xref: /aosp_15_r20/external/mesa3d/src/imagination/vulkan/pvr_cmd_buffer.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2022 Imagination Technologies Ltd.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a copy
5  * of this software and associated documentation files (the "Software"), to deal
6  * in the Software without restriction, including without limitation the rights
7  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8  * copies of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <limits.h>
26 #include <stdbool.h>
27 #include <stddef.h>
28 #include <stdint.h>
29 #include <string.h>
30 #include <vulkan/vulkan.h>
31 
32 #include "hwdef/rogue_hw_defs.h"
33 #include "hwdef/rogue_hw_utils.h"
34 #include "pvr_blit.h"
35 #include "pvr_bo.h"
36 #include "pvr_clear.h"
37 #include "pvr_common.h"
38 #include "pvr_csb.h"
39 #include "pvr_csb_enum_helpers.h"
40 #include "pvr_device_info.h"
41 #include "pvr_formats.h"
42 #include "pvr_hardcode.h"
43 #include "pvr_hw_pass.h"
44 #include "pvr_job_common.h"
45 #include "pvr_job_render.h"
46 #include "pvr_limits.h"
47 #include "pvr_pds.h"
48 #include "pvr_private.h"
49 #include "pvr_tex_state.h"
50 #include "pvr_types.h"
51 #include "pvr_uscgen.h"
52 #include "pvr_winsys.h"
53 #include "util/bitscan.h"
54 #include "util/bitset.h"
55 #include "util/compiler.h"
56 #include "util/list.h"
57 #include "util/macros.h"
58 #include "util/u_dynarray.h"
59 #include "util/u_math.h"
60 #include "util/u_pack_color.h"
61 #include "vk_alloc.h"
62 #include "vk_command_buffer.h"
63 #include "vk_command_pool.h"
64 #include "vk_common_entrypoints.h"
65 #include "vk_format.h"
66 #include "vk_graphics_state.h"
67 #include "vk_log.h"
68 #include "vk_object.h"
69 #include "vk_util.h"
70 
71 /* Structure used to pass data into pvr_compute_generate_control_stream()
72  * function.
73  */
74 struct pvr_compute_kernel_info {
75    pvr_dev_addr_t indirect_buffer_addr;
76    bool global_offsets_present;
77    uint32_t usc_common_size;
78    uint32_t usc_unified_size;
79    uint32_t pds_temp_size;
80    uint32_t pds_data_size;
81    enum PVRX(CDMCTRL_USC_TARGET) usc_target;
82    bool is_fence;
83    uint32_t pds_data_offset;
84    uint32_t pds_code_offset;
85    enum PVRX(CDMCTRL_SD_TYPE) sd_type;
86    bool usc_common_shared;
87    uint32_t local_size[PVR_WORKGROUP_DIMENSIONS];
88    uint32_t global_size[PVR_WORKGROUP_DIMENSIONS];
89    uint32_t max_instances;
90 };
91 
pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd * sub_cmd)92 static void pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
93                                         struct pvr_sub_cmd *sub_cmd)
94 {
95    if (sub_cmd->owned) {
96       switch (sub_cmd->type) {
97       case PVR_SUB_CMD_TYPE_GRAPHICS:
98          util_dynarray_fini(&sub_cmd->gfx.sec_query_indices);
99          pvr_csb_finish(&sub_cmd->gfx.control_stream);
100          pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.terminate_ctrl_stream);
101          pvr_bo_suballoc_free(sub_cmd->gfx.depth_bias_bo);
102          pvr_bo_suballoc_free(sub_cmd->gfx.scissor_bo);
103          break;
104 
105       case PVR_SUB_CMD_TYPE_COMPUTE:
106       case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
107          pvr_csb_finish(&sub_cmd->compute.control_stream);
108          break;
109 
110       case PVR_SUB_CMD_TYPE_TRANSFER:
111          list_for_each_entry_safe (struct pvr_transfer_cmd,
112                                    transfer_cmd,
113                                    sub_cmd->transfer.transfer_cmds,
114                                    link) {
115             list_del(&transfer_cmd->link);
116             if (!transfer_cmd->is_deferred_clear)
117                vk_free(&cmd_buffer->vk.pool->alloc, transfer_cmd);
118          }
119          break;
120 
121       case PVR_SUB_CMD_TYPE_EVENT:
122          if (sub_cmd->event.type == PVR_EVENT_TYPE_WAIT)
123             vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd->event.wait.events);
124          break;
125 
126       default:
127          unreachable("Unsupported sub-command type");
128       }
129    }
130 
131    list_del(&sub_cmd->link);
132    vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd);
133 }
134 
pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer * cmd_buffer)135 static void pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer *cmd_buffer)
136 {
137    list_for_each_entry_safe (struct pvr_sub_cmd,
138                              sub_cmd,
139                              &cmd_buffer->sub_cmds,
140                              link) {
141       pvr_cmd_buffer_free_sub_cmd(cmd_buffer, sub_cmd);
142    }
143 }
144 
pvr_cmd_buffer_free_resources(struct pvr_cmd_buffer * cmd_buffer)145 static void pvr_cmd_buffer_free_resources(struct pvr_cmd_buffer *cmd_buffer)
146 {
147    vk_free(&cmd_buffer->vk.pool->alloc,
148            cmd_buffer->state.render_pass_info.attachments);
149    vk_free(&cmd_buffer->vk.pool->alloc,
150            cmd_buffer->state.render_pass_info.clear_values);
151 
152    util_dynarray_fini(&cmd_buffer->state.query_indices);
153 
154    pvr_cmd_buffer_free_sub_cmds(cmd_buffer);
155 
156    list_for_each_entry_safe (struct pvr_suballoc_bo,
157                              suballoc_bo,
158                              &cmd_buffer->bo_list,
159                              link) {
160       list_del(&suballoc_bo->link);
161       pvr_bo_suballoc_free(suballoc_bo);
162    }
163 
164    util_dynarray_fini(&cmd_buffer->deferred_clears);
165    util_dynarray_fini(&cmd_buffer->deferred_csb_commands);
166    util_dynarray_fini(&cmd_buffer->scissor_array);
167    util_dynarray_fini(&cmd_buffer->depth_bias_array);
168 }
169 
pvr_cmd_buffer_reset(struct vk_command_buffer * vk_cmd_buffer,VkCommandBufferResetFlags flags)170 static void pvr_cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
171                                  VkCommandBufferResetFlags flags)
172 {
173    struct pvr_cmd_buffer *cmd_buffer =
174       container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk);
175 
176    /* FIXME: For now we always free all resources as if
177     * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set.
178     */
179    pvr_cmd_buffer_free_resources(cmd_buffer);
180 
181    vk_command_buffer_reset(&cmd_buffer->vk);
182 
183    memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
184    memset(&cmd_buffer->scissor_words, 0, sizeof(cmd_buffer->scissor_words));
185 
186    cmd_buffer->usage_flags = 0;
187 }
188 
pvr_cmd_buffer_destroy(struct vk_command_buffer * vk_cmd_buffer)189 static void pvr_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
190 {
191    struct pvr_cmd_buffer *cmd_buffer =
192       container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk);
193 
194    pvr_cmd_buffer_free_resources(cmd_buffer);
195    vk_command_buffer_finish(&cmd_buffer->vk);
196    vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
197 }
198 
199 static const struct vk_command_buffer_ops cmd_buffer_ops = {
200    .reset = pvr_cmd_buffer_reset,
201    .destroy = pvr_cmd_buffer_destroy,
202 };
203 
pvr_cmd_buffer_create(struct pvr_device * device,struct vk_command_pool * pool,VkCommandBufferLevel level,VkCommandBuffer * pCommandBuffer)204 static VkResult pvr_cmd_buffer_create(struct pvr_device *device,
205                                       struct vk_command_pool *pool,
206                                       VkCommandBufferLevel level,
207                                       VkCommandBuffer *pCommandBuffer)
208 {
209    struct pvr_cmd_buffer *cmd_buffer;
210    VkResult result;
211 
212    cmd_buffer = vk_zalloc(&pool->alloc,
213                           sizeof(*cmd_buffer),
214                           8U,
215                           VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
216    if (!cmd_buffer)
217       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
218 
219    result =
220       vk_command_buffer_init(pool, &cmd_buffer->vk, &cmd_buffer_ops, level);
221    if (result != VK_SUCCESS) {
222       vk_free(&pool->alloc, cmd_buffer);
223       return result;
224    }
225 
226    cmd_buffer->device = device;
227 
228    util_dynarray_init(&cmd_buffer->depth_bias_array, NULL);
229    util_dynarray_init(&cmd_buffer->scissor_array, NULL);
230    util_dynarray_init(&cmd_buffer->deferred_csb_commands, NULL);
231    util_dynarray_init(&cmd_buffer->deferred_clears, NULL);
232 
233    list_inithead(&cmd_buffer->sub_cmds);
234    list_inithead(&cmd_buffer->bo_list);
235 
236    *pCommandBuffer = pvr_cmd_buffer_to_handle(cmd_buffer);
237 
238    return VK_SUCCESS;
239 }
240 
241 VkResult
pvr_AllocateCommandBuffers(VkDevice _device,const VkCommandBufferAllocateInfo * pAllocateInfo,VkCommandBuffer * pCommandBuffers)242 pvr_AllocateCommandBuffers(VkDevice _device,
243                            const VkCommandBufferAllocateInfo *pAllocateInfo,
244                            VkCommandBuffer *pCommandBuffers)
245 {
246    VK_FROM_HANDLE(vk_command_pool, pool, pAllocateInfo->commandPool);
247    PVR_FROM_HANDLE(pvr_device, device, _device);
248    VkResult result = VK_SUCCESS;
249    uint32_t i;
250 
251    for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
252       result = pvr_cmd_buffer_create(device,
253                                      pool,
254                                      pAllocateInfo->level,
255                                      &pCommandBuffers[i]);
256       if (result != VK_SUCCESS)
257          break;
258    }
259 
260    if (result != VK_SUCCESS) {
261       while (i--) {
262          VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, pCommandBuffers[i]);
263          pvr_cmd_buffer_destroy(cmd_buffer);
264       }
265 
266       for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
267          pCommandBuffers[i] = VK_NULL_HANDLE;
268    }
269 
270    return result;
271 }
272 
pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer * cmd_buffer,enum pvr_sub_cmd_type type)273 static void pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer *cmd_buffer,
274                                            enum pvr_sub_cmd_type type)
275 {
276    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
277    uint32_t barriers;
278 
279    switch (type) {
280    case PVR_SUB_CMD_TYPE_GRAPHICS:
281       barriers = PVR_PIPELINE_STAGE_GEOM_BIT | PVR_PIPELINE_STAGE_FRAG_BIT;
282       break;
283 
284    case PVR_SUB_CMD_TYPE_COMPUTE:
285       barriers = PVR_PIPELINE_STAGE_COMPUTE_BIT;
286       break;
287 
288    case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
289    case PVR_SUB_CMD_TYPE_TRANSFER:
290       /* Compute jobs are used for occlusion queries but to copy the results we
291        * have to sync with transfer jobs because vkCmdCopyQueryPoolResults() is
292        * deemed as a transfer operation by the spec.
293        */
294       barriers = PVR_PIPELINE_STAGE_TRANSFER_BIT;
295       break;
296 
297    case PVR_SUB_CMD_TYPE_EVENT:
298       barriers = 0;
299       break;
300 
301    default:
302       unreachable("Unsupported sub-command type");
303    }
304 
305    for (uint32_t i = 0; i < ARRAY_SIZE(state->barriers_needed); i++)
306       state->barriers_needed[i] |= barriers;
307 }
308 
309 static VkResult
pvr_cmd_buffer_upload_tables(struct pvr_device * device,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)310 pvr_cmd_buffer_upload_tables(struct pvr_device *device,
311                              struct pvr_cmd_buffer *cmd_buffer,
312                              struct pvr_sub_cmd_gfx *const sub_cmd)
313 {
314    const uint32_t cache_line_size =
315       rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
316    VkResult result;
317 
318    assert(!sub_cmd->depth_bias_bo && !sub_cmd->scissor_bo);
319 
320    if (cmd_buffer->depth_bias_array.size > 0) {
321       result =
322          pvr_gpu_upload(device,
323                         device->heaps.general_heap,
324                         util_dynarray_begin(&cmd_buffer->depth_bias_array),
325                         cmd_buffer->depth_bias_array.size,
326                         cache_line_size,
327                         &sub_cmd->depth_bias_bo);
328       if (result != VK_SUCCESS)
329          return result;
330    }
331 
332    if (cmd_buffer->scissor_array.size > 0) {
333       result = pvr_gpu_upload(device,
334                               device->heaps.general_heap,
335                               util_dynarray_begin(&cmd_buffer->scissor_array),
336                               cmd_buffer->scissor_array.size,
337                               cache_line_size,
338                               &sub_cmd->scissor_bo);
339       if (result != VK_SUCCESS)
340          goto err_free_depth_bias_bo;
341    }
342 
343    util_dynarray_clear(&cmd_buffer->depth_bias_array);
344    util_dynarray_clear(&cmd_buffer->scissor_array);
345 
346    return VK_SUCCESS;
347 
348 err_free_depth_bias_bo:
349    pvr_bo_suballoc_free(sub_cmd->depth_bias_bo);
350    sub_cmd->depth_bias_bo = NULL;
351 
352    return result;
353 }
354 
355 static VkResult
pvr_cmd_buffer_emit_ppp_state(const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_csb * const csb)356 pvr_cmd_buffer_emit_ppp_state(const struct pvr_cmd_buffer *const cmd_buffer,
357                               struct pvr_csb *const csb)
358 {
359    const struct pvr_framebuffer *const framebuffer =
360       cmd_buffer->state.render_pass_info.framebuffer;
361 
362    assert(csb->stream_type == PVR_CMD_STREAM_TYPE_GRAPHICS ||
363           csb->stream_type == PVR_CMD_STREAM_TYPE_GRAPHICS_DEFERRED);
364 
365    pvr_csb_set_relocation_mark(csb);
366 
367    pvr_csb_emit (csb, VDMCTRL_PPP_STATE0, state0) {
368       state0.addrmsb = framebuffer->ppp_state_bo->dev_addr;
369       state0.word_count = framebuffer->ppp_state_size;
370    }
371 
372    pvr_csb_emit (csb, VDMCTRL_PPP_STATE1, state1) {
373       state1.addrlsb = framebuffer->ppp_state_bo->dev_addr;
374    }
375 
376    pvr_csb_clear_relocation_mark(csb);
377 
378    return csb->status;
379 }
380 
381 VkResult
pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer * const cmd_buffer,const void * const data,const size_t size,struct pvr_suballoc_bo ** const pvr_bo_out)382 pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer *const cmd_buffer,
383                               const void *const data,
384                               const size_t size,
385                               struct pvr_suballoc_bo **const pvr_bo_out)
386 {
387    struct pvr_device *const device = cmd_buffer->device;
388    const uint32_t cache_line_size =
389       rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
390    struct pvr_suballoc_bo *suballoc_bo;
391    VkResult result;
392 
393    result = pvr_gpu_upload(device,
394                            device->heaps.general_heap,
395                            data,
396                            size,
397                            cache_line_size,
398                            &suballoc_bo);
399    if (result != VK_SUCCESS)
400       return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
401 
402    list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
403 
404    *pvr_bo_out = suballoc_bo;
405 
406    return VK_SUCCESS;
407 }
408 
409 static VkResult
pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer * const cmd_buffer,const void * const code,const size_t code_size,uint64_t code_alignment,struct pvr_suballoc_bo ** const pvr_bo_out)410 pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer *const cmd_buffer,
411                           const void *const code,
412                           const size_t code_size,
413                           uint64_t code_alignment,
414                           struct pvr_suballoc_bo **const pvr_bo_out)
415 {
416    struct pvr_device *const device = cmd_buffer->device;
417    const uint32_t cache_line_size =
418       rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
419    struct pvr_suballoc_bo *suballoc_bo;
420    VkResult result;
421 
422    code_alignment = MAX2(code_alignment, cache_line_size);
423 
424    result =
425       pvr_gpu_upload_usc(device, code, code_size, code_alignment, &suballoc_bo);
426    if (result != VK_SUCCESS)
427       return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
428 
429    list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
430 
431    *pvr_bo_out = suballoc_bo;
432 
433    return VK_SUCCESS;
434 }
435 
pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t * data,uint32_t data_size_dwords,uint32_t data_alignment,const uint32_t * code,uint32_t code_size_dwords,uint32_t code_alignment,uint64_t min_alignment,struct pvr_pds_upload * const pds_upload_out)436 VkResult pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer *const cmd_buffer,
437                                    const uint32_t *data,
438                                    uint32_t data_size_dwords,
439                                    uint32_t data_alignment,
440                                    const uint32_t *code,
441                                    uint32_t code_size_dwords,
442                                    uint32_t code_alignment,
443                                    uint64_t min_alignment,
444                                    struct pvr_pds_upload *const pds_upload_out)
445 {
446    struct pvr_device *const device = cmd_buffer->device;
447    VkResult result;
448 
449    result = pvr_gpu_upload_pds(device,
450                                data,
451                                data_size_dwords,
452                                data_alignment,
453                                code,
454                                code_size_dwords,
455                                code_alignment,
456                                min_alignment,
457                                pds_upload_out);
458    if (result != VK_SUCCESS)
459       return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
460 
461    list_add(&pds_upload_out->pvr_bo->link, &cmd_buffer->bo_list);
462 
463    return VK_SUCCESS;
464 }
465 
466 static inline VkResult
pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t * data,uint32_t data_size_dwords,uint32_t data_alignment,struct pvr_pds_upload * const pds_upload_out)467 pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer *const cmd_buffer,
468                                const uint32_t *data,
469                                uint32_t data_size_dwords,
470                                uint32_t data_alignment,
471                                struct pvr_pds_upload *const pds_upload_out)
472 {
473    return pvr_cmd_buffer_upload_pds(cmd_buffer,
474                                     data,
475                                     data_size_dwords,
476                                     data_alignment,
477                                     NULL,
478                                     0,
479                                     0,
480                                     data_alignment,
481                                     pds_upload_out);
482 }
483 
484 /* pbe_cs_words must be an array of length emit_count with
485  * ROGUE_NUM_PBESTATE_STATE_WORDS entries
486  */
pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t emit_count,const uint32_t * pbe_cs_words,struct pvr_pds_upload * const pds_upload_out)487 static VkResult pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
488    struct pvr_cmd_buffer *const cmd_buffer,
489    const uint32_t emit_count,
490    const uint32_t *pbe_cs_words,
491    struct pvr_pds_upload *const pds_upload_out)
492 {
493    struct pvr_pds_event_program pixel_event_program = {
494       /* No data to DMA, just a DOUTU needed. */
495       .num_emit_word_pairs = 0,
496    };
497    const uint32_t staging_buffer_size =
498       PVR_DW_TO_BYTES(cmd_buffer->device->pixel_event_data_size_in_dwords);
499    const VkAllocationCallbacks *const allocator = &cmd_buffer->vk.pool->alloc;
500    struct pvr_device *const device = cmd_buffer->device;
501    struct pvr_suballoc_bo *usc_eot_program = NULL;
502    struct util_dynarray eot_program_bin;
503    uint32_t *staging_buffer;
504    uint32_t usc_temp_count;
505    VkResult result;
506 
507    assert(emit_count > 0);
508 
509    pvr_uscgen_eot("per-job EOT",
510                   emit_count,
511                   pbe_cs_words,
512                   &usc_temp_count,
513                   &eot_program_bin);
514 
515    result = pvr_cmd_buffer_upload_usc(cmd_buffer,
516                                       eot_program_bin.data,
517                                       eot_program_bin.size,
518                                       4,
519                                       &usc_eot_program);
520 
521    util_dynarray_fini(&eot_program_bin);
522 
523    if (result != VK_SUCCESS)
524       return result;
525 
526    pvr_pds_setup_doutu(&pixel_event_program.task_control,
527                        usc_eot_program->dev_addr.addr,
528                        usc_temp_count,
529                        PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
530                        false);
531 
532    /* TODO: We could skip allocating this and generate directly into the device
533     * buffer thus removing one allocation and memcpy() per job. Would this
534     * speed up things in a noticeable way?
535     */
536    staging_buffer = vk_alloc(allocator,
537                              staging_buffer_size,
538                              8,
539                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
540    if (!staging_buffer) {
541       result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
542       goto err_free_usc_pixel_program;
543    }
544 
545    /* Generate the data segment. The code segment was uploaded earlier when
546     * setting up the PDS static heap data.
547     */
548    pvr_pds_generate_pixel_event_data_segment(&pixel_event_program,
549                                              staging_buffer,
550                                              &device->pdevice->dev_info);
551 
552    result = pvr_cmd_buffer_upload_pds_data(
553       cmd_buffer,
554       staging_buffer,
555       cmd_buffer->device->pixel_event_data_size_in_dwords,
556       4,
557       pds_upload_out);
558    if (result != VK_SUCCESS)
559       goto err_free_pixel_event_staging_buffer;
560 
561    vk_free(allocator, staging_buffer);
562 
563    return VK_SUCCESS;
564 
565 err_free_pixel_event_staging_buffer:
566    vk_free(allocator, staging_buffer);
567 
568 err_free_usc_pixel_program:
569    list_del(&usc_eot_program->link);
570    pvr_bo_suballoc_free(usc_eot_program);
571 
572    return result;
573 }
574 
pvr_sub_cmd_gfx_build_terminate_ctrl_stream(struct pvr_device * const device,const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const gfx_sub_cmd)575 static VkResult pvr_sub_cmd_gfx_build_terminate_ctrl_stream(
576    struct pvr_device *const device,
577    const struct pvr_cmd_buffer *const cmd_buffer,
578    struct pvr_sub_cmd_gfx *const gfx_sub_cmd)
579 {
580    struct list_head bo_list;
581    struct pvr_csb csb;
582    VkResult result;
583 
584    pvr_csb_init(device, PVR_CMD_STREAM_TYPE_GRAPHICS, &csb);
585 
586    result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer, &csb);
587    if (result != VK_SUCCESS)
588       goto err_csb_finish;
589 
590    result = pvr_csb_emit_terminate(&csb);
591    if (result != VK_SUCCESS)
592       goto err_csb_finish;
593 
594    result = pvr_csb_bake(&csb, &bo_list);
595    if (result != VK_SUCCESS)
596       goto err_csb_finish;
597 
598    /* This is a trivial control stream, there's no reason it should ever require
599     * more memory than a single bo can provide.
600     */
601    assert(list_is_singular(&bo_list));
602    gfx_sub_cmd->terminate_ctrl_stream =
603       list_first_entry(&bo_list, struct pvr_bo, link);
604 
605    return VK_SUCCESS;
606 
607 err_csb_finish:
608    pvr_csb_finish(&csb);
609 
610    return result;
611 }
612 
pvr_setup_texture_state_words(struct pvr_device * device,struct pvr_combined_image_sampler_descriptor * descriptor,const struct pvr_image_view * image_view)613 static VkResult pvr_setup_texture_state_words(
614    struct pvr_device *device,
615    struct pvr_combined_image_sampler_descriptor *descriptor,
616    const struct pvr_image_view *image_view)
617 {
618    const struct pvr_image *image = vk_to_pvr_image(image_view->vk.image);
619    struct pvr_texture_state_info info = {
620       .format = image_view->vk.format,
621       .mem_layout = image->memlayout,
622       .type = image_view->vk.view_type,
623       .is_cube = image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE ||
624                  image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY,
625       .tex_state_type = PVR_TEXTURE_STATE_SAMPLE,
626       .extent = image_view->vk.extent,
627       .mip_levels = 1,
628       .sample_count = image_view->vk.image->samples,
629       .stride = image->physical_extent.width,
630       .addr = image->dev_addr,
631    };
632    const uint8_t *const swizzle = pvr_get_format_swizzle(info.format);
633    VkResult result;
634 
635    memcpy(&info.swizzle, swizzle, sizeof(info.swizzle));
636 
637    /* TODO: Can we use image_view->texture_state instead of generating here? */
638    result = pvr_pack_tex_state(device, &info, descriptor->image);
639    if (result != VK_SUCCESS)
640       return result;
641 
642    descriptor->sampler = (union pvr_sampler_descriptor){ 0 };
643 
644    pvr_csb_pack (&descriptor->sampler.data.sampler_word,
645                  TEXSTATE_SAMPLER,
646                  sampler) {
647       sampler.non_normalized_coords = true;
648       sampler.addrmode_v = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
649       sampler.addrmode_u = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
650       sampler.minfilter = PVRX(TEXSTATE_FILTER_POINT);
651       sampler.magfilter = PVRX(TEXSTATE_FILTER_POINT);
652       sampler.dadjust = PVRX(TEXSTATE_DADJUST_ZERO_UINT);
653    }
654 
655    return VK_SUCCESS;
656 }
657 
658 static VkResult
pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_load_op * load_op,pvr_dev_addr_t * const addr_out)659 pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
660                                         const struct pvr_load_op *load_op,
661                                         pvr_dev_addr_t *const addr_out)
662 {
663    const struct pvr_render_pass_info *render_pass_info =
664       &cmd_buffer->state.render_pass_info;
665    const struct pvr_render_pass *pass = render_pass_info->pass;
666    const struct pvr_renderpass_hwsetup_render *hw_render = load_op->hw_render;
667    const struct pvr_renderpass_colorinit *color_init =
668       &hw_render->color_init[0];
669    const VkClearValue *clear_value =
670       &render_pass_info->clear_values[color_init->index];
671    struct pvr_suballoc_bo *clear_bo;
672    uint32_t attachment_count;
673    bool has_depth_clear;
674    bool has_depth_load;
675    VkResult result;
676 
677    /* These are only setup and never used for now. These will need to be
678     * uploaded into a buffer based on some compiler info.
679     */
680    /* TODO: Remove the above comment once the compiler is hooked up and we're
681     * setting up + uploading the buffer.
682     */
683    struct pvr_combined_image_sampler_descriptor
684       texture_states[PVR_LOAD_OP_CLEARS_LOADS_MAX_RTS];
685    uint32_t texture_count = 0;
686    uint32_t hw_clear_value[PVR_LOAD_OP_CLEARS_LOADS_MAX_RTS *
687                            PVR_CLEAR_COLOR_ARRAY_SIZE];
688    uint32_t next_clear_consts = 0;
689 
690    if (load_op->is_hw_object)
691       attachment_count = load_op->hw_render->color_init_count;
692    else
693       attachment_count = load_op->subpass->color_count;
694 
695    for (uint32_t i = 0; i < attachment_count; i++) {
696       struct pvr_image_view *image_view;
697       uint32_t attachment_idx;
698 
699       if (load_op->is_hw_object)
700          attachment_idx = load_op->hw_render->color_init[i].index;
701       else
702          attachment_idx = load_op->subpass->color_attachments[i];
703 
704       image_view = render_pass_info->attachments[attachment_idx];
705 
706       assert((load_op->clears_loads_state.rt_load_mask &
707               load_op->clears_loads_state.rt_clear_mask) == 0);
708       if (load_op->clears_loads_state.rt_load_mask & BITFIELD_BIT(i)) {
709          result = pvr_setup_texture_state_words(cmd_buffer->device,
710                                                 &texture_states[texture_count],
711                                                 image_view);
712          if (result != VK_SUCCESS)
713             return result;
714 
715          texture_count++;
716       } else if (load_op->clears_loads_state.rt_clear_mask & BITFIELD_BIT(i)) {
717          const uint32_t accum_fmt_size =
718             pvr_get_pbe_accum_format_size_in_bytes(image_view->vk.format);
719 
720          assert(next_clear_consts +
721                    vk_format_get_blocksize(image_view->vk.format) <=
722                 ARRAY_SIZE(hw_clear_value));
723 
724          /* FIXME: do this at the point we store the clear values? */
725          pvr_get_hw_clear_color(image_view->vk.format,
726                                 clear_value->color,
727                                 &hw_clear_value[next_clear_consts]);
728 
729          next_clear_consts += DIV_ROUND_UP(accum_fmt_size, sizeof(uint32_t));
730       }
731    }
732 
733    has_depth_load = false;
734    for (uint32_t i = 0;
735         i < ARRAY_SIZE(load_op->clears_loads_state.dest_vk_format);
736         i++) {
737       if (load_op->clears_loads_state.dest_vk_format[i] ==
738           VK_FORMAT_D32_SFLOAT) {
739          has_depth_load = true;
740          break;
741       }
742    }
743 
744    has_depth_clear = load_op->clears_loads_state.depth_clear_to_reg != -1;
745 
746    assert(!(has_depth_clear && has_depth_load));
747 
748    if (has_depth_load) {
749       const struct pvr_render_pass_attachment *attachment;
750       const struct pvr_image_view *image_view;
751 
752       assert(load_op->subpass->depth_stencil_attachment !=
753              VK_ATTACHMENT_UNUSED);
754       assert(!load_op->is_hw_object);
755       attachment =
756          &pass->attachments[load_op->subpass->depth_stencil_attachment];
757 
758       image_view = render_pass_info->attachments[attachment->index];
759 
760       result = pvr_setup_texture_state_words(cmd_buffer->device,
761                                              &texture_states[texture_count],
762                                              image_view);
763       if (result != VK_SUCCESS)
764          return result;
765 
766       texture_count++;
767    } else if (has_depth_clear) {
768       const struct pvr_render_pass_attachment *attachment;
769       VkClearValue clear_value;
770 
771       assert(load_op->subpass->depth_stencil_attachment !=
772              VK_ATTACHMENT_UNUSED);
773       attachment =
774          &pass->attachments[load_op->subpass->depth_stencil_attachment];
775 
776       clear_value = render_pass_info->clear_values[attachment->index];
777 
778       assert(next_clear_consts < ARRAY_SIZE(hw_clear_value));
779       hw_clear_value[next_clear_consts++] = fui(clear_value.depthStencil.depth);
780    }
781 
782    result = pvr_cmd_buffer_upload_general(cmd_buffer,
783                                           &hw_clear_value[0],
784                                           sizeof(hw_clear_value),
785                                           &clear_bo);
786    if (result != VK_SUCCESS)
787       return result;
788 
789    *addr_out = clear_bo->dev_addr;
790 
791    return VK_SUCCESS;
792 }
793 
pvr_load_op_pds_data_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_load_op * load_op,pvr_dev_addr_t constants_addr,struct pvr_pds_upload * const pds_upload_out)794 static VkResult pvr_load_op_pds_data_create_and_upload(
795    struct pvr_cmd_buffer *cmd_buffer,
796    const struct pvr_load_op *load_op,
797    pvr_dev_addr_t constants_addr,
798    struct pvr_pds_upload *const pds_upload_out)
799 {
800    struct pvr_device *device = cmd_buffer->device;
801    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
802    struct pvr_pds_pixel_shader_sa_program program = { 0 };
803    uint32_t staging_buffer_size;
804    uint32_t *staging_buffer;
805    VkResult result;
806 
807    program.num_texture_dma_kicks = 1;
808 
809    pvr_csb_pack (&program.texture_dma_address[0],
810                  PDSINST_DOUT_FIELDS_DOUTD_SRC0,
811                  value) {
812       value.sbase = constants_addr;
813    }
814 
815    pvr_csb_pack (&program.texture_dma_control[0],
816                  PDSINST_DOUT_FIELDS_DOUTD_SRC1,
817                  value) {
818       value.dest = PVRX(PDSINST_DOUTD_DEST_COMMON_STORE);
819       value.a0 = load_op->shareds_dest_offset;
820       value.bsize = load_op->shareds_count;
821    }
822 
823    pvr_pds_set_sizes_pixel_shader_sa_texture_data(&program, dev_info);
824 
825    staging_buffer_size = PVR_DW_TO_BYTES(program.data_size);
826 
827    staging_buffer = vk_alloc(&cmd_buffer->vk.pool->alloc,
828                              staging_buffer_size,
829                              8,
830                              VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
831    if (!staging_buffer)
832       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
833 
834    pvr_pds_generate_pixel_shader_sa_texture_state_data(&program,
835                                                        staging_buffer,
836                                                        dev_info);
837 
838    result = pvr_cmd_buffer_upload_pds_data(cmd_buffer,
839                                            staging_buffer,
840                                            program.data_size,
841                                            1,
842                                            pds_upload_out);
843    if (result != VK_SUCCESS) {
844       vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
845       return result;
846    }
847 
848    vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
849 
850    return VK_SUCCESS;
851 }
852 
853 /* FIXME: Should this function be specific to the HW background object, in
854  * which case its name should be changed, or should it have the load op
855  * structure passed in?
856  */
857 static VkResult
pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_load_op * load_op,struct pvr_pds_upload * const pds_upload_out)858 pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
859                                    const struct pvr_load_op *load_op,
860                                    struct pvr_pds_upload *const pds_upload_out)
861 {
862    pvr_dev_addr_t constants_addr;
863    VkResult result;
864 
865    result = pvr_load_op_constants_create_and_upload(cmd_buffer,
866                                                     load_op,
867                                                     &constants_addr);
868    if (result != VK_SUCCESS)
869       return result;
870 
871    return pvr_load_op_pds_data_create_and_upload(cmd_buffer,
872                                                  load_op,
873                                                  constants_addr,
874                                                  pds_upload_out);
875 }
876 
pvr_pds_bgnd_pack_state(const struct pvr_load_op * load_op,const struct pvr_pds_upload * load_op_program,uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])877 static void pvr_pds_bgnd_pack_state(
878    const struct pvr_load_op *load_op,
879    const struct pvr_pds_upload *load_op_program,
880    uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])
881 {
882    pvr_csb_pack (&pds_reg_values[0], CR_PDS_BGRND0_BASE, value) {
883       value.shader_addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset);
884       value.texunicode_addr =
885          PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset);
886    }
887 
888    pvr_csb_pack (&pds_reg_values[1], CR_PDS_BGRND1_BASE, value) {
889       value.texturedata_addr = PVR_DEV_ADDR(load_op_program->data_offset);
890    }
891 
892    pvr_csb_pack (&pds_reg_values[2], CR_PDS_BGRND3_SIZEINFO, value) {
893       value.usc_sharedsize =
894          DIV_ROUND_UP(load_op->const_shareds_count,
895                       PVRX(CR_PDS_BGRND3_SIZEINFO_USC_SHAREDSIZE_UNIT_SIZE));
896       value.pds_texturestatesize = DIV_ROUND_UP(
897          load_op_program->data_size,
898          PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEXTURESTATESIZE_UNIT_SIZE));
899       value.pds_tempsize =
900          DIV_ROUND_UP(load_op->temps_count,
901                       PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEMPSIZE_UNIT_SIZE));
902    }
903 }
904 
905 /**
906  * \brief Calculates the stride in pixels based on the pitch in bytes and pixel
907  * format.
908  *
909  * \param[in] pitch     Width pitch in bytes.
910  * \param[in] vk_format Vulkan image format.
911  * \return Stride in pixels.
912  */
pvr_stride_from_pitch(uint32_t pitch,VkFormat vk_format)913 static inline uint32_t pvr_stride_from_pitch(uint32_t pitch, VkFormat vk_format)
914 {
915    const unsigned int cpp = vk_format_get_blocksize(vk_format);
916 
917    assert(pitch % cpp == 0);
918 
919    return pitch / cpp;
920 }
921 
pvr_setup_pbe_state(const struct pvr_device_info * dev_info,const struct pvr_framebuffer * framebuffer,uint32_t mrt_index,const struct usc_mrt_resource * mrt_resource,const struct pvr_image_view * const iview,const VkRect2D * render_area,const bool down_scale,const uint32_t samples,uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS])922 static void pvr_setup_pbe_state(
923    const struct pvr_device_info *dev_info,
924    const struct pvr_framebuffer *framebuffer,
925    uint32_t mrt_index,
926    const struct usc_mrt_resource *mrt_resource,
927    const struct pvr_image_view *const iview,
928    const VkRect2D *render_area,
929    const bool down_scale,
930    const uint32_t samples,
931    uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],
932    uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS])
933 {
934    const struct pvr_image *image = pvr_image_view_get_image(iview);
935    uint32_t level_pitch = image->mip_levels[iview->vk.base_mip_level].pitch;
936 
937    struct pvr_pbe_surf_params surface_params;
938    struct pvr_pbe_render_params render_params;
939    bool with_packed_usc_channel;
940    const uint8_t *swizzle;
941    uint32_t position;
942 
943    /* down_scale should be true when performing a resolve, in which case there
944     * should be more than one sample.
945     */
946    assert((down_scale && samples > 1U) || (!down_scale && samples == 1U));
947 
948    /* Setup surface parameters. */
949 
950    if (PVR_HAS_FEATURE(dev_info, usc_f16sop_u8)) {
951       with_packed_usc_channel = vk_format_is_unorm(iview->vk.format) ||
952                                 vk_format_is_snorm(iview->vk.format);
953    } else {
954       with_packed_usc_channel = false;
955    }
956 
957    swizzle = pvr_get_format_swizzle(iview->vk.format);
958    memcpy(surface_params.swizzle, swizzle, sizeof(surface_params.swizzle));
959 
960    pvr_pbe_get_src_format_and_gamma(iview->vk.format,
961                                     PVR_PBE_GAMMA_NONE,
962                                     with_packed_usc_channel,
963                                     &surface_params.source_format,
964                                     &surface_params.gamma);
965 
966    surface_params.is_normalized = pvr_vk_format_is_fully_normalized(iview->vk.format);
967    surface_params.pbe_packmode = pvr_get_pbe_packmode(iview->vk.format);
968    surface_params.nr_components = vk_format_get_nr_components(iview->vk.format);
969 
970    /* FIXME: Should we have an inline function to return the address of a mip
971     * level?
972     */
973    surface_params.addr =
974       PVR_DEV_ADDR_OFFSET(image->vma->dev_addr,
975                           image->mip_levels[iview->vk.base_mip_level].offset);
976    surface_params.addr =
977       PVR_DEV_ADDR_OFFSET(surface_params.addr,
978                           iview->vk.base_array_layer * image->layer_size);
979 
980    surface_params.mem_layout = image->memlayout;
981    surface_params.stride = pvr_stride_from_pitch(level_pitch, iview->vk.format);
982    surface_params.depth = iview->vk.extent.depth;
983    surface_params.width = iview->vk.extent.width;
984    surface_params.height = iview->vk.extent.height;
985    surface_params.z_only_render = false;
986    surface_params.down_scale = down_scale;
987 
988    /* Setup render parameters. */
989 
990    if (mrt_resource->type == USC_MRT_RESOURCE_TYPE_MEMORY) {
991       position = mrt_resource->mem.offset_dw;
992    } else {
993       assert(mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REG);
994       assert(mrt_resource->reg.offset == 0);
995 
996       position = mrt_resource->reg.output_reg;
997    }
998 
999    assert(position <= 3 || PVR_HAS_FEATURE(dev_info, eight_output_registers));
1000 
1001    switch (position) {
1002    case 0:
1003    case 4:
1004       render_params.source_start = PVR_PBE_STARTPOS_BIT0;
1005       break;
1006    case 1:
1007    case 5:
1008       render_params.source_start = PVR_PBE_STARTPOS_BIT32;
1009       break;
1010    case 2:
1011    case 6:
1012       render_params.source_start = PVR_PBE_STARTPOS_BIT64;
1013       break;
1014    case 3:
1015    case 7:
1016       render_params.source_start = PVR_PBE_STARTPOS_BIT96;
1017       break;
1018    default:
1019       assert(!"Invalid output register");
1020       break;
1021    }
1022 
1023 #define PVR_DEC_IF_NOT_ZERO(_v) (((_v) > 0) ? (_v)-1 : 0)
1024 
1025    render_params.min_x_clip = MAX2(0, render_area->offset.x);
1026    render_params.min_y_clip = MAX2(0, render_area->offset.y);
1027    render_params.max_x_clip = MIN2(
1028       framebuffer->width - 1,
1029       PVR_DEC_IF_NOT_ZERO(render_area->offset.x + render_area->extent.width));
1030    render_params.max_y_clip = MIN2(
1031       framebuffer->height - 1,
1032       PVR_DEC_IF_NOT_ZERO(render_area->offset.y + render_area->extent.height));
1033 
1034 #undef PVR_DEC_IF_NOT_ZERO
1035 
1036    render_params.slice = 0;
1037    render_params.mrt_index = mrt_index;
1038 
1039    pvr_pbe_pack_state(dev_info,
1040                       &surface_params,
1041                       &render_params,
1042                       pbe_cs_words,
1043                       pbe_reg_words);
1044 }
1045 
1046 static struct pvr_render_target *
pvr_get_render_target(const struct pvr_render_pass * pass,const struct pvr_framebuffer * framebuffer,uint32_t idx)1047 pvr_get_render_target(const struct pvr_render_pass *pass,
1048                       const struct pvr_framebuffer *framebuffer,
1049                       uint32_t idx)
1050 {
1051    const struct pvr_renderpass_hwsetup_render *hw_render =
1052       &pass->hw_setup->renders[idx];
1053    uint32_t rt_idx = 0;
1054 
1055    switch (hw_render->sample_count) {
1056    case 1:
1057    case 2:
1058    case 4:
1059    case 8:
1060       rt_idx = util_logbase2(hw_render->sample_count);
1061       break;
1062 
1063    default:
1064       unreachable("Unsupported sample count");
1065       break;
1066    }
1067 
1068    return &framebuffer->render_targets[rt_idx];
1069 }
1070 
1071 static uint32_t
pvr_pass_get_pixel_output_width(const struct pvr_render_pass * pass,uint32_t idx,const struct pvr_device_info * dev_info)1072 pvr_pass_get_pixel_output_width(const struct pvr_render_pass *pass,
1073                                 uint32_t idx,
1074                                 const struct pvr_device_info *dev_info)
1075 {
1076    const struct pvr_renderpass_hwsetup_render *hw_render =
1077       &pass->hw_setup->renders[idx];
1078    /* Default value based on the maximum value found in all existing cores. The
1079     * maximum is used as this is being treated as a lower bound, making it a
1080     * "safer" choice than the minimum value found in all existing cores.
1081     */
1082    const uint32_t min_output_regs =
1083       PVR_GET_FEATURE_VALUE(dev_info, usc_min_output_registers_per_pix, 2U);
1084    const uint32_t width = MAX2(hw_render->output_regs_count, min_output_regs);
1085 
1086    return util_next_power_of_two(width);
1087 }
1088 
1089 static inline bool
pvr_ds_attachment_requires_zls(const struct pvr_ds_attachment * attachment)1090 pvr_ds_attachment_requires_zls(const struct pvr_ds_attachment *attachment)
1091 {
1092    bool zls_used;
1093 
1094    zls_used = attachment->load.d || attachment->load.s;
1095    zls_used |= attachment->store.d || attachment->store.s;
1096 
1097    return zls_used;
1098 }
1099 
1100 /**
1101  * \brief If depth and/or stencil attachment dimensions are not tile-aligned,
1102  * then we may need to insert some additional transfer subcommands.
1103  *
1104  * It's worth noting that we check whether the dimensions are smaller than a
1105  * tile here, rather than checking whether they're tile-aligned - this relies
1106  * on the assumption that we can safely use any attachment with dimensions
1107  * larger than a tile. If the attachment is twiddled, it will be over-allocated
1108  * to the nearest power-of-two (which will be tile-aligned). If the attachment
1109  * is not twiddled, we don't need to worry about tile-alignment at all.
1110  */
pvr_sub_cmd_gfx_requires_ds_subtile_alignment(const struct pvr_device_info * dev_info,const struct pvr_render_job * job)1111 static bool pvr_sub_cmd_gfx_requires_ds_subtile_alignment(
1112    const struct pvr_device_info *dev_info,
1113    const struct pvr_render_job *job)
1114 {
1115    const struct pvr_image *const ds_image =
1116       pvr_image_view_get_image(job->ds.iview);
1117    uint32_t zls_tile_size_x;
1118    uint32_t zls_tile_size_y;
1119 
1120    rogue_get_zls_tile_size_xy(dev_info, &zls_tile_size_x, &zls_tile_size_y);
1121 
1122    if (ds_image->physical_extent.width >= zls_tile_size_x &&
1123        ds_image->physical_extent.height >= zls_tile_size_y) {
1124       return false;
1125    }
1126 
1127    /* If we have the zls_subtile feature, we can skip the alignment iff:
1128     *  - The attachment is not multisampled, and
1129     *  - The depth and stencil attachments are the same.
1130     */
1131    if (PVR_HAS_FEATURE(dev_info, zls_subtile) &&
1132        ds_image->vk.samples == VK_SAMPLE_COUNT_1_BIT &&
1133        job->has_stencil_attachment == job->has_depth_attachment) {
1134       return false;
1135    }
1136 
1137    /* No ZLS functions enabled; nothing to do. */
1138    if ((!job->has_depth_attachment && !job->has_stencil_attachment) ||
1139        !pvr_ds_attachment_requires_zls(&job->ds)) {
1140       return false;
1141    }
1142 
1143    return true;
1144 }
1145 
1146 static VkResult
pvr_sub_cmd_gfx_align_ds_subtiles(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const gfx_sub_cmd)1147 pvr_sub_cmd_gfx_align_ds_subtiles(struct pvr_cmd_buffer *const cmd_buffer,
1148                                   struct pvr_sub_cmd_gfx *const gfx_sub_cmd)
1149 {
1150    struct pvr_sub_cmd *const prev_sub_cmd =
1151       container_of(gfx_sub_cmd, struct pvr_sub_cmd, gfx);
1152    struct pvr_ds_attachment *const ds = &gfx_sub_cmd->job.ds;
1153    const struct pvr_image *const ds_image = pvr_image_view_get_image(ds->iview);
1154    const VkFormat copy_format = pvr_get_raw_copy_format(ds_image->vk.format);
1155 
1156    struct pvr_suballoc_bo *buffer;
1157    uint32_t buffer_layer_size;
1158    VkBufferImageCopy2 region;
1159    VkExtent2D zls_tile_size;
1160    VkExtent2D rounded_size;
1161    uint32_t buffer_size;
1162    VkExtent2D scale;
1163    VkResult result;
1164 
1165    /* The operations below assume the last command in the buffer was the target
1166     * gfx subcommand. Assert that this is the case.
1167     */
1168    assert(list_last_entry(&cmd_buffer->sub_cmds, struct pvr_sub_cmd, link) ==
1169           prev_sub_cmd);
1170 
1171    if (!pvr_ds_attachment_requires_zls(ds))
1172       return VK_SUCCESS;
1173 
1174    rogue_get_zls_tile_size_xy(&cmd_buffer->device->pdevice->dev_info,
1175                               &zls_tile_size.width,
1176                               &zls_tile_size.height);
1177    rogue_get_isp_scale_xy_from_samples(ds_image->vk.samples,
1178                                        &scale.width,
1179                                        &scale.height);
1180 
1181    rounded_size = (VkExtent2D){
1182       .width = ALIGN_POT(ds_image->physical_extent.width, zls_tile_size.width),
1183       .height =
1184          ALIGN_POT(ds_image->physical_extent.height, zls_tile_size.height),
1185    };
1186 
1187    buffer_layer_size = vk_format_get_blocksize(ds_image->vk.format) *
1188                        rounded_size.width * rounded_size.height * scale.width *
1189                        scale.height;
1190 
1191    if (ds->iview->vk.layer_count > 1)
1192       buffer_layer_size = ALIGN_POT(buffer_layer_size, ds_image->alignment);
1193 
1194    buffer_size = buffer_layer_size * ds->iview->vk.layer_count;
1195 
1196    result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
1197                                      cmd_buffer->device->heaps.general_heap,
1198                                      buffer_size,
1199                                      &buffer);
1200    if (result != VK_SUCCESS)
1201       return result;
1202 
1203    region = (VkBufferImageCopy2){
1204       .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2,
1205       .pNext = NULL,
1206       .bufferOffset = 0,
1207       .bufferRowLength = rounded_size.width,
1208       .bufferImageHeight = 0,
1209       .imageSubresource = {
1210          .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT,
1211          .mipLevel = ds->iview->vk.base_mip_level,
1212          .baseArrayLayer = ds->iview->vk.base_array_layer,
1213          .layerCount = ds->iview->vk.layer_count,
1214       },
1215       .imageOffset = { 0 },
1216       .imageExtent = {
1217          .width = ds->iview->vk.extent.width,
1218          .height = ds->iview->vk.extent.height,
1219          .depth = 1,
1220       },
1221    };
1222 
1223    if (ds->load.d || ds->load.s) {
1224       cmd_buffer->state.current_sub_cmd = NULL;
1225 
1226       result =
1227          pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
1228       if (result != VK_SUCCESS)
1229          return result;
1230 
1231       result = pvr_copy_image_to_buffer_region_format(cmd_buffer,
1232                                                       ds_image,
1233                                                       buffer->dev_addr,
1234                                                       &region,
1235                                                       copy_format,
1236                                                       copy_format);
1237       if (result != VK_SUCCESS)
1238          return result;
1239 
1240       cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true;
1241 
1242       result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
1243       if (result != VK_SUCCESS)
1244          return result;
1245 
1246       /* Now we have to fiddle with cmd_buffer to place this transfer command
1247        * *before* the target gfx subcommand.
1248        */
1249       list_move_to(&cmd_buffer->state.current_sub_cmd->link,
1250                    &prev_sub_cmd->link);
1251 
1252       cmd_buffer->state.current_sub_cmd = prev_sub_cmd;
1253    }
1254 
1255    if (ds->store.d || ds->store.s) {
1256       cmd_buffer->state.current_sub_cmd = NULL;
1257 
1258       result =
1259          pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
1260       if (result != VK_SUCCESS)
1261          return result;
1262 
1263       result = pvr_copy_buffer_to_image_region_format(cmd_buffer,
1264                                                       buffer->dev_addr,
1265                                                       ds_image,
1266                                                       &region,
1267                                                       copy_format,
1268                                                       copy_format,
1269                                                       0);
1270       if (result != VK_SUCCESS)
1271          return result;
1272 
1273       cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true;
1274 
1275       result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
1276       if (result != VK_SUCCESS)
1277          return result;
1278 
1279       cmd_buffer->state.current_sub_cmd = prev_sub_cmd;
1280    }
1281 
1282    /* Finally, patch up the target graphics sub_cmd to use the correctly-strided
1283     * buffer.
1284     */
1285    ds->has_alignment_transfers = true;
1286    ds->addr = buffer->dev_addr;
1287    ds->physical_extent = rounded_size;
1288 
1289    gfx_sub_cmd->wait_on_previous_transfer = true;
1290 
1291    return VK_SUCCESS;
1292 }
1293 
1294 struct pvr_emit_state {
1295    uint32_t pbe_cs_words[PVR_MAX_COLOR_ATTACHMENTS]
1296                         [ROGUE_NUM_PBESTATE_STATE_WORDS];
1297 
1298    uint64_t pbe_reg_words[PVR_MAX_COLOR_ATTACHMENTS]
1299                          [ROGUE_NUM_PBESTATE_REG_WORDS];
1300 
1301    uint32_t emit_count;
1302 };
1303 
1304 static void
pvr_setup_emit_state(const struct pvr_device_info * dev_info,const struct pvr_renderpass_hwsetup_render * hw_render,struct pvr_render_pass_info * render_pass_info,struct pvr_emit_state * emit_state)1305 pvr_setup_emit_state(const struct pvr_device_info *dev_info,
1306                      const struct pvr_renderpass_hwsetup_render *hw_render,
1307                      struct pvr_render_pass_info *render_pass_info,
1308                      struct pvr_emit_state *emit_state)
1309 {
1310    assert(hw_render->pbe_emits <= PVR_NUM_PBE_EMIT_REGS);
1311 
1312    if (hw_render->eot_surface_count == 0) {
1313       emit_state->emit_count = 1;
1314       pvr_csb_pack (&emit_state->pbe_cs_words[0][1],
1315                     PBESTATE_STATE_WORD1,
1316                     state) {
1317          state.emptytile = true;
1318       }
1319       return;
1320    }
1321 
1322    static_assert(USC_MRT_RESOURCE_TYPE_OUTPUT_REG + 1 ==
1323                     USC_MRT_RESOURCE_TYPE_MEMORY,
1324                  "The loop below needs adjusting.");
1325 
1326    emit_state->emit_count = 0;
1327    for (uint32_t resource_type = USC_MRT_RESOURCE_TYPE_OUTPUT_REG;
1328         resource_type <= USC_MRT_RESOURCE_TYPE_MEMORY;
1329         resource_type++) {
1330       for (uint32_t i = 0; i < hw_render->eot_surface_count; i++) {
1331          const struct pvr_framebuffer *framebuffer =
1332             render_pass_info->framebuffer;
1333          const struct pvr_renderpass_hwsetup_eot_surface *surface =
1334             &hw_render->eot_surfaces[i];
1335          const struct pvr_image_view *iview =
1336             render_pass_info->attachments[surface->attachment_idx];
1337          const struct usc_mrt_resource *mrt_resource =
1338             &hw_render->eot_setup.mrt_resources[surface->mrt_idx];
1339          uint32_t samples = 1;
1340 
1341          if (mrt_resource->type != resource_type)
1342             continue;
1343 
1344          if (surface->need_resolve) {
1345             const struct pvr_image_view *resolve_src =
1346                render_pass_info->attachments[surface->src_attachment_idx];
1347 
1348             /* Attachments that are the destination of resolve operations must
1349              * be loaded before their next use.
1350              */
1351             render_pass_info->enable_bg_tag = true;
1352             render_pass_info->process_empty_tiles = true;
1353 
1354             if (surface->resolve_type != PVR_RESOLVE_TYPE_PBE)
1355                continue;
1356 
1357             samples = (uint32_t)resolve_src->vk.image->samples;
1358          }
1359 
1360          assert(emit_state->emit_count < ARRAY_SIZE(emit_state->pbe_cs_words));
1361          assert(emit_state->emit_count < ARRAY_SIZE(emit_state->pbe_reg_words));
1362 
1363          pvr_setup_pbe_state(dev_info,
1364                              framebuffer,
1365                              emit_state->emit_count,
1366                              mrt_resource,
1367                              iview,
1368                              &render_pass_info->render_area,
1369                              surface->need_resolve,
1370                              samples,
1371                              emit_state->pbe_cs_words[emit_state->emit_count],
1372                              emit_state->pbe_reg_words[emit_state->emit_count]);
1373          emit_state->emit_count += 1;
1374       }
1375    }
1376 
1377    assert(emit_state->emit_count == hw_render->pbe_emits);
1378 }
1379 
1380 static inline bool
pvr_is_render_area_tile_aligned(const struct pvr_cmd_buffer * cmd_buffer,const struct pvr_image_view * iview)1381 pvr_is_render_area_tile_aligned(const struct pvr_cmd_buffer *cmd_buffer,
1382                                 const struct pvr_image_view *iview)
1383 {
1384    const VkRect2D *render_area =
1385       &cmd_buffer->state.render_pass_info.render_area;
1386 
1387    return render_area->offset.x == 0 && render_area->offset.y == 0 &&
1388           render_area->extent.height == iview->vk.extent.height &&
1389           render_area->extent.width == iview->vk.extent.width;
1390 }
1391 
pvr_sub_cmd_gfx_job_init(const struct pvr_device_info * dev_info,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * sub_cmd)1392 static VkResult pvr_sub_cmd_gfx_job_init(const struct pvr_device_info *dev_info,
1393                                          struct pvr_cmd_buffer *cmd_buffer,
1394                                          struct pvr_sub_cmd_gfx *sub_cmd)
1395 {
1396    static const VkClearDepthStencilValue default_ds_clear_value = {
1397       .depth = 1.0f,
1398       .stencil = 0xFFFFFFFF,
1399    };
1400 
1401    const struct vk_dynamic_graphics_state *dynamic_state =
1402       &cmd_buffer->vk.dynamic_graphics_state;
1403    struct pvr_render_pass_info *render_pass_info =
1404       &cmd_buffer->state.render_pass_info;
1405    const struct pvr_renderpass_hwsetup_render *hw_render =
1406       &render_pass_info->pass->hw_setup->renders[sub_cmd->hw_render_idx];
1407    struct pvr_render_job *job = &sub_cmd->job;
1408    struct pvr_pds_upload pds_pixel_event_program;
1409    struct pvr_framebuffer *framebuffer = render_pass_info->framebuffer;
1410    struct pvr_spm_bgobj_state *spm_bgobj_state =
1411       &framebuffer->spm_bgobj_state_per_render[sub_cmd->hw_render_idx];
1412    struct pvr_render_target *render_target;
1413    VkResult result;
1414 
1415    if (sub_cmd->barrier_store) {
1416       /* There can only ever be one frag job running on the hardware at any one
1417        * time, and a context switch is not allowed mid-tile, so instead of
1418        * allocating a new scratch buffer we can reuse the SPM scratch buffer to
1419        * perform the store.
1420        * So use the SPM EOT program with the SPM PBE reg words in order to store
1421        * the render to the SPM scratch buffer.
1422        */
1423 
1424       memcpy(job->pbe_reg_words,
1425              &framebuffer->spm_eot_state_per_render[0].pbe_reg_words,
1426              sizeof(job->pbe_reg_words));
1427       job->pds_pixel_event_data_offset =
1428          framebuffer->spm_eot_state_per_render[0]
1429             .pixel_event_program_data_offset;
1430    } else {
1431       struct pvr_emit_state emit_state = { 0 };
1432 
1433       pvr_setup_emit_state(dev_info, hw_render, render_pass_info, &emit_state);
1434 
1435       memcpy(job->pbe_reg_words,
1436              emit_state.pbe_reg_words,
1437              sizeof(job->pbe_reg_words));
1438 
1439       result = pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
1440          cmd_buffer,
1441          emit_state.emit_count,
1442          emit_state.pbe_cs_words[0],
1443          &pds_pixel_event_program);
1444       if (result != VK_SUCCESS)
1445          return result;
1446 
1447       job->pds_pixel_event_data_offset = pds_pixel_event_program.data_offset;
1448    }
1449 
1450    if (sub_cmd->barrier_load) {
1451       job->enable_bg_tag = true;
1452       job->process_empty_tiles = true;
1453 
1454       /* Load the previously stored render from the SPM scratch buffer. */
1455 
1456       STATIC_ASSERT(ARRAY_SIZE(job->pds_bgnd_reg_values) ==
1457                     ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1458       typed_memcpy(job->pds_bgnd_reg_values,
1459                    spm_bgobj_state->pds_reg_values,
1460                    ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1461    } else if (hw_render->load_op) {
1462       const struct pvr_load_op *load_op = hw_render->load_op;
1463       struct pvr_pds_upload load_op_program;
1464 
1465       /* Recalculate Background Object(s). */
1466 
1467       /* FIXME: Should we free the PDS pixel event data or let it be freed
1468        * when the pool gets emptied?
1469        */
1470       result = pvr_load_op_data_create_and_upload(cmd_buffer,
1471                                                   load_op,
1472                                                   &load_op_program);
1473       if (result != VK_SUCCESS)
1474          return result;
1475 
1476       job->enable_bg_tag = render_pass_info->enable_bg_tag;
1477       job->process_empty_tiles = render_pass_info->process_empty_tiles;
1478 
1479       pvr_pds_bgnd_pack_state(load_op,
1480                               &load_op_program,
1481                               job->pds_bgnd_reg_values);
1482    }
1483 
1484    /* TODO: In some cases a PR can be removed by storing to the color attachment
1485     * and have the background object load directly from it instead of using the
1486     * scratch buffer. In those cases we can also set this to "false" and avoid
1487     * extra fw overhead.
1488     */
1489    /* The scratch buffer is always needed and allocated to avoid data loss in
1490     * case SPM is hit so set the flag unconditionally.
1491     */
1492    job->requires_spm_scratch_buffer = true;
1493 
1494    memcpy(job->pr_pbe_reg_words,
1495           &framebuffer->spm_eot_state_per_render[0].pbe_reg_words,
1496           sizeof(job->pbe_reg_words));
1497    job->pr_pds_pixel_event_data_offset =
1498       framebuffer->spm_eot_state_per_render[0].pixel_event_program_data_offset;
1499 
1500    STATIC_ASSERT(ARRAY_SIZE(job->pds_pr_bgnd_reg_values) ==
1501                  ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1502    typed_memcpy(job->pds_pr_bgnd_reg_values,
1503                 spm_bgobj_state->pds_reg_values,
1504                 ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1505 
1506    render_target = pvr_get_render_target(render_pass_info->pass,
1507                                          framebuffer,
1508                                          sub_cmd->hw_render_idx);
1509    job->rt_dataset = render_target->rt_dataset;
1510 
1511    job->ctrl_stream_addr = pvr_csb_get_start_address(&sub_cmd->control_stream);
1512 
1513    if (sub_cmd->depth_bias_bo)
1514       job->depth_bias_table_addr = sub_cmd->depth_bias_bo->dev_addr;
1515    else
1516       job->depth_bias_table_addr = PVR_DEV_ADDR_INVALID;
1517 
1518    if (sub_cmd->scissor_bo)
1519       job->scissor_table_addr = sub_cmd->scissor_bo->dev_addr;
1520    else
1521       job->scissor_table_addr = PVR_DEV_ADDR_INVALID;
1522 
1523    job->pixel_output_width =
1524       pvr_pass_get_pixel_output_width(render_pass_info->pass,
1525                                       sub_cmd->hw_render_idx,
1526                                       dev_info);
1527 
1528    /* Setup depth/stencil job information. */
1529    if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
1530       struct pvr_image_view *ds_iview =
1531          render_pass_info->attachments[hw_render->ds_attach_idx];
1532       const struct pvr_image *ds_image = pvr_image_view_get_image(ds_iview);
1533 
1534       job->has_depth_attachment = vk_format_has_depth(ds_image->vk.format);
1535       job->has_stencil_attachment = vk_format_has_stencil(ds_image->vk.format);
1536 
1537       if (job->has_depth_attachment || job->has_stencil_attachment) {
1538          uint32_t level_pitch =
1539             ds_image->mip_levels[ds_iview->vk.base_mip_level].pitch;
1540          const bool render_area_is_tile_aligned =
1541             pvr_is_render_area_tile_aligned(cmd_buffer, ds_iview);
1542          bool store_was_optimised_out = false;
1543          bool d_store = false, s_store = false;
1544          bool d_load = false, s_load = false;
1545 
1546          job->ds.iview = ds_iview;
1547          job->ds.addr = ds_image->dev_addr;
1548 
1549          job->ds.stride =
1550             pvr_stride_from_pitch(level_pitch, ds_iview->vk.format);
1551          job->ds.height = ds_iview->vk.extent.height;
1552          job->ds.physical_extent = (VkExtent2D){
1553             .width = u_minify(ds_image->physical_extent.width,
1554                               ds_iview->vk.base_mip_level),
1555             .height = u_minify(ds_image->physical_extent.height,
1556                                ds_iview->vk.base_mip_level),
1557          };
1558          job->ds.layer_size = ds_image->layer_size;
1559 
1560          job->ds_clear_value = default_ds_clear_value;
1561 
1562          if (hw_render->ds_attach_idx < render_pass_info->clear_value_count) {
1563             const VkClearDepthStencilValue *const clear_values =
1564                &render_pass_info->clear_values[hw_render->ds_attach_idx]
1565                    .depthStencil;
1566 
1567             if (job->has_depth_attachment)
1568                job->ds_clear_value.depth = clear_values->depth;
1569 
1570             if (job->has_stencil_attachment)
1571                job->ds_clear_value.stencil = clear_values->stencil;
1572          }
1573 
1574          switch (ds_iview->vk.format) {
1575          case VK_FORMAT_D16_UNORM:
1576             job->ds.zls_format = PVRX(CR_ZLS_FORMAT_TYPE_16BITINT);
1577             break;
1578 
1579          case VK_FORMAT_S8_UINT:
1580          case VK_FORMAT_D32_SFLOAT:
1581             job->ds.zls_format = PVRX(CR_ZLS_FORMAT_TYPE_F32Z);
1582             break;
1583 
1584          case VK_FORMAT_D24_UNORM_S8_UINT:
1585             job->ds.zls_format = PVRX(CR_ZLS_FORMAT_TYPE_24BITINT);
1586             break;
1587 
1588          default:
1589             unreachable("Unsupported depth stencil format");
1590          }
1591 
1592          job->ds.memlayout = ds_image->memlayout;
1593 
1594          if (job->has_depth_attachment) {
1595             if (hw_render->depth_store || sub_cmd->barrier_store) {
1596                const bool depth_init_is_clear = hw_render->depth_init ==
1597                                                 VK_ATTACHMENT_LOAD_OP_CLEAR;
1598 
1599                d_store = true;
1600 
1601                if (hw_render->depth_store && render_area_is_tile_aligned &&
1602                    !(sub_cmd->modifies_depth || depth_init_is_clear)) {
1603                   d_store = false;
1604                   store_was_optimised_out = true;
1605                }
1606             }
1607 
1608             if (d_store && !render_area_is_tile_aligned) {
1609                d_load = true;
1610             } else if (hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_LOAD) {
1611                enum pvr_depth_stencil_usage depth_usage = sub_cmd->depth_usage;
1612 
1613                assert(depth_usage != PVR_DEPTH_STENCIL_USAGE_UNDEFINED);
1614                d_load = (depth_usage != PVR_DEPTH_STENCIL_USAGE_NEVER);
1615             } else {
1616                d_load = sub_cmd->barrier_load;
1617             }
1618          }
1619 
1620          if (job->has_stencil_attachment) {
1621             if (hw_render->stencil_store || sub_cmd->barrier_store) {
1622                const bool stencil_init_is_clear = hw_render->stencil_init ==
1623                                                   VK_ATTACHMENT_LOAD_OP_CLEAR;
1624 
1625                s_store = true;
1626 
1627                if (hw_render->stencil_store && render_area_is_tile_aligned &&
1628                    !(sub_cmd->modifies_stencil || stencil_init_is_clear)) {
1629                   s_store = false;
1630                   store_was_optimised_out = true;
1631                }
1632             }
1633 
1634             if (s_store && !render_area_is_tile_aligned) {
1635                s_load = true;
1636             } else if (hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_LOAD) {
1637                enum pvr_depth_stencil_usage stencil_usage =
1638                   sub_cmd->stencil_usage;
1639 
1640                assert(stencil_usage != PVR_DEPTH_STENCIL_USAGE_UNDEFINED);
1641                s_load = (stencil_usage != PVR_DEPTH_STENCIL_USAGE_NEVER);
1642             } else {
1643                s_load = sub_cmd->barrier_load;
1644             }
1645          }
1646 
1647          job->ds.load.d = d_load;
1648          job->ds.load.s = s_load;
1649          job->ds.store.d = d_store;
1650          job->ds.store.s = s_store;
1651 
1652          /* ZLS can't do masked writes for packed depth stencil formats so if
1653           * we store anything, we have to store everything.
1654           */
1655          if ((job->ds.store.d || job->ds.store.s) &&
1656              pvr_zls_format_type_is_packed(job->ds.zls_format)) {
1657             job->ds.store.d = true;
1658             job->ds.store.s = true;
1659 
1660             /* In case we are only operating on one aspect of the attachment we
1661              * need to load the unused one in order to preserve its contents due
1662              * to the forced store which might otherwise corrupt it.
1663              */
1664             if (hw_render->depth_init != VK_ATTACHMENT_LOAD_OP_CLEAR)
1665                job->ds.load.d = true;
1666 
1667             if (hw_render->stencil_init != VK_ATTACHMENT_LOAD_OP_CLEAR)
1668                job->ds.load.s = true;
1669          }
1670 
1671          if (pvr_ds_attachment_requires_zls(&job->ds) ||
1672              store_was_optimised_out) {
1673             job->process_empty_tiles = true;
1674          }
1675 
1676          if (pvr_sub_cmd_gfx_requires_ds_subtile_alignment(dev_info, job)) {
1677             result = pvr_sub_cmd_gfx_align_ds_subtiles(cmd_buffer, sub_cmd);
1678             if (result != VK_SUCCESS)
1679                return result;
1680          }
1681       }
1682    } else {
1683       job->has_depth_attachment = false;
1684       job->has_stencil_attachment = false;
1685       job->ds_clear_value = default_ds_clear_value;
1686    }
1687 
1688    if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
1689       struct pvr_image_view *iview =
1690          render_pass_info->attachments[hw_render->ds_attach_idx];
1691       const struct pvr_image *image = pvr_image_view_get_image(iview);
1692 
1693       /* If the HW render pass has a valid depth/stencil surface, determine the
1694        * sample count from the attachment's image.
1695        */
1696       job->samples = image->vk.samples;
1697    } else if (hw_render->output_regs_count) {
1698       /* If the HW render pass has output registers, we have color attachments
1699        * to write to, so determine the sample count from the count specified for
1700        * every color attachment in this render.
1701        */
1702       job->samples = hw_render->sample_count;
1703    } else if (cmd_buffer->state.gfx_pipeline) {
1704       /* If the HW render pass has no color or depth/stencil attachments, we
1705        * determine the sample count from the count given during pipeline
1706        * creation.
1707        */
1708       job->samples = dynamic_state->ms.rasterization_samples;
1709    } else if (render_pass_info->pass->attachment_count > 0) {
1710       /* If we get here, we have a render pass with subpasses containing no
1711        * attachments. The next best thing is largest of the sample counts
1712        * specified by the render pass attachment descriptions.
1713        */
1714       job->samples = render_pass_info->pass->max_sample_count;
1715    } else {
1716       /* No appropriate framebuffer attachment is available. */
1717       mesa_logw("Defaulting render job sample count to 1.");
1718       job->samples = VK_SAMPLE_COUNT_1_BIT;
1719    }
1720 
1721    if (sub_cmd->max_tiles_in_flight ==
1722        PVR_GET_FEATURE_VALUE(dev_info, isp_max_tiles_in_flight, 1U)) {
1723       /* Use the default limit based on the partition store. */
1724       job->max_tiles_in_flight = 0U;
1725    } else {
1726       job->max_tiles_in_flight = sub_cmd->max_tiles_in_flight;
1727    }
1728 
1729    job->frag_uses_atomic_ops = sub_cmd->frag_uses_atomic_ops;
1730    job->disable_compute_overlap = false;
1731    job->max_shared_registers = cmd_buffer->state.max_shared_regs;
1732    job->run_frag = true;
1733    job->geometry_terminate = true;
1734 
1735    return VK_SUCCESS;
1736 }
1737 
1738 static void
pvr_sub_cmd_compute_job_init(const struct pvr_physical_device * pdevice,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * sub_cmd)1739 pvr_sub_cmd_compute_job_init(const struct pvr_physical_device *pdevice,
1740                              struct pvr_cmd_buffer *cmd_buffer,
1741                              struct pvr_sub_cmd_compute *sub_cmd)
1742 {
1743    sub_cmd->num_shared_regs = MAX2(cmd_buffer->device->idfwdf_state.usc_shareds,
1744                                    cmd_buffer->state.max_shared_regs);
1745 
1746    cmd_buffer->state.max_shared_regs = 0U;
1747 }
1748 
1749 #define PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS \
1750    (1024 / PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE))
1751 
1752 static uint32_t
pvr_compute_flat_slot_size(const struct pvr_physical_device * pdevice,uint32_t coeff_regs_count,bool use_barrier,uint32_t total_workitems)1753 pvr_compute_flat_slot_size(const struct pvr_physical_device *pdevice,
1754                            uint32_t coeff_regs_count,
1755                            bool use_barrier,
1756                            uint32_t total_workitems)
1757 {
1758    const struct pvr_device_runtime_info *dev_runtime_info =
1759       &pdevice->dev_runtime_info;
1760    const struct pvr_device_info *dev_info = &pdevice->dev_info;
1761    uint32_t max_workgroups_per_task = ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK;
1762    uint32_t max_avail_coeff_regs =
1763       dev_runtime_info->cdm_max_local_mem_size_regs;
1764    uint32_t localstore_chunks_count =
1765       DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs_count),
1766                    PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
1767 
1768    /* Ensure that we cannot have more workgroups in a slot than the available
1769     * number of coefficients allow us to have.
1770     */
1771    if (coeff_regs_count > 0U) {
1772       /* If the geometry or fragment jobs can overlap with the compute job, or
1773        * if there is a vertex shader already running then we need to consider
1774        * this in calculating max allowed work-groups.
1775        */
1776       if (PVR_HAS_QUIRK(dev_info, 52354) &&
1777           (PVR_HAS_FEATURE(dev_info, compute_overlap) ||
1778            PVR_HAS_FEATURE(dev_info, gs_rta_support))) {
1779          /* Solve for n (number of work-groups per task). All values are in
1780           * size of common store alloc blocks:
1781           *
1782           * n + (2n + 7) * (local_memory_size_max - 1) =
1783           * 	(coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1784           * ==>
1785           * n + 2n * (local_memory_size_max - 1) =
1786           * 	(coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1787           * 	- (7 * (local_memory_size_max - 1))
1788           * ==>
1789           * n * (1 + 2 * (local_memory_size_max - 1)) =
1790           * 	(coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1791           * 	- (7 * (local_memory_size_max - 1))
1792           * ==>
1793           * n = ((coefficient_memory_pool_size) -
1794           * 	(7 * pixel_allocation_size_max) -
1795           * 	(7 * (local_memory_size_max - 1)) / (1 +
1796           * 2 * (local_memory_size_max - 1)))
1797           */
1798          uint32_t max_common_store_blocks =
1799             DIV_ROUND_UP(max_avail_coeff_regs * 4U,
1800                          PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
1801 
1802          /* (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1803           */
1804          max_common_store_blocks -= ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
1805                                     PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS;
1806 
1807          /* - (7 * (local_memory_size_max - 1)) */
1808          max_common_store_blocks -= (ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
1809                                      (localstore_chunks_count - 1U));
1810 
1811          /* Divide by (1 + 2 * (local_memory_size_max - 1)) */
1812          max_workgroups_per_task = max_common_store_blocks /
1813                                    (1U + 2U * (localstore_chunks_count - 1U));
1814 
1815          max_workgroups_per_task =
1816             MIN2(max_workgroups_per_task,
1817                  ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK);
1818 
1819       } else {
1820          max_workgroups_per_task =
1821             MIN2((max_avail_coeff_regs / coeff_regs_count),
1822                  max_workgroups_per_task);
1823       }
1824    }
1825 
1826    /* max_workgroups_per_task should at least be one. */
1827    assert(max_workgroups_per_task >= 1U);
1828 
1829    if (total_workitems >= ROGUE_MAX_INSTANCES_PER_TASK) {
1830       /* In this case, the work group size will have been padded up to the
1831        * next ROGUE_MAX_INSTANCES_PER_TASK so we just set max instances to be
1832        * ROGUE_MAX_INSTANCES_PER_TASK.
1833        */
1834       return ROGUE_MAX_INSTANCES_PER_TASK;
1835    }
1836 
1837    /* In this case, the number of instances in the slot must be clamped to
1838     * accommodate whole work-groups only.
1839     */
1840    if (PVR_HAS_QUIRK(dev_info, 49032) || use_barrier) {
1841       max_workgroups_per_task =
1842          MIN2(max_workgroups_per_task,
1843               ROGUE_MAX_INSTANCES_PER_TASK / total_workitems);
1844       return total_workitems * max_workgroups_per_task;
1845    }
1846 
1847    return MIN2(total_workitems * max_workgroups_per_task,
1848                ROGUE_MAX_INSTANCES_PER_TASK);
1849 }
1850 
1851 static void
pvr_compute_generate_control_stream(struct pvr_csb * csb,struct pvr_sub_cmd_compute * sub_cmd,const struct pvr_compute_kernel_info * info)1852 pvr_compute_generate_control_stream(struct pvr_csb *csb,
1853                                     struct pvr_sub_cmd_compute *sub_cmd,
1854                                     const struct pvr_compute_kernel_info *info)
1855 {
1856    pvr_csb_set_relocation_mark(csb);
1857 
1858    /* Compute kernel 0. */
1859    pvr_csb_emit (csb, CDMCTRL_KERNEL0, kernel0) {
1860       kernel0.indirect_present = !!info->indirect_buffer_addr.addr;
1861       kernel0.global_offsets_present = info->global_offsets_present;
1862       kernel0.usc_common_size = info->usc_common_size;
1863       kernel0.usc_unified_size = info->usc_unified_size;
1864       kernel0.pds_temp_size = info->pds_temp_size;
1865       kernel0.pds_data_size = info->pds_data_size;
1866       kernel0.usc_target = info->usc_target;
1867       kernel0.fence = info->is_fence;
1868    }
1869 
1870    /* Compute kernel 1. */
1871    pvr_csb_emit (csb, CDMCTRL_KERNEL1, kernel1) {
1872       kernel1.data_addr = PVR_DEV_ADDR(info->pds_data_offset);
1873       kernel1.sd_type = info->sd_type;
1874       kernel1.usc_common_shared = info->usc_common_shared;
1875    }
1876 
1877    /* Compute kernel 2. */
1878    pvr_csb_emit (csb, CDMCTRL_KERNEL2, kernel2) {
1879       kernel2.code_addr = PVR_DEV_ADDR(info->pds_code_offset);
1880    }
1881 
1882    if (info->indirect_buffer_addr.addr) {
1883       /* Compute kernel 6. */
1884       pvr_csb_emit (csb, CDMCTRL_KERNEL6, kernel6) {
1885          kernel6.indirect_addrmsb = info->indirect_buffer_addr;
1886       }
1887 
1888       /* Compute kernel 7. */
1889       pvr_csb_emit (csb, CDMCTRL_KERNEL7, kernel7) {
1890          kernel7.indirect_addrlsb = info->indirect_buffer_addr;
1891       }
1892    } else {
1893       /* Compute kernel 3. */
1894       pvr_csb_emit (csb, CDMCTRL_KERNEL3, kernel3) {
1895          assert(info->global_size[0U] > 0U);
1896          kernel3.workgroup_x = info->global_size[0U] - 1U;
1897       }
1898 
1899       /* Compute kernel 4. */
1900       pvr_csb_emit (csb, CDMCTRL_KERNEL4, kernel4) {
1901          assert(info->global_size[1U] > 0U);
1902          kernel4.workgroup_y = info->global_size[1U] - 1U;
1903       }
1904 
1905       /* Compute kernel 5. */
1906       pvr_csb_emit (csb, CDMCTRL_KERNEL5, kernel5) {
1907          assert(info->global_size[2U] > 0U);
1908          kernel5.workgroup_z = info->global_size[2U] - 1U;
1909       }
1910    }
1911 
1912    /* Compute kernel 8. */
1913    pvr_csb_emit (csb, CDMCTRL_KERNEL8, kernel8) {
1914       if (info->max_instances == ROGUE_MAX_INSTANCES_PER_TASK)
1915          kernel8.max_instances = 0U;
1916       else
1917          kernel8.max_instances = info->max_instances;
1918 
1919       assert(info->local_size[0U] > 0U);
1920       kernel8.workgroup_size_x = info->local_size[0U] - 1U;
1921       assert(info->local_size[1U] > 0U);
1922       kernel8.workgroup_size_y = info->local_size[1U] - 1U;
1923       assert(info->local_size[2U] > 0U);
1924       kernel8.workgroup_size_z = info->local_size[2U] - 1U;
1925    }
1926 
1927    pvr_csb_clear_relocation_mark(csb);
1928 
1929    /* Track the highest amount of shared registers usage in this dispatch.
1930     * This is used by the FW for context switching, so must be large enough
1931     * to contain all the shared registers that might be in use for this compute
1932     * job. Coefficients don't need to be included as the context switch will not
1933     * happen within the execution of a single workgroup, thus nothing needs to
1934     * be preserved.
1935     */
1936    if (info->usc_common_shared) {
1937       sub_cmd->num_shared_regs =
1938          MAX2(sub_cmd->num_shared_regs, info->usc_common_size);
1939    }
1940 }
1941 
1942 /* TODO: This can be pre-packed and uploaded directly. Would that provide any
1943  * speed up?
1944  */
1945 static void
pvr_compute_generate_idfwdf(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd)1946 pvr_compute_generate_idfwdf(struct pvr_cmd_buffer *cmd_buffer,
1947                             struct pvr_sub_cmd_compute *const sub_cmd)
1948 {
1949    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
1950    bool *const is_sw_barier_required =
1951       &state->current_sub_cmd->compute.pds_sw_barrier_requires_clearing;
1952    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
1953    struct pvr_csb *csb = &sub_cmd->control_stream;
1954    const struct pvr_pds_upload *program;
1955 
1956    if (PVR_NEED_SW_COMPUTE_PDS_BARRIER(&pdevice->dev_info) &&
1957        *is_sw_barier_required) {
1958       *is_sw_barier_required = false;
1959       program = &cmd_buffer->device->idfwdf_state.sw_compute_barrier_pds;
1960    } else {
1961       program = &cmd_buffer->device->idfwdf_state.pds;
1962    }
1963 
1964    struct pvr_compute_kernel_info info = {
1965       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
1966       .global_offsets_present = false,
1967       .usc_common_size = DIV_ROUND_UP(
1968          PVR_DW_TO_BYTES(cmd_buffer->device->idfwdf_state.usc_shareds),
1969          PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
1970       .usc_unified_size = 0U,
1971       .pds_temp_size = 0U,
1972       .pds_data_size =
1973          DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
1974                       PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
1975       .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
1976       .is_fence = false,
1977       .pds_data_offset = program->data_offset,
1978       .sd_type = PVRX(CDMCTRL_SD_TYPE_USC),
1979       .usc_common_shared = true,
1980       .pds_code_offset = program->code_offset,
1981       .global_size = { 1U, 1U, 1U },
1982       .local_size = { 1U, 1U, 1U },
1983    };
1984 
1985    /* We don't need to pad work-group size for this case. */
1986 
1987    info.max_instances =
1988       pvr_compute_flat_slot_size(pdevice,
1989                                  cmd_buffer->device->idfwdf_state.usc_shareds,
1990                                  false,
1991                                  1U);
1992 
1993    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
1994 }
1995 
pvr_compute_generate_fence(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,bool deallocate_shareds)1996 void pvr_compute_generate_fence(struct pvr_cmd_buffer *cmd_buffer,
1997                                 struct pvr_sub_cmd_compute *const sub_cmd,
1998                                 bool deallocate_shareds)
1999 {
2000    const struct pvr_pds_upload *program =
2001       &cmd_buffer->device->pds_compute_fence_program;
2002    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
2003    struct pvr_csb *csb = &sub_cmd->control_stream;
2004 
2005    struct pvr_compute_kernel_info info = {
2006       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
2007       .global_offsets_present = false,
2008       .usc_common_size = 0U,
2009       .usc_unified_size = 0U,
2010       .pds_temp_size = 0U,
2011       .pds_data_size =
2012          DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
2013                       PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
2014       .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
2015       .is_fence = true,
2016       .pds_data_offset = program->data_offset,
2017       .sd_type = PVRX(CDMCTRL_SD_TYPE_PDS),
2018       .usc_common_shared = deallocate_shareds,
2019       .pds_code_offset = program->code_offset,
2020       .global_size = { 1U, 1U, 1U },
2021       .local_size = { 1U, 1U, 1U },
2022    };
2023 
2024    /* We don't need to pad work-group size for this case. */
2025    /* Here we calculate the slot size. This can depend on the use of barriers,
2026     * local memory, BRN's or other factors.
2027     */
2028    info.max_instances = pvr_compute_flat_slot_size(pdevice, 0U, false, 1U);
2029 
2030    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
2031 }
2032 
2033 static VkResult
pvr_cmd_buffer_process_deferred_clears(struct pvr_cmd_buffer * cmd_buffer)2034 pvr_cmd_buffer_process_deferred_clears(struct pvr_cmd_buffer *cmd_buffer)
2035 {
2036    util_dynarray_foreach (&cmd_buffer->deferred_clears,
2037                           struct pvr_transfer_cmd,
2038                           transfer_cmd) {
2039       VkResult result;
2040 
2041       result = pvr_cmd_buffer_add_transfer_cmd(cmd_buffer, transfer_cmd);
2042       if (result != VK_SUCCESS)
2043          return result;
2044 
2045       cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true;
2046    }
2047 
2048    return VK_SUCCESS;
2049 }
2050 
pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer * cmd_buffer)2051 VkResult pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer *cmd_buffer)
2052 {
2053    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2054    struct pvr_sub_cmd *sub_cmd = state->current_sub_cmd;
2055    struct pvr_device *device = cmd_buffer->device;
2056    const struct pvr_query_pool *query_pool = NULL;
2057    struct pvr_suballoc_bo *query_bo = NULL;
2058    size_t query_indices_size = 0;
2059    VkResult result;
2060 
2061    /* FIXME: Is this NULL check required because this function is called from
2062     * pvr_resolve_unemitted_resolve_attachments()? See comment about this
2063     * function being called twice in a row in pvr_CmdEndRenderPass().
2064     */
2065    if (!sub_cmd)
2066       return VK_SUCCESS;
2067 
2068    if (!sub_cmd->owned) {
2069       state->current_sub_cmd = NULL;
2070       return VK_SUCCESS;
2071    }
2072 
2073    switch (sub_cmd->type) {
2074    case PVR_SUB_CMD_TYPE_GRAPHICS: {
2075       struct pvr_sub_cmd_gfx *const gfx_sub_cmd = &sub_cmd->gfx;
2076 
2077       query_indices_size =
2078          util_dynarray_num_elements(&state->query_indices, char);
2079 
2080       if (query_indices_size > 0) {
2081          const bool secondary_cont =
2082             cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
2083             cmd_buffer->usage_flags &
2084                VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
2085 
2086          assert(gfx_sub_cmd->query_pool);
2087 
2088          if (secondary_cont) {
2089             util_dynarray_append_dynarray(&state->query_indices,
2090                                           &gfx_sub_cmd->sec_query_indices);
2091          } else {
2092             const void *data = util_dynarray_begin(&state->query_indices);
2093 
2094             result = pvr_cmd_buffer_upload_general(cmd_buffer,
2095                                                    data,
2096                                                    query_indices_size,
2097                                                    &query_bo);
2098             if (result != VK_SUCCESS)
2099                return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2100 
2101             query_pool = gfx_sub_cmd->query_pool;
2102          }
2103 
2104          gfx_sub_cmd->has_occlusion_query = true;
2105 
2106          util_dynarray_clear(&state->query_indices);
2107       }
2108 
2109       if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
2110          result = pvr_csb_emit_return(&gfx_sub_cmd->control_stream);
2111          if (result != VK_SUCCESS)
2112             return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2113 
2114          break;
2115       }
2116 
2117       /* TODO: Check if the sub_cmd can be skipped based on
2118        * sub_cmd->gfx.empty_cmd flag.
2119        */
2120 
2121       /* TODO: Set the state in the functions called with the command buffer
2122        * instead of here.
2123        */
2124 
2125       result = pvr_cmd_buffer_upload_tables(device, cmd_buffer, gfx_sub_cmd);
2126       if (result != VK_SUCCESS)
2127          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2128 
2129       result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer,
2130                                              &gfx_sub_cmd->control_stream);
2131       if (result != VK_SUCCESS)
2132          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2133 
2134       result = pvr_csb_emit_terminate(&gfx_sub_cmd->control_stream);
2135       if (result != VK_SUCCESS)
2136          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2137 
2138       result = pvr_sub_cmd_gfx_job_init(&device->pdevice->dev_info,
2139                                         cmd_buffer,
2140                                         gfx_sub_cmd);
2141       if (result != VK_SUCCESS)
2142          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2143 
2144       if (pvr_sub_cmd_gfx_requires_split_submit(gfx_sub_cmd)) {
2145          result = pvr_sub_cmd_gfx_build_terminate_ctrl_stream(device,
2146                                                               cmd_buffer,
2147                                                               gfx_sub_cmd);
2148          if (result != VK_SUCCESS)
2149             return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2150       }
2151 
2152       break;
2153    }
2154 
2155    case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
2156    case PVR_SUB_CMD_TYPE_COMPUTE: {
2157       struct pvr_sub_cmd_compute *const compute_sub_cmd = &sub_cmd->compute;
2158 
2159       pvr_compute_generate_fence(cmd_buffer, compute_sub_cmd, true);
2160 
2161       result = pvr_csb_emit_terminate(&compute_sub_cmd->control_stream);
2162       if (result != VK_SUCCESS)
2163          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2164 
2165       pvr_sub_cmd_compute_job_init(device->pdevice,
2166                                    cmd_buffer,
2167                                    compute_sub_cmd);
2168       break;
2169    }
2170 
2171    case PVR_SUB_CMD_TYPE_TRANSFER:
2172       break;
2173 
2174    case PVR_SUB_CMD_TYPE_EVENT:
2175       break;
2176 
2177    default:
2178       unreachable("Unsupported sub-command type");
2179    }
2180 
2181    state->current_sub_cmd = NULL;
2182 
2183    /* pvr_cmd_buffer_process_deferred_clears() must be called with a NULL
2184     * current_sub_cmd.
2185     *
2186     * We can start a sub_cmd of a different type from the current sub_cmd only
2187     * after having ended the current sub_cmd. However, we can't end the current
2188     * sub_cmd if this depends on starting sub_cmd(s) of a different type. Hence,
2189     * don't try to start transfer sub_cmd(s) with
2190     * pvr_cmd_buffer_process_deferred_clears() until the current hasn't ended.
2191     * Failing to do so we will cause a circular dependency between
2192     * pvr_cmd_buffer_{end,start}_cmd and blow the stack.
2193     */
2194    if (sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS) {
2195       result = pvr_cmd_buffer_process_deferred_clears(cmd_buffer);
2196       if (result != VK_SUCCESS)
2197          return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2198    }
2199 
2200    if (query_pool) {
2201       struct pvr_query_info query_info;
2202 
2203       assert(query_bo);
2204       assert(query_indices_size);
2205 
2206       query_info.type = PVR_QUERY_TYPE_AVAILABILITY_WRITE;
2207 
2208       /* sizeof(uint32_t) is for the size of single query. */
2209       query_info.availability_write.num_query_indices =
2210          query_indices_size / sizeof(uint32_t);
2211       query_info.availability_write.index_bo = query_bo;
2212 
2213       query_info.availability_write.num_queries = query_pool->query_count;
2214       query_info.availability_write.availability_bo =
2215          query_pool->availability_buffer;
2216 
2217       /* Insert a barrier after the graphics sub command and before the
2218        * query sub command so that the availability write program waits for the
2219        * fragment shader to complete.
2220        */
2221 
2222       result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
2223       if (result != VK_SUCCESS)
2224          return result;
2225 
2226       cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
2227          .type = PVR_EVENT_TYPE_BARRIER,
2228          .barrier = {
2229             .wait_for_stage_mask = PVR_PIPELINE_STAGE_FRAG_BIT,
2230             .wait_at_stage_mask = PVR_PIPELINE_STAGE_OCCLUSION_QUERY_BIT,
2231          },
2232       };
2233 
2234       return pvr_add_query_program(cmd_buffer, &query_info);
2235    }
2236 
2237    return VK_SUCCESS;
2238 }
2239 
pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer * const cmd_buffer,bool start_geom)2240 void pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer *const cmd_buffer,
2241                                     bool start_geom)
2242 {
2243    struct vk_dynamic_graphics_state *const dynamic_state =
2244       &cmd_buffer->vk.dynamic_graphics_state;
2245 
2246    if (start_geom) {
2247       /*
2248        * Initial geometry phase state.
2249        * It's the driver's responsibility to ensure that the state of the
2250        * hardware is correctly initialized at the start of every geometry
2251        * phase. This is required to prevent stale state from a previous
2252        * geometry phase erroneously affecting the next geometry phase.
2253        *
2254        * If a geometry phase does not contain any geometry, this restriction
2255        * can be ignored. If the first draw call in a geometry phase will only
2256        * update the depth or stencil buffers i.e. ISP_TAGWRITEDISABLE is set
2257        * in the ISP State Control Word, the PDS State Pointers
2258        * (TA_PRES_PDSSTATEPTR*) in the first PPP State Update do not need to
2259        * be supplied, since they will never reach the PDS in the fragment
2260        * phase.
2261        */
2262 
2263       cmd_buffer->state.emit_header = (struct PVRX(TA_STATE_HEADER)){
2264          .pres_stream_out_size = true,
2265          .pres_ppp_ctrl = true,
2266          .pres_varying_word2 = true,
2267          .pres_varying_word1 = true,
2268          .pres_varying_word0 = true,
2269          .pres_outselects = true,
2270          .pres_wclamp = true,
2271          .pres_viewport = true,
2272          .pres_region_clip = true,
2273          .pres_pds_state_ptr0 = true,
2274          .pres_ispctl_fb = true,
2275          .pres_ispctl = true,
2276       };
2277    } else {
2278       struct PVRX(TA_STATE_HEADER) *const emit_header =
2279          &cmd_buffer->state.emit_header;
2280 
2281       emit_header->pres_ppp_ctrl = true;
2282       emit_header->pres_varying_word1 = true;
2283       emit_header->pres_varying_word0 = true;
2284       emit_header->pres_outselects = true;
2285       emit_header->pres_viewport = true;
2286       emit_header->pres_region_clip = true;
2287       emit_header->pres_pds_state_ptr0 = true;
2288       emit_header->pres_ispctl_fb = true;
2289       emit_header->pres_ispctl = true;
2290    }
2291 
2292    memset(&cmd_buffer->state.ppp_state,
2293           0U,
2294           sizeof(cmd_buffer->state.ppp_state));
2295 
2296    cmd_buffer->state.dirty.vertex_bindings = true;
2297    cmd_buffer->state.dirty.gfx_pipeline_binding = true;
2298 
2299    BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS);
2300    BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT);
2301 }
2302 
2303 static inline bool
pvr_cmd_uses_deferred_cs_cmds(const struct pvr_cmd_buffer * const cmd_buffer)2304 pvr_cmd_uses_deferred_cs_cmds(const struct pvr_cmd_buffer *const cmd_buffer)
2305 {
2306    const VkCommandBufferUsageFlags deferred_control_stream_flags =
2307       VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT |
2308       VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
2309 
2310    return cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
2311           (cmd_buffer->usage_flags & deferred_control_stream_flags) ==
2312              deferred_control_stream_flags;
2313 }
2314 
pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,enum pvr_sub_cmd_type type)2315 VkResult pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
2316                                       enum pvr_sub_cmd_type type)
2317 {
2318    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2319    struct pvr_device *device = cmd_buffer->device;
2320    struct pvr_sub_cmd *sub_cmd;
2321    VkResult result;
2322 
2323    /* Check the current status of the buffer. */
2324    if (vk_command_buffer_has_error(&cmd_buffer->vk))
2325       return vk_command_buffer_get_record_result(&cmd_buffer->vk);
2326 
2327    pvr_cmd_buffer_update_barriers(cmd_buffer, type);
2328 
2329    /* TODO: Add proper support for joining consecutive event sub_cmd? */
2330    if (state->current_sub_cmd) {
2331       if (state->current_sub_cmd->type == type) {
2332          /* Continue adding to the current sub command. */
2333          return VK_SUCCESS;
2334       }
2335 
2336       /* End the current sub command. */
2337       result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
2338       if (result != VK_SUCCESS)
2339          return result;
2340    }
2341 
2342    sub_cmd = vk_zalloc(&cmd_buffer->vk.pool->alloc,
2343                        sizeof(*sub_cmd),
2344                        8,
2345                        VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2346    if (!sub_cmd) {
2347       return vk_command_buffer_set_error(&cmd_buffer->vk,
2348                                          VK_ERROR_OUT_OF_HOST_MEMORY);
2349    }
2350 
2351    sub_cmd->type = type;
2352    sub_cmd->owned = true;
2353 
2354    switch (type) {
2355    case PVR_SUB_CMD_TYPE_GRAPHICS:
2356       sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
2357       sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
2358       sub_cmd->gfx.modifies_depth = false;
2359       sub_cmd->gfx.modifies_stencil = false;
2360       sub_cmd->gfx.max_tiles_in_flight =
2361          PVR_GET_FEATURE_VALUE(&device->pdevice->dev_info,
2362                                isp_max_tiles_in_flight,
2363                                1);
2364       sub_cmd->gfx.hw_render_idx = state->render_pass_info.current_hw_subpass;
2365       sub_cmd->gfx.framebuffer = state->render_pass_info.framebuffer;
2366       sub_cmd->gfx.empty_cmd = true;
2367 
2368       if (state->vis_test_enabled)
2369          sub_cmd->gfx.query_pool = state->query_pool;
2370 
2371       pvr_reset_graphics_dirty_state(cmd_buffer, true);
2372 
2373       if (pvr_cmd_uses_deferred_cs_cmds(cmd_buffer)) {
2374          pvr_csb_init(device,
2375                       PVR_CMD_STREAM_TYPE_GRAPHICS_DEFERRED,
2376                       &sub_cmd->gfx.control_stream);
2377       } else {
2378          pvr_csb_init(device,
2379                       PVR_CMD_STREAM_TYPE_GRAPHICS,
2380                       &sub_cmd->gfx.control_stream);
2381       }
2382 
2383       util_dynarray_init(&sub_cmd->gfx.sec_query_indices, NULL);
2384       break;
2385 
2386    case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
2387    case PVR_SUB_CMD_TYPE_COMPUTE:
2388       pvr_csb_init(device,
2389                    PVR_CMD_STREAM_TYPE_COMPUTE,
2390                    &sub_cmd->compute.control_stream);
2391       break;
2392 
2393    case PVR_SUB_CMD_TYPE_TRANSFER:
2394       sub_cmd->transfer.transfer_cmds = &sub_cmd->transfer.transfer_cmds_priv;
2395       list_inithead(sub_cmd->transfer.transfer_cmds);
2396       break;
2397 
2398    case PVR_SUB_CMD_TYPE_EVENT:
2399       break;
2400 
2401    default:
2402       unreachable("Unsupported sub-command type");
2403    }
2404 
2405    list_addtail(&sub_cmd->link, &cmd_buffer->sub_cmds);
2406    state->current_sub_cmd = sub_cmd;
2407 
2408    return VK_SUCCESS;
2409 }
2410 
pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer * cmd_buffer,struct pvr_winsys_heap * heap,uint64_t size,struct pvr_suballoc_bo ** const pvr_bo_out)2411 VkResult pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer *cmd_buffer,
2412                                   struct pvr_winsys_heap *heap,
2413                                   uint64_t size,
2414                                   struct pvr_suballoc_bo **const pvr_bo_out)
2415 {
2416    const uint32_t cache_line_size =
2417       rogue_get_slc_cache_line_size(&cmd_buffer->device->pdevice->dev_info);
2418    struct pvr_suballoc_bo *suballoc_bo;
2419    struct pvr_suballocator *allocator;
2420    VkResult result;
2421 
2422    if (heap == cmd_buffer->device->heaps.general_heap)
2423       allocator = &cmd_buffer->device->suballoc_general;
2424    else if (heap == cmd_buffer->device->heaps.pds_heap)
2425       allocator = &cmd_buffer->device->suballoc_pds;
2426    else if (heap == cmd_buffer->device->heaps.transfer_frag_heap)
2427       allocator = &cmd_buffer->device->suballoc_transfer;
2428    else if (heap == cmd_buffer->device->heaps.usc_heap)
2429       allocator = &cmd_buffer->device->suballoc_usc;
2430    else
2431       unreachable("Unknown heap type");
2432 
2433    result =
2434       pvr_bo_suballoc(allocator, size, cache_line_size, false, &suballoc_bo);
2435    if (result != VK_SUCCESS)
2436       return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2437 
2438    list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
2439 
2440    *pvr_bo_out = suballoc_bo;
2441 
2442    return VK_SUCCESS;
2443 }
2444 
pvr_cmd_bind_compute_pipeline(const struct pvr_compute_pipeline * const compute_pipeline,struct pvr_cmd_buffer * const cmd_buffer)2445 static void pvr_cmd_bind_compute_pipeline(
2446    const struct pvr_compute_pipeline *const compute_pipeline,
2447    struct pvr_cmd_buffer *const cmd_buffer)
2448 {
2449    cmd_buffer->state.compute_pipeline = compute_pipeline;
2450    cmd_buffer->state.dirty.compute_pipeline_binding = true;
2451 }
2452 
pvr_cmd_bind_graphics_pipeline(const struct pvr_graphics_pipeline * const gfx_pipeline,struct pvr_cmd_buffer * const cmd_buffer)2453 static void pvr_cmd_bind_graphics_pipeline(
2454    const struct pvr_graphics_pipeline *const gfx_pipeline,
2455    struct pvr_cmd_buffer *const cmd_buffer)
2456 {
2457    cmd_buffer->state.gfx_pipeline = gfx_pipeline;
2458    cmd_buffer->state.dirty.gfx_pipeline_binding = true;
2459 
2460    vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk,
2461                                      &gfx_pipeline->dynamic_state);
2462 }
2463 
pvr_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)2464 void pvr_CmdBindPipeline(VkCommandBuffer commandBuffer,
2465                          VkPipelineBindPoint pipelineBindPoint,
2466                          VkPipeline _pipeline)
2467 {
2468    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2469    PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
2470 
2471    switch (pipelineBindPoint) {
2472    case VK_PIPELINE_BIND_POINT_COMPUTE:
2473       pvr_cmd_bind_compute_pipeline(to_pvr_compute_pipeline(pipeline),
2474                                     cmd_buffer);
2475       break;
2476 
2477    case VK_PIPELINE_BIND_POINT_GRAPHICS:
2478       pvr_cmd_bind_graphics_pipeline(to_pvr_graphics_pipeline(pipeline),
2479                                      cmd_buffer);
2480       break;
2481 
2482    default:
2483       unreachable("Invalid bind point.");
2484       break;
2485    }
2486 }
2487 
2488 #if MESA_DEBUG
check_viewport_quirk_70165(const struct pvr_device * device,const VkViewport * pViewport)2489 static void check_viewport_quirk_70165(const struct pvr_device *device,
2490                                        const VkViewport *pViewport)
2491 {
2492    const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
2493    float min_vertex_x, max_vertex_x, min_vertex_y, max_vertex_y;
2494    float min_screen_space_value, max_screen_space_value;
2495    float sign_to_unsigned_offset, fixed_point_max;
2496    float guardband_width, guardband_height;
2497 
2498    if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
2499       /* Max representable value in 13.4 fixed point format.
2500        * Round-down to avoid precision issues.
2501        * Calculated as (2 ** 13) - 2*(2 ** -4)
2502        */
2503       fixed_point_max = 8192.0f - 2.0f / 16.0f;
2504 
2505       if (PVR_HAS_FEATURE(dev_info, screen_size8K)) {
2506          if (pViewport->width <= 4096 && pViewport->height <= 4096) {
2507             guardband_width = pViewport->width / 4.0f;
2508             guardband_height = pViewport->height / 4.0f;
2509 
2510             /* 2k of the range is negative */
2511             sign_to_unsigned_offset = 2048.0f;
2512          } else {
2513             guardband_width = 0.0f;
2514             guardband_height = 0.0f;
2515 
2516             /* For > 4k renders, the entire range is positive */
2517             sign_to_unsigned_offset = 0.0f;
2518          }
2519       } else {
2520          guardband_width = pViewport->width / 4.0f;
2521          guardband_height = pViewport->height / 4.0f;
2522 
2523          /* 2k of the range is negative */
2524          sign_to_unsigned_offset = 2048.0f;
2525       }
2526    } else {
2527       /* Max representable value in 16.8 fixed point format
2528        * Calculated as (2 ** 16) - (2 ** -8)
2529        */
2530       fixed_point_max = 65535.99609375f;
2531       guardband_width = pViewport->width / 4.0f;
2532       guardband_height = pViewport->height / 4.0f;
2533 
2534       /* 4k/20k of the range is negative */
2535       sign_to_unsigned_offset = (float)PVR_MAX_NEG_OFFSCREEN_OFFSET;
2536    }
2537 
2538    min_screen_space_value = -sign_to_unsigned_offset;
2539    max_screen_space_value = fixed_point_max - sign_to_unsigned_offset;
2540 
2541    min_vertex_x = pViewport->x - guardband_width;
2542    max_vertex_x = pViewport->x + pViewport->width + guardband_width;
2543    min_vertex_y = pViewport->y - guardband_height;
2544    max_vertex_y = pViewport->y + pViewport->height + guardband_height;
2545    if (min_vertex_x < min_screen_space_value ||
2546        max_vertex_x > max_screen_space_value ||
2547        min_vertex_y < min_screen_space_value ||
2548        max_vertex_y > max_screen_space_value) {
2549       mesa_logw("Viewport is affected by BRN70165, geometry outside "
2550                 "the viewport could be corrupted");
2551    }
2552 }
2553 #endif
2554 
pvr_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)2555 void pvr_CmdSetViewport(VkCommandBuffer commandBuffer,
2556                         uint32_t firstViewport,
2557                         uint32_t viewportCount,
2558                         const VkViewport *pViewports)
2559 {
2560    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2561    const uint32_t total_count = firstViewport + viewportCount;
2562 
2563    assert(firstViewport < PVR_MAX_VIEWPORTS && viewportCount > 0);
2564    assert(total_count >= 1 && total_count <= PVR_MAX_VIEWPORTS);
2565 
2566    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2567 
2568 #if MESA_DEBUG
2569    if (PVR_HAS_QUIRK(&cmd_buffer->device->pdevice->dev_info, 70165)) {
2570       for (uint32_t viewport = 0; viewport < viewportCount; viewport++) {
2571          check_viewport_quirk_70165(cmd_buffer->device, &pViewports[viewport]);
2572       }
2573    }
2574 #endif
2575 
2576    vk_common_CmdSetViewport(commandBuffer,
2577                             firstViewport,
2578                             viewportCount,
2579                             pViewports);
2580 }
2581 
pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)2582 void pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
2583                            float minDepthBounds,
2584                            float maxDepthBounds)
2585 {
2586    mesa_logd("No support for depth bounds testing.");
2587 }
2588 
pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t firstSet,uint32_t descriptorSetCount,const VkDescriptorSet * pDescriptorSets,uint32_t dynamicOffsetCount,const uint32_t * pDynamicOffsets)2589 void pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
2590                                VkPipelineBindPoint pipelineBindPoint,
2591                                VkPipelineLayout _layout,
2592                                uint32_t firstSet,
2593                                uint32_t descriptorSetCount,
2594                                const VkDescriptorSet *pDescriptorSets,
2595                                uint32_t dynamicOffsetCount,
2596                                const uint32_t *pDynamicOffsets)
2597 {
2598    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2599    struct pvr_descriptor_state *descriptor_state;
2600 
2601    assert(firstSet + descriptorSetCount <= PVR_MAX_DESCRIPTOR_SETS);
2602 
2603    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2604 
2605    switch (pipelineBindPoint) {
2606    case VK_PIPELINE_BIND_POINT_GRAPHICS:
2607    case VK_PIPELINE_BIND_POINT_COMPUTE:
2608       break;
2609 
2610    default:
2611       unreachable("Unsupported bind point.");
2612       break;
2613    }
2614 
2615    if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
2616       descriptor_state = &cmd_buffer->state.gfx_desc_state;
2617       cmd_buffer->state.dirty.gfx_desc_dirty = true;
2618    } else {
2619       descriptor_state = &cmd_buffer->state.compute_desc_state;
2620       cmd_buffer->state.dirty.compute_desc_dirty = true;
2621    }
2622 
2623    for (uint32_t i = 0; i < descriptorSetCount; i++) {
2624       PVR_FROM_HANDLE(pvr_descriptor_set, set, pDescriptorSets[i]);
2625       uint32_t index = firstSet + i;
2626 
2627       if (descriptor_state->descriptor_sets[index] != set) {
2628          descriptor_state->descriptor_sets[index] = set;
2629          descriptor_state->valid_mask |= (1u << index);
2630       }
2631    }
2632 
2633    if (dynamicOffsetCount > 0) {
2634       PVR_FROM_HANDLE(pvr_pipeline_layout, pipeline_layout, _layout);
2635       uint32_t set_offset = 0;
2636 
2637       for (uint32_t set = 0; set < firstSet; set++)
2638          set_offset += pipeline_layout->set_layout[set]->dynamic_buffer_count;
2639 
2640       assert(set_offset + dynamicOffsetCount <=
2641              ARRAY_SIZE(descriptor_state->dynamic_offsets));
2642 
2643       /* From the Vulkan 1.3.238 spec. :
2644        *
2645        *    "If any of the sets being bound include dynamic uniform or storage
2646        *    buffers, then pDynamicOffsets includes one element for each array
2647        *    element in each dynamic descriptor type binding in each set."
2648        *
2649        */
2650       for (uint32_t i = 0; i < dynamicOffsetCount; i++)
2651          descriptor_state->dynamic_offsets[set_offset + i] = pDynamicOffsets[i];
2652    }
2653 }
2654 
pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets)2655 void pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
2656                               uint32_t firstBinding,
2657                               uint32_t bindingCount,
2658                               const VkBuffer *pBuffers,
2659                               const VkDeviceSize *pOffsets)
2660 {
2661    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2662    struct pvr_vertex_binding *const vb = cmd_buffer->state.vertex_bindings;
2663 
2664    /* We have to defer setting up vertex buffer since we need the buffer
2665     * stride from the pipeline.
2666     */
2667 
2668    assert(firstBinding < PVR_MAX_VERTEX_INPUT_BINDINGS &&
2669           bindingCount <= PVR_MAX_VERTEX_INPUT_BINDINGS);
2670 
2671    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2672 
2673    for (uint32_t i = 0; i < bindingCount; i++) {
2674       vb[firstBinding + i].buffer = pvr_buffer_from_handle(pBuffers[i]);
2675       vb[firstBinding + i].offset = pOffsets[i];
2676    }
2677 
2678    cmd_buffer->state.dirty.vertex_bindings = true;
2679 }
2680 
pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkIndexType indexType)2681 void pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
2682                             VkBuffer buffer,
2683                             VkDeviceSize offset,
2684                             VkIndexType indexType)
2685 {
2686    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2687    PVR_FROM_HANDLE(pvr_buffer, index_buffer, buffer);
2688    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2689 
2690    assert(offset < index_buffer->vk.size);
2691    assert(indexType == VK_INDEX_TYPE_UINT32 ||
2692           indexType == VK_INDEX_TYPE_UINT16 ||
2693           indexType == VK_INDEX_TYPE_UINT8_KHR);
2694 
2695    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2696 
2697    state->index_buffer_binding.buffer = index_buffer;
2698    state->index_buffer_binding.offset = offset;
2699    state->index_buffer_binding.type = indexType;
2700    state->dirty.index_buffer_binding = true;
2701 }
2702 
pvr_CmdPushConstants(VkCommandBuffer commandBuffer,VkPipelineLayout layout,VkShaderStageFlags stageFlags,uint32_t offset,uint32_t size,const void * pValues)2703 void pvr_CmdPushConstants(VkCommandBuffer commandBuffer,
2704                           VkPipelineLayout layout,
2705                           VkShaderStageFlags stageFlags,
2706                           uint32_t offset,
2707                           uint32_t size,
2708                           const void *pValues)
2709 {
2710 #if MESA_DEBUG
2711    const uint64_t ending = (uint64_t)offset + (uint64_t)size;
2712 #endif
2713 
2714    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2715    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2716 
2717    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2718 
2719    pvr_assert(ending <= PVR_MAX_PUSH_CONSTANTS_SIZE);
2720 
2721    memcpy(&state->push_constants.data[offset], pValues, size);
2722 
2723    state->push_constants.dirty_stages |= stageFlags;
2724    state->push_constants.uploaded = false;
2725 }
2726 
2727 static VkResult
pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_render_pass * pass,const struct pvr_framebuffer * framebuffer)2728 pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer *cmd_buffer,
2729                                  const struct pvr_render_pass *pass,
2730                                  const struct pvr_framebuffer *framebuffer)
2731 {
2732    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2733    struct pvr_render_pass_info *info = &state->render_pass_info;
2734 
2735    assert(pass->attachment_count == framebuffer->attachment_count);
2736 
2737    /* Free any previously allocated attachments. */
2738    vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.attachments);
2739 
2740    if (pass->attachment_count == 0) {
2741       info->attachments = NULL;
2742       return VK_SUCCESS;
2743    }
2744 
2745    info->attachments =
2746       vk_zalloc(&cmd_buffer->vk.pool->alloc,
2747                 pass->attachment_count * sizeof(*info->attachments),
2748                 8,
2749                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2750    if (!info->attachments) {
2751       return vk_command_buffer_set_error(&cmd_buffer->vk,
2752                                          VK_ERROR_OUT_OF_HOST_MEMORY);
2753    }
2754 
2755    for (uint32_t i = 0; i < pass->attachment_count; i++)
2756       info->attachments[i] = framebuffer->attachments[i];
2757 
2758    return VK_SUCCESS;
2759 }
2760 
pvr_init_render_targets(struct pvr_device * device,struct pvr_render_pass * pass,struct pvr_framebuffer * framebuffer)2761 static VkResult pvr_init_render_targets(struct pvr_device *device,
2762                                         struct pvr_render_pass *pass,
2763                                         struct pvr_framebuffer *framebuffer)
2764 {
2765    for (uint32_t i = 0; i < pass->hw_setup->render_count; i++) {
2766       struct pvr_render_target *render_target =
2767          pvr_get_render_target(pass, framebuffer, i);
2768 
2769       pthread_mutex_lock(&render_target->mutex);
2770 
2771       if (!render_target->valid) {
2772          const struct pvr_renderpass_hwsetup_render *hw_render =
2773             &pass->hw_setup->renders[i];
2774          VkResult result;
2775 
2776          result = pvr_render_target_dataset_create(device,
2777                                                    framebuffer->width,
2778                                                    framebuffer->height,
2779                                                    hw_render->sample_count,
2780                                                    framebuffer->layers,
2781                                                    &render_target->rt_dataset);
2782          if (result != VK_SUCCESS) {
2783             pthread_mutex_unlock(&render_target->mutex);
2784             return result;
2785          }
2786 
2787          render_target->valid = true;
2788       }
2789 
2790       pthread_mutex_unlock(&render_target->mutex);
2791    }
2792 
2793    return VK_SUCCESS;
2794 }
2795 
2796 const struct pvr_renderpass_hwsetup_subpass *
pvr_get_hw_subpass(const struct pvr_render_pass * pass,const uint32_t subpass)2797 pvr_get_hw_subpass(const struct pvr_render_pass *pass, const uint32_t subpass)
2798 {
2799    const struct pvr_renderpass_hw_map *map =
2800       &pass->hw_setup->subpass_map[subpass];
2801 
2802    return &pass->hw_setup->renders[map->render].subpasses[map->subpass];
2803 }
2804 
pvr_perform_start_of_render_attachment_clear(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_framebuffer * framebuffer,uint32_t index,bool is_depth_stencil,uint32_t * index_list_clear_mask)2805 static void pvr_perform_start_of_render_attachment_clear(
2806    struct pvr_cmd_buffer *cmd_buffer,
2807    const struct pvr_framebuffer *framebuffer,
2808    uint32_t index,
2809    bool is_depth_stencil,
2810    uint32_t *index_list_clear_mask)
2811 {
2812    ASSERTED static const VkImageAspectFlags dsc_aspect_flags =
2813       VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT |
2814       VK_IMAGE_ASPECT_COLOR_BIT;
2815    struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
2816    const struct pvr_render_pass *pass = info->pass;
2817    const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup;
2818    const struct pvr_renderpass_hwsetup_render *hw_render =
2819       &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
2820    VkImageAspectFlags image_aspect;
2821    struct pvr_image_view *iview;
2822    uint32_t view_idx;
2823 
2824    if (is_depth_stencil) {
2825       bool stencil_clear;
2826       bool depth_clear;
2827       bool is_stencil;
2828       bool is_depth;
2829 
2830       assert(hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED);
2831       assert(index == 0);
2832 
2833       view_idx = hw_render->ds_attach_idx;
2834 
2835       is_depth = vk_format_has_depth(pass->attachments[view_idx].vk_format);
2836       is_stencil = vk_format_has_stencil(pass->attachments[view_idx].vk_format);
2837       depth_clear = hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_CLEAR;
2838       stencil_clear = hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_CLEAR;
2839 
2840       /* Attempt to clear the ds attachment. Do not erroneously discard an
2841        * attachment that has no depth clear but has a stencil attachment.
2842        */
2843       /* if not (a ∧ c) ∨ (b ∧ d) */
2844       if (!((is_depth && depth_clear) || (is_stencil && stencil_clear)))
2845          return;
2846    } else if (hw_render->color_init[index].op != VK_ATTACHMENT_LOAD_OP_CLEAR) {
2847       return;
2848    } else {
2849       view_idx = hw_render->color_init[index].index;
2850    }
2851 
2852    iview = info->attachments[view_idx];
2853 
2854    /* FIXME: It would be nice if this function and pvr_sub_cmd_gfx_job_init()
2855     * were doing the same check (even if it's just an assert) to determine if a
2856     * clear is needed.
2857     */
2858    /* If this is single-layer fullscreen, we already do the clears in
2859     * pvr_sub_cmd_gfx_job_init().
2860     */
2861    if (pvr_is_render_area_tile_aligned(cmd_buffer, iview) &&
2862        framebuffer->layers == 1) {
2863       return;
2864    }
2865 
2866    image_aspect = vk_format_aspects(pass->attachments[view_idx].vk_format);
2867    assert((image_aspect & ~dsc_aspect_flags) == 0);
2868 
2869    if (image_aspect & VK_IMAGE_ASPECT_DEPTH_BIT &&
2870        hw_render->depth_init != VK_ATTACHMENT_LOAD_OP_CLEAR) {
2871       image_aspect &= ~VK_IMAGE_ASPECT_DEPTH_BIT;
2872    }
2873 
2874    if (image_aspect & VK_IMAGE_ASPECT_STENCIL_BIT &&
2875        hw_render->stencil_init != VK_ATTACHMENT_LOAD_OP_CLEAR) {
2876       image_aspect &= ~VK_IMAGE_ASPECT_STENCIL_BIT;
2877    }
2878 
2879    if (image_aspect != VK_IMAGE_ASPECT_NONE) {
2880       VkClearAttachment clear_attachment = {
2881          .aspectMask = image_aspect,
2882          .colorAttachment = index,
2883          .clearValue = info->clear_values[view_idx],
2884       };
2885       VkClearRect rect = {
2886          .rect = info->render_area,
2887          .baseArrayLayer = 0,
2888          .layerCount = info->framebuffer->layers,
2889       };
2890 
2891       assert(view_idx < info->clear_value_count);
2892 
2893       pvr_clear_attachments_render_init(cmd_buffer, &clear_attachment, &rect);
2894 
2895       *index_list_clear_mask |= (1 << index);
2896    }
2897 }
2898 
2899 static void
pvr_perform_start_of_render_clears(struct pvr_cmd_buffer * cmd_buffer)2900 pvr_perform_start_of_render_clears(struct pvr_cmd_buffer *cmd_buffer)
2901 {
2902    struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
2903    const struct pvr_framebuffer *framebuffer = info->framebuffer;
2904    const struct pvr_render_pass *pass = info->pass;
2905    const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup;
2906    const struct pvr_renderpass_hwsetup_render *hw_render =
2907       &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
2908 
2909    /* Mask of attachment clears using index lists instead of background object
2910     * to clear.
2911     */
2912    uint32_t index_list_clear_mask = 0;
2913 
2914    for (uint32_t i = 0; i < hw_render->color_init_count; i++) {
2915       pvr_perform_start_of_render_attachment_clear(cmd_buffer,
2916                                                    framebuffer,
2917                                                    i,
2918                                                    false,
2919                                                    &index_list_clear_mask);
2920    }
2921 
2922    info->enable_bg_tag = !!hw_render->color_init_count;
2923 
2924    /* If we're not using index list for all clears/loads then we need to run
2925     * the background object on empty tiles.
2926     */
2927    if (hw_render->color_init_count &&
2928        index_list_clear_mask != ((1u << hw_render->color_init_count) - 1u)) {
2929       info->process_empty_tiles = true;
2930    } else {
2931       info->process_empty_tiles = false;
2932    }
2933 
2934    if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
2935       uint32_t ds_index_list = 0;
2936 
2937       pvr_perform_start_of_render_attachment_clear(cmd_buffer,
2938                                                    framebuffer,
2939                                                    0,
2940                                                    true,
2941                                                    &ds_index_list);
2942    }
2943 
2944    if (index_list_clear_mask)
2945       pvr_finishme("Add support for generating loadops shaders!");
2946 }
2947 
pvr_stash_depth_format(struct pvr_cmd_buffer_state * state,struct pvr_sub_cmd_gfx * const sub_cmd)2948 static void pvr_stash_depth_format(struct pvr_cmd_buffer_state *state,
2949                                    struct pvr_sub_cmd_gfx *const sub_cmd)
2950 {
2951    const struct pvr_render_pass *pass = state->render_pass_info.pass;
2952    const struct pvr_renderpass_hwsetup_render *hw_render =
2953       &pass->hw_setup->renders[sub_cmd->hw_render_idx];
2954 
2955    if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
2956       struct pvr_image_view **iviews = state->render_pass_info.attachments;
2957 
2958       state->depth_format = iviews[hw_render->ds_attach_idx]->vk.format;
2959    }
2960 }
2961 
pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup * hw_setup)2962 static bool pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup *hw_setup)
2963 {
2964    for (uint32_t i = 0; i < hw_setup->render_count; i++) {
2965       struct pvr_renderpass_hwsetup_render *hw_render = &hw_setup->renders[i];
2966       uint32_t render_targets_count = hw_render->init_setup.num_render_targets;
2967 
2968       for (uint32_t j = 0;
2969            j < (hw_render->color_init_count * render_targets_count);
2970            j += render_targets_count) {
2971          for (uint32_t k = 0; k < hw_render->init_setup.num_render_targets;
2972               k++) {
2973             if (hw_render->color_init[j + k].op ==
2974                 VK_ATTACHMENT_LOAD_OP_CLEAR) {
2975                return true;
2976             }
2977          }
2978       }
2979       if (hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_CLEAR ||
2980           hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_CLEAR) {
2981          return true;
2982       }
2983    }
2984 
2985    return false;
2986 }
2987 
2988 static VkResult
pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin)2989 pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer *cmd_buffer,
2990                                 const VkRenderPassBeginInfo *pRenderPassBegin)
2991 {
2992    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2993 
2994    /* Free any previously allocated clear values. */
2995    vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.clear_values);
2996 
2997    if (pRenderPassBegin->clearValueCount) {
2998       const size_t size = pRenderPassBegin->clearValueCount *
2999                           sizeof(*state->render_pass_info.clear_values);
3000 
3001       state->render_pass_info.clear_values =
3002          vk_zalloc(&cmd_buffer->vk.pool->alloc,
3003                    size,
3004                    8,
3005                    VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
3006       if (!state->render_pass_info.clear_values) {
3007          return vk_command_buffer_set_error(&cmd_buffer->vk,
3008                                             VK_ERROR_OUT_OF_HOST_MEMORY);
3009       }
3010 
3011       memcpy(state->render_pass_info.clear_values,
3012              pRenderPassBegin->pClearValues,
3013              size);
3014    } else {
3015       state->render_pass_info.clear_values = NULL;
3016    }
3017 
3018    state->render_pass_info.clear_value_count =
3019       pRenderPassBegin->clearValueCount;
3020 
3021    return VK_SUCCESS;
3022 }
3023 
3024 /**
3025  * \brief Indicates whether to use the large or normal clear state words.
3026  *
3027  * If the current render area can fit within a quarter of the max framebuffer
3028  * that the device is capable of, we can use the normal clear state words,
3029  * otherwise the large clear state words are needed.
3030  *
3031  * The requirement of a quarter of the max framebuffer comes from the index
3032  * count used in the normal clear state words and the vertices uploaded at
3033  * device creation.
3034  *
3035  * \param[in] cmd_buffer The command buffer for the clear.
3036  * \return true if large clear state words are required.
3037  */
3038 static bool
pvr_is_large_clear_required(const struct pvr_cmd_buffer * const cmd_buffer)3039 pvr_is_large_clear_required(const struct pvr_cmd_buffer *const cmd_buffer)
3040 {
3041    const struct pvr_device_info *const dev_info =
3042       &cmd_buffer->device->pdevice->dev_info;
3043    const VkRect2D render_area = cmd_buffer->state.render_pass_info.render_area;
3044    const uint32_t vf_max_x = rogue_get_param_vf_max_x(dev_info);
3045    const uint32_t vf_max_y = rogue_get_param_vf_max_x(dev_info);
3046 
3047    return (render_area.extent.width > (vf_max_x / 2) - 1) ||
3048           (render_area.extent.height > (vf_max_y / 2) - 1);
3049 }
3050 
pvr_emit_clear_words(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)3051 static void pvr_emit_clear_words(struct pvr_cmd_buffer *const cmd_buffer,
3052                                  struct pvr_sub_cmd_gfx *const sub_cmd)
3053 {
3054    struct pvr_device *device = cmd_buffer->device;
3055    struct pvr_csb *csb = &sub_cmd->control_stream;
3056    uint32_t vdm_state_size_in_dw;
3057    const uint32_t *vdm_state;
3058    uint32_t *stream;
3059 
3060    vdm_state_size_in_dw =
3061       pvr_clear_vdm_state_get_size_in_dw(&device->pdevice->dev_info, 1);
3062 
3063    pvr_csb_set_relocation_mark(csb);
3064 
3065    stream = pvr_csb_alloc_dwords(csb, vdm_state_size_in_dw);
3066    if (!stream) {
3067       pvr_cmd_buffer_set_error_unwarned(cmd_buffer, csb->status);
3068       return;
3069    }
3070 
3071    if (pvr_is_large_clear_required(cmd_buffer))
3072       vdm_state = device->static_clear_state.large_clear_vdm_words;
3073    else
3074       vdm_state = device->static_clear_state.vdm_words;
3075 
3076    memcpy(stream, vdm_state, PVR_DW_TO_BYTES(vdm_state_size_in_dw));
3077 
3078    pvr_csb_clear_relocation_mark(csb);
3079 }
3080 
pvr_cs_write_load_op(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * sub_cmd,struct pvr_load_op * load_op,uint32_t isp_userpass)3081 static VkResult pvr_cs_write_load_op(struct pvr_cmd_buffer *cmd_buffer,
3082                                      struct pvr_sub_cmd_gfx *sub_cmd,
3083                                      struct pvr_load_op *load_op,
3084                                      uint32_t isp_userpass)
3085 {
3086    const struct pvr_device *device = cmd_buffer->device;
3087    struct pvr_static_clear_ppp_template template =
3088       device->static_clear_state.ppp_templates[VK_IMAGE_ASPECT_COLOR_BIT];
3089    uint32_t pds_state[PVR_STATIC_CLEAR_PDS_STATE_COUNT];
3090    struct pvr_pds_upload shareds_update_program;
3091    struct pvr_suballoc_bo *pvr_bo;
3092    VkResult result;
3093 
3094    result = pvr_load_op_data_create_and_upload(cmd_buffer,
3095                                                load_op,
3096                                                &shareds_update_program);
3097    if (result != VK_SUCCESS)
3098       return result;
3099 
3100    template.config.ispctl.upass = isp_userpass;
3101 
3102    /* It might look odd that we aren't specifying the code segment's
3103     * address anywhere. This is because the hardware always assumes that the
3104     * data size is 2 128bit words and the code segments starts after that.
3105     */
3106    pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SHADERBASE],
3107                  TA_STATE_PDS_SHADERBASE,
3108                  shaderbase) {
3109       shaderbase.addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset);
3110    }
3111 
3112    pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_TEXUNICODEBASE],
3113                  TA_STATE_PDS_TEXUNICODEBASE,
3114                  texunicodebase) {
3115       texunicodebase.addr =
3116          PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset);
3117    }
3118 
3119    pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SIZEINFO1],
3120                  TA_STATE_PDS_SIZEINFO1,
3121                  sizeinfo1) {
3122       /* Dummy coefficient loading program. */
3123       sizeinfo1.pds_varyingsize = 0;
3124 
3125       sizeinfo1.pds_texturestatesize = DIV_ROUND_UP(
3126          shareds_update_program.data_size,
3127          PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEXTURESTATESIZE_UNIT_SIZE));
3128 
3129       sizeinfo1.pds_tempsize =
3130          DIV_ROUND_UP(load_op->temps_count,
3131                       PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE));
3132    }
3133 
3134    pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SIZEINFO2],
3135                  TA_STATE_PDS_SIZEINFO2,
3136                  sizeinfo2) {
3137       sizeinfo2.usc_sharedsize =
3138          DIV_ROUND_UP(load_op->const_shareds_count,
3139                       PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE));
3140    }
3141 
3142    /* Dummy coefficient loading program. */
3143    pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_VARYINGBASE] = 0;
3144 
3145    pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_TEXTUREDATABASE],
3146                  TA_STATE_PDS_TEXTUREDATABASE,
3147                  texturedatabase) {
3148       texturedatabase.addr = PVR_DEV_ADDR(shareds_update_program.data_offset);
3149    }
3150 
3151    template.config.pds_state = &pds_state;
3152 
3153    pvr_emit_ppp_from_template(&sub_cmd->control_stream, &template, &pvr_bo);
3154    list_add(&pvr_bo->link, &cmd_buffer->bo_list);
3155 
3156    pvr_emit_clear_words(cmd_buffer, sub_cmd);
3157 
3158    pvr_reset_graphics_dirty_state(cmd_buffer, false);
3159 
3160    return VK_SUCCESS;
3161 }
3162 
pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBeginInfo,const VkSubpassBeginInfo * pSubpassBeginInfo)3163 void pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
3164                              const VkRenderPassBeginInfo *pRenderPassBeginInfo,
3165                              const VkSubpassBeginInfo *pSubpassBeginInfo)
3166 {
3167    PVR_FROM_HANDLE(pvr_framebuffer,
3168                    framebuffer,
3169                    pRenderPassBeginInfo->framebuffer);
3170    PVR_FROM_HANDLE(pvr_render_pass, pass, pRenderPassBeginInfo->renderPass);
3171    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
3172    const struct pvr_renderpass_hwsetup_subpass *hw_subpass;
3173    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
3174    VkResult result;
3175 
3176    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
3177 
3178    assert(!state->render_pass_info.pass);
3179    assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3180 
3181    /* FIXME: Create a separate function for everything using pass->subpasses,
3182     * look at cmd_buffer_begin_subpass() for example. */
3183    state->render_pass_info.pass = pass;
3184    state->render_pass_info.framebuffer = framebuffer;
3185    state->render_pass_info.subpass_idx = 0;
3186    state->render_pass_info.render_area = pRenderPassBeginInfo->renderArea;
3187    state->render_pass_info.current_hw_subpass = 0;
3188    state->render_pass_info.pipeline_bind_point =
3189       pass->subpasses[0].pipeline_bind_point;
3190    state->render_pass_info.isp_userpass = pass->subpasses[0].isp_userpass;
3191    state->dirty.isp_userpass = true;
3192 
3193    result = pvr_cmd_buffer_setup_attachments(cmd_buffer, pass, framebuffer);
3194    if (result != VK_SUCCESS)
3195       return;
3196 
3197    result = pvr_init_render_targets(cmd_buffer->device, pass, framebuffer);
3198    if (result != VK_SUCCESS) {
3199       pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
3200       return;
3201    }
3202 
3203    result = pvr_cmd_buffer_set_clear_values(cmd_buffer, pRenderPassBeginInfo);
3204    if (result != VK_SUCCESS)
3205       return;
3206 
3207    assert(pass->subpasses[0].pipeline_bind_point ==
3208           VK_PIPELINE_BIND_POINT_GRAPHICS);
3209 
3210    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
3211    if (result != VK_SUCCESS)
3212       return;
3213 
3214    /* Run subpass 0 "soft" background object after the actual background
3215     * object.
3216     */
3217    hw_subpass = pvr_get_hw_subpass(pass, 0);
3218    if (hw_subpass->load_op) {
3219       result = pvr_cs_write_load_op(cmd_buffer,
3220                                     &cmd_buffer->state.current_sub_cmd->gfx,
3221                                     hw_subpass->load_op,
3222                                     0);
3223       if (result != VK_SUCCESS)
3224          return;
3225    }
3226 
3227    pvr_perform_start_of_render_clears(cmd_buffer);
3228    pvr_stash_depth_format(&cmd_buffer->state,
3229                           &cmd_buffer->state.current_sub_cmd->gfx);
3230 }
3231 
pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)3232 VkResult pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer,
3233                                 const VkCommandBufferBeginInfo *pBeginInfo)
3234 {
3235    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
3236    struct pvr_cmd_buffer_state *state;
3237    VkResult result;
3238 
3239    vk_command_buffer_begin(&cmd_buffer->vk, pBeginInfo);
3240 
3241    cmd_buffer->usage_flags = pBeginInfo->flags;
3242    state = &cmd_buffer->state;
3243 
3244    /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
3245     * primary level command buffers.
3246     *
3247     * From the Vulkan 1.0 spec:
3248     *
3249     *    VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
3250     *    secondary command buffer is considered to be entirely inside a render
3251     *    pass. If this is a primary command buffer, then this bit is ignored.
3252     */
3253    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
3254       cmd_buffer->usage_flags &=
3255          ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
3256    }
3257 
3258    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
3259       if (cmd_buffer->usage_flags &
3260           VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
3261          const VkCommandBufferInheritanceInfo *inheritance_info =
3262             pBeginInfo->pInheritanceInfo;
3263          struct pvr_render_pass *pass;
3264 
3265          pass = pvr_render_pass_from_handle(inheritance_info->renderPass);
3266          state->render_pass_info.pass = pass;
3267          state->render_pass_info.framebuffer =
3268             pvr_framebuffer_from_handle(inheritance_info->framebuffer);
3269          state->render_pass_info.subpass_idx = inheritance_info->subpass;
3270          state->render_pass_info.isp_userpass =
3271             pass->subpasses[inheritance_info->subpass].isp_userpass;
3272 
3273          result =
3274             pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
3275          if (result != VK_SUCCESS)
3276             return result;
3277 
3278          state->vis_test_enabled = inheritance_info->occlusionQueryEnable;
3279       }
3280 
3281       state->dirty.isp_userpass = true;
3282    }
3283 
3284    util_dynarray_init(&state->query_indices, NULL);
3285 
3286    memset(state->barriers_needed,
3287           0xFF,
3288           sizeof(*state->barriers_needed) * ARRAY_SIZE(state->barriers_needed));
3289 
3290    return VK_SUCCESS;
3291 }
3292 
pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_transfer_cmd * transfer_cmd)3293 VkResult pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer *cmd_buffer,
3294                                          struct pvr_transfer_cmd *transfer_cmd)
3295 {
3296    struct pvr_sub_cmd_transfer *sub_cmd;
3297    VkResult result;
3298 
3299    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
3300    if (result != VK_SUCCESS)
3301       return result;
3302 
3303    sub_cmd = &cmd_buffer->state.current_sub_cmd->transfer;
3304 
3305    list_addtail(&transfer_cmd->link, sub_cmd->transfer_cmds);
3306 
3307    return VK_SUCCESS;
3308 }
3309 
3310 static VkResult
pvr_setup_vertex_buffers(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_graphics_pipeline * const gfx_pipeline)3311 pvr_setup_vertex_buffers(struct pvr_cmd_buffer *cmd_buffer,
3312                          const struct pvr_graphics_pipeline *const gfx_pipeline)
3313 {
3314    const struct pvr_vertex_shader_state *const vertex_state =
3315       &gfx_pipeline->shader_state.vertex;
3316    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3317    const struct pvr_pds_info *const pds_info = state->pds_shader.info;
3318    struct pvr_suballoc_bo *pvr_bo;
3319    const uint8_t *entries;
3320    uint32_t *dword_buffer;
3321    uint64_t *qword_buffer;
3322    VkResult result;
3323 
3324    result =
3325       pvr_cmd_buffer_alloc_mem(cmd_buffer,
3326                                cmd_buffer->device->heaps.pds_heap,
3327                                PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
3328                                &pvr_bo);
3329    if (result != VK_SUCCESS)
3330       return result;
3331 
3332    dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3333    qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3334 
3335    entries = (uint8_t *)pds_info->entries;
3336 
3337    for (uint32_t i = 0; i < pds_info->entry_count; i++) {
3338       const struct pvr_const_map_entry *const entry_header =
3339          (struct pvr_const_map_entry *)entries;
3340 
3341       switch (entry_header->type) {
3342       case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
3343          const struct pvr_const_map_entry_literal32 *const literal =
3344             (struct pvr_const_map_entry_literal32 *)entries;
3345 
3346          PVR_WRITE(dword_buffer,
3347                    literal->literal_value,
3348                    literal->const_offset,
3349                    pds_info->data_size_in_dwords);
3350 
3351          entries += sizeof(*literal);
3352          break;
3353       }
3354 
3355       case PVR_PDS_CONST_MAP_ENTRY_TYPE_DOUTU_ADDRESS: {
3356          const struct pvr_const_map_entry_doutu_address *const doutu_addr =
3357             (struct pvr_const_map_entry_doutu_address *)entries;
3358          const pvr_dev_addr_t exec_addr =
3359             PVR_DEV_ADDR_OFFSET(vertex_state->bo->dev_addr,
3360                                 vertex_state->entry_offset);
3361          uint64_t addr = 0ULL;
3362 
3363          pvr_set_usc_execution_address64(&addr, exec_addr.addr);
3364 
3365          PVR_WRITE(qword_buffer,
3366                    addr | doutu_addr->doutu_control,
3367                    doutu_addr->const_offset,
3368                    pds_info->data_size_in_dwords);
3369 
3370          entries += sizeof(*doutu_addr);
3371          break;
3372       }
3373 
3374       case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE: {
3375          const struct pvr_const_map_entry_base_instance *const base_instance =
3376             (struct pvr_const_map_entry_base_instance *)entries;
3377 
3378          PVR_WRITE(dword_buffer,
3379                    state->draw_state.base_instance,
3380                    base_instance->const_offset,
3381                    pds_info->data_size_in_dwords);
3382 
3383          entries += sizeof(*base_instance);
3384          break;
3385       }
3386 
3387       case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_VERTEX: {
3388          const struct pvr_const_map_entry_base_instance *const base_instance =
3389             (struct pvr_const_map_entry_base_instance *)entries;
3390 
3391          PVR_WRITE(dword_buffer,
3392                    state->draw_state.base_vertex,
3393                    base_instance->const_offset,
3394                    pds_info->data_size_in_dwords);
3395 
3396          entries += sizeof(*base_instance);
3397          break;
3398       }
3399 
3400       case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_ADDRESS: {
3401          const struct pvr_const_map_entry_vertex_attribute_address
3402             *const attribute =
3403                (struct pvr_const_map_entry_vertex_attribute_address *)entries;
3404          const struct pvr_vertex_binding *const binding =
3405             &state->vertex_bindings[attribute->binding_index];
3406          /* In relation to the Vulkan spec. 22.4. Vertex Input Address
3407           * Calculation:
3408           *    Adding binding->offset corresponds to calculating the
3409           *    `bufferBindingAddress`. Adding attribute->offset corresponds to
3410           *    adding the `attribDesc.offset`. The `effectiveVertexOffset` is
3411           *    taken care by the PDS program itself with a DDMAD which will
3412           *    multiply the vertex/instance idx with the binding's stride and
3413           *    add that to the address provided here.
3414           */
3415          const pvr_dev_addr_t addr =
3416             PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr,
3417                                 binding->offset + attribute->offset);
3418 
3419          PVR_WRITE(qword_buffer,
3420                    addr.addr,
3421                    attribute->const_offset,
3422                    pds_info->data_size_in_dwords);
3423 
3424          entries += sizeof(*attribute);
3425          break;
3426       }
3427 
3428       case PVR_PDS_CONST_MAP_ENTRY_TYPE_ROBUST_VERTEX_ATTRIBUTE_ADDRESS: {
3429          const struct pvr_const_map_entry_robust_vertex_attribute_address
3430             *const attribute =
3431                (struct pvr_const_map_entry_robust_vertex_attribute_address *)
3432                   entries;
3433          const struct pvr_vertex_binding *const binding =
3434             &state->vertex_bindings[attribute->binding_index];
3435          pvr_dev_addr_t addr;
3436 
3437          if (binding->buffer->vk.size <
3438              (attribute->offset + attribute->component_size_in_bytes)) {
3439             /* Replace with load from robustness buffer when no attribute is in
3440              * range
3441              */
3442             addr = PVR_DEV_ADDR_OFFSET(
3443                cmd_buffer->device->robustness_buffer->vma->dev_addr,
3444                attribute->robustness_buffer_offset);
3445          } else {
3446             addr = PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr,
3447                                        binding->offset + attribute->offset);
3448          }
3449 
3450          PVR_WRITE(qword_buffer,
3451                    addr.addr,
3452                    attribute->const_offset,
3453                    pds_info->data_size_in_dwords);
3454 
3455          entries += sizeof(*attribute);
3456          break;
3457       }
3458 
3459       case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_MAX_INDEX: {
3460          const struct pvr_const_map_entry_vertex_attribute_max_index *attribute =
3461             (struct pvr_const_map_entry_vertex_attribute_max_index *)entries;
3462          const struct pvr_vertex_binding *const binding =
3463             &state->vertex_bindings[attribute->binding_index];
3464          const uint64_t bound_size = binding->buffer->vk.size - binding->offset;
3465          const uint32_t attribute_end =
3466             attribute->offset + attribute->component_size_in_bytes;
3467          uint32_t max_index;
3468 
3469          if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
3470                              pds_ddmadt)) {
3471             /* TODO: PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_MAX_INDEX
3472              * has the same define value as
3473              * PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTR_DDMADT_OOB_BUFFER_SIZE
3474              * so maybe we want to remove one of the defines or change the
3475              * values.
3476              */
3477             pvr_finishme("Unimplemented robust buffer access with DDMADT");
3478             assert(false);
3479          }
3480 
3481          /* If the stride is 0 then all attributes use the same single element
3482           * from the binding so the index can only be up to 0.
3483           */
3484          if (bound_size < attribute_end || attribute->stride == 0) {
3485             max_index = 0;
3486          } else {
3487             max_index = (uint32_t)(bound_size / attribute->stride) - 1;
3488 
3489             /* There's one last attribute that can fit in. */
3490             if (bound_size % attribute->stride >= attribute_end)
3491                max_index++;
3492          }
3493 
3494          PVR_WRITE(dword_buffer,
3495                    max_index,
3496                    attribute->const_offset,
3497                    pds_info->data_size_in_dwords);
3498 
3499          entries += sizeof(*attribute);
3500          break;
3501       }
3502 
3503       default:
3504          unreachable("Unsupported data section map");
3505          break;
3506       }
3507    }
3508 
3509    state->pds_vertex_attrib_offset =
3510       pvr_bo->dev_addr.addr -
3511       cmd_buffer->device->heaps.pds_heap->base_addr.addr;
3512 
3513    return VK_SUCCESS;
3514 }
3515 
pvr_setup_descriptor_mappings_old(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,const pvr_dev_addr_t * const num_worgroups_buff_addr,uint32_t * const descriptor_data_offset_out)3516 static VkResult pvr_setup_descriptor_mappings_old(
3517    struct pvr_cmd_buffer *const cmd_buffer,
3518    enum pvr_stage_allocation stage,
3519    const struct pvr_stage_allocation_descriptor_state *descriptor_state,
3520    const pvr_dev_addr_t *const num_worgroups_buff_addr,
3521    uint32_t *const descriptor_data_offset_out)
3522 {
3523    const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
3524    const struct pvr_descriptor_state *desc_state;
3525    struct pvr_suballoc_bo *pvr_bo;
3526    const uint8_t *entries;
3527    uint32_t *dword_buffer;
3528    uint64_t *qword_buffer;
3529    VkResult result;
3530 
3531    if (!pds_info->data_size_in_dwords)
3532       return VK_SUCCESS;
3533 
3534    result =
3535       pvr_cmd_buffer_alloc_mem(cmd_buffer,
3536                                cmd_buffer->device->heaps.pds_heap,
3537                                PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
3538                                &pvr_bo);
3539    if (result != VK_SUCCESS)
3540       return result;
3541 
3542    dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3543    qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3544 
3545    entries = (uint8_t *)pds_info->entries;
3546 
3547    switch (stage) {
3548    case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
3549    case PVR_STAGE_ALLOCATION_FRAGMENT:
3550       desc_state = &cmd_buffer->state.gfx_desc_state;
3551       break;
3552 
3553    case PVR_STAGE_ALLOCATION_COMPUTE:
3554       desc_state = &cmd_buffer->state.compute_desc_state;
3555       break;
3556 
3557    default:
3558       unreachable("Unsupported stage.");
3559       break;
3560    }
3561 
3562    for (uint32_t i = 0; i < pds_info->entry_count; i++) {
3563       const struct pvr_const_map_entry *const entry_header =
3564          (struct pvr_const_map_entry *)entries;
3565 
3566       switch (entry_header->type) {
3567       case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
3568          const struct pvr_const_map_entry_literal32 *const literal =
3569             (struct pvr_const_map_entry_literal32 *)entries;
3570 
3571          PVR_WRITE(dword_buffer,
3572                    literal->literal_value,
3573                    literal->const_offset,
3574                    pds_info->data_size_in_dwords);
3575 
3576          entries += sizeof(*literal);
3577          break;
3578       }
3579 
3580       case PVR_PDS_CONST_MAP_ENTRY_TYPE_CONSTANT_BUFFER: {
3581          const struct pvr_const_map_entry_constant_buffer *const_buffer_entry =
3582             (struct pvr_const_map_entry_constant_buffer *)entries;
3583          const uint32_t desc_set = const_buffer_entry->desc_set;
3584          const uint32_t binding = const_buffer_entry->binding;
3585          const struct pvr_descriptor_set *descriptor_set;
3586          const struct pvr_descriptor *descriptor;
3587          pvr_dev_addr_t buffer_addr;
3588 
3589          assert(desc_set < PVR_MAX_DESCRIPTOR_SETS);
3590          descriptor_set = desc_state->descriptor_sets[desc_set];
3591 
3592          /* TODO: Handle dynamic buffers. */
3593          descriptor = &descriptor_set->descriptors[binding];
3594          assert(descriptor->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
3595 
3596          assert(descriptor->buffer_desc_range ==
3597                 PVR_DW_TO_BYTES(const_buffer_entry->size_in_dwords));
3598          assert(descriptor->buffer_whole_range ==
3599                 PVR_DW_TO_BYTES(const_buffer_entry->size_in_dwords));
3600 
3601          buffer_addr =
3602             PVR_DEV_ADDR_OFFSET(descriptor->buffer_dev_addr,
3603                                 const_buffer_entry->offset * sizeof(uint32_t));
3604 
3605          PVR_WRITE(qword_buffer,
3606                    buffer_addr.addr,
3607                    const_buffer_entry->const_offset,
3608                    pds_info->data_size_in_dwords);
3609 
3610          entries += sizeof(*const_buffer_entry);
3611          break;
3612       }
3613 
3614       case PVR_PDS_CONST_MAP_ENTRY_TYPE_DESCRIPTOR_SET: {
3615          const struct pvr_const_map_entry_descriptor_set *desc_set_entry =
3616             (struct pvr_const_map_entry_descriptor_set *)entries;
3617          const uint32_t desc_set_num = desc_set_entry->descriptor_set;
3618          const struct pvr_descriptor_set *descriptor_set;
3619          pvr_dev_addr_t desc_set_addr;
3620          uint64_t desc_portion_offset;
3621 
3622          assert(desc_set_num < PVR_MAX_DESCRIPTOR_SETS);
3623 
3624          /* TODO: Remove this when the compiler provides us with usage info?
3625           */
3626          /* We skip DMAing unbound descriptor sets. */
3627          if (!(desc_state->valid_mask & BITFIELD_BIT(desc_set_num))) {
3628             const struct pvr_const_map_entry_literal32 *literal;
3629             uint32_t zero_literal_value;
3630 
3631             /* The code segment contains a DOUT instructions so in the data
3632              * section we have to write a DOUTD_SRC0 and DOUTD_SRC1.
3633              * We'll write 0 for DOUTD_SRC0 since we don't have a buffer to DMA.
3634              * We're expecting a LITERAL32 entry containing the value for
3635              * DOUTD_SRC1 next so let's make sure we get it and write it
3636              * with BSIZE to 0 disabling the DMA operation.
3637              * We don't want the LITERAL32 to be processed as normal otherwise
3638              * we'd be DMAing from an address of 0.
3639              */
3640 
3641             entries += sizeof(*desc_set_entry);
3642             literal = (struct pvr_const_map_entry_literal32 *)entries;
3643 
3644             assert(literal->type == PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32);
3645 
3646             zero_literal_value =
3647                literal->literal_value &
3648                PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_BSIZE_CLRMSK;
3649 
3650             PVR_WRITE(qword_buffer,
3651                       UINT64_C(0),
3652                       desc_set_entry->const_offset,
3653                       pds_info->data_size_in_dwords);
3654 
3655             PVR_WRITE(dword_buffer,
3656                       zero_literal_value,
3657                       desc_set_entry->const_offset,
3658                       pds_info->data_size_in_dwords);
3659 
3660             entries += sizeof(*literal);
3661             i++;
3662             continue;
3663          }
3664 
3665          descriptor_set = desc_state->descriptor_sets[desc_set_num];
3666 
3667          desc_set_addr = descriptor_set->pvr_bo->dev_addr;
3668 
3669          if (desc_set_entry->primary) {
3670             desc_portion_offset =
3671                descriptor_set->layout->memory_layout_in_dwords_per_stage[stage]
3672                   .primary_offset;
3673          } else {
3674             desc_portion_offset =
3675                descriptor_set->layout->memory_layout_in_dwords_per_stage[stage]
3676                   .secondary_offset;
3677          }
3678          desc_portion_offset = PVR_DW_TO_BYTES(desc_portion_offset);
3679 
3680          desc_set_addr =
3681             PVR_DEV_ADDR_OFFSET(desc_set_addr, desc_portion_offset);
3682 
3683          desc_set_addr = PVR_DEV_ADDR_OFFSET(
3684             desc_set_addr,
3685             PVR_DW_TO_BYTES((uint64_t)desc_set_entry->offset_in_dwords));
3686 
3687          PVR_WRITE(qword_buffer,
3688                    desc_set_addr.addr,
3689                    desc_set_entry->const_offset,
3690                    pds_info->data_size_in_dwords);
3691 
3692          entries += sizeof(*desc_set_entry);
3693          break;
3694       }
3695 
3696       case PVR_PDS_CONST_MAP_ENTRY_TYPE_SPECIAL_BUFFER: {
3697          const struct pvr_const_map_entry_special_buffer *special_buff_entry =
3698             (struct pvr_const_map_entry_special_buffer *)entries;
3699 
3700          switch (special_buff_entry->buffer_type) {
3701          case PVR_BUFFER_TYPE_COMPILE_TIME: {
3702             uint64_t addr = descriptor_state->static_consts->dev_addr.addr;
3703 
3704             PVR_WRITE(qword_buffer,
3705                       addr,
3706                       special_buff_entry->const_offset,
3707                       pds_info->data_size_in_dwords);
3708             break;
3709          }
3710 
3711          case PVR_BUFFER_TYPE_BLEND_CONSTS:
3712             /* TODO: See if instead of reusing the blend constant buffer type
3713              * entry, we can setup a new buffer type specifically for
3714              * num_workgroups or other built-in variables. The mappings are
3715              * setup at pipeline creation when creating the descriptor program.
3716              */
3717             if (stage == PVR_STAGE_ALLOCATION_COMPUTE) {
3718                assert(num_worgroups_buff_addr->addr);
3719 
3720                /* TODO: Check if we need to offset this (e.g. for just y and z),
3721                 * or cope with any reordering?
3722                 */
3723                PVR_WRITE(qword_buffer,
3724                          num_worgroups_buff_addr->addr,
3725                          special_buff_entry->const_offset,
3726                          pds_info->data_size_in_dwords);
3727             } else {
3728                pvr_finishme("Add blend constants support.");
3729             }
3730             break;
3731 
3732          default:
3733             unreachable("Unsupported special buffer type.");
3734          }
3735 
3736          entries += sizeof(*special_buff_entry);
3737          break;
3738       }
3739 
3740       default:
3741          unreachable("Unsupported map entry type.");
3742       }
3743    }
3744 
3745    *descriptor_data_offset_out =
3746       pvr_bo->dev_addr.addr -
3747       cmd_buffer->device->heaps.pds_heap->base_addr.addr;
3748 
3749    return VK_SUCCESS;
3750 }
3751 
3752 /* Note that the descriptor set doesn't have any space for dynamic buffer
3753  * descriptors so this works on the assumption that you have a buffer with space
3754  * for them at the end.
3755  */
pvr_get_dynamic_descriptor_primary_offset(const struct pvr_device * device,const struct pvr_descriptor_set_layout * layout,const struct pvr_descriptor_set_layout_binding * binding,const uint32_t stage,const uint32_t desc_idx)3756 static uint16_t pvr_get_dynamic_descriptor_primary_offset(
3757    const struct pvr_device *device,
3758    const struct pvr_descriptor_set_layout *layout,
3759    const struct pvr_descriptor_set_layout_binding *binding,
3760    const uint32_t stage,
3761    const uint32_t desc_idx)
3762 {
3763    struct pvr_descriptor_size_info size_info;
3764    uint32_t offset;
3765 
3766    assert(binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
3767           binding->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC);
3768    assert(desc_idx < binding->descriptor_count);
3769 
3770    pvr_descriptor_size_info_init(device, binding->type, &size_info);
3771 
3772    offset = layout->total_size_in_dwords;
3773    offset += binding->per_stage_offset_in_dwords[stage].primary;
3774    offset += (desc_idx * size_info.primary);
3775 
3776    /* Offset must be less than * 16bits. */
3777    assert(offset < UINT16_MAX);
3778 
3779    return (uint16_t)offset;
3780 }
3781 
3782 /* Note that the descriptor set doesn't have any space for dynamic buffer
3783  * descriptors so this works on the assumption that you have a buffer with space
3784  * for them at the end.
3785  */
pvr_get_dynamic_descriptor_secondary_offset(const struct pvr_device * device,const struct pvr_descriptor_set_layout * layout,const struct pvr_descriptor_set_layout_binding * binding,const uint32_t stage,const uint32_t desc_idx)3786 static uint16_t pvr_get_dynamic_descriptor_secondary_offset(
3787    const struct pvr_device *device,
3788    const struct pvr_descriptor_set_layout *layout,
3789    const struct pvr_descriptor_set_layout_binding *binding,
3790    const uint32_t stage,
3791    const uint32_t desc_idx)
3792 {
3793    struct pvr_descriptor_size_info size_info;
3794    uint32_t offset;
3795 
3796    assert(binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
3797           binding->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC);
3798    assert(desc_idx < binding->descriptor_count);
3799 
3800    pvr_descriptor_size_info_init(device, binding->type, &size_info);
3801 
3802    offset = layout->total_size_in_dwords;
3803    offset +=
3804       layout->memory_layout_in_dwords_per_stage[stage].primary_dynamic_size;
3805    offset += binding->per_stage_offset_in_dwords[stage].secondary;
3806    offset += (desc_idx * size_info.secondary);
3807 
3808    /* Offset must be less than * 16bits. */
3809    assert(offset < UINT16_MAX);
3810 
3811    return (uint16_t)offset;
3812 }
3813 
3814 /**
3815  * \brief Upload a copy of the descriptor set with dynamic buffer offsets
3816  * applied.
3817  */
3818 /* TODO: We should probably make the compiler aware of the dynamic descriptors.
3819  * We could use push constants like Anv seems to do. This would avoid having to
3820  * duplicate all sets containing dynamic descriptors each time the offsets are
3821  * updated.
3822  */
pvr_cmd_buffer_upload_patched_desc_set(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_descriptor_set * desc_set,const uint32_t * dynamic_offsets,struct pvr_suballoc_bo ** const bo_out)3823 static VkResult pvr_cmd_buffer_upload_patched_desc_set(
3824    struct pvr_cmd_buffer *cmd_buffer,
3825    const struct pvr_descriptor_set *desc_set,
3826    const uint32_t *dynamic_offsets,
3827    struct pvr_suballoc_bo **const bo_out)
3828 {
3829    const struct pvr_descriptor_set_layout *layout = desc_set->layout;
3830    const uint64_t normal_desc_set_size =
3831       PVR_DW_TO_BYTES(layout->total_size_in_dwords);
3832    const uint64_t dynamic_descs_size =
3833       PVR_DW_TO_BYTES(layout->total_dynamic_size_in_dwords);
3834    struct pvr_descriptor_size_info dynamic_uniform_buffer_size_info;
3835    struct pvr_descriptor_size_info dynamic_storage_buffer_size_info;
3836    struct pvr_device *device = cmd_buffer->device;
3837    struct pvr_suballoc_bo *patched_desc_set_bo;
3838    uint32_t *src_mem_ptr, *dst_mem_ptr;
3839    uint32_t desc_idx_offset = 0;
3840    VkResult result;
3841 
3842    assert(desc_set->layout->dynamic_buffer_count > 0);
3843 
3844    pvr_descriptor_size_info_init(device,
3845                                  VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC,
3846                                  &dynamic_uniform_buffer_size_info);
3847    pvr_descriptor_size_info_init(device,
3848                                  VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC,
3849                                  &dynamic_storage_buffer_size_info);
3850 
3851    /* TODO: In the descriptor set we don't account for dynamic buffer
3852     * descriptors and take care of them in the pipeline layout. The pipeline
3853     * layout allocates them at the beginning but let's put them at the end just
3854     * because it makes things a bit easier. Ideally we should be using the
3855     * pipeline layout and use the offsets from the pipeline layout to patch
3856     * descriptors.
3857     */
3858    result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
3859                                      cmd_buffer->device->heaps.general_heap,
3860                                      normal_desc_set_size + dynamic_descs_size,
3861                                      &patched_desc_set_bo);
3862    if (result != VK_SUCCESS)
3863       return result;
3864 
3865    src_mem_ptr = (uint32_t *)pvr_bo_suballoc_get_map_addr(desc_set->pvr_bo);
3866    dst_mem_ptr = (uint32_t *)pvr_bo_suballoc_get_map_addr(patched_desc_set_bo);
3867 
3868    memcpy(dst_mem_ptr, src_mem_ptr, normal_desc_set_size);
3869 
3870    for (uint32_t i = 0; i < desc_set->layout->binding_count; i++) {
3871       const struct pvr_descriptor_set_layout_binding *binding =
3872          &desc_set->layout->bindings[i];
3873       const struct pvr_descriptor *descriptors =
3874          &desc_set->descriptors[binding->descriptor_index];
3875       const struct pvr_descriptor_size_info *size_info;
3876 
3877       if (binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
3878          size_info = &dynamic_uniform_buffer_size_info;
3879       else if (binding->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)
3880          size_info = &dynamic_storage_buffer_size_info;
3881       else
3882          continue;
3883 
3884       for (uint32_t stage = 0; stage < PVR_STAGE_ALLOCATION_COUNT; stage++) {
3885          uint32_t primary_offset;
3886          uint32_t secondary_offset;
3887 
3888          if (!(binding->shader_stage_mask & BITFIELD_BIT(stage)))
3889             continue;
3890 
3891          /* Get the offsets for the first dynamic descriptor in the current
3892           * binding.
3893           */
3894          primary_offset =
3895             pvr_get_dynamic_descriptor_primary_offset(device,
3896                                                       desc_set->layout,
3897                                                       binding,
3898                                                       stage,
3899                                                       0);
3900          secondary_offset =
3901             pvr_get_dynamic_descriptor_secondary_offset(device,
3902                                                         desc_set->layout,
3903                                                         binding,
3904                                                         stage,
3905                                                         0);
3906 
3907          /* clang-format off */
3908          for (uint32_t desc_idx = 0;
3909               desc_idx < binding->descriptor_count;
3910               desc_idx++) {
3911             /* clang-format on */
3912             const pvr_dev_addr_t addr =
3913                PVR_DEV_ADDR_OFFSET(descriptors[desc_idx].buffer_dev_addr,
3914                                    dynamic_offsets[desc_idx + desc_idx_offset]);
3915             const VkDeviceSize range =
3916                MIN2(descriptors[desc_idx].buffer_desc_range,
3917                     descriptors[desc_idx].buffer_whole_range -
3918                        dynamic_offsets[desc_idx]);
3919 
3920 #if MESA_DEBUG
3921             uint32_t desc_primary_offset;
3922             uint32_t desc_secondary_offset;
3923 
3924             desc_primary_offset =
3925                pvr_get_dynamic_descriptor_primary_offset(device,
3926                                                          desc_set->layout,
3927                                                          binding,
3928                                                          stage,
3929                                                          desc_idx);
3930             desc_secondary_offset =
3931                pvr_get_dynamic_descriptor_secondary_offset(device,
3932                                                            desc_set->layout,
3933                                                            binding,
3934                                                            stage,
3935                                                            desc_idx);
3936 
3937             /* Check the assumption that the descriptors within a binding, for
3938              * a particular stage, are allocated consecutively.
3939              */
3940             assert(desc_primary_offset ==
3941                    primary_offset + size_info->primary * desc_idx);
3942             assert(desc_secondary_offset ==
3943                    secondary_offset + size_info->secondary * desc_idx);
3944 #endif
3945 
3946             assert(descriptors[desc_idx].type == binding->type);
3947 
3948             memcpy(dst_mem_ptr + primary_offset + size_info->primary * desc_idx,
3949                    &addr.addr,
3950                    PVR_DW_TO_BYTES(size_info->primary));
3951             memcpy(dst_mem_ptr + secondary_offset +
3952                       size_info->secondary * desc_idx,
3953                    &range,
3954                    PVR_DW_TO_BYTES(size_info->secondary));
3955          }
3956       }
3957 
3958       desc_idx_offset += binding->descriptor_count;
3959    }
3960 
3961    *bo_out = patched_desc_set_bo;
3962 
3963    return VK_SUCCESS;
3964 }
3965 
3966 #define PVR_SELECT(_geom, _frag, _compute)         \
3967    (stage == PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY) \
3968       ? (_geom)                                    \
3969       : (stage == PVR_STAGE_ALLOCATION_FRAGMENT) ? (_frag) : (_compute)
3970 
3971 static VkResult
pvr_cmd_buffer_upload_desc_set_table(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,pvr_dev_addr_t * addr_out)3972 pvr_cmd_buffer_upload_desc_set_table(struct pvr_cmd_buffer *const cmd_buffer,
3973                                      enum pvr_stage_allocation stage,
3974                                      pvr_dev_addr_t *addr_out)
3975 {
3976    uint64_t bound_desc_sets[PVR_MAX_DESCRIPTOR_SETS];
3977    const struct pvr_descriptor_state *desc_state;
3978    struct pvr_suballoc_bo *suballoc_bo;
3979    uint32_t dynamic_offset_idx = 0;
3980    VkResult result;
3981 
3982    switch (stage) {
3983    case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
3984    case PVR_STAGE_ALLOCATION_FRAGMENT:
3985    case PVR_STAGE_ALLOCATION_COMPUTE:
3986       break;
3987 
3988    default:
3989       unreachable("Unsupported stage.");
3990       break;
3991    }
3992 
3993    desc_state = PVR_SELECT(&cmd_buffer->state.gfx_desc_state,
3994                            &cmd_buffer->state.gfx_desc_state,
3995                            &cmd_buffer->state.compute_desc_state);
3996 
3997    for (uint32_t set = 0; set < ARRAY_SIZE(bound_desc_sets); set++)
3998       bound_desc_sets[set] = ~0;
3999 
4000    assert(util_last_bit(desc_state->valid_mask) <= ARRAY_SIZE(bound_desc_sets));
4001    for (uint32_t set = 0; set < util_last_bit(desc_state->valid_mask); set++) {
4002       const struct pvr_descriptor_set *desc_set;
4003 
4004       if (!(desc_state->valid_mask & BITFIELD_BIT(set))) {
4005          const struct pvr_pipeline_layout *pipeline_layout =
4006             PVR_SELECT(cmd_buffer->state.gfx_pipeline->base.layout,
4007                        cmd_buffer->state.gfx_pipeline->base.layout,
4008                        cmd_buffer->state.compute_pipeline->base.layout);
4009          const struct pvr_descriptor_set_layout *set_layout;
4010 
4011          assert(set <= pipeline_layout->set_count);
4012 
4013          set_layout = pipeline_layout->set_layout[set];
4014          dynamic_offset_idx += set_layout->dynamic_buffer_count;
4015 
4016          continue;
4017       }
4018 
4019       desc_set = desc_state->descriptor_sets[set];
4020 
4021       /* TODO: Is it better if we don't set the valid_mask for empty sets? */
4022       if (desc_set->layout->descriptor_count == 0)
4023          continue;
4024 
4025       if (desc_set->layout->dynamic_buffer_count > 0) {
4026          struct pvr_suballoc_bo *new_desc_set_bo;
4027 
4028          assert(dynamic_offset_idx + desc_set->layout->dynamic_buffer_count <=
4029                 ARRAY_SIZE(desc_state->dynamic_offsets));
4030 
4031          result = pvr_cmd_buffer_upload_patched_desc_set(
4032             cmd_buffer,
4033             desc_set,
4034             &desc_state->dynamic_offsets[dynamic_offset_idx],
4035             &new_desc_set_bo);
4036          if (result != VK_SUCCESS)
4037             return result;
4038 
4039          dynamic_offset_idx += desc_set->layout->dynamic_buffer_count;
4040 
4041          bound_desc_sets[set] = new_desc_set_bo->dev_addr.addr;
4042       } else {
4043          bound_desc_sets[set] = desc_set->pvr_bo->dev_addr.addr;
4044       }
4045    }
4046 
4047    result = pvr_cmd_buffer_upload_general(cmd_buffer,
4048                                           bound_desc_sets,
4049                                           sizeof(bound_desc_sets),
4050                                           &suballoc_bo);
4051    if (result != VK_SUCCESS)
4052       return result;
4053 
4054    *addr_out = suballoc_bo->dev_addr;
4055    return VK_SUCCESS;
4056 }
4057 
4058 static VkResult
pvr_process_addr_literal(struct pvr_cmd_buffer * cmd_buffer,enum pvr_pds_addr_literal_type addr_literal_type,enum pvr_stage_allocation stage,pvr_dev_addr_t * addr_out)4059 pvr_process_addr_literal(struct pvr_cmd_buffer *cmd_buffer,
4060                          enum pvr_pds_addr_literal_type addr_literal_type,
4061                          enum pvr_stage_allocation stage,
4062                          pvr_dev_addr_t *addr_out)
4063 {
4064    VkResult result;
4065 
4066    switch (addr_literal_type) {
4067    case PVR_PDS_ADDR_LITERAL_DESC_SET_ADDRS_TABLE: {
4068       /* TODO: Maybe we want to free pvr_bo? And only when the data
4069        * section is written successfully we link all bos to the command
4070        * buffer.
4071        */
4072       result =
4073          pvr_cmd_buffer_upload_desc_set_table(cmd_buffer, stage, addr_out);
4074       if (result != VK_SUCCESS)
4075          return result;
4076 
4077       break;
4078    }
4079 
4080    case PVR_PDS_ADDR_LITERAL_PUSH_CONSTS: {
4081       const struct pvr_pipeline_layout *layout =
4082          PVR_SELECT(cmd_buffer->state.gfx_pipeline->base.layout,
4083                     cmd_buffer->state.gfx_pipeline->base.layout,
4084                     cmd_buffer->state.compute_pipeline->base.layout);
4085       const uint32_t push_constants_offset =
4086          PVR_SELECT(layout->vert_push_constants_offset,
4087                     layout->frag_push_constants_offset,
4088                     layout->compute_push_constants_offset);
4089 
4090       *addr_out = PVR_DEV_ADDR_OFFSET(cmd_buffer->state.push_constants.dev_addr,
4091                                       push_constants_offset);
4092       break;
4093    }
4094 
4095    case PVR_PDS_ADDR_LITERAL_BLEND_CONSTANTS: {
4096       float *blend_consts =
4097          cmd_buffer->vk.dynamic_graphics_state.cb.blend_constants;
4098       size_t size =
4099          sizeof(cmd_buffer->vk.dynamic_graphics_state.cb.blend_constants);
4100       struct pvr_suballoc_bo *blend_consts_bo;
4101 
4102       result = pvr_cmd_buffer_upload_general(cmd_buffer,
4103                                              blend_consts,
4104                                              size,
4105                                              &blend_consts_bo);
4106       if (result != VK_SUCCESS)
4107          return result;
4108 
4109       *addr_out = blend_consts_bo->dev_addr;
4110 
4111       break;
4112    }
4113 
4114    default:
4115       unreachable("Invalid add literal type.");
4116    }
4117 
4118    return VK_SUCCESS;
4119 }
4120 
4121 #undef PVR_SELECT
4122 
pvr_setup_descriptor_mappings_new(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,uint32_t * const descriptor_data_offset_out)4123 static VkResult pvr_setup_descriptor_mappings_new(
4124    struct pvr_cmd_buffer *const cmd_buffer,
4125    enum pvr_stage_allocation stage,
4126    const struct pvr_stage_allocation_descriptor_state *descriptor_state,
4127    uint32_t *const descriptor_data_offset_out)
4128 {
4129    const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
4130    struct pvr_suballoc_bo *pvr_bo;
4131    const uint8_t *entries;
4132    uint32_t *dword_buffer;
4133    uint64_t *qword_buffer;
4134    VkResult result;
4135 
4136    if (!pds_info->data_size_in_dwords)
4137       return VK_SUCCESS;
4138 
4139    result =
4140       pvr_cmd_buffer_alloc_mem(cmd_buffer,
4141                                cmd_buffer->device->heaps.pds_heap,
4142                                PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
4143                                &pvr_bo);
4144    if (result != VK_SUCCESS)
4145       return result;
4146 
4147    dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
4148    qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
4149 
4150    entries = (uint8_t *)pds_info->entries;
4151 
4152    switch (stage) {
4153    case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
4154    case PVR_STAGE_ALLOCATION_FRAGMENT:
4155    case PVR_STAGE_ALLOCATION_COMPUTE:
4156       break;
4157 
4158    default:
4159       unreachable("Unsupported stage.");
4160       break;
4161    }
4162 
4163    for (uint32_t i = 0; i < pds_info->entry_count; i++) {
4164       const struct pvr_const_map_entry *const entry_header =
4165          (struct pvr_const_map_entry *)entries;
4166 
4167       switch (entry_header->type) {
4168       case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
4169          const struct pvr_const_map_entry_literal32 *const literal =
4170             (struct pvr_const_map_entry_literal32 *)entries;
4171 
4172          PVR_WRITE(dword_buffer,
4173                    literal->literal_value,
4174                    literal->const_offset,
4175                    pds_info->data_size_in_dwords);
4176 
4177          entries += sizeof(*literal);
4178          break;
4179       }
4180 
4181       case PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL_BUFFER: {
4182          const struct pvr_pds_const_map_entry_addr_literal_buffer
4183             *const addr_literal_buffer_entry =
4184                (struct pvr_pds_const_map_entry_addr_literal_buffer *)entries;
4185          struct pvr_device *device = cmd_buffer->device;
4186          struct pvr_suballoc_bo *addr_literal_buffer_bo;
4187          uint32_t addr_literal_count = 0;
4188          uint64_t *addr_literal_buffer;
4189 
4190          result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
4191                                            device->heaps.general_heap,
4192                                            addr_literal_buffer_entry->size,
4193                                            &addr_literal_buffer_bo);
4194          if (result != VK_SUCCESS)
4195             return result;
4196 
4197          addr_literal_buffer =
4198             (uint64_t *)pvr_bo_suballoc_get_map_addr(addr_literal_buffer_bo);
4199 
4200          entries += sizeof(*addr_literal_buffer_entry);
4201 
4202          PVR_WRITE(qword_buffer,
4203                    addr_literal_buffer_bo->dev_addr.addr,
4204                    addr_literal_buffer_entry->const_offset,
4205                    pds_info->data_size_in_dwords);
4206 
4207          for (uint32_t j = i + 1; j < pds_info->entry_count; j++) {
4208             const struct pvr_const_map_entry *const entry_header =
4209                (struct pvr_const_map_entry *)entries;
4210             const struct pvr_pds_const_map_entry_addr_literal *addr_literal;
4211             pvr_dev_addr_t dev_addr;
4212 
4213             if (entry_header->type != PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL)
4214                break;
4215 
4216             addr_literal =
4217                (struct pvr_pds_const_map_entry_addr_literal *)entries;
4218 
4219             result = pvr_process_addr_literal(cmd_buffer,
4220                                               addr_literal->addr_type,
4221                                               stage,
4222                                               &dev_addr);
4223             if (result != VK_SUCCESS)
4224                return result;
4225 
4226             addr_literal_buffer[addr_literal_count++] = dev_addr.addr;
4227 
4228             entries += sizeof(*addr_literal);
4229          }
4230 
4231          assert(addr_literal_count * sizeof(uint64_t) ==
4232                 addr_literal_buffer_entry->size);
4233 
4234          i += addr_literal_count;
4235 
4236          break;
4237       }
4238 
4239       default:
4240          unreachable("Unsupported map entry type.");
4241       }
4242    }
4243 
4244    *descriptor_data_offset_out =
4245       pvr_bo->dev_addr.addr -
4246       cmd_buffer->device->heaps.pds_heap->base_addr.addr;
4247 
4248    return VK_SUCCESS;
4249 }
4250 
pvr_setup_descriptor_mappings(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,const pvr_dev_addr_t * const num_worgroups_buff_addr,uint32_t * const descriptor_data_offset_out)4251 static VkResult pvr_setup_descriptor_mappings(
4252    struct pvr_cmd_buffer *const cmd_buffer,
4253    enum pvr_stage_allocation stage,
4254    const struct pvr_stage_allocation_descriptor_state *descriptor_state,
4255    const pvr_dev_addr_t *const num_worgroups_buff_addr,
4256    uint32_t *const descriptor_data_offset_out)
4257 {
4258    const bool old_path =
4259       pvr_has_hard_coded_shaders(&cmd_buffer->device->pdevice->dev_info);
4260 
4261    if (old_path) {
4262       return pvr_setup_descriptor_mappings_old(cmd_buffer,
4263                                                stage,
4264                                                descriptor_state,
4265                                                num_worgroups_buff_addr,
4266                                                descriptor_data_offset_out);
4267    }
4268 
4269    return pvr_setup_descriptor_mappings_new(cmd_buffer,
4270                                             stage,
4271                                             descriptor_state,
4272                                             descriptor_data_offset_out);
4273 }
4274 
pvr_compute_update_shared(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd)4275 static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer,
4276                                       struct pvr_sub_cmd_compute *const sub_cmd)
4277 {
4278    const struct pvr_device *device = cmd_buffer->device;
4279    const struct pvr_physical_device *pdevice = device->pdevice;
4280    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4281    struct pvr_csb *csb = &sub_cmd->control_stream;
4282    const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
4283    const uint32_t const_shared_regs =
4284       pipeline->shader_state.const_shared_reg_count;
4285    struct pvr_compute_kernel_info info;
4286 
4287    /* No shared regs, no need to use an allocation kernel. */
4288    if (!const_shared_regs)
4289       return;
4290 
4291    /* Accumulate the MAX number of shared registers across the kernels in this
4292     * dispatch. This is used by the FW for context switching, so must be large
4293     * enough to contain all the shared registers that might be in use for this
4294     * compute job. Coefficients don't need to be included as the context switch
4295     * will not happen within the execution of a single workgroup, thus nothing
4296     * needs to be preserved.
4297     */
4298    state->max_shared_regs = MAX2(state->max_shared_regs, const_shared_regs);
4299 
4300    info = (struct pvr_compute_kernel_info){
4301       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
4302       .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
4303 
4304       .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
4305       .usc_common_shared = true,
4306       .usc_common_size =
4307          DIV_ROUND_UP(const_shared_regs,
4308                       PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
4309 
4310       .local_size = { 1, 1, 1 },
4311       .global_size = { 1, 1, 1 },
4312    };
4313 
4314    /* Sometimes we don't have a secondary program if there were no constants to
4315     * write, but we still need to run a PDS program to accomplish the
4316     * allocation of the local/common store shared registers. Use the
4317     * pre-uploaded empty PDS program in this instance.
4318     */
4319    if (pipeline->descriptor_state.pds_info.code_size_in_dwords) {
4320       uint32_t pds_data_size_in_dwords =
4321          pipeline->descriptor_state.pds_info.data_size_in_dwords;
4322 
4323       info.pds_data_offset = state->pds_compute_descriptor_data_offset;
4324       info.pds_data_size =
4325          DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_data_size_in_dwords),
4326                       PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE));
4327 
4328       /* Check that we have upload the code section. */
4329       assert(pipeline->descriptor_state.pds_code.code_size);
4330       info.pds_code_offset = pipeline->descriptor_state.pds_code.code_offset;
4331    } else {
4332       const struct pvr_pds_upload *program = &device->pds_compute_empty_program;
4333 
4334       info.pds_data_offset = program->data_offset;
4335       info.pds_data_size =
4336          DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
4337                       PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE));
4338       info.pds_code_offset = program->code_offset;
4339    }
4340 
4341    /* We don't need to pad the workgroup size. */
4342 
4343    info.max_instances =
4344       pvr_compute_flat_slot_size(pdevice, const_shared_regs, false, 1U);
4345 
4346    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4347 }
4348 
pvr_compute_update_shared_private(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,struct pvr_private_compute_pipeline * pipeline)4349 void pvr_compute_update_shared_private(
4350    struct pvr_cmd_buffer *cmd_buffer,
4351    struct pvr_sub_cmd_compute *const sub_cmd,
4352    struct pvr_private_compute_pipeline *pipeline)
4353 {
4354    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
4355    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4356    const uint32_t const_shared_regs = pipeline->const_shared_regs_count;
4357    struct pvr_csb *csb = &sub_cmd->control_stream;
4358    struct pvr_compute_kernel_info info;
4359 
4360    /* No shared regs, no need to use an allocation kernel. */
4361    if (!const_shared_regs)
4362       return;
4363 
4364    /* See comment in pvr_compute_update_shared() for details on this. */
4365    state->max_shared_regs = MAX2(state->max_shared_regs, const_shared_regs);
4366 
4367    info = (struct pvr_compute_kernel_info){
4368       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
4369       .usc_common_size =
4370          DIV_ROUND_UP(const_shared_regs,
4371                       PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
4372       .pds_data_size =
4373          DIV_ROUND_UP(PVR_DW_TO_BYTES(pipeline->pds_shared_update_data_size_dw),
4374                       PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
4375       .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
4376       .pds_data_offset = pipeline->pds_shared_update_data_offset,
4377       .pds_code_offset = pipeline->pds_shared_update_code_offset,
4378       .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
4379       .usc_common_shared = true,
4380       .local_size = { 1, 1, 1 },
4381       .global_size = { 1, 1, 1 },
4382    };
4383 
4384    /* We don't need to pad the workgroup size. */
4385 
4386    info.max_instances =
4387       pvr_compute_flat_slot_size(pdevice, const_shared_regs, false, 1U);
4388 
4389    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4390 }
4391 
4392 static uint32_t
pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device * pdevice,uint32_t workgroup_size,uint32_t coeff_regs_count)4393 pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device *pdevice,
4394                                     uint32_t workgroup_size,
4395                                     uint32_t coeff_regs_count)
4396 {
4397    const struct pvr_device_runtime_info *dev_runtime_info =
4398       &pdevice->dev_runtime_info;
4399    const struct pvr_device_info *dev_info = &pdevice->dev_info;
4400    uint32_t max_avail_coeff_regs =
4401       dev_runtime_info->cdm_max_local_mem_size_regs;
4402    uint32_t coeff_regs_count_aligned =
4403       ALIGN_POT(coeff_regs_count,
4404                 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE) >> 2U);
4405 
4406    /* If the work group size is > ROGUE_MAX_INSTANCES_PER_TASK. We now *always*
4407     * pad the work group size to the next multiple of
4408     * ROGUE_MAX_INSTANCES_PER_TASK.
4409     *
4410     * If we use more than 1/8th of the max coefficient registers then we round
4411     * work group size up to the next multiple of ROGUE_MAX_INSTANCES_PER_TASK
4412     */
4413    /* TODO: See if this can be optimized. */
4414    if (workgroup_size > ROGUE_MAX_INSTANCES_PER_TASK ||
4415        coeff_regs_count_aligned > (max_avail_coeff_regs / 8)) {
4416       assert(workgroup_size < rogue_get_compute_max_work_group_size(dev_info));
4417 
4418       return ALIGN_POT(workgroup_size, ROGUE_MAX_INSTANCES_PER_TASK);
4419    }
4420 
4421    return workgroup_size;
4422 }
4423 
pvr_compute_update_kernel_private(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,struct pvr_private_compute_pipeline * pipeline,const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])4424 void pvr_compute_update_kernel_private(
4425    struct pvr_cmd_buffer *cmd_buffer,
4426    struct pvr_sub_cmd_compute *const sub_cmd,
4427    struct pvr_private_compute_pipeline *pipeline,
4428    const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
4429 {
4430    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
4431    const struct pvr_device_runtime_info *dev_runtime_info =
4432       &pdevice->dev_runtime_info;
4433    struct pvr_csb *csb = &sub_cmd->control_stream;
4434 
4435    struct pvr_compute_kernel_info info = {
4436       .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
4437       .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
4438       .pds_temp_size =
4439          DIV_ROUND_UP(pipeline->pds_temps_used << 2U,
4440                       PVRX(CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE)),
4441 
4442       .pds_data_size =
4443          DIV_ROUND_UP(PVR_DW_TO_BYTES(pipeline->pds_data_size_dw),
4444                       PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
4445       .pds_data_offset = pipeline->pds_data_offset,
4446       .pds_code_offset = pipeline->pds_code_offset,
4447 
4448       .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
4449 
4450       .usc_unified_size =
4451          DIV_ROUND_UP(pipeline->unified_store_regs_count << 2U,
4452                       PVRX(CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE)),
4453 
4454       /* clang-format off */
4455       .global_size = {
4456          global_workgroup_size[0],
4457          global_workgroup_size[1],
4458          global_workgroup_size[2]
4459       },
4460       /* clang-format on */
4461    };
4462 
4463    uint32_t work_size = pipeline->workgroup_size.width *
4464                         pipeline->workgroup_size.height *
4465                         pipeline->workgroup_size.depth;
4466    uint32_t coeff_regs;
4467 
4468    if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
4469       /* Enforce a single workgroup per cluster through allocation starvation.
4470        */
4471       coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
4472    } else {
4473       coeff_regs = pipeline->coeff_regs_count;
4474    }
4475 
4476    info.usc_common_size =
4477       DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs),
4478                    PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
4479 
4480    /* Use a whole slot per workgroup. */
4481    work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
4482 
4483    coeff_regs += pipeline->const_shared_regs_count;
4484 
4485    if (pipeline->const_shared_regs_count > 0)
4486       info.sd_type = PVRX(CDMCTRL_SD_TYPE_USC);
4487 
4488    work_size =
4489       pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs);
4490 
4491    info.local_size[0] = work_size;
4492    info.local_size[1] = 1U;
4493    info.local_size[2] = 1U;
4494 
4495    info.max_instances =
4496       pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size);
4497 
4498    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4499 }
4500 
4501 /* TODO: Wire up the base_workgroup variant program when implementing
4502  * VK_KHR_device_group. The values will also need patching into the program.
4503  */
pvr_compute_update_kernel(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,pvr_dev_addr_t indirect_addr,const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])4504 static void pvr_compute_update_kernel(
4505    struct pvr_cmd_buffer *cmd_buffer,
4506    struct pvr_sub_cmd_compute *const sub_cmd,
4507    pvr_dev_addr_t indirect_addr,
4508    const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
4509 {
4510    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
4511    const struct pvr_device_runtime_info *dev_runtime_info =
4512       &pdevice->dev_runtime_info;
4513    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4514    struct pvr_csb *csb = &sub_cmd->control_stream;
4515    const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
4516    const struct pvr_compute_shader_state *shader_state =
4517       &pipeline->shader_state;
4518    const struct pvr_pds_info *program_info = &pipeline->primary_program_info;
4519 
4520    struct pvr_compute_kernel_info info = {
4521       .indirect_buffer_addr = indirect_addr,
4522       .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
4523       .pds_temp_size =
4524          DIV_ROUND_UP(program_info->temps_required << 2U,
4525                       PVRX(CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE)),
4526 
4527       .pds_data_size =
4528          DIV_ROUND_UP(PVR_DW_TO_BYTES(program_info->data_size_in_dwords),
4529                       PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
4530       .pds_data_offset = pipeline->primary_program.data_offset,
4531       .pds_code_offset = pipeline->primary_program.code_offset,
4532 
4533       .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
4534 
4535       .usc_unified_size =
4536          DIV_ROUND_UP(shader_state->input_register_count << 2U,
4537                       PVRX(CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE)),
4538 
4539       /* clang-format off */
4540       .global_size = {
4541          global_workgroup_size[0],
4542          global_workgroup_size[1],
4543          global_workgroup_size[2]
4544       },
4545       /* clang-format on */
4546    };
4547 
4548    uint32_t work_size = shader_state->work_size;
4549    uint32_t coeff_regs;
4550 
4551    if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
4552       /* Enforce a single workgroup per cluster through allocation starvation.
4553        */
4554       coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
4555    } else {
4556       coeff_regs = shader_state->coefficient_register_count;
4557    }
4558 
4559    info.usc_common_size =
4560       DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs),
4561                    PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
4562 
4563    /* Use a whole slot per workgroup. */
4564    work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
4565 
4566    coeff_regs += shader_state->const_shared_reg_count;
4567 
4568    if (shader_state->const_shared_reg_count > 0)
4569       info.sd_type = PVRX(CDMCTRL_SD_TYPE_USC);
4570 
4571    work_size =
4572       pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs);
4573 
4574    info.local_size[0] = work_size;
4575    info.local_size[1] = 1U;
4576    info.local_size[2] = 1U;
4577 
4578    info.max_instances =
4579       pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size);
4580 
4581    pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4582 }
4583 
pvr_cmd_upload_push_consts(struct pvr_cmd_buffer * cmd_buffer)4584 static VkResult pvr_cmd_upload_push_consts(struct pvr_cmd_buffer *cmd_buffer)
4585 {
4586    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4587    struct pvr_suballoc_bo *suballoc_bo;
4588    VkResult result;
4589 
4590    /* TODO: Here are some possible optimizations/things to consider:
4591     *
4592     *    - Currently we upload maxPushConstantsSize. The application might only
4593     *      be using a portion of that so we might end up with unused memory.
4594     *      Should we be smarter about this. If we intend to upload the push
4595     *      consts into shareds, we definitely want to do avoid reserving unused
4596     *      regs.
4597     *
4598     *    - For now we have to upload to a new buffer each time since the shaders
4599     *      access the push constants from memory. If we were to reuse the same
4600     *      buffer we might update the contents out of sync with job submission
4601     *      and the shaders will see the updated contents while the command
4602     *      buffer was still being recorded and not yet submitted.
4603     *      If we were to upload the push constants directly to shared regs we
4604     *      could reuse the same buffer (avoiding extra allocation overhead)
4605     *      since the contents will be DMAed only on job submission when the
4606     *      control stream is processed and the PDS program is executed. This
4607     *      approach would also allow us to avoid regenerating the PDS data
4608     *      section in some cases since the buffer address will be constants.
4609     */
4610 
4611    if (cmd_buffer->state.push_constants.uploaded)
4612       return VK_SUCCESS;
4613 
4614    result = pvr_cmd_buffer_upload_general(cmd_buffer,
4615                                           state->push_constants.data,
4616                                           sizeof(state->push_constants.data),
4617                                           &suballoc_bo);
4618    if (result != VK_SUCCESS)
4619       return result;
4620 
4621    cmd_buffer->state.push_constants.dev_addr = suballoc_bo->dev_addr;
4622    cmd_buffer->state.push_constants.uploaded = true;
4623 
4624    return VK_SUCCESS;
4625 }
4626 
pvr_cmd_dispatch(struct pvr_cmd_buffer * const cmd_buffer,const pvr_dev_addr_t indirect_addr,const uint32_t workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])4627 static void pvr_cmd_dispatch(
4628    struct pvr_cmd_buffer *const cmd_buffer,
4629    const pvr_dev_addr_t indirect_addr,
4630    const uint32_t workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
4631 {
4632    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4633    const struct pvr_compute_pipeline *compute_pipeline =
4634       state->compute_pipeline;
4635    struct pvr_sub_cmd_compute *sub_cmd;
4636    VkResult result;
4637 
4638    pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_COMPUTE);
4639 
4640    sub_cmd = &state->current_sub_cmd->compute;
4641    sub_cmd->uses_atomic_ops |= compute_pipeline->shader_state.uses_atomic_ops;
4642    sub_cmd->uses_barrier |= compute_pipeline->shader_state.uses_barrier;
4643 
4644    if (state->push_constants.dirty_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
4645       result = pvr_cmd_upload_push_consts(cmd_buffer);
4646       if (result != VK_SUCCESS)
4647          return;
4648 
4649       /* Regenerate the PDS program to use the new push consts buffer. */
4650       state->dirty.compute_desc_dirty = true;
4651 
4652       state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4653    }
4654 
4655    if (compute_pipeline->shader_state.uses_num_workgroups) {
4656       pvr_dev_addr_t descriptor_data_offset_out;
4657 
4658       if (indirect_addr.addr) {
4659          descriptor_data_offset_out = indirect_addr;
4660       } else {
4661          struct pvr_suballoc_bo *num_workgroups_bo;
4662 
4663          result = pvr_cmd_buffer_upload_general(cmd_buffer,
4664                                                 workgroup_size,
4665                                                 sizeof(*workgroup_size) *
4666                                                    PVR_WORKGROUP_DIMENSIONS,
4667                                                 &num_workgroups_bo);
4668          if (result != VK_SUCCESS)
4669             return;
4670 
4671          descriptor_data_offset_out = num_workgroups_bo->dev_addr;
4672       }
4673 
4674       result = pvr_setup_descriptor_mappings(
4675          cmd_buffer,
4676          PVR_STAGE_ALLOCATION_COMPUTE,
4677          &compute_pipeline->descriptor_state,
4678          &descriptor_data_offset_out,
4679          &state->pds_compute_descriptor_data_offset);
4680       if (result != VK_SUCCESS)
4681          return;
4682    } else if ((compute_pipeline->base.layout
4683                   ->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_COMPUTE] &&
4684                state->dirty.compute_desc_dirty) ||
4685               state->dirty.compute_pipeline_binding) {
4686       result = pvr_setup_descriptor_mappings(
4687          cmd_buffer,
4688          PVR_STAGE_ALLOCATION_COMPUTE,
4689          &compute_pipeline->descriptor_state,
4690          NULL,
4691          &state->pds_compute_descriptor_data_offset);
4692       if (result != VK_SUCCESS)
4693          return;
4694    }
4695 
4696    pvr_compute_update_shared(cmd_buffer, sub_cmd);
4697    pvr_compute_update_kernel(cmd_buffer, sub_cmd, indirect_addr, workgroup_size);
4698 }
4699 
pvr_CmdDispatch(VkCommandBuffer commandBuffer,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4700 void pvr_CmdDispatch(VkCommandBuffer commandBuffer,
4701                      uint32_t groupCountX,
4702                      uint32_t groupCountY,
4703                      uint32_t groupCountZ)
4704 {
4705    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4706 
4707    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4708 
4709    if (!groupCountX || !groupCountY || !groupCountZ)
4710       return;
4711 
4712    pvr_cmd_dispatch(cmd_buffer,
4713                     PVR_DEV_ADDR_INVALID,
4714                     (uint32_t[]){ groupCountX, groupCountY, groupCountZ });
4715 }
4716 
pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)4717 void pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
4718                              VkBuffer _buffer,
4719                              VkDeviceSize offset)
4720 {
4721    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4722    PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
4723 
4724    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4725 
4726    pvr_cmd_dispatch(cmd_buffer,
4727                     PVR_DEV_ADDR_OFFSET(buffer->dev_addr, offset),
4728                     (uint32_t[]){ 1, 1, 1 });
4729 }
4730 
4731 static void
pvr_update_draw_state(struct pvr_cmd_buffer_state * const state,const struct pvr_cmd_buffer_draw_state * const draw_state)4732 pvr_update_draw_state(struct pvr_cmd_buffer_state *const state,
4733                       const struct pvr_cmd_buffer_draw_state *const draw_state)
4734 {
4735    /* We don't have a state to tell us that base_instance is being used so it
4736     * gets used as a boolean - 0 means we'll use a pds program that skips the
4737     * base instance addition. If the base_instance gets used (and the last
4738     * draw's base_instance was 0) then we switch to the BASE_INSTANCE attrib
4739     * program.
4740     *
4741     * If base_instance changes then we only need to update the data section.
4742     *
4743     * The only draw call state that doesn't really matter is the start vertex
4744     * as that is handled properly in the VDM state in all cases.
4745     */
4746    if ((state->draw_state.draw_indexed != draw_state->draw_indexed) ||
4747        (state->draw_state.draw_indirect != draw_state->draw_indirect) ||
4748        (state->draw_state.base_instance == 0 &&
4749         draw_state->base_instance != 0)) {
4750       state->dirty.draw_variant = true;
4751    } else if (state->draw_state.base_instance != draw_state->base_instance) {
4752       state->dirty.draw_base_instance = true;
4753    }
4754 
4755    state->draw_state = *draw_state;
4756 }
4757 
pvr_calc_shared_regs_count(const struct pvr_graphics_pipeline * const gfx_pipeline)4758 static uint32_t pvr_calc_shared_regs_count(
4759    const struct pvr_graphics_pipeline *const gfx_pipeline)
4760 {
4761    const struct pvr_pipeline_stage_state *const vertex_state =
4762       &gfx_pipeline->shader_state.vertex.stage_state;
4763 
4764    uint32_t shared_regs = vertex_state->const_shared_reg_count +
4765                           vertex_state->const_shared_reg_offset;
4766 
4767    if (gfx_pipeline->shader_state.fragment.bo) {
4768       const struct pvr_pipeline_stage_state *const fragment_state =
4769          &gfx_pipeline->shader_state.fragment.stage_state;
4770 
4771       uint32_t fragment_regs = fragment_state->const_shared_reg_count +
4772                                fragment_state->const_shared_reg_offset;
4773 
4774       shared_regs = MAX2(shared_regs, fragment_regs);
4775    }
4776 
4777    return shared_regs;
4778 }
4779 
4780 static void
pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd,const uint32_t pds_vertex_descriptor_data_offset)4781 pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer *const cmd_buffer,
4782                          struct pvr_sub_cmd_gfx *const sub_cmd,
4783                          const uint32_t pds_vertex_descriptor_data_offset)
4784 {
4785    const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
4786    const struct pvr_stage_allocation_descriptor_state
4787       *const vertex_descriptor_state =
4788          &state->gfx_pipeline->shader_state.vertex.descriptor_state;
4789    const struct pvr_pipeline_stage_state *const vertex_stage_state =
4790       &state->gfx_pipeline->shader_state.vertex.stage_state;
4791    struct pvr_csb *const csb = &sub_cmd->control_stream;
4792 
4793    if (!vertex_descriptor_state->pds_info.code_size_in_dwords)
4794       return;
4795 
4796    pvr_csb_set_relocation_mark(csb);
4797 
4798    pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) {
4799       state0.usc_target = PVRX(VDMCTRL_USC_TARGET_ALL);
4800 
4801       state0.usc_common_size =
4802          DIV_ROUND_UP(vertex_stage_state->const_shared_reg_count << 2,
4803                       PVRX(VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE));
4804 
4805       state0.pds_data_size = DIV_ROUND_UP(
4806          PVR_DW_TO_BYTES(vertex_descriptor_state->pds_info.data_size_in_dwords),
4807          PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE));
4808    }
4809 
4810    pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) {
4811       state1.pds_data_addr = PVR_DEV_ADDR(pds_vertex_descriptor_data_offset);
4812       state1.sd_type = PVRX(VDMCTRL_SD_TYPE_NONE);
4813    }
4814 
4815    pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) {
4816       state2.pds_code_addr =
4817          PVR_DEV_ADDR(vertex_descriptor_state->pds_code.code_offset);
4818    }
4819 
4820    pvr_csb_clear_relocation_mark(csb);
4821 }
4822 
pvr_setup_output_select(struct pvr_cmd_buffer * const cmd_buffer)4823 static void pvr_setup_output_select(struct pvr_cmd_buffer *const cmd_buffer)
4824 {
4825    const struct pvr_graphics_pipeline *const gfx_pipeline =
4826       cmd_buffer->state.gfx_pipeline;
4827    const struct pvr_vertex_shader_state *const vertex_state =
4828       &gfx_pipeline->shader_state.vertex;
4829    struct vk_dynamic_graphics_state *const dynamic_state =
4830       &cmd_buffer->vk.dynamic_graphics_state;
4831    struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header;
4832    struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
4833    uint32_t output_selects;
4834 
4835    /* TODO: Handle vertex and fragment shader state flags. */
4836 
4837    pvr_csb_pack (&output_selects, TA_OUTPUT_SEL, state) {
4838       state.rhw_pres = true;
4839       state.vtxsize = DIV_ROUND_UP(vertex_state->vertex_output_size, 4U);
4840       state.psprite_size_pres = (dynamic_state->ia.primitive_topology ==
4841                                  VK_PRIMITIVE_TOPOLOGY_POINT_LIST);
4842    }
4843 
4844    if (ppp_state->output_selects != output_selects) {
4845       ppp_state->output_selects = output_selects;
4846       header->pres_outselects = true;
4847    }
4848 
4849    if (ppp_state->varying_word[0] != vertex_state->varying[0]) {
4850       ppp_state->varying_word[0] = vertex_state->varying[0];
4851       header->pres_varying_word0 = true;
4852    }
4853 
4854    if (ppp_state->varying_word[1] != vertex_state->varying[1]) {
4855       ppp_state->varying_word[1] = vertex_state->varying[1];
4856       header->pres_varying_word1 = true;
4857    }
4858 }
4859 
4860 static void
pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer * const cmd_buffer,struct PVRX (TA_STATE_ISPA)* const ispa_out)4861 pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer *const cmd_buffer,
4862                                 struct PVRX(TA_STATE_ISPA) *const ispa_out)
4863 {
4864    struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header;
4865    const struct pvr_fragment_shader_state *const fragment_shader_state =
4866       &cmd_buffer->state.gfx_pipeline->shader_state.fragment;
4867    const struct pvr_render_pass_info *const pass_info =
4868       &cmd_buffer->state.render_pass_info;
4869    struct vk_dynamic_graphics_state *dynamic_state =
4870       &cmd_buffer->vk.dynamic_graphics_state;
4871    struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
4872 
4873    const bool rasterizer_discard = dynamic_state->rs.rasterizer_discard_enable;
4874    const uint32_t subpass_idx = pass_info->subpass_idx;
4875    const uint32_t depth_stencil_attachment_idx =
4876       pass_info->pass->subpasses[subpass_idx].depth_stencil_attachment;
4877    const struct pvr_render_pass_attachment *const attachment =
4878       depth_stencil_attachment_idx != VK_ATTACHMENT_UNUSED
4879          ? &pass_info->pass->attachments[depth_stencil_attachment_idx]
4880          : NULL;
4881 
4882    const enum PVRX(TA_OBJTYPE)
4883       obj_type = pvr_ta_objtype(dynamic_state->ia.primitive_topology);
4884 
4885    const VkImageAspectFlags ds_aspects =
4886       (!rasterizer_discard && attachment)
4887          ? vk_format_aspects(attachment->vk_format) &
4888               (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)
4889          : VK_IMAGE_ASPECT_NONE;
4890 
4891    /* This is deliberately a full copy rather than a pointer because
4892     * vk_optimize_depth_stencil_state() can only be run once against any given
4893     * instance of vk_depth_stencil_state.
4894     */
4895    struct vk_depth_stencil_state ds_state = dynamic_state->ds;
4896 
4897    uint32_t ispb_stencil_off;
4898    bool is_two_sided = false;
4899    uint32_t isp_control;
4900 
4901    uint32_t line_width;
4902    uint32_t common_a;
4903    uint32_t front_a;
4904    uint32_t front_b;
4905    uint32_t back_a;
4906    uint32_t back_b;
4907 
4908    vk_optimize_depth_stencil_state(&ds_state, ds_aspects, true);
4909 
4910    /* Convert to 4.4 fixed point format. */
4911    line_width = util_unsigned_fixed(dynamic_state->rs.line.width, 4);
4912 
4913    /* Subtract 1 to shift values from range [0=0,256=16] to [0=1/16,255=16].
4914     * If 0 it stays at 0, otherwise we subtract 1.
4915     */
4916    line_width = (!!line_width) * (line_width - 1);
4917 
4918    line_width = MIN2(line_width, PVRX(TA_STATE_ISPA_POINTLINEWIDTH_SIZE_MAX));
4919 
4920    /* TODO: Part of the logic in this function is duplicated in another part
4921     * of the code. E.g. the dcmpmode, and sop1/2/3. Could we do this earlier?
4922     */
4923 
4924    pvr_csb_pack (&common_a, TA_STATE_ISPA, ispa) {
4925       ispa.pointlinewidth = line_width;
4926 
4927       ispa.dcmpmode = pvr_ta_cmpmode(ds_state.depth.compare_op);
4928       ispa.dwritedisable = !ds_state.depth.write_enable;
4929 
4930       ispa.passtype = fragment_shader_state->pass_type;
4931 
4932       ispa.objtype = obj_type;
4933 
4934       /* Return unpacked ispa structure. dcmpmode, dwritedisable, passtype and
4935        * objtype are needed by pvr_setup_triangle_merging_flag.
4936        */
4937       if (ispa_out)
4938          *ispa_out = ispa;
4939    }
4940 
4941    /* TODO: Does this actually represent the ispb control word on stencil off?
4942     * If not, rename the variable.
4943     */
4944    pvr_csb_pack (&ispb_stencil_off, TA_STATE_ISPB, ispb) {
4945       ispb.sop3 = PVRX(TA_ISPB_STENCILOP_KEEP);
4946       ispb.sop2 = PVRX(TA_ISPB_STENCILOP_KEEP);
4947       ispb.sop1 = PVRX(TA_ISPB_STENCILOP_KEEP);
4948       ispb.scmpmode = PVRX(TA_CMPMODE_ALWAYS);
4949    }
4950 
4951    /* FIXME: This logic should be redone and improved. Can we also get rid of
4952     * the front and back variants?
4953     */
4954 
4955    front_a = common_a;
4956    back_a = common_a;
4957 
4958    if (ds_state.stencil.test_enable) {
4959       uint32_t front_a_sref;
4960       uint32_t back_a_sref;
4961 
4962       pvr_csb_pack (&front_a_sref, TA_STATE_ISPA, ispa) {
4963          ispa.sref = ds_state.stencil.front.reference;
4964       }
4965       front_a |= front_a_sref;
4966 
4967       pvr_csb_pack (&back_a_sref, TA_STATE_ISPA, ispa) {
4968          ispa.sref = ds_state.stencil.back.reference;
4969       }
4970       back_a |= back_a_sref;
4971 
4972       pvr_csb_pack (&front_b, TA_STATE_ISPB, ispb) {
4973          const struct vk_stencil_test_face_state *const front =
4974             &ds_state.stencil.front;
4975 
4976          if (ds_state.stencil.write_enable)
4977             ispb.swmask = front->write_mask;
4978 
4979          ispb.scmpmask = front->compare_mask;
4980 
4981          ispb.sop3 = pvr_ta_stencilop(front->op.pass);
4982          ispb.sop2 = pvr_ta_stencilop(front->op.depth_fail);
4983          ispb.sop1 = pvr_ta_stencilop(front->op.fail);
4984          ispb.scmpmode = pvr_ta_cmpmode(front->op.compare);
4985       }
4986 
4987       pvr_csb_pack (&back_b, TA_STATE_ISPB, ispb) {
4988          const struct vk_stencil_test_face_state *const back =
4989             &ds_state.stencil.back;
4990 
4991          if (ds_state.stencil.write_enable)
4992             ispb.swmask = back->write_mask;
4993 
4994          ispb.scmpmask = back->compare_mask;
4995 
4996          ispb.sop3 = pvr_ta_stencilop(back->op.pass);
4997          ispb.sop2 = pvr_ta_stencilop(back->op.depth_fail);
4998          ispb.sop1 = pvr_ta_stencilop(back->op.fail);
4999          ispb.scmpmode = pvr_ta_cmpmode(back->op.compare);
5000       }
5001    } else {
5002       front_b = ispb_stencil_off;
5003       back_b = ispb_stencil_off;
5004    }
5005 
5006    if (front_a != back_a || front_b != back_b) {
5007       if (dynamic_state->rs.cull_mode & VK_CULL_MODE_BACK_BIT) {
5008          /* Single face, using front state. */
5009       } else if (dynamic_state->rs.cull_mode & VK_CULL_MODE_FRONT_BIT) {
5010          /* Single face, using back state. */
5011 
5012          front_a = back_a;
5013          front_b = back_b;
5014       } else {
5015          /* Both faces. */
5016 
5017          header->pres_ispctl_ba = is_two_sided = true;
5018 
5019          if (dynamic_state->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) {
5020             uint32_t tmp = front_a;
5021 
5022             front_a = back_a;
5023             back_a = tmp;
5024 
5025             tmp = front_b;
5026             front_b = back_b;
5027             back_b = tmp;
5028          }
5029 
5030          /* HW defaults to stencil off. */
5031          if (back_b != ispb_stencil_off) {
5032             header->pres_ispctl_fb = true;
5033             header->pres_ispctl_bb = true;
5034          }
5035       }
5036    }
5037 
5038    if (ds_state.stencil.test_enable && front_b != ispb_stencil_off)
5039       header->pres_ispctl_fb = true;
5040 
5041    pvr_csb_pack (&isp_control, TA_STATE_ISPCTL, ispctl) {
5042       ispctl.upass = pass_info->isp_userpass;
5043 
5044       /* TODO: is bo ever NULL? Figure out what to do. */
5045       ispctl.tagwritedisable = rasterizer_discard || !fragment_shader_state->bo;
5046 
5047       ispctl.two_sided = is_two_sided;
5048       ispctl.bpres = header->pres_ispctl_fb || header->pres_ispctl_bb;
5049 
5050       ispctl.dbenable = !rasterizer_discard &&
5051                         dynamic_state->rs.depth_bias.enable &&
5052                         obj_type == PVRX(TA_OBJTYPE_TRIANGLE);
5053       if (!rasterizer_discard && cmd_buffer->state.vis_test_enabled) {
5054          ispctl.vistest = true;
5055          ispctl.visreg = cmd_buffer->state.vis_reg;
5056       }
5057 
5058       ispctl.scenable = !rasterizer_discard;
5059 
5060       ppp_state->isp.control_struct = ispctl;
5061    }
5062 
5063    header->pres_ispctl = true;
5064 
5065    ppp_state->isp.control = isp_control;
5066    ppp_state->isp.front_a = front_a;
5067    ppp_state->isp.front_b = front_b;
5068    ppp_state->isp.back_a = back_a;
5069    ppp_state->isp.back_b = back_b;
5070 }
5071 
5072 static float
pvr_calculate_final_depth_bias_contant_factor(struct pvr_device_info * dev_info,VkFormat format,float depth_bias)5073 pvr_calculate_final_depth_bias_contant_factor(struct pvr_device_info *dev_info,
5074                                               VkFormat format,
5075                                               float depth_bias)
5076 {
5077    /* Information for future modifiers of these depth bias calculations.
5078     * ==================================================================
5079     * Specified depth bias equations scale the specified constant factor by a
5080     * value 'r' that is guaranteed to cause a resolvable difference in depth
5081     * across the entire range of depth values.
5082     * For floating point depth formats 'r' is calculated by taking the maximum
5083     * exponent across the triangle.
5084     * For UNORM formats 'r' is constant.
5085     * Here 'n' is the number of mantissa bits stored in the floating point
5086     * representation (23 for F32).
5087     *
5088     *    UNORM Format -> z += dbcf * r + slope
5089     *    FLOAT Format -> z += dbcf * 2^(e-n) + slope
5090     *
5091     * HW Variations.
5092     * ==============
5093     * The HW either always performs the F32 depth bias equation (exponent based
5094     * r), or in the case of HW that correctly supports the integer depth bias
5095     * equation for UNORM depth formats, we can select between both equations
5096     * using the ROGUE_CR_ISP_CTL.dbias_is_int flag - this is required to
5097     * correctly perform Vulkan UNORM depth bias (constant r).
5098     *
5099     *    if ern42307:
5100     *       if DBIAS_IS_INT_EN:
5101     *          z += dbcf + slope
5102     *       else:
5103     *          z += dbcf * 2^(e-n) + slope
5104     *    else:
5105     *       z += dbcf * 2^(e-n) + slope
5106     *
5107     */
5108 
5109    float nudge_factor;
5110 
5111    if (PVR_HAS_ERN(dev_info, 42307)) {
5112       switch (format) {
5113       case VK_FORMAT_D16_UNORM:
5114          return depth_bias / (1 << 15);
5115 
5116       case VK_FORMAT_D24_UNORM_S8_UINT:
5117       case VK_FORMAT_X8_D24_UNORM_PACK32:
5118          return depth_bias / (1 << 23);
5119 
5120       default:
5121          return depth_bias;
5122       }
5123    }
5124 
5125    /* The reasoning behind clamping/nudging the value here is because UNORM
5126     * depth formats can have higher precision over our underlying D32F
5127     * representation for some depth ranges.
5128     *
5129     * When the HW scales the depth bias value by 2^(e-n) [The 'r' term'] a depth
5130     * bias of 1 can result in a value smaller than one F32 ULP, which will get
5131     * quantized to 0 - resulting in no bias.
5132     *
5133     * Biasing small values away from zero will ensure that small depth biases of
5134     * 1 still yield a result and overcome Z-fighting.
5135     */
5136    switch (format) {
5137    case VK_FORMAT_D16_UNORM:
5138       depth_bias *= 512.0f;
5139       nudge_factor = 1.0f;
5140       break;
5141 
5142    case VK_FORMAT_D24_UNORM_S8_UINT:
5143    case VK_FORMAT_X8_D24_UNORM_PACK32:
5144       depth_bias *= 2.0f;
5145       nudge_factor = 2.0f;
5146       break;
5147 
5148    default:
5149       nudge_factor = 0.0f;
5150       break;
5151    }
5152 
5153    if (nudge_factor != 0.0f) {
5154       if (depth_bias < 0.0f && depth_bias > -nudge_factor)
5155          depth_bias -= nudge_factor;
5156       else if (depth_bias > 0.0f && depth_bias < nudge_factor)
5157          depth_bias += nudge_factor;
5158    }
5159 
5160    return depth_bias;
5161 }
5162 
pvr_get_viewport_scissor_overlap(const VkViewport * const viewport,const VkRect2D * const scissor,VkRect2D * const rect_out)5163 static void pvr_get_viewport_scissor_overlap(const VkViewport *const viewport,
5164                                              const VkRect2D *const scissor,
5165                                              VkRect2D *const rect_out)
5166 {
5167    /* TODO: See if we can remove this struct. */
5168    struct pvr_rect {
5169       int32_t x0, y0;
5170       int32_t x1, y1;
5171    };
5172 
5173    /* TODO: Worry about overflow? */
5174    const struct pvr_rect scissor_rect = {
5175       .x0 = scissor->offset.x,
5176       .y0 = scissor->offset.y,
5177       .x1 = scissor->offset.x + scissor->extent.width,
5178       .y1 = scissor->offset.y + scissor->extent.height
5179    };
5180    struct pvr_rect viewport_rect = { 0 };
5181 
5182    assert(viewport->width >= 0.0f);
5183    assert(scissor_rect.x0 >= 0);
5184    assert(scissor_rect.y0 >= 0);
5185 
5186    if (scissor->extent.width == 0 || scissor->extent.height == 0) {
5187       *rect_out = (VkRect2D){ 0 };
5188       return;
5189    }
5190 
5191    viewport_rect.x0 = (int32_t)viewport->x;
5192    viewport_rect.x1 = (int32_t)viewport->x + (int32_t)viewport->width;
5193 
5194    /* TODO: Is there a mathematical way of doing all this and then clamp at
5195     * the end?
5196     */
5197    /* We flip the y0 and y1 when height is negative. */
5198    viewport_rect.y0 = (int32_t)viewport->y + MIN2(0, (int32_t)viewport->height);
5199    viewport_rect.y1 = (int32_t)viewport->y + MAX2(0, (int32_t)viewport->height);
5200 
5201    if (scissor_rect.x1 <= viewport_rect.x0 ||
5202        scissor_rect.y1 <= viewport_rect.y0 ||
5203        scissor_rect.x0 >= viewport_rect.x1 ||
5204        scissor_rect.y0 >= viewport_rect.y1) {
5205       *rect_out = (VkRect2D){ 0 };
5206       return;
5207    }
5208 
5209    /* Determine the overlapping rectangle. */
5210    viewport_rect.x0 = MAX2(viewport_rect.x0, scissor_rect.x0);
5211    viewport_rect.y0 = MAX2(viewport_rect.y0, scissor_rect.y0);
5212    viewport_rect.x1 = MIN2(viewport_rect.x1, scissor_rect.x1);
5213    viewport_rect.y1 = MIN2(viewport_rect.y1, scissor_rect.y1);
5214 
5215    /* TODO: Is this conversion safe? Is this logic right? */
5216    rect_out->offset.x = (uint32_t)viewport_rect.x0;
5217    rect_out->offset.y = (uint32_t)viewport_rect.y0;
5218    rect_out->extent.height = (uint32_t)(viewport_rect.y1 - viewport_rect.y0);
5219    rect_out->extent.width = (uint32_t)(viewport_rect.x1 - viewport_rect.x0);
5220 }
5221 
5222 static inline uint32_t
pvr_get_geom_region_clip_align_size(struct pvr_device_info * const dev_info)5223 pvr_get_geom_region_clip_align_size(struct pvr_device_info *const dev_info)
5224 {
5225    /* TODO: This should come from rogue_ppp.xml. */
5226    return 16U + 16U * (!PVR_HAS_FEATURE(dev_info, tile_size_16x16));
5227 }
5228 
5229 static void
pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer * const cmd_buffer)5230 pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer *const cmd_buffer)
5231 {
5232    struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header;
5233    struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
5234    struct vk_dynamic_graphics_state *const dynamic_state =
5235       &cmd_buffer->vk.dynamic_graphics_state;
5236    const struct PVRX(TA_STATE_ISPCTL) *const ispctl =
5237       &ppp_state->isp.control_struct;
5238    struct pvr_device_info *const dev_info =
5239       &cmd_buffer->device->pdevice->dev_info;
5240 
5241    if (ispctl->dbenable &&
5242        (BITSET_TEST(dynamic_state->dirty,
5243                     MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
5244         cmd_buffer->depth_bias_array.size == 0)) {
5245       struct pvr_depth_bias_state depth_bias = {
5246          .constant_factor = pvr_calculate_final_depth_bias_contant_factor(
5247             dev_info,
5248             cmd_buffer->state.depth_format,
5249             dynamic_state->rs.depth_bias.constant),
5250          .slope_factor = dynamic_state->rs.depth_bias.slope,
5251          .clamp = dynamic_state->rs.depth_bias.clamp,
5252       };
5253 
5254       ppp_state->depthbias_scissor_indices.depthbias_index =
5255          util_dynarray_num_elements(&cmd_buffer->depth_bias_array,
5256                                     __typeof__(depth_bias));
5257 
5258       util_dynarray_append(&cmd_buffer->depth_bias_array,
5259                            __typeof__(depth_bias),
5260                            depth_bias);
5261 
5262       header->pres_ispctl_dbsc = true;
5263    }
5264 
5265    if (ispctl->scenable) {
5266       const uint32_t region_clip_align_size =
5267          pvr_get_geom_region_clip_align_size(dev_info);
5268       const VkViewport *const viewport = &dynamic_state->vp.viewports[0];
5269       const VkRect2D *const scissor = &dynamic_state->vp.scissors[0];
5270       struct pvr_scissor_words scissor_words;
5271       VkRect2D overlap_rect;
5272       uint32_t height;
5273       uint32_t width;
5274       uint32_t x;
5275       uint32_t y;
5276 
5277       /* For region clip. */
5278       uint32_t bottom;
5279       uint32_t right;
5280       uint32_t left;
5281       uint32_t top;
5282 
5283       /* We don't support multiple viewport calculations. */
5284       assert(dynamic_state->vp.viewport_count == 1);
5285       /* We don't support multiple scissor calculations. */
5286       assert(dynamic_state->vp.scissor_count == 1);
5287 
5288       pvr_get_viewport_scissor_overlap(viewport, scissor, &overlap_rect);
5289 
5290       x = overlap_rect.offset.x;
5291       y = overlap_rect.offset.y;
5292       width = overlap_rect.extent.width;
5293       height = overlap_rect.extent.height;
5294 
5295       pvr_csb_pack (&scissor_words.w0, IPF_SCISSOR_WORD_0, word0) {
5296          word0.scw0_xmax = x + width;
5297          word0.scw0_xmin = x;
5298       }
5299 
5300       pvr_csb_pack (&scissor_words.w1, IPF_SCISSOR_WORD_1, word1) {
5301          word1.scw1_ymax = y + height;
5302          word1.scw1_ymin = y;
5303       }
5304 
5305       if (cmd_buffer->scissor_array.size &&
5306           cmd_buffer->scissor_words.w0 == scissor_words.w0 &&
5307           cmd_buffer->scissor_words.w1 == scissor_words.w1) {
5308          return;
5309       }
5310 
5311       cmd_buffer->scissor_words = scissor_words;
5312 
5313       /* Calculate region clip. */
5314 
5315       left = x / region_clip_align_size;
5316       top = y / region_clip_align_size;
5317 
5318       /* We prevent right=-1 with the multiplication. */
5319       /* TODO: Is there a better way of doing this? */
5320       if ((x + width) != 0U)
5321          right = DIV_ROUND_UP(x + width, region_clip_align_size) - 1;
5322       else
5323          right = 0;
5324 
5325       if ((y + height) != 0U)
5326          bottom = DIV_ROUND_UP(y + height, region_clip_align_size) - 1;
5327       else
5328          bottom = 0U;
5329 
5330       /* Setup region clip to clip everything outside what was calculated. */
5331 
5332       /* FIXME: Should we mask to prevent writing over other words? */
5333       pvr_csb_pack (&ppp_state->region_clipping.word0, TA_REGION_CLIP0, word0) {
5334          word0.right = right;
5335          word0.left = left;
5336          word0.mode = PVRX(TA_REGION_CLIP_MODE_OUTSIDE);
5337       }
5338 
5339       pvr_csb_pack (&ppp_state->region_clipping.word1, TA_REGION_CLIP1, word1) {
5340          word1.bottom = bottom;
5341          word1.top = top;
5342       }
5343 
5344       ppp_state->depthbias_scissor_indices.scissor_index =
5345          util_dynarray_num_elements(&cmd_buffer->scissor_array,
5346                                     struct pvr_scissor_words);
5347 
5348       util_dynarray_append(&cmd_buffer->scissor_array,
5349                            struct pvr_scissor_words,
5350                            cmd_buffer->scissor_words);
5351 
5352       header->pres_ispctl_dbsc = true;
5353       header->pres_region_clip = true;
5354    }
5355 }
5356 
5357 static void
pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer * const cmd_buffer,struct PVRX (TA_STATE_ISPA)* ispa)5358 pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer *const cmd_buffer,
5359                                 struct PVRX(TA_STATE_ISPA) * ispa)
5360 {
5361    struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header;
5362    struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
5363    uint32_t merge_word;
5364    uint32_t mask;
5365 
5366    pvr_csb_pack (&merge_word, TA_STATE_PDS_SIZEINFO2, size_info) {
5367       /* Disable for lines or punch-through or for DWD and depth compare
5368        * always.
5369        */
5370       if (ispa->objtype == PVRX(TA_OBJTYPE_LINE) ||
5371           ispa->passtype == PVRX(TA_PASSTYPE_PUNCH_THROUGH) ||
5372           (ispa->dwritedisable && ispa->dcmpmode == PVRX(TA_CMPMODE_ALWAYS))) {
5373          size_info.pds_tri_merge_disable = true;
5374       }
5375    }
5376 
5377    pvr_csb_pack (&mask, TA_STATE_PDS_SIZEINFO2, size_info) {
5378       size_info.pds_tri_merge_disable = true;
5379    }
5380 
5381    merge_word |= ppp_state->pds.size_info2 & ~mask;
5382 
5383    if (merge_word != ppp_state->pds.size_info2) {
5384       ppp_state->pds.size_info2 = merge_word;
5385       header->pres_pds_state_ptr0 = true;
5386    }
5387 }
5388 
5389 static void
pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)5390 pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer *const cmd_buffer,
5391                                   struct pvr_sub_cmd_gfx *const sub_cmd)
5392 {
5393    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5394 
5395    const struct pvr_fragment_shader_state *const fragment =
5396       &state->gfx_pipeline->shader_state.fragment;
5397    const struct pvr_stage_allocation_descriptor_state *descriptor_shader_state =
5398       &fragment->descriptor_state;
5399    const struct pvr_pipeline_stage_state *fragment_state =
5400       &fragment->stage_state;
5401    const struct pvr_pds_upload *pds_coeff_program =
5402       &fragment->pds_coeff_program;
5403 
5404    const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
5405    struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5406    struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5407 
5408    const uint32_t pds_uniform_size =
5409       DIV_ROUND_UP(descriptor_shader_state->pds_info.data_size_in_dwords,
5410                    PVRX(TA_STATE_PDS_SIZEINFO1_PDS_UNIFORMSIZE_UNIT_SIZE));
5411 
5412    const uint32_t pds_varying_state_size =
5413       DIV_ROUND_UP(pds_coeff_program->data_size,
5414                    PVRX(TA_STATE_PDS_SIZEINFO1_PDS_VARYINGSIZE_UNIT_SIZE));
5415 
5416    const uint32_t usc_varying_size =
5417       DIV_ROUND_UP(fragment_state->coefficient_size,
5418                    PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE));
5419 
5420    const uint32_t pds_temp_size =
5421       DIV_ROUND_UP(fragment_state->pds_temps_count,
5422                    PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE));
5423 
5424    const uint32_t usc_shared_size =
5425       DIV_ROUND_UP(fragment_state->const_shared_reg_count,
5426                    PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE));
5427 
5428    const uint32_t max_tiles_in_flight =
5429       pvr_calc_fscommon_size_and_tiles_in_flight(
5430          &pdevice->dev_info,
5431          &pdevice->dev_runtime_info,
5432          usc_shared_size *
5433             PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE),
5434          1);
5435    uint32_t size_info_mask;
5436    uint32_t size_info2;
5437 
5438    if (max_tiles_in_flight < sub_cmd->max_tiles_in_flight)
5439       sub_cmd->max_tiles_in_flight = max_tiles_in_flight;
5440 
5441    pvr_csb_pack (&ppp_state->pds.pixel_shader_base,
5442                  TA_STATE_PDS_SHADERBASE,
5443                  shader_base) {
5444       const struct pvr_pds_upload *const pds_upload =
5445          &fragment->pds_fragment_program;
5446 
5447       shader_base.addr = PVR_DEV_ADDR(pds_upload->data_offset);
5448    }
5449 
5450    if (descriptor_shader_state->pds_code.pvr_bo) {
5451       pvr_csb_pack (&ppp_state->pds.texture_uniform_code_base,
5452                     TA_STATE_PDS_TEXUNICODEBASE,
5453                     tex_base) {
5454          tex_base.addr =
5455             PVR_DEV_ADDR(descriptor_shader_state->pds_code.code_offset);
5456       }
5457    } else {
5458       ppp_state->pds.texture_uniform_code_base = 0U;
5459    }
5460 
5461    pvr_csb_pack (&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1, info1) {
5462       info1.pds_uniformsize = pds_uniform_size;
5463       info1.pds_texturestatesize = 0U;
5464       info1.pds_varyingsize = pds_varying_state_size;
5465       info1.usc_varyingsize = usc_varying_size;
5466       info1.pds_tempsize = pds_temp_size;
5467    }
5468 
5469    pvr_csb_pack (&size_info_mask, TA_STATE_PDS_SIZEINFO2, mask) {
5470       mask.pds_tri_merge_disable = true;
5471    }
5472 
5473    ppp_state->pds.size_info2 &= size_info_mask;
5474 
5475    pvr_csb_pack (&size_info2, TA_STATE_PDS_SIZEINFO2, info2) {
5476       info2.usc_sharedsize = usc_shared_size;
5477    }
5478 
5479    ppp_state->pds.size_info2 |= size_info2;
5480 
5481    if (pds_coeff_program->pvr_bo) {
5482       header->pres_pds_state_ptr1 = true;
5483 
5484       pvr_csb_pack (&ppp_state->pds.varying_base,
5485                     TA_STATE_PDS_VARYINGBASE,
5486                     base) {
5487          base.addr = PVR_DEV_ADDR(pds_coeff_program->data_offset);
5488       }
5489    } else {
5490       ppp_state->pds.varying_base = 0U;
5491    }
5492 
5493    pvr_csb_pack (&ppp_state->pds.uniform_state_data_base,
5494                  TA_STATE_PDS_UNIFORMDATABASE,
5495                  base) {
5496       base.addr = PVR_DEV_ADDR(state->pds_fragment_descriptor_data_offset);
5497    }
5498 
5499    header->pres_pds_state_ptr0 = true;
5500    header->pres_pds_state_ptr3 = true;
5501 }
5502 
pvr_setup_viewport(struct pvr_cmd_buffer * const cmd_buffer)5503 static void pvr_setup_viewport(struct pvr_cmd_buffer *const cmd_buffer)
5504 {
5505    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5506    struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5507    struct vk_dynamic_graphics_state *const dynamic_state =
5508       &cmd_buffer->vk.dynamic_graphics_state;
5509    struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5510 
5511    if (ppp_state->viewport_count != dynamic_state->vp.viewport_count) {
5512       ppp_state->viewport_count = dynamic_state->vp.viewport_count;
5513       header->pres_viewport = true;
5514    }
5515 
5516    if (dynamic_state->rs.rasterizer_discard_enable) {
5517       /* We don't want to emit any viewport data as it'll just get thrown
5518        * away. It's after the previous condition because we still want to
5519        * stash the viewport_count as it's our trigger for when
5520        * rasterizer discard gets disabled.
5521        */
5522       header->pres_viewport = false;
5523       return;
5524    }
5525 
5526    for (uint32_t i = 0; i < ppp_state->viewport_count; i++) {
5527       VkViewport *viewport = &dynamic_state->vp.viewports[i];
5528       uint32_t x_scale = fui(viewport->width * 0.5f);
5529       uint32_t y_scale = fui(viewport->height * 0.5f);
5530       uint32_t z_scale = fui(viewport->maxDepth - viewport->minDepth);
5531       uint32_t x_center = fui(viewport->x + viewport->width * 0.5f);
5532       uint32_t y_center = fui(viewport->y + viewport->height * 0.5f);
5533       uint32_t z_center = fui(viewport->minDepth);
5534 
5535       if (ppp_state->viewports[i].a0 != x_center ||
5536           ppp_state->viewports[i].m0 != x_scale ||
5537           ppp_state->viewports[i].a1 != y_center ||
5538           ppp_state->viewports[i].m1 != y_scale ||
5539           ppp_state->viewports[i].a2 != z_center ||
5540           ppp_state->viewports[i].m2 != z_scale) {
5541          ppp_state->viewports[i].a0 = x_center;
5542          ppp_state->viewports[i].m0 = x_scale;
5543          ppp_state->viewports[i].a1 = y_center;
5544          ppp_state->viewports[i].m1 = y_scale;
5545          ppp_state->viewports[i].a2 = z_center;
5546          ppp_state->viewports[i].m2 = z_scale;
5547 
5548          header->pres_viewport = true;
5549       }
5550    }
5551 }
5552 
pvr_setup_ppp_control(struct pvr_cmd_buffer * const cmd_buffer)5553 static void pvr_setup_ppp_control(struct pvr_cmd_buffer *const cmd_buffer)
5554 {
5555    struct vk_dynamic_graphics_state *const dynamic_state =
5556       &cmd_buffer->vk.dynamic_graphics_state;
5557    const VkPrimitiveTopology topology = dynamic_state->ia.primitive_topology;
5558    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5559    struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5560    struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5561    uint32_t ppp_control;
5562 
5563    pvr_csb_pack (&ppp_control, TA_STATE_PPP_CTRL, control) {
5564       control.drawclippededges = true;
5565       control.wclampen = true;
5566 
5567       if (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN)
5568          control.flatshade_vtx = PVRX(TA_FLATSHADE_VTX_VERTEX_1);
5569       else
5570          control.flatshade_vtx = PVRX(TA_FLATSHADE_VTX_VERTEX_0);
5571 
5572       if (dynamic_state->rs.depth_clamp_enable)
5573          control.clip_mode = PVRX(TA_CLIP_MODE_NO_FRONT_OR_REAR);
5574       else
5575          control.clip_mode = PVRX(TA_CLIP_MODE_FRONT_REAR);
5576 
5577       /* +--- FrontIsCCW?
5578        * | +--- Cull Front?
5579        * v v
5580        * 0|0 CULLMODE_CULL_CCW,
5581        * 0|1 CULLMODE_CULL_CW,
5582        * 1|0 CULLMODE_CULL_CW,
5583        * 1|1 CULLMODE_CULL_CCW,
5584        */
5585       switch (dynamic_state->rs.cull_mode) {
5586       case VK_CULL_MODE_BACK_BIT:
5587       case VK_CULL_MODE_FRONT_BIT:
5588          if ((dynamic_state->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) ^
5589              (dynamic_state->rs.cull_mode == VK_CULL_MODE_FRONT_BIT)) {
5590             control.cullmode = PVRX(TA_CULLMODE_CULL_CW);
5591          } else {
5592             control.cullmode = PVRX(TA_CULLMODE_CULL_CCW);
5593          }
5594 
5595          break;
5596 
5597       case VK_CULL_MODE_FRONT_AND_BACK:
5598       case VK_CULL_MODE_NONE:
5599          control.cullmode = PVRX(TA_CULLMODE_NO_CULLING);
5600          break;
5601 
5602       default:
5603          unreachable("Unsupported cull mode!");
5604       }
5605    }
5606 
5607    if (ppp_control != ppp_state->ppp_control) {
5608       ppp_state->ppp_control = ppp_control;
5609       header->pres_ppp_ctrl = true;
5610    }
5611 }
5612 
5613 /* Largest valid PPP State update in words = 31
5614  * 1 - Header
5615  * 3 - Stream Out Config words 0, 1 and 2
5616  * 1 - PPP Control word
5617  * 3 - Varying Config words 0, 1 and 2
5618  * 1 - Output Select
5619  * 1 - WClamp
5620  * 6 - Viewport Transform words
5621  * 2 - Region Clip words
5622  * 3 - PDS State for fragment phase (PDSSTATEPTR 1-3)
5623  * 4 - PDS State for fragment phase (PDSSTATEPTR0)
5624  * 6 - ISP Control Words
5625  */
5626 #define PVR_MAX_PPP_STATE_DWORDS 31
5627 
pvr_emit_ppp_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)5628 static VkResult pvr_emit_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
5629                                    struct pvr_sub_cmd_gfx *const sub_cmd)
5630 {
5631    const bool deferred_secondary = pvr_cmd_uses_deferred_cs_cmds(cmd_buffer);
5632    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5633    struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5634    struct pvr_csb *const control_stream = &sub_cmd->control_stream;
5635    struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5636    uint32_t ppp_state_words[PVR_MAX_PPP_STATE_DWORDS];
5637    const bool emit_dbsc = header->pres_ispctl_dbsc;
5638    uint32_t *buffer_ptr = ppp_state_words;
5639    uint32_t dbsc_patching_offset = 0;
5640    uint32_t ppp_state_words_count;
5641    struct pvr_suballoc_bo *pvr_bo;
5642    VkResult result;
5643 
5644 #if !defined(NDEBUG)
5645    struct PVRX(TA_STATE_HEADER) emit_mask = *header;
5646    uint32_t packed_emit_mask;
5647 
5648    static_assert(pvr_cmd_length(TA_STATE_HEADER) == 1,
5649                  "EMIT_MASK_IS_CLEAR assumes 1 dword sized header.");
5650 
5651 #   define EMIT_MASK_GET(field) (emit_mask.field)
5652 #   define EMIT_MASK_SET(field, value) (emit_mask.field = (value))
5653 #   define EMIT_MASK_IS_CLEAR                                        \
5654       (pvr_cmd_pack(TA_STATE_HEADER)(&packed_emit_mask, &emit_mask), \
5655        packed_emit_mask == 0)
5656 #else
5657 #   define EMIT_MASK_GET(field)
5658 #   define EMIT_MASK_SET(field, value)
5659 #endif
5660 
5661    header->view_port_count =
5662       (ppp_state->viewport_count == 0) ? 0U : (ppp_state->viewport_count - 1);
5663    header->pres_ispctl_fa = header->pres_ispctl;
5664 
5665    /* If deferred_secondary is true then we do a separate state update
5666     * which gets patched in vkCmdExecuteCommands().
5667     */
5668    header->pres_ispctl_dbsc &= !deferred_secondary;
5669 
5670    pvr_csb_write_struct(buffer_ptr, TA_STATE_HEADER, header);
5671 
5672    static_assert(pvr_cmd_length(TA_STATE_HEADER) == 1,
5673                  "Following header check assumes 1 dword sized header.");
5674    /* If the header is empty we exit early and prevent a bo alloc of 0 size. */
5675    if (ppp_state_words[0] == 0)
5676       return VK_SUCCESS;
5677 
5678    if (header->pres_ispctl) {
5679       pvr_csb_write_value(buffer_ptr, TA_STATE_ISPCTL, ppp_state->isp.control);
5680 
5681       assert(header->pres_ispctl_fa);
5682       /* This is not a mistake. FA, BA have the ISPA format, and FB, BB have the
5683        * ISPB format.
5684        */
5685       pvr_csb_write_value(buffer_ptr, TA_STATE_ISPA, ppp_state->isp.front_a);
5686       EMIT_MASK_SET(pres_ispctl_fa, false);
5687 
5688       if (header->pres_ispctl_fb) {
5689          pvr_csb_write_value(buffer_ptr, TA_STATE_ISPB, ppp_state->isp.front_b);
5690          EMIT_MASK_SET(pres_ispctl_fb, false);
5691       }
5692 
5693       if (header->pres_ispctl_ba) {
5694          pvr_csb_write_value(buffer_ptr, TA_STATE_ISPA, ppp_state->isp.back_a);
5695          EMIT_MASK_SET(pres_ispctl_ba, false);
5696       }
5697 
5698       if (header->pres_ispctl_bb) {
5699          pvr_csb_write_value(buffer_ptr, TA_STATE_ISPB, ppp_state->isp.back_b);
5700          EMIT_MASK_SET(pres_ispctl_bb, false);
5701       }
5702 
5703       EMIT_MASK_SET(pres_ispctl, false);
5704    }
5705 
5706    if (header->pres_ispctl_dbsc) {
5707       assert(!deferred_secondary);
5708 
5709       dbsc_patching_offset = buffer_ptr - ppp_state_words;
5710 
5711       pvr_csb_pack (buffer_ptr, TA_STATE_ISPDBSC, ispdbsc) {
5712          ispdbsc.dbindex = ppp_state->depthbias_scissor_indices.depthbias_index;
5713          ispdbsc.scindex = ppp_state->depthbias_scissor_indices.scissor_index;
5714       }
5715       buffer_ptr += pvr_cmd_length(TA_STATE_ISPDBSC);
5716 
5717       EMIT_MASK_SET(pres_ispctl_dbsc, false);
5718    }
5719 
5720    if (header->pres_pds_state_ptr0) {
5721       pvr_csb_write_value(buffer_ptr,
5722                           TA_STATE_PDS_SHADERBASE,
5723                           ppp_state->pds.pixel_shader_base);
5724 
5725       pvr_csb_write_value(buffer_ptr,
5726                           TA_STATE_PDS_TEXUNICODEBASE,
5727                           ppp_state->pds.texture_uniform_code_base);
5728 
5729       pvr_csb_write_value(buffer_ptr,
5730                           TA_STATE_PDS_SIZEINFO1,
5731                           ppp_state->pds.size_info1);
5732       pvr_csb_write_value(buffer_ptr,
5733                           TA_STATE_PDS_SIZEINFO2,
5734                           ppp_state->pds.size_info2);
5735 
5736       EMIT_MASK_SET(pres_pds_state_ptr0, false);
5737    }
5738 
5739    if (header->pres_pds_state_ptr1) {
5740       pvr_csb_write_value(buffer_ptr,
5741                           TA_STATE_PDS_VARYINGBASE,
5742                           ppp_state->pds.varying_base);
5743       EMIT_MASK_SET(pres_pds_state_ptr1, false);
5744    }
5745 
5746    /* We don't use pds_state_ptr2 (texture state programs) control word, but
5747     * this doesn't mean we need to set it to 0. This is because the hardware
5748     * runs the texture state program only when
5749     * ROGUE_TA_STATE_PDS_SIZEINFO1.pds_texturestatesize is non-zero.
5750     */
5751    assert(pvr_csb_unpack(&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1)
5752              .pds_texturestatesize == 0);
5753 
5754    if (header->pres_pds_state_ptr3) {
5755       pvr_csb_write_value(buffer_ptr,
5756                           TA_STATE_PDS_UNIFORMDATABASE,
5757                           ppp_state->pds.uniform_state_data_base);
5758       EMIT_MASK_SET(pres_pds_state_ptr3, false);
5759    }
5760 
5761    if (header->pres_region_clip) {
5762       pvr_csb_write_value(buffer_ptr,
5763                           TA_REGION_CLIP0,
5764                           ppp_state->region_clipping.word0);
5765       pvr_csb_write_value(buffer_ptr,
5766                           TA_REGION_CLIP1,
5767                           ppp_state->region_clipping.word1);
5768 
5769       EMIT_MASK_SET(pres_region_clip, false);
5770    }
5771 
5772    if (header->pres_viewport) {
5773       const uint32_t viewports = MAX2(1, ppp_state->viewport_count);
5774       EMIT_MASK_SET(view_port_count, viewports);
5775 
5776       for (uint32_t i = 0; i < viewports; i++) {
5777          /* These don't have any definitions in the csbgen xml files and none
5778           * will be added.
5779           */
5780          *buffer_ptr++ = ppp_state->viewports[i].a0;
5781          *buffer_ptr++ = ppp_state->viewports[i].m0;
5782          *buffer_ptr++ = ppp_state->viewports[i].a1;
5783          *buffer_ptr++ = ppp_state->viewports[i].m1;
5784          *buffer_ptr++ = ppp_state->viewports[i].a2;
5785          *buffer_ptr++ = ppp_state->viewports[i].m2;
5786 
5787          EMIT_MASK_SET(view_port_count, EMIT_MASK_GET(view_port_count) - 1);
5788       }
5789 
5790       EMIT_MASK_SET(pres_viewport, false);
5791    }
5792 
5793    if (header->pres_wclamp) {
5794       pvr_csb_pack (buffer_ptr, TA_WCLAMP, wclamp) {
5795          wclamp.val = fui(0.00001f);
5796       }
5797       buffer_ptr += pvr_cmd_length(TA_WCLAMP);
5798       EMIT_MASK_SET(pres_wclamp, false);
5799    }
5800 
5801    if (header->pres_outselects) {
5802       pvr_csb_write_value(buffer_ptr, TA_OUTPUT_SEL, ppp_state->output_selects);
5803       EMIT_MASK_SET(pres_outselects, false);
5804    }
5805 
5806    if (header->pres_varying_word0) {
5807       pvr_csb_write_value(buffer_ptr,
5808                           TA_STATE_VARYING0,
5809                           ppp_state->varying_word[0]);
5810       EMIT_MASK_SET(pres_varying_word0, false);
5811    }
5812 
5813    if (header->pres_varying_word1) {
5814       pvr_csb_write_value(buffer_ptr,
5815                           TA_STATE_VARYING1,
5816                           ppp_state->varying_word[1]);
5817       EMIT_MASK_SET(pres_varying_word1, false);
5818    }
5819 
5820    /* We only emit this on the first draw of a render job to prevent us from
5821     * inheriting a non-zero value set elsewhere.
5822     */
5823    if (header->pres_varying_word2) {
5824       pvr_csb_write_value(buffer_ptr, TA_STATE_VARYING2, 0);
5825       EMIT_MASK_SET(pres_varying_word2, false);
5826    }
5827 
5828    if (header->pres_ppp_ctrl) {
5829       pvr_csb_write_value(buffer_ptr,
5830                           TA_STATE_PPP_CTRL,
5831                           ppp_state->ppp_control);
5832       EMIT_MASK_SET(pres_ppp_ctrl, false);
5833    }
5834 
5835    /* We only emit this on the first draw of a render job to prevent us from
5836     * inheriting a non-zero value set elsewhere.
5837     */
5838    if (header->pres_stream_out_size) {
5839       pvr_csb_write_value(buffer_ptr, TA_STATE_STREAM_OUT0, 0);
5840       EMIT_MASK_SET(pres_stream_out_size, false);
5841    }
5842 
5843    assert(EMIT_MASK_IS_CLEAR);
5844 
5845 #undef EMIT_MASK_GET
5846 #undef EMIT_MASK_SET
5847 #if !defined(NDEBUG)
5848 #   undef EMIT_MASK_IS_CLEAR
5849 #endif
5850 
5851    ppp_state_words_count = buffer_ptr - ppp_state_words;
5852    assert(ppp_state_words_count <= PVR_MAX_PPP_STATE_DWORDS);
5853 
5854    result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
5855                                      cmd_buffer->device->heaps.general_heap,
5856                                      PVR_DW_TO_BYTES(ppp_state_words_count),
5857                                      &pvr_bo);
5858    if (result != VK_SUCCESS)
5859       return result;
5860 
5861    memcpy(pvr_bo_suballoc_get_map_addr(pvr_bo),
5862           ppp_state_words,
5863           PVR_DW_TO_BYTES(ppp_state_words_count));
5864 
5865    pvr_csb_set_relocation_mark(control_stream);
5866 
5867    /* Write the VDM state update into the VDM control stream. */
5868    pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE0, state0) {
5869       state0.word_count = ppp_state_words_count;
5870       state0.addrmsb = pvr_bo->dev_addr;
5871    }
5872 
5873    pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE1, state1) {
5874       state1.addrlsb = pvr_bo->dev_addr;
5875    }
5876 
5877    pvr_csb_clear_relocation_mark(control_stream);
5878 
5879    if (emit_dbsc && cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
5880       struct pvr_deferred_cs_command cmd;
5881 
5882       if (deferred_secondary) {
5883          const uint32_t num_dwords = pvr_cmd_length(VDMCTRL_PPP_STATE0) +
5884                                      pvr_cmd_length(VDMCTRL_PPP_STATE1);
5885          uint32_t *vdm_state;
5886 
5887          pvr_csb_set_relocation_mark(control_stream);
5888 
5889          vdm_state = pvr_csb_alloc_dwords(control_stream, num_dwords);
5890          if (!vdm_state) {
5891             result = pvr_csb_get_status(control_stream);
5892             return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
5893          }
5894 
5895          pvr_csb_clear_relocation_mark(control_stream);
5896 
5897          cmd = (struct pvr_deferred_cs_command){
5898             .type = PVR_DEFERRED_CS_COMMAND_TYPE_DBSC,
5899             .dbsc = {
5900                .state = ppp_state->depthbias_scissor_indices,
5901                .vdm_state = vdm_state,
5902             },
5903          };
5904       } else {
5905          cmd = (struct pvr_deferred_cs_command){
5906             .type = PVR_DEFERRED_CS_COMMAND_TYPE_DBSC2,
5907             .dbsc2 = {
5908                .state = ppp_state->depthbias_scissor_indices,
5909                .ppp_cs_bo = pvr_bo,
5910                .patch_offset = dbsc_patching_offset,
5911             },
5912          };
5913       }
5914 
5915       util_dynarray_append(&cmd_buffer->deferred_csb_commands,
5916                            struct pvr_deferred_cs_command,
5917                            cmd);
5918    }
5919 
5920    state->emit_header = (struct PVRX(TA_STATE_HEADER)){ 0 };
5921 
5922    return VK_SUCCESS;
5923 }
5924 
5925 static inline bool
pvr_ppp_state_update_required(const struct pvr_cmd_buffer * cmd_buffer)5926 pvr_ppp_state_update_required(const struct pvr_cmd_buffer *cmd_buffer)
5927 {
5928    const BITSET_WORD *const dynamic_dirty =
5929       cmd_buffer->vk.dynamic_graphics_state.dirty;
5930    const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5931    const struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5932 
5933    /* For push constants we only need to worry if they are updated for the
5934     * fragment stage since we're only updating the pds programs used in the
5935     * fragment stage.
5936     */
5937 
5938    return header->pres_ppp_ctrl || header->pres_ispctl ||
5939           header->pres_ispctl_fb || header->pres_ispctl_ba ||
5940           header->pres_ispctl_bb || header->pres_ispctl_dbsc ||
5941           header->pres_pds_state_ptr0 || header->pres_pds_state_ptr1 ||
5942           header->pres_pds_state_ptr2 || header->pres_pds_state_ptr3 ||
5943           header->pres_region_clip || header->pres_viewport ||
5944           header->pres_wclamp || header->pres_outselects ||
5945           header->pres_varying_word0 || header->pres_varying_word1 ||
5946           header->pres_varying_word2 || header->pres_stream_out_program ||
5947           state->dirty.fragment_descriptors || state->dirty.vis_test ||
5948           state->dirty.gfx_pipeline_binding || state->dirty.isp_userpass ||
5949           state->push_constants.dirty_stages & VK_SHADER_STAGE_FRAGMENT_BIT ||
5950           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
5951           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
5952           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
5953           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
5954           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
5955           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
5956           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
5957           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT) ||
5958           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
5959           BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT);
5960 }
5961 
5962 static VkResult
pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)5963 pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
5964                          struct pvr_sub_cmd_gfx *const sub_cmd)
5965 {
5966    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5967    struct vk_dynamic_graphics_state *const dynamic_state =
5968       &cmd_buffer->vk.dynamic_graphics_state;
5969    VkResult result;
5970 
5971    /* TODO: The emit_header will be dirty only if
5972     * pvr_reset_graphics_dirty_state() was called before this (so when command
5973     * buffer begins recording or when it's reset). Otherwise it will have been
5974     * zeroed out by the previous pvr_emit_ppp_state(). We can probably set a
5975     * flag in there and check it here instead of checking the header.
5976     * Check if this is true and implement the flag.
5977     */
5978    if (!pvr_ppp_state_update_required(cmd_buffer))
5979       return VK_SUCCESS;
5980 
5981    if (state->dirty.gfx_pipeline_binding) {
5982       struct PVRX(TA_STATE_ISPA) ispa;
5983 
5984       pvr_setup_output_select(cmd_buffer);
5985       pvr_setup_isp_faces_and_control(cmd_buffer, &ispa);
5986       pvr_setup_triangle_merging_flag(cmd_buffer, &ispa);
5987    } else if (BITSET_TEST(dynamic_state->dirty,
5988                           MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
5989               BITSET_TEST(dynamic_state->dirty,
5990                           MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
5991               BITSET_TEST(dynamic_state->dirty,
5992                           MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
5993               BITSET_TEST(dynamic_state->dirty,
5994                           MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
5995               state->dirty.isp_userpass || state->dirty.vis_test) {
5996       pvr_setup_isp_faces_and_control(cmd_buffer, NULL);
5997    }
5998 
5999    if (!dynamic_state->rs.rasterizer_discard_enable &&
6000        state->dirty.fragment_descriptors &&
6001        state->gfx_pipeline->shader_state.fragment.bo) {
6002       pvr_setup_fragment_state_pointers(cmd_buffer, sub_cmd);
6003    }
6004 
6005    pvr_setup_isp_depth_bias_scissor_state(cmd_buffer);
6006 
6007    if (BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
6008        BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT))
6009       pvr_setup_viewport(cmd_buffer);
6010 
6011    pvr_setup_ppp_control(cmd_buffer);
6012 
6013    /* The hardware doesn't have an explicit mode for this so we use a
6014     * negative viewport to make sure all objects are culled out early.
6015     */
6016    if (dynamic_state->rs.cull_mode == VK_CULL_MODE_FRONT_AND_BACK) {
6017       /* Shift the viewport out of the guard-band culling everything. */
6018       const uint32_t negative_vp_val = fui(-2.0f);
6019 
6020       state->ppp_state.viewports[0].a0 = negative_vp_val;
6021       state->ppp_state.viewports[0].m0 = 0;
6022       state->ppp_state.viewports[0].a1 = negative_vp_val;
6023       state->ppp_state.viewports[0].m1 = 0;
6024       state->ppp_state.viewports[0].a2 = negative_vp_val;
6025       state->ppp_state.viewports[0].m2 = 0;
6026 
6027       state->ppp_state.viewport_count = 1;
6028 
6029       state->emit_header.pres_viewport = true;
6030    }
6031 
6032    result = pvr_emit_ppp_state(cmd_buffer, sub_cmd);
6033    if (result != VK_SUCCESS)
6034       return result;
6035 
6036    return VK_SUCCESS;
6037 }
6038 
pvr_calculate_vertex_cam_size(const struct pvr_device_info * dev_info,const uint32_t vs_output_size,const bool raster_enable,uint32_t * const cam_size_out,uint32_t * const vs_max_instances_out)6039 void pvr_calculate_vertex_cam_size(const struct pvr_device_info *dev_info,
6040                                    const uint32_t vs_output_size,
6041                                    const bool raster_enable,
6042                                    uint32_t *const cam_size_out,
6043                                    uint32_t *const vs_max_instances_out)
6044 {
6045    /* First work out the size of a vertex in the UVS and multiply by 4 for
6046     * column ordering.
6047     */
6048    const uint32_t uvs_vertex_vector_size_in_dwords =
6049       (vs_output_size + 1U + raster_enable * 4U) * 4U;
6050    const uint32_t vdm_cam_size =
6051       PVR_GET_FEATURE_VALUE(dev_info, vdm_cam_size, 32U);
6052 
6053    /* This is a proxy for 8XE. */
6054    if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format) &&
6055        vdm_cam_size < 96U) {
6056       /* Comparisons are based on size including scratch per vertex vector. */
6057       if (uvs_vertex_vector_size_in_dwords < (14U * 4U)) {
6058          *cam_size_out = MIN2(31U, vdm_cam_size - 1U);
6059          *vs_max_instances_out = 16U;
6060       } else if (uvs_vertex_vector_size_in_dwords < (20U * 4U)) {
6061          *cam_size_out = 15U;
6062          *vs_max_instances_out = 16U;
6063       } else if (uvs_vertex_vector_size_in_dwords < (28U * 4U)) {
6064          *cam_size_out = 11U;
6065          *vs_max_instances_out = 12U;
6066       } else if (uvs_vertex_vector_size_in_dwords < (44U * 4U)) {
6067          *cam_size_out = 7U;
6068          *vs_max_instances_out = 8U;
6069       } else if (PVR_HAS_FEATURE(dev_info,
6070                                  simple_internal_parameter_format_v2) ||
6071                  uvs_vertex_vector_size_in_dwords < (64U * 4U)) {
6072          *cam_size_out = 7U;
6073          *vs_max_instances_out = 4U;
6074       } else {
6075          *cam_size_out = 3U;
6076          *vs_max_instances_out = 2U;
6077       }
6078    } else {
6079       /* Comparisons are based on size including scratch per vertex vector. */
6080       if (uvs_vertex_vector_size_in_dwords <= (32U * 4U)) {
6081          /* output size <= 27 + 5 scratch. */
6082          *cam_size_out = MIN2(95U, vdm_cam_size - 1U);
6083          *vs_max_instances_out = 0U;
6084       } else if (uvs_vertex_vector_size_in_dwords <= 48U * 4U) {
6085          /* output size <= 43 + 5 scratch */
6086          *cam_size_out = 63U;
6087          if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
6088             *vs_max_instances_out = 16U;
6089          else
6090             *vs_max_instances_out = 0U;
6091       } else if (uvs_vertex_vector_size_in_dwords <= 64U * 4U) {
6092          /* output size <= 59 + 5 scratch. */
6093          *cam_size_out = 31U;
6094          if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
6095             *vs_max_instances_out = 16U;
6096          else
6097             *vs_max_instances_out = 0U;
6098       } else {
6099          *cam_size_out = 15U;
6100          *vs_max_instances_out = 16U;
6101       }
6102    }
6103 }
6104 
pvr_emit_dirty_vdm_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)6105 static void pvr_emit_dirty_vdm_state(struct pvr_cmd_buffer *const cmd_buffer,
6106                                      struct pvr_sub_cmd_gfx *const sub_cmd)
6107 {
6108    /* FIXME: Assume all state is dirty for the moment. */
6109    struct pvr_device_info *const dev_info =
6110       &cmd_buffer->device->pdevice->dev_info;
6111    ASSERTED const uint32_t max_user_vertex_output_components =
6112       pvr_get_max_user_vertex_output_components(dev_info);
6113    struct PVRX(VDMCTRL_VDM_STATE0)
6114       header = { pvr_cmd_header(VDMCTRL_VDM_STATE0) };
6115    struct vk_dynamic_graphics_state *const dynamic_state =
6116       &cmd_buffer->vk.dynamic_graphics_state;
6117    const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
6118    const struct pvr_vertex_shader_state *const vertex_shader_state =
6119       &state->gfx_pipeline->shader_state.vertex;
6120    struct pvr_csb *const csb = &sub_cmd->control_stream;
6121    uint32_t vs_output_size;
6122    uint32_t max_instances;
6123    uint32_t cam_size;
6124 
6125    /* CAM Calculations and HW state take vertex size aligned to DWORDS. */
6126    vs_output_size =
6127       DIV_ROUND_UP(vertex_shader_state->vertex_output_size,
6128                    PVRX(VDMCTRL_VDM_STATE4_VS_OUTPUT_SIZE_UNIT_SIZE));
6129 
6130    assert(vs_output_size <= max_user_vertex_output_components);
6131 
6132    pvr_calculate_vertex_cam_size(dev_info,
6133                                  vs_output_size,
6134                                  true,
6135                                  &cam_size,
6136                                  &max_instances);
6137 
6138    pvr_csb_set_relocation_mark(csb);
6139 
6140    pvr_csb_emit (csb, VDMCTRL_VDM_STATE0, state0) {
6141       state0.cam_size = cam_size;
6142 
6143       if (dynamic_state->ia.primitive_restart_enable) {
6144          state0.cut_index_enable = true;
6145          state0.cut_index_present = true;
6146       }
6147 
6148       switch (dynamic_state->ia.primitive_topology) {
6149       case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
6150          state0.flatshade_control = PVRX(VDMCTRL_FLATSHADE_CONTROL_VERTEX_1);
6151          break;
6152 
6153       default:
6154          state0.flatshade_control = PVRX(VDMCTRL_FLATSHADE_CONTROL_VERTEX_0);
6155          break;
6156       }
6157 
6158       /* If we've bound a different vertex buffer, or this draw-call requires
6159        * a different PDS attrib data-section from the last draw call (changed
6160        * base_instance) then we need to specify a new data section. This is
6161        * also the case if we've switched pipeline or attrib program as the
6162        * data-section layout will be different.
6163        */
6164       state0.vs_data_addr_present =
6165          state->dirty.gfx_pipeline_binding || state->dirty.vertex_bindings ||
6166          state->dirty.draw_base_instance || state->dirty.draw_variant;
6167 
6168       /* Need to specify new PDS Attrib program if we've bound a different
6169        * pipeline or we needed a different PDS Attrib variant for this
6170        * draw-call.
6171        */
6172       state0.vs_other_present = state->dirty.gfx_pipeline_binding ||
6173                                 state->dirty.draw_variant;
6174 
6175       /* UVB_SCRATCH_SELECT_ONE with no rasterization is only valid when
6176        * stream output is enabled. We use UVB_SCRATCH_SELECT_FIVE because
6177        * Vulkan doesn't support stream output and the vertex position is
6178        * always emitted to the UVB.
6179        */
6180       state0.uvs_scratch_size_select =
6181          PVRX(VDMCTRL_UVS_SCRATCH_SIZE_SELECT_FIVE);
6182 
6183       header = state0;
6184    }
6185 
6186    if (header.cut_index_present) {
6187       pvr_csb_emit (csb, VDMCTRL_VDM_STATE1, state1) {
6188          state1.cut_index =
6189             vk_index_to_restart(state->index_buffer_binding.type);
6190       }
6191    }
6192 
6193    if (header.vs_data_addr_present) {
6194       pvr_csb_emit (csb, VDMCTRL_VDM_STATE2, state2) {
6195          state2.vs_pds_data_base_addr =
6196             PVR_DEV_ADDR(state->pds_vertex_attrib_offset);
6197       }
6198    }
6199 
6200    if (header.vs_other_present) {
6201       const uint32_t usc_unified_store_size_in_bytes =
6202          vertex_shader_state->vertex_input_size << 2;
6203 
6204       pvr_csb_emit (csb, VDMCTRL_VDM_STATE3, state3) {
6205          state3.vs_pds_code_base_addr =
6206             PVR_DEV_ADDR(state->pds_shader.code_offset);
6207       }
6208 
6209       pvr_csb_emit (csb, VDMCTRL_VDM_STATE4, state4) {
6210          state4.vs_output_size = vs_output_size;
6211       }
6212 
6213       pvr_csb_emit (csb, VDMCTRL_VDM_STATE5, state5) {
6214          state5.vs_max_instances = max_instances;
6215          state5.vs_usc_common_size = 0U;
6216          state5.vs_usc_unified_size = DIV_ROUND_UP(
6217             usc_unified_store_size_in_bytes,
6218             PVRX(VDMCTRL_VDM_STATE5_VS_USC_UNIFIED_SIZE_UNIT_SIZE));
6219          state5.vs_pds_temp_size =
6220             DIV_ROUND_UP(state->pds_shader.info->temps_required << 2,
6221                          PVRX(VDMCTRL_VDM_STATE5_VS_PDS_TEMP_SIZE_UNIT_SIZE));
6222          state5.vs_pds_data_size = DIV_ROUND_UP(
6223             PVR_DW_TO_BYTES(state->pds_shader.info->data_size_in_dwords),
6224             PVRX(VDMCTRL_VDM_STATE5_VS_PDS_DATA_SIZE_UNIT_SIZE));
6225       }
6226    }
6227 
6228    pvr_csb_clear_relocation_mark(csb);
6229 }
6230 
pvr_validate_draw_state(struct pvr_cmd_buffer * cmd_buffer)6231 static VkResult pvr_validate_draw_state(struct pvr_cmd_buffer *cmd_buffer)
6232 {
6233    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
6234    struct vk_dynamic_graphics_state *const dynamic_state =
6235       &cmd_buffer->vk.dynamic_graphics_state;
6236    const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
6237    const struct pvr_pipeline_stage_state *const fragment_state =
6238       &gfx_pipeline->shader_state.fragment.stage_state;
6239    const struct pvr_pipeline_stage_state *const vertex_state =
6240       &gfx_pipeline->shader_state.vertex.stage_state;
6241    const struct pvr_pipeline_layout *const pipeline_layout =
6242       gfx_pipeline->base.layout;
6243    struct pvr_sub_cmd_gfx *sub_cmd;
6244    bool fstencil_writemask_zero;
6245    bool bstencil_writemask_zero;
6246    bool fstencil_keep;
6247    bool bstencil_keep;
6248    VkResult result;
6249 
6250    pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
6251 
6252    sub_cmd = &state->current_sub_cmd->gfx;
6253    sub_cmd->empty_cmd = false;
6254 
6255    /* Determine pipeline depth/stencil usage. If a pipeline uses depth or
6256     * stencil testing, those attachments are using their loaded values, and
6257     * the loadOps cannot be optimized out.
6258     */
6259    /* Pipeline uses depth testing. */
6260    if (sub_cmd->depth_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
6261        dynamic_state->ds.depth.compare_op != VK_COMPARE_OP_ALWAYS) {
6262       sub_cmd->depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
6263    }
6264 
6265    /* Pipeline uses stencil testing. */
6266    if (sub_cmd->stencil_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
6267        (dynamic_state->ds.stencil.front.op.compare != VK_COMPARE_OP_ALWAYS ||
6268         dynamic_state->ds.stencil.back.op.compare != VK_COMPARE_OP_ALWAYS)) {
6269       sub_cmd->stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
6270    }
6271 
6272    if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
6273                        compute_overlap)) {
6274       uint32_t coefficient_size =
6275          DIV_ROUND_UP(fragment_state->coefficient_size,
6276                       PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE));
6277 
6278       if (coefficient_size >
6279           PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_MAX_SIZE))
6280          sub_cmd->disable_compute_overlap = true;
6281    }
6282 
6283    sub_cmd->frag_uses_atomic_ops |= fragment_state->uses_atomic_ops;
6284    sub_cmd->frag_has_side_effects |= fragment_state->has_side_effects;
6285    sub_cmd->frag_uses_texture_rw |= fragment_state->uses_texture_rw;
6286    sub_cmd->vertex_uses_texture_rw |= vertex_state->uses_texture_rw;
6287 
6288    sub_cmd->job.get_vis_results = state->vis_test_enabled;
6289 
6290    fstencil_keep =
6291       (dynamic_state->ds.stencil.front.op.fail == VK_STENCIL_OP_KEEP) &&
6292       (dynamic_state->ds.stencil.front.op.pass == VK_STENCIL_OP_KEEP);
6293    bstencil_keep =
6294       (dynamic_state->ds.stencil.back.op.fail == VK_STENCIL_OP_KEEP) &&
6295       (dynamic_state->ds.stencil.back.op.pass == VK_STENCIL_OP_KEEP);
6296    fstencil_writemask_zero = (dynamic_state->ds.stencil.front.write_mask == 0);
6297    bstencil_writemask_zero = (dynamic_state->ds.stencil.back.write_mask == 0);
6298 
6299    /* Set stencil modified flag if:
6300     * - Neither front nor back-facing stencil has a fail_op/pass_op of KEEP.
6301     * - Neither front nor back-facing stencil has a write_mask of zero.
6302     */
6303    if (!(fstencil_keep && bstencil_keep) &&
6304        !(fstencil_writemask_zero && bstencil_writemask_zero)) {
6305       sub_cmd->modifies_stencil = true;
6306    }
6307 
6308    /* Set depth modified flag if depth write is enabled. */
6309    if (dynamic_state->ds.depth.write_enable)
6310       sub_cmd->modifies_depth = true;
6311 
6312    /* If either the data or code changes for pds vertex attribs, regenerate the
6313     * data segment.
6314     */
6315    if (state->dirty.vertex_bindings || state->dirty.gfx_pipeline_binding ||
6316        state->dirty.draw_variant || state->dirty.draw_base_instance) {
6317       enum pvr_pds_vertex_attrib_program_type prog_type;
6318       const struct pvr_pds_attrib_program *program;
6319 
6320       if (state->draw_state.draw_indirect)
6321          prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT;
6322       else if (state->draw_state.base_instance)
6323          prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE;
6324       else
6325          prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC;
6326 
6327       program =
6328          &gfx_pipeline->shader_state.vertex.pds_attrib_programs[prog_type];
6329       state->pds_shader.info = &program->info;
6330       state->pds_shader.code_offset = program->program.code_offset;
6331 
6332       state->max_shared_regs =
6333          MAX2(state->max_shared_regs, pvr_calc_shared_regs_count(gfx_pipeline));
6334 
6335       pvr_setup_vertex_buffers(cmd_buffer, gfx_pipeline);
6336    }
6337 
6338    if (state->push_constants.dirty_stages & VK_SHADER_STAGE_ALL_GRAPHICS) {
6339       result = pvr_cmd_upload_push_consts(cmd_buffer);
6340       if (result != VK_SUCCESS)
6341          return result;
6342    }
6343 
6344    state->dirty.vertex_descriptors = state->dirty.gfx_pipeline_binding;
6345    state->dirty.fragment_descriptors = state->dirty.vertex_descriptors;
6346 
6347    /* Account for dirty descriptor set. */
6348    state->dirty.vertex_descriptors |=
6349       state->dirty.gfx_desc_dirty &&
6350       pipeline_layout
6351          ->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY];
6352    state->dirty.fragment_descriptors |=
6353       state->dirty.gfx_desc_dirty &&
6354       pipeline_layout->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_FRAGMENT];
6355 
6356    if (BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS))
6357       state->dirty.fragment_descriptors = true;
6358 
6359    state->dirty.vertex_descriptors |=
6360       state->push_constants.dirty_stages &
6361       (VK_SHADER_STAGE_ALL_GRAPHICS & ~VK_SHADER_STAGE_FRAGMENT_BIT);
6362    state->dirty.fragment_descriptors |= state->push_constants.dirty_stages &
6363                                         VK_SHADER_STAGE_FRAGMENT_BIT;
6364 
6365    if (state->dirty.fragment_descriptors) {
6366       result = pvr_setup_descriptor_mappings(
6367          cmd_buffer,
6368          PVR_STAGE_ALLOCATION_FRAGMENT,
6369          &state->gfx_pipeline->shader_state.fragment.descriptor_state,
6370          NULL,
6371          &state->pds_fragment_descriptor_data_offset);
6372       if (result != VK_SUCCESS) {
6373          mesa_loge("Could not setup fragment descriptor mappings.");
6374          return result;
6375       }
6376    }
6377 
6378    if (state->dirty.vertex_descriptors) {
6379       uint32_t pds_vertex_descriptor_data_offset;
6380 
6381       result = pvr_setup_descriptor_mappings(
6382          cmd_buffer,
6383          PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
6384          &state->gfx_pipeline->shader_state.vertex.descriptor_state,
6385          NULL,
6386          &pds_vertex_descriptor_data_offset);
6387       if (result != VK_SUCCESS) {
6388          mesa_loge("Could not setup vertex descriptor mappings.");
6389          return result;
6390       }
6391 
6392       pvr_emit_dirty_pds_state(cmd_buffer,
6393                                sub_cmd,
6394                                pds_vertex_descriptor_data_offset);
6395    }
6396 
6397    pvr_emit_dirty_ppp_state(cmd_buffer, sub_cmd);
6398    pvr_emit_dirty_vdm_state(cmd_buffer, sub_cmd);
6399 
6400    vk_dynamic_graphics_state_clear_dirty(dynamic_state);
6401    state->dirty.gfx_desc_dirty = false;
6402    state->dirty.draw_base_instance = false;
6403    state->dirty.draw_variant = false;
6404    state->dirty.fragment_descriptors = false;
6405    state->dirty.gfx_pipeline_binding = false;
6406    state->dirty.isp_userpass = false;
6407    state->dirty.vertex_bindings = false;
6408    state->dirty.vis_test = false;
6409 
6410    state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
6411 
6412    return VK_SUCCESS;
6413 }
6414 
pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)6415 static uint32_t pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)
6416 {
6417    switch (topology) {
6418    case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
6419       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_POINT_LIST);
6420    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
6421       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST);
6422    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
6423       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP);
6424    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
6425       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST);
6426    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
6427       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP);
6428    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
6429       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_FAN);
6430    case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
6431       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST_ADJ);
6432    case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
6433       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP_ADJ);
6434    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
6435       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST_ADJ);
6436    case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
6437       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP_ADJ);
6438    case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
6439       return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_PATCH_LIST);
6440    default:
6441       unreachable("Undefined primitive topology");
6442    }
6443 }
6444 
6445 /* TODO: Rewrite this in terms of ALIGN_POT() and pvr_cmd_length(). */
6446 /* Aligned to 128 bit for PDS loads / stores */
6447 #define DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE 8
6448 
6449 static VkResult
pvr_write_draw_indirect_vdm_stream(struct pvr_cmd_buffer * cmd_buffer,struct pvr_csb * const csb,pvr_dev_addr_t idx_buffer_addr,uint32_t idx_stride,struct PVRX (VDMCTRL_INDEX_LIST0)* list_hdr,struct pvr_buffer * buffer,VkDeviceSize offset,uint32_t count,uint32_t stride)6450 pvr_write_draw_indirect_vdm_stream(struct pvr_cmd_buffer *cmd_buffer,
6451                                    struct pvr_csb *const csb,
6452                                    pvr_dev_addr_t idx_buffer_addr,
6453                                    uint32_t idx_stride,
6454                                    struct PVRX(VDMCTRL_INDEX_LIST0) * list_hdr,
6455                                    struct pvr_buffer *buffer,
6456                                    VkDeviceSize offset,
6457                                    uint32_t count,
6458                                    uint32_t stride)
6459 {
6460    struct pvr_pds_drawindirect_program pds_prog = { 0 };
6461    uint32_t word0;
6462 
6463    /* Draw indirect always has index offset and instance count. */
6464    list_hdr->index_offset_present = true;
6465    list_hdr->index_instance_count_present = true;
6466 
6467    pvr_cmd_pack(VDMCTRL_INDEX_LIST0)(&word0, list_hdr);
6468 
6469    pds_prog.support_base_instance = true;
6470    pds_prog.arg_buffer = buffer->dev_addr.addr + offset;
6471    pds_prog.index_buffer = idx_buffer_addr.addr;
6472    pds_prog.index_block_header = word0;
6473    pds_prog.index_stride = idx_stride;
6474    pds_prog.num_views = 1U;
6475 
6476    /* TODO: See if we can pre-upload the code section of all the pds programs
6477     * and reuse them here.
6478     */
6479    /* Generate and upload the PDS programs (code + data). */
6480    for (uint32_t i = 0U; i < count; i++) {
6481       const struct pvr_device_info *dev_info =
6482          &cmd_buffer->device->pdevice->dev_info;
6483       struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6484       struct pvr_suballoc_bo *dummy_bo;
6485       struct pvr_suballoc_bo *pds_bo;
6486       uint32_t *dummy_stream;
6487       uint32_t *pds_base;
6488       uint32_t pds_size;
6489       VkResult result;
6490 
6491       /* TODO: Move this outside the loop and allocate all of them in one go? */
6492       result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
6493                                         cmd_buffer->device->heaps.general_heap,
6494                                         DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE,
6495                                         &dummy_bo);
6496       if (result != VK_SUCCESS)
6497          return result;
6498 
6499       pds_prog.increment_draw_id = (i != 0);
6500       pds_prog.index_list_addr_buffer = dummy_bo->dev_addr.addr;
6501 
6502       if (state->draw_state.draw_indexed) {
6503          pvr_pds_generate_draw_elements_indirect(&pds_prog,
6504                                                  0,
6505                                                  PDS_GENERATE_SIZES,
6506                                                  dev_info);
6507       } else {
6508          pvr_pds_generate_draw_arrays_indirect(&pds_prog,
6509                                                0,
6510                                                PDS_GENERATE_SIZES,
6511                                                dev_info);
6512       }
6513 
6514       pds_size = PVR_DW_TO_BYTES(pds_prog.program.data_size_aligned +
6515                                  pds_prog.program.code_size_aligned);
6516 
6517       result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
6518                                         cmd_buffer->device->heaps.pds_heap,
6519                                         pds_size,
6520                                         &pds_bo);
6521       if (result != VK_SUCCESS)
6522          return result;
6523 
6524       pds_base = pvr_bo_suballoc_get_map_addr(pds_bo);
6525       memcpy(pds_base,
6526              pds_prog.program.code,
6527              PVR_DW_TO_BYTES(pds_prog.program.code_size_aligned));
6528 
6529       if (state->draw_state.draw_indexed) {
6530          pvr_pds_generate_draw_elements_indirect(
6531             &pds_prog,
6532             pds_base + pds_prog.program.code_size_aligned,
6533             PDS_GENERATE_DATA_SEGMENT,
6534             dev_info);
6535       } else {
6536          pvr_pds_generate_draw_arrays_indirect(
6537             &pds_prog,
6538             pds_base + pds_prog.program.code_size_aligned,
6539             PDS_GENERATE_DATA_SEGMENT,
6540             dev_info);
6541       }
6542 
6543       pvr_csb_set_relocation_mark(csb);
6544 
6545       pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) {
6546          state0.usc_target = PVRX(VDMCTRL_USC_TARGET_ANY);
6547 
6548          state0.pds_temp_size =
6549             DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_prog.program.temp_size_aligned),
6550                          PVRX(VDMCTRL_PDS_STATE0_PDS_TEMP_SIZE_UNIT_SIZE));
6551 
6552          state0.pds_data_size =
6553             DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_prog.program.data_size_aligned),
6554                          PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE));
6555       }
6556 
6557       pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) {
6558          const uint32_t data_offset =
6559             pds_bo->dev_addr.addr +
6560             PVR_DW_TO_BYTES(pds_prog.program.code_size_aligned) -
6561             cmd_buffer->device->heaps.pds_heap->base_addr.addr;
6562 
6563          state1.pds_data_addr = PVR_DEV_ADDR(data_offset);
6564          state1.sd_type = PVRX(VDMCTRL_SD_TYPE_PDS);
6565          state1.sd_next_type = PVRX(VDMCTRL_SD_TYPE_NONE);
6566       }
6567 
6568       pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) {
6569          const uint32_t code_offset =
6570             pds_bo->dev_addr.addr -
6571             cmd_buffer->device->heaps.pds_heap->base_addr.addr;
6572 
6573          state2.pds_code_addr = PVR_DEV_ADDR(code_offset);
6574       }
6575 
6576       pvr_csb_clear_relocation_mark(csb);
6577 
6578       /* We don't really need to set the relocation mark since the following
6579        * state update is just one emit but let's be nice and use it.
6580        */
6581       pvr_csb_set_relocation_mark(csb);
6582 
6583       /* Sync task to ensure the VDM doesn't start reading the dummy blocks
6584        * before they are ready.
6585        */
6586       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
6587          list0.primitive_topology = PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST);
6588       }
6589 
6590       pvr_csb_clear_relocation_mark(csb);
6591 
6592       dummy_stream = pvr_bo_suballoc_get_map_addr(dummy_bo);
6593 
6594       /* For indexed draw cmds fill in the dummy's header (as it won't change
6595        * based on the indirect args) and increment by the in-use size of each
6596        * dummy block.
6597        */
6598       if (!state->draw_state.draw_indexed) {
6599          dummy_stream[0] = word0;
6600          dummy_stream += 4;
6601       } else {
6602          dummy_stream += 5;
6603       }
6604 
6605       /* clang-format off */
6606       pvr_csb_pack (dummy_stream, VDMCTRL_STREAM_RETURN, word);
6607       /* clang-format on */
6608 
6609       pvr_csb_set_relocation_mark(csb);
6610 
6611       /* Stream link to the first dummy which forces the VDM to discard any
6612        * prefetched (dummy) control stream.
6613        */
6614       pvr_csb_emit (csb, VDMCTRL_STREAM_LINK0, link) {
6615          link.with_return = true;
6616          link.link_addrmsb = dummy_bo->dev_addr;
6617       }
6618 
6619       pvr_csb_emit (csb, VDMCTRL_STREAM_LINK1, link) {
6620          link.link_addrlsb = dummy_bo->dev_addr;
6621       }
6622 
6623       pvr_csb_clear_relocation_mark(csb);
6624 
6625       /* Point the pds program to the next argument buffer and the next VDM
6626        * dummy buffer.
6627        */
6628       pds_prog.arg_buffer += stride;
6629    }
6630 
6631    return VK_SUCCESS;
6632 }
6633 
6634 #undef DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE
6635 
pvr_emit_vdm_index_list(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd,VkPrimitiveTopology topology,uint32_t index_offset,uint32_t first_index,uint32_t index_count,uint32_t instance_count,struct pvr_buffer * buffer,VkDeviceSize offset,uint32_t count,uint32_t stride)6636 static void pvr_emit_vdm_index_list(struct pvr_cmd_buffer *cmd_buffer,
6637                                     struct pvr_sub_cmd_gfx *const sub_cmd,
6638                                     VkPrimitiveTopology topology,
6639                                     uint32_t index_offset,
6640                                     uint32_t first_index,
6641                                     uint32_t index_count,
6642                                     uint32_t instance_count,
6643                                     struct pvr_buffer *buffer,
6644                                     VkDeviceSize offset,
6645                                     uint32_t count,
6646                                     uint32_t stride)
6647 {
6648    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6649    const bool vertex_shader_has_side_effects =
6650       state->gfx_pipeline->shader_state.vertex.stage_state.has_side_effects;
6651    struct PVRX(VDMCTRL_INDEX_LIST0)
6652       list_hdr = { pvr_cmd_header(VDMCTRL_INDEX_LIST0) };
6653    pvr_dev_addr_t index_buffer_addr = PVR_DEV_ADDR_INVALID;
6654    struct pvr_csb *const csb = &sub_cmd->control_stream;
6655    unsigned int index_stride = 0;
6656 
6657    list_hdr.primitive_topology = pvr_get_hw_primitive_topology(topology);
6658 
6659    /* firstInstance is not handled here in the VDM state, it's implemented as
6660     * an addition in the PDS vertex fetch using
6661     * PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE entry type.
6662     */
6663 
6664    list_hdr.index_count_present = true;
6665 
6666    if (instance_count > 1)
6667       list_hdr.index_instance_count_present = true;
6668 
6669    if (index_offset)
6670       list_hdr.index_offset_present = true;
6671 
6672    if (state->draw_state.draw_indexed) {
6673       list_hdr.index_size =
6674          pvr_vdmctrl_index_size_from_type(state->index_buffer_binding.type);
6675       index_stride = vk_index_type_to_bytes(state->index_buffer_binding.type);
6676 
6677       index_buffer_addr = PVR_DEV_ADDR_OFFSET(
6678          state->index_buffer_binding.buffer->dev_addr,
6679          state->index_buffer_binding.offset + first_index * index_stride);
6680 
6681       list_hdr.index_addr_present = true;
6682       list_hdr.index_base_addrmsb = index_buffer_addr;
6683    }
6684 
6685    list_hdr.degen_cull_enable =
6686       PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
6687                       vdm_degenerate_culling) &&
6688       !vertex_shader_has_side_effects;
6689 
6690    if (state->draw_state.draw_indirect) {
6691       assert(buffer);
6692       pvr_write_draw_indirect_vdm_stream(cmd_buffer,
6693                                          csb,
6694                                          index_buffer_addr,
6695                                          index_stride,
6696                                          &list_hdr,
6697                                          buffer,
6698                                          offset,
6699                                          count,
6700                                          stride);
6701       return;
6702    }
6703 
6704    pvr_csb_set_relocation_mark(csb);
6705 
6706    pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
6707       list0 = list_hdr;
6708    }
6709 
6710    if (list_hdr.index_addr_present) {
6711       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST1, list1) {
6712          list1.index_base_addrlsb = index_buffer_addr;
6713       }
6714    }
6715 
6716    if (list_hdr.index_count_present) {
6717       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST2, list2) {
6718          list2.index_count = index_count;
6719       }
6720    }
6721 
6722    if (list_hdr.index_instance_count_present) {
6723       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST3, list3) {
6724          list3.instance_count = instance_count - 1;
6725       }
6726    }
6727 
6728    if (list_hdr.index_offset_present) {
6729       pvr_csb_emit (csb, VDMCTRL_INDEX_LIST4, list4) {
6730          list4.index_offset = index_offset;
6731       }
6732    }
6733 
6734    pvr_csb_clear_relocation_mark(csb);
6735 }
6736 
pvr_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)6737 void pvr_CmdDraw(VkCommandBuffer commandBuffer,
6738                  uint32_t vertexCount,
6739                  uint32_t instanceCount,
6740                  uint32_t firstVertex,
6741                  uint32_t firstInstance)
6742 {
6743    const struct pvr_cmd_buffer_draw_state draw_state = {
6744       .base_vertex = firstVertex,
6745       .base_instance = firstInstance,
6746    };
6747 
6748    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6749    struct vk_dynamic_graphics_state *const dynamic_state =
6750       &cmd_buffer->vk.dynamic_graphics_state;
6751    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6752    VkResult result;
6753 
6754    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6755 
6756    pvr_update_draw_state(state, &draw_state);
6757 
6758    result = pvr_validate_draw_state(cmd_buffer);
6759    if (result != VK_SUCCESS)
6760       return;
6761 
6762    /* Write the VDM control stream for the primitive. */
6763    pvr_emit_vdm_index_list(cmd_buffer,
6764                            &state->current_sub_cmd->gfx,
6765                            dynamic_state->ia.primitive_topology,
6766                            firstVertex,
6767                            0U,
6768                            vertexCount,
6769                            instanceCount,
6770                            NULL,
6771                            0U,
6772                            0U,
6773                            0U);
6774 }
6775 
pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)6776 void pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,
6777                         uint32_t indexCount,
6778                         uint32_t instanceCount,
6779                         uint32_t firstIndex,
6780                         int32_t vertexOffset,
6781                         uint32_t firstInstance)
6782 {
6783    const struct pvr_cmd_buffer_draw_state draw_state = {
6784       .base_vertex = vertexOffset,
6785       .base_instance = firstInstance,
6786       .draw_indexed = true,
6787    };
6788 
6789    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6790    struct vk_dynamic_graphics_state *const dynamic_state =
6791       &cmd_buffer->vk.dynamic_graphics_state;
6792    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6793    VkResult result;
6794 
6795    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6796 
6797    pvr_update_draw_state(state, &draw_state);
6798 
6799    result = pvr_validate_draw_state(cmd_buffer);
6800    if (result != VK_SUCCESS)
6801       return;
6802 
6803    /* Write the VDM control stream for the primitive. */
6804    pvr_emit_vdm_index_list(cmd_buffer,
6805                            &state->current_sub_cmd->gfx,
6806                            dynamic_state->ia.primitive_topology,
6807                            vertexOffset,
6808                            firstIndex,
6809                            indexCount,
6810                            instanceCount,
6811                            NULL,
6812                            0U,
6813                            0U,
6814                            0U);
6815 }
6816 
pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)6817 void pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
6818                                 VkBuffer _buffer,
6819                                 VkDeviceSize offset,
6820                                 uint32_t drawCount,
6821                                 uint32_t stride)
6822 {
6823    const struct pvr_cmd_buffer_draw_state draw_state = {
6824       .draw_indirect = true,
6825       .draw_indexed = true,
6826    };
6827 
6828    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6829    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6830    struct vk_dynamic_graphics_state *const dynamic_state =
6831       &cmd_buffer->vk.dynamic_graphics_state;
6832    PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
6833    VkResult result;
6834 
6835    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6836 
6837    pvr_update_draw_state(state, &draw_state);
6838 
6839    result = pvr_validate_draw_state(cmd_buffer);
6840    if (result != VK_SUCCESS)
6841       return;
6842 
6843    /* Write the VDM control stream for the primitive. */
6844    pvr_emit_vdm_index_list(cmd_buffer,
6845                            &state->current_sub_cmd->gfx,
6846                            dynamic_state->ia.primitive_topology,
6847                            0U,
6848                            0U,
6849                            0U,
6850                            0U,
6851                            buffer,
6852                            offset,
6853                            drawCount,
6854                            stride);
6855 }
6856 
pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)6857 void pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer,
6858                          VkBuffer _buffer,
6859                          VkDeviceSize offset,
6860                          uint32_t drawCount,
6861                          uint32_t stride)
6862 {
6863    const struct pvr_cmd_buffer_draw_state draw_state = {
6864       .draw_indirect = true,
6865    };
6866 
6867    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6868    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6869    PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
6870    struct vk_dynamic_graphics_state *const dynamic_state =
6871       &cmd_buffer->vk.dynamic_graphics_state;
6872    VkResult result;
6873 
6874    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6875 
6876    pvr_update_draw_state(state, &draw_state);
6877 
6878    result = pvr_validate_draw_state(cmd_buffer);
6879    if (result != VK_SUCCESS)
6880       return;
6881 
6882    /* Write the VDM control stream for the primitive. */
6883    pvr_emit_vdm_index_list(cmd_buffer,
6884                            &state->current_sub_cmd->gfx,
6885                            dynamic_state->ia.primitive_topology,
6886                            0U,
6887                            0U,
6888                            0U,
6889                            0U,
6890                            buffer,
6891                            offset,
6892                            drawCount,
6893                            stride);
6894 }
6895 
6896 static VkResult
pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer * cmd_buffer,struct pvr_render_pass_info * info)6897 pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer *cmd_buffer,
6898                                           struct pvr_render_pass_info *info)
6899 {
6900    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6901    const struct pvr_renderpass_hwsetup_render *hw_render =
6902       &state->render_pass_info.pass->hw_setup->renders[info->current_hw_subpass];
6903 
6904    for (uint32_t i = 0U; i < hw_render->eot_surface_count; i++) {
6905       const struct pvr_renderpass_hwsetup_eot_surface *surface =
6906          &hw_render->eot_surfaces[i];
6907       const uint32_t color_attach_idx = surface->src_attachment_idx;
6908       const uint32_t resolve_attach_idx = surface->attachment_idx;
6909       VkImageSubresourceLayers src_subresource;
6910       VkImageSubresourceLayers dst_subresource;
6911       struct pvr_image_view *dst_view;
6912       struct pvr_image_view *src_view;
6913       VkFormat src_format;
6914       VkFormat dst_format;
6915       VkImageCopy2 region;
6916       VkResult result;
6917 
6918       if (!surface->need_resolve ||
6919           surface->resolve_type != PVR_RESOLVE_TYPE_TRANSFER)
6920          continue;
6921 
6922       dst_view = info->attachments[resolve_attach_idx];
6923       src_view = info->attachments[color_attach_idx];
6924 
6925       src_subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
6926       src_subresource.mipLevel = src_view->vk.base_mip_level;
6927       src_subresource.baseArrayLayer = src_view->vk.base_array_layer;
6928       src_subresource.layerCount = src_view->vk.layer_count;
6929 
6930       dst_subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
6931       dst_subresource.mipLevel = dst_view->vk.base_mip_level;
6932       dst_subresource.baseArrayLayer = dst_view->vk.base_array_layer;
6933       dst_subresource.layerCount = dst_view->vk.layer_count;
6934 
6935       region.srcOffset = (VkOffset3D){ info->render_area.offset.x,
6936                                        info->render_area.offset.y,
6937                                        0 };
6938       region.dstOffset = (VkOffset3D){ info->render_area.offset.x,
6939                                        info->render_area.offset.y,
6940                                        0 };
6941       region.extent = (VkExtent3D){ info->render_area.extent.width,
6942                                     info->render_area.extent.height,
6943                                     1 };
6944 
6945       region.srcSubresource = src_subresource;
6946       region.dstSubresource = dst_subresource;
6947 
6948       /* TODO: if ERN_46863 is supported, Depth and stencil are sampled
6949        * separately from images with combined depth+stencil. Add logic here to
6950        * handle it using appropriate format from image view.
6951        */
6952       src_format = src_view->vk.image->format;
6953       dst_format = dst_view->vk.image->format;
6954       src_view->vk.image->format = src_view->vk.format;
6955       dst_view->vk.image->format = dst_view->vk.format;
6956 
6957       result = pvr_copy_or_resolve_color_image_region(
6958          cmd_buffer,
6959          vk_to_pvr_image(src_view->vk.image),
6960          vk_to_pvr_image(dst_view->vk.image),
6961          &region);
6962 
6963       src_view->vk.image->format = src_format;
6964       dst_view->vk.image->format = dst_format;
6965 
6966       state->current_sub_cmd->transfer.serialize_with_frag = true;
6967 
6968       if (result != VK_SUCCESS)
6969          return result;
6970    }
6971 
6972    return pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
6973 }
6974 
pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer,const VkSubpassEndInfo * pSubpassEndInfo)6975 void pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
6976                            const VkSubpassEndInfo *pSubpassEndInfo)
6977 {
6978    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6979    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6980    struct pvr_image_view **attachments;
6981    VkClearValue *clear_values;
6982    VkResult result;
6983 
6984    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6985 
6986    assert(state->render_pass_info.pass);
6987    assert(state->render_pass_info.framebuffer);
6988 
6989    result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
6990    if (result != VK_SUCCESS)
6991       return;
6992 
6993    result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer,
6994                                                       &state->render_pass_info);
6995    if (result != VK_SUCCESS)
6996       return;
6997 
6998    /* Save the required fields before clearing render_pass_info struct. */
6999    attachments = state->render_pass_info.attachments;
7000    clear_values = state->render_pass_info.clear_values;
7001 
7002    memset(&state->render_pass_info, 0, sizeof(state->render_pass_info));
7003 
7004    state->render_pass_info.attachments = attachments;
7005    state->render_pass_info.clear_values = clear_values;
7006 }
7007 
7008 static VkResult
pvr_execute_deferred_cmd_buffer(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_cmd_buffer * sec_cmd_buffer)7009 pvr_execute_deferred_cmd_buffer(struct pvr_cmd_buffer *cmd_buffer,
7010                                 const struct pvr_cmd_buffer *sec_cmd_buffer)
7011 {
7012    struct vk_dynamic_graphics_state *const dynamic_state =
7013       &cmd_buffer->vk.dynamic_graphics_state;
7014    const uint32_t prim_db_elems =
7015       util_dynarray_num_elements(&cmd_buffer->depth_bias_array,
7016                                  struct pvr_depth_bias_state);
7017    const uint32_t prim_scissor_elems =
7018       util_dynarray_num_elements(&cmd_buffer->scissor_array,
7019                                  struct pvr_scissor_words);
7020 
7021    util_dynarray_foreach (&sec_cmd_buffer->deferred_csb_commands,
7022                           struct pvr_deferred_cs_command,
7023                           cmd) {
7024       switch (cmd->type) {
7025       case PVR_DEFERRED_CS_COMMAND_TYPE_DBSC: {
7026          const uint32_t scissor_idx =
7027             prim_scissor_elems + cmd->dbsc.state.scissor_index;
7028          const uint32_t db_idx =
7029             prim_db_elems + cmd->dbsc.state.depthbias_index;
7030          const uint32_t num_dwords =
7031             pvr_cmd_length(TA_STATE_HEADER) + pvr_cmd_length(TA_STATE_ISPDBSC);
7032          struct pvr_suballoc_bo *suballoc_bo;
7033          uint32_t ppp_state[num_dwords];
7034          VkResult result;
7035 
7036          pvr_csb_pack (&ppp_state[0], TA_STATE_HEADER, header) {
7037             header.pres_ispctl_dbsc = true;
7038          }
7039 
7040          pvr_csb_pack (&ppp_state[1], TA_STATE_ISPDBSC, ispdbsc) {
7041             ispdbsc.dbindex = db_idx;
7042             ispdbsc.scindex = scissor_idx;
7043          }
7044 
7045          result = pvr_cmd_buffer_upload_general(cmd_buffer,
7046                                                 &ppp_state[0],
7047                                                 sizeof(ppp_state),
7048                                                 &suballoc_bo);
7049          if (result != VK_SUCCESS)
7050             return result;
7051 
7052          pvr_csb_pack (&cmd->dbsc.vdm_state[0], VDMCTRL_PPP_STATE0, state) {
7053             state.word_count = num_dwords;
7054             state.addrmsb = suballoc_bo->dev_addr;
7055          }
7056 
7057          pvr_csb_pack (&cmd->dbsc.vdm_state[1], VDMCTRL_PPP_STATE1, state) {
7058             state.addrlsb = suballoc_bo->dev_addr;
7059          }
7060 
7061          break;
7062       }
7063 
7064       case PVR_DEFERRED_CS_COMMAND_TYPE_DBSC2: {
7065          const uint32_t scissor_idx =
7066             prim_scissor_elems + cmd->dbsc2.state.scissor_index;
7067          const uint32_t db_idx =
7068             prim_db_elems + cmd->dbsc2.state.depthbias_index;
7069 
7070          uint32_t *const addr =
7071             (uint32_t *)pvr_bo_suballoc_get_map_addr(cmd->dbsc2.ppp_cs_bo) +
7072             cmd->dbsc2.patch_offset;
7073 
7074          assert(pvr_bo_suballoc_get_map_addr(cmd->dbsc2.ppp_cs_bo));
7075 
7076          pvr_csb_pack (addr, TA_STATE_ISPDBSC, ispdbsc) {
7077             ispdbsc.dbindex = db_idx;
7078             ispdbsc.scindex = scissor_idx;
7079          }
7080 
7081          break;
7082       }
7083 
7084       default:
7085          unreachable("Invalid deferred control stream command type.");
7086          break;
7087       }
7088    }
7089 
7090    util_dynarray_append_dynarray(&cmd_buffer->depth_bias_array,
7091                                  &sec_cmd_buffer->depth_bias_array);
7092 
7093    util_dynarray_append_dynarray(&cmd_buffer->scissor_array,
7094                                  &sec_cmd_buffer->scissor_array);
7095 
7096    BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
7097    cmd_buffer->scissor_words = (struct pvr_scissor_words){ 0 };
7098 
7099    return VK_SUCCESS;
7100 }
7101 
7102 /* Caller needs to make sure that it ends the current sub_cmd. This function
7103  * only creates a copy of sec_sub_cmd and links it to the cmd_buffer's
7104  * sub_cmd list.
7105  */
pvr_execute_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd * sec_sub_cmd)7106 static VkResult pvr_execute_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
7107                                     struct pvr_sub_cmd *sec_sub_cmd)
7108 {
7109    struct pvr_sub_cmd *primary_sub_cmd =
7110       vk_zalloc(&cmd_buffer->vk.pool->alloc,
7111                 sizeof(*primary_sub_cmd),
7112                 8,
7113                 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
7114    if (!primary_sub_cmd) {
7115       return vk_command_buffer_set_error(&cmd_buffer->vk,
7116                                          VK_ERROR_OUT_OF_HOST_MEMORY);
7117    }
7118 
7119    primary_sub_cmd->type = sec_sub_cmd->type;
7120    primary_sub_cmd->owned = false;
7121 
7122    list_addtail(&primary_sub_cmd->link, &cmd_buffer->sub_cmds);
7123 
7124    switch (sec_sub_cmd->type) {
7125    case PVR_SUB_CMD_TYPE_GRAPHICS:
7126       primary_sub_cmd->gfx = sec_sub_cmd->gfx;
7127       break;
7128 
7129    case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
7130    case PVR_SUB_CMD_TYPE_COMPUTE:
7131       primary_sub_cmd->compute = sec_sub_cmd->compute;
7132       break;
7133 
7134    case PVR_SUB_CMD_TYPE_TRANSFER:
7135       primary_sub_cmd->transfer = sec_sub_cmd->transfer;
7136       break;
7137 
7138    case PVR_SUB_CMD_TYPE_EVENT:
7139       primary_sub_cmd->event = sec_sub_cmd->event;
7140       break;
7141 
7142    default:
7143       unreachable("Unsupported sub-command type");
7144    }
7145 
7146    return VK_SUCCESS;
7147 }
7148 
7149 static VkResult
pvr_execute_graphics_cmd_buffer(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_cmd_buffer * sec_cmd_buffer)7150 pvr_execute_graphics_cmd_buffer(struct pvr_cmd_buffer *cmd_buffer,
7151                                 const struct pvr_cmd_buffer *sec_cmd_buffer)
7152 {
7153    const struct pvr_device_info *dev_info =
7154       &cmd_buffer->device->pdevice->dev_info;
7155    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7156    struct pvr_sub_cmd *primary_sub_cmd = state->current_sub_cmd;
7157    struct pvr_sub_cmd *first_sec_cmd;
7158    VkResult result;
7159 
7160    /* Inherited queries are not supported. */
7161    assert(!state->vis_test_enabled);
7162 
7163    if (list_is_empty(&sec_cmd_buffer->sub_cmds))
7164       return VK_SUCCESS;
7165 
7166    first_sec_cmd =
7167       list_first_entry(&sec_cmd_buffer->sub_cmds, struct pvr_sub_cmd, link);
7168 
7169    /* Kick a render if we have a new base address. */
7170    if (primary_sub_cmd->gfx.query_pool && first_sec_cmd->gfx.query_pool &&
7171        primary_sub_cmd->gfx.query_pool != first_sec_cmd->gfx.query_pool) {
7172       state->current_sub_cmd->gfx.barrier_store = true;
7173 
7174       result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7175       if (result != VK_SUCCESS)
7176          return result;
7177 
7178       result =
7179          pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7180       if (result != VK_SUCCESS)
7181          return result;
7182 
7183       primary_sub_cmd = state->current_sub_cmd;
7184 
7185       /* Use existing render setup, but load color attachments from HW
7186        * Background object.
7187        */
7188       primary_sub_cmd->gfx.barrier_load = true;
7189       primary_sub_cmd->gfx.barrier_store = false;
7190    }
7191 
7192    list_for_each_entry (struct pvr_sub_cmd,
7193                         sec_sub_cmd,
7194                         &sec_cmd_buffer->sub_cmds,
7195                         link) {
7196       /* Only graphics secondary execution supported within a renderpass. */
7197       assert(sec_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
7198 
7199       if (!sec_sub_cmd->gfx.empty_cmd)
7200          primary_sub_cmd->gfx.empty_cmd = false;
7201 
7202       if (sec_sub_cmd->gfx.query_pool) {
7203          primary_sub_cmd->gfx.query_pool = sec_sub_cmd->gfx.query_pool;
7204 
7205          util_dynarray_append_dynarray(&state->query_indices,
7206                                        &sec_sub_cmd->gfx.sec_query_indices);
7207       }
7208 
7209       if (pvr_cmd_uses_deferred_cs_cmds(sec_cmd_buffer)) {
7210          /* TODO: In case if secondary buffer is created with
7211           * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, then we patch the
7212           * stream and copy it to primary stream using pvr_csb_copy below.
7213           * This will need locking if the same secondary command buffer is
7214           * executed in multiple primary buffers at the same time.
7215           */
7216          result = pvr_execute_deferred_cmd_buffer(cmd_buffer, sec_cmd_buffer);
7217          if (result != VK_SUCCESS)
7218             return result;
7219 
7220          result = pvr_csb_copy(&primary_sub_cmd->gfx.control_stream,
7221                                &sec_sub_cmd->gfx.control_stream);
7222          if (result != VK_SUCCESS)
7223             return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
7224       } else {
7225          result = pvr_execute_deferred_cmd_buffer(cmd_buffer, sec_cmd_buffer);
7226          if (result != VK_SUCCESS)
7227             return result;
7228 
7229          pvr_csb_emit_link(
7230             &primary_sub_cmd->gfx.control_stream,
7231             pvr_csb_get_start_address(&sec_sub_cmd->gfx.control_stream),
7232             true);
7233       }
7234 
7235       if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
7236                           compute_overlap)) {
7237          primary_sub_cmd->gfx.job.disable_compute_overlap |=
7238             sec_sub_cmd->gfx.job.disable_compute_overlap;
7239       }
7240 
7241       primary_sub_cmd->gfx.max_tiles_in_flight =
7242          MIN2(primary_sub_cmd->gfx.max_tiles_in_flight,
7243               sec_sub_cmd->gfx.max_tiles_in_flight);
7244 
7245       /* Pass loaded depth/stencil usage from secondary command buffer. */
7246       if (sec_sub_cmd->gfx.depth_usage == PVR_DEPTH_STENCIL_USAGE_NEEDED)
7247          primary_sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
7248 
7249       if (sec_sub_cmd->gfx.stencil_usage == PVR_DEPTH_STENCIL_USAGE_NEEDED)
7250          primary_sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
7251 
7252       /* Pass depth/stencil modification state from secondary command buffer. */
7253       if (sec_sub_cmd->gfx.modifies_depth)
7254          primary_sub_cmd->gfx.modifies_depth = true;
7255 
7256       if (sec_sub_cmd->gfx.modifies_stencil)
7257          primary_sub_cmd->gfx.modifies_stencil = true;
7258 
7259       if (sec_sub_cmd->gfx.barrier_store) {
7260          struct pvr_sub_cmd *sec_next =
7261             list_entry(sec_sub_cmd->link.next, struct pvr_sub_cmd, link);
7262 
7263          /* This shouldn't be the last sub cmd. There should be a barrier load
7264           * subsequent to the barrier store.
7265           */
7266          assert(list_last_entry(&sec_cmd_buffer->sub_cmds,
7267                                 struct pvr_sub_cmd,
7268                                 link) != sec_sub_cmd);
7269 
7270          /* Kick render to store stencil. */
7271          state->current_sub_cmd->gfx.barrier_store = true;
7272          state->current_sub_cmd->gfx.empty_cmd = false;
7273 
7274          result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7275          if (result != VK_SUCCESS)
7276             return result;
7277 
7278          result =
7279             pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7280          if (result != VK_SUCCESS)
7281             return result;
7282 
7283          primary_sub_cmd = state->current_sub_cmd;
7284 
7285          /* Use existing render setup, but load color attachments from HW
7286           * Background object.
7287           */
7288          primary_sub_cmd->gfx.barrier_load = sec_next->gfx.barrier_load;
7289          primary_sub_cmd->gfx.barrier_store = sec_next->gfx.barrier_store;
7290          primary_sub_cmd->gfx.empty_cmd = false;
7291       }
7292 
7293       if (!PVR_HAS_FEATURE(dev_info, gs_rta_support)) {
7294          util_dynarray_append_dynarray(&cmd_buffer->deferred_clears,
7295                                        &sec_cmd_buffer->deferred_clears);
7296       }
7297    }
7298 
7299    return VK_SUCCESS;
7300 }
7301 
pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)7302 void pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer,
7303                             uint32_t commandBufferCount,
7304                             const VkCommandBuffer *pCommandBuffers)
7305 {
7306    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7307    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7308    struct pvr_cmd_buffer *last_cmd_buffer;
7309    VkResult result;
7310 
7311    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7312 
7313    assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
7314 
7315    /* Reset the CPU copy of the most recent PPP state of the primary command
7316     * buffer.
7317     *
7318     * The next draw call in the primary after CmdExecuteCommands may send
7319     * redundant state, if it all goes in the same geom job.
7320     *
7321     * Can't just copy state from the secondary because the recording state of
7322     * the secondary command buffers would have been deleted at this point.
7323     */
7324    pvr_reset_graphics_dirty_state(cmd_buffer, false);
7325 
7326    if (state->current_sub_cmd &&
7327        state->current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS) {
7328       for (uint32_t i = 0; i < commandBufferCount; i++) {
7329          PVR_FROM_HANDLE(pvr_cmd_buffer, sec_cmd_buffer, pCommandBuffers[i]);
7330 
7331          assert(sec_cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
7332 
7333          result = pvr_execute_graphics_cmd_buffer(cmd_buffer, sec_cmd_buffer);
7334          if (result != VK_SUCCESS)
7335             return;
7336       }
7337 
7338       last_cmd_buffer =
7339          pvr_cmd_buffer_from_handle(pCommandBuffers[commandBufferCount - 1]);
7340 
7341       /* Set barriers from final command secondary command buffer. */
7342       for (uint32_t i = 0; i != PVR_NUM_SYNC_PIPELINE_STAGES; i++) {
7343          state->barriers_needed[i] |=
7344             last_cmd_buffer->state.barriers_needed[i] &
7345             PVR_PIPELINE_STAGE_ALL_GRAPHICS_BITS;
7346       }
7347    } else {
7348       for (uint32_t i = 0; i < commandBufferCount; i++) {
7349          PVR_FROM_HANDLE(pvr_cmd_buffer, sec_cmd_buffer, pCommandBuffers[i]);
7350 
7351          assert(sec_cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
7352 
7353          result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7354          if (result != VK_SUCCESS)
7355             return;
7356 
7357          list_for_each_entry_safe (struct pvr_sub_cmd,
7358                                    sec_sub_cmd,
7359                                    &sec_cmd_buffer->sub_cmds,
7360                                    link) {
7361             result = pvr_execute_sub_cmd(cmd_buffer, sec_sub_cmd);
7362             if (result != VK_SUCCESS)
7363                return;
7364          }
7365       }
7366 
7367       last_cmd_buffer =
7368          pvr_cmd_buffer_from_handle(pCommandBuffers[commandBufferCount - 1]);
7369 
7370       memcpy(state->barriers_needed,
7371              last_cmd_buffer->state.barriers_needed,
7372              sizeof(state->barriers_needed));
7373    }
7374 }
7375 
pvr_insert_transparent_obj(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)7376 static void pvr_insert_transparent_obj(struct pvr_cmd_buffer *const cmd_buffer,
7377                                        struct pvr_sub_cmd_gfx *const sub_cmd)
7378 {
7379    struct pvr_device *const device = cmd_buffer->device;
7380    /* Yes we want a copy. The user could be recording multiple command buffers
7381     * in parallel so writing the template in place could cause problems.
7382     */
7383    struct pvr_static_clear_ppp_template clear =
7384       device->static_clear_state.ppp_templates[VK_IMAGE_ASPECT_COLOR_BIT];
7385    uint32_t pds_state[PVR_STATIC_CLEAR_PDS_STATE_COUNT] = { 0 };
7386    struct pvr_csb *csb = &sub_cmd->control_stream;
7387    struct pvr_suballoc_bo *ppp_bo;
7388 
7389    assert(clear.requires_pds_state);
7390 
7391    /* Patch the template. */
7392 
7393    pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SHADERBASE],
7394                  TA_STATE_PDS_SHADERBASE,
7395                  shaderbase) {
7396       shaderbase.addr = PVR_DEV_ADDR(device->nop_program.pds.data_offset);
7397    }
7398 
7399    clear.config.pds_state = &pds_state;
7400 
7401    clear.config.ispctl.upass = cmd_buffer->state.render_pass_info.isp_userpass;
7402 
7403    /* Emit PPP state from template. */
7404 
7405    pvr_emit_ppp_from_template(csb, &clear, &ppp_bo);
7406    list_add(&ppp_bo->link, &cmd_buffer->bo_list);
7407 
7408    /* Emit VDM state. */
7409 
7410    pvr_emit_clear_words(cmd_buffer, sub_cmd);
7411 
7412    /* Reset graphics state. */
7413    pvr_reset_graphics_dirty_state(cmd_buffer, false);
7414 }
7415 
7416 static inline struct pvr_render_subpass *
pvr_get_current_subpass(const struct pvr_cmd_buffer_state * const state)7417 pvr_get_current_subpass(const struct pvr_cmd_buffer_state *const state)
7418 {
7419    const uint32_t subpass_idx = state->render_pass_info.subpass_idx;
7420 
7421    return &state->render_pass_info.pass->subpasses[subpass_idx];
7422 }
7423 
pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer,const VkSubpassBeginInfo * pSubpassBeginInfo,const VkSubpassEndInfo * pSubpassEndInfo)7424 void pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer,
7425                          const VkSubpassBeginInfo *pSubpassBeginInfo,
7426                          const VkSubpassEndInfo *pSubpassEndInfo)
7427 {
7428    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7429    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7430    struct pvr_render_pass_info *rp_info = &state->render_pass_info;
7431    const struct pvr_renderpass_hwsetup_subpass *hw_subpass;
7432    struct pvr_renderpass_hwsetup_render *next_hw_render;
7433    const struct pvr_render_pass *pass = rp_info->pass;
7434    const struct pvr_renderpass_hw_map *current_map;
7435    const struct pvr_renderpass_hw_map *next_map;
7436    struct pvr_load_op *hw_subpass_load_op;
7437    VkResult result;
7438 
7439    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7440 
7441    current_map = &pass->hw_setup->subpass_map[rp_info->subpass_idx];
7442    next_map = &pass->hw_setup->subpass_map[rp_info->subpass_idx + 1];
7443    next_hw_render = &pass->hw_setup->renders[next_map->render];
7444 
7445    if (current_map->render != next_map->render) {
7446       result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7447       if (result != VK_SUCCESS)
7448          return;
7449 
7450       result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer, rp_info);
7451       if (result != VK_SUCCESS)
7452          return;
7453 
7454       rp_info->current_hw_subpass = next_map->render;
7455 
7456       result =
7457          pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7458       if (result != VK_SUCCESS)
7459          return;
7460 
7461       rp_info->enable_bg_tag = false;
7462       rp_info->process_empty_tiles = false;
7463 
7464       /* If this subpass contains any load ops the HW Background Object must be
7465        * run to do the clears/loads.
7466        */
7467       if (next_hw_render->color_init_count > 0) {
7468          rp_info->enable_bg_tag = true;
7469 
7470          for (uint32_t i = 0; i < next_hw_render->color_init_count; i++) {
7471             /* Empty tiles need to be cleared too. */
7472             if (next_hw_render->color_init[i].op ==
7473                 VK_ATTACHMENT_LOAD_OP_CLEAR) {
7474                rp_info->process_empty_tiles = true;
7475                break;
7476             }
7477          }
7478       }
7479 
7480       /* Set isp_userpass to zero for new hw_render. This will be used to set
7481        * ROGUE_CR_ISP_CTL::upass_start.
7482        */
7483       rp_info->isp_userpass = 0;
7484    }
7485 
7486    hw_subpass = &next_hw_render->subpasses[next_map->subpass];
7487    hw_subpass_load_op = hw_subpass->load_op;
7488 
7489    if (hw_subpass_load_op) {
7490       result = pvr_cs_write_load_op(cmd_buffer,
7491                                     &state->current_sub_cmd->gfx,
7492                                     hw_subpass_load_op,
7493                                     rp_info->isp_userpass);
7494    }
7495 
7496    /* Pipelines are created for a particular subpass so unbind but leave the
7497     * vertex and descriptor bindings intact as they are orthogonal to the
7498     * subpass.
7499     */
7500    state->gfx_pipeline = NULL;
7501 
7502    /* User-pass spawn is 4 bits so if the driver has to wrap it, it will emit a
7503     * full screen transparent object to flush all tags up until now, then the
7504     * user-pass spawn value will implicitly be reset to 0 because
7505     * pvr_render_subpass::isp_userpass values are stored ANDed with
7506     * ROGUE_CR_ISP_CTL_UPASS_START_SIZE_MAX.
7507     */
7508    /* If hw_subpass_load_op is valid then pvr_write_load_op_control_stream
7509     * has already done a full-screen transparent object.
7510     */
7511    if (rp_info->isp_userpass == PVRX(CR_ISP_CTL_UPASS_START_SIZE_MAX) &&
7512        !hw_subpass_load_op) {
7513       pvr_insert_transparent_obj(cmd_buffer, &state->current_sub_cmd->gfx);
7514    }
7515 
7516    rp_info->subpass_idx++;
7517 
7518    rp_info->isp_userpass = pass->subpasses[rp_info->subpass_idx].isp_userpass;
7519    state->dirty.isp_userpass = true;
7520 
7521    rp_info->pipeline_bind_point =
7522       pass->subpasses[rp_info->subpass_idx].pipeline_bind_point;
7523 
7524    pvr_stash_depth_format(state, &state->current_sub_cmd->gfx);
7525 }
7526 
7527 static bool
pvr_stencil_has_self_dependency(const struct pvr_cmd_buffer_state * const state)7528 pvr_stencil_has_self_dependency(const struct pvr_cmd_buffer_state *const state)
7529 {
7530    const struct pvr_render_subpass *const current_subpass =
7531       pvr_get_current_subpass(state);
7532    const uint32_t *const input_attachments = current_subpass->input_attachments;
7533 
7534    if (current_subpass->depth_stencil_attachment == VK_ATTACHMENT_UNUSED)
7535       return false;
7536 
7537    /* We only need to check the current software subpass as we don't support
7538     * merging to/from a subpass with self-dep stencil.
7539     */
7540 
7541    for (uint32_t i = 0; i < current_subpass->input_count; i++) {
7542       if (input_attachments[i] == current_subpass->depth_stencil_attachment)
7543          return true;
7544    }
7545 
7546    return false;
7547 }
7548 
pvr_is_stencil_store_load_needed(const struct pvr_cmd_buffer * const cmd_buffer,VkPipelineStageFlags2 vk_src_stage_mask,VkPipelineStageFlags2 vk_dst_stage_mask,uint32_t memory_barrier_count,const VkMemoryBarrier2 * const memory_barriers,uint32_t image_barrier_count,const VkImageMemoryBarrier2 * const image_barriers)7549 static bool pvr_is_stencil_store_load_needed(
7550    const struct pvr_cmd_buffer *const cmd_buffer,
7551    VkPipelineStageFlags2 vk_src_stage_mask,
7552    VkPipelineStageFlags2 vk_dst_stage_mask,
7553    uint32_t memory_barrier_count,
7554    const VkMemoryBarrier2 *const memory_barriers,
7555    uint32_t image_barrier_count,
7556    const VkImageMemoryBarrier2 *const image_barriers)
7557 {
7558    const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
7559    const uint32_t fragment_test_stages =
7560       VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
7561       VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
7562    const struct pvr_render_pass *const pass = state->render_pass_info.pass;
7563    const struct pvr_renderpass_hwsetup_render *hw_render;
7564    struct pvr_image_view **const attachments =
7565       state->render_pass_info.attachments;
7566    const struct pvr_image_view *attachment;
7567    uint32_t hw_render_idx;
7568 
7569    if (!pass)
7570       return false;
7571 
7572    hw_render_idx = state->current_sub_cmd->gfx.hw_render_idx;
7573    hw_render = &pass->hw_setup->renders[hw_render_idx];
7574 
7575    if (hw_render->ds_attach_idx == VK_ATTACHMENT_UNUSED)
7576       return false;
7577 
7578    if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
7579       attachment = attachments[hw_render->ds_attach_idx];
7580    } else {
7581       assert(!attachments);
7582       attachment = NULL;
7583    }
7584 
7585    if (!(vk_src_stage_mask & fragment_test_stages) &&
7586        vk_dst_stage_mask & VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT)
7587       return false;
7588 
7589    for (uint32_t i = 0; i < memory_barrier_count; i++) {
7590       const uint32_t stencil_write_bit =
7591          VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
7592       const uint32_t input_attachment_read_bit =
7593          VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
7594 
7595       if (!(memory_barriers[i].srcAccessMask & stencil_write_bit))
7596          continue;
7597 
7598       if (!(memory_barriers[i].dstAccessMask & input_attachment_read_bit))
7599          continue;
7600 
7601       return pvr_stencil_has_self_dependency(state);
7602    }
7603 
7604    for (uint32_t i = 0; i < image_barrier_count; i++) {
7605       PVR_FROM_HANDLE(pvr_image, image, image_barriers[i].image);
7606       const uint32_t stencil_bit = VK_IMAGE_ASPECT_STENCIL_BIT;
7607 
7608       if (!(image_barriers[i].subresourceRange.aspectMask & stencil_bit))
7609          continue;
7610 
7611       if (attachment && image != vk_to_pvr_image(attachment->vk.image))
7612          continue;
7613 
7614       if (!vk_format_has_stencil(image->vk.format))
7615          continue;
7616 
7617       return pvr_stencil_has_self_dependency(state);
7618    }
7619 
7620    return false;
7621 }
7622 
7623 static VkResult
pvr_cmd_buffer_insert_mid_frag_barrier_event(struct pvr_cmd_buffer * cmd_buffer,uint32_t src_stage_mask,uint32_t dst_stage_mask)7624 pvr_cmd_buffer_insert_mid_frag_barrier_event(struct pvr_cmd_buffer *cmd_buffer,
7625                                              uint32_t src_stage_mask,
7626                                              uint32_t dst_stage_mask)
7627 {
7628    VkResult result;
7629 
7630    assert(cmd_buffer->state.current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
7631 
7632    cmd_buffer->state.current_sub_cmd->gfx.empty_cmd = false;
7633 
7634    /* Submit graphics job to store stencil. */
7635    cmd_buffer->state.current_sub_cmd->gfx.barrier_store = true;
7636 
7637    pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7638    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7639    if (result != VK_SUCCESS)
7640       return result;
7641 
7642    cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7643       .type = PVR_EVENT_TYPE_BARRIER,
7644       .barrier = {
7645          .in_render_pass = true,
7646          .wait_for_stage_mask = src_stage_mask,
7647          .wait_at_stage_mask = dst_stage_mask,
7648       },
7649    };
7650 
7651    pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7652    pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7653 
7654    /* Use existing render setup, but load color attachments from HW BGOBJ */
7655    cmd_buffer->state.current_sub_cmd->gfx.barrier_load = true;
7656    cmd_buffer->state.current_sub_cmd->gfx.barrier_store = false;
7657 
7658    return VK_SUCCESS;
7659 }
7660 
7661 static VkResult
pvr_cmd_buffer_insert_barrier_event(struct pvr_cmd_buffer * cmd_buffer,uint32_t src_stage_mask,uint32_t dst_stage_mask)7662 pvr_cmd_buffer_insert_barrier_event(struct pvr_cmd_buffer *cmd_buffer,
7663                                     uint32_t src_stage_mask,
7664                                     uint32_t dst_stage_mask)
7665 {
7666    VkResult result;
7667 
7668    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7669    if (result != VK_SUCCESS)
7670       return result;
7671 
7672    cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7673       .type = PVR_EVENT_TYPE_BARRIER,
7674       .barrier = {
7675          .wait_for_stage_mask = src_stage_mask,
7676          .wait_at_stage_mask = dst_stage_mask,
7677       },
7678    };
7679 
7680    return pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7681 }
7682 
7683 /* This is just enough to handle vkCmdPipelineBarrier().
7684  * TODO: Complete?
7685  */
pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)7686 void pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
7687                              const VkDependencyInfo *pDependencyInfo)
7688 {
7689    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7690    struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
7691    const struct pvr_render_pass *const render_pass =
7692       state->render_pass_info.pass;
7693    VkPipelineStageFlags vk_src_stage_mask = 0U;
7694    VkPipelineStageFlags vk_dst_stage_mask = 0U;
7695    bool is_stencil_store_load_needed;
7696    uint32_t required_stage_mask = 0U;
7697    uint32_t src_stage_mask;
7698    uint32_t dst_stage_mask;
7699    bool is_barrier_needed;
7700 
7701    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7702 
7703    for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) {
7704       vk_src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
7705       vk_dst_stage_mask |= pDependencyInfo->pMemoryBarriers[i].dstStageMask;
7706    }
7707 
7708    for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) {
7709       vk_src_stage_mask |=
7710          pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
7711       vk_dst_stage_mask |=
7712          pDependencyInfo->pBufferMemoryBarriers[i].dstStageMask;
7713    }
7714 
7715    for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) {
7716       vk_src_stage_mask |=
7717          pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
7718       vk_dst_stage_mask |=
7719          pDependencyInfo->pImageMemoryBarriers[i].dstStageMask;
7720    }
7721 
7722    src_stage_mask = pvr_stage_mask_src(vk_src_stage_mask);
7723    dst_stage_mask = pvr_stage_mask_dst(vk_dst_stage_mask);
7724 
7725    for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
7726       if (!(dst_stage_mask & BITFIELD_BIT(stage)))
7727          continue;
7728 
7729       required_stage_mask |= state->barriers_needed[stage];
7730    }
7731 
7732    src_stage_mask &= required_stage_mask;
7733    for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
7734       if (!(dst_stage_mask & BITFIELD_BIT(stage)))
7735          continue;
7736 
7737       state->barriers_needed[stage] &= ~src_stage_mask;
7738    }
7739 
7740    if (src_stage_mask == 0 || dst_stage_mask == 0) {
7741       is_barrier_needed = false;
7742    } else if (src_stage_mask == PVR_PIPELINE_STAGE_GEOM_BIT &&
7743               dst_stage_mask == PVR_PIPELINE_STAGE_FRAG_BIT) {
7744       /* This is implicit so no need to barrier. */
7745       is_barrier_needed = false;
7746    } else if (src_stage_mask == dst_stage_mask &&
7747               util_bitcount(src_stage_mask) == 1) {
7748       struct pvr_sub_cmd *const current_sub_cmd = state->current_sub_cmd;
7749 
7750       switch (src_stage_mask) {
7751       case PVR_PIPELINE_STAGE_FRAG_BIT:
7752          is_barrier_needed = !render_pass;
7753 
7754          if (is_barrier_needed)
7755             break;
7756 
7757          assert(current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
7758 
7759          /* Flush all fragment work up to this point. */
7760          pvr_insert_transparent_obj(cmd_buffer, &current_sub_cmd->gfx);
7761          break;
7762 
7763       case PVR_PIPELINE_STAGE_COMPUTE_BIT:
7764          is_barrier_needed = false;
7765 
7766          if (!current_sub_cmd ||
7767              current_sub_cmd->type != PVR_SUB_CMD_TYPE_COMPUTE) {
7768             break;
7769          }
7770 
7771          /* Multiple dispatches can be merged into a single job. When back to
7772           * back dispatches have a sequential dependency (Compute -> compute
7773           * pipeline barrier) we need to do the following.
7774           *   - Dispatch a kernel which fences all previous memory writes and
7775           *     flushes the MADD cache.
7776           *   - Issue a compute fence which ensures all previous tasks emitted
7777           *     by the compute data master are completed before starting
7778           *     anything new.
7779           */
7780 
7781          /* Issue Data Fence, Wait for Data Fence (IDFWDF) makes the PDS wait
7782           * for data.
7783           */
7784          pvr_compute_generate_idfwdf(cmd_buffer, &current_sub_cmd->compute);
7785 
7786          pvr_compute_generate_fence(cmd_buffer,
7787                                     &current_sub_cmd->compute,
7788                                     false);
7789          break;
7790 
7791       default:
7792          is_barrier_needed = false;
7793          break;
7794       };
7795    } else {
7796       is_barrier_needed = true;
7797    }
7798 
7799    is_stencil_store_load_needed =
7800       pvr_is_stencil_store_load_needed(cmd_buffer,
7801                                        vk_src_stage_mask,
7802                                        vk_dst_stage_mask,
7803                                        pDependencyInfo->memoryBarrierCount,
7804                                        pDependencyInfo->pMemoryBarriers,
7805                                        pDependencyInfo->imageMemoryBarrierCount,
7806                                        pDependencyInfo->pImageMemoryBarriers);
7807 
7808    if (is_stencil_store_load_needed) {
7809       VkResult result;
7810 
7811       result = pvr_cmd_buffer_insert_mid_frag_barrier_event(cmd_buffer,
7812                                                             src_stage_mask,
7813                                                             dst_stage_mask);
7814       if (result != VK_SUCCESS)
7815          mesa_loge("Failed to insert mid frag barrier event.");
7816    } else {
7817       if (is_barrier_needed) {
7818          VkResult result;
7819 
7820          result = pvr_cmd_buffer_insert_barrier_event(cmd_buffer,
7821                                                       src_stage_mask,
7822                                                       dst_stage_mask);
7823          if (result != VK_SUCCESS)
7824             mesa_loge("Failed to insert pipeline barrier event.");
7825       }
7826    }
7827 }
7828 
pvr_CmdResetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags2 stageMask)7829 void pvr_CmdResetEvent2(VkCommandBuffer commandBuffer,
7830                         VkEvent _event,
7831                         VkPipelineStageFlags2 stageMask)
7832 {
7833    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7834    PVR_FROM_HANDLE(pvr_event, event, _event);
7835    VkResult result;
7836 
7837    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7838 
7839    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7840    if (result != VK_SUCCESS)
7841       return;
7842 
7843    cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7844       .type = PVR_EVENT_TYPE_RESET,
7845       .set_reset = {
7846          .event = event,
7847          .wait_for_stage_mask = pvr_stage_mask_src(stageMask),
7848       },
7849    };
7850 
7851    pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7852 }
7853 
pvr_CmdSetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,const VkDependencyInfo * pDependencyInfo)7854 void pvr_CmdSetEvent2(VkCommandBuffer commandBuffer,
7855                       VkEvent _event,
7856                       const VkDependencyInfo *pDependencyInfo)
7857 {
7858    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7859    PVR_FROM_HANDLE(pvr_event, event, _event);
7860    VkPipelineStageFlags2 stage_mask = 0;
7861    VkResult result;
7862 
7863    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7864 
7865    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7866    if (result != VK_SUCCESS)
7867       return;
7868 
7869    for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
7870       stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
7871 
7872    for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
7873       stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
7874 
7875    for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
7876       stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
7877 
7878    cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7879       .type = PVR_EVENT_TYPE_SET,
7880       .set_reset = {
7881          .event = event,
7882          .wait_for_stage_mask = pvr_stage_mask_dst(stage_mask),
7883       },
7884    };
7885 
7886    pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7887 }
7888 
pvr_CmdWaitEvents2(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,const VkDependencyInfo * pDependencyInfos)7889 void pvr_CmdWaitEvents2(VkCommandBuffer commandBuffer,
7890                         uint32_t eventCount,
7891                         const VkEvent *pEvents,
7892                         const VkDependencyInfo *pDependencyInfos)
7893 {
7894    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7895    struct pvr_event **events_array;
7896    uint32_t *stage_masks;
7897    VkResult result;
7898 
7899    PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7900 
7901    VK_MULTIALLOC(ma);
7902    vk_multialloc_add(&ma, &events_array, __typeof__(*events_array), eventCount);
7903    vk_multialloc_add(&ma, &stage_masks, __typeof__(*stage_masks), eventCount);
7904 
7905    if (!vk_multialloc_alloc(&ma,
7906                             &cmd_buffer->vk.pool->alloc,
7907                             VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) {
7908       vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
7909       return;
7910    }
7911 
7912    result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7913    if (result != VK_SUCCESS) {
7914       vk_free(&cmd_buffer->vk.pool->alloc, events_array);
7915       return;
7916    }
7917 
7918    memcpy(events_array, pEvents, sizeof(*events_array) * eventCount);
7919 
7920    for (uint32_t i = 0; i < eventCount; i++) {
7921       const VkDependencyInfo *info = &pDependencyInfos[i];
7922       VkPipelineStageFlags2 mask = 0;
7923 
7924       for (uint32_t j = 0; j < info->memoryBarrierCount; j++)
7925          mask |= info->pMemoryBarriers[j].dstStageMask;
7926 
7927       for (uint32_t j = 0; j < info->bufferMemoryBarrierCount; j++)
7928          mask |= info->pBufferMemoryBarriers[j].dstStageMask;
7929 
7930       for (uint32_t j = 0; j < info->imageMemoryBarrierCount; j++)
7931          mask |= info->pImageMemoryBarriers[j].dstStageMask;
7932 
7933       stage_masks[i] = pvr_stage_mask_dst(mask);
7934    }
7935 
7936    cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7937       .type = PVR_EVENT_TYPE_WAIT,
7938       .wait = {
7939          .count = eventCount,
7940          .events = events_array,
7941          .wait_at_stage_masks = stage_masks,
7942       },
7943    };
7944 
7945    pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7946 }
7947 
pvr_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)7948 void pvr_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
7949                                VkPipelineStageFlags2 stage,
7950                                VkQueryPool queryPool,
7951                                uint32_t query)
7952 {
7953    unreachable("Timestamp queries are not supported.");
7954 }
7955 
pvr_EndCommandBuffer(VkCommandBuffer commandBuffer)7956 VkResult pvr_EndCommandBuffer(VkCommandBuffer commandBuffer)
7957 {
7958    PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7959    struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7960    VkResult result;
7961 
7962    if (vk_command_buffer_has_error(&cmd_buffer->vk))
7963       return vk_command_buffer_end(&cmd_buffer->vk);
7964 
7965    /* TODO: We should be freeing all the resources, allocated for recording,
7966     * here.
7967     */
7968    util_dynarray_fini(&state->query_indices);
7969 
7970    result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7971    if (result != VK_SUCCESS)
7972       pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
7973 
7974    return vk_command_buffer_end(&cmd_buffer->vk);
7975 }
7976