1 /*
2 * Copyright © 2022 Imagination Technologies Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <limits.h>
26 #include <stdbool.h>
27 #include <stddef.h>
28 #include <stdint.h>
29 #include <string.h>
30 #include <vulkan/vulkan.h>
31
32 #include "hwdef/rogue_hw_defs.h"
33 #include "hwdef/rogue_hw_utils.h"
34 #include "pvr_blit.h"
35 #include "pvr_bo.h"
36 #include "pvr_clear.h"
37 #include "pvr_common.h"
38 #include "pvr_csb.h"
39 #include "pvr_csb_enum_helpers.h"
40 #include "pvr_device_info.h"
41 #include "pvr_formats.h"
42 #include "pvr_hardcode.h"
43 #include "pvr_hw_pass.h"
44 #include "pvr_job_common.h"
45 #include "pvr_job_render.h"
46 #include "pvr_limits.h"
47 #include "pvr_pds.h"
48 #include "pvr_private.h"
49 #include "pvr_tex_state.h"
50 #include "pvr_types.h"
51 #include "pvr_uscgen.h"
52 #include "pvr_winsys.h"
53 #include "util/bitscan.h"
54 #include "util/bitset.h"
55 #include "util/compiler.h"
56 #include "util/list.h"
57 #include "util/macros.h"
58 #include "util/u_dynarray.h"
59 #include "util/u_math.h"
60 #include "util/u_pack_color.h"
61 #include "vk_alloc.h"
62 #include "vk_command_buffer.h"
63 #include "vk_command_pool.h"
64 #include "vk_common_entrypoints.h"
65 #include "vk_format.h"
66 #include "vk_graphics_state.h"
67 #include "vk_log.h"
68 #include "vk_object.h"
69 #include "vk_util.h"
70
71 /* Structure used to pass data into pvr_compute_generate_control_stream()
72 * function.
73 */
74 struct pvr_compute_kernel_info {
75 pvr_dev_addr_t indirect_buffer_addr;
76 bool global_offsets_present;
77 uint32_t usc_common_size;
78 uint32_t usc_unified_size;
79 uint32_t pds_temp_size;
80 uint32_t pds_data_size;
81 enum PVRX(CDMCTRL_USC_TARGET) usc_target;
82 bool is_fence;
83 uint32_t pds_data_offset;
84 uint32_t pds_code_offset;
85 enum PVRX(CDMCTRL_SD_TYPE) sd_type;
86 bool usc_common_shared;
87 uint32_t local_size[PVR_WORKGROUP_DIMENSIONS];
88 uint32_t global_size[PVR_WORKGROUP_DIMENSIONS];
89 uint32_t max_instances;
90 };
91
pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd * sub_cmd)92 static void pvr_cmd_buffer_free_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
93 struct pvr_sub_cmd *sub_cmd)
94 {
95 if (sub_cmd->owned) {
96 switch (sub_cmd->type) {
97 case PVR_SUB_CMD_TYPE_GRAPHICS:
98 util_dynarray_fini(&sub_cmd->gfx.sec_query_indices);
99 pvr_csb_finish(&sub_cmd->gfx.control_stream);
100 pvr_bo_free(cmd_buffer->device, sub_cmd->gfx.terminate_ctrl_stream);
101 pvr_bo_suballoc_free(sub_cmd->gfx.depth_bias_bo);
102 pvr_bo_suballoc_free(sub_cmd->gfx.scissor_bo);
103 break;
104
105 case PVR_SUB_CMD_TYPE_COMPUTE:
106 case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
107 pvr_csb_finish(&sub_cmd->compute.control_stream);
108 break;
109
110 case PVR_SUB_CMD_TYPE_TRANSFER:
111 list_for_each_entry_safe (struct pvr_transfer_cmd,
112 transfer_cmd,
113 sub_cmd->transfer.transfer_cmds,
114 link) {
115 list_del(&transfer_cmd->link);
116 if (!transfer_cmd->is_deferred_clear)
117 vk_free(&cmd_buffer->vk.pool->alloc, transfer_cmd);
118 }
119 break;
120
121 case PVR_SUB_CMD_TYPE_EVENT:
122 if (sub_cmd->event.type == PVR_EVENT_TYPE_WAIT)
123 vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd->event.wait.events);
124 break;
125
126 default:
127 unreachable("Unsupported sub-command type");
128 }
129 }
130
131 list_del(&sub_cmd->link);
132 vk_free(&cmd_buffer->vk.pool->alloc, sub_cmd);
133 }
134
pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer * cmd_buffer)135 static void pvr_cmd_buffer_free_sub_cmds(struct pvr_cmd_buffer *cmd_buffer)
136 {
137 list_for_each_entry_safe (struct pvr_sub_cmd,
138 sub_cmd,
139 &cmd_buffer->sub_cmds,
140 link) {
141 pvr_cmd_buffer_free_sub_cmd(cmd_buffer, sub_cmd);
142 }
143 }
144
pvr_cmd_buffer_free_resources(struct pvr_cmd_buffer * cmd_buffer)145 static void pvr_cmd_buffer_free_resources(struct pvr_cmd_buffer *cmd_buffer)
146 {
147 vk_free(&cmd_buffer->vk.pool->alloc,
148 cmd_buffer->state.render_pass_info.attachments);
149 vk_free(&cmd_buffer->vk.pool->alloc,
150 cmd_buffer->state.render_pass_info.clear_values);
151
152 util_dynarray_fini(&cmd_buffer->state.query_indices);
153
154 pvr_cmd_buffer_free_sub_cmds(cmd_buffer);
155
156 list_for_each_entry_safe (struct pvr_suballoc_bo,
157 suballoc_bo,
158 &cmd_buffer->bo_list,
159 link) {
160 list_del(&suballoc_bo->link);
161 pvr_bo_suballoc_free(suballoc_bo);
162 }
163
164 util_dynarray_fini(&cmd_buffer->deferred_clears);
165 util_dynarray_fini(&cmd_buffer->deferred_csb_commands);
166 util_dynarray_fini(&cmd_buffer->scissor_array);
167 util_dynarray_fini(&cmd_buffer->depth_bias_array);
168 }
169
pvr_cmd_buffer_reset(struct vk_command_buffer * vk_cmd_buffer,VkCommandBufferResetFlags flags)170 static void pvr_cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
171 VkCommandBufferResetFlags flags)
172 {
173 struct pvr_cmd_buffer *cmd_buffer =
174 container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk);
175
176 /* FIXME: For now we always free all resources as if
177 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set.
178 */
179 pvr_cmd_buffer_free_resources(cmd_buffer);
180
181 vk_command_buffer_reset(&cmd_buffer->vk);
182
183 memset(&cmd_buffer->state, 0, sizeof(cmd_buffer->state));
184 memset(&cmd_buffer->scissor_words, 0, sizeof(cmd_buffer->scissor_words));
185
186 cmd_buffer->usage_flags = 0;
187 }
188
pvr_cmd_buffer_destroy(struct vk_command_buffer * vk_cmd_buffer)189 static void pvr_cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
190 {
191 struct pvr_cmd_buffer *cmd_buffer =
192 container_of(vk_cmd_buffer, struct pvr_cmd_buffer, vk);
193
194 pvr_cmd_buffer_free_resources(cmd_buffer);
195 vk_command_buffer_finish(&cmd_buffer->vk);
196 vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
197 }
198
199 static const struct vk_command_buffer_ops cmd_buffer_ops = {
200 .reset = pvr_cmd_buffer_reset,
201 .destroy = pvr_cmd_buffer_destroy,
202 };
203
pvr_cmd_buffer_create(struct pvr_device * device,struct vk_command_pool * pool,VkCommandBufferLevel level,VkCommandBuffer * pCommandBuffer)204 static VkResult pvr_cmd_buffer_create(struct pvr_device *device,
205 struct vk_command_pool *pool,
206 VkCommandBufferLevel level,
207 VkCommandBuffer *pCommandBuffer)
208 {
209 struct pvr_cmd_buffer *cmd_buffer;
210 VkResult result;
211
212 cmd_buffer = vk_zalloc(&pool->alloc,
213 sizeof(*cmd_buffer),
214 8U,
215 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
216 if (!cmd_buffer)
217 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
218
219 result =
220 vk_command_buffer_init(pool, &cmd_buffer->vk, &cmd_buffer_ops, level);
221 if (result != VK_SUCCESS) {
222 vk_free(&pool->alloc, cmd_buffer);
223 return result;
224 }
225
226 cmd_buffer->device = device;
227
228 util_dynarray_init(&cmd_buffer->depth_bias_array, NULL);
229 util_dynarray_init(&cmd_buffer->scissor_array, NULL);
230 util_dynarray_init(&cmd_buffer->deferred_csb_commands, NULL);
231 util_dynarray_init(&cmd_buffer->deferred_clears, NULL);
232
233 list_inithead(&cmd_buffer->sub_cmds);
234 list_inithead(&cmd_buffer->bo_list);
235
236 *pCommandBuffer = pvr_cmd_buffer_to_handle(cmd_buffer);
237
238 return VK_SUCCESS;
239 }
240
241 VkResult
pvr_AllocateCommandBuffers(VkDevice _device,const VkCommandBufferAllocateInfo * pAllocateInfo,VkCommandBuffer * pCommandBuffers)242 pvr_AllocateCommandBuffers(VkDevice _device,
243 const VkCommandBufferAllocateInfo *pAllocateInfo,
244 VkCommandBuffer *pCommandBuffers)
245 {
246 VK_FROM_HANDLE(vk_command_pool, pool, pAllocateInfo->commandPool);
247 PVR_FROM_HANDLE(pvr_device, device, _device);
248 VkResult result = VK_SUCCESS;
249 uint32_t i;
250
251 for (i = 0; i < pAllocateInfo->commandBufferCount; i++) {
252 result = pvr_cmd_buffer_create(device,
253 pool,
254 pAllocateInfo->level,
255 &pCommandBuffers[i]);
256 if (result != VK_SUCCESS)
257 break;
258 }
259
260 if (result != VK_SUCCESS) {
261 while (i--) {
262 VK_FROM_HANDLE(vk_command_buffer, cmd_buffer, pCommandBuffers[i]);
263 pvr_cmd_buffer_destroy(cmd_buffer);
264 }
265
266 for (i = 0; i < pAllocateInfo->commandBufferCount; i++)
267 pCommandBuffers[i] = VK_NULL_HANDLE;
268 }
269
270 return result;
271 }
272
pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer * cmd_buffer,enum pvr_sub_cmd_type type)273 static void pvr_cmd_buffer_update_barriers(struct pvr_cmd_buffer *cmd_buffer,
274 enum pvr_sub_cmd_type type)
275 {
276 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
277 uint32_t barriers;
278
279 switch (type) {
280 case PVR_SUB_CMD_TYPE_GRAPHICS:
281 barriers = PVR_PIPELINE_STAGE_GEOM_BIT | PVR_PIPELINE_STAGE_FRAG_BIT;
282 break;
283
284 case PVR_SUB_CMD_TYPE_COMPUTE:
285 barriers = PVR_PIPELINE_STAGE_COMPUTE_BIT;
286 break;
287
288 case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
289 case PVR_SUB_CMD_TYPE_TRANSFER:
290 /* Compute jobs are used for occlusion queries but to copy the results we
291 * have to sync with transfer jobs because vkCmdCopyQueryPoolResults() is
292 * deemed as a transfer operation by the spec.
293 */
294 barriers = PVR_PIPELINE_STAGE_TRANSFER_BIT;
295 break;
296
297 case PVR_SUB_CMD_TYPE_EVENT:
298 barriers = 0;
299 break;
300
301 default:
302 unreachable("Unsupported sub-command type");
303 }
304
305 for (uint32_t i = 0; i < ARRAY_SIZE(state->barriers_needed); i++)
306 state->barriers_needed[i] |= barriers;
307 }
308
309 static VkResult
pvr_cmd_buffer_upload_tables(struct pvr_device * device,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)310 pvr_cmd_buffer_upload_tables(struct pvr_device *device,
311 struct pvr_cmd_buffer *cmd_buffer,
312 struct pvr_sub_cmd_gfx *const sub_cmd)
313 {
314 const uint32_t cache_line_size =
315 rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
316 VkResult result;
317
318 assert(!sub_cmd->depth_bias_bo && !sub_cmd->scissor_bo);
319
320 if (cmd_buffer->depth_bias_array.size > 0) {
321 result =
322 pvr_gpu_upload(device,
323 device->heaps.general_heap,
324 util_dynarray_begin(&cmd_buffer->depth_bias_array),
325 cmd_buffer->depth_bias_array.size,
326 cache_line_size,
327 &sub_cmd->depth_bias_bo);
328 if (result != VK_SUCCESS)
329 return result;
330 }
331
332 if (cmd_buffer->scissor_array.size > 0) {
333 result = pvr_gpu_upload(device,
334 device->heaps.general_heap,
335 util_dynarray_begin(&cmd_buffer->scissor_array),
336 cmd_buffer->scissor_array.size,
337 cache_line_size,
338 &sub_cmd->scissor_bo);
339 if (result != VK_SUCCESS)
340 goto err_free_depth_bias_bo;
341 }
342
343 util_dynarray_clear(&cmd_buffer->depth_bias_array);
344 util_dynarray_clear(&cmd_buffer->scissor_array);
345
346 return VK_SUCCESS;
347
348 err_free_depth_bias_bo:
349 pvr_bo_suballoc_free(sub_cmd->depth_bias_bo);
350 sub_cmd->depth_bias_bo = NULL;
351
352 return result;
353 }
354
355 static VkResult
pvr_cmd_buffer_emit_ppp_state(const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_csb * const csb)356 pvr_cmd_buffer_emit_ppp_state(const struct pvr_cmd_buffer *const cmd_buffer,
357 struct pvr_csb *const csb)
358 {
359 const struct pvr_framebuffer *const framebuffer =
360 cmd_buffer->state.render_pass_info.framebuffer;
361
362 assert(csb->stream_type == PVR_CMD_STREAM_TYPE_GRAPHICS ||
363 csb->stream_type == PVR_CMD_STREAM_TYPE_GRAPHICS_DEFERRED);
364
365 pvr_csb_set_relocation_mark(csb);
366
367 pvr_csb_emit (csb, VDMCTRL_PPP_STATE0, state0) {
368 state0.addrmsb = framebuffer->ppp_state_bo->dev_addr;
369 state0.word_count = framebuffer->ppp_state_size;
370 }
371
372 pvr_csb_emit (csb, VDMCTRL_PPP_STATE1, state1) {
373 state1.addrlsb = framebuffer->ppp_state_bo->dev_addr;
374 }
375
376 pvr_csb_clear_relocation_mark(csb);
377
378 return csb->status;
379 }
380
381 VkResult
pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer * const cmd_buffer,const void * const data,const size_t size,struct pvr_suballoc_bo ** const pvr_bo_out)382 pvr_cmd_buffer_upload_general(struct pvr_cmd_buffer *const cmd_buffer,
383 const void *const data,
384 const size_t size,
385 struct pvr_suballoc_bo **const pvr_bo_out)
386 {
387 struct pvr_device *const device = cmd_buffer->device;
388 const uint32_t cache_line_size =
389 rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
390 struct pvr_suballoc_bo *suballoc_bo;
391 VkResult result;
392
393 result = pvr_gpu_upload(device,
394 device->heaps.general_heap,
395 data,
396 size,
397 cache_line_size,
398 &suballoc_bo);
399 if (result != VK_SUCCESS)
400 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
401
402 list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
403
404 *pvr_bo_out = suballoc_bo;
405
406 return VK_SUCCESS;
407 }
408
409 static VkResult
pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer * const cmd_buffer,const void * const code,const size_t code_size,uint64_t code_alignment,struct pvr_suballoc_bo ** const pvr_bo_out)410 pvr_cmd_buffer_upload_usc(struct pvr_cmd_buffer *const cmd_buffer,
411 const void *const code,
412 const size_t code_size,
413 uint64_t code_alignment,
414 struct pvr_suballoc_bo **const pvr_bo_out)
415 {
416 struct pvr_device *const device = cmd_buffer->device;
417 const uint32_t cache_line_size =
418 rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
419 struct pvr_suballoc_bo *suballoc_bo;
420 VkResult result;
421
422 code_alignment = MAX2(code_alignment, cache_line_size);
423
424 result =
425 pvr_gpu_upload_usc(device, code, code_size, code_alignment, &suballoc_bo);
426 if (result != VK_SUCCESS)
427 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
428
429 list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
430
431 *pvr_bo_out = suballoc_bo;
432
433 return VK_SUCCESS;
434 }
435
pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t * data,uint32_t data_size_dwords,uint32_t data_alignment,const uint32_t * code,uint32_t code_size_dwords,uint32_t code_alignment,uint64_t min_alignment,struct pvr_pds_upload * const pds_upload_out)436 VkResult pvr_cmd_buffer_upload_pds(struct pvr_cmd_buffer *const cmd_buffer,
437 const uint32_t *data,
438 uint32_t data_size_dwords,
439 uint32_t data_alignment,
440 const uint32_t *code,
441 uint32_t code_size_dwords,
442 uint32_t code_alignment,
443 uint64_t min_alignment,
444 struct pvr_pds_upload *const pds_upload_out)
445 {
446 struct pvr_device *const device = cmd_buffer->device;
447 VkResult result;
448
449 result = pvr_gpu_upload_pds(device,
450 data,
451 data_size_dwords,
452 data_alignment,
453 code,
454 code_size_dwords,
455 code_alignment,
456 min_alignment,
457 pds_upload_out);
458 if (result != VK_SUCCESS)
459 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
460
461 list_add(&pds_upload_out->pvr_bo->link, &cmd_buffer->bo_list);
462
463 return VK_SUCCESS;
464 }
465
466 static inline VkResult
pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t * data,uint32_t data_size_dwords,uint32_t data_alignment,struct pvr_pds_upload * const pds_upload_out)467 pvr_cmd_buffer_upload_pds_data(struct pvr_cmd_buffer *const cmd_buffer,
468 const uint32_t *data,
469 uint32_t data_size_dwords,
470 uint32_t data_alignment,
471 struct pvr_pds_upload *const pds_upload_out)
472 {
473 return pvr_cmd_buffer_upload_pds(cmd_buffer,
474 data,
475 data_size_dwords,
476 data_alignment,
477 NULL,
478 0,
479 0,
480 data_alignment,
481 pds_upload_out);
482 }
483
484 /* pbe_cs_words must be an array of length emit_count with
485 * ROGUE_NUM_PBESTATE_STATE_WORDS entries
486 */
pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(struct pvr_cmd_buffer * const cmd_buffer,const uint32_t emit_count,const uint32_t * pbe_cs_words,struct pvr_pds_upload * const pds_upload_out)487 static VkResult pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
488 struct pvr_cmd_buffer *const cmd_buffer,
489 const uint32_t emit_count,
490 const uint32_t *pbe_cs_words,
491 struct pvr_pds_upload *const pds_upload_out)
492 {
493 struct pvr_pds_event_program pixel_event_program = {
494 /* No data to DMA, just a DOUTU needed. */
495 .num_emit_word_pairs = 0,
496 };
497 const uint32_t staging_buffer_size =
498 PVR_DW_TO_BYTES(cmd_buffer->device->pixel_event_data_size_in_dwords);
499 const VkAllocationCallbacks *const allocator = &cmd_buffer->vk.pool->alloc;
500 struct pvr_device *const device = cmd_buffer->device;
501 struct pvr_suballoc_bo *usc_eot_program = NULL;
502 struct util_dynarray eot_program_bin;
503 uint32_t *staging_buffer;
504 uint32_t usc_temp_count;
505 VkResult result;
506
507 assert(emit_count > 0);
508
509 pvr_uscgen_eot("per-job EOT",
510 emit_count,
511 pbe_cs_words,
512 &usc_temp_count,
513 &eot_program_bin);
514
515 result = pvr_cmd_buffer_upload_usc(cmd_buffer,
516 eot_program_bin.data,
517 eot_program_bin.size,
518 4,
519 &usc_eot_program);
520
521 util_dynarray_fini(&eot_program_bin);
522
523 if (result != VK_SUCCESS)
524 return result;
525
526 pvr_pds_setup_doutu(&pixel_event_program.task_control,
527 usc_eot_program->dev_addr.addr,
528 usc_temp_count,
529 PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
530 false);
531
532 /* TODO: We could skip allocating this and generate directly into the device
533 * buffer thus removing one allocation and memcpy() per job. Would this
534 * speed up things in a noticeable way?
535 */
536 staging_buffer = vk_alloc(allocator,
537 staging_buffer_size,
538 8,
539 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
540 if (!staging_buffer) {
541 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
542 goto err_free_usc_pixel_program;
543 }
544
545 /* Generate the data segment. The code segment was uploaded earlier when
546 * setting up the PDS static heap data.
547 */
548 pvr_pds_generate_pixel_event_data_segment(&pixel_event_program,
549 staging_buffer,
550 &device->pdevice->dev_info);
551
552 result = pvr_cmd_buffer_upload_pds_data(
553 cmd_buffer,
554 staging_buffer,
555 cmd_buffer->device->pixel_event_data_size_in_dwords,
556 4,
557 pds_upload_out);
558 if (result != VK_SUCCESS)
559 goto err_free_pixel_event_staging_buffer;
560
561 vk_free(allocator, staging_buffer);
562
563 return VK_SUCCESS;
564
565 err_free_pixel_event_staging_buffer:
566 vk_free(allocator, staging_buffer);
567
568 err_free_usc_pixel_program:
569 list_del(&usc_eot_program->link);
570 pvr_bo_suballoc_free(usc_eot_program);
571
572 return result;
573 }
574
pvr_sub_cmd_gfx_build_terminate_ctrl_stream(struct pvr_device * const device,const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const gfx_sub_cmd)575 static VkResult pvr_sub_cmd_gfx_build_terminate_ctrl_stream(
576 struct pvr_device *const device,
577 const struct pvr_cmd_buffer *const cmd_buffer,
578 struct pvr_sub_cmd_gfx *const gfx_sub_cmd)
579 {
580 struct list_head bo_list;
581 struct pvr_csb csb;
582 VkResult result;
583
584 pvr_csb_init(device, PVR_CMD_STREAM_TYPE_GRAPHICS, &csb);
585
586 result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer, &csb);
587 if (result != VK_SUCCESS)
588 goto err_csb_finish;
589
590 result = pvr_csb_emit_terminate(&csb);
591 if (result != VK_SUCCESS)
592 goto err_csb_finish;
593
594 result = pvr_csb_bake(&csb, &bo_list);
595 if (result != VK_SUCCESS)
596 goto err_csb_finish;
597
598 /* This is a trivial control stream, there's no reason it should ever require
599 * more memory than a single bo can provide.
600 */
601 assert(list_is_singular(&bo_list));
602 gfx_sub_cmd->terminate_ctrl_stream =
603 list_first_entry(&bo_list, struct pvr_bo, link);
604
605 return VK_SUCCESS;
606
607 err_csb_finish:
608 pvr_csb_finish(&csb);
609
610 return result;
611 }
612
pvr_setup_texture_state_words(struct pvr_device * device,struct pvr_combined_image_sampler_descriptor * descriptor,const struct pvr_image_view * image_view)613 static VkResult pvr_setup_texture_state_words(
614 struct pvr_device *device,
615 struct pvr_combined_image_sampler_descriptor *descriptor,
616 const struct pvr_image_view *image_view)
617 {
618 const struct pvr_image *image = vk_to_pvr_image(image_view->vk.image);
619 struct pvr_texture_state_info info = {
620 .format = image_view->vk.format,
621 .mem_layout = image->memlayout,
622 .type = image_view->vk.view_type,
623 .is_cube = image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE ||
624 image_view->vk.view_type == VK_IMAGE_VIEW_TYPE_CUBE_ARRAY,
625 .tex_state_type = PVR_TEXTURE_STATE_SAMPLE,
626 .extent = image_view->vk.extent,
627 .mip_levels = 1,
628 .sample_count = image_view->vk.image->samples,
629 .stride = image->physical_extent.width,
630 .addr = image->dev_addr,
631 };
632 const uint8_t *const swizzle = pvr_get_format_swizzle(info.format);
633 VkResult result;
634
635 memcpy(&info.swizzle, swizzle, sizeof(info.swizzle));
636
637 /* TODO: Can we use image_view->texture_state instead of generating here? */
638 result = pvr_pack_tex_state(device, &info, descriptor->image);
639 if (result != VK_SUCCESS)
640 return result;
641
642 descriptor->sampler = (union pvr_sampler_descriptor){ 0 };
643
644 pvr_csb_pack (&descriptor->sampler.data.sampler_word,
645 TEXSTATE_SAMPLER,
646 sampler) {
647 sampler.non_normalized_coords = true;
648 sampler.addrmode_v = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
649 sampler.addrmode_u = PVRX(TEXSTATE_ADDRMODE_CLAMP_TO_EDGE);
650 sampler.minfilter = PVRX(TEXSTATE_FILTER_POINT);
651 sampler.magfilter = PVRX(TEXSTATE_FILTER_POINT);
652 sampler.dadjust = PVRX(TEXSTATE_DADJUST_ZERO_UINT);
653 }
654
655 return VK_SUCCESS;
656 }
657
658 static VkResult
pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_load_op * load_op,pvr_dev_addr_t * const addr_out)659 pvr_load_op_constants_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
660 const struct pvr_load_op *load_op,
661 pvr_dev_addr_t *const addr_out)
662 {
663 const struct pvr_render_pass_info *render_pass_info =
664 &cmd_buffer->state.render_pass_info;
665 const struct pvr_render_pass *pass = render_pass_info->pass;
666 const struct pvr_renderpass_hwsetup_render *hw_render = load_op->hw_render;
667 const struct pvr_renderpass_colorinit *color_init =
668 &hw_render->color_init[0];
669 const VkClearValue *clear_value =
670 &render_pass_info->clear_values[color_init->index];
671 struct pvr_suballoc_bo *clear_bo;
672 uint32_t attachment_count;
673 bool has_depth_clear;
674 bool has_depth_load;
675 VkResult result;
676
677 /* These are only setup and never used for now. These will need to be
678 * uploaded into a buffer based on some compiler info.
679 */
680 /* TODO: Remove the above comment once the compiler is hooked up and we're
681 * setting up + uploading the buffer.
682 */
683 struct pvr_combined_image_sampler_descriptor
684 texture_states[PVR_LOAD_OP_CLEARS_LOADS_MAX_RTS];
685 uint32_t texture_count = 0;
686 uint32_t hw_clear_value[PVR_LOAD_OP_CLEARS_LOADS_MAX_RTS *
687 PVR_CLEAR_COLOR_ARRAY_SIZE];
688 uint32_t next_clear_consts = 0;
689
690 if (load_op->is_hw_object)
691 attachment_count = load_op->hw_render->color_init_count;
692 else
693 attachment_count = load_op->subpass->color_count;
694
695 for (uint32_t i = 0; i < attachment_count; i++) {
696 struct pvr_image_view *image_view;
697 uint32_t attachment_idx;
698
699 if (load_op->is_hw_object)
700 attachment_idx = load_op->hw_render->color_init[i].index;
701 else
702 attachment_idx = load_op->subpass->color_attachments[i];
703
704 image_view = render_pass_info->attachments[attachment_idx];
705
706 assert((load_op->clears_loads_state.rt_load_mask &
707 load_op->clears_loads_state.rt_clear_mask) == 0);
708 if (load_op->clears_loads_state.rt_load_mask & BITFIELD_BIT(i)) {
709 result = pvr_setup_texture_state_words(cmd_buffer->device,
710 &texture_states[texture_count],
711 image_view);
712 if (result != VK_SUCCESS)
713 return result;
714
715 texture_count++;
716 } else if (load_op->clears_loads_state.rt_clear_mask & BITFIELD_BIT(i)) {
717 const uint32_t accum_fmt_size =
718 pvr_get_pbe_accum_format_size_in_bytes(image_view->vk.format);
719
720 assert(next_clear_consts +
721 vk_format_get_blocksize(image_view->vk.format) <=
722 ARRAY_SIZE(hw_clear_value));
723
724 /* FIXME: do this at the point we store the clear values? */
725 pvr_get_hw_clear_color(image_view->vk.format,
726 clear_value->color,
727 &hw_clear_value[next_clear_consts]);
728
729 next_clear_consts += DIV_ROUND_UP(accum_fmt_size, sizeof(uint32_t));
730 }
731 }
732
733 has_depth_load = false;
734 for (uint32_t i = 0;
735 i < ARRAY_SIZE(load_op->clears_loads_state.dest_vk_format);
736 i++) {
737 if (load_op->clears_loads_state.dest_vk_format[i] ==
738 VK_FORMAT_D32_SFLOAT) {
739 has_depth_load = true;
740 break;
741 }
742 }
743
744 has_depth_clear = load_op->clears_loads_state.depth_clear_to_reg != -1;
745
746 assert(!(has_depth_clear && has_depth_load));
747
748 if (has_depth_load) {
749 const struct pvr_render_pass_attachment *attachment;
750 const struct pvr_image_view *image_view;
751
752 assert(load_op->subpass->depth_stencil_attachment !=
753 VK_ATTACHMENT_UNUSED);
754 assert(!load_op->is_hw_object);
755 attachment =
756 &pass->attachments[load_op->subpass->depth_stencil_attachment];
757
758 image_view = render_pass_info->attachments[attachment->index];
759
760 result = pvr_setup_texture_state_words(cmd_buffer->device,
761 &texture_states[texture_count],
762 image_view);
763 if (result != VK_SUCCESS)
764 return result;
765
766 texture_count++;
767 } else if (has_depth_clear) {
768 const struct pvr_render_pass_attachment *attachment;
769 VkClearValue clear_value;
770
771 assert(load_op->subpass->depth_stencil_attachment !=
772 VK_ATTACHMENT_UNUSED);
773 attachment =
774 &pass->attachments[load_op->subpass->depth_stencil_attachment];
775
776 clear_value = render_pass_info->clear_values[attachment->index];
777
778 assert(next_clear_consts < ARRAY_SIZE(hw_clear_value));
779 hw_clear_value[next_clear_consts++] = fui(clear_value.depthStencil.depth);
780 }
781
782 result = pvr_cmd_buffer_upload_general(cmd_buffer,
783 &hw_clear_value[0],
784 sizeof(hw_clear_value),
785 &clear_bo);
786 if (result != VK_SUCCESS)
787 return result;
788
789 *addr_out = clear_bo->dev_addr;
790
791 return VK_SUCCESS;
792 }
793
pvr_load_op_pds_data_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_load_op * load_op,pvr_dev_addr_t constants_addr,struct pvr_pds_upload * const pds_upload_out)794 static VkResult pvr_load_op_pds_data_create_and_upload(
795 struct pvr_cmd_buffer *cmd_buffer,
796 const struct pvr_load_op *load_op,
797 pvr_dev_addr_t constants_addr,
798 struct pvr_pds_upload *const pds_upload_out)
799 {
800 struct pvr_device *device = cmd_buffer->device;
801 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
802 struct pvr_pds_pixel_shader_sa_program program = { 0 };
803 uint32_t staging_buffer_size;
804 uint32_t *staging_buffer;
805 VkResult result;
806
807 program.num_texture_dma_kicks = 1;
808
809 pvr_csb_pack (&program.texture_dma_address[0],
810 PDSINST_DOUT_FIELDS_DOUTD_SRC0,
811 value) {
812 value.sbase = constants_addr;
813 }
814
815 pvr_csb_pack (&program.texture_dma_control[0],
816 PDSINST_DOUT_FIELDS_DOUTD_SRC1,
817 value) {
818 value.dest = PVRX(PDSINST_DOUTD_DEST_COMMON_STORE);
819 value.a0 = load_op->shareds_dest_offset;
820 value.bsize = load_op->shareds_count;
821 }
822
823 pvr_pds_set_sizes_pixel_shader_sa_texture_data(&program, dev_info);
824
825 staging_buffer_size = PVR_DW_TO_BYTES(program.data_size);
826
827 staging_buffer = vk_alloc(&cmd_buffer->vk.pool->alloc,
828 staging_buffer_size,
829 8,
830 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
831 if (!staging_buffer)
832 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
833
834 pvr_pds_generate_pixel_shader_sa_texture_state_data(&program,
835 staging_buffer,
836 dev_info);
837
838 result = pvr_cmd_buffer_upload_pds_data(cmd_buffer,
839 staging_buffer,
840 program.data_size,
841 1,
842 pds_upload_out);
843 if (result != VK_SUCCESS) {
844 vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
845 return result;
846 }
847
848 vk_free(&cmd_buffer->vk.pool->alloc, staging_buffer);
849
850 return VK_SUCCESS;
851 }
852
853 /* FIXME: Should this function be specific to the HW background object, in
854 * which case its name should be changed, or should it have the load op
855 * structure passed in?
856 */
857 static VkResult
pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_load_op * load_op,struct pvr_pds_upload * const pds_upload_out)858 pvr_load_op_data_create_and_upload(struct pvr_cmd_buffer *cmd_buffer,
859 const struct pvr_load_op *load_op,
860 struct pvr_pds_upload *const pds_upload_out)
861 {
862 pvr_dev_addr_t constants_addr;
863 VkResult result;
864
865 result = pvr_load_op_constants_create_and_upload(cmd_buffer,
866 load_op,
867 &constants_addr);
868 if (result != VK_SUCCESS)
869 return result;
870
871 return pvr_load_op_pds_data_create_and_upload(cmd_buffer,
872 load_op,
873 constants_addr,
874 pds_upload_out);
875 }
876
pvr_pds_bgnd_pack_state(const struct pvr_load_op * load_op,const struct pvr_pds_upload * load_op_program,uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])877 static void pvr_pds_bgnd_pack_state(
878 const struct pvr_load_op *load_op,
879 const struct pvr_pds_upload *load_op_program,
880 uint64_t pds_reg_values[static const ROGUE_NUM_CR_PDS_BGRND_WORDS])
881 {
882 pvr_csb_pack (&pds_reg_values[0], CR_PDS_BGRND0_BASE, value) {
883 value.shader_addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset);
884 value.texunicode_addr =
885 PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset);
886 }
887
888 pvr_csb_pack (&pds_reg_values[1], CR_PDS_BGRND1_BASE, value) {
889 value.texturedata_addr = PVR_DEV_ADDR(load_op_program->data_offset);
890 }
891
892 pvr_csb_pack (&pds_reg_values[2], CR_PDS_BGRND3_SIZEINFO, value) {
893 value.usc_sharedsize =
894 DIV_ROUND_UP(load_op->const_shareds_count,
895 PVRX(CR_PDS_BGRND3_SIZEINFO_USC_SHAREDSIZE_UNIT_SIZE));
896 value.pds_texturestatesize = DIV_ROUND_UP(
897 load_op_program->data_size,
898 PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEXTURESTATESIZE_UNIT_SIZE));
899 value.pds_tempsize =
900 DIV_ROUND_UP(load_op->temps_count,
901 PVRX(CR_PDS_BGRND3_SIZEINFO_PDS_TEMPSIZE_UNIT_SIZE));
902 }
903 }
904
905 /**
906 * \brief Calculates the stride in pixels based on the pitch in bytes and pixel
907 * format.
908 *
909 * \param[in] pitch Width pitch in bytes.
910 * \param[in] vk_format Vulkan image format.
911 * \return Stride in pixels.
912 */
pvr_stride_from_pitch(uint32_t pitch,VkFormat vk_format)913 static inline uint32_t pvr_stride_from_pitch(uint32_t pitch, VkFormat vk_format)
914 {
915 const unsigned int cpp = vk_format_get_blocksize(vk_format);
916
917 assert(pitch % cpp == 0);
918
919 return pitch / cpp;
920 }
921
pvr_setup_pbe_state(const struct pvr_device_info * dev_info,const struct pvr_framebuffer * framebuffer,uint32_t mrt_index,const struct usc_mrt_resource * mrt_resource,const struct pvr_image_view * const iview,const VkRect2D * render_area,const bool down_scale,const uint32_t samples,uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS])922 static void pvr_setup_pbe_state(
923 const struct pvr_device_info *dev_info,
924 const struct pvr_framebuffer *framebuffer,
925 uint32_t mrt_index,
926 const struct usc_mrt_resource *mrt_resource,
927 const struct pvr_image_view *const iview,
928 const VkRect2D *render_area,
929 const bool down_scale,
930 const uint32_t samples,
931 uint32_t pbe_cs_words[static const ROGUE_NUM_PBESTATE_STATE_WORDS],
932 uint64_t pbe_reg_words[static const ROGUE_NUM_PBESTATE_REG_WORDS])
933 {
934 const struct pvr_image *image = pvr_image_view_get_image(iview);
935 uint32_t level_pitch = image->mip_levels[iview->vk.base_mip_level].pitch;
936
937 struct pvr_pbe_surf_params surface_params;
938 struct pvr_pbe_render_params render_params;
939 bool with_packed_usc_channel;
940 const uint8_t *swizzle;
941 uint32_t position;
942
943 /* down_scale should be true when performing a resolve, in which case there
944 * should be more than one sample.
945 */
946 assert((down_scale && samples > 1U) || (!down_scale && samples == 1U));
947
948 /* Setup surface parameters. */
949
950 if (PVR_HAS_FEATURE(dev_info, usc_f16sop_u8)) {
951 with_packed_usc_channel = vk_format_is_unorm(iview->vk.format) ||
952 vk_format_is_snorm(iview->vk.format);
953 } else {
954 with_packed_usc_channel = false;
955 }
956
957 swizzle = pvr_get_format_swizzle(iview->vk.format);
958 memcpy(surface_params.swizzle, swizzle, sizeof(surface_params.swizzle));
959
960 pvr_pbe_get_src_format_and_gamma(iview->vk.format,
961 PVR_PBE_GAMMA_NONE,
962 with_packed_usc_channel,
963 &surface_params.source_format,
964 &surface_params.gamma);
965
966 surface_params.is_normalized = pvr_vk_format_is_fully_normalized(iview->vk.format);
967 surface_params.pbe_packmode = pvr_get_pbe_packmode(iview->vk.format);
968 surface_params.nr_components = vk_format_get_nr_components(iview->vk.format);
969
970 /* FIXME: Should we have an inline function to return the address of a mip
971 * level?
972 */
973 surface_params.addr =
974 PVR_DEV_ADDR_OFFSET(image->vma->dev_addr,
975 image->mip_levels[iview->vk.base_mip_level].offset);
976 surface_params.addr =
977 PVR_DEV_ADDR_OFFSET(surface_params.addr,
978 iview->vk.base_array_layer * image->layer_size);
979
980 surface_params.mem_layout = image->memlayout;
981 surface_params.stride = pvr_stride_from_pitch(level_pitch, iview->vk.format);
982 surface_params.depth = iview->vk.extent.depth;
983 surface_params.width = iview->vk.extent.width;
984 surface_params.height = iview->vk.extent.height;
985 surface_params.z_only_render = false;
986 surface_params.down_scale = down_scale;
987
988 /* Setup render parameters. */
989
990 if (mrt_resource->type == USC_MRT_RESOURCE_TYPE_MEMORY) {
991 position = mrt_resource->mem.offset_dw;
992 } else {
993 assert(mrt_resource->type == USC_MRT_RESOURCE_TYPE_OUTPUT_REG);
994 assert(mrt_resource->reg.offset == 0);
995
996 position = mrt_resource->reg.output_reg;
997 }
998
999 assert(position <= 3 || PVR_HAS_FEATURE(dev_info, eight_output_registers));
1000
1001 switch (position) {
1002 case 0:
1003 case 4:
1004 render_params.source_start = PVR_PBE_STARTPOS_BIT0;
1005 break;
1006 case 1:
1007 case 5:
1008 render_params.source_start = PVR_PBE_STARTPOS_BIT32;
1009 break;
1010 case 2:
1011 case 6:
1012 render_params.source_start = PVR_PBE_STARTPOS_BIT64;
1013 break;
1014 case 3:
1015 case 7:
1016 render_params.source_start = PVR_PBE_STARTPOS_BIT96;
1017 break;
1018 default:
1019 assert(!"Invalid output register");
1020 break;
1021 }
1022
1023 #define PVR_DEC_IF_NOT_ZERO(_v) (((_v) > 0) ? (_v)-1 : 0)
1024
1025 render_params.min_x_clip = MAX2(0, render_area->offset.x);
1026 render_params.min_y_clip = MAX2(0, render_area->offset.y);
1027 render_params.max_x_clip = MIN2(
1028 framebuffer->width - 1,
1029 PVR_DEC_IF_NOT_ZERO(render_area->offset.x + render_area->extent.width));
1030 render_params.max_y_clip = MIN2(
1031 framebuffer->height - 1,
1032 PVR_DEC_IF_NOT_ZERO(render_area->offset.y + render_area->extent.height));
1033
1034 #undef PVR_DEC_IF_NOT_ZERO
1035
1036 render_params.slice = 0;
1037 render_params.mrt_index = mrt_index;
1038
1039 pvr_pbe_pack_state(dev_info,
1040 &surface_params,
1041 &render_params,
1042 pbe_cs_words,
1043 pbe_reg_words);
1044 }
1045
1046 static struct pvr_render_target *
pvr_get_render_target(const struct pvr_render_pass * pass,const struct pvr_framebuffer * framebuffer,uint32_t idx)1047 pvr_get_render_target(const struct pvr_render_pass *pass,
1048 const struct pvr_framebuffer *framebuffer,
1049 uint32_t idx)
1050 {
1051 const struct pvr_renderpass_hwsetup_render *hw_render =
1052 &pass->hw_setup->renders[idx];
1053 uint32_t rt_idx = 0;
1054
1055 switch (hw_render->sample_count) {
1056 case 1:
1057 case 2:
1058 case 4:
1059 case 8:
1060 rt_idx = util_logbase2(hw_render->sample_count);
1061 break;
1062
1063 default:
1064 unreachable("Unsupported sample count");
1065 break;
1066 }
1067
1068 return &framebuffer->render_targets[rt_idx];
1069 }
1070
1071 static uint32_t
pvr_pass_get_pixel_output_width(const struct pvr_render_pass * pass,uint32_t idx,const struct pvr_device_info * dev_info)1072 pvr_pass_get_pixel_output_width(const struct pvr_render_pass *pass,
1073 uint32_t idx,
1074 const struct pvr_device_info *dev_info)
1075 {
1076 const struct pvr_renderpass_hwsetup_render *hw_render =
1077 &pass->hw_setup->renders[idx];
1078 /* Default value based on the maximum value found in all existing cores. The
1079 * maximum is used as this is being treated as a lower bound, making it a
1080 * "safer" choice than the minimum value found in all existing cores.
1081 */
1082 const uint32_t min_output_regs =
1083 PVR_GET_FEATURE_VALUE(dev_info, usc_min_output_registers_per_pix, 2U);
1084 const uint32_t width = MAX2(hw_render->output_regs_count, min_output_regs);
1085
1086 return util_next_power_of_two(width);
1087 }
1088
1089 static inline bool
pvr_ds_attachment_requires_zls(const struct pvr_ds_attachment * attachment)1090 pvr_ds_attachment_requires_zls(const struct pvr_ds_attachment *attachment)
1091 {
1092 bool zls_used;
1093
1094 zls_used = attachment->load.d || attachment->load.s;
1095 zls_used |= attachment->store.d || attachment->store.s;
1096
1097 return zls_used;
1098 }
1099
1100 /**
1101 * \brief If depth and/or stencil attachment dimensions are not tile-aligned,
1102 * then we may need to insert some additional transfer subcommands.
1103 *
1104 * It's worth noting that we check whether the dimensions are smaller than a
1105 * tile here, rather than checking whether they're tile-aligned - this relies
1106 * on the assumption that we can safely use any attachment with dimensions
1107 * larger than a tile. If the attachment is twiddled, it will be over-allocated
1108 * to the nearest power-of-two (which will be tile-aligned). If the attachment
1109 * is not twiddled, we don't need to worry about tile-alignment at all.
1110 */
pvr_sub_cmd_gfx_requires_ds_subtile_alignment(const struct pvr_device_info * dev_info,const struct pvr_render_job * job)1111 static bool pvr_sub_cmd_gfx_requires_ds_subtile_alignment(
1112 const struct pvr_device_info *dev_info,
1113 const struct pvr_render_job *job)
1114 {
1115 const struct pvr_image *const ds_image =
1116 pvr_image_view_get_image(job->ds.iview);
1117 uint32_t zls_tile_size_x;
1118 uint32_t zls_tile_size_y;
1119
1120 rogue_get_zls_tile_size_xy(dev_info, &zls_tile_size_x, &zls_tile_size_y);
1121
1122 if (ds_image->physical_extent.width >= zls_tile_size_x &&
1123 ds_image->physical_extent.height >= zls_tile_size_y) {
1124 return false;
1125 }
1126
1127 /* If we have the zls_subtile feature, we can skip the alignment iff:
1128 * - The attachment is not multisampled, and
1129 * - The depth and stencil attachments are the same.
1130 */
1131 if (PVR_HAS_FEATURE(dev_info, zls_subtile) &&
1132 ds_image->vk.samples == VK_SAMPLE_COUNT_1_BIT &&
1133 job->has_stencil_attachment == job->has_depth_attachment) {
1134 return false;
1135 }
1136
1137 /* No ZLS functions enabled; nothing to do. */
1138 if ((!job->has_depth_attachment && !job->has_stencil_attachment) ||
1139 !pvr_ds_attachment_requires_zls(&job->ds)) {
1140 return false;
1141 }
1142
1143 return true;
1144 }
1145
1146 static VkResult
pvr_sub_cmd_gfx_align_ds_subtiles(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const gfx_sub_cmd)1147 pvr_sub_cmd_gfx_align_ds_subtiles(struct pvr_cmd_buffer *const cmd_buffer,
1148 struct pvr_sub_cmd_gfx *const gfx_sub_cmd)
1149 {
1150 struct pvr_sub_cmd *const prev_sub_cmd =
1151 container_of(gfx_sub_cmd, struct pvr_sub_cmd, gfx);
1152 struct pvr_ds_attachment *const ds = &gfx_sub_cmd->job.ds;
1153 const struct pvr_image *const ds_image = pvr_image_view_get_image(ds->iview);
1154 const VkFormat copy_format = pvr_get_raw_copy_format(ds_image->vk.format);
1155
1156 struct pvr_suballoc_bo *buffer;
1157 uint32_t buffer_layer_size;
1158 VkBufferImageCopy2 region;
1159 VkExtent2D zls_tile_size;
1160 VkExtent2D rounded_size;
1161 uint32_t buffer_size;
1162 VkExtent2D scale;
1163 VkResult result;
1164
1165 /* The operations below assume the last command in the buffer was the target
1166 * gfx subcommand. Assert that this is the case.
1167 */
1168 assert(list_last_entry(&cmd_buffer->sub_cmds, struct pvr_sub_cmd, link) ==
1169 prev_sub_cmd);
1170
1171 if (!pvr_ds_attachment_requires_zls(ds))
1172 return VK_SUCCESS;
1173
1174 rogue_get_zls_tile_size_xy(&cmd_buffer->device->pdevice->dev_info,
1175 &zls_tile_size.width,
1176 &zls_tile_size.height);
1177 rogue_get_isp_scale_xy_from_samples(ds_image->vk.samples,
1178 &scale.width,
1179 &scale.height);
1180
1181 rounded_size = (VkExtent2D){
1182 .width = ALIGN_POT(ds_image->physical_extent.width, zls_tile_size.width),
1183 .height =
1184 ALIGN_POT(ds_image->physical_extent.height, zls_tile_size.height),
1185 };
1186
1187 buffer_layer_size = vk_format_get_blocksize(ds_image->vk.format) *
1188 rounded_size.width * rounded_size.height * scale.width *
1189 scale.height;
1190
1191 if (ds->iview->vk.layer_count > 1)
1192 buffer_layer_size = ALIGN_POT(buffer_layer_size, ds_image->alignment);
1193
1194 buffer_size = buffer_layer_size * ds->iview->vk.layer_count;
1195
1196 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
1197 cmd_buffer->device->heaps.general_heap,
1198 buffer_size,
1199 &buffer);
1200 if (result != VK_SUCCESS)
1201 return result;
1202
1203 region = (VkBufferImageCopy2){
1204 .sType = VK_STRUCTURE_TYPE_BUFFER_IMAGE_COPY_2,
1205 .pNext = NULL,
1206 .bufferOffset = 0,
1207 .bufferRowLength = rounded_size.width,
1208 .bufferImageHeight = 0,
1209 .imageSubresource = {
1210 .aspectMask = VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT,
1211 .mipLevel = ds->iview->vk.base_mip_level,
1212 .baseArrayLayer = ds->iview->vk.base_array_layer,
1213 .layerCount = ds->iview->vk.layer_count,
1214 },
1215 .imageOffset = { 0 },
1216 .imageExtent = {
1217 .width = ds->iview->vk.extent.width,
1218 .height = ds->iview->vk.extent.height,
1219 .depth = 1,
1220 },
1221 };
1222
1223 if (ds->load.d || ds->load.s) {
1224 cmd_buffer->state.current_sub_cmd = NULL;
1225
1226 result =
1227 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
1228 if (result != VK_SUCCESS)
1229 return result;
1230
1231 result = pvr_copy_image_to_buffer_region_format(cmd_buffer,
1232 ds_image,
1233 buffer->dev_addr,
1234 ®ion,
1235 copy_format,
1236 copy_format);
1237 if (result != VK_SUCCESS)
1238 return result;
1239
1240 cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true;
1241
1242 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
1243 if (result != VK_SUCCESS)
1244 return result;
1245
1246 /* Now we have to fiddle with cmd_buffer to place this transfer command
1247 * *before* the target gfx subcommand.
1248 */
1249 list_move_to(&cmd_buffer->state.current_sub_cmd->link,
1250 &prev_sub_cmd->link);
1251
1252 cmd_buffer->state.current_sub_cmd = prev_sub_cmd;
1253 }
1254
1255 if (ds->store.d || ds->store.s) {
1256 cmd_buffer->state.current_sub_cmd = NULL;
1257
1258 result =
1259 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
1260 if (result != VK_SUCCESS)
1261 return result;
1262
1263 result = pvr_copy_buffer_to_image_region_format(cmd_buffer,
1264 buffer->dev_addr,
1265 ds_image,
1266 ®ion,
1267 copy_format,
1268 copy_format,
1269 0);
1270 if (result != VK_SUCCESS)
1271 return result;
1272
1273 cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true;
1274
1275 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
1276 if (result != VK_SUCCESS)
1277 return result;
1278
1279 cmd_buffer->state.current_sub_cmd = prev_sub_cmd;
1280 }
1281
1282 /* Finally, patch up the target graphics sub_cmd to use the correctly-strided
1283 * buffer.
1284 */
1285 ds->has_alignment_transfers = true;
1286 ds->addr = buffer->dev_addr;
1287 ds->physical_extent = rounded_size;
1288
1289 gfx_sub_cmd->wait_on_previous_transfer = true;
1290
1291 return VK_SUCCESS;
1292 }
1293
1294 struct pvr_emit_state {
1295 uint32_t pbe_cs_words[PVR_MAX_COLOR_ATTACHMENTS]
1296 [ROGUE_NUM_PBESTATE_STATE_WORDS];
1297
1298 uint64_t pbe_reg_words[PVR_MAX_COLOR_ATTACHMENTS]
1299 [ROGUE_NUM_PBESTATE_REG_WORDS];
1300
1301 uint32_t emit_count;
1302 };
1303
1304 static void
pvr_setup_emit_state(const struct pvr_device_info * dev_info,const struct pvr_renderpass_hwsetup_render * hw_render,struct pvr_render_pass_info * render_pass_info,struct pvr_emit_state * emit_state)1305 pvr_setup_emit_state(const struct pvr_device_info *dev_info,
1306 const struct pvr_renderpass_hwsetup_render *hw_render,
1307 struct pvr_render_pass_info *render_pass_info,
1308 struct pvr_emit_state *emit_state)
1309 {
1310 assert(hw_render->pbe_emits <= PVR_NUM_PBE_EMIT_REGS);
1311
1312 if (hw_render->eot_surface_count == 0) {
1313 emit_state->emit_count = 1;
1314 pvr_csb_pack (&emit_state->pbe_cs_words[0][1],
1315 PBESTATE_STATE_WORD1,
1316 state) {
1317 state.emptytile = true;
1318 }
1319 return;
1320 }
1321
1322 static_assert(USC_MRT_RESOURCE_TYPE_OUTPUT_REG + 1 ==
1323 USC_MRT_RESOURCE_TYPE_MEMORY,
1324 "The loop below needs adjusting.");
1325
1326 emit_state->emit_count = 0;
1327 for (uint32_t resource_type = USC_MRT_RESOURCE_TYPE_OUTPUT_REG;
1328 resource_type <= USC_MRT_RESOURCE_TYPE_MEMORY;
1329 resource_type++) {
1330 for (uint32_t i = 0; i < hw_render->eot_surface_count; i++) {
1331 const struct pvr_framebuffer *framebuffer =
1332 render_pass_info->framebuffer;
1333 const struct pvr_renderpass_hwsetup_eot_surface *surface =
1334 &hw_render->eot_surfaces[i];
1335 const struct pvr_image_view *iview =
1336 render_pass_info->attachments[surface->attachment_idx];
1337 const struct usc_mrt_resource *mrt_resource =
1338 &hw_render->eot_setup.mrt_resources[surface->mrt_idx];
1339 uint32_t samples = 1;
1340
1341 if (mrt_resource->type != resource_type)
1342 continue;
1343
1344 if (surface->need_resolve) {
1345 const struct pvr_image_view *resolve_src =
1346 render_pass_info->attachments[surface->src_attachment_idx];
1347
1348 /* Attachments that are the destination of resolve operations must
1349 * be loaded before their next use.
1350 */
1351 render_pass_info->enable_bg_tag = true;
1352 render_pass_info->process_empty_tiles = true;
1353
1354 if (surface->resolve_type != PVR_RESOLVE_TYPE_PBE)
1355 continue;
1356
1357 samples = (uint32_t)resolve_src->vk.image->samples;
1358 }
1359
1360 assert(emit_state->emit_count < ARRAY_SIZE(emit_state->pbe_cs_words));
1361 assert(emit_state->emit_count < ARRAY_SIZE(emit_state->pbe_reg_words));
1362
1363 pvr_setup_pbe_state(dev_info,
1364 framebuffer,
1365 emit_state->emit_count,
1366 mrt_resource,
1367 iview,
1368 &render_pass_info->render_area,
1369 surface->need_resolve,
1370 samples,
1371 emit_state->pbe_cs_words[emit_state->emit_count],
1372 emit_state->pbe_reg_words[emit_state->emit_count]);
1373 emit_state->emit_count += 1;
1374 }
1375 }
1376
1377 assert(emit_state->emit_count == hw_render->pbe_emits);
1378 }
1379
1380 static inline bool
pvr_is_render_area_tile_aligned(const struct pvr_cmd_buffer * cmd_buffer,const struct pvr_image_view * iview)1381 pvr_is_render_area_tile_aligned(const struct pvr_cmd_buffer *cmd_buffer,
1382 const struct pvr_image_view *iview)
1383 {
1384 const VkRect2D *render_area =
1385 &cmd_buffer->state.render_pass_info.render_area;
1386
1387 return render_area->offset.x == 0 && render_area->offset.y == 0 &&
1388 render_area->extent.height == iview->vk.extent.height &&
1389 render_area->extent.width == iview->vk.extent.width;
1390 }
1391
pvr_sub_cmd_gfx_job_init(const struct pvr_device_info * dev_info,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * sub_cmd)1392 static VkResult pvr_sub_cmd_gfx_job_init(const struct pvr_device_info *dev_info,
1393 struct pvr_cmd_buffer *cmd_buffer,
1394 struct pvr_sub_cmd_gfx *sub_cmd)
1395 {
1396 static const VkClearDepthStencilValue default_ds_clear_value = {
1397 .depth = 1.0f,
1398 .stencil = 0xFFFFFFFF,
1399 };
1400
1401 const struct vk_dynamic_graphics_state *dynamic_state =
1402 &cmd_buffer->vk.dynamic_graphics_state;
1403 struct pvr_render_pass_info *render_pass_info =
1404 &cmd_buffer->state.render_pass_info;
1405 const struct pvr_renderpass_hwsetup_render *hw_render =
1406 &render_pass_info->pass->hw_setup->renders[sub_cmd->hw_render_idx];
1407 struct pvr_render_job *job = &sub_cmd->job;
1408 struct pvr_pds_upload pds_pixel_event_program;
1409 struct pvr_framebuffer *framebuffer = render_pass_info->framebuffer;
1410 struct pvr_spm_bgobj_state *spm_bgobj_state =
1411 &framebuffer->spm_bgobj_state_per_render[sub_cmd->hw_render_idx];
1412 struct pvr_render_target *render_target;
1413 VkResult result;
1414
1415 if (sub_cmd->barrier_store) {
1416 /* There can only ever be one frag job running on the hardware at any one
1417 * time, and a context switch is not allowed mid-tile, so instead of
1418 * allocating a new scratch buffer we can reuse the SPM scratch buffer to
1419 * perform the store.
1420 * So use the SPM EOT program with the SPM PBE reg words in order to store
1421 * the render to the SPM scratch buffer.
1422 */
1423
1424 memcpy(job->pbe_reg_words,
1425 &framebuffer->spm_eot_state_per_render[0].pbe_reg_words,
1426 sizeof(job->pbe_reg_words));
1427 job->pds_pixel_event_data_offset =
1428 framebuffer->spm_eot_state_per_render[0]
1429 .pixel_event_program_data_offset;
1430 } else {
1431 struct pvr_emit_state emit_state = { 0 };
1432
1433 pvr_setup_emit_state(dev_info, hw_render, render_pass_info, &emit_state);
1434
1435 memcpy(job->pbe_reg_words,
1436 emit_state.pbe_reg_words,
1437 sizeof(job->pbe_reg_words));
1438
1439 result = pvr_sub_cmd_gfx_per_job_fragment_programs_create_and_upload(
1440 cmd_buffer,
1441 emit_state.emit_count,
1442 emit_state.pbe_cs_words[0],
1443 &pds_pixel_event_program);
1444 if (result != VK_SUCCESS)
1445 return result;
1446
1447 job->pds_pixel_event_data_offset = pds_pixel_event_program.data_offset;
1448 }
1449
1450 if (sub_cmd->barrier_load) {
1451 job->enable_bg_tag = true;
1452 job->process_empty_tiles = true;
1453
1454 /* Load the previously stored render from the SPM scratch buffer. */
1455
1456 STATIC_ASSERT(ARRAY_SIZE(job->pds_bgnd_reg_values) ==
1457 ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1458 typed_memcpy(job->pds_bgnd_reg_values,
1459 spm_bgobj_state->pds_reg_values,
1460 ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1461 } else if (hw_render->load_op) {
1462 const struct pvr_load_op *load_op = hw_render->load_op;
1463 struct pvr_pds_upload load_op_program;
1464
1465 /* Recalculate Background Object(s). */
1466
1467 /* FIXME: Should we free the PDS pixel event data or let it be freed
1468 * when the pool gets emptied?
1469 */
1470 result = pvr_load_op_data_create_and_upload(cmd_buffer,
1471 load_op,
1472 &load_op_program);
1473 if (result != VK_SUCCESS)
1474 return result;
1475
1476 job->enable_bg_tag = render_pass_info->enable_bg_tag;
1477 job->process_empty_tiles = render_pass_info->process_empty_tiles;
1478
1479 pvr_pds_bgnd_pack_state(load_op,
1480 &load_op_program,
1481 job->pds_bgnd_reg_values);
1482 }
1483
1484 /* TODO: In some cases a PR can be removed by storing to the color attachment
1485 * and have the background object load directly from it instead of using the
1486 * scratch buffer. In those cases we can also set this to "false" and avoid
1487 * extra fw overhead.
1488 */
1489 /* The scratch buffer is always needed and allocated to avoid data loss in
1490 * case SPM is hit so set the flag unconditionally.
1491 */
1492 job->requires_spm_scratch_buffer = true;
1493
1494 memcpy(job->pr_pbe_reg_words,
1495 &framebuffer->spm_eot_state_per_render[0].pbe_reg_words,
1496 sizeof(job->pbe_reg_words));
1497 job->pr_pds_pixel_event_data_offset =
1498 framebuffer->spm_eot_state_per_render[0].pixel_event_program_data_offset;
1499
1500 STATIC_ASSERT(ARRAY_SIZE(job->pds_pr_bgnd_reg_values) ==
1501 ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1502 typed_memcpy(job->pds_pr_bgnd_reg_values,
1503 spm_bgobj_state->pds_reg_values,
1504 ARRAY_SIZE(spm_bgobj_state->pds_reg_values));
1505
1506 render_target = pvr_get_render_target(render_pass_info->pass,
1507 framebuffer,
1508 sub_cmd->hw_render_idx);
1509 job->rt_dataset = render_target->rt_dataset;
1510
1511 job->ctrl_stream_addr = pvr_csb_get_start_address(&sub_cmd->control_stream);
1512
1513 if (sub_cmd->depth_bias_bo)
1514 job->depth_bias_table_addr = sub_cmd->depth_bias_bo->dev_addr;
1515 else
1516 job->depth_bias_table_addr = PVR_DEV_ADDR_INVALID;
1517
1518 if (sub_cmd->scissor_bo)
1519 job->scissor_table_addr = sub_cmd->scissor_bo->dev_addr;
1520 else
1521 job->scissor_table_addr = PVR_DEV_ADDR_INVALID;
1522
1523 job->pixel_output_width =
1524 pvr_pass_get_pixel_output_width(render_pass_info->pass,
1525 sub_cmd->hw_render_idx,
1526 dev_info);
1527
1528 /* Setup depth/stencil job information. */
1529 if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
1530 struct pvr_image_view *ds_iview =
1531 render_pass_info->attachments[hw_render->ds_attach_idx];
1532 const struct pvr_image *ds_image = pvr_image_view_get_image(ds_iview);
1533
1534 job->has_depth_attachment = vk_format_has_depth(ds_image->vk.format);
1535 job->has_stencil_attachment = vk_format_has_stencil(ds_image->vk.format);
1536
1537 if (job->has_depth_attachment || job->has_stencil_attachment) {
1538 uint32_t level_pitch =
1539 ds_image->mip_levels[ds_iview->vk.base_mip_level].pitch;
1540 const bool render_area_is_tile_aligned =
1541 pvr_is_render_area_tile_aligned(cmd_buffer, ds_iview);
1542 bool store_was_optimised_out = false;
1543 bool d_store = false, s_store = false;
1544 bool d_load = false, s_load = false;
1545
1546 job->ds.iview = ds_iview;
1547 job->ds.addr = ds_image->dev_addr;
1548
1549 job->ds.stride =
1550 pvr_stride_from_pitch(level_pitch, ds_iview->vk.format);
1551 job->ds.height = ds_iview->vk.extent.height;
1552 job->ds.physical_extent = (VkExtent2D){
1553 .width = u_minify(ds_image->physical_extent.width,
1554 ds_iview->vk.base_mip_level),
1555 .height = u_minify(ds_image->physical_extent.height,
1556 ds_iview->vk.base_mip_level),
1557 };
1558 job->ds.layer_size = ds_image->layer_size;
1559
1560 job->ds_clear_value = default_ds_clear_value;
1561
1562 if (hw_render->ds_attach_idx < render_pass_info->clear_value_count) {
1563 const VkClearDepthStencilValue *const clear_values =
1564 &render_pass_info->clear_values[hw_render->ds_attach_idx]
1565 .depthStencil;
1566
1567 if (job->has_depth_attachment)
1568 job->ds_clear_value.depth = clear_values->depth;
1569
1570 if (job->has_stencil_attachment)
1571 job->ds_clear_value.stencil = clear_values->stencil;
1572 }
1573
1574 switch (ds_iview->vk.format) {
1575 case VK_FORMAT_D16_UNORM:
1576 job->ds.zls_format = PVRX(CR_ZLS_FORMAT_TYPE_16BITINT);
1577 break;
1578
1579 case VK_FORMAT_S8_UINT:
1580 case VK_FORMAT_D32_SFLOAT:
1581 job->ds.zls_format = PVRX(CR_ZLS_FORMAT_TYPE_F32Z);
1582 break;
1583
1584 case VK_FORMAT_D24_UNORM_S8_UINT:
1585 job->ds.zls_format = PVRX(CR_ZLS_FORMAT_TYPE_24BITINT);
1586 break;
1587
1588 default:
1589 unreachable("Unsupported depth stencil format");
1590 }
1591
1592 job->ds.memlayout = ds_image->memlayout;
1593
1594 if (job->has_depth_attachment) {
1595 if (hw_render->depth_store || sub_cmd->barrier_store) {
1596 const bool depth_init_is_clear = hw_render->depth_init ==
1597 VK_ATTACHMENT_LOAD_OP_CLEAR;
1598
1599 d_store = true;
1600
1601 if (hw_render->depth_store && render_area_is_tile_aligned &&
1602 !(sub_cmd->modifies_depth || depth_init_is_clear)) {
1603 d_store = false;
1604 store_was_optimised_out = true;
1605 }
1606 }
1607
1608 if (d_store && !render_area_is_tile_aligned) {
1609 d_load = true;
1610 } else if (hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_LOAD) {
1611 enum pvr_depth_stencil_usage depth_usage = sub_cmd->depth_usage;
1612
1613 assert(depth_usage != PVR_DEPTH_STENCIL_USAGE_UNDEFINED);
1614 d_load = (depth_usage != PVR_DEPTH_STENCIL_USAGE_NEVER);
1615 } else {
1616 d_load = sub_cmd->barrier_load;
1617 }
1618 }
1619
1620 if (job->has_stencil_attachment) {
1621 if (hw_render->stencil_store || sub_cmd->barrier_store) {
1622 const bool stencil_init_is_clear = hw_render->stencil_init ==
1623 VK_ATTACHMENT_LOAD_OP_CLEAR;
1624
1625 s_store = true;
1626
1627 if (hw_render->stencil_store && render_area_is_tile_aligned &&
1628 !(sub_cmd->modifies_stencil || stencil_init_is_clear)) {
1629 s_store = false;
1630 store_was_optimised_out = true;
1631 }
1632 }
1633
1634 if (s_store && !render_area_is_tile_aligned) {
1635 s_load = true;
1636 } else if (hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_LOAD) {
1637 enum pvr_depth_stencil_usage stencil_usage =
1638 sub_cmd->stencil_usage;
1639
1640 assert(stencil_usage != PVR_DEPTH_STENCIL_USAGE_UNDEFINED);
1641 s_load = (stencil_usage != PVR_DEPTH_STENCIL_USAGE_NEVER);
1642 } else {
1643 s_load = sub_cmd->barrier_load;
1644 }
1645 }
1646
1647 job->ds.load.d = d_load;
1648 job->ds.load.s = s_load;
1649 job->ds.store.d = d_store;
1650 job->ds.store.s = s_store;
1651
1652 /* ZLS can't do masked writes for packed depth stencil formats so if
1653 * we store anything, we have to store everything.
1654 */
1655 if ((job->ds.store.d || job->ds.store.s) &&
1656 pvr_zls_format_type_is_packed(job->ds.zls_format)) {
1657 job->ds.store.d = true;
1658 job->ds.store.s = true;
1659
1660 /* In case we are only operating on one aspect of the attachment we
1661 * need to load the unused one in order to preserve its contents due
1662 * to the forced store which might otherwise corrupt it.
1663 */
1664 if (hw_render->depth_init != VK_ATTACHMENT_LOAD_OP_CLEAR)
1665 job->ds.load.d = true;
1666
1667 if (hw_render->stencil_init != VK_ATTACHMENT_LOAD_OP_CLEAR)
1668 job->ds.load.s = true;
1669 }
1670
1671 if (pvr_ds_attachment_requires_zls(&job->ds) ||
1672 store_was_optimised_out) {
1673 job->process_empty_tiles = true;
1674 }
1675
1676 if (pvr_sub_cmd_gfx_requires_ds_subtile_alignment(dev_info, job)) {
1677 result = pvr_sub_cmd_gfx_align_ds_subtiles(cmd_buffer, sub_cmd);
1678 if (result != VK_SUCCESS)
1679 return result;
1680 }
1681 }
1682 } else {
1683 job->has_depth_attachment = false;
1684 job->has_stencil_attachment = false;
1685 job->ds_clear_value = default_ds_clear_value;
1686 }
1687
1688 if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
1689 struct pvr_image_view *iview =
1690 render_pass_info->attachments[hw_render->ds_attach_idx];
1691 const struct pvr_image *image = pvr_image_view_get_image(iview);
1692
1693 /* If the HW render pass has a valid depth/stencil surface, determine the
1694 * sample count from the attachment's image.
1695 */
1696 job->samples = image->vk.samples;
1697 } else if (hw_render->output_regs_count) {
1698 /* If the HW render pass has output registers, we have color attachments
1699 * to write to, so determine the sample count from the count specified for
1700 * every color attachment in this render.
1701 */
1702 job->samples = hw_render->sample_count;
1703 } else if (cmd_buffer->state.gfx_pipeline) {
1704 /* If the HW render pass has no color or depth/stencil attachments, we
1705 * determine the sample count from the count given during pipeline
1706 * creation.
1707 */
1708 job->samples = dynamic_state->ms.rasterization_samples;
1709 } else if (render_pass_info->pass->attachment_count > 0) {
1710 /* If we get here, we have a render pass with subpasses containing no
1711 * attachments. The next best thing is largest of the sample counts
1712 * specified by the render pass attachment descriptions.
1713 */
1714 job->samples = render_pass_info->pass->max_sample_count;
1715 } else {
1716 /* No appropriate framebuffer attachment is available. */
1717 mesa_logw("Defaulting render job sample count to 1.");
1718 job->samples = VK_SAMPLE_COUNT_1_BIT;
1719 }
1720
1721 if (sub_cmd->max_tiles_in_flight ==
1722 PVR_GET_FEATURE_VALUE(dev_info, isp_max_tiles_in_flight, 1U)) {
1723 /* Use the default limit based on the partition store. */
1724 job->max_tiles_in_flight = 0U;
1725 } else {
1726 job->max_tiles_in_flight = sub_cmd->max_tiles_in_flight;
1727 }
1728
1729 job->frag_uses_atomic_ops = sub_cmd->frag_uses_atomic_ops;
1730 job->disable_compute_overlap = false;
1731 job->max_shared_registers = cmd_buffer->state.max_shared_regs;
1732 job->run_frag = true;
1733 job->geometry_terminate = true;
1734
1735 return VK_SUCCESS;
1736 }
1737
1738 static void
pvr_sub_cmd_compute_job_init(const struct pvr_physical_device * pdevice,struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * sub_cmd)1739 pvr_sub_cmd_compute_job_init(const struct pvr_physical_device *pdevice,
1740 struct pvr_cmd_buffer *cmd_buffer,
1741 struct pvr_sub_cmd_compute *sub_cmd)
1742 {
1743 sub_cmd->num_shared_regs = MAX2(cmd_buffer->device->idfwdf_state.usc_shareds,
1744 cmd_buffer->state.max_shared_regs);
1745
1746 cmd_buffer->state.max_shared_regs = 0U;
1747 }
1748
1749 #define PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS \
1750 (1024 / PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE))
1751
1752 static uint32_t
pvr_compute_flat_slot_size(const struct pvr_physical_device * pdevice,uint32_t coeff_regs_count,bool use_barrier,uint32_t total_workitems)1753 pvr_compute_flat_slot_size(const struct pvr_physical_device *pdevice,
1754 uint32_t coeff_regs_count,
1755 bool use_barrier,
1756 uint32_t total_workitems)
1757 {
1758 const struct pvr_device_runtime_info *dev_runtime_info =
1759 &pdevice->dev_runtime_info;
1760 const struct pvr_device_info *dev_info = &pdevice->dev_info;
1761 uint32_t max_workgroups_per_task = ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK;
1762 uint32_t max_avail_coeff_regs =
1763 dev_runtime_info->cdm_max_local_mem_size_regs;
1764 uint32_t localstore_chunks_count =
1765 DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs_count),
1766 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
1767
1768 /* Ensure that we cannot have more workgroups in a slot than the available
1769 * number of coefficients allow us to have.
1770 */
1771 if (coeff_regs_count > 0U) {
1772 /* If the geometry or fragment jobs can overlap with the compute job, or
1773 * if there is a vertex shader already running then we need to consider
1774 * this in calculating max allowed work-groups.
1775 */
1776 if (PVR_HAS_QUIRK(dev_info, 52354) &&
1777 (PVR_HAS_FEATURE(dev_info, compute_overlap) ||
1778 PVR_HAS_FEATURE(dev_info, gs_rta_support))) {
1779 /* Solve for n (number of work-groups per task). All values are in
1780 * size of common store alloc blocks:
1781 *
1782 * n + (2n + 7) * (local_memory_size_max - 1) =
1783 * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1784 * ==>
1785 * n + 2n * (local_memory_size_max - 1) =
1786 * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1787 * - (7 * (local_memory_size_max - 1))
1788 * ==>
1789 * n * (1 + 2 * (local_memory_size_max - 1)) =
1790 * (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1791 * - (7 * (local_memory_size_max - 1))
1792 * ==>
1793 * n = ((coefficient_memory_pool_size) -
1794 * (7 * pixel_allocation_size_max) -
1795 * (7 * (local_memory_size_max - 1)) / (1 +
1796 * 2 * (local_memory_size_max - 1)))
1797 */
1798 uint32_t max_common_store_blocks =
1799 DIV_ROUND_UP(max_avail_coeff_regs * 4U,
1800 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
1801
1802 /* (coefficient_memory_pool_size) - (7 * pixel_allocation_size_max)
1803 */
1804 max_common_store_blocks -= ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
1805 PIXEL_ALLOCATION_SIZE_MAX_IN_BLOCKS;
1806
1807 /* - (7 * (local_memory_size_max - 1)) */
1808 max_common_store_blocks -= (ROGUE_MAX_OVERLAPPED_PIXEL_TASK_INSTANCES *
1809 (localstore_chunks_count - 1U));
1810
1811 /* Divide by (1 + 2 * (local_memory_size_max - 1)) */
1812 max_workgroups_per_task = max_common_store_blocks /
1813 (1U + 2U * (localstore_chunks_count - 1U));
1814
1815 max_workgroups_per_task =
1816 MIN2(max_workgroups_per_task,
1817 ROGUE_CDM_MAX_PACKED_WORKGROUPS_PER_TASK);
1818
1819 } else {
1820 max_workgroups_per_task =
1821 MIN2((max_avail_coeff_regs / coeff_regs_count),
1822 max_workgroups_per_task);
1823 }
1824 }
1825
1826 /* max_workgroups_per_task should at least be one. */
1827 assert(max_workgroups_per_task >= 1U);
1828
1829 if (total_workitems >= ROGUE_MAX_INSTANCES_PER_TASK) {
1830 /* In this case, the work group size will have been padded up to the
1831 * next ROGUE_MAX_INSTANCES_PER_TASK so we just set max instances to be
1832 * ROGUE_MAX_INSTANCES_PER_TASK.
1833 */
1834 return ROGUE_MAX_INSTANCES_PER_TASK;
1835 }
1836
1837 /* In this case, the number of instances in the slot must be clamped to
1838 * accommodate whole work-groups only.
1839 */
1840 if (PVR_HAS_QUIRK(dev_info, 49032) || use_barrier) {
1841 max_workgroups_per_task =
1842 MIN2(max_workgroups_per_task,
1843 ROGUE_MAX_INSTANCES_PER_TASK / total_workitems);
1844 return total_workitems * max_workgroups_per_task;
1845 }
1846
1847 return MIN2(total_workitems * max_workgroups_per_task,
1848 ROGUE_MAX_INSTANCES_PER_TASK);
1849 }
1850
1851 static void
pvr_compute_generate_control_stream(struct pvr_csb * csb,struct pvr_sub_cmd_compute * sub_cmd,const struct pvr_compute_kernel_info * info)1852 pvr_compute_generate_control_stream(struct pvr_csb *csb,
1853 struct pvr_sub_cmd_compute *sub_cmd,
1854 const struct pvr_compute_kernel_info *info)
1855 {
1856 pvr_csb_set_relocation_mark(csb);
1857
1858 /* Compute kernel 0. */
1859 pvr_csb_emit (csb, CDMCTRL_KERNEL0, kernel0) {
1860 kernel0.indirect_present = !!info->indirect_buffer_addr.addr;
1861 kernel0.global_offsets_present = info->global_offsets_present;
1862 kernel0.usc_common_size = info->usc_common_size;
1863 kernel0.usc_unified_size = info->usc_unified_size;
1864 kernel0.pds_temp_size = info->pds_temp_size;
1865 kernel0.pds_data_size = info->pds_data_size;
1866 kernel0.usc_target = info->usc_target;
1867 kernel0.fence = info->is_fence;
1868 }
1869
1870 /* Compute kernel 1. */
1871 pvr_csb_emit (csb, CDMCTRL_KERNEL1, kernel1) {
1872 kernel1.data_addr = PVR_DEV_ADDR(info->pds_data_offset);
1873 kernel1.sd_type = info->sd_type;
1874 kernel1.usc_common_shared = info->usc_common_shared;
1875 }
1876
1877 /* Compute kernel 2. */
1878 pvr_csb_emit (csb, CDMCTRL_KERNEL2, kernel2) {
1879 kernel2.code_addr = PVR_DEV_ADDR(info->pds_code_offset);
1880 }
1881
1882 if (info->indirect_buffer_addr.addr) {
1883 /* Compute kernel 6. */
1884 pvr_csb_emit (csb, CDMCTRL_KERNEL6, kernel6) {
1885 kernel6.indirect_addrmsb = info->indirect_buffer_addr;
1886 }
1887
1888 /* Compute kernel 7. */
1889 pvr_csb_emit (csb, CDMCTRL_KERNEL7, kernel7) {
1890 kernel7.indirect_addrlsb = info->indirect_buffer_addr;
1891 }
1892 } else {
1893 /* Compute kernel 3. */
1894 pvr_csb_emit (csb, CDMCTRL_KERNEL3, kernel3) {
1895 assert(info->global_size[0U] > 0U);
1896 kernel3.workgroup_x = info->global_size[0U] - 1U;
1897 }
1898
1899 /* Compute kernel 4. */
1900 pvr_csb_emit (csb, CDMCTRL_KERNEL4, kernel4) {
1901 assert(info->global_size[1U] > 0U);
1902 kernel4.workgroup_y = info->global_size[1U] - 1U;
1903 }
1904
1905 /* Compute kernel 5. */
1906 pvr_csb_emit (csb, CDMCTRL_KERNEL5, kernel5) {
1907 assert(info->global_size[2U] > 0U);
1908 kernel5.workgroup_z = info->global_size[2U] - 1U;
1909 }
1910 }
1911
1912 /* Compute kernel 8. */
1913 pvr_csb_emit (csb, CDMCTRL_KERNEL8, kernel8) {
1914 if (info->max_instances == ROGUE_MAX_INSTANCES_PER_TASK)
1915 kernel8.max_instances = 0U;
1916 else
1917 kernel8.max_instances = info->max_instances;
1918
1919 assert(info->local_size[0U] > 0U);
1920 kernel8.workgroup_size_x = info->local_size[0U] - 1U;
1921 assert(info->local_size[1U] > 0U);
1922 kernel8.workgroup_size_y = info->local_size[1U] - 1U;
1923 assert(info->local_size[2U] > 0U);
1924 kernel8.workgroup_size_z = info->local_size[2U] - 1U;
1925 }
1926
1927 pvr_csb_clear_relocation_mark(csb);
1928
1929 /* Track the highest amount of shared registers usage in this dispatch.
1930 * This is used by the FW for context switching, so must be large enough
1931 * to contain all the shared registers that might be in use for this compute
1932 * job. Coefficients don't need to be included as the context switch will not
1933 * happen within the execution of a single workgroup, thus nothing needs to
1934 * be preserved.
1935 */
1936 if (info->usc_common_shared) {
1937 sub_cmd->num_shared_regs =
1938 MAX2(sub_cmd->num_shared_regs, info->usc_common_size);
1939 }
1940 }
1941
1942 /* TODO: This can be pre-packed and uploaded directly. Would that provide any
1943 * speed up?
1944 */
1945 static void
pvr_compute_generate_idfwdf(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd)1946 pvr_compute_generate_idfwdf(struct pvr_cmd_buffer *cmd_buffer,
1947 struct pvr_sub_cmd_compute *const sub_cmd)
1948 {
1949 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
1950 bool *const is_sw_barier_required =
1951 &state->current_sub_cmd->compute.pds_sw_barrier_requires_clearing;
1952 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
1953 struct pvr_csb *csb = &sub_cmd->control_stream;
1954 const struct pvr_pds_upload *program;
1955
1956 if (PVR_NEED_SW_COMPUTE_PDS_BARRIER(&pdevice->dev_info) &&
1957 *is_sw_barier_required) {
1958 *is_sw_barier_required = false;
1959 program = &cmd_buffer->device->idfwdf_state.sw_compute_barrier_pds;
1960 } else {
1961 program = &cmd_buffer->device->idfwdf_state.pds;
1962 }
1963
1964 struct pvr_compute_kernel_info info = {
1965 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
1966 .global_offsets_present = false,
1967 .usc_common_size = DIV_ROUND_UP(
1968 PVR_DW_TO_BYTES(cmd_buffer->device->idfwdf_state.usc_shareds),
1969 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
1970 .usc_unified_size = 0U,
1971 .pds_temp_size = 0U,
1972 .pds_data_size =
1973 DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
1974 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
1975 .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
1976 .is_fence = false,
1977 .pds_data_offset = program->data_offset,
1978 .sd_type = PVRX(CDMCTRL_SD_TYPE_USC),
1979 .usc_common_shared = true,
1980 .pds_code_offset = program->code_offset,
1981 .global_size = { 1U, 1U, 1U },
1982 .local_size = { 1U, 1U, 1U },
1983 };
1984
1985 /* We don't need to pad work-group size for this case. */
1986
1987 info.max_instances =
1988 pvr_compute_flat_slot_size(pdevice,
1989 cmd_buffer->device->idfwdf_state.usc_shareds,
1990 false,
1991 1U);
1992
1993 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
1994 }
1995
pvr_compute_generate_fence(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,bool deallocate_shareds)1996 void pvr_compute_generate_fence(struct pvr_cmd_buffer *cmd_buffer,
1997 struct pvr_sub_cmd_compute *const sub_cmd,
1998 bool deallocate_shareds)
1999 {
2000 const struct pvr_pds_upload *program =
2001 &cmd_buffer->device->pds_compute_fence_program;
2002 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
2003 struct pvr_csb *csb = &sub_cmd->control_stream;
2004
2005 struct pvr_compute_kernel_info info = {
2006 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
2007 .global_offsets_present = false,
2008 .usc_common_size = 0U,
2009 .usc_unified_size = 0U,
2010 .pds_temp_size = 0U,
2011 .pds_data_size =
2012 DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
2013 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
2014 .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
2015 .is_fence = true,
2016 .pds_data_offset = program->data_offset,
2017 .sd_type = PVRX(CDMCTRL_SD_TYPE_PDS),
2018 .usc_common_shared = deallocate_shareds,
2019 .pds_code_offset = program->code_offset,
2020 .global_size = { 1U, 1U, 1U },
2021 .local_size = { 1U, 1U, 1U },
2022 };
2023
2024 /* We don't need to pad work-group size for this case. */
2025 /* Here we calculate the slot size. This can depend on the use of barriers,
2026 * local memory, BRN's or other factors.
2027 */
2028 info.max_instances = pvr_compute_flat_slot_size(pdevice, 0U, false, 1U);
2029
2030 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
2031 }
2032
2033 static VkResult
pvr_cmd_buffer_process_deferred_clears(struct pvr_cmd_buffer * cmd_buffer)2034 pvr_cmd_buffer_process_deferred_clears(struct pvr_cmd_buffer *cmd_buffer)
2035 {
2036 util_dynarray_foreach (&cmd_buffer->deferred_clears,
2037 struct pvr_transfer_cmd,
2038 transfer_cmd) {
2039 VkResult result;
2040
2041 result = pvr_cmd_buffer_add_transfer_cmd(cmd_buffer, transfer_cmd);
2042 if (result != VK_SUCCESS)
2043 return result;
2044
2045 cmd_buffer->state.current_sub_cmd->transfer.serialize_with_frag = true;
2046 }
2047
2048 return VK_SUCCESS;
2049 }
2050
pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer * cmd_buffer)2051 VkResult pvr_cmd_buffer_end_sub_cmd(struct pvr_cmd_buffer *cmd_buffer)
2052 {
2053 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2054 struct pvr_sub_cmd *sub_cmd = state->current_sub_cmd;
2055 struct pvr_device *device = cmd_buffer->device;
2056 const struct pvr_query_pool *query_pool = NULL;
2057 struct pvr_suballoc_bo *query_bo = NULL;
2058 size_t query_indices_size = 0;
2059 VkResult result;
2060
2061 /* FIXME: Is this NULL check required because this function is called from
2062 * pvr_resolve_unemitted_resolve_attachments()? See comment about this
2063 * function being called twice in a row in pvr_CmdEndRenderPass().
2064 */
2065 if (!sub_cmd)
2066 return VK_SUCCESS;
2067
2068 if (!sub_cmd->owned) {
2069 state->current_sub_cmd = NULL;
2070 return VK_SUCCESS;
2071 }
2072
2073 switch (sub_cmd->type) {
2074 case PVR_SUB_CMD_TYPE_GRAPHICS: {
2075 struct pvr_sub_cmd_gfx *const gfx_sub_cmd = &sub_cmd->gfx;
2076
2077 query_indices_size =
2078 util_dynarray_num_elements(&state->query_indices, char);
2079
2080 if (query_indices_size > 0) {
2081 const bool secondary_cont =
2082 cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
2083 cmd_buffer->usage_flags &
2084 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
2085
2086 assert(gfx_sub_cmd->query_pool);
2087
2088 if (secondary_cont) {
2089 util_dynarray_append_dynarray(&state->query_indices,
2090 &gfx_sub_cmd->sec_query_indices);
2091 } else {
2092 const void *data = util_dynarray_begin(&state->query_indices);
2093
2094 result = pvr_cmd_buffer_upload_general(cmd_buffer,
2095 data,
2096 query_indices_size,
2097 &query_bo);
2098 if (result != VK_SUCCESS)
2099 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2100
2101 query_pool = gfx_sub_cmd->query_pool;
2102 }
2103
2104 gfx_sub_cmd->has_occlusion_query = true;
2105
2106 util_dynarray_clear(&state->query_indices);
2107 }
2108
2109 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
2110 result = pvr_csb_emit_return(&gfx_sub_cmd->control_stream);
2111 if (result != VK_SUCCESS)
2112 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2113
2114 break;
2115 }
2116
2117 /* TODO: Check if the sub_cmd can be skipped based on
2118 * sub_cmd->gfx.empty_cmd flag.
2119 */
2120
2121 /* TODO: Set the state in the functions called with the command buffer
2122 * instead of here.
2123 */
2124
2125 result = pvr_cmd_buffer_upload_tables(device, cmd_buffer, gfx_sub_cmd);
2126 if (result != VK_SUCCESS)
2127 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2128
2129 result = pvr_cmd_buffer_emit_ppp_state(cmd_buffer,
2130 &gfx_sub_cmd->control_stream);
2131 if (result != VK_SUCCESS)
2132 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2133
2134 result = pvr_csb_emit_terminate(&gfx_sub_cmd->control_stream);
2135 if (result != VK_SUCCESS)
2136 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2137
2138 result = pvr_sub_cmd_gfx_job_init(&device->pdevice->dev_info,
2139 cmd_buffer,
2140 gfx_sub_cmd);
2141 if (result != VK_SUCCESS)
2142 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2143
2144 if (pvr_sub_cmd_gfx_requires_split_submit(gfx_sub_cmd)) {
2145 result = pvr_sub_cmd_gfx_build_terminate_ctrl_stream(device,
2146 cmd_buffer,
2147 gfx_sub_cmd);
2148 if (result != VK_SUCCESS)
2149 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2150 }
2151
2152 break;
2153 }
2154
2155 case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
2156 case PVR_SUB_CMD_TYPE_COMPUTE: {
2157 struct pvr_sub_cmd_compute *const compute_sub_cmd = &sub_cmd->compute;
2158
2159 pvr_compute_generate_fence(cmd_buffer, compute_sub_cmd, true);
2160
2161 result = pvr_csb_emit_terminate(&compute_sub_cmd->control_stream);
2162 if (result != VK_SUCCESS)
2163 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2164
2165 pvr_sub_cmd_compute_job_init(device->pdevice,
2166 cmd_buffer,
2167 compute_sub_cmd);
2168 break;
2169 }
2170
2171 case PVR_SUB_CMD_TYPE_TRANSFER:
2172 break;
2173
2174 case PVR_SUB_CMD_TYPE_EVENT:
2175 break;
2176
2177 default:
2178 unreachable("Unsupported sub-command type");
2179 }
2180
2181 state->current_sub_cmd = NULL;
2182
2183 /* pvr_cmd_buffer_process_deferred_clears() must be called with a NULL
2184 * current_sub_cmd.
2185 *
2186 * We can start a sub_cmd of a different type from the current sub_cmd only
2187 * after having ended the current sub_cmd. However, we can't end the current
2188 * sub_cmd if this depends on starting sub_cmd(s) of a different type. Hence,
2189 * don't try to start transfer sub_cmd(s) with
2190 * pvr_cmd_buffer_process_deferred_clears() until the current hasn't ended.
2191 * Failing to do so we will cause a circular dependency between
2192 * pvr_cmd_buffer_{end,start}_cmd and blow the stack.
2193 */
2194 if (sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS) {
2195 result = pvr_cmd_buffer_process_deferred_clears(cmd_buffer);
2196 if (result != VK_SUCCESS)
2197 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2198 }
2199
2200 if (query_pool) {
2201 struct pvr_query_info query_info;
2202
2203 assert(query_bo);
2204 assert(query_indices_size);
2205
2206 query_info.type = PVR_QUERY_TYPE_AVAILABILITY_WRITE;
2207
2208 /* sizeof(uint32_t) is for the size of single query. */
2209 query_info.availability_write.num_query_indices =
2210 query_indices_size / sizeof(uint32_t);
2211 query_info.availability_write.index_bo = query_bo;
2212
2213 query_info.availability_write.num_queries = query_pool->query_count;
2214 query_info.availability_write.availability_bo =
2215 query_pool->availability_buffer;
2216
2217 /* Insert a barrier after the graphics sub command and before the
2218 * query sub command so that the availability write program waits for the
2219 * fragment shader to complete.
2220 */
2221
2222 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
2223 if (result != VK_SUCCESS)
2224 return result;
2225
2226 cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
2227 .type = PVR_EVENT_TYPE_BARRIER,
2228 .barrier = {
2229 .wait_for_stage_mask = PVR_PIPELINE_STAGE_FRAG_BIT,
2230 .wait_at_stage_mask = PVR_PIPELINE_STAGE_OCCLUSION_QUERY_BIT,
2231 },
2232 };
2233
2234 return pvr_add_query_program(cmd_buffer, &query_info);
2235 }
2236
2237 return VK_SUCCESS;
2238 }
2239
pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer * const cmd_buffer,bool start_geom)2240 void pvr_reset_graphics_dirty_state(struct pvr_cmd_buffer *const cmd_buffer,
2241 bool start_geom)
2242 {
2243 struct vk_dynamic_graphics_state *const dynamic_state =
2244 &cmd_buffer->vk.dynamic_graphics_state;
2245
2246 if (start_geom) {
2247 /*
2248 * Initial geometry phase state.
2249 * It's the driver's responsibility to ensure that the state of the
2250 * hardware is correctly initialized at the start of every geometry
2251 * phase. This is required to prevent stale state from a previous
2252 * geometry phase erroneously affecting the next geometry phase.
2253 *
2254 * If a geometry phase does not contain any geometry, this restriction
2255 * can be ignored. If the first draw call in a geometry phase will only
2256 * update the depth or stencil buffers i.e. ISP_TAGWRITEDISABLE is set
2257 * in the ISP State Control Word, the PDS State Pointers
2258 * (TA_PRES_PDSSTATEPTR*) in the first PPP State Update do not need to
2259 * be supplied, since they will never reach the PDS in the fragment
2260 * phase.
2261 */
2262
2263 cmd_buffer->state.emit_header = (struct PVRX(TA_STATE_HEADER)){
2264 .pres_stream_out_size = true,
2265 .pres_ppp_ctrl = true,
2266 .pres_varying_word2 = true,
2267 .pres_varying_word1 = true,
2268 .pres_varying_word0 = true,
2269 .pres_outselects = true,
2270 .pres_wclamp = true,
2271 .pres_viewport = true,
2272 .pres_region_clip = true,
2273 .pres_pds_state_ptr0 = true,
2274 .pres_ispctl_fb = true,
2275 .pres_ispctl = true,
2276 };
2277 } else {
2278 struct PVRX(TA_STATE_HEADER) *const emit_header =
2279 &cmd_buffer->state.emit_header;
2280
2281 emit_header->pres_ppp_ctrl = true;
2282 emit_header->pres_varying_word1 = true;
2283 emit_header->pres_varying_word0 = true;
2284 emit_header->pres_outselects = true;
2285 emit_header->pres_viewport = true;
2286 emit_header->pres_region_clip = true;
2287 emit_header->pres_pds_state_ptr0 = true;
2288 emit_header->pres_ispctl_fb = true;
2289 emit_header->pres_ispctl = true;
2290 }
2291
2292 memset(&cmd_buffer->state.ppp_state,
2293 0U,
2294 sizeof(cmd_buffer->state.ppp_state));
2295
2296 cmd_buffer->state.dirty.vertex_bindings = true;
2297 cmd_buffer->state.dirty.gfx_pipeline_binding = true;
2298
2299 BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS);
2300 BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT);
2301 }
2302
2303 static inline bool
pvr_cmd_uses_deferred_cs_cmds(const struct pvr_cmd_buffer * const cmd_buffer)2304 pvr_cmd_uses_deferred_cs_cmds(const struct pvr_cmd_buffer *const cmd_buffer)
2305 {
2306 const VkCommandBufferUsageFlags deferred_control_stream_flags =
2307 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT |
2308 VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
2309
2310 return cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
2311 (cmd_buffer->usage_flags & deferred_control_stream_flags) ==
2312 deferred_control_stream_flags;
2313 }
2314
pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,enum pvr_sub_cmd_type type)2315 VkResult pvr_cmd_buffer_start_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
2316 enum pvr_sub_cmd_type type)
2317 {
2318 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2319 struct pvr_device *device = cmd_buffer->device;
2320 struct pvr_sub_cmd *sub_cmd;
2321 VkResult result;
2322
2323 /* Check the current status of the buffer. */
2324 if (vk_command_buffer_has_error(&cmd_buffer->vk))
2325 return vk_command_buffer_get_record_result(&cmd_buffer->vk);
2326
2327 pvr_cmd_buffer_update_barriers(cmd_buffer, type);
2328
2329 /* TODO: Add proper support for joining consecutive event sub_cmd? */
2330 if (state->current_sub_cmd) {
2331 if (state->current_sub_cmd->type == type) {
2332 /* Continue adding to the current sub command. */
2333 return VK_SUCCESS;
2334 }
2335
2336 /* End the current sub command. */
2337 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
2338 if (result != VK_SUCCESS)
2339 return result;
2340 }
2341
2342 sub_cmd = vk_zalloc(&cmd_buffer->vk.pool->alloc,
2343 sizeof(*sub_cmd),
2344 8,
2345 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2346 if (!sub_cmd) {
2347 return vk_command_buffer_set_error(&cmd_buffer->vk,
2348 VK_ERROR_OUT_OF_HOST_MEMORY);
2349 }
2350
2351 sub_cmd->type = type;
2352 sub_cmd->owned = true;
2353
2354 switch (type) {
2355 case PVR_SUB_CMD_TYPE_GRAPHICS:
2356 sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
2357 sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_UNDEFINED;
2358 sub_cmd->gfx.modifies_depth = false;
2359 sub_cmd->gfx.modifies_stencil = false;
2360 sub_cmd->gfx.max_tiles_in_flight =
2361 PVR_GET_FEATURE_VALUE(&device->pdevice->dev_info,
2362 isp_max_tiles_in_flight,
2363 1);
2364 sub_cmd->gfx.hw_render_idx = state->render_pass_info.current_hw_subpass;
2365 sub_cmd->gfx.framebuffer = state->render_pass_info.framebuffer;
2366 sub_cmd->gfx.empty_cmd = true;
2367
2368 if (state->vis_test_enabled)
2369 sub_cmd->gfx.query_pool = state->query_pool;
2370
2371 pvr_reset_graphics_dirty_state(cmd_buffer, true);
2372
2373 if (pvr_cmd_uses_deferred_cs_cmds(cmd_buffer)) {
2374 pvr_csb_init(device,
2375 PVR_CMD_STREAM_TYPE_GRAPHICS_DEFERRED,
2376 &sub_cmd->gfx.control_stream);
2377 } else {
2378 pvr_csb_init(device,
2379 PVR_CMD_STREAM_TYPE_GRAPHICS,
2380 &sub_cmd->gfx.control_stream);
2381 }
2382
2383 util_dynarray_init(&sub_cmd->gfx.sec_query_indices, NULL);
2384 break;
2385
2386 case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
2387 case PVR_SUB_CMD_TYPE_COMPUTE:
2388 pvr_csb_init(device,
2389 PVR_CMD_STREAM_TYPE_COMPUTE,
2390 &sub_cmd->compute.control_stream);
2391 break;
2392
2393 case PVR_SUB_CMD_TYPE_TRANSFER:
2394 sub_cmd->transfer.transfer_cmds = &sub_cmd->transfer.transfer_cmds_priv;
2395 list_inithead(sub_cmd->transfer.transfer_cmds);
2396 break;
2397
2398 case PVR_SUB_CMD_TYPE_EVENT:
2399 break;
2400
2401 default:
2402 unreachable("Unsupported sub-command type");
2403 }
2404
2405 list_addtail(&sub_cmd->link, &cmd_buffer->sub_cmds);
2406 state->current_sub_cmd = sub_cmd;
2407
2408 return VK_SUCCESS;
2409 }
2410
pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer * cmd_buffer,struct pvr_winsys_heap * heap,uint64_t size,struct pvr_suballoc_bo ** const pvr_bo_out)2411 VkResult pvr_cmd_buffer_alloc_mem(struct pvr_cmd_buffer *cmd_buffer,
2412 struct pvr_winsys_heap *heap,
2413 uint64_t size,
2414 struct pvr_suballoc_bo **const pvr_bo_out)
2415 {
2416 const uint32_t cache_line_size =
2417 rogue_get_slc_cache_line_size(&cmd_buffer->device->pdevice->dev_info);
2418 struct pvr_suballoc_bo *suballoc_bo;
2419 struct pvr_suballocator *allocator;
2420 VkResult result;
2421
2422 if (heap == cmd_buffer->device->heaps.general_heap)
2423 allocator = &cmd_buffer->device->suballoc_general;
2424 else if (heap == cmd_buffer->device->heaps.pds_heap)
2425 allocator = &cmd_buffer->device->suballoc_pds;
2426 else if (heap == cmd_buffer->device->heaps.transfer_frag_heap)
2427 allocator = &cmd_buffer->device->suballoc_transfer;
2428 else if (heap == cmd_buffer->device->heaps.usc_heap)
2429 allocator = &cmd_buffer->device->suballoc_usc;
2430 else
2431 unreachable("Unknown heap type");
2432
2433 result =
2434 pvr_bo_suballoc(allocator, size, cache_line_size, false, &suballoc_bo);
2435 if (result != VK_SUCCESS)
2436 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
2437
2438 list_add(&suballoc_bo->link, &cmd_buffer->bo_list);
2439
2440 *pvr_bo_out = suballoc_bo;
2441
2442 return VK_SUCCESS;
2443 }
2444
pvr_cmd_bind_compute_pipeline(const struct pvr_compute_pipeline * const compute_pipeline,struct pvr_cmd_buffer * const cmd_buffer)2445 static void pvr_cmd_bind_compute_pipeline(
2446 const struct pvr_compute_pipeline *const compute_pipeline,
2447 struct pvr_cmd_buffer *const cmd_buffer)
2448 {
2449 cmd_buffer->state.compute_pipeline = compute_pipeline;
2450 cmd_buffer->state.dirty.compute_pipeline_binding = true;
2451 }
2452
pvr_cmd_bind_graphics_pipeline(const struct pvr_graphics_pipeline * const gfx_pipeline,struct pvr_cmd_buffer * const cmd_buffer)2453 static void pvr_cmd_bind_graphics_pipeline(
2454 const struct pvr_graphics_pipeline *const gfx_pipeline,
2455 struct pvr_cmd_buffer *const cmd_buffer)
2456 {
2457 cmd_buffer->state.gfx_pipeline = gfx_pipeline;
2458 cmd_buffer->state.dirty.gfx_pipeline_binding = true;
2459
2460 vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk,
2461 &gfx_pipeline->dynamic_state);
2462 }
2463
pvr_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)2464 void pvr_CmdBindPipeline(VkCommandBuffer commandBuffer,
2465 VkPipelineBindPoint pipelineBindPoint,
2466 VkPipeline _pipeline)
2467 {
2468 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2469 PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
2470
2471 switch (pipelineBindPoint) {
2472 case VK_PIPELINE_BIND_POINT_COMPUTE:
2473 pvr_cmd_bind_compute_pipeline(to_pvr_compute_pipeline(pipeline),
2474 cmd_buffer);
2475 break;
2476
2477 case VK_PIPELINE_BIND_POINT_GRAPHICS:
2478 pvr_cmd_bind_graphics_pipeline(to_pvr_graphics_pipeline(pipeline),
2479 cmd_buffer);
2480 break;
2481
2482 default:
2483 unreachable("Invalid bind point.");
2484 break;
2485 }
2486 }
2487
2488 #if MESA_DEBUG
check_viewport_quirk_70165(const struct pvr_device * device,const VkViewport * pViewport)2489 static void check_viewport_quirk_70165(const struct pvr_device *device,
2490 const VkViewport *pViewport)
2491 {
2492 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
2493 float min_vertex_x, max_vertex_x, min_vertex_y, max_vertex_y;
2494 float min_screen_space_value, max_screen_space_value;
2495 float sign_to_unsigned_offset, fixed_point_max;
2496 float guardband_width, guardband_height;
2497
2498 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
2499 /* Max representable value in 13.4 fixed point format.
2500 * Round-down to avoid precision issues.
2501 * Calculated as (2 ** 13) - 2*(2 ** -4)
2502 */
2503 fixed_point_max = 8192.0f - 2.0f / 16.0f;
2504
2505 if (PVR_HAS_FEATURE(dev_info, screen_size8K)) {
2506 if (pViewport->width <= 4096 && pViewport->height <= 4096) {
2507 guardband_width = pViewport->width / 4.0f;
2508 guardband_height = pViewport->height / 4.0f;
2509
2510 /* 2k of the range is negative */
2511 sign_to_unsigned_offset = 2048.0f;
2512 } else {
2513 guardband_width = 0.0f;
2514 guardband_height = 0.0f;
2515
2516 /* For > 4k renders, the entire range is positive */
2517 sign_to_unsigned_offset = 0.0f;
2518 }
2519 } else {
2520 guardband_width = pViewport->width / 4.0f;
2521 guardband_height = pViewport->height / 4.0f;
2522
2523 /* 2k of the range is negative */
2524 sign_to_unsigned_offset = 2048.0f;
2525 }
2526 } else {
2527 /* Max representable value in 16.8 fixed point format
2528 * Calculated as (2 ** 16) - (2 ** -8)
2529 */
2530 fixed_point_max = 65535.99609375f;
2531 guardband_width = pViewport->width / 4.0f;
2532 guardband_height = pViewport->height / 4.0f;
2533
2534 /* 4k/20k of the range is negative */
2535 sign_to_unsigned_offset = (float)PVR_MAX_NEG_OFFSCREEN_OFFSET;
2536 }
2537
2538 min_screen_space_value = -sign_to_unsigned_offset;
2539 max_screen_space_value = fixed_point_max - sign_to_unsigned_offset;
2540
2541 min_vertex_x = pViewport->x - guardband_width;
2542 max_vertex_x = pViewport->x + pViewport->width + guardband_width;
2543 min_vertex_y = pViewport->y - guardband_height;
2544 max_vertex_y = pViewport->y + pViewport->height + guardband_height;
2545 if (min_vertex_x < min_screen_space_value ||
2546 max_vertex_x > max_screen_space_value ||
2547 min_vertex_y < min_screen_space_value ||
2548 max_vertex_y > max_screen_space_value) {
2549 mesa_logw("Viewport is affected by BRN70165, geometry outside "
2550 "the viewport could be corrupted");
2551 }
2552 }
2553 #endif
2554
pvr_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)2555 void pvr_CmdSetViewport(VkCommandBuffer commandBuffer,
2556 uint32_t firstViewport,
2557 uint32_t viewportCount,
2558 const VkViewport *pViewports)
2559 {
2560 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2561 const uint32_t total_count = firstViewport + viewportCount;
2562
2563 assert(firstViewport < PVR_MAX_VIEWPORTS && viewportCount > 0);
2564 assert(total_count >= 1 && total_count <= PVR_MAX_VIEWPORTS);
2565
2566 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2567
2568 #if MESA_DEBUG
2569 if (PVR_HAS_QUIRK(&cmd_buffer->device->pdevice->dev_info, 70165)) {
2570 for (uint32_t viewport = 0; viewport < viewportCount; viewport++) {
2571 check_viewport_quirk_70165(cmd_buffer->device, &pViewports[viewport]);
2572 }
2573 }
2574 #endif
2575
2576 vk_common_CmdSetViewport(commandBuffer,
2577 firstViewport,
2578 viewportCount,
2579 pViewports);
2580 }
2581
pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer,float minDepthBounds,float maxDepthBounds)2582 void pvr_CmdSetDepthBounds(VkCommandBuffer commandBuffer,
2583 float minDepthBounds,
2584 float maxDepthBounds)
2585 {
2586 mesa_logd("No support for depth bounds testing.");
2587 }
2588
pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t firstSet,uint32_t descriptorSetCount,const VkDescriptorSet * pDescriptorSets,uint32_t dynamicOffsetCount,const uint32_t * pDynamicOffsets)2589 void pvr_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
2590 VkPipelineBindPoint pipelineBindPoint,
2591 VkPipelineLayout _layout,
2592 uint32_t firstSet,
2593 uint32_t descriptorSetCount,
2594 const VkDescriptorSet *pDescriptorSets,
2595 uint32_t dynamicOffsetCount,
2596 const uint32_t *pDynamicOffsets)
2597 {
2598 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2599 struct pvr_descriptor_state *descriptor_state;
2600
2601 assert(firstSet + descriptorSetCount <= PVR_MAX_DESCRIPTOR_SETS);
2602
2603 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2604
2605 switch (pipelineBindPoint) {
2606 case VK_PIPELINE_BIND_POINT_GRAPHICS:
2607 case VK_PIPELINE_BIND_POINT_COMPUTE:
2608 break;
2609
2610 default:
2611 unreachable("Unsupported bind point.");
2612 break;
2613 }
2614
2615 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
2616 descriptor_state = &cmd_buffer->state.gfx_desc_state;
2617 cmd_buffer->state.dirty.gfx_desc_dirty = true;
2618 } else {
2619 descriptor_state = &cmd_buffer->state.compute_desc_state;
2620 cmd_buffer->state.dirty.compute_desc_dirty = true;
2621 }
2622
2623 for (uint32_t i = 0; i < descriptorSetCount; i++) {
2624 PVR_FROM_HANDLE(pvr_descriptor_set, set, pDescriptorSets[i]);
2625 uint32_t index = firstSet + i;
2626
2627 if (descriptor_state->descriptor_sets[index] != set) {
2628 descriptor_state->descriptor_sets[index] = set;
2629 descriptor_state->valid_mask |= (1u << index);
2630 }
2631 }
2632
2633 if (dynamicOffsetCount > 0) {
2634 PVR_FROM_HANDLE(pvr_pipeline_layout, pipeline_layout, _layout);
2635 uint32_t set_offset = 0;
2636
2637 for (uint32_t set = 0; set < firstSet; set++)
2638 set_offset += pipeline_layout->set_layout[set]->dynamic_buffer_count;
2639
2640 assert(set_offset + dynamicOffsetCount <=
2641 ARRAY_SIZE(descriptor_state->dynamic_offsets));
2642
2643 /* From the Vulkan 1.3.238 spec. :
2644 *
2645 * "If any of the sets being bound include dynamic uniform or storage
2646 * buffers, then pDynamicOffsets includes one element for each array
2647 * element in each dynamic descriptor type binding in each set."
2648 *
2649 */
2650 for (uint32_t i = 0; i < dynamicOffsetCount; i++)
2651 descriptor_state->dynamic_offsets[set_offset + i] = pDynamicOffsets[i];
2652 }
2653 }
2654
pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets)2655 void pvr_CmdBindVertexBuffers(VkCommandBuffer commandBuffer,
2656 uint32_t firstBinding,
2657 uint32_t bindingCount,
2658 const VkBuffer *pBuffers,
2659 const VkDeviceSize *pOffsets)
2660 {
2661 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2662 struct pvr_vertex_binding *const vb = cmd_buffer->state.vertex_bindings;
2663
2664 /* We have to defer setting up vertex buffer since we need the buffer
2665 * stride from the pipeline.
2666 */
2667
2668 assert(firstBinding < PVR_MAX_VERTEX_INPUT_BINDINGS &&
2669 bindingCount <= PVR_MAX_VERTEX_INPUT_BINDINGS);
2670
2671 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2672
2673 for (uint32_t i = 0; i < bindingCount; i++) {
2674 vb[firstBinding + i].buffer = pvr_buffer_from_handle(pBuffers[i]);
2675 vb[firstBinding + i].offset = pOffsets[i];
2676 }
2677
2678 cmd_buffer->state.dirty.vertex_bindings = true;
2679 }
2680
pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkIndexType indexType)2681 void pvr_CmdBindIndexBuffer(VkCommandBuffer commandBuffer,
2682 VkBuffer buffer,
2683 VkDeviceSize offset,
2684 VkIndexType indexType)
2685 {
2686 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2687 PVR_FROM_HANDLE(pvr_buffer, index_buffer, buffer);
2688 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2689
2690 assert(offset < index_buffer->vk.size);
2691 assert(indexType == VK_INDEX_TYPE_UINT32 ||
2692 indexType == VK_INDEX_TYPE_UINT16 ||
2693 indexType == VK_INDEX_TYPE_UINT8_KHR);
2694
2695 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2696
2697 state->index_buffer_binding.buffer = index_buffer;
2698 state->index_buffer_binding.offset = offset;
2699 state->index_buffer_binding.type = indexType;
2700 state->dirty.index_buffer_binding = true;
2701 }
2702
pvr_CmdPushConstants(VkCommandBuffer commandBuffer,VkPipelineLayout layout,VkShaderStageFlags stageFlags,uint32_t offset,uint32_t size,const void * pValues)2703 void pvr_CmdPushConstants(VkCommandBuffer commandBuffer,
2704 VkPipelineLayout layout,
2705 VkShaderStageFlags stageFlags,
2706 uint32_t offset,
2707 uint32_t size,
2708 const void *pValues)
2709 {
2710 #if MESA_DEBUG
2711 const uint64_t ending = (uint64_t)offset + (uint64_t)size;
2712 #endif
2713
2714 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
2715 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
2716
2717 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
2718
2719 pvr_assert(ending <= PVR_MAX_PUSH_CONSTANTS_SIZE);
2720
2721 memcpy(&state->push_constants.data[offset], pValues, size);
2722
2723 state->push_constants.dirty_stages |= stageFlags;
2724 state->push_constants.uploaded = false;
2725 }
2726
2727 static VkResult
pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_render_pass * pass,const struct pvr_framebuffer * framebuffer)2728 pvr_cmd_buffer_setup_attachments(struct pvr_cmd_buffer *cmd_buffer,
2729 const struct pvr_render_pass *pass,
2730 const struct pvr_framebuffer *framebuffer)
2731 {
2732 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2733 struct pvr_render_pass_info *info = &state->render_pass_info;
2734
2735 assert(pass->attachment_count == framebuffer->attachment_count);
2736
2737 /* Free any previously allocated attachments. */
2738 vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.attachments);
2739
2740 if (pass->attachment_count == 0) {
2741 info->attachments = NULL;
2742 return VK_SUCCESS;
2743 }
2744
2745 info->attachments =
2746 vk_zalloc(&cmd_buffer->vk.pool->alloc,
2747 pass->attachment_count * sizeof(*info->attachments),
2748 8,
2749 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2750 if (!info->attachments) {
2751 return vk_command_buffer_set_error(&cmd_buffer->vk,
2752 VK_ERROR_OUT_OF_HOST_MEMORY);
2753 }
2754
2755 for (uint32_t i = 0; i < pass->attachment_count; i++)
2756 info->attachments[i] = framebuffer->attachments[i];
2757
2758 return VK_SUCCESS;
2759 }
2760
pvr_init_render_targets(struct pvr_device * device,struct pvr_render_pass * pass,struct pvr_framebuffer * framebuffer)2761 static VkResult pvr_init_render_targets(struct pvr_device *device,
2762 struct pvr_render_pass *pass,
2763 struct pvr_framebuffer *framebuffer)
2764 {
2765 for (uint32_t i = 0; i < pass->hw_setup->render_count; i++) {
2766 struct pvr_render_target *render_target =
2767 pvr_get_render_target(pass, framebuffer, i);
2768
2769 pthread_mutex_lock(&render_target->mutex);
2770
2771 if (!render_target->valid) {
2772 const struct pvr_renderpass_hwsetup_render *hw_render =
2773 &pass->hw_setup->renders[i];
2774 VkResult result;
2775
2776 result = pvr_render_target_dataset_create(device,
2777 framebuffer->width,
2778 framebuffer->height,
2779 hw_render->sample_count,
2780 framebuffer->layers,
2781 &render_target->rt_dataset);
2782 if (result != VK_SUCCESS) {
2783 pthread_mutex_unlock(&render_target->mutex);
2784 return result;
2785 }
2786
2787 render_target->valid = true;
2788 }
2789
2790 pthread_mutex_unlock(&render_target->mutex);
2791 }
2792
2793 return VK_SUCCESS;
2794 }
2795
2796 const struct pvr_renderpass_hwsetup_subpass *
pvr_get_hw_subpass(const struct pvr_render_pass * pass,const uint32_t subpass)2797 pvr_get_hw_subpass(const struct pvr_render_pass *pass, const uint32_t subpass)
2798 {
2799 const struct pvr_renderpass_hw_map *map =
2800 &pass->hw_setup->subpass_map[subpass];
2801
2802 return &pass->hw_setup->renders[map->render].subpasses[map->subpass];
2803 }
2804
pvr_perform_start_of_render_attachment_clear(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_framebuffer * framebuffer,uint32_t index,bool is_depth_stencil,uint32_t * index_list_clear_mask)2805 static void pvr_perform_start_of_render_attachment_clear(
2806 struct pvr_cmd_buffer *cmd_buffer,
2807 const struct pvr_framebuffer *framebuffer,
2808 uint32_t index,
2809 bool is_depth_stencil,
2810 uint32_t *index_list_clear_mask)
2811 {
2812 ASSERTED static const VkImageAspectFlags dsc_aspect_flags =
2813 VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT |
2814 VK_IMAGE_ASPECT_COLOR_BIT;
2815 struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
2816 const struct pvr_render_pass *pass = info->pass;
2817 const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup;
2818 const struct pvr_renderpass_hwsetup_render *hw_render =
2819 &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
2820 VkImageAspectFlags image_aspect;
2821 struct pvr_image_view *iview;
2822 uint32_t view_idx;
2823
2824 if (is_depth_stencil) {
2825 bool stencil_clear;
2826 bool depth_clear;
2827 bool is_stencil;
2828 bool is_depth;
2829
2830 assert(hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED);
2831 assert(index == 0);
2832
2833 view_idx = hw_render->ds_attach_idx;
2834
2835 is_depth = vk_format_has_depth(pass->attachments[view_idx].vk_format);
2836 is_stencil = vk_format_has_stencil(pass->attachments[view_idx].vk_format);
2837 depth_clear = hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_CLEAR;
2838 stencil_clear = hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_CLEAR;
2839
2840 /* Attempt to clear the ds attachment. Do not erroneously discard an
2841 * attachment that has no depth clear but has a stencil attachment.
2842 */
2843 /* if not (a ∧ c) ∨ (b ∧ d) */
2844 if (!((is_depth && depth_clear) || (is_stencil && stencil_clear)))
2845 return;
2846 } else if (hw_render->color_init[index].op != VK_ATTACHMENT_LOAD_OP_CLEAR) {
2847 return;
2848 } else {
2849 view_idx = hw_render->color_init[index].index;
2850 }
2851
2852 iview = info->attachments[view_idx];
2853
2854 /* FIXME: It would be nice if this function and pvr_sub_cmd_gfx_job_init()
2855 * were doing the same check (even if it's just an assert) to determine if a
2856 * clear is needed.
2857 */
2858 /* If this is single-layer fullscreen, we already do the clears in
2859 * pvr_sub_cmd_gfx_job_init().
2860 */
2861 if (pvr_is_render_area_tile_aligned(cmd_buffer, iview) &&
2862 framebuffer->layers == 1) {
2863 return;
2864 }
2865
2866 image_aspect = vk_format_aspects(pass->attachments[view_idx].vk_format);
2867 assert((image_aspect & ~dsc_aspect_flags) == 0);
2868
2869 if (image_aspect & VK_IMAGE_ASPECT_DEPTH_BIT &&
2870 hw_render->depth_init != VK_ATTACHMENT_LOAD_OP_CLEAR) {
2871 image_aspect &= ~VK_IMAGE_ASPECT_DEPTH_BIT;
2872 }
2873
2874 if (image_aspect & VK_IMAGE_ASPECT_STENCIL_BIT &&
2875 hw_render->stencil_init != VK_ATTACHMENT_LOAD_OP_CLEAR) {
2876 image_aspect &= ~VK_IMAGE_ASPECT_STENCIL_BIT;
2877 }
2878
2879 if (image_aspect != VK_IMAGE_ASPECT_NONE) {
2880 VkClearAttachment clear_attachment = {
2881 .aspectMask = image_aspect,
2882 .colorAttachment = index,
2883 .clearValue = info->clear_values[view_idx],
2884 };
2885 VkClearRect rect = {
2886 .rect = info->render_area,
2887 .baseArrayLayer = 0,
2888 .layerCount = info->framebuffer->layers,
2889 };
2890
2891 assert(view_idx < info->clear_value_count);
2892
2893 pvr_clear_attachments_render_init(cmd_buffer, &clear_attachment, &rect);
2894
2895 *index_list_clear_mask |= (1 << index);
2896 }
2897 }
2898
2899 static void
pvr_perform_start_of_render_clears(struct pvr_cmd_buffer * cmd_buffer)2900 pvr_perform_start_of_render_clears(struct pvr_cmd_buffer *cmd_buffer)
2901 {
2902 struct pvr_render_pass_info *info = &cmd_buffer->state.render_pass_info;
2903 const struct pvr_framebuffer *framebuffer = info->framebuffer;
2904 const struct pvr_render_pass *pass = info->pass;
2905 const struct pvr_renderpass_hwsetup *hw_setup = pass->hw_setup;
2906 const struct pvr_renderpass_hwsetup_render *hw_render =
2907 &hw_setup->renders[hw_setup->subpass_map[info->subpass_idx].render];
2908
2909 /* Mask of attachment clears using index lists instead of background object
2910 * to clear.
2911 */
2912 uint32_t index_list_clear_mask = 0;
2913
2914 for (uint32_t i = 0; i < hw_render->color_init_count; i++) {
2915 pvr_perform_start_of_render_attachment_clear(cmd_buffer,
2916 framebuffer,
2917 i,
2918 false,
2919 &index_list_clear_mask);
2920 }
2921
2922 info->enable_bg_tag = !!hw_render->color_init_count;
2923
2924 /* If we're not using index list for all clears/loads then we need to run
2925 * the background object on empty tiles.
2926 */
2927 if (hw_render->color_init_count &&
2928 index_list_clear_mask != ((1u << hw_render->color_init_count) - 1u)) {
2929 info->process_empty_tiles = true;
2930 } else {
2931 info->process_empty_tiles = false;
2932 }
2933
2934 if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
2935 uint32_t ds_index_list = 0;
2936
2937 pvr_perform_start_of_render_attachment_clear(cmd_buffer,
2938 framebuffer,
2939 0,
2940 true,
2941 &ds_index_list);
2942 }
2943
2944 if (index_list_clear_mask)
2945 pvr_finishme("Add support for generating loadops shaders!");
2946 }
2947
pvr_stash_depth_format(struct pvr_cmd_buffer_state * state,struct pvr_sub_cmd_gfx * const sub_cmd)2948 static void pvr_stash_depth_format(struct pvr_cmd_buffer_state *state,
2949 struct pvr_sub_cmd_gfx *const sub_cmd)
2950 {
2951 const struct pvr_render_pass *pass = state->render_pass_info.pass;
2952 const struct pvr_renderpass_hwsetup_render *hw_render =
2953 &pass->hw_setup->renders[sub_cmd->hw_render_idx];
2954
2955 if (hw_render->ds_attach_idx != VK_ATTACHMENT_UNUSED) {
2956 struct pvr_image_view **iviews = state->render_pass_info.attachments;
2957
2958 state->depth_format = iviews[hw_render->ds_attach_idx]->vk.format;
2959 }
2960 }
2961
pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup * hw_setup)2962 static bool pvr_loadops_contain_clear(struct pvr_renderpass_hwsetup *hw_setup)
2963 {
2964 for (uint32_t i = 0; i < hw_setup->render_count; i++) {
2965 struct pvr_renderpass_hwsetup_render *hw_render = &hw_setup->renders[i];
2966 uint32_t render_targets_count = hw_render->init_setup.num_render_targets;
2967
2968 for (uint32_t j = 0;
2969 j < (hw_render->color_init_count * render_targets_count);
2970 j += render_targets_count) {
2971 for (uint32_t k = 0; k < hw_render->init_setup.num_render_targets;
2972 k++) {
2973 if (hw_render->color_init[j + k].op ==
2974 VK_ATTACHMENT_LOAD_OP_CLEAR) {
2975 return true;
2976 }
2977 }
2978 }
2979 if (hw_render->depth_init == VK_ATTACHMENT_LOAD_OP_CLEAR ||
2980 hw_render->stencil_init == VK_ATTACHMENT_LOAD_OP_CLEAR) {
2981 return true;
2982 }
2983 }
2984
2985 return false;
2986 }
2987
2988 static VkResult
pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin)2989 pvr_cmd_buffer_set_clear_values(struct pvr_cmd_buffer *cmd_buffer,
2990 const VkRenderPassBeginInfo *pRenderPassBegin)
2991 {
2992 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
2993
2994 /* Free any previously allocated clear values. */
2995 vk_free(&cmd_buffer->vk.pool->alloc, state->render_pass_info.clear_values);
2996
2997 if (pRenderPassBegin->clearValueCount) {
2998 const size_t size = pRenderPassBegin->clearValueCount *
2999 sizeof(*state->render_pass_info.clear_values);
3000
3001 state->render_pass_info.clear_values =
3002 vk_zalloc(&cmd_buffer->vk.pool->alloc,
3003 size,
3004 8,
3005 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
3006 if (!state->render_pass_info.clear_values) {
3007 return vk_command_buffer_set_error(&cmd_buffer->vk,
3008 VK_ERROR_OUT_OF_HOST_MEMORY);
3009 }
3010
3011 memcpy(state->render_pass_info.clear_values,
3012 pRenderPassBegin->pClearValues,
3013 size);
3014 } else {
3015 state->render_pass_info.clear_values = NULL;
3016 }
3017
3018 state->render_pass_info.clear_value_count =
3019 pRenderPassBegin->clearValueCount;
3020
3021 return VK_SUCCESS;
3022 }
3023
3024 /**
3025 * \brief Indicates whether to use the large or normal clear state words.
3026 *
3027 * If the current render area can fit within a quarter of the max framebuffer
3028 * that the device is capable of, we can use the normal clear state words,
3029 * otherwise the large clear state words are needed.
3030 *
3031 * The requirement of a quarter of the max framebuffer comes from the index
3032 * count used in the normal clear state words and the vertices uploaded at
3033 * device creation.
3034 *
3035 * \param[in] cmd_buffer The command buffer for the clear.
3036 * \return true if large clear state words are required.
3037 */
3038 static bool
pvr_is_large_clear_required(const struct pvr_cmd_buffer * const cmd_buffer)3039 pvr_is_large_clear_required(const struct pvr_cmd_buffer *const cmd_buffer)
3040 {
3041 const struct pvr_device_info *const dev_info =
3042 &cmd_buffer->device->pdevice->dev_info;
3043 const VkRect2D render_area = cmd_buffer->state.render_pass_info.render_area;
3044 const uint32_t vf_max_x = rogue_get_param_vf_max_x(dev_info);
3045 const uint32_t vf_max_y = rogue_get_param_vf_max_x(dev_info);
3046
3047 return (render_area.extent.width > (vf_max_x / 2) - 1) ||
3048 (render_area.extent.height > (vf_max_y / 2) - 1);
3049 }
3050
pvr_emit_clear_words(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)3051 static void pvr_emit_clear_words(struct pvr_cmd_buffer *const cmd_buffer,
3052 struct pvr_sub_cmd_gfx *const sub_cmd)
3053 {
3054 struct pvr_device *device = cmd_buffer->device;
3055 struct pvr_csb *csb = &sub_cmd->control_stream;
3056 uint32_t vdm_state_size_in_dw;
3057 const uint32_t *vdm_state;
3058 uint32_t *stream;
3059
3060 vdm_state_size_in_dw =
3061 pvr_clear_vdm_state_get_size_in_dw(&device->pdevice->dev_info, 1);
3062
3063 pvr_csb_set_relocation_mark(csb);
3064
3065 stream = pvr_csb_alloc_dwords(csb, vdm_state_size_in_dw);
3066 if (!stream) {
3067 pvr_cmd_buffer_set_error_unwarned(cmd_buffer, csb->status);
3068 return;
3069 }
3070
3071 if (pvr_is_large_clear_required(cmd_buffer))
3072 vdm_state = device->static_clear_state.large_clear_vdm_words;
3073 else
3074 vdm_state = device->static_clear_state.vdm_words;
3075
3076 memcpy(stream, vdm_state, PVR_DW_TO_BYTES(vdm_state_size_in_dw));
3077
3078 pvr_csb_clear_relocation_mark(csb);
3079 }
3080
pvr_cs_write_load_op(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * sub_cmd,struct pvr_load_op * load_op,uint32_t isp_userpass)3081 static VkResult pvr_cs_write_load_op(struct pvr_cmd_buffer *cmd_buffer,
3082 struct pvr_sub_cmd_gfx *sub_cmd,
3083 struct pvr_load_op *load_op,
3084 uint32_t isp_userpass)
3085 {
3086 const struct pvr_device *device = cmd_buffer->device;
3087 struct pvr_static_clear_ppp_template template =
3088 device->static_clear_state.ppp_templates[VK_IMAGE_ASPECT_COLOR_BIT];
3089 uint32_t pds_state[PVR_STATIC_CLEAR_PDS_STATE_COUNT];
3090 struct pvr_pds_upload shareds_update_program;
3091 struct pvr_suballoc_bo *pvr_bo;
3092 VkResult result;
3093
3094 result = pvr_load_op_data_create_and_upload(cmd_buffer,
3095 load_op,
3096 &shareds_update_program);
3097 if (result != VK_SUCCESS)
3098 return result;
3099
3100 template.config.ispctl.upass = isp_userpass;
3101
3102 /* It might look odd that we aren't specifying the code segment's
3103 * address anywhere. This is because the hardware always assumes that the
3104 * data size is 2 128bit words and the code segments starts after that.
3105 */
3106 pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SHADERBASE],
3107 TA_STATE_PDS_SHADERBASE,
3108 shaderbase) {
3109 shaderbase.addr = PVR_DEV_ADDR(load_op->pds_frag_prog.data_offset);
3110 }
3111
3112 pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_TEXUNICODEBASE],
3113 TA_STATE_PDS_TEXUNICODEBASE,
3114 texunicodebase) {
3115 texunicodebase.addr =
3116 PVR_DEV_ADDR(load_op->pds_tex_state_prog.code_offset);
3117 }
3118
3119 pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SIZEINFO1],
3120 TA_STATE_PDS_SIZEINFO1,
3121 sizeinfo1) {
3122 /* Dummy coefficient loading program. */
3123 sizeinfo1.pds_varyingsize = 0;
3124
3125 sizeinfo1.pds_texturestatesize = DIV_ROUND_UP(
3126 shareds_update_program.data_size,
3127 PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEXTURESTATESIZE_UNIT_SIZE));
3128
3129 sizeinfo1.pds_tempsize =
3130 DIV_ROUND_UP(load_op->temps_count,
3131 PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE));
3132 }
3133
3134 pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SIZEINFO2],
3135 TA_STATE_PDS_SIZEINFO2,
3136 sizeinfo2) {
3137 sizeinfo2.usc_sharedsize =
3138 DIV_ROUND_UP(load_op->const_shareds_count,
3139 PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE));
3140 }
3141
3142 /* Dummy coefficient loading program. */
3143 pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_VARYINGBASE] = 0;
3144
3145 pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_TEXTUREDATABASE],
3146 TA_STATE_PDS_TEXTUREDATABASE,
3147 texturedatabase) {
3148 texturedatabase.addr = PVR_DEV_ADDR(shareds_update_program.data_offset);
3149 }
3150
3151 template.config.pds_state = &pds_state;
3152
3153 pvr_emit_ppp_from_template(&sub_cmd->control_stream, &template, &pvr_bo);
3154 list_add(&pvr_bo->link, &cmd_buffer->bo_list);
3155
3156 pvr_emit_clear_words(cmd_buffer, sub_cmd);
3157
3158 pvr_reset_graphics_dirty_state(cmd_buffer, false);
3159
3160 return VK_SUCCESS;
3161 }
3162
pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBeginInfo,const VkSubpassBeginInfo * pSubpassBeginInfo)3163 void pvr_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
3164 const VkRenderPassBeginInfo *pRenderPassBeginInfo,
3165 const VkSubpassBeginInfo *pSubpassBeginInfo)
3166 {
3167 PVR_FROM_HANDLE(pvr_framebuffer,
3168 framebuffer,
3169 pRenderPassBeginInfo->framebuffer);
3170 PVR_FROM_HANDLE(pvr_render_pass, pass, pRenderPassBeginInfo->renderPass);
3171 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
3172 const struct pvr_renderpass_hwsetup_subpass *hw_subpass;
3173 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
3174 VkResult result;
3175
3176 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
3177
3178 assert(!state->render_pass_info.pass);
3179 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
3180
3181 /* FIXME: Create a separate function for everything using pass->subpasses,
3182 * look at cmd_buffer_begin_subpass() for example. */
3183 state->render_pass_info.pass = pass;
3184 state->render_pass_info.framebuffer = framebuffer;
3185 state->render_pass_info.subpass_idx = 0;
3186 state->render_pass_info.render_area = pRenderPassBeginInfo->renderArea;
3187 state->render_pass_info.current_hw_subpass = 0;
3188 state->render_pass_info.pipeline_bind_point =
3189 pass->subpasses[0].pipeline_bind_point;
3190 state->render_pass_info.isp_userpass = pass->subpasses[0].isp_userpass;
3191 state->dirty.isp_userpass = true;
3192
3193 result = pvr_cmd_buffer_setup_attachments(cmd_buffer, pass, framebuffer);
3194 if (result != VK_SUCCESS)
3195 return;
3196
3197 result = pvr_init_render_targets(cmd_buffer->device, pass, framebuffer);
3198 if (result != VK_SUCCESS) {
3199 pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
3200 return;
3201 }
3202
3203 result = pvr_cmd_buffer_set_clear_values(cmd_buffer, pRenderPassBeginInfo);
3204 if (result != VK_SUCCESS)
3205 return;
3206
3207 assert(pass->subpasses[0].pipeline_bind_point ==
3208 VK_PIPELINE_BIND_POINT_GRAPHICS);
3209
3210 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
3211 if (result != VK_SUCCESS)
3212 return;
3213
3214 /* Run subpass 0 "soft" background object after the actual background
3215 * object.
3216 */
3217 hw_subpass = pvr_get_hw_subpass(pass, 0);
3218 if (hw_subpass->load_op) {
3219 result = pvr_cs_write_load_op(cmd_buffer,
3220 &cmd_buffer->state.current_sub_cmd->gfx,
3221 hw_subpass->load_op,
3222 0);
3223 if (result != VK_SUCCESS)
3224 return;
3225 }
3226
3227 pvr_perform_start_of_render_clears(cmd_buffer);
3228 pvr_stash_depth_format(&cmd_buffer->state,
3229 &cmd_buffer->state.current_sub_cmd->gfx);
3230 }
3231
pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)3232 VkResult pvr_BeginCommandBuffer(VkCommandBuffer commandBuffer,
3233 const VkCommandBufferBeginInfo *pBeginInfo)
3234 {
3235 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
3236 struct pvr_cmd_buffer_state *state;
3237 VkResult result;
3238
3239 vk_command_buffer_begin(&cmd_buffer->vk, pBeginInfo);
3240
3241 cmd_buffer->usage_flags = pBeginInfo->flags;
3242 state = &cmd_buffer->state;
3243
3244 /* VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT must be ignored for
3245 * primary level command buffers.
3246 *
3247 * From the Vulkan 1.0 spec:
3248 *
3249 * VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT specifies that a
3250 * secondary command buffer is considered to be entirely inside a render
3251 * pass. If this is a primary command buffer, then this bit is ignored.
3252 */
3253 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
3254 cmd_buffer->usage_flags &=
3255 ~VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT;
3256 }
3257
3258 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
3259 if (cmd_buffer->usage_flags &
3260 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
3261 const VkCommandBufferInheritanceInfo *inheritance_info =
3262 pBeginInfo->pInheritanceInfo;
3263 struct pvr_render_pass *pass;
3264
3265 pass = pvr_render_pass_from_handle(inheritance_info->renderPass);
3266 state->render_pass_info.pass = pass;
3267 state->render_pass_info.framebuffer =
3268 pvr_framebuffer_from_handle(inheritance_info->framebuffer);
3269 state->render_pass_info.subpass_idx = inheritance_info->subpass;
3270 state->render_pass_info.isp_userpass =
3271 pass->subpasses[inheritance_info->subpass].isp_userpass;
3272
3273 result =
3274 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
3275 if (result != VK_SUCCESS)
3276 return result;
3277
3278 state->vis_test_enabled = inheritance_info->occlusionQueryEnable;
3279 }
3280
3281 state->dirty.isp_userpass = true;
3282 }
3283
3284 util_dynarray_init(&state->query_indices, NULL);
3285
3286 memset(state->barriers_needed,
3287 0xFF,
3288 sizeof(*state->barriers_needed) * ARRAY_SIZE(state->barriers_needed));
3289
3290 return VK_SUCCESS;
3291 }
3292
pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_transfer_cmd * transfer_cmd)3293 VkResult pvr_cmd_buffer_add_transfer_cmd(struct pvr_cmd_buffer *cmd_buffer,
3294 struct pvr_transfer_cmd *transfer_cmd)
3295 {
3296 struct pvr_sub_cmd_transfer *sub_cmd;
3297 VkResult result;
3298
3299 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_TRANSFER);
3300 if (result != VK_SUCCESS)
3301 return result;
3302
3303 sub_cmd = &cmd_buffer->state.current_sub_cmd->transfer;
3304
3305 list_addtail(&transfer_cmd->link, sub_cmd->transfer_cmds);
3306
3307 return VK_SUCCESS;
3308 }
3309
3310 static VkResult
pvr_setup_vertex_buffers(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_graphics_pipeline * const gfx_pipeline)3311 pvr_setup_vertex_buffers(struct pvr_cmd_buffer *cmd_buffer,
3312 const struct pvr_graphics_pipeline *const gfx_pipeline)
3313 {
3314 const struct pvr_vertex_shader_state *const vertex_state =
3315 &gfx_pipeline->shader_state.vertex;
3316 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
3317 const struct pvr_pds_info *const pds_info = state->pds_shader.info;
3318 struct pvr_suballoc_bo *pvr_bo;
3319 const uint8_t *entries;
3320 uint32_t *dword_buffer;
3321 uint64_t *qword_buffer;
3322 VkResult result;
3323
3324 result =
3325 pvr_cmd_buffer_alloc_mem(cmd_buffer,
3326 cmd_buffer->device->heaps.pds_heap,
3327 PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
3328 &pvr_bo);
3329 if (result != VK_SUCCESS)
3330 return result;
3331
3332 dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3333 qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3334
3335 entries = (uint8_t *)pds_info->entries;
3336
3337 for (uint32_t i = 0; i < pds_info->entry_count; i++) {
3338 const struct pvr_const_map_entry *const entry_header =
3339 (struct pvr_const_map_entry *)entries;
3340
3341 switch (entry_header->type) {
3342 case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
3343 const struct pvr_const_map_entry_literal32 *const literal =
3344 (struct pvr_const_map_entry_literal32 *)entries;
3345
3346 PVR_WRITE(dword_buffer,
3347 literal->literal_value,
3348 literal->const_offset,
3349 pds_info->data_size_in_dwords);
3350
3351 entries += sizeof(*literal);
3352 break;
3353 }
3354
3355 case PVR_PDS_CONST_MAP_ENTRY_TYPE_DOUTU_ADDRESS: {
3356 const struct pvr_const_map_entry_doutu_address *const doutu_addr =
3357 (struct pvr_const_map_entry_doutu_address *)entries;
3358 const pvr_dev_addr_t exec_addr =
3359 PVR_DEV_ADDR_OFFSET(vertex_state->bo->dev_addr,
3360 vertex_state->entry_offset);
3361 uint64_t addr = 0ULL;
3362
3363 pvr_set_usc_execution_address64(&addr, exec_addr.addr);
3364
3365 PVR_WRITE(qword_buffer,
3366 addr | doutu_addr->doutu_control,
3367 doutu_addr->const_offset,
3368 pds_info->data_size_in_dwords);
3369
3370 entries += sizeof(*doutu_addr);
3371 break;
3372 }
3373
3374 case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE: {
3375 const struct pvr_const_map_entry_base_instance *const base_instance =
3376 (struct pvr_const_map_entry_base_instance *)entries;
3377
3378 PVR_WRITE(dword_buffer,
3379 state->draw_state.base_instance,
3380 base_instance->const_offset,
3381 pds_info->data_size_in_dwords);
3382
3383 entries += sizeof(*base_instance);
3384 break;
3385 }
3386
3387 case PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_VERTEX: {
3388 const struct pvr_const_map_entry_base_instance *const base_instance =
3389 (struct pvr_const_map_entry_base_instance *)entries;
3390
3391 PVR_WRITE(dword_buffer,
3392 state->draw_state.base_vertex,
3393 base_instance->const_offset,
3394 pds_info->data_size_in_dwords);
3395
3396 entries += sizeof(*base_instance);
3397 break;
3398 }
3399
3400 case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_ADDRESS: {
3401 const struct pvr_const_map_entry_vertex_attribute_address
3402 *const attribute =
3403 (struct pvr_const_map_entry_vertex_attribute_address *)entries;
3404 const struct pvr_vertex_binding *const binding =
3405 &state->vertex_bindings[attribute->binding_index];
3406 /* In relation to the Vulkan spec. 22.4. Vertex Input Address
3407 * Calculation:
3408 * Adding binding->offset corresponds to calculating the
3409 * `bufferBindingAddress`. Adding attribute->offset corresponds to
3410 * adding the `attribDesc.offset`. The `effectiveVertexOffset` is
3411 * taken care by the PDS program itself with a DDMAD which will
3412 * multiply the vertex/instance idx with the binding's stride and
3413 * add that to the address provided here.
3414 */
3415 const pvr_dev_addr_t addr =
3416 PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr,
3417 binding->offset + attribute->offset);
3418
3419 PVR_WRITE(qword_buffer,
3420 addr.addr,
3421 attribute->const_offset,
3422 pds_info->data_size_in_dwords);
3423
3424 entries += sizeof(*attribute);
3425 break;
3426 }
3427
3428 case PVR_PDS_CONST_MAP_ENTRY_TYPE_ROBUST_VERTEX_ATTRIBUTE_ADDRESS: {
3429 const struct pvr_const_map_entry_robust_vertex_attribute_address
3430 *const attribute =
3431 (struct pvr_const_map_entry_robust_vertex_attribute_address *)
3432 entries;
3433 const struct pvr_vertex_binding *const binding =
3434 &state->vertex_bindings[attribute->binding_index];
3435 pvr_dev_addr_t addr;
3436
3437 if (binding->buffer->vk.size <
3438 (attribute->offset + attribute->component_size_in_bytes)) {
3439 /* Replace with load from robustness buffer when no attribute is in
3440 * range
3441 */
3442 addr = PVR_DEV_ADDR_OFFSET(
3443 cmd_buffer->device->robustness_buffer->vma->dev_addr,
3444 attribute->robustness_buffer_offset);
3445 } else {
3446 addr = PVR_DEV_ADDR_OFFSET(binding->buffer->dev_addr,
3447 binding->offset + attribute->offset);
3448 }
3449
3450 PVR_WRITE(qword_buffer,
3451 addr.addr,
3452 attribute->const_offset,
3453 pds_info->data_size_in_dwords);
3454
3455 entries += sizeof(*attribute);
3456 break;
3457 }
3458
3459 case PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_MAX_INDEX: {
3460 const struct pvr_const_map_entry_vertex_attribute_max_index *attribute =
3461 (struct pvr_const_map_entry_vertex_attribute_max_index *)entries;
3462 const struct pvr_vertex_binding *const binding =
3463 &state->vertex_bindings[attribute->binding_index];
3464 const uint64_t bound_size = binding->buffer->vk.size - binding->offset;
3465 const uint32_t attribute_end =
3466 attribute->offset + attribute->component_size_in_bytes;
3467 uint32_t max_index;
3468
3469 if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
3470 pds_ddmadt)) {
3471 /* TODO: PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTRIBUTE_MAX_INDEX
3472 * has the same define value as
3473 * PVR_PDS_CONST_MAP_ENTRY_TYPE_VERTEX_ATTR_DDMADT_OOB_BUFFER_SIZE
3474 * so maybe we want to remove one of the defines or change the
3475 * values.
3476 */
3477 pvr_finishme("Unimplemented robust buffer access with DDMADT");
3478 assert(false);
3479 }
3480
3481 /* If the stride is 0 then all attributes use the same single element
3482 * from the binding so the index can only be up to 0.
3483 */
3484 if (bound_size < attribute_end || attribute->stride == 0) {
3485 max_index = 0;
3486 } else {
3487 max_index = (uint32_t)(bound_size / attribute->stride) - 1;
3488
3489 /* There's one last attribute that can fit in. */
3490 if (bound_size % attribute->stride >= attribute_end)
3491 max_index++;
3492 }
3493
3494 PVR_WRITE(dword_buffer,
3495 max_index,
3496 attribute->const_offset,
3497 pds_info->data_size_in_dwords);
3498
3499 entries += sizeof(*attribute);
3500 break;
3501 }
3502
3503 default:
3504 unreachable("Unsupported data section map");
3505 break;
3506 }
3507 }
3508
3509 state->pds_vertex_attrib_offset =
3510 pvr_bo->dev_addr.addr -
3511 cmd_buffer->device->heaps.pds_heap->base_addr.addr;
3512
3513 return VK_SUCCESS;
3514 }
3515
pvr_setup_descriptor_mappings_old(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,const pvr_dev_addr_t * const num_worgroups_buff_addr,uint32_t * const descriptor_data_offset_out)3516 static VkResult pvr_setup_descriptor_mappings_old(
3517 struct pvr_cmd_buffer *const cmd_buffer,
3518 enum pvr_stage_allocation stage,
3519 const struct pvr_stage_allocation_descriptor_state *descriptor_state,
3520 const pvr_dev_addr_t *const num_worgroups_buff_addr,
3521 uint32_t *const descriptor_data_offset_out)
3522 {
3523 const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
3524 const struct pvr_descriptor_state *desc_state;
3525 struct pvr_suballoc_bo *pvr_bo;
3526 const uint8_t *entries;
3527 uint32_t *dword_buffer;
3528 uint64_t *qword_buffer;
3529 VkResult result;
3530
3531 if (!pds_info->data_size_in_dwords)
3532 return VK_SUCCESS;
3533
3534 result =
3535 pvr_cmd_buffer_alloc_mem(cmd_buffer,
3536 cmd_buffer->device->heaps.pds_heap,
3537 PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
3538 &pvr_bo);
3539 if (result != VK_SUCCESS)
3540 return result;
3541
3542 dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3543 qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
3544
3545 entries = (uint8_t *)pds_info->entries;
3546
3547 switch (stage) {
3548 case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
3549 case PVR_STAGE_ALLOCATION_FRAGMENT:
3550 desc_state = &cmd_buffer->state.gfx_desc_state;
3551 break;
3552
3553 case PVR_STAGE_ALLOCATION_COMPUTE:
3554 desc_state = &cmd_buffer->state.compute_desc_state;
3555 break;
3556
3557 default:
3558 unreachable("Unsupported stage.");
3559 break;
3560 }
3561
3562 for (uint32_t i = 0; i < pds_info->entry_count; i++) {
3563 const struct pvr_const_map_entry *const entry_header =
3564 (struct pvr_const_map_entry *)entries;
3565
3566 switch (entry_header->type) {
3567 case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
3568 const struct pvr_const_map_entry_literal32 *const literal =
3569 (struct pvr_const_map_entry_literal32 *)entries;
3570
3571 PVR_WRITE(dword_buffer,
3572 literal->literal_value,
3573 literal->const_offset,
3574 pds_info->data_size_in_dwords);
3575
3576 entries += sizeof(*literal);
3577 break;
3578 }
3579
3580 case PVR_PDS_CONST_MAP_ENTRY_TYPE_CONSTANT_BUFFER: {
3581 const struct pvr_const_map_entry_constant_buffer *const_buffer_entry =
3582 (struct pvr_const_map_entry_constant_buffer *)entries;
3583 const uint32_t desc_set = const_buffer_entry->desc_set;
3584 const uint32_t binding = const_buffer_entry->binding;
3585 const struct pvr_descriptor_set *descriptor_set;
3586 const struct pvr_descriptor *descriptor;
3587 pvr_dev_addr_t buffer_addr;
3588
3589 assert(desc_set < PVR_MAX_DESCRIPTOR_SETS);
3590 descriptor_set = desc_state->descriptor_sets[desc_set];
3591
3592 /* TODO: Handle dynamic buffers. */
3593 descriptor = &descriptor_set->descriptors[binding];
3594 assert(descriptor->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER);
3595
3596 assert(descriptor->buffer_desc_range ==
3597 PVR_DW_TO_BYTES(const_buffer_entry->size_in_dwords));
3598 assert(descriptor->buffer_whole_range ==
3599 PVR_DW_TO_BYTES(const_buffer_entry->size_in_dwords));
3600
3601 buffer_addr =
3602 PVR_DEV_ADDR_OFFSET(descriptor->buffer_dev_addr,
3603 const_buffer_entry->offset * sizeof(uint32_t));
3604
3605 PVR_WRITE(qword_buffer,
3606 buffer_addr.addr,
3607 const_buffer_entry->const_offset,
3608 pds_info->data_size_in_dwords);
3609
3610 entries += sizeof(*const_buffer_entry);
3611 break;
3612 }
3613
3614 case PVR_PDS_CONST_MAP_ENTRY_TYPE_DESCRIPTOR_SET: {
3615 const struct pvr_const_map_entry_descriptor_set *desc_set_entry =
3616 (struct pvr_const_map_entry_descriptor_set *)entries;
3617 const uint32_t desc_set_num = desc_set_entry->descriptor_set;
3618 const struct pvr_descriptor_set *descriptor_set;
3619 pvr_dev_addr_t desc_set_addr;
3620 uint64_t desc_portion_offset;
3621
3622 assert(desc_set_num < PVR_MAX_DESCRIPTOR_SETS);
3623
3624 /* TODO: Remove this when the compiler provides us with usage info?
3625 */
3626 /* We skip DMAing unbound descriptor sets. */
3627 if (!(desc_state->valid_mask & BITFIELD_BIT(desc_set_num))) {
3628 const struct pvr_const_map_entry_literal32 *literal;
3629 uint32_t zero_literal_value;
3630
3631 /* The code segment contains a DOUT instructions so in the data
3632 * section we have to write a DOUTD_SRC0 and DOUTD_SRC1.
3633 * We'll write 0 for DOUTD_SRC0 since we don't have a buffer to DMA.
3634 * We're expecting a LITERAL32 entry containing the value for
3635 * DOUTD_SRC1 next so let's make sure we get it and write it
3636 * with BSIZE to 0 disabling the DMA operation.
3637 * We don't want the LITERAL32 to be processed as normal otherwise
3638 * we'd be DMAing from an address of 0.
3639 */
3640
3641 entries += sizeof(*desc_set_entry);
3642 literal = (struct pvr_const_map_entry_literal32 *)entries;
3643
3644 assert(literal->type == PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32);
3645
3646 zero_literal_value =
3647 literal->literal_value &
3648 PVR_ROGUE_PDSINST_DOUT_FIELDS_DOUTD_SRC1_BSIZE_CLRMSK;
3649
3650 PVR_WRITE(qword_buffer,
3651 UINT64_C(0),
3652 desc_set_entry->const_offset,
3653 pds_info->data_size_in_dwords);
3654
3655 PVR_WRITE(dword_buffer,
3656 zero_literal_value,
3657 desc_set_entry->const_offset,
3658 pds_info->data_size_in_dwords);
3659
3660 entries += sizeof(*literal);
3661 i++;
3662 continue;
3663 }
3664
3665 descriptor_set = desc_state->descriptor_sets[desc_set_num];
3666
3667 desc_set_addr = descriptor_set->pvr_bo->dev_addr;
3668
3669 if (desc_set_entry->primary) {
3670 desc_portion_offset =
3671 descriptor_set->layout->memory_layout_in_dwords_per_stage[stage]
3672 .primary_offset;
3673 } else {
3674 desc_portion_offset =
3675 descriptor_set->layout->memory_layout_in_dwords_per_stage[stage]
3676 .secondary_offset;
3677 }
3678 desc_portion_offset = PVR_DW_TO_BYTES(desc_portion_offset);
3679
3680 desc_set_addr =
3681 PVR_DEV_ADDR_OFFSET(desc_set_addr, desc_portion_offset);
3682
3683 desc_set_addr = PVR_DEV_ADDR_OFFSET(
3684 desc_set_addr,
3685 PVR_DW_TO_BYTES((uint64_t)desc_set_entry->offset_in_dwords));
3686
3687 PVR_WRITE(qword_buffer,
3688 desc_set_addr.addr,
3689 desc_set_entry->const_offset,
3690 pds_info->data_size_in_dwords);
3691
3692 entries += sizeof(*desc_set_entry);
3693 break;
3694 }
3695
3696 case PVR_PDS_CONST_MAP_ENTRY_TYPE_SPECIAL_BUFFER: {
3697 const struct pvr_const_map_entry_special_buffer *special_buff_entry =
3698 (struct pvr_const_map_entry_special_buffer *)entries;
3699
3700 switch (special_buff_entry->buffer_type) {
3701 case PVR_BUFFER_TYPE_COMPILE_TIME: {
3702 uint64_t addr = descriptor_state->static_consts->dev_addr.addr;
3703
3704 PVR_WRITE(qword_buffer,
3705 addr,
3706 special_buff_entry->const_offset,
3707 pds_info->data_size_in_dwords);
3708 break;
3709 }
3710
3711 case PVR_BUFFER_TYPE_BLEND_CONSTS:
3712 /* TODO: See if instead of reusing the blend constant buffer type
3713 * entry, we can setup a new buffer type specifically for
3714 * num_workgroups or other built-in variables. The mappings are
3715 * setup at pipeline creation when creating the descriptor program.
3716 */
3717 if (stage == PVR_STAGE_ALLOCATION_COMPUTE) {
3718 assert(num_worgroups_buff_addr->addr);
3719
3720 /* TODO: Check if we need to offset this (e.g. for just y and z),
3721 * or cope with any reordering?
3722 */
3723 PVR_WRITE(qword_buffer,
3724 num_worgroups_buff_addr->addr,
3725 special_buff_entry->const_offset,
3726 pds_info->data_size_in_dwords);
3727 } else {
3728 pvr_finishme("Add blend constants support.");
3729 }
3730 break;
3731
3732 default:
3733 unreachable("Unsupported special buffer type.");
3734 }
3735
3736 entries += sizeof(*special_buff_entry);
3737 break;
3738 }
3739
3740 default:
3741 unreachable("Unsupported map entry type.");
3742 }
3743 }
3744
3745 *descriptor_data_offset_out =
3746 pvr_bo->dev_addr.addr -
3747 cmd_buffer->device->heaps.pds_heap->base_addr.addr;
3748
3749 return VK_SUCCESS;
3750 }
3751
3752 /* Note that the descriptor set doesn't have any space for dynamic buffer
3753 * descriptors so this works on the assumption that you have a buffer with space
3754 * for them at the end.
3755 */
pvr_get_dynamic_descriptor_primary_offset(const struct pvr_device * device,const struct pvr_descriptor_set_layout * layout,const struct pvr_descriptor_set_layout_binding * binding,const uint32_t stage,const uint32_t desc_idx)3756 static uint16_t pvr_get_dynamic_descriptor_primary_offset(
3757 const struct pvr_device *device,
3758 const struct pvr_descriptor_set_layout *layout,
3759 const struct pvr_descriptor_set_layout_binding *binding,
3760 const uint32_t stage,
3761 const uint32_t desc_idx)
3762 {
3763 struct pvr_descriptor_size_info size_info;
3764 uint32_t offset;
3765
3766 assert(binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
3767 binding->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC);
3768 assert(desc_idx < binding->descriptor_count);
3769
3770 pvr_descriptor_size_info_init(device, binding->type, &size_info);
3771
3772 offset = layout->total_size_in_dwords;
3773 offset += binding->per_stage_offset_in_dwords[stage].primary;
3774 offset += (desc_idx * size_info.primary);
3775
3776 /* Offset must be less than * 16bits. */
3777 assert(offset < UINT16_MAX);
3778
3779 return (uint16_t)offset;
3780 }
3781
3782 /* Note that the descriptor set doesn't have any space for dynamic buffer
3783 * descriptors so this works on the assumption that you have a buffer with space
3784 * for them at the end.
3785 */
pvr_get_dynamic_descriptor_secondary_offset(const struct pvr_device * device,const struct pvr_descriptor_set_layout * layout,const struct pvr_descriptor_set_layout_binding * binding,const uint32_t stage,const uint32_t desc_idx)3786 static uint16_t pvr_get_dynamic_descriptor_secondary_offset(
3787 const struct pvr_device *device,
3788 const struct pvr_descriptor_set_layout *layout,
3789 const struct pvr_descriptor_set_layout_binding *binding,
3790 const uint32_t stage,
3791 const uint32_t desc_idx)
3792 {
3793 struct pvr_descriptor_size_info size_info;
3794 uint32_t offset;
3795
3796 assert(binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC ||
3797 binding->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC);
3798 assert(desc_idx < binding->descriptor_count);
3799
3800 pvr_descriptor_size_info_init(device, binding->type, &size_info);
3801
3802 offset = layout->total_size_in_dwords;
3803 offset +=
3804 layout->memory_layout_in_dwords_per_stage[stage].primary_dynamic_size;
3805 offset += binding->per_stage_offset_in_dwords[stage].secondary;
3806 offset += (desc_idx * size_info.secondary);
3807
3808 /* Offset must be less than * 16bits. */
3809 assert(offset < UINT16_MAX);
3810
3811 return (uint16_t)offset;
3812 }
3813
3814 /**
3815 * \brief Upload a copy of the descriptor set with dynamic buffer offsets
3816 * applied.
3817 */
3818 /* TODO: We should probably make the compiler aware of the dynamic descriptors.
3819 * We could use push constants like Anv seems to do. This would avoid having to
3820 * duplicate all sets containing dynamic descriptors each time the offsets are
3821 * updated.
3822 */
pvr_cmd_buffer_upload_patched_desc_set(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_descriptor_set * desc_set,const uint32_t * dynamic_offsets,struct pvr_suballoc_bo ** const bo_out)3823 static VkResult pvr_cmd_buffer_upload_patched_desc_set(
3824 struct pvr_cmd_buffer *cmd_buffer,
3825 const struct pvr_descriptor_set *desc_set,
3826 const uint32_t *dynamic_offsets,
3827 struct pvr_suballoc_bo **const bo_out)
3828 {
3829 const struct pvr_descriptor_set_layout *layout = desc_set->layout;
3830 const uint64_t normal_desc_set_size =
3831 PVR_DW_TO_BYTES(layout->total_size_in_dwords);
3832 const uint64_t dynamic_descs_size =
3833 PVR_DW_TO_BYTES(layout->total_dynamic_size_in_dwords);
3834 struct pvr_descriptor_size_info dynamic_uniform_buffer_size_info;
3835 struct pvr_descriptor_size_info dynamic_storage_buffer_size_info;
3836 struct pvr_device *device = cmd_buffer->device;
3837 struct pvr_suballoc_bo *patched_desc_set_bo;
3838 uint32_t *src_mem_ptr, *dst_mem_ptr;
3839 uint32_t desc_idx_offset = 0;
3840 VkResult result;
3841
3842 assert(desc_set->layout->dynamic_buffer_count > 0);
3843
3844 pvr_descriptor_size_info_init(device,
3845 VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC,
3846 &dynamic_uniform_buffer_size_info);
3847 pvr_descriptor_size_info_init(device,
3848 VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC,
3849 &dynamic_storage_buffer_size_info);
3850
3851 /* TODO: In the descriptor set we don't account for dynamic buffer
3852 * descriptors and take care of them in the pipeline layout. The pipeline
3853 * layout allocates them at the beginning but let's put them at the end just
3854 * because it makes things a bit easier. Ideally we should be using the
3855 * pipeline layout and use the offsets from the pipeline layout to patch
3856 * descriptors.
3857 */
3858 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
3859 cmd_buffer->device->heaps.general_heap,
3860 normal_desc_set_size + dynamic_descs_size,
3861 &patched_desc_set_bo);
3862 if (result != VK_SUCCESS)
3863 return result;
3864
3865 src_mem_ptr = (uint32_t *)pvr_bo_suballoc_get_map_addr(desc_set->pvr_bo);
3866 dst_mem_ptr = (uint32_t *)pvr_bo_suballoc_get_map_addr(patched_desc_set_bo);
3867
3868 memcpy(dst_mem_ptr, src_mem_ptr, normal_desc_set_size);
3869
3870 for (uint32_t i = 0; i < desc_set->layout->binding_count; i++) {
3871 const struct pvr_descriptor_set_layout_binding *binding =
3872 &desc_set->layout->bindings[i];
3873 const struct pvr_descriptor *descriptors =
3874 &desc_set->descriptors[binding->descriptor_index];
3875 const struct pvr_descriptor_size_info *size_info;
3876
3877 if (binding->type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC)
3878 size_info = &dynamic_uniform_buffer_size_info;
3879 else if (binding->type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC)
3880 size_info = &dynamic_storage_buffer_size_info;
3881 else
3882 continue;
3883
3884 for (uint32_t stage = 0; stage < PVR_STAGE_ALLOCATION_COUNT; stage++) {
3885 uint32_t primary_offset;
3886 uint32_t secondary_offset;
3887
3888 if (!(binding->shader_stage_mask & BITFIELD_BIT(stage)))
3889 continue;
3890
3891 /* Get the offsets for the first dynamic descriptor in the current
3892 * binding.
3893 */
3894 primary_offset =
3895 pvr_get_dynamic_descriptor_primary_offset(device,
3896 desc_set->layout,
3897 binding,
3898 stage,
3899 0);
3900 secondary_offset =
3901 pvr_get_dynamic_descriptor_secondary_offset(device,
3902 desc_set->layout,
3903 binding,
3904 stage,
3905 0);
3906
3907 /* clang-format off */
3908 for (uint32_t desc_idx = 0;
3909 desc_idx < binding->descriptor_count;
3910 desc_idx++) {
3911 /* clang-format on */
3912 const pvr_dev_addr_t addr =
3913 PVR_DEV_ADDR_OFFSET(descriptors[desc_idx].buffer_dev_addr,
3914 dynamic_offsets[desc_idx + desc_idx_offset]);
3915 const VkDeviceSize range =
3916 MIN2(descriptors[desc_idx].buffer_desc_range,
3917 descriptors[desc_idx].buffer_whole_range -
3918 dynamic_offsets[desc_idx]);
3919
3920 #if MESA_DEBUG
3921 uint32_t desc_primary_offset;
3922 uint32_t desc_secondary_offset;
3923
3924 desc_primary_offset =
3925 pvr_get_dynamic_descriptor_primary_offset(device,
3926 desc_set->layout,
3927 binding,
3928 stage,
3929 desc_idx);
3930 desc_secondary_offset =
3931 pvr_get_dynamic_descriptor_secondary_offset(device,
3932 desc_set->layout,
3933 binding,
3934 stage,
3935 desc_idx);
3936
3937 /* Check the assumption that the descriptors within a binding, for
3938 * a particular stage, are allocated consecutively.
3939 */
3940 assert(desc_primary_offset ==
3941 primary_offset + size_info->primary * desc_idx);
3942 assert(desc_secondary_offset ==
3943 secondary_offset + size_info->secondary * desc_idx);
3944 #endif
3945
3946 assert(descriptors[desc_idx].type == binding->type);
3947
3948 memcpy(dst_mem_ptr + primary_offset + size_info->primary * desc_idx,
3949 &addr.addr,
3950 PVR_DW_TO_BYTES(size_info->primary));
3951 memcpy(dst_mem_ptr + secondary_offset +
3952 size_info->secondary * desc_idx,
3953 &range,
3954 PVR_DW_TO_BYTES(size_info->secondary));
3955 }
3956 }
3957
3958 desc_idx_offset += binding->descriptor_count;
3959 }
3960
3961 *bo_out = patched_desc_set_bo;
3962
3963 return VK_SUCCESS;
3964 }
3965
3966 #define PVR_SELECT(_geom, _frag, _compute) \
3967 (stage == PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY) \
3968 ? (_geom) \
3969 : (stage == PVR_STAGE_ALLOCATION_FRAGMENT) ? (_frag) : (_compute)
3970
3971 static VkResult
pvr_cmd_buffer_upload_desc_set_table(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,pvr_dev_addr_t * addr_out)3972 pvr_cmd_buffer_upload_desc_set_table(struct pvr_cmd_buffer *const cmd_buffer,
3973 enum pvr_stage_allocation stage,
3974 pvr_dev_addr_t *addr_out)
3975 {
3976 uint64_t bound_desc_sets[PVR_MAX_DESCRIPTOR_SETS];
3977 const struct pvr_descriptor_state *desc_state;
3978 struct pvr_suballoc_bo *suballoc_bo;
3979 uint32_t dynamic_offset_idx = 0;
3980 VkResult result;
3981
3982 switch (stage) {
3983 case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
3984 case PVR_STAGE_ALLOCATION_FRAGMENT:
3985 case PVR_STAGE_ALLOCATION_COMPUTE:
3986 break;
3987
3988 default:
3989 unreachable("Unsupported stage.");
3990 break;
3991 }
3992
3993 desc_state = PVR_SELECT(&cmd_buffer->state.gfx_desc_state,
3994 &cmd_buffer->state.gfx_desc_state,
3995 &cmd_buffer->state.compute_desc_state);
3996
3997 for (uint32_t set = 0; set < ARRAY_SIZE(bound_desc_sets); set++)
3998 bound_desc_sets[set] = ~0;
3999
4000 assert(util_last_bit(desc_state->valid_mask) <= ARRAY_SIZE(bound_desc_sets));
4001 for (uint32_t set = 0; set < util_last_bit(desc_state->valid_mask); set++) {
4002 const struct pvr_descriptor_set *desc_set;
4003
4004 if (!(desc_state->valid_mask & BITFIELD_BIT(set))) {
4005 const struct pvr_pipeline_layout *pipeline_layout =
4006 PVR_SELECT(cmd_buffer->state.gfx_pipeline->base.layout,
4007 cmd_buffer->state.gfx_pipeline->base.layout,
4008 cmd_buffer->state.compute_pipeline->base.layout);
4009 const struct pvr_descriptor_set_layout *set_layout;
4010
4011 assert(set <= pipeline_layout->set_count);
4012
4013 set_layout = pipeline_layout->set_layout[set];
4014 dynamic_offset_idx += set_layout->dynamic_buffer_count;
4015
4016 continue;
4017 }
4018
4019 desc_set = desc_state->descriptor_sets[set];
4020
4021 /* TODO: Is it better if we don't set the valid_mask for empty sets? */
4022 if (desc_set->layout->descriptor_count == 0)
4023 continue;
4024
4025 if (desc_set->layout->dynamic_buffer_count > 0) {
4026 struct pvr_suballoc_bo *new_desc_set_bo;
4027
4028 assert(dynamic_offset_idx + desc_set->layout->dynamic_buffer_count <=
4029 ARRAY_SIZE(desc_state->dynamic_offsets));
4030
4031 result = pvr_cmd_buffer_upload_patched_desc_set(
4032 cmd_buffer,
4033 desc_set,
4034 &desc_state->dynamic_offsets[dynamic_offset_idx],
4035 &new_desc_set_bo);
4036 if (result != VK_SUCCESS)
4037 return result;
4038
4039 dynamic_offset_idx += desc_set->layout->dynamic_buffer_count;
4040
4041 bound_desc_sets[set] = new_desc_set_bo->dev_addr.addr;
4042 } else {
4043 bound_desc_sets[set] = desc_set->pvr_bo->dev_addr.addr;
4044 }
4045 }
4046
4047 result = pvr_cmd_buffer_upload_general(cmd_buffer,
4048 bound_desc_sets,
4049 sizeof(bound_desc_sets),
4050 &suballoc_bo);
4051 if (result != VK_SUCCESS)
4052 return result;
4053
4054 *addr_out = suballoc_bo->dev_addr;
4055 return VK_SUCCESS;
4056 }
4057
4058 static VkResult
pvr_process_addr_literal(struct pvr_cmd_buffer * cmd_buffer,enum pvr_pds_addr_literal_type addr_literal_type,enum pvr_stage_allocation stage,pvr_dev_addr_t * addr_out)4059 pvr_process_addr_literal(struct pvr_cmd_buffer *cmd_buffer,
4060 enum pvr_pds_addr_literal_type addr_literal_type,
4061 enum pvr_stage_allocation stage,
4062 pvr_dev_addr_t *addr_out)
4063 {
4064 VkResult result;
4065
4066 switch (addr_literal_type) {
4067 case PVR_PDS_ADDR_LITERAL_DESC_SET_ADDRS_TABLE: {
4068 /* TODO: Maybe we want to free pvr_bo? And only when the data
4069 * section is written successfully we link all bos to the command
4070 * buffer.
4071 */
4072 result =
4073 pvr_cmd_buffer_upload_desc_set_table(cmd_buffer, stage, addr_out);
4074 if (result != VK_SUCCESS)
4075 return result;
4076
4077 break;
4078 }
4079
4080 case PVR_PDS_ADDR_LITERAL_PUSH_CONSTS: {
4081 const struct pvr_pipeline_layout *layout =
4082 PVR_SELECT(cmd_buffer->state.gfx_pipeline->base.layout,
4083 cmd_buffer->state.gfx_pipeline->base.layout,
4084 cmd_buffer->state.compute_pipeline->base.layout);
4085 const uint32_t push_constants_offset =
4086 PVR_SELECT(layout->vert_push_constants_offset,
4087 layout->frag_push_constants_offset,
4088 layout->compute_push_constants_offset);
4089
4090 *addr_out = PVR_DEV_ADDR_OFFSET(cmd_buffer->state.push_constants.dev_addr,
4091 push_constants_offset);
4092 break;
4093 }
4094
4095 case PVR_PDS_ADDR_LITERAL_BLEND_CONSTANTS: {
4096 float *blend_consts =
4097 cmd_buffer->vk.dynamic_graphics_state.cb.blend_constants;
4098 size_t size =
4099 sizeof(cmd_buffer->vk.dynamic_graphics_state.cb.blend_constants);
4100 struct pvr_suballoc_bo *blend_consts_bo;
4101
4102 result = pvr_cmd_buffer_upload_general(cmd_buffer,
4103 blend_consts,
4104 size,
4105 &blend_consts_bo);
4106 if (result != VK_SUCCESS)
4107 return result;
4108
4109 *addr_out = blend_consts_bo->dev_addr;
4110
4111 break;
4112 }
4113
4114 default:
4115 unreachable("Invalid add literal type.");
4116 }
4117
4118 return VK_SUCCESS;
4119 }
4120
4121 #undef PVR_SELECT
4122
pvr_setup_descriptor_mappings_new(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,uint32_t * const descriptor_data_offset_out)4123 static VkResult pvr_setup_descriptor_mappings_new(
4124 struct pvr_cmd_buffer *const cmd_buffer,
4125 enum pvr_stage_allocation stage,
4126 const struct pvr_stage_allocation_descriptor_state *descriptor_state,
4127 uint32_t *const descriptor_data_offset_out)
4128 {
4129 const struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
4130 struct pvr_suballoc_bo *pvr_bo;
4131 const uint8_t *entries;
4132 uint32_t *dword_buffer;
4133 uint64_t *qword_buffer;
4134 VkResult result;
4135
4136 if (!pds_info->data_size_in_dwords)
4137 return VK_SUCCESS;
4138
4139 result =
4140 pvr_cmd_buffer_alloc_mem(cmd_buffer,
4141 cmd_buffer->device->heaps.pds_heap,
4142 PVR_DW_TO_BYTES(pds_info->data_size_in_dwords),
4143 &pvr_bo);
4144 if (result != VK_SUCCESS)
4145 return result;
4146
4147 dword_buffer = (uint32_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
4148 qword_buffer = (uint64_t *)pvr_bo_suballoc_get_map_addr(pvr_bo);
4149
4150 entries = (uint8_t *)pds_info->entries;
4151
4152 switch (stage) {
4153 case PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY:
4154 case PVR_STAGE_ALLOCATION_FRAGMENT:
4155 case PVR_STAGE_ALLOCATION_COMPUTE:
4156 break;
4157
4158 default:
4159 unreachable("Unsupported stage.");
4160 break;
4161 }
4162
4163 for (uint32_t i = 0; i < pds_info->entry_count; i++) {
4164 const struct pvr_const_map_entry *const entry_header =
4165 (struct pvr_const_map_entry *)entries;
4166
4167 switch (entry_header->type) {
4168 case PVR_PDS_CONST_MAP_ENTRY_TYPE_LITERAL32: {
4169 const struct pvr_const_map_entry_literal32 *const literal =
4170 (struct pvr_const_map_entry_literal32 *)entries;
4171
4172 PVR_WRITE(dword_buffer,
4173 literal->literal_value,
4174 literal->const_offset,
4175 pds_info->data_size_in_dwords);
4176
4177 entries += sizeof(*literal);
4178 break;
4179 }
4180
4181 case PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL_BUFFER: {
4182 const struct pvr_pds_const_map_entry_addr_literal_buffer
4183 *const addr_literal_buffer_entry =
4184 (struct pvr_pds_const_map_entry_addr_literal_buffer *)entries;
4185 struct pvr_device *device = cmd_buffer->device;
4186 struct pvr_suballoc_bo *addr_literal_buffer_bo;
4187 uint32_t addr_literal_count = 0;
4188 uint64_t *addr_literal_buffer;
4189
4190 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
4191 device->heaps.general_heap,
4192 addr_literal_buffer_entry->size,
4193 &addr_literal_buffer_bo);
4194 if (result != VK_SUCCESS)
4195 return result;
4196
4197 addr_literal_buffer =
4198 (uint64_t *)pvr_bo_suballoc_get_map_addr(addr_literal_buffer_bo);
4199
4200 entries += sizeof(*addr_literal_buffer_entry);
4201
4202 PVR_WRITE(qword_buffer,
4203 addr_literal_buffer_bo->dev_addr.addr,
4204 addr_literal_buffer_entry->const_offset,
4205 pds_info->data_size_in_dwords);
4206
4207 for (uint32_t j = i + 1; j < pds_info->entry_count; j++) {
4208 const struct pvr_const_map_entry *const entry_header =
4209 (struct pvr_const_map_entry *)entries;
4210 const struct pvr_pds_const_map_entry_addr_literal *addr_literal;
4211 pvr_dev_addr_t dev_addr;
4212
4213 if (entry_header->type != PVR_PDS_CONST_MAP_ENTRY_TYPE_ADDR_LITERAL)
4214 break;
4215
4216 addr_literal =
4217 (struct pvr_pds_const_map_entry_addr_literal *)entries;
4218
4219 result = pvr_process_addr_literal(cmd_buffer,
4220 addr_literal->addr_type,
4221 stage,
4222 &dev_addr);
4223 if (result != VK_SUCCESS)
4224 return result;
4225
4226 addr_literal_buffer[addr_literal_count++] = dev_addr.addr;
4227
4228 entries += sizeof(*addr_literal);
4229 }
4230
4231 assert(addr_literal_count * sizeof(uint64_t) ==
4232 addr_literal_buffer_entry->size);
4233
4234 i += addr_literal_count;
4235
4236 break;
4237 }
4238
4239 default:
4240 unreachable("Unsupported map entry type.");
4241 }
4242 }
4243
4244 *descriptor_data_offset_out =
4245 pvr_bo->dev_addr.addr -
4246 cmd_buffer->device->heaps.pds_heap->base_addr.addr;
4247
4248 return VK_SUCCESS;
4249 }
4250
pvr_setup_descriptor_mappings(struct pvr_cmd_buffer * const cmd_buffer,enum pvr_stage_allocation stage,const struct pvr_stage_allocation_descriptor_state * descriptor_state,const pvr_dev_addr_t * const num_worgroups_buff_addr,uint32_t * const descriptor_data_offset_out)4251 static VkResult pvr_setup_descriptor_mappings(
4252 struct pvr_cmd_buffer *const cmd_buffer,
4253 enum pvr_stage_allocation stage,
4254 const struct pvr_stage_allocation_descriptor_state *descriptor_state,
4255 const pvr_dev_addr_t *const num_worgroups_buff_addr,
4256 uint32_t *const descriptor_data_offset_out)
4257 {
4258 const bool old_path =
4259 pvr_has_hard_coded_shaders(&cmd_buffer->device->pdevice->dev_info);
4260
4261 if (old_path) {
4262 return pvr_setup_descriptor_mappings_old(cmd_buffer,
4263 stage,
4264 descriptor_state,
4265 num_worgroups_buff_addr,
4266 descriptor_data_offset_out);
4267 }
4268
4269 return pvr_setup_descriptor_mappings_new(cmd_buffer,
4270 stage,
4271 descriptor_state,
4272 descriptor_data_offset_out);
4273 }
4274
pvr_compute_update_shared(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd)4275 static void pvr_compute_update_shared(struct pvr_cmd_buffer *cmd_buffer,
4276 struct pvr_sub_cmd_compute *const sub_cmd)
4277 {
4278 const struct pvr_device *device = cmd_buffer->device;
4279 const struct pvr_physical_device *pdevice = device->pdevice;
4280 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4281 struct pvr_csb *csb = &sub_cmd->control_stream;
4282 const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
4283 const uint32_t const_shared_regs =
4284 pipeline->shader_state.const_shared_reg_count;
4285 struct pvr_compute_kernel_info info;
4286
4287 /* No shared regs, no need to use an allocation kernel. */
4288 if (!const_shared_regs)
4289 return;
4290
4291 /* Accumulate the MAX number of shared registers across the kernels in this
4292 * dispatch. This is used by the FW for context switching, so must be large
4293 * enough to contain all the shared registers that might be in use for this
4294 * compute job. Coefficients don't need to be included as the context switch
4295 * will not happen within the execution of a single workgroup, thus nothing
4296 * needs to be preserved.
4297 */
4298 state->max_shared_regs = MAX2(state->max_shared_regs, const_shared_regs);
4299
4300 info = (struct pvr_compute_kernel_info){
4301 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
4302 .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
4303
4304 .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
4305 .usc_common_shared = true,
4306 .usc_common_size =
4307 DIV_ROUND_UP(const_shared_regs,
4308 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
4309
4310 .local_size = { 1, 1, 1 },
4311 .global_size = { 1, 1, 1 },
4312 };
4313
4314 /* Sometimes we don't have a secondary program if there were no constants to
4315 * write, but we still need to run a PDS program to accomplish the
4316 * allocation of the local/common store shared registers. Use the
4317 * pre-uploaded empty PDS program in this instance.
4318 */
4319 if (pipeline->descriptor_state.pds_info.code_size_in_dwords) {
4320 uint32_t pds_data_size_in_dwords =
4321 pipeline->descriptor_state.pds_info.data_size_in_dwords;
4322
4323 info.pds_data_offset = state->pds_compute_descriptor_data_offset;
4324 info.pds_data_size =
4325 DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_data_size_in_dwords),
4326 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE));
4327
4328 /* Check that we have upload the code section. */
4329 assert(pipeline->descriptor_state.pds_code.code_size);
4330 info.pds_code_offset = pipeline->descriptor_state.pds_code.code_offset;
4331 } else {
4332 const struct pvr_pds_upload *program = &device->pds_compute_empty_program;
4333
4334 info.pds_data_offset = program->data_offset;
4335 info.pds_data_size =
4336 DIV_ROUND_UP(PVR_DW_TO_BYTES(program->data_size),
4337 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE));
4338 info.pds_code_offset = program->code_offset;
4339 }
4340
4341 /* We don't need to pad the workgroup size. */
4342
4343 info.max_instances =
4344 pvr_compute_flat_slot_size(pdevice, const_shared_regs, false, 1U);
4345
4346 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4347 }
4348
pvr_compute_update_shared_private(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,struct pvr_private_compute_pipeline * pipeline)4349 void pvr_compute_update_shared_private(
4350 struct pvr_cmd_buffer *cmd_buffer,
4351 struct pvr_sub_cmd_compute *const sub_cmd,
4352 struct pvr_private_compute_pipeline *pipeline)
4353 {
4354 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
4355 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4356 const uint32_t const_shared_regs = pipeline->const_shared_regs_count;
4357 struct pvr_csb *csb = &sub_cmd->control_stream;
4358 struct pvr_compute_kernel_info info;
4359
4360 /* No shared regs, no need to use an allocation kernel. */
4361 if (!const_shared_regs)
4362 return;
4363
4364 /* See comment in pvr_compute_update_shared() for details on this. */
4365 state->max_shared_regs = MAX2(state->max_shared_regs, const_shared_regs);
4366
4367 info = (struct pvr_compute_kernel_info){
4368 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
4369 .usc_common_size =
4370 DIV_ROUND_UP(const_shared_regs,
4371 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE)),
4372 .pds_data_size =
4373 DIV_ROUND_UP(PVR_DW_TO_BYTES(pipeline->pds_shared_update_data_size_dw),
4374 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
4375 .usc_target = PVRX(CDMCTRL_USC_TARGET_ALL),
4376 .pds_data_offset = pipeline->pds_shared_update_data_offset,
4377 .pds_code_offset = pipeline->pds_shared_update_code_offset,
4378 .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
4379 .usc_common_shared = true,
4380 .local_size = { 1, 1, 1 },
4381 .global_size = { 1, 1, 1 },
4382 };
4383
4384 /* We don't need to pad the workgroup size. */
4385
4386 info.max_instances =
4387 pvr_compute_flat_slot_size(pdevice, const_shared_regs, false, 1U);
4388
4389 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4390 }
4391
4392 static uint32_t
pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device * pdevice,uint32_t workgroup_size,uint32_t coeff_regs_count)4393 pvr_compute_flat_pad_workgroup_size(const struct pvr_physical_device *pdevice,
4394 uint32_t workgroup_size,
4395 uint32_t coeff_regs_count)
4396 {
4397 const struct pvr_device_runtime_info *dev_runtime_info =
4398 &pdevice->dev_runtime_info;
4399 const struct pvr_device_info *dev_info = &pdevice->dev_info;
4400 uint32_t max_avail_coeff_regs =
4401 dev_runtime_info->cdm_max_local_mem_size_regs;
4402 uint32_t coeff_regs_count_aligned =
4403 ALIGN_POT(coeff_regs_count,
4404 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE) >> 2U);
4405
4406 /* If the work group size is > ROGUE_MAX_INSTANCES_PER_TASK. We now *always*
4407 * pad the work group size to the next multiple of
4408 * ROGUE_MAX_INSTANCES_PER_TASK.
4409 *
4410 * If we use more than 1/8th of the max coefficient registers then we round
4411 * work group size up to the next multiple of ROGUE_MAX_INSTANCES_PER_TASK
4412 */
4413 /* TODO: See if this can be optimized. */
4414 if (workgroup_size > ROGUE_MAX_INSTANCES_PER_TASK ||
4415 coeff_regs_count_aligned > (max_avail_coeff_regs / 8)) {
4416 assert(workgroup_size < rogue_get_compute_max_work_group_size(dev_info));
4417
4418 return ALIGN_POT(workgroup_size, ROGUE_MAX_INSTANCES_PER_TASK);
4419 }
4420
4421 return workgroup_size;
4422 }
4423
pvr_compute_update_kernel_private(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,struct pvr_private_compute_pipeline * pipeline,const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])4424 void pvr_compute_update_kernel_private(
4425 struct pvr_cmd_buffer *cmd_buffer,
4426 struct pvr_sub_cmd_compute *const sub_cmd,
4427 struct pvr_private_compute_pipeline *pipeline,
4428 const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
4429 {
4430 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
4431 const struct pvr_device_runtime_info *dev_runtime_info =
4432 &pdevice->dev_runtime_info;
4433 struct pvr_csb *csb = &sub_cmd->control_stream;
4434
4435 struct pvr_compute_kernel_info info = {
4436 .indirect_buffer_addr = PVR_DEV_ADDR_INVALID,
4437 .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
4438 .pds_temp_size =
4439 DIV_ROUND_UP(pipeline->pds_temps_used << 2U,
4440 PVRX(CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE)),
4441
4442 .pds_data_size =
4443 DIV_ROUND_UP(PVR_DW_TO_BYTES(pipeline->pds_data_size_dw),
4444 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
4445 .pds_data_offset = pipeline->pds_data_offset,
4446 .pds_code_offset = pipeline->pds_code_offset,
4447
4448 .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
4449
4450 .usc_unified_size =
4451 DIV_ROUND_UP(pipeline->unified_store_regs_count << 2U,
4452 PVRX(CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE)),
4453
4454 /* clang-format off */
4455 .global_size = {
4456 global_workgroup_size[0],
4457 global_workgroup_size[1],
4458 global_workgroup_size[2]
4459 },
4460 /* clang-format on */
4461 };
4462
4463 uint32_t work_size = pipeline->workgroup_size.width *
4464 pipeline->workgroup_size.height *
4465 pipeline->workgroup_size.depth;
4466 uint32_t coeff_regs;
4467
4468 if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
4469 /* Enforce a single workgroup per cluster through allocation starvation.
4470 */
4471 coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
4472 } else {
4473 coeff_regs = pipeline->coeff_regs_count;
4474 }
4475
4476 info.usc_common_size =
4477 DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs),
4478 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
4479
4480 /* Use a whole slot per workgroup. */
4481 work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
4482
4483 coeff_regs += pipeline->const_shared_regs_count;
4484
4485 if (pipeline->const_shared_regs_count > 0)
4486 info.sd_type = PVRX(CDMCTRL_SD_TYPE_USC);
4487
4488 work_size =
4489 pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs);
4490
4491 info.local_size[0] = work_size;
4492 info.local_size[1] = 1U;
4493 info.local_size[2] = 1U;
4494
4495 info.max_instances =
4496 pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size);
4497
4498 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4499 }
4500
4501 /* TODO: Wire up the base_workgroup variant program when implementing
4502 * VK_KHR_device_group. The values will also need patching into the program.
4503 */
pvr_compute_update_kernel(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_compute * const sub_cmd,pvr_dev_addr_t indirect_addr,const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])4504 static void pvr_compute_update_kernel(
4505 struct pvr_cmd_buffer *cmd_buffer,
4506 struct pvr_sub_cmd_compute *const sub_cmd,
4507 pvr_dev_addr_t indirect_addr,
4508 const uint32_t global_workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
4509 {
4510 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
4511 const struct pvr_device_runtime_info *dev_runtime_info =
4512 &pdevice->dev_runtime_info;
4513 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4514 struct pvr_csb *csb = &sub_cmd->control_stream;
4515 const struct pvr_compute_pipeline *pipeline = state->compute_pipeline;
4516 const struct pvr_compute_shader_state *shader_state =
4517 &pipeline->shader_state;
4518 const struct pvr_pds_info *program_info = &pipeline->primary_program_info;
4519
4520 struct pvr_compute_kernel_info info = {
4521 .indirect_buffer_addr = indirect_addr,
4522 .usc_target = PVRX(CDMCTRL_USC_TARGET_ANY),
4523 .pds_temp_size =
4524 DIV_ROUND_UP(program_info->temps_required << 2U,
4525 PVRX(CDMCTRL_KERNEL0_PDS_TEMP_SIZE_UNIT_SIZE)),
4526
4527 .pds_data_size =
4528 DIV_ROUND_UP(PVR_DW_TO_BYTES(program_info->data_size_in_dwords),
4529 PVRX(CDMCTRL_KERNEL0_PDS_DATA_SIZE_UNIT_SIZE)),
4530 .pds_data_offset = pipeline->primary_program.data_offset,
4531 .pds_code_offset = pipeline->primary_program.code_offset,
4532
4533 .sd_type = PVRX(CDMCTRL_SD_TYPE_NONE),
4534
4535 .usc_unified_size =
4536 DIV_ROUND_UP(shader_state->input_register_count << 2U,
4537 PVRX(CDMCTRL_KERNEL0_USC_UNIFIED_SIZE_UNIT_SIZE)),
4538
4539 /* clang-format off */
4540 .global_size = {
4541 global_workgroup_size[0],
4542 global_workgroup_size[1],
4543 global_workgroup_size[2]
4544 },
4545 /* clang-format on */
4546 };
4547
4548 uint32_t work_size = shader_state->work_size;
4549 uint32_t coeff_regs;
4550
4551 if (work_size > ROGUE_MAX_INSTANCES_PER_TASK) {
4552 /* Enforce a single workgroup per cluster through allocation starvation.
4553 */
4554 coeff_regs = dev_runtime_info->cdm_max_local_mem_size_regs;
4555 } else {
4556 coeff_regs = shader_state->coefficient_register_count;
4557 }
4558
4559 info.usc_common_size =
4560 DIV_ROUND_UP(PVR_DW_TO_BYTES(coeff_regs),
4561 PVRX(CDMCTRL_KERNEL0_USC_COMMON_SIZE_UNIT_SIZE));
4562
4563 /* Use a whole slot per workgroup. */
4564 work_size = MAX2(work_size, ROGUE_MAX_INSTANCES_PER_TASK);
4565
4566 coeff_regs += shader_state->const_shared_reg_count;
4567
4568 if (shader_state->const_shared_reg_count > 0)
4569 info.sd_type = PVRX(CDMCTRL_SD_TYPE_USC);
4570
4571 work_size =
4572 pvr_compute_flat_pad_workgroup_size(pdevice, work_size, coeff_regs);
4573
4574 info.local_size[0] = work_size;
4575 info.local_size[1] = 1U;
4576 info.local_size[2] = 1U;
4577
4578 info.max_instances =
4579 pvr_compute_flat_slot_size(pdevice, coeff_regs, false, work_size);
4580
4581 pvr_compute_generate_control_stream(csb, sub_cmd, &info);
4582 }
4583
pvr_cmd_upload_push_consts(struct pvr_cmd_buffer * cmd_buffer)4584 static VkResult pvr_cmd_upload_push_consts(struct pvr_cmd_buffer *cmd_buffer)
4585 {
4586 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4587 struct pvr_suballoc_bo *suballoc_bo;
4588 VkResult result;
4589
4590 /* TODO: Here are some possible optimizations/things to consider:
4591 *
4592 * - Currently we upload maxPushConstantsSize. The application might only
4593 * be using a portion of that so we might end up with unused memory.
4594 * Should we be smarter about this. If we intend to upload the push
4595 * consts into shareds, we definitely want to do avoid reserving unused
4596 * regs.
4597 *
4598 * - For now we have to upload to a new buffer each time since the shaders
4599 * access the push constants from memory. If we were to reuse the same
4600 * buffer we might update the contents out of sync with job submission
4601 * and the shaders will see the updated contents while the command
4602 * buffer was still being recorded and not yet submitted.
4603 * If we were to upload the push constants directly to shared regs we
4604 * could reuse the same buffer (avoiding extra allocation overhead)
4605 * since the contents will be DMAed only on job submission when the
4606 * control stream is processed and the PDS program is executed. This
4607 * approach would also allow us to avoid regenerating the PDS data
4608 * section in some cases since the buffer address will be constants.
4609 */
4610
4611 if (cmd_buffer->state.push_constants.uploaded)
4612 return VK_SUCCESS;
4613
4614 result = pvr_cmd_buffer_upload_general(cmd_buffer,
4615 state->push_constants.data,
4616 sizeof(state->push_constants.data),
4617 &suballoc_bo);
4618 if (result != VK_SUCCESS)
4619 return result;
4620
4621 cmd_buffer->state.push_constants.dev_addr = suballoc_bo->dev_addr;
4622 cmd_buffer->state.push_constants.uploaded = true;
4623
4624 return VK_SUCCESS;
4625 }
4626
pvr_cmd_dispatch(struct pvr_cmd_buffer * const cmd_buffer,const pvr_dev_addr_t indirect_addr,const uint32_t workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])4627 static void pvr_cmd_dispatch(
4628 struct pvr_cmd_buffer *const cmd_buffer,
4629 const pvr_dev_addr_t indirect_addr,
4630 const uint32_t workgroup_size[static const PVR_WORKGROUP_DIMENSIONS])
4631 {
4632 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
4633 const struct pvr_compute_pipeline *compute_pipeline =
4634 state->compute_pipeline;
4635 struct pvr_sub_cmd_compute *sub_cmd;
4636 VkResult result;
4637
4638 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_COMPUTE);
4639
4640 sub_cmd = &state->current_sub_cmd->compute;
4641 sub_cmd->uses_atomic_ops |= compute_pipeline->shader_state.uses_atomic_ops;
4642 sub_cmd->uses_barrier |= compute_pipeline->shader_state.uses_barrier;
4643
4644 if (state->push_constants.dirty_stages & VK_SHADER_STAGE_COMPUTE_BIT) {
4645 result = pvr_cmd_upload_push_consts(cmd_buffer);
4646 if (result != VK_SUCCESS)
4647 return;
4648
4649 /* Regenerate the PDS program to use the new push consts buffer. */
4650 state->dirty.compute_desc_dirty = true;
4651
4652 state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4653 }
4654
4655 if (compute_pipeline->shader_state.uses_num_workgroups) {
4656 pvr_dev_addr_t descriptor_data_offset_out;
4657
4658 if (indirect_addr.addr) {
4659 descriptor_data_offset_out = indirect_addr;
4660 } else {
4661 struct pvr_suballoc_bo *num_workgroups_bo;
4662
4663 result = pvr_cmd_buffer_upload_general(cmd_buffer,
4664 workgroup_size,
4665 sizeof(*workgroup_size) *
4666 PVR_WORKGROUP_DIMENSIONS,
4667 &num_workgroups_bo);
4668 if (result != VK_SUCCESS)
4669 return;
4670
4671 descriptor_data_offset_out = num_workgroups_bo->dev_addr;
4672 }
4673
4674 result = pvr_setup_descriptor_mappings(
4675 cmd_buffer,
4676 PVR_STAGE_ALLOCATION_COMPUTE,
4677 &compute_pipeline->descriptor_state,
4678 &descriptor_data_offset_out,
4679 &state->pds_compute_descriptor_data_offset);
4680 if (result != VK_SUCCESS)
4681 return;
4682 } else if ((compute_pipeline->base.layout
4683 ->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_COMPUTE] &&
4684 state->dirty.compute_desc_dirty) ||
4685 state->dirty.compute_pipeline_binding) {
4686 result = pvr_setup_descriptor_mappings(
4687 cmd_buffer,
4688 PVR_STAGE_ALLOCATION_COMPUTE,
4689 &compute_pipeline->descriptor_state,
4690 NULL,
4691 &state->pds_compute_descriptor_data_offset);
4692 if (result != VK_SUCCESS)
4693 return;
4694 }
4695
4696 pvr_compute_update_shared(cmd_buffer, sub_cmd);
4697 pvr_compute_update_kernel(cmd_buffer, sub_cmd, indirect_addr, workgroup_size);
4698 }
4699
pvr_CmdDispatch(VkCommandBuffer commandBuffer,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4700 void pvr_CmdDispatch(VkCommandBuffer commandBuffer,
4701 uint32_t groupCountX,
4702 uint32_t groupCountY,
4703 uint32_t groupCountZ)
4704 {
4705 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4706
4707 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4708
4709 if (!groupCountX || !groupCountY || !groupCountZ)
4710 return;
4711
4712 pvr_cmd_dispatch(cmd_buffer,
4713 PVR_DEV_ADDR_INVALID,
4714 (uint32_t[]){ groupCountX, groupCountY, groupCountZ });
4715 }
4716
pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)4717 void pvr_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
4718 VkBuffer _buffer,
4719 VkDeviceSize offset)
4720 {
4721 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
4722 PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
4723
4724 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
4725
4726 pvr_cmd_dispatch(cmd_buffer,
4727 PVR_DEV_ADDR_OFFSET(buffer->dev_addr, offset),
4728 (uint32_t[]){ 1, 1, 1 });
4729 }
4730
4731 static void
pvr_update_draw_state(struct pvr_cmd_buffer_state * const state,const struct pvr_cmd_buffer_draw_state * const draw_state)4732 pvr_update_draw_state(struct pvr_cmd_buffer_state *const state,
4733 const struct pvr_cmd_buffer_draw_state *const draw_state)
4734 {
4735 /* We don't have a state to tell us that base_instance is being used so it
4736 * gets used as a boolean - 0 means we'll use a pds program that skips the
4737 * base instance addition. If the base_instance gets used (and the last
4738 * draw's base_instance was 0) then we switch to the BASE_INSTANCE attrib
4739 * program.
4740 *
4741 * If base_instance changes then we only need to update the data section.
4742 *
4743 * The only draw call state that doesn't really matter is the start vertex
4744 * as that is handled properly in the VDM state in all cases.
4745 */
4746 if ((state->draw_state.draw_indexed != draw_state->draw_indexed) ||
4747 (state->draw_state.draw_indirect != draw_state->draw_indirect) ||
4748 (state->draw_state.base_instance == 0 &&
4749 draw_state->base_instance != 0)) {
4750 state->dirty.draw_variant = true;
4751 } else if (state->draw_state.base_instance != draw_state->base_instance) {
4752 state->dirty.draw_base_instance = true;
4753 }
4754
4755 state->draw_state = *draw_state;
4756 }
4757
pvr_calc_shared_regs_count(const struct pvr_graphics_pipeline * const gfx_pipeline)4758 static uint32_t pvr_calc_shared_regs_count(
4759 const struct pvr_graphics_pipeline *const gfx_pipeline)
4760 {
4761 const struct pvr_pipeline_stage_state *const vertex_state =
4762 &gfx_pipeline->shader_state.vertex.stage_state;
4763
4764 uint32_t shared_regs = vertex_state->const_shared_reg_count +
4765 vertex_state->const_shared_reg_offset;
4766
4767 if (gfx_pipeline->shader_state.fragment.bo) {
4768 const struct pvr_pipeline_stage_state *const fragment_state =
4769 &gfx_pipeline->shader_state.fragment.stage_state;
4770
4771 uint32_t fragment_regs = fragment_state->const_shared_reg_count +
4772 fragment_state->const_shared_reg_offset;
4773
4774 shared_regs = MAX2(shared_regs, fragment_regs);
4775 }
4776
4777 return shared_regs;
4778 }
4779
4780 static void
pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd,const uint32_t pds_vertex_descriptor_data_offset)4781 pvr_emit_dirty_pds_state(const struct pvr_cmd_buffer *const cmd_buffer,
4782 struct pvr_sub_cmd_gfx *const sub_cmd,
4783 const uint32_t pds_vertex_descriptor_data_offset)
4784 {
4785 const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
4786 const struct pvr_stage_allocation_descriptor_state
4787 *const vertex_descriptor_state =
4788 &state->gfx_pipeline->shader_state.vertex.descriptor_state;
4789 const struct pvr_pipeline_stage_state *const vertex_stage_state =
4790 &state->gfx_pipeline->shader_state.vertex.stage_state;
4791 struct pvr_csb *const csb = &sub_cmd->control_stream;
4792
4793 if (!vertex_descriptor_state->pds_info.code_size_in_dwords)
4794 return;
4795
4796 pvr_csb_set_relocation_mark(csb);
4797
4798 pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) {
4799 state0.usc_target = PVRX(VDMCTRL_USC_TARGET_ALL);
4800
4801 state0.usc_common_size =
4802 DIV_ROUND_UP(vertex_stage_state->const_shared_reg_count << 2,
4803 PVRX(VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE));
4804
4805 state0.pds_data_size = DIV_ROUND_UP(
4806 PVR_DW_TO_BYTES(vertex_descriptor_state->pds_info.data_size_in_dwords),
4807 PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE));
4808 }
4809
4810 pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) {
4811 state1.pds_data_addr = PVR_DEV_ADDR(pds_vertex_descriptor_data_offset);
4812 state1.sd_type = PVRX(VDMCTRL_SD_TYPE_NONE);
4813 }
4814
4815 pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) {
4816 state2.pds_code_addr =
4817 PVR_DEV_ADDR(vertex_descriptor_state->pds_code.code_offset);
4818 }
4819
4820 pvr_csb_clear_relocation_mark(csb);
4821 }
4822
pvr_setup_output_select(struct pvr_cmd_buffer * const cmd_buffer)4823 static void pvr_setup_output_select(struct pvr_cmd_buffer *const cmd_buffer)
4824 {
4825 const struct pvr_graphics_pipeline *const gfx_pipeline =
4826 cmd_buffer->state.gfx_pipeline;
4827 const struct pvr_vertex_shader_state *const vertex_state =
4828 &gfx_pipeline->shader_state.vertex;
4829 struct vk_dynamic_graphics_state *const dynamic_state =
4830 &cmd_buffer->vk.dynamic_graphics_state;
4831 struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header;
4832 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
4833 uint32_t output_selects;
4834
4835 /* TODO: Handle vertex and fragment shader state flags. */
4836
4837 pvr_csb_pack (&output_selects, TA_OUTPUT_SEL, state) {
4838 state.rhw_pres = true;
4839 state.vtxsize = DIV_ROUND_UP(vertex_state->vertex_output_size, 4U);
4840 state.psprite_size_pres = (dynamic_state->ia.primitive_topology ==
4841 VK_PRIMITIVE_TOPOLOGY_POINT_LIST);
4842 }
4843
4844 if (ppp_state->output_selects != output_selects) {
4845 ppp_state->output_selects = output_selects;
4846 header->pres_outselects = true;
4847 }
4848
4849 if (ppp_state->varying_word[0] != vertex_state->varying[0]) {
4850 ppp_state->varying_word[0] = vertex_state->varying[0];
4851 header->pres_varying_word0 = true;
4852 }
4853
4854 if (ppp_state->varying_word[1] != vertex_state->varying[1]) {
4855 ppp_state->varying_word[1] = vertex_state->varying[1];
4856 header->pres_varying_word1 = true;
4857 }
4858 }
4859
4860 static void
pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer * const cmd_buffer,struct PVRX (TA_STATE_ISPA)* const ispa_out)4861 pvr_setup_isp_faces_and_control(struct pvr_cmd_buffer *const cmd_buffer,
4862 struct PVRX(TA_STATE_ISPA) *const ispa_out)
4863 {
4864 struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header;
4865 const struct pvr_fragment_shader_state *const fragment_shader_state =
4866 &cmd_buffer->state.gfx_pipeline->shader_state.fragment;
4867 const struct pvr_render_pass_info *const pass_info =
4868 &cmd_buffer->state.render_pass_info;
4869 struct vk_dynamic_graphics_state *dynamic_state =
4870 &cmd_buffer->vk.dynamic_graphics_state;
4871 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
4872
4873 const bool rasterizer_discard = dynamic_state->rs.rasterizer_discard_enable;
4874 const uint32_t subpass_idx = pass_info->subpass_idx;
4875 const uint32_t depth_stencil_attachment_idx =
4876 pass_info->pass->subpasses[subpass_idx].depth_stencil_attachment;
4877 const struct pvr_render_pass_attachment *const attachment =
4878 depth_stencil_attachment_idx != VK_ATTACHMENT_UNUSED
4879 ? &pass_info->pass->attachments[depth_stencil_attachment_idx]
4880 : NULL;
4881
4882 const enum PVRX(TA_OBJTYPE)
4883 obj_type = pvr_ta_objtype(dynamic_state->ia.primitive_topology);
4884
4885 const VkImageAspectFlags ds_aspects =
4886 (!rasterizer_discard && attachment)
4887 ? vk_format_aspects(attachment->vk_format) &
4888 (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)
4889 : VK_IMAGE_ASPECT_NONE;
4890
4891 /* This is deliberately a full copy rather than a pointer because
4892 * vk_optimize_depth_stencil_state() can only be run once against any given
4893 * instance of vk_depth_stencil_state.
4894 */
4895 struct vk_depth_stencil_state ds_state = dynamic_state->ds;
4896
4897 uint32_t ispb_stencil_off;
4898 bool is_two_sided = false;
4899 uint32_t isp_control;
4900
4901 uint32_t line_width;
4902 uint32_t common_a;
4903 uint32_t front_a;
4904 uint32_t front_b;
4905 uint32_t back_a;
4906 uint32_t back_b;
4907
4908 vk_optimize_depth_stencil_state(&ds_state, ds_aspects, true);
4909
4910 /* Convert to 4.4 fixed point format. */
4911 line_width = util_unsigned_fixed(dynamic_state->rs.line.width, 4);
4912
4913 /* Subtract 1 to shift values from range [0=0,256=16] to [0=1/16,255=16].
4914 * If 0 it stays at 0, otherwise we subtract 1.
4915 */
4916 line_width = (!!line_width) * (line_width - 1);
4917
4918 line_width = MIN2(line_width, PVRX(TA_STATE_ISPA_POINTLINEWIDTH_SIZE_MAX));
4919
4920 /* TODO: Part of the logic in this function is duplicated in another part
4921 * of the code. E.g. the dcmpmode, and sop1/2/3. Could we do this earlier?
4922 */
4923
4924 pvr_csb_pack (&common_a, TA_STATE_ISPA, ispa) {
4925 ispa.pointlinewidth = line_width;
4926
4927 ispa.dcmpmode = pvr_ta_cmpmode(ds_state.depth.compare_op);
4928 ispa.dwritedisable = !ds_state.depth.write_enable;
4929
4930 ispa.passtype = fragment_shader_state->pass_type;
4931
4932 ispa.objtype = obj_type;
4933
4934 /* Return unpacked ispa structure. dcmpmode, dwritedisable, passtype and
4935 * objtype are needed by pvr_setup_triangle_merging_flag.
4936 */
4937 if (ispa_out)
4938 *ispa_out = ispa;
4939 }
4940
4941 /* TODO: Does this actually represent the ispb control word on stencil off?
4942 * If not, rename the variable.
4943 */
4944 pvr_csb_pack (&ispb_stencil_off, TA_STATE_ISPB, ispb) {
4945 ispb.sop3 = PVRX(TA_ISPB_STENCILOP_KEEP);
4946 ispb.sop2 = PVRX(TA_ISPB_STENCILOP_KEEP);
4947 ispb.sop1 = PVRX(TA_ISPB_STENCILOP_KEEP);
4948 ispb.scmpmode = PVRX(TA_CMPMODE_ALWAYS);
4949 }
4950
4951 /* FIXME: This logic should be redone and improved. Can we also get rid of
4952 * the front and back variants?
4953 */
4954
4955 front_a = common_a;
4956 back_a = common_a;
4957
4958 if (ds_state.stencil.test_enable) {
4959 uint32_t front_a_sref;
4960 uint32_t back_a_sref;
4961
4962 pvr_csb_pack (&front_a_sref, TA_STATE_ISPA, ispa) {
4963 ispa.sref = ds_state.stencil.front.reference;
4964 }
4965 front_a |= front_a_sref;
4966
4967 pvr_csb_pack (&back_a_sref, TA_STATE_ISPA, ispa) {
4968 ispa.sref = ds_state.stencil.back.reference;
4969 }
4970 back_a |= back_a_sref;
4971
4972 pvr_csb_pack (&front_b, TA_STATE_ISPB, ispb) {
4973 const struct vk_stencil_test_face_state *const front =
4974 &ds_state.stencil.front;
4975
4976 if (ds_state.stencil.write_enable)
4977 ispb.swmask = front->write_mask;
4978
4979 ispb.scmpmask = front->compare_mask;
4980
4981 ispb.sop3 = pvr_ta_stencilop(front->op.pass);
4982 ispb.sop2 = pvr_ta_stencilop(front->op.depth_fail);
4983 ispb.sop1 = pvr_ta_stencilop(front->op.fail);
4984 ispb.scmpmode = pvr_ta_cmpmode(front->op.compare);
4985 }
4986
4987 pvr_csb_pack (&back_b, TA_STATE_ISPB, ispb) {
4988 const struct vk_stencil_test_face_state *const back =
4989 &ds_state.stencil.back;
4990
4991 if (ds_state.stencil.write_enable)
4992 ispb.swmask = back->write_mask;
4993
4994 ispb.scmpmask = back->compare_mask;
4995
4996 ispb.sop3 = pvr_ta_stencilop(back->op.pass);
4997 ispb.sop2 = pvr_ta_stencilop(back->op.depth_fail);
4998 ispb.sop1 = pvr_ta_stencilop(back->op.fail);
4999 ispb.scmpmode = pvr_ta_cmpmode(back->op.compare);
5000 }
5001 } else {
5002 front_b = ispb_stencil_off;
5003 back_b = ispb_stencil_off;
5004 }
5005
5006 if (front_a != back_a || front_b != back_b) {
5007 if (dynamic_state->rs.cull_mode & VK_CULL_MODE_BACK_BIT) {
5008 /* Single face, using front state. */
5009 } else if (dynamic_state->rs.cull_mode & VK_CULL_MODE_FRONT_BIT) {
5010 /* Single face, using back state. */
5011
5012 front_a = back_a;
5013 front_b = back_b;
5014 } else {
5015 /* Both faces. */
5016
5017 header->pres_ispctl_ba = is_two_sided = true;
5018
5019 if (dynamic_state->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) {
5020 uint32_t tmp = front_a;
5021
5022 front_a = back_a;
5023 back_a = tmp;
5024
5025 tmp = front_b;
5026 front_b = back_b;
5027 back_b = tmp;
5028 }
5029
5030 /* HW defaults to stencil off. */
5031 if (back_b != ispb_stencil_off) {
5032 header->pres_ispctl_fb = true;
5033 header->pres_ispctl_bb = true;
5034 }
5035 }
5036 }
5037
5038 if (ds_state.stencil.test_enable && front_b != ispb_stencil_off)
5039 header->pres_ispctl_fb = true;
5040
5041 pvr_csb_pack (&isp_control, TA_STATE_ISPCTL, ispctl) {
5042 ispctl.upass = pass_info->isp_userpass;
5043
5044 /* TODO: is bo ever NULL? Figure out what to do. */
5045 ispctl.tagwritedisable = rasterizer_discard || !fragment_shader_state->bo;
5046
5047 ispctl.two_sided = is_two_sided;
5048 ispctl.bpres = header->pres_ispctl_fb || header->pres_ispctl_bb;
5049
5050 ispctl.dbenable = !rasterizer_discard &&
5051 dynamic_state->rs.depth_bias.enable &&
5052 obj_type == PVRX(TA_OBJTYPE_TRIANGLE);
5053 if (!rasterizer_discard && cmd_buffer->state.vis_test_enabled) {
5054 ispctl.vistest = true;
5055 ispctl.visreg = cmd_buffer->state.vis_reg;
5056 }
5057
5058 ispctl.scenable = !rasterizer_discard;
5059
5060 ppp_state->isp.control_struct = ispctl;
5061 }
5062
5063 header->pres_ispctl = true;
5064
5065 ppp_state->isp.control = isp_control;
5066 ppp_state->isp.front_a = front_a;
5067 ppp_state->isp.front_b = front_b;
5068 ppp_state->isp.back_a = back_a;
5069 ppp_state->isp.back_b = back_b;
5070 }
5071
5072 static float
pvr_calculate_final_depth_bias_contant_factor(struct pvr_device_info * dev_info,VkFormat format,float depth_bias)5073 pvr_calculate_final_depth_bias_contant_factor(struct pvr_device_info *dev_info,
5074 VkFormat format,
5075 float depth_bias)
5076 {
5077 /* Information for future modifiers of these depth bias calculations.
5078 * ==================================================================
5079 * Specified depth bias equations scale the specified constant factor by a
5080 * value 'r' that is guaranteed to cause a resolvable difference in depth
5081 * across the entire range of depth values.
5082 * For floating point depth formats 'r' is calculated by taking the maximum
5083 * exponent across the triangle.
5084 * For UNORM formats 'r' is constant.
5085 * Here 'n' is the number of mantissa bits stored in the floating point
5086 * representation (23 for F32).
5087 *
5088 * UNORM Format -> z += dbcf * r + slope
5089 * FLOAT Format -> z += dbcf * 2^(e-n) + slope
5090 *
5091 * HW Variations.
5092 * ==============
5093 * The HW either always performs the F32 depth bias equation (exponent based
5094 * r), or in the case of HW that correctly supports the integer depth bias
5095 * equation for UNORM depth formats, we can select between both equations
5096 * using the ROGUE_CR_ISP_CTL.dbias_is_int flag - this is required to
5097 * correctly perform Vulkan UNORM depth bias (constant r).
5098 *
5099 * if ern42307:
5100 * if DBIAS_IS_INT_EN:
5101 * z += dbcf + slope
5102 * else:
5103 * z += dbcf * 2^(e-n) + slope
5104 * else:
5105 * z += dbcf * 2^(e-n) + slope
5106 *
5107 */
5108
5109 float nudge_factor;
5110
5111 if (PVR_HAS_ERN(dev_info, 42307)) {
5112 switch (format) {
5113 case VK_FORMAT_D16_UNORM:
5114 return depth_bias / (1 << 15);
5115
5116 case VK_FORMAT_D24_UNORM_S8_UINT:
5117 case VK_FORMAT_X8_D24_UNORM_PACK32:
5118 return depth_bias / (1 << 23);
5119
5120 default:
5121 return depth_bias;
5122 }
5123 }
5124
5125 /* The reasoning behind clamping/nudging the value here is because UNORM
5126 * depth formats can have higher precision over our underlying D32F
5127 * representation for some depth ranges.
5128 *
5129 * When the HW scales the depth bias value by 2^(e-n) [The 'r' term'] a depth
5130 * bias of 1 can result in a value smaller than one F32 ULP, which will get
5131 * quantized to 0 - resulting in no bias.
5132 *
5133 * Biasing small values away from zero will ensure that small depth biases of
5134 * 1 still yield a result and overcome Z-fighting.
5135 */
5136 switch (format) {
5137 case VK_FORMAT_D16_UNORM:
5138 depth_bias *= 512.0f;
5139 nudge_factor = 1.0f;
5140 break;
5141
5142 case VK_FORMAT_D24_UNORM_S8_UINT:
5143 case VK_FORMAT_X8_D24_UNORM_PACK32:
5144 depth_bias *= 2.0f;
5145 nudge_factor = 2.0f;
5146 break;
5147
5148 default:
5149 nudge_factor = 0.0f;
5150 break;
5151 }
5152
5153 if (nudge_factor != 0.0f) {
5154 if (depth_bias < 0.0f && depth_bias > -nudge_factor)
5155 depth_bias -= nudge_factor;
5156 else if (depth_bias > 0.0f && depth_bias < nudge_factor)
5157 depth_bias += nudge_factor;
5158 }
5159
5160 return depth_bias;
5161 }
5162
pvr_get_viewport_scissor_overlap(const VkViewport * const viewport,const VkRect2D * const scissor,VkRect2D * const rect_out)5163 static void pvr_get_viewport_scissor_overlap(const VkViewport *const viewport,
5164 const VkRect2D *const scissor,
5165 VkRect2D *const rect_out)
5166 {
5167 /* TODO: See if we can remove this struct. */
5168 struct pvr_rect {
5169 int32_t x0, y0;
5170 int32_t x1, y1;
5171 };
5172
5173 /* TODO: Worry about overflow? */
5174 const struct pvr_rect scissor_rect = {
5175 .x0 = scissor->offset.x,
5176 .y0 = scissor->offset.y,
5177 .x1 = scissor->offset.x + scissor->extent.width,
5178 .y1 = scissor->offset.y + scissor->extent.height
5179 };
5180 struct pvr_rect viewport_rect = { 0 };
5181
5182 assert(viewport->width >= 0.0f);
5183 assert(scissor_rect.x0 >= 0);
5184 assert(scissor_rect.y0 >= 0);
5185
5186 if (scissor->extent.width == 0 || scissor->extent.height == 0) {
5187 *rect_out = (VkRect2D){ 0 };
5188 return;
5189 }
5190
5191 viewport_rect.x0 = (int32_t)viewport->x;
5192 viewport_rect.x1 = (int32_t)viewport->x + (int32_t)viewport->width;
5193
5194 /* TODO: Is there a mathematical way of doing all this and then clamp at
5195 * the end?
5196 */
5197 /* We flip the y0 and y1 when height is negative. */
5198 viewport_rect.y0 = (int32_t)viewport->y + MIN2(0, (int32_t)viewport->height);
5199 viewport_rect.y1 = (int32_t)viewport->y + MAX2(0, (int32_t)viewport->height);
5200
5201 if (scissor_rect.x1 <= viewport_rect.x0 ||
5202 scissor_rect.y1 <= viewport_rect.y0 ||
5203 scissor_rect.x0 >= viewport_rect.x1 ||
5204 scissor_rect.y0 >= viewport_rect.y1) {
5205 *rect_out = (VkRect2D){ 0 };
5206 return;
5207 }
5208
5209 /* Determine the overlapping rectangle. */
5210 viewport_rect.x0 = MAX2(viewport_rect.x0, scissor_rect.x0);
5211 viewport_rect.y0 = MAX2(viewport_rect.y0, scissor_rect.y0);
5212 viewport_rect.x1 = MIN2(viewport_rect.x1, scissor_rect.x1);
5213 viewport_rect.y1 = MIN2(viewport_rect.y1, scissor_rect.y1);
5214
5215 /* TODO: Is this conversion safe? Is this logic right? */
5216 rect_out->offset.x = (uint32_t)viewport_rect.x0;
5217 rect_out->offset.y = (uint32_t)viewport_rect.y0;
5218 rect_out->extent.height = (uint32_t)(viewport_rect.y1 - viewport_rect.y0);
5219 rect_out->extent.width = (uint32_t)(viewport_rect.x1 - viewport_rect.x0);
5220 }
5221
5222 static inline uint32_t
pvr_get_geom_region_clip_align_size(struct pvr_device_info * const dev_info)5223 pvr_get_geom_region_clip_align_size(struct pvr_device_info *const dev_info)
5224 {
5225 /* TODO: This should come from rogue_ppp.xml. */
5226 return 16U + 16U * (!PVR_HAS_FEATURE(dev_info, tile_size_16x16));
5227 }
5228
5229 static void
pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer * const cmd_buffer)5230 pvr_setup_isp_depth_bias_scissor_state(struct pvr_cmd_buffer *const cmd_buffer)
5231 {
5232 struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header;
5233 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
5234 struct vk_dynamic_graphics_state *const dynamic_state =
5235 &cmd_buffer->vk.dynamic_graphics_state;
5236 const struct PVRX(TA_STATE_ISPCTL) *const ispctl =
5237 &ppp_state->isp.control_struct;
5238 struct pvr_device_info *const dev_info =
5239 &cmd_buffer->device->pdevice->dev_info;
5240
5241 if (ispctl->dbenable &&
5242 (BITSET_TEST(dynamic_state->dirty,
5243 MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
5244 cmd_buffer->depth_bias_array.size == 0)) {
5245 struct pvr_depth_bias_state depth_bias = {
5246 .constant_factor = pvr_calculate_final_depth_bias_contant_factor(
5247 dev_info,
5248 cmd_buffer->state.depth_format,
5249 dynamic_state->rs.depth_bias.constant),
5250 .slope_factor = dynamic_state->rs.depth_bias.slope,
5251 .clamp = dynamic_state->rs.depth_bias.clamp,
5252 };
5253
5254 ppp_state->depthbias_scissor_indices.depthbias_index =
5255 util_dynarray_num_elements(&cmd_buffer->depth_bias_array,
5256 __typeof__(depth_bias));
5257
5258 util_dynarray_append(&cmd_buffer->depth_bias_array,
5259 __typeof__(depth_bias),
5260 depth_bias);
5261
5262 header->pres_ispctl_dbsc = true;
5263 }
5264
5265 if (ispctl->scenable) {
5266 const uint32_t region_clip_align_size =
5267 pvr_get_geom_region_clip_align_size(dev_info);
5268 const VkViewport *const viewport = &dynamic_state->vp.viewports[0];
5269 const VkRect2D *const scissor = &dynamic_state->vp.scissors[0];
5270 struct pvr_scissor_words scissor_words;
5271 VkRect2D overlap_rect;
5272 uint32_t height;
5273 uint32_t width;
5274 uint32_t x;
5275 uint32_t y;
5276
5277 /* For region clip. */
5278 uint32_t bottom;
5279 uint32_t right;
5280 uint32_t left;
5281 uint32_t top;
5282
5283 /* We don't support multiple viewport calculations. */
5284 assert(dynamic_state->vp.viewport_count == 1);
5285 /* We don't support multiple scissor calculations. */
5286 assert(dynamic_state->vp.scissor_count == 1);
5287
5288 pvr_get_viewport_scissor_overlap(viewport, scissor, &overlap_rect);
5289
5290 x = overlap_rect.offset.x;
5291 y = overlap_rect.offset.y;
5292 width = overlap_rect.extent.width;
5293 height = overlap_rect.extent.height;
5294
5295 pvr_csb_pack (&scissor_words.w0, IPF_SCISSOR_WORD_0, word0) {
5296 word0.scw0_xmax = x + width;
5297 word0.scw0_xmin = x;
5298 }
5299
5300 pvr_csb_pack (&scissor_words.w1, IPF_SCISSOR_WORD_1, word1) {
5301 word1.scw1_ymax = y + height;
5302 word1.scw1_ymin = y;
5303 }
5304
5305 if (cmd_buffer->scissor_array.size &&
5306 cmd_buffer->scissor_words.w0 == scissor_words.w0 &&
5307 cmd_buffer->scissor_words.w1 == scissor_words.w1) {
5308 return;
5309 }
5310
5311 cmd_buffer->scissor_words = scissor_words;
5312
5313 /* Calculate region clip. */
5314
5315 left = x / region_clip_align_size;
5316 top = y / region_clip_align_size;
5317
5318 /* We prevent right=-1 with the multiplication. */
5319 /* TODO: Is there a better way of doing this? */
5320 if ((x + width) != 0U)
5321 right = DIV_ROUND_UP(x + width, region_clip_align_size) - 1;
5322 else
5323 right = 0;
5324
5325 if ((y + height) != 0U)
5326 bottom = DIV_ROUND_UP(y + height, region_clip_align_size) - 1;
5327 else
5328 bottom = 0U;
5329
5330 /* Setup region clip to clip everything outside what was calculated. */
5331
5332 /* FIXME: Should we mask to prevent writing over other words? */
5333 pvr_csb_pack (&ppp_state->region_clipping.word0, TA_REGION_CLIP0, word0) {
5334 word0.right = right;
5335 word0.left = left;
5336 word0.mode = PVRX(TA_REGION_CLIP_MODE_OUTSIDE);
5337 }
5338
5339 pvr_csb_pack (&ppp_state->region_clipping.word1, TA_REGION_CLIP1, word1) {
5340 word1.bottom = bottom;
5341 word1.top = top;
5342 }
5343
5344 ppp_state->depthbias_scissor_indices.scissor_index =
5345 util_dynarray_num_elements(&cmd_buffer->scissor_array,
5346 struct pvr_scissor_words);
5347
5348 util_dynarray_append(&cmd_buffer->scissor_array,
5349 struct pvr_scissor_words,
5350 cmd_buffer->scissor_words);
5351
5352 header->pres_ispctl_dbsc = true;
5353 header->pres_region_clip = true;
5354 }
5355 }
5356
5357 static void
pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer * const cmd_buffer,struct PVRX (TA_STATE_ISPA)* ispa)5358 pvr_setup_triangle_merging_flag(struct pvr_cmd_buffer *const cmd_buffer,
5359 struct PVRX(TA_STATE_ISPA) * ispa)
5360 {
5361 struct PVRX(TA_STATE_HEADER) *const header = &cmd_buffer->state.emit_header;
5362 struct pvr_ppp_state *const ppp_state = &cmd_buffer->state.ppp_state;
5363 uint32_t merge_word;
5364 uint32_t mask;
5365
5366 pvr_csb_pack (&merge_word, TA_STATE_PDS_SIZEINFO2, size_info) {
5367 /* Disable for lines or punch-through or for DWD and depth compare
5368 * always.
5369 */
5370 if (ispa->objtype == PVRX(TA_OBJTYPE_LINE) ||
5371 ispa->passtype == PVRX(TA_PASSTYPE_PUNCH_THROUGH) ||
5372 (ispa->dwritedisable && ispa->dcmpmode == PVRX(TA_CMPMODE_ALWAYS))) {
5373 size_info.pds_tri_merge_disable = true;
5374 }
5375 }
5376
5377 pvr_csb_pack (&mask, TA_STATE_PDS_SIZEINFO2, size_info) {
5378 size_info.pds_tri_merge_disable = true;
5379 }
5380
5381 merge_word |= ppp_state->pds.size_info2 & ~mask;
5382
5383 if (merge_word != ppp_state->pds.size_info2) {
5384 ppp_state->pds.size_info2 = merge_word;
5385 header->pres_pds_state_ptr0 = true;
5386 }
5387 }
5388
5389 static void
pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)5390 pvr_setup_fragment_state_pointers(struct pvr_cmd_buffer *const cmd_buffer,
5391 struct pvr_sub_cmd_gfx *const sub_cmd)
5392 {
5393 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5394
5395 const struct pvr_fragment_shader_state *const fragment =
5396 &state->gfx_pipeline->shader_state.fragment;
5397 const struct pvr_stage_allocation_descriptor_state *descriptor_shader_state =
5398 &fragment->descriptor_state;
5399 const struct pvr_pipeline_stage_state *fragment_state =
5400 &fragment->stage_state;
5401 const struct pvr_pds_upload *pds_coeff_program =
5402 &fragment->pds_coeff_program;
5403
5404 const struct pvr_physical_device *pdevice = cmd_buffer->device->pdevice;
5405 struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5406 struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5407
5408 const uint32_t pds_uniform_size =
5409 DIV_ROUND_UP(descriptor_shader_state->pds_info.data_size_in_dwords,
5410 PVRX(TA_STATE_PDS_SIZEINFO1_PDS_UNIFORMSIZE_UNIT_SIZE));
5411
5412 const uint32_t pds_varying_state_size =
5413 DIV_ROUND_UP(pds_coeff_program->data_size,
5414 PVRX(TA_STATE_PDS_SIZEINFO1_PDS_VARYINGSIZE_UNIT_SIZE));
5415
5416 const uint32_t usc_varying_size =
5417 DIV_ROUND_UP(fragment_state->coefficient_size,
5418 PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE));
5419
5420 const uint32_t pds_temp_size =
5421 DIV_ROUND_UP(fragment_state->pds_temps_count,
5422 PVRX(TA_STATE_PDS_SIZEINFO1_PDS_TEMPSIZE_UNIT_SIZE));
5423
5424 const uint32_t usc_shared_size =
5425 DIV_ROUND_UP(fragment_state->const_shared_reg_count,
5426 PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE));
5427
5428 const uint32_t max_tiles_in_flight =
5429 pvr_calc_fscommon_size_and_tiles_in_flight(
5430 &pdevice->dev_info,
5431 &pdevice->dev_runtime_info,
5432 usc_shared_size *
5433 PVRX(TA_STATE_PDS_SIZEINFO2_USC_SHAREDSIZE_UNIT_SIZE),
5434 1);
5435 uint32_t size_info_mask;
5436 uint32_t size_info2;
5437
5438 if (max_tiles_in_flight < sub_cmd->max_tiles_in_flight)
5439 sub_cmd->max_tiles_in_flight = max_tiles_in_flight;
5440
5441 pvr_csb_pack (&ppp_state->pds.pixel_shader_base,
5442 TA_STATE_PDS_SHADERBASE,
5443 shader_base) {
5444 const struct pvr_pds_upload *const pds_upload =
5445 &fragment->pds_fragment_program;
5446
5447 shader_base.addr = PVR_DEV_ADDR(pds_upload->data_offset);
5448 }
5449
5450 if (descriptor_shader_state->pds_code.pvr_bo) {
5451 pvr_csb_pack (&ppp_state->pds.texture_uniform_code_base,
5452 TA_STATE_PDS_TEXUNICODEBASE,
5453 tex_base) {
5454 tex_base.addr =
5455 PVR_DEV_ADDR(descriptor_shader_state->pds_code.code_offset);
5456 }
5457 } else {
5458 ppp_state->pds.texture_uniform_code_base = 0U;
5459 }
5460
5461 pvr_csb_pack (&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1, info1) {
5462 info1.pds_uniformsize = pds_uniform_size;
5463 info1.pds_texturestatesize = 0U;
5464 info1.pds_varyingsize = pds_varying_state_size;
5465 info1.usc_varyingsize = usc_varying_size;
5466 info1.pds_tempsize = pds_temp_size;
5467 }
5468
5469 pvr_csb_pack (&size_info_mask, TA_STATE_PDS_SIZEINFO2, mask) {
5470 mask.pds_tri_merge_disable = true;
5471 }
5472
5473 ppp_state->pds.size_info2 &= size_info_mask;
5474
5475 pvr_csb_pack (&size_info2, TA_STATE_PDS_SIZEINFO2, info2) {
5476 info2.usc_sharedsize = usc_shared_size;
5477 }
5478
5479 ppp_state->pds.size_info2 |= size_info2;
5480
5481 if (pds_coeff_program->pvr_bo) {
5482 header->pres_pds_state_ptr1 = true;
5483
5484 pvr_csb_pack (&ppp_state->pds.varying_base,
5485 TA_STATE_PDS_VARYINGBASE,
5486 base) {
5487 base.addr = PVR_DEV_ADDR(pds_coeff_program->data_offset);
5488 }
5489 } else {
5490 ppp_state->pds.varying_base = 0U;
5491 }
5492
5493 pvr_csb_pack (&ppp_state->pds.uniform_state_data_base,
5494 TA_STATE_PDS_UNIFORMDATABASE,
5495 base) {
5496 base.addr = PVR_DEV_ADDR(state->pds_fragment_descriptor_data_offset);
5497 }
5498
5499 header->pres_pds_state_ptr0 = true;
5500 header->pres_pds_state_ptr3 = true;
5501 }
5502
pvr_setup_viewport(struct pvr_cmd_buffer * const cmd_buffer)5503 static void pvr_setup_viewport(struct pvr_cmd_buffer *const cmd_buffer)
5504 {
5505 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5506 struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5507 struct vk_dynamic_graphics_state *const dynamic_state =
5508 &cmd_buffer->vk.dynamic_graphics_state;
5509 struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5510
5511 if (ppp_state->viewport_count != dynamic_state->vp.viewport_count) {
5512 ppp_state->viewport_count = dynamic_state->vp.viewport_count;
5513 header->pres_viewport = true;
5514 }
5515
5516 if (dynamic_state->rs.rasterizer_discard_enable) {
5517 /* We don't want to emit any viewport data as it'll just get thrown
5518 * away. It's after the previous condition because we still want to
5519 * stash the viewport_count as it's our trigger for when
5520 * rasterizer discard gets disabled.
5521 */
5522 header->pres_viewport = false;
5523 return;
5524 }
5525
5526 for (uint32_t i = 0; i < ppp_state->viewport_count; i++) {
5527 VkViewport *viewport = &dynamic_state->vp.viewports[i];
5528 uint32_t x_scale = fui(viewport->width * 0.5f);
5529 uint32_t y_scale = fui(viewport->height * 0.5f);
5530 uint32_t z_scale = fui(viewport->maxDepth - viewport->minDepth);
5531 uint32_t x_center = fui(viewport->x + viewport->width * 0.5f);
5532 uint32_t y_center = fui(viewport->y + viewport->height * 0.5f);
5533 uint32_t z_center = fui(viewport->minDepth);
5534
5535 if (ppp_state->viewports[i].a0 != x_center ||
5536 ppp_state->viewports[i].m0 != x_scale ||
5537 ppp_state->viewports[i].a1 != y_center ||
5538 ppp_state->viewports[i].m1 != y_scale ||
5539 ppp_state->viewports[i].a2 != z_center ||
5540 ppp_state->viewports[i].m2 != z_scale) {
5541 ppp_state->viewports[i].a0 = x_center;
5542 ppp_state->viewports[i].m0 = x_scale;
5543 ppp_state->viewports[i].a1 = y_center;
5544 ppp_state->viewports[i].m1 = y_scale;
5545 ppp_state->viewports[i].a2 = z_center;
5546 ppp_state->viewports[i].m2 = z_scale;
5547
5548 header->pres_viewport = true;
5549 }
5550 }
5551 }
5552
pvr_setup_ppp_control(struct pvr_cmd_buffer * const cmd_buffer)5553 static void pvr_setup_ppp_control(struct pvr_cmd_buffer *const cmd_buffer)
5554 {
5555 struct vk_dynamic_graphics_state *const dynamic_state =
5556 &cmd_buffer->vk.dynamic_graphics_state;
5557 const VkPrimitiveTopology topology = dynamic_state->ia.primitive_topology;
5558 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5559 struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5560 struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5561 uint32_t ppp_control;
5562
5563 pvr_csb_pack (&ppp_control, TA_STATE_PPP_CTRL, control) {
5564 control.drawclippededges = true;
5565 control.wclampen = true;
5566
5567 if (topology == VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN)
5568 control.flatshade_vtx = PVRX(TA_FLATSHADE_VTX_VERTEX_1);
5569 else
5570 control.flatshade_vtx = PVRX(TA_FLATSHADE_VTX_VERTEX_0);
5571
5572 if (dynamic_state->rs.depth_clamp_enable)
5573 control.clip_mode = PVRX(TA_CLIP_MODE_NO_FRONT_OR_REAR);
5574 else
5575 control.clip_mode = PVRX(TA_CLIP_MODE_FRONT_REAR);
5576
5577 /* +--- FrontIsCCW?
5578 * | +--- Cull Front?
5579 * v v
5580 * 0|0 CULLMODE_CULL_CCW,
5581 * 0|1 CULLMODE_CULL_CW,
5582 * 1|0 CULLMODE_CULL_CW,
5583 * 1|1 CULLMODE_CULL_CCW,
5584 */
5585 switch (dynamic_state->rs.cull_mode) {
5586 case VK_CULL_MODE_BACK_BIT:
5587 case VK_CULL_MODE_FRONT_BIT:
5588 if ((dynamic_state->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE) ^
5589 (dynamic_state->rs.cull_mode == VK_CULL_MODE_FRONT_BIT)) {
5590 control.cullmode = PVRX(TA_CULLMODE_CULL_CW);
5591 } else {
5592 control.cullmode = PVRX(TA_CULLMODE_CULL_CCW);
5593 }
5594
5595 break;
5596
5597 case VK_CULL_MODE_FRONT_AND_BACK:
5598 case VK_CULL_MODE_NONE:
5599 control.cullmode = PVRX(TA_CULLMODE_NO_CULLING);
5600 break;
5601
5602 default:
5603 unreachable("Unsupported cull mode!");
5604 }
5605 }
5606
5607 if (ppp_control != ppp_state->ppp_control) {
5608 ppp_state->ppp_control = ppp_control;
5609 header->pres_ppp_ctrl = true;
5610 }
5611 }
5612
5613 /* Largest valid PPP State update in words = 31
5614 * 1 - Header
5615 * 3 - Stream Out Config words 0, 1 and 2
5616 * 1 - PPP Control word
5617 * 3 - Varying Config words 0, 1 and 2
5618 * 1 - Output Select
5619 * 1 - WClamp
5620 * 6 - Viewport Transform words
5621 * 2 - Region Clip words
5622 * 3 - PDS State for fragment phase (PDSSTATEPTR 1-3)
5623 * 4 - PDS State for fragment phase (PDSSTATEPTR0)
5624 * 6 - ISP Control Words
5625 */
5626 #define PVR_MAX_PPP_STATE_DWORDS 31
5627
pvr_emit_ppp_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)5628 static VkResult pvr_emit_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
5629 struct pvr_sub_cmd_gfx *const sub_cmd)
5630 {
5631 const bool deferred_secondary = pvr_cmd_uses_deferred_cs_cmds(cmd_buffer);
5632 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5633 struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5634 struct pvr_csb *const control_stream = &sub_cmd->control_stream;
5635 struct pvr_ppp_state *const ppp_state = &state->ppp_state;
5636 uint32_t ppp_state_words[PVR_MAX_PPP_STATE_DWORDS];
5637 const bool emit_dbsc = header->pres_ispctl_dbsc;
5638 uint32_t *buffer_ptr = ppp_state_words;
5639 uint32_t dbsc_patching_offset = 0;
5640 uint32_t ppp_state_words_count;
5641 struct pvr_suballoc_bo *pvr_bo;
5642 VkResult result;
5643
5644 #if !defined(NDEBUG)
5645 struct PVRX(TA_STATE_HEADER) emit_mask = *header;
5646 uint32_t packed_emit_mask;
5647
5648 static_assert(pvr_cmd_length(TA_STATE_HEADER) == 1,
5649 "EMIT_MASK_IS_CLEAR assumes 1 dword sized header.");
5650
5651 # define EMIT_MASK_GET(field) (emit_mask.field)
5652 # define EMIT_MASK_SET(field, value) (emit_mask.field = (value))
5653 # define EMIT_MASK_IS_CLEAR \
5654 (pvr_cmd_pack(TA_STATE_HEADER)(&packed_emit_mask, &emit_mask), \
5655 packed_emit_mask == 0)
5656 #else
5657 # define EMIT_MASK_GET(field)
5658 # define EMIT_MASK_SET(field, value)
5659 #endif
5660
5661 header->view_port_count =
5662 (ppp_state->viewport_count == 0) ? 0U : (ppp_state->viewport_count - 1);
5663 header->pres_ispctl_fa = header->pres_ispctl;
5664
5665 /* If deferred_secondary is true then we do a separate state update
5666 * which gets patched in vkCmdExecuteCommands().
5667 */
5668 header->pres_ispctl_dbsc &= !deferred_secondary;
5669
5670 pvr_csb_write_struct(buffer_ptr, TA_STATE_HEADER, header);
5671
5672 static_assert(pvr_cmd_length(TA_STATE_HEADER) == 1,
5673 "Following header check assumes 1 dword sized header.");
5674 /* If the header is empty we exit early and prevent a bo alloc of 0 size. */
5675 if (ppp_state_words[0] == 0)
5676 return VK_SUCCESS;
5677
5678 if (header->pres_ispctl) {
5679 pvr_csb_write_value(buffer_ptr, TA_STATE_ISPCTL, ppp_state->isp.control);
5680
5681 assert(header->pres_ispctl_fa);
5682 /* This is not a mistake. FA, BA have the ISPA format, and FB, BB have the
5683 * ISPB format.
5684 */
5685 pvr_csb_write_value(buffer_ptr, TA_STATE_ISPA, ppp_state->isp.front_a);
5686 EMIT_MASK_SET(pres_ispctl_fa, false);
5687
5688 if (header->pres_ispctl_fb) {
5689 pvr_csb_write_value(buffer_ptr, TA_STATE_ISPB, ppp_state->isp.front_b);
5690 EMIT_MASK_SET(pres_ispctl_fb, false);
5691 }
5692
5693 if (header->pres_ispctl_ba) {
5694 pvr_csb_write_value(buffer_ptr, TA_STATE_ISPA, ppp_state->isp.back_a);
5695 EMIT_MASK_SET(pres_ispctl_ba, false);
5696 }
5697
5698 if (header->pres_ispctl_bb) {
5699 pvr_csb_write_value(buffer_ptr, TA_STATE_ISPB, ppp_state->isp.back_b);
5700 EMIT_MASK_SET(pres_ispctl_bb, false);
5701 }
5702
5703 EMIT_MASK_SET(pres_ispctl, false);
5704 }
5705
5706 if (header->pres_ispctl_dbsc) {
5707 assert(!deferred_secondary);
5708
5709 dbsc_patching_offset = buffer_ptr - ppp_state_words;
5710
5711 pvr_csb_pack (buffer_ptr, TA_STATE_ISPDBSC, ispdbsc) {
5712 ispdbsc.dbindex = ppp_state->depthbias_scissor_indices.depthbias_index;
5713 ispdbsc.scindex = ppp_state->depthbias_scissor_indices.scissor_index;
5714 }
5715 buffer_ptr += pvr_cmd_length(TA_STATE_ISPDBSC);
5716
5717 EMIT_MASK_SET(pres_ispctl_dbsc, false);
5718 }
5719
5720 if (header->pres_pds_state_ptr0) {
5721 pvr_csb_write_value(buffer_ptr,
5722 TA_STATE_PDS_SHADERBASE,
5723 ppp_state->pds.pixel_shader_base);
5724
5725 pvr_csb_write_value(buffer_ptr,
5726 TA_STATE_PDS_TEXUNICODEBASE,
5727 ppp_state->pds.texture_uniform_code_base);
5728
5729 pvr_csb_write_value(buffer_ptr,
5730 TA_STATE_PDS_SIZEINFO1,
5731 ppp_state->pds.size_info1);
5732 pvr_csb_write_value(buffer_ptr,
5733 TA_STATE_PDS_SIZEINFO2,
5734 ppp_state->pds.size_info2);
5735
5736 EMIT_MASK_SET(pres_pds_state_ptr0, false);
5737 }
5738
5739 if (header->pres_pds_state_ptr1) {
5740 pvr_csb_write_value(buffer_ptr,
5741 TA_STATE_PDS_VARYINGBASE,
5742 ppp_state->pds.varying_base);
5743 EMIT_MASK_SET(pres_pds_state_ptr1, false);
5744 }
5745
5746 /* We don't use pds_state_ptr2 (texture state programs) control word, but
5747 * this doesn't mean we need to set it to 0. This is because the hardware
5748 * runs the texture state program only when
5749 * ROGUE_TA_STATE_PDS_SIZEINFO1.pds_texturestatesize is non-zero.
5750 */
5751 assert(pvr_csb_unpack(&ppp_state->pds.size_info1, TA_STATE_PDS_SIZEINFO1)
5752 .pds_texturestatesize == 0);
5753
5754 if (header->pres_pds_state_ptr3) {
5755 pvr_csb_write_value(buffer_ptr,
5756 TA_STATE_PDS_UNIFORMDATABASE,
5757 ppp_state->pds.uniform_state_data_base);
5758 EMIT_MASK_SET(pres_pds_state_ptr3, false);
5759 }
5760
5761 if (header->pres_region_clip) {
5762 pvr_csb_write_value(buffer_ptr,
5763 TA_REGION_CLIP0,
5764 ppp_state->region_clipping.word0);
5765 pvr_csb_write_value(buffer_ptr,
5766 TA_REGION_CLIP1,
5767 ppp_state->region_clipping.word1);
5768
5769 EMIT_MASK_SET(pres_region_clip, false);
5770 }
5771
5772 if (header->pres_viewport) {
5773 const uint32_t viewports = MAX2(1, ppp_state->viewport_count);
5774 EMIT_MASK_SET(view_port_count, viewports);
5775
5776 for (uint32_t i = 0; i < viewports; i++) {
5777 /* These don't have any definitions in the csbgen xml files and none
5778 * will be added.
5779 */
5780 *buffer_ptr++ = ppp_state->viewports[i].a0;
5781 *buffer_ptr++ = ppp_state->viewports[i].m0;
5782 *buffer_ptr++ = ppp_state->viewports[i].a1;
5783 *buffer_ptr++ = ppp_state->viewports[i].m1;
5784 *buffer_ptr++ = ppp_state->viewports[i].a2;
5785 *buffer_ptr++ = ppp_state->viewports[i].m2;
5786
5787 EMIT_MASK_SET(view_port_count, EMIT_MASK_GET(view_port_count) - 1);
5788 }
5789
5790 EMIT_MASK_SET(pres_viewport, false);
5791 }
5792
5793 if (header->pres_wclamp) {
5794 pvr_csb_pack (buffer_ptr, TA_WCLAMP, wclamp) {
5795 wclamp.val = fui(0.00001f);
5796 }
5797 buffer_ptr += pvr_cmd_length(TA_WCLAMP);
5798 EMIT_MASK_SET(pres_wclamp, false);
5799 }
5800
5801 if (header->pres_outselects) {
5802 pvr_csb_write_value(buffer_ptr, TA_OUTPUT_SEL, ppp_state->output_selects);
5803 EMIT_MASK_SET(pres_outselects, false);
5804 }
5805
5806 if (header->pres_varying_word0) {
5807 pvr_csb_write_value(buffer_ptr,
5808 TA_STATE_VARYING0,
5809 ppp_state->varying_word[0]);
5810 EMIT_MASK_SET(pres_varying_word0, false);
5811 }
5812
5813 if (header->pres_varying_word1) {
5814 pvr_csb_write_value(buffer_ptr,
5815 TA_STATE_VARYING1,
5816 ppp_state->varying_word[1]);
5817 EMIT_MASK_SET(pres_varying_word1, false);
5818 }
5819
5820 /* We only emit this on the first draw of a render job to prevent us from
5821 * inheriting a non-zero value set elsewhere.
5822 */
5823 if (header->pres_varying_word2) {
5824 pvr_csb_write_value(buffer_ptr, TA_STATE_VARYING2, 0);
5825 EMIT_MASK_SET(pres_varying_word2, false);
5826 }
5827
5828 if (header->pres_ppp_ctrl) {
5829 pvr_csb_write_value(buffer_ptr,
5830 TA_STATE_PPP_CTRL,
5831 ppp_state->ppp_control);
5832 EMIT_MASK_SET(pres_ppp_ctrl, false);
5833 }
5834
5835 /* We only emit this on the first draw of a render job to prevent us from
5836 * inheriting a non-zero value set elsewhere.
5837 */
5838 if (header->pres_stream_out_size) {
5839 pvr_csb_write_value(buffer_ptr, TA_STATE_STREAM_OUT0, 0);
5840 EMIT_MASK_SET(pres_stream_out_size, false);
5841 }
5842
5843 assert(EMIT_MASK_IS_CLEAR);
5844
5845 #undef EMIT_MASK_GET
5846 #undef EMIT_MASK_SET
5847 #if !defined(NDEBUG)
5848 # undef EMIT_MASK_IS_CLEAR
5849 #endif
5850
5851 ppp_state_words_count = buffer_ptr - ppp_state_words;
5852 assert(ppp_state_words_count <= PVR_MAX_PPP_STATE_DWORDS);
5853
5854 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
5855 cmd_buffer->device->heaps.general_heap,
5856 PVR_DW_TO_BYTES(ppp_state_words_count),
5857 &pvr_bo);
5858 if (result != VK_SUCCESS)
5859 return result;
5860
5861 memcpy(pvr_bo_suballoc_get_map_addr(pvr_bo),
5862 ppp_state_words,
5863 PVR_DW_TO_BYTES(ppp_state_words_count));
5864
5865 pvr_csb_set_relocation_mark(control_stream);
5866
5867 /* Write the VDM state update into the VDM control stream. */
5868 pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE0, state0) {
5869 state0.word_count = ppp_state_words_count;
5870 state0.addrmsb = pvr_bo->dev_addr;
5871 }
5872
5873 pvr_csb_emit (control_stream, VDMCTRL_PPP_STATE1, state1) {
5874 state1.addrlsb = pvr_bo->dev_addr;
5875 }
5876
5877 pvr_csb_clear_relocation_mark(control_stream);
5878
5879 if (emit_dbsc && cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
5880 struct pvr_deferred_cs_command cmd;
5881
5882 if (deferred_secondary) {
5883 const uint32_t num_dwords = pvr_cmd_length(VDMCTRL_PPP_STATE0) +
5884 pvr_cmd_length(VDMCTRL_PPP_STATE1);
5885 uint32_t *vdm_state;
5886
5887 pvr_csb_set_relocation_mark(control_stream);
5888
5889 vdm_state = pvr_csb_alloc_dwords(control_stream, num_dwords);
5890 if (!vdm_state) {
5891 result = pvr_csb_get_status(control_stream);
5892 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
5893 }
5894
5895 pvr_csb_clear_relocation_mark(control_stream);
5896
5897 cmd = (struct pvr_deferred_cs_command){
5898 .type = PVR_DEFERRED_CS_COMMAND_TYPE_DBSC,
5899 .dbsc = {
5900 .state = ppp_state->depthbias_scissor_indices,
5901 .vdm_state = vdm_state,
5902 },
5903 };
5904 } else {
5905 cmd = (struct pvr_deferred_cs_command){
5906 .type = PVR_DEFERRED_CS_COMMAND_TYPE_DBSC2,
5907 .dbsc2 = {
5908 .state = ppp_state->depthbias_scissor_indices,
5909 .ppp_cs_bo = pvr_bo,
5910 .patch_offset = dbsc_patching_offset,
5911 },
5912 };
5913 }
5914
5915 util_dynarray_append(&cmd_buffer->deferred_csb_commands,
5916 struct pvr_deferred_cs_command,
5917 cmd);
5918 }
5919
5920 state->emit_header = (struct PVRX(TA_STATE_HEADER)){ 0 };
5921
5922 return VK_SUCCESS;
5923 }
5924
5925 static inline bool
pvr_ppp_state_update_required(const struct pvr_cmd_buffer * cmd_buffer)5926 pvr_ppp_state_update_required(const struct pvr_cmd_buffer *cmd_buffer)
5927 {
5928 const BITSET_WORD *const dynamic_dirty =
5929 cmd_buffer->vk.dynamic_graphics_state.dirty;
5930 const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5931 const struct PVRX(TA_STATE_HEADER) *const header = &state->emit_header;
5932
5933 /* For push constants we only need to worry if they are updated for the
5934 * fragment stage since we're only updating the pds programs used in the
5935 * fragment stage.
5936 */
5937
5938 return header->pres_ppp_ctrl || header->pres_ispctl ||
5939 header->pres_ispctl_fb || header->pres_ispctl_ba ||
5940 header->pres_ispctl_bb || header->pres_ispctl_dbsc ||
5941 header->pres_pds_state_ptr0 || header->pres_pds_state_ptr1 ||
5942 header->pres_pds_state_ptr2 || header->pres_pds_state_ptr3 ||
5943 header->pres_region_clip || header->pres_viewport ||
5944 header->pres_wclamp || header->pres_outselects ||
5945 header->pres_varying_word0 || header->pres_varying_word1 ||
5946 header->pres_varying_word2 || header->pres_stream_out_program ||
5947 state->dirty.fragment_descriptors || state->dirty.vis_test ||
5948 state->dirty.gfx_pipeline_binding || state->dirty.isp_userpass ||
5949 state->push_constants.dirty_stages & VK_SHADER_STAGE_FRAGMENT_BIT ||
5950 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
5951 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
5952 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
5953 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
5954 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS) ||
5955 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
5956 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
5957 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_SCISSOR_COUNT) ||
5958 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
5959 BITSET_TEST(dynamic_dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT);
5960 }
5961
5962 static VkResult
pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)5963 pvr_emit_dirty_ppp_state(struct pvr_cmd_buffer *const cmd_buffer,
5964 struct pvr_sub_cmd_gfx *const sub_cmd)
5965 {
5966 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
5967 struct vk_dynamic_graphics_state *const dynamic_state =
5968 &cmd_buffer->vk.dynamic_graphics_state;
5969 VkResult result;
5970
5971 /* TODO: The emit_header will be dirty only if
5972 * pvr_reset_graphics_dirty_state() was called before this (so when command
5973 * buffer begins recording or when it's reset). Otherwise it will have been
5974 * zeroed out by the previous pvr_emit_ppp_state(). We can probably set a
5975 * flag in there and check it here instead of checking the header.
5976 * Check if this is true and implement the flag.
5977 */
5978 if (!pvr_ppp_state_update_required(cmd_buffer))
5979 return VK_SUCCESS;
5980
5981 if (state->dirty.gfx_pipeline_binding) {
5982 struct PVRX(TA_STATE_ISPA) ispa;
5983
5984 pvr_setup_output_select(cmd_buffer);
5985 pvr_setup_isp_faces_and_control(cmd_buffer, &ispa);
5986 pvr_setup_triangle_merging_flag(cmd_buffer, &ispa);
5987 } else if (BITSET_TEST(dynamic_state->dirty,
5988 MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
5989 BITSET_TEST(dynamic_state->dirty,
5990 MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
5991 BITSET_TEST(dynamic_state->dirty,
5992 MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
5993 BITSET_TEST(dynamic_state->dirty,
5994 MESA_VK_DYNAMIC_RS_LINE_WIDTH) ||
5995 state->dirty.isp_userpass || state->dirty.vis_test) {
5996 pvr_setup_isp_faces_and_control(cmd_buffer, NULL);
5997 }
5998
5999 if (!dynamic_state->rs.rasterizer_discard_enable &&
6000 state->dirty.fragment_descriptors &&
6001 state->gfx_pipeline->shader_state.fragment.bo) {
6002 pvr_setup_fragment_state_pointers(cmd_buffer, sub_cmd);
6003 }
6004
6005 pvr_setup_isp_depth_bias_scissor_state(cmd_buffer);
6006
6007 if (BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS) ||
6008 BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_VP_VIEWPORT_COUNT))
6009 pvr_setup_viewport(cmd_buffer);
6010
6011 pvr_setup_ppp_control(cmd_buffer);
6012
6013 /* The hardware doesn't have an explicit mode for this so we use a
6014 * negative viewport to make sure all objects are culled out early.
6015 */
6016 if (dynamic_state->rs.cull_mode == VK_CULL_MODE_FRONT_AND_BACK) {
6017 /* Shift the viewport out of the guard-band culling everything. */
6018 const uint32_t negative_vp_val = fui(-2.0f);
6019
6020 state->ppp_state.viewports[0].a0 = negative_vp_val;
6021 state->ppp_state.viewports[0].m0 = 0;
6022 state->ppp_state.viewports[0].a1 = negative_vp_val;
6023 state->ppp_state.viewports[0].m1 = 0;
6024 state->ppp_state.viewports[0].a2 = negative_vp_val;
6025 state->ppp_state.viewports[0].m2 = 0;
6026
6027 state->ppp_state.viewport_count = 1;
6028
6029 state->emit_header.pres_viewport = true;
6030 }
6031
6032 result = pvr_emit_ppp_state(cmd_buffer, sub_cmd);
6033 if (result != VK_SUCCESS)
6034 return result;
6035
6036 return VK_SUCCESS;
6037 }
6038
pvr_calculate_vertex_cam_size(const struct pvr_device_info * dev_info,const uint32_t vs_output_size,const bool raster_enable,uint32_t * const cam_size_out,uint32_t * const vs_max_instances_out)6039 void pvr_calculate_vertex_cam_size(const struct pvr_device_info *dev_info,
6040 const uint32_t vs_output_size,
6041 const bool raster_enable,
6042 uint32_t *const cam_size_out,
6043 uint32_t *const vs_max_instances_out)
6044 {
6045 /* First work out the size of a vertex in the UVS and multiply by 4 for
6046 * column ordering.
6047 */
6048 const uint32_t uvs_vertex_vector_size_in_dwords =
6049 (vs_output_size + 1U + raster_enable * 4U) * 4U;
6050 const uint32_t vdm_cam_size =
6051 PVR_GET_FEATURE_VALUE(dev_info, vdm_cam_size, 32U);
6052
6053 /* This is a proxy for 8XE. */
6054 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format) &&
6055 vdm_cam_size < 96U) {
6056 /* Comparisons are based on size including scratch per vertex vector. */
6057 if (uvs_vertex_vector_size_in_dwords < (14U * 4U)) {
6058 *cam_size_out = MIN2(31U, vdm_cam_size - 1U);
6059 *vs_max_instances_out = 16U;
6060 } else if (uvs_vertex_vector_size_in_dwords < (20U * 4U)) {
6061 *cam_size_out = 15U;
6062 *vs_max_instances_out = 16U;
6063 } else if (uvs_vertex_vector_size_in_dwords < (28U * 4U)) {
6064 *cam_size_out = 11U;
6065 *vs_max_instances_out = 12U;
6066 } else if (uvs_vertex_vector_size_in_dwords < (44U * 4U)) {
6067 *cam_size_out = 7U;
6068 *vs_max_instances_out = 8U;
6069 } else if (PVR_HAS_FEATURE(dev_info,
6070 simple_internal_parameter_format_v2) ||
6071 uvs_vertex_vector_size_in_dwords < (64U * 4U)) {
6072 *cam_size_out = 7U;
6073 *vs_max_instances_out = 4U;
6074 } else {
6075 *cam_size_out = 3U;
6076 *vs_max_instances_out = 2U;
6077 }
6078 } else {
6079 /* Comparisons are based on size including scratch per vertex vector. */
6080 if (uvs_vertex_vector_size_in_dwords <= (32U * 4U)) {
6081 /* output size <= 27 + 5 scratch. */
6082 *cam_size_out = MIN2(95U, vdm_cam_size - 1U);
6083 *vs_max_instances_out = 0U;
6084 } else if (uvs_vertex_vector_size_in_dwords <= 48U * 4U) {
6085 /* output size <= 43 + 5 scratch */
6086 *cam_size_out = 63U;
6087 if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
6088 *vs_max_instances_out = 16U;
6089 else
6090 *vs_max_instances_out = 0U;
6091 } else if (uvs_vertex_vector_size_in_dwords <= 64U * 4U) {
6092 /* output size <= 59 + 5 scratch. */
6093 *cam_size_out = 31U;
6094 if (PVR_GET_FEATURE_VALUE(dev_info, uvs_vtx_entries, 144U) < 288U)
6095 *vs_max_instances_out = 16U;
6096 else
6097 *vs_max_instances_out = 0U;
6098 } else {
6099 *cam_size_out = 15U;
6100 *vs_max_instances_out = 16U;
6101 }
6102 }
6103 }
6104
pvr_emit_dirty_vdm_state(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)6105 static void pvr_emit_dirty_vdm_state(struct pvr_cmd_buffer *const cmd_buffer,
6106 struct pvr_sub_cmd_gfx *const sub_cmd)
6107 {
6108 /* FIXME: Assume all state is dirty for the moment. */
6109 struct pvr_device_info *const dev_info =
6110 &cmd_buffer->device->pdevice->dev_info;
6111 ASSERTED const uint32_t max_user_vertex_output_components =
6112 pvr_get_max_user_vertex_output_components(dev_info);
6113 struct PVRX(VDMCTRL_VDM_STATE0)
6114 header = { pvr_cmd_header(VDMCTRL_VDM_STATE0) };
6115 struct vk_dynamic_graphics_state *const dynamic_state =
6116 &cmd_buffer->vk.dynamic_graphics_state;
6117 const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
6118 const struct pvr_vertex_shader_state *const vertex_shader_state =
6119 &state->gfx_pipeline->shader_state.vertex;
6120 struct pvr_csb *const csb = &sub_cmd->control_stream;
6121 uint32_t vs_output_size;
6122 uint32_t max_instances;
6123 uint32_t cam_size;
6124
6125 /* CAM Calculations and HW state take vertex size aligned to DWORDS. */
6126 vs_output_size =
6127 DIV_ROUND_UP(vertex_shader_state->vertex_output_size,
6128 PVRX(VDMCTRL_VDM_STATE4_VS_OUTPUT_SIZE_UNIT_SIZE));
6129
6130 assert(vs_output_size <= max_user_vertex_output_components);
6131
6132 pvr_calculate_vertex_cam_size(dev_info,
6133 vs_output_size,
6134 true,
6135 &cam_size,
6136 &max_instances);
6137
6138 pvr_csb_set_relocation_mark(csb);
6139
6140 pvr_csb_emit (csb, VDMCTRL_VDM_STATE0, state0) {
6141 state0.cam_size = cam_size;
6142
6143 if (dynamic_state->ia.primitive_restart_enable) {
6144 state0.cut_index_enable = true;
6145 state0.cut_index_present = true;
6146 }
6147
6148 switch (dynamic_state->ia.primitive_topology) {
6149 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
6150 state0.flatshade_control = PVRX(VDMCTRL_FLATSHADE_CONTROL_VERTEX_1);
6151 break;
6152
6153 default:
6154 state0.flatshade_control = PVRX(VDMCTRL_FLATSHADE_CONTROL_VERTEX_0);
6155 break;
6156 }
6157
6158 /* If we've bound a different vertex buffer, or this draw-call requires
6159 * a different PDS attrib data-section from the last draw call (changed
6160 * base_instance) then we need to specify a new data section. This is
6161 * also the case if we've switched pipeline or attrib program as the
6162 * data-section layout will be different.
6163 */
6164 state0.vs_data_addr_present =
6165 state->dirty.gfx_pipeline_binding || state->dirty.vertex_bindings ||
6166 state->dirty.draw_base_instance || state->dirty.draw_variant;
6167
6168 /* Need to specify new PDS Attrib program if we've bound a different
6169 * pipeline or we needed a different PDS Attrib variant for this
6170 * draw-call.
6171 */
6172 state0.vs_other_present = state->dirty.gfx_pipeline_binding ||
6173 state->dirty.draw_variant;
6174
6175 /* UVB_SCRATCH_SELECT_ONE with no rasterization is only valid when
6176 * stream output is enabled. We use UVB_SCRATCH_SELECT_FIVE because
6177 * Vulkan doesn't support stream output and the vertex position is
6178 * always emitted to the UVB.
6179 */
6180 state0.uvs_scratch_size_select =
6181 PVRX(VDMCTRL_UVS_SCRATCH_SIZE_SELECT_FIVE);
6182
6183 header = state0;
6184 }
6185
6186 if (header.cut_index_present) {
6187 pvr_csb_emit (csb, VDMCTRL_VDM_STATE1, state1) {
6188 state1.cut_index =
6189 vk_index_to_restart(state->index_buffer_binding.type);
6190 }
6191 }
6192
6193 if (header.vs_data_addr_present) {
6194 pvr_csb_emit (csb, VDMCTRL_VDM_STATE2, state2) {
6195 state2.vs_pds_data_base_addr =
6196 PVR_DEV_ADDR(state->pds_vertex_attrib_offset);
6197 }
6198 }
6199
6200 if (header.vs_other_present) {
6201 const uint32_t usc_unified_store_size_in_bytes =
6202 vertex_shader_state->vertex_input_size << 2;
6203
6204 pvr_csb_emit (csb, VDMCTRL_VDM_STATE3, state3) {
6205 state3.vs_pds_code_base_addr =
6206 PVR_DEV_ADDR(state->pds_shader.code_offset);
6207 }
6208
6209 pvr_csb_emit (csb, VDMCTRL_VDM_STATE4, state4) {
6210 state4.vs_output_size = vs_output_size;
6211 }
6212
6213 pvr_csb_emit (csb, VDMCTRL_VDM_STATE5, state5) {
6214 state5.vs_max_instances = max_instances;
6215 state5.vs_usc_common_size = 0U;
6216 state5.vs_usc_unified_size = DIV_ROUND_UP(
6217 usc_unified_store_size_in_bytes,
6218 PVRX(VDMCTRL_VDM_STATE5_VS_USC_UNIFIED_SIZE_UNIT_SIZE));
6219 state5.vs_pds_temp_size =
6220 DIV_ROUND_UP(state->pds_shader.info->temps_required << 2,
6221 PVRX(VDMCTRL_VDM_STATE5_VS_PDS_TEMP_SIZE_UNIT_SIZE));
6222 state5.vs_pds_data_size = DIV_ROUND_UP(
6223 PVR_DW_TO_BYTES(state->pds_shader.info->data_size_in_dwords),
6224 PVRX(VDMCTRL_VDM_STATE5_VS_PDS_DATA_SIZE_UNIT_SIZE));
6225 }
6226 }
6227
6228 pvr_csb_clear_relocation_mark(csb);
6229 }
6230
pvr_validate_draw_state(struct pvr_cmd_buffer * cmd_buffer)6231 static VkResult pvr_validate_draw_state(struct pvr_cmd_buffer *cmd_buffer)
6232 {
6233 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
6234 struct vk_dynamic_graphics_state *const dynamic_state =
6235 &cmd_buffer->vk.dynamic_graphics_state;
6236 const struct pvr_graphics_pipeline *const gfx_pipeline = state->gfx_pipeline;
6237 const struct pvr_pipeline_stage_state *const fragment_state =
6238 &gfx_pipeline->shader_state.fragment.stage_state;
6239 const struct pvr_pipeline_stage_state *const vertex_state =
6240 &gfx_pipeline->shader_state.vertex.stage_state;
6241 const struct pvr_pipeline_layout *const pipeline_layout =
6242 gfx_pipeline->base.layout;
6243 struct pvr_sub_cmd_gfx *sub_cmd;
6244 bool fstencil_writemask_zero;
6245 bool bstencil_writemask_zero;
6246 bool fstencil_keep;
6247 bool bstencil_keep;
6248 VkResult result;
6249
6250 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
6251
6252 sub_cmd = &state->current_sub_cmd->gfx;
6253 sub_cmd->empty_cmd = false;
6254
6255 /* Determine pipeline depth/stencil usage. If a pipeline uses depth or
6256 * stencil testing, those attachments are using their loaded values, and
6257 * the loadOps cannot be optimized out.
6258 */
6259 /* Pipeline uses depth testing. */
6260 if (sub_cmd->depth_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
6261 dynamic_state->ds.depth.compare_op != VK_COMPARE_OP_ALWAYS) {
6262 sub_cmd->depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
6263 }
6264
6265 /* Pipeline uses stencil testing. */
6266 if (sub_cmd->stencil_usage == PVR_DEPTH_STENCIL_USAGE_UNDEFINED &&
6267 (dynamic_state->ds.stencil.front.op.compare != VK_COMPARE_OP_ALWAYS ||
6268 dynamic_state->ds.stencil.back.op.compare != VK_COMPARE_OP_ALWAYS)) {
6269 sub_cmd->stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
6270 }
6271
6272 if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
6273 compute_overlap)) {
6274 uint32_t coefficient_size =
6275 DIV_ROUND_UP(fragment_state->coefficient_size,
6276 PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_UNIT_SIZE));
6277
6278 if (coefficient_size >
6279 PVRX(TA_STATE_PDS_SIZEINFO1_USC_VARYINGSIZE_MAX_SIZE))
6280 sub_cmd->disable_compute_overlap = true;
6281 }
6282
6283 sub_cmd->frag_uses_atomic_ops |= fragment_state->uses_atomic_ops;
6284 sub_cmd->frag_has_side_effects |= fragment_state->has_side_effects;
6285 sub_cmd->frag_uses_texture_rw |= fragment_state->uses_texture_rw;
6286 sub_cmd->vertex_uses_texture_rw |= vertex_state->uses_texture_rw;
6287
6288 sub_cmd->job.get_vis_results = state->vis_test_enabled;
6289
6290 fstencil_keep =
6291 (dynamic_state->ds.stencil.front.op.fail == VK_STENCIL_OP_KEEP) &&
6292 (dynamic_state->ds.stencil.front.op.pass == VK_STENCIL_OP_KEEP);
6293 bstencil_keep =
6294 (dynamic_state->ds.stencil.back.op.fail == VK_STENCIL_OP_KEEP) &&
6295 (dynamic_state->ds.stencil.back.op.pass == VK_STENCIL_OP_KEEP);
6296 fstencil_writemask_zero = (dynamic_state->ds.stencil.front.write_mask == 0);
6297 bstencil_writemask_zero = (dynamic_state->ds.stencil.back.write_mask == 0);
6298
6299 /* Set stencil modified flag if:
6300 * - Neither front nor back-facing stencil has a fail_op/pass_op of KEEP.
6301 * - Neither front nor back-facing stencil has a write_mask of zero.
6302 */
6303 if (!(fstencil_keep && bstencil_keep) &&
6304 !(fstencil_writemask_zero && bstencil_writemask_zero)) {
6305 sub_cmd->modifies_stencil = true;
6306 }
6307
6308 /* Set depth modified flag if depth write is enabled. */
6309 if (dynamic_state->ds.depth.write_enable)
6310 sub_cmd->modifies_depth = true;
6311
6312 /* If either the data or code changes for pds vertex attribs, regenerate the
6313 * data segment.
6314 */
6315 if (state->dirty.vertex_bindings || state->dirty.gfx_pipeline_binding ||
6316 state->dirty.draw_variant || state->dirty.draw_base_instance) {
6317 enum pvr_pds_vertex_attrib_program_type prog_type;
6318 const struct pvr_pds_attrib_program *program;
6319
6320 if (state->draw_state.draw_indirect)
6321 prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT;
6322 else if (state->draw_state.base_instance)
6323 prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE;
6324 else
6325 prog_type = PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC;
6326
6327 program =
6328 &gfx_pipeline->shader_state.vertex.pds_attrib_programs[prog_type];
6329 state->pds_shader.info = &program->info;
6330 state->pds_shader.code_offset = program->program.code_offset;
6331
6332 state->max_shared_regs =
6333 MAX2(state->max_shared_regs, pvr_calc_shared_regs_count(gfx_pipeline));
6334
6335 pvr_setup_vertex_buffers(cmd_buffer, gfx_pipeline);
6336 }
6337
6338 if (state->push_constants.dirty_stages & VK_SHADER_STAGE_ALL_GRAPHICS) {
6339 result = pvr_cmd_upload_push_consts(cmd_buffer);
6340 if (result != VK_SUCCESS)
6341 return result;
6342 }
6343
6344 state->dirty.vertex_descriptors = state->dirty.gfx_pipeline_binding;
6345 state->dirty.fragment_descriptors = state->dirty.vertex_descriptors;
6346
6347 /* Account for dirty descriptor set. */
6348 state->dirty.vertex_descriptors |=
6349 state->dirty.gfx_desc_dirty &&
6350 pipeline_layout
6351 ->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY];
6352 state->dirty.fragment_descriptors |=
6353 state->dirty.gfx_desc_dirty &&
6354 pipeline_layout->per_stage_descriptor_masks[PVR_STAGE_ALLOCATION_FRAGMENT];
6355
6356 if (BITSET_TEST(dynamic_state->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS))
6357 state->dirty.fragment_descriptors = true;
6358
6359 state->dirty.vertex_descriptors |=
6360 state->push_constants.dirty_stages &
6361 (VK_SHADER_STAGE_ALL_GRAPHICS & ~VK_SHADER_STAGE_FRAGMENT_BIT);
6362 state->dirty.fragment_descriptors |= state->push_constants.dirty_stages &
6363 VK_SHADER_STAGE_FRAGMENT_BIT;
6364
6365 if (state->dirty.fragment_descriptors) {
6366 result = pvr_setup_descriptor_mappings(
6367 cmd_buffer,
6368 PVR_STAGE_ALLOCATION_FRAGMENT,
6369 &state->gfx_pipeline->shader_state.fragment.descriptor_state,
6370 NULL,
6371 &state->pds_fragment_descriptor_data_offset);
6372 if (result != VK_SUCCESS) {
6373 mesa_loge("Could not setup fragment descriptor mappings.");
6374 return result;
6375 }
6376 }
6377
6378 if (state->dirty.vertex_descriptors) {
6379 uint32_t pds_vertex_descriptor_data_offset;
6380
6381 result = pvr_setup_descriptor_mappings(
6382 cmd_buffer,
6383 PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
6384 &state->gfx_pipeline->shader_state.vertex.descriptor_state,
6385 NULL,
6386 &pds_vertex_descriptor_data_offset);
6387 if (result != VK_SUCCESS) {
6388 mesa_loge("Could not setup vertex descriptor mappings.");
6389 return result;
6390 }
6391
6392 pvr_emit_dirty_pds_state(cmd_buffer,
6393 sub_cmd,
6394 pds_vertex_descriptor_data_offset);
6395 }
6396
6397 pvr_emit_dirty_ppp_state(cmd_buffer, sub_cmd);
6398 pvr_emit_dirty_vdm_state(cmd_buffer, sub_cmd);
6399
6400 vk_dynamic_graphics_state_clear_dirty(dynamic_state);
6401 state->dirty.gfx_desc_dirty = false;
6402 state->dirty.draw_base_instance = false;
6403 state->dirty.draw_variant = false;
6404 state->dirty.fragment_descriptors = false;
6405 state->dirty.gfx_pipeline_binding = false;
6406 state->dirty.isp_userpass = false;
6407 state->dirty.vertex_bindings = false;
6408 state->dirty.vis_test = false;
6409
6410 state->push_constants.dirty_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
6411
6412 return VK_SUCCESS;
6413 }
6414
pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)6415 static uint32_t pvr_get_hw_primitive_topology(VkPrimitiveTopology topology)
6416 {
6417 switch (topology) {
6418 case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
6419 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_POINT_LIST);
6420 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
6421 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST);
6422 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
6423 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP);
6424 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
6425 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST);
6426 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
6427 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP);
6428 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
6429 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_FAN);
6430 case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
6431 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_LIST_ADJ);
6432 case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
6433 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_LINE_STRIP_ADJ);
6434 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
6435 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST_ADJ);
6436 case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
6437 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_STRIP_ADJ);
6438 case VK_PRIMITIVE_TOPOLOGY_PATCH_LIST:
6439 return PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_PATCH_LIST);
6440 default:
6441 unreachable("Undefined primitive topology");
6442 }
6443 }
6444
6445 /* TODO: Rewrite this in terms of ALIGN_POT() and pvr_cmd_length(). */
6446 /* Aligned to 128 bit for PDS loads / stores */
6447 #define DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE 8
6448
6449 static VkResult
pvr_write_draw_indirect_vdm_stream(struct pvr_cmd_buffer * cmd_buffer,struct pvr_csb * const csb,pvr_dev_addr_t idx_buffer_addr,uint32_t idx_stride,struct PVRX (VDMCTRL_INDEX_LIST0)* list_hdr,struct pvr_buffer * buffer,VkDeviceSize offset,uint32_t count,uint32_t stride)6450 pvr_write_draw_indirect_vdm_stream(struct pvr_cmd_buffer *cmd_buffer,
6451 struct pvr_csb *const csb,
6452 pvr_dev_addr_t idx_buffer_addr,
6453 uint32_t idx_stride,
6454 struct PVRX(VDMCTRL_INDEX_LIST0) * list_hdr,
6455 struct pvr_buffer *buffer,
6456 VkDeviceSize offset,
6457 uint32_t count,
6458 uint32_t stride)
6459 {
6460 struct pvr_pds_drawindirect_program pds_prog = { 0 };
6461 uint32_t word0;
6462
6463 /* Draw indirect always has index offset and instance count. */
6464 list_hdr->index_offset_present = true;
6465 list_hdr->index_instance_count_present = true;
6466
6467 pvr_cmd_pack(VDMCTRL_INDEX_LIST0)(&word0, list_hdr);
6468
6469 pds_prog.support_base_instance = true;
6470 pds_prog.arg_buffer = buffer->dev_addr.addr + offset;
6471 pds_prog.index_buffer = idx_buffer_addr.addr;
6472 pds_prog.index_block_header = word0;
6473 pds_prog.index_stride = idx_stride;
6474 pds_prog.num_views = 1U;
6475
6476 /* TODO: See if we can pre-upload the code section of all the pds programs
6477 * and reuse them here.
6478 */
6479 /* Generate and upload the PDS programs (code + data). */
6480 for (uint32_t i = 0U; i < count; i++) {
6481 const struct pvr_device_info *dev_info =
6482 &cmd_buffer->device->pdevice->dev_info;
6483 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6484 struct pvr_suballoc_bo *dummy_bo;
6485 struct pvr_suballoc_bo *pds_bo;
6486 uint32_t *dummy_stream;
6487 uint32_t *pds_base;
6488 uint32_t pds_size;
6489 VkResult result;
6490
6491 /* TODO: Move this outside the loop and allocate all of them in one go? */
6492 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
6493 cmd_buffer->device->heaps.general_heap,
6494 DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE,
6495 &dummy_bo);
6496 if (result != VK_SUCCESS)
6497 return result;
6498
6499 pds_prog.increment_draw_id = (i != 0);
6500 pds_prog.index_list_addr_buffer = dummy_bo->dev_addr.addr;
6501
6502 if (state->draw_state.draw_indexed) {
6503 pvr_pds_generate_draw_elements_indirect(&pds_prog,
6504 0,
6505 PDS_GENERATE_SIZES,
6506 dev_info);
6507 } else {
6508 pvr_pds_generate_draw_arrays_indirect(&pds_prog,
6509 0,
6510 PDS_GENERATE_SIZES,
6511 dev_info);
6512 }
6513
6514 pds_size = PVR_DW_TO_BYTES(pds_prog.program.data_size_aligned +
6515 pds_prog.program.code_size_aligned);
6516
6517 result = pvr_cmd_buffer_alloc_mem(cmd_buffer,
6518 cmd_buffer->device->heaps.pds_heap,
6519 pds_size,
6520 &pds_bo);
6521 if (result != VK_SUCCESS)
6522 return result;
6523
6524 pds_base = pvr_bo_suballoc_get_map_addr(pds_bo);
6525 memcpy(pds_base,
6526 pds_prog.program.code,
6527 PVR_DW_TO_BYTES(pds_prog.program.code_size_aligned));
6528
6529 if (state->draw_state.draw_indexed) {
6530 pvr_pds_generate_draw_elements_indirect(
6531 &pds_prog,
6532 pds_base + pds_prog.program.code_size_aligned,
6533 PDS_GENERATE_DATA_SEGMENT,
6534 dev_info);
6535 } else {
6536 pvr_pds_generate_draw_arrays_indirect(
6537 &pds_prog,
6538 pds_base + pds_prog.program.code_size_aligned,
6539 PDS_GENERATE_DATA_SEGMENT,
6540 dev_info);
6541 }
6542
6543 pvr_csb_set_relocation_mark(csb);
6544
6545 pvr_csb_emit (csb, VDMCTRL_PDS_STATE0, state0) {
6546 state0.usc_target = PVRX(VDMCTRL_USC_TARGET_ANY);
6547
6548 state0.pds_temp_size =
6549 DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_prog.program.temp_size_aligned),
6550 PVRX(VDMCTRL_PDS_STATE0_PDS_TEMP_SIZE_UNIT_SIZE));
6551
6552 state0.pds_data_size =
6553 DIV_ROUND_UP(PVR_DW_TO_BYTES(pds_prog.program.data_size_aligned),
6554 PVRX(VDMCTRL_PDS_STATE0_PDS_DATA_SIZE_UNIT_SIZE));
6555 }
6556
6557 pvr_csb_emit (csb, VDMCTRL_PDS_STATE1, state1) {
6558 const uint32_t data_offset =
6559 pds_bo->dev_addr.addr +
6560 PVR_DW_TO_BYTES(pds_prog.program.code_size_aligned) -
6561 cmd_buffer->device->heaps.pds_heap->base_addr.addr;
6562
6563 state1.pds_data_addr = PVR_DEV_ADDR(data_offset);
6564 state1.sd_type = PVRX(VDMCTRL_SD_TYPE_PDS);
6565 state1.sd_next_type = PVRX(VDMCTRL_SD_TYPE_NONE);
6566 }
6567
6568 pvr_csb_emit (csb, VDMCTRL_PDS_STATE2, state2) {
6569 const uint32_t code_offset =
6570 pds_bo->dev_addr.addr -
6571 cmd_buffer->device->heaps.pds_heap->base_addr.addr;
6572
6573 state2.pds_code_addr = PVR_DEV_ADDR(code_offset);
6574 }
6575
6576 pvr_csb_clear_relocation_mark(csb);
6577
6578 /* We don't really need to set the relocation mark since the following
6579 * state update is just one emit but let's be nice and use it.
6580 */
6581 pvr_csb_set_relocation_mark(csb);
6582
6583 /* Sync task to ensure the VDM doesn't start reading the dummy blocks
6584 * before they are ready.
6585 */
6586 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
6587 list0.primitive_topology = PVRX(VDMCTRL_PRIMITIVE_TOPOLOGY_TRI_LIST);
6588 }
6589
6590 pvr_csb_clear_relocation_mark(csb);
6591
6592 dummy_stream = pvr_bo_suballoc_get_map_addr(dummy_bo);
6593
6594 /* For indexed draw cmds fill in the dummy's header (as it won't change
6595 * based on the indirect args) and increment by the in-use size of each
6596 * dummy block.
6597 */
6598 if (!state->draw_state.draw_indexed) {
6599 dummy_stream[0] = word0;
6600 dummy_stream += 4;
6601 } else {
6602 dummy_stream += 5;
6603 }
6604
6605 /* clang-format off */
6606 pvr_csb_pack (dummy_stream, VDMCTRL_STREAM_RETURN, word);
6607 /* clang-format on */
6608
6609 pvr_csb_set_relocation_mark(csb);
6610
6611 /* Stream link to the first dummy which forces the VDM to discard any
6612 * prefetched (dummy) control stream.
6613 */
6614 pvr_csb_emit (csb, VDMCTRL_STREAM_LINK0, link) {
6615 link.with_return = true;
6616 link.link_addrmsb = dummy_bo->dev_addr;
6617 }
6618
6619 pvr_csb_emit (csb, VDMCTRL_STREAM_LINK1, link) {
6620 link.link_addrlsb = dummy_bo->dev_addr;
6621 }
6622
6623 pvr_csb_clear_relocation_mark(csb);
6624
6625 /* Point the pds program to the next argument buffer and the next VDM
6626 * dummy buffer.
6627 */
6628 pds_prog.arg_buffer += stride;
6629 }
6630
6631 return VK_SUCCESS;
6632 }
6633
6634 #undef DUMMY_VDM_CONTROL_STREAM_BLOCK_SIZE
6635
pvr_emit_vdm_index_list(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd,VkPrimitiveTopology topology,uint32_t index_offset,uint32_t first_index,uint32_t index_count,uint32_t instance_count,struct pvr_buffer * buffer,VkDeviceSize offset,uint32_t count,uint32_t stride)6636 static void pvr_emit_vdm_index_list(struct pvr_cmd_buffer *cmd_buffer,
6637 struct pvr_sub_cmd_gfx *const sub_cmd,
6638 VkPrimitiveTopology topology,
6639 uint32_t index_offset,
6640 uint32_t first_index,
6641 uint32_t index_count,
6642 uint32_t instance_count,
6643 struct pvr_buffer *buffer,
6644 VkDeviceSize offset,
6645 uint32_t count,
6646 uint32_t stride)
6647 {
6648 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6649 const bool vertex_shader_has_side_effects =
6650 state->gfx_pipeline->shader_state.vertex.stage_state.has_side_effects;
6651 struct PVRX(VDMCTRL_INDEX_LIST0)
6652 list_hdr = { pvr_cmd_header(VDMCTRL_INDEX_LIST0) };
6653 pvr_dev_addr_t index_buffer_addr = PVR_DEV_ADDR_INVALID;
6654 struct pvr_csb *const csb = &sub_cmd->control_stream;
6655 unsigned int index_stride = 0;
6656
6657 list_hdr.primitive_topology = pvr_get_hw_primitive_topology(topology);
6658
6659 /* firstInstance is not handled here in the VDM state, it's implemented as
6660 * an addition in the PDS vertex fetch using
6661 * PVR_PDS_CONST_MAP_ENTRY_TYPE_BASE_INSTANCE entry type.
6662 */
6663
6664 list_hdr.index_count_present = true;
6665
6666 if (instance_count > 1)
6667 list_hdr.index_instance_count_present = true;
6668
6669 if (index_offset)
6670 list_hdr.index_offset_present = true;
6671
6672 if (state->draw_state.draw_indexed) {
6673 list_hdr.index_size =
6674 pvr_vdmctrl_index_size_from_type(state->index_buffer_binding.type);
6675 index_stride = vk_index_type_to_bytes(state->index_buffer_binding.type);
6676
6677 index_buffer_addr = PVR_DEV_ADDR_OFFSET(
6678 state->index_buffer_binding.buffer->dev_addr,
6679 state->index_buffer_binding.offset + first_index * index_stride);
6680
6681 list_hdr.index_addr_present = true;
6682 list_hdr.index_base_addrmsb = index_buffer_addr;
6683 }
6684
6685 list_hdr.degen_cull_enable =
6686 PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
6687 vdm_degenerate_culling) &&
6688 !vertex_shader_has_side_effects;
6689
6690 if (state->draw_state.draw_indirect) {
6691 assert(buffer);
6692 pvr_write_draw_indirect_vdm_stream(cmd_buffer,
6693 csb,
6694 index_buffer_addr,
6695 index_stride,
6696 &list_hdr,
6697 buffer,
6698 offset,
6699 count,
6700 stride);
6701 return;
6702 }
6703
6704 pvr_csb_set_relocation_mark(csb);
6705
6706 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST0, list0) {
6707 list0 = list_hdr;
6708 }
6709
6710 if (list_hdr.index_addr_present) {
6711 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST1, list1) {
6712 list1.index_base_addrlsb = index_buffer_addr;
6713 }
6714 }
6715
6716 if (list_hdr.index_count_present) {
6717 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST2, list2) {
6718 list2.index_count = index_count;
6719 }
6720 }
6721
6722 if (list_hdr.index_instance_count_present) {
6723 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST3, list3) {
6724 list3.instance_count = instance_count - 1;
6725 }
6726 }
6727
6728 if (list_hdr.index_offset_present) {
6729 pvr_csb_emit (csb, VDMCTRL_INDEX_LIST4, list4) {
6730 list4.index_offset = index_offset;
6731 }
6732 }
6733
6734 pvr_csb_clear_relocation_mark(csb);
6735 }
6736
pvr_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)6737 void pvr_CmdDraw(VkCommandBuffer commandBuffer,
6738 uint32_t vertexCount,
6739 uint32_t instanceCount,
6740 uint32_t firstVertex,
6741 uint32_t firstInstance)
6742 {
6743 const struct pvr_cmd_buffer_draw_state draw_state = {
6744 .base_vertex = firstVertex,
6745 .base_instance = firstInstance,
6746 };
6747
6748 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6749 struct vk_dynamic_graphics_state *const dynamic_state =
6750 &cmd_buffer->vk.dynamic_graphics_state;
6751 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6752 VkResult result;
6753
6754 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6755
6756 pvr_update_draw_state(state, &draw_state);
6757
6758 result = pvr_validate_draw_state(cmd_buffer);
6759 if (result != VK_SUCCESS)
6760 return;
6761
6762 /* Write the VDM control stream for the primitive. */
6763 pvr_emit_vdm_index_list(cmd_buffer,
6764 &state->current_sub_cmd->gfx,
6765 dynamic_state->ia.primitive_topology,
6766 firstVertex,
6767 0U,
6768 vertexCount,
6769 instanceCount,
6770 NULL,
6771 0U,
6772 0U,
6773 0U);
6774 }
6775
pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)6776 void pvr_CmdDrawIndexed(VkCommandBuffer commandBuffer,
6777 uint32_t indexCount,
6778 uint32_t instanceCount,
6779 uint32_t firstIndex,
6780 int32_t vertexOffset,
6781 uint32_t firstInstance)
6782 {
6783 const struct pvr_cmd_buffer_draw_state draw_state = {
6784 .base_vertex = vertexOffset,
6785 .base_instance = firstInstance,
6786 .draw_indexed = true,
6787 };
6788
6789 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6790 struct vk_dynamic_graphics_state *const dynamic_state =
6791 &cmd_buffer->vk.dynamic_graphics_state;
6792 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6793 VkResult result;
6794
6795 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6796
6797 pvr_update_draw_state(state, &draw_state);
6798
6799 result = pvr_validate_draw_state(cmd_buffer);
6800 if (result != VK_SUCCESS)
6801 return;
6802
6803 /* Write the VDM control stream for the primitive. */
6804 pvr_emit_vdm_index_list(cmd_buffer,
6805 &state->current_sub_cmd->gfx,
6806 dynamic_state->ia.primitive_topology,
6807 vertexOffset,
6808 firstIndex,
6809 indexCount,
6810 instanceCount,
6811 NULL,
6812 0U,
6813 0U,
6814 0U);
6815 }
6816
pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)6817 void pvr_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
6818 VkBuffer _buffer,
6819 VkDeviceSize offset,
6820 uint32_t drawCount,
6821 uint32_t stride)
6822 {
6823 const struct pvr_cmd_buffer_draw_state draw_state = {
6824 .draw_indirect = true,
6825 .draw_indexed = true,
6826 };
6827
6828 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6829 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6830 struct vk_dynamic_graphics_state *const dynamic_state =
6831 &cmd_buffer->vk.dynamic_graphics_state;
6832 PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
6833 VkResult result;
6834
6835 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6836
6837 pvr_update_draw_state(state, &draw_state);
6838
6839 result = pvr_validate_draw_state(cmd_buffer);
6840 if (result != VK_SUCCESS)
6841 return;
6842
6843 /* Write the VDM control stream for the primitive. */
6844 pvr_emit_vdm_index_list(cmd_buffer,
6845 &state->current_sub_cmd->gfx,
6846 dynamic_state->ia.primitive_topology,
6847 0U,
6848 0U,
6849 0U,
6850 0U,
6851 buffer,
6852 offset,
6853 drawCount,
6854 stride);
6855 }
6856
pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)6857 void pvr_CmdDrawIndirect(VkCommandBuffer commandBuffer,
6858 VkBuffer _buffer,
6859 VkDeviceSize offset,
6860 uint32_t drawCount,
6861 uint32_t stride)
6862 {
6863 const struct pvr_cmd_buffer_draw_state draw_state = {
6864 .draw_indirect = true,
6865 };
6866
6867 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6868 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6869 PVR_FROM_HANDLE(pvr_buffer, buffer, _buffer);
6870 struct vk_dynamic_graphics_state *const dynamic_state =
6871 &cmd_buffer->vk.dynamic_graphics_state;
6872 VkResult result;
6873
6874 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6875
6876 pvr_update_draw_state(state, &draw_state);
6877
6878 result = pvr_validate_draw_state(cmd_buffer);
6879 if (result != VK_SUCCESS)
6880 return;
6881
6882 /* Write the VDM control stream for the primitive. */
6883 pvr_emit_vdm_index_list(cmd_buffer,
6884 &state->current_sub_cmd->gfx,
6885 dynamic_state->ia.primitive_topology,
6886 0U,
6887 0U,
6888 0U,
6889 0U,
6890 buffer,
6891 offset,
6892 drawCount,
6893 stride);
6894 }
6895
6896 static VkResult
pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer * cmd_buffer,struct pvr_render_pass_info * info)6897 pvr_resolve_unemitted_resolve_attachments(struct pvr_cmd_buffer *cmd_buffer,
6898 struct pvr_render_pass_info *info)
6899 {
6900 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6901 const struct pvr_renderpass_hwsetup_render *hw_render =
6902 &state->render_pass_info.pass->hw_setup->renders[info->current_hw_subpass];
6903
6904 for (uint32_t i = 0U; i < hw_render->eot_surface_count; i++) {
6905 const struct pvr_renderpass_hwsetup_eot_surface *surface =
6906 &hw_render->eot_surfaces[i];
6907 const uint32_t color_attach_idx = surface->src_attachment_idx;
6908 const uint32_t resolve_attach_idx = surface->attachment_idx;
6909 VkImageSubresourceLayers src_subresource;
6910 VkImageSubresourceLayers dst_subresource;
6911 struct pvr_image_view *dst_view;
6912 struct pvr_image_view *src_view;
6913 VkFormat src_format;
6914 VkFormat dst_format;
6915 VkImageCopy2 region;
6916 VkResult result;
6917
6918 if (!surface->need_resolve ||
6919 surface->resolve_type != PVR_RESOLVE_TYPE_TRANSFER)
6920 continue;
6921
6922 dst_view = info->attachments[resolve_attach_idx];
6923 src_view = info->attachments[color_attach_idx];
6924
6925 src_subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
6926 src_subresource.mipLevel = src_view->vk.base_mip_level;
6927 src_subresource.baseArrayLayer = src_view->vk.base_array_layer;
6928 src_subresource.layerCount = src_view->vk.layer_count;
6929
6930 dst_subresource.aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
6931 dst_subresource.mipLevel = dst_view->vk.base_mip_level;
6932 dst_subresource.baseArrayLayer = dst_view->vk.base_array_layer;
6933 dst_subresource.layerCount = dst_view->vk.layer_count;
6934
6935 region.srcOffset = (VkOffset3D){ info->render_area.offset.x,
6936 info->render_area.offset.y,
6937 0 };
6938 region.dstOffset = (VkOffset3D){ info->render_area.offset.x,
6939 info->render_area.offset.y,
6940 0 };
6941 region.extent = (VkExtent3D){ info->render_area.extent.width,
6942 info->render_area.extent.height,
6943 1 };
6944
6945 region.srcSubresource = src_subresource;
6946 region.dstSubresource = dst_subresource;
6947
6948 /* TODO: if ERN_46863 is supported, Depth and stencil are sampled
6949 * separately from images with combined depth+stencil. Add logic here to
6950 * handle it using appropriate format from image view.
6951 */
6952 src_format = src_view->vk.image->format;
6953 dst_format = dst_view->vk.image->format;
6954 src_view->vk.image->format = src_view->vk.format;
6955 dst_view->vk.image->format = dst_view->vk.format;
6956
6957 result = pvr_copy_or_resolve_color_image_region(
6958 cmd_buffer,
6959 vk_to_pvr_image(src_view->vk.image),
6960 vk_to_pvr_image(dst_view->vk.image),
6961 ®ion);
6962
6963 src_view->vk.image->format = src_format;
6964 dst_view->vk.image->format = dst_format;
6965
6966 state->current_sub_cmd->transfer.serialize_with_frag = true;
6967
6968 if (result != VK_SUCCESS)
6969 return result;
6970 }
6971
6972 return pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
6973 }
6974
pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer,const VkSubpassEndInfo * pSubpassEndInfo)6975 void pvr_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
6976 const VkSubpassEndInfo *pSubpassEndInfo)
6977 {
6978 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
6979 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
6980 struct pvr_image_view **attachments;
6981 VkClearValue *clear_values;
6982 VkResult result;
6983
6984 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
6985
6986 assert(state->render_pass_info.pass);
6987 assert(state->render_pass_info.framebuffer);
6988
6989 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
6990 if (result != VK_SUCCESS)
6991 return;
6992
6993 result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer,
6994 &state->render_pass_info);
6995 if (result != VK_SUCCESS)
6996 return;
6997
6998 /* Save the required fields before clearing render_pass_info struct. */
6999 attachments = state->render_pass_info.attachments;
7000 clear_values = state->render_pass_info.clear_values;
7001
7002 memset(&state->render_pass_info, 0, sizeof(state->render_pass_info));
7003
7004 state->render_pass_info.attachments = attachments;
7005 state->render_pass_info.clear_values = clear_values;
7006 }
7007
7008 static VkResult
pvr_execute_deferred_cmd_buffer(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_cmd_buffer * sec_cmd_buffer)7009 pvr_execute_deferred_cmd_buffer(struct pvr_cmd_buffer *cmd_buffer,
7010 const struct pvr_cmd_buffer *sec_cmd_buffer)
7011 {
7012 struct vk_dynamic_graphics_state *const dynamic_state =
7013 &cmd_buffer->vk.dynamic_graphics_state;
7014 const uint32_t prim_db_elems =
7015 util_dynarray_num_elements(&cmd_buffer->depth_bias_array,
7016 struct pvr_depth_bias_state);
7017 const uint32_t prim_scissor_elems =
7018 util_dynarray_num_elements(&cmd_buffer->scissor_array,
7019 struct pvr_scissor_words);
7020
7021 util_dynarray_foreach (&sec_cmd_buffer->deferred_csb_commands,
7022 struct pvr_deferred_cs_command,
7023 cmd) {
7024 switch (cmd->type) {
7025 case PVR_DEFERRED_CS_COMMAND_TYPE_DBSC: {
7026 const uint32_t scissor_idx =
7027 prim_scissor_elems + cmd->dbsc.state.scissor_index;
7028 const uint32_t db_idx =
7029 prim_db_elems + cmd->dbsc.state.depthbias_index;
7030 const uint32_t num_dwords =
7031 pvr_cmd_length(TA_STATE_HEADER) + pvr_cmd_length(TA_STATE_ISPDBSC);
7032 struct pvr_suballoc_bo *suballoc_bo;
7033 uint32_t ppp_state[num_dwords];
7034 VkResult result;
7035
7036 pvr_csb_pack (&ppp_state[0], TA_STATE_HEADER, header) {
7037 header.pres_ispctl_dbsc = true;
7038 }
7039
7040 pvr_csb_pack (&ppp_state[1], TA_STATE_ISPDBSC, ispdbsc) {
7041 ispdbsc.dbindex = db_idx;
7042 ispdbsc.scindex = scissor_idx;
7043 }
7044
7045 result = pvr_cmd_buffer_upload_general(cmd_buffer,
7046 &ppp_state[0],
7047 sizeof(ppp_state),
7048 &suballoc_bo);
7049 if (result != VK_SUCCESS)
7050 return result;
7051
7052 pvr_csb_pack (&cmd->dbsc.vdm_state[0], VDMCTRL_PPP_STATE0, state) {
7053 state.word_count = num_dwords;
7054 state.addrmsb = suballoc_bo->dev_addr;
7055 }
7056
7057 pvr_csb_pack (&cmd->dbsc.vdm_state[1], VDMCTRL_PPP_STATE1, state) {
7058 state.addrlsb = suballoc_bo->dev_addr;
7059 }
7060
7061 break;
7062 }
7063
7064 case PVR_DEFERRED_CS_COMMAND_TYPE_DBSC2: {
7065 const uint32_t scissor_idx =
7066 prim_scissor_elems + cmd->dbsc2.state.scissor_index;
7067 const uint32_t db_idx =
7068 prim_db_elems + cmd->dbsc2.state.depthbias_index;
7069
7070 uint32_t *const addr =
7071 (uint32_t *)pvr_bo_suballoc_get_map_addr(cmd->dbsc2.ppp_cs_bo) +
7072 cmd->dbsc2.patch_offset;
7073
7074 assert(pvr_bo_suballoc_get_map_addr(cmd->dbsc2.ppp_cs_bo));
7075
7076 pvr_csb_pack (addr, TA_STATE_ISPDBSC, ispdbsc) {
7077 ispdbsc.dbindex = db_idx;
7078 ispdbsc.scindex = scissor_idx;
7079 }
7080
7081 break;
7082 }
7083
7084 default:
7085 unreachable("Invalid deferred control stream command type.");
7086 break;
7087 }
7088 }
7089
7090 util_dynarray_append_dynarray(&cmd_buffer->depth_bias_array,
7091 &sec_cmd_buffer->depth_bias_array);
7092
7093 util_dynarray_append_dynarray(&cmd_buffer->scissor_array,
7094 &sec_cmd_buffer->scissor_array);
7095
7096 BITSET_SET(dynamic_state->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
7097 cmd_buffer->scissor_words = (struct pvr_scissor_words){ 0 };
7098
7099 return VK_SUCCESS;
7100 }
7101
7102 /* Caller needs to make sure that it ends the current sub_cmd. This function
7103 * only creates a copy of sec_sub_cmd and links it to the cmd_buffer's
7104 * sub_cmd list.
7105 */
pvr_execute_sub_cmd(struct pvr_cmd_buffer * cmd_buffer,struct pvr_sub_cmd * sec_sub_cmd)7106 static VkResult pvr_execute_sub_cmd(struct pvr_cmd_buffer *cmd_buffer,
7107 struct pvr_sub_cmd *sec_sub_cmd)
7108 {
7109 struct pvr_sub_cmd *primary_sub_cmd =
7110 vk_zalloc(&cmd_buffer->vk.pool->alloc,
7111 sizeof(*primary_sub_cmd),
7112 8,
7113 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
7114 if (!primary_sub_cmd) {
7115 return vk_command_buffer_set_error(&cmd_buffer->vk,
7116 VK_ERROR_OUT_OF_HOST_MEMORY);
7117 }
7118
7119 primary_sub_cmd->type = sec_sub_cmd->type;
7120 primary_sub_cmd->owned = false;
7121
7122 list_addtail(&primary_sub_cmd->link, &cmd_buffer->sub_cmds);
7123
7124 switch (sec_sub_cmd->type) {
7125 case PVR_SUB_CMD_TYPE_GRAPHICS:
7126 primary_sub_cmd->gfx = sec_sub_cmd->gfx;
7127 break;
7128
7129 case PVR_SUB_CMD_TYPE_OCCLUSION_QUERY:
7130 case PVR_SUB_CMD_TYPE_COMPUTE:
7131 primary_sub_cmd->compute = sec_sub_cmd->compute;
7132 break;
7133
7134 case PVR_SUB_CMD_TYPE_TRANSFER:
7135 primary_sub_cmd->transfer = sec_sub_cmd->transfer;
7136 break;
7137
7138 case PVR_SUB_CMD_TYPE_EVENT:
7139 primary_sub_cmd->event = sec_sub_cmd->event;
7140 break;
7141
7142 default:
7143 unreachable("Unsupported sub-command type");
7144 }
7145
7146 return VK_SUCCESS;
7147 }
7148
7149 static VkResult
pvr_execute_graphics_cmd_buffer(struct pvr_cmd_buffer * cmd_buffer,const struct pvr_cmd_buffer * sec_cmd_buffer)7150 pvr_execute_graphics_cmd_buffer(struct pvr_cmd_buffer *cmd_buffer,
7151 const struct pvr_cmd_buffer *sec_cmd_buffer)
7152 {
7153 const struct pvr_device_info *dev_info =
7154 &cmd_buffer->device->pdevice->dev_info;
7155 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7156 struct pvr_sub_cmd *primary_sub_cmd = state->current_sub_cmd;
7157 struct pvr_sub_cmd *first_sec_cmd;
7158 VkResult result;
7159
7160 /* Inherited queries are not supported. */
7161 assert(!state->vis_test_enabled);
7162
7163 if (list_is_empty(&sec_cmd_buffer->sub_cmds))
7164 return VK_SUCCESS;
7165
7166 first_sec_cmd =
7167 list_first_entry(&sec_cmd_buffer->sub_cmds, struct pvr_sub_cmd, link);
7168
7169 /* Kick a render if we have a new base address. */
7170 if (primary_sub_cmd->gfx.query_pool && first_sec_cmd->gfx.query_pool &&
7171 primary_sub_cmd->gfx.query_pool != first_sec_cmd->gfx.query_pool) {
7172 state->current_sub_cmd->gfx.barrier_store = true;
7173
7174 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7175 if (result != VK_SUCCESS)
7176 return result;
7177
7178 result =
7179 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7180 if (result != VK_SUCCESS)
7181 return result;
7182
7183 primary_sub_cmd = state->current_sub_cmd;
7184
7185 /* Use existing render setup, but load color attachments from HW
7186 * Background object.
7187 */
7188 primary_sub_cmd->gfx.barrier_load = true;
7189 primary_sub_cmd->gfx.barrier_store = false;
7190 }
7191
7192 list_for_each_entry (struct pvr_sub_cmd,
7193 sec_sub_cmd,
7194 &sec_cmd_buffer->sub_cmds,
7195 link) {
7196 /* Only graphics secondary execution supported within a renderpass. */
7197 assert(sec_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
7198
7199 if (!sec_sub_cmd->gfx.empty_cmd)
7200 primary_sub_cmd->gfx.empty_cmd = false;
7201
7202 if (sec_sub_cmd->gfx.query_pool) {
7203 primary_sub_cmd->gfx.query_pool = sec_sub_cmd->gfx.query_pool;
7204
7205 util_dynarray_append_dynarray(&state->query_indices,
7206 &sec_sub_cmd->gfx.sec_query_indices);
7207 }
7208
7209 if (pvr_cmd_uses_deferred_cs_cmds(sec_cmd_buffer)) {
7210 /* TODO: In case if secondary buffer is created with
7211 * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, then we patch the
7212 * stream and copy it to primary stream using pvr_csb_copy below.
7213 * This will need locking if the same secondary command buffer is
7214 * executed in multiple primary buffers at the same time.
7215 */
7216 result = pvr_execute_deferred_cmd_buffer(cmd_buffer, sec_cmd_buffer);
7217 if (result != VK_SUCCESS)
7218 return result;
7219
7220 result = pvr_csb_copy(&primary_sub_cmd->gfx.control_stream,
7221 &sec_sub_cmd->gfx.control_stream);
7222 if (result != VK_SUCCESS)
7223 return pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
7224 } else {
7225 result = pvr_execute_deferred_cmd_buffer(cmd_buffer, sec_cmd_buffer);
7226 if (result != VK_SUCCESS)
7227 return result;
7228
7229 pvr_csb_emit_link(
7230 &primary_sub_cmd->gfx.control_stream,
7231 pvr_csb_get_start_address(&sec_sub_cmd->gfx.control_stream),
7232 true);
7233 }
7234
7235 if (PVR_HAS_FEATURE(&cmd_buffer->device->pdevice->dev_info,
7236 compute_overlap)) {
7237 primary_sub_cmd->gfx.job.disable_compute_overlap |=
7238 sec_sub_cmd->gfx.job.disable_compute_overlap;
7239 }
7240
7241 primary_sub_cmd->gfx.max_tiles_in_flight =
7242 MIN2(primary_sub_cmd->gfx.max_tiles_in_flight,
7243 sec_sub_cmd->gfx.max_tiles_in_flight);
7244
7245 /* Pass loaded depth/stencil usage from secondary command buffer. */
7246 if (sec_sub_cmd->gfx.depth_usage == PVR_DEPTH_STENCIL_USAGE_NEEDED)
7247 primary_sub_cmd->gfx.depth_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
7248
7249 if (sec_sub_cmd->gfx.stencil_usage == PVR_DEPTH_STENCIL_USAGE_NEEDED)
7250 primary_sub_cmd->gfx.stencil_usage = PVR_DEPTH_STENCIL_USAGE_NEEDED;
7251
7252 /* Pass depth/stencil modification state from secondary command buffer. */
7253 if (sec_sub_cmd->gfx.modifies_depth)
7254 primary_sub_cmd->gfx.modifies_depth = true;
7255
7256 if (sec_sub_cmd->gfx.modifies_stencil)
7257 primary_sub_cmd->gfx.modifies_stencil = true;
7258
7259 if (sec_sub_cmd->gfx.barrier_store) {
7260 struct pvr_sub_cmd *sec_next =
7261 list_entry(sec_sub_cmd->link.next, struct pvr_sub_cmd, link);
7262
7263 /* This shouldn't be the last sub cmd. There should be a barrier load
7264 * subsequent to the barrier store.
7265 */
7266 assert(list_last_entry(&sec_cmd_buffer->sub_cmds,
7267 struct pvr_sub_cmd,
7268 link) != sec_sub_cmd);
7269
7270 /* Kick render to store stencil. */
7271 state->current_sub_cmd->gfx.barrier_store = true;
7272 state->current_sub_cmd->gfx.empty_cmd = false;
7273
7274 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7275 if (result != VK_SUCCESS)
7276 return result;
7277
7278 result =
7279 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7280 if (result != VK_SUCCESS)
7281 return result;
7282
7283 primary_sub_cmd = state->current_sub_cmd;
7284
7285 /* Use existing render setup, but load color attachments from HW
7286 * Background object.
7287 */
7288 primary_sub_cmd->gfx.barrier_load = sec_next->gfx.barrier_load;
7289 primary_sub_cmd->gfx.barrier_store = sec_next->gfx.barrier_store;
7290 primary_sub_cmd->gfx.empty_cmd = false;
7291 }
7292
7293 if (!PVR_HAS_FEATURE(dev_info, gs_rta_support)) {
7294 util_dynarray_append_dynarray(&cmd_buffer->deferred_clears,
7295 &sec_cmd_buffer->deferred_clears);
7296 }
7297 }
7298
7299 return VK_SUCCESS;
7300 }
7301
pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)7302 void pvr_CmdExecuteCommands(VkCommandBuffer commandBuffer,
7303 uint32_t commandBufferCount,
7304 const VkCommandBuffer *pCommandBuffers)
7305 {
7306 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7307 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7308 struct pvr_cmd_buffer *last_cmd_buffer;
7309 VkResult result;
7310
7311 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7312
7313 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY);
7314
7315 /* Reset the CPU copy of the most recent PPP state of the primary command
7316 * buffer.
7317 *
7318 * The next draw call in the primary after CmdExecuteCommands may send
7319 * redundant state, if it all goes in the same geom job.
7320 *
7321 * Can't just copy state from the secondary because the recording state of
7322 * the secondary command buffers would have been deleted at this point.
7323 */
7324 pvr_reset_graphics_dirty_state(cmd_buffer, false);
7325
7326 if (state->current_sub_cmd &&
7327 state->current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS) {
7328 for (uint32_t i = 0; i < commandBufferCount; i++) {
7329 PVR_FROM_HANDLE(pvr_cmd_buffer, sec_cmd_buffer, pCommandBuffers[i]);
7330
7331 assert(sec_cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
7332
7333 result = pvr_execute_graphics_cmd_buffer(cmd_buffer, sec_cmd_buffer);
7334 if (result != VK_SUCCESS)
7335 return;
7336 }
7337
7338 last_cmd_buffer =
7339 pvr_cmd_buffer_from_handle(pCommandBuffers[commandBufferCount - 1]);
7340
7341 /* Set barriers from final command secondary command buffer. */
7342 for (uint32_t i = 0; i != PVR_NUM_SYNC_PIPELINE_STAGES; i++) {
7343 state->barriers_needed[i] |=
7344 last_cmd_buffer->state.barriers_needed[i] &
7345 PVR_PIPELINE_STAGE_ALL_GRAPHICS_BITS;
7346 }
7347 } else {
7348 for (uint32_t i = 0; i < commandBufferCount; i++) {
7349 PVR_FROM_HANDLE(pvr_cmd_buffer, sec_cmd_buffer, pCommandBuffers[i]);
7350
7351 assert(sec_cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
7352
7353 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7354 if (result != VK_SUCCESS)
7355 return;
7356
7357 list_for_each_entry_safe (struct pvr_sub_cmd,
7358 sec_sub_cmd,
7359 &sec_cmd_buffer->sub_cmds,
7360 link) {
7361 result = pvr_execute_sub_cmd(cmd_buffer, sec_sub_cmd);
7362 if (result != VK_SUCCESS)
7363 return;
7364 }
7365 }
7366
7367 last_cmd_buffer =
7368 pvr_cmd_buffer_from_handle(pCommandBuffers[commandBufferCount - 1]);
7369
7370 memcpy(state->barriers_needed,
7371 last_cmd_buffer->state.barriers_needed,
7372 sizeof(state->barriers_needed));
7373 }
7374 }
7375
pvr_insert_transparent_obj(struct pvr_cmd_buffer * const cmd_buffer,struct pvr_sub_cmd_gfx * const sub_cmd)7376 static void pvr_insert_transparent_obj(struct pvr_cmd_buffer *const cmd_buffer,
7377 struct pvr_sub_cmd_gfx *const sub_cmd)
7378 {
7379 struct pvr_device *const device = cmd_buffer->device;
7380 /* Yes we want a copy. The user could be recording multiple command buffers
7381 * in parallel so writing the template in place could cause problems.
7382 */
7383 struct pvr_static_clear_ppp_template clear =
7384 device->static_clear_state.ppp_templates[VK_IMAGE_ASPECT_COLOR_BIT];
7385 uint32_t pds_state[PVR_STATIC_CLEAR_PDS_STATE_COUNT] = { 0 };
7386 struct pvr_csb *csb = &sub_cmd->control_stream;
7387 struct pvr_suballoc_bo *ppp_bo;
7388
7389 assert(clear.requires_pds_state);
7390
7391 /* Patch the template. */
7392
7393 pvr_csb_pack (&pds_state[PVR_STATIC_CLEAR_PPP_PDS_TYPE_SHADERBASE],
7394 TA_STATE_PDS_SHADERBASE,
7395 shaderbase) {
7396 shaderbase.addr = PVR_DEV_ADDR(device->nop_program.pds.data_offset);
7397 }
7398
7399 clear.config.pds_state = &pds_state;
7400
7401 clear.config.ispctl.upass = cmd_buffer->state.render_pass_info.isp_userpass;
7402
7403 /* Emit PPP state from template. */
7404
7405 pvr_emit_ppp_from_template(csb, &clear, &ppp_bo);
7406 list_add(&ppp_bo->link, &cmd_buffer->bo_list);
7407
7408 /* Emit VDM state. */
7409
7410 pvr_emit_clear_words(cmd_buffer, sub_cmd);
7411
7412 /* Reset graphics state. */
7413 pvr_reset_graphics_dirty_state(cmd_buffer, false);
7414 }
7415
7416 static inline struct pvr_render_subpass *
pvr_get_current_subpass(const struct pvr_cmd_buffer_state * const state)7417 pvr_get_current_subpass(const struct pvr_cmd_buffer_state *const state)
7418 {
7419 const uint32_t subpass_idx = state->render_pass_info.subpass_idx;
7420
7421 return &state->render_pass_info.pass->subpasses[subpass_idx];
7422 }
7423
pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer,const VkSubpassBeginInfo * pSubpassBeginInfo,const VkSubpassEndInfo * pSubpassEndInfo)7424 void pvr_CmdNextSubpass2(VkCommandBuffer commandBuffer,
7425 const VkSubpassBeginInfo *pSubpassBeginInfo,
7426 const VkSubpassEndInfo *pSubpassEndInfo)
7427 {
7428 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7429 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7430 struct pvr_render_pass_info *rp_info = &state->render_pass_info;
7431 const struct pvr_renderpass_hwsetup_subpass *hw_subpass;
7432 struct pvr_renderpass_hwsetup_render *next_hw_render;
7433 const struct pvr_render_pass *pass = rp_info->pass;
7434 const struct pvr_renderpass_hw_map *current_map;
7435 const struct pvr_renderpass_hw_map *next_map;
7436 struct pvr_load_op *hw_subpass_load_op;
7437 VkResult result;
7438
7439 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7440
7441 current_map = &pass->hw_setup->subpass_map[rp_info->subpass_idx];
7442 next_map = &pass->hw_setup->subpass_map[rp_info->subpass_idx + 1];
7443 next_hw_render = &pass->hw_setup->renders[next_map->render];
7444
7445 if (current_map->render != next_map->render) {
7446 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7447 if (result != VK_SUCCESS)
7448 return;
7449
7450 result = pvr_resolve_unemitted_resolve_attachments(cmd_buffer, rp_info);
7451 if (result != VK_SUCCESS)
7452 return;
7453
7454 rp_info->current_hw_subpass = next_map->render;
7455
7456 result =
7457 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7458 if (result != VK_SUCCESS)
7459 return;
7460
7461 rp_info->enable_bg_tag = false;
7462 rp_info->process_empty_tiles = false;
7463
7464 /* If this subpass contains any load ops the HW Background Object must be
7465 * run to do the clears/loads.
7466 */
7467 if (next_hw_render->color_init_count > 0) {
7468 rp_info->enable_bg_tag = true;
7469
7470 for (uint32_t i = 0; i < next_hw_render->color_init_count; i++) {
7471 /* Empty tiles need to be cleared too. */
7472 if (next_hw_render->color_init[i].op ==
7473 VK_ATTACHMENT_LOAD_OP_CLEAR) {
7474 rp_info->process_empty_tiles = true;
7475 break;
7476 }
7477 }
7478 }
7479
7480 /* Set isp_userpass to zero for new hw_render. This will be used to set
7481 * ROGUE_CR_ISP_CTL::upass_start.
7482 */
7483 rp_info->isp_userpass = 0;
7484 }
7485
7486 hw_subpass = &next_hw_render->subpasses[next_map->subpass];
7487 hw_subpass_load_op = hw_subpass->load_op;
7488
7489 if (hw_subpass_load_op) {
7490 result = pvr_cs_write_load_op(cmd_buffer,
7491 &state->current_sub_cmd->gfx,
7492 hw_subpass_load_op,
7493 rp_info->isp_userpass);
7494 }
7495
7496 /* Pipelines are created for a particular subpass so unbind but leave the
7497 * vertex and descriptor bindings intact as they are orthogonal to the
7498 * subpass.
7499 */
7500 state->gfx_pipeline = NULL;
7501
7502 /* User-pass spawn is 4 bits so if the driver has to wrap it, it will emit a
7503 * full screen transparent object to flush all tags up until now, then the
7504 * user-pass spawn value will implicitly be reset to 0 because
7505 * pvr_render_subpass::isp_userpass values are stored ANDed with
7506 * ROGUE_CR_ISP_CTL_UPASS_START_SIZE_MAX.
7507 */
7508 /* If hw_subpass_load_op is valid then pvr_write_load_op_control_stream
7509 * has already done a full-screen transparent object.
7510 */
7511 if (rp_info->isp_userpass == PVRX(CR_ISP_CTL_UPASS_START_SIZE_MAX) &&
7512 !hw_subpass_load_op) {
7513 pvr_insert_transparent_obj(cmd_buffer, &state->current_sub_cmd->gfx);
7514 }
7515
7516 rp_info->subpass_idx++;
7517
7518 rp_info->isp_userpass = pass->subpasses[rp_info->subpass_idx].isp_userpass;
7519 state->dirty.isp_userpass = true;
7520
7521 rp_info->pipeline_bind_point =
7522 pass->subpasses[rp_info->subpass_idx].pipeline_bind_point;
7523
7524 pvr_stash_depth_format(state, &state->current_sub_cmd->gfx);
7525 }
7526
7527 static bool
pvr_stencil_has_self_dependency(const struct pvr_cmd_buffer_state * const state)7528 pvr_stencil_has_self_dependency(const struct pvr_cmd_buffer_state *const state)
7529 {
7530 const struct pvr_render_subpass *const current_subpass =
7531 pvr_get_current_subpass(state);
7532 const uint32_t *const input_attachments = current_subpass->input_attachments;
7533
7534 if (current_subpass->depth_stencil_attachment == VK_ATTACHMENT_UNUSED)
7535 return false;
7536
7537 /* We only need to check the current software subpass as we don't support
7538 * merging to/from a subpass with self-dep stencil.
7539 */
7540
7541 for (uint32_t i = 0; i < current_subpass->input_count; i++) {
7542 if (input_attachments[i] == current_subpass->depth_stencil_attachment)
7543 return true;
7544 }
7545
7546 return false;
7547 }
7548
pvr_is_stencil_store_load_needed(const struct pvr_cmd_buffer * const cmd_buffer,VkPipelineStageFlags2 vk_src_stage_mask,VkPipelineStageFlags2 vk_dst_stage_mask,uint32_t memory_barrier_count,const VkMemoryBarrier2 * const memory_barriers,uint32_t image_barrier_count,const VkImageMemoryBarrier2 * const image_barriers)7549 static bool pvr_is_stencil_store_load_needed(
7550 const struct pvr_cmd_buffer *const cmd_buffer,
7551 VkPipelineStageFlags2 vk_src_stage_mask,
7552 VkPipelineStageFlags2 vk_dst_stage_mask,
7553 uint32_t memory_barrier_count,
7554 const VkMemoryBarrier2 *const memory_barriers,
7555 uint32_t image_barrier_count,
7556 const VkImageMemoryBarrier2 *const image_barriers)
7557 {
7558 const struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
7559 const uint32_t fragment_test_stages =
7560 VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT |
7561 VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT;
7562 const struct pvr_render_pass *const pass = state->render_pass_info.pass;
7563 const struct pvr_renderpass_hwsetup_render *hw_render;
7564 struct pvr_image_view **const attachments =
7565 state->render_pass_info.attachments;
7566 const struct pvr_image_view *attachment;
7567 uint32_t hw_render_idx;
7568
7569 if (!pass)
7570 return false;
7571
7572 hw_render_idx = state->current_sub_cmd->gfx.hw_render_idx;
7573 hw_render = &pass->hw_setup->renders[hw_render_idx];
7574
7575 if (hw_render->ds_attach_idx == VK_ATTACHMENT_UNUSED)
7576 return false;
7577
7578 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
7579 attachment = attachments[hw_render->ds_attach_idx];
7580 } else {
7581 assert(!attachments);
7582 attachment = NULL;
7583 }
7584
7585 if (!(vk_src_stage_mask & fragment_test_stages) &&
7586 vk_dst_stage_mask & VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT)
7587 return false;
7588
7589 for (uint32_t i = 0; i < memory_barrier_count; i++) {
7590 const uint32_t stencil_write_bit =
7591 VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT;
7592 const uint32_t input_attachment_read_bit =
7593 VK_ACCESS_INPUT_ATTACHMENT_READ_BIT;
7594
7595 if (!(memory_barriers[i].srcAccessMask & stencil_write_bit))
7596 continue;
7597
7598 if (!(memory_barriers[i].dstAccessMask & input_attachment_read_bit))
7599 continue;
7600
7601 return pvr_stencil_has_self_dependency(state);
7602 }
7603
7604 for (uint32_t i = 0; i < image_barrier_count; i++) {
7605 PVR_FROM_HANDLE(pvr_image, image, image_barriers[i].image);
7606 const uint32_t stencil_bit = VK_IMAGE_ASPECT_STENCIL_BIT;
7607
7608 if (!(image_barriers[i].subresourceRange.aspectMask & stencil_bit))
7609 continue;
7610
7611 if (attachment && image != vk_to_pvr_image(attachment->vk.image))
7612 continue;
7613
7614 if (!vk_format_has_stencil(image->vk.format))
7615 continue;
7616
7617 return pvr_stencil_has_self_dependency(state);
7618 }
7619
7620 return false;
7621 }
7622
7623 static VkResult
pvr_cmd_buffer_insert_mid_frag_barrier_event(struct pvr_cmd_buffer * cmd_buffer,uint32_t src_stage_mask,uint32_t dst_stage_mask)7624 pvr_cmd_buffer_insert_mid_frag_barrier_event(struct pvr_cmd_buffer *cmd_buffer,
7625 uint32_t src_stage_mask,
7626 uint32_t dst_stage_mask)
7627 {
7628 VkResult result;
7629
7630 assert(cmd_buffer->state.current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
7631
7632 cmd_buffer->state.current_sub_cmd->gfx.empty_cmd = false;
7633
7634 /* Submit graphics job to store stencil. */
7635 cmd_buffer->state.current_sub_cmd->gfx.barrier_store = true;
7636
7637 pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7638 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7639 if (result != VK_SUCCESS)
7640 return result;
7641
7642 cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7643 .type = PVR_EVENT_TYPE_BARRIER,
7644 .barrier = {
7645 .in_render_pass = true,
7646 .wait_for_stage_mask = src_stage_mask,
7647 .wait_at_stage_mask = dst_stage_mask,
7648 },
7649 };
7650
7651 pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7652 pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_GRAPHICS);
7653
7654 /* Use existing render setup, but load color attachments from HW BGOBJ */
7655 cmd_buffer->state.current_sub_cmd->gfx.barrier_load = true;
7656 cmd_buffer->state.current_sub_cmd->gfx.barrier_store = false;
7657
7658 return VK_SUCCESS;
7659 }
7660
7661 static VkResult
pvr_cmd_buffer_insert_barrier_event(struct pvr_cmd_buffer * cmd_buffer,uint32_t src_stage_mask,uint32_t dst_stage_mask)7662 pvr_cmd_buffer_insert_barrier_event(struct pvr_cmd_buffer *cmd_buffer,
7663 uint32_t src_stage_mask,
7664 uint32_t dst_stage_mask)
7665 {
7666 VkResult result;
7667
7668 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7669 if (result != VK_SUCCESS)
7670 return result;
7671
7672 cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7673 .type = PVR_EVENT_TYPE_BARRIER,
7674 .barrier = {
7675 .wait_for_stage_mask = src_stage_mask,
7676 .wait_at_stage_mask = dst_stage_mask,
7677 },
7678 };
7679
7680 return pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7681 }
7682
7683 /* This is just enough to handle vkCmdPipelineBarrier().
7684 * TODO: Complete?
7685 */
pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)7686 void pvr_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
7687 const VkDependencyInfo *pDependencyInfo)
7688 {
7689 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7690 struct pvr_cmd_buffer_state *const state = &cmd_buffer->state;
7691 const struct pvr_render_pass *const render_pass =
7692 state->render_pass_info.pass;
7693 VkPipelineStageFlags vk_src_stage_mask = 0U;
7694 VkPipelineStageFlags vk_dst_stage_mask = 0U;
7695 bool is_stencil_store_load_needed;
7696 uint32_t required_stage_mask = 0U;
7697 uint32_t src_stage_mask;
7698 uint32_t dst_stage_mask;
7699 bool is_barrier_needed;
7700
7701 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7702
7703 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++) {
7704 vk_src_stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
7705 vk_dst_stage_mask |= pDependencyInfo->pMemoryBarriers[i].dstStageMask;
7706 }
7707
7708 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++) {
7709 vk_src_stage_mask |=
7710 pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
7711 vk_dst_stage_mask |=
7712 pDependencyInfo->pBufferMemoryBarriers[i].dstStageMask;
7713 }
7714
7715 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++) {
7716 vk_src_stage_mask |=
7717 pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
7718 vk_dst_stage_mask |=
7719 pDependencyInfo->pImageMemoryBarriers[i].dstStageMask;
7720 }
7721
7722 src_stage_mask = pvr_stage_mask_src(vk_src_stage_mask);
7723 dst_stage_mask = pvr_stage_mask_dst(vk_dst_stage_mask);
7724
7725 for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
7726 if (!(dst_stage_mask & BITFIELD_BIT(stage)))
7727 continue;
7728
7729 required_stage_mask |= state->barriers_needed[stage];
7730 }
7731
7732 src_stage_mask &= required_stage_mask;
7733 for (uint32_t stage = 0U; stage != PVR_NUM_SYNC_PIPELINE_STAGES; stage++) {
7734 if (!(dst_stage_mask & BITFIELD_BIT(stage)))
7735 continue;
7736
7737 state->barriers_needed[stage] &= ~src_stage_mask;
7738 }
7739
7740 if (src_stage_mask == 0 || dst_stage_mask == 0) {
7741 is_barrier_needed = false;
7742 } else if (src_stage_mask == PVR_PIPELINE_STAGE_GEOM_BIT &&
7743 dst_stage_mask == PVR_PIPELINE_STAGE_FRAG_BIT) {
7744 /* This is implicit so no need to barrier. */
7745 is_barrier_needed = false;
7746 } else if (src_stage_mask == dst_stage_mask &&
7747 util_bitcount(src_stage_mask) == 1) {
7748 struct pvr_sub_cmd *const current_sub_cmd = state->current_sub_cmd;
7749
7750 switch (src_stage_mask) {
7751 case PVR_PIPELINE_STAGE_FRAG_BIT:
7752 is_barrier_needed = !render_pass;
7753
7754 if (is_barrier_needed)
7755 break;
7756
7757 assert(current_sub_cmd->type == PVR_SUB_CMD_TYPE_GRAPHICS);
7758
7759 /* Flush all fragment work up to this point. */
7760 pvr_insert_transparent_obj(cmd_buffer, ¤t_sub_cmd->gfx);
7761 break;
7762
7763 case PVR_PIPELINE_STAGE_COMPUTE_BIT:
7764 is_barrier_needed = false;
7765
7766 if (!current_sub_cmd ||
7767 current_sub_cmd->type != PVR_SUB_CMD_TYPE_COMPUTE) {
7768 break;
7769 }
7770
7771 /* Multiple dispatches can be merged into a single job. When back to
7772 * back dispatches have a sequential dependency (Compute -> compute
7773 * pipeline barrier) we need to do the following.
7774 * - Dispatch a kernel which fences all previous memory writes and
7775 * flushes the MADD cache.
7776 * - Issue a compute fence which ensures all previous tasks emitted
7777 * by the compute data master are completed before starting
7778 * anything new.
7779 */
7780
7781 /* Issue Data Fence, Wait for Data Fence (IDFWDF) makes the PDS wait
7782 * for data.
7783 */
7784 pvr_compute_generate_idfwdf(cmd_buffer, ¤t_sub_cmd->compute);
7785
7786 pvr_compute_generate_fence(cmd_buffer,
7787 ¤t_sub_cmd->compute,
7788 false);
7789 break;
7790
7791 default:
7792 is_barrier_needed = false;
7793 break;
7794 };
7795 } else {
7796 is_barrier_needed = true;
7797 }
7798
7799 is_stencil_store_load_needed =
7800 pvr_is_stencil_store_load_needed(cmd_buffer,
7801 vk_src_stage_mask,
7802 vk_dst_stage_mask,
7803 pDependencyInfo->memoryBarrierCount,
7804 pDependencyInfo->pMemoryBarriers,
7805 pDependencyInfo->imageMemoryBarrierCount,
7806 pDependencyInfo->pImageMemoryBarriers);
7807
7808 if (is_stencil_store_load_needed) {
7809 VkResult result;
7810
7811 result = pvr_cmd_buffer_insert_mid_frag_barrier_event(cmd_buffer,
7812 src_stage_mask,
7813 dst_stage_mask);
7814 if (result != VK_SUCCESS)
7815 mesa_loge("Failed to insert mid frag barrier event.");
7816 } else {
7817 if (is_barrier_needed) {
7818 VkResult result;
7819
7820 result = pvr_cmd_buffer_insert_barrier_event(cmd_buffer,
7821 src_stage_mask,
7822 dst_stage_mask);
7823 if (result != VK_SUCCESS)
7824 mesa_loge("Failed to insert pipeline barrier event.");
7825 }
7826 }
7827 }
7828
pvr_CmdResetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,VkPipelineStageFlags2 stageMask)7829 void pvr_CmdResetEvent2(VkCommandBuffer commandBuffer,
7830 VkEvent _event,
7831 VkPipelineStageFlags2 stageMask)
7832 {
7833 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7834 PVR_FROM_HANDLE(pvr_event, event, _event);
7835 VkResult result;
7836
7837 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7838
7839 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7840 if (result != VK_SUCCESS)
7841 return;
7842
7843 cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7844 .type = PVR_EVENT_TYPE_RESET,
7845 .set_reset = {
7846 .event = event,
7847 .wait_for_stage_mask = pvr_stage_mask_src(stageMask),
7848 },
7849 };
7850
7851 pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7852 }
7853
pvr_CmdSetEvent2(VkCommandBuffer commandBuffer,VkEvent _event,const VkDependencyInfo * pDependencyInfo)7854 void pvr_CmdSetEvent2(VkCommandBuffer commandBuffer,
7855 VkEvent _event,
7856 const VkDependencyInfo *pDependencyInfo)
7857 {
7858 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7859 PVR_FROM_HANDLE(pvr_event, event, _event);
7860 VkPipelineStageFlags2 stage_mask = 0;
7861 VkResult result;
7862
7863 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7864
7865 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7866 if (result != VK_SUCCESS)
7867 return;
7868
7869 for (uint32_t i = 0; i < pDependencyInfo->memoryBarrierCount; i++)
7870 stage_mask |= pDependencyInfo->pMemoryBarriers[i].srcStageMask;
7871
7872 for (uint32_t i = 0; i < pDependencyInfo->bufferMemoryBarrierCount; i++)
7873 stage_mask |= pDependencyInfo->pBufferMemoryBarriers[i].srcStageMask;
7874
7875 for (uint32_t i = 0; i < pDependencyInfo->imageMemoryBarrierCount; i++)
7876 stage_mask |= pDependencyInfo->pImageMemoryBarriers[i].srcStageMask;
7877
7878 cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7879 .type = PVR_EVENT_TYPE_SET,
7880 .set_reset = {
7881 .event = event,
7882 .wait_for_stage_mask = pvr_stage_mask_dst(stage_mask),
7883 },
7884 };
7885
7886 pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7887 }
7888
pvr_CmdWaitEvents2(VkCommandBuffer commandBuffer,uint32_t eventCount,const VkEvent * pEvents,const VkDependencyInfo * pDependencyInfos)7889 void pvr_CmdWaitEvents2(VkCommandBuffer commandBuffer,
7890 uint32_t eventCount,
7891 const VkEvent *pEvents,
7892 const VkDependencyInfo *pDependencyInfos)
7893 {
7894 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7895 struct pvr_event **events_array;
7896 uint32_t *stage_masks;
7897 VkResult result;
7898
7899 PVR_CHECK_COMMAND_BUFFER_BUILDING_STATE(cmd_buffer);
7900
7901 VK_MULTIALLOC(ma);
7902 vk_multialloc_add(&ma, &events_array, __typeof__(*events_array), eventCount);
7903 vk_multialloc_add(&ma, &stage_masks, __typeof__(*stage_masks), eventCount);
7904
7905 if (!vk_multialloc_alloc(&ma,
7906 &cmd_buffer->vk.pool->alloc,
7907 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT)) {
7908 vk_command_buffer_set_error(&cmd_buffer->vk, VK_ERROR_OUT_OF_HOST_MEMORY);
7909 return;
7910 }
7911
7912 result = pvr_cmd_buffer_start_sub_cmd(cmd_buffer, PVR_SUB_CMD_TYPE_EVENT);
7913 if (result != VK_SUCCESS) {
7914 vk_free(&cmd_buffer->vk.pool->alloc, events_array);
7915 return;
7916 }
7917
7918 memcpy(events_array, pEvents, sizeof(*events_array) * eventCount);
7919
7920 for (uint32_t i = 0; i < eventCount; i++) {
7921 const VkDependencyInfo *info = &pDependencyInfos[i];
7922 VkPipelineStageFlags2 mask = 0;
7923
7924 for (uint32_t j = 0; j < info->memoryBarrierCount; j++)
7925 mask |= info->pMemoryBarriers[j].dstStageMask;
7926
7927 for (uint32_t j = 0; j < info->bufferMemoryBarrierCount; j++)
7928 mask |= info->pBufferMemoryBarriers[j].dstStageMask;
7929
7930 for (uint32_t j = 0; j < info->imageMemoryBarrierCount; j++)
7931 mask |= info->pImageMemoryBarriers[j].dstStageMask;
7932
7933 stage_masks[i] = pvr_stage_mask_dst(mask);
7934 }
7935
7936 cmd_buffer->state.current_sub_cmd->event = (struct pvr_sub_cmd_event){
7937 .type = PVR_EVENT_TYPE_WAIT,
7938 .wait = {
7939 .count = eventCount,
7940 .events = events_array,
7941 .wait_at_stage_masks = stage_masks,
7942 },
7943 };
7944
7945 pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7946 }
7947
pvr_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)7948 void pvr_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
7949 VkPipelineStageFlags2 stage,
7950 VkQueryPool queryPool,
7951 uint32_t query)
7952 {
7953 unreachable("Timestamp queries are not supported.");
7954 }
7955
pvr_EndCommandBuffer(VkCommandBuffer commandBuffer)7956 VkResult pvr_EndCommandBuffer(VkCommandBuffer commandBuffer)
7957 {
7958 PVR_FROM_HANDLE(pvr_cmd_buffer, cmd_buffer, commandBuffer);
7959 struct pvr_cmd_buffer_state *state = &cmd_buffer->state;
7960 VkResult result;
7961
7962 if (vk_command_buffer_has_error(&cmd_buffer->vk))
7963 return vk_command_buffer_end(&cmd_buffer->vk);
7964
7965 /* TODO: We should be freeing all the resources, allocated for recording,
7966 * here.
7967 */
7968 util_dynarray_fini(&state->query_indices);
7969
7970 result = pvr_cmd_buffer_end_sub_cmd(cmd_buffer);
7971 if (result != VK_SUCCESS)
7972 pvr_cmd_buffer_set_error_unwarned(cmd_buffer, result);
7973
7974 return vk_command_buffer_end(&cmd_buffer->vk);
7975 }
7976