1 /*
2 * Copyright © 2019 Raspberry Pi Ltd
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "broadcom/common/v3d_csd.h"
25 #include "v3dv_private.h"
26 #include "util/u_pack_color.h"
27 #include "vk_common_entrypoints.h"
28 #include "vk_util.h"
29
30 float
v3dv_get_aa_line_width(struct v3dv_pipeline * pipeline,struct v3dv_cmd_buffer * buffer)31 v3dv_get_aa_line_width(struct v3dv_pipeline *pipeline,
32 struct v3dv_cmd_buffer *buffer)
33 {
34 float width = buffer->vk.dynamic_graphics_state.rs.line.width;
35
36 /* If line smoothing is enabled then we want to add some extra pixels to
37 * the width in order to have some semi-transparent edges.
38 */
39 if (pipeline->line_smooth)
40 width = floorf(M_SQRT2 * width) + 3;
41
42 return width;
43 }
44
45 void
v3dv_job_add_bo(struct v3dv_job * job,struct v3dv_bo * bo)46 v3dv_job_add_bo(struct v3dv_job *job, struct v3dv_bo *bo)
47 {
48 if (!bo)
49 return;
50
51 if (job->bo_handle_mask & bo->handle_bit) {
52 if (_mesa_set_search(job->bos, bo))
53 return;
54 }
55
56 _mesa_set_add(job->bos, bo);
57 job->bo_count++;
58 job->bo_handle_mask |= bo->handle_bit;
59 }
60
61 void
v3dv_job_add_bo_unchecked(struct v3dv_job * job,struct v3dv_bo * bo)62 v3dv_job_add_bo_unchecked(struct v3dv_job *job, struct v3dv_bo *bo)
63 {
64 assert(bo);
65 _mesa_set_add(job->bos, bo);
66 job->bo_count++;
67 job->bo_handle_mask |= bo->handle_bit;
68 }
69
70 static void
cmd_buffer_init(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_device * device)71 cmd_buffer_init(struct v3dv_cmd_buffer *cmd_buffer,
72 struct v3dv_device *device)
73 {
74 /* Do not reset the base object! If we are calling this from a command
75 * buffer reset that would reset the loader's dispatch table for the
76 * command buffer, and any other relevant info from vk_object_base
77 */
78 const uint32_t base_size = sizeof(struct vk_command_buffer);
79 uint8_t *cmd_buffer_driver_start = ((uint8_t *) cmd_buffer) + base_size;
80 memset(cmd_buffer_driver_start, 0, sizeof(*cmd_buffer) - base_size);
81
82 cmd_buffer->device = device;
83
84 list_inithead(&cmd_buffer->private_objs);
85 list_inithead(&cmd_buffer->jobs);
86
87 cmd_buffer->state.subpass_idx = -1;
88 cmd_buffer->state.meta.subpass_idx = -1;
89
90 cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_INITIALIZED;
91 }
92
93 static VkResult
cmd_buffer_create(struct vk_command_pool * pool,VkCommandBufferLevel level,struct vk_command_buffer ** cmd_buffer_out)94 cmd_buffer_create(struct vk_command_pool *pool, VkCommandBufferLevel level,
95 struct vk_command_buffer **cmd_buffer_out)
96 {
97 struct v3dv_device *device =
98 container_of(pool->base.device, struct v3dv_device, vk);
99
100 struct v3dv_cmd_buffer *cmd_buffer;
101 cmd_buffer = vk_zalloc(&pool->alloc,
102 sizeof(*cmd_buffer),
103 8,
104 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
105 if (cmd_buffer == NULL)
106 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
107
108 /* Here we pass 0 as level because this callback hook doesn't have the level
109 * info, but that's fine, vk_common_AllocateCommandBuffers will fix it up
110 * after creation.
111 */
112 VkResult result;
113 result = vk_command_buffer_init(pool, &cmd_buffer->vk,
114 &v3dv_cmd_buffer_ops, level);
115 if (result != VK_SUCCESS) {
116 vk_free(&pool->alloc, cmd_buffer);
117 return result;
118 }
119
120 cmd_buffer_init(cmd_buffer, device);
121
122 *cmd_buffer_out = &cmd_buffer->vk;
123
124 return VK_SUCCESS;
125 }
126
127 static void
job_destroy_gpu_cl_resources(struct v3dv_job * job)128 job_destroy_gpu_cl_resources(struct v3dv_job *job)
129 {
130 assert(job->type == V3DV_JOB_TYPE_GPU_CL ||
131 job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
132
133 v3dv_cl_destroy(&job->bcl);
134 v3dv_cl_destroy(&job->rcl);
135 v3dv_cl_destroy(&job->indirect);
136
137 /* Since we don't ref BOs when we add them to the command buffer, don't
138 * unref them here either. Bo's will be freed when their corresponding API
139 * objects are destroyed.
140 */
141 _mesa_set_destroy(job->bos, NULL);
142
143 v3dv_bo_free(job->device, job->tile_alloc);
144 v3dv_bo_free(job->device, job->tile_state);
145 }
146
147 static void
job_destroy_cloned_gpu_cl_resources(struct v3dv_job * job)148 job_destroy_cloned_gpu_cl_resources(struct v3dv_job *job)
149 {
150 assert(job->type == V3DV_JOB_TYPE_GPU_CL);
151
152 struct v3dv_cmd_buffer *cmd_buffer = job->cmd_buffer;
153 if (job->clone_owns_bcl) {
154 /* For suspending jobs in command buffers with the simultaneous use flag
155 * we allocate a real copy of the BCL.
156 */
157 assert(job->suspending &&
158 cmd_buffer &&
159 (cmd_buffer->usage_flags &
160 VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT));
161 v3dv_cl_destroy(&job->bcl);
162 } else {
163 list_for_each_entry_safe(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) {
164 list_del(&bo->list_link);
165 vk_free(&job->device->vk.alloc, bo);
166 }
167 }
168
169 list_for_each_entry_safe(struct v3dv_bo, bo, &job->rcl.bo_list, list_link) {
170 list_del(&bo->list_link);
171 vk_free(&job->device->vk.alloc, bo);
172 }
173
174 list_for_each_entry_safe(struct v3dv_bo, bo, &job->indirect.bo_list, list_link) {
175 list_del(&bo->list_link);
176 vk_free(&job->device->vk.alloc, bo);
177 }
178 }
179
180 static void
job_destroy_gpu_csd_resources(struct v3dv_job * job)181 job_destroy_gpu_csd_resources(struct v3dv_job *job)
182 {
183 assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
184 assert(job->cmd_buffer);
185
186 v3dv_cl_destroy(&job->indirect);
187
188 _mesa_set_destroy(job->bos, NULL);
189
190 if (job->csd.shared_memory)
191 v3dv_bo_free(job->device, job->csd.shared_memory);
192 }
193
194 void
v3dv_job_destroy(struct v3dv_job * job)195 v3dv_job_destroy(struct v3dv_job *job)
196 {
197 assert(job);
198
199 list_del(&job->list_link);
200
201 /* Cloned jobs don't make deep copies of the original jobs, so they don't
202 * own any of their resources. However, they do allocate clones of BO
203 * structs, so make sure we free those.
204 */
205 if (!job->is_clone) {
206 switch (job->type) {
207 case V3DV_JOB_TYPE_GPU_CL:
208 case V3DV_JOB_TYPE_GPU_CL_INCOMPLETE:
209 job_destroy_gpu_cl_resources(job);
210 break;
211 case V3DV_JOB_TYPE_GPU_CSD:
212 job_destroy_gpu_csd_resources(job);
213 break;
214 default:
215 break;
216 }
217 } else {
218 /* Cloned jobs */
219 if (job->type == V3DV_JOB_TYPE_GPU_CL)
220 job_destroy_cloned_gpu_cl_resources(job);
221 }
222
223 vk_free(&job->device->vk.alloc, job);
224 }
225
226 void
v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer * cmd_buffer,uint64_t obj,v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb)227 v3dv_cmd_buffer_add_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
228 uint64_t obj,
229 v3dv_cmd_buffer_private_obj_destroy_cb destroy_cb)
230 {
231 struct v3dv_cmd_buffer_private_obj *pobj =
232 vk_alloc(&cmd_buffer->device->vk.alloc, sizeof(*pobj), 8,
233 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
234 if (!pobj) {
235 v3dv_flag_oom(cmd_buffer, NULL);
236 return;
237 }
238
239 pobj->obj = obj;
240 pobj->destroy_cb = destroy_cb;
241
242 list_addtail(&pobj->list_link, &cmd_buffer->private_objs);
243 }
244
245 static void
cmd_buffer_destroy_private_obj(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cmd_buffer_private_obj * pobj)246 cmd_buffer_destroy_private_obj(struct v3dv_cmd_buffer *cmd_buffer,
247 struct v3dv_cmd_buffer_private_obj *pobj)
248 {
249 assert(pobj && pobj->obj && pobj->destroy_cb);
250 pobj->destroy_cb(v3dv_device_to_handle(cmd_buffer->device),
251 pobj->obj,
252 &cmd_buffer->device->vk.alloc);
253 list_del(&pobj->list_link);
254 vk_free(&cmd_buffer->device->vk.alloc, pobj);
255 }
256
257 static void
cmd_buffer_free_resources(struct v3dv_cmd_buffer * cmd_buffer)258 cmd_buffer_free_resources(struct v3dv_cmd_buffer *cmd_buffer)
259 {
260 list_for_each_entry_safe(struct v3dv_job, job,
261 &cmd_buffer->jobs, list_link) {
262 v3dv_job_destroy(job);
263 }
264
265 if (cmd_buffer->state.job)
266 v3dv_job_destroy(cmd_buffer->state.job);
267
268 if (cmd_buffer->state.attachments)
269 vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer->state.attachments);
270
271 if (cmd_buffer->state.query.end.alloc_count > 0)
272 vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.query.end.states);
273
274 if (cmd_buffer->push_constants_resource.bo)
275 v3dv_bo_free(cmd_buffer->device, cmd_buffer->push_constants_resource.bo);
276
277 list_for_each_entry_safe(struct v3dv_cmd_buffer_private_obj, pobj,
278 &cmd_buffer->private_objs, list_link) {
279 cmd_buffer_destroy_private_obj(cmd_buffer, pobj);
280 }
281
282 if (cmd_buffer->state.meta.attachments) {
283 assert(cmd_buffer->state.meta.attachment_alloc_count > 0);
284 vk_free(&cmd_buffer->device->vk.alloc, cmd_buffer->state.meta.attachments);
285 }
286
287 v3dv_destroy_dynamic_framebuffer(cmd_buffer);
288 }
289
290 static void
cmd_buffer_destroy(struct vk_command_buffer * vk_cmd_buffer)291 cmd_buffer_destroy(struct vk_command_buffer *vk_cmd_buffer)
292 {
293 struct v3dv_cmd_buffer *cmd_buffer =
294 container_of(vk_cmd_buffer, struct v3dv_cmd_buffer, vk);
295
296 cmd_buffer_free_resources(cmd_buffer);
297 vk_command_buffer_finish(&cmd_buffer->vk);
298 vk_free(&cmd_buffer->vk.pool->alloc, cmd_buffer);
299 }
300
301 static bool
cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)302 cmd_buffer_can_merge_subpass(struct v3dv_cmd_buffer *cmd_buffer,
303 uint32_t subpass_idx)
304 {
305 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
306 assert(state->pass);
307
308 const struct v3dv_physical_device *physical_device =
309 cmd_buffer->device->pdevice;
310
311 if (cmd_buffer->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
312 return false;
313
314 if (!cmd_buffer->state.job)
315 return false;
316
317 if (cmd_buffer->state.job->always_flush)
318 return false;
319
320 if (!physical_device->options.merge_jobs)
321 return false;
322
323 /* Each render pass starts a new job */
324 if (subpass_idx == 0)
325 return false;
326
327 /* Two subpasses can be merged in the same job if we can emit a single RCL
328 * for them (since the RCL includes the END_OF_RENDERING command that
329 * triggers the "render job finished" interrupt). We can do this so long
330 * as both subpasses render against the same attachments.
331 */
332 assert(state->subpass_idx == subpass_idx - 1);
333 struct v3dv_subpass *prev_subpass = &state->pass->subpasses[state->subpass_idx];
334 struct v3dv_subpass *subpass = &state->pass->subpasses[subpass_idx];
335
336 if (subpass->ds_attachment.attachment !=
337 prev_subpass->ds_attachment.attachment)
338 return false;
339
340 if (subpass->color_count != prev_subpass->color_count)
341 return false;
342
343 for (uint32_t i = 0; i < subpass->color_count; i++) {
344 if (subpass->color_attachments[i].attachment !=
345 prev_subpass->color_attachments[i].attachment) {
346 return false;
347 }
348 }
349
350 /* Don't merge if the subpasses have different view masks, since in that
351 * case the framebuffer setup is different and we need to emit different
352 * RCLs.
353 */
354 if (subpass->view_mask != prev_subpass->view_mask)
355 return false;
356
357 /* FIXME: Since some attachment formats can't be resolved using the TLB we
358 * need to emit separate resolve jobs for them and that would not be
359 * compatible with subpass merges. We could fix that by testing if any of
360 * the attachments to resolve doesn't support TLB resolves.
361 */
362 if (prev_subpass->resolve_attachments || subpass->resolve_attachments ||
363 prev_subpass->resolve_depth || prev_subpass->resolve_stencil ||
364 subpass->resolve_depth || subpass->resolve_stencil) {
365 return false;
366 }
367
368 return true;
369 }
370
371 /**
372 * Computes and sets the job frame tiling information required to setup frame
373 * binning and rendering.
374 */
375 static struct v3dv_frame_tiling *
job_compute_frame_tiling(struct v3dv_job * job,uint32_t width,uint32_t height,uint32_t layers,uint32_t render_target_count,uint8_t max_internal_bpp,uint8_t total_color_bpp,bool msaa,bool double_buffer)376 job_compute_frame_tiling(struct v3dv_job *job,
377 uint32_t width,
378 uint32_t height,
379 uint32_t layers,
380 uint32_t render_target_count,
381 uint8_t max_internal_bpp,
382 uint8_t total_color_bpp,
383 bool msaa,
384 bool double_buffer)
385 {
386 assert(job);
387 struct v3dv_frame_tiling *tiling = &job->frame_tiling;
388
389 tiling->width = width;
390 tiling->height = height;
391 tiling->layers = layers;
392 tiling->render_target_count = render_target_count;
393 tiling->msaa = msaa;
394 tiling->internal_bpp = max_internal_bpp;
395 tiling->total_color_bpp = total_color_bpp;
396 tiling->double_buffer = double_buffer;
397
398 /* Double-buffer is incompatible with MSAA */
399 assert(!tiling->msaa || !tiling->double_buffer);
400
401 v3d_choose_tile_size(&job->device->devinfo,
402 render_target_count,
403 max_internal_bpp, total_color_bpp, msaa,
404 tiling->double_buffer,
405 &tiling->tile_width, &tiling->tile_height);
406
407 tiling->draw_tiles_x = DIV_ROUND_UP(width, tiling->tile_width);
408 tiling->draw_tiles_y = DIV_ROUND_UP(height, tiling->tile_height);
409
410 /* Size up our supertiles until we get under the limit */
411 const uint32_t max_supertiles = 256;
412 tiling->supertile_width = 1;
413 tiling->supertile_height = 1;
414 for (;;) {
415 tiling->frame_width_in_supertiles =
416 DIV_ROUND_UP(tiling->draw_tiles_x, tiling->supertile_width);
417 tiling->frame_height_in_supertiles =
418 DIV_ROUND_UP(tiling->draw_tiles_y, tiling->supertile_height);
419 const uint32_t num_supertiles = tiling->frame_width_in_supertiles *
420 tiling->frame_height_in_supertiles;
421 if (num_supertiles < max_supertiles)
422 break;
423
424 if (tiling->supertile_width < tiling->supertile_height)
425 tiling->supertile_width++;
426 else
427 tiling->supertile_height++;
428 }
429
430 return tiling;
431 }
432
433 bool
v3dv_job_allocate_tile_state(struct v3dv_job * job)434 v3dv_job_allocate_tile_state(struct v3dv_job *job)
435 {
436 struct v3dv_frame_tiling *tiling = &job->frame_tiling;
437 const uint32_t layers =
438 job->allocate_tile_state_for_all_layers ? tiling->layers : 1;
439
440 /* The PTB will request the tile alloc initial size per tile at start
441 * of tile binning.
442 */
443 uint32_t tile_alloc_size = 64 * layers *
444 tiling->draw_tiles_x *
445 tiling->draw_tiles_y;
446
447 /* The PTB allocates in aligned 4k chunks after the initial setup. */
448 tile_alloc_size = align(tile_alloc_size, 4096);
449
450 /* Include the first two chunk allocations that the PTB does so that
451 * we definitely clear the OOM condition before triggering one (the HW
452 * won't trigger OOM during the first allocations).
453 */
454 tile_alloc_size += 8192;
455
456 /* For performance, allocate some extra initial memory after the PTB's
457 * minimal allocations, so that we hopefully don't have to block the
458 * GPU on the kernel handling an OOM signal.
459 */
460 tile_alloc_size += 512 * 1024;
461
462 job->tile_alloc = v3dv_bo_alloc(job->device, tile_alloc_size,
463 "tile_alloc", true);
464 if (!job->tile_alloc) {
465 v3dv_flag_oom(NULL, job);
466 return false;
467 }
468
469 v3dv_job_add_bo_unchecked(job, job->tile_alloc);
470
471 const uint32_t tsda_per_tile_size = 256;
472 const uint32_t tile_state_size = layers *
473 tiling->draw_tiles_x *
474 tiling->draw_tiles_y *
475 tsda_per_tile_size;
476 job->tile_state = v3dv_bo_alloc(job->device, tile_state_size, "TSDA", true);
477 if (!job->tile_state) {
478 v3dv_flag_oom(NULL, job);
479 return false;
480 }
481
482 v3dv_job_add_bo_unchecked(job, job->tile_state);
483 return true;
484 }
485
486 void
v3dv_job_start_frame(struct v3dv_job * job,uint32_t width,uint32_t height,uint32_t layers,bool allocate_tile_state_for_all_layers,bool allocate_tile_state_now,uint32_t render_target_count,uint8_t max_internal_bpp,uint8_t total_color_bpp,bool msaa)487 v3dv_job_start_frame(struct v3dv_job *job,
488 uint32_t width,
489 uint32_t height,
490 uint32_t layers,
491 bool allocate_tile_state_for_all_layers,
492 bool allocate_tile_state_now,
493 uint32_t render_target_count,
494 uint8_t max_internal_bpp,
495 uint8_t total_color_bpp,
496 bool msaa)
497 {
498 assert(job);
499
500 /* Start by computing frame tiling spec for this job assuming that
501 * double-buffer mode is disabled.
502 */
503 const struct v3dv_frame_tiling *tiling =
504 job_compute_frame_tiling(job, width, height, layers,
505 render_target_count, max_internal_bpp,
506 total_color_bpp, msaa, false);
507
508 v3dv_cl_ensure_space_with_branch(&job->bcl, 256);
509 v3dv_return_if_oom(NULL, job);
510
511 job->allocate_tile_state_for_all_layers = allocate_tile_state_for_all_layers;
512
513 /* For subpass jobs we postpone tile state allocation until we are finishing
514 * the job and have made a decision about double-buffer.
515 */
516 if (allocate_tile_state_now) {
517 if (!v3dv_job_allocate_tile_state(job))
518 return;
519 }
520
521 v3dv_X(job->device, job_emit_binning_prolog)(job, tiling,
522 allocate_tile_state_for_all_layers ? tiling->layers : 1);
523
524 job->ez_state = V3D_EZ_UNDECIDED;
525 job->first_ez_state = V3D_EZ_UNDECIDED;
526 }
527
528 static bool
job_should_enable_double_buffer(struct v3dv_job * job)529 job_should_enable_double_buffer(struct v3dv_job *job)
530 {
531 /* Incompatibility with double-buffer */
532 if (!job->can_use_double_buffer)
533 return false;
534
535 /* Too much geometry processing */
536 if (job->double_buffer_score.geom > 2000000)
537 return false;
538
539 /* Too little rendering to make up for tile store latency */
540 if (job->double_buffer_score.render < 100000)
541 return false;
542
543 return true;
544 }
545
546 static void
cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer * cmd_buffer)547 cmd_buffer_end_render_pass_frame(struct v3dv_cmd_buffer *cmd_buffer)
548 {
549 struct v3dv_job *job = cmd_buffer->state.job;
550 assert(job);
551
552 /* For subpass jobs we always emit the RCL here */
553 assert(v3dv_cl_offset(&job->rcl) == 0);
554
555 /* Only emit RCL for the first job in a suspend/resume chain */
556 if (!job->resuming) {
557 /* Decide if we want to enable double-buffer for this job. If we do, then
558 * we need to rewrite the TILE_BINNING_MODE_CFG packet in the BCL.
559 */
560 if (job_should_enable_double_buffer(job)) {
561 assert(!job->frame_tiling.double_buffer);
562 job_compute_frame_tiling(job,
563 job->frame_tiling.width,
564 job->frame_tiling.height,
565 job->frame_tiling.layers,
566 job->frame_tiling.render_target_count,
567 job->frame_tiling.internal_bpp,
568 job->frame_tiling.total_color_bpp,
569 job->frame_tiling.msaa,
570 true);
571
572 v3dv_X(job->device, job_emit_enable_double_buffer)(job);
573 }
574
575 /* At this point we have decided whether we want to use double-buffer or
576 * not and the job's frame tiling represents that decision so we can
577 * allocate the tile state, which we need to do before we emit the RCL.
578 */
579 v3dv_job_allocate_tile_state(job);
580
581 v3dv_X(cmd_buffer->device, cmd_buffer_emit_render_pass_rcl)(cmd_buffer);
582 }
583
584 /* Only emit the binning flush for the last job in resume/suspend chain */
585 if (!job->suspending)
586 v3dv_X(cmd_buffer->device, job_emit_binning_flush)(job);
587 }
588
589 struct v3dv_job *
v3dv_cmd_buffer_create_cpu_job(struct v3dv_device * device,enum v3dv_job_type type,struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)590 v3dv_cmd_buffer_create_cpu_job(struct v3dv_device *device,
591 enum v3dv_job_type type,
592 struct v3dv_cmd_buffer *cmd_buffer,
593 uint32_t subpass_idx)
594 {
595 struct v3dv_job *job = vk_zalloc(&device->vk.alloc,
596 sizeof(struct v3dv_job), 8,
597 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
598 if (!job) {
599 v3dv_flag_oom(cmd_buffer, NULL);
600 return NULL;
601 }
602
603 v3dv_job_init(job, type, device, cmd_buffer, subpass_idx);
604 return job;
605 }
606
607 static void
cmd_buffer_emit_end_query_cpu(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query,uint32_t count)608 cmd_buffer_emit_end_query_cpu(struct v3dv_cmd_buffer *cmd_buffer,
609 struct v3dv_query_pool *pool,
610 uint32_t query, uint32_t count)
611 {
612 assert(pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
613
614 struct v3dv_job *job =
615 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
616 V3DV_JOB_TYPE_CPU_END_QUERY,
617 cmd_buffer, -1);
618 v3dv_return_if_oom(cmd_buffer, NULL);
619
620 job->cpu.query_end.pool = pool;
621 job->cpu.query_end.query = query;
622 job->cpu.query_end.count = count;
623 list_addtail(&job->list_link, &cmd_buffer->jobs);
624 }
625
626 static inline bool
cmd_buffer_has_pending_jobs(struct v3dv_cmd_buffer * cmd_buffer)627 cmd_buffer_has_pending_jobs(struct v3dv_cmd_buffer *cmd_buffer)
628 {
629 return cmd_buffer->state.query.end.used_count > 0;
630 }
631
632 static void
cmd_buffer_add_pending_jobs(struct v3dv_cmd_buffer * cmd_buffer)633 cmd_buffer_add_pending_jobs(struct v3dv_cmd_buffer *cmd_buffer)
634 {
635 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
636 const uint32_t count = state->query.end.used_count;
637 for (uint32_t i = 0; i < count; i++) {
638 assert(i < state->query.end.used_count);
639 struct v3dv_end_query_info *info = &state->query.end.states[i];
640 if (info->pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
641 v3dv_cmd_buffer_emit_set_query_availability(cmd_buffer, info->pool,
642 info->query, info->count, 1);
643 } else {
644 cmd_buffer_emit_end_query_cpu(cmd_buffer, info->pool,
645 info->query, info->count);
646 }
647 }
648 state->query.end.used_count = 0;
649 }
650
651 void
v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer * cmd_buffer)652 v3dv_cmd_buffer_finish_job(struct v3dv_cmd_buffer *cmd_buffer)
653 {
654 struct v3dv_job *job = cmd_buffer->state.job;
655 if (!job)
656 return;
657
658 if (cmd_buffer->state.oom) {
659 v3dv_job_destroy(job);
660 cmd_buffer->state.job = NULL;
661 return;
662 }
663
664 /* If we have created a job for a command buffer then we should have
665 * recorded something into it: if the job was started in a render pass, it
666 * should at least have the start frame commands, otherwise, it should have
667 * a transfer command. The only exception are secondary command buffers
668 * inside a render pass.
669 *
670 * With dynamic rendering there is also the possibility that we resume a
671 * suspended pass with an empty job. In that case, we need to ensure the
672 * empty job is still a valid commmand list, which we will ensure when we
673 * add the binning flush right below, which only happens if this is the
674 * last job in the resume/suspend chain. If it is not the last then we know
675 * it must at least have the BRANCH instruction to link with a follow-up
676 * resume job.
677 */
678 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY ||
679 (job->resuming && !job->suspending) ||
680 v3dv_cl_offset(&job->bcl) > 0);
681
682 /* When we merge multiple subpasses into the same job we must only emit one
683 * RCL, so we do that here, when we decided that we need to finish the job.
684 * Any rendering that happens outside a render pass is never merged, so
685 * the RCL should have been emitted by the time we got here.
686 */
687 assert(v3dv_cl_offset(&job->rcl) != 0 || cmd_buffer->state.pass);
688
689 if (!(cmd_buffer->state.barrier.dst_mask & V3DV_BARRIER_GRAPHICS_BIT)) {
690 cmd_buffer->state.barrier.bcl_buffer_access = 0;
691 cmd_buffer->state.barrier.bcl_image_access = 0;
692 }
693
694 /* If we are finishing a job inside a render pass we have two scenarios:
695 *
696 * 1. It is a regular CL, in which case we will submit the job to the GPU,
697 * so we may need to generate an RCL and add a binning flush.
698 *
699 * 2. It is a partial CL recorded in a secondary command buffer, in which
700 * case we are not submitting it directly to the GPU but rather branch to
701 * it from a primary command buffer. In this case we just want to end
702 * the BCL with a RETURN_FROM_SUB_LIST and the RCL and binning flush
703 * will be the primary job that branches to this CL.
704 */
705 if (cmd_buffer->state.pass) {
706 if (job->type == V3DV_JOB_TYPE_GPU_CL) {
707 cmd_buffer_end_render_pass_frame(cmd_buffer);
708 } else {
709 assert(job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
710 v3dv_X(cmd_buffer->device, cmd_buffer_end_render_pass_secondary)(cmd_buffer);
711 }
712 }
713
714 bool suspending = job->suspending;
715 list_addtail(&job->list_link, &cmd_buffer->jobs);
716 cmd_buffer->state.job = NULL;
717
718 /* If we have recorded any state with this last GPU job that requires to
719 * emit jobs after the job is completed, add them now. The only exception
720 * is secondary command buffers inside a render pass, because in
721 * that case we want to defer this until we finish recording the primary
722 * job into which we execute the secondary.
723 */
724 if (!suspending) {
725 if (cmd_buffer_has_pending_jobs(cmd_buffer) &&
726 (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY ||
727 !cmd_buffer->state.pass)) {
728 cmd_buffer_add_pending_jobs(cmd_buffer);
729 }
730 }
731 }
732
733 bool
v3dv_job_type_is_gpu(struct v3dv_job * job)734 v3dv_job_type_is_gpu(struct v3dv_job *job)
735 {
736 switch (job->type) {
737 case V3DV_JOB_TYPE_GPU_CL:
738 case V3DV_JOB_TYPE_GPU_CL_INCOMPLETE:
739 case V3DV_JOB_TYPE_GPU_TFU:
740 case V3DV_JOB_TYPE_GPU_CSD:
741 return true;
742 default:
743 return false;
744 }
745 }
746
747 static void
cmd_buffer_serialize_job_if_needed(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_job * job)748 cmd_buffer_serialize_job_if_needed(struct v3dv_cmd_buffer *cmd_buffer,
749 struct v3dv_job *job)
750 {
751 assert(cmd_buffer && job);
752
753 /* Serialization only affects GPU jobs, CPU jobs are always automatically
754 * serialized.
755 */
756 if (!v3dv_job_type_is_gpu(job))
757 return;
758
759 uint8_t barrier_mask = cmd_buffer->state.barrier.dst_mask;
760 if (barrier_mask == 0)
761 return;
762
763 uint8_t bit = 0;
764 uint8_t *src_mask;
765 if (job->type == V3DV_JOB_TYPE_GPU_CSD) {
766 assert(!job->is_transfer);
767 bit = V3DV_BARRIER_COMPUTE_BIT;
768 src_mask = &cmd_buffer->state.barrier.src_mask_compute;
769 } else if (job->is_transfer) {
770 assert(job->type == V3DV_JOB_TYPE_GPU_CL ||
771 job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE ||
772 job->type == V3DV_JOB_TYPE_GPU_TFU);
773 bit = V3DV_BARRIER_TRANSFER_BIT;
774 src_mask = &cmd_buffer->state.barrier.src_mask_transfer;
775 } else {
776 assert(job->type == V3DV_JOB_TYPE_GPU_CL ||
777 job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
778 bit = V3DV_BARRIER_GRAPHICS_BIT;
779 src_mask = &cmd_buffer->state.barrier.src_mask_graphics;
780 }
781
782 if (barrier_mask & bit) {
783 job->serialize = *src_mask;
784 *src_mask = 0;
785 cmd_buffer->state.barrier.dst_mask &= ~bit;
786 }
787 }
788
789 void
v3dv_job_init(struct v3dv_job * job,enum v3dv_job_type type,struct v3dv_device * device,struct v3dv_cmd_buffer * cmd_buffer,int32_t subpass_idx)790 v3dv_job_init(struct v3dv_job *job,
791 enum v3dv_job_type type,
792 struct v3dv_device *device,
793 struct v3dv_cmd_buffer *cmd_buffer,
794 int32_t subpass_idx)
795 {
796 assert(job);
797
798 /* Make sure we haven't made this new job current before calling here */
799 assert(!cmd_buffer || cmd_buffer->state.job != job);
800
801 job->type = type;
802
803 job->device = device;
804 job->cmd_buffer = cmd_buffer;
805
806 list_inithead(&job->list_link);
807
808 if (type == V3DV_JOB_TYPE_GPU_CL ||
809 type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE ||
810 type == V3DV_JOB_TYPE_GPU_CSD) {
811 job->bos =
812 _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal);
813 job->bo_count = 0;
814
815 v3dv_cl_init(job, &job->indirect);
816
817 if (V3D_DBG(ALWAYS_FLUSH))
818 job->always_flush = true;
819 }
820
821 if (type == V3DV_JOB_TYPE_GPU_CL ||
822 type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE) {
823 v3dv_cl_init(job, &job->bcl);
824 v3dv_cl_init(job, &job->rcl);
825 }
826
827 if (cmd_buffer) {
828 /* Flag all state as dirty. Generally, we need to re-emit state for each
829 * new job.
830 *
831 * FIXME: there may be some exceptions, in which case we could skip some
832 * bits.
833 */
834 cmd_buffer->state.dirty = ~0;
835 cmd_buffer->state.dirty_descriptor_stages = ~0;
836 vk_dynamic_graphics_state_dirty_all(&cmd_buffer->vk.dynamic_graphics_state);
837
838 /* Honor inheritance of occlusion queries in secondaries if requested */
839 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
840 cmd_buffer->state.inheritance.occlusion_query_enable) {
841 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
842 }
843
844 /* Keep track of the first subpass that we are recording in this new job.
845 * We will use this when we emit the RCL to decide how to emit our loads
846 * and stores.
847 */
848 if (cmd_buffer->state.pass)
849 job->first_subpass = subpass_idx;
850
851 job->is_transfer = cmd_buffer->state.is_transfer;
852
853 cmd_buffer_serialize_job_if_needed(cmd_buffer, job);
854
855 job->perf = cmd_buffer->state.query.active_query.perf;
856 }
857 }
858
859 struct v3dv_job *
v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer * cmd_buffer,int32_t subpass_idx,enum v3dv_job_type type)860 v3dv_cmd_buffer_start_job(struct v3dv_cmd_buffer *cmd_buffer,
861 int32_t subpass_idx,
862 enum v3dv_job_type type)
863 {
864 /* Don't create a new job if we can merge the current subpass into
865 * the current job.
866 */
867 if (cmd_buffer->state.pass &&
868 subpass_idx != -1 &&
869 cmd_buffer_can_merge_subpass(cmd_buffer, subpass_idx)) {
870 cmd_buffer->state.job->is_subpass_finish = false;
871 return cmd_buffer->state.job;
872 }
873
874 /* Ensure we are not starting a new job without finishing a previous one */
875 if (cmd_buffer->state.job != NULL)
876 v3dv_cmd_buffer_finish_job(cmd_buffer);
877
878 assert(cmd_buffer->state.job == NULL);
879 struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,
880 sizeof(struct v3dv_job), 8,
881 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
882
883 if (!job) {
884 fprintf(stderr, "Error: failed to allocate CPU memory for job\n");
885 v3dv_flag_oom(cmd_buffer, NULL);
886 return NULL;
887 }
888
889 v3dv_job_init(job, type, cmd_buffer->device, cmd_buffer, subpass_idx);
890 cmd_buffer->state.job = job;
891
892 return job;
893 }
894
895 static void
cmd_buffer_reset(struct vk_command_buffer * vk_cmd_buffer,VkCommandBufferResetFlags flags)896 cmd_buffer_reset(struct vk_command_buffer *vk_cmd_buffer,
897 VkCommandBufferResetFlags flags)
898 {
899 struct v3dv_cmd_buffer *cmd_buffer =
900 container_of(vk_cmd_buffer, struct v3dv_cmd_buffer, vk);
901
902 vk_command_buffer_reset(&cmd_buffer->vk);
903 if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_INITIALIZED) {
904 struct v3dv_device *device = cmd_buffer->device;
905
906 /* FIXME: For now we always free all resources as if
907 * VK_COMMAND_BUFFER_RESET_RELEASE_RESOURCES_BIT was set.
908 */
909 if (cmd_buffer->status != V3DV_CMD_BUFFER_STATUS_NEW)
910 cmd_buffer_free_resources(cmd_buffer);
911
912 cmd_buffer_init(cmd_buffer, device);
913 }
914
915 assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED);
916 }
917
918
919 static void
cmd_buffer_emit_resolve(struct v3dv_cmd_buffer * cmd_buffer,uint32_t dst_attachment_idx,uint32_t src_attachment_idx,VkImageAspectFlagBits aspect)920 cmd_buffer_emit_resolve(struct v3dv_cmd_buffer *cmd_buffer,
921 uint32_t dst_attachment_idx,
922 uint32_t src_attachment_idx,
923 VkImageAspectFlagBits aspect)
924 {
925 struct v3dv_image_view *src_iview =
926 cmd_buffer->state.attachments[src_attachment_idx].image_view;
927 struct v3dv_image_view *dst_iview =
928 cmd_buffer->state.attachments[dst_attachment_idx].image_view;
929
930 const VkRect2D *ra = &cmd_buffer->state.render_area;
931
932 VkImageResolve2 region = {
933 .sType = VK_STRUCTURE_TYPE_IMAGE_RESOLVE_2,
934 .srcSubresource = {
935 aspect,
936 src_iview->vk.base_mip_level,
937 src_iview->vk.base_array_layer,
938 src_iview->vk.layer_count,
939 },
940 .srcOffset = { ra->offset.x, ra->offset.y, 0 },
941 .dstSubresource = {
942 aspect,
943 dst_iview->vk.base_mip_level,
944 dst_iview->vk.base_array_layer,
945 dst_iview->vk.layer_count,
946 },
947 .dstOffset = { ra->offset.x, ra->offset.y, 0 },
948 .extent = { ra->extent.width, ra->extent.height, 1 },
949 };
950
951 struct v3dv_image *src_image = (struct v3dv_image *) src_iview->vk.image;
952 struct v3dv_image *dst_image = (struct v3dv_image *) dst_iview->vk.image;
953 VkResolveImageInfo2 resolve_info = {
954 .sType = VK_STRUCTURE_TYPE_RESOLVE_IMAGE_INFO_2,
955 .srcImage = v3dv_image_to_handle(src_image),
956 .srcImageLayout = VK_IMAGE_LAYOUT_GENERAL,
957 .dstImage = v3dv_image_to_handle(dst_image),
958 .dstImageLayout = VK_IMAGE_LAYOUT_GENERAL,
959 .regionCount = 1,
960 .pRegions = ®ion,
961 };
962
963 VkCommandBuffer cmd_buffer_handle = v3dv_cmd_buffer_to_handle(cmd_buffer);
964 v3dv_CmdResolveImage2(cmd_buffer_handle, &resolve_info);
965 }
966
967 static void
cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer * cmd_buffer)968 cmd_buffer_subpass_handle_pending_resolves(struct v3dv_cmd_buffer *cmd_buffer)
969 {
970 assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);
971 const struct v3dv_render_pass *pass = cmd_buffer->state.pass;
972 const struct v3dv_subpass *subpass =
973 &pass->subpasses[cmd_buffer->state.subpass_idx];
974
975 if (!subpass->resolve_attachments)
976 return;
977
978 /* At this point we have already ended the current subpass and now we are
979 * about to emit vkCmdResolveImage calls to get the resolves we can't handle
980 * handle in the subpass RCL.
981 *
982 * vkCmdResolveImage is not supposed to be called inside a render pass so
983 * before we call that we need to make sure our command buffer state reflects
984 * that we are no longer in a subpass by finishing the current job and
985 * resetting the framebuffer and render pass state temporarily and then
986 * restoring it after we are done with the resolves.
987 */
988 if (cmd_buffer->state.job)
989 v3dv_cmd_buffer_finish_job(cmd_buffer);
990 struct v3dv_framebuffer *restore_fb = cmd_buffer->state.framebuffer;
991 struct v3dv_render_pass *restore_pass = cmd_buffer->state.pass;
992 uint32_t restore_subpass_idx = cmd_buffer->state.subpass_idx;
993 cmd_buffer->state.framebuffer = NULL;
994 cmd_buffer->state.pass = NULL;
995 cmd_buffer->state.subpass_idx = -1;
996
997 for (uint32_t i = 0; i < subpass->color_count; i++) {
998 const uint32_t src_attachment_idx =
999 subpass->color_attachments[i].attachment;
1000 if (src_attachment_idx == VK_ATTACHMENT_UNUSED)
1001 continue;
1002
1003 /* Skip if this attachment doesn't have a resolve or if it was already
1004 * implemented as a TLB resolve.
1005 */
1006 if (!cmd_buffer->state.attachments[src_attachment_idx].has_resolve ||
1007 cmd_buffer->state.attachments[src_attachment_idx].use_tlb_resolve) {
1008 continue;
1009 }
1010
1011 const uint32_t dst_attachment_idx =
1012 subpass->resolve_attachments[i].attachment;
1013 assert(dst_attachment_idx != VK_ATTACHMENT_UNUSED);
1014
1015 cmd_buffer_emit_resolve(cmd_buffer, dst_attachment_idx, src_attachment_idx,
1016 VK_IMAGE_ASPECT_COLOR_BIT);
1017 }
1018
1019 const uint32_t ds_src_attachment_idx =
1020 subpass->ds_attachment.attachment;
1021 if (ds_src_attachment_idx != VK_ATTACHMENT_UNUSED &&
1022 cmd_buffer->state.attachments[ds_src_attachment_idx].has_resolve &&
1023 !cmd_buffer->state.attachments[ds_src_attachment_idx].use_tlb_resolve) {
1024 assert(subpass->resolve_depth || subpass->resolve_stencil);
1025 const VkImageAspectFlags ds_aspects =
1026 (subpass->resolve_depth ? VK_IMAGE_ASPECT_DEPTH_BIT : 0) |
1027 (subpass->resolve_stencil ? VK_IMAGE_ASPECT_STENCIL_BIT : 0);
1028 const uint32_t ds_dst_attachment_idx =
1029 subpass->ds_resolve_attachment.attachment;
1030 assert(ds_dst_attachment_idx != VK_ATTACHMENT_UNUSED);
1031 cmd_buffer_emit_resolve(cmd_buffer, ds_dst_attachment_idx,
1032 ds_src_attachment_idx, ds_aspects);
1033 }
1034
1035 cmd_buffer->state.framebuffer = restore_fb;
1036 cmd_buffer->state.pass = restore_pass;
1037 cmd_buffer->state.subpass_idx = restore_subpass_idx;
1038 }
1039
1040 static VkResult
cmd_buffer_begin_render_pass_secondary(struct v3dv_cmd_buffer * cmd_buffer,const VkCommandBufferInheritanceInfo * inheritance_info)1041 cmd_buffer_begin_render_pass_secondary(
1042 struct v3dv_cmd_buffer *cmd_buffer,
1043 const VkCommandBufferInheritanceInfo *inheritance_info)
1044 {
1045 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1046 assert(cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);
1047 assert(inheritance_info);
1048
1049 const VkCommandBufferInheritanceRenderingInfo *rendering_info = NULL;
1050 if (inheritance_info->renderPass == VK_NULL_HANDLE) {
1051 rendering_info = vk_find_struct_const(inheritance_info,
1052 COMMAND_BUFFER_INHERITANCE_RENDERING_INFO);
1053 assert(rendering_info);
1054 v3dv_setup_dynamic_render_pass_inheritance(cmd_buffer, rendering_info);
1055 cmd_buffer->state.pass = &cmd_buffer->state.dynamic_pass;
1056 cmd_buffer->state.subpass_idx = 0;
1057 cmd_buffer->state.framebuffer = NULL;
1058 } else {
1059 cmd_buffer->state.pass =
1060 v3dv_render_pass_from_handle(inheritance_info->renderPass);
1061
1062 assert(inheritance_info->subpass < cmd_buffer->state.pass->subpass_count);
1063 cmd_buffer->state.subpass_idx = inheritance_info->subpass;
1064
1065 cmd_buffer->state.framebuffer =
1066 v3dv_framebuffer_from_handle(inheritance_info->framebuffer);
1067 }
1068 assert(cmd_buffer->state.pass);
1069
1070 cmd_buffer->state.inheritance.occlusion_query_enable =
1071 inheritance_info->occlusionQueryEnable;
1072
1073 /* Secondaries that execute inside a render pass won't start subpasses
1074 * so we want to create a job for them here.
1075 */
1076 struct v3dv_job *job =
1077 v3dv_cmd_buffer_start_job(cmd_buffer, cmd_buffer->state.subpass_idx,
1078 V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
1079 if (!job) {
1080 v3dv_flag_oom(cmd_buffer, NULL);
1081 return VK_ERROR_OUT_OF_HOST_MEMORY;
1082 }
1083
1084 /* Secondary command buffers don't know about the render area, but our
1085 * scissor setup accounts for it, so let's make sure we make it large
1086 * enough that it doesn't actually constrain any rendering. This should
1087 * be fine, since the Vulkan spec states:
1088 *
1089 * "The application must ensure (using scissor if necessary) that all
1090 * rendering is contained within the render area."
1091 */
1092 const struct v3dv_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
1093 cmd_buffer->state.render_area.offset.x = 0;
1094 cmd_buffer->state.render_area.offset.y = 0;
1095 cmd_buffer->state.render_area.extent.width =
1096 framebuffer ? framebuffer->width : V3D_MAX_IMAGE_DIMENSION;
1097 cmd_buffer->state.render_area.extent.height =
1098 framebuffer ? framebuffer->height : V3D_MAX_IMAGE_DIMENSION;
1099
1100 /* We only really execute double-buffer mode in primary jobs, so allow this
1101 * mode in render pass secondaries to keep track of the double-buffer mode
1102 * score in them and update the primaries accordingly when they are executed
1103 * into them.
1104 */
1105 job->can_use_double_buffer = true;
1106
1107 return VK_SUCCESS;
1108 }
1109
1110 const struct vk_command_buffer_ops v3dv_cmd_buffer_ops = {
1111 .create = cmd_buffer_create,
1112 .reset = cmd_buffer_reset,
1113 .destroy = cmd_buffer_destroy,
1114 };
1115
1116 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,const VkCommandBufferBeginInfo * pBeginInfo)1117 v3dv_BeginCommandBuffer(VkCommandBuffer commandBuffer,
1118 const VkCommandBufferBeginInfo *pBeginInfo)
1119 {
1120 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1121
1122 /* If this is the first vkBeginCommandBuffer, we must initialize the
1123 * command buffer's state. Otherwise, we must reset its state. In both
1124 * cases we reset it.
1125 */
1126 cmd_buffer_reset(&cmd_buffer->vk, 0);
1127
1128 assert(cmd_buffer->status == V3DV_CMD_BUFFER_STATUS_INITIALIZED);
1129
1130 cmd_buffer->usage_flags = pBeginInfo->flags;
1131
1132 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY) {
1133 if (pBeginInfo->flags & VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT) {
1134 VkResult result =
1135 cmd_buffer_begin_render_pass_secondary(cmd_buffer,
1136 pBeginInfo->pInheritanceInfo);
1137 if (result != VK_SUCCESS)
1138 return result;
1139 }
1140 }
1141
1142 cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_RECORDING;
1143
1144 return VK_SUCCESS;
1145 }
1146
1147 static void
cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer * cmd_buffer)1148 cmd_buffer_update_tile_alignment(struct v3dv_cmd_buffer *cmd_buffer)
1149 {
1150 /* Render areas and scissor/viewport are only relevant inside render passes,
1151 * otherwise we are dealing with transfer operations where these elements
1152 * don't apply.
1153 */
1154 assert(cmd_buffer->state.pass);
1155 const VkRect2D *rect = &cmd_buffer->state.render_area;
1156
1157 /* We should only call this at the beginning of a subpass so we should
1158 * always have framebuffer information available.
1159 */
1160 assert(cmd_buffer->state.framebuffer);
1161 cmd_buffer->state.tile_aligned_render_area =
1162 v3dv_subpass_area_is_tile_aligned(cmd_buffer->device, rect,
1163 cmd_buffer->state.framebuffer,
1164 cmd_buffer->state.pass,
1165 cmd_buffer->state.subpass_idx);
1166
1167 if (!cmd_buffer->state.tile_aligned_render_area) {
1168 perf_debug("Render area for subpass %d of render pass %p doesn't "
1169 "match render pass granularity.\n",
1170 cmd_buffer->state.subpass_idx, cmd_buffer->state.pass);
1171 }
1172 }
1173
1174 static void
cmd_buffer_update_attachment_resolve_state(struct v3dv_cmd_buffer * cmd_buffer)1175 cmd_buffer_update_attachment_resolve_state(struct v3dv_cmd_buffer *cmd_buffer)
1176 {
1177 /* NOTE: This should be called after cmd_buffer_update_tile_alignment()
1178 * since it relies on up-to-date information about subpass tile alignment.
1179 */
1180 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1181 const struct v3dv_render_pass *pass = state->pass;
1182 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
1183
1184 for (uint32_t i = 0; i < subpass->color_count; i++) {
1185 const uint32_t attachment_idx = subpass->color_attachments[i].attachment;
1186 if (attachment_idx == VK_ATTACHMENT_UNUSED)
1187 continue;
1188
1189 state->attachments[attachment_idx].has_resolve =
1190 subpass->resolve_attachments &&
1191 subpass->resolve_attachments[i].attachment != VK_ATTACHMENT_UNUSED;
1192
1193 state->attachments[attachment_idx].use_tlb_resolve =
1194 state->attachments[attachment_idx].has_resolve &&
1195 state->tile_aligned_render_area &&
1196 pass->attachments[attachment_idx].try_tlb_resolve;
1197 }
1198
1199 uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
1200 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
1201 uint32_t ds_resolve_attachment_idx =
1202 subpass->ds_resolve_attachment.attachment;
1203 state->attachments[ds_attachment_idx].has_resolve =
1204 ds_resolve_attachment_idx != VK_ATTACHMENT_UNUSED;
1205
1206 assert(!state->attachments[ds_attachment_idx].has_resolve ||
1207 (subpass->resolve_depth || subpass->resolve_stencil));
1208
1209 state->attachments[ds_attachment_idx].use_tlb_resolve =
1210 state->attachments[ds_attachment_idx].has_resolve &&
1211 state->tile_aligned_render_area &&
1212 pass->attachments[ds_attachment_idx].try_tlb_resolve;
1213 }
1214 }
1215
1216 static void
cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer * cmd_buffer,uint32_t attachment_idx,const VkClearColorValue * color)1217 cmd_buffer_state_set_attachment_clear_color(struct v3dv_cmd_buffer *cmd_buffer,
1218 uint32_t attachment_idx,
1219 const VkClearColorValue *color)
1220 {
1221 assert(attachment_idx < cmd_buffer->state.pass->attachment_count);
1222 const struct v3dv_render_pass_attachment *attachment =
1223 &cmd_buffer->state.pass->attachments[attachment_idx];
1224
1225 uint32_t internal_type, internal_bpp;
1226 const struct v3dv_format *format =
1227 v3dv_X(cmd_buffer->device, get_format)(attachment->desc.format);
1228 /* We don't allow multi-planar formats for render pass attachments */
1229 assert(format->plane_count == 1);
1230
1231 v3dv_X(cmd_buffer->device, get_internal_type_bpp_for_output_format)
1232 (format->planes[0].rt_type, &internal_type, &internal_bpp);
1233
1234 uint32_t internal_size = 4 << internal_bpp;
1235
1236 struct v3dv_cmd_buffer_attachment_state *attachment_state =
1237 &cmd_buffer->state.attachments[attachment_idx];
1238
1239 v3dv_X(cmd_buffer->device, get_hw_clear_color)
1240 (color, internal_type, internal_size, &attachment_state->clear_value.color[0]);
1241
1242 attachment_state->vk_clear_value.color = *color;
1243 }
1244
1245 static void
cmd_buffer_state_set_attachment_clear_depth_stencil(struct v3dv_cmd_buffer * cmd_buffer,uint32_t attachment_idx,bool clear_depth,bool clear_stencil,const VkClearDepthStencilValue * ds)1246 cmd_buffer_state_set_attachment_clear_depth_stencil(
1247 struct v3dv_cmd_buffer *cmd_buffer,
1248 uint32_t attachment_idx,
1249 bool clear_depth, bool clear_stencil,
1250 const VkClearDepthStencilValue *ds)
1251 {
1252 struct v3dv_cmd_buffer_attachment_state *attachment_state =
1253 &cmd_buffer->state.attachments[attachment_idx];
1254
1255 if (clear_depth)
1256 attachment_state->clear_value.z = ds->depth;
1257
1258 if (clear_stencil)
1259 attachment_state->clear_value.s = ds->stencil;
1260
1261 attachment_state->vk_clear_value.depthStencil = *ds;
1262 }
1263
1264 static void
cmd_buffer_state_set_clear_values(struct v3dv_cmd_buffer * cmd_buffer,uint32_t count,const VkClearValue * values)1265 cmd_buffer_state_set_clear_values(struct v3dv_cmd_buffer *cmd_buffer,
1266 uint32_t count, const VkClearValue *values)
1267 {
1268 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1269 const struct v3dv_render_pass *pass = state->pass;
1270
1271 /* There could be less clear values than attachments in the render pass, in
1272 * which case we only want to process as many as we have, or there could be
1273 * more, in which case we want to ignore those for which we don't have a
1274 * corresponding attachment.
1275 */
1276 count = MIN2(count, pass->attachment_count);
1277 for (uint32_t i = 0; i < count; i++) {
1278 const struct v3dv_render_pass_attachment *attachment =
1279 &pass->attachments[i];
1280
1281 if (attachment->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
1282 continue;
1283
1284 VkImageAspectFlags aspects = vk_format_aspects(attachment->desc.format);
1285 if (aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
1286 cmd_buffer_state_set_attachment_clear_color(cmd_buffer, i,
1287 &values[i].color);
1288 } else if (aspects & (VK_IMAGE_ASPECT_DEPTH_BIT |
1289 VK_IMAGE_ASPECT_STENCIL_BIT)) {
1290 cmd_buffer_state_set_attachment_clear_depth_stencil(
1291 cmd_buffer, i,
1292 aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1293 aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1294 &values[i].depthStencil);
1295 }
1296 }
1297 }
1298
1299 static void
cmd_buffer_state_set_attachments(struct v3dv_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin)1300 cmd_buffer_state_set_attachments(struct v3dv_cmd_buffer *cmd_buffer,
1301 const VkRenderPassBeginInfo *pRenderPassBegin)
1302 {
1303 V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass);
1304 V3DV_FROM_HANDLE(v3dv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
1305
1306 const VkRenderPassAttachmentBeginInfo *attach_begin =
1307 vk_find_struct_const(pRenderPassBegin, RENDER_PASS_ATTACHMENT_BEGIN_INFO);
1308
1309 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1310
1311 for (uint32_t i = 0; i < pass->attachment_count; i++) {
1312 if (attach_begin && attach_begin->attachmentCount != 0) {
1313 state->attachments[i].image_view =
1314 v3dv_image_view_from_handle(attach_begin->pAttachments[i]);
1315 } else if (framebuffer) {
1316 state->attachments[i].image_view = framebuffer->attachments[i];
1317 } else {
1318 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1319 state->attachments[i].image_view = NULL;
1320 }
1321 }
1322 }
1323
1324 static void
cmd_buffer_init_render_pass_attachment_state(struct v3dv_cmd_buffer * cmd_buffer,const VkRenderPassBeginInfo * pRenderPassBegin)1325 cmd_buffer_init_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer,
1326 const VkRenderPassBeginInfo *pRenderPassBegin)
1327 {
1328 cmd_buffer_state_set_clear_values(cmd_buffer,
1329 pRenderPassBegin->clearValueCount,
1330 pRenderPassBegin->pClearValues);
1331
1332 cmd_buffer_state_set_attachments(cmd_buffer, pRenderPassBegin);
1333 }
1334
1335 static void
cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer * cmd_buffer)1336 cmd_buffer_ensure_render_pass_attachment_state(struct v3dv_cmd_buffer *cmd_buffer)
1337 {
1338 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1339 const struct v3dv_render_pass *pass = state->pass;
1340
1341 if (state->attachment_alloc_count < pass->attachment_count) {
1342 if (state->attachments > 0) {
1343 assert(state->attachment_alloc_count > 0);
1344 vk_free(&cmd_buffer->device->vk.alloc, state->attachments);
1345 }
1346
1347 uint32_t size = sizeof(struct v3dv_cmd_buffer_attachment_state) *
1348 pass->attachment_count;
1349 state->attachments = vk_zalloc(&cmd_buffer->device->vk.alloc, size, 8,
1350 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
1351 if (!state->attachments) {
1352 v3dv_flag_oom(cmd_buffer, NULL);
1353 return;
1354 }
1355 state->attachment_alloc_count = pass->attachment_count;
1356 }
1357
1358 assert(state->attachment_alloc_count >= pass->attachment_count);
1359 }
1360
1361 /* If our render area is smaller than the current clip window we will have
1362 * to emit a new clip window to constraint it to the render area.
1363 */
1364 static void
constraint_clip_window_to_render_area(struct v3dv_cmd_buffer * cmd_buffer)1365 constraint_clip_window_to_render_area(struct v3dv_cmd_buffer *cmd_buffer)
1366 {
1367 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1368 struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
1369
1370 uint32_t min_render_x = state->render_area.offset.x;
1371 uint32_t min_render_y = state->render_area.offset.y;
1372 uint32_t max_render_x = min_render_x + state->render_area.extent.width - 1;
1373 uint32_t max_render_y = min_render_y + state->render_area.extent.height - 1;
1374 uint32_t min_clip_x = state->clip_window.offset.x;
1375 uint32_t min_clip_y = state->clip_window.offset.y;
1376 uint32_t max_clip_x = min_clip_x + state->clip_window.extent.width - 1;
1377 uint32_t max_clip_y = min_clip_y + state->clip_window.extent.height - 1;
1378 if (min_render_x > min_clip_x || min_render_y > min_clip_y ||
1379 max_render_x < max_clip_x || max_render_y < max_clip_y) {
1380 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS);
1381 }
1382 }
1383
1384 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,const VkRenderPassBeginInfo * pRenderPassBegin,const VkSubpassBeginInfo * pSubpassBeginInfo)1385 v3dv_CmdBeginRenderPass2(VkCommandBuffer commandBuffer,
1386 const VkRenderPassBeginInfo *pRenderPassBegin,
1387 const VkSubpassBeginInfo *pSubpassBeginInfo)
1388 {
1389 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1390 V3DV_FROM_HANDLE(v3dv_render_pass, pass, pRenderPassBegin->renderPass);
1391 V3DV_FROM_HANDLE(v3dv_framebuffer, framebuffer, pRenderPassBegin->framebuffer);
1392
1393 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1394 state->pass = pass;
1395 state->framebuffer = framebuffer;
1396
1397 cmd_buffer_ensure_render_pass_attachment_state(cmd_buffer);
1398 v3dv_return_if_oom(cmd_buffer, NULL);
1399
1400 cmd_buffer_init_render_pass_attachment_state(cmd_buffer, pRenderPassBegin);
1401
1402 state->render_area = pRenderPassBegin->renderArea;
1403 constraint_clip_window_to_render_area(cmd_buffer);
1404
1405 /* Setup for first subpass */
1406 v3dv_cmd_buffer_subpass_start(cmd_buffer, 0);
1407 }
1408
1409 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdNextSubpass2(VkCommandBuffer commandBuffer,const VkSubpassBeginInfo * pSubpassBeginInfo,const VkSubpassEndInfo * pSubpassEndInfo)1410 v3dv_CmdNextSubpass2(VkCommandBuffer commandBuffer,
1411 const VkSubpassBeginInfo *pSubpassBeginInfo,
1412 const VkSubpassEndInfo *pSubpassEndInfo)
1413 {
1414 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1415
1416 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1417 assert(state->subpass_idx < state->pass->subpass_count - 1);
1418
1419 /* Finish the previous subpass */
1420 v3dv_cmd_buffer_subpass_finish(cmd_buffer);
1421 cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);
1422
1423 /* Start the next subpass */
1424 v3dv_cmd_buffer_subpass_start(cmd_buffer, state->subpass_idx + 1);
1425 }
1426
1427 static void
cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer * cmd_buffer)1428 cmd_buffer_emit_subpass_clears(struct v3dv_cmd_buffer *cmd_buffer)
1429 {
1430 assert(cmd_buffer->state.pass);
1431 assert(cmd_buffer->state.subpass_idx < cmd_buffer->state.pass->subpass_count);
1432 assert(!cmd_buffer->state.resuming);
1433 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1434 const struct v3dv_render_pass *pass = state->pass;
1435 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
1436
1437 /* We only need to emit subpass clears as draw calls when the render
1438 * area is not aligned to tile boundaries or for GFXH-1461.
1439 */
1440 if (cmd_buffer->state.tile_aligned_render_area &&
1441 !subpass->do_depth_clear_with_draw &&
1442 !subpass->do_stencil_clear_with_draw) {
1443 return;
1444 }
1445
1446 uint32_t att_count = 0;
1447 VkClearAttachment atts[V3D_MAX_DRAW_BUFFERS + 1]; /* +1 for D/S */
1448
1449 /* We only need to emit subpass clears as draw calls for color attachments
1450 * if the render area is not aligned to tile boundaries.
1451 */
1452 if (!cmd_buffer->state.tile_aligned_render_area) {
1453 for (uint32_t i = 0; i < subpass->color_count; i++) {
1454 const uint32_t att_idx = subpass->color_attachments[i].attachment;
1455 if (att_idx == VK_ATTACHMENT_UNUSED)
1456 continue;
1457
1458 struct v3dv_render_pass_attachment *att = &pass->attachments[att_idx];
1459 if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR)
1460 continue;
1461
1462 if (state->subpass_idx != att->first_subpass)
1463 continue;
1464
1465 atts[att_count].aspectMask = VK_IMAGE_ASPECT_COLOR_BIT;
1466 atts[att_count].colorAttachment = i;
1467 atts[att_count].clearValue = state->attachments[att_idx].vk_clear_value;
1468 att_count++;
1469 }
1470 }
1471
1472 /* For D/S we may also need to emit a subpass clear for GFXH-1461 */
1473 const uint32_t ds_att_idx = subpass->ds_attachment.attachment;
1474 if (ds_att_idx != VK_ATTACHMENT_UNUSED) {
1475 struct v3dv_render_pass_attachment *att = &pass->attachments[ds_att_idx];
1476 if (state->subpass_idx == att->first_subpass) {
1477 VkImageAspectFlags aspects = vk_format_aspects(att->desc.format);
1478 if (att->desc.loadOp != VK_ATTACHMENT_LOAD_OP_CLEAR ||
1479 (cmd_buffer->state.tile_aligned_render_area &&
1480 !subpass->do_depth_clear_with_draw)) {
1481 aspects &= ~VK_IMAGE_ASPECT_DEPTH_BIT;
1482 }
1483 if (att->desc.stencilLoadOp != VK_ATTACHMENT_LOAD_OP_CLEAR ||
1484 (cmd_buffer->state.tile_aligned_render_area &&
1485 !subpass->do_stencil_clear_with_draw)) {
1486 aspects &= ~VK_IMAGE_ASPECT_STENCIL_BIT;
1487 }
1488 if (aspects) {
1489 atts[att_count].aspectMask = aspects;
1490 atts[att_count].colorAttachment = 0; /* Ignored */
1491 atts[att_count].clearValue =
1492 state->attachments[ds_att_idx].vk_clear_value;
1493 att_count++;
1494 }
1495 }
1496 }
1497
1498 if (att_count == 0)
1499 return;
1500
1501 if (!cmd_buffer->state.tile_aligned_render_area) {
1502 perf_debug("Render area doesn't match render pass granularity, falling "
1503 "back to vkCmdClearAttachments for "
1504 "VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
1505 } else if (subpass->do_depth_clear_with_draw ||
1506 subpass->do_stencil_clear_with_draw) {
1507 perf_debug("Subpass clears DEPTH but loads STENCIL (or vice versa), "
1508 "falling back to vkCmdClearAttachments for "
1509 "VK_ATTACHMENT_LOAD_OP_CLEAR.\n");
1510 }
1511
1512 /* From the Vulkan 1.0 spec:
1513 *
1514 * "VK_ATTACHMENT_LOAD_OP_CLEAR specifies that the contents within the
1515 * render area will be cleared to a uniform value, which is specified
1516 * when a render pass instance is begun."
1517 *
1518 * So the clear is only constrained by the render area and not by pipeline
1519 * state such as scissor or viewport, these are the semantics of
1520 * vkCmdClearAttachments as well.
1521 *
1522 * Also:
1523 *
1524 * "If the render pass instance this is recorded in uses multiview, then
1525 * baseArrayLayer must be zero and layerCount must be one."
1526 */
1527 assert(state->framebuffer);
1528 uint32_t layer_count = cmd_buffer->state.pass->multiview_enabled ?
1529 1 : state->framebuffer->layers;
1530 VkCommandBuffer _cmd_buffer = v3dv_cmd_buffer_to_handle(cmd_buffer);
1531 VkClearRect rect = {
1532 .rect = state->render_area,
1533 .baseArrayLayer = 0,
1534 .layerCount = layer_count,
1535 };
1536 v3dv_CmdClearAttachments(_cmd_buffer, att_count, atts, 1, &rect);
1537 }
1538
1539 bool
v3dv_cmd_buffer_check_needs_load(const struct v3dv_cmd_buffer_state * state,VkImageAspectFlags aspect,uint32_t first_subpass_idx,VkAttachmentLoadOp load_op,uint32_t last_subpass_idx,VkAttachmentStoreOp store_op)1540 v3dv_cmd_buffer_check_needs_load(const struct v3dv_cmd_buffer_state *state,
1541 VkImageAspectFlags aspect,
1542 uint32_t first_subpass_idx,
1543 VkAttachmentLoadOp load_op,
1544 uint32_t last_subpass_idx,
1545 VkAttachmentStoreOp store_op)
1546 {
1547 /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
1548 * testing does not exist in the image.
1549 */
1550 if (!aspect)
1551 return false;
1552
1553 /* Attachment (or view) load operations apply on the first subpass that
1554 * uses the attachment (or view), otherwise we always need to load.
1555 */
1556 if (state->job->first_subpass > first_subpass_idx)
1557 return true;
1558
1559 /* If the job is continuing a subpass started in another job, we always
1560 * need to load.
1561 */
1562 if (state->job->is_subpass_continue)
1563 return true;
1564
1565 /* If the area is not aligned to tile boundaries and we are going to store,
1566 * then we need to load to preserve contents outside the render area.
1567 */
1568 if (!state->tile_aligned_render_area &&
1569 v3dv_cmd_buffer_check_needs_store(state, aspect, last_subpass_idx,
1570 store_op)) {
1571 return true;
1572 }
1573
1574 /* The attachment load operations must be LOAD */
1575 return load_op == VK_ATTACHMENT_LOAD_OP_LOAD;
1576 }
1577
1578 bool
v3dv_cmd_buffer_check_needs_store(const struct v3dv_cmd_buffer_state * state,VkImageAspectFlags aspect,uint32_t last_subpass_idx,VkAttachmentStoreOp store_op)1579 v3dv_cmd_buffer_check_needs_store(const struct v3dv_cmd_buffer_state *state,
1580 VkImageAspectFlags aspect,
1581 uint32_t last_subpass_idx,
1582 VkAttachmentStoreOp store_op)
1583 {
1584 /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
1585 * testing does not exist in the image.
1586 */
1587 if (!aspect)
1588 return false;
1589
1590 /* Attachment (or view) store operations only apply on the last subpass
1591 * where the attachment (or view) is used, in other subpasses we always
1592 * need to store.
1593 */
1594 if (state->subpass_idx < last_subpass_idx)
1595 return true;
1596
1597 /* Attachment store operations only apply on the last job we emit on the the
1598 * last subpass where the attachment is used, otherwise we always need to
1599 * store.
1600 */
1601 if (!state->job->is_subpass_finish)
1602 return true;
1603
1604 /* The attachment store operation must be STORE */
1605 return store_op == VK_ATTACHMENT_STORE_OP_STORE;
1606 }
1607
1608 static void
cmd_buffer_subpass_check_double_buffer_mode(struct v3dv_cmd_buffer * cmd_buffer,bool msaa)1609 cmd_buffer_subpass_check_double_buffer_mode(struct v3dv_cmd_buffer *cmd_buffer,
1610 bool msaa)
1611 {
1612 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1613 struct v3dv_job *job = cmd_buffer->state.job;
1614 assert(job);
1615
1616 job->can_use_double_buffer = false;
1617
1618 /* Double-buffer can only be used if requested via V3D_DEBUG */
1619 if (!V3D_DBG(DOUBLE_BUFFER))
1620 return;
1621
1622 /* Double-buffer cannot be enabled for MSAA jobs */
1623 if (msaa)
1624 return;
1625
1626 const struct v3dv_render_pass *pass = state->pass;
1627 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
1628
1629 /* FIXME: For now we discard multiview jobs (which have an implicit geometry
1630 * shader) for this optimization. If we want to enable this with multiview
1631 * we would need to check if any view (layer) in any attachment used by the
1632 * job has loads and/or stores as we do below for regular attachments. Also,
1633 * we would want to have a heuristic that doesn't automatically disable
1634 * double-buffer in the presence of geometry shaders.
1635 */
1636 if (state->pass->multiview_enabled)
1637 return;
1638
1639 /* Tile loads are serialized against stores, in which case we don't get
1640 * any benefits from enabling double-buffer and would just pay the price
1641 * of a smaller tile size instead. Similarly, we only benefit from
1642 * double-buffer if we have tile stores, as the point of this mode is
1643 * to execute rendering of a new tile while we store the previous one to
1644 * hide latency on the tile store operation.
1645 */
1646 bool has_stores = false;
1647 for (uint32_t i = 0; i < subpass->color_count; i++) {
1648 uint32_t attachment_idx = subpass->color_attachments[i].attachment;
1649 if (attachment_idx == VK_ATTACHMENT_UNUSED)
1650 continue;
1651
1652 const struct v3dv_render_pass_attachment *attachment =
1653 &state->pass->attachments[attachment_idx];
1654
1655 /* FIXME: This will check 'tile_aligned_render_area' but that was
1656 * computed with a tile size without double-buffer. That is okay
1657 * because if the larger tile size is aligned then we know the smaller
1658 * tile size for double-buffer will be as well. However, we might
1659 * still benefit from doing this check with the smaller tile size
1660 * because it can happen that the smaller size is aligned and the
1661 * larger size is not.
1662 */
1663 if (v3dv_cmd_buffer_check_needs_load(state,
1664 VK_IMAGE_ASPECT_COLOR_BIT,
1665 attachment->first_subpass,
1666 attachment->desc.loadOp,
1667 attachment->last_subpass,
1668 attachment->desc.storeOp)) {
1669 return;
1670 }
1671
1672 if (v3dv_cmd_buffer_check_needs_store(state,
1673 VK_IMAGE_ASPECT_COLOR_BIT,
1674 attachment->last_subpass,
1675 attachment->desc.storeOp)) {
1676 has_stores = true;
1677 }
1678 }
1679
1680 if (subpass->ds_attachment.attachment != VK_ATTACHMENT_UNUSED) {
1681 uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
1682 const struct v3dv_render_pass_attachment *ds_attachment =
1683 &state->pass->attachments[ds_attachment_idx];
1684
1685 const VkImageAspectFlags ds_aspects =
1686 vk_format_aspects(ds_attachment->desc.format);
1687
1688 if (v3dv_cmd_buffer_check_needs_load(state,
1689 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1690 ds_attachment->first_subpass,
1691 ds_attachment->desc.loadOp,
1692 ds_attachment->last_subpass,
1693 ds_attachment->desc.storeOp)) {
1694 return;
1695 }
1696
1697 if (v3dv_cmd_buffer_check_needs_load(state,
1698 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1699 ds_attachment->first_subpass,
1700 ds_attachment->desc.stencilLoadOp,
1701 ds_attachment->last_subpass,
1702 ds_attachment->desc.stencilStoreOp)) {
1703 return;
1704 }
1705
1706 has_stores |= v3dv_cmd_buffer_check_needs_store(state,
1707 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1708 ds_attachment->last_subpass,
1709 ds_attachment->desc.storeOp);
1710 has_stores |= v3dv_cmd_buffer_check_needs_store(state,
1711 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1712 ds_attachment->last_subpass,
1713 ds_attachment->desc.stencilStoreOp);
1714 }
1715
1716 job->can_use_double_buffer = has_stores;
1717 }
1718
1719 static struct v3dv_job *
cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx,enum v3dv_job_type type,bool is_subpass_start)1720 cmd_buffer_subpass_create_job(struct v3dv_cmd_buffer *cmd_buffer,
1721 uint32_t subpass_idx,
1722 enum v3dv_job_type type,
1723 bool is_subpass_start)
1724 {
1725 assert(type == V3DV_JOB_TYPE_GPU_CL ||
1726 type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
1727
1728 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1729 assert(subpass_idx < state->pass->subpass_count);
1730
1731 /* Starting a new job can trigger a finish of the current one, so don't
1732 * change the command buffer state for the new job until we are done creating
1733 * the new job.
1734 */
1735 struct v3dv_job *job =
1736 v3dv_cmd_buffer_start_job(cmd_buffer, subpass_idx, type);
1737 if (!job)
1738 return NULL;
1739
1740 if (is_subpass_start && cmd_buffer->state.resuming) {
1741 assert(subpass_idx == 0);
1742 job->resuming = true;
1743 }
1744
1745 state->subpass_idx = subpass_idx;
1746
1747 /* If we are starting a new job we need to setup binning. We only do this
1748 * for V3DV_JOB_TYPE_GPU_CL jobs because V3DV_JOB_TYPE_GPU_CL_INCOMPLETE
1749 * jobs are not submitted to the GPU directly, and are instead meant to be
1750 * branched to from other V3DV_JOB_TYPE_GPU_CL jobs. With dynamic rendering,
1751 * all resuming jobs work similarly to secondary command buffers, so we
1752 * apply the same.
1753 */
1754 if (type == V3DV_JOB_TYPE_GPU_CL &&
1755 job->first_subpass == state->subpass_idx &&
1756 !job->resuming) {
1757 const struct v3dv_subpass *subpass =
1758 &state->pass->subpasses[state->subpass_idx];
1759
1760 const struct v3dv_framebuffer *framebuffer = state->framebuffer;
1761
1762 uint8_t max_internal_bpp, total_color_bpp;
1763 bool msaa;
1764 v3dv_X(job->device, framebuffer_compute_internal_bpp_msaa)
1765 (framebuffer, state->attachments, subpass,
1766 &max_internal_bpp, &total_color_bpp, &msaa);
1767
1768 /* From the Vulkan spec:
1769 *
1770 * "If the render pass uses multiview, then layers must be one and
1771 * each attachment requires a number of layers that is greater than
1772 * the maximum bit index set in the view mask in the subpasses in
1773 * which it is used."
1774 *
1775 * So when multiview is enabled, we take the number of layers from the
1776 * last bit set in the view mask.
1777 */
1778 uint32_t layers = framebuffer->layers;
1779 if (subpass->view_mask != 0) {
1780 assert(framebuffer->layers == 1);
1781 layers = util_last_bit(subpass->view_mask);
1782 }
1783
1784 v3dv_job_start_frame(job,
1785 framebuffer->width,
1786 framebuffer->height,
1787 layers,
1788 true, false,
1789 subpass->color_count,
1790 max_internal_bpp,
1791 total_color_bpp,
1792 msaa);
1793 }
1794
1795 return job;
1796 }
1797
1798 struct v3dv_job *
v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)1799 v3dv_cmd_buffer_subpass_start(struct v3dv_cmd_buffer *cmd_buffer,
1800 uint32_t subpass_idx)
1801 {
1802 assert(cmd_buffer->state.pass);
1803 assert(subpass_idx < cmd_buffer->state.pass->subpass_count);
1804
1805 struct v3dv_job *job =
1806 cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
1807 V3DV_JOB_TYPE_GPU_CL, true);
1808 if (!job)
1809 return NULL;
1810
1811 /* FIXME: do we need all this below for resuming jobs? */
1812
1813 /* Check if our render area is aligned to tile boundaries. We have to do
1814 * this in each subpass because the subset of attachments used can change
1815 * and with that the tile size selected by the hardware can change too.
1816 */
1817 cmd_buffer_update_tile_alignment(cmd_buffer);
1818
1819 /* Decide if we can use double-buffer for this subpass job */
1820 cmd_buffer_subpass_check_double_buffer_mode(cmd_buffer, job->frame_tiling.msaa);
1821
1822 cmd_buffer_update_attachment_resolve_state(cmd_buffer);
1823
1824 /* If we can't use TLB clears then we need to emit draw clears for any
1825 * LOAD_OP_CLEAR attachments in this subpass now. We might also need to emit
1826 * Depth/Stencil clears if we hit GFXH-1461. With dynamic render passes this
1827 * should only be called when starting the render pass, not when resuming.
1828 */
1829 if (!cmd_buffer->state.resuming)
1830 cmd_buffer_emit_subpass_clears(cmd_buffer);
1831
1832 return job;
1833 }
1834
1835 struct v3dv_job *
v3dv_cmd_buffer_subpass_resume(struct v3dv_cmd_buffer * cmd_buffer,uint32_t subpass_idx)1836 v3dv_cmd_buffer_subpass_resume(struct v3dv_cmd_buffer *cmd_buffer,
1837 uint32_t subpass_idx)
1838 {
1839 assert(cmd_buffer->state.pass);
1840 assert(subpass_idx < cmd_buffer->state.pass->subpass_count);
1841
1842 struct v3dv_job *job;
1843 if (cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_PRIMARY) {
1844 job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
1845 V3DV_JOB_TYPE_GPU_CL, false);
1846 } else {
1847 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1848 job = cmd_buffer_subpass_create_job(cmd_buffer, subpass_idx,
1849 V3DV_JOB_TYPE_GPU_CL_INCOMPLETE, false);
1850 }
1851
1852 if (!job)
1853 return NULL;
1854
1855 job->is_subpass_continue = true;
1856
1857 return job;
1858 }
1859
1860 void
v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer * cmd_buffer)1861 v3dv_cmd_buffer_subpass_finish(struct v3dv_cmd_buffer *cmd_buffer)
1862 {
1863 /* We can end up here without a job if the last command recorded into the
1864 * subpass already finished the job (for example a pipeline barrier). In
1865 * that case we miss to set the is_subpass_finish flag, but that is not
1866 * required for proper behavior.
1867 */
1868 struct v3dv_job *job = cmd_buffer->state.job;
1869 if (job)
1870 job->is_subpass_finish = true;
1871 }
1872
1873 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdEndRenderPass2(VkCommandBuffer commandBuffer,const VkSubpassEndInfo * pSubpassEndInfo)1874 v3dv_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
1875 const VkSubpassEndInfo *pSubpassEndInfo)
1876 {
1877 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1878
1879 /* Finalize last subpass */
1880 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1881 assert(state->subpass_idx == state->pass->subpass_count - 1);
1882 v3dv_cmd_buffer_subpass_finish(cmd_buffer);
1883 v3dv_cmd_buffer_finish_job(cmd_buffer);
1884
1885 cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);
1886
1887 /* We are no longer inside a render pass */
1888 state->framebuffer = NULL;
1889 state->pass = NULL;
1890 state->subpass_idx = -1;
1891 }
1892
1893 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)1894 v3dv_EndCommandBuffer(VkCommandBuffer commandBuffer)
1895 {
1896 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
1897
1898 if (cmd_buffer->state.oom)
1899 return VK_ERROR_OUT_OF_HOST_MEMORY;
1900
1901 /* Primaries should have ended any recording jobs by the time they hit
1902 * vkEndRenderPass (if we are inside a render pass). Commands outside
1903 * a render pass instance (for both primaries and secondaries) spawn
1904 * complete jobs too. So the only case where we can get here without
1905 * finishing a recording job is when we are recording a secondary
1906 * inside a render pass.
1907 */
1908 if (cmd_buffer->state.job) {
1909 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY &&
1910 cmd_buffer->state.pass);
1911 v3dv_cmd_buffer_finish_job(cmd_buffer);
1912 }
1913
1914 cmd_buffer->status = V3DV_CMD_BUFFER_STATUS_EXECUTABLE;
1915
1916 return VK_SUCCESS;
1917 }
1918
1919 static bool
clone_bo_list(struct v3dv_device * device,struct list_head * dst,struct list_head * src)1920 clone_bo_list(struct v3dv_device *device,
1921 struct list_head *dst,
1922 struct list_head *src)
1923 {
1924 assert(device);
1925
1926 list_inithead(dst);
1927 list_for_each_entry(struct v3dv_bo, bo, src, list_link) {
1928 struct v3dv_bo *clone_bo =
1929 vk_alloc(&device->vk.alloc, sizeof(struct v3dv_bo), 8,
1930 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
1931 if (!clone_bo)
1932 return false;
1933
1934 *clone_bo = *bo;
1935 list_addtail(&clone_bo->list_link, dst);
1936 }
1937
1938 return true;
1939 }
1940
1941 struct v3dv_job *
v3dv_job_clone(struct v3dv_job * job,bool skip_bcl)1942 v3dv_job_clone(struct v3dv_job *job, bool skip_bcl)
1943 {
1944 struct v3dv_job *clone = vk_alloc(&job->device->vk.alloc,
1945 sizeof(struct v3dv_job), 8,
1946 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
1947 if (!clone)
1948 return NULL;
1949
1950 /* Cloned jobs don't duplicate resources, they share their CLs with the
1951 * oringinal job, since they are typically read-only. The exception to this
1952 * is dynamic rendering suspension paired with
1953 * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, since in that case we need
1954 * to patch the BCL with the resume address and for that we need to create a
1955 * copy of the job so we avoid rewriting the resume address for another copy
1956 * of the same job that may be running in the GPU. When we create a job for
1957 * this use case skip_bcl is set to True and the caller will be responsible
1958 * for creating the BCL.
1959 */
1960 *clone = *job;
1961 clone->is_clone = true;
1962 clone->cmd_buffer = NULL;
1963
1964 /* We need to regen the BO lists so that they point to the BO list in the
1965 * cloned job. Otherwise functions like list_length() will loop forever.
1966 */
1967 if (job->type == V3DV_JOB_TYPE_GPU_CL) {
1968 assert(job->cmd_buffer);
1969 struct v3dv_device *device = job->cmd_buffer->device;
1970
1971 clone->bcl.job = clone;
1972 clone->rcl.job = clone;
1973 clone->indirect.job = clone;
1974
1975 if (!skip_bcl &&
1976 !clone_bo_list(device, &clone->bcl.bo_list, &job->bcl.bo_list)) {
1977 return NULL;
1978 }
1979 if (!clone_bo_list(device, &clone->rcl.bo_list, &job->rcl.bo_list))
1980 return NULL;
1981 if (!clone_bo_list(device, &clone->indirect.bo_list, &job->indirect.bo_list))
1982 return NULL;
1983 }
1984
1985 return clone;
1986 }
1987
1988 /* Clones a job for inclusion in the given command buffer. Note that this
1989 * doesn't make a deep copy so the cloned job it doesn't own any resources.
1990 * Useful when we need to have a job in more than one list, which happens
1991 * for jobs recorded in secondary command buffers when we want to execute
1992 * them in primaries.
1993 */
1994 struct v3dv_job *
v3dv_job_clone_in_cmd_buffer(struct v3dv_job * job,struct v3dv_cmd_buffer * cmd_buffer)1995 v3dv_job_clone_in_cmd_buffer(struct v3dv_job *job,
1996 struct v3dv_cmd_buffer *cmd_buffer)
1997 {
1998 struct v3dv_job *clone = v3dv_job_clone(job, false);
1999 if (!clone) {
2000 v3dv_flag_oom(cmd_buffer, NULL);
2001 return NULL;
2002 }
2003
2004 clone->cmd_buffer = cmd_buffer;
2005 list_addtail(&clone->list_link, &cmd_buffer->jobs);
2006 return clone;
2007 }
2008
2009 void
v3dv_cmd_buffer_merge_barrier_state(struct v3dv_barrier_state * dst,struct v3dv_barrier_state * src)2010 v3dv_cmd_buffer_merge_barrier_state(struct v3dv_barrier_state *dst,
2011 struct v3dv_barrier_state *src)
2012 {
2013 dst->dst_mask |= src->dst_mask;
2014
2015 dst->src_mask_graphics |= src->src_mask_graphics;
2016 dst->src_mask_compute |= src->src_mask_compute;
2017 dst->src_mask_transfer |= src->src_mask_transfer;
2018
2019 dst->bcl_buffer_access |= src->bcl_buffer_access;
2020 dst->bcl_image_access |= src->bcl_image_access;
2021 }
2022
2023 static void
cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer * primary,uint32_t cmd_buffer_count,const VkCommandBuffer * cmd_buffers)2024 cmd_buffer_execute_outside_pass(struct v3dv_cmd_buffer *primary,
2025 uint32_t cmd_buffer_count,
2026 const VkCommandBuffer *cmd_buffers)
2027 {
2028 struct v3dv_barrier_state pending_barrier = { 0 };
2029 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
2030 V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
2031
2032 assert(!(secondary->usage_flags &
2033 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT));
2034
2035 /* Secondary command buffers that execute outside a render pass create
2036 * complete jobs with an RCL and tile setup, so we simply want to merge
2037 * their job list into the primary's. However, because they may be
2038 * executed into multiple primaries at the same time and we only have a
2039 * single list_link in each job, we can't just add then to the primary's
2040 * job list and we instead have to clone them first.
2041 *
2042 * Alternatively, we could create a "execute secondary" CPU job that
2043 * when executed in a queue, would submit all the jobs in the referenced
2044 * secondary command buffer. However, this would raise some challenges
2045 * to make it work with the implementation of wait threads in the queue
2046 * which we use for event waits, for example.
2047 */
2048 list_for_each_entry(struct v3dv_job, secondary_job,
2049 &secondary->jobs, list_link) {
2050 /* These can only happen inside a render pass */
2051 assert(secondary_job->type != V3DV_JOB_TYPE_GPU_CL_INCOMPLETE);
2052 struct v3dv_job *job = v3dv_job_clone_in_cmd_buffer(secondary_job, primary);
2053 if (!job)
2054 return;
2055
2056 if (pending_barrier.dst_mask) {
2057 /* FIXME: do the same we do for primaries and only choose the
2058 * relevant src masks.
2059 */
2060 job->serialize = pending_barrier.src_mask_graphics |
2061 pending_barrier.src_mask_transfer |
2062 pending_barrier.src_mask_compute;
2063 if (pending_barrier.bcl_buffer_access ||
2064 pending_barrier.bcl_image_access) {
2065 job->needs_bcl_sync = true;
2066 }
2067 memset(&pending_barrier, 0, sizeof(pending_barrier));
2068 }
2069 }
2070
2071 /* If this secondary had any pending barrier state we will need that
2072 * barrier state consumed with whatever comes after it (first job in
2073 * the next secondary or the primary, if this was the last secondary).
2074 */
2075 assert(secondary->state.barrier.dst_mask ||
2076 (!secondary->state.barrier.bcl_buffer_access &&
2077 !secondary->state.barrier.bcl_image_access));
2078 pending_barrier = secondary->state.barrier;
2079 }
2080
2081 if (pending_barrier.dst_mask) {
2082 v3dv_cmd_buffer_merge_barrier_state(&primary->state.barrier,
2083 &pending_barrier);
2084 }
2085 }
2086
2087 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer,uint32_t commandBufferCount,const VkCommandBuffer * pCommandBuffers)2088 v3dv_CmdExecuteCommands(VkCommandBuffer commandBuffer,
2089 uint32_t commandBufferCount,
2090 const VkCommandBuffer *pCommandBuffers)
2091 {
2092 V3DV_FROM_HANDLE(v3dv_cmd_buffer, primary, commandBuffer);
2093
2094 if (primary->state.pass != NULL) {
2095 v3dv_X(primary->device, cmd_buffer_execute_inside_pass)
2096 (primary, commandBufferCount, pCommandBuffers);
2097 } else {
2098 cmd_buffer_execute_outside_pass(primary,
2099 commandBufferCount, pCommandBuffers);
2100 }
2101 }
2102
2103 static void
cmd_buffer_copy_private_dynamic_state(struct v3dv_dynamic_state * dst,struct v3dv_dynamic_state * src,struct vk_dynamic_graphics_state * src_dyn)2104 cmd_buffer_copy_private_dynamic_state(struct v3dv_dynamic_state *dst,
2105 struct v3dv_dynamic_state *src,
2106 struct vk_dynamic_graphics_state *src_dyn)
2107 {
2108 if (BITSET_TEST(src_dyn->set, MESA_VK_DYNAMIC_VP_VIEWPORTS)) {
2109 typed_memcpy(dst->viewport.scale, src->viewport.scale,
2110 MAX_VIEWPORTS);
2111 typed_memcpy(dst->viewport.translate, src->viewport.translate,
2112 MAX_VIEWPORTS);
2113 }
2114 if (BITSET_TEST(src_dyn->set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES))
2115 dst->color_write_enable = src->color_write_enable;
2116 }
2117
2118 /* This function copies relevant static state from the pipeline to the command
2119 * buffer state.
2120 *
2121 * Notice the Vulkan runtime uses the term 'dynamic' to refer to all state
2122 * that *could* be dynamic, even if it is not dynamic for a particular
2123 * pipeline, so the terminology used in the runtime may be a bit misleading.
2124 */
2125 static void
cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_pipeline * pipeline)2126 cmd_buffer_bind_pipeline_static_state(struct v3dv_cmd_buffer *cmd_buffer,
2127 struct v3dv_pipeline *pipeline)
2128 {
2129 vk_cmd_set_dynamic_graphics_state(&cmd_buffer->vk, &pipeline->dynamic_graphics_state);
2130 cmd_buffer_copy_private_dynamic_state(&cmd_buffer->state.dynamic, &pipeline->dynamic,
2131 &pipeline->dynamic_graphics_state);
2132
2133 }
2134
2135 static void
bind_graphics_pipeline(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_pipeline * pipeline)2136 bind_graphics_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
2137 struct v3dv_pipeline *pipeline)
2138 {
2139 assert(pipeline && !(pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
2140
2141 /* We need to unconditionally bind the pipeline static state, as the state
2142 * could have changed (through calls to vkCmdSetXXX) between bindings of
2143 * the same pipeline.
2144 */
2145 cmd_buffer_bind_pipeline_static_state(cmd_buffer, pipeline);
2146
2147 if (cmd_buffer->state.gfx.pipeline == pipeline)
2148 return;
2149
2150 cmd_buffer->state.gfx.pipeline = pipeline;
2151 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PIPELINE;
2152 }
2153
2154 static void
bind_compute_pipeline(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_pipeline * pipeline)2155 bind_compute_pipeline(struct v3dv_cmd_buffer *cmd_buffer,
2156 struct v3dv_pipeline *pipeline)
2157 {
2158 assert(pipeline && pipeline->active_stages == VK_SHADER_STAGE_COMPUTE_BIT);
2159
2160 if (cmd_buffer->state.compute.pipeline == pipeline)
2161 return;
2162
2163 cmd_buffer->state.compute.pipeline = pipeline;
2164 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_PIPELINE;
2165 }
2166
2167 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipeline _pipeline)2168 v3dv_CmdBindPipeline(VkCommandBuffer commandBuffer,
2169 VkPipelineBindPoint pipelineBindPoint,
2170 VkPipeline _pipeline)
2171 {
2172 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2173 V3DV_FROM_HANDLE(v3dv_pipeline, pipeline, _pipeline);
2174
2175 switch (pipelineBindPoint) {
2176 case VK_PIPELINE_BIND_POINT_COMPUTE:
2177 bind_compute_pipeline(cmd_buffer, pipeline);
2178 break;
2179
2180 case VK_PIPELINE_BIND_POINT_GRAPHICS:
2181 bind_graphics_pipeline(cmd_buffer, pipeline);
2182 break;
2183
2184 default:
2185 assert(!"invalid bind point");
2186 break;
2187 }
2188 }
2189
2190 /* Considers the pipeline's negative_one_to_one state and applies it to the
2191 * current viewport transform if needed to produce the resulting Z translate
2192 * and scale parameters.
2193 */
2194 void
v3dv_cmd_buffer_state_get_viewport_z_xform(struct v3dv_cmd_buffer * cmd_buffer,uint32_t vp_idx,float * translate_z,float * scale_z)2195 v3dv_cmd_buffer_state_get_viewport_z_xform(struct v3dv_cmd_buffer *cmd_buffer,
2196 uint32_t vp_idx,
2197 float *translate_z, float *scale_z)
2198 {
2199 const struct v3dv_viewport_state *vp_state = &cmd_buffer->state.dynamic.viewport;
2200 const struct vk_viewport_state *vk_vp_state = &cmd_buffer->vk.dynamic_graphics_state.vp;
2201
2202 float t = vp_state->translate[vp_idx][2];
2203 float s = vp_state->scale[vp_idx][2];
2204
2205 assert(cmd_buffer->state.gfx.pipeline);
2206 if (cmd_buffer->state.gfx.pipeline->negative_one_to_one) {
2207 t = (t + vk_vp_state->viewports[vp_idx].maxDepth) * 0.5f;
2208 s *= 0.5f;
2209 }
2210
2211 if (translate_z)
2212 *translate_z = t;
2213
2214 if (scale_z)
2215 *scale_z = s;
2216 }
2217
2218 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkBool32 * pColorWriteEnables)2219 v3dv_CmdSetColorWriteEnableEXT(VkCommandBuffer commandBuffer,
2220 uint32_t attachmentCount,
2221 const VkBool32 *pColorWriteEnables)
2222 {
2223 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2224 struct v3dv_dynamic_state *v3dv_dyn = &cmd_buffer->state.dynamic;
2225 struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
2226 uint32_t color_write_enable = 0;
2227
2228 /* Vulkan runtime computes color_write_enable as an 8-bit bitset, setting a
2229 * bit per attachment. But when emitting, it is combined with the
2230 * color_write_mask, that is stored as a 32-bit mask (one bit per channel,
2231 * per attachment). So we store the color_write_enable as a 32-bit mask
2232 * ourselves.
2233 */
2234 for (uint32_t i = 0; i < attachmentCount; i++)
2235 color_write_enable |= pColorWriteEnables[i] ? (0xfu << (i * 4)) : 0;
2236
2237 if (v3dv_dyn->color_write_enable == color_write_enable)
2238 return;
2239
2240 v3dv_dyn->color_write_enable = color_write_enable;
2241 BITSET_SET(dyn->set, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
2242 BITSET_SET(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
2243 }
2244
2245 /* We keep a custom CmdSetViewport because we want to cache the outcome of
2246 * viewport_compute_xform, and because we need to set the viewport count. This
2247 * is specially relevant to our case because we are pushing/popping the
2248 * dynamic state as part of the meta operations.
2249 */
2250 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,uint32_t firstViewport,uint32_t viewportCount,const VkViewport * pViewports)2251 v3dv_CmdSetViewport(VkCommandBuffer commandBuffer,
2252 uint32_t firstViewport,
2253 uint32_t viewportCount,
2254 const VkViewport *pViewports)
2255 {
2256 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
2257 struct v3dv_dynamic_state *v3dv_dyn = &cmd_buffer->state.dynamic;
2258 struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
2259
2260 const uint32_t total_count = firstViewport + viewportCount;
2261 assert(firstViewport < MAX_VIEWPORTS);
2262 assert(total_count >= 1 && total_count <= MAX_VIEWPORTS);
2263
2264 vk_common_CmdSetViewportWithCount(commandBuffer,
2265 total_count,
2266 pViewports);
2267
2268 for (uint32_t i = firstViewport; i < total_count; i++) {
2269 v3dv_X(cmd_buffer->device, viewport_compute_xform)
2270 (&dyn->vp.viewports[i], v3dv_dyn->viewport.scale[i],
2271 v3dv_dyn->viewport.translate[i]);
2272 }
2273 }
2274
2275 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer,uint32_t viewportCount,const VkViewport * pViewports)2276 v3dv_CmdSetViewportWithCount(VkCommandBuffer commandBuffer,
2277 uint32_t viewportCount,
2278 const VkViewport *pViewports)
2279 {
2280 v3dv_CmdSetViewport(commandBuffer, 0, viewportCount, pViewports);
2281 }
2282
2283 /* We keep a custom CmdSetScissor because we need to set the scissor
2284 * count. This is specially relevant to our case because we are
2285 * pushing/popping the dynamic state as part of the meta operations.
2286 */
2287 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetScissor(VkCommandBuffer commandBuffer,uint32_t firstScissor,uint32_t scissorCount,const VkRect2D * pScissors)2288 v3dv_CmdSetScissor(VkCommandBuffer commandBuffer,
2289 uint32_t firstScissor,
2290 uint32_t scissorCount,
2291 const VkRect2D *pScissors)
2292 {
2293 assert(firstScissor < MAX_SCISSORS);
2294 assert(firstScissor + scissorCount >= 1 &&
2295 firstScissor + scissorCount <= MAX_SCISSORS);
2296
2297 vk_common_CmdSetScissorWithCount(commandBuffer,
2298 firstScissor + scissorCount,
2299 pScissors);
2300 }
2301
2302 static void
emit_scissor(struct v3dv_cmd_buffer * cmd_buffer)2303 emit_scissor(struct v3dv_cmd_buffer *cmd_buffer)
2304 {
2305 if (cmd_buffer->vk.dynamic_graphics_state.vp.viewport_count == 0)
2306 return;
2307
2308 struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
2309
2310 /* FIXME: right now we only support one viewport. viewporst[0] would work
2311 * now, but would need to change if we allow multiple viewports.
2312 */
2313 float *vptranslate = dynamic->viewport.translate[0];
2314 float *vpscale = dynamic->viewport.scale[0];
2315 assert(vpscale[0] >= 0);
2316
2317 float vp_minx = vptranslate[0] - vpscale[0];
2318 float vp_maxx = vptranslate[0] + vpscale[0];
2319
2320 /* With KHR_maintenance1 viewport may have negative Y */
2321 float vp_miny = vptranslate[1] - fabsf(vpscale[1]);
2322 float vp_maxy = vptranslate[1] + fabsf(vpscale[1]);
2323
2324 /* Quoting from v3dx_emit:
2325 * "Clip to the scissor if it's enabled, but still clip to the
2326 * drawable regardless since that controls where the binner
2327 * tries to put things.
2328 *
2329 * Additionally, always clip the rendering to the viewport,
2330 * since the hardware does guardband clipping, meaning
2331 * primitives would rasterize outside of the view volume."
2332 */
2333 uint32_t minx, miny, maxx, maxy;
2334
2335 /* From the Vulkan spec:
2336 *
2337 * "The application must ensure (using scissor if necessary) that all
2338 * rendering is contained within the render area. The render area must be
2339 * contained within the framebuffer dimensions."
2340 *
2341 * So it is the application's responsibility to ensure this. Still, we can
2342 * help by automatically restricting the scissor rect to the render area.
2343 */
2344 minx = MAX2(vp_minx, cmd_buffer->state.render_area.offset.x);
2345 miny = MAX2(vp_miny, cmd_buffer->state.render_area.offset.y);
2346 maxx = MIN2(vp_maxx, cmd_buffer->state.render_area.offset.x +
2347 cmd_buffer->state.render_area.extent.width);
2348 maxy = MIN2(vp_maxy, cmd_buffer->state.render_area.offset.y +
2349 cmd_buffer->state.render_area.extent.height);
2350
2351 /* Clip against user provided scissor if needed.
2352 *
2353 * FIXME: right now we only allow one scissor. Below would need to be
2354 * updated if we support more
2355 */
2356 struct vk_dynamic_graphics_state *vk_dyn =
2357 &cmd_buffer->vk.dynamic_graphics_state;
2358 if (vk_dyn->vp.scissor_count > 0) {
2359 VkRect2D *scissor = &vk_dyn->vp.scissors[0];
2360 minx = MAX2(minx, scissor->offset.x);
2361 miny = MAX2(miny, scissor->offset.y);
2362 maxx = MIN2(maxx, scissor->offset.x + scissor->extent.width);
2363 maxy = MIN2(maxy, scissor->offset.y + scissor->extent.height);
2364 }
2365
2366 /* If the scissor is outside the viewport area we end up with
2367 * min{x,y} > max{x,y}.
2368 */
2369 if (minx > maxx)
2370 maxx = minx;
2371 if (miny > maxy)
2372 maxy = miny;
2373
2374 cmd_buffer->state.clip_window.offset.x = minx;
2375 cmd_buffer->state.clip_window.offset.y = miny;
2376 cmd_buffer->state.clip_window.extent.width = maxx - minx;
2377 cmd_buffer->state.clip_window.extent.height = maxy - miny;
2378
2379 v3dv_X(cmd_buffer->device, job_emit_clip_window)
2380 (cmd_buffer->state.job, &cmd_buffer->state.clip_window);
2381
2382 BITSET_CLEAR(vk_dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS);
2383 }
2384
2385 static bool
update_gfx_uniform_state(struct v3dv_cmd_buffer * cmd_buffer)2386 update_gfx_uniform_state(struct v3dv_cmd_buffer *cmd_buffer)
2387 {
2388 /* We need to update uniform streams if any piece of state that is passed
2389 * to the shader as a uniform may have changed.
2390 *
2391 * If only descriptor sets are dirty then we can safely ignore updates
2392 * for shader stages that don't access descriptors.
2393 */
2394 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
2395 assert(pipeline);
2396 uint32_t dirty = cmd_buffer->state.dirty;
2397 struct vk_dynamic_graphics_state *dyn =
2398 &cmd_buffer->vk.dynamic_graphics_state;
2399
2400 const bool dirty_uniform_state =
2401 (dirty & (V3DV_CMD_DIRTY_PIPELINE |
2402 V3DV_CMD_DIRTY_PUSH_CONSTANTS |
2403 V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
2404 V3DV_CMD_DIRTY_VIEW_INDEX |
2405 V3DV_CMD_DIRTY_DRAW_ID)) ||
2406 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS);
2407
2408 if (!dirty_uniform_state)
2409 return false;
2410
2411 const bool has_new_pipeline = dirty & V3DV_CMD_DIRTY_PIPELINE;
2412 const bool has_new_viewport = BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS);
2413 const bool has_new_push_constants = dirty & V3DV_CMD_DIRTY_PUSH_CONSTANTS;
2414 const bool has_new_descriptors = dirty & V3DV_CMD_DIRTY_DESCRIPTOR_SETS;
2415 const bool has_new_view_index = dirty & V3DV_CMD_DIRTY_VIEW_INDEX;
2416 const bool has_new_draw_id = dirty & V3DV_CMD_DIRTY_DRAW_ID;
2417
2418 /* VK_SHADER_STAGE_FRAGMENT_BIT */
2419 const bool has_new_descriptors_fs =
2420 has_new_descriptors &&
2421 (cmd_buffer->state.dirty_descriptor_stages & VK_SHADER_STAGE_FRAGMENT_BIT);
2422
2423 const bool has_new_push_constants_fs =
2424 has_new_push_constants &&
2425 (cmd_buffer->state.dirty_push_constants_stages & VK_SHADER_STAGE_FRAGMENT_BIT);
2426
2427 const bool needs_fs_update = has_new_pipeline ||
2428 has_new_view_index ||
2429 has_new_push_constants_fs ||
2430 has_new_descriptors_fs;
2431
2432 if (needs_fs_update) {
2433 struct v3dv_shader_variant *fs_variant =
2434 pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
2435
2436 cmd_buffer->state.uniforms.fs =
2437 v3dv_write_uniforms(cmd_buffer, pipeline, fs_variant);
2438 }
2439
2440 /* VK_SHADER_STAGE_GEOMETRY_BIT */
2441 if (pipeline->has_gs) {
2442 const bool has_new_descriptors_gs =
2443 has_new_descriptors &&
2444 (cmd_buffer->state.dirty_descriptor_stages &
2445 VK_SHADER_STAGE_GEOMETRY_BIT);
2446
2447 const bool has_new_push_constants_gs =
2448 has_new_push_constants &&
2449 (cmd_buffer->state.dirty_push_constants_stages &
2450 VK_SHADER_STAGE_GEOMETRY_BIT);
2451
2452 const bool needs_gs_update = has_new_viewport ||
2453 has_new_view_index ||
2454 has_new_pipeline ||
2455 has_new_push_constants_gs ||
2456 has_new_descriptors_gs;
2457
2458 if (needs_gs_update) {
2459 struct v3dv_shader_variant *gs_variant =
2460 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
2461
2462 struct v3dv_shader_variant *gs_bin_variant =
2463 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
2464
2465 cmd_buffer->state.uniforms.gs =
2466 v3dv_write_uniforms(cmd_buffer, pipeline, gs_variant);
2467
2468 cmd_buffer->state.uniforms.gs_bin =
2469 v3dv_write_uniforms(cmd_buffer, pipeline, gs_bin_variant);
2470 }
2471 }
2472
2473 /* VK_SHADER_STAGE_VERTEX_BIT */
2474 const bool has_new_descriptors_vs =
2475 has_new_descriptors &&
2476 (cmd_buffer->state.dirty_descriptor_stages & VK_SHADER_STAGE_VERTEX_BIT);
2477
2478 const bool has_new_push_constants_vs =
2479 has_new_push_constants &&
2480 (cmd_buffer->state.dirty_push_constants_stages & VK_SHADER_STAGE_VERTEX_BIT);
2481
2482 const bool needs_vs_update = has_new_viewport ||
2483 has_new_view_index ||
2484 has_new_draw_id ||
2485 has_new_pipeline ||
2486 has_new_push_constants_vs ||
2487 has_new_descriptors_vs;
2488
2489 if (needs_vs_update) {
2490 struct v3dv_shader_variant *vs_variant =
2491 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
2492
2493 struct v3dv_shader_variant *vs_bin_variant =
2494 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
2495
2496 cmd_buffer->state.uniforms.vs =
2497 v3dv_write_uniforms(cmd_buffer, pipeline, vs_variant);
2498
2499 cmd_buffer->state.uniforms.vs_bin =
2500 v3dv_write_uniforms(cmd_buffer, pipeline, vs_bin_variant);
2501 }
2502
2503 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_VIEW_INDEX;
2504 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_DRAW_ID;
2505
2506 return true;
2507 }
2508
2509 /* This stores command buffer state that we might be about to stomp for
2510 * a meta operation.
2511 */
2512 void
v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer * cmd_buffer,bool push_descriptor_state)2513 v3dv_cmd_buffer_meta_state_push(struct v3dv_cmd_buffer *cmd_buffer,
2514 bool push_descriptor_state)
2515 {
2516 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2517
2518 /* Attachment state.
2519 *
2520 * We store this state even if we are not currently in a subpass
2521 * (subpass_idx != -1) because we may get here to implement subpass
2522 * resolves via vkCmdResolveImage from
2523 * cmd_buffer_subpass_handle_pending_resolves. In that scenario we pretend
2524 * we are no longer in a subpass because Vulkan disallows image resolves
2525 * via vkCmdResolveImage during subpasses, but we still need to preserve
2526 * attachment state because we may have more subpasses to go through
2527 * after processing resolves in the current subass.
2528 */
2529 const uint32_t attachment_state_item_size =
2530 sizeof(struct v3dv_cmd_buffer_attachment_state);
2531 const uint32_t attachment_state_total_size =
2532 attachment_state_item_size * state->attachment_alloc_count;
2533 if (state->meta.attachment_alloc_count < state->attachment_alloc_count) {
2534 if (state->meta.attachment_alloc_count > 0)
2535 vk_free(&cmd_buffer->device->vk.alloc, state->meta.attachments);
2536
2537 state->meta.attachments = vk_zalloc(&cmd_buffer->device->vk.alloc,
2538 attachment_state_total_size, 8,
2539 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2540 if (!state->meta.attachments) {
2541 v3dv_flag_oom(cmd_buffer, NULL);
2542 return;
2543 }
2544 state->meta.attachment_alloc_count = state->attachment_alloc_count;
2545 }
2546 state->meta.attachment_count = state->attachment_alloc_count;
2547 if (state->meta.attachments) {
2548 memcpy(state->meta.attachments, state->attachments,
2549 attachment_state_total_size);
2550 }
2551
2552 if (state->subpass_idx != -1) {
2553 state->meta.subpass_idx = state->subpass_idx;
2554 state->meta.framebuffer = v3dv_framebuffer_to_handle(state->framebuffer);
2555 state->meta.pass = v3dv_render_pass_to_handle(state->pass);
2556
2557 state->meta.tile_aligned_render_area = state->tile_aligned_render_area;
2558 memcpy(&state->meta.render_area, &state->render_area, sizeof(VkRect2D));
2559 }
2560
2561 /* We expect that meta operations are graphics-only, so we only take into
2562 * account the graphics pipeline, and the graphics state
2563 */
2564 state->meta.gfx.pipeline = state->gfx.pipeline;
2565 vk_dynamic_graphics_state_copy(&state->meta.dynamic_graphics_state,
2566 &cmd_buffer->vk.dynamic_graphics_state);
2567 memcpy(&state->meta.dynamic, &state->dynamic, sizeof(state->dynamic));
2568
2569 struct v3dv_descriptor_state *gfx_descriptor_state =
2570 &cmd_buffer->state.gfx.descriptor_state;
2571
2572 if (push_descriptor_state) {
2573 if (gfx_descriptor_state->valid != 0) {
2574 memcpy(&state->meta.gfx.descriptor_state, gfx_descriptor_state,
2575 sizeof(state->gfx.descriptor_state));
2576 }
2577 state->meta.has_descriptor_state = true;
2578 } else {
2579 state->meta.has_descriptor_state = false;
2580 }
2581
2582 if (cmd_buffer->state.push_constants_size > 0) {
2583 state->meta.push_constants_size = cmd_buffer->state.push_constants_size;
2584 memcpy(state->meta.push_constants, cmd_buffer->state.push_constants_data,
2585 cmd_buffer->state.push_constants_size);
2586 cmd_buffer->state.push_constants_size = 0;
2587 }
2588 }
2589
2590 /* This restores command buffer state after a meta operation
2591 */
2592 void
v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer * cmd_buffer,bool needs_subpass_resume)2593 v3dv_cmd_buffer_meta_state_pop(struct v3dv_cmd_buffer *cmd_buffer,
2594 bool needs_subpass_resume)
2595 {
2596 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2597
2598 /* Attachment state */
2599 assert(state->meta.attachment_count <= state->attachment_alloc_count);
2600 const uint32_t attachment_state_item_size =
2601 sizeof(struct v3dv_cmd_buffer_attachment_state);
2602 const uint32_t attachment_state_total_size =
2603 attachment_state_item_size * state->meta.attachment_count;
2604 if (attachment_state_total_size > 0) {
2605 memcpy(state->attachments, state->meta.attachments,
2606 attachment_state_total_size);
2607 }
2608
2609 if (state->meta.subpass_idx != -1) {
2610 state->pass = v3dv_render_pass_from_handle(state->meta.pass);
2611 state->framebuffer = v3dv_framebuffer_from_handle(state->meta.framebuffer);
2612
2613 state->tile_aligned_render_area = state->meta.tile_aligned_render_area;
2614 memcpy(&state->render_area, &state->meta.render_area, sizeof(VkRect2D));
2615
2616 /* Is needs_subpass_resume is true it means that the emitted the meta
2617 * operation in its own job (possibly with an RT config that is
2618 * incompatible with the current subpass), so resuming subpass execution
2619 * after it requires that we create a new job with the subpass RT setup.
2620 */
2621 if (needs_subpass_resume)
2622 v3dv_cmd_buffer_subpass_resume(cmd_buffer, state->meta.subpass_idx);
2623 } else {
2624 state->subpass_idx = -1;
2625 }
2626
2627 if (state->meta.gfx.pipeline != NULL) {
2628 struct v3dv_pipeline *pipeline = state->meta.gfx.pipeline;
2629 VkPipelineBindPoint pipeline_binding =
2630 v3dv_pipeline_get_binding_point(pipeline);
2631 v3dv_CmdBindPipeline(v3dv_cmd_buffer_to_handle(cmd_buffer),
2632 pipeline_binding,
2633 v3dv_pipeline_to_handle(state->meta.gfx.pipeline));
2634 } else {
2635 state->gfx.pipeline = NULL;
2636 }
2637
2638 /* Restore dynamic state */
2639 vk_dynamic_graphics_state_copy(&cmd_buffer->vk.dynamic_graphics_state,
2640 &state->meta.dynamic_graphics_state);
2641 memcpy(&state->dynamic, &state->meta.dynamic, sizeof(state->dynamic));
2642 state->dirty = ~0;
2643
2644 if (state->meta.has_descriptor_state) {
2645 if (state->meta.gfx.descriptor_state.valid != 0) {
2646 memcpy(&state->gfx.descriptor_state, &state->meta.gfx.descriptor_state,
2647 sizeof(state->gfx.descriptor_state));
2648 } else {
2649 state->gfx.descriptor_state.valid = 0;
2650 }
2651 }
2652
2653 /* We only need to restore push constant data if we had any data in the
2654 * original command buffer and the meta operation wrote new push constant
2655 * data.
2656 */
2657 if (state->meta.push_constants_size > 0 &&
2658 cmd_buffer->state.push_constants_size > 0) {
2659 memcpy(cmd_buffer->state.push_constants_data, state->meta.push_constants,
2660 state->meta.push_constants_size);
2661 }
2662 cmd_buffer->state.push_constants_size = state->meta.push_constants_size;
2663
2664 state->meta.gfx.pipeline = NULL;
2665 state->meta.framebuffer = VK_NULL_HANDLE;
2666 state->meta.pass = VK_NULL_HANDLE;
2667 state->meta.subpass_idx = -1;
2668 state->meta.has_descriptor_state = false;
2669 state->meta.push_constants_size = 0;
2670 }
2671
2672 static struct v3dv_job *
cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer * cmd_buffer)2673 cmd_buffer_pre_draw_split_job(struct v3dv_cmd_buffer *cmd_buffer)
2674 {
2675 struct v3dv_job *job = cmd_buffer->state.job;
2676 assert(job);
2677
2678 /* If the job has been flagged with 'always_flush' and it has already
2679 * recorded any draw calls then we need to start a new job for it.
2680 */
2681 if (job->always_flush && job->draw_count > 0) {
2682 assert(cmd_buffer->state.pass);
2683 /* First, flag the current job as not being the last in the
2684 * current subpass
2685 */
2686 job->is_subpass_finish = false;
2687
2688 /* Now start a new job in the same subpass and flag it as continuing
2689 * the current subpass.
2690 */
2691 job = v3dv_cmd_buffer_subpass_resume(cmd_buffer,
2692 cmd_buffer->state.subpass_idx);
2693 assert(job->draw_count == 0);
2694
2695 /* Inherit the 'always flush' behavior */
2696 job->always_flush = true;
2697 }
2698
2699 assert(job->draw_count == 0 || !job->always_flush);
2700 return job;
2701 }
2702
2703 /**
2704 * The Vulkan spec states:
2705 *
2706 * "It is legal for a subpass to use no color or depth/stencil
2707 * attachments (...) This kind of subpass can use shader side effects such
2708 * as image stores and atomics to produce an output. In this case, the
2709 * subpass continues to use the width, height, and layers of the framebuffer
2710 * to define the dimensions of the rendering area, and the
2711 * rasterizationSamples from each pipeline’s
2712 * VkPipelineMultisampleStateCreateInfo to define the number of samples used
2713 * in rasterization."
2714 *
2715 * We need to enable MSAA in the TILE_BINNING_MODE_CFG packet, which we
2716 * emit when we start a new frame at the beginning of a subpass. At that point,
2717 * if the framebuffer doesn't have any attachments we won't enable MSAA and
2718 * the job won't be valid in the scenario described by the spec.
2719 *
2720 * This function is intended to be called before a draw call and will test if
2721 * we are in that scenario, in which case, it will restart the current job
2722 * with MSAA enabled.
2723 */
2724 static void
cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer * cmd_buffer)2725 cmd_buffer_restart_job_for_msaa_if_needed(struct v3dv_cmd_buffer *cmd_buffer)
2726 {
2727 assert(cmd_buffer->state.job);
2728
2729 /* We don't support variableMultisampleRate so we know that all pipelines
2730 * bound in the same subpass must have matching number of samples, so we
2731 * can do this check only on the first draw call.
2732 */
2733 if (cmd_buffer->state.job->draw_count > 0)
2734 return;
2735
2736 /* We only need to restart the frame if the pipeline requires MSAA but
2737 * our frame tiling didn't enable it.
2738 */
2739 if (!cmd_buffer->state.gfx.pipeline->msaa ||
2740 cmd_buffer->state.job->frame_tiling.msaa) {
2741 return;
2742 }
2743
2744 /* FIXME: Secondary command buffers don't start frames. Instead, they are
2745 * recorded into primary jobs that start them. For secondaries, we should
2746 * still handle this scenario, but we should do that when we record them
2747 * into primaries by testing if any of the secondaries has multisampled
2748 * draw calls in them, and then using that info to decide if we need to
2749 * restart the primary job into which they are being recorded.
2750 */
2751 if (cmd_buffer->vk.level != VK_COMMAND_BUFFER_LEVEL_PRIMARY)
2752 return;
2753
2754 /* Drop the current job and restart it with MSAA enabled */
2755 struct v3dv_job *old_job = cmd_buffer->state.job;
2756 cmd_buffer->state.job = NULL;
2757
2758 struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,
2759 sizeof(struct v3dv_job), 8,
2760 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
2761 if (!job) {
2762 v3dv_flag_oom(cmd_buffer, NULL);
2763 return;
2764 }
2765
2766 v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CL, cmd_buffer->device, cmd_buffer,
2767 cmd_buffer->state.subpass_idx);
2768 cmd_buffer->state.job = job;
2769
2770 v3dv_job_start_frame(job,
2771 old_job->frame_tiling.width,
2772 old_job->frame_tiling.height,
2773 old_job->frame_tiling.layers,
2774 true, false,
2775 old_job->frame_tiling.render_target_count,
2776 old_job->frame_tiling.internal_bpp,
2777 old_job->frame_tiling.total_color_bpp,
2778 true /* msaa */);
2779
2780 v3dv_job_destroy(old_job);
2781 }
2782
2783 static bool
cmd_buffer_binning_sync_required(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_pipeline * pipeline,bool indexed,bool indirect)2784 cmd_buffer_binning_sync_required(struct v3dv_cmd_buffer *cmd_buffer,
2785 struct v3dv_pipeline *pipeline,
2786 bool indexed, bool indirect)
2787 {
2788 const struct v3dv_descriptor_maps *vs_bin_maps =
2789 pipeline->shared_data->maps[BROADCOM_SHADER_VERTEX_BIN];
2790
2791 const struct v3dv_descriptor_maps *gs_bin_maps =
2792 pipeline->shared_data->maps[BROADCOM_SHADER_GEOMETRY_BIN];
2793
2794 VkAccessFlags buffer_access =
2795 cmd_buffer->state.barrier.bcl_buffer_access;
2796 if (buffer_access) {
2797 /* Index buffer read */
2798 if (indexed && (buffer_access & (VK_ACCESS_2_INDEX_READ_BIT |
2799 VK_ACCESS_2_MEMORY_READ_BIT))) {
2800 return true;
2801 }
2802
2803 /* Indirect buffer read */
2804 if (indirect && (buffer_access & (VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT |
2805 VK_ACCESS_2_MEMORY_READ_BIT))) {
2806 return true;
2807 }
2808
2809 /* Attribute read */
2810 if (buffer_access & (VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT |
2811 VK_ACCESS_2_MEMORY_READ_BIT)) {
2812 const struct v3d_vs_prog_data *prog_data =
2813 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN]->prog_data.vs;
2814
2815 for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
2816 if (prog_data->vattr_sizes[i] > 0)
2817 return true;
2818 }
2819 }
2820
2821 /* UBO / SSBO read */
2822 if (buffer_access & (VK_ACCESS_2_UNIFORM_READ_BIT |
2823 VK_ACCESS_2_SHADER_READ_BIT |
2824 VK_ACCESS_2_MEMORY_READ_BIT |
2825 VK_ACCESS_2_SHADER_STORAGE_READ_BIT)) {
2826
2827 if (vs_bin_maps->ubo_map.num_desc > 0 ||
2828 vs_bin_maps->ssbo_map.num_desc > 0) {
2829 return true;
2830 }
2831
2832 if (gs_bin_maps && (gs_bin_maps->ubo_map.num_desc > 0 ||
2833 gs_bin_maps->ssbo_map.num_desc > 0)) {
2834 return true;
2835 }
2836 }
2837
2838 /* SSBO write */
2839 if (buffer_access & (VK_ACCESS_2_SHADER_WRITE_BIT |
2840 VK_ACCESS_2_MEMORY_WRITE_BIT |
2841 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT)) {
2842 if (vs_bin_maps->ssbo_map.num_desc > 0)
2843 return true;
2844
2845 if (gs_bin_maps && gs_bin_maps->ssbo_map.num_desc > 0)
2846 return true;
2847 }
2848
2849 /* Texel Buffer read */
2850 if (buffer_access & (VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
2851 VK_ACCESS_2_MEMORY_READ_BIT)) {
2852 if (vs_bin_maps->texture_map.num_desc > 0)
2853 return true;
2854
2855 if (gs_bin_maps && gs_bin_maps->texture_map.num_desc > 0)
2856 return true;
2857 }
2858 }
2859
2860 VkAccessFlags image_access =
2861 cmd_buffer->state.barrier.bcl_image_access;
2862 if (image_access) {
2863 /* Image load / store */
2864 if (image_access & (VK_ACCESS_2_SHADER_READ_BIT |
2865 VK_ACCESS_2_SHADER_WRITE_BIT |
2866 VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
2867 VK_ACCESS_2_SHADER_STORAGE_READ_BIT |
2868 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
2869 VK_ACCESS_2_MEMORY_READ_BIT |
2870 VK_ACCESS_2_MEMORY_WRITE_BIT)) {
2871 if (vs_bin_maps->texture_map.num_desc > 0 ||
2872 vs_bin_maps->sampler_map.num_desc > 0) {
2873 return true;
2874 }
2875
2876 if (gs_bin_maps && (gs_bin_maps->texture_map.num_desc > 0 ||
2877 gs_bin_maps->sampler_map.num_desc > 0)) {
2878 return true;
2879 }
2880 }
2881 }
2882
2883 return false;
2884 }
2885
2886 void
v3dv_cmd_buffer_consume_bcl_sync(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_job * job)2887 v3dv_cmd_buffer_consume_bcl_sync(struct v3dv_cmd_buffer *cmd_buffer,
2888 struct v3dv_job *job)
2889 {
2890 job->needs_bcl_sync = true;
2891 cmd_buffer->state.barrier.bcl_buffer_access = 0;
2892 cmd_buffer->state.barrier.bcl_image_access = 0;
2893 }
2894
2895 static inline uint32_t
compute_prog_score(struct v3dv_shader_variant * vs)2896 compute_prog_score(struct v3dv_shader_variant *vs)
2897 {
2898 const uint32_t inst_count = vs->qpu_insts_size / sizeof(uint64_t);
2899 const uint32_t tmu_count = vs->prog_data.base->tmu_count +
2900 vs->prog_data.base->tmu_spills +
2901 vs->prog_data.base->tmu_fills;
2902 return inst_count + 4 * tmu_count;
2903 }
2904
2905 static void
job_update_double_buffer_score(struct v3dv_job * job,struct v3dv_pipeline * pipeline,uint32_t vertex_count,VkExtent2D * render_area)2906 job_update_double_buffer_score(struct v3dv_job *job,
2907 struct v3dv_pipeline *pipeline,
2908 uint32_t vertex_count,
2909 VkExtent2D *render_area)
2910 {
2911 /* FIXME: assume anything with GS workloads is too expensive */
2912 struct v3dv_shader_variant *gs_bin =
2913 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
2914 if (gs_bin) {
2915 job->can_use_double_buffer = false;
2916 return;
2917 }
2918
2919 /* Keep track of vertex processing: too much geometry processing would not
2920 * be good for double-buffer.
2921 */
2922 struct v3dv_shader_variant *vs_bin =
2923 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
2924 assert(vs_bin);
2925 uint32_t geom_score = vertex_count * compute_prog_score(vs_bin);
2926
2927 struct v3dv_shader_variant *vs =
2928 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
2929 assert(vs);
2930 uint32_t vs_score = vertex_count * compute_prog_score(vs);
2931 geom_score += vs_score;
2932
2933 job->double_buffer_score.geom += geom_score;
2934
2935 /* Compute pixel rendering cost.
2936 *
2937 * We estimate that on average a draw would render 0.2% of the pixels in
2938 * the render area. That would be a 64x64 region in a 1920x1080 area.
2939 */
2940 struct v3dv_shader_variant *fs =
2941 pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
2942 assert(fs);
2943 uint32_t pixel_count = 0.002f * render_area->width * render_area->height;
2944 uint32_t render_score = vs_score + pixel_count * compute_prog_score(fs);
2945
2946 job->double_buffer_score.render += render_score;
2947 }
2948
2949 void
v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer * cmd_buffer,bool indexed,bool indirect,uint32_t vertex_count)2950 v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer,
2951 bool indexed, bool indirect,
2952 uint32_t vertex_count)
2953 {
2954 assert(cmd_buffer->state.gfx.pipeline);
2955 assert(!(cmd_buffer->state.gfx.pipeline->active_stages & VK_SHADER_STAGE_COMPUTE_BIT));
2956
2957 /* If we emitted a pipeline barrier right before this draw we won't have
2958 * an active job. In that case, create a new job continuing the current
2959 * subpass.
2960 */
2961 if (!cmd_buffer->state.job) {
2962 v3dv_cmd_buffer_subpass_resume(cmd_buffer,
2963 cmd_buffer->state.subpass_idx);
2964 }
2965
2966 /* Restart single sample job for MSAA pipeline if needed */
2967 cmd_buffer_restart_job_for_msaa_if_needed(cmd_buffer);
2968
2969 /* If the job is configured to flush on every draw call we need to create
2970 * a new job now.
2971 */
2972 struct v3dv_job *job = cmd_buffer_pre_draw_split_job(cmd_buffer);
2973 job->draw_count++;
2974
2975 /* Track VK_KHR_buffer_device_address usage in the job */
2976 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
2977 job->uses_buffer_device_address |= pipeline->uses_buffer_device_address;
2978
2979 /* If this job is serialized (has consumed a barrier) then check if we need
2980 * to sync at the binning stage by testing if the binning shaders involved
2981 * with the draw call require access to external resources.
2982 */
2983 if (job->serialize && (cmd_buffer->state.barrier.bcl_buffer_access ||
2984 cmd_buffer->state.barrier.bcl_image_access)) {
2985 assert(!job->needs_bcl_sync);
2986 if (cmd_buffer_binning_sync_required(cmd_buffer, pipeline,
2987 indexed, indirect)) {
2988 v3dv_cmd_buffer_consume_bcl_sync(cmd_buffer, job);
2989 }
2990 }
2991
2992 /* GL shader state binds shaders, uniform and vertex attribute state. The
2993 * compiler injects uniforms to handle some descriptor types (such as
2994 * textures), so we need to regen that when descriptor state changes.
2995 *
2996 * We also need to emit new shader state if we have a dirty viewport since
2997 * that will require that we new uniform state for QUNIFORM_VIEWPORT_*.
2998 */
2999 uint32_t *dirty = &cmd_buffer->state.dirty;
3000 struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
3001
3002 const bool dirty_uniform_state =
3003 update_gfx_uniform_state(cmd_buffer);
3004
3005 struct v3dv_device *device = cmd_buffer->device;
3006
3007 if (dirty_uniform_state || (*dirty & V3DV_CMD_DIRTY_VERTEX_BUFFER))
3008 v3dv_X(device, cmd_buffer_emit_gl_shader_state)(cmd_buffer);
3009
3010 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE) ||
3011 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE) ||
3012 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE) ||
3013 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) ||
3014 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE) ||
3015 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE) ||
3016 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE)) {
3017 v3dv_X(device, cmd_buffer_emit_configuration_bits)(cmd_buffer);
3018 }
3019
3020 if (*dirty & (V3DV_CMD_DIRTY_PIPELINE)) {
3021 v3dv_X(device, cmd_buffer_emit_varyings_state)(cmd_buffer);
3022 }
3023
3024 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_SCISSORS) ||
3025 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS)) {
3026 emit_scissor(cmd_buffer);
3027 }
3028
3029 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VP_VIEWPORTS))
3030 v3dv_X(device, cmd_buffer_emit_viewport)(cmd_buffer);
3031
3032 if (*dirty & V3DV_CMD_DIRTY_INDEX_BUFFER)
3033 v3dv_X(device, cmd_buffer_emit_index_buffer)(cmd_buffer);
3034
3035 bool any_dynamic_stencil_dirty =
3036 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
3037 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
3038 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
3039 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP);
3040
3041 if (*dirty & V3DV_CMD_DIRTY_PIPELINE || any_dynamic_stencil_dirty)
3042 v3dv_X(device, cmd_buffer_emit_stencil)(cmd_buffer);
3043
3044 if (*dirty & V3DV_CMD_DIRTY_PIPELINE ||
3045 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS)) {
3046 v3dv_X(device, cmd_buffer_emit_depth_bias)(cmd_buffer);
3047 }
3048
3049 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS))
3050 v3dv_X(device, cmd_buffer_emit_depth_bounds)(cmd_buffer);
3051
3052 if (*dirty & V3DV_CMD_DIRTY_PIPELINE ||
3053 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS)) {
3054 v3dv_X(device, cmd_buffer_emit_blend)(cmd_buffer);
3055 }
3056
3057 if (*dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY)
3058 v3dv_X(device, cmd_buffer_emit_occlusion_query)(cmd_buffer);
3059
3060 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH))
3061 v3dv_X(device, cmd_buffer_emit_line_width)(cmd_buffer);
3062
3063 if (dyn->ia.primitive_topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST &&
3064 !job->emitted_default_point_size) {
3065 v3dv_X(device, cmd_buffer_emit_default_point_size)(cmd_buffer);
3066 }
3067
3068 if (*dirty & V3DV_CMD_DIRTY_PIPELINE)
3069 v3dv_X(device, cmd_buffer_emit_sample_state)(cmd_buffer);
3070
3071 if (*dirty & V3DV_CMD_DIRTY_PIPELINE ||
3072 BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES)) {
3073 v3dv_X(device, cmd_buffer_emit_color_write_mask)(cmd_buffer);
3074 }
3075
3076 /* We disable double-buffer mode if indirect draws are used because in that
3077 * case we don't know the vertex count.
3078 */
3079 if (indirect) {
3080 job->can_use_double_buffer = false;
3081 } else if (job->can_use_double_buffer) {
3082 job_update_double_buffer_score(job, pipeline, vertex_count,
3083 &cmd_buffer->state.render_area.extent);
3084 }
3085
3086 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_PIPELINE;
3087 }
3088
3089 static inline void
cmd_buffer_set_view_index(struct v3dv_cmd_buffer * cmd_buffer,uint32_t view_index)3090 cmd_buffer_set_view_index(struct v3dv_cmd_buffer *cmd_buffer,
3091 uint32_t view_index)
3092 {
3093 if (view_index != cmd_buffer->state.view_index) {
3094 cmd_buffer->state.view_index = view_index;
3095 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VIEW_INDEX;
3096 }
3097 }
3098
3099 static void
cmd_buffer_draw(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_draw_info * info)3100 cmd_buffer_draw(struct v3dv_cmd_buffer *cmd_buffer,
3101 struct v3dv_draw_info *info)
3102 {
3103 uint32_t vertex_count =
3104 info->vertex_count * info->instance_count;
3105
3106 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
3107 if (likely(!pass->multiview_enabled)) {
3108 cmd_buffer_set_view_index(cmd_buffer, 0);
3109 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, false, vertex_count);
3110 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info);
3111 return;
3112 }
3113
3114 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
3115 while (view_mask) {
3116 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
3117 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, false, vertex_count);
3118 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw)(cmd_buffer, info);
3119 }
3120 }
3121
3122 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDraw(VkCommandBuffer commandBuffer,uint32_t vertexCount,uint32_t instanceCount,uint32_t firstVertex,uint32_t firstInstance)3123 v3dv_CmdDraw(VkCommandBuffer commandBuffer,
3124 uint32_t vertexCount,
3125 uint32_t instanceCount,
3126 uint32_t firstVertex,
3127 uint32_t firstInstance)
3128 {
3129 if (vertexCount == 0 || instanceCount == 0)
3130 return;
3131
3132 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3133 struct v3dv_draw_info info = {};
3134 info.vertex_count = vertexCount;
3135 info.instance_count = instanceCount;
3136 info.first_instance = firstInstance;
3137 info.first_vertex = firstVertex;
3138
3139 cmd_buffer_draw(cmd_buffer, &info);
3140 }
3141
3142 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawInfoEXT * pVertexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride)3143 v3dv_CmdDrawMultiEXT(VkCommandBuffer commandBuffer,
3144 uint32_t drawCount,
3145 const VkMultiDrawInfoEXT *pVertexInfo,
3146 uint32_t instanceCount,
3147 uint32_t firstInstance,
3148 uint32_t stride)
3149
3150 {
3151 if (drawCount == 0 || instanceCount == 0)
3152 return;
3153
3154 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3155
3156 uint32_t i = 0;
3157 vk_foreach_multi_draw(draw, i, pVertexInfo, drawCount, stride) {
3158 cmd_buffer->state.draw_id = i;
3159 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DRAW_ID;
3160
3161 struct v3dv_draw_info info = {};
3162 info.vertex_count = draw->vertexCount;
3163 info.instance_count = instanceCount;
3164 info.first_instance = firstInstance;
3165 info.first_vertex = draw->firstVertex;
3166
3167 cmd_buffer_draw(cmd_buffer, &info);
3168 }
3169 }
3170
3171 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,uint32_t indexCount,uint32_t instanceCount,uint32_t firstIndex,int32_t vertexOffset,uint32_t firstInstance)3172 v3dv_CmdDrawIndexed(VkCommandBuffer commandBuffer,
3173 uint32_t indexCount,
3174 uint32_t instanceCount,
3175 uint32_t firstIndex,
3176 int32_t vertexOffset,
3177 uint32_t firstInstance)
3178 {
3179 if (indexCount == 0 || instanceCount == 0)
3180 return;
3181
3182 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3183
3184 uint32_t vertex_count = indexCount * instanceCount;
3185
3186 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
3187 if (likely(!pass->multiview_enabled)) {
3188 cmd_buffer_set_view_index(cmd_buffer, 0);
3189 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
3190 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
3191 (cmd_buffer, indexCount, instanceCount,
3192 firstIndex, vertexOffset, firstInstance);
3193 return;
3194 }
3195
3196 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
3197 while (view_mask) {
3198 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
3199 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
3200 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
3201 (cmd_buffer, indexCount, instanceCount,
3202 firstIndex, vertexOffset, firstInstance);
3203 }
3204 }
3205
3206 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,uint32_t drawCount,const VkMultiDrawIndexedInfoEXT * pIndexInfo,uint32_t instanceCount,uint32_t firstInstance,uint32_t stride,const int32_t * pVertexOffset)3207 v3dv_CmdDrawMultiIndexedEXT(VkCommandBuffer commandBuffer,
3208 uint32_t drawCount,
3209 const VkMultiDrawIndexedInfoEXT *pIndexInfo,
3210 uint32_t instanceCount,
3211 uint32_t firstInstance,
3212 uint32_t stride,
3213 const int32_t *pVertexOffset)
3214 {
3215 if (drawCount == 0 || instanceCount == 0)
3216 return;
3217
3218 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3219
3220 uint32_t i = 0;
3221 vk_foreach_multi_draw_indexed(draw, i, pIndexInfo, drawCount, stride) {
3222 uint32_t vertex_count = draw->indexCount * instanceCount;
3223 int32_t vertexOffset = pVertexOffset ? *pVertexOffset : draw->vertexOffset;
3224
3225 cmd_buffer->state.draw_id = i;
3226 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DRAW_ID;
3227
3228 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
3229 if (likely(!pass->multiview_enabled)) {
3230 cmd_buffer_set_view_index(cmd_buffer, 0);
3231 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
3232 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
3233 (cmd_buffer, draw->indexCount, instanceCount,
3234 draw->firstIndex, vertexOffset, firstInstance);
3235 continue;
3236 }
3237 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
3238 while (view_mask) {
3239 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
3240 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, false, vertex_count);
3241 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indexed)
3242 (cmd_buffer, draw->indexCount, instanceCount,
3243 draw->firstIndex, vertexOffset, firstInstance);
3244 }
3245 }
3246 }
3247
3248 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)3249 v3dv_CmdDrawIndirect(VkCommandBuffer commandBuffer,
3250 VkBuffer _buffer,
3251 VkDeviceSize offset,
3252 uint32_t drawCount,
3253 uint32_t stride)
3254 {
3255 /* drawCount is the number of draws to execute, and can be zero. */
3256 if (drawCount == 0)
3257 return;
3258
3259 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3260 V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
3261
3262 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
3263 if (likely(!pass->multiview_enabled)) {
3264 cmd_buffer_set_view_index(cmd_buffer, 0);
3265 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, true, 0);
3266 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect)
3267 (cmd_buffer, buffer, offset, drawCount, stride);
3268 return;
3269 }
3270
3271 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
3272 while (view_mask) {
3273 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
3274 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, false, true, 0);
3275 v3dv_X(cmd_buffer->device, cmd_buffer_emit_draw_indirect)
3276 (cmd_buffer, buffer, offset, drawCount, stride);
3277 }
3278 }
3279
3280 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset,uint32_t drawCount,uint32_t stride)3281 v3dv_CmdDrawIndexedIndirect(VkCommandBuffer commandBuffer,
3282 VkBuffer _buffer,
3283 VkDeviceSize offset,
3284 uint32_t drawCount,
3285 uint32_t stride)
3286 {
3287 /* drawCount is the number of draws to execute, and can be zero. */
3288 if (drawCount == 0)
3289 return;
3290
3291 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3292 V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
3293
3294 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
3295 if (likely(!pass->multiview_enabled)) {
3296 cmd_buffer_set_view_index(cmd_buffer, 0);
3297 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, true, 0);
3298 v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect)
3299 (cmd_buffer, buffer, offset, drawCount, stride);
3300 return;
3301 }
3302
3303 uint32_t view_mask = pass->subpasses[cmd_buffer->state.subpass_idx].view_mask;
3304 while (view_mask) {
3305 cmd_buffer_set_view_index(cmd_buffer, u_bit_scan(&view_mask));
3306 v3dv_cmd_buffer_emit_pre_draw(cmd_buffer, true, true, 0);
3307 v3dv_X(cmd_buffer->device, cmd_buffer_emit_indexed_indirect)
3308 (cmd_buffer, buffer, offset, drawCount, stride);
3309 }
3310 }
3311
3312 static void
handle_barrier(VkPipelineStageFlags2 srcStageMask,VkAccessFlags2 srcAccessMask,VkPipelineStageFlags2 dstStageMask,VkAccessFlags2 dstAccessMask,bool is_image_barrier,bool is_buffer_barrier,struct v3dv_barrier_state * state)3313 handle_barrier(VkPipelineStageFlags2 srcStageMask, VkAccessFlags2 srcAccessMask,
3314 VkPipelineStageFlags2 dstStageMask, VkAccessFlags2 dstAccessMask,
3315 bool is_image_barrier, bool is_buffer_barrier,
3316 struct v3dv_barrier_state *state)
3317 {
3318 /* We only care about barriers between GPU jobs */
3319 if (srcStageMask == VK_PIPELINE_STAGE_2_HOST_BIT ||
3320 dstStageMask == VK_PIPELINE_STAGE_2_HOST_BIT) {
3321 return;
3322 }
3323
3324 /* Track source of the barrier */
3325 uint8_t src_mask = 0;
3326
3327 const VkPipelineStageFlags2 compute_mask =
3328 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
3329 if (srcStageMask & (compute_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
3330 src_mask |= V3DV_BARRIER_COMPUTE_BIT;
3331
3332 const VkPipelineStageFlags2 transfer_mask =
3333 VK_PIPELINE_STAGE_2_ALL_TRANSFER_BIT |
3334 VK_PIPELINE_STAGE_2_COPY_BIT |
3335 VK_PIPELINE_STAGE_2_BLIT_BIT |
3336 VK_PIPELINE_STAGE_2_CLEAR_BIT;
3337 if (srcStageMask & (transfer_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
3338 src_mask |= V3DV_BARRIER_TRANSFER_BIT;
3339
3340 const VkPipelineStageFlags2 graphics_mask = ~(compute_mask | transfer_mask);
3341 if (srcStageMask & (graphics_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT))
3342 src_mask |= V3DV_BARRIER_GRAPHICS_BIT;
3343
3344 /* Track consumer of the barrier */
3345 if (dstStageMask & (compute_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
3346 state->dst_mask |= V3DV_BARRIER_COMPUTE_BIT;
3347 state->src_mask_compute |= src_mask;
3348 }
3349
3350 if (dstStageMask & (transfer_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
3351 state->dst_mask |= V3DV_BARRIER_TRANSFER_BIT;
3352 state->src_mask_transfer |= src_mask;
3353 }
3354
3355 if (dstStageMask & (graphics_mask | VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
3356 state->dst_mask |= V3DV_BARRIER_GRAPHICS_BIT;
3357 state->src_mask_graphics |= src_mask;
3358
3359 if (dstStageMask & (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
3360 VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
3361 VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
3362 VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
3363 VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
3364 VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
3365 VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
3366 VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
3367 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
3368 VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT |
3369 VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
3370 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT)) {
3371 if (is_image_barrier)
3372 state->bcl_image_access |= dstAccessMask;
3373
3374 if (is_buffer_barrier)
3375 state->bcl_buffer_access |= dstAccessMask;
3376 }
3377 }
3378 }
3379
3380 void
v3dv_cmd_buffer_emit_pipeline_barrier(struct v3dv_cmd_buffer * cmd_buffer,const VkDependencyInfo * info)3381 v3dv_cmd_buffer_emit_pipeline_barrier(struct v3dv_cmd_buffer *cmd_buffer,
3382 const VkDependencyInfo *info)
3383 {
3384 uint32_t imageBarrierCount = info->imageMemoryBarrierCount;
3385 const VkImageMemoryBarrier2 *pImageBarriers = info->pImageMemoryBarriers;
3386
3387 uint32_t bufferBarrierCount = info->bufferMemoryBarrierCount;
3388 const VkBufferMemoryBarrier2 *pBufferBarriers = info->pBufferMemoryBarriers;
3389
3390 uint32_t memoryBarrierCount = info->memoryBarrierCount;
3391 const VkMemoryBarrier2 *pMemoryBarriers = info->pMemoryBarriers;
3392
3393 struct v3dv_barrier_state state = { 0 };
3394 for (uint32_t i = 0; i < imageBarrierCount; i++) {
3395 /* We can safely skip barriers for image layout transitions from UNDEFINED
3396 * layout.
3397 *
3398 * Notice that KHR_synchronization2 allows to specify barriers that don't
3399 * involve a layout transition by making oldLayout and newLayout the same,
3400 * including UNDEFINED.
3401 */
3402 if (pImageBarriers[i].oldLayout == VK_IMAGE_LAYOUT_UNDEFINED &&
3403 pImageBarriers[i].oldLayout != pImageBarriers[i].newLayout) {
3404 continue;
3405 }
3406
3407 handle_barrier(pImageBarriers[i].srcStageMask,
3408 pImageBarriers[i].srcAccessMask,
3409 pImageBarriers[i].dstStageMask,
3410 pImageBarriers[i].dstAccessMask,
3411 true, false, &state);
3412 }
3413
3414 for (uint32_t i = 0; i < bufferBarrierCount; i++) {
3415 handle_barrier(pBufferBarriers[i].srcStageMask,
3416 pBufferBarriers[i].srcAccessMask,
3417 pBufferBarriers[i].dstStageMask,
3418 pBufferBarriers[i].dstAccessMask,
3419 false, true, &state);
3420 }
3421
3422 for (uint32_t i = 0; i < memoryBarrierCount; i++) {
3423 handle_barrier(pMemoryBarriers[i].srcStageMask,
3424 pMemoryBarriers[i].srcAccessMask,
3425 pMemoryBarriers[i].dstStageMask,
3426 pMemoryBarriers[i].dstAccessMask,
3427 true, true, &state);
3428 }
3429
3430 /* Bail if we don't relevant barriers */
3431 if (!state.dst_mask)
3432 return;
3433
3434 /* If we have a recording job, finish it here */
3435 if (cmd_buffer->state.job)
3436 v3dv_cmd_buffer_finish_job(cmd_buffer);
3437
3438 /* Update barrier state in the command buffer */
3439 v3dv_cmd_buffer_merge_barrier_state(&cmd_buffer->state.barrier, &state);
3440 }
3441
3442 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,const VkDependencyInfo * pDependencyInfo)3443 v3dv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer,
3444 const VkDependencyInfo *pDependencyInfo)
3445 {
3446 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3447 v3dv_cmd_buffer_emit_pipeline_barrier(cmd_buffer, pDependencyInfo);
3448 }
3449
3450 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,uint32_t firstBinding,uint32_t bindingCount,const VkBuffer * pBuffers,const VkDeviceSize * pOffsets,const VkDeviceSize * pSizes,const VkDeviceSize * pStrides)3451 v3dv_CmdBindVertexBuffers2(VkCommandBuffer commandBuffer,
3452 uint32_t firstBinding,
3453 uint32_t bindingCount,
3454 const VkBuffer *pBuffers,
3455 const VkDeviceSize *pOffsets,
3456 const VkDeviceSize *pSizes,
3457 const VkDeviceSize *pStrides)
3458 {
3459 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3460 struct v3dv_vertex_binding *vb = cmd_buffer->state.vertex_bindings;
3461
3462 assert(firstBinding + bindingCount <= MAX_VBS);
3463 bool vb_state_changed = false;
3464 if (pStrides) {
3465 vk_cmd_set_vertex_binding_strides(&cmd_buffer->vk,
3466 firstBinding, bindingCount,
3467 pStrides);
3468 struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
3469 if (BITSET_TEST(dyn->dirty, MESA_VK_DYNAMIC_VI_BINDING_STRIDES))
3470 vb_state_changed = true;
3471 }
3472
3473 for (uint32_t i = 0; i < bindingCount; i++) {
3474 struct v3dv_buffer *buffer = v3dv_buffer_from_handle(pBuffers[i]);
3475 if (vb[firstBinding + i].buffer != buffer) {
3476 vb[firstBinding + i].buffer = v3dv_buffer_from_handle(pBuffers[i]);
3477 vb_state_changed = true;
3478 }
3479
3480 if (vb[firstBinding + i].offset != pOffsets[i]) {
3481 vb[firstBinding + i].offset = pOffsets[i];
3482 vb_state_changed = true;
3483 }
3484 assert(pOffsets[i] <= buffer->size);
3485
3486 VkDeviceSize size;
3487 if (!pSizes || pSizes[i] == VK_WHOLE_SIZE)
3488 size = buffer->size - pOffsets[i];
3489 else
3490 size = pSizes[i];
3491 assert(pOffsets[i] + size <= buffer->size);
3492
3493 if (vb[firstBinding + i].size != size) {
3494 vb[firstBinding + i].size = size;
3495 vb_state_changed = true;
3496 }
3497 }
3498
3499 if (vb_state_changed)
3500 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_VERTEX_BUFFER;
3501 }
3502
3503 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,VkBuffer buffer,VkDeviceSize offset,VkDeviceSize size,VkIndexType indexType)3504 v3dv_CmdBindIndexBuffer2KHR(VkCommandBuffer commandBuffer,
3505 VkBuffer buffer,
3506 VkDeviceSize offset,
3507 VkDeviceSize size,
3508 VkIndexType indexType)
3509 {
3510 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3511
3512 assert(buffer != VK_NULL_HANDLE);
3513
3514 if (size == VK_WHOLE_SIZE) {
3515 assert(v3dv_buffer_from_handle(buffer)->size >= offset);
3516 size = v3dv_buffer_from_handle(buffer)->size - offset;
3517 }
3518
3519 const uint32_t index_size = vk_index_type_to_bytes(indexType);
3520 if (buffer == cmd_buffer->state.index_buffer.buffer &&
3521 offset == cmd_buffer->state.index_buffer.offset &&
3522 size == cmd_buffer->state.index_buffer.size &&
3523 index_size == cmd_buffer->state.index_buffer.index_size) {
3524 return;
3525 }
3526
3527 cmd_buffer->state.index_buffer.buffer = buffer;
3528 cmd_buffer->state.index_buffer.offset = offset;
3529 cmd_buffer->state.index_buffer.size = size;
3530 cmd_buffer->state.index_buffer.index_size = index_size;
3531 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_INDEX_BUFFER;
3532 }
3533
3534 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer,uint32_t lineStippleFactor,uint16_t lineStipplePattern)3535 v3dv_CmdSetLineStippleEXT(VkCommandBuffer commandBuffer,
3536 uint32_t lineStippleFactor,
3537 uint16_t lineStipplePattern)
3538 {
3539 /* We do not support stippled line rasterization so we just ignore this. */
3540 }
3541
3542 /**
3543 * This checks a descriptor set to see if are binding any descriptors that would
3544 * involve sampling from a linear image (the hardware only supports this for
3545 * 1D images), and if so, attempts to create a tiled copy of the linear image
3546 * and rewrite the descriptor set to use that instead.
3547 *
3548 * This was added to support a scenario with Android where some part of the UI
3549 * wanted to show previews of linear swapchain images. For more details:
3550 * https://gitlab.freedesktop.org/mesa/mesa/-/issues/9712
3551 *
3552 * Currently this only supports a linear sampling from a simple 2D image, but
3553 * it could be extended to support more cases if necessary.
3554 */
3555 static void
handle_sample_from_linear_image(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_descriptor_set * set,bool is_compute)3556 handle_sample_from_linear_image(struct v3dv_cmd_buffer *cmd_buffer,
3557 struct v3dv_descriptor_set *set,
3558 bool is_compute)
3559 {
3560 for (int32_t i = 0; i < set->layout->binding_count; i++) {
3561 const struct v3dv_descriptor_set_binding_layout *blayout =
3562 &set->layout->binding[i];
3563 if (blayout->type != VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE &&
3564 blayout->type != VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER)
3565 continue;
3566
3567 struct v3dv_descriptor *desc = &set->descriptors[blayout->descriptor_index];
3568 if (!desc->image_view)
3569 continue;
3570
3571 struct v3dv_image *image = (struct v3dv_image *) desc->image_view->vk.image;
3572 struct v3dv_image_view *view = (struct v3dv_image_view *) desc->image_view;
3573 if (image->tiled || view->vk.view_type == VK_IMAGE_VIEW_TYPE_1D ||
3574 view->vk.view_type == VK_IMAGE_VIEW_TYPE_1D_ARRAY) {
3575 continue;
3576 }
3577
3578 /* FIXME: we can probably handle most of these restrictions too with
3579 * a bit of extra effort.
3580 */
3581 if (view->vk.view_type != VK_IMAGE_VIEW_TYPE_2D ||
3582 view->vk.level_count != 1 || view->vk.layer_count != 1 ||
3583 blayout->array_size != 1) {
3584 fprintf(stderr, "Sampling from linear image is not supported. "
3585 "Expect corruption.\n");
3586 continue;
3587 }
3588
3589 /* We are sampling from a linear image. V3D doesn't support this
3590 * so we create a tiled copy of the image and rewrite the descriptor
3591 * to read from it instead.
3592 */
3593 perf_debug("Sampling from linear image is not supported natively and "
3594 "requires a copy.\n");
3595
3596 struct v3dv_device *device = cmd_buffer->device;
3597 VkDevice vk_device = v3dv_device_to_handle(device);
3598
3599 /* Allocate shadow tiled image if needed, we only do this once for
3600 * each image, on the first sampling attempt. We need to take a lock
3601 * since we may be trying to do the same in another command buffer in
3602 * a separate thread.
3603 */
3604 mtx_lock(&device->meta.mtx);
3605 VkResult result;
3606 VkImage tiled_image;
3607 if (image->shadow) {
3608 tiled_image = v3dv_image_to_handle(image->shadow);
3609 } else {
3610 VkImageCreateInfo image_info = {
3611 .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
3612 .flags = image->vk.create_flags,
3613 .imageType = image->vk.image_type,
3614 .format = image->vk.format,
3615 .extent = {
3616 image->vk.extent.width,
3617 image->vk.extent.height,
3618 image->vk.extent.depth,
3619 },
3620 .mipLevels = image->vk.mip_levels,
3621 .arrayLayers = image->vk.array_layers,
3622 .samples = image->vk.samples,
3623 .tiling = VK_IMAGE_TILING_OPTIMAL,
3624 .usage = image->vk.usage,
3625 .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
3626 .queueFamilyIndexCount = 0,
3627 .initialLayout = VK_IMAGE_LAYOUT_GENERAL,
3628 };
3629 result = v3dv_CreateImage(vk_device, &image_info,
3630 &device->vk.alloc, &tiled_image);
3631 if (result != VK_SUCCESS) {
3632 fprintf(stderr, "Failed to copy linear 2D image for sampling."
3633 "Expect corruption.\n");
3634 mtx_unlock(&device->meta.mtx);
3635 continue;
3636 }
3637
3638 bool disjoint = image->vk.create_flags & VK_IMAGE_CREATE_DISJOINT_BIT;
3639 VkImageMemoryRequirementsInfo2 reqs_info = {
3640 .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2,
3641 .image = tiled_image,
3642 };
3643
3644 assert(image->plane_count <= V3DV_MAX_PLANE_COUNT);
3645 for (int p = 0; p < (disjoint ? image->plane_count : 1); p++) {
3646 VkImageAspectFlagBits plane_aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << p;
3647 VkImagePlaneMemoryRequirementsInfo plane_info = {
3648 .sType = VK_STRUCTURE_TYPE_IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO,
3649 .planeAspect = plane_aspect,
3650 };
3651 if (disjoint)
3652 reqs_info.pNext = &plane_info;
3653
3654 VkMemoryRequirements2 reqs = {
3655 .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
3656 };
3657 v3dv_GetImageMemoryRequirements2(vk_device, &reqs_info, &reqs);
3658
3659 VkDeviceMemory mem;
3660 VkMemoryAllocateInfo alloc_info = {
3661 .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
3662 .allocationSize = reqs.memoryRequirements.size,
3663 .memoryTypeIndex = 0,
3664 };
3665 result = v3dv_AllocateMemory(vk_device, &alloc_info,
3666 &device->vk.alloc, &mem);
3667 if (result != VK_SUCCESS) {
3668 fprintf(stderr, "Failed to copy linear 2D image for sampling."
3669 "Expect corruption.\n");
3670 v3dv_DestroyImage(vk_device, tiled_image, &device->vk.alloc);
3671 mtx_unlock(&device->meta.mtx);
3672 continue;
3673 }
3674
3675 VkBindImageMemoryInfo bind_info = {
3676 .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_MEMORY_INFO,
3677 .image = tiled_image,
3678 .memory = mem,
3679 .memoryOffset = 0,
3680 };
3681 VkBindImagePlaneMemoryInfo plane_bind_info = {
3682 .sType = VK_STRUCTURE_TYPE_BIND_IMAGE_PLANE_MEMORY_INFO,
3683 .planeAspect = plane_aspect,
3684 };
3685 if (disjoint)
3686 bind_info.pNext = &plane_bind_info;
3687 result = v3dv_BindImageMemory2(vk_device, 1, &bind_info);
3688 if (result != VK_SUCCESS) {
3689 fprintf(stderr, "Failed to copy linear 2D image for sampling."
3690 "Expect corruption.\n");
3691 v3dv_DestroyImage(vk_device, tiled_image, &device->vk.alloc);
3692 v3dv_FreeMemory(vk_device, mem, &device->vk.alloc);
3693 mtx_unlock(&device->meta.mtx);
3694 continue;
3695 }
3696 }
3697
3698 image->shadow = v3dv_image_from_handle(tiled_image);
3699 }
3700
3701 /* Create a shadow view that refers to the tiled image if needed */
3702 VkImageView tiled_view;
3703 if (view->shadow) {
3704 tiled_view = v3dv_image_view_to_handle(view->shadow);
3705 } else {
3706 VkImageViewCreateInfo view_info = {
3707 .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
3708 .flags = view->vk.create_flags,
3709 .image = tiled_image,
3710 .viewType = view->vk.view_type,
3711 .format = view->vk.format,
3712 .components = view->vk.swizzle,
3713 .subresourceRange = {
3714 .aspectMask = view->vk.aspects,
3715 .baseMipLevel = view->vk.base_mip_level,
3716 .levelCount = view->vk.level_count,
3717 .baseArrayLayer = view->vk.base_array_layer,
3718 .layerCount = view->vk.layer_count,
3719 },
3720 };
3721 result = v3dv_create_image_view(device, &view_info, &tiled_view);
3722 if (result != VK_SUCCESS) {
3723 fprintf(stderr, "Failed to copy linear 2D image for sampling."
3724 "Expect corruption.\n");
3725 mtx_unlock(&device->meta.mtx);
3726 continue;
3727 }
3728 }
3729
3730 view->shadow = v3dv_image_view_from_handle(tiled_view);
3731
3732 mtx_unlock(&device->meta.mtx);
3733
3734 /* Rewrite the descriptor to use the shadow view */
3735 VkDescriptorImageInfo desc_image_info = {
3736 .sampler = v3dv_sampler_to_handle(desc->sampler),
3737 .imageView = tiled_view,
3738 .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
3739 };
3740 VkWriteDescriptorSet write = {
3741 .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
3742 .dstSet = v3dv_descriptor_set_to_handle(set),
3743 .dstBinding = i,
3744 .dstArrayElement = 0, /* Assumes array_size is 1 */
3745 .descriptorCount = 1,
3746 .descriptorType = desc->type,
3747 .pImageInfo = &desc_image_info,
3748 };
3749 v3dv_UpdateDescriptorSets(vk_device, 1, &write, 0, NULL);
3750
3751 /* Now we need to actually copy the pixel data from the linear image
3752 * into the tiled image storage to ensure it is up-to-date.
3753 *
3754 * FIXME: ideally we would track if the linear image is dirty and skip
3755 * this step otherwise, but that would be a bit of a pain.
3756 *
3757 * Note that we need to place the copy job *before* the current job in
3758 * the command buffer state so we have the tiled image ready to process
3759 * an upcoming draw call in the current job that samples from it.
3760 *
3761 * Also, we need to use the TFU path for this copy, as any other path
3762 * will use the tile buffer and would require a new framebuffer setup,
3763 * thus requiring extra work to stop and resume any in-flight render
3764 * pass. Since we are converting a full 2D texture here the TFU should
3765 * be able to handle this.
3766 */
3767 for (int p = 0; p < image->plane_count; p++) {
3768 VkImageAspectFlagBits plane_aspect = VK_IMAGE_ASPECT_PLANE_0_BIT << p;
3769 struct VkImageCopy2 copy_region = {
3770 .sType = VK_STRUCTURE_TYPE_IMAGE_COPY_2,
3771 .srcSubresource = {
3772 .aspectMask = image->plane_count == 1 ?
3773 view->vk.aspects : (view->vk.aspects & plane_aspect),
3774 .mipLevel = view->vk.base_mip_level,
3775 .baseArrayLayer = view->vk.base_array_layer,
3776 .layerCount = view->vk.layer_count,
3777 },
3778 .srcOffset = {0, 0, 0 },
3779 .dstSubresource = {
3780 .aspectMask = image->plane_count == 1 ?
3781 view->vk.aspects : (view->vk.aspects & plane_aspect),
3782 .mipLevel = view->vk.base_mip_level,
3783 .baseArrayLayer = view->vk.base_array_layer,
3784 .layerCount = view->vk.layer_count,
3785 },
3786 .dstOffset = { 0, 0, 0},
3787 .extent = {
3788 image->planes[p].width,
3789 image->planes[p].height,
3790 1,
3791 },
3792 };
3793 struct v3dv_image *copy_src = image;
3794 struct v3dv_image *copy_dst = v3dv_image_from_handle(tiled_image);
3795 bool ok = v3dv_cmd_buffer_copy_image_tfu(cmd_buffer, copy_dst, copy_src,
3796 ©_region);
3797 if (ok) {
3798 /* This will emit the TFU job right before the current in-flight
3799 * job (if any), since in-fight jobs are only added to the list
3800 * when finished.
3801 */
3802 struct v3dv_job *tfu_job =
3803 list_last_entry(&cmd_buffer->jobs, struct v3dv_job, list_link);
3804 assert(tfu_job->type == V3DV_JOB_TYPE_GPU_TFU);
3805 /* Serialize the copy since we don't know who is producing the linear
3806 * image and we need the image to be ready by the time the copy
3807 * executes.
3808 */
3809 tfu_job->serialize = V3DV_BARRIER_ALL;
3810
3811 /* Also, we need to ensure the TFU copy job completes before anyhing
3812 * else coming after that may be using the tiled shadow copy.
3813 */
3814 if (cmd_buffer->state.job) {
3815 /* If we already had an in-flight job (i.e. we are in a render
3816 * pass) make sure the job waits for the TFU copy.
3817 */
3818 cmd_buffer->state.job->serialize |= V3DV_BARRIER_TRANSFER_BIT;
3819 } else {
3820 /* Otherwise, make the the follow-up job syncs with the TFU
3821 * job we just added when it is created by adding the
3822 * corresponding barrier state.
3823 */
3824 if (!is_compute) {
3825 cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_GRAPHICS_BIT;
3826 cmd_buffer->state.barrier.src_mask_graphics |= V3DV_BARRIER_TRANSFER_BIT;
3827 } else {
3828 cmd_buffer->state.barrier.dst_mask |= V3DV_BARRIER_COMPUTE_BIT;
3829 cmd_buffer->state.barrier.src_mask_compute |= V3DV_BARRIER_TRANSFER_BIT;
3830 }
3831 }
3832 } else {
3833 fprintf(stderr, "Failed to copy linear 2D image for sampling."
3834 "TFU doesn't support copy. Expect corruption.\n");
3835 }
3836 }
3837 }
3838 }
3839
3840 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,VkPipelineBindPoint pipelineBindPoint,VkPipelineLayout _layout,uint32_t firstSet,uint32_t descriptorSetCount,const VkDescriptorSet * pDescriptorSets,uint32_t dynamicOffsetCount,const uint32_t * pDynamicOffsets)3841 v3dv_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
3842 VkPipelineBindPoint pipelineBindPoint,
3843 VkPipelineLayout _layout,
3844 uint32_t firstSet,
3845 uint32_t descriptorSetCount,
3846 const VkDescriptorSet *pDescriptorSets,
3847 uint32_t dynamicOffsetCount,
3848 const uint32_t *pDynamicOffsets)
3849 {
3850 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3851 V3DV_FROM_HANDLE(v3dv_pipeline_layout, layout, _layout);
3852
3853 uint32_t dyn_index = 0;
3854
3855 assert(firstSet + descriptorSetCount <= MAX_SETS);
3856
3857 struct v3dv_descriptor_state *descriptor_state =
3858 pipelineBindPoint == VK_PIPELINE_BIND_POINT_COMPUTE ?
3859 &cmd_buffer->state.compute.descriptor_state :
3860 &cmd_buffer->state.gfx.descriptor_state;
3861
3862 VkShaderStageFlags dirty_stages = 0;
3863 bool descriptor_state_changed = false;
3864 for (uint32_t i = 0; i < descriptorSetCount; i++) {
3865 V3DV_FROM_HANDLE(v3dv_descriptor_set, set, pDescriptorSets[i]);
3866 uint32_t index = firstSet + i;
3867
3868 descriptor_state->valid |= (1u << index);
3869 if (descriptor_state->descriptor_sets[index] != set) {
3870 descriptor_state->descriptor_sets[index] = set;
3871 dirty_stages |= set->layout->shader_stages;
3872 descriptor_state_changed = true;
3873
3874 /* Check if we are sampling from a linear 2D image. This is not
3875 * supported in hardware, but may be required for some applications
3876 * so we will transparently convert to tiled at the expense of
3877 * performance.
3878 */
3879 handle_sample_from_linear_image(cmd_buffer, set,
3880 pipelineBindPoint ==
3881 VK_PIPELINE_BIND_POINT_COMPUTE);
3882 }
3883
3884 for (uint32_t j = 0; j < set->layout->dynamic_offset_count; j++, dyn_index++) {
3885 uint32_t idx = j + layout->set[i + firstSet].dynamic_offset_start;
3886
3887 if (descriptor_state->dynamic_offsets[idx] != pDynamicOffsets[dyn_index]) {
3888 descriptor_state->dynamic_offsets[idx] = pDynamicOffsets[dyn_index];
3889 dirty_stages |= set->layout->shader_stages;
3890 descriptor_state_changed = true;
3891 }
3892 }
3893 }
3894
3895 if (descriptor_state_changed) {
3896 if (pipelineBindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) {
3897 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_DESCRIPTOR_SETS;
3898 cmd_buffer->state.dirty_descriptor_stages |= dirty_stages & VK_SHADER_STAGE_ALL_GRAPHICS;
3899 } else {
3900 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS;
3901 cmd_buffer->state.dirty_descriptor_stages |= VK_SHADER_STAGE_COMPUTE_BIT;
3902 }
3903 }
3904 }
3905
3906 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdPushConstants(VkCommandBuffer commandBuffer,VkPipelineLayout layout,VkShaderStageFlags stageFlags,uint32_t offset,uint32_t size,const void * pValues)3907 v3dv_CmdPushConstants(VkCommandBuffer commandBuffer,
3908 VkPipelineLayout layout,
3909 VkShaderStageFlags stageFlags,
3910 uint32_t offset,
3911 uint32_t size,
3912 const void *pValues)
3913 {
3914 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
3915
3916 if (!memcmp((uint8_t *) cmd_buffer->state.push_constants_data + offset,
3917 pValues, size)) {
3918 return;
3919 }
3920
3921 memcpy((uint8_t *) cmd_buffer->state.push_constants_data + offset,
3922 pValues, size);
3923 cmd_buffer->state.push_constants_size =
3924 MAX2(offset + size, cmd_buffer->state.push_constants_size);
3925
3926 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_PUSH_CONSTANTS |
3927 V3DV_CMD_DIRTY_PUSH_CONSTANTS_UBO;
3928 cmd_buffer->state.dirty_push_constants_stages |= stageFlags;
3929 }
3930
3931 void
v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer * cmd_buffer,uint32_t slot_size,uint32_t used_count,uint32_t * alloc_count,void ** ptr)3932 v3dv_cmd_buffer_ensure_array_state(struct v3dv_cmd_buffer *cmd_buffer,
3933 uint32_t slot_size,
3934 uint32_t used_count,
3935 uint32_t *alloc_count,
3936 void **ptr)
3937 {
3938 if (used_count >= *alloc_count) {
3939 const uint32_t prev_slot_count = *alloc_count;
3940 void *old_buffer = *ptr;
3941
3942 const uint32_t new_slot_count = MAX2(*alloc_count * 2, 4);
3943 const uint32_t bytes = new_slot_count * slot_size;
3944 *ptr = vk_alloc(&cmd_buffer->device->vk.alloc, bytes, 8,
3945 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
3946 if (*ptr == NULL) {
3947 fprintf(stderr, "Error: failed to allocate CPU buffer for query.\n");
3948 v3dv_flag_oom(cmd_buffer, NULL);
3949 return;
3950 }
3951
3952 if (old_buffer)
3953 memcpy(*ptr, old_buffer, prev_slot_count * slot_size);
3954 *alloc_count = new_slot_count;
3955 }
3956 assert(used_count < *alloc_count);
3957 }
3958
3959 void
v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query,VkQueryControlFlags flags)3960 v3dv_cmd_buffer_begin_query(struct v3dv_cmd_buffer *cmd_buffer,
3961 struct v3dv_query_pool *pool,
3962 uint32_t query,
3963 VkQueryControlFlags flags)
3964 {
3965 assert(query < pool->query_count);
3966 switch (pool->query_type) {
3967 case VK_QUERY_TYPE_OCCLUSION:
3968 /* FIXME: we only support one active occlusion query for now */
3969 assert(cmd_buffer->state.query.active_query.bo == NULL);
3970
3971 cmd_buffer->state.query.active_query.bo = pool->occlusion.bo;
3972 cmd_buffer->state.query.active_query.offset =
3973 pool->queries[query].occlusion.offset;
3974 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
3975 break;
3976 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: {
3977 assert(cmd_buffer->state.query.active_query.perf == NULL);
3978 if (cmd_buffer->state.pass)
3979 v3dv_cmd_buffer_subpass_finish(cmd_buffer);
3980
3981 cmd_buffer->state.query.active_query.perf =
3982 &pool->queries[query].perf;
3983
3984 if (cmd_buffer->state.pass) {
3985 v3dv_cmd_buffer_subpass_resume(cmd_buffer,
3986 cmd_buffer->state.subpass_idx);
3987 }
3988 break;
3989 }
3990 default:
3991 unreachable("Unsupported query type");
3992 }
3993 }
3994
3995 void
v3dv_cmd_buffer_pause_occlusion_query(struct v3dv_cmd_buffer * cmd_buffer)3996 v3dv_cmd_buffer_pause_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer)
3997 {
3998 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
3999 struct v3dv_bo *occlusion_query_bo = state->query.active_query.bo;
4000 if (occlusion_query_bo) {
4001 assert(!state->query.active_query.paused_bo);
4002 state->query.active_query.paused_bo = occlusion_query_bo;
4003 state->query.active_query.bo = NULL;
4004 state->dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
4005 }
4006 }
4007
4008 void
v3dv_cmd_buffer_resume_occlusion_query(struct v3dv_cmd_buffer * cmd_buffer)4009 v3dv_cmd_buffer_resume_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer)
4010 {
4011 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
4012 struct v3dv_bo *occlusion_query_bo = state->query.active_query.paused_bo;
4013 if (occlusion_query_bo) {
4014 assert(!state->query.active_query.bo);
4015 state->query.active_query.bo = occlusion_query_bo;
4016 state->query.active_query.paused_bo = NULL;
4017 state->dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
4018 }
4019 }
4020
4021 static void
v3dv_cmd_buffer_schedule_end_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query)4022 v3dv_cmd_buffer_schedule_end_query(struct v3dv_cmd_buffer *cmd_buffer,
4023 struct v3dv_query_pool *pool,
4024 uint32_t query)
4025 {
4026 assert(query < pool->query_count);
4027 assert(pool->query_type == VK_QUERY_TYPE_OCCLUSION ||
4028 pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
4029
4030 /* For occlusion queries in the middle of a render pass we don't want to
4031 * split the current job at the EndQuery just to emit query availability,
4032 * instead we queue this state in the command buffer and we emit it when
4033 * we finish the current job.
4034 */
4035 if (cmd_buffer->state.pass &&
4036 pool->query_type == VK_QUERY_TYPE_OCCLUSION) {
4037 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
4038 v3dv_cmd_buffer_ensure_array_state(cmd_buffer,
4039 sizeof(struct v3dv_end_query_info),
4040 state->query.end.used_count,
4041 &state->query.end.alloc_count,
4042 (void **) &state->query.end.states);
4043 v3dv_return_if_oom(cmd_buffer, NULL);
4044
4045 struct v3dv_end_query_info *info =
4046 &state->query.end.states[state->query.end.used_count++];
4047
4048 info->pool = pool;
4049 info->query = query;
4050
4051 /* From the Vulkan spec:
4052 *
4053 * "If queries are used while executing a render pass instance that has
4054 * multiview enabled, the query uses N consecutive query indices in
4055 * the query pool (starting at query) where N is the number of bits set
4056 * in the view mask in the subpass the query is used in. How the
4057 * numerical results of the query are distributed among the queries is
4058 * implementation-dependent."
4059 *
4060 * In our case, only the first query is used but this means we still need
4061 * to flag the other queries as available so we don't emit errors when
4062 * the applications attempt to retrieve values from them.
4063 */
4064 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
4065 if (!pass->multiview_enabled) {
4066 info->count = 1;
4067 } else {
4068 struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
4069 info->count = util_bitcount(subpass->view_mask);
4070 }
4071 } else {
4072 /* Otherwise, schedule the end query job immediately.
4073 *
4074 * Multiview queries cannot cross subpass boundaries, so query count is
4075 * always 1.
4076 */
4077 if (pool->query_type == VK_QUERY_TYPE_OCCLUSION)
4078 v3dv_cmd_buffer_emit_set_query_availability(cmd_buffer, pool, query, 1, 1);
4079 else
4080 cmd_buffer_emit_end_query_cpu(cmd_buffer, pool, query, 1);
4081 }
4082 }
4083
4084 static void
v3dv_cmd_buffer_end_occlusion_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query)4085 v3dv_cmd_buffer_end_occlusion_query(struct v3dv_cmd_buffer *cmd_buffer,
4086 struct v3dv_query_pool *pool,
4087 uint32_t query)
4088 {
4089 assert(query < pool->query_count);
4090 assert(cmd_buffer->state.query.active_query.bo != NULL);
4091
4092 v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query);
4093
4094 cmd_buffer->state.query.active_query.bo = NULL;
4095 cmd_buffer->state.dirty |= V3DV_CMD_DIRTY_OCCLUSION_QUERY;
4096 }
4097
4098 static void
v3dv_cmd_buffer_end_performance_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query)4099 v3dv_cmd_buffer_end_performance_query(struct v3dv_cmd_buffer *cmd_buffer,
4100 struct v3dv_query_pool *pool,
4101 uint32_t query)
4102 {
4103 assert(query < pool->query_count);
4104 assert(cmd_buffer->state.query.active_query.perf != NULL);
4105
4106 if (cmd_buffer->state.pass)
4107 v3dv_cmd_buffer_subpass_finish(cmd_buffer);
4108
4109 v3dv_cmd_buffer_schedule_end_query(cmd_buffer, pool, query);
4110
4111 cmd_buffer->state.query.active_query.perf = NULL;
4112
4113 if (cmd_buffer->state.pass)
4114 v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx);
4115 }
4116
v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_query_pool * pool,uint32_t query)4117 void v3dv_cmd_buffer_end_query(struct v3dv_cmd_buffer *cmd_buffer,
4118 struct v3dv_query_pool *pool,
4119 uint32_t query)
4120 {
4121 switch (pool->query_type) {
4122 case VK_QUERY_TYPE_OCCLUSION:
4123 v3dv_cmd_buffer_end_occlusion_query(cmd_buffer, pool, query);
4124 break;
4125 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR:
4126 v3dv_cmd_buffer_end_performance_query(cmd_buffer, pool, query);
4127 break;
4128 default:
4129 unreachable("Unsupported query type");
4130 }
4131 }
4132
4133 void
v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer * cmd_buffer,struct drm_v3d_submit_tfu * tfu)4134 v3dv_cmd_buffer_add_tfu_job(struct v3dv_cmd_buffer *cmd_buffer,
4135 struct drm_v3d_submit_tfu *tfu)
4136 {
4137 struct v3dv_device *device = cmd_buffer->device;
4138 struct v3dv_job *job = vk_zalloc(&device->vk.alloc,
4139 sizeof(struct v3dv_job), 8,
4140 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
4141 if (!job) {
4142 v3dv_flag_oom(cmd_buffer, NULL);
4143 return;
4144 }
4145
4146 v3dv_job_init(job, V3DV_JOB_TYPE_GPU_TFU, device, cmd_buffer, -1);
4147 job->tfu = *tfu;
4148 list_addtail(&job->list_link, &cmd_buffer->jobs);
4149 }
4150
4151 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)4152 v3dv_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
4153 VkPipelineStageFlags2 stage,
4154 VkQueryPool queryPool,
4155 uint32_t query)
4156 {
4157 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4158 V3DV_FROM_HANDLE(v3dv_query_pool, query_pool, queryPool);
4159
4160 /* If this is called inside a render pass we need to finish the current
4161 * job here...
4162 */
4163 struct v3dv_render_pass *pass = cmd_buffer->state.pass;
4164 if (pass)
4165 v3dv_cmd_buffer_finish_job(cmd_buffer);
4166
4167 struct v3dv_job *job =
4168 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
4169 V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY,
4170 cmd_buffer, -1);
4171 v3dv_return_if_oom(cmd_buffer, NULL);
4172
4173 job->cpu.query_timestamp.pool = query_pool;
4174 job->cpu.query_timestamp.query = query;
4175
4176 if (!pass || !pass->multiview_enabled) {
4177 job->cpu.query_timestamp.count = 1;
4178 } else {
4179 struct v3dv_subpass *subpass =
4180 &pass->subpasses[cmd_buffer->state.subpass_idx];
4181 job->cpu.query_timestamp.count = util_bitcount(subpass->view_mask);
4182 }
4183
4184 list_addtail(&job->list_link, &cmd_buffer->jobs);
4185 cmd_buffer->state.job = NULL;
4186
4187 /* ...and resume the subpass after the timestamp */
4188 if (cmd_buffer->state.pass)
4189 v3dv_cmd_buffer_subpass_resume(cmd_buffer, cmd_buffer->state.subpass_idx);
4190 }
4191
4192 static void
cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer * cmd_buffer)4193 cmd_buffer_emit_pre_dispatch(struct v3dv_cmd_buffer *cmd_buffer)
4194 {
4195 assert(cmd_buffer->state.compute.pipeline);
4196 assert(cmd_buffer->state.compute.pipeline->active_stages ==
4197 VK_SHADER_STAGE_COMPUTE_BIT);
4198
4199 cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_COMPUTE_PIPELINE |
4200 V3DV_CMD_DIRTY_COMPUTE_DESCRIPTOR_SETS);
4201 cmd_buffer->state.dirty_descriptor_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4202 cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_COMPUTE_BIT;
4203 }
4204
4205 void
v3dv_cmd_buffer_rewrite_indirect_csd_job(struct v3dv_device * device,struct v3dv_csd_indirect_cpu_job_info * info,const uint32_t * wg_counts)4206 v3dv_cmd_buffer_rewrite_indirect_csd_job(
4207 struct v3dv_device *device,
4208 struct v3dv_csd_indirect_cpu_job_info *info,
4209 const uint32_t *wg_counts)
4210 {
4211 assert(info->csd_job);
4212 struct v3dv_job *job = info->csd_job;
4213
4214 assert(job->type == V3DV_JOB_TYPE_GPU_CSD);
4215 assert(wg_counts[0] > 0 && wg_counts[1] > 0 && wg_counts[2] > 0);
4216
4217 struct drm_v3d_submit_csd *submit = &job->csd.submit;
4218
4219 job->csd.wg_count[0] = wg_counts[0];
4220 job->csd.wg_count[1] = wg_counts[1];
4221 job->csd.wg_count[2] = wg_counts[2];
4222
4223 submit->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
4224 submit->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
4225 submit->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT;
4226
4227 uint32_t num_batches = DIV_ROUND_UP(info->wg_size, 16) *
4228 (wg_counts[0] * wg_counts[1] * wg_counts[2]);
4229 /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
4230 if (device->devinfo.ver < 71 ||
4231 (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
4232 submit->cfg[4] = num_batches - 1;
4233 } else {
4234 submit->cfg[4] = num_batches;
4235 }
4236 assert(submit->cfg[4] != ~0);
4237
4238 if (info->needs_wg_uniform_rewrite) {
4239 /* Make sure the GPU is not currently accessing the indirect CL for this
4240 * job, since we are about to overwrite some of the uniform data.
4241 */
4242 v3dv_bo_wait(job->device, job->indirect.bo, OS_TIMEOUT_INFINITE);
4243
4244 for (uint32_t i = 0; i < 3; i++) {
4245 if (info->wg_uniform_offsets[i]) {
4246 /* Sanity check that our uniform pointers are within the allocated
4247 * BO space for our indirect CL.
4248 */
4249 assert(info->wg_uniform_offsets[i] >= (uint32_t *) job->indirect.base);
4250 assert(info->wg_uniform_offsets[i] < (uint32_t *) job->indirect.next);
4251 *(info->wg_uniform_offsets[i]) = wg_counts[i];
4252 }
4253 }
4254 }
4255 }
4256
4257 static struct v3dv_job *
cmd_buffer_create_csd_job(struct v3dv_cmd_buffer * cmd_buffer,uint32_t base_offset_x,uint32_t base_offset_y,uint32_t base_offset_z,uint32_t group_count_x,uint32_t group_count_y,uint32_t group_count_z,uint32_t ** wg_uniform_offsets_out,uint32_t * wg_size_out)4258 cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
4259 uint32_t base_offset_x,
4260 uint32_t base_offset_y,
4261 uint32_t base_offset_z,
4262 uint32_t group_count_x,
4263 uint32_t group_count_y,
4264 uint32_t group_count_z,
4265 uint32_t **wg_uniform_offsets_out,
4266 uint32_t *wg_size_out)
4267 {
4268 struct v3dv_device *device = cmd_buffer->device;
4269 struct v3dv_pipeline *pipeline = cmd_buffer->state.compute.pipeline;
4270 assert(pipeline && pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE]);
4271 struct v3dv_shader_variant *cs_variant =
4272 pipeline->shared_data->variants[BROADCOM_SHADER_COMPUTE];
4273
4274 struct v3dv_job *job = vk_zalloc(&cmd_buffer->device->vk.alloc,
4275 sizeof(struct v3dv_job), 8,
4276 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
4277 if (!job) {
4278 v3dv_flag_oom(cmd_buffer, NULL);
4279 return NULL;
4280 }
4281
4282 v3dv_job_init(job, V3DV_JOB_TYPE_GPU_CSD, cmd_buffer->device, cmd_buffer, -1);
4283 cmd_buffer->state.job = job;
4284
4285 struct drm_v3d_submit_csd *submit = &job->csd.submit;
4286
4287 job->csd.wg_count[0] = group_count_x;
4288 job->csd.wg_count[1] = group_count_y;
4289 job->csd.wg_count[2] = group_count_z;
4290
4291 job->csd.wg_base[0] = base_offset_x;
4292 job->csd.wg_base[1] = base_offset_y;
4293 job->csd.wg_base[2] = base_offset_z;
4294
4295 submit->cfg[0] |= group_count_x << V3D_CSD_CFG012_WG_COUNT_SHIFT;
4296 submit->cfg[1] |= group_count_y << V3D_CSD_CFG012_WG_COUNT_SHIFT;
4297 submit->cfg[2] |= group_count_z << V3D_CSD_CFG012_WG_COUNT_SHIFT;
4298
4299 const struct v3d_compute_prog_data *cpd =
4300 cs_variant->prog_data.cs;
4301
4302 const uint32_t num_wgs = group_count_x * group_count_y * group_count_z;
4303 const uint32_t wg_size = cpd->local_size[0] *
4304 cpd->local_size[1] *
4305 cpd->local_size[2];
4306
4307 uint32_t wgs_per_sg =
4308 v3d_csd_choose_workgroups_per_supergroup(
4309 &cmd_buffer->device->devinfo,
4310 cs_variant->prog_data.cs->has_subgroups,
4311 cs_variant->prog_data.cs->base.has_control_barrier,
4312 cs_variant->prog_data.cs->base.threads,
4313 num_wgs, wg_size);
4314
4315 uint32_t batches_per_sg = DIV_ROUND_UP(wgs_per_sg * wg_size, 16);
4316 uint32_t whole_sgs = num_wgs / wgs_per_sg;
4317 uint32_t rem_wgs = num_wgs - whole_sgs * wgs_per_sg;
4318 uint32_t num_batches = batches_per_sg * whole_sgs +
4319 DIV_ROUND_UP(rem_wgs * wg_size, 16);
4320
4321 submit->cfg[3] |= (wgs_per_sg & 0xf) << V3D_CSD_CFG3_WGS_PER_SG_SHIFT;
4322 submit->cfg[3] |= (batches_per_sg - 1) << V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT;
4323 submit->cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
4324 if (wg_size_out)
4325 *wg_size_out = wg_size;
4326
4327 /* V3D 7.1.6 and later don't subtract 1 from the number of batches */
4328 if (device->devinfo.ver < 71 ||
4329 (device->devinfo.ver == 71 && device->devinfo.rev < 6)) {
4330 submit->cfg[4] = num_batches - 1;
4331 } else {
4332 submit->cfg[4] = num_batches;
4333 }
4334 assert(submit->cfg[4] != ~0);
4335
4336 assert(pipeline->shared_data->assembly_bo);
4337 struct v3dv_bo *cs_assembly_bo = pipeline->shared_data->assembly_bo;
4338
4339 submit->cfg[5] = cs_assembly_bo->offset + cs_variant->assembly_offset;
4340 if (cs_variant->prog_data.base->single_seg)
4341 submit->cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
4342 if (cs_variant->prog_data.base->threads == 4)
4343 submit->cfg[5] |= V3D_CSD_CFG5_THREADING;
4344 /* V3D 7.x has made the PROPAGATE_NANS bit in CFG5 reserved */
4345 if (device->devinfo.ver < 71)
4346 submit->cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
4347
4348 if (cs_variant->prog_data.cs->shared_size > 0) {
4349 job->csd.shared_memory =
4350 v3dv_bo_alloc(cmd_buffer->device,
4351 cs_variant->prog_data.cs->shared_size * num_wgs,
4352 "shared_vars", true);
4353 if (!job->csd.shared_memory) {
4354 v3dv_flag_oom(cmd_buffer, NULL);
4355 return job;
4356 }
4357 }
4358
4359 v3dv_job_add_bo_unchecked(job, cs_assembly_bo);
4360 struct v3dv_cl_reloc uniforms =
4361 v3dv_write_uniforms_wg_offsets(cmd_buffer, pipeline,
4362 cs_variant,
4363 wg_uniform_offsets_out);
4364 submit->cfg[6] = uniforms.bo->offset + uniforms.offset;
4365
4366
4367 /* Track VK_KHR_buffer_device_address usage in the job */
4368 job->uses_buffer_device_address |= pipeline->uses_buffer_device_address;
4369
4370 v3dv_job_add_bo(job, uniforms.bo);
4371
4372 return job;
4373 }
4374
4375 static void
cmd_buffer_dispatch(struct v3dv_cmd_buffer * cmd_buffer,uint32_t base_offset_x,uint32_t base_offset_y,uint32_t base_offset_z,uint32_t group_count_x,uint32_t group_count_y,uint32_t group_count_z)4376 cmd_buffer_dispatch(struct v3dv_cmd_buffer *cmd_buffer,
4377 uint32_t base_offset_x,
4378 uint32_t base_offset_y,
4379 uint32_t base_offset_z,
4380 uint32_t group_count_x,
4381 uint32_t group_count_y,
4382 uint32_t group_count_z)
4383 {
4384 if (group_count_x == 0 || group_count_y == 0 || group_count_z == 0)
4385 return;
4386
4387 struct v3dv_job *job =
4388 cmd_buffer_create_csd_job(cmd_buffer,
4389 base_offset_x,
4390 base_offset_y,
4391 base_offset_z,
4392 group_count_x,
4393 group_count_y,
4394 group_count_z,
4395 NULL, NULL);
4396
4397 list_addtail(&job->list_link, &cmd_buffer->jobs);
4398 cmd_buffer->state.job = NULL;
4399 }
4400
4401 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDispatchBase(VkCommandBuffer commandBuffer,uint32_t baseGroupX,uint32_t baseGroupY,uint32_t baseGroupZ,uint32_t groupCountX,uint32_t groupCountY,uint32_t groupCountZ)4402 v3dv_CmdDispatchBase(VkCommandBuffer commandBuffer,
4403 uint32_t baseGroupX,
4404 uint32_t baseGroupY,
4405 uint32_t baseGroupZ,
4406 uint32_t groupCountX,
4407 uint32_t groupCountY,
4408 uint32_t groupCountZ)
4409 {
4410 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4411
4412 cmd_buffer_emit_pre_dispatch(cmd_buffer);
4413 cmd_buffer_dispatch(cmd_buffer,
4414 baseGroupX, baseGroupY, baseGroupZ,
4415 groupCountX, groupCountY, groupCountZ);
4416 }
4417
4418
4419 static void
cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_buffer * buffer,uint32_t offset)4420 cmd_buffer_dispatch_indirect(struct v3dv_cmd_buffer *cmd_buffer,
4421 struct v3dv_buffer *buffer,
4422 uint32_t offset)
4423 {
4424 /* We can't do indirect dispatches, so instead we record a CPU job that,
4425 * when executed in the queue, will map the indirect buffer, read the
4426 * dispatch parameters, and submit a regular dispatch.
4427 */
4428 struct v3dv_job *job =
4429 v3dv_cmd_buffer_create_cpu_job(cmd_buffer->device,
4430 V3DV_JOB_TYPE_CPU_CSD_INDIRECT,
4431 cmd_buffer, -1);
4432 v3dv_return_if_oom(cmd_buffer, NULL);
4433
4434 /* We need to create a CSD job now, even if we still don't know the actual
4435 * dispatch parameters, because the job setup needs to be done using the
4436 * current command buffer state (i.e. pipeline, descriptor sets, push
4437 * constants, etc.). So we create the job with default dispatch parameters
4438 * and we will rewrite the parts we need at submit time if the indirect
4439 * parameters don't match the ones we used to setup the job.
4440 */
4441 struct v3dv_job *csd_job =
4442 cmd_buffer_create_csd_job(cmd_buffer,
4443 0, 0, 0,
4444 1, 1, 1,
4445 &job->cpu.csd_indirect.wg_uniform_offsets[0],
4446 &job->cpu.csd_indirect.wg_size);
4447 v3dv_return_if_oom(cmd_buffer, NULL);
4448 assert(csd_job);
4449
4450 job->cpu.csd_indirect.buffer = buffer;
4451 job->cpu.csd_indirect.offset = offset;
4452 job->cpu.csd_indirect.csd_job = csd_job;
4453
4454 /* If the compute shader reads the workgroup sizes we will also need to
4455 * rewrite the corresponding uniforms.
4456 */
4457 job->cpu.csd_indirect.needs_wg_uniform_rewrite =
4458 job->cpu.csd_indirect.wg_uniform_offsets[0] ||
4459 job->cpu.csd_indirect.wg_uniform_offsets[1] ||
4460 job->cpu.csd_indirect.wg_uniform_offsets[2];
4461
4462 list_addtail(&job->list_link, &cmd_buffer->jobs);
4463
4464 /* If we have a CPU queue we submit the CPU job directly to the
4465 * queue and the CSD job will be dispatched from within the kernel
4466 * queue, otherwise we will have to dispatch the CSD job manually
4467 * right after the CPU job by adding it to the list of jobs in the
4468 * command buffer.
4469 */
4470 if (!cmd_buffer->device->pdevice->caps.cpu_queue)
4471 list_addtail(&csd_job->list_link, &cmd_buffer->jobs);
4472
4473 cmd_buffer->state.job = NULL;
4474 }
4475
4476 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,VkBuffer _buffer,VkDeviceSize offset)4477 v3dv_CmdDispatchIndirect(VkCommandBuffer commandBuffer,
4478 VkBuffer _buffer,
4479 VkDeviceSize offset)
4480 {
4481 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4482 V3DV_FROM_HANDLE(v3dv_buffer, buffer, _buffer);
4483
4484 assert(offset <= UINT32_MAX);
4485
4486 cmd_buffer_emit_pre_dispatch(cmd_buffer);
4487 cmd_buffer_dispatch_indirect(cmd_buffer, buffer, offset);
4488 }
4489
4490 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdBeginRenderingKHR(VkCommandBuffer commandBuffer,const VkRenderingInfoKHR * info)4491 v3dv_CmdBeginRenderingKHR(VkCommandBuffer commandBuffer,
4492 const VkRenderingInfoKHR *info)
4493 {
4494 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4495
4496 cmd_buffer->state.suspending = info->flags & VK_RENDERING_SUSPENDING_BIT;
4497 cmd_buffer->state.resuming = info->flags & VK_RENDERING_RESUMING_BIT;
4498
4499 /* FIXME: for resuming passes we might not need all this setup below since
4500 * we are only mostly recording draw calls like in secondaries.
4501 */
4502
4503 v3dv_setup_dynamic_render_pass(cmd_buffer, info);
4504 v3dv_return_if_oom(cmd_buffer, NULL);
4505
4506 v3dv_setup_dynamic_framebuffer(cmd_buffer, info);
4507 v3dv_return_if_oom(cmd_buffer, NULL);
4508
4509 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
4510 state->pass = &state->dynamic_pass;
4511 state->framebuffer = state->dynamic_framebuffer;
4512
4513 VkRenderPassBeginInfo begin_info = {
4514 .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
4515 .pNext = NULL,
4516 .renderPass = v3dv_render_pass_to_handle(state->pass),
4517 .framebuffer = v3dv_framebuffer_to_handle(state->framebuffer),
4518 .renderArea = info->renderArea,
4519 };
4520
4521 VkClearValue *clear_values = NULL;
4522 if (state->pass->attachment_count > 0) {
4523 clear_values =
4524 vk_alloc(&cmd_buffer->device->vk.alloc,
4525 state->pass->attachment_count * sizeof(VkClearValue), 8,
4526 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
4527 if (!clear_values) {
4528 v3dv_flag_oom(cmd_buffer, NULL);
4529 return;
4530 }
4531 }
4532
4533 for (int i = 0; i < info->colorAttachmentCount; i++) {
4534 if (!info->pColorAttachments[i].imageView)
4535 continue;
4536
4537 uint32_t a = cmd_buffer->state.dynamic_subpass.color_attachments[i].attachment;
4538 assert(a < state->pass->attachment_count);
4539 clear_values[a] = info->pColorAttachments[i].clearValue;
4540 }
4541
4542 if (info->pDepthAttachment &&
4543 info->pDepthAttachment->imageView != VK_NULL_HANDLE) {
4544 uint32_t a = cmd_buffer->state.dynamic_subpass.ds_attachment.attachment;
4545 assert(a < state->pass->attachment_count);
4546 clear_values[a].depthStencil.depth =
4547 info->pDepthAttachment->clearValue.depthStencil.depth;
4548 }
4549
4550 if (info->pStencilAttachment &&
4551 info->pStencilAttachment->imageView != VK_NULL_HANDLE) {
4552 uint32_t a = cmd_buffer->state.dynamic_subpass.ds_attachment.attachment;
4553 assert(a < state->pass->attachment_count);
4554 clear_values[a].depthStencil.stencil =
4555 info->pStencilAttachment->clearValue.depthStencil.stencil;
4556 }
4557
4558 begin_info.clearValueCount = state->pass->attachment_count;
4559 begin_info.pClearValues = clear_values;
4560
4561 cmd_buffer_ensure_render_pass_attachment_state(cmd_buffer);
4562 v3dv_return_if_oom(cmd_buffer, NULL);
4563 cmd_buffer_init_render_pass_attachment_state(cmd_buffer, &begin_info);
4564
4565 if (clear_values)
4566 vk_free(&cmd_buffer->vk.pool->alloc, clear_values);
4567
4568 state->render_area = info->renderArea;
4569 constraint_clip_window_to_render_area(cmd_buffer);
4570 v3dv_cmd_buffer_subpass_start(cmd_buffer, 0);
4571 }
4572
4573 VKAPI_ATTR void VKAPI_CALL
v3dv_CmdEndRenderingKHR(VkCommandBuffer commandBuffer)4574 v3dv_CmdEndRenderingKHR(VkCommandBuffer commandBuffer)
4575 {
4576 V3DV_FROM_HANDLE(v3dv_cmd_buffer, cmd_buffer, commandBuffer);
4577
4578 v3dv_return_if_oom(cmd_buffer, NULL);
4579
4580 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
4581 assert(state->subpass_idx == state->pass->subpass_count - 1);
4582
4583 /* If we have any pending jobs that were waiting for the current job
4584 * to finish and we are suspending the pass here, we need to finish the
4585 * job completely and ensure we emit the pending jobs immediately.
4586 *
4587 * FIXME: this is not optimal but since the resuming command buffer won't
4588 * have the pending state we can't do it after the resuming chain completes
4589 * without some extra work: we would have to generate the pending jobs
4590 * now but not add them to this command buffer's job list, instead, they
4591 * should be added to a separate list of "pending jobs" and at submit time
4592 * we would accumulate these jobs during the suspend/resume chain and emit
4593 * them all after the last job in the chain.
4594 */
4595 if (state->suspending && cmd_buffer_has_pending_jobs(cmd_buffer))
4596 v3dv_cmd_buffer_finish_job(cmd_buffer);
4597
4598 /* If we don't have a job and we are suspending we will need to create one
4599 * so we can link to a follow-up resume job. Because would be starting a new
4600 * job, we should ensure the command buffer state is not flagged as resuming
4601 * from a previous suspend. The new job will consume any pending barrier
4602 * state if necessary.
4603 */
4604 struct v3dv_job *job = cmd_buffer->state.job;
4605 if (!job && state->suspending) {
4606 state->resuming = false;
4607 job = v3dv_cmd_buffer_subpass_resume(cmd_buffer, state->subpass_idx);
4608 if (!job)
4609 return;
4610 }
4611
4612 /* If this job is suspending it means it will continue execution in another
4613 * job (with the same RCL spec). We implement this by branching the BCL and
4614 * we will patch the branch address when we know the resuming job.
4615 */
4616 if (state->suspending)
4617 v3dv_X(cmd_buffer->device, cmd_buffer_suspend)(cmd_buffer);
4618
4619 v3dv_cmd_buffer_subpass_finish(cmd_buffer);
4620 v3dv_cmd_buffer_finish_job(cmd_buffer);
4621
4622 /* This must be done after the resume/suspend chain completed. */
4623 if (!state->suspending)
4624 cmd_buffer_subpass_handle_pending_resolves(cmd_buffer);
4625
4626 state->framebuffer = NULL;
4627 state->pass = NULL;
4628 state->subpass_idx = -1;
4629 state->suspending = false;
4630 state->resuming = false;
4631 }
4632