1 /*
2 * Copyright © 2019 Raspberry Pi Ltd
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "v3dv_private.h"
25 #include "drm-uapi/v3d_drm.h"
26
27 #include "broadcom/clif/clif_dump.h"
28 #include "util/libsync.h"
29 #include "util/os_time.h"
30 #include "vk_drm_syncobj.h"
31
32 #include <errno.h>
33 #include <time.h>
34
35 static void
v3dv_clif_dump(struct v3dv_device * device,struct v3dv_job * job,struct drm_v3d_submit_cl * submit)36 v3dv_clif_dump(struct v3dv_device *device,
37 struct v3dv_job *job,
38 struct drm_v3d_submit_cl *submit)
39 {
40 if (!(V3D_DBG(CL) ||
41 V3D_DBG(CL_NO_BIN) ||
42 V3D_DBG(CLIF)))
43 return;
44
45 struct clif_dump *clif = clif_dump_init(&device->devinfo,
46 stderr,
47 V3D_DBG(CL) ||
48 V3D_DBG(CL_NO_BIN),
49 V3D_DBG(CL_NO_BIN));
50
51 set_foreach(job->bos, entry) {
52 struct v3dv_bo *bo = (void *)entry->key;
53 char *name = ralloc_asprintf(NULL, "%s_0x%x",
54 bo->name, bo->offset);
55
56 bool ok = v3dv_bo_map(device, bo, bo->size);
57 if (!ok) {
58 fprintf(stderr, "failed to map BO for clif_dump.\n");
59 ralloc_free(name);
60 goto free_clif;
61 }
62 clif_dump_add_bo(clif, name, bo->offset, bo->size, bo->map);
63
64 ralloc_free(name);
65 }
66
67 clif_dump(clif, submit);
68
69 free_clif:
70 clif_dump_destroy(clif);
71 }
72
73 static VkResult
queue_wait_idle(struct v3dv_queue * queue,struct v3dv_submit_sync_info * sync_info)74 queue_wait_idle(struct v3dv_queue *queue,
75 struct v3dv_submit_sync_info *sync_info)
76 {
77 int ret = drmSyncobjWait(queue->device->pdevice->render_fd,
78 queue->last_job_syncs.syncs, 4,
79 INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
80 NULL);
81 if (ret)
82 return vk_errorf(queue, VK_ERROR_DEVICE_LOST, "syncobj wait failed: %m");
83
84 bool first = true;
85 for (int i = 0; i < 4; i++) {
86 if (!queue->last_job_syncs.first[i])
87 first = false;
88 }
89
90 /* If we're not the first job, that means we're waiting on some
91 * per-queue-type syncobj which transitively waited on the semaphores
92 * so we can skip the semaphore wait.
93 */
94 if (first) {
95 VkResult result = vk_sync_wait_many(&queue->device->vk,
96 sync_info->wait_count,
97 sync_info->waits,
98 VK_SYNC_WAIT_COMPLETE,
99 UINT64_MAX);
100 if (result != VK_SUCCESS)
101 return result;
102 }
103
104 for (int i = 0; i < 4; i++)
105 queue->last_job_syncs.first[i] = false;
106
107 return VK_SUCCESS;
108 }
109
110 static void
multisync_free(struct v3dv_device * device,struct drm_v3d_multi_sync * ms)111 multisync_free(struct v3dv_device *device,
112 struct drm_v3d_multi_sync *ms)
113 {
114 vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->out_syncs);
115 vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->in_syncs);
116 }
117
118 static struct drm_v3d_sem *
set_in_syncs(struct v3dv_queue * queue,struct v3dv_job * job,enum v3dv_queue_type queue_sync,uint32_t * count,struct vk_sync_wait * waits,unsigned wait_count,struct v3dv_submit_sync_info * sync_info)119 set_in_syncs(struct v3dv_queue *queue,
120 struct v3dv_job *job,
121 enum v3dv_queue_type queue_sync,
122 uint32_t *count,
123 struct vk_sync_wait *waits,
124 unsigned wait_count,
125 struct v3dv_submit_sync_info *sync_info)
126 {
127 struct v3dv_device *device = queue->device;
128 uint32_t n_syncs = 0;
129
130 /* If this is the first job submitted to a given GPU queue in this cmd buf
131 * batch, it has to wait on wait semaphores (if any) before running.
132 */
133 if (queue->last_job_syncs.first[queue_sync])
134 n_syncs = sync_info->wait_count;
135
136 /* If the serialize flag is set the job needs to be serialized in the
137 * corresponding queues. Notice that we may implement transfer operations
138 * as both CL or TFU jobs.
139 *
140 * FIXME: maybe we could track more precisely if the source of a transfer
141 * barrier is a CL and/or a TFU job.
142 */
143 bool sync_csd = job->serialize & V3DV_BARRIER_COMPUTE_BIT;
144 bool sync_tfu = job->serialize & V3DV_BARRIER_TRANSFER_BIT;
145 bool sync_cl = job->serialize & (V3DV_BARRIER_GRAPHICS_BIT |
146 V3DV_BARRIER_TRANSFER_BIT);
147 bool sync_cpu = job->serialize & V3DV_BARRIER_CPU_BIT;
148
149 *count = n_syncs;
150 if (sync_cl)
151 (*count)++;
152 if (sync_tfu)
153 (*count)++;
154 if (sync_csd)
155 (*count)++;
156 if (sync_cpu)
157 (*count)++;
158
159 *count += wait_count;
160
161 if (!*count)
162 return NULL;
163
164 struct drm_v3d_sem *syncs =
165 vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
166 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
167
168 if (!syncs)
169 return NULL;
170
171 for (int i = 0; i < n_syncs; i++) {
172 syncs[i].handle =
173 vk_sync_as_drm_syncobj(sync_info->waits[i].sync)->syncobj;
174 }
175
176 for (int i = 0; i < wait_count; i++) {
177 syncs[n_syncs++].handle =
178 vk_sync_as_drm_syncobj(waits[i].sync)->syncobj;
179 }
180
181 if (sync_cl)
182 syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CL];
183
184 if (sync_csd)
185 syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CSD];
186
187 if (sync_tfu)
188 syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_TFU];
189
190 if (sync_cpu)
191 syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CPU];
192
193 assert(n_syncs == *count);
194 return syncs;
195 }
196
197 static struct drm_v3d_sem *
set_out_syncs(struct v3dv_queue * queue,struct v3dv_job * job,enum v3dv_queue_type queue_sync,uint32_t * count,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)198 set_out_syncs(struct v3dv_queue *queue,
199 struct v3dv_job *job,
200 enum v3dv_queue_type queue_sync,
201 uint32_t *count,
202 struct v3dv_submit_sync_info *sync_info,
203 bool signal_syncs)
204 {
205 struct v3dv_device *device = queue->device;
206
207 uint32_t n_vk_syncs = signal_syncs ? sync_info->signal_count : 0;
208
209 /* We always signal the syncobj from `device->last_job_syncs` related to
210 * this v3dv_queue_type to track the last job submitted to this queue.
211 */
212 (*count) = n_vk_syncs + 1;
213
214 struct drm_v3d_sem *syncs =
215 vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
216 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
217
218 if (!syncs)
219 return NULL;
220
221 if (n_vk_syncs) {
222 for (unsigned i = 0; i < n_vk_syncs; i++) {
223 syncs[i].handle =
224 vk_sync_as_drm_syncobj(sync_info->signals[i].sync)->syncobj;
225 }
226 }
227
228 syncs[n_vk_syncs].handle = queue->last_job_syncs.syncs[queue_sync];
229
230 return syncs;
231 }
232
233 static void
set_ext(struct drm_v3d_extension * ext,struct drm_v3d_extension * next,uint32_t id,uintptr_t flags)234 set_ext(struct drm_v3d_extension *ext,
235 struct drm_v3d_extension *next,
236 uint32_t id,
237 uintptr_t flags)
238 {
239 ext->next = (uintptr_t)(void *)next;
240 ext->id = id;
241 ext->flags = flags;
242 }
243
244 /* This function sets the extension for multiple in/out syncobjs. When it is
245 * successful, it sets the extension id to DRM_V3D_EXT_ID_MULTI_SYNC.
246 * Otherwise, the extension id is 0, which means an out-of-memory error.
247 */
248 static void
set_multisync(struct drm_v3d_multi_sync * ms,struct v3dv_submit_sync_info * sync_info,struct vk_sync_wait * waits,unsigned wait_count,struct drm_v3d_extension * next,struct v3dv_device * device,struct v3dv_job * job,enum v3dv_queue_type in_queue_sync,enum v3dv_queue_type out_queue_sync,enum v3d_queue wait_stage,bool signal_syncs)249 set_multisync(struct drm_v3d_multi_sync *ms,
250 struct v3dv_submit_sync_info *sync_info,
251 struct vk_sync_wait *waits,
252 unsigned wait_count,
253 struct drm_v3d_extension *next,
254 struct v3dv_device *device,
255 struct v3dv_job *job,
256 enum v3dv_queue_type in_queue_sync,
257 enum v3dv_queue_type out_queue_sync,
258 enum v3d_queue wait_stage,
259 bool signal_syncs)
260 {
261 struct v3dv_queue *queue = &device->queue;
262 uint32_t out_sync_count = 0, in_sync_count = 0;
263 struct drm_v3d_sem *out_syncs = NULL, *in_syncs = NULL;
264
265 in_syncs = set_in_syncs(queue, job, in_queue_sync,
266 &in_sync_count, waits, wait_count, sync_info);
267 if (!in_syncs && in_sync_count)
268 goto fail;
269
270 out_syncs = set_out_syncs(queue, job, out_queue_sync,
271 &out_sync_count, sync_info, signal_syncs);
272
273 assert(out_sync_count > 0);
274
275 if (!out_syncs)
276 goto fail;
277
278 set_ext(&ms->base, next, DRM_V3D_EXT_ID_MULTI_SYNC, 0);
279 ms->wait_stage = wait_stage;
280 ms->out_sync_count = out_sync_count;
281 ms->out_syncs = (uintptr_t)(void *)out_syncs;
282 ms->in_sync_count = in_sync_count;
283 ms->in_syncs = (uintptr_t)(void *)in_syncs;
284
285 return;
286
287 fail:
288 if (in_syncs)
289 vk_free(&device->vk.alloc, in_syncs);
290 assert(!out_syncs);
291
292 return;
293 }
294
295 static VkResult
handle_reset_query_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)296 handle_reset_query_cpu_job(struct v3dv_queue *queue,
297 struct v3dv_job *job,
298 struct v3dv_submit_sync_info *sync_info,
299 bool signal_syncs)
300 {
301 struct v3dv_device *device = queue->device;
302 struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset;
303 assert(info->pool);
304
305 assert(info->pool->query_type != VK_QUERY_TYPE_OCCLUSION);
306
307 if (device->pdevice->caps.cpu_queue) {
308 assert(info->first + info->count <= info->pool->query_count);
309
310 struct drm_v3d_submit_cpu submit = {0};
311 struct drm_v3d_multi_sync ms = {0};
312
313 uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
314 uintptr_t *kperfmon_ids = NULL;
315
316 if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
317 submit.bo_handle_count = 1;
318 submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle;
319
320 struct drm_v3d_reset_timestamp_query reset = {0};
321
322 set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_TIMESTAMP_QUERY, 0);
323
324 reset.count = info->count;
325 reset.offset = info->pool->queries[info->first].timestamp.offset;
326
327 for (uint32_t i = 0; i < info->count; i++) {
328 struct v3dv_query *query = &info->pool->queries[info->first + i];
329 syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
330 }
331
332 reset.syncs = (uintptr_t)(void *)syncs;
333
334 set_multisync(&ms, sync_info, NULL, 0, (void *)&reset, device, job,
335 V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
336 if (!ms.base.id)
337 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
338 } else {
339 assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
340 struct drm_v3d_reset_performance_query reset = {0};
341
342 set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_PERFORMANCE_QUERY, 0);
343
344 struct vk_sync_wait waits[info->count];
345 unsigned wait_count = 0;
346 for (int i = 0; i < info->count; i++) {
347 struct v3dv_query *query = &info->pool->queries[info->first + i];
348 /* Only wait for a query if we've used it otherwise we will be
349 * waiting forever for the fence to become signaled.
350 */
351 if (query->maybe_available) {
352 waits[wait_count] = (struct vk_sync_wait){
353 .sync = query->perf.last_job_sync
354 };
355 wait_count++;
356 };
357 }
358
359 reset.count = info->count;
360 reset.nperfmons = info->pool->perfmon.nperfmons;
361
362 kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count);
363
364 for (uint32_t i = 0; i < info->count; i++) {
365 struct v3dv_query *query = &info->pool->queries[info->first + i];
366
367 syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
368 kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids;
369 }
370
371 reset.syncs = (uintptr_t)(void *)syncs;
372 reset.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids;
373
374 set_multisync(&ms, sync_info, waits, wait_count, (void *)&reset, device, job,
375 V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
376 if (!ms.base.id)
377 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
378 }
379
380 submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
381 submit.extensions = (uintptr_t)(void *)&ms;
382
383 /* From the Vulkan spec for vkCmdResetQueryPool:
384 *
385 * "This command defines an execution dependency between other query commands
386 * that reference the same query.
387 * ...
388 * The second synchronization scope includes all commands which reference the
389 * queries in queryPool indicated by firstQuery and queryCount that occur later
390 * in submission order."
391 *
392 * This means we should ensure that any timestamps after a reset don't execute before
393 * the reset, however, for timestamps queries in particular we don't have to do
394 * anything special because timestamp queries have to wait for all previously
395 * submitted work to complete before executing (which we accomplish by using
396 * V3DV_BARRIER_ALL on them) and that includes reset jobs submitted to the CPU queue.
397 */
398 int ret = v3dv_ioctl(device->pdevice->render_fd,
399 DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
400
401 free(syncs);
402 free(kperfmon_ids);
403 multisync_free(device, &ms);
404
405 queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
406
407 if (ret)
408 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
409
410 return VK_SUCCESS;
411 }
412
413 /* We are about to reset query counters in user-space so we need to make
414 * sure that the GPU is not using them.
415 */
416 if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
417 VkResult result = queue_wait_idle(queue, sync_info);
418 if (result != VK_SUCCESS)
419 return result;
420
421 v3dv_bo_wait(job->device, info->pool->timestamp.bo, OS_TIMEOUT_INFINITE);
422 }
423
424 if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
425 struct vk_sync_wait waits[info->count];
426 unsigned wait_count = 0;
427 for (int i = 0; i < info->count; i++) {
428 struct v3dv_query *query = &info->pool->queries[info->first + i];
429 /* Only wait for a query if we've used it otherwise we will be
430 * waiting forever for the fence to become signaled.
431 */
432 if (query->maybe_available) {
433 waits[wait_count] = (struct vk_sync_wait){
434 .sync = query->perf.last_job_sync
435 };
436 wait_count++;
437 };
438 }
439
440 VkResult result = vk_sync_wait_many(&job->device->vk, wait_count, waits,
441 VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
442
443 if (result != VK_SUCCESS)
444 return result;
445 }
446
447 v3dv_reset_query_pool_cpu(job->device, info->pool, info->first, info->count);
448
449 return VK_SUCCESS;
450 }
451
452 static VkResult
export_perfmon_last_job_sync(struct v3dv_queue * queue,struct v3dv_job * job,int * fd)453 export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int *fd)
454 {
455 int err;
456 static const enum v3dv_queue_type queues_to_sync[] = {
457 V3DV_QUEUE_CL,
458 V3DV_QUEUE_CSD,
459 };
460
461 for (uint32_t i = 0; i < ARRAY_SIZE(queues_to_sync); i++) {
462 enum v3dv_queue_type queue_type = queues_to_sync[i];
463 int tmp_fd = -1;
464
465 err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
466 queue->last_job_syncs.syncs[queue_type],
467 &tmp_fd);
468
469 if (err) {
470 close(*fd);
471 return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
472 "sync file export failed: %m");
473 }
474
475 err = sync_accumulate("v3dv", fd, tmp_fd);
476
477 if (err) {
478 close(tmp_fd);
479 close(*fd);
480 return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
481 "failed to accumulate sync files: %m");
482 }
483 }
484
485 return VK_SUCCESS;
486 }
487
488 static VkResult
handle_end_query_cpu_job(struct v3dv_job * job,uint32_t counter_pass_idx)489 handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx)
490 {
491 VkResult result = VK_SUCCESS;
492
493 mtx_lock(&job->device->query_mutex);
494
495 struct v3dv_end_query_info *info = &job->cpu.query_end;
496 struct v3dv_queue *queue = &job->device->queue;
497
498 int err = 0;
499 int fd = -1;
500
501 assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
502
503 if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
504 result = export_perfmon_last_job_sync(queue, job, &fd);
505
506 if (result != VK_SUCCESS)
507 goto fail;
508
509 assert(fd >= 0);
510 }
511
512 for (uint32_t i = 0; i < info->count; i++) {
513 assert(info->query + i < info->pool->query_count);
514 struct v3dv_query *query = &info->pool->queries[info->query + i];
515
516 if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
517 uint32_t syncobj = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
518 err = drmSyncobjImportSyncFile(job->device->pdevice->render_fd,
519 syncobj, fd);
520
521 if (err) {
522 result = vk_errorf(queue, VK_ERROR_UNKNOWN,
523 "sync file import failed: %m");
524 goto fail;
525 }
526 }
527
528 query->maybe_available = true;
529 }
530
531 fail:
532 if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
533 close(fd);
534
535 cnd_broadcast(&job->device->query_ended);
536 mtx_unlock(&job->device->query_mutex);
537
538 return result;
539 }
540
541 static VkResult
handle_copy_query_results_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)542 handle_copy_query_results_cpu_job(struct v3dv_queue *queue,
543 struct v3dv_job *job,
544 struct v3dv_submit_sync_info *sync_info,
545 bool signal_syncs)
546 {
547 struct v3dv_device *device = queue->device;
548 struct v3dv_copy_query_results_cpu_job_info *info =
549 &job->cpu.query_copy_results;
550
551 assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
552 info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
553
554 assert(info->dst && info->dst->mem && info->dst->mem->bo);
555 struct v3dv_bo *bo = info->dst->mem->bo;
556
557 if (device->pdevice->caps.cpu_queue) {
558 struct drm_v3d_submit_cpu submit = {0};
559 struct drm_v3d_multi_sync ms = {0};
560
561 uint32_t *offsets = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
562 uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
563 uint32_t *bo_handles = NULL;
564 uintptr_t *kperfmon_ids = NULL;
565
566 if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
567 submit.bo_handle_count = 2;
568
569 bo_handles = (uint32_t *)
570 malloc(sizeof(uint32_t) * submit.bo_handle_count);
571
572 bo_handles[0] = bo->handle;
573 bo_handles[1] = info->pool->timestamp.bo->handle;
574 submit.bo_handles = (uintptr_t)(void *)bo_handles;
575
576 struct drm_v3d_copy_timestamp_query copy = {0};
577
578 set_ext(©.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_TIMESTAMP_QUERY, 0);
579
580 copy.do_64bit = info->flags & VK_QUERY_RESULT_64_BIT;
581 copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT;
582 copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
583 copy.offset = info->offset + info->dst->mem_offset;
584 copy.stride = info->stride;
585 copy.count = info->count;
586
587 for (uint32_t i = 0; i < info->count; i++) {
588 assert(info->first < info->pool->query_count);
589 assert(info->first + info->count <= info->pool->query_count);
590 struct v3dv_query *query = &info->pool->queries[info->first + i];
591
592 offsets[i] = query->timestamp.offset;
593 syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
594 }
595
596 copy.offsets = (uintptr_t)(void *)offsets;
597 copy.syncs = (uintptr_t)(void *)syncs;
598
599 set_multisync(&ms, sync_info, NULL, 0, (void *)©, device, job,
600 V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
601 if (!ms.base.id)
602 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
603 } else {
604 assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
605
606 submit.bo_handle_count = 1;
607 submit.bo_handles = (uintptr_t)(void *)&bo->handle;
608
609 struct drm_v3d_copy_performance_query copy = {0};
610
611 set_ext(©.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_PERFORMANCE_QUERY, 0);
612
613 /* If the queryPool was created with VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR,
614 * results for each query are written as an array of the type indicated
615 * by VkPerformanceCounterKHR::storage for the counter being queried.
616 * For v3dv, VkPerformanceCounterKHR::storage is
617 * VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR.
618 */
619 copy.do_64bit = true;
620 copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT;
621 copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
622 copy.offset = info->offset + info->dst->mem_offset;
623 copy.stride = info->stride;
624 copy.count = info->count;
625 copy.nperfmons = info->pool->perfmon.nperfmons;
626 copy.ncounters = info->pool->perfmon.ncounters;
627
628 kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count);
629
630 struct vk_sync_wait waits[info->count];
631 unsigned wait_count = 0;
632
633 for (uint32_t i = 0; i < info->count; i++) {
634 assert(info->first < info->pool->query_count);
635 assert(info->first + info->count <= info->pool->query_count);
636 struct v3dv_query *query = &info->pool->queries[info->first + i];
637
638 syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
639 kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids;
640
641 if (info->flags & VK_QUERY_RESULT_WAIT_BIT) {
642 waits[wait_count] = (struct vk_sync_wait){
643 .sync = query->perf.last_job_sync
644 };
645 wait_count++;
646 }
647 }
648
649 copy.syncs = (uintptr_t)(void *)syncs;
650 copy.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids;
651
652 set_multisync(&ms, sync_info, waits, wait_count, (void *)©, device, job,
653 V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
654 if (!ms.base.id)
655 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
656 }
657
658 submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
659 submit.extensions = (uintptr_t)(void *)&ms;
660
661 int ret = v3dv_ioctl(device->pdevice->render_fd,
662 DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
663
664 free(kperfmon_ids);
665 free(bo_handles);
666 free(offsets);
667 free(syncs);
668 multisync_free(device, &ms);
669
670 queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
671
672 if (ret)
673 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
674
675 return VK_SUCCESS;
676 }
677
678 /* Map the entire dst buffer for the CPU copy if needed */
679 assert(!bo->map || bo->map_size == bo->size);
680 if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
681 return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
682
683 uint8_t *offset = ((uint8_t *) bo->map) +
684 info->offset + info->dst->mem_offset;
685 v3dv_get_query_pool_results_cpu(job->device,
686 info->pool,
687 info->first,
688 info->count,
689 offset,
690 info->stride,
691 info->flags);
692
693 return VK_SUCCESS;
694 }
695
696 static VkResult
handle_timestamp_query_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)697 handle_timestamp_query_cpu_job(struct v3dv_queue *queue,
698 struct v3dv_job *job,
699 struct v3dv_submit_sync_info *sync_info,
700 bool signal_syncs)
701 {
702 struct v3dv_device *device = queue->device;
703
704 assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY);
705 struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp;
706
707 if (!device->pdevice->caps.cpu_queue) {
708 /* Wait for completion of all work queued before the timestamp query */
709 VkResult result = queue_wait_idle(queue, sync_info);
710 if (result != VK_SUCCESS)
711 return result;
712
713 mtx_lock(&job->device->query_mutex);
714
715 /* Compute timestamp */
716 struct timespec t;
717 clock_gettime(CLOCK_MONOTONIC, &t);
718
719 for (uint32_t i = 0; i < info->count; i++) {
720 assert(info->query + i < info->pool->query_count);
721 struct v3dv_query *query = &info->pool->queries[info->query + i];
722 query->maybe_available = true;
723
724 /* Value */
725 uint8_t *value_addr =
726 ((uint8_t *) info->pool->timestamp.bo->map) + query->timestamp.offset;
727 *((uint64_t*)value_addr) = (i == 0) ? t.tv_sec * 1000000000ull + t.tv_nsec : 0ull;
728
729 /* Availability */
730 result = vk_sync_signal(&job->device->vk, query->timestamp.sync, 0);
731 }
732
733 cnd_broadcast(&job->device->query_ended);
734 mtx_unlock(&job->device->query_mutex);
735
736 return result;
737 }
738
739 struct drm_v3d_submit_cpu submit = {0};
740
741 submit.bo_handle_count = 1;
742 submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle;
743
744 struct drm_v3d_timestamp_query timestamp = {0};
745
746 set_ext(×tamp.base, NULL, DRM_V3D_EXT_ID_CPU_TIMESTAMP_QUERY, 0);
747
748 timestamp.count = info->count;
749
750 uint32_t *offsets =
751 (uint32_t *) malloc(sizeof(uint32_t) * info->count);
752 uint32_t *syncs =
753 (uint32_t *) malloc(sizeof(uint32_t) * info->count);
754
755 for (uint32_t i = 0; i < info->count; i++) {
756 assert(info->query + i < info->pool->query_count);
757 struct v3dv_query *query = &info->pool->queries[info->query + i];
758 query->maybe_available = true;
759
760 offsets[i] = query->timestamp.offset;
761 syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
762 }
763
764 timestamp.offsets = (uintptr_t)(void *)offsets;
765 timestamp.syncs = (uintptr_t)(void *)syncs;
766
767 struct drm_v3d_multi_sync ms = {0};
768
769 /* The CPU job should be serialized so it only executes after all previously
770 * submitted work has completed
771 */
772 job->serialize = V3DV_BARRIER_ALL;
773
774 set_multisync(&ms, sync_info, NULL, 0, (void *)×tamp, device, job,
775 V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
776 if (!ms.base.id)
777 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
778
779 submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
780 submit.extensions = (uintptr_t)(void *)&ms;
781
782 int ret = v3dv_ioctl(device->pdevice->render_fd,
783 DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
784
785 free(offsets);
786 free(syncs);
787 multisync_free(device, &ms);
788
789 queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
790
791 if (ret)
792 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
793
794 return VK_SUCCESS;
795 }
796
797 static VkResult
handle_csd_indirect_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)798 handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
799 struct v3dv_job *job,
800 struct v3dv_submit_sync_info *sync_info,
801 bool signal_syncs)
802 {
803 struct v3dv_device *device = queue->device;
804
805 assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
806 struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect;
807 assert(info->csd_job);
808
809 assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
810 struct v3dv_bo *bo = info->buffer->mem->bo;
811
812 if (!device->pdevice->caps.cpu_queue) {
813 /* Make sure the GPU is no longer using the indirect buffer*/
814 v3dv_bo_wait(queue->device, bo, OS_TIMEOUT_INFINITE);
815
816 /* Map the indirect buffer and read the dispatch parameters */
817 if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
818 return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
819 assert(bo->map);
820
821 const uint32_t offset = info->buffer->mem_offset + info->offset;
822 const uint32_t *group_counts = (uint32_t *) (bo->map + offset);
823 if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0)
824 return VK_SUCCESS;
825
826 if (memcmp(group_counts, info->csd_job->csd.wg_count,
827 sizeof(info->csd_job->csd.wg_count)) != 0) {
828 v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts);
829 }
830
831 return VK_SUCCESS;
832 }
833
834 struct v3dv_job *csd_job = info->csd_job;
835
836 struct drm_v3d_submit_cpu submit = {0};
837
838 submit.bo_handle_count = 1;
839 submit.bo_handles = (uintptr_t)(void *)&bo->handle;
840
841 csd_job->csd.submit.bo_handle_count = csd_job->bo_count;
842 uint32_t *bo_handles = (uint32_t *) malloc(sizeof(uint32_t) * csd_job->bo_count);
843 uint32_t bo_idx = 0;
844 set_foreach (csd_job->bos, entry) {
845 struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
846 bo_handles[bo_idx++] = bo->handle;
847 }
848 csd_job->csd.submit.bo_handles = (uintptr_t)(void *)bo_handles;
849
850 struct drm_v3d_indirect_csd indirect = {0};
851
852 set_ext(&indirect.base, NULL, DRM_V3D_EXT_ID_CPU_INDIRECT_CSD, 0);
853
854 indirect.submit = csd_job->csd.submit;
855 indirect.offset = info->buffer->mem_offset + info->offset;
856 indirect.wg_size = info->wg_size;
857
858 for (int i = 0; i < 3; i++) {
859 if (info->wg_uniform_offsets[i]) {
860 assert(info->wg_uniform_offsets[i] >= (uint32_t *) csd_job->indirect.base);
861 indirect.wg_uniform_offsets[i] = info->wg_uniform_offsets[i] - (uint32_t *) csd_job->indirect.base;
862 } else {
863 indirect.wg_uniform_offsets[i] = 0xffffffff; /* No rewrite */
864 }
865 }
866
867 indirect.indirect = csd_job->indirect.bo->handle;
868
869 struct drm_v3d_multi_sync ms = {0};
870
871 /* We need to configure the semaphores of this job with the indirect
872 * CSD job, as the CPU job must obey to the CSD job synchronization
873 * demands, such as barriers.
874 */
875 set_multisync(&ms, sync_info, NULL, 0, (void *)&indirect, device, csd_job,
876 V3DV_QUEUE_CPU, V3DV_QUEUE_CSD, V3D_CPU, signal_syncs);
877 if (!ms.base.id)
878 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
879
880 submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
881 submit.extensions = (uintptr_t)(void *)&ms;
882
883 int ret = v3dv_ioctl(device->pdevice->render_fd,
884 DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
885
886 free(bo_handles);
887 multisync_free(device, &ms);
888
889 queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
890 queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
891
892 if (ret)
893 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
894
895 return VK_SUCCESS;
896 }
897
898 static VkResult
handle_cl_job(struct v3dv_queue * queue,struct v3dv_job * job,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)899 handle_cl_job(struct v3dv_queue *queue,
900 struct v3dv_job *job,
901 uint32_t counter_pass_idx,
902 struct v3dv_submit_sync_info *sync_info,
903 bool signal_syncs)
904 {
905 struct v3dv_device *device = queue->device;
906
907 struct drm_v3d_submit_cl submit = { 0 };
908
909 /* Sanity check: we should only flag a bcl sync on a job that needs to be
910 * serialized.
911 */
912 assert(job->serialize || !job->needs_bcl_sync);
913
914 /* We expect to have just one RCL per job which should fit in just one BO.
915 * Our BCL, could chain multiple BOS together though.
916 */
917 assert(list_length(&job->rcl.bo_list) == 1);
918 assert(list_length(&job->bcl.bo_list) >= 1);
919 struct v3dv_bo *bcl_fist_bo =
920 list_first_entry(&job->bcl.bo_list, struct v3dv_bo, list_link);
921 submit.bcl_start = bcl_fist_bo->offset;
922 submit.bcl_end = job->suspending ? job->suspended_bcl_end :
923 job->bcl.bo->offset + v3dv_cl_offset(&job->bcl);
924 submit.rcl_start = job->rcl.bo->offset;
925 submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl);
926
927 submit.qma = job->tile_alloc->offset;
928 submit.qms = job->tile_alloc->size;
929 submit.qts = job->tile_state->offset;
930
931 submit.flags = 0;
932 if (job->tmu_dirty_rcl)
933 submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
934
935 /* If the job uses VK_KHR_buffer_device_address we need to ensure all
936 * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
937 * are included.
938 */
939 if (job->uses_buffer_device_address) {
940 util_dynarray_foreach(&queue->device->device_address_bo_list,
941 struct v3dv_bo *, bo) {
942 v3dv_job_add_bo(job, *bo);
943 }
944 }
945
946 submit.bo_handle_count = job->bo_count;
947 uint32_t *bo_handles =
948 (uint32_t *) malloc(sizeof(uint32_t) * submit.bo_handle_count);
949 uint32_t bo_idx = 0;
950 set_foreach(job->bos, entry) {
951 struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
952 bo_handles[bo_idx++] = bo->handle;
953 }
954 assert(bo_idx == submit.bo_handle_count);
955 submit.bo_handles = (uintptr_t)(void *)bo_handles;
956
957 submit.perfmon_id = job->perf ?
958 job->perf->kperfmon_ids[counter_pass_idx] : 0;
959 const bool needs_perf_sync = queue->last_perfmon_id != submit.perfmon_id;
960 queue->last_perfmon_id = submit.perfmon_id;
961
962 /* We need a binning sync if we are the first CL job waiting on a semaphore
963 * with a wait stage that involves the geometry pipeline, or if the job
964 * comes after a pipeline barrier that involves geometry stages
965 * (needs_bcl_sync) or when performance queries are in use.
966 *
967 * We need a render sync if the job doesn't need a binning sync but has
968 * still been flagged for serialization. It should be noted that RCL jobs
969 * don't start until the previous RCL job has finished so we don't really
970 * need to add a fence for those, however, we might need to wait on a CSD or
971 * TFU job, which are not automatically serialized with CL jobs.
972 */
973 bool needs_bcl_sync = job->needs_bcl_sync || needs_perf_sync;
974 if (queue->last_job_syncs.first[V3DV_QUEUE_CL]) {
975 for (int i = 0; !needs_bcl_sync && i < sync_info->wait_count; i++) {
976 needs_bcl_sync = sync_info->waits[i].stage_mask &
977 (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
978 VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
979 VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
980 VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
981 VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
982 VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
983 VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
984 VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
985 VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
986 VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
987 VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
988 VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT);
989 }
990 }
991
992 bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
993
994 /* Replace single semaphore settings whenever our kernel-driver supports
995 * multiple semaphores extension.
996 */
997 struct drm_v3d_multi_sync ms = { 0 };
998 enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN;
999 set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
1000 V3DV_QUEUE_CL, V3DV_QUEUE_CL, wait_stage, signal_syncs);
1001 if (!ms.base.id)
1002 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1003
1004 submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
1005 submit.extensions = (uintptr_t)(void *)&ms;
1006
1007 /* We are using multisync so disable legacy single-sync interface */
1008 submit.in_sync_rcl = 0;
1009 submit.in_sync_bcl = 0;
1010 submit.out_sync = 0;
1011
1012 v3dv_clif_dump(device, job, &submit);
1013 int ret = v3dv_ioctl(device->pdevice->render_fd,
1014 DRM_IOCTL_V3D_SUBMIT_CL, &submit);
1015
1016 static bool warned = false;
1017 if (ret && !warned) {
1018 fprintf(stderr, "Draw call returned %s. Expect corruption.\n",
1019 strerror(errno));
1020 warned = true;
1021 }
1022
1023 free(bo_handles);
1024 multisync_free(device, &ms);
1025
1026 queue->last_job_syncs.first[V3DV_QUEUE_CL] = false;
1027
1028 if (ret)
1029 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CL failed: %m");
1030
1031 return VK_SUCCESS;
1032 }
1033
1034 static VkResult
handle_tfu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1035 handle_tfu_job(struct v3dv_queue *queue,
1036 struct v3dv_job *job,
1037 struct v3dv_submit_sync_info *sync_info,
1038 bool signal_syncs)
1039 {
1040 assert(!V3D_DBG(DISABLE_TFU));
1041
1042 struct v3dv_device *device = queue->device;
1043
1044 /* Replace single semaphore settings whenever our kernel-driver supports
1045 * multiple semaphore extension.
1046 */
1047 struct drm_v3d_multi_sync ms = { 0 };
1048 set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
1049 V3DV_QUEUE_TFU, V3DV_QUEUE_TFU, V3D_TFU, signal_syncs);
1050 if (!ms.base.id)
1051 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1052
1053 job->tfu.flags |= DRM_V3D_SUBMIT_EXTENSION;
1054 job->tfu.extensions = (uintptr_t)(void *)&ms;
1055
1056 /* We are using multisync so disable legacy single-sync interface */
1057 job->tfu.in_sync = 0;
1058 job->tfu.out_sync = 0;
1059
1060 int ret = v3dv_ioctl(device->pdevice->render_fd,
1061 DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu);
1062
1063 multisync_free(device, &ms);
1064 queue->last_job_syncs.first[V3DV_QUEUE_TFU] = false;
1065
1066 if (ret != 0)
1067 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_TFU failed: %m");
1068
1069 return VK_SUCCESS;
1070 }
1071
1072 static VkResult
handle_csd_job(struct v3dv_queue * queue,struct v3dv_job * job,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1073 handle_csd_job(struct v3dv_queue *queue,
1074 struct v3dv_job *job,
1075 uint32_t counter_pass_idx,
1076 struct v3dv_submit_sync_info *sync_info,
1077 bool signal_syncs)
1078 {
1079 struct v3dv_device *device = queue->device;
1080
1081 struct drm_v3d_submit_csd *submit = &job->csd.submit;
1082
1083 /* If the job uses VK_KHR_buffer_device_address we need to ensure all
1084 * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
1085 * are included.
1086 */
1087 if (job->uses_buffer_device_address) {
1088 util_dynarray_foreach(&queue->device->device_address_bo_list,
1089 struct v3dv_bo *, bo) {
1090 v3dv_job_add_bo(job, *bo);
1091 }
1092 }
1093
1094 submit->bo_handle_count = job->bo_count;
1095 uint32_t *bo_handles =
1096 (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2));
1097 uint32_t bo_idx = 0;
1098 set_foreach(job->bos, entry) {
1099 struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
1100 bo_handles[bo_idx++] = bo->handle;
1101 }
1102 assert(bo_idx == submit->bo_handle_count);
1103 submit->bo_handles = (uintptr_t)(void *)bo_handles;
1104
1105 /* Replace single semaphore settings whenever our kernel-driver supports
1106 * multiple semaphore extension.
1107 */
1108 struct drm_v3d_multi_sync ms = { 0 };
1109 set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
1110 V3DV_QUEUE_CSD, V3DV_QUEUE_CSD, V3D_CSD, signal_syncs);
1111 if (!ms.base.id)
1112 return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1113
1114 submit->flags |= DRM_V3D_SUBMIT_EXTENSION;
1115 submit->extensions = (uintptr_t)(void *)&ms;
1116
1117 /* We are using multisync so disable legacy single-sync interface */
1118 submit->in_sync = 0;
1119 submit->out_sync = 0;
1120
1121 submit->perfmon_id = job->perf ?
1122 job->perf->kperfmon_ids[counter_pass_idx] : 0;
1123 queue->last_perfmon_id = submit->perfmon_id;
1124
1125 int ret = v3dv_ioctl(device->pdevice->render_fd,
1126 DRM_IOCTL_V3D_SUBMIT_CSD, submit);
1127
1128 static bool warned = false;
1129 if (ret && !warned) {
1130 fprintf(stderr, "Compute dispatch returned %s. Expect corruption.\n",
1131 strerror(errno));
1132 warned = true;
1133 }
1134
1135 free(bo_handles);
1136
1137 multisync_free(device, &ms);
1138 queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
1139
1140 if (ret)
1141 return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CSD failed: %m");
1142
1143 return VK_SUCCESS;
1144 }
1145
1146 static VkResult
queue_handle_job(struct v3dv_queue * queue,struct v3dv_job * job,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1147 queue_handle_job(struct v3dv_queue *queue,
1148 struct v3dv_job *job,
1149 uint32_t counter_pass_idx,
1150 struct v3dv_submit_sync_info *sync_info,
1151 bool signal_syncs)
1152 {
1153 switch (job->type) {
1154 case V3DV_JOB_TYPE_GPU_CL:
1155 return handle_cl_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
1156 case V3DV_JOB_TYPE_GPU_TFU:
1157 return handle_tfu_job(queue, job, sync_info, signal_syncs);
1158 case V3DV_JOB_TYPE_GPU_CSD:
1159 return handle_csd_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
1160 case V3DV_JOB_TYPE_CPU_RESET_QUERIES:
1161 return handle_reset_query_cpu_job(queue, job, sync_info, signal_syncs);
1162 case V3DV_JOB_TYPE_CPU_END_QUERY:
1163 return handle_end_query_cpu_job(job, counter_pass_idx);
1164 case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
1165 return handle_copy_query_results_cpu_job(queue, job, sync_info, signal_syncs);
1166 case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
1167 return handle_csd_indirect_cpu_job(queue, job, sync_info, signal_syncs);
1168 case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY:
1169 return handle_timestamp_query_cpu_job(queue, job, sync_info, signal_syncs);
1170 default:
1171 unreachable("Unhandled job type");
1172 }
1173 }
1174
1175 static VkResult
queue_create_noop_job(struct v3dv_queue * queue)1176 queue_create_noop_job(struct v3dv_queue *queue)
1177 {
1178 struct v3dv_device *device = queue->device;
1179 queue->noop_job = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_job), 8,
1180 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1181 if (!queue->noop_job)
1182 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1183 v3dv_job_init(queue->noop_job, V3DV_JOB_TYPE_GPU_CL, device, NULL, -1);
1184
1185 v3dv_X(device, job_emit_noop)(queue->noop_job);
1186
1187 /* We use no-op jobs to signal semaphores/fences. These jobs needs to be
1188 * serialized across all hw queues to comply with Vulkan's signal operation
1189 * order requirements, which basically require that signal operations occur
1190 * in submission order.
1191 */
1192 queue->noop_job->serialize = V3DV_BARRIER_ALL;
1193
1194 return VK_SUCCESS;
1195 }
1196
1197 static VkResult
queue_submit_noop_job(struct v3dv_queue * queue,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1198 queue_submit_noop_job(struct v3dv_queue *queue,
1199 uint32_t counter_pass_idx,
1200 struct v3dv_submit_sync_info *sync_info,
1201 bool signal_syncs)
1202 {
1203 if (!queue->noop_job) {
1204 VkResult result = queue_create_noop_job(queue);
1205 if (result != VK_SUCCESS)
1206 return result;
1207 }
1208
1209 assert(queue->noop_job);
1210 return queue_handle_job(queue, queue->noop_job, counter_pass_idx,
1211 sync_info, signal_syncs);
1212 }
1213
1214 VkResult
v3dv_queue_driver_submit(struct vk_queue * vk_queue,struct vk_queue_submit * submit)1215 v3dv_queue_driver_submit(struct vk_queue *vk_queue,
1216 struct vk_queue_submit *submit)
1217 {
1218 struct v3dv_queue *queue = container_of(vk_queue, struct v3dv_queue, vk);
1219 VkResult result;
1220
1221 struct v3dv_submit_sync_info sync_info = {
1222 .wait_count = submit->wait_count,
1223 .waits = submit->waits,
1224 .signal_count = submit->signal_count,
1225 .signals = submit->signals,
1226 };
1227
1228 for (int i = 0; i < V3DV_QUEUE_COUNT; i++)
1229 queue->last_job_syncs.first[i] = true;
1230
1231 struct v3dv_job *first_suspend_job = NULL;
1232 struct v3dv_job *current_suspend_job = NULL;
1233 for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
1234 struct v3dv_cmd_buffer *cmd_buffer =
1235 container_of(submit->command_buffers[i], struct v3dv_cmd_buffer, vk);
1236 list_for_each_entry_safe(struct v3dv_job, job,
1237 &cmd_buffer->jobs, list_link) {
1238 if (job->suspending) {
1239 job = v3dv_X(job->device,
1240 cmd_buffer_prepare_suspend_job_for_submit)(job);
1241 if (!job)
1242 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1243 }
1244
1245 if (job->suspending && !job->resuming) {
1246 assert(!first_suspend_job);
1247 assert(!current_suspend_job);
1248 first_suspend_job = job;
1249 }
1250
1251 if (job->resuming) {
1252 assert(first_suspend_job);
1253 assert(current_suspend_job);
1254 v3dv_X(job->device, job_patch_resume_address)(first_suspend_job,
1255 current_suspend_job,
1256 job);
1257 current_suspend_job = NULL;
1258 }
1259
1260 if (job->suspending) {
1261 current_suspend_job = job;
1262 } else {
1263 assert(!current_suspend_job);
1264 struct v3dv_job *submit_job = first_suspend_job ?
1265 first_suspend_job : job;
1266 result =
1267 queue_handle_job(queue, submit_job, submit->perf_pass_index,
1268 &sync_info, false);
1269
1270 if (result != VK_SUCCESS)
1271 return result;
1272
1273 first_suspend_job = NULL;
1274 }
1275 }
1276
1277 /* If the command buffer ends with a barrier we need to consume it now.
1278 *
1279 * FIXME: this will drain all hw queues. Instead, we could use the pending
1280 * barrier state to limit the queues we serialize against.
1281 */
1282 if (cmd_buffer->state.barrier.dst_mask) {
1283 result = queue_submit_noop_job(queue, submit->perf_pass_index,
1284 &sync_info, false);
1285 if (result != VK_SUCCESS)
1286 return result;
1287 }
1288 }
1289
1290 assert(!first_suspend_job);
1291 assert(!current_suspend_job);
1292
1293 /* Handle signaling now */
1294 if (submit->signal_count > 0) {
1295 /* Finish by submitting a no-op job that synchronizes across all queues.
1296 * This will ensure that the signal semaphores don't get triggered until
1297 * all work on any queue completes. See Vulkan's signal operation order
1298 * requirements.
1299 */
1300 return queue_submit_noop_job(queue, submit->perf_pass_index,
1301 &sync_info, true);
1302 }
1303
1304 return VK_SUCCESS;
1305 }
1306
1307 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_QueueBindSparse(VkQueue _queue,uint32_t bindInfoCount,const VkBindSparseInfo * pBindInfo,VkFence fence)1308 v3dv_QueueBindSparse(VkQueue _queue,
1309 uint32_t bindInfoCount,
1310 const VkBindSparseInfo *pBindInfo,
1311 VkFence fence)
1312 {
1313 V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
1314 return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT);
1315 }
1316