xref: /aosp_15_r20/external/mesa3d/src/broadcom/vulkan/v3dv_queue.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2019 Raspberry Pi Ltd
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3dv_private.h"
25 #include "drm-uapi/v3d_drm.h"
26 
27 #include "broadcom/clif/clif_dump.h"
28 #include "util/libsync.h"
29 #include "util/os_time.h"
30 #include "vk_drm_syncobj.h"
31 
32 #include <errno.h>
33 #include <time.h>
34 
35 static void
v3dv_clif_dump(struct v3dv_device * device,struct v3dv_job * job,struct drm_v3d_submit_cl * submit)36 v3dv_clif_dump(struct v3dv_device *device,
37                struct v3dv_job *job,
38                struct drm_v3d_submit_cl *submit)
39 {
40    if (!(V3D_DBG(CL) ||
41          V3D_DBG(CL_NO_BIN) ||
42          V3D_DBG(CLIF)))
43       return;
44 
45    struct clif_dump *clif = clif_dump_init(&device->devinfo,
46                                            stderr,
47                                            V3D_DBG(CL) ||
48                                            V3D_DBG(CL_NO_BIN),
49                                            V3D_DBG(CL_NO_BIN));
50 
51    set_foreach(job->bos, entry) {
52       struct v3dv_bo *bo = (void *)entry->key;
53       char *name = ralloc_asprintf(NULL, "%s_0x%x",
54                                    bo->name, bo->offset);
55 
56       bool ok = v3dv_bo_map(device, bo, bo->size);
57       if (!ok) {
58          fprintf(stderr, "failed to map BO for clif_dump.\n");
59          ralloc_free(name);
60          goto free_clif;
61       }
62       clif_dump_add_bo(clif, name, bo->offset, bo->size, bo->map);
63 
64       ralloc_free(name);
65    }
66 
67    clif_dump(clif, submit);
68 
69  free_clif:
70    clif_dump_destroy(clif);
71 }
72 
73 static VkResult
queue_wait_idle(struct v3dv_queue * queue,struct v3dv_submit_sync_info * sync_info)74 queue_wait_idle(struct v3dv_queue *queue,
75                 struct v3dv_submit_sync_info *sync_info)
76 {
77    int ret = drmSyncobjWait(queue->device->pdevice->render_fd,
78                             queue->last_job_syncs.syncs, 4,
79                             INT64_MAX, DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL,
80                             NULL);
81    if (ret)
82       return vk_errorf(queue, VK_ERROR_DEVICE_LOST, "syncobj wait failed: %m");
83 
84    bool first = true;
85    for (int i = 0; i < 4; i++) {
86       if (!queue->last_job_syncs.first[i])
87          first = false;
88    }
89 
90    /* If we're not the first job, that means we're waiting on some
91     * per-queue-type syncobj which transitively waited on the semaphores
92     * so we can skip the semaphore wait.
93     */
94    if (first) {
95       VkResult result = vk_sync_wait_many(&queue->device->vk,
96                                           sync_info->wait_count,
97                                           sync_info->waits,
98                                           VK_SYNC_WAIT_COMPLETE,
99                                           UINT64_MAX);
100       if (result != VK_SUCCESS)
101          return result;
102    }
103 
104    for (int i = 0; i < 4; i++)
105       queue->last_job_syncs.first[i] = false;
106 
107    return VK_SUCCESS;
108 }
109 
110 static void
multisync_free(struct v3dv_device * device,struct drm_v3d_multi_sync * ms)111 multisync_free(struct v3dv_device *device,
112                struct drm_v3d_multi_sync *ms)
113 {
114    vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->out_syncs);
115    vk_free(&device->vk.alloc, (void *)(uintptr_t)ms->in_syncs);
116 }
117 
118 static struct drm_v3d_sem *
set_in_syncs(struct v3dv_queue * queue,struct v3dv_job * job,enum v3dv_queue_type queue_sync,uint32_t * count,struct vk_sync_wait * waits,unsigned wait_count,struct v3dv_submit_sync_info * sync_info)119 set_in_syncs(struct v3dv_queue *queue,
120              struct v3dv_job *job,
121              enum v3dv_queue_type queue_sync,
122              uint32_t *count,
123              struct vk_sync_wait *waits,
124              unsigned wait_count,
125              struct v3dv_submit_sync_info *sync_info)
126 {
127    struct v3dv_device *device = queue->device;
128    uint32_t n_syncs = 0;
129 
130    /* If this is the first job submitted to a given GPU queue in this cmd buf
131     * batch, it has to wait on wait semaphores (if any) before running.
132     */
133    if (queue->last_job_syncs.first[queue_sync])
134       n_syncs = sync_info->wait_count;
135 
136    /* If the serialize flag is set the job needs to be serialized in the
137     * corresponding queues. Notice that we may implement transfer operations
138     * as both CL or TFU jobs.
139     *
140     * FIXME: maybe we could track more precisely if the source of a transfer
141     * barrier is a CL and/or a TFU job.
142     */
143    bool sync_csd  = job->serialize & V3DV_BARRIER_COMPUTE_BIT;
144    bool sync_tfu  = job->serialize & V3DV_BARRIER_TRANSFER_BIT;
145    bool sync_cl   = job->serialize & (V3DV_BARRIER_GRAPHICS_BIT |
146                                       V3DV_BARRIER_TRANSFER_BIT);
147    bool sync_cpu  = job->serialize & V3DV_BARRIER_CPU_BIT;
148 
149    *count = n_syncs;
150    if (sync_cl)
151       (*count)++;
152    if (sync_tfu)
153       (*count)++;
154    if (sync_csd)
155       (*count)++;
156    if (sync_cpu)
157       (*count)++;
158 
159    *count += wait_count;
160 
161    if (!*count)
162       return NULL;
163 
164    struct drm_v3d_sem *syncs =
165       vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
166                 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
167 
168    if (!syncs)
169       return NULL;
170 
171    for (int i = 0; i < n_syncs; i++) {
172       syncs[i].handle =
173          vk_sync_as_drm_syncobj(sync_info->waits[i].sync)->syncobj;
174    }
175 
176    for (int i = 0; i < wait_count; i++) {
177       syncs[n_syncs++].handle =
178          vk_sync_as_drm_syncobj(waits[i].sync)->syncobj;
179    }
180 
181    if (sync_cl)
182       syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CL];
183 
184    if (sync_csd)
185       syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CSD];
186 
187    if (sync_tfu)
188       syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_TFU];
189 
190    if (sync_cpu)
191       syncs[n_syncs++].handle = queue->last_job_syncs.syncs[V3DV_QUEUE_CPU];
192 
193    assert(n_syncs == *count);
194    return syncs;
195 }
196 
197 static struct drm_v3d_sem *
set_out_syncs(struct v3dv_queue * queue,struct v3dv_job * job,enum v3dv_queue_type queue_sync,uint32_t * count,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)198 set_out_syncs(struct v3dv_queue *queue,
199               struct v3dv_job *job,
200               enum v3dv_queue_type queue_sync,
201               uint32_t *count,
202               struct v3dv_submit_sync_info *sync_info,
203               bool signal_syncs)
204 {
205    struct v3dv_device *device = queue->device;
206 
207    uint32_t n_vk_syncs = signal_syncs ? sync_info->signal_count : 0;
208 
209    /* We always signal the syncobj from `device->last_job_syncs` related to
210     * this v3dv_queue_type to track the last job submitted to this queue.
211     */
212    (*count) = n_vk_syncs + 1;
213 
214    struct drm_v3d_sem *syncs =
215       vk_zalloc(&device->vk.alloc, *count * sizeof(struct drm_v3d_sem),
216                 8, VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
217 
218    if (!syncs)
219       return NULL;
220 
221    if (n_vk_syncs) {
222       for (unsigned i = 0; i < n_vk_syncs; i++) {
223          syncs[i].handle =
224             vk_sync_as_drm_syncobj(sync_info->signals[i].sync)->syncobj;
225       }
226    }
227 
228    syncs[n_vk_syncs].handle = queue->last_job_syncs.syncs[queue_sync];
229 
230    return syncs;
231 }
232 
233 static void
set_ext(struct drm_v3d_extension * ext,struct drm_v3d_extension * next,uint32_t id,uintptr_t flags)234 set_ext(struct drm_v3d_extension *ext,
235 	struct drm_v3d_extension *next,
236 	uint32_t id,
237 	uintptr_t flags)
238 {
239    ext->next = (uintptr_t)(void *)next;
240    ext->id = id;
241    ext->flags = flags;
242 }
243 
244 /* This function sets the extension for multiple in/out syncobjs. When it is
245  * successful, it sets the extension id to DRM_V3D_EXT_ID_MULTI_SYNC.
246  * Otherwise, the extension id is 0, which means an out-of-memory error.
247  */
248 static void
set_multisync(struct drm_v3d_multi_sync * ms,struct v3dv_submit_sync_info * sync_info,struct vk_sync_wait * waits,unsigned wait_count,struct drm_v3d_extension * next,struct v3dv_device * device,struct v3dv_job * job,enum v3dv_queue_type in_queue_sync,enum v3dv_queue_type out_queue_sync,enum v3d_queue wait_stage,bool signal_syncs)249 set_multisync(struct drm_v3d_multi_sync *ms,
250               struct v3dv_submit_sync_info *sync_info,
251               struct vk_sync_wait *waits,
252               unsigned wait_count,
253               struct drm_v3d_extension *next,
254               struct v3dv_device *device,
255               struct v3dv_job *job,
256               enum v3dv_queue_type in_queue_sync,
257               enum v3dv_queue_type out_queue_sync,
258               enum v3d_queue wait_stage,
259               bool signal_syncs)
260 {
261    struct v3dv_queue *queue = &device->queue;
262    uint32_t out_sync_count = 0, in_sync_count = 0;
263    struct drm_v3d_sem *out_syncs = NULL, *in_syncs = NULL;
264 
265    in_syncs = set_in_syncs(queue, job, in_queue_sync,
266                            &in_sync_count, waits, wait_count, sync_info);
267    if (!in_syncs && in_sync_count)
268       goto fail;
269 
270    out_syncs = set_out_syncs(queue, job, out_queue_sync,
271                              &out_sync_count, sync_info, signal_syncs);
272 
273    assert(out_sync_count > 0);
274 
275    if (!out_syncs)
276       goto fail;
277 
278    set_ext(&ms->base, next, DRM_V3D_EXT_ID_MULTI_SYNC, 0);
279    ms->wait_stage = wait_stage;
280    ms->out_sync_count = out_sync_count;
281    ms->out_syncs = (uintptr_t)(void *)out_syncs;
282    ms->in_sync_count = in_sync_count;
283    ms->in_syncs = (uintptr_t)(void *)in_syncs;
284 
285    return;
286 
287 fail:
288    if (in_syncs)
289       vk_free(&device->vk.alloc, in_syncs);
290    assert(!out_syncs);
291 
292    return;
293 }
294 
295 static VkResult
handle_reset_query_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)296 handle_reset_query_cpu_job(struct v3dv_queue *queue,
297                            struct v3dv_job *job,
298                            struct v3dv_submit_sync_info *sync_info,
299                            bool signal_syncs)
300 {
301    struct v3dv_device *device = queue->device;
302    struct v3dv_reset_query_cpu_job_info *info = &job->cpu.query_reset;
303    assert(info->pool);
304 
305    assert(info->pool->query_type != VK_QUERY_TYPE_OCCLUSION);
306 
307    if (device->pdevice->caps.cpu_queue) {
308       assert(info->first + info->count <= info->pool->query_count);
309 
310       struct drm_v3d_submit_cpu submit = {0};
311       struct drm_v3d_multi_sync ms = {0};
312 
313       uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
314       uintptr_t *kperfmon_ids = NULL;
315 
316       if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
317          submit.bo_handle_count = 1;
318          submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle;
319 
320          struct drm_v3d_reset_timestamp_query reset = {0};
321 
322          set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_TIMESTAMP_QUERY, 0);
323 
324          reset.count = info->count;
325          reset.offset = info->pool->queries[info->first].timestamp.offset;
326 
327          for (uint32_t i = 0; i < info->count; i++) {
328             struct v3dv_query *query = &info->pool->queries[info->first + i];
329             syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
330          }
331 
332          reset.syncs = (uintptr_t)(void *)syncs;
333 
334          set_multisync(&ms, sync_info, NULL, 0, (void *)&reset, device, job,
335                        V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
336          if (!ms.base.id)
337             return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
338       } else {
339          assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
340          struct drm_v3d_reset_performance_query reset = {0};
341 
342          set_ext(&reset.base, NULL, DRM_V3D_EXT_ID_CPU_RESET_PERFORMANCE_QUERY, 0);
343 
344          struct vk_sync_wait waits[info->count];
345          unsigned wait_count = 0;
346          for (int i = 0; i < info->count; i++) {
347             struct v3dv_query *query = &info->pool->queries[info->first + i];
348             /* Only wait for a query if we've used it otherwise we will be
349              * waiting forever for the fence to become signaled.
350              */
351             if (query->maybe_available) {
352                waits[wait_count] = (struct vk_sync_wait){
353                   .sync = query->perf.last_job_sync
354                };
355                wait_count++;
356             };
357          }
358 
359          reset.count = info->count;
360          reset.nperfmons = info->pool->perfmon.nperfmons;
361 
362          kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count);
363 
364          for (uint32_t i = 0; i < info->count; i++) {
365             struct v3dv_query *query = &info->pool->queries[info->first + i];
366 
367             syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
368             kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids;
369          }
370 
371          reset.syncs = (uintptr_t)(void *)syncs;
372          reset.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids;
373 
374          set_multisync(&ms, sync_info, waits, wait_count, (void *)&reset, device, job,
375                        V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
376          if (!ms.base.id)
377             return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
378       }
379 
380       submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
381       submit.extensions = (uintptr_t)(void *)&ms;
382 
383       /* From the Vulkan spec for vkCmdResetQueryPool:
384        *
385        *    "This command defines an execution dependency between other query commands
386        *     that reference the same query.
387        *     ...
388        *     The second synchronization scope includes all commands which reference the
389        *     queries in queryPool indicated by firstQuery and queryCount that occur later
390        *     in submission order."
391        *
392        * This means we should ensure that any timestamps after a reset don't execute before
393        * the reset, however, for timestamps queries in particular we don't have to do
394        * anything special because timestamp queries have to wait for all previously
395        * submitted work to complete before executing (which we accomplish by using
396        * V3DV_BARRIER_ALL on them) and that includes reset jobs submitted to the CPU queue.
397        */
398       int ret = v3dv_ioctl(device->pdevice->render_fd,
399                            DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
400 
401       free(syncs);
402       free(kperfmon_ids);
403       multisync_free(device, &ms);
404 
405       queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
406 
407       if (ret)
408          return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
409 
410       return VK_SUCCESS;
411    }
412 
413    /* We are about to reset query counters in user-space so we need to make
414     * sure that the GPU is not using them.
415     */
416    if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
417       VkResult result = queue_wait_idle(queue, sync_info);
418       if (result != VK_SUCCESS)
419          return result;
420 
421       v3dv_bo_wait(job->device, info->pool->timestamp.bo, OS_TIMEOUT_INFINITE);
422    }
423 
424    if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
425       struct vk_sync_wait waits[info->count];
426       unsigned wait_count = 0;
427       for (int i = 0; i < info->count; i++) {
428          struct v3dv_query *query = &info->pool->queries[info->first + i];
429          /* Only wait for a query if we've used it otherwise we will be
430           * waiting forever for the fence to become signaled.
431           */
432          if (query->maybe_available) {
433             waits[wait_count] = (struct vk_sync_wait){
434                .sync = query->perf.last_job_sync
435             };
436             wait_count++;
437          };
438       }
439 
440       VkResult result = vk_sync_wait_many(&job->device->vk, wait_count, waits,
441                                           VK_SYNC_WAIT_COMPLETE, UINT64_MAX);
442 
443       if (result != VK_SUCCESS)
444          return result;
445    }
446 
447    v3dv_reset_query_pool_cpu(job->device, info->pool, info->first, info->count);
448 
449    return VK_SUCCESS;
450 }
451 
452 static VkResult
export_perfmon_last_job_sync(struct v3dv_queue * queue,struct v3dv_job * job,int * fd)453 export_perfmon_last_job_sync(struct v3dv_queue *queue, struct v3dv_job *job, int *fd)
454 {
455    int err;
456    static const enum v3dv_queue_type queues_to_sync[] = {
457       V3DV_QUEUE_CL,
458       V3DV_QUEUE_CSD,
459    };
460 
461    for (uint32_t i = 0; i < ARRAY_SIZE(queues_to_sync); i++) {
462       enum v3dv_queue_type queue_type = queues_to_sync[i];
463       int tmp_fd = -1;
464 
465       err = drmSyncobjExportSyncFile(job->device->pdevice->render_fd,
466                                      queue->last_job_syncs.syncs[queue_type],
467                                      &tmp_fd);
468 
469       if (err) {
470          close(*fd);
471          return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
472                           "sync file export failed: %m");
473       }
474 
475       err = sync_accumulate("v3dv", fd, tmp_fd);
476 
477       if (err) {
478          close(tmp_fd);
479          close(*fd);
480          return vk_errorf(&job->device->queue, VK_ERROR_UNKNOWN,
481                           "failed to accumulate sync files: %m");
482       }
483    }
484 
485    return VK_SUCCESS;
486 }
487 
488 static VkResult
handle_end_query_cpu_job(struct v3dv_job * job,uint32_t counter_pass_idx)489 handle_end_query_cpu_job(struct v3dv_job *job, uint32_t counter_pass_idx)
490 {
491    VkResult result = VK_SUCCESS;
492 
493    mtx_lock(&job->device->query_mutex);
494 
495    struct v3dv_end_query_info *info = &job->cpu.query_end;
496    struct v3dv_queue *queue = &job->device->queue;
497 
498    int err = 0;
499    int fd = -1;
500 
501    assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
502 
503    if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
504       result = export_perfmon_last_job_sync(queue, job, &fd);
505 
506       if (result != VK_SUCCESS)
507          goto fail;
508 
509       assert(fd >= 0);
510    }
511 
512    for (uint32_t i = 0; i < info->count; i++) {
513       assert(info->query + i < info->pool->query_count);
514       struct v3dv_query *query = &info->pool->queries[info->query + i];
515 
516       if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) {
517          uint32_t syncobj = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
518          err = drmSyncobjImportSyncFile(job->device->pdevice->render_fd,
519                                         syncobj, fd);
520 
521          if (err) {
522             result = vk_errorf(queue, VK_ERROR_UNKNOWN,
523                                "sync file import failed: %m");
524             goto fail;
525          }
526       }
527 
528       query->maybe_available = true;
529    }
530 
531 fail:
532    if (info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR)
533       close(fd);
534 
535    cnd_broadcast(&job->device->query_ended);
536    mtx_unlock(&job->device->query_mutex);
537 
538    return result;
539 }
540 
541 static VkResult
handle_copy_query_results_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)542 handle_copy_query_results_cpu_job(struct v3dv_queue *queue,
543                                   struct v3dv_job *job,
544                                   struct v3dv_submit_sync_info *sync_info,
545                                   bool signal_syncs)
546 {
547    struct v3dv_device *device = queue->device;
548    struct v3dv_copy_query_results_cpu_job_info *info =
549       &job->cpu.query_copy_results;
550 
551    assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR ||
552           info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP);
553 
554    assert(info->dst && info->dst->mem && info->dst->mem->bo);
555    struct v3dv_bo *bo = info->dst->mem->bo;
556 
557    if (device->pdevice->caps.cpu_queue) {
558       struct drm_v3d_submit_cpu submit = {0};
559       struct drm_v3d_multi_sync ms = {0};
560 
561       uint32_t *offsets = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
562       uint32_t *syncs = (uint32_t *) malloc(sizeof(uint32_t) * info->count);
563       uint32_t *bo_handles = NULL;
564       uintptr_t *kperfmon_ids = NULL;
565 
566       if (info->pool->query_type == VK_QUERY_TYPE_TIMESTAMP) {
567          submit.bo_handle_count = 2;
568 
569          bo_handles = (uint32_t *)
570             malloc(sizeof(uint32_t) * submit.bo_handle_count);
571 
572          bo_handles[0] = bo->handle;
573          bo_handles[1] = info->pool->timestamp.bo->handle;
574          submit.bo_handles = (uintptr_t)(void *)bo_handles;
575 
576          struct drm_v3d_copy_timestamp_query copy = {0};
577 
578          set_ext(&copy.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_TIMESTAMP_QUERY, 0);
579 
580          copy.do_64bit = info->flags & VK_QUERY_RESULT_64_BIT;
581          copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT;
582          copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
583          copy.offset = info->offset + info->dst->mem_offset;
584          copy.stride = info->stride;
585          copy.count = info->count;
586 
587          for (uint32_t i = 0; i < info->count; i++) {
588             assert(info->first < info->pool->query_count);
589             assert(info->first + info->count <= info->pool->query_count);
590             struct v3dv_query *query = &info->pool->queries[info->first + i];
591 
592             offsets[i] = query->timestamp.offset;
593             syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
594          }
595 
596          copy.offsets = (uintptr_t)(void *)offsets;
597          copy.syncs = (uintptr_t)(void *)syncs;
598 
599          set_multisync(&ms, sync_info, NULL, 0, (void *)&copy, device, job,
600                        V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
601          if (!ms.base.id)
602             return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
603       } else {
604          assert(info->pool->query_type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR);
605 
606          submit.bo_handle_count = 1;
607          submit.bo_handles = (uintptr_t)(void *)&bo->handle;
608 
609          struct drm_v3d_copy_performance_query copy = {0};
610 
611          set_ext(&copy.base, NULL, DRM_V3D_EXT_ID_CPU_COPY_PERFORMANCE_QUERY, 0);
612 
613 	 /* If the queryPool was created with VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR,
614 	  * results for each query are written as an array of the type indicated
615 	  * by VkPerformanceCounterKHR::storage for the counter being queried.
616 	  * For v3dv, VkPerformanceCounterKHR::storage is
617 	  * VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR.
618 	  */
619          copy.do_64bit = true;
620          copy.do_partial = info->flags & VK_QUERY_RESULT_PARTIAL_BIT;
621          copy.availability_bit = info->flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT;
622          copy.offset = info->offset + info->dst->mem_offset;
623          copy.stride = info->stride;
624          copy.count = info->count;
625          copy.nperfmons = info->pool->perfmon.nperfmons;
626          copy.ncounters = info->pool->perfmon.ncounters;
627 
628          kperfmon_ids = (uintptr_t *) malloc(sizeof(uintptr_t) * info->count);
629 
630          struct vk_sync_wait waits[info->count];
631          unsigned wait_count = 0;
632 
633          for (uint32_t i = 0; i < info->count; i++) {
634             assert(info->first < info->pool->query_count);
635             assert(info->first + info->count <= info->pool->query_count);
636             struct v3dv_query *query = &info->pool->queries[info->first + i];
637 
638             syncs[i] = vk_sync_as_drm_syncobj(query->perf.last_job_sync)->syncobj;
639             kperfmon_ids[i] = (uintptr_t)(void *)query->perf.kperfmon_ids;
640 
641             if (info->flags & VK_QUERY_RESULT_WAIT_BIT) {
642                 waits[wait_count] = (struct vk_sync_wait){
643                    .sync = query->perf.last_job_sync
644                 };
645                 wait_count++;
646             }
647          }
648 
649          copy.syncs = (uintptr_t)(void *)syncs;
650          copy.kperfmon_ids = (uintptr_t)(void *)kperfmon_ids;
651 
652          set_multisync(&ms, sync_info, waits, wait_count, (void *)&copy, device, job,
653                        V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
654          if (!ms.base.id)
655             return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
656       }
657 
658       submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
659       submit.extensions = (uintptr_t)(void *)&ms;
660 
661       int ret = v3dv_ioctl(device->pdevice->render_fd,
662                            DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
663 
664       free(kperfmon_ids);
665       free(bo_handles);
666       free(offsets);
667       free(syncs);
668       multisync_free(device, &ms);
669 
670       queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
671 
672       if (ret)
673          return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
674 
675       return VK_SUCCESS;
676    }
677 
678    /* Map the entire dst buffer for the CPU copy if needed */
679    assert(!bo->map || bo->map_size == bo->size);
680    if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
681       return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
682 
683    uint8_t *offset = ((uint8_t *) bo->map) +
684                      info->offset + info->dst->mem_offset;
685    v3dv_get_query_pool_results_cpu(job->device,
686                                    info->pool,
687                                    info->first,
688                                    info->count,
689                                    offset,
690                                    info->stride,
691                                    info->flags);
692 
693    return VK_SUCCESS;
694 }
695 
696 static VkResult
handle_timestamp_query_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)697 handle_timestamp_query_cpu_job(struct v3dv_queue *queue,
698                                struct v3dv_job *job,
699                                struct v3dv_submit_sync_info *sync_info,
700                                bool signal_syncs)
701 {
702    struct v3dv_device *device = queue->device;
703 
704    assert(job->type == V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY);
705    struct v3dv_timestamp_query_cpu_job_info *info = &job->cpu.query_timestamp;
706 
707    if (!device->pdevice->caps.cpu_queue) {
708       /* Wait for completion of all work queued before the timestamp query */
709       VkResult result = queue_wait_idle(queue, sync_info);
710       if (result != VK_SUCCESS)
711          return result;
712 
713       mtx_lock(&job->device->query_mutex);
714 
715       /* Compute timestamp */
716       struct timespec t;
717       clock_gettime(CLOCK_MONOTONIC, &t);
718 
719       for (uint32_t i = 0; i < info->count; i++) {
720          assert(info->query + i < info->pool->query_count);
721 	 struct v3dv_query *query = &info->pool->queries[info->query + i];
722          query->maybe_available = true;
723 
724          /* Value */
725          uint8_t *value_addr =
726             ((uint8_t *) info->pool->timestamp.bo->map) + query->timestamp.offset;
727          *((uint64_t*)value_addr) = (i == 0) ? t.tv_sec * 1000000000ull + t.tv_nsec : 0ull;
728 
729          /* Availability */
730          result = vk_sync_signal(&job->device->vk, query->timestamp.sync, 0);
731       }
732 
733       cnd_broadcast(&job->device->query_ended);
734       mtx_unlock(&job->device->query_mutex);
735 
736       return result;
737    }
738 
739    struct drm_v3d_submit_cpu submit = {0};
740 
741    submit.bo_handle_count = 1;
742    submit.bo_handles = (uintptr_t)(void *)&info->pool->timestamp.bo->handle;
743 
744    struct drm_v3d_timestamp_query timestamp = {0};
745 
746    set_ext(&timestamp.base, NULL, DRM_V3D_EXT_ID_CPU_TIMESTAMP_QUERY, 0);
747 
748    timestamp.count = info->count;
749 
750    uint32_t *offsets =
751       (uint32_t *) malloc(sizeof(uint32_t) * info->count);
752    uint32_t *syncs =
753       (uint32_t *) malloc(sizeof(uint32_t) * info->count);
754 
755    for (uint32_t i = 0; i < info->count; i++) {
756       assert(info->query + i < info->pool->query_count);
757       struct v3dv_query *query = &info->pool->queries[info->query + i];
758       query->maybe_available = true;
759 
760       offsets[i] = query->timestamp.offset;
761       syncs[i] = vk_sync_as_drm_syncobj(query->timestamp.sync)->syncobj;
762    }
763 
764    timestamp.offsets = (uintptr_t)(void *)offsets;
765    timestamp.syncs = (uintptr_t)(void *)syncs;
766 
767    struct drm_v3d_multi_sync ms = {0};
768 
769    /* The CPU job should be serialized so it only executes after all previously
770     * submitted work has completed
771     */
772    job->serialize = V3DV_BARRIER_ALL;
773 
774    set_multisync(&ms, sync_info, NULL, 0, (void *)&timestamp, device, job,
775 	         V3DV_QUEUE_CPU, V3DV_QUEUE_CPU, V3D_CPU, signal_syncs);
776    if (!ms.base.id)
777       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
778 
779    submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
780    submit.extensions = (uintptr_t)(void *)&ms;
781 
782    int ret = v3dv_ioctl(device->pdevice->render_fd,
783 			DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
784 
785    free(offsets);
786    free(syncs);
787    multisync_free(device, &ms);
788 
789    queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
790 
791    if (ret)
792       return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
793 
794    return VK_SUCCESS;
795 }
796 
797 static VkResult
handle_csd_indirect_cpu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)798 handle_csd_indirect_cpu_job(struct v3dv_queue *queue,
799                             struct v3dv_job *job,
800                             struct v3dv_submit_sync_info *sync_info,
801                             bool signal_syncs)
802 {
803    struct v3dv_device *device = queue->device;
804 
805    assert(job->type == V3DV_JOB_TYPE_CPU_CSD_INDIRECT);
806    struct v3dv_csd_indirect_cpu_job_info *info = &job->cpu.csd_indirect;
807    assert(info->csd_job);
808 
809    assert(info->buffer && info->buffer->mem && info->buffer->mem->bo);
810    struct v3dv_bo *bo = info->buffer->mem->bo;
811 
812    if (!device->pdevice->caps.cpu_queue) {
813       /* Make sure the GPU is no longer using the indirect buffer*/
814       v3dv_bo_wait(queue->device, bo, OS_TIMEOUT_INFINITE);
815 
816       /* Map the indirect buffer and read the dispatch parameters */
817       if (!bo->map && !v3dv_bo_map(job->device, bo, bo->size))
818          return vk_error(job->device, VK_ERROR_OUT_OF_HOST_MEMORY);
819       assert(bo->map);
820 
821       const uint32_t offset = info->buffer->mem_offset + info->offset;
822       const uint32_t *group_counts = (uint32_t *) (bo->map + offset);
823       if (group_counts[0] == 0 || group_counts[1] == 0|| group_counts[2] == 0)
824          return VK_SUCCESS;
825 
826       if (memcmp(group_counts, info->csd_job->csd.wg_count,
827 		 sizeof(info->csd_job->csd.wg_count)) != 0) {
828          v3dv_cmd_buffer_rewrite_indirect_csd_job(queue->device, info, group_counts);
829       }
830 
831       return VK_SUCCESS;
832    }
833 
834    struct v3dv_job *csd_job = info->csd_job;
835 
836    struct drm_v3d_submit_cpu submit = {0};
837 
838    submit.bo_handle_count = 1;
839    submit.bo_handles = (uintptr_t)(void *)&bo->handle;
840 
841    csd_job->csd.submit.bo_handle_count = csd_job->bo_count;
842    uint32_t *bo_handles = (uint32_t *) malloc(sizeof(uint32_t) * csd_job->bo_count);
843    uint32_t bo_idx = 0;
844    set_foreach (csd_job->bos, entry) {
845       struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
846       bo_handles[bo_idx++] = bo->handle;
847    }
848    csd_job->csd.submit.bo_handles = (uintptr_t)(void *)bo_handles;
849 
850    struct drm_v3d_indirect_csd indirect = {0};
851 
852    set_ext(&indirect.base, NULL, DRM_V3D_EXT_ID_CPU_INDIRECT_CSD, 0);
853 
854    indirect.submit = csd_job->csd.submit;
855    indirect.offset = info->buffer->mem_offset + info->offset;
856    indirect.wg_size = info->wg_size;
857 
858    for (int i = 0; i < 3; i++) {
859       if (info->wg_uniform_offsets[i]) {
860          assert(info->wg_uniform_offsets[i] >= (uint32_t *) csd_job->indirect.base);
861          indirect.wg_uniform_offsets[i] = info->wg_uniform_offsets[i] - (uint32_t *) csd_job->indirect.base;
862       } else {
863          indirect.wg_uniform_offsets[i] = 0xffffffff; /* No rewrite */
864       }
865    }
866 
867    indirect.indirect = csd_job->indirect.bo->handle;
868 
869    struct drm_v3d_multi_sync ms = {0};
870 
871    /* We need to configure the semaphores of this job with the indirect
872     * CSD job, as the CPU job must obey to the CSD job synchronization
873     * demands, such as barriers.
874     */
875    set_multisync(&ms, sync_info, NULL, 0, (void *)&indirect, device, csd_job,
876 	         V3DV_QUEUE_CPU, V3DV_QUEUE_CSD, V3D_CPU, signal_syncs);
877    if (!ms.base.id)
878       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
879 
880    submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
881    submit.extensions = (uintptr_t)(void *)&ms;
882 
883    int ret = v3dv_ioctl(device->pdevice->render_fd,
884 			DRM_IOCTL_V3D_SUBMIT_CPU, &submit);
885 
886    free(bo_handles);
887    multisync_free(device, &ms);
888 
889    queue->last_job_syncs.first[V3DV_QUEUE_CPU] = false;
890    queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
891 
892    if (ret)
893       return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CPU failed: %m");
894 
895    return VK_SUCCESS;
896 }
897 
898 static VkResult
handle_cl_job(struct v3dv_queue * queue,struct v3dv_job * job,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)899 handle_cl_job(struct v3dv_queue *queue,
900               struct v3dv_job *job,
901               uint32_t counter_pass_idx,
902               struct v3dv_submit_sync_info *sync_info,
903               bool signal_syncs)
904 {
905    struct v3dv_device *device = queue->device;
906 
907    struct drm_v3d_submit_cl submit = { 0 };
908 
909    /* Sanity check: we should only flag a bcl sync on a job that needs to be
910     * serialized.
911     */
912    assert(job->serialize || !job->needs_bcl_sync);
913 
914    /* We expect to have just one RCL per job which should fit in just one BO.
915     * Our BCL, could chain multiple BOS together though.
916     */
917    assert(list_length(&job->rcl.bo_list) == 1);
918    assert(list_length(&job->bcl.bo_list) >= 1);
919    struct v3dv_bo *bcl_fist_bo =
920       list_first_entry(&job->bcl.bo_list, struct v3dv_bo, list_link);
921    submit.bcl_start = bcl_fist_bo->offset;
922    submit.bcl_end = job->suspending ? job->suspended_bcl_end :
923                                       job->bcl.bo->offset + v3dv_cl_offset(&job->bcl);
924    submit.rcl_start = job->rcl.bo->offset;
925    submit.rcl_end = job->rcl.bo->offset + v3dv_cl_offset(&job->rcl);
926 
927    submit.qma = job->tile_alloc->offset;
928    submit.qms = job->tile_alloc->size;
929    submit.qts = job->tile_state->offset;
930 
931    submit.flags = 0;
932    if (job->tmu_dirty_rcl)
933       submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE;
934 
935    /* If the job uses VK_KHR_buffer_device_address we need to ensure all
936     * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
937     * are included.
938     */
939    if (job->uses_buffer_device_address) {
940       util_dynarray_foreach(&queue->device->device_address_bo_list,
941                             struct v3dv_bo *, bo) {
942          v3dv_job_add_bo(job, *bo);
943       }
944    }
945 
946    submit.bo_handle_count = job->bo_count;
947    uint32_t *bo_handles =
948       (uint32_t *) malloc(sizeof(uint32_t) * submit.bo_handle_count);
949    uint32_t bo_idx = 0;
950    set_foreach(job->bos, entry) {
951       struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
952       bo_handles[bo_idx++] = bo->handle;
953    }
954    assert(bo_idx == submit.bo_handle_count);
955    submit.bo_handles = (uintptr_t)(void *)bo_handles;
956 
957    submit.perfmon_id = job->perf ?
958       job->perf->kperfmon_ids[counter_pass_idx] : 0;
959    const bool needs_perf_sync = queue->last_perfmon_id != submit.perfmon_id;
960    queue->last_perfmon_id = submit.perfmon_id;
961 
962    /* We need a binning sync if we are the first CL job waiting on a semaphore
963     * with a wait stage that involves the geometry pipeline, or if the job
964     * comes after a pipeline barrier that involves geometry stages
965     * (needs_bcl_sync) or when performance queries are in use.
966     *
967     * We need a render sync if the job doesn't need a binning sync but has
968     * still been flagged for serialization. It should be noted that RCL jobs
969     * don't start until the previous RCL job has finished so we don't really
970     * need to add a fence for those, however, we might need to wait on a CSD or
971     * TFU job, which are not automatically serialized with CL jobs.
972     */
973    bool needs_bcl_sync = job->needs_bcl_sync || needs_perf_sync;
974    if (queue->last_job_syncs.first[V3DV_QUEUE_CL]) {
975       for (int i = 0; !needs_bcl_sync && i < sync_info->wait_count; i++) {
976          needs_bcl_sync = sync_info->waits[i].stage_mask &
977              (VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT |
978               VK_PIPELINE_STAGE_2_ALL_GRAPHICS_BIT |
979               VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT |
980               VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
981               VK_PIPELINE_STAGE_2_INDEX_INPUT_BIT |
982               VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
983               VK_PIPELINE_STAGE_2_VERTEX_ATTRIBUTE_INPUT_BIT |
984               VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT |
985               VK_PIPELINE_STAGE_2_TESSELLATION_CONTROL_SHADER_BIT |
986               VK_PIPELINE_STAGE_2_TESSELLATION_EVALUATION_SHADER_BIT |
987               VK_PIPELINE_STAGE_2_GEOMETRY_SHADER_BIT |
988               VK_PIPELINE_STAGE_2_PRE_RASTERIZATION_SHADERS_BIT);
989       }
990    }
991 
992    bool needs_rcl_sync = job->serialize && !needs_bcl_sync;
993 
994    /* Replace single semaphore settings whenever our kernel-driver supports
995     * multiple semaphores extension.
996     */
997    struct drm_v3d_multi_sync ms = { 0 };
998    enum v3d_queue wait_stage = needs_rcl_sync ? V3D_RENDER : V3D_BIN;
999    set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
1000                  V3DV_QUEUE_CL, V3DV_QUEUE_CL, wait_stage, signal_syncs);
1001    if (!ms.base.id)
1002       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1003 
1004    submit.flags |= DRM_V3D_SUBMIT_EXTENSION;
1005    submit.extensions = (uintptr_t)(void *)&ms;
1006 
1007    /* We are using multisync so disable legacy single-sync interface */
1008    submit.in_sync_rcl = 0;
1009    submit.in_sync_bcl = 0;
1010    submit.out_sync = 0;
1011 
1012    v3dv_clif_dump(device, job, &submit);
1013    int ret = v3dv_ioctl(device->pdevice->render_fd,
1014                         DRM_IOCTL_V3D_SUBMIT_CL, &submit);
1015 
1016    static bool warned = false;
1017    if (ret && !warned) {
1018       fprintf(stderr, "Draw call returned %s. Expect corruption.\n",
1019               strerror(errno));
1020       warned = true;
1021    }
1022 
1023    free(bo_handles);
1024    multisync_free(device, &ms);
1025 
1026    queue->last_job_syncs.first[V3DV_QUEUE_CL] = false;
1027 
1028    if (ret)
1029       return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CL failed: %m");
1030 
1031    return VK_SUCCESS;
1032 }
1033 
1034 static VkResult
handle_tfu_job(struct v3dv_queue * queue,struct v3dv_job * job,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1035 handle_tfu_job(struct v3dv_queue *queue,
1036                struct v3dv_job *job,
1037                struct v3dv_submit_sync_info *sync_info,
1038                bool signal_syncs)
1039 {
1040    assert(!V3D_DBG(DISABLE_TFU));
1041 
1042    struct v3dv_device *device = queue->device;
1043 
1044    /* Replace single semaphore settings whenever our kernel-driver supports
1045     * multiple semaphore extension.
1046     */
1047    struct drm_v3d_multi_sync ms = { 0 };
1048    set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
1049                  V3DV_QUEUE_TFU, V3DV_QUEUE_TFU, V3D_TFU, signal_syncs);
1050    if (!ms.base.id)
1051       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1052 
1053    job->tfu.flags |= DRM_V3D_SUBMIT_EXTENSION;
1054    job->tfu.extensions = (uintptr_t)(void *)&ms;
1055 
1056    /* We are using multisync so disable legacy single-sync interface */
1057    job->tfu.in_sync = 0;
1058    job->tfu.out_sync = 0;
1059 
1060    int ret = v3dv_ioctl(device->pdevice->render_fd,
1061                         DRM_IOCTL_V3D_SUBMIT_TFU, &job->tfu);
1062 
1063    multisync_free(device, &ms);
1064    queue->last_job_syncs.first[V3DV_QUEUE_TFU] = false;
1065 
1066    if (ret != 0)
1067       return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_TFU failed: %m");
1068 
1069    return VK_SUCCESS;
1070 }
1071 
1072 static VkResult
handle_csd_job(struct v3dv_queue * queue,struct v3dv_job * job,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1073 handle_csd_job(struct v3dv_queue *queue,
1074                struct v3dv_job *job,
1075                uint32_t counter_pass_idx,
1076                struct v3dv_submit_sync_info *sync_info,
1077                bool signal_syncs)
1078 {
1079    struct v3dv_device *device = queue->device;
1080 
1081    struct drm_v3d_submit_csd *submit = &job->csd.submit;
1082 
1083    /* If the job uses VK_KHR_buffer_device_address we need to ensure all
1084     * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT
1085     * are included.
1086     */
1087    if (job->uses_buffer_device_address) {
1088       util_dynarray_foreach(&queue->device->device_address_bo_list,
1089                             struct v3dv_bo *, bo) {
1090          v3dv_job_add_bo(job, *bo);
1091       }
1092    }
1093 
1094    submit->bo_handle_count = job->bo_count;
1095    uint32_t *bo_handles =
1096       (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2));
1097    uint32_t bo_idx = 0;
1098    set_foreach(job->bos, entry) {
1099       struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
1100       bo_handles[bo_idx++] = bo->handle;
1101    }
1102    assert(bo_idx == submit->bo_handle_count);
1103    submit->bo_handles = (uintptr_t)(void *)bo_handles;
1104 
1105    /* Replace single semaphore settings whenever our kernel-driver supports
1106     * multiple semaphore extension.
1107     */
1108    struct drm_v3d_multi_sync ms = { 0 };
1109    set_multisync(&ms, sync_info, NULL, 0, NULL, device, job,
1110                  V3DV_QUEUE_CSD, V3DV_QUEUE_CSD, V3D_CSD, signal_syncs);
1111    if (!ms.base.id)
1112       return vk_error(device->instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1113 
1114    submit->flags |= DRM_V3D_SUBMIT_EXTENSION;
1115    submit->extensions = (uintptr_t)(void *)&ms;
1116 
1117    /* We are using multisync so disable legacy single-sync interface */
1118    submit->in_sync = 0;
1119    submit->out_sync = 0;
1120 
1121    submit->perfmon_id = job->perf ?
1122       job->perf->kperfmon_ids[counter_pass_idx] : 0;
1123    queue->last_perfmon_id = submit->perfmon_id;
1124 
1125    int ret = v3dv_ioctl(device->pdevice->render_fd,
1126                         DRM_IOCTL_V3D_SUBMIT_CSD, submit);
1127 
1128    static bool warned = false;
1129    if (ret && !warned) {
1130       fprintf(stderr, "Compute dispatch returned %s. Expect corruption.\n",
1131               strerror(errno));
1132       warned = true;
1133    }
1134 
1135    free(bo_handles);
1136 
1137    multisync_free(device, &ms);
1138    queue->last_job_syncs.first[V3DV_QUEUE_CSD] = false;
1139 
1140    if (ret)
1141       return vk_queue_set_lost(&queue->vk, "V3D_SUBMIT_CSD failed: %m");
1142 
1143    return VK_SUCCESS;
1144 }
1145 
1146 static VkResult
queue_handle_job(struct v3dv_queue * queue,struct v3dv_job * job,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1147 queue_handle_job(struct v3dv_queue *queue,
1148                  struct v3dv_job *job,
1149                  uint32_t counter_pass_idx,
1150                  struct v3dv_submit_sync_info *sync_info,
1151                  bool signal_syncs)
1152 {
1153    switch (job->type) {
1154    case V3DV_JOB_TYPE_GPU_CL:
1155       return handle_cl_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
1156    case V3DV_JOB_TYPE_GPU_TFU:
1157       return handle_tfu_job(queue, job, sync_info, signal_syncs);
1158    case V3DV_JOB_TYPE_GPU_CSD:
1159       return handle_csd_job(queue, job, counter_pass_idx, sync_info, signal_syncs);
1160    case V3DV_JOB_TYPE_CPU_RESET_QUERIES:
1161       return handle_reset_query_cpu_job(queue, job, sync_info, signal_syncs);
1162    case V3DV_JOB_TYPE_CPU_END_QUERY:
1163       return handle_end_query_cpu_job(job, counter_pass_idx);
1164    case V3DV_JOB_TYPE_CPU_COPY_QUERY_RESULTS:
1165       return handle_copy_query_results_cpu_job(queue, job, sync_info, signal_syncs);
1166    case V3DV_JOB_TYPE_CPU_CSD_INDIRECT:
1167       return handle_csd_indirect_cpu_job(queue, job, sync_info, signal_syncs);
1168    case V3DV_JOB_TYPE_CPU_TIMESTAMP_QUERY:
1169       return handle_timestamp_query_cpu_job(queue, job, sync_info, signal_syncs);
1170    default:
1171       unreachable("Unhandled job type");
1172    }
1173 }
1174 
1175 static VkResult
queue_create_noop_job(struct v3dv_queue * queue)1176 queue_create_noop_job(struct v3dv_queue *queue)
1177 {
1178    struct v3dv_device *device = queue->device;
1179    queue->noop_job = vk_zalloc(&device->vk.alloc, sizeof(struct v3dv_job), 8,
1180                                VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1181    if (!queue->noop_job)
1182       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1183    v3dv_job_init(queue->noop_job, V3DV_JOB_TYPE_GPU_CL, device, NULL, -1);
1184 
1185    v3dv_X(device, job_emit_noop)(queue->noop_job);
1186 
1187    /* We use no-op jobs to signal semaphores/fences. These jobs needs to be
1188     * serialized across all hw queues to comply with Vulkan's signal operation
1189     * order requirements, which basically require that signal operations occur
1190     * in submission order.
1191     */
1192    queue->noop_job->serialize = V3DV_BARRIER_ALL;
1193 
1194    return VK_SUCCESS;
1195 }
1196 
1197 static VkResult
queue_submit_noop_job(struct v3dv_queue * queue,uint32_t counter_pass_idx,struct v3dv_submit_sync_info * sync_info,bool signal_syncs)1198 queue_submit_noop_job(struct v3dv_queue *queue,
1199                       uint32_t counter_pass_idx,
1200                       struct v3dv_submit_sync_info *sync_info,
1201                       bool signal_syncs)
1202 {
1203    if (!queue->noop_job) {
1204       VkResult result = queue_create_noop_job(queue);
1205       if (result != VK_SUCCESS)
1206          return result;
1207    }
1208 
1209    assert(queue->noop_job);
1210    return queue_handle_job(queue, queue->noop_job, counter_pass_idx,
1211                            sync_info, signal_syncs);
1212 }
1213 
1214 VkResult
v3dv_queue_driver_submit(struct vk_queue * vk_queue,struct vk_queue_submit * submit)1215 v3dv_queue_driver_submit(struct vk_queue *vk_queue,
1216                          struct vk_queue_submit *submit)
1217 {
1218    struct v3dv_queue *queue = container_of(vk_queue, struct v3dv_queue, vk);
1219    VkResult result;
1220 
1221    struct v3dv_submit_sync_info sync_info = {
1222       .wait_count = submit->wait_count,
1223       .waits = submit->waits,
1224       .signal_count = submit->signal_count,
1225       .signals = submit->signals,
1226    };
1227 
1228    for (int i = 0; i < V3DV_QUEUE_COUNT; i++)
1229       queue->last_job_syncs.first[i] = true;
1230 
1231    struct v3dv_job *first_suspend_job = NULL;
1232    struct v3dv_job *current_suspend_job = NULL;
1233    for (uint32_t i = 0; i < submit->command_buffer_count; i++) {
1234       struct v3dv_cmd_buffer *cmd_buffer =
1235          container_of(submit->command_buffers[i], struct v3dv_cmd_buffer, vk);
1236       list_for_each_entry_safe(struct v3dv_job, job,
1237                                &cmd_buffer->jobs, list_link) {
1238          if (job->suspending) {
1239             job = v3dv_X(job->device,
1240                          cmd_buffer_prepare_suspend_job_for_submit)(job);
1241             if (!job)
1242                return VK_ERROR_OUT_OF_DEVICE_MEMORY;
1243          }
1244 
1245          if (job->suspending && !job->resuming) {
1246             assert(!first_suspend_job);
1247             assert(!current_suspend_job);
1248             first_suspend_job = job;
1249          }
1250 
1251          if (job->resuming) {
1252             assert(first_suspend_job);
1253             assert(current_suspend_job);
1254             v3dv_X(job->device, job_patch_resume_address)(first_suspend_job,
1255                                                           current_suspend_job,
1256                                                           job);
1257             current_suspend_job = NULL;
1258          }
1259 
1260          if (job->suspending) {
1261             current_suspend_job = job;
1262          } else {
1263             assert(!current_suspend_job);
1264             struct v3dv_job *submit_job = first_suspend_job ?
1265                                           first_suspend_job : job;
1266             result =
1267                queue_handle_job(queue, submit_job, submit->perf_pass_index,
1268                                 &sync_info, false);
1269 
1270             if (result != VK_SUCCESS)
1271                return result;
1272 
1273             first_suspend_job = NULL;
1274          }
1275       }
1276 
1277       /* If the command buffer ends with a barrier we need to consume it now.
1278        *
1279        * FIXME: this will drain all hw queues. Instead, we could use the pending
1280        * barrier state to limit the queues we serialize against.
1281        */
1282       if (cmd_buffer->state.barrier.dst_mask) {
1283          result = queue_submit_noop_job(queue, submit->perf_pass_index,
1284                                         &sync_info, false);
1285          if (result != VK_SUCCESS)
1286             return result;
1287       }
1288    }
1289 
1290    assert(!first_suspend_job);
1291    assert(!current_suspend_job);
1292 
1293    /* Handle signaling now */
1294    if (submit->signal_count > 0) {
1295       /* Finish by submitting a no-op job that synchronizes across all queues.
1296        * This will ensure that the signal semaphores don't get triggered until
1297        * all work on any queue completes. See Vulkan's signal operation order
1298        * requirements.
1299        */
1300       return queue_submit_noop_job(queue, submit->perf_pass_index,
1301                                    &sync_info, true);
1302    }
1303 
1304    return VK_SUCCESS;
1305 }
1306 
1307 VKAPI_ATTR VkResult VKAPI_CALL
v3dv_QueueBindSparse(VkQueue _queue,uint32_t bindInfoCount,const VkBindSparseInfo * pBindInfo,VkFence fence)1308 v3dv_QueueBindSparse(VkQueue _queue,
1309                      uint32_t bindInfoCount,
1310                      const VkBindSparseInfo *pBindInfo,
1311                      VkFence fence)
1312 {
1313    V3DV_FROM_HANDLE(v3dv_queue, queue, _queue);
1314    return vk_error(queue, VK_ERROR_FEATURE_NOT_PRESENT);
1315 }
1316