xref: /aosp_15_r20/external/mesa3d/src/amd/vulkan/winsys/amdgpu/radv_amdgpu_cs.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include <amdgpu.h>
9 #include <assert.h>
10 #include <libsync.h>
11 #include <pthread.h>
12 #include <stdlib.h>
13 #include "drm-uapi/amdgpu_drm.h"
14 
15 #include "util/detect_os.h"
16 #include "util/os_time.h"
17 #include "util/u_memory.h"
18 #include "ac_debug.h"
19 #include "radv_amdgpu_bo.h"
20 #include "radv_amdgpu_cs.h"
21 #include "radv_amdgpu_winsys.h"
22 #include "radv_debug.h"
23 #include "radv_radeon_winsys.h"
24 #include "sid.h"
25 #include "vk_alloc.h"
26 #include "vk_drm_syncobj.h"
27 #include "vk_sync.h"
28 #include "vk_sync_dummy.h"
29 
30 /* Some BSDs don't define ENODATA (and ENODATA is replaced with different error
31  * codes in the kernel).
32  */
33 #if DETECT_OS_OPENBSD
34 #define ENODATA ENOTSUP
35 #elif DETECT_OS_FREEBSD || DETECT_OS_DRAGONFLY
36 #define ENODATA ECONNREFUSED
37 #endif
38 
39 /* Maximum allowed total number of submitted IBs. */
40 #define RADV_MAX_IBS_PER_SUBMIT 192
41 
42 enum { VIRTUAL_BUFFER_HASH_TABLE_SIZE = 1024 };
43 
44 struct radv_amdgpu_ib {
45    struct radeon_winsys_bo *bo; /* NULL when not owned by the current CS object */
46    uint64_t va;
47    unsigned cdw;
48 };
49 
50 struct radv_amdgpu_cs_ib_info {
51    int64_t flags;
52    uint64_t ib_mc_address;
53    uint32_t size;
54    enum amd_ip_type ip_type;
55 };
56 
57 struct radv_amdgpu_cs {
58    struct radeon_cmdbuf base;
59    struct radv_amdgpu_winsys *ws;
60 
61    struct radv_amdgpu_cs_ib_info ib;
62 
63    struct radeon_winsys_bo *ib_buffer;
64    uint8_t *ib_mapped;
65    unsigned max_num_buffers;
66    unsigned num_buffers;
67    struct drm_amdgpu_bo_list_entry *handles;
68 
69    struct radv_amdgpu_ib *ib_buffers;
70    unsigned num_ib_buffers;
71    unsigned max_num_ib_buffers;
72    unsigned *ib_size_ptr;
73    VkResult status;
74    struct radv_amdgpu_cs *chained_to;
75    bool use_ib;
76    bool is_secondary;
77 
78    int buffer_hash_table[1024];
79    unsigned hw_ip;
80 
81    unsigned num_virtual_buffers;
82    unsigned max_num_virtual_buffers;
83    struct radeon_winsys_bo **virtual_buffers;
84    int *virtual_buffer_hash_table;
85 
86    struct hash_table *annotations;
87 };
88 
89 struct radv_winsys_sem_counts {
90    uint32_t syncobj_count;
91    uint32_t timeline_syncobj_count;
92    uint32_t *syncobj;
93    uint64_t *points;
94 };
95 
96 struct radv_winsys_sem_info {
97    bool cs_emit_signal;
98    bool cs_emit_wait;
99    struct radv_winsys_sem_counts wait;
100    struct radv_winsys_sem_counts signal;
101 };
102 
103 static void
radeon_emit_unchecked(struct radeon_cmdbuf * cs,uint32_t value)104 radeon_emit_unchecked(struct radeon_cmdbuf *cs, uint32_t value)
105 {
106    cs->buf[cs->cdw++] = value;
107 }
108 
109 static uint32_t radv_amdgpu_ctx_queue_syncobj(struct radv_amdgpu_ctx *ctx, unsigned ip, unsigned ring);
110 
111 static inline struct radv_amdgpu_cs *
radv_amdgpu_cs(struct radeon_cmdbuf * base)112 radv_amdgpu_cs(struct radeon_cmdbuf *base)
113 {
114    return (struct radv_amdgpu_cs *)base;
115 }
116 
117 static bool
ring_can_use_ib_bos(const struct radv_amdgpu_winsys * ws,enum amd_ip_type ip_type)118 ring_can_use_ib_bos(const struct radv_amdgpu_winsys *ws, enum amd_ip_type ip_type)
119 {
120    return ws->use_ib_bos && (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE);
121 }
122 
123 struct radv_amdgpu_cs_request {
124    /** Specify HW IP block type to which to send the IB. */
125    unsigned ip_type;
126 
127    /** IP instance index if there are several IPs of the same type. */
128    unsigned ip_instance;
129 
130    /**
131     * Specify ring index of the IP. We could have several rings
132     * in the same IP. E.g. 0 for SDMA0 and 1 for SDMA1.
133     */
134    uint32_t ring;
135 
136    /**
137     * BO list handles used by this request.
138     */
139    struct drm_amdgpu_bo_list_entry *handles;
140    uint32_t num_handles;
141 
142    /** Number of IBs to submit in the field ibs. */
143    uint32_t number_of_ibs;
144 
145    /**
146     * IBs to submit. Those IBs will be submitted together as single entity
147     */
148    struct radv_amdgpu_cs_ib_info *ibs;
149 
150    /**
151     * The returned sequence number for the command submission
152     */
153    uint64_t seq_no;
154 };
155 
156 static VkResult radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request *request,
157                                       struct radv_winsys_sem_info *sem_info);
158 
159 static void
radv_amdgpu_request_to_fence(struct radv_amdgpu_ctx * ctx,struct radv_amdgpu_fence * fence,struct radv_amdgpu_cs_request * req)160 radv_amdgpu_request_to_fence(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_fence *fence,
161                              struct radv_amdgpu_cs_request *req)
162 {
163    fence->fence.context = ctx->ctx;
164    fence->fence.ip_type = req->ip_type;
165    fence->fence.ip_instance = req->ip_instance;
166    fence->fence.ring = req->ring;
167    fence->fence.fence = req->seq_no;
168 }
169 
170 static struct radv_amdgpu_cs_ib_info
radv_amdgpu_cs_ib_to_info(struct radv_amdgpu_cs * cs,struct radv_amdgpu_ib ib)171 radv_amdgpu_cs_ib_to_info(struct radv_amdgpu_cs *cs, struct radv_amdgpu_ib ib)
172 {
173    struct radv_amdgpu_cs_ib_info info = {
174       .flags = 0,
175       .ip_type = cs->hw_ip,
176       .ib_mc_address = ib.va,
177       .size = ib.cdw,
178    };
179    return info;
180 }
181 
182 static void
radv_amdgpu_cs_free_annotation(struct hash_entry * entry)183 radv_amdgpu_cs_free_annotation(struct hash_entry *entry)
184 {
185    free(entry->data);
186 }
187 
188 static void
radv_amdgpu_cs_destroy(struct radeon_cmdbuf * rcs)189 radv_amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
190 {
191    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(rcs);
192 
193    _mesa_hash_table_destroy(cs->annotations, radv_amdgpu_cs_free_annotation);
194 
195    if (cs->ib_buffer)
196       cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffer);
197 
198    for (unsigned i = 0; i < cs->num_ib_buffers; ++i) {
199       if (!cs->ib_buffers[i].bo)
200          continue;
201 
202       cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffers[i].bo);
203    }
204 
205    free(cs->ib_buffers);
206    free(cs->virtual_buffers);
207    free(cs->virtual_buffer_hash_table);
208    free(cs->handles);
209    free(cs);
210 }
211 
212 static void
radv_amdgpu_init_cs(struct radv_amdgpu_cs * cs,enum amd_ip_type ip_type)213 radv_amdgpu_init_cs(struct radv_amdgpu_cs *cs, enum amd_ip_type ip_type)
214 {
215    for (int i = 0; i < ARRAY_SIZE(cs->buffer_hash_table); ++i)
216       cs->buffer_hash_table[i] = -1;
217 
218    cs->hw_ip = ip_type;
219 }
220 
221 static enum radeon_bo_domain
radv_amdgpu_cs_domain(const struct radeon_winsys * _ws)222 radv_amdgpu_cs_domain(const struct radeon_winsys *_ws)
223 {
224    const struct radv_amdgpu_winsys *ws = (const struct radv_amdgpu_winsys *)_ws;
225 
226    bool enough_vram = ws->info.all_vram_visible ||
227                       p_atomic_read_relaxed(&ws->allocated_vram_vis) * 2 <= (uint64_t)ws->info.vram_vis_size_kb * 1024;
228 
229    /* Bandwidth should be equivalent to at least PCIe 3.0 x8.
230     * If there is no PCIe info, assume there is enough bandwidth.
231     */
232    bool enough_bandwidth = !ws->info.has_pcie_bandwidth_info || ws->info.pcie_bandwidth_mbps >= 8 * 0.985 * 1024;
233 
234    bool use_sam =
235       (enough_vram && enough_bandwidth && ws->info.has_dedicated_vram && !(ws->perftest & RADV_PERFTEST_NO_SAM)) ||
236       (ws->perftest & RADV_PERFTEST_SAM);
237    return use_sam ? RADEON_DOMAIN_VRAM : RADEON_DOMAIN_GTT;
238 }
239 
240 static VkResult
radv_amdgpu_cs_bo_create(struct radv_amdgpu_cs * cs,uint32_t ib_size)241 radv_amdgpu_cs_bo_create(struct radv_amdgpu_cs *cs, uint32_t ib_size)
242 {
243    struct radeon_winsys *ws = &cs->ws->base;
244 
245    /* Avoid memcpy from VRAM when a secondary cmdbuf can't always rely on IB2. */
246    const bool can_always_use_ib2 = cs->ws->info.gfx_level >= GFX8 && cs->hw_ip == AMD_IP_GFX;
247    const bool avoid_vram = cs->is_secondary && !can_always_use_ib2;
248    const enum radeon_bo_domain domain = avoid_vram ? RADEON_DOMAIN_GTT : radv_amdgpu_cs_domain(ws);
249    const enum radeon_bo_flag gtt_wc_flag = avoid_vram ? 0 : RADEON_FLAG_GTT_WC;
250    const enum radeon_bo_flag flags =
251       RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_READ_ONLY | gtt_wc_flag;
252 
253    return ws->buffer_create(ws, ib_size, cs->ws->info.ip[cs->hw_ip].ib_alignment, domain, flags, RADV_BO_PRIORITY_CS, 0,
254                             &cs->ib_buffer);
255 }
256 
257 static VkResult
radv_amdgpu_cs_get_new_ib(struct radeon_cmdbuf * _cs,uint32_t ib_size)258 radv_amdgpu_cs_get_new_ib(struct radeon_cmdbuf *_cs, uint32_t ib_size)
259 {
260    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
261    VkResult result;
262 
263    result = radv_amdgpu_cs_bo_create(cs, ib_size);
264    if (result != VK_SUCCESS)
265       return result;
266 
267    cs->ib_mapped = radv_buffer_map(&cs->ws->base, cs->ib_buffer);
268    if (!cs->ib_mapped) {
269       cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffer);
270       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
271    }
272 
273    cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
274    cs->base.buf = (uint32_t *)cs->ib_mapped;
275    cs->base.cdw = 0;
276    cs->base.reserved_dw = 0;
277    cs->base.max_dw = ib_size / 4 - 4;
278    cs->ib.size = 0;
279    cs->ib.ip_type = cs->hw_ip;
280 
281    if (cs->use_ib)
282       cs->ib_size_ptr = &cs->ib.size;
283 
284    cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
285 
286    return VK_SUCCESS;
287 }
288 
289 static unsigned
radv_amdgpu_cs_get_initial_size(struct radv_amdgpu_winsys * ws,enum amd_ip_type ip_type)290 radv_amdgpu_cs_get_initial_size(struct radv_amdgpu_winsys *ws, enum amd_ip_type ip_type)
291 {
292    const uint32_t ib_alignment = ws->info.ip[ip_type].ib_alignment;
293    assert(util_is_power_of_two_nonzero(ib_alignment));
294    return align(20 * 1024 * 4, ib_alignment);
295 }
296 
297 static struct radeon_cmdbuf *
radv_amdgpu_cs_create(struct radeon_winsys * ws,enum amd_ip_type ip_type,bool is_secondary)298 radv_amdgpu_cs_create(struct radeon_winsys *ws, enum amd_ip_type ip_type, bool is_secondary)
299 {
300    struct radv_amdgpu_cs *cs;
301    uint32_t ib_size = radv_amdgpu_cs_get_initial_size(radv_amdgpu_winsys(ws), ip_type);
302 
303    cs = calloc(1, sizeof(struct radv_amdgpu_cs));
304    if (!cs)
305       return NULL;
306 
307    cs->is_secondary = is_secondary;
308    cs->ws = radv_amdgpu_winsys(ws);
309    radv_amdgpu_init_cs(cs, ip_type);
310 
311    cs->use_ib = ring_can_use_ib_bos(cs->ws, ip_type);
312 
313    VkResult result = radv_amdgpu_cs_get_new_ib(&cs->base, ib_size);
314    if (result != VK_SUCCESS) {
315       free(cs);
316       return NULL;
317    }
318 
319    return &cs->base;
320 }
321 
322 static uint32_t
get_nop_packet(struct radv_amdgpu_cs * cs)323 get_nop_packet(struct radv_amdgpu_cs *cs)
324 {
325    switch (cs->hw_ip) {
326    case AMDGPU_HW_IP_GFX:
327    case AMDGPU_HW_IP_COMPUTE:
328       return cs->ws->info.gfx_ib_pad_with_type2 ? PKT2_NOP_PAD : PKT3_NOP_PAD;
329    case AMDGPU_HW_IP_DMA:
330       return cs->ws->info.gfx_level == GFX6 ? 0xF0000000 : SDMA_NOP_PAD;
331    case AMDGPU_HW_IP_UVD:
332    case AMDGPU_HW_IP_UVD_ENC:
333       return PKT2_NOP_PAD;
334    case AMDGPU_HW_IP_VCN_DEC:
335       return 0x81FF;
336    case AMDGPU_HW_IP_VCN_ENC:
337       return 0; /* NOPs are illegal in encode, so don't pad */
338    default:
339       unreachable("Unknown IP type");
340    }
341 }
342 
343 static void
radv_amdgpu_cs_add_ib_buffer(struct radv_amdgpu_cs * cs,struct radeon_winsys_bo * bo,uint64_t va,uint32_t cdw)344 radv_amdgpu_cs_add_ib_buffer(struct radv_amdgpu_cs *cs, struct radeon_winsys_bo *bo, uint64_t va, uint32_t cdw)
345 {
346    if (cs->num_ib_buffers == cs->max_num_ib_buffers) {
347       unsigned max_num_ib_buffers = MAX2(1, cs->max_num_ib_buffers * 2);
348       struct radv_amdgpu_ib *ib_buffers = realloc(cs->ib_buffers, max_num_ib_buffers * sizeof(*ib_buffers));
349       if (!ib_buffers) {
350          cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
351          return;
352       }
353       cs->max_num_ib_buffers = max_num_ib_buffers;
354       cs->ib_buffers = ib_buffers;
355    }
356 
357    cs->ib_buffers[cs->num_ib_buffers].bo = bo;
358    cs->ib_buffers[cs->num_ib_buffers].va = va;
359    cs->ib_buffers[cs->num_ib_buffers++].cdw = cdw;
360 }
361 
362 static void
radv_amdgpu_restore_last_ib(struct radv_amdgpu_cs * cs)363 radv_amdgpu_restore_last_ib(struct radv_amdgpu_cs *cs)
364 {
365    struct radv_amdgpu_ib *ib = &cs->ib_buffers[--cs->num_ib_buffers];
366    assert(ib->bo);
367    cs->ib_buffer = ib->bo;
368 }
369 
370 static void
radv_amdgpu_cs_grow(struct radeon_cmdbuf * _cs,size_t min_size)371 radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size)
372 {
373    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
374 
375    if (cs->status != VK_SUCCESS) {
376       cs->base.cdw = 0;
377       return;
378    }
379 
380    const uint32_t ib_alignment = cs->ws->info.ip[cs->hw_ip].ib_alignment;
381 
382    cs->ws->base.cs_finalize(_cs);
383 
384    uint64_t ib_size = MAX2(min_size * 4 + 16, cs->base.max_dw * 4 * 2);
385 
386    /* max that fits in the chain size field. */
387    ib_size = align(MIN2(ib_size, 0xfffff), ib_alignment);
388 
389    VkResult result = radv_amdgpu_cs_bo_create(cs, ib_size);
390 
391    if (result != VK_SUCCESS) {
392       cs->base.cdw = 0;
393       cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
394       radv_amdgpu_restore_last_ib(cs);
395    }
396 
397    cs->ib_mapped = radv_buffer_map(&cs->ws->base, cs->ib_buffer);
398    if (!cs->ib_mapped) {
399       cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffer);
400       cs->base.cdw = 0;
401 
402       /* VK_ERROR_MEMORY_MAP_FAILED is not valid for vkEndCommandBuffer. */
403       cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
404       radv_amdgpu_restore_last_ib(cs);
405    }
406 
407    cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
408 
409    if (cs->use_ib) {
410       cs->base.buf[cs->base.cdw - 4] = PKT3(PKT3_INDIRECT_BUFFER, 2, 0);
411       cs->base.buf[cs->base.cdw - 3] = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
412       cs->base.buf[cs->base.cdw - 2] = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va >> 32;
413       cs->base.buf[cs->base.cdw - 1] = S_3F2_CHAIN(1) | S_3F2_VALID(1);
414 
415       cs->ib_size_ptr = cs->base.buf + cs->base.cdw - 1;
416    }
417 
418    cs->base.buf = (uint32_t *)cs->ib_mapped;
419    cs->base.cdw = 0;
420    cs->base.reserved_dw = 0;
421    cs->base.max_dw = ib_size / 4 - 4;
422 }
423 
424 static void
radv_amdgpu_winsys_cs_pad(struct radeon_cmdbuf * _cs,unsigned leave_dw_space)425 radv_amdgpu_winsys_cs_pad(struct radeon_cmdbuf *_cs, unsigned leave_dw_space)
426 {
427    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
428    const enum amd_ip_type ip_type = cs->hw_ip;
429    const uint32_t pad_dw_mask = cs->ws->info.ip[ip_type].ib_pad_dw_mask;
430    const uint32_t unaligned_dw = (cs->base.cdw + leave_dw_space) & pad_dw_mask;
431 
432    if (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE) {
433       if (unaligned_dw) {
434          const int remaining = pad_dw_mask + 1 - unaligned_dw;
435 
436          /* Only pad by 1 dword with the type-2 NOP if necessary. */
437          if (remaining == 1 && cs->ws->info.gfx_ib_pad_with_type2) {
438             radeon_emit_unchecked(&cs->base, PKT2_NOP_PAD);
439          } else {
440             /* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized
441              * packet. The size of the packet body after the header is always count + 1.
442              * If count == -1, there is no packet body. NOP is the only packet that can have
443              * count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1).
444              */
445             radeon_emit_unchecked(&cs->base, PKT3(PKT3_NOP, remaining - 2, 0));
446             cs->base.cdw += remaining - 1;
447          }
448       }
449    } else {
450       /* Don't pad on VCN encode/unified as no NOPs */
451       if (ip_type == AMDGPU_HW_IP_VCN_ENC)
452          return;
453 
454       /* Don't add padding to 0 length UVD due to kernel */
455       if (ip_type == AMDGPU_HW_IP_UVD && cs->base.cdw == 0)
456          return;
457 
458       const uint32_t nop_packet = get_nop_packet(cs);
459 
460       while (!cs->base.cdw || (cs->base.cdw & pad_dw_mask))
461          radeon_emit_unchecked(&cs->base, nop_packet);
462    }
463 
464    assert(((cs->base.cdw + leave_dw_space) & pad_dw_mask) == 0);
465 }
466 
467 static VkResult
radv_amdgpu_cs_finalize(struct radeon_cmdbuf * _cs)468 radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs)
469 {
470    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
471 
472    assert(cs->base.cdw <= cs->base.reserved_dw);
473 
474    if (cs->use_ib) {
475       const uint32_t nop_packet = get_nop_packet(cs);
476 
477       /* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */
478       radv_amdgpu_winsys_cs_pad(_cs, 4);
479 
480       radeon_emit_unchecked(&cs->base, nop_packet);
481       radeon_emit_unchecked(&cs->base, nop_packet);
482       radeon_emit_unchecked(&cs->base, nop_packet);
483       radeon_emit_unchecked(&cs->base, nop_packet);
484 
485       *cs->ib_size_ptr |= cs->base.cdw;
486    } else {
487       radv_amdgpu_winsys_cs_pad(_cs, 0);
488    }
489 
490    /* Append the current (last) IB to the array of IB buffers. */
491    radv_amdgpu_cs_add_ib_buffer(cs, cs->ib_buffer, cs->ib_buffer->va,
492                                 cs->use_ib ? G_3F2_IB_SIZE(*cs->ib_size_ptr) : cs->base.cdw);
493 
494    /* Prevent freeing this BO twice. */
495    cs->ib_buffer = NULL;
496 
497    cs->chained_to = NULL;
498 
499    assert(cs->base.cdw <= cs->base.max_dw + 4);
500 
501    return cs->status;
502 }
503 
504 static void
radv_amdgpu_cs_reset(struct radeon_cmdbuf * _cs)505 radv_amdgpu_cs_reset(struct radeon_cmdbuf *_cs)
506 {
507    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
508    cs->base.cdw = 0;
509    cs->base.reserved_dw = 0;
510    cs->status = VK_SUCCESS;
511 
512    for (unsigned i = 0; i < cs->num_buffers; ++i) {
513       unsigned hash = cs->handles[i].bo_handle & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
514       cs->buffer_hash_table[hash] = -1;
515    }
516 
517    for (unsigned i = 0; i < cs->num_virtual_buffers; ++i) {
518       unsigned hash = ((uintptr_t)cs->virtual_buffers[i] >> 6) & (VIRTUAL_BUFFER_HASH_TABLE_SIZE - 1);
519       cs->virtual_buffer_hash_table[hash] = -1;
520    }
521 
522    cs->num_buffers = 0;
523    cs->num_virtual_buffers = 0;
524 
525    /* When the CS is finalized and IBs are not allowed, use last IB. */
526    assert(cs->ib_buffer || cs->num_ib_buffers);
527    if (!cs->ib_buffer)
528       radv_amdgpu_restore_last_ib(cs);
529 
530    cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
531 
532    for (unsigned i = 0; i < cs->num_ib_buffers; ++i) {
533       if (!cs->ib_buffers[i].bo)
534          continue;
535 
536       cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffers[i].bo);
537    }
538 
539    cs->num_ib_buffers = 0;
540    cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
541 
542    cs->ib.size = 0;
543 
544    if (cs->use_ib)
545       cs->ib_size_ptr = &cs->ib.size;
546 
547    _mesa_hash_table_destroy(cs->annotations, radv_amdgpu_cs_free_annotation);
548    cs->annotations = NULL;
549 }
550 
551 static bool
radv_amdgpu_cs_has_external_ib(const struct radv_amdgpu_cs * cs)552 radv_amdgpu_cs_has_external_ib(const struct radv_amdgpu_cs *cs)
553 {
554    for (unsigned i = 0; i < cs->num_ib_buffers; i++) {
555       if (!cs->ib_buffers[i].bo)
556          return true;
557    }
558 
559    return false;
560 }
561 
562 static void
radv_amdgpu_cs_unchain(struct radeon_cmdbuf * cs)563 radv_amdgpu_cs_unchain(struct radeon_cmdbuf *cs)
564 {
565    struct radv_amdgpu_cs *acs = radv_amdgpu_cs(cs);
566 
567    if (!acs->chained_to)
568       return;
569 
570    assert(cs->cdw <= cs->max_dw + 4);
571 
572    acs->chained_to = NULL;
573    cs->buf[cs->cdw - 4] = PKT3_NOP_PAD;
574    cs->buf[cs->cdw - 3] = PKT3_NOP_PAD;
575    cs->buf[cs->cdw - 2] = PKT3_NOP_PAD;
576    cs->buf[cs->cdw - 1] = PKT3_NOP_PAD;
577 }
578 
579 static bool
radv_amdgpu_cs_chain(struct radeon_cmdbuf * cs,struct radeon_cmdbuf * next_cs,bool pre_ena)580 radv_amdgpu_cs_chain(struct radeon_cmdbuf *cs, struct radeon_cmdbuf *next_cs, bool pre_ena)
581 {
582    /* Chains together two CS (command stream) objects by editing
583     * the end of the first CS to add a command that jumps to the
584     * second CS.
585     *
586     * After this, it is enough to submit the first CS to the GPU
587     * and not necessary to submit the second CS because it is already
588     * executed by the first.
589     */
590 
591    struct radv_amdgpu_cs *acs = radv_amdgpu_cs(cs);
592    struct radv_amdgpu_cs *next_acs = radv_amdgpu_cs(next_cs);
593 
594    /* Only some HW IP types have packets that we can use for chaining. */
595    if (!acs->use_ib)
596       return false;
597 
598    /* Do not chain if the next CS has external IBs because it will chain to newly created IB instead
599     * of the first one.
600     */
601    if (radv_amdgpu_cs_has_external_ib(next_acs))
602       return false;
603 
604    assert(cs->cdw <= cs->max_dw + 4);
605 
606    acs->chained_to = next_acs;
607 
608    cs->buf[cs->cdw - 4] = PKT3(PKT3_INDIRECT_BUFFER, 2, 0);
609    cs->buf[cs->cdw - 3] = next_acs->ib.ib_mc_address;
610    cs->buf[cs->cdw - 2] = next_acs->ib.ib_mc_address >> 32;
611    cs->buf[cs->cdw - 1] = S_3F2_CHAIN(1) | S_3F2_VALID(1) | S_3F2_PRE_ENA(pre_ena) | next_acs->ib.size;
612 
613    return true;
614 }
615 
616 static int
radv_amdgpu_cs_find_buffer(struct radv_amdgpu_cs * cs,uint32_t bo)617 radv_amdgpu_cs_find_buffer(struct radv_amdgpu_cs *cs, uint32_t bo)
618 {
619    unsigned hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
620    int index = cs->buffer_hash_table[hash];
621 
622    if (index == -1)
623       return -1;
624 
625    if (cs->handles[index].bo_handle == bo)
626       return index;
627 
628    for (unsigned i = 0; i < cs->num_buffers; ++i) {
629       if (cs->handles[i].bo_handle == bo) {
630          cs->buffer_hash_table[hash] = i;
631          return i;
632       }
633    }
634 
635    return -1;
636 }
637 
638 static void
radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs * cs,uint32_t bo,uint8_t priority)639 radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs *cs, uint32_t bo, uint8_t priority)
640 {
641    unsigned hash;
642    int index = radv_amdgpu_cs_find_buffer(cs, bo);
643 
644    if (index != -1)
645       return;
646 
647    if (cs->num_buffers == cs->max_num_buffers) {
648       unsigned new_count = MAX2(1, cs->max_num_buffers * 2);
649       struct drm_amdgpu_bo_list_entry *new_entries =
650          realloc(cs->handles, new_count * sizeof(struct drm_amdgpu_bo_list_entry));
651       if (new_entries) {
652          cs->max_num_buffers = new_count;
653          cs->handles = new_entries;
654       } else {
655          cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
656          return;
657       }
658    }
659 
660    cs->handles[cs->num_buffers].bo_handle = bo;
661    cs->handles[cs->num_buffers].bo_priority = priority;
662 
663    hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
664    cs->buffer_hash_table[hash] = cs->num_buffers;
665 
666    ++cs->num_buffers;
667 }
668 
669 static void
radv_amdgpu_cs_add_virtual_buffer(struct radeon_cmdbuf * _cs,struct radeon_winsys_bo * bo)670 radv_amdgpu_cs_add_virtual_buffer(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo)
671 {
672    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
673    unsigned hash = ((uintptr_t)bo >> 6) & (VIRTUAL_BUFFER_HASH_TABLE_SIZE - 1);
674 
675    if (!cs->virtual_buffer_hash_table) {
676       int *virtual_buffer_hash_table = malloc(VIRTUAL_BUFFER_HASH_TABLE_SIZE * sizeof(int));
677       if (!virtual_buffer_hash_table) {
678          cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
679          return;
680       }
681       cs->virtual_buffer_hash_table = virtual_buffer_hash_table;
682 
683       for (int i = 0; i < VIRTUAL_BUFFER_HASH_TABLE_SIZE; ++i)
684          cs->virtual_buffer_hash_table[i] = -1;
685    }
686 
687    if (cs->virtual_buffer_hash_table[hash] >= 0) {
688       int idx = cs->virtual_buffer_hash_table[hash];
689       if (cs->virtual_buffers[idx] == bo) {
690          return;
691       }
692       for (unsigned i = 0; i < cs->num_virtual_buffers; ++i) {
693          if (cs->virtual_buffers[i] == bo) {
694             cs->virtual_buffer_hash_table[hash] = i;
695             return;
696          }
697       }
698    }
699 
700    if (cs->max_num_virtual_buffers <= cs->num_virtual_buffers) {
701       unsigned max_num_virtual_buffers = MAX2(2, cs->max_num_virtual_buffers * 2);
702       struct radeon_winsys_bo **virtual_buffers =
703          realloc(cs->virtual_buffers, sizeof(struct radeon_winsys_bo *) * max_num_virtual_buffers);
704       if (!virtual_buffers) {
705          cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
706          return;
707       }
708       cs->max_num_virtual_buffers = max_num_virtual_buffers;
709       cs->virtual_buffers = virtual_buffers;
710    }
711 
712    cs->virtual_buffers[cs->num_virtual_buffers] = bo;
713 
714    cs->virtual_buffer_hash_table[hash] = cs->num_virtual_buffers;
715    ++cs->num_virtual_buffers;
716 }
717 
718 static void
radv_amdgpu_cs_add_buffer(struct radeon_cmdbuf * _cs,struct radeon_winsys_bo * _bo)719 radv_amdgpu_cs_add_buffer(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *_bo)
720 {
721    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
722    struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
723 
724    if (cs->status != VK_SUCCESS)
725       return;
726 
727    if (bo->is_virtual) {
728       radv_amdgpu_cs_add_virtual_buffer(_cs, _bo);
729       return;
730    }
731 
732    radv_amdgpu_cs_add_buffer_internal(cs, bo->bo_handle, bo->priority);
733 }
734 
735 static void
radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf * _parent,struct radeon_cmdbuf * _child,bool allow_ib2)736 radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf *_parent, struct radeon_cmdbuf *_child, bool allow_ib2)
737 {
738    struct radv_amdgpu_cs *parent = radv_amdgpu_cs(_parent);
739    struct radv_amdgpu_cs *child = radv_amdgpu_cs(_child);
740    struct radv_amdgpu_winsys *ws = parent->ws;
741    const bool use_ib2 = parent->use_ib && !parent->is_secondary && allow_ib2 && parent->hw_ip == AMD_IP_GFX;
742 
743    if (parent->status != VK_SUCCESS || child->status != VK_SUCCESS)
744       return;
745 
746    for (unsigned i = 0; i < child->num_buffers; ++i) {
747       radv_amdgpu_cs_add_buffer_internal(parent, child->handles[i].bo_handle, child->handles[i].bo_priority);
748    }
749 
750    for (unsigned i = 0; i < child->num_virtual_buffers; ++i) {
751       radv_amdgpu_cs_add_buffer(&parent->base, child->virtual_buffers[i]);
752    }
753 
754    if (use_ib2) {
755       if (parent->base.cdw + 4 > parent->base.max_dw)
756          radv_amdgpu_cs_grow(&parent->base, 4);
757 
758       parent->base.reserved_dw = MAX2(parent->base.reserved_dw, parent->base.cdw + 4);
759 
760       /* Not setting the CHAIN bit will launch an IB2. */
761       radeon_emit(&parent->base, PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
762       radeon_emit(&parent->base, child->ib.ib_mc_address);
763       radeon_emit(&parent->base, child->ib.ib_mc_address >> 32);
764       radeon_emit(&parent->base, child->ib.size);
765    } else {
766       assert(parent->use_ib == child->use_ib);
767 
768       /* Grow the current CS and copy the contents of the secondary CS. */
769       for (unsigned i = 0; i < child->num_ib_buffers; i++) {
770          struct radv_amdgpu_ib *ib = &child->ib_buffers[i];
771          uint32_t cdw = ib->cdw;
772          uint8_t *mapped;
773 
774          /* Do not copy the original chain link for IBs. */
775          if (child->use_ib)
776             cdw -= 4;
777 
778          assert(ib->bo);
779 
780          if (parent->base.cdw + cdw > parent->base.max_dw)
781             radv_amdgpu_cs_grow(&parent->base, cdw);
782 
783          parent->base.reserved_dw = MAX2(parent->base.reserved_dw, parent->base.cdw + cdw);
784 
785          mapped = radv_buffer_map(&ws->base, ib->bo);
786          if (!mapped) {
787             parent->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
788             return;
789          }
790 
791          memcpy(parent->base.buf + parent->base.cdw, mapped, 4 * cdw);
792          parent->base.cdw += cdw;
793       }
794    }
795 }
796 
797 static void
radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf * _cs,struct radeon_winsys_bo * bo,uint64_t va,const uint32_t cdw,const bool predicate)798 radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo, uint64_t va, const uint32_t cdw,
799                           const bool predicate)
800 {
801    struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
802    const uint64_t ib_va = bo ? bo->va : va;
803 
804    if (cs->status != VK_SUCCESS)
805       return;
806 
807    assert(ib_va && ib_va % cs->ws->info.ip[cs->hw_ip].ib_alignment == 0);
808 
809    if (cs->hw_ip == AMD_IP_GFX && cs->use_ib) {
810       radeon_emit(&cs->base, PKT3(PKT3_INDIRECT_BUFFER, 2, predicate));
811       radeon_emit(&cs->base, ib_va);
812       radeon_emit(&cs->base, ib_va >> 32);
813       radeon_emit(&cs->base, cdw);
814    } else {
815       const uint32_t ib_size = radv_amdgpu_cs_get_initial_size(cs->ws, cs->hw_ip);
816       VkResult result;
817 
818       /* Finalize the current CS without chaining to execute the external IB. */
819       radv_amdgpu_cs_finalize(_cs);
820 
821       radv_amdgpu_cs_add_ib_buffer(cs, bo, ib_va, cdw);
822 
823       /* Start a new CS which isn't chained to any previous CS. */
824       result = radv_amdgpu_cs_get_new_ib(_cs, ib_size);
825       if (result != VK_SUCCESS) {
826          cs->base.cdw = 0;
827          cs->status = result;
828       }
829    }
830 }
831 
832 static unsigned
radv_amdgpu_count_cs_bo(struct radv_amdgpu_cs * start_cs)833 radv_amdgpu_count_cs_bo(struct radv_amdgpu_cs *start_cs)
834 {
835    unsigned num_bo = 0;
836 
837    for (struct radv_amdgpu_cs *cs = start_cs; cs; cs = cs->chained_to) {
838       num_bo += cs->num_buffers;
839       for (unsigned j = 0; j < cs->num_virtual_buffers; ++j)
840          num_bo += radv_amdgpu_winsys_bo(cs->virtual_buffers[j])->bo_count;
841    }
842 
843    return num_bo;
844 }
845 
846 static unsigned
radv_amdgpu_count_cs_array_bo(struct radeon_cmdbuf ** cs_array,unsigned num_cs)847 radv_amdgpu_count_cs_array_bo(struct radeon_cmdbuf **cs_array, unsigned num_cs)
848 {
849    unsigned num_bo = 0;
850 
851    for (unsigned i = 0; i < num_cs; ++i) {
852       num_bo += radv_amdgpu_count_cs_bo(radv_amdgpu_cs(cs_array[i]));
853    }
854 
855    return num_bo;
856 }
857 
858 static unsigned
radv_amdgpu_add_cs_to_bo_list(struct radv_amdgpu_cs * cs,struct drm_amdgpu_bo_list_entry * handles,unsigned num_handles)859 radv_amdgpu_add_cs_to_bo_list(struct radv_amdgpu_cs *cs, struct drm_amdgpu_bo_list_entry *handles, unsigned num_handles)
860 {
861    if (!cs->num_buffers)
862       return num_handles;
863 
864    if (num_handles == 0 && !cs->num_virtual_buffers) {
865       memcpy(handles, cs->handles, cs->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
866       return cs->num_buffers;
867    }
868 
869    int unique_bo_so_far = num_handles;
870    for (unsigned j = 0; j < cs->num_buffers; ++j) {
871       bool found = false;
872       for (unsigned k = 0; k < unique_bo_so_far; ++k) {
873          if (handles[k].bo_handle == cs->handles[j].bo_handle) {
874             found = true;
875             break;
876          }
877       }
878       if (!found) {
879          handles[num_handles] = cs->handles[j];
880          ++num_handles;
881       }
882    }
883    for (unsigned j = 0; j < cs->num_virtual_buffers; ++j) {
884       struct radv_amdgpu_winsys_bo *virtual_bo = radv_amdgpu_winsys_bo(cs->virtual_buffers[j]);
885       u_rwlock_rdlock(&virtual_bo->lock);
886       for (unsigned k = 0; k < virtual_bo->bo_count; ++k) {
887          struct radv_amdgpu_winsys_bo *bo = virtual_bo->bos[k];
888          bool found = false;
889          for (unsigned m = 0; m < num_handles; ++m) {
890             if (handles[m].bo_handle == bo->bo_handle) {
891                found = true;
892                break;
893             }
894          }
895          if (!found) {
896             handles[num_handles].bo_handle = bo->bo_handle;
897             handles[num_handles].bo_priority = bo->priority;
898             ++num_handles;
899          }
900       }
901       u_rwlock_rdunlock(&virtual_bo->lock);
902    }
903 
904    return num_handles;
905 }
906 
907 static unsigned
radv_amdgpu_add_cs_array_to_bo_list(struct radeon_cmdbuf ** cs_array,unsigned num_cs,struct drm_amdgpu_bo_list_entry * handles,unsigned num_handles)908 radv_amdgpu_add_cs_array_to_bo_list(struct radeon_cmdbuf **cs_array, unsigned num_cs,
909                                     struct drm_amdgpu_bo_list_entry *handles, unsigned num_handles)
910 {
911    for (unsigned i = 0; i < num_cs; ++i) {
912       for (struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]); cs; cs = cs->chained_to) {
913          num_handles = radv_amdgpu_add_cs_to_bo_list(cs, handles, num_handles);
914       }
915    }
916 
917    return num_handles;
918 }
919 
920 static unsigned
radv_amdgpu_copy_global_bo_list(struct radv_amdgpu_winsys * ws,struct drm_amdgpu_bo_list_entry * handles)921 radv_amdgpu_copy_global_bo_list(struct radv_amdgpu_winsys *ws, struct drm_amdgpu_bo_list_entry *handles)
922 {
923    for (uint32_t i = 0; i < ws->global_bo_list.count; i++) {
924       handles[i].bo_handle = ws->global_bo_list.bos[i]->bo_handle;
925       handles[i].bo_priority = ws->global_bo_list.bos[i]->priority;
926    }
927 
928    return ws->global_bo_list.count;
929 }
930 
931 static VkResult
radv_amdgpu_get_bo_list(struct radv_amdgpu_winsys * ws,struct radeon_cmdbuf ** cs_array,unsigned count,struct radeon_cmdbuf ** initial_preamble_array,unsigned num_initial_preambles,struct radeon_cmdbuf ** continue_preamble_array,unsigned num_continue_preambles,struct radeon_cmdbuf ** postamble_array,unsigned num_postambles,unsigned * rnum_handles,struct drm_amdgpu_bo_list_entry ** rhandles)932 radv_amdgpu_get_bo_list(struct radv_amdgpu_winsys *ws, struct radeon_cmdbuf **cs_array, unsigned count,
933                         struct radeon_cmdbuf **initial_preamble_array, unsigned num_initial_preambles,
934                         struct radeon_cmdbuf **continue_preamble_array, unsigned num_continue_preambles,
935                         struct radeon_cmdbuf **postamble_array, unsigned num_postambles, unsigned *rnum_handles,
936                         struct drm_amdgpu_bo_list_entry **rhandles)
937 {
938    struct drm_amdgpu_bo_list_entry *handles = NULL;
939    unsigned num_handles = 0;
940 
941    if (ws->debug_all_bos) {
942       handles = malloc(sizeof(handles[0]) * ws->global_bo_list.count);
943       if (!handles)
944          return VK_ERROR_OUT_OF_HOST_MEMORY;
945 
946       num_handles = radv_amdgpu_copy_global_bo_list(ws, handles);
947    } else if (count == 1 && !num_initial_preambles && !num_continue_preambles && !num_postambles &&
948               !radv_amdgpu_cs(cs_array[0])->num_virtual_buffers && !radv_amdgpu_cs(cs_array[0])->chained_to &&
949               !ws->global_bo_list.count) {
950       struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)cs_array[0];
951       if (cs->num_buffers == 0)
952          return VK_SUCCESS;
953 
954       handles = malloc(sizeof(handles[0]) * cs->num_buffers);
955       if (!handles)
956          return VK_ERROR_OUT_OF_HOST_MEMORY;
957 
958       memcpy(handles, cs->handles, sizeof(handles[0]) * cs->num_buffers);
959       num_handles = cs->num_buffers;
960    } else {
961       unsigned total_buffer_count = ws->global_bo_list.count;
962       total_buffer_count += radv_amdgpu_count_cs_array_bo(cs_array, count);
963       total_buffer_count += radv_amdgpu_count_cs_array_bo(initial_preamble_array, num_initial_preambles);
964       total_buffer_count += radv_amdgpu_count_cs_array_bo(continue_preamble_array, num_continue_preambles);
965       total_buffer_count += radv_amdgpu_count_cs_array_bo(postamble_array, num_postambles);
966 
967       if (total_buffer_count == 0)
968          return VK_SUCCESS;
969 
970       handles = malloc(sizeof(handles[0]) * total_buffer_count);
971       if (!handles)
972          return VK_ERROR_OUT_OF_HOST_MEMORY;
973 
974       num_handles = radv_amdgpu_copy_global_bo_list(ws, handles);
975       num_handles = radv_amdgpu_add_cs_array_to_bo_list(cs_array, count, handles, num_handles);
976       num_handles =
977          radv_amdgpu_add_cs_array_to_bo_list(initial_preamble_array, num_initial_preambles, handles, num_handles);
978       num_handles =
979          radv_amdgpu_add_cs_array_to_bo_list(continue_preamble_array, num_continue_preambles, handles, num_handles);
980       num_handles = radv_amdgpu_add_cs_array_to_bo_list(postamble_array, num_postambles, handles, num_handles);
981    }
982 
983    *rhandles = handles;
984    *rnum_handles = num_handles;
985 
986    return VK_SUCCESS;
987 }
988 
989 static void
radv_assign_last_submit(struct radv_amdgpu_ctx * ctx,struct radv_amdgpu_cs_request * request)990 radv_assign_last_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request *request)
991 {
992    radv_amdgpu_request_to_fence(ctx, &ctx->last_submission[request->ip_type][request->ring], request);
993 }
994 
995 static unsigned
radv_amdgpu_get_num_ibs_per_cs(const struct radv_amdgpu_cs * cs)996 radv_amdgpu_get_num_ibs_per_cs(const struct radv_amdgpu_cs *cs)
997 {
998    unsigned num_ibs = 0;
999 
1000    if (cs->use_ib) {
1001       unsigned num_external_ibs = 0;
1002 
1003       for (unsigned i = 0; i < cs->num_ib_buffers; i++) {
1004          if (!cs->ib_buffers[i].bo)
1005             num_external_ibs++;
1006       }
1007 
1008       num_ibs = num_external_ibs * 2 + 1;
1009    } else {
1010       num_ibs = cs->num_ib_buffers;
1011    }
1012 
1013    return num_ibs;
1014 }
1015 
1016 static unsigned
radv_amdgpu_count_ibs(struct radeon_cmdbuf ** cs_array,unsigned cs_count,unsigned initial_preamble_count,unsigned continue_preamble_count,unsigned postamble_count)1017 radv_amdgpu_count_ibs(struct radeon_cmdbuf **cs_array, unsigned cs_count, unsigned initial_preamble_count,
1018                       unsigned continue_preamble_count, unsigned postamble_count)
1019 {
1020    unsigned num_ibs = 0;
1021 
1022    for (unsigned i = 0; i < cs_count; i++) {
1023       struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]);
1024 
1025       num_ibs += radv_amdgpu_get_num_ibs_per_cs(cs);
1026    }
1027 
1028    return MAX2(initial_preamble_count, continue_preamble_count) + num_ibs + postamble_count;
1029 }
1030 
1031 static VkResult
radv_amdgpu_winsys_cs_submit_internal(struct radv_amdgpu_ctx * ctx,int queue_idx,struct radv_winsys_sem_info * sem_info,struct radeon_cmdbuf ** cs_array,unsigned cs_count,struct radeon_cmdbuf ** initial_preamble_cs,unsigned initial_preamble_count,struct radeon_cmdbuf ** continue_preamble_cs,unsigned continue_preamble_count,struct radeon_cmdbuf ** postamble_cs,unsigned postamble_count,bool uses_shadow_regs)1032 radv_amdgpu_winsys_cs_submit_internal(struct radv_amdgpu_ctx *ctx, int queue_idx, struct radv_winsys_sem_info *sem_info,
1033                                       struct radeon_cmdbuf **cs_array, unsigned cs_count,
1034                                       struct radeon_cmdbuf **initial_preamble_cs, unsigned initial_preamble_count,
1035                                       struct radeon_cmdbuf **continue_preamble_cs, unsigned continue_preamble_count,
1036                                       struct radeon_cmdbuf **postamble_cs, unsigned postamble_count,
1037                                       bool uses_shadow_regs)
1038 {
1039    VkResult result;
1040 
1041    /* Last CS is "the gang leader", its IP type determines which fence to signal. */
1042    struct radv_amdgpu_cs *last_cs = radv_amdgpu_cs(cs_array[cs_count - 1]);
1043    struct radv_amdgpu_winsys *ws = last_cs->ws;
1044 
1045    const unsigned num_ibs =
1046       radv_amdgpu_count_ibs(cs_array, cs_count, initial_preamble_count, continue_preamble_count, postamble_count);
1047    const unsigned ib_array_size = MIN2(RADV_MAX_IBS_PER_SUBMIT, num_ibs);
1048 
1049    STACK_ARRAY(struct radv_amdgpu_cs_ib_info, ibs, ib_array_size);
1050 
1051    struct drm_amdgpu_bo_list_entry *handles = NULL;
1052    unsigned num_handles = 0;
1053 
1054    u_rwlock_rdlock(&ws->global_bo_list.lock);
1055 
1056    result = radv_amdgpu_get_bo_list(ws, &cs_array[0], cs_count, initial_preamble_cs, initial_preamble_count,
1057                                     continue_preamble_cs, continue_preamble_count, postamble_cs, postamble_count,
1058                                     &num_handles, &handles);
1059    if (result != VK_SUCCESS)
1060       goto fail;
1061 
1062    /* Configure the CS request. */
1063    const uint32_t *max_ib_per_ip = ws->info.max_submitted_ibs;
1064    struct radv_amdgpu_cs_request request = {
1065       .ip_type = last_cs->hw_ip,
1066       .ip_instance = 0,
1067       .ring = queue_idx,
1068       .handles = handles,
1069       .num_handles = num_handles,
1070       .ibs = ibs,
1071       .number_of_ibs = 0, /* set below */
1072    };
1073 
1074    for (unsigned cs_idx = 0, cs_ib_idx = 0; cs_idx < cs_count;) {
1075       struct radeon_cmdbuf **preambles = cs_idx ? continue_preamble_cs : initial_preamble_cs;
1076       const unsigned preamble_count = cs_idx ? continue_preamble_count : initial_preamble_count;
1077       const unsigned ib_per_submit = RADV_MAX_IBS_PER_SUBMIT - preamble_count - postamble_count;
1078       unsigned num_submitted_ibs = 0;
1079       unsigned ibs_per_ip[AMD_NUM_IP_TYPES] = {0};
1080 
1081       /* Copy preambles to the submission. */
1082       for (unsigned i = 0; i < preamble_count; ++i) {
1083          /* Assume that the full preamble fits into 1 IB. */
1084          struct radv_amdgpu_cs *cs = radv_amdgpu_cs(preambles[i]);
1085          struct radv_amdgpu_cs_ib_info ib;
1086 
1087          assert(cs->num_ib_buffers == 1);
1088          ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1089 
1090          ibs[num_submitted_ibs++] = ib;
1091          ibs_per_ip[cs->hw_ip]++;
1092       }
1093 
1094       for (unsigned i = 0; i < ib_per_submit && cs_idx < cs_count; ++i) {
1095          struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[cs_idx]);
1096          struct radv_amdgpu_cs_ib_info ib;
1097 
1098          if (cs_ib_idx == 0) {
1099             /* Make sure the whole CS fits into the same submission. */
1100             unsigned cs_num_ib = radv_amdgpu_get_num_ibs_per_cs(cs);
1101             if (i + cs_num_ib > ib_per_submit || ibs_per_ip[cs->hw_ip] + cs_num_ib > max_ib_per_ip[cs->hw_ip])
1102                break;
1103 
1104             if (cs->hw_ip != request.ip_type) {
1105                /* Found a "follower" CS in a gang submission.
1106                 * Make sure to submit this together with its "leader", the next CS.
1107                 * We rely on the caller to order each "follower" before its "leader."
1108                 */
1109                assert(cs_idx != cs_count - 1);
1110                struct radv_amdgpu_cs *next_cs = radv_amdgpu_cs(cs_array[cs_idx + 1]);
1111                assert(next_cs->hw_ip == request.ip_type);
1112                unsigned next_cs_num_ib = radv_amdgpu_get_num_ibs_per_cs(next_cs);
1113                if (i + cs_num_ib + next_cs_num_ib > ib_per_submit ||
1114                    ibs_per_ip[next_cs->hw_ip] + next_cs_num_ib > max_ib_per_ip[next_cs->hw_ip])
1115                   break;
1116             }
1117          }
1118 
1119          /* When IBs are used, we only need to submit the main IB of this CS, because everything
1120           * else is chained to the first IB. Except when the CS has external IBs because they need
1121           * to be submitted separately. Otherwise we must submit all IBs in the ib_buffers array.
1122           */
1123          if (cs->use_ib) {
1124             if (radv_amdgpu_cs_has_external_ib(cs)) {
1125                const unsigned cur_ib_idx = cs_ib_idx;
1126 
1127                ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[cs_ib_idx++]);
1128 
1129                /* Loop until the next external IB is found. */
1130                while (cs->ib_buffers[cur_ib_idx].bo && cs->ib_buffers[cs_ib_idx].bo && cs_ib_idx < cs->num_ib_buffers) {
1131                   cs_ib_idx++;
1132                }
1133 
1134                if (cs_ib_idx == cs->num_ib_buffers) {
1135                   cs_idx++;
1136                   cs_ib_idx = 0;
1137                }
1138             } else {
1139                ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1140                cs_idx++;
1141             }
1142          } else {
1143             assert(cs_ib_idx < cs->num_ib_buffers);
1144             ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[cs_ib_idx++]);
1145 
1146             if (cs_ib_idx == cs->num_ib_buffers) {
1147                cs_idx++;
1148                cs_ib_idx = 0;
1149             }
1150          }
1151 
1152          if (uses_shadow_regs && ib.ip_type == AMDGPU_HW_IP_GFX)
1153             ib.flags |= AMDGPU_IB_FLAG_PREEMPT;
1154 
1155          assert(num_submitted_ibs < ib_array_size);
1156          ibs[num_submitted_ibs++] = ib;
1157          ibs_per_ip[cs->hw_ip]++;
1158       }
1159 
1160       assert(num_submitted_ibs > preamble_count);
1161 
1162       /* Copy postambles to the submission. */
1163       for (unsigned i = 0; i < postamble_count; ++i) {
1164          /* Assume that the full postamble fits into 1 IB. */
1165          struct radv_amdgpu_cs *cs = radv_amdgpu_cs(postamble_cs[i]);
1166          struct radv_amdgpu_cs_ib_info ib;
1167 
1168          assert(cs->num_ib_buffers == 1);
1169          ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1170 
1171          ibs[num_submitted_ibs++] = ib;
1172          ibs_per_ip[cs->hw_ip]++;
1173       }
1174 
1175       /* Submit the CS. */
1176       request.number_of_ibs = num_submitted_ibs;
1177       result = radv_amdgpu_cs_submit(ctx, &request, sem_info);
1178       if (result != VK_SUCCESS)
1179          goto fail;
1180    }
1181 
1182    free(request.handles);
1183 
1184    if (result != VK_SUCCESS)
1185       goto fail;
1186 
1187    radv_assign_last_submit(ctx, &request);
1188 
1189 fail:
1190    u_rwlock_rdunlock(&ws->global_bo_list.lock);
1191    STACK_ARRAY_FINISH(ibs);
1192    return result;
1193 }
1194 
1195 static VkResult
radv_amdgpu_cs_submit_zero(struct radv_amdgpu_ctx * ctx,enum amd_ip_type ip_type,int queue_idx,struct radv_winsys_sem_info * sem_info)1196 radv_amdgpu_cs_submit_zero(struct radv_amdgpu_ctx *ctx, enum amd_ip_type ip_type, int queue_idx,
1197                            struct radv_winsys_sem_info *sem_info)
1198 {
1199    unsigned hw_ip = ip_type;
1200    unsigned queue_syncobj = radv_amdgpu_ctx_queue_syncobj(ctx, hw_ip, queue_idx);
1201    int ret;
1202 
1203    if (!queue_syncobj)
1204       return VK_ERROR_OUT_OF_HOST_MEMORY;
1205 
1206    if (sem_info->wait.syncobj_count || sem_info->wait.timeline_syncobj_count) {
1207       int fd;
1208       ret = amdgpu_cs_syncobj_export_sync_file(ctx->ws->dev, queue_syncobj, &fd);
1209       if (ret < 0)
1210          return VK_ERROR_DEVICE_LOST;
1211 
1212       for (unsigned i = 0; i < sem_info->wait.syncobj_count; ++i) {
1213          int fd2;
1214          ret = amdgpu_cs_syncobj_export_sync_file(ctx->ws->dev, sem_info->wait.syncobj[i], &fd2);
1215          if (ret < 0) {
1216             close(fd);
1217             return VK_ERROR_DEVICE_LOST;
1218          }
1219 
1220          sync_accumulate("radv", &fd, fd2);
1221          close(fd2);
1222       }
1223       for (unsigned i = 0; i < sem_info->wait.timeline_syncobj_count; ++i) {
1224          int fd2;
1225          ret = amdgpu_cs_syncobj_export_sync_file2(
1226             ctx->ws->dev, sem_info->wait.syncobj[i + sem_info->wait.syncobj_count], sem_info->wait.points[i], 0, &fd2);
1227          if (ret < 0) {
1228             /* This works around a kernel bug where the fence isn't copied if it is already
1229              * signalled. Since it is already signalled it is totally fine to not wait on it.
1230              *
1231              * kernel patch: https://patchwork.freedesktop.org/patch/465583/ */
1232             uint64_t point;
1233             ret = amdgpu_cs_syncobj_query2(ctx->ws->dev, &sem_info->wait.syncobj[i + sem_info->wait.syncobj_count],
1234                                            &point, 1, 0);
1235             if (!ret && point >= sem_info->wait.points[i])
1236                continue;
1237 
1238             close(fd);
1239             return VK_ERROR_DEVICE_LOST;
1240          }
1241 
1242          sync_accumulate("radv", &fd, fd2);
1243          close(fd2);
1244       }
1245       ret = amdgpu_cs_syncobj_import_sync_file(ctx->ws->dev, queue_syncobj, fd);
1246       close(fd);
1247       if (ret < 0)
1248          return VK_ERROR_DEVICE_LOST;
1249 
1250       ctx->queue_syncobj_wait[hw_ip][queue_idx] = true;
1251    }
1252 
1253    for (unsigned i = 0; i < sem_info->signal.syncobj_count; ++i) {
1254       uint32_t dst_handle = sem_info->signal.syncobj[i];
1255       uint32_t src_handle = queue_syncobj;
1256 
1257       if (ctx->ws->info.has_timeline_syncobj) {
1258          ret = amdgpu_cs_syncobj_transfer(ctx->ws->dev, dst_handle, 0, src_handle, 0, 0);
1259          if (ret < 0)
1260             return VK_ERROR_DEVICE_LOST;
1261       } else {
1262          int fd;
1263          ret = amdgpu_cs_syncobj_export_sync_file(ctx->ws->dev, src_handle, &fd);
1264          if (ret < 0)
1265             return VK_ERROR_DEVICE_LOST;
1266 
1267          ret = amdgpu_cs_syncobj_import_sync_file(ctx->ws->dev, dst_handle, fd);
1268          close(fd);
1269          if (ret < 0)
1270             return VK_ERROR_DEVICE_LOST;
1271       }
1272    }
1273    for (unsigned i = 0; i < sem_info->signal.timeline_syncobj_count; ++i) {
1274       ret = amdgpu_cs_syncobj_transfer(ctx->ws->dev, sem_info->signal.syncobj[i + sem_info->signal.syncobj_count],
1275                                        sem_info->signal.points[i], queue_syncobj, 0, 0);
1276       if (ret < 0)
1277          return VK_ERROR_DEVICE_LOST;
1278    }
1279    return VK_SUCCESS;
1280 }
1281 
1282 static VkResult
radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx * _ctx,const struct radv_winsys_submit_info * submit,uint32_t wait_count,const struct vk_sync_wait * waits,uint32_t signal_count,const struct vk_sync_signal * signals)1283 radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx *_ctx, const struct radv_winsys_submit_info *submit,
1284                              uint32_t wait_count, const struct vk_sync_wait *waits, uint32_t signal_count,
1285                              const struct vk_sync_signal *signals)
1286 {
1287    struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
1288    struct radv_amdgpu_winsys *ws = ctx->ws;
1289    VkResult result;
1290    unsigned wait_idx = 0, signal_idx = 0;
1291 
1292    STACK_ARRAY(uint64_t, wait_points, wait_count);
1293    STACK_ARRAY(uint32_t, wait_syncobj, wait_count);
1294    STACK_ARRAY(uint64_t, signal_points, signal_count);
1295    STACK_ARRAY(uint32_t, signal_syncobj, signal_count);
1296 
1297    if (!wait_points || !wait_syncobj || !signal_points || !signal_syncobj) {
1298       result = VK_ERROR_OUT_OF_HOST_MEMORY;
1299       goto out;
1300    }
1301 
1302    for (uint32_t i = 0; i < wait_count; ++i) {
1303       if (waits[i].sync->type == &vk_sync_dummy_type)
1304          continue;
1305 
1306       assert(waits[i].sync->type == &ws->syncobj_sync_type);
1307       wait_syncobj[wait_idx] = ((struct vk_drm_syncobj *)waits[i].sync)->syncobj;
1308       wait_points[wait_idx] = waits[i].wait_value;
1309       ++wait_idx;
1310    }
1311 
1312    for (uint32_t i = 0; i < signal_count; ++i) {
1313       if (signals[i].sync->type == &vk_sync_dummy_type)
1314          continue;
1315 
1316       assert(signals[i].sync->type == &ws->syncobj_sync_type);
1317       signal_syncobj[signal_idx] = ((struct vk_drm_syncobj *)signals[i].sync)->syncobj;
1318       signal_points[signal_idx] = signals[i].signal_value;
1319       ++signal_idx;
1320    }
1321 
1322    assert(signal_idx <= signal_count);
1323    assert(wait_idx <= wait_count);
1324 
1325    const uint32_t wait_timeline_syncobj_count =
1326       (ws->syncobj_sync_type.features & VK_SYNC_FEATURE_TIMELINE) ? wait_idx : 0;
1327    const uint32_t signal_timeline_syncobj_count =
1328       (ws->syncobj_sync_type.features & VK_SYNC_FEATURE_TIMELINE) ? signal_idx : 0;
1329 
1330    struct radv_winsys_sem_info sem_info = {
1331       .wait =
1332          {
1333             .points = wait_points,
1334             .syncobj = wait_syncobj,
1335             .timeline_syncobj_count = wait_timeline_syncobj_count,
1336             .syncobj_count = wait_idx - wait_timeline_syncobj_count,
1337          },
1338       .signal =
1339          {
1340             .points = signal_points,
1341             .syncobj = signal_syncobj,
1342             .timeline_syncobj_count = signal_timeline_syncobj_count,
1343             .syncobj_count = signal_idx - signal_timeline_syncobj_count,
1344          },
1345       .cs_emit_wait = true,
1346       .cs_emit_signal = true,
1347    };
1348 
1349    if (!submit->cs_count) {
1350       result = radv_amdgpu_cs_submit_zero(ctx, submit->ip_type, submit->queue_index, &sem_info);
1351    } else {
1352       result = radv_amdgpu_winsys_cs_submit_internal(
1353          ctx, submit->queue_index, &sem_info, submit->cs_array, submit->cs_count, submit->initial_preamble_cs,
1354          submit->initial_preamble_count, submit->continue_preamble_cs, submit->continue_preamble_count,
1355          submit->postamble_cs, submit->postamble_count, submit->uses_shadow_regs);
1356    }
1357 
1358 out:
1359    STACK_ARRAY_FINISH(wait_points);
1360    STACK_ARRAY_FINISH(wait_syncobj);
1361    STACK_ARRAY_FINISH(signal_points);
1362    STACK_ARRAY_FINISH(signal_syncobj);
1363    return result;
1364 }
1365 
1366 static void
radv_amdgpu_winsys_get_cpu_addr(void * _cs,uint64_t addr,struct ac_addr_info * info)1367 radv_amdgpu_winsys_get_cpu_addr(void *_cs, uint64_t addr, struct ac_addr_info *info)
1368 {
1369    struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
1370 
1371    memset(info, 0, sizeof(struct ac_addr_info));
1372 
1373    if (cs->ws->debug_log_bos) {
1374       u_rwlock_rdlock(&cs->ws->log_bo_list_lock);
1375       list_for_each_entry_rev (struct radv_amdgpu_winsys_bo_log, bo_log, &cs->ws->log_bo_list, list) {
1376          if (addr >= bo_log->va && addr - bo_log->va < bo_log->size) {
1377             info->use_after_free = bo_log->destroyed;
1378             break;
1379          }
1380       }
1381       u_rwlock_rdunlock(&cs->ws->log_bo_list_lock);
1382    }
1383 
1384    if (info->use_after_free)
1385       return;
1386 
1387    info->valid = !cs->ws->debug_all_bos;
1388 
1389    for (unsigned i = 0; i < cs->num_ib_buffers; ++i) {
1390       struct radv_amdgpu_ib *ib = &cs->ib_buffers[i];
1391       struct radv_amdgpu_winsys_bo *bo = (struct radv_amdgpu_winsys_bo *)ib->bo;
1392 
1393       if (addr >= bo->base.va && addr - bo->base.va < bo->base.size) {
1394          void *map = radv_buffer_map(&cs->ws->base, &bo->base);
1395          if (map) {
1396             info->cpu_addr = (char *)map + (addr - bo->base.va);
1397             info->valid = true;
1398             return;
1399          }
1400       }
1401    }
1402    u_rwlock_rdlock(&cs->ws->global_bo_list.lock);
1403    for (uint32_t i = 0; i < cs->ws->global_bo_list.count; i++) {
1404       struct radv_amdgpu_winsys_bo *bo = cs->ws->global_bo_list.bos[i];
1405       if (addr >= bo->base.va && addr - bo->base.va < bo->base.size) {
1406          void *map = radv_buffer_map(&cs->ws->base, &bo->base);
1407          if (map) {
1408             u_rwlock_rdunlock(&cs->ws->global_bo_list.lock);
1409             info->valid = true;
1410             info->cpu_addr = (char *)map + (addr - bo->base.va);
1411             return;
1412          }
1413       }
1414    }
1415    u_rwlock_rdunlock(&cs->ws->global_bo_list.lock);
1416 
1417    return;
1418 }
1419 
1420 static void
radv_amdgpu_winsys_cs_dump(struct radeon_cmdbuf * _cs,FILE * file,const int * trace_ids,int trace_id_count,enum radv_cs_dump_type type)1421 radv_amdgpu_winsys_cs_dump(struct radeon_cmdbuf *_cs, FILE *file, const int *trace_ids, int trace_id_count,
1422                            enum radv_cs_dump_type type)
1423 {
1424    struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
1425    struct radv_amdgpu_winsys *ws = cs->ws;
1426 
1427    if (cs->use_ib && !radv_amdgpu_cs_has_external_ib(cs)) {
1428       struct radv_amdgpu_cs_ib_info ib_info = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1429 
1430       struct ac_addr_info addr_info;
1431       radv_amdgpu_winsys_get_cpu_addr(cs, ib_info.ib_mc_address, &addr_info);
1432       assert(addr_info.cpu_addr);
1433 
1434       if (type == RADV_CS_DUMP_TYPE_IBS) {
1435          struct ac_ib_parser ib_parser = {
1436             .f = file,
1437             .ib = addr_info.cpu_addr,
1438             .num_dw = cs->ib_buffers[0].cdw,
1439             .trace_ids = trace_ids,
1440             .trace_id_count = trace_id_count,
1441             .gfx_level = ws->info.gfx_level,
1442             .family = ws->info.family,
1443             .ip_type = cs->hw_ip,
1444             .addr_callback = radv_amdgpu_winsys_get_cpu_addr,
1445             .addr_callback_data = cs,
1446             .annotations = cs->annotations,
1447          };
1448 
1449          ac_parse_ib(&ib_parser, "main IB");
1450       } else {
1451          uint32_t *ib_dw = addr_info.cpu_addr;
1452          ac_gather_context_rolls(file, &ib_dw, &cs->ib_buffers[0].cdw, 1, cs->annotations, &ws->info);
1453       }
1454    } else {
1455       uint32_t **ibs = type == RADV_CS_DUMP_TYPE_CTX_ROLLS ? malloc(cs->num_ib_buffers * sizeof(uint32_t *)) : NULL;
1456       uint32_t *ib_dw_sizes =
1457          type == RADV_CS_DUMP_TYPE_CTX_ROLLS ? malloc(cs->num_ib_buffers * sizeof(uint32_t)) : NULL;
1458 
1459       for (unsigned i = 0; i < cs->num_ib_buffers; i++) {
1460          struct radv_amdgpu_ib *ib = &cs->ib_buffers[i];
1461          char name[64];
1462          void *mapped;
1463 
1464          if (!ib->bo) {
1465             fprintf(file, "Chunk %d isn't owned by this CS.\n\n", i);
1466             continue;
1467          }
1468 
1469          mapped = radv_buffer_map(&ws->base, ib->bo);
1470          if (!mapped)
1471             continue;
1472 
1473          if (cs->num_ib_buffers > 1) {
1474             snprintf(name, sizeof(name), "main IB (chunk %d)", i);
1475          } else {
1476             snprintf(name, sizeof(name), "main IB");
1477          }
1478 
1479          if (type == RADV_CS_DUMP_TYPE_IBS) {
1480             struct ac_ib_parser ib_parser = {
1481                .f = file,
1482                .ib = mapped,
1483                .num_dw = ib->cdw,
1484                .trace_ids = trace_ids,
1485                .trace_id_count = trace_id_count,
1486                .gfx_level = ws->info.gfx_level,
1487                .family = ws->info.family,
1488                .ip_type = cs->hw_ip,
1489                .addr_callback = radv_amdgpu_winsys_get_cpu_addr,
1490                .addr_callback_data = cs,
1491                .annotations = cs->annotations,
1492             };
1493 
1494             ac_parse_ib(&ib_parser, name);
1495          } else {
1496             ibs[i] = mapped;
1497             ib_dw_sizes[i] = ib->cdw;
1498          }
1499       }
1500 
1501       if (type == RADV_CS_DUMP_TYPE_CTX_ROLLS) {
1502          ac_gather_context_rolls(file, ibs, ib_dw_sizes, cs->num_ib_buffers, cs->annotations, &ws->info);
1503 
1504          free(ibs);
1505          free(ib_dw_sizes);
1506       }
1507    }
1508 }
1509 
1510 static void
radv_amdgpu_winsys_cs_annotate(struct radeon_cmdbuf * _cs,const char * annotation)1511 radv_amdgpu_winsys_cs_annotate(struct radeon_cmdbuf *_cs, const char *annotation)
1512 {
1513    struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
1514 
1515    if (!cs->annotations) {
1516       cs->annotations = _mesa_pointer_hash_table_create(NULL);
1517       if (!cs->annotations)
1518          return;
1519    }
1520 
1521    struct hash_entry *entry = _mesa_hash_table_search(cs->annotations, _cs->buf + _cs->cdw);
1522    if (entry) {
1523       char *old_annotation = entry->data;
1524       char *new_annotation = calloc(strlen(old_annotation) + strlen(annotation) + 5, 1);
1525       sprintf(new_annotation, "%s -> %s", old_annotation, annotation);
1526       free(old_annotation);
1527       _mesa_hash_table_insert(cs->annotations, _cs->buf + _cs->cdw, new_annotation);
1528    } else {
1529       _mesa_hash_table_insert(cs->annotations, _cs->buf + _cs->cdw, strdup(annotation));
1530    }
1531 }
1532 
1533 static uint32_t
radv_to_amdgpu_priority(enum radeon_ctx_priority radv_priority)1534 radv_to_amdgpu_priority(enum radeon_ctx_priority radv_priority)
1535 {
1536    switch (radv_priority) {
1537    case RADEON_CTX_PRIORITY_REALTIME:
1538       return AMDGPU_CTX_PRIORITY_VERY_HIGH;
1539    case RADEON_CTX_PRIORITY_HIGH:
1540       return AMDGPU_CTX_PRIORITY_HIGH;
1541    case RADEON_CTX_PRIORITY_MEDIUM:
1542       return AMDGPU_CTX_PRIORITY_NORMAL;
1543    case RADEON_CTX_PRIORITY_LOW:
1544       return AMDGPU_CTX_PRIORITY_LOW;
1545    default:
1546       unreachable("Invalid context priority");
1547    }
1548 }
1549 
1550 static VkResult
radv_amdgpu_ctx_create(struct radeon_winsys * _ws,enum radeon_ctx_priority priority,struct radeon_winsys_ctx ** rctx)1551 radv_amdgpu_ctx_create(struct radeon_winsys *_ws, enum radeon_ctx_priority priority, struct radeon_winsys_ctx **rctx)
1552 {
1553    struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1554    struct radv_amdgpu_ctx *ctx = CALLOC_STRUCT(radv_amdgpu_ctx);
1555    uint32_t amdgpu_priority = radv_to_amdgpu_priority(priority);
1556    VkResult result;
1557    int r;
1558 
1559    if (!ctx)
1560       return VK_ERROR_OUT_OF_HOST_MEMORY;
1561 
1562    r = amdgpu_cs_ctx_create2(ws->dev, amdgpu_priority, &ctx->ctx);
1563    if (r && r == -EACCES) {
1564       result = VK_ERROR_NOT_PERMITTED_KHR;
1565       goto fail_create;
1566    } else if (r) {
1567       fprintf(stderr, "radv/amdgpu: radv_amdgpu_cs_ctx_create2 failed. (%i)\n", r);
1568       result = VK_ERROR_OUT_OF_HOST_MEMORY;
1569       goto fail_create;
1570    }
1571    ctx->ws = ws;
1572 
1573    assert(AMDGPU_HW_IP_NUM * MAX_RINGS_PER_TYPE * 4 * sizeof(uint64_t) <= 4096);
1574    result = ws->base.buffer_create(&ws->base, 4096, 8, RADEON_DOMAIN_GTT,
1575                                    RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_CS, 0,
1576                                    &ctx->fence_bo);
1577    if (result != VK_SUCCESS) {
1578       goto fail_alloc;
1579    }
1580 
1581    *rctx = (struct radeon_winsys_ctx *)ctx;
1582    return VK_SUCCESS;
1583 
1584 fail_alloc:
1585    amdgpu_cs_ctx_free(ctx->ctx);
1586 fail_create:
1587    FREE(ctx);
1588    return result;
1589 }
1590 
1591 static void
radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx * rwctx)1592 radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
1593 {
1594    struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1595 
1596    for (unsigned ip = 0; ip <= AMDGPU_HW_IP_NUM; ++ip) {
1597       for (unsigned ring = 0; ring < MAX_RINGS_PER_TYPE; ++ring) {
1598          if (ctx->queue_syncobj[ip][ring])
1599             amdgpu_cs_destroy_syncobj(ctx->ws->dev, ctx->queue_syncobj[ip][ring]);
1600       }
1601    }
1602 
1603    ctx->ws->base.buffer_destroy(&ctx->ws->base, ctx->fence_bo);
1604    amdgpu_cs_ctx_free(ctx->ctx);
1605    FREE(ctx);
1606 }
1607 
1608 static uint32_t
radv_amdgpu_ctx_queue_syncobj(struct radv_amdgpu_ctx * ctx,unsigned ip,unsigned ring)1609 radv_amdgpu_ctx_queue_syncobj(struct radv_amdgpu_ctx *ctx, unsigned ip, unsigned ring)
1610 {
1611    uint32_t *syncobj = &ctx->queue_syncobj[ip][ring];
1612    if (!*syncobj) {
1613       amdgpu_cs_create_syncobj2(ctx->ws->dev, DRM_SYNCOBJ_CREATE_SIGNALED, syncobj);
1614    }
1615    return *syncobj;
1616 }
1617 
1618 static bool
radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx * rwctx,enum amd_ip_type ip_type,int ring_index)1619 radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx *rwctx, enum amd_ip_type ip_type, int ring_index)
1620 {
1621    struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1622 
1623    if (ctx->last_submission[ip_type][ring_index].fence.fence) {
1624       uint32_t expired;
1625       int ret =
1626          amdgpu_cs_query_fence_status(&ctx->last_submission[ip_type][ring_index].fence, 1000000000ull, 0, &expired);
1627 
1628       if (ret || !expired)
1629          return false;
1630    }
1631 
1632    return true;
1633 }
1634 
1635 static uint32_t
radv_to_amdgpu_pstate(enum radeon_ctx_pstate radv_pstate)1636 radv_to_amdgpu_pstate(enum radeon_ctx_pstate radv_pstate)
1637 {
1638    switch (radv_pstate) {
1639    case RADEON_CTX_PSTATE_NONE:
1640       return AMDGPU_CTX_STABLE_PSTATE_NONE;
1641    case RADEON_CTX_PSTATE_STANDARD:
1642       return AMDGPU_CTX_STABLE_PSTATE_STANDARD;
1643    case RADEON_CTX_PSTATE_MIN_SCLK:
1644       return AMDGPU_CTX_STABLE_PSTATE_MIN_SCLK;
1645    case RADEON_CTX_PSTATE_MIN_MCLK:
1646       return AMDGPU_CTX_STABLE_PSTATE_MIN_MCLK;
1647    case RADEON_CTX_PSTATE_PEAK:
1648       return AMDGPU_CTX_STABLE_PSTATE_PEAK;
1649    default:
1650       unreachable("Invalid pstate");
1651    }
1652 }
1653 
1654 static int
radv_amdgpu_ctx_set_pstate(struct radeon_winsys_ctx * rwctx,enum radeon_ctx_pstate pstate)1655 radv_amdgpu_ctx_set_pstate(struct radeon_winsys_ctx *rwctx, enum radeon_ctx_pstate pstate)
1656 {
1657    struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1658    uint32_t new_pstate = radv_to_amdgpu_pstate(pstate);
1659    uint32_t current_pstate = 0;
1660    int r;
1661 
1662    r = amdgpu_cs_ctx_stable_pstate(ctx->ctx, AMDGPU_CTX_OP_GET_STABLE_PSTATE, 0, &current_pstate);
1663    if (r) {
1664       fprintf(stderr, "radv/amdgpu: failed to get current pstate\n");
1665       return r;
1666    }
1667 
1668    /* Do not try to set a new pstate when the current one is already what we want. Otherwise, the
1669     * kernel might return -EBUSY if we have multiple AMDGPU contexts in flight.
1670     */
1671    if (current_pstate == new_pstate)
1672       return 0;
1673 
1674    r = amdgpu_cs_ctx_stable_pstate(ctx->ctx, AMDGPU_CTX_OP_SET_STABLE_PSTATE, new_pstate, NULL);
1675    if (r) {
1676       fprintf(stderr, "radv/amdgpu: failed to set new pstate\n");
1677       return r;
1678    }
1679 
1680    return 0;
1681 }
1682 
1683 static void *
radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts * counts,uint32_t queue_syncobj,struct drm_amdgpu_cs_chunk * chunk,int chunk_id)1684 radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts *counts, uint32_t queue_syncobj,
1685                                    struct drm_amdgpu_cs_chunk *chunk, int chunk_id)
1686 {
1687    unsigned count = counts->syncobj_count + (queue_syncobj ? 1 : 0);
1688    struct drm_amdgpu_cs_chunk_sem *syncobj = malloc(sizeof(struct drm_amdgpu_cs_chunk_sem) * count);
1689    if (!syncobj)
1690       return NULL;
1691 
1692    for (unsigned i = 0; i < counts->syncobj_count; i++) {
1693       struct drm_amdgpu_cs_chunk_sem *sem = &syncobj[i];
1694       sem->handle = counts->syncobj[i];
1695    }
1696 
1697    if (queue_syncobj)
1698       syncobj[counts->syncobj_count].handle = queue_syncobj;
1699 
1700    chunk->chunk_id = chunk_id;
1701    chunk->length_dw = sizeof(struct drm_amdgpu_cs_chunk_sem) / 4 * count;
1702    chunk->chunk_data = (uint64_t)(uintptr_t)syncobj;
1703    return syncobj;
1704 }
1705 
1706 static void *
radv_amdgpu_cs_alloc_timeline_syncobj_chunk(struct radv_winsys_sem_counts * counts,uint32_t queue_syncobj,struct drm_amdgpu_cs_chunk * chunk,int chunk_id)1707 radv_amdgpu_cs_alloc_timeline_syncobj_chunk(struct radv_winsys_sem_counts *counts, uint32_t queue_syncobj,
1708                                             struct drm_amdgpu_cs_chunk *chunk, int chunk_id)
1709 {
1710    uint32_t count = counts->syncobj_count + counts->timeline_syncobj_count + (queue_syncobj ? 1 : 0);
1711    struct drm_amdgpu_cs_chunk_syncobj *syncobj = malloc(sizeof(struct drm_amdgpu_cs_chunk_syncobj) * count);
1712    if (!syncobj)
1713       return NULL;
1714 
1715    for (unsigned i = 0; i < counts->syncobj_count; i++) {
1716       struct drm_amdgpu_cs_chunk_syncobj *sem = &syncobj[i];
1717       sem->handle = counts->syncobj[i];
1718       sem->flags = 0;
1719       sem->point = 0;
1720    }
1721 
1722    for (unsigned i = 0; i < counts->timeline_syncobj_count; i++) {
1723       struct drm_amdgpu_cs_chunk_syncobj *sem = &syncobj[i + counts->syncobj_count];
1724       sem->handle = counts->syncobj[i + counts->syncobj_count];
1725       sem->flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
1726       sem->point = counts->points[i];
1727    }
1728 
1729    if (queue_syncobj) {
1730       syncobj[count - 1].handle = queue_syncobj;
1731       syncobj[count - 1].flags = 0;
1732       syncobj[count - 1].point = 0;
1733    }
1734 
1735    chunk->chunk_id = chunk_id;
1736    chunk->length_dw = sizeof(struct drm_amdgpu_cs_chunk_syncobj) / 4 * count;
1737    chunk->chunk_data = (uint64_t)(uintptr_t)syncobj;
1738    return syncobj;
1739 }
1740 
1741 static bool
radv_amdgpu_cs_has_user_fence(struct radv_amdgpu_cs_request * request)1742 radv_amdgpu_cs_has_user_fence(struct radv_amdgpu_cs_request *request)
1743 {
1744    return request->ip_type != AMDGPU_HW_IP_UVD && request->ip_type != AMDGPU_HW_IP_VCE &&
1745           request->ip_type != AMDGPU_HW_IP_UVD_ENC && request->ip_type != AMDGPU_HW_IP_VCN_DEC &&
1746           request->ip_type != AMDGPU_HW_IP_VCN_ENC && request->ip_type != AMDGPU_HW_IP_VCN_JPEG;
1747 }
1748 
1749 static VkResult
radv_amdgpu_cs_submit(struct radv_amdgpu_ctx * ctx,struct radv_amdgpu_cs_request * request,struct radv_winsys_sem_info * sem_info)1750 radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request *request,
1751                       struct radv_winsys_sem_info *sem_info)
1752 {
1753    int r;
1754    int num_chunks;
1755    int size;
1756    struct drm_amdgpu_cs_chunk *chunks;
1757    struct drm_amdgpu_cs_chunk_data *chunk_data;
1758    struct drm_amdgpu_bo_list_in bo_list_in;
1759    void *wait_syncobj = NULL, *signal_syncobj = NULL;
1760    int i;
1761    VkResult result = VK_SUCCESS;
1762    bool has_user_fence = radv_amdgpu_cs_has_user_fence(request);
1763    uint32_t queue_syncobj = radv_amdgpu_ctx_queue_syncobj(ctx, request->ip_type, request->ring);
1764    bool *queue_syncobj_wait = &ctx->queue_syncobj_wait[request->ip_type][request->ring];
1765 
1766    if (!queue_syncobj)
1767       return VK_ERROR_OUT_OF_HOST_MEMORY;
1768 
1769    size = request->number_of_ibs + 1 + (has_user_fence ? 1 : 0) + 1 /* bo list */ + 3;
1770 
1771    chunks = malloc(sizeof(chunks[0]) * size);
1772    if (!chunks)
1773       return VK_ERROR_OUT_OF_HOST_MEMORY;
1774 
1775    size = request->number_of_ibs + (has_user_fence ? 1 : 0);
1776 
1777    chunk_data = malloc(sizeof(chunk_data[0]) * size);
1778    if (!chunk_data) {
1779       result = VK_ERROR_OUT_OF_HOST_MEMORY;
1780       goto error_out;
1781    }
1782 
1783    num_chunks = request->number_of_ibs;
1784    for (i = 0; i < request->number_of_ibs; i++) {
1785       struct radv_amdgpu_cs_ib_info *ib;
1786       chunks[i].chunk_id = AMDGPU_CHUNK_ID_IB;
1787       chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1788       chunks[i].chunk_data = (uint64_t)(uintptr_t)&chunk_data[i];
1789 
1790       ib = &request->ibs[i];
1791       assert(ib->ib_mc_address && ib->ib_mc_address % ctx->ws->info.ip[ib->ip_type].ib_alignment == 0);
1792       assert(ib->size);
1793 
1794       chunk_data[i].ib_data._pad = 0;
1795       chunk_data[i].ib_data.va_start = ib->ib_mc_address;
1796       chunk_data[i].ib_data.ib_bytes = ib->size * 4;
1797       chunk_data[i].ib_data.ip_type = ib->ip_type;
1798       chunk_data[i].ib_data.ip_instance = request->ip_instance;
1799       chunk_data[i].ib_data.ring = request->ring;
1800       chunk_data[i].ib_data.flags = ib->flags;
1801    }
1802 
1803    assert(chunk_data[request->number_of_ibs - 1].ib_data.ip_type == request->ip_type);
1804 
1805    if (has_user_fence) {
1806       i = num_chunks++;
1807       chunks[i].chunk_id = AMDGPU_CHUNK_ID_FENCE;
1808       chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
1809       chunks[i].chunk_data = (uint64_t)(uintptr_t)&chunk_data[i];
1810 
1811       struct amdgpu_cs_fence_info fence_info;
1812       fence_info.handle = radv_amdgpu_winsys_bo(ctx->fence_bo)->bo;
1813       /* Need to reserve 4 QWORD for user fence:
1814        *   QWORD[0]: completed fence
1815        *   QWORD[1]: preempted fence
1816        *   QWORD[2]: reset fence
1817        *   QWORD[3]: preempted then reset
1818        */
1819       fence_info.offset = (request->ip_type * MAX_RINGS_PER_TYPE + request->ring) * 4;
1820       amdgpu_cs_chunk_fence_info_to_data(&fence_info, &chunk_data[i]);
1821    }
1822 
1823    if (sem_info->cs_emit_wait &&
1824        (sem_info->wait.timeline_syncobj_count || sem_info->wait.syncobj_count || *queue_syncobj_wait)) {
1825 
1826       if (ctx->ws->info.has_timeline_syncobj) {
1827          wait_syncobj = radv_amdgpu_cs_alloc_timeline_syncobj_chunk(&sem_info->wait, queue_syncobj, &chunks[num_chunks],
1828                                                                     AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT);
1829       } else {
1830          wait_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->wait, queue_syncobj, &chunks[num_chunks],
1831                                                            AMDGPU_CHUNK_ID_SYNCOBJ_IN);
1832       }
1833       if (!wait_syncobj) {
1834          result = VK_ERROR_OUT_OF_HOST_MEMORY;
1835          goto error_out;
1836       }
1837       num_chunks++;
1838 
1839       sem_info->cs_emit_wait = false;
1840       *queue_syncobj_wait = false;
1841    }
1842 
1843    if (sem_info->cs_emit_signal) {
1844       if (ctx->ws->info.has_timeline_syncobj) {
1845          signal_syncobj = radv_amdgpu_cs_alloc_timeline_syncobj_chunk(
1846             &sem_info->signal, queue_syncobj, &chunks[num_chunks], AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL);
1847       } else {
1848          signal_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->signal, queue_syncobj, &chunks[num_chunks],
1849                                                              AMDGPU_CHUNK_ID_SYNCOBJ_OUT);
1850       }
1851       if (!signal_syncobj) {
1852          result = VK_ERROR_OUT_OF_HOST_MEMORY;
1853          goto error_out;
1854       }
1855       num_chunks++;
1856    }
1857 
1858    bo_list_in.operation = ~0;
1859    bo_list_in.list_handle = ~0;
1860    bo_list_in.bo_number = request->num_handles;
1861    bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
1862    bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)request->handles;
1863 
1864    chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
1865    chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
1866    chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in;
1867    num_chunks++;
1868 
1869    /* The kernel returns -ENOMEM with many parallel processes using GDS such as test suites quite
1870     * often, but it eventually succeeds after enough attempts. This happens frequently with dEQP
1871     * using NGG streamout.
1872     */
1873    uint64_t abs_timeout_ns = os_time_get_absolute_timeout(1000000000ull); /* 1s */
1874 
1875    r = 0;
1876    do {
1877       /* Wait 1 ms and try again. */
1878       if (r == -ENOMEM)
1879          os_time_sleep(1000);
1880 
1881       r = amdgpu_cs_submit_raw2(ctx->ws->dev, ctx->ctx, 0, num_chunks, chunks, &request->seq_no);
1882    } while (r == -ENOMEM && os_time_get_nano() < abs_timeout_ns);
1883 
1884    if (r) {
1885       if (r == -ENOMEM) {
1886          fprintf(stderr, "radv/amdgpu: Not enough memory for command submission.\n");
1887          result = VK_ERROR_OUT_OF_HOST_MEMORY;
1888       } else if (r == -ECANCELED) {
1889          fprintf(stderr,
1890                  "radv/amdgpu: The CS has been cancelled because the context is lost. This context is innocent.\n");
1891          result = VK_ERROR_DEVICE_LOST;
1892       } else if (r == -ENODATA) {
1893          fprintf(stderr, "radv/amdgpu: The CS has been cancelled because the context is lost. This context is guilty "
1894                          "of a soft recovery.\n");
1895          result = VK_ERROR_DEVICE_LOST;
1896       } else if (r == -ETIME) {
1897          fprintf(stderr, "radv/amdgpu: The CS has been cancelled because the context is lost. This context is guilty "
1898                          "of a hard recovery.\n");
1899          result = VK_ERROR_DEVICE_LOST;
1900       } else {
1901          fprintf(stderr,
1902                  "radv/amdgpu: The CS has been rejected, "
1903                  "see dmesg for more information (%i).\n",
1904                  r);
1905          result = VK_ERROR_UNKNOWN;
1906       }
1907    }
1908 
1909 error_out:
1910    free(chunks);
1911    free(chunk_data);
1912    free(wait_syncobj);
1913    free(signal_syncobj);
1914    return result;
1915 }
1916 
1917 void
radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys * ws)1918 radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws)
1919 {
1920    ws->base.ctx_create = radv_amdgpu_ctx_create;
1921    ws->base.ctx_destroy = radv_amdgpu_ctx_destroy;
1922    ws->base.ctx_wait_idle = radv_amdgpu_ctx_wait_idle;
1923    ws->base.ctx_set_pstate = radv_amdgpu_ctx_set_pstate;
1924    ws->base.cs_domain = radv_amdgpu_cs_domain;
1925    ws->base.cs_create = radv_amdgpu_cs_create;
1926    ws->base.cs_destroy = radv_amdgpu_cs_destroy;
1927    ws->base.cs_grow = radv_amdgpu_cs_grow;
1928    ws->base.cs_finalize = radv_amdgpu_cs_finalize;
1929    ws->base.cs_reset = radv_amdgpu_cs_reset;
1930    ws->base.cs_chain = radv_amdgpu_cs_chain;
1931    ws->base.cs_unchain = radv_amdgpu_cs_unchain;
1932    ws->base.cs_add_buffer = radv_amdgpu_cs_add_buffer;
1933    ws->base.cs_execute_secondary = radv_amdgpu_cs_execute_secondary;
1934    ws->base.cs_execute_ib = radv_amdgpu_cs_execute_ib;
1935    ws->base.cs_submit = radv_amdgpu_winsys_cs_submit;
1936    ws->base.cs_dump = radv_amdgpu_winsys_cs_dump;
1937    ws->base.cs_annotate = radv_amdgpu_winsys_cs_annotate;
1938    ws->base.cs_pad = radv_amdgpu_winsys_cs_pad;
1939 }
1940