1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * SPDX-License-Identifier: MIT
6 */
7
8 #include <amdgpu.h>
9 #include <assert.h>
10 #include <libsync.h>
11 #include <pthread.h>
12 #include <stdlib.h>
13 #include "drm-uapi/amdgpu_drm.h"
14
15 #include "util/detect_os.h"
16 #include "util/os_time.h"
17 #include "util/u_memory.h"
18 #include "ac_debug.h"
19 #include "radv_amdgpu_bo.h"
20 #include "radv_amdgpu_cs.h"
21 #include "radv_amdgpu_winsys.h"
22 #include "radv_debug.h"
23 #include "radv_radeon_winsys.h"
24 #include "sid.h"
25 #include "vk_alloc.h"
26 #include "vk_drm_syncobj.h"
27 #include "vk_sync.h"
28 #include "vk_sync_dummy.h"
29
30 /* Some BSDs don't define ENODATA (and ENODATA is replaced with different error
31 * codes in the kernel).
32 */
33 #if DETECT_OS_OPENBSD
34 #define ENODATA ENOTSUP
35 #elif DETECT_OS_FREEBSD || DETECT_OS_DRAGONFLY
36 #define ENODATA ECONNREFUSED
37 #endif
38
39 /* Maximum allowed total number of submitted IBs. */
40 #define RADV_MAX_IBS_PER_SUBMIT 192
41
42 enum { VIRTUAL_BUFFER_HASH_TABLE_SIZE = 1024 };
43
44 struct radv_amdgpu_ib {
45 struct radeon_winsys_bo *bo; /* NULL when not owned by the current CS object */
46 uint64_t va;
47 unsigned cdw;
48 };
49
50 struct radv_amdgpu_cs_ib_info {
51 int64_t flags;
52 uint64_t ib_mc_address;
53 uint32_t size;
54 enum amd_ip_type ip_type;
55 };
56
57 struct radv_amdgpu_cs {
58 struct radeon_cmdbuf base;
59 struct radv_amdgpu_winsys *ws;
60
61 struct radv_amdgpu_cs_ib_info ib;
62
63 struct radeon_winsys_bo *ib_buffer;
64 uint8_t *ib_mapped;
65 unsigned max_num_buffers;
66 unsigned num_buffers;
67 struct drm_amdgpu_bo_list_entry *handles;
68
69 struct radv_amdgpu_ib *ib_buffers;
70 unsigned num_ib_buffers;
71 unsigned max_num_ib_buffers;
72 unsigned *ib_size_ptr;
73 VkResult status;
74 struct radv_amdgpu_cs *chained_to;
75 bool use_ib;
76 bool is_secondary;
77
78 int buffer_hash_table[1024];
79 unsigned hw_ip;
80
81 unsigned num_virtual_buffers;
82 unsigned max_num_virtual_buffers;
83 struct radeon_winsys_bo **virtual_buffers;
84 int *virtual_buffer_hash_table;
85
86 struct hash_table *annotations;
87 };
88
89 struct radv_winsys_sem_counts {
90 uint32_t syncobj_count;
91 uint32_t timeline_syncobj_count;
92 uint32_t *syncobj;
93 uint64_t *points;
94 };
95
96 struct radv_winsys_sem_info {
97 bool cs_emit_signal;
98 bool cs_emit_wait;
99 struct radv_winsys_sem_counts wait;
100 struct radv_winsys_sem_counts signal;
101 };
102
103 static void
radeon_emit_unchecked(struct radeon_cmdbuf * cs,uint32_t value)104 radeon_emit_unchecked(struct radeon_cmdbuf *cs, uint32_t value)
105 {
106 cs->buf[cs->cdw++] = value;
107 }
108
109 static uint32_t radv_amdgpu_ctx_queue_syncobj(struct radv_amdgpu_ctx *ctx, unsigned ip, unsigned ring);
110
111 static inline struct radv_amdgpu_cs *
radv_amdgpu_cs(struct radeon_cmdbuf * base)112 radv_amdgpu_cs(struct radeon_cmdbuf *base)
113 {
114 return (struct radv_amdgpu_cs *)base;
115 }
116
117 static bool
ring_can_use_ib_bos(const struct radv_amdgpu_winsys * ws,enum amd_ip_type ip_type)118 ring_can_use_ib_bos(const struct radv_amdgpu_winsys *ws, enum amd_ip_type ip_type)
119 {
120 return ws->use_ib_bos && (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE);
121 }
122
123 struct radv_amdgpu_cs_request {
124 /** Specify HW IP block type to which to send the IB. */
125 unsigned ip_type;
126
127 /** IP instance index if there are several IPs of the same type. */
128 unsigned ip_instance;
129
130 /**
131 * Specify ring index of the IP. We could have several rings
132 * in the same IP. E.g. 0 for SDMA0 and 1 for SDMA1.
133 */
134 uint32_t ring;
135
136 /**
137 * BO list handles used by this request.
138 */
139 struct drm_amdgpu_bo_list_entry *handles;
140 uint32_t num_handles;
141
142 /** Number of IBs to submit in the field ibs. */
143 uint32_t number_of_ibs;
144
145 /**
146 * IBs to submit. Those IBs will be submitted together as single entity
147 */
148 struct radv_amdgpu_cs_ib_info *ibs;
149
150 /**
151 * The returned sequence number for the command submission
152 */
153 uint64_t seq_no;
154 };
155
156 static VkResult radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request *request,
157 struct radv_winsys_sem_info *sem_info);
158
159 static void
radv_amdgpu_request_to_fence(struct radv_amdgpu_ctx * ctx,struct radv_amdgpu_fence * fence,struct radv_amdgpu_cs_request * req)160 radv_amdgpu_request_to_fence(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_fence *fence,
161 struct radv_amdgpu_cs_request *req)
162 {
163 fence->fence.context = ctx->ctx;
164 fence->fence.ip_type = req->ip_type;
165 fence->fence.ip_instance = req->ip_instance;
166 fence->fence.ring = req->ring;
167 fence->fence.fence = req->seq_no;
168 }
169
170 static struct radv_amdgpu_cs_ib_info
radv_amdgpu_cs_ib_to_info(struct radv_amdgpu_cs * cs,struct radv_amdgpu_ib ib)171 radv_amdgpu_cs_ib_to_info(struct radv_amdgpu_cs *cs, struct radv_amdgpu_ib ib)
172 {
173 struct radv_amdgpu_cs_ib_info info = {
174 .flags = 0,
175 .ip_type = cs->hw_ip,
176 .ib_mc_address = ib.va,
177 .size = ib.cdw,
178 };
179 return info;
180 }
181
182 static void
radv_amdgpu_cs_free_annotation(struct hash_entry * entry)183 radv_amdgpu_cs_free_annotation(struct hash_entry *entry)
184 {
185 free(entry->data);
186 }
187
188 static void
radv_amdgpu_cs_destroy(struct radeon_cmdbuf * rcs)189 radv_amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
190 {
191 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(rcs);
192
193 _mesa_hash_table_destroy(cs->annotations, radv_amdgpu_cs_free_annotation);
194
195 if (cs->ib_buffer)
196 cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffer);
197
198 for (unsigned i = 0; i < cs->num_ib_buffers; ++i) {
199 if (!cs->ib_buffers[i].bo)
200 continue;
201
202 cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffers[i].bo);
203 }
204
205 free(cs->ib_buffers);
206 free(cs->virtual_buffers);
207 free(cs->virtual_buffer_hash_table);
208 free(cs->handles);
209 free(cs);
210 }
211
212 static void
radv_amdgpu_init_cs(struct radv_amdgpu_cs * cs,enum amd_ip_type ip_type)213 radv_amdgpu_init_cs(struct radv_amdgpu_cs *cs, enum amd_ip_type ip_type)
214 {
215 for (int i = 0; i < ARRAY_SIZE(cs->buffer_hash_table); ++i)
216 cs->buffer_hash_table[i] = -1;
217
218 cs->hw_ip = ip_type;
219 }
220
221 static enum radeon_bo_domain
radv_amdgpu_cs_domain(const struct radeon_winsys * _ws)222 radv_amdgpu_cs_domain(const struct radeon_winsys *_ws)
223 {
224 const struct radv_amdgpu_winsys *ws = (const struct radv_amdgpu_winsys *)_ws;
225
226 bool enough_vram = ws->info.all_vram_visible ||
227 p_atomic_read_relaxed(&ws->allocated_vram_vis) * 2 <= (uint64_t)ws->info.vram_vis_size_kb * 1024;
228
229 /* Bandwidth should be equivalent to at least PCIe 3.0 x8.
230 * If there is no PCIe info, assume there is enough bandwidth.
231 */
232 bool enough_bandwidth = !ws->info.has_pcie_bandwidth_info || ws->info.pcie_bandwidth_mbps >= 8 * 0.985 * 1024;
233
234 bool use_sam =
235 (enough_vram && enough_bandwidth && ws->info.has_dedicated_vram && !(ws->perftest & RADV_PERFTEST_NO_SAM)) ||
236 (ws->perftest & RADV_PERFTEST_SAM);
237 return use_sam ? RADEON_DOMAIN_VRAM : RADEON_DOMAIN_GTT;
238 }
239
240 static VkResult
radv_amdgpu_cs_bo_create(struct radv_amdgpu_cs * cs,uint32_t ib_size)241 radv_amdgpu_cs_bo_create(struct radv_amdgpu_cs *cs, uint32_t ib_size)
242 {
243 struct radeon_winsys *ws = &cs->ws->base;
244
245 /* Avoid memcpy from VRAM when a secondary cmdbuf can't always rely on IB2. */
246 const bool can_always_use_ib2 = cs->ws->info.gfx_level >= GFX8 && cs->hw_ip == AMD_IP_GFX;
247 const bool avoid_vram = cs->is_secondary && !can_always_use_ib2;
248 const enum radeon_bo_domain domain = avoid_vram ? RADEON_DOMAIN_GTT : radv_amdgpu_cs_domain(ws);
249 const enum radeon_bo_flag gtt_wc_flag = avoid_vram ? 0 : RADEON_FLAG_GTT_WC;
250 const enum radeon_bo_flag flags =
251 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_READ_ONLY | gtt_wc_flag;
252
253 return ws->buffer_create(ws, ib_size, cs->ws->info.ip[cs->hw_ip].ib_alignment, domain, flags, RADV_BO_PRIORITY_CS, 0,
254 &cs->ib_buffer);
255 }
256
257 static VkResult
radv_amdgpu_cs_get_new_ib(struct radeon_cmdbuf * _cs,uint32_t ib_size)258 radv_amdgpu_cs_get_new_ib(struct radeon_cmdbuf *_cs, uint32_t ib_size)
259 {
260 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
261 VkResult result;
262
263 result = radv_amdgpu_cs_bo_create(cs, ib_size);
264 if (result != VK_SUCCESS)
265 return result;
266
267 cs->ib_mapped = radv_buffer_map(&cs->ws->base, cs->ib_buffer);
268 if (!cs->ib_mapped) {
269 cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffer);
270 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
271 }
272
273 cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
274 cs->base.buf = (uint32_t *)cs->ib_mapped;
275 cs->base.cdw = 0;
276 cs->base.reserved_dw = 0;
277 cs->base.max_dw = ib_size / 4 - 4;
278 cs->ib.size = 0;
279 cs->ib.ip_type = cs->hw_ip;
280
281 if (cs->use_ib)
282 cs->ib_size_ptr = &cs->ib.size;
283
284 cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
285
286 return VK_SUCCESS;
287 }
288
289 static unsigned
radv_amdgpu_cs_get_initial_size(struct radv_amdgpu_winsys * ws,enum amd_ip_type ip_type)290 radv_amdgpu_cs_get_initial_size(struct radv_amdgpu_winsys *ws, enum amd_ip_type ip_type)
291 {
292 const uint32_t ib_alignment = ws->info.ip[ip_type].ib_alignment;
293 assert(util_is_power_of_two_nonzero(ib_alignment));
294 return align(20 * 1024 * 4, ib_alignment);
295 }
296
297 static struct radeon_cmdbuf *
radv_amdgpu_cs_create(struct radeon_winsys * ws,enum amd_ip_type ip_type,bool is_secondary)298 radv_amdgpu_cs_create(struct radeon_winsys *ws, enum amd_ip_type ip_type, bool is_secondary)
299 {
300 struct radv_amdgpu_cs *cs;
301 uint32_t ib_size = radv_amdgpu_cs_get_initial_size(radv_amdgpu_winsys(ws), ip_type);
302
303 cs = calloc(1, sizeof(struct radv_amdgpu_cs));
304 if (!cs)
305 return NULL;
306
307 cs->is_secondary = is_secondary;
308 cs->ws = radv_amdgpu_winsys(ws);
309 radv_amdgpu_init_cs(cs, ip_type);
310
311 cs->use_ib = ring_can_use_ib_bos(cs->ws, ip_type);
312
313 VkResult result = radv_amdgpu_cs_get_new_ib(&cs->base, ib_size);
314 if (result != VK_SUCCESS) {
315 free(cs);
316 return NULL;
317 }
318
319 return &cs->base;
320 }
321
322 static uint32_t
get_nop_packet(struct radv_amdgpu_cs * cs)323 get_nop_packet(struct radv_amdgpu_cs *cs)
324 {
325 switch (cs->hw_ip) {
326 case AMDGPU_HW_IP_GFX:
327 case AMDGPU_HW_IP_COMPUTE:
328 return cs->ws->info.gfx_ib_pad_with_type2 ? PKT2_NOP_PAD : PKT3_NOP_PAD;
329 case AMDGPU_HW_IP_DMA:
330 return cs->ws->info.gfx_level == GFX6 ? 0xF0000000 : SDMA_NOP_PAD;
331 case AMDGPU_HW_IP_UVD:
332 case AMDGPU_HW_IP_UVD_ENC:
333 return PKT2_NOP_PAD;
334 case AMDGPU_HW_IP_VCN_DEC:
335 return 0x81FF;
336 case AMDGPU_HW_IP_VCN_ENC:
337 return 0; /* NOPs are illegal in encode, so don't pad */
338 default:
339 unreachable("Unknown IP type");
340 }
341 }
342
343 static void
radv_amdgpu_cs_add_ib_buffer(struct radv_amdgpu_cs * cs,struct radeon_winsys_bo * bo,uint64_t va,uint32_t cdw)344 radv_amdgpu_cs_add_ib_buffer(struct radv_amdgpu_cs *cs, struct radeon_winsys_bo *bo, uint64_t va, uint32_t cdw)
345 {
346 if (cs->num_ib_buffers == cs->max_num_ib_buffers) {
347 unsigned max_num_ib_buffers = MAX2(1, cs->max_num_ib_buffers * 2);
348 struct radv_amdgpu_ib *ib_buffers = realloc(cs->ib_buffers, max_num_ib_buffers * sizeof(*ib_buffers));
349 if (!ib_buffers) {
350 cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
351 return;
352 }
353 cs->max_num_ib_buffers = max_num_ib_buffers;
354 cs->ib_buffers = ib_buffers;
355 }
356
357 cs->ib_buffers[cs->num_ib_buffers].bo = bo;
358 cs->ib_buffers[cs->num_ib_buffers].va = va;
359 cs->ib_buffers[cs->num_ib_buffers++].cdw = cdw;
360 }
361
362 static void
radv_amdgpu_restore_last_ib(struct radv_amdgpu_cs * cs)363 radv_amdgpu_restore_last_ib(struct radv_amdgpu_cs *cs)
364 {
365 struct radv_amdgpu_ib *ib = &cs->ib_buffers[--cs->num_ib_buffers];
366 assert(ib->bo);
367 cs->ib_buffer = ib->bo;
368 }
369
370 static void
radv_amdgpu_cs_grow(struct radeon_cmdbuf * _cs,size_t min_size)371 radv_amdgpu_cs_grow(struct radeon_cmdbuf *_cs, size_t min_size)
372 {
373 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
374
375 if (cs->status != VK_SUCCESS) {
376 cs->base.cdw = 0;
377 return;
378 }
379
380 const uint32_t ib_alignment = cs->ws->info.ip[cs->hw_ip].ib_alignment;
381
382 cs->ws->base.cs_finalize(_cs);
383
384 uint64_t ib_size = MAX2(min_size * 4 + 16, cs->base.max_dw * 4 * 2);
385
386 /* max that fits in the chain size field. */
387 ib_size = align(MIN2(ib_size, 0xfffff), ib_alignment);
388
389 VkResult result = radv_amdgpu_cs_bo_create(cs, ib_size);
390
391 if (result != VK_SUCCESS) {
392 cs->base.cdw = 0;
393 cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
394 radv_amdgpu_restore_last_ib(cs);
395 }
396
397 cs->ib_mapped = radv_buffer_map(&cs->ws->base, cs->ib_buffer);
398 if (!cs->ib_mapped) {
399 cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffer);
400 cs->base.cdw = 0;
401
402 /* VK_ERROR_MEMORY_MAP_FAILED is not valid for vkEndCommandBuffer. */
403 cs->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
404 radv_amdgpu_restore_last_ib(cs);
405 }
406
407 cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
408
409 if (cs->use_ib) {
410 cs->base.buf[cs->base.cdw - 4] = PKT3(PKT3_INDIRECT_BUFFER, 2, 0);
411 cs->base.buf[cs->base.cdw - 3] = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
412 cs->base.buf[cs->base.cdw - 2] = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va >> 32;
413 cs->base.buf[cs->base.cdw - 1] = S_3F2_CHAIN(1) | S_3F2_VALID(1);
414
415 cs->ib_size_ptr = cs->base.buf + cs->base.cdw - 1;
416 }
417
418 cs->base.buf = (uint32_t *)cs->ib_mapped;
419 cs->base.cdw = 0;
420 cs->base.reserved_dw = 0;
421 cs->base.max_dw = ib_size / 4 - 4;
422 }
423
424 static void
radv_amdgpu_winsys_cs_pad(struct radeon_cmdbuf * _cs,unsigned leave_dw_space)425 radv_amdgpu_winsys_cs_pad(struct radeon_cmdbuf *_cs, unsigned leave_dw_space)
426 {
427 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
428 const enum amd_ip_type ip_type = cs->hw_ip;
429 const uint32_t pad_dw_mask = cs->ws->info.ip[ip_type].ib_pad_dw_mask;
430 const uint32_t unaligned_dw = (cs->base.cdw + leave_dw_space) & pad_dw_mask;
431
432 if (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE) {
433 if (unaligned_dw) {
434 const int remaining = pad_dw_mask + 1 - unaligned_dw;
435
436 /* Only pad by 1 dword with the type-2 NOP if necessary. */
437 if (remaining == 1 && cs->ws->info.gfx_ib_pad_with_type2) {
438 radeon_emit_unchecked(&cs->base, PKT2_NOP_PAD);
439 } else {
440 /* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized
441 * packet. The size of the packet body after the header is always count + 1.
442 * If count == -1, there is no packet body. NOP is the only packet that can have
443 * count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1).
444 */
445 radeon_emit_unchecked(&cs->base, PKT3(PKT3_NOP, remaining - 2, 0));
446 cs->base.cdw += remaining - 1;
447 }
448 }
449 } else {
450 /* Don't pad on VCN encode/unified as no NOPs */
451 if (ip_type == AMDGPU_HW_IP_VCN_ENC)
452 return;
453
454 /* Don't add padding to 0 length UVD due to kernel */
455 if (ip_type == AMDGPU_HW_IP_UVD && cs->base.cdw == 0)
456 return;
457
458 const uint32_t nop_packet = get_nop_packet(cs);
459
460 while (!cs->base.cdw || (cs->base.cdw & pad_dw_mask))
461 radeon_emit_unchecked(&cs->base, nop_packet);
462 }
463
464 assert(((cs->base.cdw + leave_dw_space) & pad_dw_mask) == 0);
465 }
466
467 static VkResult
radv_amdgpu_cs_finalize(struct radeon_cmdbuf * _cs)468 radv_amdgpu_cs_finalize(struct radeon_cmdbuf *_cs)
469 {
470 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
471
472 assert(cs->base.cdw <= cs->base.reserved_dw);
473
474 if (cs->use_ib) {
475 const uint32_t nop_packet = get_nop_packet(cs);
476
477 /* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */
478 radv_amdgpu_winsys_cs_pad(_cs, 4);
479
480 radeon_emit_unchecked(&cs->base, nop_packet);
481 radeon_emit_unchecked(&cs->base, nop_packet);
482 radeon_emit_unchecked(&cs->base, nop_packet);
483 radeon_emit_unchecked(&cs->base, nop_packet);
484
485 *cs->ib_size_ptr |= cs->base.cdw;
486 } else {
487 radv_amdgpu_winsys_cs_pad(_cs, 0);
488 }
489
490 /* Append the current (last) IB to the array of IB buffers. */
491 radv_amdgpu_cs_add_ib_buffer(cs, cs->ib_buffer, cs->ib_buffer->va,
492 cs->use_ib ? G_3F2_IB_SIZE(*cs->ib_size_ptr) : cs->base.cdw);
493
494 /* Prevent freeing this BO twice. */
495 cs->ib_buffer = NULL;
496
497 cs->chained_to = NULL;
498
499 assert(cs->base.cdw <= cs->base.max_dw + 4);
500
501 return cs->status;
502 }
503
504 static void
radv_amdgpu_cs_reset(struct radeon_cmdbuf * _cs)505 radv_amdgpu_cs_reset(struct radeon_cmdbuf *_cs)
506 {
507 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
508 cs->base.cdw = 0;
509 cs->base.reserved_dw = 0;
510 cs->status = VK_SUCCESS;
511
512 for (unsigned i = 0; i < cs->num_buffers; ++i) {
513 unsigned hash = cs->handles[i].bo_handle & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
514 cs->buffer_hash_table[hash] = -1;
515 }
516
517 for (unsigned i = 0; i < cs->num_virtual_buffers; ++i) {
518 unsigned hash = ((uintptr_t)cs->virtual_buffers[i] >> 6) & (VIRTUAL_BUFFER_HASH_TABLE_SIZE - 1);
519 cs->virtual_buffer_hash_table[hash] = -1;
520 }
521
522 cs->num_buffers = 0;
523 cs->num_virtual_buffers = 0;
524
525 /* When the CS is finalized and IBs are not allowed, use last IB. */
526 assert(cs->ib_buffer || cs->num_ib_buffers);
527 if (!cs->ib_buffer)
528 radv_amdgpu_restore_last_ib(cs);
529
530 cs->ws->base.cs_add_buffer(&cs->base, cs->ib_buffer);
531
532 for (unsigned i = 0; i < cs->num_ib_buffers; ++i) {
533 if (!cs->ib_buffers[i].bo)
534 continue;
535
536 cs->ws->base.buffer_destroy(&cs->ws->base, cs->ib_buffers[i].bo);
537 }
538
539 cs->num_ib_buffers = 0;
540 cs->ib.ib_mc_address = radv_amdgpu_winsys_bo(cs->ib_buffer)->base.va;
541
542 cs->ib.size = 0;
543
544 if (cs->use_ib)
545 cs->ib_size_ptr = &cs->ib.size;
546
547 _mesa_hash_table_destroy(cs->annotations, radv_amdgpu_cs_free_annotation);
548 cs->annotations = NULL;
549 }
550
551 static bool
radv_amdgpu_cs_has_external_ib(const struct radv_amdgpu_cs * cs)552 radv_amdgpu_cs_has_external_ib(const struct radv_amdgpu_cs *cs)
553 {
554 for (unsigned i = 0; i < cs->num_ib_buffers; i++) {
555 if (!cs->ib_buffers[i].bo)
556 return true;
557 }
558
559 return false;
560 }
561
562 static void
radv_amdgpu_cs_unchain(struct radeon_cmdbuf * cs)563 radv_amdgpu_cs_unchain(struct radeon_cmdbuf *cs)
564 {
565 struct radv_amdgpu_cs *acs = radv_amdgpu_cs(cs);
566
567 if (!acs->chained_to)
568 return;
569
570 assert(cs->cdw <= cs->max_dw + 4);
571
572 acs->chained_to = NULL;
573 cs->buf[cs->cdw - 4] = PKT3_NOP_PAD;
574 cs->buf[cs->cdw - 3] = PKT3_NOP_PAD;
575 cs->buf[cs->cdw - 2] = PKT3_NOP_PAD;
576 cs->buf[cs->cdw - 1] = PKT3_NOP_PAD;
577 }
578
579 static bool
radv_amdgpu_cs_chain(struct radeon_cmdbuf * cs,struct radeon_cmdbuf * next_cs,bool pre_ena)580 radv_amdgpu_cs_chain(struct radeon_cmdbuf *cs, struct radeon_cmdbuf *next_cs, bool pre_ena)
581 {
582 /* Chains together two CS (command stream) objects by editing
583 * the end of the first CS to add a command that jumps to the
584 * second CS.
585 *
586 * After this, it is enough to submit the first CS to the GPU
587 * and not necessary to submit the second CS because it is already
588 * executed by the first.
589 */
590
591 struct radv_amdgpu_cs *acs = radv_amdgpu_cs(cs);
592 struct radv_amdgpu_cs *next_acs = radv_amdgpu_cs(next_cs);
593
594 /* Only some HW IP types have packets that we can use for chaining. */
595 if (!acs->use_ib)
596 return false;
597
598 /* Do not chain if the next CS has external IBs because it will chain to newly created IB instead
599 * of the first one.
600 */
601 if (radv_amdgpu_cs_has_external_ib(next_acs))
602 return false;
603
604 assert(cs->cdw <= cs->max_dw + 4);
605
606 acs->chained_to = next_acs;
607
608 cs->buf[cs->cdw - 4] = PKT3(PKT3_INDIRECT_BUFFER, 2, 0);
609 cs->buf[cs->cdw - 3] = next_acs->ib.ib_mc_address;
610 cs->buf[cs->cdw - 2] = next_acs->ib.ib_mc_address >> 32;
611 cs->buf[cs->cdw - 1] = S_3F2_CHAIN(1) | S_3F2_VALID(1) | S_3F2_PRE_ENA(pre_ena) | next_acs->ib.size;
612
613 return true;
614 }
615
616 static int
radv_amdgpu_cs_find_buffer(struct radv_amdgpu_cs * cs,uint32_t bo)617 radv_amdgpu_cs_find_buffer(struct radv_amdgpu_cs *cs, uint32_t bo)
618 {
619 unsigned hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
620 int index = cs->buffer_hash_table[hash];
621
622 if (index == -1)
623 return -1;
624
625 if (cs->handles[index].bo_handle == bo)
626 return index;
627
628 for (unsigned i = 0; i < cs->num_buffers; ++i) {
629 if (cs->handles[i].bo_handle == bo) {
630 cs->buffer_hash_table[hash] = i;
631 return i;
632 }
633 }
634
635 return -1;
636 }
637
638 static void
radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs * cs,uint32_t bo,uint8_t priority)639 radv_amdgpu_cs_add_buffer_internal(struct radv_amdgpu_cs *cs, uint32_t bo, uint8_t priority)
640 {
641 unsigned hash;
642 int index = radv_amdgpu_cs_find_buffer(cs, bo);
643
644 if (index != -1)
645 return;
646
647 if (cs->num_buffers == cs->max_num_buffers) {
648 unsigned new_count = MAX2(1, cs->max_num_buffers * 2);
649 struct drm_amdgpu_bo_list_entry *new_entries =
650 realloc(cs->handles, new_count * sizeof(struct drm_amdgpu_bo_list_entry));
651 if (new_entries) {
652 cs->max_num_buffers = new_count;
653 cs->handles = new_entries;
654 } else {
655 cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
656 return;
657 }
658 }
659
660 cs->handles[cs->num_buffers].bo_handle = bo;
661 cs->handles[cs->num_buffers].bo_priority = priority;
662
663 hash = bo & (ARRAY_SIZE(cs->buffer_hash_table) - 1);
664 cs->buffer_hash_table[hash] = cs->num_buffers;
665
666 ++cs->num_buffers;
667 }
668
669 static void
radv_amdgpu_cs_add_virtual_buffer(struct radeon_cmdbuf * _cs,struct radeon_winsys_bo * bo)670 radv_amdgpu_cs_add_virtual_buffer(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo)
671 {
672 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
673 unsigned hash = ((uintptr_t)bo >> 6) & (VIRTUAL_BUFFER_HASH_TABLE_SIZE - 1);
674
675 if (!cs->virtual_buffer_hash_table) {
676 int *virtual_buffer_hash_table = malloc(VIRTUAL_BUFFER_HASH_TABLE_SIZE * sizeof(int));
677 if (!virtual_buffer_hash_table) {
678 cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
679 return;
680 }
681 cs->virtual_buffer_hash_table = virtual_buffer_hash_table;
682
683 for (int i = 0; i < VIRTUAL_BUFFER_HASH_TABLE_SIZE; ++i)
684 cs->virtual_buffer_hash_table[i] = -1;
685 }
686
687 if (cs->virtual_buffer_hash_table[hash] >= 0) {
688 int idx = cs->virtual_buffer_hash_table[hash];
689 if (cs->virtual_buffers[idx] == bo) {
690 return;
691 }
692 for (unsigned i = 0; i < cs->num_virtual_buffers; ++i) {
693 if (cs->virtual_buffers[i] == bo) {
694 cs->virtual_buffer_hash_table[hash] = i;
695 return;
696 }
697 }
698 }
699
700 if (cs->max_num_virtual_buffers <= cs->num_virtual_buffers) {
701 unsigned max_num_virtual_buffers = MAX2(2, cs->max_num_virtual_buffers * 2);
702 struct radeon_winsys_bo **virtual_buffers =
703 realloc(cs->virtual_buffers, sizeof(struct radeon_winsys_bo *) * max_num_virtual_buffers);
704 if (!virtual_buffers) {
705 cs->status = VK_ERROR_OUT_OF_HOST_MEMORY;
706 return;
707 }
708 cs->max_num_virtual_buffers = max_num_virtual_buffers;
709 cs->virtual_buffers = virtual_buffers;
710 }
711
712 cs->virtual_buffers[cs->num_virtual_buffers] = bo;
713
714 cs->virtual_buffer_hash_table[hash] = cs->num_virtual_buffers;
715 ++cs->num_virtual_buffers;
716 }
717
718 static void
radv_amdgpu_cs_add_buffer(struct radeon_cmdbuf * _cs,struct radeon_winsys_bo * _bo)719 radv_amdgpu_cs_add_buffer(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *_bo)
720 {
721 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
722 struct radv_amdgpu_winsys_bo *bo = radv_amdgpu_winsys_bo(_bo);
723
724 if (cs->status != VK_SUCCESS)
725 return;
726
727 if (bo->is_virtual) {
728 radv_amdgpu_cs_add_virtual_buffer(_cs, _bo);
729 return;
730 }
731
732 radv_amdgpu_cs_add_buffer_internal(cs, bo->bo_handle, bo->priority);
733 }
734
735 static void
radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf * _parent,struct radeon_cmdbuf * _child,bool allow_ib2)736 radv_amdgpu_cs_execute_secondary(struct radeon_cmdbuf *_parent, struct radeon_cmdbuf *_child, bool allow_ib2)
737 {
738 struct radv_amdgpu_cs *parent = radv_amdgpu_cs(_parent);
739 struct radv_amdgpu_cs *child = radv_amdgpu_cs(_child);
740 struct radv_amdgpu_winsys *ws = parent->ws;
741 const bool use_ib2 = parent->use_ib && !parent->is_secondary && allow_ib2 && parent->hw_ip == AMD_IP_GFX;
742
743 if (parent->status != VK_SUCCESS || child->status != VK_SUCCESS)
744 return;
745
746 for (unsigned i = 0; i < child->num_buffers; ++i) {
747 radv_amdgpu_cs_add_buffer_internal(parent, child->handles[i].bo_handle, child->handles[i].bo_priority);
748 }
749
750 for (unsigned i = 0; i < child->num_virtual_buffers; ++i) {
751 radv_amdgpu_cs_add_buffer(&parent->base, child->virtual_buffers[i]);
752 }
753
754 if (use_ib2) {
755 if (parent->base.cdw + 4 > parent->base.max_dw)
756 radv_amdgpu_cs_grow(&parent->base, 4);
757
758 parent->base.reserved_dw = MAX2(parent->base.reserved_dw, parent->base.cdw + 4);
759
760 /* Not setting the CHAIN bit will launch an IB2. */
761 radeon_emit(&parent->base, PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
762 radeon_emit(&parent->base, child->ib.ib_mc_address);
763 radeon_emit(&parent->base, child->ib.ib_mc_address >> 32);
764 radeon_emit(&parent->base, child->ib.size);
765 } else {
766 assert(parent->use_ib == child->use_ib);
767
768 /* Grow the current CS and copy the contents of the secondary CS. */
769 for (unsigned i = 0; i < child->num_ib_buffers; i++) {
770 struct radv_amdgpu_ib *ib = &child->ib_buffers[i];
771 uint32_t cdw = ib->cdw;
772 uint8_t *mapped;
773
774 /* Do not copy the original chain link for IBs. */
775 if (child->use_ib)
776 cdw -= 4;
777
778 assert(ib->bo);
779
780 if (parent->base.cdw + cdw > parent->base.max_dw)
781 radv_amdgpu_cs_grow(&parent->base, cdw);
782
783 parent->base.reserved_dw = MAX2(parent->base.reserved_dw, parent->base.cdw + cdw);
784
785 mapped = radv_buffer_map(&ws->base, ib->bo);
786 if (!mapped) {
787 parent->status = VK_ERROR_OUT_OF_DEVICE_MEMORY;
788 return;
789 }
790
791 memcpy(parent->base.buf + parent->base.cdw, mapped, 4 * cdw);
792 parent->base.cdw += cdw;
793 }
794 }
795 }
796
797 static void
radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf * _cs,struct radeon_winsys_bo * bo,uint64_t va,const uint32_t cdw,const bool predicate)798 radv_amdgpu_cs_execute_ib(struct radeon_cmdbuf *_cs, struct radeon_winsys_bo *bo, uint64_t va, const uint32_t cdw,
799 const bool predicate)
800 {
801 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(_cs);
802 const uint64_t ib_va = bo ? bo->va : va;
803
804 if (cs->status != VK_SUCCESS)
805 return;
806
807 assert(ib_va && ib_va % cs->ws->info.ip[cs->hw_ip].ib_alignment == 0);
808
809 if (cs->hw_ip == AMD_IP_GFX && cs->use_ib) {
810 radeon_emit(&cs->base, PKT3(PKT3_INDIRECT_BUFFER, 2, predicate));
811 radeon_emit(&cs->base, ib_va);
812 radeon_emit(&cs->base, ib_va >> 32);
813 radeon_emit(&cs->base, cdw);
814 } else {
815 const uint32_t ib_size = radv_amdgpu_cs_get_initial_size(cs->ws, cs->hw_ip);
816 VkResult result;
817
818 /* Finalize the current CS without chaining to execute the external IB. */
819 radv_amdgpu_cs_finalize(_cs);
820
821 radv_amdgpu_cs_add_ib_buffer(cs, bo, ib_va, cdw);
822
823 /* Start a new CS which isn't chained to any previous CS. */
824 result = radv_amdgpu_cs_get_new_ib(_cs, ib_size);
825 if (result != VK_SUCCESS) {
826 cs->base.cdw = 0;
827 cs->status = result;
828 }
829 }
830 }
831
832 static unsigned
radv_amdgpu_count_cs_bo(struct radv_amdgpu_cs * start_cs)833 radv_amdgpu_count_cs_bo(struct radv_amdgpu_cs *start_cs)
834 {
835 unsigned num_bo = 0;
836
837 for (struct radv_amdgpu_cs *cs = start_cs; cs; cs = cs->chained_to) {
838 num_bo += cs->num_buffers;
839 for (unsigned j = 0; j < cs->num_virtual_buffers; ++j)
840 num_bo += radv_amdgpu_winsys_bo(cs->virtual_buffers[j])->bo_count;
841 }
842
843 return num_bo;
844 }
845
846 static unsigned
radv_amdgpu_count_cs_array_bo(struct radeon_cmdbuf ** cs_array,unsigned num_cs)847 radv_amdgpu_count_cs_array_bo(struct radeon_cmdbuf **cs_array, unsigned num_cs)
848 {
849 unsigned num_bo = 0;
850
851 for (unsigned i = 0; i < num_cs; ++i) {
852 num_bo += radv_amdgpu_count_cs_bo(radv_amdgpu_cs(cs_array[i]));
853 }
854
855 return num_bo;
856 }
857
858 static unsigned
radv_amdgpu_add_cs_to_bo_list(struct radv_amdgpu_cs * cs,struct drm_amdgpu_bo_list_entry * handles,unsigned num_handles)859 radv_amdgpu_add_cs_to_bo_list(struct radv_amdgpu_cs *cs, struct drm_amdgpu_bo_list_entry *handles, unsigned num_handles)
860 {
861 if (!cs->num_buffers)
862 return num_handles;
863
864 if (num_handles == 0 && !cs->num_virtual_buffers) {
865 memcpy(handles, cs->handles, cs->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
866 return cs->num_buffers;
867 }
868
869 int unique_bo_so_far = num_handles;
870 for (unsigned j = 0; j < cs->num_buffers; ++j) {
871 bool found = false;
872 for (unsigned k = 0; k < unique_bo_so_far; ++k) {
873 if (handles[k].bo_handle == cs->handles[j].bo_handle) {
874 found = true;
875 break;
876 }
877 }
878 if (!found) {
879 handles[num_handles] = cs->handles[j];
880 ++num_handles;
881 }
882 }
883 for (unsigned j = 0; j < cs->num_virtual_buffers; ++j) {
884 struct radv_amdgpu_winsys_bo *virtual_bo = radv_amdgpu_winsys_bo(cs->virtual_buffers[j]);
885 u_rwlock_rdlock(&virtual_bo->lock);
886 for (unsigned k = 0; k < virtual_bo->bo_count; ++k) {
887 struct radv_amdgpu_winsys_bo *bo = virtual_bo->bos[k];
888 bool found = false;
889 for (unsigned m = 0; m < num_handles; ++m) {
890 if (handles[m].bo_handle == bo->bo_handle) {
891 found = true;
892 break;
893 }
894 }
895 if (!found) {
896 handles[num_handles].bo_handle = bo->bo_handle;
897 handles[num_handles].bo_priority = bo->priority;
898 ++num_handles;
899 }
900 }
901 u_rwlock_rdunlock(&virtual_bo->lock);
902 }
903
904 return num_handles;
905 }
906
907 static unsigned
radv_amdgpu_add_cs_array_to_bo_list(struct radeon_cmdbuf ** cs_array,unsigned num_cs,struct drm_amdgpu_bo_list_entry * handles,unsigned num_handles)908 radv_amdgpu_add_cs_array_to_bo_list(struct radeon_cmdbuf **cs_array, unsigned num_cs,
909 struct drm_amdgpu_bo_list_entry *handles, unsigned num_handles)
910 {
911 for (unsigned i = 0; i < num_cs; ++i) {
912 for (struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]); cs; cs = cs->chained_to) {
913 num_handles = radv_amdgpu_add_cs_to_bo_list(cs, handles, num_handles);
914 }
915 }
916
917 return num_handles;
918 }
919
920 static unsigned
radv_amdgpu_copy_global_bo_list(struct radv_amdgpu_winsys * ws,struct drm_amdgpu_bo_list_entry * handles)921 radv_amdgpu_copy_global_bo_list(struct radv_amdgpu_winsys *ws, struct drm_amdgpu_bo_list_entry *handles)
922 {
923 for (uint32_t i = 0; i < ws->global_bo_list.count; i++) {
924 handles[i].bo_handle = ws->global_bo_list.bos[i]->bo_handle;
925 handles[i].bo_priority = ws->global_bo_list.bos[i]->priority;
926 }
927
928 return ws->global_bo_list.count;
929 }
930
931 static VkResult
radv_amdgpu_get_bo_list(struct radv_amdgpu_winsys * ws,struct radeon_cmdbuf ** cs_array,unsigned count,struct radeon_cmdbuf ** initial_preamble_array,unsigned num_initial_preambles,struct radeon_cmdbuf ** continue_preamble_array,unsigned num_continue_preambles,struct radeon_cmdbuf ** postamble_array,unsigned num_postambles,unsigned * rnum_handles,struct drm_amdgpu_bo_list_entry ** rhandles)932 radv_amdgpu_get_bo_list(struct radv_amdgpu_winsys *ws, struct radeon_cmdbuf **cs_array, unsigned count,
933 struct radeon_cmdbuf **initial_preamble_array, unsigned num_initial_preambles,
934 struct radeon_cmdbuf **continue_preamble_array, unsigned num_continue_preambles,
935 struct radeon_cmdbuf **postamble_array, unsigned num_postambles, unsigned *rnum_handles,
936 struct drm_amdgpu_bo_list_entry **rhandles)
937 {
938 struct drm_amdgpu_bo_list_entry *handles = NULL;
939 unsigned num_handles = 0;
940
941 if (ws->debug_all_bos) {
942 handles = malloc(sizeof(handles[0]) * ws->global_bo_list.count);
943 if (!handles)
944 return VK_ERROR_OUT_OF_HOST_MEMORY;
945
946 num_handles = radv_amdgpu_copy_global_bo_list(ws, handles);
947 } else if (count == 1 && !num_initial_preambles && !num_continue_preambles && !num_postambles &&
948 !radv_amdgpu_cs(cs_array[0])->num_virtual_buffers && !radv_amdgpu_cs(cs_array[0])->chained_to &&
949 !ws->global_bo_list.count) {
950 struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)cs_array[0];
951 if (cs->num_buffers == 0)
952 return VK_SUCCESS;
953
954 handles = malloc(sizeof(handles[0]) * cs->num_buffers);
955 if (!handles)
956 return VK_ERROR_OUT_OF_HOST_MEMORY;
957
958 memcpy(handles, cs->handles, sizeof(handles[0]) * cs->num_buffers);
959 num_handles = cs->num_buffers;
960 } else {
961 unsigned total_buffer_count = ws->global_bo_list.count;
962 total_buffer_count += radv_amdgpu_count_cs_array_bo(cs_array, count);
963 total_buffer_count += radv_amdgpu_count_cs_array_bo(initial_preamble_array, num_initial_preambles);
964 total_buffer_count += radv_amdgpu_count_cs_array_bo(continue_preamble_array, num_continue_preambles);
965 total_buffer_count += radv_amdgpu_count_cs_array_bo(postamble_array, num_postambles);
966
967 if (total_buffer_count == 0)
968 return VK_SUCCESS;
969
970 handles = malloc(sizeof(handles[0]) * total_buffer_count);
971 if (!handles)
972 return VK_ERROR_OUT_OF_HOST_MEMORY;
973
974 num_handles = radv_amdgpu_copy_global_bo_list(ws, handles);
975 num_handles = radv_amdgpu_add_cs_array_to_bo_list(cs_array, count, handles, num_handles);
976 num_handles =
977 radv_amdgpu_add_cs_array_to_bo_list(initial_preamble_array, num_initial_preambles, handles, num_handles);
978 num_handles =
979 radv_amdgpu_add_cs_array_to_bo_list(continue_preamble_array, num_continue_preambles, handles, num_handles);
980 num_handles = radv_amdgpu_add_cs_array_to_bo_list(postamble_array, num_postambles, handles, num_handles);
981 }
982
983 *rhandles = handles;
984 *rnum_handles = num_handles;
985
986 return VK_SUCCESS;
987 }
988
989 static void
radv_assign_last_submit(struct radv_amdgpu_ctx * ctx,struct radv_amdgpu_cs_request * request)990 radv_assign_last_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request *request)
991 {
992 radv_amdgpu_request_to_fence(ctx, &ctx->last_submission[request->ip_type][request->ring], request);
993 }
994
995 static unsigned
radv_amdgpu_get_num_ibs_per_cs(const struct radv_amdgpu_cs * cs)996 radv_amdgpu_get_num_ibs_per_cs(const struct radv_amdgpu_cs *cs)
997 {
998 unsigned num_ibs = 0;
999
1000 if (cs->use_ib) {
1001 unsigned num_external_ibs = 0;
1002
1003 for (unsigned i = 0; i < cs->num_ib_buffers; i++) {
1004 if (!cs->ib_buffers[i].bo)
1005 num_external_ibs++;
1006 }
1007
1008 num_ibs = num_external_ibs * 2 + 1;
1009 } else {
1010 num_ibs = cs->num_ib_buffers;
1011 }
1012
1013 return num_ibs;
1014 }
1015
1016 static unsigned
radv_amdgpu_count_ibs(struct radeon_cmdbuf ** cs_array,unsigned cs_count,unsigned initial_preamble_count,unsigned continue_preamble_count,unsigned postamble_count)1017 radv_amdgpu_count_ibs(struct radeon_cmdbuf **cs_array, unsigned cs_count, unsigned initial_preamble_count,
1018 unsigned continue_preamble_count, unsigned postamble_count)
1019 {
1020 unsigned num_ibs = 0;
1021
1022 for (unsigned i = 0; i < cs_count; i++) {
1023 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[i]);
1024
1025 num_ibs += radv_amdgpu_get_num_ibs_per_cs(cs);
1026 }
1027
1028 return MAX2(initial_preamble_count, continue_preamble_count) + num_ibs + postamble_count;
1029 }
1030
1031 static VkResult
radv_amdgpu_winsys_cs_submit_internal(struct radv_amdgpu_ctx * ctx,int queue_idx,struct radv_winsys_sem_info * sem_info,struct radeon_cmdbuf ** cs_array,unsigned cs_count,struct radeon_cmdbuf ** initial_preamble_cs,unsigned initial_preamble_count,struct radeon_cmdbuf ** continue_preamble_cs,unsigned continue_preamble_count,struct radeon_cmdbuf ** postamble_cs,unsigned postamble_count,bool uses_shadow_regs)1032 radv_amdgpu_winsys_cs_submit_internal(struct radv_amdgpu_ctx *ctx, int queue_idx, struct radv_winsys_sem_info *sem_info,
1033 struct radeon_cmdbuf **cs_array, unsigned cs_count,
1034 struct radeon_cmdbuf **initial_preamble_cs, unsigned initial_preamble_count,
1035 struct radeon_cmdbuf **continue_preamble_cs, unsigned continue_preamble_count,
1036 struct radeon_cmdbuf **postamble_cs, unsigned postamble_count,
1037 bool uses_shadow_regs)
1038 {
1039 VkResult result;
1040
1041 /* Last CS is "the gang leader", its IP type determines which fence to signal. */
1042 struct radv_amdgpu_cs *last_cs = radv_amdgpu_cs(cs_array[cs_count - 1]);
1043 struct radv_amdgpu_winsys *ws = last_cs->ws;
1044
1045 const unsigned num_ibs =
1046 radv_amdgpu_count_ibs(cs_array, cs_count, initial_preamble_count, continue_preamble_count, postamble_count);
1047 const unsigned ib_array_size = MIN2(RADV_MAX_IBS_PER_SUBMIT, num_ibs);
1048
1049 STACK_ARRAY(struct radv_amdgpu_cs_ib_info, ibs, ib_array_size);
1050
1051 struct drm_amdgpu_bo_list_entry *handles = NULL;
1052 unsigned num_handles = 0;
1053
1054 u_rwlock_rdlock(&ws->global_bo_list.lock);
1055
1056 result = radv_amdgpu_get_bo_list(ws, &cs_array[0], cs_count, initial_preamble_cs, initial_preamble_count,
1057 continue_preamble_cs, continue_preamble_count, postamble_cs, postamble_count,
1058 &num_handles, &handles);
1059 if (result != VK_SUCCESS)
1060 goto fail;
1061
1062 /* Configure the CS request. */
1063 const uint32_t *max_ib_per_ip = ws->info.max_submitted_ibs;
1064 struct radv_amdgpu_cs_request request = {
1065 .ip_type = last_cs->hw_ip,
1066 .ip_instance = 0,
1067 .ring = queue_idx,
1068 .handles = handles,
1069 .num_handles = num_handles,
1070 .ibs = ibs,
1071 .number_of_ibs = 0, /* set below */
1072 };
1073
1074 for (unsigned cs_idx = 0, cs_ib_idx = 0; cs_idx < cs_count;) {
1075 struct radeon_cmdbuf **preambles = cs_idx ? continue_preamble_cs : initial_preamble_cs;
1076 const unsigned preamble_count = cs_idx ? continue_preamble_count : initial_preamble_count;
1077 const unsigned ib_per_submit = RADV_MAX_IBS_PER_SUBMIT - preamble_count - postamble_count;
1078 unsigned num_submitted_ibs = 0;
1079 unsigned ibs_per_ip[AMD_NUM_IP_TYPES] = {0};
1080
1081 /* Copy preambles to the submission. */
1082 for (unsigned i = 0; i < preamble_count; ++i) {
1083 /* Assume that the full preamble fits into 1 IB. */
1084 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(preambles[i]);
1085 struct radv_amdgpu_cs_ib_info ib;
1086
1087 assert(cs->num_ib_buffers == 1);
1088 ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1089
1090 ibs[num_submitted_ibs++] = ib;
1091 ibs_per_ip[cs->hw_ip]++;
1092 }
1093
1094 for (unsigned i = 0; i < ib_per_submit && cs_idx < cs_count; ++i) {
1095 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(cs_array[cs_idx]);
1096 struct radv_amdgpu_cs_ib_info ib;
1097
1098 if (cs_ib_idx == 0) {
1099 /* Make sure the whole CS fits into the same submission. */
1100 unsigned cs_num_ib = radv_amdgpu_get_num_ibs_per_cs(cs);
1101 if (i + cs_num_ib > ib_per_submit || ibs_per_ip[cs->hw_ip] + cs_num_ib > max_ib_per_ip[cs->hw_ip])
1102 break;
1103
1104 if (cs->hw_ip != request.ip_type) {
1105 /* Found a "follower" CS in a gang submission.
1106 * Make sure to submit this together with its "leader", the next CS.
1107 * We rely on the caller to order each "follower" before its "leader."
1108 */
1109 assert(cs_idx != cs_count - 1);
1110 struct radv_amdgpu_cs *next_cs = radv_amdgpu_cs(cs_array[cs_idx + 1]);
1111 assert(next_cs->hw_ip == request.ip_type);
1112 unsigned next_cs_num_ib = radv_amdgpu_get_num_ibs_per_cs(next_cs);
1113 if (i + cs_num_ib + next_cs_num_ib > ib_per_submit ||
1114 ibs_per_ip[next_cs->hw_ip] + next_cs_num_ib > max_ib_per_ip[next_cs->hw_ip])
1115 break;
1116 }
1117 }
1118
1119 /* When IBs are used, we only need to submit the main IB of this CS, because everything
1120 * else is chained to the first IB. Except when the CS has external IBs because they need
1121 * to be submitted separately. Otherwise we must submit all IBs in the ib_buffers array.
1122 */
1123 if (cs->use_ib) {
1124 if (radv_amdgpu_cs_has_external_ib(cs)) {
1125 const unsigned cur_ib_idx = cs_ib_idx;
1126
1127 ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[cs_ib_idx++]);
1128
1129 /* Loop until the next external IB is found. */
1130 while (cs->ib_buffers[cur_ib_idx].bo && cs->ib_buffers[cs_ib_idx].bo && cs_ib_idx < cs->num_ib_buffers) {
1131 cs_ib_idx++;
1132 }
1133
1134 if (cs_ib_idx == cs->num_ib_buffers) {
1135 cs_idx++;
1136 cs_ib_idx = 0;
1137 }
1138 } else {
1139 ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1140 cs_idx++;
1141 }
1142 } else {
1143 assert(cs_ib_idx < cs->num_ib_buffers);
1144 ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[cs_ib_idx++]);
1145
1146 if (cs_ib_idx == cs->num_ib_buffers) {
1147 cs_idx++;
1148 cs_ib_idx = 0;
1149 }
1150 }
1151
1152 if (uses_shadow_regs && ib.ip_type == AMDGPU_HW_IP_GFX)
1153 ib.flags |= AMDGPU_IB_FLAG_PREEMPT;
1154
1155 assert(num_submitted_ibs < ib_array_size);
1156 ibs[num_submitted_ibs++] = ib;
1157 ibs_per_ip[cs->hw_ip]++;
1158 }
1159
1160 assert(num_submitted_ibs > preamble_count);
1161
1162 /* Copy postambles to the submission. */
1163 for (unsigned i = 0; i < postamble_count; ++i) {
1164 /* Assume that the full postamble fits into 1 IB. */
1165 struct radv_amdgpu_cs *cs = radv_amdgpu_cs(postamble_cs[i]);
1166 struct radv_amdgpu_cs_ib_info ib;
1167
1168 assert(cs->num_ib_buffers == 1);
1169 ib = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1170
1171 ibs[num_submitted_ibs++] = ib;
1172 ibs_per_ip[cs->hw_ip]++;
1173 }
1174
1175 /* Submit the CS. */
1176 request.number_of_ibs = num_submitted_ibs;
1177 result = radv_amdgpu_cs_submit(ctx, &request, sem_info);
1178 if (result != VK_SUCCESS)
1179 goto fail;
1180 }
1181
1182 free(request.handles);
1183
1184 if (result != VK_SUCCESS)
1185 goto fail;
1186
1187 radv_assign_last_submit(ctx, &request);
1188
1189 fail:
1190 u_rwlock_rdunlock(&ws->global_bo_list.lock);
1191 STACK_ARRAY_FINISH(ibs);
1192 return result;
1193 }
1194
1195 static VkResult
radv_amdgpu_cs_submit_zero(struct radv_amdgpu_ctx * ctx,enum amd_ip_type ip_type,int queue_idx,struct radv_winsys_sem_info * sem_info)1196 radv_amdgpu_cs_submit_zero(struct radv_amdgpu_ctx *ctx, enum amd_ip_type ip_type, int queue_idx,
1197 struct radv_winsys_sem_info *sem_info)
1198 {
1199 unsigned hw_ip = ip_type;
1200 unsigned queue_syncobj = radv_amdgpu_ctx_queue_syncobj(ctx, hw_ip, queue_idx);
1201 int ret;
1202
1203 if (!queue_syncobj)
1204 return VK_ERROR_OUT_OF_HOST_MEMORY;
1205
1206 if (sem_info->wait.syncobj_count || sem_info->wait.timeline_syncobj_count) {
1207 int fd;
1208 ret = amdgpu_cs_syncobj_export_sync_file(ctx->ws->dev, queue_syncobj, &fd);
1209 if (ret < 0)
1210 return VK_ERROR_DEVICE_LOST;
1211
1212 for (unsigned i = 0; i < sem_info->wait.syncobj_count; ++i) {
1213 int fd2;
1214 ret = amdgpu_cs_syncobj_export_sync_file(ctx->ws->dev, sem_info->wait.syncobj[i], &fd2);
1215 if (ret < 0) {
1216 close(fd);
1217 return VK_ERROR_DEVICE_LOST;
1218 }
1219
1220 sync_accumulate("radv", &fd, fd2);
1221 close(fd2);
1222 }
1223 for (unsigned i = 0; i < sem_info->wait.timeline_syncobj_count; ++i) {
1224 int fd2;
1225 ret = amdgpu_cs_syncobj_export_sync_file2(
1226 ctx->ws->dev, sem_info->wait.syncobj[i + sem_info->wait.syncobj_count], sem_info->wait.points[i], 0, &fd2);
1227 if (ret < 0) {
1228 /* This works around a kernel bug where the fence isn't copied if it is already
1229 * signalled. Since it is already signalled it is totally fine to not wait on it.
1230 *
1231 * kernel patch: https://patchwork.freedesktop.org/patch/465583/ */
1232 uint64_t point;
1233 ret = amdgpu_cs_syncobj_query2(ctx->ws->dev, &sem_info->wait.syncobj[i + sem_info->wait.syncobj_count],
1234 &point, 1, 0);
1235 if (!ret && point >= sem_info->wait.points[i])
1236 continue;
1237
1238 close(fd);
1239 return VK_ERROR_DEVICE_LOST;
1240 }
1241
1242 sync_accumulate("radv", &fd, fd2);
1243 close(fd2);
1244 }
1245 ret = amdgpu_cs_syncobj_import_sync_file(ctx->ws->dev, queue_syncobj, fd);
1246 close(fd);
1247 if (ret < 0)
1248 return VK_ERROR_DEVICE_LOST;
1249
1250 ctx->queue_syncobj_wait[hw_ip][queue_idx] = true;
1251 }
1252
1253 for (unsigned i = 0; i < sem_info->signal.syncobj_count; ++i) {
1254 uint32_t dst_handle = sem_info->signal.syncobj[i];
1255 uint32_t src_handle = queue_syncobj;
1256
1257 if (ctx->ws->info.has_timeline_syncobj) {
1258 ret = amdgpu_cs_syncobj_transfer(ctx->ws->dev, dst_handle, 0, src_handle, 0, 0);
1259 if (ret < 0)
1260 return VK_ERROR_DEVICE_LOST;
1261 } else {
1262 int fd;
1263 ret = amdgpu_cs_syncobj_export_sync_file(ctx->ws->dev, src_handle, &fd);
1264 if (ret < 0)
1265 return VK_ERROR_DEVICE_LOST;
1266
1267 ret = amdgpu_cs_syncobj_import_sync_file(ctx->ws->dev, dst_handle, fd);
1268 close(fd);
1269 if (ret < 0)
1270 return VK_ERROR_DEVICE_LOST;
1271 }
1272 }
1273 for (unsigned i = 0; i < sem_info->signal.timeline_syncobj_count; ++i) {
1274 ret = amdgpu_cs_syncobj_transfer(ctx->ws->dev, sem_info->signal.syncobj[i + sem_info->signal.syncobj_count],
1275 sem_info->signal.points[i], queue_syncobj, 0, 0);
1276 if (ret < 0)
1277 return VK_ERROR_DEVICE_LOST;
1278 }
1279 return VK_SUCCESS;
1280 }
1281
1282 static VkResult
radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx * _ctx,const struct radv_winsys_submit_info * submit,uint32_t wait_count,const struct vk_sync_wait * waits,uint32_t signal_count,const struct vk_sync_signal * signals)1283 radv_amdgpu_winsys_cs_submit(struct radeon_winsys_ctx *_ctx, const struct radv_winsys_submit_info *submit,
1284 uint32_t wait_count, const struct vk_sync_wait *waits, uint32_t signal_count,
1285 const struct vk_sync_signal *signals)
1286 {
1287 struct radv_amdgpu_ctx *ctx = radv_amdgpu_ctx(_ctx);
1288 struct radv_amdgpu_winsys *ws = ctx->ws;
1289 VkResult result;
1290 unsigned wait_idx = 0, signal_idx = 0;
1291
1292 STACK_ARRAY(uint64_t, wait_points, wait_count);
1293 STACK_ARRAY(uint32_t, wait_syncobj, wait_count);
1294 STACK_ARRAY(uint64_t, signal_points, signal_count);
1295 STACK_ARRAY(uint32_t, signal_syncobj, signal_count);
1296
1297 if (!wait_points || !wait_syncobj || !signal_points || !signal_syncobj) {
1298 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1299 goto out;
1300 }
1301
1302 for (uint32_t i = 0; i < wait_count; ++i) {
1303 if (waits[i].sync->type == &vk_sync_dummy_type)
1304 continue;
1305
1306 assert(waits[i].sync->type == &ws->syncobj_sync_type);
1307 wait_syncobj[wait_idx] = ((struct vk_drm_syncobj *)waits[i].sync)->syncobj;
1308 wait_points[wait_idx] = waits[i].wait_value;
1309 ++wait_idx;
1310 }
1311
1312 for (uint32_t i = 0; i < signal_count; ++i) {
1313 if (signals[i].sync->type == &vk_sync_dummy_type)
1314 continue;
1315
1316 assert(signals[i].sync->type == &ws->syncobj_sync_type);
1317 signal_syncobj[signal_idx] = ((struct vk_drm_syncobj *)signals[i].sync)->syncobj;
1318 signal_points[signal_idx] = signals[i].signal_value;
1319 ++signal_idx;
1320 }
1321
1322 assert(signal_idx <= signal_count);
1323 assert(wait_idx <= wait_count);
1324
1325 const uint32_t wait_timeline_syncobj_count =
1326 (ws->syncobj_sync_type.features & VK_SYNC_FEATURE_TIMELINE) ? wait_idx : 0;
1327 const uint32_t signal_timeline_syncobj_count =
1328 (ws->syncobj_sync_type.features & VK_SYNC_FEATURE_TIMELINE) ? signal_idx : 0;
1329
1330 struct radv_winsys_sem_info sem_info = {
1331 .wait =
1332 {
1333 .points = wait_points,
1334 .syncobj = wait_syncobj,
1335 .timeline_syncobj_count = wait_timeline_syncobj_count,
1336 .syncobj_count = wait_idx - wait_timeline_syncobj_count,
1337 },
1338 .signal =
1339 {
1340 .points = signal_points,
1341 .syncobj = signal_syncobj,
1342 .timeline_syncobj_count = signal_timeline_syncobj_count,
1343 .syncobj_count = signal_idx - signal_timeline_syncobj_count,
1344 },
1345 .cs_emit_wait = true,
1346 .cs_emit_signal = true,
1347 };
1348
1349 if (!submit->cs_count) {
1350 result = radv_amdgpu_cs_submit_zero(ctx, submit->ip_type, submit->queue_index, &sem_info);
1351 } else {
1352 result = radv_amdgpu_winsys_cs_submit_internal(
1353 ctx, submit->queue_index, &sem_info, submit->cs_array, submit->cs_count, submit->initial_preamble_cs,
1354 submit->initial_preamble_count, submit->continue_preamble_cs, submit->continue_preamble_count,
1355 submit->postamble_cs, submit->postamble_count, submit->uses_shadow_regs);
1356 }
1357
1358 out:
1359 STACK_ARRAY_FINISH(wait_points);
1360 STACK_ARRAY_FINISH(wait_syncobj);
1361 STACK_ARRAY_FINISH(signal_points);
1362 STACK_ARRAY_FINISH(signal_syncobj);
1363 return result;
1364 }
1365
1366 static void
radv_amdgpu_winsys_get_cpu_addr(void * _cs,uint64_t addr,struct ac_addr_info * info)1367 radv_amdgpu_winsys_get_cpu_addr(void *_cs, uint64_t addr, struct ac_addr_info *info)
1368 {
1369 struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
1370
1371 memset(info, 0, sizeof(struct ac_addr_info));
1372
1373 if (cs->ws->debug_log_bos) {
1374 u_rwlock_rdlock(&cs->ws->log_bo_list_lock);
1375 list_for_each_entry_rev (struct radv_amdgpu_winsys_bo_log, bo_log, &cs->ws->log_bo_list, list) {
1376 if (addr >= bo_log->va && addr - bo_log->va < bo_log->size) {
1377 info->use_after_free = bo_log->destroyed;
1378 break;
1379 }
1380 }
1381 u_rwlock_rdunlock(&cs->ws->log_bo_list_lock);
1382 }
1383
1384 if (info->use_after_free)
1385 return;
1386
1387 info->valid = !cs->ws->debug_all_bos;
1388
1389 for (unsigned i = 0; i < cs->num_ib_buffers; ++i) {
1390 struct radv_amdgpu_ib *ib = &cs->ib_buffers[i];
1391 struct radv_amdgpu_winsys_bo *bo = (struct radv_amdgpu_winsys_bo *)ib->bo;
1392
1393 if (addr >= bo->base.va && addr - bo->base.va < bo->base.size) {
1394 void *map = radv_buffer_map(&cs->ws->base, &bo->base);
1395 if (map) {
1396 info->cpu_addr = (char *)map + (addr - bo->base.va);
1397 info->valid = true;
1398 return;
1399 }
1400 }
1401 }
1402 u_rwlock_rdlock(&cs->ws->global_bo_list.lock);
1403 for (uint32_t i = 0; i < cs->ws->global_bo_list.count; i++) {
1404 struct radv_amdgpu_winsys_bo *bo = cs->ws->global_bo_list.bos[i];
1405 if (addr >= bo->base.va && addr - bo->base.va < bo->base.size) {
1406 void *map = radv_buffer_map(&cs->ws->base, &bo->base);
1407 if (map) {
1408 u_rwlock_rdunlock(&cs->ws->global_bo_list.lock);
1409 info->valid = true;
1410 info->cpu_addr = (char *)map + (addr - bo->base.va);
1411 return;
1412 }
1413 }
1414 }
1415 u_rwlock_rdunlock(&cs->ws->global_bo_list.lock);
1416
1417 return;
1418 }
1419
1420 static void
radv_amdgpu_winsys_cs_dump(struct radeon_cmdbuf * _cs,FILE * file,const int * trace_ids,int trace_id_count,enum radv_cs_dump_type type)1421 radv_amdgpu_winsys_cs_dump(struct radeon_cmdbuf *_cs, FILE *file, const int *trace_ids, int trace_id_count,
1422 enum radv_cs_dump_type type)
1423 {
1424 struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
1425 struct radv_amdgpu_winsys *ws = cs->ws;
1426
1427 if (cs->use_ib && !radv_amdgpu_cs_has_external_ib(cs)) {
1428 struct radv_amdgpu_cs_ib_info ib_info = radv_amdgpu_cs_ib_to_info(cs, cs->ib_buffers[0]);
1429
1430 struct ac_addr_info addr_info;
1431 radv_amdgpu_winsys_get_cpu_addr(cs, ib_info.ib_mc_address, &addr_info);
1432 assert(addr_info.cpu_addr);
1433
1434 if (type == RADV_CS_DUMP_TYPE_IBS) {
1435 struct ac_ib_parser ib_parser = {
1436 .f = file,
1437 .ib = addr_info.cpu_addr,
1438 .num_dw = cs->ib_buffers[0].cdw,
1439 .trace_ids = trace_ids,
1440 .trace_id_count = trace_id_count,
1441 .gfx_level = ws->info.gfx_level,
1442 .family = ws->info.family,
1443 .ip_type = cs->hw_ip,
1444 .addr_callback = radv_amdgpu_winsys_get_cpu_addr,
1445 .addr_callback_data = cs,
1446 .annotations = cs->annotations,
1447 };
1448
1449 ac_parse_ib(&ib_parser, "main IB");
1450 } else {
1451 uint32_t *ib_dw = addr_info.cpu_addr;
1452 ac_gather_context_rolls(file, &ib_dw, &cs->ib_buffers[0].cdw, 1, cs->annotations, &ws->info);
1453 }
1454 } else {
1455 uint32_t **ibs = type == RADV_CS_DUMP_TYPE_CTX_ROLLS ? malloc(cs->num_ib_buffers * sizeof(uint32_t *)) : NULL;
1456 uint32_t *ib_dw_sizes =
1457 type == RADV_CS_DUMP_TYPE_CTX_ROLLS ? malloc(cs->num_ib_buffers * sizeof(uint32_t)) : NULL;
1458
1459 for (unsigned i = 0; i < cs->num_ib_buffers; i++) {
1460 struct radv_amdgpu_ib *ib = &cs->ib_buffers[i];
1461 char name[64];
1462 void *mapped;
1463
1464 if (!ib->bo) {
1465 fprintf(file, "Chunk %d isn't owned by this CS.\n\n", i);
1466 continue;
1467 }
1468
1469 mapped = radv_buffer_map(&ws->base, ib->bo);
1470 if (!mapped)
1471 continue;
1472
1473 if (cs->num_ib_buffers > 1) {
1474 snprintf(name, sizeof(name), "main IB (chunk %d)", i);
1475 } else {
1476 snprintf(name, sizeof(name), "main IB");
1477 }
1478
1479 if (type == RADV_CS_DUMP_TYPE_IBS) {
1480 struct ac_ib_parser ib_parser = {
1481 .f = file,
1482 .ib = mapped,
1483 .num_dw = ib->cdw,
1484 .trace_ids = trace_ids,
1485 .trace_id_count = trace_id_count,
1486 .gfx_level = ws->info.gfx_level,
1487 .family = ws->info.family,
1488 .ip_type = cs->hw_ip,
1489 .addr_callback = radv_amdgpu_winsys_get_cpu_addr,
1490 .addr_callback_data = cs,
1491 .annotations = cs->annotations,
1492 };
1493
1494 ac_parse_ib(&ib_parser, name);
1495 } else {
1496 ibs[i] = mapped;
1497 ib_dw_sizes[i] = ib->cdw;
1498 }
1499 }
1500
1501 if (type == RADV_CS_DUMP_TYPE_CTX_ROLLS) {
1502 ac_gather_context_rolls(file, ibs, ib_dw_sizes, cs->num_ib_buffers, cs->annotations, &ws->info);
1503
1504 free(ibs);
1505 free(ib_dw_sizes);
1506 }
1507 }
1508 }
1509
1510 static void
radv_amdgpu_winsys_cs_annotate(struct radeon_cmdbuf * _cs,const char * annotation)1511 radv_amdgpu_winsys_cs_annotate(struct radeon_cmdbuf *_cs, const char *annotation)
1512 {
1513 struct radv_amdgpu_cs *cs = (struct radv_amdgpu_cs *)_cs;
1514
1515 if (!cs->annotations) {
1516 cs->annotations = _mesa_pointer_hash_table_create(NULL);
1517 if (!cs->annotations)
1518 return;
1519 }
1520
1521 struct hash_entry *entry = _mesa_hash_table_search(cs->annotations, _cs->buf + _cs->cdw);
1522 if (entry) {
1523 char *old_annotation = entry->data;
1524 char *new_annotation = calloc(strlen(old_annotation) + strlen(annotation) + 5, 1);
1525 sprintf(new_annotation, "%s -> %s", old_annotation, annotation);
1526 free(old_annotation);
1527 _mesa_hash_table_insert(cs->annotations, _cs->buf + _cs->cdw, new_annotation);
1528 } else {
1529 _mesa_hash_table_insert(cs->annotations, _cs->buf + _cs->cdw, strdup(annotation));
1530 }
1531 }
1532
1533 static uint32_t
radv_to_amdgpu_priority(enum radeon_ctx_priority radv_priority)1534 radv_to_amdgpu_priority(enum radeon_ctx_priority radv_priority)
1535 {
1536 switch (radv_priority) {
1537 case RADEON_CTX_PRIORITY_REALTIME:
1538 return AMDGPU_CTX_PRIORITY_VERY_HIGH;
1539 case RADEON_CTX_PRIORITY_HIGH:
1540 return AMDGPU_CTX_PRIORITY_HIGH;
1541 case RADEON_CTX_PRIORITY_MEDIUM:
1542 return AMDGPU_CTX_PRIORITY_NORMAL;
1543 case RADEON_CTX_PRIORITY_LOW:
1544 return AMDGPU_CTX_PRIORITY_LOW;
1545 default:
1546 unreachable("Invalid context priority");
1547 }
1548 }
1549
1550 static VkResult
radv_amdgpu_ctx_create(struct radeon_winsys * _ws,enum radeon_ctx_priority priority,struct radeon_winsys_ctx ** rctx)1551 radv_amdgpu_ctx_create(struct radeon_winsys *_ws, enum radeon_ctx_priority priority, struct radeon_winsys_ctx **rctx)
1552 {
1553 struct radv_amdgpu_winsys *ws = radv_amdgpu_winsys(_ws);
1554 struct radv_amdgpu_ctx *ctx = CALLOC_STRUCT(radv_amdgpu_ctx);
1555 uint32_t amdgpu_priority = radv_to_amdgpu_priority(priority);
1556 VkResult result;
1557 int r;
1558
1559 if (!ctx)
1560 return VK_ERROR_OUT_OF_HOST_MEMORY;
1561
1562 r = amdgpu_cs_ctx_create2(ws->dev, amdgpu_priority, &ctx->ctx);
1563 if (r && r == -EACCES) {
1564 result = VK_ERROR_NOT_PERMITTED_KHR;
1565 goto fail_create;
1566 } else if (r) {
1567 fprintf(stderr, "radv/amdgpu: radv_amdgpu_cs_ctx_create2 failed. (%i)\n", r);
1568 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1569 goto fail_create;
1570 }
1571 ctx->ws = ws;
1572
1573 assert(AMDGPU_HW_IP_NUM * MAX_RINGS_PER_TYPE * 4 * sizeof(uint64_t) <= 4096);
1574 result = ws->base.buffer_create(&ws->base, 4096, 8, RADEON_DOMAIN_GTT,
1575 RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_CS, 0,
1576 &ctx->fence_bo);
1577 if (result != VK_SUCCESS) {
1578 goto fail_alloc;
1579 }
1580
1581 *rctx = (struct radeon_winsys_ctx *)ctx;
1582 return VK_SUCCESS;
1583
1584 fail_alloc:
1585 amdgpu_cs_ctx_free(ctx->ctx);
1586 fail_create:
1587 FREE(ctx);
1588 return result;
1589 }
1590
1591 static void
radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx * rwctx)1592 radv_amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
1593 {
1594 struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1595
1596 for (unsigned ip = 0; ip <= AMDGPU_HW_IP_NUM; ++ip) {
1597 for (unsigned ring = 0; ring < MAX_RINGS_PER_TYPE; ++ring) {
1598 if (ctx->queue_syncobj[ip][ring])
1599 amdgpu_cs_destroy_syncobj(ctx->ws->dev, ctx->queue_syncobj[ip][ring]);
1600 }
1601 }
1602
1603 ctx->ws->base.buffer_destroy(&ctx->ws->base, ctx->fence_bo);
1604 amdgpu_cs_ctx_free(ctx->ctx);
1605 FREE(ctx);
1606 }
1607
1608 static uint32_t
radv_amdgpu_ctx_queue_syncobj(struct radv_amdgpu_ctx * ctx,unsigned ip,unsigned ring)1609 radv_amdgpu_ctx_queue_syncobj(struct radv_amdgpu_ctx *ctx, unsigned ip, unsigned ring)
1610 {
1611 uint32_t *syncobj = &ctx->queue_syncobj[ip][ring];
1612 if (!*syncobj) {
1613 amdgpu_cs_create_syncobj2(ctx->ws->dev, DRM_SYNCOBJ_CREATE_SIGNALED, syncobj);
1614 }
1615 return *syncobj;
1616 }
1617
1618 static bool
radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx * rwctx,enum amd_ip_type ip_type,int ring_index)1619 radv_amdgpu_ctx_wait_idle(struct radeon_winsys_ctx *rwctx, enum amd_ip_type ip_type, int ring_index)
1620 {
1621 struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1622
1623 if (ctx->last_submission[ip_type][ring_index].fence.fence) {
1624 uint32_t expired;
1625 int ret =
1626 amdgpu_cs_query_fence_status(&ctx->last_submission[ip_type][ring_index].fence, 1000000000ull, 0, &expired);
1627
1628 if (ret || !expired)
1629 return false;
1630 }
1631
1632 return true;
1633 }
1634
1635 static uint32_t
radv_to_amdgpu_pstate(enum radeon_ctx_pstate radv_pstate)1636 radv_to_amdgpu_pstate(enum radeon_ctx_pstate radv_pstate)
1637 {
1638 switch (radv_pstate) {
1639 case RADEON_CTX_PSTATE_NONE:
1640 return AMDGPU_CTX_STABLE_PSTATE_NONE;
1641 case RADEON_CTX_PSTATE_STANDARD:
1642 return AMDGPU_CTX_STABLE_PSTATE_STANDARD;
1643 case RADEON_CTX_PSTATE_MIN_SCLK:
1644 return AMDGPU_CTX_STABLE_PSTATE_MIN_SCLK;
1645 case RADEON_CTX_PSTATE_MIN_MCLK:
1646 return AMDGPU_CTX_STABLE_PSTATE_MIN_MCLK;
1647 case RADEON_CTX_PSTATE_PEAK:
1648 return AMDGPU_CTX_STABLE_PSTATE_PEAK;
1649 default:
1650 unreachable("Invalid pstate");
1651 }
1652 }
1653
1654 static int
radv_amdgpu_ctx_set_pstate(struct radeon_winsys_ctx * rwctx,enum radeon_ctx_pstate pstate)1655 radv_amdgpu_ctx_set_pstate(struct radeon_winsys_ctx *rwctx, enum radeon_ctx_pstate pstate)
1656 {
1657 struct radv_amdgpu_ctx *ctx = (struct radv_amdgpu_ctx *)rwctx;
1658 uint32_t new_pstate = radv_to_amdgpu_pstate(pstate);
1659 uint32_t current_pstate = 0;
1660 int r;
1661
1662 r = amdgpu_cs_ctx_stable_pstate(ctx->ctx, AMDGPU_CTX_OP_GET_STABLE_PSTATE, 0, ¤t_pstate);
1663 if (r) {
1664 fprintf(stderr, "radv/amdgpu: failed to get current pstate\n");
1665 return r;
1666 }
1667
1668 /* Do not try to set a new pstate when the current one is already what we want. Otherwise, the
1669 * kernel might return -EBUSY if we have multiple AMDGPU contexts in flight.
1670 */
1671 if (current_pstate == new_pstate)
1672 return 0;
1673
1674 r = amdgpu_cs_ctx_stable_pstate(ctx->ctx, AMDGPU_CTX_OP_SET_STABLE_PSTATE, new_pstate, NULL);
1675 if (r) {
1676 fprintf(stderr, "radv/amdgpu: failed to set new pstate\n");
1677 return r;
1678 }
1679
1680 return 0;
1681 }
1682
1683 static void *
radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts * counts,uint32_t queue_syncobj,struct drm_amdgpu_cs_chunk * chunk,int chunk_id)1684 radv_amdgpu_cs_alloc_syncobj_chunk(struct radv_winsys_sem_counts *counts, uint32_t queue_syncobj,
1685 struct drm_amdgpu_cs_chunk *chunk, int chunk_id)
1686 {
1687 unsigned count = counts->syncobj_count + (queue_syncobj ? 1 : 0);
1688 struct drm_amdgpu_cs_chunk_sem *syncobj = malloc(sizeof(struct drm_amdgpu_cs_chunk_sem) * count);
1689 if (!syncobj)
1690 return NULL;
1691
1692 for (unsigned i = 0; i < counts->syncobj_count; i++) {
1693 struct drm_amdgpu_cs_chunk_sem *sem = &syncobj[i];
1694 sem->handle = counts->syncobj[i];
1695 }
1696
1697 if (queue_syncobj)
1698 syncobj[counts->syncobj_count].handle = queue_syncobj;
1699
1700 chunk->chunk_id = chunk_id;
1701 chunk->length_dw = sizeof(struct drm_amdgpu_cs_chunk_sem) / 4 * count;
1702 chunk->chunk_data = (uint64_t)(uintptr_t)syncobj;
1703 return syncobj;
1704 }
1705
1706 static void *
radv_amdgpu_cs_alloc_timeline_syncobj_chunk(struct radv_winsys_sem_counts * counts,uint32_t queue_syncobj,struct drm_amdgpu_cs_chunk * chunk,int chunk_id)1707 radv_amdgpu_cs_alloc_timeline_syncobj_chunk(struct radv_winsys_sem_counts *counts, uint32_t queue_syncobj,
1708 struct drm_amdgpu_cs_chunk *chunk, int chunk_id)
1709 {
1710 uint32_t count = counts->syncobj_count + counts->timeline_syncobj_count + (queue_syncobj ? 1 : 0);
1711 struct drm_amdgpu_cs_chunk_syncobj *syncobj = malloc(sizeof(struct drm_amdgpu_cs_chunk_syncobj) * count);
1712 if (!syncobj)
1713 return NULL;
1714
1715 for (unsigned i = 0; i < counts->syncobj_count; i++) {
1716 struct drm_amdgpu_cs_chunk_syncobj *sem = &syncobj[i];
1717 sem->handle = counts->syncobj[i];
1718 sem->flags = 0;
1719 sem->point = 0;
1720 }
1721
1722 for (unsigned i = 0; i < counts->timeline_syncobj_count; i++) {
1723 struct drm_amdgpu_cs_chunk_syncobj *sem = &syncobj[i + counts->syncobj_count];
1724 sem->handle = counts->syncobj[i + counts->syncobj_count];
1725 sem->flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT;
1726 sem->point = counts->points[i];
1727 }
1728
1729 if (queue_syncobj) {
1730 syncobj[count - 1].handle = queue_syncobj;
1731 syncobj[count - 1].flags = 0;
1732 syncobj[count - 1].point = 0;
1733 }
1734
1735 chunk->chunk_id = chunk_id;
1736 chunk->length_dw = sizeof(struct drm_amdgpu_cs_chunk_syncobj) / 4 * count;
1737 chunk->chunk_data = (uint64_t)(uintptr_t)syncobj;
1738 return syncobj;
1739 }
1740
1741 static bool
radv_amdgpu_cs_has_user_fence(struct radv_amdgpu_cs_request * request)1742 radv_amdgpu_cs_has_user_fence(struct radv_amdgpu_cs_request *request)
1743 {
1744 return request->ip_type != AMDGPU_HW_IP_UVD && request->ip_type != AMDGPU_HW_IP_VCE &&
1745 request->ip_type != AMDGPU_HW_IP_UVD_ENC && request->ip_type != AMDGPU_HW_IP_VCN_DEC &&
1746 request->ip_type != AMDGPU_HW_IP_VCN_ENC && request->ip_type != AMDGPU_HW_IP_VCN_JPEG;
1747 }
1748
1749 static VkResult
radv_amdgpu_cs_submit(struct radv_amdgpu_ctx * ctx,struct radv_amdgpu_cs_request * request,struct radv_winsys_sem_info * sem_info)1750 radv_amdgpu_cs_submit(struct radv_amdgpu_ctx *ctx, struct radv_amdgpu_cs_request *request,
1751 struct radv_winsys_sem_info *sem_info)
1752 {
1753 int r;
1754 int num_chunks;
1755 int size;
1756 struct drm_amdgpu_cs_chunk *chunks;
1757 struct drm_amdgpu_cs_chunk_data *chunk_data;
1758 struct drm_amdgpu_bo_list_in bo_list_in;
1759 void *wait_syncobj = NULL, *signal_syncobj = NULL;
1760 int i;
1761 VkResult result = VK_SUCCESS;
1762 bool has_user_fence = radv_amdgpu_cs_has_user_fence(request);
1763 uint32_t queue_syncobj = radv_amdgpu_ctx_queue_syncobj(ctx, request->ip_type, request->ring);
1764 bool *queue_syncobj_wait = &ctx->queue_syncobj_wait[request->ip_type][request->ring];
1765
1766 if (!queue_syncobj)
1767 return VK_ERROR_OUT_OF_HOST_MEMORY;
1768
1769 size = request->number_of_ibs + 1 + (has_user_fence ? 1 : 0) + 1 /* bo list */ + 3;
1770
1771 chunks = malloc(sizeof(chunks[0]) * size);
1772 if (!chunks)
1773 return VK_ERROR_OUT_OF_HOST_MEMORY;
1774
1775 size = request->number_of_ibs + (has_user_fence ? 1 : 0);
1776
1777 chunk_data = malloc(sizeof(chunk_data[0]) * size);
1778 if (!chunk_data) {
1779 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1780 goto error_out;
1781 }
1782
1783 num_chunks = request->number_of_ibs;
1784 for (i = 0; i < request->number_of_ibs; i++) {
1785 struct radv_amdgpu_cs_ib_info *ib;
1786 chunks[i].chunk_id = AMDGPU_CHUNK_ID_IB;
1787 chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1788 chunks[i].chunk_data = (uint64_t)(uintptr_t)&chunk_data[i];
1789
1790 ib = &request->ibs[i];
1791 assert(ib->ib_mc_address && ib->ib_mc_address % ctx->ws->info.ip[ib->ip_type].ib_alignment == 0);
1792 assert(ib->size);
1793
1794 chunk_data[i].ib_data._pad = 0;
1795 chunk_data[i].ib_data.va_start = ib->ib_mc_address;
1796 chunk_data[i].ib_data.ib_bytes = ib->size * 4;
1797 chunk_data[i].ib_data.ip_type = ib->ip_type;
1798 chunk_data[i].ib_data.ip_instance = request->ip_instance;
1799 chunk_data[i].ib_data.ring = request->ring;
1800 chunk_data[i].ib_data.flags = ib->flags;
1801 }
1802
1803 assert(chunk_data[request->number_of_ibs - 1].ib_data.ip_type == request->ip_type);
1804
1805 if (has_user_fence) {
1806 i = num_chunks++;
1807 chunks[i].chunk_id = AMDGPU_CHUNK_ID_FENCE;
1808 chunks[i].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
1809 chunks[i].chunk_data = (uint64_t)(uintptr_t)&chunk_data[i];
1810
1811 struct amdgpu_cs_fence_info fence_info;
1812 fence_info.handle = radv_amdgpu_winsys_bo(ctx->fence_bo)->bo;
1813 /* Need to reserve 4 QWORD for user fence:
1814 * QWORD[0]: completed fence
1815 * QWORD[1]: preempted fence
1816 * QWORD[2]: reset fence
1817 * QWORD[3]: preempted then reset
1818 */
1819 fence_info.offset = (request->ip_type * MAX_RINGS_PER_TYPE + request->ring) * 4;
1820 amdgpu_cs_chunk_fence_info_to_data(&fence_info, &chunk_data[i]);
1821 }
1822
1823 if (sem_info->cs_emit_wait &&
1824 (sem_info->wait.timeline_syncobj_count || sem_info->wait.syncobj_count || *queue_syncobj_wait)) {
1825
1826 if (ctx->ws->info.has_timeline_syncobj) {
1827 wait_syncobj = radv_amdgpu_cs_alloc_timeline_syncobj_chunk(&sem_info->wait, queue_syncobj, &chunks[num_chunks],
1828 AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_WAIT);
1829 } else {
1830 wait_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->wait, queue_syncobj, &chunks[num_chunks],
1831 AMDGPU_CHUNK_ID_SYNCOBJ_IN);
1832 }
1833 if (!wait_syncobj) {
1834 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1835 goto error_out;
1836 }
1837 num_chunks++;
1838
1839 sem_info->cs_emit_wait = false;
1840 *queue_syncobj_wait = false;
1841 }
1842
1843 if (sem_info->cs_emit_signal) {
1844 if (ctx->ws->info.has_timeline_syncobj) {
1845 signal_syncobj = radv_amdgpu_cs_alloc_timeline_syncobj_chunk(
1846 &sem_info->signal, queue_syncobj, &chunks[num_chunks], AMDGPU_CHUNK_ID_SYNCOBJ_TIMELINE_SIGNAL);
1847 } else {
1848 signal_syncobj = radv_amdgpu_cs_alloc_syncobj_chunk(&sem_info->signal, queue_syncobj, &chunks[num_chunks],
1849 AMDGPU_CHUNK_ID_SYNCOBJ_OUT);
1850 }
1851 if (!signal_syncobj) {
1852 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1853 goto error_out;
1854 }
1855 num_chunks++;
1856 }
1857
1858 bo_list_in.operation = ~0;
1859 bo_list_in.list_handle = ~0;
1860 bo_list_in.bo_number = request->num_handles;
1861 bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
1862 bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)request->handles;
1863
1864 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
1865 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
1866 chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in;
1867 num_chunks++;
1868
1869 /* The kernel returns -ENOMEM with many parallel processes using GDS such as test suites quite
1870 * often, but it eventually succeeds after enough attempts. This happens frequently with dEQP
1871 * using NGG streamout.
1872 */
1873 uint64_t abs_timeout_ns = os_time_get_absolute_timeout(1000000000ull); /* 1s */
1874
1875 r = 0;
1876 do {
1877 /* Wait 1 ms and try again. */
1878 if (r == -ENOMEM)
1879 os_time_sleep(1000);
1880
1881 r = amdgpu_cs_submit_raw2(ctx->ws->dev, ctx->ctx, 0, num_chunks, chunks, &request->seq_no);
1882 } while (r == -ENOMEM && os_time_get_nano() < abs_timeout_ns);
1883
1884 if (r) {
1885 if (r == -ENOMEM) {
1886 fprintf(stderr, "radv/amdgpu: Not enough memory for command submission.\n");
1887 result = VK_ERROR_OUT_OF_HOST_MEMORY;
1888 } else if (r == -ECANCELED) {
1889 fprintf(stderr,
1890 "radv/amdgpu: The CS has been cancelled because the context is lost. This context is innocent.\n");
1891 result = VK_ERROR_DEVICE_LOST;
1892 } else if (r == -ENODATA) {
1893 fprintf(stderr, "radv/amdgpu: The CS has been cancelled because the context is lost. This context is guilty "
1894 "of a soft recovery.\n");
1895 result = VK_ERROR_DEVICE_LOST;
1896 } else if (r == -ETIME) {
1897 fprintf(stderr, "radv/amdgpu: The CS has been cancelled because the context is lost. This context is guilty "
1898 "of a hard recovery.\n");
1899 result = VK_ERROR_DEVICE_LOST;
1900 } else {
1901 fprintf(stderr,
1902 "radv/amdgpu: The CS has been rejected, "
1903 "see dmesg for more information (%i).\n",
1904 r);
1905 result = VK_ERROR_UNKNOWN;
1906 }
1907 }
1908
1909 error_out:
1910 free(chunks);
1911 free(chunk_data);
1912 free(wait_syncobj);
1913 free(signal_syncobj);
1914 return result;
1915 }
1916
1917 void
radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys * ws)1918 radv_amdgpu_cs_init_functions(struct radv_amdgpu_winsys *ws)
1919 {
1920 ws->base.ctx_create = radv_amdgpu_ctx_create;
1921 ws->base.ctx_destroy = radv_amdgpu_ctx_destroy;
1922 ws->base.ctx_wait_idle = radv_amdgpu_ctx_wait_idle;
1923 ws->base.ctx_set_pstate = radv_amdgpu_ctx_set_pstate;
1924 ws->base.cs_domain = radv_amdgpu_cs_domain;
1925 ws->base.cs_create = radv_amdgpu_cs_create;
1926 ws->base.cs_destroy = radv_amdgpu_cs_destroy;
1927 ws->base.cs_grow = radv_amdgpu_cs_grow;
1928 ws->base.cs_finalize = radv_amdgpu_cs_finalize;
1929 ws->base.cs_reset = radv_amdgpu_cs_reset;
1930 ws->base.cs_chain = radv_amdgpu_cs_chain;
1931 ws->base.cs_unchain = radv_amdgpu_cs_unchain;
1932 ws->base.cs_add_buffer = radv_amdgpu_cs_add_buffer;
1933 ws->base.cs_execute_secondary = radv_amdgpu_cs_execute_secondary;
1934 ws->base.cs_execute_ib = radv_amdgpu_cs_execute_ib;
1935 ws->base.cs_submit = radv_amdgpu_winsys_cs_submit;
1936 ws->base.cs_dump = radv_amdgpu_winsys_cs_dump;
1937 ws->base.cs_annotate = radv_amdgpu_winsys_cs_annotate;
1938 ws->base.cs_pad = radv_amdgpu_winsys_cs_pad;
1939 }
1940