1 /*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <[email protected]>
4 * Copyright © 2015 Advanced Micro Devices, Inc.
5 *
6 * SPDX-License-Identifier: MIT
7 */
8
9 #include "amdgpu_cs.h"
10 #include "util/detect_os.h"
11 #include "util/os_time.h"
12 #include <inttypes.h>
13 #include <stdio.h>
14
15 #include "amd/common/sid.h"
16
17 /* Some BSDs don't define ENODATA (and ENODATA is replaced with different error
18 * codes in the kernel).
19 */
20 #if DETECT_OS_OPENBSD
21 #define ENODATA ENOTSUP
22 #elif DETECT_OS_FREEBSD || DETECT_OS_DRAGONFLY
23 #define ENODATA ECONNREFUSED
24 #endif
25
26 /* FENCES */
27
amdgpu_fence_destroy(struct amdgpu_fence * fence)28 void amdgpu_fence_destroy(struct amdgpu_fence *fence)
29 {
30 amdgpu_cs_destroy_syncobj(fence->aws->dev, fence->syncobj);
31
32 if (fence->ctx)
33 amdgpu_ctx_reference(&fence->ctx, NULL);
34
35 util_queue_fence_destroy(&fence->submitted);
36 FREE(fence);
37 }
38
39 static struct pipe_fence_handle *
amdgpu_fence_create(struct amdgpu_cs * cs)40 amdgpu_fence_create(struct amdgpu_cs *cs)
41 {
42 struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
43 struct amdgpu_ctx *ctx = cs->ctx;
44
45 fence->reference.count = 1;
46 fence->aws = ctx->aws;
47 amdgpu_ctx_reference(&fence->ctx, ctx);
48 fence->ctx = ctx;
49 fence->ip_type = cs->ip_type;
50 if (amdgpu_cs_create_syncobj2(ctx->aws->dev, 0, &fence->syncobj)) {
51 free(fence);
52 return NULL;
53 }
54
55 util_queue_fence_init(&fence->submitted);
56 util_queue_fence_reset(&fence->submitted);
57 fence->queue_index = cs->queue_index;
58 return (struct pipe_fence_handle *)fence;
59 }
60
61 static struct pipe_fence_handle *
amdgpu_fence_import_syncobj(struct radeon_winsys * rws,int fd)62 amdgpu_fence_import_syncobj(struct radeon_winsys *rws, int fd)
63 {
64 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
65 struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
66 int r;
67
68 if (!fence)
69 return NULL;
70
71 pipe_reference_init(&fence->reference, 1);
72 fence->aws = aws;
73 fence->ip_type = 0xffffffff;
74
75 r = amdgpu_cs_import_syncobj(aws->dev, fd, &fence->syncobj);
76 if (r) {
77 FREE(fence);
78 return NULL;
79 }
80
81 util_queue_fence_init(&fence->submitted);
82 fence->imported = true;
83
84 return (struct pipe_fence_handle*)fence;
85 }
86
87 static struct pipe_fence_handle *
amdgpu_fence_import_sync_file(struct radeon_winsys * rws,int fd)88 amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd)
89 {
90 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
91 struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
92
93 if (!fence)
94 return NULL;
95
96 pipe_reference_init(&fence->reference, 1);
97 fence->aws = aws;
98 /* fence->ctx == NULL means that the fence is syncobj-based. */
99
100 /* Convert sync_file into syncobj. */
101 int r = amdgpu_cs_create_syncobj(aws->dev, &fence->syncobj);
102 if (r) {
103 FREE(fence);
104 return NULL;
105 }
106
107 r = amdgpu_cs_syncobj_import_sync_file(aws->dev, fence->syncobj, fd);
108 if (r) {
109 amdgpu_cs_destroy_syncobj(aws->dev, fence->syncobj);
110 FREE(fence);
111 return NULL;
112 }
113
114 util_queue_fence_init(&fence->submitted);
115 fence->imported = true;
116
117 return (struct pipe_fence_handle*)fence;
118 }
119
amdgpu_fence_export_sync_file(struct radeon_winsys * rws,struct pipe_fence_handle * pfence)120 static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws,
121 struct pipe_fence_handle *pfence)
122 {
123 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
124 struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
125 int fd, r;
126
127 util_queue_fence_wait(&fence->submitted);
128
129 /* Convert syncobj into sync_file. */
130 r = amdgpu_cs_syncobj_export_sync_file(aws->dev, fence->syncobj, &fd);
131 return r ? -1 : fd;
132 }
133
amdgpu_export_signalled_sync_file(struct radeon_winsys * rws)134 static int amdgpu_export_signalled_sync_file(struct radeon_winsys *rws)
135 {
136 struct amdgpu_winsys *aws = amdgpu_winsys(rws);
137 uint32_t syncobj;
138 int fd = -1;
139
140 int r = amdgpu_cs_create_syncobj2(aws->dev, DRM_SYNCOBJ_CREATE_SIGNALED,
141 &syncobj);
142 if (r) {
143 return -1;
144 }
145
146 r = amdgpu_cs_syncobj_export_sync_file(aws->dev, syncobj, &fd);
147 if (r) {
148 fd = -1;
149 }
150
151 amdgpu_cs_destroy_syncobj(aws->dev, syncobj);
152 return fd;
153 }
154
amdgpu_fence_submitted(struct pipe_fence_handle * fence,uint64_t seq_no,uint64_t * user_fence_cpu_address)155 static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
156 uint64_t seq_no,
157 uint64_t *user_fence_cpu_address)
158 {
159 struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
160
161 afence->seq_no = seq_no;
162 afence->user_fence_cpu_address = user_fence_cpu_address;
163 util_queue_fence_signal(&afence->submitted);
164 }
165
amdgpu_fence_signalled(struct pipe_fence_handle * fence)166 static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
167 {
168 struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
169
170 afence->signalled = true;
171 util_queue_fence_signal(&afence->submitted);
172 }
173
amdgpu_fence_wait(struct pipe_fence_handle * fence,uint64_t timeout,bool absolute)174 bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
175 bool absolute)
176 {
177 struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
178 int64_t abs_timeout;
179 uint64_t *user_fence_cpu;
180
181 if (afence->signalled)
182 return true;
183
184 if (absolute)
185 abs_timeout = timeout;
186 else
187 abs_timeout = os_time_get_absolute_timeout(timeout);
188
189 /* The fence might not have a number assigned if its IB is being
190 * submitted in the other thread right now. Wait until the submission
191 * is done. */
192 if (!util_queue_fence_wait_timeout(&afence->submitted, abs_timeout))
193 return false;
194
195 user_fence_cpu = afence->user_fence_cpu_address;
196 if (user_fence_cpu) {
197 if (*user_fence_cpu >= afence->seq_no) {
198 afence->signalled = true;
199 return true;
200 }
201
202 /* No timeout, just query: no need for the ioctl. */
203 if (!absolute && !timeout)
204 return false;
205 }
206
207 if ((uint64_t)abs_timeout == OS_TIMEOUT_INFINITE)
208 abs_timeout = INT64_MAX;
209
210 if (amdgpu_cs_syncobj_wait(afence->aws->dev, &afence->syncobj, 1,
211 abs_timeout, 0, NULL))
212
213 return false;
214
215 afence->signalled = true;
216 return true;
217 }
218
amdgpu_fence_wait_rel_timeout(struct radeon_winsys * rws,struct pipe_fence_handle * fence,uint64_t timeout)219 static bool amdgpu_fence_wait_rel_timeout(struct radeon_winsys *rws,
220 struct pipe_fence_handle *fence,
221 uint64_t timeout)
222 {
223 return amdgpu_fence_wait(fence, timeout, false);
224 }
225
226 static struct pipe_fence_handle *
amdgpu_cs_get_next_fence(struct radeon_cmdbuf * rcs)227 amdgpu_cs_get_next_fence(struct radeon_cmdbuf *rcs)
228 {
229 struct amdgpu_cs *cs = amdgpu_cs(rcs);
230 struct pipe_fence_handle *fence = NULL;
231
232 if (cs->noop)
233 return NULL;
234
235 if (cs->next_fence) {
236 amdgpu_fence_reference(&fence, cs->next_fence);
237 return fence;
238 }
239
240 fence = amdgpu_fence_create(cs);
241 if (!fence)
242 return NULL;
243
244 amdgpu_fence_reference(&cs->next_fence, fence);
245 return fence;
246 }
247
248 /* CONTEXTS */
249
250 static uint32_t
radeon_to_amdgpu_priority(enum radeon_ctx_priority radeon_priority)251 radeon_to_amdgpu_priority(enum radeon_ctx_priority radeon_priority)
252 {
253 switch (radeon_priority) {
254 case RADEON_CTX_PRIORITY_REALTIME:
255 return AMDGPU_CTX_PRIORITY_VERY_HIGH;
256 case RADEON_CTX_PRIORITY_HIGH:
257 return AMDGPU_CTX_PRIORITY_HIGH;
258 case RADEON_CTX_PRIORITY_MEDIUM:
259 return AMDGPU_CTX_PRIORITY_NORMAL;
260 case RADEON_CTX_PRIORITY_LOW:
261 return AMDGPU_CTX_PRIORITY_LOW;
262 default:
263 unreachable("Invalid context priority");
264 }
265 }
266
amdgpu_ctx_create(struct radeon_winsys * rws,enum radeon_ctx_priority priority,bool allow_context_lost)267 static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *rws,
268 enum radeon_ctx_priority priority,
269 bool allow_context_lost)
270 {
271 struct amdgpu_ctx *ctx = CALLOC_STRUCT(amdgpu_ctx);
272 int r;
273 struct amdgpu_bo_alloc_request alloc_buffer = {};
274 uint32_t amdgpu_priority = radeon_to_amdgpu_priority(priority);
275 amdgpu_bo_handle buf_handle;
276
277 if (!ctx)
278 return NULL;
279
280 ctx->aws = amdgpu_winsys(rws);
281 ctx->reference.count = 1;
282 ctx->allow_context_lost = allow_context_lost;
283
284 r = amdgpu_cs_ctx_create2(ctx->aws->dev, amdgpu_priority, &ctx->ctx);
285 if (r) {
286 fprintf(stderr, "amdgpu: amdgpu_cs_ctx_create2 failed. (%i)\n", r);
287 goto error_create;
288 }
289
290 alloc_buffer.alloc_size = ctx->aws->info.gart_page_size;
291 alloc_buffer.phys_alignment = ctx->aws->info.gart_page_size;
292 alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT;
293
294 r = amdgpu_bo_alloc(ctx->aws->dev, &alloc_buffer, &buf_handle);
295 if (r) {
296 fprintf(stderr, "amdgpu: amdgpu_bo_alloc failed. (%i)\n", r);
297 goto error_user_fence_alloc;
298 }
299
300 r = amdgpu_bo_cpu_map(buf_handle, (void**)&ctx->user_fence_cpu_address_base);
301 if (r) {
302 fprintf(stderr, "amdgpu: amdgpu_bo_cpu_map failed. (%i)\n", r);
303 goto error_user_fence_map;
304 }
305
306 memset(ctx->user_fence_cpu_address_base, 0, alloc_buffer.alloc_size);
307 ctx->user_fence_bo = buf_handle;
308
309 return (struct radeon_winsys_ctx*)ctx;
310
311 error_user_fence_map:
312 amdgpu_bo_free(buf_handle);
313 error_user_fence_alloc:
314 amdgpu_cs_ctx_free(ctx->ctx);
315 error_create:
316 FREE(ctx);
317 return NULL;
318 }
319
amdgpu_ctx_destroy(struct radeon_winsys_ctx * rwctx)320 static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
321 {
322 struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
323
324 amdgpu_ctx_reference(&ctx, NULL);
325 }
326
amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys * aws,enum amd_ip_type ip_type,uint32_t * ib,uint32_t * num_dw,unsigned leave_dw_space)327 static void amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys *aws, enum amd_ip_type ip_type,
328 uint32_t *ib, uint32_t *num_dw, unsigned leave_dw_space)
329 {
330 unsigned pad_dw_mask = aws->info.ip[ip_type].ib_pad_dw_mask;
331 unsigned unaligned_dw = (*num_dw + leave_dw_space) & pad_dw_mask;
332
333 if (unaligned_dw) {
334 int remaining = pad_dw_mask + 1 - unaligned_dw;
335
336 /* Only pad by 1 dword with the type-2 NOP if necessary. */
337 if (remaining == 1 && aws->info.gfx_ib_pad_with_type2) {
338 ib[(*num_dw)++] = PKT2_NOP_PAD;
339 } else {
340 /* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized
341 * packet. The size of the packet body after the header is always count + 1.
342 * If count == -1, there is no packet body. NOP is the only packet that can have
343 * count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1).
344 */
345 ib[(*num_dw)++] = PKT3(PKT3_NOP, remaining - 2, 0);
346 *num_dw += remaining - 1;
347 }
348 }
349 assert(((*num_dw + leave_dw_space) & pad_dw_mask) == 0);
350 }
351
amdgpu_submit_gfx_nop(struct amdgpu_ctx * ctx)352 static int amdgpu_submit_gfx_nop(struct amdgpu_ctx *ctx)
353 {
354 struct amdgpu_bo_alloc_request request = {0};
355 struct drm_amdgpu_bo_list_in bo_list_in;
356 struct drm_amdgpu_cs_chunk_ib ib_in = {0};
357 amdgpu_bo_handle buf_handle;
358 amdgpu_va_handle va_handle = NULL;
359 struct drm_amdgpu_cs_chunk chunks[2];
360 struct drm_amdgpu_bo_list_entry list;
361 unsigned noop_dw_size;
362 void *cpu = NULL;
363 uint64_t seq_no;
364 uint64_t va;
365 int r;
366
367 /* Older amdgpu doesn't report if the reset is complete or not. Detect
368 * it by submitting a no-op job. If it reports an error, then assume
369 * that the reset is not complete.
370 */
371 amdgpu_context_handle temp_ctx;
372 r = amdgpu_cs_ctx_create2(ctx->aws->dev, AMDGPU_CTX_PRIORITY_NORMAL, &temp_ctx);
373 if (r)
374 return r;
375
376 request.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM;
377 request.alloc_size = 4096;
378 request.phys_alignment = 4096;
379 r = amdgpu_bo_alloc(ctx->aws->dev, &request, &buf_handle);
380 if (r)
381 goto destroy_ctx;
382
383 r = amdgpu_va_range_alloc(ctx->aws->dev, amdgpu_gpu_va_range_general,
384 request.alloc_size, request.phys_alignment,
385 0, &va, &va_handle,
386 AMDGPU_VA_RANGE_32_BIT | AMDGPU_VA_RANGE_HIGH);
387 if (r)
388 goto destroy_bo;
389 r = amdgpu_bo_va_op_raw(ctx->aws->dev, buf_handle, 0, request.alloc_size, va,
390 AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE,
391 AMDGPU_VA_OP_MAP);
392 if (r)
393 goto destroy_bo;
394
395 r = amdgpu_bo_cpu_map(buf_handle, &cpu);
396 if (r)
397 goto destroy_bo;
398
399 noop_dw_size = ctx->aws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1;
400 ((uint32_t*)cpu)[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0);
401
402 amdgpu_bo_cpu_unmap(buf_handle);
403
404 amdgpu_bo_export(buf_handle, amdgpu_bo_handle_type_kms, &list.bo_handle);
405 list.bo_priority = 0;
406
407 bo_list_in.list_handle = ~0;
408 bo_list_in.bo_number = 1;
409 bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
410 bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)&list;
411
412 ib_in.ip_type = AMD_IP_GFX;
413 ib_in.ib_bytes = noop_dw_size * 4;
414 ib_in.va_start = va;
415
416 chunks[0].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
417 chunks[0].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
418 chunks[0].chunk_data = (uintptr_t)&bo_list_in;
419
420 chunks[1].chunk_id = AMDGPU_CHUNK_ID_IB;
421 chunks[1].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
422 chunks[1].chunk_data = (uintptr_t)&ib_in;
423
424 r = amdgpu_cs_submit_raw2(ctx->aws->dev, temp_ctx, 0, 2, chunks, &seq_no);
425
426 destroy_bo:
427 if (va_handle)
428 amdgpu_va_range_free(va_handle);
429 amdgpu_bo_free(buf_handle);
430 destroy_ctx:
431 amdgpu_cs_ctx_free(temp_ctx);
432
433 return r;
434 }
435
436 static void
amdgpu_ctx_set_sw_reset_status(struct radeon_winsys_ctx * rwctx,enum pipe_reset_status status,const char * format,...)437 amdgpu_ctx_set_sw_reset_status(struct radeon_winsys_ctx *rwctx, enum pipe_reset_status status,
438 const char *format, ...)
439 {
440 struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
441
442 /* Don't overwrite the last reset status. */
443 if (ctx->sw_status != PIPE_NO_RESET)
444 return;
445
446 ctx->sw_status = status;
447
448 if (!ctx->allow_context_lost) {
449 va_list args;
450
451 va_start(args, format);
452 vfprintf(stderr, format, args);
453 va_end(args);
454
455 /* Non-robust contexts are allowed to terminate the process. The only alternative is
456 * to skip command submission, which would look like a freeze because nothing is drawn,
457 * which looks like a hang without any reset.
458 */
459 abort();
460 }
461 }
462
463 static enum pipe_reset_status
amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx * rwctx,bool full_reset_only,bool * needs_reset,bool * reset_completed)464 amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx, bool full_reset_only,
465 bool *needs_reset, bool *reset_completed)
466 {
467 struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
468
469 if (needs_reset)
470 *needs_reset = false;
471 if (reset_completed)
472 *reset_completed = false;
473
474 /* Return a failure due to a GPU hang. */
475 uint64_t flags;
476
477 if (full_reset_only && ctx->sw_status == PIPE_NO_RESET) {
478 /* If the caller is only interested in full reset (= wants to ignore soft
479 * recoveries), we can use the rejected cs count as a quick first check.
480 */
481 return PIPE_NO_RESET;
482 }
483
484 /*
485 * ctx->sw_status is updated on alloc/ioctl failures.
486 *
487 * We only rely on amdgpu_cs_query_reset_state2 to tell us
488 * that the context reset is complete.
489 */
490 if (ctx->sw_status != PIPE_NO_RESET) {
491 int r = amdgpu_cs_query_reset_state2(ctx->ctx, &flags);
492 if (!r) {
493 if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) {
494 if (reset_completed) {
495 /* The ARB_robustness spec says:
496 *
497 * If a reset status other than NO_ERROR is returned and subsequent
498 * calls return NO_ERROR, the context reset was encountered and
499 * completed. If a reset status is repeatedly returned, the context may
500 * be in the process of resetting.
501 *
502 * Starting with drm_minor >= 54 amdgpu reports if the reset is complete,
503 * so don't do anything special. On older kernels, submit a no-op cs. If it
504 * succeeds then assume the reset is complete.
505 */
506 if (!(flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS))
507 *reset_completed = true;
508
509 if (ctx->aws->info.drm_minor < 54 && ctx->aws->info.has_graphics)
510 *reset_completed = amdgpu_submit_gfx_nop(ctx) == 0;
511 }
512 }
513 } else {
514 fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state2 failed. (%i)\n", r);
515 }
516
517 /* Return a failure due to SW issues. */
518 if (needs_reset)
519 *needs_reset = true;
520 return ctx->sw_status;
521 }
522
523 if (needs_reset)
524 *needs_reset = false;
525 return PIPE_NO_RESET;
526 }
527
528 /* COMMAND SUBMISSION */
529
amdgpu_cs_has_user_fence(struct amdgpu_cs * acs)530 static bool amdgpu_cs_has_user_fence(struct amdgpu_cs *acs)
531 {
532 return acs->ip_type == AMD_IP_GFX ||
533 acs->ip_type == AMD_IP_COMPUTE ||
534 acs->ip_type == AMD_IP_SDMA;
535 }
536
amdgpu_cs_epilog_dws(struct amdgpu_cs * cs)537 static inline unsigned amdgpu_cs_epilog_dws(struct amdgpu_cs *cs)
538 {
539 if (cs->has_chaining)
540 return 4; /* for chaining */
541
542 return 0;
543 }
544
545 static struct amdgpu_cs_buffer *
amdgpu_lookup_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list)546 amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
547 struct amdgpu_buffer_list *list)
548 {
549 int num_buffers = list->num_buffers;
550 struct amdgpu_cs_buffer *buffers = list->buffers;
551 unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
552 int i = cs->buffer_indices_hashlist[hash];
553
554 /* not found or found */
555 if (i < 0)
556 return NULL;
557
558 if (i < num_buffers && buffers[i].bo == bo)
559 return &buffers[i];
560
561 /* Hash collision, look for the BO in the list of buffers linearly. */
562 for (int i = num_buffers - 1; i >= 0; i--) {
563 if (buffers[i].bo == bo) {
564 /* Put this buffer in the hash list.
565 * This will prevent additional hash collisions if there are
566 * several consecutive lookup_buffer calls for the same buffer.
567 *
568 * Example: Assuming buffers A,B,C collide in the hash list,
569 * the following sequence of buffers:
570 * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
571 * will collide here: ^ and here: ^,
572 * meaning that we should get very few collisions in the end. */
573 cs->buffer_indices_hashlist[hash] = i & 0x7fff;
574 return &buffers[i];
575 }
576 }
577 return NULL;
578 }
579
580 struct amdgpu_cs_buffer *
amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo)581 amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo)
582 {
583 return amdgpu_lookup_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)]);
584 }
585
586 static struct amdgpu_cs_buffer *
amdgpu_do_add_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list,bool add_ref)587 amdgpu_do_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
588 struct amdgpu_buffer_list *list, bool add_ref)
589 {
590 /* New buffer, check if the backing array is large enough. */
591 if (unlikely(list->num_buffers >= list->max_buffers)) {
592 unsigned new_max =
593 MAX2(list->max_buffers + 16, (unsigned)(list->max_buffers * 1.3));
594 struct amdgpu_cs_buffer *new_buffers;
595
596 new_buffers = (struct amdgpu_cs_buffer *)
597 REALLOC(list->buffers, list->max_buffers * sizeof(*new_buffers),
598 new_max * sizeof(*new_buffers));
599 if (!new_buffers) {
600 fprintf(stderr, "amdgpu_do_add_buffer: allocation failed\n");
601 return NULL;
602 }
603
604 list->max_buffers = new_max;
605 list->buffers = new_buffers;
606 }
607
608 unsigned idx = list->num_buffers++;
609 struct amdgpu_cs_buffer *buffer = &list->buffers[idx];
610 if (add_ref)
611 p_atomic_inc(&bo->base.reference.count);
612 buffer->bo = bo;
613 buffer->usage = 0;
614
615 unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
616 cs->buffer_indices_hashlist[hash] = idx & 0x7fff;
617 return buffer;
618 }
619
620 static struct amdgpu_cs_buffer *
amdgpu_lookup_or_add_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list,bool add_ref)621 amdgpu_lookup_or_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
622 struct amdgpu_buffer_list *list, bool add_ref)
623 {
624 struct amdgpu_cs_buffer *buffer = amdgpu_lookup_buffer(cs, bo, list);
625
626 return buffer ? buffer : amdgpu_do_add_buffer(cs, bo, list, add_ref);
627 }
628
amdgpu_cs_add_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * buf,unsigned usage,enum radeon_bo_domain domains)629 static unsigned amdgpu_cs_add_buffer(struct radeon_cmdbuf *rcs,
630 struct pb_buffer_lean *buf,
631 unsigned usage,
632 enum radeon_bo_domain domains)
633 {
634 /* Don't use the "domains" parameter. Amdgpu doesn't support changing
635 * the buffer placement during command submission.
636 */
637 struct amdgpu_cs_context *cs = (struct amdgpu_cs_context*)rcs->csc;
638 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
639 struct amdgpu_cs_buffer *buffer;
640
641 /* Fast exit for no-op calls.
642 * This is very effective with suballocators and linear uploaders that
643 * are outside of the winsys.
644 */
645 if (bo == cs->last_added_bo &&
646 (usage & cs->last_added_bo_usage) == usage)
647 return 0;
648
649 buffer = amdgpu_lookup_or_add_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)], true);
650 if (!buffer)
651 return 0;
652
653 buffer->usage |= usage;
654
655 cs->last_added_bo_usage = buffer->usage;
656 cs->last_added_bo = bo;
657 return 0;
658 }
659
amdgpu_ib_new_buffer(struct amdgpu_winsys * aws,struct amdgpu_ib * main_ib,struct amdgpu_cs * cs)660 static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *aws,
661 struct amdgpu_ib *main_ib,
662 struct amdgpu_cs *cs)
663 {
664 struct pb_buffer_lean *pb;
665 uint8_t *mapped;
666 unsigned buffer_size;
667
668 /* Always create a buffer that is at least as large as the maximum seen IB size,
669 * aligned to a power of two.
670 */
671 buffer_size = util_next_power_of_two(main_ib->max_ib_bytes);
672
673 /* Multiply by 4 to reduce internal fragmentation if chaining is not available.*/
674 if (!cs->has_chaining)
675 buffer_size *= 4;
676
677 const unsigned min_size = MAX2(main_ib->max_check_space_size, 32 * 1024);
678 /* This is the maximum size that fits into the INDIRECT_BUFFER packet. */
679 const unsigned max_size = 2 * 1024 * 1024;
680
681 buffer_size = MIN2(buffer_size, max_size);
682 buffer_size = MAX2(buffer_size, min_size); /* min_size is more important */
683
684 /* Use cached GTT for command buffers. Writing to other heaps is very slow on the CPU.
685 * The speed of writing to GTT WC is somewhere between no difference and very slow, while
686 * VRAM being very slow a lot more often.
687 *
688 * Bypass GL2 because command buffers are read only once. Bypassing GL2 has better latency
689 * and doesn't have to wait for cached GL2 requests to be processed.
690 */
691 enum radeon_bo_domain domain = RADEON_DOMAIN_GTT;
692 unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING |
693 RADEON_FLAG_GL2_BYPASS;
694
695 if (cs->ip_type == AMD_IP_GFX ||
696 cs->ip_type == AMD_IP_COMPUTE ||
697 cs->ip_type == AMD_IP_SDMA) {
698 /* Avoids hangs with "rendercheck -t cacomposite -f a8r8g8b8" via glamor
699 * on Navi 14
700 */
701 flags |= RADEON_FLAG_32BIT;
702 }
703
704 pb = amdgpu_bo_create(aws, buffer_size,
705 aws->info.gart_page_size,
706 domain, (radeon_bo_flag)flags);
707 if (!pb)
708 return false;
709
710 mapped = (uint8_t*)amdgpu_bo_map(&aws->dummy_sws.base, pb, NULL, PIPE_MAP_WRITE);
711 if (!mapped) {
712 radeon_bo_reference(&aws->dummy_sws.base, &pb, NULL);
713 return false;
714 }
715
716 radeon_bo_reference(&aws->dummy_sws.base, &main_ib->big_buffer, pb);
717 radeon_bo_reference(&aws->dummy_sws.base, &pb, NULL);
718
719 main_ib->gpu_address = amdgpu_bo_get_va(main_ib->big_buffer);
720 main_ib->big_buffer_cpu_ptr = mapped;
721 main_ib->used_ib_space = 0;
722
723 return true;
724 }
725
amdgpu_get_new_ib(struct amdgpu_winsys * aws,struct radeon_cmdbuf * rcs,struct amdgpu_ib * main_ib,struct amdgpu_cs * cs)726 static bool amdgpu_get_new_ib(struct amdgpu_winsys *aws,
727 struct radeon_cmdbuf *rcs,
728 struct amdgpu_ib *main_ib,
729 struct amdgpu_cs *cs)
730 {
731 struct drm_amdgpu_cs_chunk_ib *chunk_ib = &cs->csc->chunk_ib[IB_MAIN];
732 /* This is the minimum size of a contiguous IB. */
733 unsigned ib_size = 16 * 1024;
734
735 /* Always allocate at least the size of the biggest cs_check_space call,
736 * because precisely the last call might have requested this size.
737 */
738 ib_size = MAX2(ib_size, main_ib->max_check_space_size);
739
740 if (!cs->has_chaining) {
741 ib_size = MAX2(ib_size, MIN2(util_next_power_of_two(main_ib->max_ib_bytes),
742 IB_MAX_SUBMIT_BYTES));
743 }
744
745 /* Decay the IB buffer size over time, so that memory usage decreases after
746 * a temporary peak.
747 */
748 main_ib->max_ib_bytes = main_ib->max_ib_bytes - main_ib->max_ib_bytes / 32;
749
750 rcs->prev_dw = 0;
751 rcs->num_prev = 0;
752 rcs->current.cdw = 0;
753 rcs->current.buf = NULL;
754
755 /* Allocate a new buffer for IBs if the current buffer is all used. */
756 if (!main_ib->big_buffer ||
757 main_ib->used_ib_space + ib_size > main_ib->big_buffer->size) {
758 if (!amdgpu_ib_new_buffer(aws, main_ib, cs))
759 return false;
760 }
761
762 chunk_ib->va_start = main_ib->gpu_address + main_ib->used_ib_space;
763 chunk_ib->ib_bytes = 0;
764 /* ib_bytes is in dwords and the conversion to bytes will be done before
765 * the CS ioctl. */
766 main_ib->ptr_ib_size = &chunk_ib->ib_bytes;
767 main_ib->is_chained_ib = false;
768
769 amdgpu_cs_add_buffer(rcs, main_ib->big_buffer,
770 (radeon_bo_flag)(RADEON_USAGE_READ | RADEON_PRIO_IB),
771 (radeon_bo_domain)0);
772
773 rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space);
774
775 cs->csc->ib_main_addr = rcs->current.buf;
776
777 ib_size = main_ib->big_buffer->size - main_ib->used_ib_space;
778 rcs->current.max_dw = ib_size / 4 - amdgpu_cs_epilog_dws(cs);
779 return true;
780 }
781
amdgpu_set_ib_size(struct radeon_cmdbuf * rcs,struct amdgpu_ib * ib)782 static void amdgpu_set_ib_size(struct radeon_cmdbuf *rcs, struct amdgpu_ib *ib)
783 {
784 if (ib->is_chained_ib) {
785 *ib->ptr_ib_size = rcs->current.cdw |
786 S_3F2_CHAIN(1) | S_3F2_VALID(1) |
787 S_3F2_PRE_ENA(((struct amdgpu_cs*)ib)->preamble_ib_bo != NULL);
788 } else {
789 *ib->ptr_ib_size = rcs->current.cdw;
790 }
791 }
792
amdgpu_ib_finalize(struct amdgpu_winsys * aws,struct radeon_cmdbuf * rcs,struct amdgpu_ib * ib,enum amd_ip_type ip_type)793 static void amdgpu_ib_finalize(struct amdgpu_winsys *aws, struct radeon_cmdbuf *rcs,
794 struct amdgpu_ib *ib, enum amd_ip_type ip_type)
795 {
796 amdgpu_set_ib_size(rcs, ib);
797 ib->used_ib_space += rcs->current.cdw * 4;
798 ib->used_ib_space = align(ib->used_ib_space, aws->info.ip[ip_type].ib_alignment);
799 ib->max_ib_bytes = MAX2(ib->max_ib_bytes, (rcs->prev_dw + rcs->current.cdw) * 4);
800 }
801
amdgpu_init_cs_context(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs,enum amd_ip_type ip_type)802 static bool amdgpu_init_cs_context(struct amdgpu_winsys *aws,
803 struct amdgpu_cs_context *cs,
804 enum amd_ip_type ip_type)
805 {
806 for (unsigned i = 0; i < ARRAY_SIZE(cs->chunk_ib); i++) {
807 cs->chunk_ib[i].ip_type = ip_type;
808 cs->chunk_ib[i].flags = 0;
809
810 if (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE) {
811 /* The kernel shouldn't invalidate L2 and vL1. The proper place for cache invalidation
812 * is the beginning of IBs because completion of an IB doesn't care about the state of
813 * GPU caches, only the beginning of an IB does. Draw calls from multiple IBs can be
814 * executed in parallel, so draw calls from the current IB can finish after the next IB
815 * starts drawing, and so the cache flush at the end of IBs is usually late and thus
816 * useless.
817 */
818 cs->chunk_ib[i].flags |= AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE;
819 }
820 }
821
822 cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAG_PREAMBLE;
823 cs->last_added_bo = NULL;
824 return true;
825 }
826
cleanup_fence_list(struct amdgpu_fence_list * fences)827 static void cleanup_fence_list(struct amdgpu_fence_list *fences)
828 {
829 for (unsigned i = 0; i < fences->num; i++)
830 amdgpu_fence_drop_reference(fences->list[i]);
831 fences->num = 0;
832 }
833
amdgpu_cs_context_cleanup_buffers(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs)834 static void amdgpu_cs_context_cleanup_buffers(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs)
835 {
836 for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++) {
837 struct amdgpu_cs_buffer *buffers = cs->buffer_lists[i].buffers;
838 unsigned num_buffers = cs->buffer_lists[i].num_buffers;
839
840 for (unsigned j = 0; j < num_buffers; j++)
841 amdgpu_winsys_bo_drop_reference(aws, buffers[j].bo);
842
843 cs->buffer_lists[i].num_buffers = 0;
844 }
845 }
846
amdgpu_cs_context_cleanup(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs)847 static void amdgpu_cs_context_cleanup(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs)
848 {
849 cs->seq_no_dependencies.valid_fence_mask = 0;
850 cleanup_fence_list(&cs->syncobj_dependencies);
851 cleanup_fence_list(&cs->syncobj_to_signal);
852 amdgpu_fence_reference(&cs->fence, NULL);
853 cs->last_added_bo = NULL;
854 }
855
amdgpu_destroy_cs_context(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs)856 static void amdgpu_destroy_cs_context(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs)
857 {
858 amdgpu_cs_context_cleanup_buffers(aws, cs);
859 amdgpu_cs_context_cleanup(aws, cs);
860 for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++)
861 FREE(cs->buffer_lists[i].buffers);
862 FREE(cs->syncobj_dependencies.list);
863 FREE(cs->syncobj_to_signal.list);
864 }
865
866
amdgpu_cs_get_ip_type(struct radeon_cmdbuf * rcs)867 static enum amd_ip_type amdgpu_cs_get_ip_type(struct radeon_cmdbuf *rcs)
868 {
869 struct amdgpu_cs *cs = amdgpu_cs(rcs);
870 return cs->ip_type;
871 }
872
ip_uses_alt_fence(enum amd_ip_type ip_type)873 static bool ip_uses_alt_fence(enum amd_ip_type ip_type)
874 {
875 /* The alt_fence path can be tested thoroughly by enabling it for GFX here. */
876 return ip_type == AMD_IP_VCN_DEC ||
877 ip_type == AMD_IP_VCN_ENC ||
878 ip_type == AMD_IP_VCN_JPEG;
879 }
880
881 static bool
amdgpu_cs_create(struct radeon_cmdbuf * rcs,struct radeon_winsys_ctx * rwctx,enum amd_ip_type ip_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx)882 amdgpu_cs_create(struct radeon_cmdbuf *rcs,
883 struct radeon_winsys_ctx *rwctx,
884 enum amd_ip_type ip_type,
885 void (*flush)(void *ctx, unsigned flags,
886 struct pipe_fence_handle **fence),
887 void *flush_ctx)
888 {
889 struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
890 struct amdgpu_cs *cs;
891
892 cs = CALLOC_STRUCT(amdgpu_cs);
893 if (!cs) {
894 return false;
895 }
896
897 util_queue_fence_init(&cs->flush_completed);
898
899 cs->aws = ctx->aws;
900 cs->ctx = ctx;
901 cs->flush_cs = flush;
902 cs->flush_data = flush_ctx;
903 cs->ip_type = ip_type;
904 cs->noop = ctx->aws->noop_cs;
905 cs->has_chaining = ctx->aws->info.gfx_level >= GFX7 &&
906 (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE);
907
908 /* Compute the queue index by counting the IPs that have queues. */
909 assert(ip_type < ARRAY_SIZE(ctx->aws->info.ip));
910 assert(ctx->aws->info.ip[ip_type].num_queues);
911
912 if (ip_uses_alt_fence(ip_type)) {
913 cs->queue_index = INT_MAX;
914 cs->uses_alt_fence = true;
915 } else {
916 cs->queue_index = 0;
917
918 for (unsigned i = 0; i < ARRAY_SIZE(ctx->aws->info.ip); i++) {
919 if (!ctx->aws->info.ip[i].num_queues || ip_uses_alt_fence((amd_ip_type)i))
920 continue;
921
922 if (i == ip_type)
923 break;
924
925 cs->queue_index++;
926 }
927 assert(cs->queue_index < AMDGPU_MAX_QUEUES);
928 }
929
930 struct amdgpu_cs_fence_info fence_info;
931 fence_info.handle = cs->ctx->user_fence_bo;
932 fence_info.offset = cs->ip_type * 4;
933 amdgpu_cs_chunk_fence_info_to_data(&fence_info,
934 (struct drm_amdgpu_cs_chunk_data*)&cs->fence_chunk);
935
936 if (!amdgpu_init_cs_context(ctx->aws, &cs->csc1, ip_type)) {
937 FREE(cs);
938 return false;
939 }
940
941 if (!amdgpu_init_cs_context(ctx->aws, &cs->csc2, ip_type)) {
942 amdgpu_destroy_cs_context(ctx->aws, &cs->csc1);
943 FREE(cs);
944 return false;
945 }
946
947 memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
948
949 /* Set the first submission context as current. */
950 rcs->csc = cs->csc = &cs->csc1;
951 cs->cst = &cs->csc2;
952
953 /* Assign to both amdgpu_cs_context; only csc will use it. */
954 cs->csc1.buffer_indices_hashlist = cs->buffer_indices_hashlist;
955 cs->csc2.buffer_indices_hashlist = cs->buffer_indices_hashlist;
956
957 cs->csc1.aws = ctx->aws;
958 cs->csc2.aws = ctx->aws;
959
960 rcs->priv = cs;
961
962 if (!amdgpu_get_new_ib(ctx->aws, rcs, &cs->main_ib, cs)) {
963 amdgpu_destroy_cs_context(ctx->aws, &cs->csc2);
964 amdgpu_destroy_cs_context(ctx->aws, &cs->csc1);
965 FREE(cs);
966 rcs->priv = NULL;
967 return false;
968 }
969
970 p_atomic_inc(&ctx->aws->num_cs);
971 return true;
972 }
973
974 static bool
amdgpu_cs_setup_preemption(struct radeon_cmdbuf * rcs,const uint32_t * preamble_ib,unsigned preamble_num_dw)975 amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib,
976 unsigned preamble_num_dw)
977 {
978 struct amdgpu_cs *cs = amdgpu_cs(rcs);
979 struct amdgpu_winsys *aws = cs->aws;
980 struct amdgpu_cs_context *csc[2] = {&cs->csc1, &cs->csc2};
981 unsigned size = align(preamble_num_dw * 4, aws->info.ip[AMD_IP_GFX].ib_alignment);
982 struct pb_buffer_lean *preamble_bo;
983 uint32_t *map;
984
985 /* Create the preamble IB buffer. */
986 preamble_bo = amdgpu_bo_create(aws, size, aws->info.ip[AMD_IP_GFX].ib_alignment,
987 RADEON_DOMAIN_VRAM,
988 (radeon_bo_flag)
989 (RADEON_FLAG_NO_INTERPROCESS_SHARING |
990 RADEON_FLAG_GTT_WC));
991 if (!preamble_bo)
992 return false;
993
994 map = (uint32_t*)amdgpu_bo_map(&aws->dummy_sws.base, preamble_bo, NULL,
995 (pipe_map_flags)(PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY));
996 if (!map) {
997 radeon_bo_reference(&aws->dummy_sws.base, &preamble_bo, NULL);
998 return false;
999 }
1000
1001 /* Upload the preamble IB. */
1002 memcpy(map, preamble_ib, preamble_num_dw * 4);
1003
1004 /* Pad the IB. */
1005 amdgpu_pad_gfx_compute_ib(aws, cs->ip_type, map, &preamble_num_dw, 0);
1006 amdgpu_bo_unmap(&aws->dummy_sws.base, preamble_bo);
1007
1008 for (unsigned i = 0; i < 2; i++) {
1009 csc[i]->chunk_ib[IB_PREAMBLE].va_start = amdgpu_bo_get_va(preamble_bo);
1010 csc[i]->chunk_ib[IB_PREAMBLE].ib_bytes = preamble_num_dw * 4;
1011
1012 csc[i]->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAG_PREEMPT;
1013 }
1014
1015 assert(!cs->preamble_ib_bo);
1016 cs->preamble_ib_bo = preamble_bo;
1017
1018 amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo,
1019 RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
1020 return true;
1021 }
1022
amdgpu_cs_validate(struct radeon_cmdbuf * rcs)1023 static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs)
1024 {
1025 return true;
1026 }
1027
amdgpu_cs_check_space(struct radeon_cmdbuf * rcs,unsigned dw)1028 static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
1029 {
1030 struct amdgpu_cs *cs = amdgpu_cs(rcs);
1031 struct amdgpu_ib *main_ib = &cs->main_ib;
1032
1033 assert(rcs->current.cdw <= rcs->current.max_dw);
1034
1035 unsigned projected_size_dw = rcs->prev_dw + rcs->current.cdw + dw;
1036
1037 if (projected_size_dw * 4 > IB_MAX_SUBMIT_BYTES)
1038 return false;
1039
1040 if (rcs->current.max_dw - rcs->current.cdw >= dw)
1041 return true;
1042
1043 unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs);
1044 unsigned need_byte_size = (dw + cs_epilog_dw) * 4;
1045 /* 125% of the size for IB epilog. */
1046 unsigned safe_byte_size = need_byte_size + need_byte_size / 4;
1047 main_ib->max_check_space_size = MAX2(main_ib->max_check_space_size, safe_byte_size);
1048 main_ib->max_ib_bytes = MAX2(main_ib->max_ib_bytes, projected_size_dw * 4);
1049
1050 if (!cs->has_chaining)
1051 return false;
1052
1053 /* Allocate a new chunk */
1054 if (rcs->num_prev >= rcs->max_prev) {
1055 unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev);
1056 struct radeon_cmdbuf_chunk *new_prev;
1057
1058 new_prev = (struct radeon_cmdbuf_chunk*)
1059 REALLOC(rcs->prev, sizeof(*new_prev) * rcs->max_prev,
1060 sizeof(*new_prev) * new_max_prev);
1061 if (!new_prev)
1062 return false;
1063
1064 rcs->prev = new_prev;
1065 rcs->max_prev = new_max_prev;
1066 }
1067
1068 if (!amdgpu_ib_new_buffer(cs->aws, main_ib, cs))
1069 return false;
1070
1071 assert(main_ib->used_ib_space == 0);
1072 uint64_t va = main_ib->gpu_address;
1073
1074 /* This space was originally reserved. */
1075 rcs->current.max_dw += cs_epilog_dw;
1076
1077 /* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */
1078 amdgpu_pad_gfx_compute_ib(cs->aws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 4);
1079
1080 radeon_emit(rcs, PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
1081 radeon_emit(rcs, va);
1082 radeon_emit(rcs, va >> 32);
1083 uint32_t *new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw++];
1084
1085 assert((rcs->current.cdw & cs->aws->info.ip[cs->ip_type].ib_pad_dw_mask) == 0);
1086 assert(rcs->current.cdw <= rcs->current.max_dw);
1087
1088 amdgpu_set_ib_size(rcs, main_ib);
1089 main_ib->ptr_ib_size = new_ptr_ib_size;
1090 main_ib->is_chained_ib = true;
1091
1092 /* Hook up the new chunk */
1093 rcs->prev[rcs->num_prev].buf = rcs->current.buf;
1094 rcs->prev[rcs->num_prev].cdw = rcs->current.cdw;
1095 rcs->prev[rcs->num_prev].max_dw = rcs->current.cdw; /* no modifications */
1096 rcs->num_prev++;
1097
1098 rcs->prev_dw += rcs->current.cdw;
1099 rcs->current.cdw = 0;
1100
1101 rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space);
1102 rcs->current.max_dw = main_ib->big_buffer->size / 4 - cs_epilog_dw;
1103
1104 amdgpu_cs_add_buffer(rcs, main_ib->big_buffer,
1105 RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
1106
1107 return true;
1108 }
1109
amdgpu_add_slab_backing_buffers(struct amdgpu_cs_context * cs)1110 static void amdgpu_add_slab_backing_buffers(struct amdgpu_cs_context *cs)
1111 {
1112 unsigned num_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
1113 struct amdgpu_cs_buffer *buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
1114
1115 for (unsigned i = 0; i < num_buffers; i++) {
1116 struct amdgpu_cs_buffer *slab_buffer = &buffers[i];
1117 struct amdgpu_cs_buffer *real_buffer =
1118 amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(slab_buffer->bo)->b,
1119 &cs->buffer_lists[AMDGPU_BO_REAL], true);
1120
1121 /* We need to set the usage because it determines the BO priority.
1122 *
1123 * Mask out the SYNCHRONIZED flag because the backing buffer of slabs shouldn't add its
1124 * BO fences to fence dependencies. Only the slab entries should do that.
1125 */
1126 real_buffer->usage |= slab_buffer->usage & ~RADEON_USAGE_SYNCHRONIZED;
1127 }
1128 }
1129
amdgpu_cs_get_buffer_list(struct radeon_cmdbuf * rcs,struct radeon_bo_list_item * list)1130 static unsigned amdgpu_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
1131 struct radeon_bo_list_item *list)
1132 {
1133 struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc;
1134
1135 /* We do this in the CS thread, but since we need to return the final usage of all buffers
1136 * here, do it here too. There is no harm in doing it again in the CS thread.
1137 */
1138 amdgpu_add_slab_backing_buffers(cs);
1139
1140 struct amdgpu_buffer_list *real_buffers = &cs->buffer_lists[AMDGPU_BO_REAL];
1141 unsigned num_real_buffers = real_buffers->num_buffers;
1142
1143 if (list) {
1144 for (unsigned i = 0; i < num_real_buffers; i++) {
1145 list[i].bo_size = real_buffers->buffers[i].bo->base.size;
1146 list[i].vm_address =
1147 amdgpu_va_get_start_addr(get_real_bo(real_buffers->buffers[i].bo)->va_handle);
1148 list[i].priority_usage = real_buffers->buffers[i].usage;
1149 }
1150 }
1151 return num_real_buffers;
1152 }
1153
add_fence_to_list(struct amdgpu_fence_list * fences,struct amdgpu_fence * fence)1154 static void add_fence_to_list(struct amdgpu_fence_list *fences,
1155 struct amdgpu_fence *fence)
1156 {
1157 unsigned idx = fences->num++;
1158
1159 if (idx >= fences->max) {
1160 unsigned size;
1161 const unsigned increment = 8;
1162
1163 fences->max = idx + increment;
1164 size = fences->max * sizeof(fences->list[0]);
1165 fences->list = (struct pipe_fence_handle**)realloc(fences->list, size);
1166 }
1167 amdgpu_fence_set_reference(&fences->list[idx], (struct pipe_fence_handle*)fence);
1168 }
1169
amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf * rcs,struct pipe_fence_handle * pfence)1170 static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rcs,
1171 struct pipe_fence_handle *pfence)
1172 {
1173 struct amdgpu_cs *acs = amdgpu_cs(rcs);
1174 struct amdgpu_cs_context *cs = acs->csc;
1175 struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
1176
1177 util_queue_fence_wait(&fence->submitted);
1178
1179 if (!fence->imported) {
1180 /* Ignore idle fences. This will only check the user fence in memory. */
1181 if (!amdgpu_fence_wait((struct pipe_fence_handle *)fence, 0, false)) {
1182 add_seq_no_to_list(acs->aws, &cs->seq_no_dependencies, fence->queue_index,
1183 fence->queue_seq_no);
1184 }
1185 }
1186 else
1187 add_fence_to_list(&cs->syncobj_dependencies, fence);
1188 }
1189
amdgpu_add_fences_to_dependencies(struct amdgpu_winsys * ws,struct amdgpu_cs_context * cs,unsigned queue_index_bit,struct amdgpu_seq_no_fences * dependencies,struct amdgpu_winsys_bo * bo,unsigned usage)1190 static void amdgpu_add_fences_to_dependencies(struct amdgpu_winsys *ws,
1191 struct amdgpu_cs_context *cs,
1192 unsigned queue_index_bit,
1193 struct amdgpu_seq_no_fences *dependencies,
1194 struct amdgpu_winsys_bo *bo, unsigned usage)
1195 {
1196 if (usage & RADEON_USAGE_SYNCHRONIZED) {
1197 /* Add BO fences from queues other than 'queue_index' to dependencies. */
1198 u_foreach_bit(other_queue_idx, bo->fences.valid_fence_mask & ~queue_index_bit) {
1199 add_seq_no_to_list(ws, dependencies, other_queue_idx,
1200 bo->fences.seq_no[other_queue_idx]);
1201 }
1202
1203 if (bo->alt_fence)
1204 add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)bo->alt_fence);
1205 }
1206 }
1207
amdgpu_set_bo_seq_no(unsigned queue_index,struct amdgpu_winsys_bo * bo,uint_seq_no new_queue_seq_no)1208 static void amdgpu_set_bo_seq_no(unsigned queue_index, struct amdgpu_winsys_bo *bo,
1209 uint_seq_no new_queue_seq_no)
1210 {
1211 bo->fences.seq_no[queue_index] = new_queue_seq_no;
1212 bo->fences.valid_fence_mask |= BITFIELD_BIT(queue_index);
1213 }
1214
amdgpu_add_to_kernel_bo_list(struct drm_amdgpu_bo_list_entry * bo_entry,struct amdgpu_winsys_bo * bo,unsigned usage)1215 static void amdgpu_add_to_kernel_bo_list(struct drm_amdgpu_bo_list_entry *bo_entry,
1216 struct amdgpu_winsys_bo *bo, unsigned usage)
1217 {
1218 bo_entry->bo_handle = get_real_bo(bo)->kms_handle;
1219 bo_entry->bo_priority = (util_last_bit(usage & RADEON_ALL_PRIORITIES) - 1) / 2;
1220 }
1221
amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf * rws,struct pipe_fence_handle * fence)1222 static void amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf *rws,
1223 struct pipe_fence_handle *fence)
1224 {
1225 struct amdgpu_cs *acs = amdgpu_cs(rws);
1226 struct amdgpu_cs_context *cs = acs->csc;
1227
1228 add_fence_to_list(&cs->syncobj_to_signal, (struct amdgpu_fence*)fence);
1229 }
1230
1231 /* The template parameter determines whether the queue should skip code used by the default queue
1232 * system that's based on sequence numbers, and instead use and update amdgpu_winsys_bo::alt_fence
1233 * for all BOs.
1234 */
1235 template<bool QUEUE_USES_ALT_FENCE>
amdgpu_cs_submit_ib(void * job,void * gdata,int thread_index)1236 static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
1237 {
1238 struct amdgpu_cs *acs = (struct amdgpu_cs*)job;
1239 struct amdgpu_winsys *aws = acs->aws;
1240 struct amdgpu_cs_context *cs = acs->cst;
1241 int r;
1242 uint64_t seq_no = 0;
1243 bool has_user_fence = amdgpu_cs_has_user_fence(acs);
1244
1245 assert(QUEUE_USES_ALT_FENCE == acs->uses_alt_fence);
1246
1247 simple_mtx_lock(&aws->bo_fence_lock);
1248 unsigned queue_index;
1249 struct amdgpu_queue *queue;
1250 uint_seq_no prev_seq_no, next_seq_no;
1251
1252 if (!QUEUE_USES_ALT_FENCE) {
1253 queue_index = acs->queue_index;
1254 queue = &aws->queues[queue_index];
1255 prev_seq_no = queue->latest_seq_no;
1256
1257 /* Generate a per queue sequence number. The logic is similar to the kernel side amdgpu seqno,
1258 * but the values aren't related.
1259 */
1260 next_seq_no = prev_seq_no + 1;
1261
1262 /* Wait for the oldest fence to signal. This should always check the user fence, then wait
1263 * via the ioctl. We have to do this because we are going to release the oldest fence and
1264 * replace it with the latest fence in the ring.
1265 */
1266 struct pipe_fence_handle **oldest_fence =
1267 &queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE];
1268
1269 if (*oldest_fence) {
1270 if (!amdgpu_fence_wait(*oldest_fence, 0, false)) {
1271 /* Take the reference because the fence can be released by other threads after we
1272 * unlock the mutex.
1273 */
1274 struct pipe_fence_handle *tmp_fence = NULL;
1275 amdgpu_fence_reference(&tmp_fence, *oldest_fence);
1276
1277 /* Unlock the mutex before waiting. */
1278 simple_mtx_unlock(&aws->bo_fence_lock);
1279 amdgpu_fence_wait(tmp_fence, OS_TIMEOUT_INFINITE, false);
1280 amdgpu_fence_reference(&tmp_fence, NULL);
1281 simple_mtx_lock(&aws->bo_fence_lock);
1282 }
1283
1284 /* Remove the idle fence from the ring. */
1285 amdgpu_fence_reference(oldest_fence, NULL);
1286 }
1287 }
1288
1289 /* We'll accumulate sequence numbers in this structure. It automatically keeps only the latest
1290 * sequence number per queue and removes all older ones.
1291 */
1292 struct amdgpu_seq_no_fences seq_no_dependencies;
1293 memcpy(&seq_no_dependencies, &cs->seq_no_dependencies, sizeof(seq_no_dependencies));
1294
1295 if (!QUEUE_USES_ALT_FENCE) {
1296 /* Add a fence dependency on the previous IB if the IP has multiple physical queues to
1297 * make it appear as if it had only 1 queue, or if the previous IB comes from a different
1298 * context. The reasons are:
1299 * - Our BO fence tracking only supports 1 queue per IP.
1300 * - IBs from different contexts must wait for each other and can't execute in a random order.
1301 */
1302 struct amdgpu_fence *prev_fence =
1303 (struct amdgpu_fence*)queue->fences[prev_seq_no % AMDGPU_FENCE_RING_SIZE];
1304
1305 if (prev_fence && (aws->info.ip[acs->ip_type].num_queues > 1 || queue->last_ctx != acs->ctx))
1306 add_seq_no_to_list(aws, &seq_no_dependencies, queue_index, prev_seq_no);
1307 }
1308
1309 /* Since the kernel driver doesn't synchronize execution between different
1310 * rings automatically, we have to add fence dependencies manually. This gathers sequence
1311 * numbers from BOs and sets the next sequence number in the BOs.
1312 */
1313
1314 /* Slab entry BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */
1315 struct amdgpu_cs_buffer *slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
1316 unsigned num_slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
1317 unsigned initial_num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1318 unsigned queue_index_bit = QUEUE_USES_ALT_FENCE ? 0 : BITFIELD_BIT(queue_index);
1319
1320 for (unsigned i = 0; i < num_slab_entry_buffers; i++) {
1321 struct amdgpu_cs_buffer *buffer = &slab_entry_buffers[i];
1322 struct amdgpu_winsys_bo *bo = buffer->bo;
1323
1324 amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo,
1325 buffer->usage);
1326 if (QUEUE_USES_ALT_FENCE)
1327 amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1328 else
1329 amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1330
1331 /* We didn't add any slab entries into the real buffer list that will be submitted
1332 * to the kernel. Do it now.
1333 */
1334 struct amdgpu_cs_buffer *real_buffer =
1335 amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(buffer->bo)->b,
1336 &cs->buffer_lists[AMDGPU_BO_REAL], false);
1337
1338 /* We need to set the usage because it determines the BO priority. */
1339 real_buffer->usage |= buffer->usage;
1340 }
1341
1342 /* Sparse BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */
1343 unsigned num_real_buffers_except_sparse = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1344 struct amdgpu_cs_buffer *sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].buffers;
1345 unsigned num_sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].num_buffers;
1346 bool out_of_memory = false;
1347
1348 for (unsigned i = 0; i < num_sparse_buffers; i++) {
1349 struct amdgpu_cs_buffer *buffer = &sparse_buffers[i];
1350 struct amdgpu_winsys_bo *bo = buffer->bo;
1351
1352 amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo,
1353 buffer->usage);
1354 if (QUEUE_USES_ALT_FENCE)
1355 amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1356 else
1357 amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1358
1359 /* Add backing buffers of sparse buffers to the buffer list.
1360 *
1361 * This is done late, during submission, to keep the buffer list short before
1362 * submit, and to avoid managing fences for the backing buffers.
1363 */
1364 struct amdgpu_bo_sparse *sparse_bo = get_sparse_bo(buffer->bo);
1365
1366 simple_mtx_lock(&sparse_bo->commit_lock);
1367 list_for_each_entry(struct amdgpu_sparse_backing, backing, &sparse_bo->backing, list) {
1368 /* We can directly add the buffer here, because we know that each
1369 * backing buffer occurs only once.
1370 */
1371 struct amdgpu_cs_buffer *real_buffer =
1372 amdgpu_do_add_buffer(cs, &backing->bo->b, &cs->buffer_lists[AMDGPU_BO_REAL], true);
1373 if (!real_buffer) {
1374 fprintf(stderr, "%s: failed to add sparse backing buffer\n", __func__);
1375 simple_mtx_unlock(&sparse_bo->commit_lock);
1376 r = -ENOMEM;
1377 out_of_memory = true;
1378 }
1379
1380 real_buffer->usage = buffer->usage;
1381 }
1382 simple_mtx_unlock(&sparse_bo->commit_lock);
1383 }
1384
1385 /* Real BOs: Add fence dependencies, update seq_no in BOs except sparse backing BOs. */
1386 unsigned num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1387 struct amdgpu_cs_buffer *real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].buffers;
1388 struct drm_amdgpu_bo_list_entry *bo_list =
1389 (struct drm_amdgpu_bo_list_entry *)
1390 alloca(num_real_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
1391 unsigned i;
1392
1393 for (i = 0; i < initial_num_real_buffers; i++) {
1394 struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1395 struct amdgpu_winsys_bo *bo = buffer->bo;
1396
1397 amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo,
1398 buffer->usage);
1399 if (QUEUE_USES_ALT_FENCE)
1400 amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1401 else
1402 amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1403
1404 amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage);
1405 }
1406
1407 /* These are backing buffers of slab entries. Don't add their fence dependencies. */
1408 for (; i < num_real_buffers_except_sparse; i++) {
1409 struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1410 struct amdgpu_winsys_bo *bo = buffer->bo;
1411
1412 if (QUEUE_USES_ALT_FENCE)
1413 get_real_bo_reusable_slab(bo)->b.b.slab_has_busy_alt_fences = true;
1414 else
1415 amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1416
1417 amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage);
1418 }
1419
1420 /* Sparse backing BOs are last. Don't update their fences because we don't use them. */
1421 for (; i < num_real_buffers; ++i) {
1422 struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1423
1424 amdgpu_add_to_kernel_bo_list(&bo_list[i], buffer->bo, buffer->usage);
1425 }
1426
1427 #if 0 /* Debug code. */
1428 printf("submit queue=%u, seq_no=%u\n", acs->queue_index, next_seq_no);
1429
1430 /* Wait for all previous fences. This can be used when BO fence tracking doesn't work. */
1431 for (unsigned i = 0; i < AMDGPU_MAX_QUEUES; i++) {
1432 if (i == acs->queue_index)
1433 continue;
1434
1435 struct pipe_fence_handle *fence = queue->fences[ws->queues[i].latest_seq_no % AMDGPU_FENCE_RING_SIZE];
1436 if (!fence) {
1437 if (i <= 1)
1438 printf(" queue %u doesn't have any fence at seq_no %u\n", i, ws->queues[i].latest_seq_no);
1439 continue;
1440 }
1441
1442 bool valid = seq_no_dependencies.valid_fence_mask & BITFIELD_BIT(i);
1443 uint_seq_no old = seq_no_dependencies.seq_no[i];
1444 add_seq_no_to_list(aws, &seq_no_dependencies, i, aws->queues[i].latest_seq_no);
1445 uint_seq_no new = seq_no_dependencies.seq_no[i];
1446
1447 if (!valid)
1448 printf(" missing dependency on queue=%u, seq_no=%u\n", i, new);
1449 else if (old != new)
1450 printf(" too old dependency on queue=%u, old=%u, new=%u\n", i, old, new);
1451 else
1452 printf(" has dependency on queue=%u, seq_no=%u\n", i, old);
1453 }
1454 #endif
1455
1456 /* Convert the sequence numbers we gathered to fence dependencies. */
1457 u_foreach_bit(i, seq_no_dependencies.valid_fence_mask) {
1458 struct pipe_fence_handle **fence = get_fence_from_ring(aws, &seq_no_dependencies, i);
1459
1460 if (fence) {
1461 /* If it's idle, don't add it to the list of dependencies. */
1462 if (amdgpu_fence_wait(*fence, 0, false))
1463 amdgpu_fence_reference(fence, NULL);
1464 else
1465 add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)*fence);
1466 }
1467 }
1468
1469 if (!QUEUE_USES_ALT_FENCE) {
1470 /* Finally, add the IB fence into the fence ring of the queue. */
1471 amdgpu_fence_reference(&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE], cs->fence);
1472 queue->latest_seq_no = next_seq_no;
1473 ((struct amdgpu_fence*)cs->fence)->queue_seq_no = next_seq_no;
1474
1475 /* Update the last used context in the queue. */
1476 amdgpu_ctx_reference(&queue->last_ctx, acs->ctx);
1477 }
1478 simple_mtx_unlock(&aws->bo_fence_lock);
1479
1480 #if MESA_DEBUG
1481 /* Prepare the buffer list. */
1482 if (aws->debug_all_bos) {
1483 /* The buffer list contains all buffers. This is a slow path that
1484 * ensures that no buffer is missing in the BO list.
1485 */
1486 simple_mtx_lock(&aws->global_bo_list_lock);
1487 bo_list = (struct drm_amdgpu_bo_list_entry *)
1488 alloca(aws->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
1489 num_real_buffers = 0;
1490
1491 list_for_each_entry(struct amdgpu_bo_real, bo, &aws->global_bo_list, global_list_item) {
1492 bo_list[num_real_buffers].bo_handle = bo->kms_handle;
1493 bo_list[num_real_buffers].bo_priority = 0;
1494 ++num_real_buffers;
1495 }
1496 simple_mtx_unlock(&aws->global_bo_list_lock);
1497 }
1498 #endif
1499
1500 if (acs->ip_type == AMD_IP_GFX)
1501 aws->gfx_bo_list_counter += num_real_buffers;
1502
1503 struct drm_amdgpu_cs_chunk chunks[8];
1504 unsigned num_chunks = 0;
1505
1506 /* BO list */
1507 struct drm_amdgpu_bo_list_in bo_list_in;
1508 bo_list_in.operation = ~0;
1509 bo_list_in.list_handle = ~0;
1510 bo_list_in.bo_number = num_real_buffers;
1511 bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
1512 bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)bo_list;
1513
1514 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
1515 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
1516 chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in;
1517 num_chunks++;
1518
1519 /* Syncobj dependencies. */
1520 unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num;
1521 if (num_syncobj_dependencies) {
1522 struct drm_amdgpu_cs_chunk_sem *sem_chunk =
1523 (struct drm_amdgpu_cs_chunk_sem *)
1524 alloca(num_syncobj_dependencies * sizeof(sem_chunk[0]));
1525
1526 for (unsigned i = 0; i < num_syncobj_dependencies; i++) {
1527 struct amdgpu_fence *fence =
1528 (struct amdgpu_fence*)cs->syncobj_dependencies.list[i];
1529
1530 assert(util_queue_fence_is_signalled(&fence->submitted));
1531 sem_chunk[i].handle = fence->syncobj;
1532 }
1533
1534 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN;
1535 chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_dependencies;
1536 chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
1537 num_chunks++;
1538 }
1539
1540 /* Syncobj signals. */
1541 unsigned num_syncobj_to_signal = 1 + cs->syncobj_to_signal.num;
1542 struct drm_amdgpu_cs_chunk_sem *sem_chunk =
1543 (struct drm_amdgpu_cs_chunk_sem *)
1544 alloca(num_syncobj_to_signal * sizeof(sem_chunk[0]));
1545
1546 for (unsigned i = 0; i < num_syncobj_to_signal - 1; i++) {
1547 struct amdgpu_fence *fence =
1548 (struct amdgpu_fence*)cs->syncobj_to_signal.list[i];
1549
1550 sem_chunk[i].handle = fence->syncobj;
1551 }
1552 sem_chunk[cs->syncobj_to_signal.num].handle = ((struct amdgpu_fence*)cs->fence)->syncobj;
1553
1554 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_OUT;
1555 chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_to_signal;
1556 chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
1557 num_chunks++;
1558
1559 if (aws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.shadow_va) {
1560 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_CP_GFX_SHADOW;
1561 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_cp_gfx_shadow) / 4;
1562 chunks[num_chunks].chunk_data = (uintptr_t)&acs->mcbp_fw_shadow_chunk;
1563 num_chunks++;
1564 }
1565
1566 /* Fence */
1567 if (has_user_fence) {
1568 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE;
1569 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
1570 chunks[num_chunks].chunk_data = (uintptr_t)&acs->fence_chunk;
1571 num_chunks++;
1572 }
1573
1574 /* IB */
1575 if (cs->chunk_ib[IB_PREAMBLE].ib_bytes) {
1576 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
1577 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1578 chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_PREAMBLE];
1579 num_chunks++;
1580 }
1581
1582 /* IB */
1583 cs->chunk_ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */
1584 chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
1585 chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1586 chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_MAIN];
1587 num_chunks++;
1588
1589 if (cs->secure) {
1590 cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAGS_SECURE;
1591 cs->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAGS_SECURE;
1592 } else {
1593 cs->chunk_ib[IB_PREAMBLE].flags &= ~AMDGPU_IB_FLAGS_SECURE;
1594 cs->chunk_ib[IB_MAIN].flags &= ~AMDGPU_IB_FLAGS_SECURE;
1595 }
1596
1597 bool noop = acs->noop;
1598
1599 if (noop && acs->ip_type == AMD_IP_GFX) {
1600 /* Reduce the IB size and fill it with NOP to make it like an empty IB. */
1601 unsigned noop_dw_size = aws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1;
1602 assert(cs->chunk_ib[IB_MAIN].ib_bytes / 4 >= noop_dw_size);
1603
1604 cs->ib_main_addr[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0);
1605 cs->chunk_ib[IB_MAIN].ib_bytes = noop_dw_size * 4;
1606 noop = false;
1607 }
1608
1609 assert(num_chunks <= ARRAY_SIZE(chunks));
1610
1611 if (out_of_memory) {
1612 r = -ENOMEM;
1613 } else if (unlikely(acs->ctx->sw_status != PIPE_NO_RESET)) {
1614 r = -ECANCELED;
1615 } else if (unlikely(noop)) {
1616 r = 0;
1617 } else {
1618 /* Submit the command buffer.
1619 *
1620 * The kernel returns -ENOMEM with many parallel processes using GDS such as test suites
1621 * quite often, but it eventually succeeds after enough attempts. This happens frequently
1622 * with dEQP using NGG streamout.
1623 */
1624 r = 0;
1625
1626 do {
1627 /* Wait 1 ms and try again. */
1628 if (r == -ENOMEM)
1629 os_time_sleep(1000);
1630
1631 r = amdgpu_cs_submit_raw2(aws->dev, acs->ctx->ctx, 0, num_chunks, chunks, &seq_no);
1632 } while (r == -ENOMEM);
1633
1634 if (!r) {
1635 /* Success. */
1636 uint64_t *user_fence = NULL;
1637
1638 /* Need to reserve 4 QWORD for user fence:
1639 * QWORD[0]: completed fence
1640 * QWORD[1]: preempted fence
1641 * QWORD[2]: reset fence
1642 * QWORD[3]: preempted then reset
1643 */
1644 if (has_user_fence)
1645 user_fence = acs->ctx->user_fence_cpu_address_base + acs->ip_type * 4;
1646 amdgpu_fence_submitted(cs->fence, seq_no, user_fence);
1647 }
1648 }
1649
1650 if (unlikely(r)) {
1651 if (r == -ECANCELED) {
1652 amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_INNOCENT_CONTEXT_RESET,
1653 "amdgpu: The CS has cancelled because the context is lost. This context is innocent.\n");
1654 } else if (r == -ENODATA) {
1655 amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
1656 "amdgpu: The CS has cancelled because the context is lost. This context is guilty of a soft recovery.\n");
1657 } else if (r == -ETIME) {
1658 amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
1659 "amdgpu: The CS has cancelled because the context is lost. This context is guilty of a hard recovery.\n");
1660 } else {
1661 amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx,
1662 PIPE_UNKNOWN_CONTEXT_RESET,
1663 "amdgpu: The CS has been rejected, "
1664 "see dmesg for more information (%i).\n",
1665 r);
1666 }
1667 }
1668
1669 /* If there was an error, signal the fence, because it won't be signalled
1670 * by the hardware. */
1671 if (r || noop)
1672 amdgpu_fence_signalled(cs->fence);
1673
1674 if (unlikely(aws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.flags && r == 0))
1675 acs->mcbp_fw_shadow_chunk.flags = 0;
1676
1677 cs->error_code = r;
1678
1679 /* Clear the buffer lists. */
1680 for (unsigned list = 0; list < ARRAY_SIZE(cs->buffer_lists); list++) {
1681 struct amdgpu_cs_buffer *buffers = cs->buffer_lists[list].buffers;
1682 unsigned num_buffers = cs->buffer_lists[list].num_buffers;
1683
1684 if (list == AMDGPU_BO_REAL) {
1685 /* Only decrement num_active_ioctls and unref where we incremented them.
1686 * We did both for regular real BOs. We only incremented the refcount for sparse
1687 * backing BOs.
1688 */
1689 /* Regular real BOs. */
1690 for (unsigned i = 0; i < initial_num_real_buffers; i++) {
1691 p_atomic_dec(&buffers[i].bo->num_active_ioctls);
1692 amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
1693 }
1694
1695 /* Do nothing for slab BOs. */
1696
1697 /* Sparse backing BOs. */
1698 for (unsigned i = num_real_buffers_except_sparse; i < num_buffers; i++)
1699 amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
1700 } else {
1701 for (unsigned i = 0; i < num_buffers; i++) {
1702 p_atomic_dec(&buffers[i].bo->num_active_ioctls);
1703 amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
1704 }
1705 }
1706
1707 cs->buffer_lists[list].num_buffers = 0;
1708 }
1709
1710 amdgpu_cs_context_cleanup(aws, cs);
1711 }
1712
1713 /* Make sure the previous submission is completed. */
amdgpu_cs_sync_flush(struct radeon_cmdbuf * rcs)1714 void amdgpu_cs_sync_flush(struct radeon_cmdbuf *rcs)
1715 {
1716 struct amdgpu_cs *cs = amdgpu_cs(rcs);
1717
1718 /* Wait for any pending ioctl of this CS to complete. */
1719 util_queue_fence_wait(&cs->flush_completed);
1720 }
1721
amdgpu_cs_flush(struct radeon_cmdbuf * rcs,unsigned flags,struct pipe_fence_handle ** fence)1722 static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
1723 unsigned flags,
1724 struct pipe_fence_handle **fence)
1725 {
1726 struct amdgpu_cs *cs = amdgpu_cs(rcs);
1727 struct amdgpu_winsys *aws = cs->aws;
1728 int error_code = 0;
1729 uint32_t ib_pad_dw_mask = aws->info.ip[cs->ip_type].ib_pad_dw_mask;
1730
1731 rcs->current.max_dw += amdgpu_cs_epilog_dws(cs);
1732
1733 /* Pad the IB according to the mask. */
1734 switch (cs->ip_type) {
1735 case AMD_IP_SDMA:
1736 if (aws->info.gfx_level <= GFX6) {
1737 while (rcs->current.cdw & ib_pad_dw_mask)
1738 radeon_emit(rcs, 0xf0000000); /* NOP packet */
1739 } else {
1740 while (rcs->current.cdw & ib_pad_dw_mask)
1741 radeon_emit(rcs, SDMA_NOP_PAD);
1742 }
1743 break;
1744 case AMD_IP_GFX:
1745 case AMD_IP_COMPUTE:
1746 amdgpu_pad_gfx_compute_ib(aws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 0);
1747 if (cs->ip_type == AMD_IP_GFX)
1748 aws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4;
1749 break;
1750 case AMD_IP_UVD:
1751 case AMD_IP_UVD_ENC:
1752 while (rcs->current.cdw & ib_pad_dw_mask)
1753 radeon_emit(rcs, 0x80000000); /* type2 nop packet */
1754 break;
1755 case AMD_IP_VCN_JPEG:
1756 if (rcs->current.cdw % 2)
1757 assert(0);
1758 while (rcs->current.cdw & ib_pad_dw_mask) {
1759 radeon_emit(rcs, 0x60000000); /* nop packet */
1760 radeon_emit(rcs, 0x00000000);
1761 }
1762 break;
1763 case AMD_IP_VCN_DEC:
1764 while (rcs->current.cdw & ib_pad_dw_mask)
1765 radeon_emit(rcs, 0x81ff); /* nop packet */
1766 break;
1767 default:
1768 break;
1769 }
1770
1771 if (rcs->current.cdw > rcs->current.max_dw) {
1772 fprintf(stderr, "amdgpu: command stream overflowed\n");
1773 }
1774
1775 /* If the CS is not empty or overflowed.... */
1776 if (likely(radeon_emitted(rcs, 0) &&
1777 rcs->current.cdw <= rcs->current.max_dw &&
1778 !(flags & RADEON_FLUSH_NOOP))) {
1779 struct amdgpu_cs_context *cur = cs->csc;
1780
1781 /* Set IB sizes. */
1782 amdgpu_ib_finalize(aws, rcs, &cs->main_ib, cs->ip_type);
1783
1784 /* Create a fence. */
1785 amdgpu_fence_reference(&cur->fence, NULL);
1786 if (cs->next_fence) {
1787 /* just move the reference */
1788 cur->fence = cs->next_fence;
1789 cs->next_fence = NULL;
1790 } else {
1791 cur->fence = amdgpu_fence_create(cs);
1792 }
1793 if (fence)
1794 amdgpu_fence_reference(fence, cur->fence);
1795
1796 for (unsigned i = 0; i < ARRAY_SIZE(cur->buffer_lists); i++) {
1797 unsigned num_buffers = cur->buffer_lists[i].num_buffers;
1798 struct amdgpu_cs_buffer *buffers = cur->buffer_lists[i].buffers;
1799
1800 for (unsigned j = 0; j < num_buffers; j++)
1801 p_atomic_inc(&buffers[j].bo->num_active_ioctls);
1802 }
1803
1804 amdgpu_cs_sync_flush(rcs);
1805
1806 /* Swap command streams. "cst" is going to be submitted. */
1807 rcs->csc = cs->csc = cs->cst;
1808 cs->cst = cur;
1809
1810 /* Submit. */
1811 util_queue_add_job(&aws->cs_queue, cs, &cs->flush_completed,
1812 cs->uses_alt_fence ? amdgpu_cs_submit_ib<true>
1813 : amdgpu_cs_submit_ib<false>, NULL, 0);
1814
1815 if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
1816 cs->csc->secure = !cs->cst->secure;
1817 else
1818 cs->csc->secure = cs->cst->secure;
1819
1820 if (!(flags & PIPE_FLUSH_ASYNC)) {
1821 amdgpu_cs_sync_flush(rcs);
1822 error_code = cur->error_code;
1823 }
1824 } else {
1825 if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
1826 cs->csc->secure = !cs->csc->secure;
1827
1828 amdgpu_cs_context_cleanup_buffers(aws, cs->csc);
1829 amdgpu_cs_context_cleanup(aws, cs->csc);
1830 }
1831
1832 memset(cs->csc->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
1833
1834 amdgpu_get_new_ib(aws, rcs, &cs->main_ib, cs);
1835
1836 if (cs->preamble_ib_bo) {
1837 amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo,
1838 RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
1839 }
1840
1841 if (cs->ip_type == AMD_IP_GFX)
1842 aws->num_gfx_IBs++;
1843 else if (cs->ip_type == AMD_IP_SDMA)
1844 aws->num_sdma_IBs++;
1845
1846 return error_code;
1847 }
1848
amdgpu_cs_destroy(struct radeon_cmdbuf * rcs)1849 static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
1850 {
1851 struct amdgpu_cs *cs = amdgpu_cs(rcs);
1852
1853 if (!cs)
1854 return;
1855
1856 amdgpu_cs_sync_flush(rcs);
1857 util_queue_fence_destroy(&cs->flush_completed);
1858 p_atomic_dec(&cs->aws->num_cs);
1859 radeon_bo_reference(&cs->aws->dummy_sws.base, &cs->preamble_ib_bo, NULL);
1860 radeon_bo_reference(&cs->aws->dummy_sws.base, &cs->main_ib.big_buffer, NULL);
1861 FREE(rcs->prev);
1862 amdgpu_destroy_cs_context(cs->aws, &cs->csc1);
1863 amdgpu_destroy_cs_context(cs->aws, &cs->csc2);
1864 amdgpu_fence_reference(&cs->next_fence, NULL);
1865 FREE(cs);
1866 }
1867
amdgpu_bo_is_referenced(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * _buf,unsigned usage)1868 static bool amdgpu_bo_is_referenced(struct radeon_cmdbuf *rcs,
1869 struct pb_buffer_lean *_buf,
1870 unsigned usage)
1871 {
1872 struct amdgpu_cs *cs = amdgpu_cs(rcs);
1873 struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)_buf;
1874
1875 return amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, usage);
1876 }
1877
amdgpu_cs_set_mcbp_reg_shadowing_va(struct radeon_cmdbuf * rcs,uint64_t regs_va,uint64_t csa_va)1878 static void amdgpu_cs_set_mcbp_reg_shadowing_va(struct radeon_cmdbuf *rcs,uint64_t regs_va,
1879 uint64_t csa_va)
1880 {
1881 struct amdgpu_cs *cs = amdgpu_cs(rcs);
1882 cs->mcbp_fw_shadow_chunk.shadow_va = regs_va;
1883 cs->mcbp_fw_shadow_chunk.csa_va = csa_va;
1884 cs->mcbp_fw_shadow_chunk.gds_va = 0;
1885 cs->mcbp_fw_shadow_chunk.flags = AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW;
1886 }
1887
amdgpu_winsys_fence_reference(struct radeon_winsys * rws,struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)1888 static void amdgpu_winsys_fence_reference(struct radeon_winsys *rws,
1889 struct pipe_fence_handle **dst,
1890 struct pipe_fence_handle *src)
1891 {
1892 amdgpu_fence_reference(dst, src);
1893 }
1894
amdgpu_cs_init_functions(struct amdgpu_screen_winsys * sws)1895 void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *sws)
1896 {
1897 sws->base.ctx_create = amdgpu_ctx_create;
1898 sws->base.ctx_destroy = amdgpu_ctx_destroy;
1899 sws->base.ctx_set_sw_reset_status = amdgpu_ctx_set_sw_reset_status;
1900 sws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
1901 sws->base.cs_create = amdgpu_cs_create;
1902 sws->base.cs_setup_preemption = amdgpu_cs_setup_preemption;
1903 sws->base.cs_destroy = amdgpu_cs_destroy;
1904 sws->base.cs_add_buffer = amdgpu_cs_add_buffer;
1905 sws->base.cs_validate = amdgpu_cs_validate;
1906 sws->base.cs_check_space = amdgpu_cs_check_space;
1907 sws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list;
1908 sws->base.cs_flush = amdgpu_cs_flush;
1909 sws->base.cs_get_next_fence = amdgpu_cs_get_next_fence;
1910 sws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced;
1911 sws->base.cs_sync_flush = amdgpu_cs_sync_flush;
1912 sws->base.cs_add_fence_dependency = amdgpu_cs_add_fence_dependency;
1913 sws->base.cs_add_syncobj_signal = amdgpu_cs_add_syncobj_signal;
1914 sws->base.cs_get_ip_type = amdgpu_cs_get_ip_type;
1915 sws->base.fence_wait = amdgpu_fence_wait_rel_timeout;
1916 sws->base.fence_reference = amdgpu_winsys_fence_reference;
1917 sws->base.fence_import_syncobj = amdgpu_fence_import_syncobj;
1918 sws->base.fence_import_sync_file = amdgpu_fence_import_sync_file;
1919 sws->base.fence_export_sync_file = amdgpu_fence_export_sync_file;
1920 sws->base.export_signalled_sync_file = amdgpu_export_signalled_sync_file;
1921
1922 if (sws->aws->info.has_fw_based_shadowing)
1923 sws->base.cs_set_mcbp_reg_shadowing_va = amdgpu_cs_set_mcbp_reg_shadowing_va;
1924 }
1925