xref: /aosp_15_r20/external/mesa3d/src/gallium/winsys/amdgpu/drm/amdgpu_cs.cpp (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2008 Jérôme Glisse
3  * Copyright © 2010 Marek Olšák <[email protected]>
4  * Copyright © 2015 Advanced Micro Devices, Inc.
5  *
6  * SPDX-License-Identifier: MIT
7  */
8 
9 #include "amdgpu_cs.h"
10 #include "util/detect_os.h"
11 #include "util/os_time.h"
12 #include <inttypes.h>
13 #include <stdio.h>
14 
15 #include "amd/common/sid.h"
16 
17 /* Some BSDs don't define ENODATA (and ENODATA is replaced with different error
18  * codes in the kernel).
19  */
20 #if DETECT_OS_OPENBSD
21 #define ENODATA ENOTSUP
22 #elif DETECT_OS_FREEBSD || DETECT_OS_DRAGONFLY
23 #define ENODATA ECONNREFUSED
24 #endif
25 
26 /* FENCES */
27 
amdgpu_fence_destroy(struct amdgpu_fence * fence)28 void amdgpu_fence_destroy(struct amdgpu_fence *fence)
29 {
30    amdgpu_cs_destroy_syncobj(fence->aws->dev, fence->syncobj);
31 
32    if (fence->ctx)
33       amdgpu_ctx_reference(&fence->ctx, NULL);
34 
35    util_queue_fence_destroy(&fence->submitted);
36    FREE(fence);
37 }
38 
39 static struct pipe_fence_handle *
amdgpu_fence_create(struct amdgpu_cs * cs)40 amdgpu_fence_create(struct amdgpu_cs *cs)
41 {
42    struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
43    struct amdgpu_ctx *ctx = cs->ctx;
44 
45    fence->reference.count = 1;
46    fence->aws = ctx->aws;
47    amdgpu_ctx_reference(&fence->ctx, ctx);
48    fence->ctx = ctx;
49    fence->ip_type = cs->ip_type;
50    if (amdgpu_cs_create_syncobj2(ctx->aws->dev, 0, &fence->syncobj)) {
51       free(fence);
52       return NULL;
53    }
54 
55    util_queue_fence_init(&fence->submitted);
56    util_queue_fence_reset(&fence->submitted);
57    fence->queue_index = cs->queue_index;
58    return (struct pipe_fence_handle *)fence;
59 }
60 
61 static struct pipe_fence_handle *
amdgpu_fence_import_syncobj(struct radeon_winsys * rws,int fd)62 amdgpu_fence_import_syncobj(struct radeon_winsys *rws, int fd)
63 {
64    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
65    struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
66    int r;
67 
68    if (!fence)
69       return NULL;
70 
71    pipe_reference_init(&fence->reference, 1);
72    fence->aws = aws;
73    fence->ip_type = 0xffffffff;
74 
75    r = amdgpu_cs_import_syncobj(aws->dev, fd, &fence->syncobj);
76    if (r) {
77       FREE(fence);
78       return NULL;
79    }
80 
81    util_queue_fence_init(&fence->submitted);
82    fence->imported = true;
83 
84    return (struct pipe_fence_handle*)fence;
85 }
86 
87 static struct pipe_fence_handle *
amdgpu_fence_import_sync_file(struct radeon_winsys * rws,int fd)88 amdgpu_fence_import_sync_file(struct radeon_winsys *rws, int fd)
89 {
90    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
91    struct amdgpu_fence *fence = CALLOC_STRUCT(amdgpu_fence);
92 
93    if (!fence)
94       return NULL;
95 
96    pipe_reference_init(&fence->reference, 1);
97    fence->aws = aws;
98    /* fence->ctx == NULL means that the fence is syncobj-based. */
99 
100    /* Convert sync_file into syncobj. */
101    int r = amdgpu_cs_create_syncobj(aws->dev, &fence->syncobj);
102    if (r) {
103       FREE(fence);
104       return NULL;
105    }
106 
107    r = amdgpu_cs_syncobj_import_sync_file(aws->dev, fence->syncobj, fd);
108    if (r) {
109       amdgpu_cs_destroy_syncobj(aws->dev, fence->syncobj);
110       FREE(fence);
111       return NULL;
112    }
113 
114    util_queue_fence_init(&fence->submitted);
115    fence->imported = true;
116 
117    return (struct pipe_fence_handle*)fence;
118 }
119 
amdgpu_fence_export_sync_file(struct radeon_winsys * rws,struct pipe_fence_handle * pfence)120 static int amdgpu_fence_export_sync_file(struct radeon_winsys *rws,
121                                          struct pipe_fence_handle *pfence)
122 {
123    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
124    struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
125    int fd, r;
126 
127    util_queue_fence_wait(&fence->submitted);
128 
129    /* Convert syncobj into sync_file. */
130    r = amdgpu_cs_syncobj_export_sync_file(aws->dev, fence->syncobj, &fd);
131    return r ? -1 : fd;
132 }
133 
amdgpu_export_signalled_sync_file(struct radeon_winsys * rws)134 static int amdgpu_export_signalled_sync_file(struct radeon_winsys *rws)
135 {
136    struct amdgpu_winsys *aws = amdgpu_winsys(rws);
137    uint32_t syncobj;
138    int fd = -1;
139 
140    int r = amdgpu_cs_create_syncobj2(aws->dev, DRM_SYNCOBJ_CREATE_SIGNALED,
141                                      &syncobj);
142    if (r) {
143       return -1;
144    }
145 
146    r = amdgpu_cs_syncobj_export_sync_file(aws->dev, syncobj, &fd);
147    if (r) {
148       fd = -1;
149    }
150 
151    amdgpu_cs_destroy_syncobj(aws->dev, syncobj);
152    return fd;
153 }
154 
amdgpu_fence_submitted(struct pipe_fence_handle * fence,uint64_t seq_no,uint64_t * user_fence_cpu_address)155 static void amdgpu_fence_submitted(struct pipe_fence_handle *fence,
156                                    uint64_t seq_no,
157                                    uint64_t *user_fence_cpu_address)
158 {
159    struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
160 
161    afence->seq_no = seq_no;
162    afence->user_fence_cpu_address = user_fence_cpu_address;
163    util_queue_fence_signal(&afence->submitted);
164 }
165 
amdgpu_fence_signalled(struct pipe_fence_handle * fence)166 static void amdgpu_fence_signalled(struct pipe_fence_handle *fence)
167 {
168    struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
169 
170    afence->signalled = true;
171    util_queue_fence_signal(&afence->submitted);
172 }
173 
amdgpu_fence_wait(struct pipe_fence_handle * fence,uint64_t timeout,bool absolute)174 bool amdgpu_fence_wait(struct pipe_fence_handle *fence, uint64_t timeout,
175                        bool absolute)
176 {
177    struct amdgpu_fence *afence = (struct amdgpu_fence*)fence;
178    int64_t abs_timeout;
179    uint64_t *user_fence_cpu;
180 
181    if (afence->signalled)
182       return true;
183 
184    if (absolute)
185       abs_timeout = timeout;
186    else
187       abs_timeout = os_time_get_absolute_timeout(timeout);
188 
189    /* The fence might not have a number assigned if its IB is being
190     * submitted in the other thread right now. Wait until the submission
191     * is done. */
192    if (!util_queue_fence_wait_timeout(&afence->submitted, abs_timeout))
193       return false;
194 
195    user_fence_cpu = afence->user_fence_cpu_address;
196    if (user_fence_cpu) {
197       if (*user_fence_cpu >= afence->seq_no) {
198          afence->signalled = true;
199          return true;
200       }
201 
202       /* No timeout, just query: no need for the ioctl. */
203       if (!absolute && !timeout)
204          return false;
205    }
206 
207    if ((uint64_t)abs_timeout == OS_TIMEOUT_INFINITE)
208       abs_timeout = INT64_MAX;
209 
210    if (amdgpu_cs_syncobj_wait(afence->aws->dev, &afence->syncobj, 1,
211                               abs_timeout, 0, NULL))
212 
213       return false;
214 
215    afence->signalled = true;
216    return true;
217 }
218 
amdgpu_fence_wait_rel_timeout(struct radeon_winsys * rws,struct pipe_fence_handle * fence,uint64_t timeout)219 static bool amdgpu_fence_wait_rel_timeout(struct radeon_winsys *rws,
220                                           struct pipe_fence_handle *fence,
221                                           uint64_t timeout)
222 {
223    return amdgpu_fence_wait(fence, timeout, false);
224 }
225 
226 static struct pipe_fence_handle *
amdgpu_cs_get_next_fence(struct radeon_cmdbuf * rcs)227 amdgpu_cs_get_next_fence(struct radeon_cmdbuf *rcs)
228 {
229    struct amdgpu_cs *cs = amdgpu_cs(rcs);
230    struct pipe_fence_handle *fence = NULL;
231 
232    if (cs->noop)
233       return NULL;
234 
235    if (cs->next_fence) {
236       amdgpu_fence_reference(&fence, cs->next_fence);
237       return fence;
238    }
239 
240    fence = amdgpu_fence_create(cs);
241    if (!fence)
242       return NULL;
243 
244    amdgpu_fence_reference(&cs->next_fence, fence);
245    return fence;
246 }
247 
248 /* CONTEXTS */
249 
250 static uint32_t
radeon_to_amdgpu_priority(enum radeon_ctx_priority radeon_priority)251 radeon_to_amdgpu_priority(enum radeon_ctx_priority radeon_priority)
252 {
253    switch (radeon_priority) {
254    case RADEON_CTX_PRIORITY_REALTIME:
255       return AMDGPU_CTX_PRIORITY_VERY_HIGH;
256    case RADEON_CTX_PRIORITY_HIGH:
257       return AMDGPU_CTX_PRIORITY_HIGH;
258    case RADEON_CTX_PRIORITY_MEDIUM:
259       return AMDGPU_CTX_PRIORITY_NORMAL;
260    case RADEON_CTX_PRIORITY_LOW:
261       return AMDGPU_CTX_PRIORITY_LOW;
262    default:
263       unreachable("Invalid context priority");
264    }
265 }
266 
amdgpu_ctx_create(struct radeon_winsys * rws,enum radeon_ctx_priority priority,bool allow_context_lost)267 static struct radeon_winsys_ctx *amdgpu_ctx_create(struct radeon_winsys *rws,
268                                                    enum radeon_ctx_priority priority,
269                                                    bool allow_context_lost)
270 {
271    struct amdgpu_ctx *ctx = CALLOC_STRUCT(amdgpu_ctx);
272    int r;
273    struct amdgpu_bo_alloc_request alloc_buffer = {};
274    uint32_t amdgpu_priority = radeon_to_amdgpu_priority(priority);
275    amdgpu_bo_handle buf_handle;
276 
277    if (!ctx)
278       return NULL;
279 
280    ctx->aws = amdgpu_winsys(rws);
281    ctx->reference.count = 1;
282    ctx->allow_context_lost = allow_context_lost;
283 
284    r = amdgpu_cs_ctx_create2(ctx->aws->dev, amdgpu_priority, &ctx->ctx);
285    if (r) {
286       fprintf(stderr, "amdgpu: amdgpu_cs_ctx_create2 failed. (%i)\n", r);
287       goto error_create;
288    }
289 
290    alloc_buffer.alloc_size = ctx->aws->info.gart_page_size;
291    alloc_buffer.phys_alignment = ctx->aws->info.gart_page_size;
292    alloc_buffer.preferred_heap = AMDGPU_GEM_DOMAIN_GTT;
293 
294    r = amdgpu_bo_alloc(ctx->aws->dev, &alloc_buffer, &buf_handle);
295    if (r) {
296       fprintf(stderr, "amdgpu: amdgpu_bo_alloc failed. (%i)\n", r);
297       goto error_user_fence_alloc;
298    }
299 
300    r = amdgpu_bo_cpu_map(buf_handle, (void**)&ctx->user_fence_cpu_address_base);
301    if (r) {
302       fprintf(stderr, "amdgpu: amdgpu_bo_cpu_map failed. (%i)\n", r);
303       goto error_user_fence_map;
304    }
305 
306    memset(ctx->user_fence_cpu_address_base, 0, alloc_buffer.alloc_size);
307    ctx->user_fence_bo = buf_handle;
308 
309    return (struct radeon_winsys_ctx*)ctx;
310 
311 error_user_fence_map:
312    amdgpu_bo_free(buf_handle);
313 error_user_fence_alloc:
314    amdgpu_cs_ctx_free(ctx->ctx);
315 error_create:
316    FREE(ctx);
317    return NULL;
318 }
319 
amdgpu_ctx_destroy(struct radeon_winsys_ctx * rwctx)320 static void amdgpu_ctx_destroy(struct radeon_winsys_ctx *rwctx)
321 {
322    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
323 
324    amdgpu_ctx_reference(&ctx, NULL);
325 }
326 
amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys * aws,enum amd_ip_type ip_type,uint32_t * ib,uint32_t * num_dw,unsigned leave_dw_space)327 static void amdgpu_pad_gfx_compute_ib(struct amdgpu_winsys *aws, enum amd_ip_type ip_type,
328                                       uint32_t *ib, uint32_t *num_dw, unsigned leave_dw_space)
329 {
330    unsigned pad_dw_mask = aws->info.ip[ip_type].ib_pad_dw_mask;
331    unsigned unaligned_dw = (*num_dw + leave_dw_space) & pad_dw_mask;
332 
333    if (unaligned_dw) {
334       int remaining = pad_dw_mask + 1 - unaligned_dw;
335 
336       /* Only pad by 1 dword with the type-2 NOP if necessary. */
337       if (remaining == 1 && aws->info.gfx_ib_pad_with_type2) {
338          ib[(*num_dw)++] = PKT2_NOP_PAD;
339       } else {
340          /* Pad with a single NOP packet to minimize CP overhead because NOP is a variable-sized
341           * packet. The size of the packet body after the header is always count + 1.
342           * If count == -1, there is no packet body. NOP is the only packet that can have
343           * count == -1, which is the definition of PKT3_NOP_PAD (count == 0x3fff means -1).
344           */
345          ib[(*num_dw)++] = PKT3(PKT3_NOP, remaining - 2, 0);
346          *num_dw += remaining - 1;
347       }
348    }
349    assert(((*num_dw + leave_dw_space) & pad_dw_mask) == 0);
350 }
351 
amdgpu_submit_gfx_nop(struct amdgpu_ctx * ctx)352 static int amdgpu_submit_gfx_nop(struct amdgpu_ctx *ctx)
353 {
354    struct amdgpu_bo_alloc_request request = {0};
355    struct drm_amdgpu_bo_list_in bo_list_in;
356    struct drm_amdgpu_cs_chunk_ib ib_in = {0};
357    amdgpu_bo_handle buf_handle;
358    amdgpu_va_handle va_handle = NULL;
359    struct drm_amdgpu_cs_chunk chunks[2];
360    struct drm_amdgpu_bo_list_entry list;
361    unsigned noop_dw_size;
362    void *cpu = NULL;
363    uint64_t seq_no;
364    uint64_t va;
365    int r;
366 
367    /* Older amdgpu doesn't report if the reset is complete or not. Detect
368     * it by submitting a no-op job. If it reports an error, then assume
369     * that the reset is not complete.
370     */
371    amdgpu_context_handle temp_ctx;
372    r = amdgpu_cs_ctx_create2(ctx->aws->dev, AMDGPU_CTX_PRIORITY_NORMAL, &temp_ctx);
373    if (r)
374       return r;
375 
376    request.preferred_heap = AMDGPU_GEM_DOMAIN_VRAM;
377    request.alloc_size = 4096;
378    request.phys_alignment = 4096;
379    r = amdgpu_bo_alloc(ctx->aws->dev, &request, &buf_handle);
380    if (r)
381       goto destroy_ctx;
382 
383    r = amdgpu_va_range_alloc(ctx->aws->dev, amdgpu_gpu_va_range_general,
384                  request.alloc_size, request.phys_alignment,
385                  0, &va, &va_handle,
386                  AMDGPU_VA_RANGE_32_BIT | AMDGPU_VA_RANGE_HIGH);
387    if (r)
388       goto destroy_bo;
389    r = amdgpu_bo_va_op_raw(ctx->aws->dev, buf_handle, 0, request.alloc_size, va,
390                            AMDGPU_VM_PAGE_READABLE | AMDGPU_VM_PAGE_WRITEABLE | AMDGPU_VM_PAGE_EXECUTABLE,
391                            AMDGPU_VA_OP_MAP);
392    if (r)
393       goto destroy_bo;
394 
395    r = amdgpu_bo_cpu_map(buf_handle, &cpu);
396    if (r)
397       goto destroy_bo;
398 
399    noop_dw_size = ctx->aws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1;
400    ((uint32_t*)cpu)[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0);
401 
402    amdgpu_bo_cpu_unmap(buf_handle);
403 
404    amdgpu_bo_export(buf_handle, amdgpu_bo_handle_type_kms, &list.bo_handle);
405    list.bo_priority = 0;
406 
407    bo_list_in.list_handle = ~0;
408    bo_list_in.bo_number = 1;
409    bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
410    bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)&list;
411 
412    ib_in.ip_type = AMD_IP_GFX;
413    ib_in.ib_bytes = noop_dw_size * 4;
414    ib_in.va_start = va;
415 
416    chunks[0].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
417    chunks[0].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
418    chunks[0].chunk_data = (uintptr_t)&bo_list_in;
419 
420    chunks[1].chunk_id = AMDGPU_CHUNK_ID_IB;
421    chunks[1].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
422    chunks[1].chunk_data = (uintptr_t)&ib_in;
423 
424    r = amdgpu_cs_submit_raw2(ctx->aws->dev, temp_ctx, 0, 2, chunks, &seq_no);
425 
426 destroy_bo:
427    if (va_handle)
428       amdgpu_va_range_free(va_handle);
429    amdgpu_bo_free(buf_handle);
430 destroy_ctx:
431    amdgpu_cs_ctx_free(temp_ctx);
432 
433    return r;
434 }
435 
436 static void
amdgpu_ctx_set_sw_reset_status(struct radeon_winsys_ctx * rwctx,enum pipe_reset_status status,const char * format,...)437 amdgpu_ctx_set_sw_reset_status(struct radeon_winsys_ctx *rwctx, enum pipe_reset_status status,
438                                const char *format, ...)
439 {
440    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
441 
442    /* Don't overwrite the last reset status. */
443    if (ctx->sw_status != PIPE_NO_RESET)
444       return;
445 
446    ctx->sw_status = status;
447 
448    if (!ctx->allow_context_lost) {
449       va_list args;
450 
451       va_start(args, format);
452       vfprintf(stderr, format, args);
453       va_end(args);
454 
455       /* Non-robust contexts are allowed to terminate the process. The only alternative is
456        * to skip command submission, which would look like a freeze because nothing is drawn,
457        * which looks like a hang without any reset.
458        */
459       abort();
460    }
461 }
462 
463 static enum pipe_reset_status
amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx * rwctx,bool full_reset_only,bool * needs_reset,bool * reset_completed)464 amdgpu_ctx_query_reset_status(struct radeon_winsys_ctx *rwctx, bool full_reset_only,
465                               bool *needs_reset, bool *reset_completed)
466 {
467    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
468 
469    if (needs_reset)
470       *needs_reset = false;
471    if (reset_completed)
472       *reset_completed = false;
473 
474    /* Return a failure due to a GPU hang. */
475    uint64_t flags;
476 
477    if (full_reset_only && ctx->sw_status == PIPE_NO_RESET) {
478       /* If the caller is only interested in full reset (= wants to ignore soft
479        * recoveries), we can use the rejected cs count as a quick first check.
480        */
481       return PIPE_NO_RESET;
482    }
483 
484    /*
485     * ctx->sw_status is updated on alloc/ioctl failures.
486     *
487     * We only rely on amdgpu_cs_query_reset_state2 to tell us
488     * that the context reset is complete.
489     */
490    if (ctx->sw_status != PIPE_NO_RESET) {
491       int r = amdgpu_cs_query_reset_state2(ctx->ctx, &flags);
492       if (!r) {
493          if (flags & AMDGPU_CTX_QUERY2_FLAGS_RESET) {
494             if (reset_completed) {
495                /* The ARB_robustness spec says:
496                *
497                *    If a reset status other than NO_ERROR is returned and subsequent
498                *    calls return NO_ERROR, the context reset was encountered and
499                *    completed. If a reset status is repeatedly returned, the context may
500                *    be in the process of resetting.
501                *
502                * Starting with drm_minor >= 54 amdgpu reports if the reset is complete,
503                * so don't do anything special. On older kernels, submit a no-op cs. If it
504                * succeeds then assume the reset is complete.
505                */
506                if (!(flags & AMDGPU_CTX_QUERY2_FLAGS_RESET_IN_PROGRESS))
507                   *reset_completed = true;
508 
509                if (ctx->aws->info.drm_minor < 54 && ctx->aws->info.has_graphics)
510                   *reset_completed = amdgpu_submit_gfx_nop(ctx) == 0;
511             }
512          }
513       } else {
514          fprintf(stderr, "amdgpu: amdgpu_cs_query_reset_state2 failed. (%i)\n", r);
515       }
516 
517       /* Return a failure due to SW issues. */
518       if (needs_reset)
519          *needs_reset = true;
520       return ctx->sw_status;
521    }
522 
523    if (needs_reset)
524       *needs_reset = false;
525    return PIPE_NO_RESET;
526 }
527 
528 /* COMMAND SUBMISSION */
529 
amdgpu_cs_has_user_fence(struct amdgpu_cs * acs)530 static bool amdgpu_cs_has_user_fence(struct amdgpu_cs *acs)
531 {
532    return acs->ip_type == AMD_IP_GFX ||
533           acs->ip_type == AMD_IP_COMPUTE ||
534           acs->ip_type == AMD_IP_SDMA;
535 }
536 
amdgpu_cs_epilog_dws(struct amdgpu_cs * cs)537 static inline unsigned amdgpu_cs_epilog_dws(struct amdgpu_cs *cs)
538 {
539    if (cs->has_chaining)
540       return 4; /* for chaining */
541 
542    return 0;
543 }
544 
545 static struct amdgpu_cs_buffer *
amdgpu_lookup_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list)546 amdgpu_lookup_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
547                      struct amdgpu_buffer_list *list)
548 {
549    int num_buffers = list->num_buffers;
550    struct amdgpu_cs_buffer *buffers = list->buffers;
551    unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
552    int i = cs->buffer_indices_hashlist[hash];
553 
554    /* not found or found */
555    if (i < 0)
556       return NULL;
557 
558    if (i < num_buffers && buffers[i].bo == bo)
559       return &buffers[i];
560 
561    /* Hash collision, look for the BO in the list of buffers linearly. */
562    for (int i = num_buffers - 1; i >= 0; i--) {
563       if (buffers[i].bo == bo) {
564          /* Put this buffer in the hash list.
565           * This will prevent additional hash collisions if there are
566           * several consecutive lookup_buffer calls for the same buffer.
567           *
568           * Example: Assuming buffers A,B,C collide in the hash list,
569           * the following sequence of buffers:
570           *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
571           * will collide here: ^ and here:   ^,
572           * meaning that we should get very few collisions in the end. */
573          cs->buffer_indices_hashlist[hash] = i & 0x7fff;
574          return &buffers[i];
575       }
576    }
577    return NULL;
578 }
579 
580 struct amdgpu_cs_buffer *
amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo)581 amdgpu_lookup_buffer_any_type(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo)
582 {
583    return amdgpu_lookup_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)]);
584 }
585 
586 static struct amdgpu_cs_buffer *
amdgpu_do_add_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list,bool add_ref)587 amdgpu_do_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
588                      struct amdgpu_buffer_list *list, bool add_ref)
589 {
590    /* New buffer, check if the backing array is large enough. */
591    if (unlikely(list->num_buffers >= list->max_buffers)) {
592       unsigned new_max =
593          MAX2(list->max_buffers + 16, (unsigned)(list->max_buffers * 1.3));
594       struct amdgpu_cs_buffer *new_buffers;
595 
596       new_buffers = (struct amdgpu_cs_buffer *)
597                     REALLOC(list->buffers, list->max_buffers * sizeof(*new_buffers),
598                             new_max * sizeof(*new_buffers));
599       if (!new_buffers) {
600          fprintf(stderr, "amdgpu_do_add_buffer: allocation failed\n");
601          return NULL;
602       }
603 
604       list->max_buffers = new_max;
605       list->buffers = new_buffers;
606    }
607 
608    unsigned idx = list->num_buffers++;
609    struct amdgpu_cs_buffer *buffer = &list->buffers[idx];
610    if (add_ref)
611       p_atomic_inc(&bo->base.reference.count);
612    buffer->bo = bo;
613    buffer->usage = 0;
614 
615    unsigned hash = bo->unique_id & (BUFFER_HASHLIST_SIZE-1);
616    cs->buffer_indices_hashlist[hash] = idx & 0x7fff;
617    return buffer;
618 }
619 
620 static struct amdgpu_cs_buffer *
amdgpu_lookup_or_add_buffer(struct amdgpu_cs_context * cs,struct amdgpu_winsys_bo * bo,struct amdgpu_buffer_list * list,bool add_ref)621 amdgpu_lookup_or_add_buffer(struct amdgpu_cs_context *cs, struct amdgpu_winsys_bo *bo,
622                             struct amdgpu_buffer_list *list, bool add_ref)
623 {
624    struct amdgpu_cs_buffer *buffer = amdgpu_lookup_buffer(cs, bo, list);
625 
626    return buffer ? buffer : amdgpu_do_add_buffer(cs, bo, list, add_ref);
627 }
628 
amdgpu_cs_add_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * buf,unsigned usage,enum radeon_bo_domain domains)629 static unsigned amdgpu_cs_add_buffer(struct radeon_cmdbuf *rcs,
630                                     struct pb_buffer_lean *buf,
631                                     unsigned usage,
632                                     enum radeon_bo_domain domains)
633 {
634    /* Don't use the "domains" parameter. Amdgpu doesn't support changing
635     * the buffer placement during command submission.
636     */
637    struct amdgpu_cs_context *cs = (struct amdgpu_cs_context*)rcs->csc;
638    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)buf;
639    struct amdgpu_cs_buffer *buffer;
640 
641    /* Fast exit for no-op calls.
642     * This is very effective with suballocators and linear uploaders that
643     * are outside of the winsys.
644     */
645    if (bo == cs->last_added_bo &&
646        (usage & cs->last_added_bo_usage) == usage)
647       return 0;
648 
649    buffer = amdgpu_lookup_or_add_buffer(cs, bo, &cs->buffer_lists[get_buf_list_idx(bo)], true);
650    if (!buffer)
651       return 0;
652 
653    buffer->usage |= usage;
654 
655    cs->last_added_bo_usage = buffer->usage;
656    cs->last_added_bo = bo;
657    return 0;
658 }
659 
amdgpu_ib_new_buffer(struct amdgpu_winsys * aws,struct amdgpu_ib * main_ib,struct amdgpu_cs * cs)660 static bool amdgpu_ib_new_buffer(struct amdgpu_winsys *aws,
661                                  struct amdgpu_ib *main_ib,
662                                  struct amdgpu_cs *cs)
663 {
664    struct pb_buffer_lean *pb;
665    uint8_t *mapped;
666    unsigned buffer_size;
667 
668    /* Always create a buffer that is at least as large as the maximum seen IB size,
669     * aligned to a power of two.
670     */
671    buffer_size = util_next_power_of_two(main_ib->max_ib_bytes);
672 
673    /* Multiply by 4 to reduce internal fragmentation if chaining is not available.*/
674    if (!cs->has_chaining)
675       buffer_size *= 4;
676 
677    const unsigned min_size = MAX2(main_ib->max_check_space_size, 32 * 1024);
678    /* This is the maximum size that fits into the INDIRECT_BUFFER packet. */
679    const unsigned max_size = 2 * 1024 * 1024;
680 
681    buffer_size = MIN2(buffer_size, max_size);
682    buffer_size = MAX2(buffer_size, min_size); /* min_size is more important */
683 
684    /* Use cached GTT for command buffers. Writing to other heaps is very slow on the CPU.
685     * The speed of writing to GTT WC is somewhere between no difference and very slow, while
686     * VRAM being very slow a lot more often.
687     *
688     * Bypass GL2 because command buffers are read only once. Bypassing GL2 has better latency
689     * and doesn't have to wait for cached GL2 requests to be processed.
690     */
691    enum radeon_bo_domain domain = RADEON_DOMAIN_GTT;
692    unsigned flags = RADEON_FLAG_NO_INTERPROCESS_SHARING |
693                     RADEON_FLAG_GL2_BYPASS;
694 
695    if (cs->ip_type == AMD_IP_GFX ||
696        cs->ip_type == AMD_IP_COMPUTE ||
697        cs->ip_type == AMD_IP_SDMA) {
698       /* Avoids hangs with "rendercheck -t cacomposite -f a8r8g8b8" via glamor
699        * on Navi 14
700        */
701       flags |= RADEON_FLAG_32BIT;
702    }
703 
704    pb = amdgpu_bo_create(aws, buffer_size,
705                          aws->info.gart_page_size,
706                          domain, (radeon_bo_flag)flags);
707    if (!pb)
708       return false;
709 
710    mapped = (uint8_t*)amdgpu_bo_map(&aws->dummy_sws.base, pb, NULL, PIPE_MAP_WRITE);
711    if (!mapped) {
712       radeon_bo_reference(&aws->dummy_sws.base, &pb, NULL);
713       return false;
714    }
715 
716    radeon_bo_reference(&aws->dummy_sws.base, &main_ib->big_buffer, pb);
717    radeon_bo_reference(&aws->dummy_sws.base, &pb, NULL);
718 
719    main_ib->gpu_address = amdgpu_bo_get_va(main_ib->big_buffer);
720    main_ib->big_buffer_cpu_ptr = mapped;
721    main_ib->used_ib_space = 0;
722 
723    return true;
724 }
725 
amdgpu_get_new_ib(struct amdgpu_winsys * aws,struct radeon_cmdbuf * rcs,struct amdgpu_ib * main_ib,struct amdgpu_cs * cs)726 static bool amdgpu_get_new_ib(struct amdgpu_winsys *aws,
727                               struct radeon_cmdbuf *rcs,
728                               struct amdgpu_ib *main_ib,
729                               struct amdgpu_cs *cs)
730 {
731    struct drm_amdgpu_cs_chunk_ib *chunk_ib = &cs->csc->chunk_ib[IB_MAIN];
732    /* This is the minimum size of a contiguous IB. */
733    unsigned ib_size = 16 * 1024;
734 
735    /* Always allocate at least the size of the biggest cs_check_space call,
736     * because precisely the last call might have requested this size.
737     */
738    ib_size = MAX2(ib_size, main_ib->max_check_space_size);
739 
740    if (!cs->has_chaining) {
741       ib_size = MAX2(ib_size, MIN2(util_next_power_of_two(main_ib->max_ib_bytes),
742                                    IB_MAX_SUBMIT_BYTES));
743    }
744 
745    /* Decay the IB buffer size over time, so that memory usage decreases after
746     * a temporary peak.
747     */
748    main_ib->max_ib_bytes = main_ib->max_ib_bytes - main_ib->max_ib_bytes / 32;
749 
750    rcs->prev_dw = 0;
751    rcs->num_prev = 0;
752    rcs->current.cdw = 0;
753    rcs->current.buf = NULL;
754 
755    /* Allocate a new buffer for IBs if the current buffer is all used. */
756    if (!main_ib->big_buffer ||
757        main_ib->used_ib_space + ib_size > main_ib->big_buffer->size) {
758       if (!amdgpu_ib_new_buffer(aws, main_ib, cs))
759          return false;
760    }
761 
762    chunk_ib->va_start = main_ib->gpu_address + main_ib->used_ib_space;
763    chunk_ib->ib_bytes = 0;
764    /* ib_bytes is in dwords and the conversion to bytes will be done before
765     * the CS ioctl. */
766    main_ib->ptr_ib_size = &chunk_ib->ib_bytes;
767    main_ib->is_chained_ib = false;
768 
769    amdgpu_cs_add_buffer(rcs, main_ib->big_buffer,
770                         (radeon_bo_flag)(RADEON_USAGE_READ | RADEON_PRIO_IB),
771                         (radeon_bo_domain)0);
772 
773    rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space);
774 
775    cs->csc->ib_main_addr = rcs->current.buf;
776 
777    ib_size = main_ib->big_buffer->size - main_ib->used_ib_space;
778    rcs->current.max_dw = ib_size / 4 - amdgpu_cs_epilog_dws(cs);
779    return true;
780 }
781 
amdgpu_set_ib_size(struct radeon_cmdbuf * rcs,struct amdgpu_ib * ib)782 static void amdgpu_set_ib_size(struct radeon_cmdbuf *rcs, struct amdgpu_ib *ib)
783 {
784    if (ib->is_chained_ib) {
785       *ib->ptr_ib_size = rcs->current.cdw |
786                          S_3F2_CHAIN(1) | S_3F2_VALID(1) |
787                          S_3F2_PRE_ENA(((struct amdgpu_cs*)ib)->preamble_ib_bo != NULL);
788    } else {
789       *ib->ptr_ib_size = rcs->current.cdw;
790    }
791 }
792 
amdgpu_ib_finalize(struct amdgpu_winsys * aws,struct radeon_cmdbuf * rcs,struct amdgpu_ib * ib,enum amd_ip_type ip_type)793 static void amdgpu_ib_finalize(struct amdgpu_winsys *aws, struct radeon_cmdbuf *rcs,
794                                struct amdgpu_ib *ib, enum amd_ip_type ip_type)
795 {
796    amdgpu_set_ib_size(rcs, ib);
797    ib->used_ib_space += rcs->current.cdw * 4;
798    ib->used_ib_space = align(ib->used_ib_space, aws->info.ip[ip_type].ib_alignment);
799    ib->max_ib_bytes = MAX2(ib->max_ib_bytes, (rcs->prev_dw + rcs->current.cdw) * 4);
800 }
801 
amdgpu_init_cs_context(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs,enum amd_ip_type ip_type)802 static bool amdgpu_init_cs_context(struct amdgpu_winsys *aws,
803                                    struct amdgpu_cs_context *cs,
804                                    enum amd_ip_type ip_type)
805 {
806    for (unsigned i = 0; i < ARRAY_SIZE(cs->chunk_ib); i++) {
807       cs->chunk_ib[i].ip_type = ip_type;
808       cs->chunk_ib[i].flags = 0;
809 
810       if (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE) {
811          /* The kernel shouldn't invalidate L2 and vL1. The proper place for cache invalidation
812           * is the beginning of IBs because completion of an IB doesn't care about the state of
813           * GPU caches, only the beginning of an IB does. Draw calls from multiple IBs can be
814           * executed in parallel, so draw calls from the current IB can finish after the next IB
815           * starts drawing, and so the cache flush at the end of IBs is usually late and thus
816           * useless.
817           */
818          cs->chunk_ib[i].flags |= AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE;
819       }
820    }
821 
822    cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAG_PREAMBLE;
823    cs->last_added_bo = NULL;
824    return true;
825 }
826 
cleanup_fence_list(struct amdgpu_fence_list * fences)827 static void cleanup_fence_list(struct amdgpu_fence_list *fences)
828 {
829    for (unsigned i = 0; i < fences->num; i++)
830       amdgpu_fence_drop_reference(fences->list[i]);
831    fences->num = 0;
832 }
833 
amdgpu_cs_context_cleanup_buffers(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs)834 static void amdgpu_cs_context_cleanup_buffers(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs)
835 {
836    for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++) {
837       struct amdgpu_cs_buffer *buffers = cs->buffer_lists[i].buffers;
838       unsigned num_buffers = cs->buffer_lists[i].num_buffers;
839 
840       for (unsigned j = 0; j < num_buffers; j++)
841          amdgpu_winsys_bo_drop_reference(aws, buffers[j].bo);
842 
843       cs->buffer_lists[i].num_buffers = 0;
844    }
845 }
846 
amdgpu_cs_context_cleanup(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs)847 static void amdgpu_cs_context_cleanup(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs)
848 {
849    cs->seq_no_dependencies.valid_fence_mask = 0;
850    cleanup_fence_list(&cs->syncobj_dependencies);
851    cleanup_fence_list(&cs->syncobj_to_signal);
852    amdgpu_fence_reference(&cs->fence, NULL);
853    cs->last_added_bo = NULL;
854 }
855 
amdgpu_destroy_cs_context(struct amdgpu_winsys * aws,struct amdgpu_cs_context * cs)856 static void amdgpu_destroy_cs_context(struct amdgpu_winsys *aws, struct amdgpu_cs_context *cs)
857 {
858    amdgpu_cs_context_cleanup_buffers(aws, cs);
859    amdgpu_cs_context_cleanup(aws, cs);
860    for (unsigned i = 0; i < ARRAY_SIZE(cs->buffer_lists); i++)
861       FREE(cs->buffer_lists[i].buffers);
862    FREE(cs->syncobj_dependencies.list);
863    FREE(cs->syncobj_to_signal.list);
864 }
865 
866 
amdgpu_cs_get_ip_type(struct radeon_cmdbuf * rcs)867 static enum amd_ip_type amdgpu_cs_get_ip_type(struct radeon_cmdbuf *rcs)
868 {
869    struct amdgpu_cs *cs = amdgpu_cs(rcs);
870    return cs->ip_type;
871 }
872 
ip_uses_alt_fence(enum amd_ip_type ip_type)873 static bool ip_uses_alt_fence(enum amd_ip_type ip_type)
874 {
875    /* The alt_fence path can be tested thoroughly by enabling it for GFX here. */
876    return ip_type == AMD_IP_VCN_DEC ||
877           ip_type == AMD_IP_VCN_ENC ||
878           ip_type == AMD_IP_VCN_JPEG;
879 }
880 
881 static bool
amdgpu_cs_create(struct radeon_cmdbuf * rcs,struct radeon_winsys_ctx * rwctx,enum amd_ip_type ip_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx)882 amdgpu_cs_create(struct radeon_cmdbuf *rcs,
883                  struct radeon_winsys_ctx *rwctx,
884                  enum amd_ip_type ip_type,
885                  void (*flush)(void *ctx, unsigned flags,
886                                struct pipe_fence_handle **fence),
887                  void *flush_ctx)
888 {
889    struct amdgpu_ctx *ctx = (struct amdgpu_ctx*)rwctx;
890    struct amdgpu_cs *cs;
891 
892    cs = CALLOC_STRUCT(amdgpu_cs);
893    if (!cs) {
894       return false;
895    }
896 
897    util_queue_fence_init(&cs->flush_completed);
898 
899    cs->aws = ctx->aws;
900    cs->ctx = ctx;
901    cs->flush_cs = flush;
902    cs->flush_data = flush_ctx;
903    cs->ip_type = ip_type;
904    cs->noop = ctx->aws->noop_cs;
905    cs->has_chaining = ctx->aws->info.gfx_level >= GFX7 &&
906                       (ip_type == AMD_IP_GFX || ip_type == AMD_IP_COMPUTE);
907 
908    /* Compute the queue index by counting the IPs that have queues. */
909    assert(ip_type < ARRAY_SIZE(ctx->aws->info.ip));
910    assert(ctx->aws->info.ip[ip_type].num_queues);
911 
912    if (ip_uses_alt_fence(ip_type)) {
913       cs->queue_index = INT_MAX;
914       cs->uses_alt_fence = true;
915    } else {
916       cs->queue_index = 0;
917 
918       for (unsigned i = 0; i < ARRAY_SIZE(ctx->aws->info.ip); i++) {
919          if (!ctx->aws->info.ip[i].num_queues || ip_uses_alt_fence((amd_ip_type)i))
920             continue;
921 
922          if (i == ip_type)
923             break;
924 
925          cs->queue_index++;
926       }
927       assert(cs->queue_index < AMDGPU_MAX_QUEUES);
928    }
929 
930    struct amdgpu_cs_fence_info fence_info;
931    fence_info.handle = cs->ctx->user_fence_bo;
932    fence_info.offset = cs->ip_type * 4;
933    amdgpu_cs_chunk_fence_info_to_data(&fence_info,
934                                       (struct drm_amdgpu_cs_chunk_data*)&cs->fence_chunk);
935 
936    if (!amdgpu_init_cs_context(ctx->aws, &cs->csc1, ip_type)) {
937       FREE(cs);
938       return false;
939    }
940 
941    if (!amdgpu_init_cs_context(ctx->aws, &cs->csc2, ip_type)) {
942       amdgpu_destroy_cs_context(ctx->aws, &cs->csc1);
943       FREE(cs);
944       return false;
945    }
946 
947    memset(cs->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
948 
949    /* Set the first submission context as current. */
950    rcs->csc = cs->csc = &cs->csc1;
951    cs->cst = &cs->csc2;
952 
953    /* Assign to both amdgpu_cs_context; only csc will use it. */
954    cs->csc1.buffer_indices_hashlist = cs->buffer_indices_hashlist;
955    cs->csc2.buffer_indices_hashlist = cs->buffer_indices_hashlist;
956 
957    cs->csc1.aws = ctx->aws;
958    cs->csc2.aws = ctx->aws;
959 
960    rcs->priv = cs;
961 
962    if (!amdgpu_get_new_ib(ctx->aws, rcs, &cs->main_ib, cs)) {
963       amdgpu_destroy_cs_context(ctx->aws, &cs->csc2);
964       amdgpu_destroy_cs_context(ctx->aws, &cs->csc1);
965       FREE(cs);
966       rcs->priv = NULL;
967       return false;
968    }
969 
970    p_atomic_inc(&ctx->aws->num_cs);
971    return true;
972 }
973 
974 static bool
amdgpu_cs_setup_preemption(struct radeon_cmdbuf * rcs,const uint32_t * preamble_ib,unsigned preamble_num_dw)975 amdgpu_cs_setup_preemption(struct radeon_cmdbuf *rcs, const uint32_t *preamble_ib,
976                            unsigned preamble_num_dw)
977 {
978    struct amdgpu_cs *cs = amdgpu_cs(rcs);
979    struct amdgpu_winsys *aws = cs->aws;
980    struct amdgpu_cs_context *csc[2] = {&cs->csc1, &cs->csc2};
981    unsigned size = align(preamble_num_dw * 4, aws->info.ip[AMD_IP_GFX].ib_alignment);
982    struct pb_buffer_lean *preamble_bo;
983    uint32_t *map;
984 
985    /* Create the preamble IB buffer. */
986    preamble_bo = amdgpu_bo_create(aws, size, aws->info.ip[AMD_IP_GFX].ib_alignment,
987                                   RADEON_DOMAIN_VRAM,
988                                   (radeon_bo_flag)
989                                   (RADEON_FLAG_NO_INTERPROCESS_SHARING |
990                                    RADEON_FLAG_GTT_WC));
991    if (!preamble_bo)
992       return false;
993 
994    map = (uint32_t*)amdgpu_bo_map(&aws->dummy_sws.base, preamble_bo, NULL,
995                                   (pipe_map_flags)(PIPE_MAP_WRITE | RADEON_MAP_TEMPORARY));
996    if (!map) {
997       radeon_bo_reference(&aws->dummy_sws.base, &preamble_bo, NULL);
998       return false;
999    }
1000 
1001    /* Upload the preamble IB. */
1002    memcpy(map, preamble_ib, preamble_num_dw * 4);
1003 
1004    /* Pad the IB. */
1005    amdgpu_pad_gfx_compute_ib(aws, cs->ip_type, map, &preamble_num_dw, 0);
1006    amdgpu_bo_unmap(&aws->dummy_sws.base, preamble_bo);
1007 
1008    for (unsigned i = 0; i < 2; i++) {
1009       csc[i]->chunk_ib[IB_PREAMBLE].va_start = amdgpu_bo_get_va(preamble_bo);
1010       csc[i]->chunk_ib[IB_PREAMBLE].ib_bytes = preamble_num_dw * 4;
1011 
1012       csc[i]->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAG_PREEMPT;
1013    }
1014 
1015    assert(!cs->preamble_ib_bo);
1016    cs->preamble_ib_bo = preamble_bo;
1017 
1018    amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo,
1019                         RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
1020    return true;
1021 }
1022 
amdgpu_cs_validate(struct radeon_cmdbuf * rcs)1023 static bool amdgpu_cs_validate(struct radeon_cmdbuf *rcs)
1024 {
1025    return true;
1026 }
1027 
amdgpu_cs_check_space(struct radeon_cmdbuf * rcs,unsigned dw)1028 static bool amdgpu_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
1029 {
1030    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1031    struct amdgpu_ib *main_ib = &cs->main_ib;
1032 
1033    assert(rcs->current.cdw <= rcs->current.max_dw);
1034 
1035    unsigned projected_size_dw = rcs->prev_dw + rcs->current.cdw + dw;
1036 
1037    if (projected_size_dw * 4 > IB_MAX_SUBMIT_BYTES)
1038       return false;
1039 
1040    if (rcs->current.max_dw - rcs->current.cdw >= dw)
1041       return true;
1042 
1043    unsigned cs_epilog_dw = amdgpu_cs_epilog_dws(cs);
1044    unsigned need_byte_size = (dw + cs_epilog_dw) * 4;
1045    /* 125% of the size for IB epilog. */
1046    unsigned safe_byte_size = need_byte_size + need_byte_size / 4;
1047    main_ib->max_check_space_size = MAX2(main_ib->max_check_space_size, safe_byte_size);
1048    main_ib->max_ib_bytes = MAX2(main_ib->max_ib_bytes, projected_size_dw * 4);
1049 
1050    if (!cs->has_chaining)
1051       return false;
1052 
1053    /* Allocate a new chunk */
1054    if (rcs->num_prev >= rcs->max_prev) {
1055       unsigned new_max_prev = MAX2(1, 2 * rcs->max_prev);
1056       struct radeon_cmdbuf_chunk *new_prev;
1057 
1058       new_prev = (struct radeon_cmdbuf_chunk*)
1059                  REALLOC(rcs->prev, sizeof(*new_prev) * rcs->max_prev,
1060                          sizeof(*new_prev) * new_max_prev);
1061       if (!new_prev)
1062          return false;
1063 
1064       rcs->prev = new_prev;
1065       rcs->max_prev = new_max_prev;
1066    }
1067 
1068    if (!amdgpu_ib_new_buffer(cs->aws, main_ib, cs))
1069       return false;
1070 
1071    assert(main_ib->used_ib_space == 0);
1072    uint64_t va = main_ib->gpu_address;
1073 
1074    /* This space was originally reserved. */
1075    rcs->current.max_dw += cs_epilog_dw;
1076 
1077    /* Pad with NOPs but leave 4 dwords for INDIRECT_BUFFER. */
1078    amdgpu_pad_gfx_compute_ib(cs->aws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 4);
1079 
1080    radeon_emit(rcs, PKT3(PKT3_INDIRECT_BUFFER, 2, 0));
1081    radeon_emit(rcs, va);
1082    radeon_emit(rcs, va >> 32);
1083    uint32_t *new_ptr_ib_size = &rcs->current.buf[rcs->current.cdw++];
1084 
1085    assert((rcs->current.cdw & cs->aws->info.ip[cs->ip_type].ib_pad_dw_mask) == 0);
1086    assert(rcs->current.cdw <= rcs->current.max_dw);
1087 
1088    amdgpu_set_ib_size(rcs, main_ib);
1089    main_ib->ptr_ib_size = new_ptr_ib_size;
1090    main_ib->is_chained_ib = true;
1091 
1092    /* Hook up the new chunk */
1093    rcs->prev[rcs->num_prev].buf = rcs->current.buf;
1094    rcs->prev[rcs->num_prev].cdw = rcs->current.cdw;
1095    rcs->prev[rcs->num_prev].max_dw = rcs->current.cdw; /* no modifications */
1096    rcs->num_prev++;
1097 
1098    rcs->prev_dw += rcs->current.cdw;
1099    rcs->current.cdw = 0;
1100 
1101    rcs->current.buf = (uint32_t*)(main_ib->big_buffer_cpu_ptr + main_ib->used_ib_space);
1102    rcs->current.max_dw = main_ib->big_buffer->size / 4 - cs_epilog_dw;
1103 
1104    amdgpu_cs_add_buffer(rcs, main_ib->big_buffer,
1105                         RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
1106 
1107    return true;
1108 }
1109 
amdgpu_add_slab_backing_buffers(struct amdgpu_cs_context * cs)1110 static void amdgpu_add_slab_backing_buffers(struct amdgpu_cs_context *cs)
1111 {
1112    unsigned num_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
1113    struct amdgpu_cs_buffer *buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
1114 
1115    for (unsigned i = 0; i < num_buffers; i++) {
1116       struct amdgpu_cs_buffer *slab_buffer = &buffers[i];
1117       struct amdgpu_cs_buffer *real_buffer =
1118          amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(slab_buffer->bo)->b,
1119                                      &cs->buffer_lists[AMDGPU_BO_REAL], true);
1120 
1121       /* We need to set the usage because it determines the BO priority.
1122        *
1123        * Mask out the SYNCHRONIZED flag because the backing buffer of slabs shouldn't add its
1124        * BO fences to fence dependencies. Only the slab entries should do that.
1125        */
1126       real_buffer->usage |= slab_buffer->usage & ~RADEON_USAGE_SYNCHRONIZED;
1127    }
1128 }
1129 
amdgpu_cs_get_buffer_list(struct radeon_cmdbuf * rcs,struct radeon_bo_list_item * list)1130 static unsigned amdgpu_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
1131                                           struct radeon_bo_list_item *list)
1132 {
1133     struct amdgpu_cs_context *cs = amdgpu_cs(rcs)->csc;
1134 
1135     /* We do this in the CS thread, but since we need to return the final usage of all buffers
1136      * here, do it here too. There is no harm in doing it again in the CS thread.
1137      */
1138     amdgpu_add_slab_backing_buffers(cs);
1139 
1140     struct amdgpu_buffer_list *real_buffers = &cs->buffer_lists[AMDGPU_BO_REAL];
1141     unsigned num_real_buffers = real_buffers->num_buffers;
1142 
1143     if (list) {
1144         for (unsigned i = 0; i < num_real_buffers; i++) {
1145             list[i].bo_size = real_buffers->buffers[i].bo->base.size;
1146             list[i].vm_address =
1147                amdgpu_va_get_start_addr(get_real_bo(real_buffers->buffers[i].bo)->va_handle);
1148             list[i].priority_usage = real_buffers->buffers[i].usage;
1149         }
1150     }
1151     return num_real_buffers;
1152 }
1153 
add_fence_to_list(struct amdgpu_fence_list * fences,struct amdgpu_fence * fence)1154 static void add_fence_to_list(struct amdgpu_fence_list *fences,
1155                               struct amdgpu_fence *fence)
1156 {
1157    unsigned idx = fences->num++;
1158 
1159    if (idx >= fences->max) {
1160       unsigned size;
1161       const unsigned increment = 8;
1162 
1163       fences->max = idx + increment;
1164       size = fences->max * sizeof(fences->list[0]);
1165       fences->list = (struct pipe_fence_handle**)realloc(fences->list, size);
1166    }
1167    amdgpu_fence_set_reference(&fences->list[idx], (struct pipe_fence_handle*)fence);
1168 }
1169 
amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf * rcs,struct pipe_fence_handle * pfence)1170 static void amdgpu_cs_add_fence_dependency(struct radeon_cmdbuf *rcs,
1171                                            struct pipe_fence_handle *pfence)
1172 {
1173    struct amdgpu_cs *acs = amdgpu_cs(rcs);
1174    struct amdgpu_cs_context *cs = acs->csc;
1175    struct amdgpu_fence *fence = (struct amdgpu_fence*)pfence;
1176 
1177    util_queue_fence_wait(&fence->submitted);
1178 
1179    if (!fence->imported) {
1180       /* Ignore idle fences. This will only check the user fence in memory. */
1181       if (!amdgpu_fence_wait((struct pipe_fence_handle *)fence, 0, false)) {
1182          add_seq_no_to_list(acs->aws, &cs->seq_no_dependencies, fence->queue_index,
1183                             fence->queue_seq_no);
1184       }
1185    }
1186    else
1187       add_fence_to_list(&cs->syncobj_dependencies, fence);
1188 }
1189 
amdgpu_add_fences_to_dependencies(struct amdgpu_winsys * ws,struct amdgpu_cs_context * cs,unsigned queue_index_bit,struct amdgpu_seq_no_fences * dependencies,struct amdgpu_winsys_bo * bo,unsigned usage)1190 static void amdgpu_add_fences_to_dependencies(struct amdgpu_winsys *ws,
1191                                               struct amdgpu_cs_context *cs,
1192                                               unsigned queue_index_bit,
1193                                               struct amdgpu_seq_no_fences *dependencies,
1194                                               struct amdgpu_winsys_bo *bo, unsigned usage)
1195 {
1196    if (usage & RADEON_USAGE_SYNCHRONIZED) {
1197       /* Add BO fences from queues other than 'queue_index' to dependencies. */
1198       u_foreach_bit(other_queue_idx, bo->fences.valid_fence_mask & ~queue_index_bit) {
1199          add_seq_no_to_list(ws, dependencies, other_queue_idx,
1200                             bo->fences.seq_no[other_queue_idx]);
1201       }
1202 
1203       if (bo->alt_fence)
1204          add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)bo->alt_fence);
1205    }
1206 }
1207 
amdgpu_set_bo_seq_no(unsigned queue_index,struct amdgpu_winsys_bo * bo,uint_seq_no new_queue_seq_no)1208 static void amdgpu_set_bo_seq_no(unsigned queue_index, struct amdgpu_winsys_bo *bo,
1209                                  uint_seq_no new_queue_seq_no)
1210 {
1211    bo->fences.seq_no[queue_index] = new_queue_seq_no;
1212    bo->fences.valid_fence_mask |= BITFIELD_BIT(queue_index);
1213 }
1214 
amdgpu_add_to_kernel_bo_list(struct drm_amdgpu_bo_list_entry * bo_entry,struct amdgpu_winsys_bo * bo,unsigned usage)1215 static void amdgpu_add_to_kernel_bo_list(struct drm_amdgpu_bo_list_entry *bo_entry,
1216                                          struct amdgpu_winsys_bo *bo, unsigned usage)
1217 {
1218    bo_entry->bo_handle = get_real_bo(bo)->kms_handle;
1219    bo_entry->bo_priority = (util_last_bit(usage & RADEON_ALL_PRIORITIES) - 1) / 2;
1220 }
1221 
amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf * rws,struct pipe_fence_handle * fence)1222 static void amdgpu_cs_add_syncobj_signal(struct radeon_cmdbuf *rws,
1223                                          struct pipe_fence_handle *fence)
1224 {
1225    struct amdgpu_cs *acs = amdgpu_cs(rws);
1226    struct amdgpu_cs_context *cs = acs->csc;
1227 
1228    add_fence_to_list(&cs->syncobj_to_signal, (struct amdgpu_fence*)fence);
1229 }
1230 
1231 /* The template parameter determines whether the queue should skip code used by the default queue
1232  * system that's based on sequence numbers, and instead use and update amdgpu_winsys_bo::alt_fence
1233  * for all BOs.
1234  */
1235 template<bool QUEUE_USES_ALT_FENCE>
amdgpu_cs_submit_ib(void * job,void * gdata,int thread_index)1236 static void amdgpu_cs_submit_ib(void *job, void *gdata, int thread_index)
1237 {
1238    struct amdgpu_cs *acs = (struct amdgpu_cs*)job;
1239    struct amdgpu_winsys *aws = acs->aws;
1240    struct amdgpu_cs_context *cs = acs->cst;
1241    int r;
1242    uint64_t seq_no = 0;
1243    bool has_user_fence = amdgpu_cs_has_user_fence(acs);
1244 
1245    assert(QUEUE_USES_ALT_FENCE == acs->uses_alt_fence);
1246 
1247    simple_mtx_lock(&aws->bo_fence_lock);
1248    unsigned queue_index;
1249    struct amdgpu_queue *queue;
1250    uint_seq_no prev_seq_no, next_seq_no;
1251 
1252    if (!QUEUE_USES_ALT_FENCE) {
1253       queue_index = acs->queue_index;
1254       queue = &aws->queues[queue_index];
1255       prev_seq_no = queue->latest_seq_no;
1256 
1257       /* Generate a per queue sequence number. The logic is similar to the kernel side amdgpu seqno,
1258        * but the values aren't related.
1259        */
1260       next_seq_no = prev_seq_no + 1;
1261 
1262       /* Wait for the oldest fence to signal. This should always check the user fence, then wait
1263        * via the ioctl. We have to do this because we are going to release the oldest fence and
1264        * replace it with the latest fence in the ring.
1265        */
1266       struct pipe_fence_handle **oldest_fence =
1267          &queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE];
1268 
1269       if (*oldest_fence) {
1270          if (!amdgpu_fence_wait(*oldest_fence, 0, false)) {
1271             /* Take the reference because the fence can be released by other threads after we
1272              * unlock the mutex.
1273              */
1274             struct pipe_fence_handle *tmp_fence = NULL;
1275             amdgpu_fence_reference(&tmp_fence, *oldest_fence);
1276 
1277             /* Unlock the mutex before waiting. */
1278             simple_mtx_unlock(&aws->bo_fence_lock);
1279             amdgpu_fence_wait(tmp_fence, OS_TIMEOUT_INFINITE, false);
1280             amdgpu_fence_reference(&tmp_fence, NULL);
1281             simple_mtx_lock(&aws->bo_fence_lock);
1282          }
1283 
1284          /* Remove the idle fence from the ring. */
1285          amdgpu_fence_reference(oldest_fence, NULL);
1286       }
1287    }
1288 
1289    /* We'll accumulate sequence numbers in this structure. It automatically keeps only the latest
1290     * sequence number per queue and removes all older ones.
1291     */
1292    struct amdgpu_seq_no_fences seq_no_dependencies;
1293    memcpy(&seq_no_dependencies, &cs->seq_no_dependencies, sizeof(seq_no_dependencies));
1294 
1295    if (!QUEUE_USES_ALT_FENCE) {
1296       /* Add a fence dependency on the previous IB if the IP has multiple physical queues to
1297        * make it appear as if it had only 1 queue, or if the previous IB comes from a different
1298        * context. The reasons are:
1299        * - Our BO fence tracking only supports 1 queue per IP.
1300        * - IBs from different contexts must wait for each other and can't execute in a random order.
1301        */
1302       struct amdgpu_fence *prev_fence =
1303          (struct amdgpu_fence*)queue->fences[prev_seq_no % AMDGPU_FENCE_RING_SIZE];
1304 
1305       if (prev_fence && (aws->info.ip[acs->ip_type].num_queues > 1 || queue->last_ctx != acs->ctx))
1306          add_seq_no_to_list(aws, &seq_no_dependencies, queue_index, prev_seq_no);
1307    }
1308 
1309    /* Since the kernel driver doesn't synchronize execution between different
1310     * rings automatically, we have to add fence dependencies manually. This gathers sequence
1311     * numbers from BOs and sets the next sequence number in the BOs.
1312     */
1313 
1314    /* Slab entry BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */
1315    struct amdgpu_cs_buffer *slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].buffers;
1316    unsigned num_slab_entry_buffers = cs->buffer_lists[AMDGPU_BO_SLAB_ENTRY].num_buffers;
1317    unsigned initial_num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1318    unsigned queue_index_bit = QUEUE_USES_ALT_FENCE ? 0 : BITFIELD_BIT(queue_index);
1319 
1320    for (unsigned i = 0; i < num_slab_entry_buffers; i++) {
1321       struct amdgpu_cs_buffer *buffer = &slab_entry_buffers[i];
1322       struct amdgpu_winsys_bo *bo = buffer->bo;
1323 
1324       amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo,
1325                                         buffer->usage);
1326       if (QUEUE_USES_ALT_FENCE)
1327          amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1328       else
1329          amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1330 
1331       /* We didn't add any slab entries into the real buffer list that will be submitted
1332        * to the kernel. Do it now.
1333        */
1334       struct amdgpu_cs_buffer *real_buffer =
1335          amdgpu_lookup_or_add_buffer(cs, &get_slab_entry_real_bo(buffer->bo)->b,
1336                                      &cs->buffer_lists[AMDGPU_BO_REAL], false);
1337 
1338       /* We need to set the usage because it determines the BO priority. */
1339       real_buffer->usage |= buffer->usage;
1340    }
1341 
1342    /* Sparse BOs: Add fence dependencies, update seq_no in BOs, add real buffers. */
1343    unsigned num_real_buffers_except_sparse = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1344    struct amdgpu_cs_buffer *sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].buffers;
1345    unsigned num_sparse_buffers = cs->buffer_lists[AMDGPU_BO_SPARSE].num_buffers;
1346    bool out_of_memory = false;
1347 
1348    for (unsigned i = 0; i < num_sparse_buffers; i++) {
1349       struct amdgpu_cs_buffer *buffer = &sparse_buffers[i];
1350       struct amdgpu_winsys_bo *bo = buffer->bo;
1351 
1352       amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo,
1353                                         buffer->usage);
1354       if (QUEUE_USES_ALT_FENCE)
1355          amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1356       else
1357          amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1358 
1359       /* Add backing buffers of sparse buffers to the buffer list.
1360        *
1361        * This is done late, during submission, to keep the buffer list short before
1362        * submit, and to avoid managing fences for the backing buffers.
1363        */
1364       struct amdgpu_bo_sparse *sparse_bo = get_sparse_bo(buffer->bo);
1365 
1366       simple_mtx_lock(&sparse_bo->commit_lock);
1367       list_for_each_entry(struct amdgpu_sparse_backing, backing, &sparse_bo->backing, list) {
1368          /* We can directly add the buffer here, because we know that each
1369           * backing buffer occurs only once.
1370           */
1371          struct amdgpu_cs_buffer *real_buffer =
1372             amdgpu_do_add_buffer(cs, &backing->bo->b, &cs->buffer_lists[AMDGPU_BO_REAL], true);
1373          if (!real_buffer) {
1374             fprintf(stderr, "%s: failed to add sparse backing buffer\n", __func__);
1375             simple_mtx_unlock(&sparse_bo->commit_lock);
1376             r = -ENOMEM;
1377             out_of_memory = true;
1378          }
1379 
1380          real_buffer->usage = buffer->usage;
1381       }
1382       simple_mtx_unlock(&sparse_bo->commit_lock);
1383    }
1384 
1385    /* Real BOs: Add fence dependencies, update seq_no in BOs except sparse backing BOs. */
1386    unsigned num_real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].num_buffers;
1387    struct amdgpu_cs_buffer *real_buffers = cs->buffer_lists[AMDGPU_BO_REAL].buffers;
1388    struct drm_amdgpu_bo_list_entry *bo_list =
1389       (struct drm_amdgpu_bo_list_entry *)
1390       alloca(num_real_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
1391    unsigned i;
1392 
1393    for (i = 0; i < initial_num_real_buffers; i++) {
1394       struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1395       struct amdgpu_winsys_bo *bo = buffer->bo;
1396 
1397       amdgpu_add_fences_to_dependencies(aws, cs, queue_index_bit, &seq_no_dependencies, bo,
1398                                         buffer->usage);
1399       if (QUEUE_USES_ALT_FENCE)
1400          amdgpu_fence_reference(&bo->alt_fence, cs->fence);
1401       else
1402          amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1403 
1404       amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage);
1405    }
1406 
1407    /* These are backing buffers of slab entries. Don't add their fence dependencies. */
1408    for (; i < num_real_buffers_except_sparse; i++) {
1409       struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1410       struct amdgpu_winsys_bo *bo = buffer->bo;
1411 
1412       if (QUEUE_USES_ALT_FENCE)
1413          get_real_bo_reusable_slab(bo)->b.b.slab_has_busy_alt_fences = true;
1414       else
1415          amdgpu_set_bo_seq_no(queue_index, bo, next_seq_no);
1416 
1417       amdgpu_add_to_kernel_bo_list(&bo_list[i], bo, buffer->usage);
1418    }
1419 
1420    /* Sparse backing BOs are last. Don't update their fences because we don't use them. */
1421    for (; i < num_real_buffers; ++i) {
1422       struct amdgpu_cs_buffer *buffer = &real_buffers[i];
1423 
1424       amdgpu_add_to_kernel_bo_list(&bo_list[i], buffer->bo, buffer->usage);
1425    }
1426 
1427 #if 0 /* Debug code. */
1428    printf("submit queue=%u, seq_no=%u\n", acs->queue_index, next_seq_no);
1429 
1430    /* Wait for all previous fences. This can be used when BO fence tracking doesn't work. */
1431    for (unsigned i = 0; i < AMDGPU_MAX_QUEUES; i++) {
1432       if (i == acs->queue_index)
1433          continue;
1434 
1435       struct pipe_fence_handle *fence = queue->fences[ws->queues[i].latest_seq_no % AMDGPU_FENCE_RING_SIZE];
1436       if (!fence) {
1437          if (i <= 1)
1438             printf("      queue %u doesn't have any fence at seq_no %u\n", i, ws->queues[i].latest_seq_no);
1439          continue;
1440       }
1441 
1442       bool valid = seq_no_dependencies.valid_fence_mask & BITFIELD_BIT(i);
1443       uint_seq_no old = seq_no_dependencies.seq_no[i];
1444       add_seq_no_to_list(aws, &seq_no_dependencies, i, aws->queues[i].latest_seq_no);
1445       uint_seq_no new = seq_no_dependencies.seq_no[i];
1446 
1447       if (!valid)
1448          printf("   missing dependency on queue=%u, seq_no=%u\n", i, new);
1449       else if (old != new)
1450          printf("   too old dependency on queue=%u, old=%u, new=%u\n", i, old, new);
1451       else
1452          printf("   has dependency on queue=%u, seq_no=%u\n", i, old);
1453    }
1454 #endif
1455 
1456    /* Convert the sequence numbers we gathered to fence dependencies. */
1457    u_foreach_bit(i, seq_no_dependencies.valid_fence_mask) {
1458       struct pipe_fence_handle **fence = get_fence_from_ring(aws, &seq_no_dependencies, i);
1459 
1460       if (fence) {
1461          /* If it's idle, don't add it to the list of dependencies. */
1462          if (amdgpu_fence_wait(*fence, 0, false))
1463             amdgpu_fence_reference(fence, NULL);
1464          else
1465             add_fence_to_list(&cs->syncobj_dependencies, (struct amdgpu_fence*)*fence);
1466       }
1467    }
1468 
1469    if (!QUEUE_USES_ALT_FENCE) {
1470       /* Finally, add the IB fence into the fence ring of the queue. */
1471       amdgpu_fence_reference(&queue->fences[next_seq_no % AMDGPU_FENCE_RING_SIZE], cs->fence);
1472       queue->latest_seq_no = next_seq_no;
1473       ((struct amdgpu_fence*)cs->fence)->queue_seq_no = next_seq_no;
1474 
1475       /* Update the last used context in the queue. */
1476       amdgpu_ctx_reference(&queue->last_ctx, acs->ctx);
1477    }
1478    simple_mtx_unlock(&aws->bo_fence_lock);
1479 
1480 #if MESA_DEBUG
1481    /* Prepare the buffer list. */
1482    if (aws->debug_all_bos) {
1483       /* The buffer list contains all buffers. This is a slow path that
1484        * ensures that no buffer is missing in the BO list.
1485        */
1486       simple_mtx_lock(&aws->global_bo_list_lock);
1487       bo_list = (struct drm_amdgpu_bo_list_entry *)
1488                 alloca(aws->num_buffers * sizeof(struct drm_amdgpu_bo_list_entry));
1489       num_real_buffers = 0;
1490 
1491       list_for_each_entry(struct amdgpu_bo_real, bo, &aws->global_bo_list, global_list_item) {
1492          bo_list[num_real_buffers].bo_handle = bo->kms_handle;
1493          bo_list[num_real_buffers].bo_priority = 0;
1494          ++num_real_buffers;
1495       }
1496       simple_mtx_unlock(&aws->global_bo_list_lock);
1497    }
1498 #endif
1499 
1500    if (acs->ip_type == AMD_IP_GFX)
1501       aws->gfx_bo_list_counter += num_real_buffers;
1502 
1503    struct drm_amdgpu_cs_chunk chunks[8];
1504    unsigned num_chunks = 0;
1505 
1506    /* BO list */
1507    struct drm_amdgpu_bo_list_in bo_list_in;
1508    bo_list_in.operation = ~0;
1509    bo_list_in.list_handle = ~0;
1510    bo_list_in.bo_number = num_real_buffers;
1511    bo_list_in.bo_info_size = sizeof(struct drm_amdgpu_bo_list_entry);
1512    bo_list_in.bo_info_ptr = (uint64_t)(uintptr_t)bo_list;
1513 
1514    chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_BO_HANDLES;
1515    chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_bo_list_in) / 4;
1516    chunks[num_chunks].chunk_data = (uintptr_t)&bo_list_in;
1517    num_chunks++;
1518 
1519    /* Syncobj dependencies. */
1520    unsigned num_syncobj_dependencies = cs->syncobj_dependencies.num;
1521    if (num_syncobj_dependencies) {
1522       struct drm_amdgpu_cs_chunk_sem *sem_chunk =
1523          (struct drm_amdgpu_cs_chunk_sem *)
1524          alloca(num_syncobj_dependencies * sizeof(sem_chunk[0]));
1525 
1526       for (unsigned i = 0; i < num_syncobj_dependencies; i++) {
1527          struct amdgpu_fence *fence =
1528             (struct amdgpu_fence*)cs->syncobj_dependencies.list[i];
1529 
1530          assert(util_queue_fence_is_signalled(&fence->submitted));
1531          sem_chunk[i].handle = fence->syncobj;
1532       }
1533 
1534       chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_IN;
1535       chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_dependencies;
1536       chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
1537       num_chunks++;
1538    }
1539 
1540    /* Syncobj signals. */
1541    unsigned num_syncobj_to_signal = 1 + cs->syncobj_to_signal.num;
1542    struct drm_amdgpu_cs_chunk_sem *sem_chunk =
1543       (struct drm_amdgpu_cs_chunk_sem *)
1544       alloca(num_syncobj_to_signal * sizeof(sem_chunk[0]));
1545 
1546    for (unsigned i = 0; i < num_syncobj_to_signal - 1; i++) {
1547       struct amdgpu_fence *fence =
1548          (struct amdgpu_fence*)cs->syncobj_to_signal.list[i];
1549 
1550       sem_chunk[i].handle = fence->syncobj;
1551    }
1552    sem_chunk[cs->syncobj_to_signal.num].handle = ((struct amdgpu_fence*)cs->fence)->syncobj;
1553 
1554    chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_SYNCOBJ_OUT;
1555    chunks[num_chunks].length_dw = sizeof(sem_chunk[0]) / 4 * num_syncobj_to_signal;
1556    chunks[num_chunks].chunk_data = (uintptr_t)sem_chunk;
1557    num_chunks++;
1558 
1559    if (aws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.shadow_va) {
1560       chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_CP_GFX_SHADOW;
1561       chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_cp_gfx_shadow) / 4;
1562       chunks[num_chunks].chunk_data = (uintptr_t)&acs->mcbp_fw_shadow_chunk;
1563       num_chunks++;
1564    }
1565 
1566    /* Fence */
1567    if (has_user_fence) {
1568       chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_FENCE;
1569       chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_fence) / 4;
1570       chunks[num_chunks].chunk_data = (uintptr_t)&acs->fence_chunk;
1571       num_chunks++;
1572    }
1573 
1574    /* IB */
1575    if (cs->chunk_ib[IB_PREAMBLE].ib_bytes) {
1576       chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
1577       chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1578       chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_PREAMBLE];
1579       num_chunks++;
1580    }
1581 
1582    /* IB */
1583    cs->chunk_ib[IB_MAIN].ib_bytes *= 4; /* Convert from dwords to bytes. */
1584    chunks[num_chunks].chunk_id = AMDGPU_CHUNK_ID_IB;
1585    chunks[num_chunks].length_dw = sizeof(struct drm_amdgpu_cs_chunk_ib) / 4;
1586    chunks[num_chunks].chunk_data = (uintptr_t)&cs->chunk_ib[IB_MAIN];
1587    num_chunks++;
1588 
1589    if (cs->secure) {
1590       cs->chunk_ib[IB_PREAMBLE].flags |= AMDGPU_IB_FLAGS_SECURE;
1591       cs->chunk_ib[IB_MAIN].flags |= AMDGPU_IB_FLAGS_SECURE;
1592    } else {
1593       cs->chunk_ib[IB_PREAMBLE].flags &= ~AMDGPU_IB_FLAGS_SECURE;
1594       cs->chunk_ib[IB_MAIN].flags &= ~AMDGPU_IB_FLAGS_SECURE;
1595    }
1596 
1597    bool noop = acs->noop;
1598 
1599    if (noop && acs->ip_type == AMD_IP_GFX) {
1600       /* Reduce the IB size and fill it with NOP to make it like an empty IB. */
1601       unsigned noop_dw_size = aws->info.ip[AMD_IP_GFX].ib_pad_dw_mask + 1;
1602       assert(cs->chunk_ib[IB_MAIN].ib_bytes / 4 >= noop_dw_size);
1603 
1604       cs->ib_main_addr[0] = PKT3(PKT3_NOP, noop_dw_size - 2, 0);
1605       cs->chunk_ib[IB_MAIN].ib_bytes = noop_dw_size * 4;
1606       noop = false;
1607    }
1608 
1609    assert(num_chunks <= ARRAY_SIZE(chunks));
1610 
1611    if (out_of_memory) {
1612       r = -ENOMEM;
1613    } else if (unlikely(acs->ctx->sw_status != PIPE_NO_RESET)) {
1614       r = -ECANCELED;
1615    } else if (unlikely(noop)) {
1616       r = 0;
1617    } else {
1618       /* Submit the command buffer.
1619        *
1620        * The kernel returns -ENOMEM with many parallel processes using GDS such as test suites
1621        * quite often, but it eventually succeeds after enough attempts. This happens frequently
1622        * with dEQP using NGG streamout.
1623        */
1624       r = 0;
1625 
1626       do {
1627          /* Wait 1 ms and try again. */
1628          if (r == -ENOMEM)
1629             os_time_sleep(1000);
1630 
1631          r = amdgpu_cs_submit_raw2(aws->dev, acs->ctx->ctx, 0, num_chunks, chunks, &seq_no);
1632       } while (r == -ENOMEM);
1633 
1634       if (!r) {
1635          /* Success. */
1636          uint64_t *user_fence = NULL;
1637 
1638          /* Need to reserve 4 QWORD for user fence:
1639           *   QWORD[0]: completed fence
1640           *   QWORD[1]: preempted fence
1641           *   QWORD[2]: reset fence
1642           *   QWORD[3]: preempted then reset
1643           */
1644          if (has_user_fence)
1645             user_fence = acs->ctx->user_fence_cpu_address_base + acs->ip_type * 4;
1646          amdgpu_fence_submitted(cs->fence, seq_no, user_fence);
1647       }
1648    }
1649 
1650    if (unlikely(r)) {
1651       if (r == -ECANCELED) {
1652          amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_INNOCENT_CONTEXT_RESET,
1653                                         "amdgpu: The CS has cancelled because the context is lost. This context is innocent.\n");
1654       } else if (r == -ENODATA) {
1655          amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
1656                                         "amdgpu: The CS has cancelled because the context is lost. This context is guilty of a soft recovery.\n");
1657       } else if (r == -ETIME) {
1658          amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx, PIPE_GUILTY_CONTEXT_RESET,
1659                                         "amdgpu: The CS has cancelled because the context is lost. This context is guilty of a hard recovery.\n");
1660       } else {
1661          amdgpu_ctx_set_sw_reset_status((struct radeon_winsys_ctx*)acs->ctx,
1662                                         PIPE_UNKNOWN_CONTEXT_RESET,
1663                                         "amdgpu: The CS has been rejected, "
1664                                         "see dmesg for more information (%i).\n",
1665                                         r);
1666       }
1667    }
1668 
1669    /* If there was an error, signal the fence, because it won't be signalled
1670     * by the hardware. */
1671    if (r || noop)
1672       amdgpu_fence_signalled(cs->fence);
1673 
1674    if (unlikely(aws->info.has_fw_based_shadowing && acs->mcbp_fw_shadow_chunk.flags && r == 0))
1675       acs->mcbp_fw_shadow_chunk.flags = 0;
1676 
1677    cs->error_code = r;
1678 
1679    /* Clear the buffer lists. */
1680    for (unsigned list = 0; list < ARRAY_SIZE(cs->buffer_lists); list++) {
1681       struct amdgpu_cs_buffer *buffers = cs->buffer_lists[list].buffers;
1682       unsigned num_buffers = cs->buffer_lists[list].num_buffers;
1683 
1684       if (list == AMDGPU_BO_REAL) {
1685          /* Only decrement num_active_ioctls and unref where we incremented them.
1686           * We did both for regular real BOs. We only incremented the refcount for sparse
1687           * backing BOs.
1688           */
1689          /* Regular real BOs. */
1690          for (unsigned i = 0; i < initial_num_real_buffers; i++) {
1691             p_atomic_dec(&buffers[i].bo->num_active_ioctls);
1692             amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
1693          }
1694 
1695          /* Do nothing for slab BOs. */
1696 
1697          /* Sparse backing BOs. */
1698          for (unsigned i = num_real_buffers_except_sparse; i < num_buffers; i++)
1699             amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
1700       } else {
1701          for (unsigned i = 0; i < num_buffers; i++) {
1702             p_atomic_dec(&buffers[i].bo->num_active_ioctls);
1703             amdgpu_winsys_bo_drop_reference(aws, buffers[i].bo);
1704          }
1705       }
1706 
1707       cs->buffer_lists[list].num_buffers = 0;
1708    }
1709 
1710    amdgpu_cs_context_cleanup(aws, cs);
1711 }
1712 
1713 /* Make sure the previous submission is completed. */
amdgpu_cs_sync_flush(struct radeon_cmdbuf * rcs)1714 void amdgpu_cs_sync_flush(struct radeon_cmdbuf *rcs)
1715 {
1716    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1717 
1718    /* Wait for any pending ioctl of this CS to complete. */
1719    util_queue_fence_wait(&cs->flush_completed);
1720 }
1721 
amdgpu_cs_flush(struct radeon_cmdbuf * rcs,unsigned flags,struct pipe_fence_handle ** fence)1722 static int amdgpu_cs_flush(struct radeon_cmdbuf *rcs,
1723                            unsigned flags,
1724                            struct pipe_fence_handle **fence)
1725 {
1726    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1727    struct amdgpu_winsys *aws = cs->aws;
1728    int error_code = 0;
1729    uint32_t ib_pad_dw_mask = aws->info.ip[cs->ip_type].ib_pad_dw_mask;
1730 
1731    rcs->current.max_dw += amdgpu_cs_epilog_dws(cs);
1732 
1733    /* Pad the IB according to the mask. */
1734    switch (cs->ip_type) {
1735    case AMD_IP_SDMA:
1736       if (aws->info.gfx_level <= GFX6) {
1737          while (rcs->current.cdw & ib_pad_dw_mask)
1738             radeon_emit(rcs, 0xf0000000); /* NOP packet */
1739       } else {
1740          while (rcs->current.cdw & ib_pad_dw_mask)
1741             radeon_emit(rcs, SDMA_NOP_PAD);
1742       }
1743       break;
1744    case AMD_IP_GFX:
1745    case AMD_IP_COMPUTE:
1746       amdgpu_pad_gfx_compute_ib(aws, cs->ip_type, rcs->current.buf, &rcs->current.cdw, 0);
1747       if (cs->ip_type == AMD_IP_GFX)
1748          aws->gfx_ib_size_counter += (rcs->prev_dw + rcs->current.cdw) * 4;
1749       break;
1750    case AMD_IP_UVD:
1751    case AMD_IP_UVD_ENC:
1752       while (rcs->current.cdw & ib_pad_dw_mask)
1753          radeon_emit(rcs, 0x80000000); /* type2 nop packet */
1754       break;
1755    case AMD_IP_VCN_JPEG:
1756       if (rcs->current.cdw % 2)
1757          assert(0);
1758       while (rcs->current.cdw & ib_pad_dw_mask) {
1759          radeon_emit(rcs, 0x60000000); /* nop packet */
1760          radeon_emit(rcs, 0x00000000);
1761       }
1762       break;
1763    case AMD_IP_VCN_DEC:
1764       while (rcs->current.cdw & ib_pad_dw_mask)
1765          radeon_emit(rcs, 0x81ff); /* nop packet */
1766       break;
1767    default:
1768       break;
1769    }
1770 
1771    if (rcs->current.cdw > rcs->current.max_dw) {
1772       fprintf(stderr, "amdgpu: command stream overflowed\n");
1773    }
1774 
1775    /* If the CS is not empty or overflowed.... */
1776    if (likely(radeon_emitted(rcs, 0) &&
1777        rcs->current.cdw <= rcs->current.max_dw &&
1778        !(flags & RADEON_FLUSH_NOOP))) {
1779       struct amdgpu_cs_context *cur = cs->csc;
1780 
1781       /* Set IB sizes. */
1782       amdgpu_ib_finalize(aws, rcs, &cs->main_ib, cs->ip_type);
1783 
1784       /* Create a fence. */
1785       amdgpu_fence_reference(&cur->fence, NULL);
1786       if (cs->next_fence) {
1787          /* just move the reference */
1788          cur->fence = cs->next_fence;
1789          cs->next_fence = NULL;
1790       } else {
1791          cur->fence = amdgpu_fence_create(cs);
1792       }
1793       if (fence)
1794          amdgpu_fence_reference(fence, cur->fence);
1795 
1796       for (unsigned i = 0; i < ARRAY_SIZE(cur->buffer_lists); i++) {
1797          unsigned num_buffers = cur->buffer_lists[i].num_buffers;
1798          struct amdgpu_cs_buffer *buffers = cur->buffer_lists[i].buffers;
1799 
1800          for (unsigned j = 0; j < num_buffers; j++)
1801             p_atomic_inc(&buffers[j].bo->num_active_ioctls);
1802       }
1803 
1804       amdgpu_cs_sync_flush(rcs);
1805 
1806       /* Swap command streams. "cst" is going to be submitted. */
1807       rcs->csc = cs->csc = cs->cst;
1808       cs->cst = cur;
1809 
1810       /* Submit. */
1811       util_queue_add_job(&aws->cs_queue, cs, &cs->flush_completed,
1812                          cs->uses_alt_fence ? amdgpu_cs_submit_ib<true>
1813                                             : amdgpu_cs_submit_ib<false>, NULL, 0);
1814 
1815       if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
1816          cs->csc->secure = !cs->cst->secure;
1817       else
1818          cs->csc->secure = cs->cst->secure;
1819 
1820       if (!(flags & PIPE_FLUSH_ASYNC)) {
1821          amdgpu_cs_sync_flush(rcs);
1822          error_code = cur->error_code;
1823       }
1824    } else {
1825       if (flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)
1826          cs->csc->secure = !cs->csc->secure;
1827 
1828       amdgpu_cs_context_cleanup_buffers(aws, cs->csc);
1829       amdgpu_cs_context_cleanup(aws, cs->csc);
1830    }
1831 
1832    memset(cs->csc->buffer_indices_hashlist, -1, sizeof(cs->buffer_indices_hashlist));
1833 
1834    amdgpu_get_new_ib(aws, rcs, &cs->main_ib, cs);
1835 
1836    if (cs->preamble_ib_bo) {
1837       amdgpu_cs_add_buffer(rcs, cs->preamble_ib_bo,
1838                            RADEON_USAGE_READ | RADEON_PRIO_IB, (radeon_bo_domain)0);
1839    }
1840 
1841    if (cs->ip_type == AMD_IP_GFX)
1842       aws->num_gfx_IBs++;
1843    else if (cs->ip_type == AMD_IP_SDMA)
1844       aws->num_sdma_IBs++;
1845 
1846    return error_code;
1847 }
1848 
amdgpu_cs_destroy(struct radeon_cmdbuf * rcs)1849 static void amdgpu_cs_destroy(struct radeon_cmdbuf *rcs)
1850 {
1851    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1852 
1853    if (!cs)
1854       return;
1855 
1856    amdgpu_cs_sync_flush(rcs);
1857    util_queue_fence_destroy(&cs->flush_completed);
1858    p_atomic_dec(&cs->aws->num_cs);
1859    radeon_bo_reference(&cs->aws->dummy_sws.base, &cs->preamble_ib_bo, NULL);
1860    radeon_bo_reference(&cs->aws->dummy_sws.base, &cs->main_ib.big_buffer, NULL);
1861    FREE(rcs->prev);
1862    amdgpu_destroy_cs_context(cs->aws, &cs->csc1);
1863    amdgpu_destroy_cs_context(cs->aws, &cs->csc2);
1864    amdgpu_fence_reference(&cs->next_fence, NULL);
1865    FREE(cs);
1866 }
1867 
amdgpu_bo_is_referenced(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * _buf,unsigned usage)1868 static bool amdgpu_bo_is_referenced(struct radeon_cmdbuf *rcs,
1869                                     struct pb_buffer_lean *_buf,
1870                                     unsigned usage)
1871 {
1872    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1873    struct amdgpu_winsys_bo *bo = (struct amdgpu_winsys_bo*)_buf;
1874 
1875    return amdgpu_bo_is_referenced_by_cs_with_usage(cs, bo, usage);
1876 }
1877 
amdgpu_cs_set_mcbp_reg_shadowing_va(struct radeon_cmdbuf * rcs,uint64_t regs_va,uint64_t csa_va)1878 static void amdgpu_cs_set_mcbp_reg_shadowing_va(struct radeon_cmdbuf *rcs,uint64_t regs_va,
1879                                                                    uint64_t csa_va)
1880 {
1881    struct amdgpu_cs *cs = amdgpu_cs(rcs);
1882    cs->mcbp_fw_shadow_chunk.shadow_va = regs_va;
1883    cs->mcbp_fw_shadow_chunk.csa_va = csa_va;
1884    cs->mcbp_fw_shadow_chunk.gds_va = 0;
1885    cs->mcbp_fw_shadow_chunk.flags = AMDGPU_CS_CHUNK_CP_GFX_SHADOW_FLAGS_INIT_SHADOW;
1886 }
1887 
amdgpu_winsys_fence_reference(struct radeon_winsys * rws,struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)1888 static void amdgpu_winsys_fence_reference(struct radeon_winsys *rws,
1889                                           struct pipe_fence_handle **dst,
1890                                           struct pipe_fence_handle *src)
1891 {
1892    amdgpu_fence_reference(dst, src);
1893 }
1894 
amdgpu_cs_init_functions(struct amdgpu_screen_winsys * sws)1895 void amdgpu_cs_init_functions(struct amdgpu_screen_winsys *sws)
1896 {
1897    sws->base.ctx_create = amdgpu_ctx_create;
1898    sws->base.ctx_destroy = amdgpu_ctx_destroy;
1899    sws->base.ctx_set_sw_reset_status = amdgpu_ctx_set_sw_reset_status;
1900    sws->base.ctx_query_reset_status = amdgpu_ctx_query_reset_status;
1901    sws->base.cs_create = amdgpu_cs_create;
1902    sws->base.cs_setup_preemption = amdgpu_cs_setup_preemption;
1903    sws->base.cs_destroy = amdgpu_cs_destroy;
1904    sws->base.cs_add_buffer = amdgpu_cs_add_buffer;
1905    sws->base.cs_validate = amdgpu_cs_validate;
1906    sws->base.cs_check_space = amdgpu_cs_check_space;
1907    sws->base.cs_get_buffer_list = amdgpu_cs_get_buffer_list;
1908    sws->base.cs_flush = amdgpu_cs_flush;
1909    sws->base.cs_get_next_fence = amdgpu_cs_get_next_fence;
1910    sws->base.cs_is_buffer_referenced = amdgpu_bo_is_referenced;
1911    sws->base.cs_sync_flush = amdgpu_cs_sync_flush;
1912    sws->base.cs_add_fence_dependency = amdgpu_cs_add_fence_dependency;
1913    sws->base.cs_add_syncobj_signal = amdgpu_cs_add_syncobj_signal;
1914    sws->base.cs_get_ip_type = amdgpu_cs_get_ip_type;
1915    sws->base.fence_wait = amdgpu_fence_wait_rel_timeout;
1916    sws->base.fence_reference = amdgpu_winsys_fence_reference;
1917    sws->base.fence_import_syncobj = amdgpu_fence_import_syncobj;
1918    sws->base.fence_import_sync_file = amdgpu_fence_import_sync_file;
1919    sws->base.fence_export_sync_file = amdgpu_fence_export_sync_file;
1920    sws->base.export_signalled_sync_file = amdgpu_export_signalled_sync_file;
1921 
1922    if (sws->aws->info.has_fw_based_shadowing)
1923       sws->base.cs_set_mcbp_reg_shadowing_va = amdgpu_cs_set_mcbp_reg_shadowing_va;
1924 }
1925