xref: /aosp_15_r20/external/mesa3d/src/gallium/winsys/radeon/drm/radeon_drm_cs.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2008 Jérôme Glisse
3  * Copyright © 2010 Marek Olšák <[email protected]>
4  *
5  * SPDX-License-Identifier: MIT
6  */
7 
8 /*
9     This file replaces libdrm's radeon_cs_gem with our own implemention.
10     It's optimized specifically for Radeon DRM.
11     Adding buffers and space checking are faster and simpler than their
12     counterparts in libdrm (the time complexity of all the functions
13     is O(1) in nearly all scenarios, thanks to hashing).
14 
15     It works like this:
16 
17     cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
18     also adds the size of 'buf' to the used_gart and used_vram winsys variables
19     based on the domains, which are simply or'd for the accounting purposes.
20     The adding is skipped if the reloc is already present in the list, but it
21     accounts any newly-referenced domains.
22 
23     cs_validate is then called, which just checks:
24         used_vram/gart < vram/gart_size * 0.8
25     The 0.8 number allows for some memory fragmentation. If the validation
26     fails, the pipe driver flushes CS and tries do the validation again,
27     i.e. it validates only that one operation. If it fails again, it drops
28     the operation on the floor and prints some nasty message to stderr.
29     (done in the pipe driver)
30 
31     cs_write_reloc(cs, buf) just writes a reloc that has been added using
32     cs_add_buffer. The read_domain and write_domain parameters have been removed,
33     because we already specify them in cs_add_buffer.
34 */
35 
36 #include "radeon_drm_cs.h"
37 
38 #include "util/u_memory.h"
39 #include "util/os_time.h"
40 
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <stdint.h>
44 #include <xf86drm.h>
45 
46 
47 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
48 
49 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
50 static void radeon_fence_reference(struct radeon_winsys *ws,
51                                    struct pipe_fence_handle **dst,
52                                    struct pipe_fence_handle *src);
53 
radeon_drm_ctx_create(struct radeon_winsys * ws,enum radeon_ctx_priority priority,bool allow_context_lost)54 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws,
55                                                        enum radeon_ctx_priority priority,
56                                                        bool allow_context_lost)
57 {
58    struct radeon_ctx *ctx = CALLOC_STRUCT(radeon_ctx);
59    if (!ctx)
60       return NULL;
61 
62    ctx->ws = (struct radeon_drm_winsys*)ws;
63    ctx->gpu_reset_counter = radeon_drm_get_gpu_reset_counter(ctx->ws);
64    return (struct radeon_winsys_ctx*)ctx;
65 }
66 
radeon_drm_ctx_destroy(struct radeon_winsys_ctx * ctx)67 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
68 {
69    FREE(ctx);
70 }
71 
72 static void
radeon_drm_ctx_set_sw_reset_status(struct radeon_winsys_ctx * rwctx,enum pipe_reset_status status,const char * format,...)73 radeon_drm_ctx_set_sw_reset_status(struct radeon_winsys_ctx *rwctx, enum pipe_reset_status status,
74                                    const char *format, ...)
75 {
76    /* TODO: we should do something better here */
77    va_list args;
78 
79    va_start(args, format);
80    vfprintf(stderr, format, args);
81    va_end(args);
82 }
83 
84 static enum pipe_reset_status
radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx * rctx,bool full_reset_only,bool * needs_reset,bool * reset_completed)85 radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx *rctx, bool full_reset_only,
86                                   bool *needs_reset, bool *reset_completed)
87 {
88    struct radeon_ctx *ctx = (struct radeon_ctx*)rctx;
89 
90    unsigned latest = radeon_drm_get_gpu_reset_counter(ctx->ws);
91 
92    if (ctx->gpu_reset_counter == latest) {
93       if (needs_reset)
94          *needs_reset = false;
95       if (reset_completed)
96          *reset_completed = false;
97       return PIPE_NO_RESET;
98    }
99 
100    if (needs_reset)
101       *needs_reset = true;
102    if (reset_completed)
103       *reset_completed = true;
104 
105    ctx->gpu_reset_counter = latest;
106    return PIPE_UNKNOWN_CONTEXT_RESET;
107 }
108 
radeon_init_cs_context(struct radeon_cs_context * csc,struct radeon_drm_winsys * ws)109 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
110                                    struct radeon_drm_winsys *ws)
111 {
112    int i;
113 
114    csc->fd = ws->fd;
115 
116    csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
117    csc->chunks[0].length_dw = 0;
118    csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
119    csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
120    csc->chunks[1].length_dw = 0;
121    csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
122    csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
123    csc->chunks[2].length_dw = 2;
124    csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
125 
126    csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
127    csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
128    csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
129 
130    csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
131 
132    for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
133       csc->reloc_indices_hashlist[i] = -1;
134    }
135    return true;
136 }
137 
radeon_cs_context_cleanup(struct radeon_winsys * rws,struct radeon_cs_context * csc)138 static void radeon_cs_context_cleanup(struct radeon_winsys *rws,
139                                       struct radeon_cs_context *csc)
140 {
141    unsigned i;
142 
143    for (i = 0; i < csc->num_relocs; i++) {
144       p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
145       radeon_ws_bo_reference(rws, &csc->relocs_bo[i].bo, NULL);
146    }
147    for (i = 0; i < csc->num_slab_buffers; ++i) {
148       p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
149       radeon_ws_bo_reference(rws, &csc->slab_buffers[i].bo, NULL);
150    }
151 
152    csc->num_relocs = 0;
153    csc->num_validated_relocs = 0;
154    csc->num_slab_buffers = 0;
155    csc->chunks[0].length_dw = 0;
156    csc->chunks[1].length_dw = 0;
157 
158    for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
159       csc->reloc_indices_hashlist[i] = -1;
160    }
161 }
162 
radeon_destroy_cs_context(struct radeon_winsys * rws,struct radeon_cs_context * csc)163 static void radeon_destroy_cs_context(struct radeon_winsys *rws, struct radeon_cs_context *csc)
164 {
165    radeon_cs_context_cleanup(rws, csc);
166    FREE(csc->slab_buffers);
167    FREE(csc->relocs_bo);
168    FREE(csc->relocs);
169 }
170 
171 
radeon_drm_cs_get_ip_type(struct radeon_cmdbuf * rcs)172 static enum amd_ip_type radeon_drm_cs_get_ip_type(struct radeon_cmdbuf *rcs)
173 {
174    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
175    return cs->ip_type;
176 }
177 
178 
179 static bool
radeon_drm_cs_create(struct radeon_cmdbuf * rcs,struct radeon_winsys_ctx * ctx,enum amd_ip_type ip_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx)180 radeon_drm_cs_create(struct radeon_cmdbuf *rcs,
181                      struct radeon_winsys_ctx *ctx,
182                      enum amd_ip_type ip_type,
183                      void (*flush)(void *ctx, unsigned flags,
184                                    struct pipe_fence_handle **fence),
185                      void *flush_ctx)
186 {
187    struct radeon_drm_winsys *ws = ((struct radeon_ctx*)ctx)->ws;
188    struct radeon_drm_cs *cs;
189 
190    cs = CALLOC_STRUCT(radeon_drm_cs);
191    if (!cs) {
192       return false;
193    }
194    util_queue_fence_init(&cs->flush_completed);
195 
196    cs->ws = ws;
197    cs->flush_cs = flush;
198    cs->flush_data = flush_ctx;
199 
200    if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
201       FREE(cs);
202       return false;
203    }
204    if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
205       radeon_destroy_cs_context(&ws->base, &cs->csc1);
206       FREE(cs);
207       return false;
208    }
209 
210    /* Set the first command buffer as current. */
211    cs->csc = &cs->csc1;
212    cs->cst = &cs->csc2;
213    cs->ip_type = ip_type;
214 
215    memset(rcs, 0, sizeof(*rcs));
216    rcs->current.buf = cs->csc->buf;
217    rcs->current.max_dw = ARRAY_SIZE(cs->csc->buf);
218    rcs->priv = cs;
219 
220    p_atomic_inc(&ws->num_cs);
221    return true;
222 }
223 
radeon_drm_cs_set_preamble(struct radeon_cmdbuf * cs,const uint32_t * preamble_ib,unsigned preamble_num_dw,bool preamble_changed)224 static void radeon_drm_cs_set_preamble(struct radeon_cmdbuf *cs, const uint32_t *preamble_ib,
225                                        unsigned preamble_num_dw, bool preamble_changed)
226 {
227    /* The radeon kernel driver doesn't support preambles. */
228    radeon_emit_array(cs, preamble_ib, preamble_num_dw);
229 }
230 
radeon_lookup_buffer(struct radeon_winsys * rws,struct radeon_cs_context * csc,struct radeon_bo * bo)231 int radeon_lookup_buffer(struct radeon_winsys *rws, struct radeon_cs_context *csc,
232                          struct radeon_bo *bo)
233 {
234    unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
235    struct radeon_bo_item *buffers;
236    unsigned num_buffers;
237    int i = csc->reloc_indices_hashlist[hash];
238 
239    if (bo->handle) {
240       buffers = csc->relocs_bo;
241       num_buffers = csc->num_relocs;
242    } else {
243       buffers = csc->slab_buffers;
244       num_buffers = csc->num_slab_buffers;
245    }
246 
247    /* not found or found */
248    if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
249       return i;
250 
251    /* Hash collision, look for the BO in the list of relocs linearly. */
252    for (i = num_buffers - 1; i >= 0; i--) {
253       if (buffers[i].bo == bo) {
254          /* Put this reloc in the hash list.
255           * This will prevent additional hash collisions if there are
256           * several consecutive lookup_buffer calls for the same buffer.
257           *
258           * Example: Assuming buffers A,B,C collide in the hash list,
259           * the following sequence of relocs:
260           *         AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
261           * will collide here: ^ and here:   ^,
262           * meaning that we should get very few collisions in the end. */
263          csc->reloc_indices_hashlist[hash] = i;
264          return i;
265       }
266    }
267    return -1;
268 }
269 
radeon_lookup_or_add_real_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)270 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
271                                                  struct radeon_bo *bo)
272 {
273    struct radeon_cs_context *csc = cs->csc;
274    struct drm_radeon_cs_reloc *reloc;
275    unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
276    int i = -1;
277 
278    i = radeon_lookup_buffer(&cs->ws->base, csc, bo);
279 
280    if (i >= 0) {
281       /* For async DMA, every add_buffer call must add a buffer to the list
282        * no matter how many duplicates there are. This is due to the fact
283        * the DMA CS checker doesn't use NOP packets for offset patching,
284        * but always uses the i-th buffer from the list to patch the i-th
285        * offset. If there are N offsets in a DMA CS, there must also be N
286        * buffers in the relocation list.
287        *
288        * This doesn't have to be done if virtual memory is enabled,
289        * because there is no offset patching with virtual memory.
290        */
291       if (cs->ip_type != AMD_IP_SDMA || cs->ws->info.r600_has_virtual_memory) {
292          return i;
293       }
294    }
295 
296    /* New relocation, check if the backing array is large enough. */
297    if (csc->num_relocs >= csc->max_relocs) {
298       uint32_t size;
299       csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
300 
301       size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
302       csc->relocs_bo = realloc(csc->relocs_bo, size);
303 
304       size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
305       csc->relocs = realloc(csc->relocs, size);
306 
307       csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
308    }
309 
310    /* Initialize the new relocation. */
311    csc->relocs_bo[csc->num_relocs].bo = NULL;
312    csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
313    radeon_ws_bo_reference(&cs->ws->base, &csc->relocs_bo[csc->num_relocs].bo, bo);
314    p_atomic_inc(&bo->num_cs_references);
315    reloc = &csc->relocs[csc->num_relocs];
316    reloc->handle = bo->handle;
317    reloc->read_domains = 0;
318    reloc->write_domain = 0;
319    reloc->flags = 0;
320 
321    csc->reloc_indices_hashlist[hash] = csc->num_relocs;
322 
323    csc->chunks[1].length_dw += RELOC_DWORDS;
324 
325    return csc->num_relocs++;
326 }
327 
radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)328 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
329                                             struct radeon_bo *bo)
330 {
331    struct radeon_cs_context *csc = cs->csc;
332    unsigned hash;
333    struct radeon_bo_item *item;
334    int idx;
335    int real_idx;
336 
337    idx = radeon_lookup_buffer(&cs->ws->base, csc, bo);
338    if (idx >= 0)
339       return idx;
340 
341    real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
342 
343    /* Check if the backing array is large enough. */
344    if (csc->num_slab_buffers >= csc->max_slab_buffers) {
345       unsigned new_max = MAX2(csc->max_slab_buffers + 16,
346                               (unsigned)(csc->max_slab_buffers * 1.3));
347       struct radeon_bo_item *new_buffers =
348             REALLOC(csc->slab_buffers,
349                     csc->max_slab_buffers * sizeof(*new_buffers),
350                     new_max * sizeof(*new_buffers));
351       if (!new_buffers) {
352          fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
353          return -1;
354       }
355 
356       csc->max_slab_buffers = new_max;
357       csc->slab_buffers = new_buffers;
358    }
359 
360    /* Initialize the new relocation. */
361    idx = csc->num_slab_buffers++;
362    item = &csc->slab_buffers[idx];
363 
364    item->bo = NULL;
365    item->u.slab.real_idx = real_idx;
366    radeon_ws_bo_reference(&cs->ws->base, &item->bo, bo);
367    p_atomic_inc(&bo->num_cs_references);
368 
369    hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
370    csc->reloc_indices_hashlist[hash] = idx;
371 
372    return idx;
373 }
374 
radeon_drm_cs_add_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * buf,unsigned usage,enum radeon_bo_domain domains)375 static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
376                                          struct pb_buffer_lean *buf,
377                                          unsigned usage,
378                                          enum radeon_bo_domain domains)
379 {
380    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
381    struct radeon_bo *bo = (struct radeon_bo*)buf;
382    enum radeon_bo_domain added_domains;
383 
384    /* If VRAM is just stolen system memory, allow both VRAM and
385     * GTT, whichever has free space. If a buffer is evicted from
386     * VRAM to GTT, it will stay there.
387     */
388    if (!cs->ws->info.has_dedicated_vram)
389       domains |= RADEON_DOMAIN_GTT;
390 
391    enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
392    enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
393    struct drm_radeon_cs_reloc *reloc;
394    int index;
395 
396    if (!bo->handle) {
397       index = radeon_lookup_or_add_slab_buffer(cs, bo);
398       if (index < 0)
399          return 0;
400 
401       index = cs->csc->slab_buffers[index].u.slab.real_idx;
402    } else {
403       index = radeon_lookup_or_add_real_buffer(cs, bo);
404    }
405 
406    reloc = &cs->csc->relocs[index];
407    added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
408    reloc->read_domains |= rd;
409    reloc->write_domain |= wd;
410 
411    /* The priority must be in [0, 15]. It's used by the kernel memory management. */
412    unsigned priority = usage & RADEON_ALL_PRIORITIES;
413    unsigned bo_priority = util_last_bit(priority) / 2;
414    reloc->flags = MAX2(reloc->flags, bo_priority);
415    cs->csc->relocs_bo[index].u.real.priority_usage |= priority;
416 
417    if (added_domains & RADEON_DOMAIN_VRAM)
418       rcs->used_vram_kb += bo->base.size / 1024;
419    else if (added_domains & RADEON_DOMAIN_GTT)
420       rcs->used_gart_kb += bo->base.size / 1024;
421 
422    return index;
423 }
424 
radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * buf)425 static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
426                                        struct pb_buffer_lean *buf)
427 {
428    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
429 
430    return radeon_lookup_buffer(&cs->ws->base, cs->csc, (struct radeon_bo*)buf);
431 }
432 
radeon_drm_cs_validate(struct radeon_cmdbuf * rcs)433 static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
434 {
435    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
436    bool status =
437          rcs->used_gart_kb < cs->ws->info.gart_size_kb * 0.8 &&
438          rcs->used_vram_kb < cs->ws->info.vram_size_kb * 0.8;
439 
440    if (status) {
441       cs->csc->num_validated_relocs = cs->csc->num_relocs;
442    } else {
443       /* Remove lately-added buffers. The validation failed with them
444        * and the CS is about to be flushed because of that. Keep only
445        * the already-validated buffers. */
446       unsigned i;
447 
448       for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
449          p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
450          radeon_ws_bo_reference(&cs->ws->base, &cs->csc->relocs_bo[i].bo, NULL);
451       }
452       cs->csc->num_relocs = cs->csc->num_validated_relocs;
453 
454       /* Flush if there are any relocs. Clean up otherwise. */
455       if (cs->csc->num_relocs) {
456          cs->flush_cs(cs->flush_data,
457                       RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
458       } else {
459          radeon_cs_context_cleanup(&cs->ws->base, cs->csc);
460          rcs->used_vram_kb = 0;
461          rcs->used_gart_kb = 0;
462 
463          assert(rcs->current.cdw == 0);
464          if (rcs->current.cdw != 0) {
465             fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
466          }
467       }
468    }
469    return status;
470 }
471 
radeon_drm_cs_check_space(struct radeon_cmdbuf * rcs,unsigned dw)472 static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
473 {
474    assert(rcs->current.cdw <= rcs->current.max_dw);
475    return rcs->current.max_dw - rcs->current.cdw >= dw;
476 }
477 
radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf * rcs,struct radeon_bo_list_item * list)478 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
479                                               struct radeon_bo_list_item *list)
480 {
481    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
482    int i;
483 
484    if (list) {
485       for (i = 0; i < cs->csc->num_relocs; i++) {
486          list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
487          list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
488          list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
489       }
490    }
491    return cs->csc->num_relocs;
492 }
493 
radeon_drm_cs_emit_ioctl_oneshot(void * job,void * gdata,int thread_index)494 void radeon_drm_cs_emit_ioctl_oneshot(void *job, void *gdata, int thread_index)
495 {
496    struct radeon_drm_cs *cs = (struct radeon_drm_cs*)job;
497    struct radeon_cs_context *csc = cs->cst;
498    unsigned i;
499    int r;
500 
501    r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
502                            &csc->cs, sizeof(struct drm_radeon_cs));
503    if (r) {
504       if (r == -ENOMEM)
505          fprintf(stderr, "radeon: Not enough memory for command submission.\n");
506       else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
507          unsigned i;
508 
509          fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
510          for (i = 0; i < csc->chunks[0].length_dw; i++) {
511             fprintf(stderr, "0x%08X\n", csc->buf[i]);
512          }
513       } else {
514          fprintf(stderr, "radeon: The kernel rejected CS, "
515                          "see dmesg for more information (%i).\n", r);
516       }
517    }
518 
519    for (i = 0; i < csc->num_relocs; i++)
520       p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
521    for (i = 0; i < csc->num_slab_buffers; i++)
522       p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
523 
524    radeon_cs_context_cleanup(&cs->ws->base, csc);
525 }
526 
527 /*
528  * Make sure previous submission of this cs are completed
529  */
radeon_drm_cs_sync_flush(struct radeon_cmdbuf * rcs)530 void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
531 {
532    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
533 
534    /* Wait for any pending ioctl of this CS to complete. */
535    if (util_queue_is_initialized(&cs->ws->cs_queue))
536       util_queue_fence_wait(&cs->flush_completed);
537 }
538 
539 /* Add the given fence to a slab buffer fence list.
540  *
541  * There is a potential race condition when bo participates in submissions on
542  * two or more threads simultaneously. Since we do not know which of the
543  * submissions will be sent to the GPU first, we have to keep the fences
544  * of all submissions.
545  *
546  * However, fences that belong to submissions that have already returned from
547  * their respective ioctl do not have to be kept, because we know that they
548  * will signal earlier.
549  */
radeon_bo_slab_fence(struct radeon_winsys * rws,struct radeon_bo * bo,struct radeon_bo * fence)550 static void radeon_bo_slab_fence(struct radeon_winsys *rws, struct radeon_bo *bo,
551                                  struct radeon_bo *fence)
552 {
553    unsigned dst;
554 
555    assert(fence->num_cs_references);
556 
557    /* Cleanup older fences */
558    dst = 0;
559    for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
560       if (bo->u.slab.fences[src]->num_cs_references) {
561          bo->u.slab.fences[dst] = bo->u.slab.fences[src];
562          dst++;
563       } else {
564          radeon_ws_bo_reference(rws, &bo->u.slab.fences[src], NULL);
565       }
566    }
567    bo->u.slab.num_fences = dst;
568 
569    /* Check available space for the new fence */
570    if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
571       unsigned new_max_fences = bo->u.slab.max_fences + 1;
572       struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
573                                               bo->u.slab.max_fences * sizeof(*new_fences),
574                                               new_max_fences * sizeof(*new_fences));
575       if (!new_fences) {
576          fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
577          return;
578       }
579 
580       bo->u.slab.fences = new_fences;
581       bo->u.slab.max_fences = new_max_fences;
582    }
583 
584    /* Add the new fence */
585    bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
586    radeon_ws_bo_reference(rws, &bo->u.slab.fences[bo->u.slab.num_fences], fence);
587    bo->u.slab.num_fences++;
588 }
589 
radeon_drm_cs_flush(struct radeon_cmdbuf * rcs,unsigned flags,struct pipe_fence_handle ** pfence)590 static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
591                                unsigned flags,
592                                struct pipe_fence_handle **pfence)
593 {
594    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
595    struct radeon_cs_context *tmp;
596 
597    switch (cs->ip_type) {
598    case AMD_IP_SDMA:
599       /* pad DMA ring to 8 DWs */
600       if (cs->ws->info.gfx_level <= GFX6) {
601          while (rcs->current.cdw & 7)
602             radeon_emit(rcs, 0xf0000000); /* NOP packet */
603       } else {
604          while (rcs->current.cdw & 7)
605             radeon_emit(rcs, 0x00000000); /* NOP packet */
606       }
607       break;
608    case AMD_IP_GFX:
609       /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
610        * r6xx, requires at least 4 dw alignment to avoid a hw bug.
611        */
612       if (cs->ws->info.gfx_ib_pad_with_type2) {
613          while (rcs->current.cdw & 7)
614             radeon_emit(rcs, 0x80000000); /* type2 nop packet */
615       } else {
616          while (rcs->current.cdw & 7)
617             radeon_emit(rcs, 0xffff1000); /* type3 nop packet */
618       }
619       break;
620    case AMD_IP_UVD:
621       while (rcs->current.cdw & 15)
622          radeon_emit(rcs, 0x80000000); /* type2 nop packet */
623       break;
624    default:
625       break;
626    }
627 
628    if (rcs->current.cdw > rcs->current.max_dw) {
629       fprintf(stderr, "radeon: command stream overflowed\n");
630    }
631 
632    if (pfence || cs->csc->num_slab_buffers) {
633       struct pipe_fence_handle *fence;
634 
635       if (cs->next_fence) {
636          fence = cs->next_fence;
637          cs->next_fence = NULL;
638       } else {
639          fence = radeon_cs_create_fence(rcs);
640       }
641 
642       if (fence) {
643          if (pfence)
644             radeon_fence_reference(&cs->ws->base, pfence, fence);
645 
646          mtx_lock(&cs->ws->bo_fence_lock);
647          for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
648             struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
649             p_atomic_inc(&bo->num_active_ioctls);
650             radeon_bo_slab_fence(&cs->ws->base, bo, (struct radeon_bo *)fence);
651          }
652          mtx_unlock(&cs->ws->bo_fence_lock);
653 
654          radeon_fence_reference(&cs->ws->base, &fence, NULL);
655       }
656    } else {
657       radeon_fence_reference(&cs->ws->base, &cs->next_fence, NULL);
658    }
659 
660    radeon_drm_cs_sync_flush(rcs);
661 
662    /* Swap command streams. */
663    tmp = cs->csc;
664    cs->csc = cs->cst;
665    cs->cst = tmp;
666 
667    /* If the CS is not empty or overflowed, emit it in a separate thread. */
668    if (rcs->current.cdw && rcs->current.cdw <= rcs->current.max_dw &&
669        !cs->ws->noop_cs && !(flags & RADEON_FLUSH_NOOP)) {
670       unsigned i, num_relocs;
671 
672       num_relocs = cs->cst->num_relocs;
673 
674       cs->cst->chunks[0].length_dw = rcs->current.cdw;
675 
676       for (i = 0; i < num_relocs; i++) {
677          /* Update the number of active asynchronous CS ioctls for the buffer. */
678          p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
679       }
680 
681       switch (cs->ip_type) {
682       case AMD_IP_SDMA:
683          cs->cst->flags[0] = 0;
684          cs->cst->flags[1] = RADEON_CS_RING_DMA;
685          cs->cst->cs.num_chunks = 3;
686          if (cs->ws->info.r600_has_virtual_memory) {
687             cs->cst->flags[0] |= RADEON_CS_USE_VM;
688          }
689          break;
690 
691       case AMD_IP_UVD:
692          cs->cst->flags[0] = 0;
693          cs->cst->flags[1] = RADEON_CS_RING_UVD;
694          cs->cst->cs.num_chunks = 3;
695          break;
696 
697       case AMD_IP_VCE:
698          cs->cst->flags[0] = 0;
699          cs->cst->flags[1] = RADEON_CS_RING_VCE;
700          cs->cst->cs.num_chunks = 3;
701          break;
702 
703       default:
704       case AMD_IP_GFX:
705       case AMD_IP_COMPUTE:
706          cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
707          cs->cst->flags[1] = RADEON_CS_RING_GFX;
708          cs->cst->cs.num_chunks = 3;
709 
710          if (cs->ws->info.r600_has_virtual_memory) {
711             cs->cst->flags[0] |= RADEON_CS_USE_VM;
712             cs->cst->cs.num_chunks = 3;
713          }
714          if (flags & PIPE_FLUSH_END_OF_FRAME) {
715             cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
716             cs->cst->cs.num_chunks = 3;
717          }
718          if (cs->ip_type == AMD_IP_COMPUTE) {
719             cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
720             cs->cst->cs.num_chunks = 3;
721          }
722          break;
723       }
724 
725       if (util_queue_is_initialized(&cs->ws->cs_queue)) {
726          util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
727                             radeon_drm_cs_emit_ioctl_oneshot, NULL, 0);
728          if (!(flags & PIPE_FLUSH_ASYNC))
729             radeon_drm_cs_sync_flush(rcs);
730       } else {
731          radeon_drm_cs_emit_ioctl_oneshot(cs, NULL, 0);
732       }
733    } else {
734       radeon_cs_context_cleanup(&cs->ws->base, cs->cst);
735    }
736 
737    /* Prepare a new CS. */
738    rcs->current.buf = cs->csc->buf;
739    rcs->current.cdw = 0;
740    rcs->used_vram_kb = 0;
741    rcs->used_gart_kb = 0;
742 
743    if (cs->ip_type == AMD_IP_GFX)
744       cs->ws->num_gfx_IBs++;
745    else if (cs->ip_type == AMD_IP_SDMA)
746       cs->ws->num_sdma_IBs++;
747    return 0;
748 }
749 
radeon_drm_cs_destroy(struct radeon_cmdbuf * rcs)750 static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
751 {
752    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
753 
754    if (!cs)
755       return;
756 
757    radeon_drm_cs_sync_flush(rcs);
758    util_queue_fence_destroy(&cs->flush_completed);
759    radeon_cs_context_cleanup(&cs->ws->base, &cs->csc1);
760    radeon_cs_context_cleanup(&cs->ws->base, &cs->csc2);
761    p_atomic_dec(&cs->ws->num_cs);
762    radeon_destroy_cs_context(&cs->ws->base, &cs->csc1);
763    radeon_destroy_cs_context(&cs->ws->base, &cs->csc2);
764    radeon_fence_reference(&cs->ws->base, &cs->next_fence, NULL);
765    FREE(cs);
766 }
767 
radeon_bo_is_referenced(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * _buf,unsigned usage)768 static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
769                                     struct pb_buffer_lean *_buf,
770                                     unsigned usage)
771 {
772    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
773    struct radeon_bo *bo = (struct radeon_bo*)_buf;
774    int index;
775 
776    if (!bo->num_cs_references)
777       return false;
778 
779    index = radeon_lookup_buffer(&cs->ws->base, cs->csc, bo);
780    if (index == -1)
781       return false;
782 
783    if (!bo->handle)
784       index = cs->csc->slab_buffers[index].u.slab.real_idx;
785 
786    if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
787       return true;
788    if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
789       return true;
790 
791    return false;
792 }
793 
794 /* FENCES */
795 
radeon_cs_create_fence(struct radeon_cmdbuf * rcs)796 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
797 {
798    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
799    struct pb_buffer_lean *fence;
800 
801    /* Create a fence, which is a dummy BO. */
802    fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
803                                       RADEON_DOMAIN_GTT,
804                                       RADEON_FLAG_NO_SUBALLOC
805                                       | RADEON_FLAG_NO_INTERPROCESS_SHARING);
806    if (!fence)
807       return NULL;
808 
809    /* Add the fence as a dummy relocation. */
810    cs->ws->base.cs_add_buffer(rcs, fence,
811                               RADEON_USAGE_READWRITE | RADEON_PRIO_FENCE_TRACE, RADEON_DOMAIN_GTT);
812    return (struct pipe_fence_handle*)fence;
813 }
814 
radeon_fence_wait(struct radeon_winsys * ws,struct pipe_fence_handle * fence,uint64_t timeout)815 static bool radeon_fence_wait(struct radeon_winsys *ws,
816                               struct pipe_fence_handle *fence,
817                               uint64_t timeout)
818 {
819    return ws->buffer_wait(ws, (struct pb_buffer_lean*)fence, timeout,
820                           RADEON_USAGE_READWRITE);
821 }
822 
radeon_fence_reference(struct radeon_winsys * ws,struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)823 static void radeon_fence_reference(struct radeon_winsys *ws,
824                                    struct pipe_fence_handle **dst,
825                                    struct pipe_fence_handle *src)
826 {
827    radeon_bo_reference(ws, (struct pb_buffer_lean**)dst, (struct pb_buffer_lean*)src);
828 }
829 
radeon_drm_cs_get_next_fence(struct radeon_cmdbuf * rcs)830 static struct pipe_fence_handle *radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
831 {
832    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
833    struct pipe_fence_handle *fence = NULL;
834 
835    if (cs->next_fence) {
836       radeon_fence_reference(&cs->ws->base, &fence, cs->next_fence);
837       return fence;
838    }
839 
840    fence = radeon_cs_create_fence(rcs);
841    if (!fence)
842       return NULL;
843 
844    radeon_fence_reference(&cs->ws->base, &cs->next_fence, fence);
845    return fence;
846 }
847 
848 static void
radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf * cs,struct pipe_fence_handle * fence)849 radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
850                                    struct pipe_fence_handle *fence)
851 {
852    /* TODO: Handle the following unlikely multi-threaded scenario:
853     *
854     *  Thread 1 / Context 1                   Thread 2 / Context 2
855     *  --------------------                   --------------------
856     *  f = cs_get_next_fence()
857     *                                         cs_add_fence_dependency(f)
858     *                                         cs_flush()
859     *  cs_flush()
860     *
861     * We currently assume that this does not happen because we don't support
862     * asynchronous flushes on Radeon.
863     */
864 }
865 
radeon_drm_cs_init_functions(struct radeon_drm_winsys * ws)866 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
867 {
868    ws->base.ctx_create = radeon_drm_ctx_create;
869    ws->base.ctx_destroy = radeon_drm_ctx_destroy;
870    ws->base.ctx_set_sw_reset_status = radeon_drm_ctx_set_sw_reset_status;
871    ws->base.ctx_query_reset_status = radeon_drm_ctx_query_reset_status;
872    ws->base.cs_create = radeon_drm_cs_create;
873    ws->base.cs_destroy = radeon_drm_cs_destroy;
874    ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
875    ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
876    ws->base.cs_validate = radeon_drm_cs_validate;
877    ws->base.cs_check_space = radeon_drm_cs_check_space;
878    ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
879    ws->base.cs_flush = radeon_drm_cs_flush;
880    ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
881    ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
882    ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
883    ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
884    ws->base.fence_wait = radeon_fence_wait;
885    ws->base.fence_reference = radeon_fence_reference;
886 }
887