1 /*
2 * Copyright © 2008 Jérôme Glisse
3 * Copyright © 2010 Marek Olšák <[email protected]>
4 *
5 * SPDX-License-Identifier: MIT
6 */
7
8 /*
9 This file replaces libdrm's radeon_cs_gem with our own implemention.
10 It's optimized specifically for Radeon DRM.
11 Adding buffers and space checking are faster and simpler than their
12 counterparts in libdrm (the time complexity of all the functions
13 is O(1) in nearly all scenarios, thanks to hashing).
14
15 It works like this:
16
17 cs_add_buffer(cs, buf, read_domain, write_domain) adds a new relocation and
18 also adds the size of 'buf' to the used_gart and used_vram winsys variables
19 based on the domains, which are simply or'd for the accounting purposes.
20 The adding is skipped if the reloc is already present in the list, but it
21 accounts any newly-referenced domains.
22
23 cs_validate is then called, which just checks:
24 used_vram/gart < vram/gart_size * 0.8
25 The 0.8 number allows for some memory fragmentation. If the validation
26 fails, the pipe driver flushes CS and tries do the validation again,
27 i.e. it validates only that one operation. If it fails again, it drops
28 the operation on the floor and prints some nasty message to stderr.
29 (done in the pipe driver)
30
31 cs_write_reloc(cs, buf) just writes a reloc that has been added using
32 cs_add_buffer. The read_domain and write_domain parameters have been removed,
33 because we already specify them in cs_add_buffer.
34 */
35
36 #include "radeon_drm_cs.h"
37
38 #include "util/u_memory.h"
39 #include "util/os_time.h"
40
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <stdint.h>
44 #include <xf86drm.h>
45
46
47 #define RELOC_DWORDS (sizeof(struct drm_radeon_cs_reloc) / sizeof(uint32_t))
48
49 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs);
50 static void radeon_fence_reference(struct radeon_winsys *ws,
51 struct pipe_fence_handle **dst,
52 struct pipe_fence_handle *src);
53
radeon_drm_ctx_create(struct radeon_winsys * ws,enum radeon_ctx_priority priority,bool allow_context_lost)54 static struct radeon_winsys_ctx *radeon_drm_ctx_create(struct radeon_winsys *ws,
55 enum radeon_ctx_priority priority,
56 bool allow_context_lost)
57 {
58 struct radeon_ctx *ctx = CALLOC_STRUCT(radeon_ctx);
59 if (!ctx)
60 return NULL;
61
62 ctx->ws = (struct radeon_drm_winsys*)ws;
63 ctx->gpu_reset_counter = radeon_drm_get_gpu_reset_counter(ctx->ws);
64 return (struct radeon_winsys_ctx*)ctx;
65 }
66
radeon_drm_ctx_destroy(struct radeon_winsys_ctx * ctx)67 static void radeon_drm_ctx_destroy(struct radeon_winsys_ctx *ctx)
68 {
69 FREE(ctx);
70 }
71
72 static void
radeon_drm_ctx_set_sw_reset_status(struct radeon_winsys_ctx * rwctx,enum pipe_reset_status status,const char * format,...)73 radeon_drm_ctx_set_sw_reset_status(struct radeon_winsys_ctx *rwctx, enum pipe_reset_status status,
74 const char *format, ...)
75 {
76 /* TODO: we should do something better here */
77 va_list args;
78
79 va_start(args, format);
80 vfprintf(stderr, format, args);
81 va_end(args);
82 }
83
84 static enum pipe_reset_status
radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx * rctx,bool full_reset_only,bool * needs_reset,bool * reset_completed)85 radeon_drm_ctx_query_reset_status(struct radeon_winsys_ctx *rctx, bool full_reset_only,
86 bool *needs_reset, bool *reset_completed)
87 {
88 struct radeon_ctx *ctx = (struct radeon_ctx*)rctx;
89
90 unsigned latest = radeon_drm_get_gpu_reset_counter(ctx->ws);
91
92 if (ctx->gpu_reset_counter == latest) {
93 if (needs_reset)
94 *needs_reset = false;
95 if (reset_completed)
96 *reset_completed = false;
97 return PIPE_NO_RESET;
98 }
99
100 if (needs_reset)
101 *needs_reset = true;
102 if (reset_completed)
103 *reset_completed = true;
104
105 ctx->gpu_reset_counter = latest;
106 return PIPE_UNKNOWN_CONTEXT_RESET;
107 }
108
radeon_init_cs_context(struct radeon_cs_context * csc,struct radeon_drm_winsys * ws)109 static bool radeon_init_cs_context(struct radeon_cs_context *csc,
110 struct radeon_drm_winsys *ws)
111 {
112 int i;
113
114 csc->fd = ws->fd;
115
116 csc->chunks[0].chunk_id = RADEON_CHUNK_ID_IB;
117 csc->chunks[0].length_dw = 0;
118 csc->chunks[0].chunk_data = (uint64_t)(uintptr_t)csc->buf;
119 csc->chunks[1].chunk_id = RADEON_CHUNK_ID_RELOCS;
120 csc->chunks[1].length_dw = 0;
121 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
122 csc->chunks[2].chunk_id = RADEON_CHUNK_ID_FLAGS;
123 csc->chunks[2].length_dw = 2;
124 csc->chunks[2].chunk_data = (uint64_t)(uintptr_t)&csc->flags;
125
126 csc->chunk_array[0] = (uint64_t)(uintptr_t)&csc->chunks[0];
127 csc->chunk_array[1] = (uint64_t)(uintptr_t)&csc->chunks[1];
128 csc->chunk_array[2] = (uint64_t)(uintptr_t)&csc->chunks[2];
129
130 csc->cs.chunks = (uint64_t)(uintptr_t)csc->chunk_array;
131
132 for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
133 csc->reloc_indices_hashlist[i] = -1;
134 }
135 return true;
136 }
137
radeon_cs_context_cleanup(struct radeon_winsys * rws,struct radeon_cs_context * csc)138 static void radeon_cs_context_cleanup(struct radeon_winsys *rws,
139 struct radeon_cs_context *csc)
140 {
141 unsigned i;
142
143 for (i = 0; i < csc->num_relocs; i++) {
144 p_atomic_dec(&csc->relocs_bo[i].bo->num_cs_references);
145 radeon_ws_bo_reference(rws, &csc->relocs_bo[i].bo, NULL);
146 }
147 for (i = 0; i < csc->num_slab_buffers; ++i) {
148 p_atomic_dec(&csc->slab_buffers[i].bo->num_cs_references);
149 radeon_ws_bo_reference(rws, &csc->slab_buffers[i].bo, NULL);
150 }
151
152 csc->num_relocs = 0;
153 csc->num_validated_relocs = 0;
154 csc->num_slab_buffers = 0;
155 csc->chunks[0].length_dw = 0;
156 csc->chunks[1].length_dw = 0;
157
158 for (i = 0; i < ARRAY_SIZE(csc->reloc_indices_hashlist); i++) {
159 csc->reloc_indices_hashlist[i] = -1;
160 }
161 }
162
radeon_destroy_cs_context(struct radeon_winsys * rws,struct radeon_cs_context * csc)163 static void radeon_destroy_cs_context(struct radeon_winsys *rws, struct radeon_cs_context *csc)
164 {
165 radeon_cs_context_cleanup(rws, csc);
166 FREE(csc->slab_buffers);
167 FREE(csc->relocs_bo);
168 FREE(csc->relocs);
169 }
170
171
radeon_drm_cs_get_ip_type(struct radeon_cmdbuf * rcs)172 static enum amd_ip_type radeon_drm_cs_get_ip_type(struct radeon_cmdbuf *rcs)
173 {
174 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
175 return cs->ip_type;
176 }
177
178
179 static bool
radeon_drm_cs_create(struct radeon_cmdbuf * rcs,struct radeon_winsys_ctx * ctx,enum amd_ip_type ip_type,void (* flush)(void * ctx,unsigned flags,struct pipe_fence_handle ** fence),void * flush_ctx)180 radeon_drm_cs_create(struct radeon_cmdbuf *rcs,
181 struct radeon_winsys_ctx *ctx,
182 enum amd_ip_type ip_type,
183 void (*flush)(void *ctx, unsigned flags,
184 struct pipe_fence_handle **fence),
185 void *flush_ctx)
186 {
187 struct radeon_drm_winsys *ws = ((struct radeon_ctx*)ctx)->ws;
188 struct radeon_drm_cs *cs;
189
190 cs = CALLOC_STRUCT(radeon_drm_cs);
191 if (!cs) {
192 return false;
193 }
194 util_queue_fence_init(&cs->flush_completed);
195
196 cs->ws = ws;
197 cs->flush_cs = flush;
198 cs->flush_data = flush_ctx;
199
200 if (!radeon_init_cs_context(&cs->csc1, cs->ws)) {
201 FREE(cs);
202 return false;
203 }
204 if (!radeon_init_cs_context(&cs->csc2, cs->ws)) {
205 radeon_destroy_cs_context(&ws->base, &cs->csc1);
206 FREE(cs);
207 return false;
208 }
209
210 /* Set the first command buffer as current. */
211 cs->csc = &cs->csc1;
212 cs->cst = &cs->csc2;
213 cs->ip_type = ip_type;
214
215 memset(rcs, 0, sizeof(*rcs));
216 rcs->current.buf = cs->csc->buf;
217 rcs->current.max_dw = ARRAY_SIZE(cs->csc->buf);
218 rcs->priv = cs;
219
220 p_atomic_inc(&ws->num_cs);
221 return true;
222 }
223
radeon_drm_cs_set_preamble(struct radeon_cmdbuf * cs,const uint32_t * preamble_ib,unsigned preamble_num_dw,bool preamble_changed)224 static void radeon_drm_cs_set_preamble(struct radeon_cmdbuf *cs, const uint32_t *preamble_ib,
225 unsigned preamble_num_dw, bool preamble_changed)
226 {
227 /* The radeon kernel driver doesn't support preambles. */
228 radeon_emit_array(cs, preamble_ib, preamble_num_dw);
229 }
230
radeon_lookup_buffer(struct radeon_winsys * rws,struct radeon_cs_context * csc,struct radeon_bo * bo)231 int radeon_lookup_buffer(struct radeon_winsys *rws, struct radeon_cs_context *csc,
232 struct radeon_bo *bo)
233 {
234 unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
235 struct radeon_bo_item *buffers;
236 unsigned num_buffers;
237 int i = csc->reloc_indices_hashlist[hash];
238
239 if (bo->handle) {
240 buffers = csc->relocs_bo;
241 num_buffers = csc->num_relocs;
242 } else {
243 buffers = csc->slab_buffers;
244 num_buffers = csc->num_slab_buffers;
245 }
246
247 /* not found or found */
248 if (i == -1 || (i < num_buffers && buffers[i].bo == bo))
249 return i;
250
251 /* Hash collision, look for the BO in the list of relocs linearly. */
252 for (i = num_buffers - 1; i >= 0; i--) {
253 if (buffers[i].bo == bo) {
254 /* Put this reloc in the hash list.
255 * This will prevent additional hash collisions if there are
256 * several consecutive lookup_buffer calls for the same buffer.
257 *
258 * Example: Assuming buffers A,B,C collide in the hash list,
259 * the following sequence of relocs:
260 * AAAAAAAAAAABBBBBBBBBBBBBBCCCCCCCC
261 * will collide here: ^ and here: ^,
262 * meaning that we should get very few collisions in the end. */
263 csc->reloc_indices_hashlist[hash] = i;
264 return i;
265 }
266 }
267 return -1;
268 }
269
radeon_lookup_or_add_real_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)270 static unsigned radeon_lookup_or_add_real_buffer(struct radeon_drm_cs *cs,
271 struct radeon_bo *bo)
272 {
273 struct radeon_cs_context *csc = cs->csc;
274 struct drm_radeon_cs_reloc *reloc;
275 unsigned hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
276 int i = -1;
277
278 i = radeon_lookup_buffer(&cs->ws->base, csc, bo);
279
280 if (i >= 0) {
281 /* For async DMA, every add_buffer call must add a buffer to the list
282 * no matter how many duplicates there are. This is due to the fact
283 * the DMA CS checker doesn't use NOP packets for offset patching,
284 * but always uses the i-th buffer from the list to patch the i-th
285 * offset. If there are N offsets in a DMA CS, there must also be N
286 * buffers in the relocation list.
287 *
288 * This doesn't have to be done if virtual memory is enabled,
289 * because there is no offset patching with virtual memory.
290 */
291 if (cs->ip_type != AMD_IP_SDMA || cs->ws->info.r600_has_virtual_memory) {
292 return i;
293 }
294 }
295
296 /* New relocation, check if the backing array is large enough. */
297 if (csc->num_relocs >= csc->max_relocs) {
298 uint32_t size;
299 csc->max_relocs = MAX2(csc->max_relocs + 16, (unsigned)(csc->max_relocs * 1.3));
300
301 size = csc->max_relocs * sizeof(csc->relocs_bo[0]);
302 csc->relocs_bo = realloc(csc->relocs_bo, size);
303
304 size = csc->max_relocs * sizeof(struct drm_radeon_cs_reloc);
305 csc->relocs = realloc(csc->relocs, size);
306
307 csc->chunks[1].chunk_data = (uint64_t)(uintptr_t)csc->relocs;
308 }
309
310 /* Initialize the new relocation. */
311 csc->relocs_bo[csc->num_relocs].bo = NULL;
312 csc->relocs_bo[csc->num_relocs].u.real.priority_usage = 0;
313 radeon_ws_bo_reference(&cs->ws->base, &csc->relocs_bo[csc->num_relocs].bo, bo);
314 p_atomic_inc(&bo->num_cs_references);
315 reloc = &csc->relocs[csc->num_relocs];
316 reloc->handle = bo->handle;
317 reloc->read_domains = 0;
318 reloc->write_domain = 0;
319 reloc->flags = 0;
320
321 csc->reloc_indices_hashlist[hash] = csc->num_relocs;
322
323 csc->chunks[1].length_dw += RELOC_DWORDS;
324
325 return csc->num_relocs++;
326 }
327
radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs * cs,struct radeon_bo * bo)328 static int radeon_lookup_or_add_slab_buffer(struct radeon_drm_cs *cs,
329 struct radeon_bo *bo)
330 {
331 struct radeon_cs_context *csc = cs->csc;
332 unsigned hash;
333 struct radeon_bo_item *item;
334 int idx;
335 int real_idx;
336
337 idx = radeon_lookup_buffer(&cs->ws->base, csc, bo);
338 if (idx >= 0)
339 return idx;
340
341 real_idx = radeon_lookup_or_add_real_buffer(cs, bo->u.slab.real);
342
343 /* Check if the backing array is large enough. */
344 if (csc->num_slab_buffers >= csc->max_slab_buffers) {
345 unsigned new_max = MAX2(csc->max_slab_buffers + 16,
346 (unsigned)(csc->max_slab_buffers * 1.3));
347 struct radeon_bo_item *new_buffers =
348 REALLOC(csc->slab_buffers,
349 csc->max_slab_buffers * sizeof(*new_buffers),
350 new_max * sizeof(*new_buffers));
351 if (!new_buffers) {
352 fprintf(stderr, "radeon_lookup_or_add_slab_buffer: allocation failure\n");
353 return -1;
354 }
355
356 csc->max_slab_buffers = new_max;
357 csc->slab_buffers = new_buffers;
358 }
359
360 /* Initialize the new relocation. */
361 idx = csc->num_slab_buffers++;
362 item = &csc->slab_buffers[idx];
363
364 item->bo = NULL;
365 item->u.slab.real_idx = real_idx;
366 radeon_ws_bo_reference(&cs->ws->base, &item->bo, bo);
367 p_atomic_inc(&bo->num_cs_references);
368
369 hash = bo->hash & (ARRAY_SIZE(csc->reloc_indices_hashlist)-1);
370 csc->reloc_indices_hashlist[hash] = idx;
371
372 return idx;
373 }
374
radeon_drm_cs_add_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * buf,unsigned usage,enum radeon_bo_domain domains)375 static unsigned radeon_drm_cs_add_buffer(struct radeon_cmdbuf *rcs,
376 struct pb_buffer_lean *buf,
377 unsigned usage,
378 enum radeon_bo_domain domains)
379 {
380 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
381 struct radeon_bo *bo = (struct radeon_bo*)buf;
382 enum radeon_bo_domain added_domains;
383
384 /* If VRAM is just stolen system memory, allow both VRAM and
385 * GTT, whichever has free space. If a buffer is evicted from
386 * VRAM to GTT, it will stay there.
387 */
388 if (!cs->ws->info.has_dedicated_vram)
389 domains |= RADEON_DOMAIN_GTT;
390
391 enum radeon_bo_domain rd = usage & RADEON_USAGE_READ ? domains : 0;
392 enum radeon_bo_domain wd = usage & RADEON_USAGE_WRITE ? domains : 0;
393 struct drm_radeon_cs_reloc *reloc;
394 int index;
395
396 if (!bo->handle) {
397 index = radeon_lookup_or_add_slab_buffer(cs, bo);
398 if (index < 0)
399 return 0;
400
401 index = cs->csc->slab_buffers[index].u.slab.real_idx;
402 } else {
403 index = radeon_lookup_or_add_real_buffer(cs, bo);
404 }
405
406 reloc = &cs->csc->relocs[index];
407 added_domains = (rd | wd) & ~(reloc->read_domains | reloc->write_domain);
408 reloc->read_domains |= rd;
409 reloc->write_domain |= wd;
410
411 /* The priority must be in [0, 15]. It's used by the kernel memory management. */
412 unsigned priority = usage & RADEON_ALL_PRIORITIES;
413 unsigned bo_priority = util_last_bit(priority) / 2;
414 reloc->flags = MAX2(reloc->flags, bo_priority);
415 cs->csc->relocs_bo[index].u.real.priority_usage |= priority;
416
417 if (added_domains & RADEON_DOMAIN_VRAM)
418 rcs->used_vram_kb += bo->base.size / 1024;
419 else if (added_domains & RADEON_DOMAIN_GTT)
420 rcs->used_gart_kb += bo->base.size / 1024;
421
422 return index;
423 }
424
radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * buf)425 static int radeon_drm_cs_lookup_buffer(struct radeon_cmdbuf *rcs,
426 struct pb_buffer_lean *buf)
427 {
428 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
429
430 return radeon_lookup_buffer(&cs->ws->base, cs->csc, (struct radeon_bo*)buf);
431 }
432
radeon_drm_cs_validate(struct radeon_cmdbuf * rcs)433 static bool radeon_drm_cs_validate(struct radeon_cmdbuf *rcs)
434 {
435 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
436 bool status =
437 rcs->used_gart_kb < cs->ws->info.gart_size_kb * 0.8 &&
438 rcs->used_vram_kb < cs->ws->info.vram_size_kb * 0.8;
439
440 if (status) {
441 cs->csc->num_validated_relocs = cs->csc->num_relocs;
442 } else {
443 /* Remove lately-added buffers. The validation failed with them
444 * and the CS is about to be flushed because of that. Keep only
445 * the already-validated buffers. */
446 unsigned i;
447
448 for (i = cs->csc->num_validated_relocs; i < cs->csc->num_relocs; i++) {
449 p_atomic_dec(&cs->csc->relocs_bo[i].bo->num_cs_references);
450 radeon_ws_bo_reference(&cs->ws->base, &cs->csc->relocs_bo[i].bo, NULL);
451 }
452 cs->csc->num_relocs = cs->csc->num_validated_relocs;
453
454 /* Flush if there are any relocs. Clean up otherwise. */
455 if (cs->csc->num_relocs) {
456 cs->flush_cs(cs->flush_data,
457 RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
458 } else {
459 radeon_cs_context_cleanup(&cs->ws->base, cs->csc);
460 rcs->used_vram_kb = 0;
461 rcs->used_gart_kb = 0;
462
463 assert(rcs->current.cdw == 0);
464 if (rcs->current.cdw != 0) {
465 fprintf(stderr, "radeon: Unexpected error in %s.\n", __func__);
466 }
467 }
468 }
469 return status;
470 }
471
radeon_drm_cs_check_space(struct radeon_cmdbuf * rcs,unsigned dw)472 static bool radeon_drm_cs_check_space(struct radeon_cmdbuf *rcs, unsigned dw)
473 {
474 assert(rcs->current.cdw <= rcs->current.max_dw);
475 return rcs->current.max_dw - rcs->current.cdw >= dw;
476 }
477
radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf * rcs,struct radeon_bo_list_item * list)478 static unsigned radeon_drm_cs_get_buffer_list(struct radeon_cmdbuf *rcs,
479 struct radeon_bo_list_item *list)
480 {
481 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
482 int i;
483
484 if (list) {
485 for (i = 0; i < cs->csc->num_relocs; i++) {
486 list[i].bo_size = cs->csc->relocs_bo[i].bo->base.size;
487 list[i].vm_address = cs->csc->relocs_bo[i].bo->va;
488 list[i].priority_usage = cs->csc->relocs_bo[i].u.real.priority_usage;
489 }
490 }
491 return cs->csc->num_relocs;
492 }
493
radeon_drm_cs_emit_ioctl_oneshot(void * job,void * gdata,int thread_index)494 void radeon_drm_cs_emit_ioctl_oneshot(void *job, void *gdata, int thread_index)
495 {
496 struct radeon_drm_cs *cs = (struct radeon_drm_cs*)job;
497 struct radeon_cs_context *csc = cs->cst;
498 unsigned i;
499 int r;
500
501 r = drmCommandWriteRead(csc->fd, DRM_RADEON_CS,
502 &csc->cs, sizeof(struct drm_radeon_cs));
503 if (r) {
504 if (r == -ENOMEM)
505 fprintf(stderr, "radeon: Not enough memory for command submission.\n");
506 else if (debug_get_bool_option("RADEON_DUMP_CS", false)) {
507 unsigned i;
508
509 fprintf(stderr, "radeon: The kernel rejected CS, dumping...\n");
510 for (i = 0; i < csc->chunks[0].length_dw; i++) {
511 fprintf(stderr, "0x%08X\n", csc->buf[i]);
512 }
513 } else {
514 fprintf(stderr, "radeon: The kernel rejected CS, "
515 "see dmesg for more information (%i).\n", r);
516 }
517 }
518
519 for (i = 0; i < csc->num_relocs; i++)
520 p_atomic_dec(&csc->relocs_bo[i].bo->num_active_ioctls);
521 for (i = 0; i < csc->num_slab_buffers; i++)
522 p_atomic_dec(&csc->slab_buffers[i].bo->num_active_ioctls);
523
524 radeon_cs_context_cleanup(&cs->ws->base, csc);
525 }
526
527 /*
528 * Make sure previous submission of this cs are completed
529 */
radeon_drm_cs_sync_flush(struct radeon_cmdbuf * rcs)530 void radeon_drm_cs_sync_flush(struct radeon_cmdbuf *rcs)
531 {
532 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
533
534 /* Wait for any pending ioctl of this CS to complete. */
535 if (util_queue_is_initialized(&cs->ws->cs_queue))
536 util_queue_fence_wait(&cs->flush_completed);
537 }
538
539 /* Add the given fence to a slab buffer fence list.
540 *
541 * There is a potential race condition when bo participates in submissions on
542 * two or more threads simultaneously. Since we do not know which of the
543 * submissions will be sent to the GPU first, we have to keep the fences
544 * of all submissions.
545 *
546 * However, fences that belong to submissions that have already returned from
547 * their respective ioctl do not have to be kept, because we know that they
548 * will signal earlier.
549 */
radeon_bo_slab_fence(struct radeon_winsys * rws,struct radeon_bo * bo,struct radeon_bo * fence)550 static void radeon_bo_slab_fence(struct radeon_winsys *rws, struct radeon_bo *bo,
551 struct radeon_bo *fence)
552 {
553 unsigned dst;
554
555 assert(fence->num_cs_references);
556
557 /* Cleanup older fences */
558 dst = 0;
559 for (unsigned src = 0; src < bo->u.slab.num_fences; ++src) {
560 if (bo->u.slab.fences[src]->num_cs_references) {
561 bo->u.slab.fences[dst] = bo->u.slab.fences[src];
562 dst++;
563 } else {
564 radeon_ws_bo_reference(rws, &bo->u.slab.fences[src], NULL);
565 }
566 }
567 bo->u.slab.num_fences = dst;
568
569 /* Check available space for the new fence */
570 if (bo->u.slab.num_fences >= bo->u.slab.max_fences) {
571 unsigned new_max_fences = bo->u.slab.max_fences + 1;
572 struct radeon_bo **new_fences = REALLOC(bo->u.slab.fences,
573 bo->u.slab.max_fences * sizeof(*new_fences),
574 new_max_fences * sizeof(*new_fences));
575 if (!new_fences) {
576 fprintf(stderr, "radeon_bo_slab_fence: allocation failure, dropping fence\n");
577 return;
578 }
579
580 bo->u.slab.fences = new_fences;
581 bo->u.slab.max_fences = new_max_fences;
582 }
583
584 /* Add the new fence */
585 bo->u.slab.fences[bo->u.slab.num_fences] = NULL;
586 radeon_ws_bo_reference(rws, &bo->u.slab.fences[bo->u.slab.num_fences], fence);
587 bo->u.slab.num_fences++;
588 }
589
radeon_drm_cs_flush(struct radeon_cmdbuf * rcs,unsigned flags,struct pipe_fence_handle ** pfence)590 static int radeon_drm_cs_flush(struct radeon_cmdbuf *rcs,
591 unsigned flags,
592 struct pipe_fence_handle **pfence)
593 {
594 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
595 struct radeon_cs_context *tmp;
596
597 switch (cs->ip_type) {
598 case AMD_IP_SDMA:
599 /* pad DMA ring to 8 DWs */
600 if (cs->ws->info.gfx_level <= GFX6) {
601 while (rcs->current.cdw & 7)
602 radeon_emit(rcs, 0xf0000000); /* NOP packet */
603 } else {
604 while (rcs->current.cdw & 7)
605 radeon_emit(rcs, 0x00000000); /* NOP packet */
606 }
607 break;
608 case AMD_IP_GFX:
609 /* pad GFX ring to 8 DWs to meet CP fetch alignment requirements
610 * r6xx, requires at least 4 dw alignment to avoid a hw bug.
611 */
612 if (cs->ws->info.gfx_ib_pad_with_type2) {
613 while (rcs->current.cdw & 7)
614 radeon_emit(rcs, 0x80000000); /* type2 nop packet */
615 } else {
616 while (rcs->current.cdw & 7)
617 radeon_emit(rcs, 0xffff1000); /* type3 nop packet */
618 }
619 break;
620 case AMD_IP_UVD:
621 while (rcs->current.cdw & 15)
622 radeon_emit(rcs, 0x80000000); /* type2 nop packet */
623 break;
624 default:
625 break;
626 }
627
628 if (rcs->current.cdw > rcs->current.max_dw) {
629 fprintf(stderr, "radeon: command stream overflowed\n");
630 }
631
632 if (pfence || cs->csc->num_slab_buffers) {
633 struct pipe_fence_handle *fence;
634
635 if (cs->next_fence) {
636 fence = cs->next_fence;
637 cs->next_fence = NULL;
638 } else {
639 fence = radeon_cs_create_fence(rcs);
640 }
641
642 if (fence) {
643 if (pfence)
644 radeon_fence_reference(&cs->ws->base, pfence, fence);
645
646 mtx_lock(&cs->ws->bo_fence_lock);
647 for (unsigned i = 0; i < cs->csc->num_slab_buffers; ++i) {
648 struct radeon_bo *bo = cs->csc->slab_buffers[i].bo;
649 p_atomic_inc(&bo->num_active_ioctls);
650 radeon_bo_slab_fence(&cs->ws->base, bo, (struct radeon_bo *)fence);
651 }
652 mtx_unlock(&cs->ws->bo_fence_lock);
653
654 radeon_fence_reference(&cs->ws->base, &fence, NULL);
655 }
656 } else {
657 radeon_fence_reference(&cs->ws->base, &cs->next_fence, NULL);
658 }
659
660 radeon_drm_cs_sync_flush(rcs);
661
662 /* Swap command streams. */
663 tmp = cs->csc;
664 cs->csc = cs->cst;
665 cs->cst = tmp;
666
667 /* If the CS is not empty or overflowed, emit it in a separate thread. */
668 if (rcs->current.cdw && rcs->current.cdw <= rcs->current.max_dw &&
669 !cs->ws->noop_cs && !(flags & RADEON_FLUSH_NOOP)) {
670 unsigned i, num_relocs;
671
672 num_relocs = cs->cst->num_relocs;
673
674 cs->cst->chunks[0].length_dw = rcs->current.cdw;
675
676 for (i = 0; i < num_relocs; i++) {
677 /* Update the number of active asynchronous CS ioctls for the buffer. */
678 p_atomic_inc(&cs->cst->relocs_bo[i].bo->num_active_ioctls);
679 }
680
681 switch (cs->ip_type) {
682 case AMD_IP_SDMA:
683 cs->cst->flags[0] = 0;
684 cs->cst->flags[1] = RADEON_CS_RING_DMA;
685 cs->cst->cs.num_chunks = 3;
686 if (cs->ws->info.r600_has_virtual_memory) {
687 cs->cst->flags[0] |= RADEON_CS_USE_VM;
688 }
689 break;
690
691 case AMD_IP_UVD:
692 cs->cst->flags[0] = 0;
693 cs->cst->flags[1] = RADEON_CS_RING_UVD;
694 cs->cst->cs.num_chunks = 3;
695 break;
696
697 case AMD_IP_VCE:
698 cs->cst->flags[0] = 0;
699 cs->cst->flags[1] = RADEON_CS_RING_VCE;
700 cs->cst->cs.num_chunks = 3;
701 break;
702
703 default:
704 case AMD_IP_GFX:
705 case AMD_IP_COMPUTE:
706 cs->cst->flags[0] = RADEON_CS_KEEP_TILING_FLAGS;
707 cs->cst->flags[1] = RADEON_CS_RING_GFX;
708 cs->cst->cs.num_chunks = 3;
709
710 if (cs->ws->info.r600_has_virtual_memory) {
711 cs->cst->flags[0] |= RADEON_CS_USE_VM;
712 cs->cst->cs.num_chunks = 3;
713 }
714 if (flags & PIPE_FLUSH_END_OF_FRAME) {
715 cs->cst->flags[0] |= RADEON_CS_END_OF_FRAME;
716 cs->cst->cs.num_chunks = 3;
717 }
718 if (cs->ip_type == AMD_IP_COMPUTE) {
719 cs->cst->flags[1] = RADEON_CS_RING_COMPUTE;
720 cs->cst->cs.num_chunks = 3;
721 }
722 break;
723 }
724
725 if (util_queue_is_initialized(&cs->ws->cs_queue)) {
726 util_queue_add_job(&cs->ws->cs_queue, cs, &cs->flush_completed,
727 radeon_drm_cs_emit_ioctl_oneshot, NULL, 0);
728 if (!(flags & PIPE_FLUSH_ASYNC))
729 radeon_drm_cs_sync_flush(rcs);
730 } else {
731 radeon_drm_cs_emit_ioctl_oneshot(cs, NULL, 0);
732 }
733 } else {
734 radeon_cs_context_cleanup(&cs->ws->base, cs->cst);
735 }
736
737 /* Prepare a new CS. */
738 rcs->current.buf = cs->csc->buf;
739 rcs->current.cdw = 0;
740 rcs->used_vram_kb = 0;
741 rcs->used_gart_kb = 0;
742
743 if (cs->ip_type == AMD_IP_GFX)
744 cs->ws->num_gfx_IBs++;
745 else if (cs->ip_type == AMD_IP_SDMA)
746 cs->ws->num_sdma_IBs++;
747 return 0;
748 }
749
radeon_drm_cs_destroy(struct radeon_cmdbuf * rcs)750 static void radeon_drm_cs_destroy(struct radeon_cmdbuf *rcs)
751 {
752 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
753
754 if (!cs)
755 return;
756
757 radeon_drm_cs_sync_flush(rcs);
758 util_queue_fence_destroy(&cs->flush_completed);
759 radeon_cs_context_cleanup(&cs->ws->base, &cs->csc1);
760 radeon_cs_context_cleanup(&cs->ws->base, &cs->csc2);
761 p_atomic_dec(&cs->ws->num_cs);
762 radeon_destroy_cs_context(&cs->ws->base, &cs->csc1);
763 radeon_destroy_cs_context(&cs->ws->base, &cs->csc2);
764 radeon_fence_reference(&cs->ws->base, &cs->next_fence, NULL);
765 FREE(cs);
766 }
767
radeon_bo_is_referenced(struct radeon_cmdbuf * rcs,struct pb_buffer_lean * _buf,unsigned usage)768 static bool radeon_bo_is_referenced(struct radeon_cmdbuf *rcs,
769 struct pb_buffer_lean *_buf,
770 unsigned usage)
771 {
772 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
773 struct radeon_bo *bo = (struct radeon_bo*)_buf;
774 int index;
775
776 if (!bo->num_cs_references)
777 return false;
778
779 index = radeon_lookup_buffer(&cs->ws->base, cs->csc, bo);
780 if (index == -1)
781 return false;
782
783 if (!bo->handle)
784 index = cs->csc->slab_buffers[index].u.slab.real_idx;
785
786 if ((usage & RADEON_USAGE_WRITE) && cs->csc->relocs[index].write_domain)
787 return true;
788 if ((usage & RADEON_USAGE_READ) && cs->csc->relocs[index].read_domains)
789 return true;
790
791 return false;
792 }
793
794 /* FENCES */
795
radeon_cs_create_fence(struct radeon_cmdbuf * rcs)796 static struct pipe_fence_handle *radeon_cs_create_fence(struct radeon_cmdbuf *rcs)
797 {
798 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
799 struct pb_buffer_lean *fence;
800
801 /* Create a fence, which is a dummy BO. */
802 fence = cs->ws->base.buffer_create(&cs->ws->base, 1, 1,
803 RADEON_DOMAIN_GTT,
804 RADEON_FLAG_NO_SUBALLOC
805 | RADEON_FLAG_NO_INTERPROCESS_SHARING);
806 if (!fence)
807 return NULL;
808
809 /* Add the fence as a dummy relocation. */
810 cs->ws->base.cs_add_buffer(rcs, fence,
811 RADEON_USAGE_READWRITE | RADEON_PRIO_FENCE_TRACE, RADEON_DOMAIN_GTT);
812 return (struct pipe_fence_handle*)fence;
813 }
814
radeon_fence_wait(struct radeon_winsys * ws,struct pipe_fence_handle * fence,uint64_t timeout)815 static bool radeon_fence_wait(struct radeon_winsys *ws,
816 struct pipe_fence_handle *fence,
817 uint64_t timeout)
818 {
819 return ws->buffer_wait(ws, (struct pb_buffer_lean*)fence, timeout,
820 RADEON_USAGE_READWRITE);
821 }
822
radeon_fence_reference(struct radeon_winsys * ws,struct pipe_fence_handle ** dst,struct pipe_fence_handle * src)823 static void radeon_fence_reference(struct radeon_winsys *ws,
824 struct pipe_fence_handle **dst,
825 struct pipe_fence_handle *src)
826 {
827 radeon_bo_reference(ws, (struct pb_buffer_lean**)dst, (struct pb_buffer_lean*)src);
828 }
829
radeon_drm_cs_get_next_fence(struct radeon_cmdbuf * rcs)830 static struct pipe_fence_handle *radeon_drm_cs_get_next_fence(struct radeon_cmdbuf *rcs)
831 {
832 struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
833 struct pipe_fence_handle *fence = NULL;
834
835 if (cs->next_fence) {
836 radeon_fence_reference(&cs->ws->base, &fence, cs->next_fence);
837 return fence;
838 }
839
840 fence = radeon_cs_create_fence(rcs);
841 if (!fence)
842 return NULL;
843
844 radeon_fence_reference(&cs->ws->base, &cs->next_fence, fence);
845 return fence;
846 }
847
848 static void
radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf * cs,struct pipe_fence_handle * fence)849 radeon_drm_cs_add_fence_dependency(struct radeon_cmdbuf *cs,
850 struct pipe_fence_handle *fence)
851 {
852 /* TODO: Handle the following unlikely multi-threaded scenario:
853 *
854 * Thread 1 / Context 1 Thread 2 / Context 2
855 * -------------------- --------------------
856 * f = cs_get_next_fence()
857 * cs_add_fence_dependency(f)
858 * cs_flush()
859 * cs_flush()
860 *
861 * We currently assume that this does not happen because we don't support
862 * asynchronous flushes on Radeon.
863 */
864 }
865
radeon_drm_cs_init_functions(struct radeon_drm_winsys * ws)866 void radeon_drm_cs_init_functions(struct radeon_drm_winsys *ws)
867 {
868 ws->base.ctx_create = radeon_drm_ctx_create;
869 ws->base.ctx_destroy = radeon_drm_ctx_destroy;
870 ws->base.ctx_set_sw_reset_status = radeon_drm_ctx_set_sw_reset_status;
871 ws->base.ctx_query_reset_status = radeon_drm_ctx_query_reset_status;
872 ws->base.cs_create = radeon_drm_cs_create;
873 ws->base.cs_destroy = radeon_drm_cs_destroy;
874 ws->base.cs_add_buffer = radeon_drm_cs_add_buffer;
875 ws->base.cs_lookup_buffer = radeon_drm_cs_lookup_buffer;
876 ws->base.cs_validate = radeon_drm_cs_validate;
877 ws->base.cs_check_space = radeon_drm_cs_check_space;
878 ws->base.cs_get_buffer_list = radeon_drm_cs_get_buffer_list;
879 ws->base.cs_flush = radeon_drm_cs_flush;
880 ws->base.cs_get_next_fence = radeon_drm_cs_get_next_fence;
881 ws->base.cs_is_buffer_referenced = radeon_bo_is_referenced;
882 ws->base.cs_sync_flush = radeon_drm_cs_sync_flush;
883 ws->base.cs_add_fence_dependency = radeon_drm_cs_add_fence_dependency;
884 ws->base.fence_wait = radeon_fence_wait;
885 ws->base.fence_reference = radeon_fence_reference;
886 }
887