1 /*
2 * Copyright © 2012-2018 Rob Clark <[email protected]>
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Rob Clark <[email protected]>
7 */
8
9 #ifndef FREEDRENO_PRIV_H_
10 #define FREEDRENO_PRIV_H_
11
12 #include <errno.h>
13 #include <fcntl.h>
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <string.h>
17 #include <unistd.h>
18 #include <sys/ioctl.h>
19 #include <sys/mman.h>
20
21 #include <xf86drm.h>
22
23 #include "util/hash_table.h"
24 #include "util/list.h"
25 #include "util/log.h"
26 #include "util/perf/cpu_trace.h"
27 #include "util/simple_mtx.h"
28 #include "util/slab.h"
29 #include "util/u_atomic.h"
30 #include "util/u_debug.h"
31 #include "util/u_math.h"
32 #include "util/vma.h"
33
34 #include "freedreno_common.h"
35 #include "freedreno_dev_info.h"
36 #include "freedreno_drmif.h"
37 #include "freedreno_rd_output.h"
38 #include "freedreno_ringbuffer.h"
39
40 extern simple_mtx_t table_lock;
41 extern simple_mtx_t fence_lock;
42 extern uint64_t os_page_size;
43
44 #define SUBALLOC_SIZE (32 * 1024)
45 /* Maximum known alignment requirement is a6xx's TEX_CONST at 16 dwords */
46 #define SUBALLOC_ALIGNMENT 64
47 #define RING_FLAGS (FD_BO_GPUREADONLY | FD_BO_CACHED_COHERENT | FD_BO_HINT_COMMAND)
48
49 /*
50 * Stupid/simple growable array implementation:
51 */
52
53 #define MAX_ARRAY_SIZE ((unsigned short)~0)
54
55 static inline void
grow(void ** ptr,uint16_t nr,uint16_t * max,uint16_t sz)56 grow(void **ptr, uint16_t nr, uint16_t *max, uint16_t sz)
57 {
58 assert((nr + 1) < MAX_ARRAY_SIZE);
59 if ((nr + 1) > *max) {
60 if (*max > MAX_ARRAY_SIZE/2)
61 *max = MAX_ARRAY_SIZE;
62 else if ((*max * 2) < (nr + 1))
63 *max = nr + 5;
64 else
65 *max = *max * 2;
66 *ptr = realloc(*ptr, *max * sz);
67 }
68 }
69
70 #define DECLARE_ARRAY(type, name) \
71 unsigned short nr_##name, max_##name; \
72 type *name;
73
74 #define APPEND(x, name, ...) \
75 ({ \
76 grow((void **)&(x)->name, (x)->nr_##name, &(x)->max_##name, \
77 sizeof((x)->name[0])); \
78 (x)->name[(x)->nr_##name] = __VA_ARGS__; \
79 (x)->nr_##name++; \
80 })
81
82 #define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
83
84
85 struct fd_device_funcs {
86 /* Create a new buffer object:
87 */
88 struct fd_bo *(*bo_new)(struct fd_device *dev, uint32_t size, uint32_t flags);
89
90 /* Create a new buffer object from existing handle (ie. dma-buf or
91 * flink import):
92 */
93 struct fd_bo *(*bo_from_handle)(struct fd_device *dev, uint32_t size,
94 uint32_t handle);
95 uint32_t (*handle_from_dmabuf)(struct fd_device *dev, int fd);
96 struct fd_bo *(*bo_from_dmabuf)(struct fd_device *dev, int fd);
97 void (*bo_close_handle)(struct fd_bo *bo);
98
99 struct fd_pipe *(*pipe_new)(struct fd_device *dev, enum fd_pipe_id id,
100 unsigned prio);
101 int (*flush)(struct fd_device *dev);
102 void (*destroy)(struct fd_device *dev);
103 };
104
105 struct fd_bo_bucket {
106 uint32_t size;
107 int count, hits, misses, expired;
108 struct list_head list;
109 };
110
111 struct fd_bo_cache {
112 const char *name;
113 simple_mtx_t lock;
114 struct fd_bo_bucket cache_bucket[14 * 4];
115 int num_buckets;
116 time_t time;
117 };
118
119 /* Probably good for the block size to be a multiple of an available
120 * large-page size. For overlap of what both the MMU (with 4kb granule)
121 * and SMMU support, 2MB is that overlap. (Well, 4kb is as well, but
122 * too small to be practical ;-))
123 */
124 #define FD_BO_HEAP_BLOCK_SIZE (4 * 1024 * 1024)
125
126 /* Zero is an invalid handle, use it to indicate buffers that have been sub-
127 * allocated from a larger backing heap block buffer.
128 */
129 #define FD_BO_SUBALLOC_HANDLE 0
130
131 static inline bool
suballoc_bo(struct fd_bo * bo)132 suballoc_bo(struct fd_bo *bo)
133 {
134 return bo->handle == FD_BO_SUBALLOC_HANDLE;
135 }
136
137 /**
138 * A heap is a virtual range of memory that is backed by N physical buffers,
139 * from which buffers can be suballocated. This requires kernel support for
140 * userspace allocated iova.
141 */
142 struct fd_bo_heap {
143 struct fd_device *dev;
144
145 int cnt;
146
147 /**
148 * Buffer allocation flags for buffers allocated from this heap.
149 */
150 uint32_t flags;
151
152 simple_mtx_t lock;
153
154 /**
155 * Ranges of the backing buffer are allocated at a granularity of
156 * SUBALLOC_ALIGNMENT
157 */
158 struct util_vma_heap heap;
159
160 /**
161 * List of recently freed suballocated BOs from this allocator until they
162 * become idle. Backend should periodically call fd_bo_suballoc_clean()
163 * to check for newly idle entries on the freelist, so that the memory can
164 * be returned to the free heap.
165 */
166 struct list_head freelist;
167
168 /**
169 * The backing buffers. Maximum total heap size is:
170 * FD_BO_HEAP_BLOCK_SIZE * ARRAY_SIZE(heap->blocks)
171 */
172 struct fd_bo *blocks[256];
173 };
174
175 struct fd_bo_heap *fd_bo_heap_new(struct fd_device *dev, uint32_t flags);
176 void fd_bo_heap_destroy(struct fd_bo_heap *heap);
177
178 struct fd_bo *fd_bo_heap_block(struct fd_bo *bo);
179 struct fd_bo *fd_bo_heap_alloc(struct fd_bo_heap *heap, uint32_t size, uint32_t flags);
180
181 static inline uint32_t
submit_offset(struct fd_bo * bo,uint32_t offset)182 submit_offset(struct fd_bo *bo, uint32_t offset)
183 {
184 if (suballoc_bo(bo)) {
185 offset += bo->iova - fd_bo_heap_block(bo)->iova;
186 }
187 return offset;
188 }
189
190 struct fd_device {
191 int fd;
192 enum fd_version version;
193 int32_t refcnt;
194
195 /* tables to keep track of bo's, to avoid "evil-twin" fd_bo objects:
196 *
197 * handle_table: maps handle to fd_bo
198 * name_table: maps flink name to fd_bo
199 *
200 * We end up needing two tables, because DRM_IOCTL_GEM_OPEN always
201 * returns a new handle. So we need to figure out if the bo is already
202 * open in the process first, before calling gem-open.
203 */
204 struct hash_table *handle_table, *name_table;
205
206 const struct fd_device_funcs *funcs;
207
208 struct fd_bo_cache bo_cache;
209 struct fd_bo_cache ring_cache;
210
211 /**
212 * Heap for mappable + cached-coherent + gpu-readonly (ie. cmdstream)
213 */
214 struct fd_bo_heap *ring_heap;
215
216 /**
217 * Heap for mappable (ie. majority of small buffer allocations, etc)
218 */
219 struct fd_bo_heap *default_heap;
220
221 bool has_cached_coherent;
222
223 bool closefd; /* call close(fd) upon destruction */
224
225 /* just for valgrind: */
226 int bo_size;
227
228 /**
229 * List of deferred submits, protected by submit_lock. The deferred
230 * submits are tracked globally per-device, even if they execute in
231 * different order on the kernel side (ie. due to different priority
232 * submitqueues, etc) to preserve the order that they are passed off
233 * to the kernel. Once the kernel has them, it is the fences' job
234 * to preserve correct order of execution.
235 */
236 struct list_head deferred_submits;
237 struct fd_fence *deferred_submits_fence;
238 unsigned deferred_cmds;
239 simple_mtx_t submit_lock;
240
241 /**
242 * BO for suballocating long-lived state objects.
243 *
244 * Note: one would be tempted to put this in fd_pipe to avoid locking.
245 * But that is a bad idea for a couple of reasons:
246 *
247 * 1) With TC, stateobj allocation can happen in either frontend thread
248 * (ie. most CSOs), and also driver thread (a6xx cached tex state)
249 * 2) It is best for fd_pipe to not hold a reference to a BO that can
250 * be free'd to bo cache, as that can cause unexpected re-entrancy
251 * (fd_bo_cache_alloc() -> find_in_bucket() -> fd_bo_state() ->
252 * cleanup_fences() -> drop pipe ref which free's bo's).
253 */
254 struct fd_bo *suballoc_bo;
255 uint32_t suballoc_offset;
256 simple_mtx_t suballoc_lock;
257
258 struct util_queue submit_queue;
259
260 struct fd_rd_output rd;
261 };
262
263 static inline bool
fd_device_threaded_submit(struct fd_device * dev)264 fd_device_threaded_submit(struct fd_device *dev)
265 {
266 return util_queue_is_initialized(&dev->submit_queue);
267 }
268
269 #define foreach_submit(name, list) \
270 list_for_each_entry(struct fd_submit, name, list, node)
271 #define foreach_submit_safe(name, list) \
272 list_for_each_entry_safe(struct fd_submit, name, list, node)
273 #define last_submit(list) \
274 list_last_entry(list, struct fd_submit, node)
275
276 #define foreach_bo(name, list) \
277 list_for_each_entry(struct fd_bo, name, list, node)
278 #define foreach_bo_safe(name, list) \
279 list_for_each_entry_safe(struct fd_bo, name, list, node)
280 #define first_bo(list) \
281 list_first_entry(list, struct fd_bo, node)
282
283
284 void fd_bo_cache_init(struct fd_bo_cache *cache, int coarse, const char *name);
285 void fd_bo_cache_cleanup(struct fd_bo_cache *cache, time_t time);
286 struct fd_bo *fd_bo_cache_alloc(struct fd_bo_cache *cache, uint32_t *size,
287 uint32_t flags);
288 int fd_bo_cache_free(struct fd_bo_cache *cache, struct fd_bo *bo);
289
290 /* for where @fence_lock is already held: */
291 void fd_pipe_del_locked(struct fd_pipe *pipe);
292
293 struct fd_pipe_funcs {
294 struct fd_ringbuffer *(*ringbuffer_new_object)(struct fd_pipe *pipe,
295 uint32_t size);
296 struct fd_submit *(*submit_new)(struct fd_pipe *pipe);
297
298 /**
299 * Flush any deferred submits (if deferred submits are supported by
300 * the pipe implementation)
301 */
302 void (*flush)(struct fd_pipe *pipe, uint32_t fence);
303 void (*finish)(struct fd_pipe *pipe);
304
305 int (*get_param)(struct fd_pipe *pipe, enum fd_param_id param,
306 uint64_t *value);
307 int (*set_param)(struct fd_pipe *pipe, enum fd_param_id param,
308 uint64_t value);
309 int (*wait)(struct fd_pipe *pipe, const struct fd_fence *fence,
310 uint64_t timeout);
311 void (*destroy)(struct fd_pipe *pipe);
312 };
313
314 struct fd_pipe_control {
315 uint32_t fence;
316 };
317 #define control_ptr(pipe, member) \
318 (pipe)->control_mem, offsetof(struct fd_pipe_control, member), 0, 0
319
320 struct fd_pipe {
321 struct fd_device *dev;
322 enum fd_pipe_id id;
323 struct fd_dev_id dev_id;
324
325 /**
326 * Note refcnt is *not* atomic, but protected by fence_lock, since the
327 * fence_lock is held in fd_bo_add_fence(), which is the hotpath.
328 */
329 int32_t refcnt;
330
331 /**
332 * Previous fence seqno allocated for this pipe. The fd_pipe represents
333 * a single timeline, fences allocated by this pipe can be compared to
334 * each other, but fences from different pipes are not comparable (as
335 * there could be preemption of multiple priority level submitqueues at
336 * play)
337 */
338 uint32_t last_fence;
339
340 /**
341 * The last fence seqno that was flushed to kernel (doesn't mean that it
342 * is complete, just that the kernel knows about it)
343 */
344 uint32_t last_submit_fence;
345
346 uint32_t last_enqueue_fence; /* just for debugging */
347
348 /**
349 * Counter for assigning each submit a unique seqno.
350 */
351 seqno_t submit_seqno;
352
353 /**
354 * If we *ever* see an in-fence-fd, assume that userspace is
355 * not relying on implicit fences.
356 */
357 bool no_implicit_sync;
358
359 bool is_64bit;
360
361 struct fd_bo *control_mem;
362 volatile struct fd_pipe_control *control;
363
364 struct slab_parent_pool ring_pool;
365
366 const struct fd_pipe_funcs *funcs;
367 };
368
369 uint32_t fd_pipe_emit_fence(struct fd_pipe *pipe, struct fd_ringbuffer *ring);
370
371 static inline void
fd_pipe_flush(struct fd_pipe * pipe,uint32_t fence)372 fd_pipe_flush(struct fd_pipe *pipe, uint32_t fence)
373 {
374 if (!pipe->funcs->flush)
375 return;
376 pipe->funcs->flush(pipe, fence);
377 }
378
379 struct fd_submit_funcs {
380 struct fd_ringbuffer *(*new_ringbuffer)(struct fd_submit *submit,
381 uint32_t size,
382 enum fd_ringbuffer_flags flags);
383 struct fd_fence *(*flush)(struct fd_submit *submit, int in_fence_fd,
384 bool use_fence_fd);
385 void (*destroy)(struct fd_submit *submit);
386 };
387
388 struct fd_submit {
389 int32_t refcnt;
390 struct fd_pipe *pipe;
391 struct fd_device *dev;
392 const struct fd_submit_funcs *funcs;
393
394 struct fd_ringbuffer *primary;
395 uint32_t fence;
396 struct list_head node; /* node in fd_pipe::deferred_submits */
397 };
398
399 static inline unsigned
fd_dev_count_deferred_cmds(struct fd_device * dev)400 fd_dev_count_deferred_cmds(struct fd_device *dev)
401 {
402 unsigned nr = 0;
403
404 simple_mtx_assert_locked(&dev->submit_lock);
405
406 list_for_each_entry (struct fd_submit, submit, &dev->deferred_submits, node) {
407 nr += fd_ringbuffer_cmd_count(submit->primary);
408 }
409
410 return nr;
411 }
412
413 struct fd_bo_funcs {
414 int (*offset)(struct fd_bo *bo, uint64_t *offset);
415 void *(*map)(struct fd_bo *bo);
416 int (*cpu_prep)(struct fd_bo *bo, struct fd_pipe *pipe, uint32_t op);
417 int (*madvise)(struct fd_bo *bo, int willneed);
418 uint64_t (*iova)(struct fd_bo *bo);
419 void (*set_name)(struct fd_bo *bo, const char *fmt, va_list ap);
420 int (*dmabuf)(struct fd_bo *bo);
421
422 /**
423 * Optional hook that is called before ->destroy(). In the case of
424 * batch deletes (such as BO cache cleanup or cleaning up a submit)
425 * the ->finalize() hook will be called for all of the BOs being
426 * destroyed followed by dev->flush() and then bo->destroy(). This
427 * allows the backend to batch up processing. (Ie. this is for
428 * virtio backend to batch ccmds to the host)
429 *
430 * In all cases, dev->flush() will happen after bo->finalize() and
431 * bo->destroy().
432 */
433 void (*finalize)(struct fd_bo *bo);
434 void (*destroy)(struct fd_bo *bo);
435
436 /**
437 * Optional, copy data into bo, falls back to mmap+memcpy. If not
438 * implemented, it must be possible to mmap all buffers
439 */
440 void (*upload)(struct fd_bo *bo, void *src, unsigned off, unsigned len);
441
442 /**
443 * Optional, if upload is supported, should upload be preferred?
444 */
445 bool (*prefer_upload)(struct fd_bo *bo, unsigned len);
446
447 void (*set_metadata)(struct fd_bo *bo, void *metadata, uint32_t metadata_size);
448 int (*get_metadata)(struct fd_bo *bo, void *metadata, uint32_t metadata_size);
449 };
450
451 void fd_bo_add_fence(struct fd_bo *bo, struct fd_fence *fence);
452 void *fd_bo_map_os_mmap(struct fd_bo *bo);
453 void *__fd_bo_map(struct fd_bo *bo);
454
455 enum fd_bo_state {
456 FD_BO_STATE_IDLE,
457 FD_BO_STATE_BUSY,
458 FD_BO_STATE_UNKNOWN,
459 };
460 enum fd_bo_state fd_bo_state(struct fd_bo *bo);
461
462 void fd_bo_init_common(struct fd_bo *bo, struct fd_device *dev);
463 void fd_bo_fini_fences(struct fd_bo *bo);
464 void fd_bo_fini_common(struct fd_bo *bo);
465
466 struct fd_bo *fd_bo_new_ring(struct fd_device *dev, uint32_t size);
467
468 uint32_t fd_handle_from_dmabuf_drm(struct fd_device *dev, int fd);
469 struct fd_bo *fd_bo_from_dmabuf_drm(struct fd_device *dev, int fd);
470 void fd_bo_close_handle_drm(struct fd_bo *bo);
471
472 #define enable_debug 0 /* TODO make dynamic */
473
474 bool fd_dbg(void);
475
476 #define INFO_MSG(fmt, ...) \
477 do { \
478 if (fd_dbg()) \
479 mesa_logi("%s:%d: " fmt, __func__, __LINE__, ##__VA_ARGS__); \
480 } while (0)
481 #define DEBUG_MSG(fmt, ...) \
482 do \
483 if (enable_debug) { \
484 mesa_logd("%s:%d: " fmt, __func__, __LINE__, ##__VA_ARGS__); \
485 } \
486 while (0)
487 #define WARN_MSG(fmt, ...) \
488 do { \
489 mesa_logw("%s:%d: " fmt, __func__, __LINE__, ##__VA_ARGS__); \
490 } while (0)
491 #define ERROR_MSG(fmt, ...) \
492 do { \
493 mesa_loge("%s:%d: " fmt, __func__, __LINE__, ##__VA_ARGS__); \
494 } while (0)
495
496 #define U642VOID(x) ((void *)(unsigned long)(x))
497 #define VOID2U64(x) ((uint64_t)(unsigned long)(x))
498
499 #ifdef HAVE_VALGRIND
500 #include <memcheck.h>
501
502 /*
503 * For tracking the backing memory (if valgrind enabled, we force a mmap
504 * for the purposes of tracking)
505 */
506 static inline void
VG_BO_ALLOC(struct fd_bo * bo)507 VG_BO_ALLOC(struct fd_bo *bo)
508 {
509 if (bo && RUNNING_ON_VALGRIND) {
510 VALGRIND_MALLOCLIKE_BLOCK(fd_bo_map(bo), bo->size, 0, 1);
511 }
512 }
513
514 static inline void
VG_BO_FREE(struct fd_bo * bo)515 VG_BO_FREE(struct fd_bo *bo)
516 {
517 VALGRIND_FREELIKE_BLOCK(bo->map, 0);
518 }
519
520 /*
521 * For tracking bo structs that are in the buffer-cache, so that valgrind
522 * doesn't attribute ownership to the first one to allocate the recycled
523 * bo.
524 *
525 * Note that the list_head in fd_bo is used to track the buffers in cache
526 * so disable error reporting on the range while they are in cache so
527 * valgrind doesn't squawk about list traversal.
528 *
529 */
530 static inline void
VG_BO_RELEASE(struct fd_bo * bo)531 VG_BO_RELEASE(struct fd_bo *bo)
532 {
533 if (RUNNING_ON_VALGRIND) {
534 VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(bo, bo->dev->bo_size);
535 VALGRIND_MAKE_MEM_NOACCESS(bo, bo->dev->bo_size);
536 VALGRIND_FREELIKE_BLOCK(bo->map, 0);
537 }
538 }
539 static inline void
VG_BO_OBTAIN(struct fd_bo * bo)540 VG_BO_OBTAIN(struct fd_bo *bo)
541 {
542 if (RUNNING_ON_VALGRIND) {
543 VALGRIND_MAKE_MEM_DEFINED(bo, bo->dev->bo_size);
544 VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(bo, bo->dev->bo_size);
545 VALGRIND_MALLOCLIKE_BLOCK(bo->map, bo->size, 0, 1);
546 }
547 }
548 /* special case for fd_bo_upload */
549 static inline void
VG_BO_MAPPED(struct fd_bo * bo)550 VG_BO_MAPPED(struct fd_bo *bo)
551 {
552 VALGRIND_MALLOCLIKE_BLOCK(bo->map, bo->size, 0, 1);
553 }
554 #else
555 static inline void
VG_BO_ALLOC(struct fd_bo * bo)556 VG_BO_ALLOC(struct fd_bo *bo)
557 {
558 }
559 static inline void
VG_BO_FREE(struct fd_bo * bo)560 VG_BO_FREE(struct fd_bo *bo)
561 {
562 }
563 static inline void
VG_BO_RELEASE(struct fd_bo * bo)564 VG_BO_RELEASE(struct fd_bo *bo)
565 {
566 }
567 static inline void
VG_BO_OBTAIN(struct fd_bo * bo)568 VG_BO_OBTAIN(struct fd_bo *bo)
569 {
570 }
571 static inline void
VG_BO_MAPPED(struct fd_bo * bo)572 VG_BO_MAPPED(struct fd_bo *bo)
573 {
574 }
575 #endif
576
577 #define FD_DEFINE_CAST(parent, child) \
578 static inline struct child *to_##child(struct parent *x) \
579 { \
580 return (struct child *)x; \
581 }
582
583 #endif /* FREEDRENO_PRIV_H_ */
584