xref: /aosp_15_r20/external/mesa3d/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2009 Corbin Simpson
3  * Copyright © 2015 Advanced Micro Devices, Inc.
4  *
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #ifndef AMDGPU_WINSYS_H
9 #define AMDGPU_WINSYS_H
10 
11 #include "pipebuffer/pb_cache.h"
12 #include "pipebuffer/pb_slab.h"
13 #include "winsys/radeon_winsys.h"
14 #include "util/simple_mtx.h"
15 #include "util/u_queue.h"
16 #include <amdgpu.h>
17 
18 struct amdgpu_cs;
19 
20 /* DRM file descriptors, file descriptions and buffer sharing.
21  *
22  * amdgpu_device_initialize() creates one amdgpu_device_handle for one
23  * gpu. It does this by getting sysfs path(eg /dev/dri/cardxx) for the fd.
24  * It uses the sysfs path to return the amdgpu_device_handle if already created
25  * or to create new one.
26  *
27  * Thus amdgpu_device_handle's fd will be from the first time the gpu
28  * was initialized by amdgpu_device_initialize().
29  *
30  * KMS/GEM buffer handles are specific to a DRM file description. i.e. the
31  * same handle value may refer to different underlying BOs in different
32  * DRM file descriptions even for the same gpu. The
33  * https://en.wikipedia.org/wiki/File:File_table_and_inode_table.svg diagram shows
34  * the file descriptors and its relation to file descriptions in the file table.
35  *
36  * The fd's are considered different if the fd's are obtained using open()
37  * function. The fd's that are duplicates(using dup() or fcntl F_DUPFD) of
38  * open fd, all will be same when compared with os_same_file_description()
39  * function which uses kcmp system call.
40  *
41  * amdgpu_screen_winsys's fd tracks the file description which was
42  * given to amdgpu_winsys_create(). This is the fd used by the application
43  * using the driver and may be used in other ioctl (eg: drmModeAddFB)
44  *
45  * amdgpu_winsys's fd is the file description used to initialize the
46  * device handle in libdrm_amdgpu.
47  *
48  * The 2 fds can be different, even in systems with a single GPU, eg: if
49  * radv is initialized before radeonsi.
50  *
51  * This fd tracking is useful for buffer sharing. As an example, if an app
52  * wants to use drmModeAddFB it'll need a KMS handle valid for its
53  * fd (== amdgpu_screen_winsys::fd). If both fds are identical, there's
54  * nothing to do: bo->u.real.kms_handle can be used directly
55  * (see amdgpu_bo_get_handle). If they're different, the BO has to be exported
56  * from the device fd as a dma-buf, then imported to the app fd to get the
57  * KMS handle of the buffer for that app fd.
58  *
59  * Examples:
60  * 1) OpenGL, then VAAPI:
61  *    OpenGL                             | VAAPI (same device, != file description)
62  *    -----------------------------------│-----------------------------------------
63  *    fd = 5 (/dev/dri/renderD128)       │fd = 9 (/dev/dri/renderD128')
64  *          │                            │       │
65  *     device_handle = 0xffff0250        │ device_handle = 0xffff0250 (fd=5, re-used)
66  *          │                            │       │
67  *    amdgpu_screen_winsys = 0xffff0120  │amdgpu_winsys = 0xffff0470  ◄─────────────┐
68  *          │   ├─ fd = dup(5) = 6       │       │   └─ sws_list = 0xffff0120       │
69  *          │   └─ aws = 0xffff0470 ◄──┐ │       │                 0xffff0640 ◄───┐ │
70  *          │                          │ │amdgpu_screen_winsys = 0xffff0640 ──────┘ │
71  *    amdgpu_winsys = 0xffff0470    ───┘ │           └─ fd = dup(9) = 10            │
72  *          │   ├─ dev = 0xffff0250      │                                          │
73  *          │   ├─ sws_list = 0xffff0120 │                                          │
74  *          │   └─ fd = 6                │                                          │
75  *    dev_tab(0xffff0250) = 0xffff0470 ──│──────────────────────────────────────────┘
76  *
77  * 2) Vulkan (fd=5) then OpenGL (same device, != file description):
78  *    -----------------------------
79  *    fd = 9 (/dev/dri/renderD128)
80  *           │
81  *     device_handle = 0xffff0250 (fd=5, re-used)
82  *           │
83  *    amdgpu_screen_winsys = 0xffff0740
84  *           │   ├─ fd = dup(9) = 10
85  *           │   └─ aws = 0xffff0940 ◄───┐
86  *    amdgpu_winsys = 0xffff0940 ────────┘
87  *           │   ├─ dev = 0xffff0250
88  *           │   ├─ sws_list = 0xffff0740
89  *           │   └─ fd = 5
90  *    dev_tab(0xffff0250) = 0xffff0940
91  */
92 
93 /* One struct amdgpu_screen_winsys is created in amdgpu_winsys_create() for one
94  * fd. For fd's that are same (read above description for same if condition),
95  * already created amdgpu_screen_winsys will be returned.
96  */
97 struct amdgpu_screen_winsys {
98    struct radeon_winsys base;
99    struct amdgpu_winsys *aws;
100    /* See comment above */
101    int fd;
102    struct pipe_reference reference;
103    struct amdgpu_screen_winsys *next;
104 
105    /* Maps a BO to its KMS handle valid for this DRM file descriptor
106     * Protected by amdgpu_winsys::sws_list_lock
107     */
108    struct hash_table *kms_handles;
109 };
110 
111 /* Maximum this number of IBs can be busy per queue. When submitting a new IB and the oldest IB
112  * ("AMDGPU_FENCE_RING_SIZE" IBs ago) is still busy, the CS thread will wait for it and will
113  * also block all queues from submitting new IBs.
114  */
115 #define AMDGPU_FENCE_RING_SIZE 32
116 
117 /* The maximum number of queues that can be present. */
118 #define AMDGPU_MAX_QUEUES 6
119 
120 /* This can use any integer type because the logic handles integer wraparounds robustly, but
121  * uint8_t wraps around so quickly that some BOs might never become idle because we don't
122  * remove idle fences from BOs, so they become "busy" again after a queue sequence number wraps
123  * around and they may stay "busy" in pb_cache long enough that we run out of memory.
124  */
125 typedef uint16_t uint_seq_no;
126 
127 struct amdgpu_queue {
128    /* Ring buffer of fences.
129     *
130     * We only remember a certain number of the most recent fences per queue. When we add a new
131     * fence, we wait for the oldest one, which implies that all older fences not present
132     * in the ring are idle. This way we don't have to keep track of a million fence references
133     * for a million BOs.
134     *
135     * We only support 1 queue per IP. If an IP has multiple queues, we always add a fence
136     * dependency on the previous fence to make it behave like there is only 1 queue.
137     *
138     * amdgpu_winsys_bo doesn't have a list of fences. It only remembers the last sequence number
139     * for every queue where it was used. We then use the BO's sequence number to look up a fence
140     * in this ring.
141     */
142    struct pipe_fence_handle *fences[AMDGPU_FENCE_RING_SIZE];
143 
144    /* The sequence number of the latest fence.
145     *
146     * This sequence number is global per queue per device, shared by all contexts, and generated
147     * by the winsys, not the kernel.
148     *
149     * The latest fence is: fences[latest_seq_no % AMDGPU_FENCE_RING_SIZE]
150     * The oldest fence is: fences([latest_seq_no + 1) % AMDGPU_FENCE_RING_SIZE]
151     * The oldest sequence number in the ring: latest_seq_no - AMDGPU_FENCE_RING_SIZE + 1
152     *
153     * The sequence number is in the ring if:
154     *    latest_seq_no - buffer_seq_no < AMDGPU_FENCE_RING_SIZE
155     * If the sequence number is not in the ring, it's idle.
156     *
157     * Integer wraparounds of the sequence number behave as follows:
158     *
159     * The comparison above gives the correct answer if buffer_seq_no isn't older than UINT*_MAX.
160     * If it's older than UINT*_MAX but not older than UINT*_MAX + AMDGPU_FENCE_RING_SIZE, we
161     * incorrectly pick and wait for one of the fences in the ring. That's only a problem when
162     * the type is so small (uint8_t) that seq_no wraps around very frequently, causing BOs to
163     * never become idle in certain very unlucky scenarios and running out of memory.
164     */
165    uint_seq_no latest_seq_no;
166 
167    /* The last context using this queue. */
168    struct amdgpu_ctx *last_ctx;
169 };
170 
171 /* This is part of every BO. */
172 struct amdgpu_seq_no_fences {
173    /* A fence sequence number per queue. This number is used to look up the fence from
174     * struct amdgpu_queue.
175     *
176     * This sequence number is global per queue per device, shared by all contexts, and generated
177     * by the winsys, not the kernel.
178     */
179    uint_seq_no seq_no[AMDGPU_MAX_QUEUES];
180 
181    /* The mask of queues where seq_no[i] is valid. */
182    uint8_t valid_fence_mask;
183 };
184 
185 /* valid_fence_mask should have 1 bit for each queue. */
186 static_assert(sizeof(((struct amdgpu_seq_no_fences*)NULL)->valid_fence_mask) * 8 >= AMDGPU_MAX_QUEUES, "");
187 
188 /* One struct amdgpu_winsys is created for one gpu in amdgpu_winsys_create(). */
189 struct amdgpu_winsys {
190    struct pipe_reference reference;
191    /* See comment above */
192    int fd;
193 
194    /* Protected by bo_fence_lock. */
195    struct amdgpu_queue queues[AMDGPU_MAX_QUEUES];
196 
197    struct pb_cache bo_cache;
198    struct pb_slabs bo_slabs;  /* Slab allocator. */
199 
200    amdgpu_device_handle dev;
201 
202    simple_mtx_t bo_fence_lock;
203 
204    int num_cs; /* The number of command streams created. */
205    uint32_t surf_index_color;
206    uint32_t surf_index_fmask;
207    uint32_t next_bo_unique_id;
208    uint64_t allocated_vram;
209    uint64_t allocated_gtt;
210    uint64_t mapped_vram;
211    uint64_t mapped_gtt;
212    uint64_t slab_wasted_vram;
213    uint64_t slab_wasted_gtt;
214    uint64_t buffer_wait_time; /* time spent in buffer_wait in ns */
215    uint64_t num_gfx_IBs;
216    uint64_t num_sdma_IBs;
217    uint64_t num_mapped_buffers;
218    uint64_t gfx_bo_list_counter;
219    uint64_t gfx_ib_size_counter;
220 
221    struct radeon_info info;
222 
223    /* multithreaded IB submission */
224    struct util_queue cs_queue;
225 
226    struct ac_addrlib *addrlib;
227 
228    bool check_vm;
229    bool noop_cs;
230    bool reserve_vmid;
231    bool zero_all_vram_allocs;
232 #if MESA_DEBUG
233    bool debug_all_bos;
234 
235    /* List of all allocated buffers */
236    simple_mtx_t global_bo_list_lock;
237    struct list_head global_bo_list;
238    unsigned num_buffers;
239 #endif
240 
241    /* Single-linked list of all structs amdgpu_screen_winsys referencing this
242     * struct amdgpu_winsys
243     */
244    simple_mtx_t sws_list_lock;
245    struct amdgpu_screen_winsys *sws_list;
246 
247    /* For returning the same amdgpu_winsys_bo instance for exported
248     * and re-imported buffers. */
249    struct hash_table *bo_export_table;
250    simple_mtx_t bo_export_table_lock;
251 
252    /* Since most winsys functions require struct radeon_winsys *, dummy_sws.base is used
253     * for invoking them because sws_list can be NULL.
254     */
255    struct amdgpu_screen_winsys dummy_sws;
256 };
257 
258 static inline struct amdgpu_screen_winsys *
amdgpu_screen_winsys(struct radeon_winsys * base)259 amdgpu_screen_winsys(struct radeon_winsys *base)
260 {
261    return (struct amdgpu_screen_winsys*)base;
262 }
263 
264 static inline struct amdgpu_winsys *
amdgpu_winsys(struct radeon_winsys * base)265 amdgpu_winsys(struct radeon_winsys *base)
266 {
267    return amdgpu_screen_winsys(base)->aws;
268 }
269 
270 void amdgpu_surface_init_functions(struct amdgpu_screen_winsys *sws);
271 
272 #endif
273