xref: /aosp_15_r20/external/mesa3d/src/freedreno/vulkan/tu_device.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  * SPDX-License-Identifier: MIT
5  *
6  * based in part on anv driver which is:
7  * Copyright © 2015 Intel Corporation
8  */
9 
10 #ifndef TU_DEVICE_H
11 #define TU_DEVICE_H
12 
13 #include "tu_common.h"
14 
15 #include "vk_device_memory.h"
16 
17 #include "tu_autotune.h"
18 #include "tu_pass.h"
19 #include "tu_perfetto.h"
20 #include "tu_suballoc.h"
21 #include "tu_util.h"
22 
23 #include "common/freedreno_rd_output.h"
24 #include "util/vma.h"
25 #include "util/u_vector.h"
26 
27 /* queue types */
28 #define TU_QUEUE_GENERAL 0
29 
30 #define TU_MAX_QUEUE_FAMILIES 1
31 
32 #define TU_BORDER_COLOR_COUNT 4096
33 #define TU_BORDER_COLOR_BUILTIN 6
34 
35 #define TU_BLIT_SHADER_SIZE 4096
36 
37 /* extra space in vsc draw/prim streams */
38 #define VSC_PAD 0x40
39 
40 enum global_shader {
41    GLOBAL_SH_VS_BLIT,
42    GLOBAL_SH_VS_CLEAR,
43    GLOBAL_SH_FS_BLIT,
44    GLOBAL_SH_FS_BLIT_ZSCALE,
45    GLOBAL_SH_FS_COPY_MS,
46    GLOBAL_SH_FS_COPY_MS_HALF,
47    GLOBAL_SH_FS_CLEAR0,
48    GLOBAL_SH_FS_CLEAR_MAX = GLOBAL_SH_FS_CLEAR0 + MAX_RTS,
49    GLOBAL_SH_COUNT,
50 };
51 
52 struct tu_memory_heap {
53    /* Standard bits passed on to the client */
54    VkDeviceSize      size;
55    VkMemoryHeapFlags flags;
56 
57    /** Copied from ANV:
58     *
59     * Driver-internal book-keeping.
60     *
61     * Align it to 64 bits to make atomic operations faster on 32 bit platforms.
62     */
63    alignas(8) VkDeviceSize used;
64 };
65 
66 enum tu_kgsl_dma_type
67 {
68    TU_KGSL_DMA_TYPE_ION_LEGACY,
69    TU_KGSL_DMA_TYPE_ION,
70    TU_KGSL_DMA_TYPE_DMAHEAP,
71 };
72 
73 extern uint64_t os_page_size;
74 
75 struct tu_physical_device
76 {
77    struct vk_physical_device vk;
78 
79    struct tu_instance *instance;
80 
81    const char *name;
82    uint8_t driver_uuid[VK_UUID_SIZE];
83    uint8_t device_uuid[VK_UUID_SIZE];
84    uint8_t cache_uuid[VK_UUID_SIZE];
85 
86    struct wsi_device wsi_device;
87 
88    char fd_path[20];
89    int local_fd;
90    bool has_local;
91    int64_t local_major;
92    int64_t local_minor;
93    int master_fd;
94    bool has_master;
95    int64_t master_major;
96    int64_t master_minor;
97 
98    int kgsl_dma_fd;
99    enum tu_kgsl_dma_type kgsl_dma_type;
100 
101    uint32_t gmem_size;
102    uint64_t gmem_base;
103 
104    uint32_t usable_gmem_size_gmem;
105    uint32_t ccu_offset_gmem;
106    uint32_t ccu_offset_bypass;
107    uint32_t ccu_depth_offset_bypass;
108    uint32_t vpc_attr_buf_offset_gmem;
109    uint32_t vpc_attr_buf_size_gmem;
110    uint32_t vpc_attr_buf_offset_bypass;
111    uint32_t vpc_attr_buf_size_bypass;
112 
113    /* Amount of usable descriptor sets, this excludes any reserved set */
114    uint32_t usable_sets;
115    /* Index of the reserved descriptor set, may be -1 if unset */
116    int32_t reserved_set_idx;
117 
118    bool has_set_iova;
119    uint64_t va_start;
120    uint64_t va_size;
121 
122    bool has_cached_coherent_memory;
123    bool has_cached_non_coherent_memory;
124    uintptr_t level1_dcache_size;
125 
126    struct {
127       uint32_t type_count;
128       VkMemoryPropertyFlags types[VK_MAX_MEMORY_TYPES];
129    } memory;
130 
131    struct fd_dev_id dev_id;
132    struct fd_dev_info dev_info;
133    const struct fd_dev_info *info;
134 
135    int msm_major_version;
136    int msm_minor_version;
137 
138    /* with 0 being the highest priority */
139    uint32_t submitqueue_priority_count;
140 
141    struct tu_memory_heap heap;
142 
143    struct vk_sync_type syncobj_type;
144    struct vk_sync_timeline_type timeline_type;
145    const struct vk_sync_type *sync_types[3];
146 
147    uint32_t device_count;
148 };
149 VK_DEFINE_HANDLE_CASTS(tu_physical_device, vk.base, VkPhysicalDevice,
150                        VK_OBJECT_TYPE_PHYSICAL_DEVICE)
151 
152 struct tu_knl;
153 
154 struct tu_instance
155 {
156    struct vk_instance vk;
157 
158    const struct tu_knl *knl;
159 
160    uint32_t instance_idx;
161    uint32_t api_version;
162 
163    struct driOptionCache dri_options;
164    struct driOptionCache available_dri_options;
165 
166    bool dont_care_as_load;
167 
168    /* Conservative LRZ (default true) invalidates LRZ on draws with
169     * blend and depth-write enabled, because this can lead to incorrect
170     * rendering.  Driconf can be used to disable conservative LRZ for
171     * games which do not have the problematic sequence of draws *and*
172     * suffer a performance loss with conservative LRZ.
173     */
174    bool conservative_lrz;
175 
176    /* If to internally reserve a descriptor set for descriptor set
177     * dynamic offsets, a descriptor set can be freed at the cost of
178     * being unable to use the feature. As it is a part of the Vulkan
179     * core, this is enabled by default.
180     */
181    bool reserve_descriptor_set;
182 
183    /* Allow out of bounds UBO access by disabling lowering of UBO loads for
184     * indirect access, which rely on the UBO bounds specified in the shader,
185     * rather than the bound UBO size which isn't known until draw time.
186     *
187     * See: https://github.com/doitsujin/dxvk/issues/3861
188     */
189    bool allow_oob_indirect_ubo_loads;
190 
191    /* DXVK and VKD3D-Proton use customBorderColorWithoutFormat
192     * and have most of D24S8 images with USAGE_SAMPLED, in such case we
193     * disable UBWC for correctness. However, games don't use border color for
194     * depth-stencil images. So we elect to ignore this edge case and force
195     * UBWC to be enabled.
196     */
197    bool disable_d24s8_border_color_workaround;
198 };
199 VK_DEFINE_HANDLE_CASTS(tu_instance, vk.base, VkInstance,
200                        VK_OBJECT_TYPE_INSTANCE)
201 
202 struct tu_queue
203 {
204    struct vk_queue vk;
205 
206    struct tu_device *device;
207 
208    uint32_t msm_queue_id;
209    uint32_t priority;
210 
211    int fence;           /* timestamp/fence of the last queue submission */
212 };
213 VK_DEFINE_HANDLE_CASTS(tu_queue, vk.base, VkQueue, VK_OBJECT_TYPE_QUEUE)
214 
215 /* This struct defines the layout of the global_bo */
216 struct tu6_global
217 {
218    /* clear/blit shaders */
219    uint32_t shaders[TU_BLIT_SHADER_SIZE];
220 
221    uint32_t seqno_dummy;          /* dummy seqno for CP_EVENT_WRITE */
222    uint32_t _pad0;
223    volatile uint32_t vsc_draw_overflow;
224    uint32_t _pad1;
225    volatile uint32_t vsc_prim_overflow;
226    uint32_t _pad2;
227    uint64_t predicate;
228 
229    /* scratch space for VPC_SO[i].FLUSH_BASE_LO/HI, start on 32 byte boundary. */
230    struct {
231       uint32_t offset;
232       uint32_t pad[7];
233    } flush_base[4];
234 
235    alignas(16) uint32_t cs_indirect_xyz[12];
236 
237    volatile uint32_t vtx_stats_query_not_running;
238 
239    /* To know when renderpass stats for autotune are valid */
240    volatile uint32_t autotune_fence;
241 
242    /* For recycling command buffers for dynamic suspend/resume comamnds */
243    volatile uint32_t dynamic_rendering_fence;
244 
245    volatile uint32_t dbg_one;
246    volatile uint32_t dbg_gmem_total_loads;
247    volatile uint32_t dbg_gmem_taken_loads;
248    volatile uint32_t dbg_gmem_total_stores;
249    volatile uint32_t dbg_gmem_taken_stores;
250 
251    /* Written from GPU */
252    volatile uint32_t breadcrumb_gpu_sync_seqno;
253    uint32_t _pad3;
254    /* Written from CPU, acknowledges value written from GPU */
255    volatile uint32_t breadcrumb_cpu_sync_seqno;
256    uint32_t _pad4;
257 
258    volatile uint32_t userspace_fence;
259    uint32_t _pad5;
260 
261    /* note: larger global bo will be used for customBorderColors */
262    struct bcolor_entry bcolor_builtin[TU_BORDER_COLOR_BUILTIN], bcolor[];
263 };
264 #define gb_offset(member) offsetof(struct tu6_global, member)
265 #define global_iova(cmd, member) ((cmd)->device->global_bo->iova + gb_offset(member))
266 #define global_iova_arr(cmd, member, idx)                                    \
267    (global_iova(cmd, member) + sizeof_field(struct tu6_global, member[0]) * (idx))
268 
269 struct tu_pvtmem_bo {
270       mtx_t mtx;
271       struct tu_bo *bo;
272       uint32_t per_fiber_size, per_sp_size;
273 };
274 
275 struct tu_virtio_device;
276 
277 struct tu_device
278 {
279    struct vk_device vk;
280    struct tu_instance *instance;
281 
282    struct tu_queue *queues[TU_MAX_QUEUE_FAMILIES];
283    int queue_count[TU_MAX_QUEUE_FAMILIES];
284 
285    struct tu_physical_device *physical_device;
286    uint32_t device_idx;
287    int fd;
288 
289    struct ir3_compiler *compiler;
290 
291    /* Backup in-memory cache to be used if the app doesn't provide one */
292    struct vk_pipeline_cache *mem_cache;
293 
294 #define MIN_SCRATCH_BO_SIZE_LOG2 12 /* A page */
295 
296    /* Currently the kernel driver uses a 32-bit GPU address space, but it
297     * should be impossible to go beyond 48 bits.
298     */
299    struct {
300       struct tu_bo *bo;
301       mtx_t construct_mtx;
302       bool initialized;
303    } scratch_bos[48 - MIN_SCRATCH_BO_SIZE_LOG2];
304 
305    struct tu_pvtmem_bo fiber_pvtmem_bo, wave_pvtmem_bo;
306 
307    struct tu_bo *global_bo;
308    struct tu6_global *global_bo_map;
309 
310    uint32_t implicit_sync_bo_count;
311 
312    /* Device-global BO suballocator for reducing BO management overhead for
313     * (read-only) pipeline state.  Synchronized by pipeline_mutex.
314     */
315    struct tu_suballocator pipeline_suballoc;
316    mtx_t pipeline_mutex;
317 
318    /* Device-global BO suballocator for reducing BO management for small
319     * gmem/sysmem autotune result buffers.  Synchronized by autotune_mutex.
320     */
321    struct tu_suballocator autotune_suballoc;
322    mtx_t autotune_mutex;
323 
324    /* KGSL requires a small chunk of GPU mem to retrieve raw GPU time on
325     * each submission.
326     */
327    struct tu_suballocator kgsl_profiling_suballoc;
328    mtx_t kgsl_profiling_mutex;
329 
330    /* the blob seems to always use 8K factor and 128K param sizes, copy them */
331 #define TU_TESS_FACTOR_SIZE (8 * 1024)
332 #define TU_TESS_PARAM_SIZE (128 * 1024)
333 #define TU_TESS_BO_SIZE (TU_TESS_FACTOR_SIZE + TU_TESS_PARAM_SIZE)
334    /* Lazily allocated, protected by the device mutex. */
335    struct tu_bo *tess_bo;
336 
337    struct ir3_shader_variant *global_shader_variants[GLOBAL_SH_COUNT];
338    struct ir3_shader *global_shaders[GLOBAL_SH_COUNT];
339    uint64_t global_shader_va[GLOBAL_SH_COUNT];
340 
341    struct tu_shader *empty_tcs, *empty_tes, *empty_gs, *empty_fs, *empty_fs_fdm;
342 
343    uint32_t vsc_draw_strm_pitch;
344    uint32_t vsc_prim_strm_pitch;
345    BITSET_DECLARE(custom_border_color, TU_BORDER_COLOR_COUNT);
346    mtx_t mutex;
347 
348    mtx_t vma_mutex;
349    struct util_vma_heap vma;
350 
351    /* bo list for submits: */
352    struct drm_msm_gem_submit_bo *bo_list;
353    /* map bo handles to bo list index: */
354    uint32_t bo_count, bo_list_size;
355    mtx_t bo_mutex;
356    /* protects imported BOs creation/freeing */
357    struct u_rwlock dma_bo_lock;
358 
359    /* Tracking of name -> size allocated for TU_DEBUG_BOS */
360    struct hash_table *bo_sizes;
361 
362    /* This array holds all our 'struct tu_bo' allocations. We use this
363     * so we can add a refcount to our BOs and check if a particular BO
364     * was already allocated in this device using its GEM handle. This is
365     * necessary to properly manage BO imports, because the kernel doesn't
366     * refcount the underlying BO memory.
367     *
368     * Specifically, when self-importing (i.e. importing a BO into the same
369     * device that created it), the kernel will give us the same BO handle
370     * for both BOs and we must only free it once when  both references are
371     * freed. Otherwise, if we are not self-importing, we get two different BO
372     * handles, and we want to free each one individually.
373     *
374     * The refcount is also useful for being able to maintain BOs across
375     * VK object lifetimes, such as pipelines suballocating out of BOs
376     * allocated on the device.
377     */
378    struct util_sparse_array bo_map;
379 
380    /* We cannot immediately free VMA when freeing BO, kernel truly
381     * frees BO when it stops being busy.
382     * So we have to free our VMA only after the kernel does it.
383     */
384    struct u_vector zombie_vmas;
385 
386    /* Command streams to set pass index to a scratch reg */
387    struct tu_cs *perfcntrs_pass_cs;
388    struct tu_cs_entry *perfcntrs_pass_cs_entries;
389 
390    struct tu_cs *cmdbuf_start_a725_quirk_cs;
391    struct tu_cs_entry *cmdbuf_start_a725_quirk_entry;
392 
393    struct util_dynarray dynamic_rendering_pending;
394    VkCommandPool dynamic_rendering_pool;
395    uint32_t dynamic_rendering_fence;
396 
397    /* Condition variable for timeline semaphore to notify waiters when a
398     * new submit is executed. */
399    pthread_cond_t timeline_cond;
400    pthread_mutex_t submit_mutex;
401 
402    struct tu_autotune autotune;
403 
404    struct breadcrumbs_context *breadcrumbs_ctx;
405 
406    struct tu_cs *dbg_cmdbuf_stomp_cs;
407    struct tu_cs *dbg_renderpass_stomp_cs;
408 
409 #ifdef TU_HAS_VIRTIO
410    struct tu_virtio_device *vdev;
411 #endif
412 
413    uint32_t submit_count;
414 
415    /* Address space and global fault count for this local_fd with DRM backend */
416    uint64_t fault_count;
417 
418    struct u_trace_context trace_context;
419 
420    #ifdef HAVE_PERFETTO
421    struct tu_perfetto_state perfetto;
422    #endif
423 
424    bool use_z24uint_s8uint;
425    bool use_lrz;
426 
427    struct fd_rd_output rd_output;
428 };
429 VK_DEFINE_HANDLE_CASTS(tu_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
430 
431 struct tu_device_memory
432 {
433    struct vk_device_memory vk;
434 
435    struct tu_bo *bo;
436 
437    /* for dedicated allocations */
438    struct tu_image *image;
439 };
440 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_device_memory, vk.base, VkDeviceMemory,
441                                VK_OBJECT_TYPE_DEVICE_MEMORY)
442 
443 struct tu_attachment_info
444 {
445    struct tu_image_view *attachment;
446 };
447 
448 struct tu_tiling_config {
449    /* size of the first tile */
450    VkExtent2D tile0;
451    /* number of tiles */
452    VkExtent2D tile_count;
453 
454    /* size of the first VSC pipe */
455    VkExtent2D pipe0;
456    /* number of VSC pipes */
457    VkExtent2D pipe_count;
458 
459    /* Whether using GMEM is even possible with this configuration */
460    bool possible;
461 
462    /* Whether binning should be used for gmem rendering using this framebuffer. */
463    bool binning;
464 
465    /* Whether binning could be used for gmem rendering using this framebuffer. */
466    bool binning_possible;
467 
468    /* pipe register values */
469    uint32_t pipe_config[MAX_VSC_PIPES];
470    uint32_t pipe_sizes[MAX_VSC_PIPES];
471 };
472 
473 struct tu_framebuffer
474 {
475    struct vk_object_base base;
476 
477    uint32_t width;
478    uint32_t height;
479    uint32_t layers;
480 
481    struct tu_tiling_config tiling[TU_GMEM_LAYOUT_COUNT];
482 
483    uint32_t attachment_count;
484    struct tu_attachment_info attachments[0];
485 };
486 VK_DEFINE_NONDISP_HANDLE_CASTS(tu_framebuffer, base, VkFramebuffer,
487                                VK_OBJECT_TYPE_FRAMEBUFFER)
488 
489 uint64_t
490 tu_get_system_heap_size(struct tu_physical_device *physical_device);
491 
492 VkResult
493 tu_physical_device_init(struct tu_physical_device *device,
494                         struct tu_instance *instance);
495 
496 uint64_t
497 tu_device_ticks_to_ns(struct tu_device *dev, uint64_t ts);
498 
499 static inline struct tu_bo *
tu_device_lookup_bo(struct tu_device * device,uint32_t handle)500 tu_device_lookup_bo(struct tu_device *device, uint32_t handle)
501 {
502    return (struct tu_bo *) util_sparse_array_get(&device->bo_map, handle);
503 }
504 
505 struct u_trace_context *
506 tu_device_get_u_trace(struct tu_device *device);
507 
508 /* Get a scratch bo for use inside a command buffer. This will always return
509  * the same bo given the same size or similar sizes, so only one scratch bo
510  * can be used at the same time. It's meant for short-lived things where we
511  * need to write to some piece of memory, read from it, and then immediately
512  * discard it.
513  */
514 VkResult
515 tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo);
516 
517 void tu_setup_dynamic_framebuffer(struct tu_cmd_buffer *cmd_buffer,
518                                   const VkRenderingInfo *pRenderingInfo);
519 
520 void
521 tu_copy_buffer(struct u_trace_context *utctx, void *cmdstream,
522                void *ts_from, uint64_t from_offset_B,
523                void *ts_to, uint64_t to_offset_B,
524                uint64_t size_B);
525 
526 
527 VkResult
528 tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
529                             struct u_trace **trace_copy);
530 
531 /* If we copy trace and timestamps we will have to free them. */
532 struct tu_u_trace_cmd_data
533 {
534    struct tu_cs *timestamp_copy_cs;
535    struct u_trace *trace;
536 };
537 
538 /* Data necessary to retrieve timestamps and clean all
539  * associated resources afterwards.
540  */
541 struct tu_u_trace_submission_data
542 {
543    uint32_t submission_id;
544    /* We have to know when timestamps are available,
545     * this sync object indicates it.
546     */
547    struct tu_u_trace_syncobj *syncobj;
548 
549    uint32_t cmd_buffer_count;
550    uint32_t last_buffer_with_tracepoints;
551    struct tu_u_trace_cmd_data *cmd_trace_data;
552 
553    /* GPU time is reset on GPU power cycle and the GPU time
554     * offset may change between submissions due to power cycle.
555     */
556    uint64_t gpu_ts_offset;
557 
558    /* KGSL needs a GPU memory to write submission timestamps into */
559    struct tu_suballoc_bo kgsl_timestamp_bo;
560 };
561 
562 VkResult
563 tu_u_trace_submission_data_create(
564    struct tu_device *device,
565    struct tu_cmd_buffer **cmd_buffers,
566    uint32_t cmd_buffer_count,
567    struct tu_u_trace_submission_data **submission_data);
568 
569 void
570 tu_u_trace_submission_data_finish(
571    struct tu_device *device,
572    struct tu_u_trace_submission_data *submission_data);
573 
574 const char *
575 tu_debug_bos_add(struct tu_device *dev, uint64_t size, const char *name);
576 void
577 tu_debug_bos_del(struct tu_device *dev, struct tu_bo *bo);
578 void
579 tu_debug_bos_print_stats(struct tu_device *dev);
580 
581 #endif /* TU_DEVICE_H */
582