1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 *
5 * based in part on anv driver which is:
6 * Copyright © 2015 Intel Corporation
7 *
8 * SPDX-License-Identifier: MIT
9 */
10
11 #ifndef RADV_DEVICE_H
12 #define RADV_DEVICE_H
13
14 #include "ac_descriptors.h"
15 #include "ac_spm.h"
16 #include "ac_sqtt.h"
17
18 #include "util/mesa-blake3.h"
19
20 #include "radv_pipeline.h"
21 #include "radv_printf.h"
22 #include "radv_queue.h"
23 #include "radv_radeon_winsys.h"
24 #include "radv_rra.h"
25 #include "radv_shader.h"
26
27 #include "vk_device.h"
28 #include "vk_texcompress_astc.h"
29 #include "vk_texcompress_etc2.h"
30
31 #define RADV_NUM_HW_CTX (RADEON_CTX_PRIORITY_REALTIME + 1)
32
33 struct radv_image_view;
34
35 enum radv_dispatch_table {
36 RADV_DEVICE_DISPATCH_TABLE,
37 RADV_ANNOTATE_DISPATCH_TABLE,
38 RADV_APP_DISPATCH_TABLE,
39 RADV_RGP_DISPATCH_TABLE,
40 RADV_RRA_DISPATCH_TABLE,
41 RADV_RMV_DISPATCH_TABLE,
42 RADV_CTX_ROLL_DISPATCH_TABLE,
43 RADV_DISPATCH_TABLE_COUNT,
44 };
45
46 struct radv_layer_dispatch_tables {
47 struct vk_device_dispatch_table annotate;
48 struct vk_device_dispatch_table app;
49 struct vk_device_dispatch_table rgp;
50 struct vk_device_dispatch_table rra;
51 struct vk_device_dispatch_table rmv;
52 struct vk_device_dispatch_table ctx_roll;
53 };
54
55 struct radv_device_cache_key {
56 uint32_t disable_trunc_coord : 1;
57 uint32_t image_2d_view_of_3d : 1;
58 uint32_t mesh_shader_queries : 1;
59 uint32_t primitives_generated_query : 1;
60 };
61
62 enum radv_force_vrs {
63 RADV_FORCE_VRS_1x1 = 0,
64 RADV_FORCE_VRS_2x2,
65 RADV_FORCE_VRS_2x1,
66 RADV_FORCE_VRS_1x2,
67 };
68
69 struct radv_notifier {
70 int fd;
71 int watch;
72 bool quit;
73 thrd_t thread;
74 };
75
76 struct radv_meta_state {
77 VkAllocationCallbacks alloc;
78
79 VkPipelineCache cache;
80 uint32_t initial_cache_entries;
81
82 /*
83 * For on-demand pipeline creation, makes sure that
84 * only one thread tries to build a pipeline at the same time.
85 */
86 mtx_t mtx;
87
88 /**
89 * Use array element `i` for images with `2^i` samples.
90 */
91 struct {
92 VkPipeline color_pipelines[NUM_META_FS_KEYS];
93 } color_clear[MAX_SAMPLES_LOG2][MAX_RTS];
94
95 struct {
96 VkPipeline depth_only_pipeline[NUM_DEPTH_CLEAR_PIPELINES];
97 VkPipeline stencil_only_pipeline[NUM_DEPTH_CLEAR_PIPELINES];
98 VkPipeline depthstencil_pipeline[NUM_DEPTH_CLEAR_PIPELINES];
99
100 VkPipeline depth_only_unrestricted_pipeline[NUM_DEPTH_CLEAR_PIPELINES];
101 VkPipeline stencil_only_unrestricted_pipeline[NUM_DEPTH_CLEAR_PIPELINES];
102 VkPipeline depthstencil_unrestricted_pipeline[NUM_DEPTH_CLEAR_PIPELINES];
103 } ds_clear[MAX_SAMPLES_LOG2];
104
105 VkPipelineLayout clear_color_p_layout;
106 VkPipelineLayout clear_depth_p_layout;
107 VkPipelineLayout clear_depth_unrestricted_p_layout;
108
109 /* Optimized compute fast HTILE clear for stencil or depth only. */
110 VkPipeline clear_htile_mask_pipeline;
111 VkPipelineLayout clear_htile_mask_p_layout;
112 VkDescriptorSetLayout clear_htile_mask_ds_layout;
113
114 /* Copy VRS into HTILE. */
115 VkPipeline copy_vrs_htile_pipeline;
116 VkPipelineLayout copy_vrs_htile_p_layout;
117 VkDescriptorSetLayout copy_vrs_htile_ds_layout;
118
119 /* Clear DCC with comp-to-single. */
120 VkPipeline clear_dcc_comp_to_single_pipeline[2]; /* 0: 1x, 1: 2x/4x/8x */
121 VkPipelineLayout clear_dcc_comp_to_single_p_layout;
122 VkDescriptorSetLayout clear_dcc_comp_to_single_ds_layout;
123
124 struct {
125 /** Pipeline that blits from a 1D image. */
126 VkPipeline pipeline_1d_src[NUM_META_FS_KEYS];
127
128 /** Pipeline that blits from a 2D image. */
129 VkPipeline pipeline_2d_src[NUM_META_FS_KEYS];
130
131 /** Pipeline that blits from a 3D image. */
132 VkPipeline pipeline_3d_src[NUM_META_FS_KEYS];
133
134 VkPipeline depth_only_1d_pipeline;
135 VkPipeline depth_only_2d_pipeline;
136 VkPipeline depth_only_3d_pipeline;
137
138 VkPipeline stencil_only_1d_pipeline;
139 VkPipeline stencil_only_2d_pipeline;
140 VkPipeline stencil_only_3d_pipeline;
141 VkPipelineLayout pipeline_layout;
142 VkDescriptorSetLayout ds_layout;
143 } blit;
144
145 struct {
146 VkPipelineLayout p_layouts[5];
147 VkDescriptorSetLayout ds_layouts[5];
148 VkPipeline pipelines[5][NUM_META_FS_KEYS];
149
150 VkPipeline depth_only_pipeline[5];
151
152 VkPipeline stencil_only_pipeline[5];
153 } blit2d[MAX_SAMPLES_LOG2];
154
155 struct {
156 VkPipelineLayout img_p_layout;
157 VkDescriptorSetLayout img_ds_layout;
158 VkPipeline pipeline;
159 VkPipeline pipeline_3d;
160 } itob;
161 struct {
162 VkPipelineLayout img_p_layout;
163 VkDescriptorSetLayout img_ds_layout;
164 VkPipeline pipeline;
165 VkPipeline pipeline_3d;
166 } btoi;
167 struct {
168 VkPipelineLayout img_p_layout;
169 VkDescriptorSetLayout img_ds_layout;
170 VkPipeline pipeline;
171 } btoi_r32g32b32;
172 struct {
173 VkPipelineLayout img_p_layout;
174 VkDescriptorSetLayout img_ds_layout;
175 VkPipeline pipeline[MAX_SAMPLES_LOG2];
176 VkPipeline pipeline_2d_3d;
177 VkPipeline pipeline_3d_2d;
178 VkPipeline pipeline_3d_3d;
179 } itoi;
180 struct {
181 VkPipelineLayout img_p_layout;
182 VkDescriptorSetLayout img_ds_layout;
183 VkPipeline pipeline;
184 } itoi_r32g32b32;
185 struct {
186 VkPipelineLayout img_p_layout;
187 VkDescriptorSetLayout img_ds_layout;
188 VkPipeline pipeline[MAX_SAMPLES_LOG2];
189 VkPipeline pipeline_3d;
190 } cleari;
191 struct {
192 VkPipelineLayout img_p_layout;
193 VkDescriptorSetLayout img_ds_layout;
194 VkPipeline pipeline;
195 } cleari_r32g32b32;
196 struct {
197 VkPipelineLayout p_layout;
198 VkDescriptorSetLayout ds_layout;
199 VkPipeline pipeline[MAX_SAMPLES_LOG2];
200 } fmask_copy;
201
202 struct {
203 VkPipelineLayout p_layout;
204 VkPipeline pipeline[NUM_META_FS_KEYS];
205 } resolve;
206
207 struct {
208 VkDescriptorSetLayout ds_layout;
209 VkPipelineLayout p_layout;
210 struct {
211 VkPipeline pipeline;
212 VkPipeline i_pipeline;
213 VkPipeline srgb_pipeline;
214 } rc[MAX_SAMPLES_LOG2];
215
216 VkPipeline depth_zero_pipeline;
217 struct {
218 VkPipeline average_pipeline;
219 VkPipeline max_pipeline;
220 VkPipeline min_pipeline;
221 } depth[MAX_SAMPLES_LOG2];
222
223 VkPipeline stencil_zero_pipeline;
224 struct {
225 VkPipeline max_pipeline;
226 VkPipeline min_pipeline;
227 } stencil[MAX_SAMPLES_LOG2];
228 } resolve_compute;
229
230 struct {
231 VkDescriptorSetLayout ds_layout;
232 VkPipelineLayout p_layout;
233
234 struct {
235 VkPipeline pipeline[NUM_META_FS_KEYS];
236 } rc[MAX_SAMPLES_LOG2];
237
238 VkPipeline depth_zero_pipeline;
239 struct {
240 VkPipeline average_pipeline;
241 VkPipeline max_pipeline;
242 VkPipeline min_pipeline;
243 } depth[MAX_SAMPLES_LOG2];
244
245 VkPipeline stencil_zero_pipeline;
246 struct {
247 VkPipeline max_pipeline;
248 VkPipeline min_pipeline;
249 } stencil[MAX_SAMPLES_LOG2];
250 } resolve_fragment;
251
252 struct {
253 VkPipelineLayout p_layout;
254 VkPipeline decompress_pipeline[MAX_SAMPLES_LOG2];
255 } depth_decomp;
256
257 VkDescriptorSetLayout expand_depth_stencil_compute_ds_layout;
258 VkPipelineLayout expand_depth_stencil_compute_p_layout;
259 VkPipeline expand_depth_stencil_compute_pipeline;
260
261 struct {
262 VkPipelineLayout p_layout;
263 VkPipeline cmask_eliminate_pipeline;
264 VkPipeline fmask_decompress_pipeline;
265 VkPipeline dcc_decompress_pipeline;
266
267 VkDescriptorSetLayout dcc_decompress_compute_ds_layout;
268 VkPipelineLayout dcc_decompress_compute_p_layout;
269 VkPipeline dcc_decompress_compute_pipeline;
270 } fast_clear_flush;
271
272 struct {
273 VkPipelineLayout fill_p_layout;
274 VkPipelineLayout copy_p_layout;
275 VkPipeline fill_pipeline;
276 VkPipeline copy_pipeline;
277 } buffer;
278
279 struct {
280 VkDescriptorSetLayout ds_layout;
281 VkPipelineLayout p_layout;
282 VkPipeline occlusion_query_pipeline;
283 VkPipeline pipeline_statistics_query_pipeline;
284 VkPipeline tfb_query_pipeline;
285 VkPipeline timestamp_query_pipeline;
286 VkPipeline pg_query_pipeline;
287 VkPipeline ms_prim_gen_query_pipeline;
288 } query;
289
290 struct {
291 VkDescriptorSetLayout ds_layout;
292 VkPipelineLayout p_layout;
293 VkPipeline pipeline[MAX_SAMPLES_LOG2];
294 } fmask_expand;
295
296 struct {
297 VkDescriptorSetLayout ds_layout;
298 VkPipelineLayout p_layout;
299 VkPipeline pipeline[32];
300 } dcc_retile;
301
302 struct {
303 VkPipelineLayout leaf_p_layout;
304 VkPipeline leaf_pipeline;
305 VkPipeline leaf_updateable_pipeline;
306 VkPipelineLayout morton_p_layout;
307 VkPipeline morton_pipeline;
308 VkPipelineLayout lbvh_main_p_layout;
309 VkPipeline lbvh_main_pipeline;
310 VkPipelineLayout lbvh_generate_ir_p_layout;
311 VkPipeline lbvh_generate_ir_pipeline;
312 VkPipelineLayout ploc_p_layout;
313 VkPipeline ploc_pipeline;
314 VkPipelineLayout encode_p_layout;
315 VkPipeline encode_pipeline;
316 VkPipeline encode_compact_pipeline;
317 VkPipelineLayout header_p_layout;
318 VkPipeline header_pipeline;
319 VkPipelineLayout update_p_layout;
320 VkPipeline update_pipeline;
321 VkPipelineLayout copy_p_layout;
322 VkPipeline copy_pipeline;
323
324 struct radix_sort_vk *radix_sort;
325
326 struct {
327 VkBuffer buffer;
328 VkDeviceMemory memory;
329 VkAccelerationStructureKHR accel_struct;
330 } null;
331 } accel_struct_build;
332
333 struct vk_texcompress_etc2_state etc_decode;
334
335 struct vk_texcompress_astc_state *astc_decode;
336
337 struct {
338 VkDescriptorSetLayout ds_layout;
339 VkPipelineLayout p_layout;
340 } dgc_prepare;
341 };
342
343 struct radv_memory_trace_data {
344 /* ID of the PTE update event in ftrace data */
345 uint16_t ftrace_update_ptes_id;
346
347 uint32_t num_cpus;
348 int *pipe_fds;
349 };
350
351 struct radv_sqtt_timestamp {
352 uint8_t *map;
353 unsigned offset;
354 uint64_t size;
355 struct radeon_winsys_bo *bo;
356 struct list_head list;
357 };
358
359 #define RADV_BORDER_COLOR_COUNT 4096
360 #define RADV_BORDER_COLOR_BUFFER_SIZE (sizeof(VkClearColorValue) * RADV_BORDER_COLOR_COUNT)
361
362 struct radv_device_border_color_data {
363 bool used[RADV_BORDER_COLOR_COUNT];
364
365 struct radeon_winsys_bo *bo;
366 VkClearColorValue *colors_gpu_ptr;
367
368 /* Mutex is required to guarantee vkCreateSampler thread safety
369 * given that we are writing to a buffer and checking color occupation */
370 mtx_t mutex;
371 };
372
373 struct radv_pso_cache_stats {
374 uint32_t hits;
375 uint32_t misses;
376 };
377
378 struct radv_device {
379 struct vk_device vk;
380
381 struct radeon_winsys *ws;
382
383 struct radv_layer_dispatch_tables layer_dispatch;
384
385 struct radeon_winsys_ctx *hw_ctx[RADV_NUM_HW_CTX];
386 struct radv_meta_state meta_state;
387
388 struct radv_queue *queues[RADV_MAX_QUEUE_FAMILIES];
389 int queue_count[RADV_MAX_QUEUE_FAMILIES];
390
391 bool pbb_allowed;
392 uint32_t scratch_waves;
393 uint32_t dispatch_initiator;
394 uint32_t dispatch_initiator_task;
395
396 /* MSAA sample locations.
397 * The first index is the sample index.
398 * The second index is the coordinate: X, Y. */
399 float sample_locations_1x[1][2];
400 float sample_locations_2x[2][2];
401 float sample_locations_4x[4][2];
402 float sample_locations_8x[8][2];
403
404 /* GFX7 and later */
405 uint32_t gfx_init_size_dw;
406 struct radeon_winsys_bo *gfx_init;
407
408 struct radeon_winsys_bo *trace_bo;
409 struct radv_trace_data *trace_data;
410
411 /* Whether to keep shader debug info, for debugging. */
412 bool keep_shader_info;
413
414 /* Backup in-memory cache to be used if the app doesn't provide one */
415 struct vk_pipeline_cache *mem_cache;
416
417 /*
418 * use different counters so MSAA MRTs get consecutive surface indices,
419 * even if MASK is allocated in between.
420 */
421 uint32_t image_mrt_offset_counter;
422 uint32_t fmask_mrt_offset_counter;
423
424 struct list_head shader_arenas;
425 struct hash_table_u64 *capture_replay_arena_vas;
426 unsigned shader_arena_shift;
427 uint8_t shader_free_list_mask;
428 struct radv_shader_free_list shader_free_list;
429 struct radv_shader_free_list capture_replay_free_list;
430 struct list_head shader_block_obj_pool;
431 mtx_t shader_arena_mutex;
432
433 mtx_t shader_upload_hw_ctx_mutex;
434 struct radeon_winsys_ctx *shader_upload_hw_ctx;
435 VkSemaphore shader_upload_sem;
436 uint64_t shader_upload_seq;
437 struct list_head shader_dma_submissions;
438 mtx_t shader_dma_submission_list_mutex;
439 cnd_t shader_dma_submission_list_cond;
440
441 /* Whether to DMA shaders to invisible VRAM or to upload directly through BAR. */
442 bool shader_use_invisible_vram;
443
444 /* Whether to inline the compute dispatch size in user sgprs. */
445 bool load_grid_size_from_user_sgpr;
446
447 /* Whether the driver uses a global BO list. */
448 bool use_global_bo_list;
449
450 /* Whether anisotropy is forced with RADV_TEX_ANISO (-1 is disabled). */
451 int force_aniso;
452
453 /* Always disable TRUNC_COORD. */
454 bool disable_trunc_coord;
455
456 struct radv_device_border_color_data border_color_data;
457
458 /* Thread trace. */
459 struct ac_sqtt sqtt;
460 bool sqtt_enabled;
461 bool sqtt_triggered;
462
463 /* SQTT timestamps for queue events. */
464 simple_mtx_t sqtt_timestamp_mtx;
465 struct radv_sqtt_timestamp sqtt_timestamp;
466
467 /* SQTT timed cmd buffers. */
468 simple_mtx_t sqtt_command_pool_mtx;
469 struct vk_command_pool *sqtt_command_pool[2];
470
471 /* Memory trace. */
472 struct radv_memory_trace_data memory_trace;
473
474 /* SPM. */
475 struct ac_spm spm;
476
477 /* Radeon Raytracing Analyzer trace. */
478 struct radv_rra_trace_data rra_trace;
479
480 FILE *ctx_roll_file;
481 simple_mtx_t ctx_roll_mtx;
482
483 /* Trap handler. */
484 struct radv_shader *trap_handler_shader;
485 struct radeon_winsys_bo *tma_bo; /* Trap Memory Address */
486 uint32_t *tma_ptr;
487
488 /* Overallocation. */
489 bool overallocation_disallowed;
490 uint64_t allocated_memory_size[VK_MAX_MEMORY_HEAPS];
491 mtx_t overallocation_mutex;
492
493 /* RADV_FORCE_VRS. */
494 struct radv_notifier notifier;
495 enum radv_force_vrs force_vrs;
496
497 /* Depth image for VRS when not bound by the app. */
498 struct {
499 struct radv_image *image;
500 struct radv_buffer *buffer; /* HTILE */
501 struct radv_device_memory *mem;
502 } vrs;
503
504 /* Prime blit sdma queue */
505 struct radv_queue *private_sdma_queue;
506
507 struct radv_shader_part_cache vs_prologs;
508 struct radv_shader_part *simple_vs_prologs[MAX_VERTEX_ATTRIBS];
509 struct radv_shader_part *instance_rate_vs_prologs[816];
510
511 struct radv_shader_part_cache ps_epilogs;
512
513 simple_mtx_t trace_mtx;
514
515 /* Whether per-vertex VRS is forced. */
516 bool force_vrs_enabled;
517
518 simple_mtx_t pstate_mtx;
519 unsigned pstate_cnt;
520
521 /* BO to contain some performance counter helpers:
522 * - A lock for profiling cmdbuffers.
523 * - a temporary fence for the end query synchronization.
524 * - the pass to use for profiling. (as an array of bools)
525 */
526 struct radeon_winsys_bo *perf_counter_bo;
527
528 /* Interleaved lock/unlock commandbuffers for perfcounter passes. */
529 struct radeon_cmdbuf **perf_counter_lock_cs;
530
531 bool uses_shadow_regs;
532
533 struct hash_table *rt_handles;
534 simple_mtx_t rt_handles_mtx;
535
536 struct radv_printf_data printf;
537
538 struct radv_device_cache_key cache_key;
539 blake3_hash cache_hash;
540
541 /* Not NULL if a GPU hang report has been generated for VK_EXT_device_fault. */
542 char *gpu_hang_report;
543
544 /* For indirect compute pipeline binds with DGC only. */
545 simple_mtx_t compute_scratch_mtx;
546 uint32_t compute_scratch_size_per_wave;
547 uint32_t compute_scratch_waves;
548
549 /* PSO cache stats */
550 simple_mtx_t pso_cache_stats_mtx;
551 struct radv_pso_cache_stats pso_cache_stats[RADV_PIPELINE_TYPE_COUNT];
552 };
553
554 VK_DEFINE_HANDLE_CASTS(radv_device, vk.base, VkDevice, VK_OBJECT_TYPE_DEVICE)
555
556 static inline struct radv_physical_device *
radv_device_physical(const struct radv_device * dev)557 radv_device_physical(const struct radv_device *dev)
558 {
559 return (struct radv_physical_device *)dev->vk.physical;
560 }
561
562 static inline bool
radv_uses_device_generated_commands(const struct radv_device * device)563 radv_uses_device_generated_commands(const struct radv_device *device)
564 {
565 return device->vk.enabled_features.deviceGeneratedCommandsNV || device->vk.enabled_features.deviceGeneratedCompute;
566 }
567
568 static inline bool
radv_uses_primitives_generated_query(const struct radv_device * device)569 radv_uses_primitives_generated_query(const struct radv_device *device)
570 {
571 return device->vk.enabled_features.primitivesGeneratedQuery ||
572 device->vk.enabled_features.primitivesGeneratedQueryWithRasterizerDiscard ||
573 device->vk.enabled_features.primitivesGeneratedQueryWithNonZeroStreams;
574 }
575
576 static inline bool
radv_uses_image_float32_atomics(const struct radv_device * device)577 radv_uses_image_float32_atomics(const struct radv_device *device)
578 {
579 return device->vk.enabled_features.shaderImageFloat32Atomics ||
580 device->vk.enabled_features.sparseImageFloat32Atomics ||
581 device->vk.enabled_features.shaderImageFloat32AtomicMinMax ||
582 device->vk.enabled_features.sparseImageFloat32AtomicMinMax;
583 }
584
585 VkResult radv_device_init_vrs_state(struct radv_device *device);
586
587 unsigned radv_get_default_max_sample_dist(int log_samples);
588
589 void radv_emit_default_sample_locations(const struct radv_physical_device *pdev, struct radeon_cmdbuf *cs,
590 int nr_samples);
591
592 bool radv_get_memory_fd(struct radv_device *device, struct radv_device_memory *memory, int *pFD);
593
594 unsigned radv_get_dcc_max_uncompressed_block_size(const struct radv_device *device, const struct radv_image *image);
595
596 struct radv_color_buffer_info {
597 struct ac_cb_surface ac;
598 };
599
600 struct radv_ds_buffer_info {
601 struct ac_ds_surface ac;
602
603 uint32_t db_render_override2;
604 uint32_t db_render_control;
605 };
606
607 void radv_initialise_color_surface(struct radv_device *device, struct radv_color_buffer_info *cb,
608 struct radv_image_view *iview);
609
610 void radv_initialise_vrs_surface(struct radv_image *image, struct radv_buffer *htile_buffer,
611 struct radv_ds_buffer_info *ds);
612
613
614 void radv_initialise_ds_surface(const struct radv_device *device, struct radv_ds_buffer_info *ds,
615 struct radv_image_view *iview, VkImageAspectFlags ds_aspects);
616
617 void radv_gfx11_set_db_render_control(const struct radv_device *device, unsigned num_samples,
618 unsigned *db_render_control);
619
620 bool radv_device_set_pstate(struct radv_device *device, bool enable);
621
622 bool radv_device_acquire_performance_counters(struct radv_device *device);
623
624 void radv_device_release_performance_counters(struct radv_device *device);
625
626 #endif /* RADV_DEVICE_H */
627