xref: /aosp_15_r20/external/mesa3d/src/amd/vulkan/radv_device.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  *
5  * based in part on anv driver which is:
6  * Copyright © 2015 Intel Corporation
7  *
8  * SPDX-License-Identifier: MIT
9  */
10 
11 #include <fcntl.h>
12 #include <stdbool.h>
13 #include <string.h>
14 
15 #ifdef __FreeBSD__
16 #include <sys/types.h>
17 #endif
18 #ifdef MAJOR_IN_MKDEV
19 #include <sys/mkdev.h>
20 #endif
21 #ifdef MAJOR_IN_SYSMACROS
22 #include <sys/sysmacros.h>
23 #endif
24 
25 #ifdef __linux__
26 #include <sys/inotify.h>
27 #endif
28 
29 #include "meta/radv_meta.h"
30 #include "util/disk_cache.h"
31 #include "util/u_debug.h"
32 #include "radv_cs.h"
33 #include "radv_debug.h"
34 #include "radv_entrypoints.h"
35 #include "radv_formats.h"
36 #include "radv_physical_device.h"
37 #include "radv_printf.h"
38 #include "radv_rmv.h"
39 #include "radv_shader.h"
40 #include "radv_spm.h"
41 #include "radv_sqtt.h"
42 #include "vk_common_entrypoints.h"
43 #include "vk_pipeline_cache.h"
44 #include "vk_semaphore.h"
45 #include "vk_util.h"
46 #ifdef _WIN32
47 typedef void *drmDevicePtr;
48 #include <io.h>
49 #else
50 #include <amdgpu.h>
51 #include <xf86drm.h>
52 #include "drm-uapi/amdgpu_drm.h"
53 #include "winsys/amdgpu/radv_amdgpu_winsys_public.h"
54 #endif
55 #include "util/build_id.h"
56 #include "util/driconf.h"
57 #include "util/mesa-sha1.h"
58 #include "util/os_time.h"
59 #include "util/timespec.h"
60 #include "util/u_atomic.h"
61 #include "util/u_process.h"
62 #include "vulkan/vk_icd.h"
63 #include "winsys/null/radv_null_winsys_public.h"
64 #include "git_sha1.h"
65 #include "sid.h"
66 #include "vk_common_entrypoints.h"
67 #include "vk_format.h"
68 #include "vk_sync.h"
69 #include "vk_sync_dummy.h"
70 
71 #if AMD_LLVM_AVAILABLE
72 #include "ac_llvm_util.h"
73 #endif
74 
75 #include "ac_descriptors.h"
76 #include "ac_formats.h"
77 
78 static bool
radv_spm_trace_enabled(const struct radv_instance * instance)79 radv_spm_trace_enabled(const struct radv_instance *instance)
80 {
81    return (instance->vk.trace_mode & RADV_TRACE_MODE_RGP) &&
82           debug_get_bool_option("RADV_THREAD_TRACE_CACHE_COUNTERS", true);
83 }
84 
85 static bool
radv_trap_handler_enabled()86 radv_trap_handler_enabled()
87 {
88    return !!getenv("RADV_TRAP_HANDLER");
89 }
90 
91 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetMemoryHostPointerPropertiesEXT(VkDevice _device,VkExternalMemoryHandleTypeFlagBits handleType,const void * pHostPointer,VkMemoryHostPointerPropertiesEXT * pMemoryHostPointerProperties)92 radv_GetMemoryHostPointerPropertiesEXT(VkDevice _device, VkExternalMemoryHandleTypeFlagBits handleType,
93                                        const void *pHostPointer,
94                                        VkMemoryHostPointerPropertiesEXT *pMemoryHostPointerProperties)
95 {
96    VK_FROM_HANDLE(radv_device, device, _device);
97    const struct radv_physical_device *pdev = radv_device_physical(device);
98 
99    switch (handleType) {
100    case VK_EXTERNAL_MEMORY_HANDLE_TYPE_HOST_ALLOCATION_BIT_EXT: {
101       uint32_t memoryTypeBits = 0;
102       for (int i = 0; i < pdev->memory_properties.memoryTypeCount; i++) {
103          if (pdev->memory_domains[i] == RADEON_DOMAIN_GTT && !(pdev->memory_flags[i] & RADEON_FLAG_GTT_WC)) {
104             memoryTypeBits = (1 << i);
105             break;
106          }
107       }
108       pMemoryHostPointerProperties->memoryTypeBits = memoryTypeBits;
109       return VK_SUCCESS;
110    }
111    default:
112       return VK_ERROR_INVALID_EXTERNAL_HANDLE;
113    }
114 }
115 
116 static VkResult
radv_device_init_border_color(struct radv_device * device)117 radv_device_init_border_color(struct radv_device *device)
118 {
119    VkResult result;
120 
121    result = radv_bo_create(device, NULL, RADV_BORDER_COLOR_BUFFER_SIZE, 4096, RADEON_DOMAIN_VRAM,
122                            RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_READ_ONLY | RADEON_FLAG_NO_INTERPROCESS_SHARING,
123                            RADV_BO_PRIORITY_SHADER, 0, true, &device->border_color_data.bo);
124 
125    if (result != VK_SUCCESS)
126       return vk_error(device, result);
127 
128    radv_rmv_log_border_color_palette_create(device, device->border_color_data.bo);
129 
130    result = device->ws->buffer_make_resident(device->ws, device->border_color_data.bo, true);
131    if (result != VK_SUCCESS)
132       return vk_error(device, result);
133 
134    device->border_color_data.colors_gpu_ptr = radv_buffer_map(device->ws, device->border_color_data.bo);
135    if (!device->border_color_data.colors_gpu_ptr)
136       return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
137    mtx_init(&device->border_color_data.mutex, mtx_plain);
138 
139    return VK_SUCCESS;
140 }
141 
142 static void
radv_device_finish_border_color(struct radv_device * device)143 radv_device_finish_border_color(struct radv_device *device)
144 {
145    if (device->border_color_data.bo) {
146       radv_rmv_log_border_color_palette_destroy(device, device->border_color_data.bo);
147       device->ws->buffer_make_resident(device->ws, device->border_color_data.bo, false);
148       radv_bo_destroy(device, NULL, device->border_color_data.bo);
149 
150       mtx_destroy(&device->border_color_data.mutex);
151    }
152 }
153 
154 static struct radv_shader_part *
_radv_create_vs_prolog(struct radv_device * device,const void * _key)155 _radv_create_vs_prolog(struct radv_device *device, const void *_key)
156 {
157    struct radv_vs_prolog_key *key = (struct radv_vs_prolog_key *)_key;
158    return radv_create_vs_prolog(device, key);
159 }
160 
161 static uint32_t
radv_hash_vs_prolog(const void * key_)162 radv_hash_vs_prolog(const void *key_)
163 {
164    const struct radv_vs_prolog_key *key = key_;
165    return _mesa_hash_data(key, sizeof(*key));
166 }
167 
168 static bool
radv_cmp_vs_prolog(const void * a_,const void * b_)169 radv_cmp_vs_prolog(const void *a_, const void *b_)
170 {
171    const struct radv_vs_prolog_key *a = a_;
172    const struct radv_vs_prolog_key *b = b_;
173 
174    return memcmp(a, b, sizeof(*a)) == 0;
175 }
176 
177 static struct radv_shader_part_cache_ops vs_prolog_ops = {
178    .create = _radv_create_vs_prolog,
179    .hash = radv_hash_vs_prolog,
180    .equals = radv_cmp_vs_prolog,
181 };
182 
183 static VkResult
radv_device_init_vs_prologs(struct radv_device * device)184 radv_device_init_vs_prologs(struct radv_device *device)
185 {
186    const struct radv_physical_device *pdev = radv_device_physical(device);
187    const struct radv_instance *instance = radv_physical_device_instance(pdev);
188 
189    if (!radv_shader_part_cache_init(&device->vs_prologs, &vs_prolog_ops))
190       return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
191 
192    /* don't pre-compile prologs if we want to print them */
193    if (instance->debug_flags & RADV_DEBUG_DUMP_PROLOGS)
194       return VK_SUCCESS;
195 
196    struct radv_vs_prolog_key key;
197    memset(&key, 0, sizeof(key));
198    key.as_ls = false;
199    key.is_ngg = pdev->use_ngg;
200    key.next_stage = MESA_SHADER_VERTEX;
201    key.wave32 = pdev->ge_wave_size == 32;
202 
203    for (unsigned i = 1; i <= MAX_VERTEX_ATTRIBS; i++) {
204       key.instance_rate_inputs = 0;
205       key.num_attributes = i;
206 
207       device->simple_vs_prologs[i - 1] = radv_create_vs_prolog(device, &key);
208       if (!device->simple_vs_prologs[i - 1])
209          return vk_error(instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
210    }
211 
212    unsigned idx = 0;
213    for (unsigned num_attributes = 1; num_attributes <= 16; num_attributes++) {
214       for (unsigned count = 1; count <= num_attributes; count++) {
215          for (unsigned start = 0; start <= (num_attributes - count); start++) {
216             key.instance_rate_inputs = u_bit_consecutive(start, count);
217             key.num_attributes = num_attributes;
218 
219             struct radv_shader_part *prolog = radv_create_vs_prolog(device, &key);
220             if (!prolog)
221                return vk_error(instance, VK_ERROR_OUT_OF_DEVICE_MEMORY);
222 
223             assert(idx == radv_instance_rate_prolog_index(num_attributes, key.instance_rate_inputs));
224             device->instance_rate_vs_prologs[idx++] = prolog;
225          }
226       }
227    }
228    assert(idx == ARRAY_SIZE(device->instance_rate_vs_prologs));
229 
230    return VK_SUCCESS;
231 }
232 
233 static void
radv_device_finish_vs_prologs(struct radv_device * device)234 radv_device_finish_vs_prologs(struct radv_device *device)
235 {
236    if (device->vs_prologs.ops)
237       radv_shader_part_cache_finish(device, &device->vs_prologs);
238 
239    for (unsigned i = 0; i < ARRAY_SIZE(device->simple_vs_prologs); i++) {
240       if (!device->simple_vs_prologs[i])
241          continue;
242 
243       radv_shader_part_unref(device, device->simple_vs_prologs[i]);
244    }
245 
246    for (unsigned i = 0; i < ARRAY_SIZE(device->instance_rate_vs_prologs); i++) {
247       if (!device->instance_rate_vs_prologs[i])
248          continue;
249 
250       radv_shader_part_unref(device, device->instance_rate_vs_prologs[i]);
251    }
252 }
253 
254 static struct radv_shader_part *
_radv_create_ps_epilog(struct radv_device * device,const void * _key)255 _radv_create_ps_epilog(struct radv_device *device, const void *_key)
256 {
257    struct radv_ps_epilog_key *key = (struct radv_ps_epilog_key *)_key;
258    return radv_create_ps_epilog(device, key, NULL);
259 }
260 
261 static uint32_t
radv_hash_ps_epilog(const void * key_)262 radv_hash_ps_epilog(const void *key_)
263 {
264    const struct radv_ps_epilog_key *key = key_;
265    return _mesa_hash_data(key, sizeof(*key));
266 }
267 
268 static bool
radv_cmp_ps_epilog(const void * a_,const void * b_)269 radv_cmp_ps_epilog(const void *a_, const void *b_)
270 {
271    const struct radv_ps_epilog_key *a = a_;
272    const struct radv_ps_epilog_key *b = b_;
273 
274    return memcmp(a, b, sizeof(*a)) == 0;
275 }
276 
277 static struct radv_shader_part_cache_ops ps_epilog_ops = {
278    .create = _radv_create_ps_epilog,
279    .hash = radv_hash_ps_epilog,
280    .equals = radv_cmp_ps_epilog,
281 };
282 
283 VkResult
radv_device_init_vrs_state(struct radv_device * device)284 radv_device_init_vrs_state(struct radv_device *device)
285 {
286    VkDeviceMemory mem;
287    VkBuffer buffer;
288    VkResult result;
289    VkImage image;
290 
291    VkImageCreateInfo image_create_info = {
292       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
293       .imageType = VK_IMAGE_TYPE_2D,
294       .format = VK_FORMAT_D16_UNORM,
295       .extent = {MAX_FRAMEBUFFER_WIDTH, MAX_FRAMEBUFFER_HEIGHT, 1},
296       .mipLevels = 1,
297       .arrayLayers = 1,
298       .samples = VK_SAMPLE_COUNT_1_BIT,
299       .tiling = VK_IMAGE_TILING_OPTIMAL,
300       .usage = VK_IMAGE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT,
301       .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
302       .queueFamilyIndexCount = 0,
303       .pQueueFamilyIndices = NULL,
304       .initialLayout = VK_IMAGE_LAYOUT_UNDEFINED,
305    };
306 
307    result =
308       radv_image_create(radv_device_to_handle(device), &(struct radv_image_create_info){.vk_info = &image_create_info},
309                         &device->meta_state.alloc, &image, true);
310    if (result != VK_SUCCESS)
311       return result;
312 
313    VkBufferCreateInfo buffer_create_info = {
314       .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
315       .pNext =
316          &(VkBufferUsageFlags2CreateInfoKHR){
317             .sType = VK_STRUCTURE_TYPE_BUFFER_USAGE_FLAGS_2_CREATE_INFO_KHR,
318             .usage = VK_BUFFER_USAGE_2_STORAGE_BUFFER_BIT_KHR,
319          },
320       .size = radv_image_from_handle(image)->planes[0].surface.meta_size,
321       .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
322    };
323 
324    result = radv_create_buffer(device, &buffer_create_info, &device->meta_state.alloc, &buffer, true);
325    if (result != VK_SUCCESS)
326       goto fail_create;
327 
328    VkBufferMemoryRequirementsInfo2 info = {
329       .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_REQUIREMENTS_INFO_2,
330       .buffer = buffer,
331    };
332    VkMemoryRequirements2 mem_req = {
333       .sType = VK_STRUCTURE_TYPE_MEMORY_REQUIREMENTS_2,
334    };
335    vk_common_GetBufferMemoryRequirements2(radv_device_to_handle(device), &info, &mem_req);
336 
337    VkMemoryAllocateInfo alloc_info = {
338       .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
339       .allocationSize = mem_req.memoryRequirements.size,
340    };
341 
342    result = radv_alloc_memory(device, &alloc_info, &device->meta_state.alloc, &mem, true);
343    if (result != VK_SUCCESS)
344       goto fail_alloc;
345 
346    VkBindBufferMemoryInfo bind_info = {.sType = VK_STRUCTURE_TYPE_BIND_BUFFER_MEMORY_INFO,
347                                        .buffer = buffer,
348                                        .memory = mem,
349                                        .memoryOffset = 0};
350 
351    result = radv_BindBufferMemory2(radv_device_to_handle(device), 1, &bind_info);
352    if (result != VK_SUCCESS)
353       goto fail_bind;
354 
355    device->vrs.image = radv_image_from_handle(image);
356    device->vrs.buffer = radv_buffer_from_handle(buffer);
357    device->vrs.mem = radv_device_memory_from_handle(mem);
358 
359    return VK_SUCCESS;
360 
361 fail_bind:
362    radv_FreeMemory(radv_device_to_handle(device), mem, &device->meta_state.alloc);
363 fail_alloc:
364    radv_DestroyBuffer(radv_device_to_handle(device), buffer, &device->meta_state.alloc);
365 fail_create:
366    radv_DestroyImage(radv_device_to_handle(device), image, &device->meta_state.alloc);
367 
368    return result;
369 }
370 
371 static void
radv_device_finish_vrs_image(struct radv_device * device)372 radv_device_finish_vrs_image(struct radv_device *device)
373 {
374    if (!device->vrs.image)
375       return;
376 
377    radv_FreeMemory(radv_device_to_handle(device), radv_device_memory_to_handle(device->vrs.mem),
378                    &device->meta_state.alloc);
379    radv_DestroyBuffer(radv_device_to_handle(device), radv_buffer_to_handle(device->vrs.buffer),
380                       &device->meta_state.alloc);
381    radv_DestroyImage(radv_device_to_handle(device), radv_image_to_handle(device->vrs.image), &device->meta_state.alloc);
382 }
383 
384 static enum radv_force_vrs
radv_parse_vrs_rates(const char * str)385 radv_parse_vrs_rates(const char *str)
386 {
387    if (!strcmp(str, "2x2")) {
388       return RADV_FORCE_VRS_2x2;
389    } else if (!strcmp(str, "2x1")) {
390       return RADV_FORCE_VRS_2x1;
391    } else if (!strcmp(str, "1x2")) {
392       return RADV_FORCE_VRS_1x2;
393    } else if (!strcmp(str, "1x1")) {
394       return RADV_FORCE_VRS_1x1;
395    }
396 
397    fprintf(stderr, "radv: Invalid VRS rates specified (valid values are 2x2, 2x1, 1x2 and 1x1)\n");
398    return RADV_FORCE_VRS_1x1;
399 }
400 
401 static const char *
radv_get_force_vrs_config_file(void)402 radv_get_force_vrs_config_file(void)
403 {
404    return getenv("RADV_FORCE_VRS_CONFIG_FILE");
405 }
406 
407 static enum radv_force_vrs
radv_parse_force_vrs_config_file(const char * config_file)408 radv_parse_force_vrs_config_file(const char *config_file)
409 {
410    enum radv_force_vrs force_vrs = RADV_FORCE_VRS_1x1;
411    char buf[4];
412    FILE *f;
413 
414    f = fopen(config_file, "r");
415    if (!f) {
416       fprintf(stderr, "radv: Can't open file: '%s'.\n", config_file);
417       return force_vrs;
418    }
419 
420    if (fread(buf, sizeof(buf), 1, f) == 1) {
421       buf[3] = '\0';
422       force_vrs = radv_parse_vrs_rates(buf);
423    }
424 
425    fclose(f);
426    return force_vrs;
427 }
428 
429 #ifdef __linux__
430 
431 #define BUF_LEN ((10 * (sizeof(struct inotify_event) + NAME_MAX + 1)))
432 
433 static int
radv_notifier_thread_run(void * data)434 radv_notifier_thread_run(void *data)
435 {
436    struct radv_device *device = data;
437    struct radv_notifier *notifier = &device->notifier;
438    char buf[BUF_LEN];
439 
440    while (!notifier->quit) {
441       const char *file = radv_get_force_vrs_config_file();
442       struct timespec tm = {.tv_nsec = 100000000}; /* 1OOms */
443       int length, i = 0;
444 
445       length = read(notifier->fd, buf, BUF_LEN);
446       while (i < length) {
447          struct inotify_event *event = (struct inotify_event *)&buf[i];
448 
449          i += sizeof(struct inotify_event) + event->len;
450          if (event->mask & IN_MODIFY || event->mask & IN_DELETE_SELF) {
451             /* Sleep 100ms for editors that use a temporary file and delete the original. */
452             thrd_sleep(&tm, NULL);
453             device->force_vrs = radv_parse_force_vrs_config_file(file);
454 
455             fprintf(stderr, "radv: Updated the per-vertex VRS rate to '%d'.\n", device->force_vrs);
456 
457             if (event->mask & IN_DELETE_SELF) {
458                inotify_rm_watch(notifier->fd, notifier->watch);
459                notifier->watch = inotify_add_watch(notifier->fd, file, IN_MODIFY | IN_DELETE_SELF);
460             }
461          }
462       }
463 
464       thrd_sleep(&tm, NULL);
465    }
466 
467    return 0;
468 }
469 
470 #endif
471 
472 static int
radv_device_init_notifier(struct radv_device * device)473 radv_device_init_notifier(struct radv_device *device)
474 {
475 #ifndef __linux__
476    return true;
477 #else
478    struct radv_notifier *notifier = &device->notifier;
479    const char *file = radv_get_force_vrs_config_file();
480    int ret;
481 
482    notifier->fd = inotify_init1(IN_NONBLOCK);
483    if (notifier->fd < 0)
484       return false;
485 
486    notifier->watch = inotify_add_watch(notifier->fd, file, IN_MODIFY | IN_DELETE_SELF);
487    if (notifier->watch < 0)
488       goto fail_watch;
489 
490    ret = thrd_create(&notifier->thread, radv_notifier_thread_run, device);
491    if (ret)
492       goto fail_thread;
493 
494    return true;
495 
496 fail_thread:
497    inotify_rm_watch(notifier->fd, notifier->watch);
498 fail_watch:
499    close(notifier->fd);
500 
501    return false;
502 #endif
503 }
504 
505 static void
radv_device_finish_notifier(struct radv_device * device)506 radv_device_finish_notifier(struct radv_device *device)
507 {
508 #ifdef __linux__
509    struct radv_notifier *notifier = &device->notifier;
510 
511    if (!notifier->thread)
512       return;
513 
514    notifier->quit = true;
515    thrd_join(notifier->thread, NULL);
516    inotify_rm_watch(notifier->fd, notifier->watch);
517    close(notifier->fd);
518 #endif
519 }
520 
521 static VkResult
radv_device_init_perf_counter(struct radv_device * device)522 radv_device_init_perf_counter(struct radv_device *device)
523 {
524    const struct radv_physical_device *pdev = radv_device_physical(device);
525    const size_t bo_size = PERF_CTR_BO_PASS_OFFSET + sizeof(uint64_t) * PERF_CTR_MAX_PASSES;
526    VkResult result;
527 
528    result = radv_bo_create(device, NULL, bo_size, 4096, RADEON_DOMAIN_GTT,
529                            RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING, RADV_BO_PRIORITY_UPLOAD_BUFFER,
530                            0, true, &device->perf_counter_bo);
531    if (result != VK_SUCCESS)
532       return result;
533 
534    device->perf_counter_lock_cs = calloc(sizeof(struct radeon_winsys_cs *), 2 * PERF_CTR_MAX_PASSES);
535    if (!device->perf_counter_lock_cs)
536       return VK_ERROR_OUT_OF_HOST_MEMORY;
537 
538    if (!pdev->ac_perfcounters.blocks)
539       return VK_ERROR_INITIALIZATION_FAILED;
540 
541    return VK_SUCCESS;
542 }
543 
544 static void
radv_device_finish_perf_counter(struct radv_device * device)545 radv_device_finish_perf_counter(struct radv_device *device)
546 {
547    if (device->perf_counter_bo)
548       radv_bo_destroy(device, NULL, device->perf_counter_bo);
549 
550    if (!device->perf_counter_lock_cs)
551       return;
552 
553    for (unsigned i = 0; i < 2 * PERF_CTR_MAX_PASSES; ++i) {
554       if (device->perf_counter_lock_cs[i])
555          device->ws->cs_destroy(device->perf_counter_lock_cs[i]);
556    }
557 
558    free(device->perf_counter_lock_cs);
559 }
560 
561 static VkResult
radv_device_init_memory_cache(struct radv_device * device)562 radv_device_init_memory_cache(struct radv_device *device)
563 {
564    struct vk_pipeline_cache_create_info info = {.weak_ref = true};
565 
566    device->mem_cache = vk_pipeline_cache_create(&device->vk, &info, NULL);
567    if (!device->mem_cache)
568       return VK_ERROR_OUT_OF_HOST_MEMORY;
569 
570    return VK_SUCCESS;
571 }
572 
573 static void
radv_device_finish_memory_cache(struct radv_device * device)574 radv_device_finish_memory_cache(struct radv_device *device)
575 {
576    if (device->mem_cache)
577       vk_pipeline_cache_destroy(device->mem_cache, NULL);
578 }
579 
580 static VkResult
radv_device_init_rgp(struct radv_device * device)581 radv_device_init_rgp(struct radv_device *device)
582 {
583    const struct radv_physical_device *pdev = radv_device_physical(device);
584    const struct radv_instance *instance = radv_physical_device_instance(pdev);
585 
586    if (!(instance->vk.trace_mode & RADV_TRACE_MODE_RGP))
587       return VK_SUCCESS;
588 
589    if (pdev->info.gfx_level < GFX8 || pdev->info.gfx_level > GFX11_5) {
590       fprintf(stderr, "GPU hardware not supported: refer to "
591                       "the RGP documentation for the list of "
592                       "supported GPUs!\n");
593       abort();
594    }
595 
596    if (!radv_sqtt_init(device))
597       return VK_ERROR_INITIALIZATION_FAILED;
598 
599    fprintf(stderr,
600            "radv: Thread trace support is enabled (initial buffer size: %u MiB, "
601            "instruction timing: %s, cache counters: %s, queue events: %s).\n",
602            device->sqtt.buffer_size / (1024 * 1024), radv_is_instruction_timing_enabled() ? "enabled" : "disabled",
603            radv_spm_trace_enabled(instance) ? "enabled" : "disabled",
604            radv_sqtt_queue_events_enabled() ? "enabled" : "disabled");
605 
606    if (radv_spm_trace_enabled(instance)) {
607       if (pdev->info.gfx_level >= GFX10 && pdev->info.gfx_level < GFX11_5) {
608          if (!radv_spm_init(device))
609             return VK_ERROR_INITIALIZATION_FAILED;
610       } else {
611          fprintf(stderr, "radv: SPM isn't supported for this GPU (%s)!\n", pdev->name);
612       }
613    }
614 
615    return VK_SUCCESS;
616 }
617 
618 static void
radv_device_finish_rgp(struct radv_device * device)619 radv_device_finish_rgp(struct radv_device *device)
620 {
621    radv_sqtt_finish(device);
622    radv_spm_finish(device);
623 }
624 
625 static void
radv_device_init_rmv(struct radv_device * device)626 radv_device_init_rmv(struct radv_device *device)
627 {
628    const struct radv_physical_device *pdev = radv_device_physical(device);
629    const struct radv_instance *instance = radv_physical_device_instance(pdev);
630 
631    if (!(instance->vk.trace_mode & VK_TRACE_MODE_RMV))
632       return;
633 
634    struct vk_rmv_device_info info;
635    memset(&info, 0, sizeof(struct vk_rmv_device_info));
636    radv_rmv_fill_device_info(pdev, &info);
637    vk_memory_trace_init(&device->vk, &info);
638    radv_memory_trace_init(device);
639 }
640 
641 static VkResult
radv_device_init_trap_handler(struct radv_device * device)642 radv_device_init_trap_handler(struct radv_device *device)
643 {
644    const struct radv_physical_device *pdev = radv_device_physical(device);
645 
646    if (!radv_trap_handler_enabled())
647       return VK_SUCCESS;
648 
649    /* TODO: Add support for more hardware. */
650    assert(pdev->info.gfx_level == GFX8);
651 
652    fprintf(stderr, "**********************************************************************\n");
653    fprintf(stderr, "* WARNING: RADV_TRAP_HANDLER is experimental and only for debugging! *\n");
654    fprintf(stderr, "**********************************************************************\n");
655 
656    if (!radv_trap_handler_init(device))
657       return VK_ERROR_INITIALIZATION_FAILED;
658 
659    return VK_SUCCESS;
660 }
661 
662 static VkResult
radv_device_init_device_fault_detection(struct radv_device * device)663 radv_device_init_device_fault_detection(struct radv_device *device)
664 {
665    const struct radv_physical_device *pdev = radv_device_physical(device);
666    struct radv_instance *instance = radv_physical_device_instance(pdev);
667 
668    if (!radv_device_fault_detection_enabled(device))
669       return VK_SUCCESS;
670 
671    if (!radv_init_trace(device))
672       return VK_ERROR_INITIALIZATION_FAILED;
673 
674    fprintf(stderr, "*****************************************************************************\n");
675    fprintf(stderr, "* WARNING: RADV_DEBUG=hang is costly and should only be used for debugging! *\n");
676    fprintf(stderr, "*****************************************************************************\n");
677 
678    /* Wait for idle after every draw/dispatch to identify the
679     * first bad call.
680     */
681    instance->debug_flags |= RADV_DEBUG_SYNC_SHADERS;
682 
683    radv_dump_enabled_options(device, stderr);
684 
685    return VK_SUCCESS;
686 }
687 
688 static void
radv_device_finish_device_fault_detection(struct radv_device * device)689 radv_device_finish_device_fault_detection(struct radv_device *device)
690 {
691    radv_finish_trace(device);
692    ralloc_free(device->gpu_hang_report);
693 }
694 
695 static VkResult
radv_device_init_tools(struct radv_device * device)696 radv_device_init_tools(struct radv_device *device)
697 {
698    const struct radv_physical_device *pdev = radv_device_physical(device);
699    struct radv_instance *instance = radv_physical_device_instance(pdev);
700    VkResult result;
701 
702    result = radv_device_init_device_fault_detection(device);
703    if (result != VK_SUCCESS)
704       return result;
705 
706    result = radv_device_init_rgp(device);
707    if (result != VK_SUCCESS)
708       return result;
709 
710    radv_device_init_rmv(device);
711 
712    result = radv_device_init_trap_handler(device);
713    if (result != VK_SUCCESS)
714       return result;
715 
716    if ((instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(pdev, false)) {
717       result = radv_rra_trace_init(device);
718       if (result != VK_SUCCESS)
719          return result;
720    }
721 
722    result = radv_printf_data_init(device);
723    if (result != VK_SUCCESS)
724       return result;
725 
726    return VK_SUCCESS;
727 }
728 
729 static void
radv_device_finish_tools(struct radv_device * device)730 radv_device_finish_tools(struct radv_device *device)
731 {
732    radv_printf_data_finish(device);
733    radv_rra_trace_finish(radv_device_to_handle(device), &device->rra_trace);
734    radv_trap_handler_finish(device);
735    radv_memory_trace_finish(device);
736    radv_device_finish_rgp(device);
737    radv_device_finish_device_fault_detection(device);
738 }
739 
740 struct dispatch_table_builder {
741    struct vk_device_dispatch_table *tables[RADV_DISPATCH_TABLE_COUNT];
742    bool used[RADV_DISPATCH_TABLE_COUNT];
743    bool initialized[RADV_DISPATCH_TABLE_COUNT];
744 };
745 
746 static void
add_entrypoints(struct dispatch_table_builder * b,const struct vk_device_entrypoint_table * entrypoints,enum radv_dispatch_table table)747 add_entrypoints(struct dispatch_table_builder *b, const struct vk_device_entrypoint_table *entrypoints,
748                 enum radv_dispatch_table table)
749 {
750    for (int32_t i = table - 1; i >= RADV_DEVICE_DISPATCH_TABLE; i--) {
751       if (i == RADV_DEVICE_DISPATCH_TABLE || b->used[i]) {
752          vk_device_dispatch_table_from_entrypoints(b->tables[i], entrypoints, !b->initialized[i]);
753          b->initialized[i] = true;
754       }
755    }
756 
757    if (table < RADV_DISPATCH_TABLE_COUNT)
758       b->used[table] = true;
759 }
760 
761 static void
init_dispatch_tables(struct radv_device * device,struct radv_physical_device * pdev)762 init_dispatch_tables(struct radv_device *device, struct radv_physical_device *pdev)
763 {
764    const struct radv_instance *instance = radv_physical_device_instance(pdev);
765    struct dispatch_table_builder b = {0};
766    b.tables[RADV_DEVICE_DISPATCH_TABLE] = &device->vk.dispatch_table;
767    b.tables[RADV_ANNOTATE_DISPATCH_TABLE] = &device->layer_dispatch.annotate;
768    b.tables[RADV_APP_DISPATCH_TABLE] = &device->layer_dispatch.app;
769    b.tables[RADV_RGP_DISPATCH_TABLE] = &device->layer_dispatch.rgp;
770    b.tables[RADV_RRA_DISPATCH_TABLE] = &device->layer_dispatch.rra;
771    b.tables[RADV_RMV_DISPATCH_TABLE] = &device->layer_dispatch.rmv;
772    b.tables[RADV_CTX_ROLL_DISPATCH_TABLE] = &device->layer_dispatch.ctx_roll;
773 
774    bool gather_ctx_rolls = instance->vk.trace_mode & RADV_TRACE_MODE_CTX_ROLLS;
775    if (radv_device_fault_detection_enabled(device) || gather_ctx_rolls)
776       add_entrypoints(&b, &annotate_device_entrypoints, RADV_ANNOTATE_DISPATCH_TABLE);
777 
778    if (!strcmp(instance->drirc.app_layer, "metroexodus")) {
779       add_entrypoints(&b, &metro_exodus_device_entrypoints, RADV_APP_DISPATCH_TABLE);
780    } else if (!strcmp(instance->drirc.app_layer, "rage2")) {
781       add_entrypoints(&b, &rage2_device_entrypoints, RADV_APP_DISPATCH_TABLE);
782    } else if (!strcmp(instance->drirc.app_layer, "quanticdream")) {
783       add_entrypoints(&b, &quantic_dream_device_entrypoints, RADV_APP_DISPATCH_TABLE);
784    }
785 
786    if (instance->vk.trace_mode & RADV_TRACE_MODE_RGP)
787       add_entrypoints(&b, &sqtt_device_entrypoints, RADV_RGP_DISPATCH_TABLE);
788 
789    if ((instance->vk.trace_mode & RADV_TRACE_MODE_RRA) && radv_enable_rt(pdev, false))
790       add_entrypoints(&b, &rra_device_entrypoints, RADV_RRA_DISPATCH_TABLE);
791 
792 #ifndef _WIN32
793    if (instance->vk.trace_mode & VK_TRACE_MODE_RMV)
794       add_entrypoints(&b, &rmv_device_entrypoints, RADV_RMV_DISPATCH_TABLE);
795 #endif
796 
797    if (gather_ctx_rolls)
798       add_entrypoints(&b, &ctx_roll_device_entrypoints, RADV_CTX_ROLL_DISPATCH_TABLE);
799 
800    add_entrypoints(&b, &radv_device_entrypoints, RADV_DISPATCH_TABLE_COUNT);
801    add_entrypoints(&b, &wsi_device_entrypoints, RADV_DISPATCH_TABLE_COUNT);
802    add_entrypoints(&b, &vk_common_device_entrypoints, RADV_DISPATCH_TABLE_COUNT);
803 }
804 
805 static VkResult
capture_trace(VkQueue _queue)806 capture_trace(VkQueue _queue)
807 {
808    VK_FROM_HANDLE(radv_queue, queue, _queue);
809    struct radv_device *device = radv_queue_device(queue);
810    const struct radv_physical_device *pdev = radv_device_physical(device);
811    const struct radv_instance *instance = radv_physical_device_instance(pdev);
812 
813    VkResult result = VK_SUCCESS;
814 
815    if (instance->vk.trace_mode & RADV_TRACE_MODE_RRA)
816       device->rra_trace.triggered = true;
817 
818    if (device->vk.memory_trace_data.is_enabled) {
819       simple_mtx_lock(&device->vk.memory_trace_data.token_mtx);
820       radv_rmv_collect_trace_events(device);
821       vk_dump_rmv_capture(&device->vk.memory_trace_data);
822       simple_mtx_unlock(&device->vk.memory_trace_data.token_mtx);
823    }
824 
825    if (instance->vk.trace_mode & RADV_TRACE_MODE_RGP)
826       device->sqtt_triggered = true;
827 
828    if (instance->vk.trace_mode & RADV_TRACE_MODE_CTX_ROLLS) {
829       char filename[2048];
830       time_t t = time(NULL);
831       struct tm now = *localtime(&t);
832       snprintf(filename, sizeof(filename), "/tmp/%s_%04d.%02d.%02d_%02d.%02d.%02d.ctxroll", util_get_process_name(),
833                1900 + now.tm_year, now.tm_mon + 1, now.tm_mday, now.tm_hour, now.tm_min, now.tm_sec);
834 
835       simple_mtx_lock(&device->ctx_roll_mtx);
836 
837       device->ctx_roll_file = fopen(filename, "w");
838       if (device->ctx_roll_file)
839          fprintf(stderr, "radv: Writing context rolls to '%s'...\n", filename);
840 
841       simple_mtx_unlock(&device->ctx_roll_mtx);
842    }
843 
844    return result;
845 }
846 
847 static void
radv_device_init_cache_key(struct radv_device * device)848 radv_device_init_cache_key(struct radv_device *device)
849 {
850    const struct radv_physical_device *pdev = radv_device_physical(device);
851    struct radv_device_cache_key *key = &device->cache_key;
852 
853    key->disable_trunc_coord = device->disable_trunc_coord;
854    key->image_2d_view_of_3d = device->vk.enabled_features.image2DViewOf3D && pdev->info.gfx_level == GFX9;
855    key->mesh_shader_queries = device->vk.enabled_features.meshShaderQueries;
856    key->primitives_generated_query = radv_uses_primitives_generated_query(device);
857 
858    /* The Vulkan spec says:
859     *  "Binary shaders retrieved from a physical device with a certain shaderBinaryUUID are
860     *   guaranteed to be compatible with all other physical devices reporting the same
861     *   shaderBinaryUUID and the same or higher shaderBinaryVersion."
862     *
863     * That means the driver should compile shaders for the "worst" case of all features being
864     * enabled, regardless of what features are actually enabled on the logical device.
865     */
866    if (device->vk.enabled_features.shaderObject) {
867       key->image_2d_view_of_3d = pdev->info.gfx_level == GFX9;
868       key->primitives_generated_query = true;
869    }
870 
871    _mesa_blake3_compute(key, sizeof(*key), device->cache_hash);
872 }
873 
874 static void
radv_create_gfx_preamble(struct radv_device * device)875 radv_create_gfx_preamble(struct radv_device *device)
876 {
877    struct radeon_cmdbuf *cs = device->ws->cs_create(device->ws, AMD_IP_GFX, false);
878    if (!cs)
879       return;
880 
881    radeon_check_space(device->ws, cs, 512);
882 
883    radv_emit_graphics(device, cs);
884 
885    device->ws->cs_pad(cs, 0);
886 
887    VkResult result = radv_bo_create(
888       device, NULL, cs->cdw * 4, 4096, device->ws->cs_domain(device->ws),
889       RADEON_FLAG_CPU_ACCESS | RADEON_FLAG_NO_INTERPROCESS_SHARING | RADEON_FLAG_READ_ONLY | RADEON_FLAG_GTT_WC,
890       RADV_BO_PRIORITY_CS, 0, true, &device->gfx_init);
891    if (result != VK_SUCCESS)
892       goto fail;
893 
894    void *map = radv_buffer_map(device->ws, device->gfx_init);
895    if (!map) {
896       radv_bo_destroy(device, NULL, device->gfx_init);
897       device->gfx_init = NULL;
898       goto fail;
899    }
900    memcpy(map, cs->buf, cs->cdw * 4);
901 
902    device->ws->buffer_unmap(device->ws, device->gfx_init, false);
903    device->gfx_init_size_dw = cs->cdw;
904 fail:
905    device->ws->cs_destroy(cs);
906 }
907 
908 /* For MSAA sample positions. */
909 #define FILL_SREG(s0x, s0y, s1x, s1y, s2x, s2y, s3x, s3y)                                                              \
910    ((((unsigned)(s0x)&0xf) << 0) | (((unsigned)(s0y)&0xf) << 4) | (((unsigned)(s1x)&0xf) << 8) |                       \
911     (((unsigned)(s1y)&0xf) << 12) | (((unsigned)(s2x)&0xf) << 16) | (((unsigned)(s2y)&0xf) << 20) |                    \
912     (((unsigned)(s3x)&0xf) << 24) | (((unsigned)(s3y)&0xf) << 28))
913 
914 /* For obtaining location coordinates from registers */
915 #define SEXT4(x)               ((int)((x) | ((x)&0x8 ? 0xfffffff0 : 0)))
916 #define GET_SFIELD(reg, index) SEXT4(((reg) >> ((index)*4)) & 0xf)
917 #define GET_SX(reg, index)     GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2)
918 #define GET_SY(reg, index)     GET_SFIELD((reg)[(index) / 4], ((index) % 4) * 2 + 1)
919 
920 /* 1x MSAA */
921 static const uint32_t sample_locs_1x = FILL_SREG(0, 0, 0, 0, 0, 0, 0, 0);
922 static const unsigned max_dist_1x = 0;
923 static const uint64_t centroid_priority_1x = 0x0000000000000000ull;
924 
925 /* 2xMSAA */
926 static const uint32_t sample_locs_2x = FILL_SREG(4, 4, -4, -4, 0, 0, 0, 0);
927 static const unsigned max_dist_2x = 4;
928 static const uint64_t centroid_priority_2x = 0x1010101010101010ull;
929 
930 /* 4xMSAA */
931 static const uint32_t sample_locs_4x = FILL_SREG(-2, -6, 6, -2, -6, 2, 2, 6);
932 static const unsigned max_dist_4x = 6;
933 static const uint64_t centroid_priority_4x = 0x3210321032103210ull;
934 
935 /* 8xMSAA */
936 static const uint32_t sample_locs_8x[] = {
937    FILL_SREG(1, -3, -1, 3, 5, 1, -3, -5),
938    FILL_SREG(-5, 5, -7, -1, 3, 7, 7, -7),
939    /* The following are unused by hardware, but we emit them to IBs
940     * instead of multiple SET_CONTEXT_REG packets. */
941    0,
942    0,
943 };
944 static const unsigned max_dist_8x = 7;
945 static const uint64_t centroid_priority_8x = 0x7654321076543210ull;
946 
947 unsigned
radv_get_default_max_sample_dist(int log_samples)948 radv_get_default_max_sample_dist(int log_samples)
949 {
950    unsigned max_dist[] = {
951       max_dist_1x,
952       max_dist_2x,
953       max_dist_4x,
954       max_dist_8x,
955    };
956    return max_dist[log_samples];
957 }
958 
959 void
radv_emit_default_sample_locations(const struct radv_physical_device * pdev,struct radeon_cmdbuf * cs,int nr_samples)960 radv_emit_default_sample_locations(const struct radv_physical_device *pdev, struct radeon_cmdbuf *cs, int nr_samples)
961 {
962    uint64_t centroid_priority;
963 
964    switch (nr_samples) {
965    default:
966    case 1:
967       centroid_priority = centroid_priority_1x;
968 
969       radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_1x);
970       radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_1x);
971       radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_1x);
972       radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_1x);
973       break;
974    case 2:
975       centroid_priority = centroid_priority_2x;
976 
977       radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_2x);
978       radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_2x);
979       radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_2x);
980       radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_2x);
981       break;
982    case 4:
983       centroid_priority = centroid_priority_4x;
984 
985       radeon_set_context_reg(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, sample_locs_4x);
986       radeon_set_context_reg(cs, R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, sample_locs_4x);
987       radeon_set_context_reg(cs, R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, sample_locs_4x);
988       radeon_set_context_reg(cs, R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, sample_locs_4x);
989       break;
990    case 8:
991       centroid_priority = centroid_priority_8x;
992 
993       radeon_set_context_reg_seq(cs, R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 14);
994       radeon_emit_array(cs, sample_locs_8x, 4);
995       radeon_emit_array(cs, sample_locs_8x, 4);
996       radeon_emit_array(cs, sample_locs_8x, 4);
997       radeon_emit_array(cs, sample_locs_8x, 2);
998       break;
999    }
1000 
1001    if (pdev->info.gfx_level >= GFX12) {
1002       radeon_set_context_reg_seq(cs, R_028BF0_PA_SC_CENTROID_PRIORITY_0, 2);
1003    } else {
1004       radeon_set_context_reg_seq(cs, R_028BD4_PA_SC_CENTROID_PRIORITY_0, 2);
1005    }
1006    radeon_emit(cs, centroid_priority);
1007    radeon_emit(cs, centroid_priority >> 32);
1008 }
1009 
1010 static void
radv_get_sample_position(struct radv_device * device,unsigned sample_count,unsigned sample_index,float * out_value)1011 radv_get_sample_position(struct radv_device *device, unsigned sample_count, unsigned sample_index, float *out_value)
1012 {
1013    const uint32_t *sample_locs;
1014 
1015    switch (sample_count) {
1016    case 1:
1017    default:
1018       sample_locs = &sample_locs_1x;
1019       break;
1020    case 2:
1021       sample_locs = &sample_locs_2x;
1022       break;
1023    case 4:
1024       sample_locs = &sample_locs_4x;
1025       break;
1026    case 8:
1027       sample_locs = sample_locs_8x;
1028       break;
1029    }
1030 
1031    out_value[0] = (GET_SX(sample_locs, sample_index) + 8) / 16.0f;
1032    out_value[1] = (GET_SY(sample_locs, sample_index) + 8) / 16.0f;
1033 }
1034 
1035 static void
radv_device_init_msaa(struct radv_device * device)1036 radv_device_init_msaa(struct radv_device *device)
1037 {
1038    int i;
1039 
1040    radv_get_sample_position(device, 1, 0, device->sample_locations_1x[0]);
1041 
1042    for (i = 0; i < 2; i++)
1043       radv_get_sample_position(device, 2, i, device->sample_locations_2x[i]);
1044    for (i = 0; i < 4; i++)
1045       radv_get_sample_position(device, 4, i, device->sample_locations_4x[i]);
1046    for (i = 0; i < 8; i++)
1047       radv_get_sample_position(device, 8, i, device->sample_locations_8x[i]);
1048 }
1049 
1050 VKAPI_ATTR VkResult VKAPI_CALL
radv_CreateDevice(VkPhysicalDevice physicalDevice,const VkDeviceCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkDevice * pDevice)1051 radv_CreateDevice(VkPhysicalDevice physicalDevice, const VkDeviceCreateInfo *pCreateInfo,
1052                   const VkAllocationCallbacks *pAllocator, VkDevice *pDevice)
1053 {
1054    VK_FROM_HANDLE(radv_physical_device, pdev, physicalDevice);
1055    struct radv_instance *instance = radv_physical_device_instance(pdev);
1056    VkResult result;
1057    struct radv_device *device;
1058 
1059    bool overallocation_disallowed = false;
1060 
1061    vk_foreach_struct_const (ext, pCreateInfo->pNext) {
1062       switch (ext->sType) {
1063       case VK_STRUCTURE_TYPE_DEVICE_MEMORY_OVERALLOCATION_CREATE_INFO_AMD: {
1064          const VkDeviceMemoryOverallocationCreateInfoAMD *overallocation = (const void *)ext;
1065          if (overallocation->overallocationBehavior == VK_MEMORY_OVERALLOCATION_BEHAVIOR_DISALLOWED_AMD)
1066             overallocation_disallowed = true;
1067          break;
1068       }
1069       default:
1070          break;
1071       }
1072    }
1073 
1074    device = vk_zalloc2(&instance->vk.alloc, pAllocator, sizeof(*device), 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1075    if (!device)
1076       return vk_error(instance, VK_ERROR_OUT_OF_HOST_MEMORY);
1077 
1078    result = vk_device_init(&device->vk, &pdev->vk, NULL, pCreateInfo, pAllocator);
1079    if (result != VK_SUCCESS) {
1080       vk_free(&device->vk.alloc, device);
1081       return result;
1082    }
1083 
1084    device->vk.capture_trace = capture_trace;
1085 
1086    device->vk.command_buffer_ops = &radv_cmd_buffer_ops;
1087 
1088    init_dispatch_tables(device, pdev);
1089 
1090    simple_mtx_init(&device->ctx_roll_mtx, mtx_plain);
1091    simple_mtx_init(&device->trace_mtx, mtx_plain);
1092    simple_mtx_init(&device->pstate_mtx, mtx_plain);
1093    simple_mtx_init(&device->rt_handles_mtx, mtx_plain);
1094    simple_mtx_init(&device->compute_scratch_mtx, mtx_plain);
1095    simple_mtx_init(&device->pso_cache_stats_mtx, mtx_plain);
1096 
1097    device->rt_handles = _mesa_hash_table_create(NULL, _mesa_hash_u32, _mesa_key_u32_equal);
1098 
1099    device->ws = pdev->ws;
1100    vk_device_set_drm_fd(&device->vk, device->ws->get_fd(device->ws));
1101 
1102    /* With update after bind we can't attach bo's to the command buffer
1103     * from the descriptor set anymore, so we have to use a global BO list.
1104     */
1105    device->use_global_bo_list =
1106       (instance->perftest_flags & RADV_PERFTEST_BO_LIST) || device->vk.enabled_features.bufferDeviceAddress ||
1107       device->vk.enabled_features.descriptorIndexing || device->vk.enabled_extensions.EXT_descriptor_indexing ||
1108       device->vk.enabled_extensions.EXT_buffer_device_address ||
1109       device->vk.enabled_extensions.KHR_buffer_device_address ||
1110       device->vk.enabled_extensions.KHR_ray_tracing_pipeline ||
1111       device->vk.enabled_extensions.KHR_acceleration_structure ||
1112       device->vk.enabled_extensions.VALVE_descriptor_set_host_mapping;
1113 
1114    radv_init_shader_arenas(device);
1115 
1116    device->overallocation_disallowed = overallocation_disallowed;
1117    mtx_init(&device->overallocation_mutex, mtx_plain);
1118 
1119    if (pdev->info.register_shadowing_required || instance->debug_flags & RADV_DEBUG_SHADOW_REGS)
1120       device->uses_shadow_regs = true;
1121 
1122    /* Create one context per queue priority. */
1123    for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
1124       const VkDeviceQueueCreateInfo *queue_create = &pCreateInfo->pQueueCreateInfos[i];
1125       const VkDeviceQueueGlobalPriorityCreateInfoKHR *global_priority =
1126          vk_find_struct_const(queue_create->pNext, DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
1127       enum radeon_ctx_priority priority = radv_get_queue_global_priority(global_priority);
1128 
1129       if (device->hw_ctx[priority])
1130          continue;
1131 
1132       result = device->ws->ctx_create(device->ws, priority, &device->hw_ctx[priority]);
1133       if (result != VK_SUCCESS)
1134          goto fail_queue;
1135    }
1136 
1137    for (unsigned i = 0; i < pCreateInfo->queueCreateInfoCount; i++) {
1138       const VkDeviceQueueCreateInfo *queue_create = &pCreateInfo->pQueueCreateInfos[i];
1139       uint32_t qfi = queue_create->queueFamilyIndex;
1140       const VkDeviceQueueGlobalPriorityCreateInfoKHR *global_priority =
1141          vk_find_struct_const(queue_create->pNext, DEVICE_QUEUE_GLOBAL_PRIORITY_CREATE_INFO_KHR);
1142 
1143       device->queues[qfi] = vk_zalloc(&device->vk.alloc, queue_create->queueCount * sizeof(struct radv_queue), 8,
1144                                       VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1145       if (!device->queues[qfi]) {
1146          result = VK_ERROR_OUT_OF_HOST_MEMORY;
1147          goto fail_queue;
1148       }
1149 
1150       device->queue_count[qfi] = queue_create->queueCount;
1151 
1152       for (unsigned q = 0; q < queue_create->queueCount; q++) {
1153          result = radv_queue_init(device, &device->queues[qfi][q], q, queue_create, global_priority);
1154          if (result != VK_SUCCESS)
1155             goto fail_queue;
1156       }
1157    }
1158    device->private_sdma_queue = VK_NULL_HANDLE;
1159 
1160    device->shader_use_invisible_vram = (instance->perftest_flags & RADV_PERFTEST_DMA_SHADERS) &&
1161                                        /* SDMA buffer copy is only implemented for GFX7+. */
1162                                        pdev->info.gfx_level >= GFX7;
1163    result = radv_init_shader_upload_queue(device);
1164    if (result != VK_SUCCESS)
1165       goto fail;
1166 
1167    device->pbb_allowed = pdev->info.gfx_level >= GFX9 && !(instance->debug_flags & RADV_DEBUG_NOBINNING);
1168 
1169    device->disable_trunc_coord = instance->drirc.disable_trunc_coord;
1170 
1171    if (instance->vk.app_info.engine_name && !strcmp(instance->vk.app_info.engine_name, "DXVK")) {
1172       /* For DXVK 2.3.0 and older, use dualSrcBlend to determine if this is D3D9. */
1173       bool is_d3d9 = !device->vk.enabled_features.dualSrcBlend;
1174       if (instance->vk.app_info.engine_version > VK_MAKE_VERSION(2, 3, 0))
1175          is_d3d9 = instance->vk.app_info.app_version & 0x1;
1176 
1177       device->disable_trunc_coord &= !is_d3d9;
1178    }
1179 
1180    /* The maximum number of scratch waves. Scratch space isn't divided
1181     * evenly between CUs. The number is only a function of the number of CUs.
1182     * We can decrease the constant to decrease the scratch buffer size.
1183     *
1184     * sctx->scratch_waves must be >= the maximum possible size of
1185     * 1 threadgroup, so that the hw doesn't hang from being unable
1186     * to start any.
1187     *
1188     * The recommended value is 4 per CU at most. Higher numbers don't
1189     * bring much benefit, but they still occupy chip resources (think
1190     * async compute). I've seen ~2% performance difference between 4 and 32.
1191     */
1192    uint32_t max_threads_per_block = 2048;
1193    device->scratch_waves = MAX2(32 * pdev->info.num_cu, max_threads_per_block / 64);
1194 
1195    device->dispatch_initiator = S_00B800_COMPUTE_SHADER_EN(1);
1196 
1197    if (pdev->info.gfx_level >= GFX7) {
1198       /* If the KMD allows it (there is a KMD hw register for it),
1199        * allow launching waves out-of-order.
1200        */
1201       device->dispatch_initiator |= S_00B800_ORDER_MODE(1);
1202    }
1203    if (pdev->info.gfx_level >= GFX10) {
1204       /* Enable asynchronous compute tunneling. The KMD restricts this feature
1205        * to high-priority compute queues, so setting the bit on any other queue
1206        * is a no-op. PAL always sets this bit as well.
1207        */
1208       device->dispatch_initiator |= S_00B800_TUNNEL_ENABLE(1);
1209    }
1210 
1211    /* Disable partial preemption for task shaders.
1212     * The kernel may not support preemption, but PAL always sets this bit,
1213     * so let's also set it here for consistency.
1214     */
1215    device->dispatch_initiator_task = device->dispatch_initiator | S_00B800_DISABLE_DISP_PREMPT_EN(1);
1216 
1217    if (pdev->info.gfx_level == GFX10_3) {
1218       if (getenv("RADV_FORCE_VRS_CONFIG_FILE")) {
1219          const char *file = radv_get_force_vrs_config_file();
1220 
1221          device->force_vrs = radv_parse_force_vrs_config_file(file);
1222 
1223          if (radv_device_init_notifier(device)) {
1224             device->force_vrs_enabled = true;
1225          } else {
1226             fprintf(stderr, "radv: Failed to initialize the notifier for RADV_FORCE_VRS_CONFIG_FILE!\n");
1227          }
1228       } else if (getenv("RADV_FORCE_VRS")) {
1229          const char *vrs_rates = getenv("RADV_FORCE_VRS");
1230 
1231          device->force_vrs = radv_parse_vrs_rates(vrs_rates);
1232          device->force_vrs_enabled = device->force_vrs != RADV_FORCE_VRS_1x1;
1233       }
1234    }
1235 
1236    /* PKT3_LOAD_SH_REG_INDEX is supported on GFX8+, but it hangs with compute queues until GFX10.3. */
1237    device->load_grid_size_from_user_sgpr = pdev->info.gfx_level >= GFX10_3;
1238 
1239    /* Keep shader info for GPU hangs debugging. */
1240    device->keep_shader_info = radv_device_fault_detection_enabled(device) || radv_trap_handler_enabled();
1241 
1242    /* Initialize the per-device cache key before compiling meta shaders. */
1243    radv_device_init_cache_key(device);
1244 
1245    result = radv_device_init_tools(device);
1246    if (result != VK_SUCCESS)
1247       goto fail;
1248 
1249    result = radv_device_init_meta(device);
1250    if (result != VK_SUCCESS)
1251       goto fail;
1252 
1253    radv_device_init_msaa(device);
1254 
1255    /* If the border color extension is enabled, let's create the buffer we need. */
1256    if (device->vk.enabled_features.customBorderColors) {
1257       result = radv_device_init_border_color(device);
1258       if (result != VK_SUCCESS)
1259          goto fail;
1260    }
1261 
1262    if (device->vk.enabled_features.vertexInputDynamicState || device->vk.enabled_features.graphicsPipelineLibrary ||
1263        device->vk.enabled_features.shaderObject) {
1264       result = radv_device_init_vs_prologs(device);
1265       if (result != VK_SUCCESS)
1266          goto fail;
1267    }
1268 
1269    if (device->vk.enabled_features.graphicsPipelineLibrary || device->vk.enabled_features.shaderObject ||
1270        device->vk.enabled_features.extendedDynamicState3ColorBlendEnable ||
1271        device->vk.enabled_features.extendedDynamicState3ColorWriteMask ||
1272        device->vk.enabled_features.extendedDynamicState3AlphaToCoverageEnable ||
1273        device->vk.enabled_features.extendedDynamicState3ColorBlendEquation) {
1274       if (!radv_shader_part_cache_init(&device->ps_epilogs, &ps_epilog_ops)) {
1275          result = VK_ERROR_OUT_OF_HOST_MEMORY;
1276          goto fail;
1277       }
1278    }
1279 
1280    if (!(instance->debug_flags & RADV_DEBUG_NO_IBS))
1281       radv_create_gfx_preamble(device);
1282 
1283    if (!device->vk.disable_internal_cache) {
1284       result = radv_device_init_memory_cache(device);
1285       if (result != VK_SUCCESS)
1286          goto fail_meta;
1287    }
1288 
1289    device->force_aniso = MIN2(16, (int)debug_get_num_option("RADV_TEX_ANISO", -1));
1290    if (device->force_aniso >= 0) {
1291       fprintf(stderr, "radv: Forcing anisotropy filter to %ix\n", 1 << util_logbase2(device->force_aniso));
1292    }
1293 
1294    if (device->vk.enabled_features.performanceCounterQueryPools) {
1295       result = radv_device_init_perf_counter(device);
1296       if (result != VK_SUCCESS)
1297          goto fail_cache;
1298    }
1299 
1300    if (device->vk.enabled_features.rayTracingPipelineShaderGroupHandleCaptureReplay) {
1301       device->capture_replay_arena_vas = _mesa_hash_table_u64_create(NULL);
1302    }
1303 
1304    if (pdev->info.gfx_level == GFX11 && pdev->info.has_dedicated_vram && instance->drirc.force_pstate_peak_gfx11_dgpu) {
1305       if (!radv_device_acquire_performance_counters(device))
1306          fprintf(stderr, "radv: failed to set pstate to profile_peak.\n");
1307    }
1308 
1309    *pDevice = radv_device_to_handle(device);
1310    return VK_SUCCESS;
1311 
1312 fail_cache:
1313    radv_device_finish_memory_cache(device);
1314 fail_meta:
1315    radv_device_finish_meta(device);
1316 fail:
1317    radv_device_finish_perf_counter(device);
1318 
1319    radv_device_finish_tools(device);
1320 
1321    if (device->gfx_init)
1322       radv_bo_destroy(device, NULL, device->gfx_init);
1323 
1324    radv_device_finish_notifier(device);
1325    radv_device_finish_vs_prologs(device);
1326    if (device->ps_epilogs.ops)
1327       radv_shader_part_cache_finish(device, &device->ps_epilogs);
1328    radv_device_finish_border_color(device);
1329 
1330    radv_destroy_shader_upload_queue(device);
1331 
1332 fail_queue:
1333    for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) {
1334       for (unsigned q = 0; q < device->queue_count[i]; q++)
1335          radv_queue_finish(&device->queues[i][q]);
1336       if (device->queue_count[i])
1337          vk_free(&device->vk.alloc, device->queues[i]);
1338    }
1339 
1340    for (unsigned i = 0; i < RADV_NUM_HW_CTX; i++) {
1341       if (device->hw_ctx[i])
1342          device->ws->ctx_destroy(device->hw_ctx[i]);
1343    }
1344 
1345    radv_destroy_shader_arenas(device);
1346 
1347    _mesa_hash_table_destroy(device->rt_handles, NULL);
1348 
1349    simple_mtx_destroy(&device->ctx_roll_mtx);
1350    simple_mtx_destroy(&device->pstate_mtx);
1351    simple_mtx_destroy(&device->trace_mtx);
1352    simple_mtx_destroy(&device->rt_handles_mtx);
1353    simple_mtx_destroy(&device->compute_scratch_mtx);
1354    simple_mtx_destroy(&device->pso_cache_stats_mtx);
1355    mtx_destroy(&device->overallocation_mutex);
1356 
1357    vk_device_finish(&device->vk);
1358    vk_free(&device->vk.alloc, device);
1359    return result;
1360 }
1361 
1362 VKAPI_ATTR void VKAPI_CALL
radv_DestroyDevice(VkDevice _device,const VkAllocationCallbacks * pAllocator)1363 radv_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
1364 {
1365    VK_FROM_HANDLE(radv_device, device, _device);
1366 
1367    if (!device)
1368       return;
1369 
1370    radv_device_finish_perf_counter(device);
1371 
1372    if (device->gfx_init)
1373       radv_bo_destroy(device, NULL, device->gfx_init);
1374 
1375    radv_device_finish_notifier(device);
1376    radv_device_finish_vs_prologs(device);
1377    if (device->ps_epilogs.ops)
1378       radv_shader_part_cache_finish(device, &device->ps_epilogs);
1379    radv_device_finish_border_color(device);
1380    radv_device_finish_vrs_image(device);
1381 
1382    for (unsigned i = 0; i < RADV_MAX_QUEUE_FAMILIES; i++) {
1383       for (unsigned q = 0; q < device->queue_count[i]; q++)
1384          radv_queue_finish(&device->queues[i][q]);
1385       if (device->queue_count[i])
1386          vk_free(&device->vk.alloc, device->queues[i]);
1387    }
1388    if (device->private_sdma_queue != VK_NULL_HANDLE) {
1389       radv_queue_finish(device->private_sdma_queue);
1390       vk_free(&device->vk.alloc, device->private_sdma_queue);
1391    }
1392 
1393    _mesa_hash_table_destroy(device->rt_handles, NULL);
1394 
1395    radv_device_finish_meta(device);
1396 
1397    radv_device_finish_memory_cache(device);
1398 
1399    radv_destroy_shader_upload_queue(device);
1400 
1401    for (unsigned i = 0; i < RADV_NUM_HW_CTX; i++) {
1402       if (device->hw_ctx[i])
1403          device->ws->ctx_destroy(device->hw_ctx[i]);
1404    }
1405 
1406    mtx_destroy(&device->overallocation_mutex);
1407    simple_mtx_destroy(&device->ctx_roll_mtx);
1408    simple_mtx_destroy(&device->pstate_mtx);
1409    simple_mtx_destroy(&device->trace_mtx);
1410    simple_mtx_destroy(&device->rt_handles_mtx);
1411    simple_mtx_destroy(&device->compute_scratch_mtx);
1412    simple_mtx_destroy(&device->pso_cache_stats_mtx);
1413 
1414    radv_destroy_shader_arenas(device);
1415    if (device->capture_replay_arena_vas)
1416       _mesa_hash_table_u64_destroy(device->capture_replay_arena_vas);
1417 
1418    vk_device_finish(&device->vk);
1419    vk_free(&device->vk.alloc, device);
1420 }
1421 
1422 bool
radv_get_memory_fd(struct radv_device * device,struct radv_device_memory * memory,int * pFD)1423 radv_get_memory_fd(struct radv_device *device, struct radv_device_memory *memory, int *pFD)
1424 {
1425    /* Set BO metadata for dedicated image allocations.  We don't need it for import when the image
1426     * tiling is VK_IMAGE_TILING_DRM_FORMAT_MODIFIER_EXT, but we set it anyway for foreign consumers.
1427     */
1428    if (memory->image) {
1429       struct radeon_bo_metadata metadata;
1430 
1431       assert(memory->image->bindings[0].offset == 0);
1432       radv_init_metadata(device, memory->image, &metadata);
1433       device->ws->buffer_set_metadata(device->ws, memory->bo, &metadata);
1434    }
1435 
1436    return device->ws->buffer_get_fd(device->ws, memory->bo, pFD);
1437 }
1438 
1439 VKAPI_ATTR void VKAPI_CALL
radv_GetImageMemoryRequirements2(VkDevice _device,const VkImageMemoryRequirementsInfo2 * pInfo,VkMemoryRequirements2 * pMemoryRequirements)1440 radv_GetImageMemoryRequirements2(VkDevice _device, const VkImageMemoryRequirementsInfo2 *pInfo,
1441                                  VkMemoryRequirements2 *pMemoryRequirements)
1442 {
1443    VK_FROM_HANDLE(radv_device, device, _device);
1444    VK_FROM_HANDLE(radv_image, image, pInfo->image);
1445    const struct radv_physical_device *pdev = radv_device_physical(device);
1446    uint32_t alignment;
1447    uint64_t size;
1448 
1449    const VkImagePlaneMemoryRequirementsInfo *plane_info =
1450       vk_find_struct_const(pInfo->pNext, IMAGE_PLANE_MEMORY_REQUIREMENTS_INFO);
1451 
1452    if (plane_info) {
1453       const uint32_t plane = radv_plane_from_aspect(plane_info->planeAspect);
1454 
1455       size = image->planes[plane].surface.total_size;
1456       alignment = 1 << image->planes[plane].surface.alignment_log2;
1457    } else {
1458       size = image->size;
1459       alignment = image->alignment;
1460    }
1461 
1462    pMemoryRequirements->memoryRequirements.memoryTypeBits =
1463       ((1u << pdev->memory_properties.memoryTypeCount) - 1u) & ~pdev->memory_types_32bit;
1464 
1465    pMemoryRequirements->memoryRequirements.size = size;
1466    pMemoryRequirements->memoryRequirements.alignment = alignment;
1467 
1468    vk_foreach_struct (ext, pMemoryRequirements->pNext) {
1469       switch (ext->sType) {
1470       case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_REQUIREMENTS: {
1471          VkMemoryDedicatedRequirements *req = (VkMemoryDedicatedRequirements *)ext;
1472          req->requiresDedicatedAllocation = image->shareable && image->vk.tiling != VK_IMAGE_TILING_LINEAR;
1473          req->prefersDedicatedAllocation = req->requiresDedicatedAllocation;
1474          break;
1475       }
1476       default:
1477          break;
1478       }
1479    }
1480 }
1481 
1482 VKAPI_ATTR void VKAPI_CALL
radv_GetDeviceImageMemoryRequirements(VkDevice device,const VkDeviceImageMemoryRequirements * pInfo,VkMemoryRequirements2 * pMemoryRequirements)1483 radv_GetDeviceImageMemoryRequirements(VkDevice device, const VkDeviceImageMemoryRequirements *pInfo,
1484                                       VkMemoryRequirements2 *pMemoryRequirements)
1485 {
1486    UNUSED VkResult result;
1487    VkImage image;
1488 
1489    /* Determining the image size/alignment require to create a surface, which is complicated without
1490     * creating an image.
1491     * TODO: Avoid creating an image.
1492     */
1493    result =
1494       radv_image_create(device, &(struct radv_image_create_info){.vk_info = pInfo->pCreateInfo}, NULL, &image, true);
1495    assert(result == VK_SUCCESS);
1496 
1497    VkImageMemoryRequirementsInfo2 info2 = {
1498       .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_REQUIREMENTS_INFO_2,
1499       .image = image,
1500    };
1501 
1502    radv_GetImageMemoryRequirements2(device, &info2, pMemoryRequirements);
1503 
1504    radv_DestroyImage(device, image, NULL);
1505 }
1506 
1507 static uint32_t
radv_surface_max_layer_count(struct radv_image_view * iview)1508 radv_surface_max_layer_count(struct radv_image_view *iview)
1509 {
1510    return iview->vk.view_type == VK_IMAGE_VIEW_TYPE_3D ? iview->extent.depth
1511                                                        : (iview->vk.base_array_layer + iview->vk.layer_count);
1512 }
1513 
1514 unsigned
radv_get_dcc_max_uncompressed_block_size(const struct radv_device * device,const struct radv_image * image)1515 radv_get_dcc_max_uncompressed_block_size(const struct radv_device *device, const struct radv_image *image)
1516 {
1517    const struct radv_physical_device *pdev = radv_device_physical(device);
1518 
1519    if (pdev->info.gfx_level < GFX10 && image->vk.samples > 1) {
1520       if (image->planes[0].surface.bpe == 1)
1521          return V_028C78_MAX_BLOCK_SIZE_64B;
1522       else if (image->planes[0].surface.bpe == 2)
1523          return V_028C78_MAX_BLOCK_SIZE_128B;
1524    }
1525 
1526    return V_028C78_MAX_BLOCK_SIZE_256B;
1527 }
1528 
1529 void
radv_initialise_color_surface(struct radv_device * device,struct radv_color_buffer_info * cb,struct radv_image_view * iview)1530 radv_initialise_color_surface(struct radv_device *device, struct radv_color_buffer_info *cb,
1531                               struct radv_image_view *iview)
1532 {
1533    const struct radv_physical_device *pdev = radv_device_physical(device);
1534    const struct radv_instance *instance = radv_physical_device_instance(pdev);
1535    uint64_t va;
1536    const struct radv_image_plane *plane = &iview->image->planes[iview->plane_id];
1537    const struct radeon_surf *surf = &plane->surface;
1538 
1539    memset(cb, 0, sizeof(*cb));
1540 
1541    const unsigned num_layers =
1542       iview->image->vk.image_type == VK_IMAGE_TYPE_3D ? (iview->extent.depth - 1) : (iview->image->vk.array_layers - 1);
1543 
1544    const struct ac_cb_state cb_state = {
1545       .surf = surf,
1546       .format = vk_format_to_pipe_format(iview->vk.format),
1547       .width = vk_format_get_plane_width(iview->image->vk.format, iview->plane_id, iview->extent.width),
1548       .height = vk_format_get_plane_height(iview->image->vk.format, iview->plane_id, iview->extent.height),
1549       .first_layer = iview->vk.base_array_layer,
1550       .last_layer = radv_surface_max_layer_count(iview) - 1,
1551       .num_layers = num_layers,
1552       .num_samples = iview->image->vk.samples,
1553       .num_storage_samples = iview->image->vk.samples,
1554       .base_level = iview->vk.base_mip_level,
1555       .num_levels = iview->image->vk.mip_levels,
1556       .gfx10 =
1557          {
1558             .nbc_view = iview->nbc_view.valid ? &iview->nbc_view : NULL,
1559          },
1560    };
1561 
1562    ac_init_cb_surface(&pdev->info, &cb_state, &cb->ac);
1563 
1564    uint32_t plane_id = iview->image->disjoint ? iview->plane_id : 0;
1565    va = radv_image_get_va(iview->image, plane_id);
1566 
1567    const struct ac_mutable_cb_state mutable_cb_state = {
1568       .surf = surf,
1569       .cb = &cb->ac,
1570       .va = va,
1571       .base_level = iview->vk.base_mip_level,
1572       .num_samples = iview->image->vk.samples,
1573       .fmask_enabled = radv_image_has_fmask(iview->image),
1574       .cmask_enabled = radv_image_has_cmask(iview->image),
1575       .fast_clear_enabled = !(instance->debug_flags & RADV_DEBUG_NO_FAST_CLEARS),
1576       .tc_compat_cmask_enabled = radv_image_is_tc_compat_cmask(iview->image),
1577       .dcc_enabled = radv_dcc_enabled(iview->image, iview->vk.base_mip_level) &&
1578                      (pdev->info.gfx_level >= GFX11 || !iview->disable_dcc_mrt),
1579       .gfx10 =
1580          {
1581             .nbc_view = iview->nbc_view.valid ? &iview->nbc_view : NULL,
1582          },
1583    };
1584 
1585    ac_set_mutable_cb_surface_fields(&pdev->info, &mutable_cb_state, &cb->ac);
1586 }
1587 
1588 void
radv_initialise_vrs_surface(struct radv_image * image,struct radv_buffer * htile_buffer,struct radv_ds_buffer_info * ds)1589 radv_initialise_vrs_surface(struct radv_image *image, struct radv_buffer *htile_buffer, struct radv_ds_buffer_info *ds)
1590 {
1591    const struct radeon_surf *surf = &image->planes[0].surface;
1592 
1593    assert(image->vk.format == VK_FORMAT_D16_UNORM);
1594    memset(ds, 0, sizeof(*ds));
1595 
1596    ds->ac.db_z_info = S_028038_FORMAT(V_028040_Z_16) | S_028038_SW_MODE(surf->u.gfx9.swizzle_mode) |
1597                       S_028038_ZRANGE_PRECISION(1) | S_028038_TILE_SURFACE_ENABLE(1);
1598    ds->ac.db_stencil_info = S_02803C_FORMAT(V_028044_STENCIL_INVALID);
1599 
1600    ds->ac.db_depth_size = S_02801C_X_MAX(image->vk.extent.width - 1) | S_02801C_Y_MAX(image->vk.extent.height - 1);
1601 
1602    ds->ac.u.gfx6.db_htile_data_base = radv_buffer_get_va(htile_buffer->bo) >> 8;
1603    ds->ac.u.gfx6.db_htile_surface =
1604       S_028ABC_FULL_CACHE(1) | S_028ABC_PIPE_ALIGNED(1) | S_028ABC_VRS_HTILE_ENCODING(V_028ABC_VRS_HTILE_4BIT_ENCODING);
1605 }
1606 
1607 void
radv_initialise_ds_surface(const struct radv_device * device,struct radv_ds_buffer_info * ds,struct radv_image_view * iview,VkImageAspectFlags ds_aspects)1608 radv_initialise_ds_surface(const struct radv_device *device, struct radv_ds_buffer_info *ds,
1609                            struct radv_image_view *iview, VkImageAspectFlags ds_aspects)
1610 {
1611    const struct radv_physical_device *pdev = radv_device_physical(device);
1612    unsigned level = iview->vk.base_mip_level;
1613    bool stencil_only = iview->image->vk.format == VK_FORMAT_S8_UINT;
1614 
1615    assert(vk_format_get_plane_count(iview->image->vk.format) == 1);
1616 
1617    memset(ds, 0, sizeof(*ds));
1618 
1619    uint32_t max_slice = radv_surface_max_layer_count(iview) - 1;
1620 
1621    /* Recommended value for better performance with 4x and 8x. */
1622    ds->db_render_override2 = S_028010_DECOMPRESS_Z_ON_FLUSH(iview->image->vk.samples >= 4) |
1623                              S_028010_CENTROID_COMPUTATION_MODE(pdev->info.gfx_level >= GFX10_3);
1624 
1625    const struct ac_ds_state ds_state = {
1626       .surf = &iview->image->planes[0].surface,
1627       .va = radv_image_get_va(iview->image, 0),
1628       .format = vk_format_to_pipe_format(iview->image->vk.format),
1629       .width = iview->image->vk.extent.width,
1630       .height = iview->image->vk.extent.height,
1631       .level = level,
1632       .num_levels = iview->image->vk.mip_levels,
1633       .num_samples = iview->image->vk.samples,
1634       .first_layer = iview->vk.base_array_layer,
1635       .last_layer = max_slice,
1636       .stencil_only = stencil_only,
1637       .z_read_only = !(ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT),
1638       .stencil_read_only = !(ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT),
1639       .htile_enabled = radv_htile_enabled(iview->image, level),
1640       .htile_stencil_disabled = radv_image_tile_stencil_disabled(device, iview->image),
1641       .vrs_enabled = radv_image_has_vrs_htile(device, iview->image),
1642    };
1643 
1644    ac_init_ds_surface(&pdev->info, &ds_state, &ds->ac);
1645 
1646    const struct ac_mutable_ds_state mutable_ds_state = {
1647       .ds = &ds->ac,
1648       .format = vk_format_to_pipe_format(iview->image->vk.format),
1649       .tc_compat_htile_enabled = radv_htile_enabled(iview->image, level) && radv_image_is_tc_compat_htile(iview->image),
1650       .zrange_precision = true,
1651       .no_d16_compression = true,
1652    };
1653 
1654    ac_set_mutable_ds_surface_fields(&pdev->info, &mutable_ds_state, &ds->ac);
1655 
1656    if (pdev->info.gfx_level >= GFX11) {
1657       radv_gfx11_set_db_render_control(device, iview->image->vk.samples, &ds->db_render_control);
1658    }
1659 }
1660 
1661 void
radv_gfx11_set_db_render_control(const struct radv_device * device,unsigned num_samples,unsigned * db_render_control)1662 radv_gfx11_set_db_render_control(const struct radv_device *device, unsigned num_samples, unsigned *db_render_control)
1663 {
1664    const struct radv_physical_device *pdev = radv_device_physical(device);
1665    unsigned max_allowed_tiles_in_wave = 0;
1666 
1667    if (pdev->info.has_dedicated_vram) {
1668       if (num_samples == 8)
1669          max_allowed_tiles_in_wave = 6;
1670       else if (num_samples == 4)
1671          max_allowed_tiles_in_wave = 13;
1672       else
1673          max_allowed_tiles_in_wave = 0;
1674    } else {
1675       if (num_samples == 8)
1676          max_allowed_tiles_in_wave = 7;
1677       else if (num_samples == 4)
1678          max_allowed_tiles_in_wave = 15;
1679       else
1680          max_allowed_tiles_in_wave = 0;
1681    }
1682 
1683    *db_render_control |= S_028000_MAX_ALLOWED_TILES_IN_WAVE(max_allowed_tiles_in_wave);
1684 }
1685 
1686 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetMemoryFdKHR(VkDevice _device,const VkMemoryGetFdInfoKHR * pGetFdInfo,int * pFD)1687 radv_GetMemoryFdKHR(VkDevice _device, const VkMemoryGetFdInfoKHR *pGetFdInfo, int *pFD)
1688 {
1689    VK_FROM_HANDLE(radv_device, device, _device);
1690    VK_FROM_HANDLE(radv_device_memory, memory, pGetFdInfo->memory);
1691 
1692    assert(pGetFdInfo->sType == VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR);
1693 
1694    /* At the moment, we support only the below handle types. */
1695    assert(pGetFdInfo->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT ||
1696           pGetFdInfo->handleType == VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT);
1697 
1698    bool ret = radv_get_memory_fd(device, memory, pFD);
1699    if (ret == false)
1700       return vk_error(device, VK_ERROR_OUT_OF_DEVICE_MEMORY);
1701    return VK_SUCCESS;
1702 }
1703 
1704 static uint32_t
radv_compute_valid_memory_types_attempt(struct radv_physical_device * pdev,enum radeon_bo_domain domains,enum radeon_bo_flag flags,enum radeon_bo_flag ignore_flags)1705 radv_compute_valid_memory_types_attempt(struct radv_physical_device *pdev, enum radeon_bo_domain domains,
1706                                         enum radeon_bo_flag flags, enum radeon_bo_flag ignore_flags)
1707 {
1708    /* Don't count GTT/CPU as relevant:
1709     *
1710     * - We're not fully consistent between the two.
1711     * - Sometimes VRAM gets VRAM|GTT.
1712     */
1713    const enum radeon_bo_domain relevant_domains = RADEON_DOMAIN_VRAM | RADEON_DOMAIN_GDS | RADEON_DOMAIN_OA;
1714    uint32_t bits = 0;
1715    for (unsigned i = 0; i < pdev->memory_properties.memoryTypeCount; ++i) {
1716       if ((domains & relevant_domains) != (pdev->memory_domains[i] & relevant_domains))
1717          continue;
1718 
1719       if ((flags & ~ignore_flags) != (pdev->memory_flags[i] & ~ignore_flags))
1720          continue;
1721 
1722       bits |= 1u << i;
1723    }
1724 
1725    return bits;
1726 }
1727 
1728 static uint32_t
radv_compute_valid_memory_types(struct radv_physical_device * pdev,enum radeon_bo_domain domains,enum radeon_bo_flag flags)1729 radv_compute_valid_memory_types(struct radv_physical_device *pdev, enum radeon_bo_domain domains,
1730                                 enum radeon_bo_flag flags)
1731 {
1732    enum radeon_bo_flag ignore_flags = ~(RADEON_FLAG_NO_CPU_ACCESS | RADEON_FLAG_GTT_WC);
1733    uint32_t bits = radv_compute_valid_memory_types_attempt(pdev, domains, flags, ignore_flags);
1734 
1735    if (!bits) {
1736       ignore_flags |= RADEON_FLAG_GTT_WC;
1737       bits = radv_compute_valid_memory_types_attempt(pdev, domains, flags, ignore_flags);
1738    }
1739 
1740    if (!bits) {
1741       ignore_flags |= RADEON_FLAG_NO_CPU_ACCESS;
1742       bits = radv_compute_valid_memory_types_attempt(pdev, domains, flags, ignore_flags);
1743    }
1744 
1745    /* Avoid 32-bit memory types for shared memory. */
1746    bits &= ~pdev->memory_types_32bit;
1747 
1748    return bits;
1749 }
1750 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetMemoryFdPropertiesKHR(VkDevice _device,VkExternalMemoryHandleTypeFlagBits handleType,int fd,VkMemoryFdPropertiesKHR * pMemoryFdProperties)1751 radv_GetMemoryFdPropertiesKHR(VkDevice _device, VkExternalMemoryHandleTypeFlagBits handleType, int fd,
1752                               VkMemoryFdPropertiesKHR *pMemoryFdProperties)
1753 {
1754    VK_FROM_HANDLE(radv_device, device, _device);
1755    struct radv_physical_device *pdev = radv_device_physical(device);
1756 
1757    switch (handleType) {
1758    case VK_EXTERNAL_MEMORY_HANDLE_TYPE_DMA_BUF_BIT_EXT: {
1759       enum radeon_bo_domain domains;
1760       enum radeon_bo_flag flags;
1761       if (!device->ws->buffer_get_flags_from_fd(device->ws, fd, &domains, &flags))
1762          return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1763 
1764       pMemoryFdProperties->memoryTypeBits = radv_compute_valid_memory_types(pdev, domains, flags);
1765       return VK_SUCCESS;
1766    }
1767    default:
1768       /* The valid usage section for this function says:
1769        *
1770        *    "handleType must not be one of the handle types defined as
1771        *    opaque."
1772        *
1773        * So opaque handle types fall into the default "unsupported" case.
1774        */
1775       return vk_error(device, VK_ERROR_INVALID_EXTERNAL_HANDLE);
1776    }
1777 }
1778 
1779 VKAPI_ATTR VkResult VKAPI_CALL
radv_GetCalibratedTimestampsKHR(VkDevice _device,uint32_t timestampCount,const VkCalibratedTimestampInfoKHR * pTimestampInfos,uint64_t * pTimestamps,uint64_t * pMaxDeviation)1780 radv_GetCalibratedTimestampsKHR(VkDevice _device, uint32_t timestampCount,
1781                                 const VkCalibratedTimestampInfoKHR *pTimestampInfos, uint64_t *pTimestamps,
1782                                 uint64_t *pMaxDeviation)
1783 {
1784 #ifndef _WIN32
1785    VK_FROM_HANDLE(radv_device, device, _device);
1786    const struct radv_physical_device *pdev = radv_device_physical(device);
1787    uint32_t clock_crystal_freq = pdev->info.clock_crystal_freq;
1788    int d;
1789    uint64_t begin, end;
1790    uint64_t max_clock_period = 0;
1791 
1792 #ifdef CLOCK_MONOTONIC_RAW
1793    begin = vk_clock_gettime(CLOCK_MONOTONIC_RAW);
1794 #else
1795    begin = vk_clock_gettime(CLOCK_MONOTONIC);
1796 #endif
1797 
1798    for (d = 0; d < timestampCount; d++) {
1799       switch (pTimestampInfos[d].timeDomain) {
1800       case VK_TIME_DOMAIN_DEVICE_KHR:
1801          pTimestamps[d] = device->ws->query_value(device->ws, RADEON_TIMESTAMP);
1802          uint64_t device_period = DIV_ROUND_UP(1000000, clock_crystal_freq);
1803          max_clock_period = MAX2(max_clock_period, device_period);
1804          break;
1805       case VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR:
1806          pTimestamps[d] = vk_clock_gettime(CLOCK_MONOTONIC);
1807          max_clock_period = MAX2(max_clock_period, 1);
1808          break;
1809 
1810 #ifdef CLOCK_MONOTONIC_RAW
1811       case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR:
1812          pTimestamps[d] = begin;
1813          break;
1814 #endif
1815       default:
1816          pTimestamps[d] = 0;
1817          break;
1818       }
1819    }
1820 
1821 #ifdef CLOCK_MONOTONIC_RAW
1822    end = vk_clock_gettime(CLOCK_MONOTONIC_RAW);
1823 #else
1824    end = vk_clock_gettime(CLOCK_MONOTONIC);
1825 #endif
1826 
1827    *pMaxDeviation = vk_time_max_deviation(begin, end, max_clock_period);
1828 
1829    return VK_SUCCESS;
1830 #else
1831    return VK_ERROR_FEATURE_NOT_PRESENT;
1832 #endif
1833 }
1834 
1835 bool
radv_device_set_pstate(struct radv_device * device,bool enable)1836 radv_device_set_pstate(struct radv_device *device, bool enable)
1837 {
1838    const struct radv_physical_device *pdev = radv_device_physical(device);
1839    const struct radv_instance *instance = radv_physical_device_instance(pdev);
1840    struct radeon_winsys *ws = device->ws;
1841    enum radeon_ctx_pstate pstate = enable ? instance->profile_pstate : RADEON_CTX_PSTATE_NONE;
1842 
1843    if (pdev->info.has_stable_pstate) {
1844       /* pstate is per-device; setting it for one ctx is sufficient.
1845        * We pick the first initialized one below. */
1846       for (unsigned i = 0; i < RADV_NUM_HW_CTX; i++)
1847          if (device->hw_ctx[i])
1848             return ws->ctx_set_pstate(device->hw_ctx[i], pstate) >= 0;
1849    }
1850 
1851    return true;
1852 }
1853 
1854 bool
radv_device_acquire_performance_counters(struct radv_device * device)1855 radv_device_acquire_performance_counters(struct radv_device *device)
1856 {
1857    bool result = true;
1858    simple_mtx_lock(&device->pstate_mtx);
1859 
1860    if (device->pstate_cnt == 0) {
1861       result = radv_device_set_pstate(device, true);
1862       if (result)
1863          ++device->pstate_cnt;
1864    }
1865 
1866    simple_mtx_unlock(&device->pstate_mtx);
1867    return result;
1868 }
1869 
1870 void
radv_device_release_performance_counters(struct radv_device * device)1871 radv_device_release_performance_counters(struct radv_device *device)
1872 {
1873    simple_mtx_lock(&device->pstate_mtx);
1874 
1875    if (--device->pstate_cnt == 0)
1876       radv_device_set_pstate(device, false);
1877 
1878    simple_mtx_unlock(&device->pstate_mtx);
1879 }
1880 
1881 VKAPI_ATTR VkResult VKAPI_CALL
radv_AcquireProfilingLockKHR(VkDevice _device,const VkAcquireProfilingLockInfoKHR * pInfo)1882 radv_AcquireProfilingLockKHR(VkDevice _device, const VkAcquireProfilingLockInfoKHR *pInfo)
1883 {
1884    VK_FROM_HANDLE(radv_device, device, _device);
1885    bool result = radv_device_acquire_performance_counters(device);
1886    return result ? VK_SUCCESS : VK_ERROR_UNKNOWN;
1887 }
1888 
1889 VKAPI_ATTR void VKAPI_CALL
radv_ReleaseProfilingLockKHR(VkDevice _device)1890 radv_ReleaseProfilingLockKHR(VkDevice _device)
1891 {
1892    VK_FROM_HANDLE(radv_device, device, _device);
1893    radv_device_release_performance_counters(device);
1894 }
1895 
1896 VKAPI_ATTR void VKAPI_CALL
radv_GetDeviceImageSubresourceLayoutKHR(VkDevice device,const VkDeviceImageSubresourceInfoKHR * pInfo,VkSubresourceLayout2KHR * pLayout)1897 radv_GetDeviceImageSubresourceLayoutKHR(VkDevice device, const VkDeviceImageSubresourceInfoKHR *pInfo,
1898                                         VkSubresourceLayout2KHR *pLayout)
1899 {
1900    UNUSED VkResult result;
1901    VkImage image;
1902 
1903    result =
1904       radv_image_create(device, &(struct radv_image_create_info){.vk_info = pInfo->pCreateInfo}, NULL, &image, true);
1905    assert(result == VK_SUCCESS);
1906 
1907    radv_GetImageSubresourceLayout2KHR(device, image, pInfo->pSubresource, pLayout);
1908 
1909    radv_DestroyImage(device, image, NULL);
1910 }
1911