xref: /aosp_15_r20/external/mesa3d/src/asahi/vulkan/hk_device.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2024 Valve Corporation
3  * Copyright 2024 Alyssa Rosenzweig
4  * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5  * SPDX-License-Identifier: MIT
6  */
7 #include "hk_device.h"
8 
9 #include "agx_bg_eot.h"
10 #include "agx_helpers.h"
11 #include "agx_opcodes.h"
12 #include "agx_scratch.h"
13 #include "hk_cmd_buffer.h"
14 #include "hk_descriptor_table.h"
15 #include "hk_entrypoints.h"
16 #include "hk_instance.h"
17 #include "hk_physical_device.h"
18 #include "hk_shader.h"
19 
20 #include "asahi/genxml/agx_pack.h"
21 #include "asahi/lib/agx_bo.h"
22 #include "asahi/lib/agx_device.h"
23 #include "asahi/lib/shaders/geometry.h"
24 #include "util/hash_table.h"
25 #include "util/os_file.h"
26 #include "util/ralloc.h"
27 #include "util/simple_mtx.h"
28 #include "vulkan/vulkan_core.h"
29 #include "vulkan/wsi/wsi_common.h"
30 #include "vk_cmd_enqueue_entrypoints.h"
31 #include "vk_common_entrypoints.h"
32 #include "vk_pipeline_cache.h"
33 
34 #include <fcntl.h>
35 #include <xf86drm.h>
36 
37 /*
38  * We preupload some constants so we can cheaply reference later without extra
39  * allocation and copying.
40  *
41  * TODO: This is small, don't waste a whole BO.
42  */
43 static VkResult
hk_upload_rodata(struct hk_device * dev)44 hk_upload_rodata(struct hk_device *dev)
45 {
46    dev->rodata.bo =
47       agx_bo_create(&dev->dev, AGX_SAMPLER_LENGTH, 0, 0, "Read only data");
48 
49    if (!dev->rodata.bo)
50       return VK_ERROR_OUT_OF_HOST_MEMORY;
51 
52    uint8_t *map = dev->rodata.bo->map;
53    uint32_t offs = 0;
54 
55    offs = align(offs, 8);
56    agx_pack(&dev->rodata.txf_sampler, USC_SAMPLER, cfg) {
57       cfg.start = 0;
58       cfg.count = 1;
59       cfg.buffer = dev->rodata.bo->va->addr + offs;
60    }
61 
62    agx_pack_txf_sampler((struct agx_sampler_packed *)(map + offs));
63    offs += AGX_SAMPLER_LENGTH;
64 
65    /* The image heap is allocated on the device prior to the rodata. The heap
66     * lives as long as the device does and has a stable address (requiring
67     * sparse binding to grow dynamically). That means its address is effectively
68     * rodata and can be uploaded now. agx_usc_uniform requires an indirection to
69     * push the heap address, so this takes care of that indirection up front to
70     * cut an alloc/upload at draw time.
71     */
72    offs = align(offs, sizeof(uint64_t));
73    agx_pack(&dev->rodata.image_heap, USC_UNIFORM, cfg) {
74       cfg.start_halfs = HK_IMAGE_HEAP_UNIFORM;
75       cfg.size_halfs = 4;
76       cfg.buffer = dev->rodata.bo->va->addr + offs;
77    }
78 
79    uint64_t *image_heap_ptr = dev->rodata.bo->map + offs;
80    *image_heap_ptr = dev->images.bo->va->addr;
81    offs += sizeof(uint64_t);
82 
83    /* The geometry state buffer isn't strictly readonly data, but we only have a
84     * single instance of it device-wide and -- after initializing at heap
85     * allocate time -- it is read-only from the CPU perspective. The GPU uses it
86     * for scratch, but is required to reset it after use to ensure resubmitting
87     * the same command buffer works.
88     *
89     * So, we allocate it here for convenience.
90     */
91    offs = align(offs, sizeof(uint64_t));
92    dev->rodata.geometry_state = dev->rodata.bo->va->addr + offs;
93    offs += sizeof(struct agx_geometry_state);
94 
95    /* For null readonly buffers, we need to allocate 16 bytes of zeroes for
96     * robustness2 semantics on read.
97     */
98    offs = align(offs, 16);
99    dev->rodata.zero_sink = dev->rodata.bo->va->addr + offs;
100    memset(dev->rodata.bo->map + offs, 0, 16);
101    offs += 16;
102 
103    /* For null storage descriptors, we need to reserve 16 bytes to catch writes.
104     * No particular content is required; we cannot get robustness2 semantics
105     * without more work.
106     */
107    offs = align(offs, 16);
108    dev->rodata.null_sink = dev->rodata.bo->va->addr + offs;
109    offs += 16;
110 
111    return VK_SUCCESS;
112 }
113 
114 static uint32_t
internal_key_hash(const void * key_)115 internal_key_hash(const void *key_)
116 {
117    const struct hk_internal_key *key = key_;
118 
119    return _mesa_hash_data(key, sizeof(struct hk_internal_key) + key->key_size);
120 }
121 
122 static bool
internal_key_equal(const void * a_,const void * b_)123 internal_key_equal(const void *a_, const void *b_)
124 {
125    const struct hk_internal_key *a = a_;
126    const struct hk_internal_key *b = b_;
127 
128    return a->builder == b->builder && a->key_size == b->key_size &&
129           memcmp(a->key, b->key, a->key_size) == 0;
130 }
131 
132 static VkResult
hk_init_internal_shaders(struct hk_internal_shaders * s)133 hk_init_internal_shaders(struct hk_internal_shaders *s)
134 {
135    s->ht = _mesa_hash_table_create(NULL, internal_key_hash, internal_key_equal);
136    if (!s->ht)
137       return VK_ERROR_OUT_OF_HOST_MEMORY;
138 
139    simple_mtx_init(&s->lock, mtx_plain);
140    return VK_SUCCESS;
141 }
142 
143 static void
hk_destroy_internal_shaders(struct hk_device * dev,struct hk_internal_shaders * s,bool part)144 hk_destroy_internal_shaders(struct hk_device *dev,
145                             struct hk_internal_shaders *s, bool part)
146 {
147    hash_table_foreach(s->ht, ent) {
148       if (part) {
149          struct agx_shader_part *part = ent->data;
150          free(part->binary);
151 
152          /* The agx_shader_part itself is ralloc'd against the hash table so
153           * will be freed.
154           */
155       } else {
156          struct hk_api_shader *obj = ent->data;
157          hk_api_shader_destroy(&dev->vk, &obj->vk, NULL);
158       }
159    }
160 
161    _mesa_hash_table_destroy(s->ht, NULL);
162    simple_mtx_destroy(&s->lock);
163 }
164 
165 DERIVE_HASH_TABLE(agx_sampler_packed);
166 
167 static VkResult
hk_init_sampler_heap(struct hk_device * dev,struct hk_sampler_heap * h)168 hk_init_sampler_heap(struct hk_device *dev, struct hk_sampler_heap *h)
169 {
170    h->ht = agx_sampler_packed_table_create(NULL);
171    if (!h->ht)
172       return VK_ERROR_OUT_OF_HOST_MEMORY;
173 
174    VkResult result =
175       hk_descriptor_table_init(dev, &h->table, AGX_SAMPLER_LENGTH, 1024, 1024);
176 
177    if (result != VK_SUCCESS) {
178       ralloc_free(h->ht);
179       return result;
180    }
181 
182    simple_mtx_init(&h->lock, mtx_plain);
183    return VK_SUCCESS;
184 }
185 
186 static void
hk_destroy_sampler_heap(struct hk_device * dev,struct hk_sampler_heap * h)187 hk_destroy_sampler_heap(struct hk_device *dev, struct hk_sampler_heap *h)
188 {
189    hk_descriptor_table_finish(dev, &h->table);
190    ralloc_free(h->ht);
191    simple_mtx_destroy(&h->lock);
192 }
193 
194 static VkResult
hk_sampler_heap_add_locked(struct hk_device * dev,struct hk_sampler_heap * h,struct agx_sampler_packed desc,struct hk_rc_sampler ** out)195 hk_sampler_heap_add_locked(struct hk_device *dev, struct hk_sampler_heap *h,
196                            struct agx_sampler_packed desc,
197                            struct hk_rc_sampler **out)
198 {
199    struct hash_entry *ent = _mesa_hash_table_search(h->ht, &desc);
200    if (ent != NULL) {
201       *out = ent->data;
202 
203       assert((*out)->refcount != 0);
204       (*out)->refcount++;
205 
206       return VK_SUCCESS;
207    }
208 
209    struct hk_rc_sampler *rc = ralloc(h->ht, struct hk_rc_sampler);
210    if (!rc)
211       return VK_ERROR_OUT_OF_HOST_MEMORY;
212 
213    uint32_t index;
214    VkResult result =
215       hk_descriptor_table_add(dev, &h->table, &desc, sizeof(desc), &index);
216    if (result != VK_SUCCESS) {
217       ralloc_free(rc);
218       return result;
219    }
220 
221    *rc = (struct hk_rc_sampler){
222       .key = desc,
223       .refcount = 1,
224       .index = index,
225    };
226 
227    _mesa_hash_table_insert(h->ht, &rc->key, rc);
228    *out = rc;
229 
230    return VK_SUCCESS;
231 }
232 
233 VkResult
hk_sampler_heap_add(struct hk_device * dev,struct agx_sampler_packed desc,struct hk_rc_sampler ** out)234 hk_sampler_heap_add(struct hk_device *dev, struct agx_sampler_packed desc,
235                     struct hk_rc_sampler **out)
236 {
237    struct hk_sampler_heap *h = &dev->samplers;
238 
239    simple_mtx_lock(&h->lock);
240    VkResult result = hk_sampler_heap_add_locked(dev, h, desc, out);
241    simple_mtx_unlock(&h->lock);
242 
243    return result;
244 }
245 
246 static void
hk_sampler_heap_remove_locked(struct hk_device * dev,struct hk_sampler_heap * h,struct hk_rc_sampler * rc)247 hk_sampler_heap_remove_locked(struct hk_device *dev, struct hk_sampler_heap *h,
248                               struct hk_rc_sampler *rc)
249 {
250    assert(rc->refcount != 0);
251    rc->refcount--;
252 
253    if (rc->refcount == 0) {
254       hk_descriptor_table_remove(dev, &h->table, rc->index);
255       _mesa_hash_table_remove_key(h->ht, &rc->key);
256       ralloc_free(rc);
257    }
258 }
259 
260 void
hk_sampler_heap_remove(struct hk_device * dev,struct hk_rc_sampler * rc)261 hk_sampler_heap_remove(struct hk_device *dev, struct hk_rc_sampler *rc)
262 {
263    struct hk_sampler_heap *h = &dev->samplers;
264 
265    simple_mtx_lock(&h->lock);
266    hk_sampler_heap_remove_locked(dev, h, rc);
267    simple_mtx_unlock(&h->lock);
268 }
269 
270 /*
271  * To implement nullDescriptor, the descriptor set code will reference
272  * preuploaded null descriptors at fixed offsets in the image heap. Here we
273  * upload those descriptors, initializing the image heap.
274  */
275 static void
hk_upload_null_descriptors(struct hk_device * dev)276 hk_upload_null_descriptors(struct hk_device *dev)
277 {
278    struct agx_texture_packed null_tex;
279    struct agx_pbe_packed null_pbe;
280    uint32_t offset_tex, offset_pbe;
281 
282    agx_set_null_texture(&null_tex, dev->rodata.null_sink);
283    agx_set_null_pbe(&null_pbe, dev->rodata.null_sink);
284 
285    hk_descriptor_table_add(dev, &dev->images, &null_tex, sizeof(null_tex),
286                            &offset_tex);
287 
288    hk_descriptor_table_add(dev, &dev->images, &null_pbe, sizeof(null_pbe),
289                            &offset_pbe);
290 
291    assert((offset_tex * HK_IMAGE_STRIDE) == HK_NULL_TEX_OFFSET && "static");
292    assert((offset_pbe * HK_IMAGE_STRIDE) == HK_NULL_PBE_OFFSET && "static");
293 }
294 
295 VKAPI_ATTR VkResult VKAPI_CALL
hk_CreateDevice(VkPhysicalDevice physicalDevice,const VkDeviceCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkDevice * pDevice)296 hk_CreateDevice(VkPhysicalDevice physicalDevice,
297                 const VkDeviceCreateInfo *pCreateInfo,
298                 const VkAllocationCallbacks *pAllocator, VkDevice *pDevice)
299 {
300    VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice);
301    VkResult result = VK_ERROR_OUT_OF_HOST_MEMORY;
302    struct hk_device *dev;
303 
304    dev = vk_zalloc2(&pdev->vk.instance->alloc, pAllocator, sizeof(*dev), 8,
305                     VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
306    if (!dev)
307       return vk_error(pdev, VK_ERROR_OUT_OF_HOST_MEMORY);
308 
309    struct vk_device_dispatch_table dispatch_table;
310 
311    /* For secondary command buffer support, overwrite any command entrypoints
312     * in the main device-level dispatch table with
313     * vk_cmd_enqueue_unless_primary_Cmd*.
314     */
315    vk_device_dispatch_table_from_entrypoints(
316       &dispatch_table, &vk_cmd_enqueue_unless_primary_device_entrypoints, true);
317 
318    vk_device_dispatch_table_from_entrypoints(&dispatch_table,
319                                              &hk_device_entrypoints, false);
320    vk_device_dispatch_table_from_entrypoints(&dispatch_table,
321                                              &wsi_device_entrypoints, false);
322 
323    /* Populate primary cmd_dispatch table */
324    vk_device_dispatch_table_from_entrypoints(&dev->cmd_dispatch,
325                                              &hk_device_entrypoints, true);
326    vk_device_dispatch_table_from_entrypoints(&dev->cmd_dispatch,
327                                              &wsi_device_entrypoints, false);
328    vk_device_dispatch_table_from_entrypoints(
329       &dev->cmd_dispatch, &vk_common_device_entrypoints, false);
330 
331    result = vk_device_init(&dev->vk, &pdev->vk, &dispatch_table, pCreateInfo,
332                            pAllocator);
333    if (result != VK_SUCCESS)
334       goto fail_alloc;
335 
336    dev->vk.shader_ops = &hk_device_shader_ops;
337    dev->vk.command_dispatch_table = &dev->cmd_dispatch;
338 
339    drmDevicePtr drm_device = NULL;
340    int ret = drmGetDeviceFromDevId(pdev->render_dev, 0, &drm_device);
341    if (ret != 0) {
342       result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
343                          "Failed to get DRM device: %m");
344       goto fail_init;
345    }
346 
347    const char *path = drm_device->nodes[DRM_NODE_RENDER];
348    dev->dev.fd = open(path, O_RDWR | O_CLOEXEC);
349    if (dev->dev.fd < 0) {
350       drmFreeDevice(&drm_device);
351       result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
352                          "failed to open device %s", path);
353       goto fail_init;
354    }
355 
356    bool succ = agx_open_device(NULL, &dev->dev);
357    drmFreeDevice(&drm_device);
358    if (!succ) {
359       result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
360                          "Failed to get DRM device: %m");
361       goto fail_fd;
362    }
363 
364    vk_device_set_drm_fd(&dev->vk, dev->dev.fd);
365    dev->vk.command_buffer_ops = &hk_cmd_buffer_ops;
366 
367    result = hk_descriptor_table_init(dev, &dev->images, AGX_TEXTURE_LENGTH,
368                                      1024, 1024 * 1024);
369    if (result != VK_SUCCESS)
370       goto fail_dev;
371 
372    result = hk_init_sampler_heap(dev, &dev->samplers);
373    if (result != VK_SUCCESS)
374       goto fail_images;
375 
376    result = hk_descriptor_table_init(
377       dev, &dev->occlusion_queries, sizeof(uint64_t), AGX_MAX_OCCLUSION_QUERIES,
378       AGX_MAX_OCCLUSION_QUERIES);
379    if (result != VK_SUCCESS)
380       goto fail_samplers;
381 
382    result = hk_upload_rodata(dev);
383    if (result != VK_SUCCESS)
384       goto fail_queries;
385 
386    /* Depends on rodata */
387    hk_upload_null_descriptors(dev);
388 
389    /* XXX: error handling, and should this even go on the device? */
390    agx_bg_eot_init(&dev->bg_eot, &dev->dev);
391    if (!dev->bg_eot.ht) {
392       result = VK_ERROR_OUT_OF_HOST_MEMORY;
393       goto fail_rodata;
394    }
395 
396    result = hk_init_internal_shaders(&dev->prolog_epilog);
397    if (result != VK_SUCCESS)
398       goto fail_bg_eot;
399 
400    result = hk_init_internal_shaders(&dev->kernels);
401    if (result != VK_SUCCESS)
402       goto fail_internal_shaders;
403 
404    result =
405       hk_queue_init(dev, &dev->queue, &pCreateInfo->pQueueCreateInfos[0], 0);
406    if (result != VK_SUCCESS)
407       goto fail_internal_shaders_2;
408 
409    struct vk_pipeline_cache_create_info cache_info = {
410       .weak_ref = true,
411    };
412    dev->mem_cache = vk_pipeline_cache_create(&dev->vk, &cache_info, NULL);
413    if (dev->mem_cache == NULL) {
414       result = VK_ERROR_OUT_OF_HOST_MEMORY;
415       goto fail_queue;
416    }
417 
418    result = hk_device_init_meta(dev);
419    if (result != VK_SUCCESS)
420       goto fail_mem_cache;
421 
422    *pDevice = hk_device_to_handle(dev);
423 
424    simple_mtx_init(&dev->scratch.lock, mtx_plain);
425    agx_scratch_init(&dev->dev, &dev->scratch.vs);
426    agx_scratch_init(&dev->dev, &dev->scratch.fs);
427    agx_scratch_init(&dev->dev, &dev->scratch.cs);
428 
429    return VK_SUCCESS;
430 
431 fail_mem_cache:
432    vk_pipeline_cache_destroy(dev->mem_cache, NULL);
433 fail_queue:
434    hk_queue_finish(dev, &dev->queue);
435 fail_rodata:
436    agx_bo_unreference(&dev->dev, dev->rodata.bo);
437 fail_bg_eot:
438    agx_bg_eot_cleanup(&dev->bg_eot);
439 fail_internal_shaders_2:
440    hk_destroy_internal_shaders(dev, &dev->kernels, false);
441 fail_internal_shaders:
442    hk_destroy_internal_shaders(dev, &dev->prolog_epilog, true);
443 fail_queries:
444    hk_descriptor_table_finish(dev, &dev->occlusion_queries);
445 fail_samplers:
446    hk_destroy_sampler_heap(dev, &dev->samplers);
447 fail_images:
448    hk_descriptor_table_finish(dev, &dev->images);
449 fail_dev:
450    agx_close_device(&dev->dev);
451 fail_fd:
452    close(dev->dev.fd);
453 fail_init:
454    vk_device_finish(&dev->vk);
455 fail_alloc:
456    vk_free(&dev->vk.alloc, dev);
457    return result;
458 }
459 
460 VKAPI_ATTR void VKAPI_CALL
hk_DestroyDevice(VkDevice _device,const VkAllocationCallbacks * pAllocator)461 hk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
462 {
463    VK_FROM_HANDLE(hk_device, dev, _device);
464 
465    if (!dev)
466       return;
467 
468    hk_device_finish_meta(dev);
469    hk_destroy_internal_shaders(dev, &dev->kernels, false);
470    hk_destroy_internal_shaders(dev, &dev->prolog_epilog, true);
471 
472    vk_pipeline_cache_destroy(dev->mem_cache, NULL);
473    hk_queue_finish(dev, &dev->queue);
474    vk_device_finish(&dev->vk);
475 
476    agx_scratch_fini(&dev->scratch.vs);
477    agx_scratch_fini(&dev->scratch.fs);
478    agx_scratch_fini(&dev->scratch.cs);
479    simple_mtx_destroy(&dev->scratch.lock);
480 
481    hk_destroy_sampler_heap(dev, &dev->samplers);
482    hk_descriptor_table_finish(dev, &dev->images);
483    hk_descriptor_table_finish(dev, &dev->occlusion_queries);
484    agx_bo_unreference(&dev->dev, dev->rodata.bo);
485    agx_bo_unreference(&dev->dev, dev->heap);
486    agx_bg_eot_cleanup(&dev->bg_eot);
487    agx_close_device(&dev->dev);
488    vk_free(&dev->vk.alloc, dev);
489 }
490 
491 VKAPI_ATTR VkResult VKAPI_CALL
hk_GetCalibratedTimestampsKHR(VkDevice _device,uint32_t timestampCount,const VkCalibratedTimestampInfoKHR * pTimestampInfos,uint64_t * pTimestamps,uint64_t * pMaxDeviation)492 hk_GetCalibratedTimestampsKHR(
493    VkDevice _device, uint32_t timestampCount,
494    const VkCalibratedTimestampInfoKHR *pTimestampInfos, uint64_t *pTimestamps,
495    uint64_t *pMaxDeviation)
496 {
497    // VK_FROM_HANDLE(hk_device, dev, _device);
498    // struct hk_physical_device *pdev = hk_device_physical(dev);
499    uint64_t max_clock_period = 0;
500    uint64_t begin, end;
501    int d;
502 
503 #ifdef CLOCK_MONOTONIC_RAW
504    begin = vk_clock_gettime(CLOCK_MONOTONIC_RAW);
505 #else
506    begin = vk_clock_gettime(CLOCK_MONOTONIC);
507 #endif
508 
509    for (d = 0; d < timestampCount; d++) {
510       switch (pTimestampInfos[d].timeDomain) {
511       case VK_TIME_DOMAIN_DEVICE_KHR:
512          unreachable("todo");
513          // pTimestamps[d] = agx_get_gpu_timestamp(&pdev->dev);
514          max_clock_period = MAX2(
515             max_clock_period, 1); /* FIXME: Is timestamp period actually 1? */
516          break;
517       case VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR:
518          pTimestamps[d] = vk_clock_gettime(CLOCK_MONOTONIC);
519          max_clock_period = MAX2(max_clock_period, 1);
520          break;
521 
522 #ifdef CLOCK_MONOTONIC_RAW
523       case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR:
524          pTimestamps[d] = begin;
525          break;
526 #endif
527       default:
528          pTimestamps[d] = 0;
529          break;
530       }
531    }
532 
533 #ifdef CLOCK_MONOTONIC_RAW
534    end = vk_clock_gettime(CLOCK_MONOTONIC_RAW);
535 #else
536    end = vk_clock_gettime(CLOCK_MONOTONIC);
537 #endif
538 
539    *pMaxDeviation = vk_time_max_deviation(begin, end, max_clock_period);
540 
541    return VK_SUCCESS;
542 }
543