1 /*
2 * Copyright 2024 Valve Corporation
3 * Copyright 2024 Alyssa Rosenzweig
4 * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5 * SPDX-License-Identifier: MIT
6 */
7 #include "hk_device.h"
8
9 #include "agx_bg_eot.h"
10 #include "agx_helpers.h"
11 #include "agx_opcodes.h"
12 #include "agx_scratch.h"
13 #include "hk_cmd_buffer.h"
14 #include "hk_descriptor_table.h"
15 #include "hk_entrypoints.h"
16 #include "hk_instance.h"
17 #include "hk_physical_device.h"
18 #include "hk_shader.h"
19
20 #include "asahi/genxml/agx_pack.h"
21 #include "asahi/lib/agx_bo.h"
22 #include "asahi/lib/agx_device.h"
23 #include "asahi/lib/shaders/geometry.h"
24 #include "util/hash_table.h"
25 #include "util/os_file.h"
26 #include "util/ralloc.h"
27 #include "util/simple_mtx.h"
28 #include "vulkan/vulkan_core.h"
29 #include "vulkan/wsi/wsi_common.h"
30 #include "vk_cmd_enqueue_entrypoints.h"
31 #include "vk_common_entrypoints.h"
32 #include "vk_pipeline_cache.h"
33
34 #include <fcntl.h>
35 #include <xf86drm.h>
36
37 /*
38 * We preupload some constants so we can cheaply reference later without extra
39 * allocation and copying.
40 *
41 * TODO: This is small, don't waste a whole BO.
42 */
43 static VkResult
hk_upload_rodata(struct hk_device * dev)44 hk_upload_rodata(struct hk_device *dev)
45 {
46 dev->rodata.bo =
47 agx_bo_create(&dev->dev, AGX_SAMPLER_LENGTH, 0, 0, "Read only data");
48
49 if (!dev->rodata.bo)
50 return VK_ERROR_OUT_OF_HOST_MEMORY;
51
52 uint8_t *map = dev->rodata.bo->map;
53 uint32_t offs = 0;
54
55 offs = align(offs, 8);
56 agx_pack(&dev->rodata.txf_sampler, USC_SAMPLER, cfg) {
57 cfg.start = 0;
58 cfg.count = 1;
59 cfg.buffer = dev->rodata.bo->va->addr + offs;
60 }
61
62 agx_pack_txf_sampler((struct agx_sampler_packed *)(map + offs));
63 offs += AGX_SAMPLER_LENGTH;
64
65 /* The image heap is allocated on the device prior to the rodata. The heap
66 * lives as long as the device does and has a stable address (requiring
67 * sparse binding to grow dynamically). That means its address is effectively
68 * rodata and can be uploaded now. agx_usc_uniform requires an indirection to
69 * push the heap address, so this takes care of that indirection up front to
70 * cut an alloc/upload at draw time.
71 */
72 offs = align(offs, sizeof(uint64_t));
73 agx_pack(&dev->rodata.image_heap, USC_UNIFORM, cfg) {
74 cfg.start_halfs = HK_IMAGE_HEAP_UNIFORM;
75 cfg.size_halfs = 4;
76 cfg.buffer = dev->rodata.bo->va->addr + offs;
77 }
78
79 uint64_t *image_heap_ptr = dev->rodata.bo->map + offs;
80 *image_heap_ptr = dev->images.bo->va->addr;
81 offs += sizeof(uint64_t);
82
83 /* The geometry state buffer isn't strictly readonly data, but we only have a
84 * single instance of it device-wide and -- after initializing at heap
85 * allocate time -- it is read-only from the CPU perspective. The GPU uses it
86 * for scratch, but is required to reset it after use to ensure resubmitting
87 * the same command buffer works.
88 *
89 * So, we allocate it here for convenience.
90 */
91 offs = align(offs, sizeof(uint64_t));
92 dev->rodata.geometry_state = dev->rodata.bo->va->addr + offs;
93 offs += sizeof(struct agx_geometry_state);
94
95 /* For null readonly buffers, we need to allocate 16 bytes of zeroes for
96 * robustness2 semantics on read.
97 */
98 offs = align(offs, 16);
99 dev->rodata.zero_sink = dev->rodata.bo->va->addr + offs;
100 memset(dev->rodata.bo->map + offs, 0, 16);
101 offs += 16;
102
103 /* For null storage descriptors, we need to reserve 16 bytes to catch writes.
104 * No particular content is required; we cannot get robustness2 semantics
105 * without more work.
106 */
107 offs = align(offs, 16);
108 dev->rodata.null_sink = dev->rodata.bo->va->addr + offs;
109 offs += 16;
110
111 return VK_SUCCESS;
112 }
113
114 static uint32_t
internal_key_hash(const void * key_)115 internal_key_hash(const void *key_)
116 {
117 const struct hk_internal_key *key = key_;
118
119 return _mesa_hash_data(key, sizeof(struct hk_internal_key) + key->key_size);
120 }
121
122 static bool
internal_key_equal(const void * a_,const void * b_)123 internal_key_equal(const void *a_, const void *b_)
124 {
125 const struct hk_internal_key *a = a_;
126 const struct hk_internal_key *b = b_;
127
128 return a->builder == b->builder && a->key_size == b->key_size &&
129 memcmp(a->key, b->key, a->key_size) == 0;
130 }
131
132 static VkResult
hk_init_internal_shaders(struct hk_internal_shaders * s)133 hk_init_internal_shaders(struct hk_internal_shaders *s)
134 {
135 s->ht = _mesa_hash_table_create(NULL, internal_key_hash, internal_key_equal);
136 if (!s->ht)
137 return VK_ERROR_OUT_OF_HOST_MEMORY;
138
139 simple_mtx_init(&s->lock, mtx_plain);
140 return VK_SUCCESS;
141 }
142
143 static void
hk_destroy_internal_shaders(struct hk_device * dev,struct hk_internal_shaders * s,bool part)144 hk_destroy_internal_shaders(struct hk_device *dev,
145 struct hk_internal_shaders *s, bool part)
146 {
147 hash_table_foreach(s->ht, ent) {
148 if (part) {
149 struct agx_shader_part *part = ent->data;
150 free(part->binary);
151
152 /* The agx_shader_part itself is ralloc'd against the hash table so
153 * will be freed.
154 */
155 } else {
156 struct hk_api_shader *obj = ent->data;
157 hk_api_shader_destroy(&dev->vk, &obj->vk, NULL);
158 }
159 }
160
161 _mesa_hash_table_destroy(s->ht, NULL);
162 simple_mtx_destroy(&s->lock);
163 }
164
165 DERIVE_HASH_TABLE(agx_sampler_packed);
166
167 static VkResult
hk_init_sampler_heap(struct hk_device * dev,struct hk_sampler_heap * h)168 hk_init_sampler_heap(struct hk_device *dev, struct hk_sampler_heap *h)
169 {
170 h->ht = agx_sampler_packed_table_create(NULL);
171 if (!h->ht)
172 return VK_ERROR_OUT_OF_HOST_MEMORY;
173
174 VkResult result =
175 hk_descriptor_table_init(dev, &h->table, AGX_SAMPLER_LENGTH, 1024, 1024);
176
177 if (result != VK_SUCCESS) {
178 ralloc_free(h->ht);
179 return result;
180 }
181
182 simple_mtx_init(&h->lock, mtx_plain);
183 return VK_SUCCESS;
184 }
185
186 static void
hk_destroy_sampler_heap(struct hk_device * dev,struct hk_sampler_heap * h)187 hk_destroy_sampler_heap(struct hk_device *dev, struct hk_sampler_heap *h)
188 {
189 hk_descriptor_table_finish(dev, &h->table);
190 ralloc_free(h->ht);
191 simple_mtx_destroy(&h->lock);
192 }
193
194 static VkResult
hk_sampler_heap_add_locked(struct hk_device * dev,struct hk_sampler_heap * h,struct agx_sampler_packed desc,struct hk_rc_sampler ** out)195 hk_sampler_heap_add_locked(struct hk_device *dev, struct hk_sampler_heap *h,
196 struct agx_sampler_packed desc,
197 struct hk_rc_sampler **out)
198 {
199 struct hash_entry *ent = _mesa_hash_table_search(h->ht, &desc);
200 if (ent != NULL) {
201 *out = ent->data;
202
203 assert((*out)->refcount != 0);
204 (*out)->refcount++;
205
206 return VK_SUCCESS;
207 }
208
209 struct hk_rc_sampler *rc = ralloc(h->ht, struct hk_rc_sampler);
210 if (!rc)
211 return VK_ERROR_OUT_OF_HOST_MEMORY;
212
213 uint32_t index;
214 VkResult result =
215 hk_descriptor_table_add(dev, &h->table, &desc, sizeof(desc), &index);
216 if (result != VK_SUCCESS) {
217 ralloc_free(rc);
218 return result;
219 }
220
221 *rc = (struct hk_rc_sampler){
222 .key = desc,
223 .refcount = 1,
224 .index = index,
225 };
226
227 _mesa_hash_table_insert(h->ht, &rc->key, rc);
228 *out = rc;
229
230 return VK_SUCCESS;
231 }
232
233 VkResult
hk_sampler_heap_add(struct hk_device * dev,struct agx_sampler_packed desc,struct hk_rc_sampler ** out)234 hk_sampler_heap_add(struct hk_device *dev, struct agx_sampler_packed desc,
235 struct hk_rc_sampler **out)
236 {
237 struct hk_sampler_heap *h = &dev->samplers;
238
239 simple_mtx_lock(&h->lock);
240 VkResult result = hk_sampler_heap_add_locked(dev, h, desc, out);
241 simple_mtx_unlock(&h->lock);
242
243 return result;
244 }
245
246 static void
hk_sampler_heap_remove_locked(struct hk_device * dev,struct hk_sampler_heap * h,struct hk_rc_sampler * rc)247 hk_sampler_heap_remove_locked(struct hk_device *dev, struct hk_sampler_heap *h,
248 struct hk_rc_sampler *rc)
249 {
250 assert(rc->refcount != 0);
251 rc->refcount--;
252
253 if (rc->refcount == 0) {
254 hk_descriptor_table_remove(dev, &h->table, rc->index);
255 _mesa_hash_table_remove_key(h->ht, &rc->key);
256 ralloc_free(rc);
257 }
258 }
259
260 void
hk_sampler_heap_remove(struct hk_device * dev,struct hk_rc_sampler * rc)261 hk_sampler_heap_remove(struct hk_device *dev, struct hk_rc_sampler *rc)
262 {
263 struct hk_sampler_heap *h = &dev->samplers;
264
265 simple_mtx_lock(&h->lock);
266 hk_sampler_heap_remove_locked(dev, h, rc);
267 simple_mtx_unlock(&h->lock);
268 }
269
270 /*
271 * To implement nullDescriptor, the descriptor set code will reference
272 * preuploaded null descriptors at fixed offsets in the image heap. Here we
273 * upload those descriptors, initializing the image heap.
274 */
275 static void
hk_upload_null_descriptors(struct hk_device * dev)276 hk_upload_null_descriptors(struct hk_device *dev)
277 {
278 struct agx_texture_packed null_tex;
279 struct agx_pbe_packed null_pbe;
280 uint32_t offset_tex, offset_pbe;
281
282 agx_set_null_texture(&null_tex, dev->rodata.null_sink);
283 agx_set_null_pbe(&null_pbe, dev->rodata.null_sink);
284
285 hk_descriptor_table_add(dev, &dev->images, &null_tex, sizeof(null_tex),
286 &offset_tex);
287
288 hk_descriptor_table_add(dev, &dev->images, &null_pbe, sizeof(null_pbe),
289 &offset_pbe);
290
291 assert((offset_tex * HK_IMAGE_STRIDE) == HK_NULL_TEX_OFFSET && "static");
292 assert((offset_pbe * HK_IMAGE_STRIDE) == HK_NULL_PBE_OFFSET && "static");
293 }
294
295 VKAPI_ATTR VkResult VKAPI_CALL
hk_CreateDevice(VkPhysicalDevice physicalDevice,const VkDeviceCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkDevice * pDevice)296 hk_CreateDevice(VkPhysicalDevice physicalDevice,
297 const VkDeviceCreateInfo *pCreateInfo,
298 const VkAllocationCallbacks *pAllocator, VkDevice *pDevice)
299 {
300 VK_FROM_HANDLE(hk_physical_device, pdev, physicalDevice);
301 VkResult result = VK_ERROR_OUT_OF_HOST_MEMORY;
302 struct hk_device *dev;
303
304 dev = vk_zalloc2(&pdev->vk.instance->alloc, pAllocator, sizeof(*dev), 8,
305 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
306 if (!dev)
307 return vk_error(pdev, VK_ERROR_OUT_OF_HOST_MEMORY);
308
309 struct vk_device_dispatch_table dispatch_table;
310
311 /* For secondary command buffer support, overwrite any command entrypoints
312 * in the main device-level dispatch table with
313 * vk_cmd_enqueue_unless_primary_Cmd*.
314 */
315 vk_device_dispatch_table_from_entrypoints(
316 &dispatch_table, &vk_cmd_enqueue_unless_primary_device_entrypoints, true);
317
318 vk_device_dispatch_table_from_entrypoints(&dispatch_table,
319 &hk_device_entrypoints, false);
320 vk_device_dispatch_table_from_entrypoints(&dispatch_table,
321 &wsi_device_entrypoints, false);
322
323 /* Populate primary cmd_dispatch table */
324 vk_device_dispatch_table_from_entrypoints(&dev->cmd_dispatch,
325 &hk_device_entrypoints, true);
326 vk_device_dispatch_table_from_entrypoints(&dev->cmd_dispatch,
327 &wsi_device_entrypoints, false);
328 vk_device_dispatch_table_from_entrypoints(
329 &dev->cmd_dispatch, &vk_common_device_entrypoints, false);
330
331 result = vk_device_init(&dev->vk, &pdev->vk, &dispatch_table, pCreateInfo,
332 pAllocator);
333 if (result != VK_SUCCESS)
334 goto fail_alloc;
335
336 dev->vk.shader_ops = &hk_device_shader_ops;
337 dev->vk.command_dispatch_table = &dev->cmd_dispatch;
338
339 drmDevicePtr drm_device = NULL;
340 int ret = drmGetDeviceFromDevId(pdev->render_dev, 0, &drm_device);
341 if (ret != 0) {
342 result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
343 "Failed to get DRM device: %m");
344 goto fail_init;
345 }
346
347 const char *path = drm_device->nodes[DRM_NODE_RENDER];
348 dev->dev.fd = open(path, O_RDWR | O_CLOEXEC);
349 if (dev->dev.fd < 0) {
350 drmFreeDevice(&drm_device);
351 result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
352 "failed to open device %s", path);
353 goto fail_init;
354 }
355
356 bool succ = agx_open_device(NULL, &dev->dev);
357 drmFreeDevice(&drm_device);
358 if (!succ) {
359 result = vk_errorf(dev, VK_ERROR_INITIALIZATION_FAILED,
360 "Failed to get DRM device: %m");
361 goto fail_fd;
362 }
363
364 vk_device_set_drm_fd(&dev->vk, dev->dev.fd);
365 dev->vk.command_buffer_ops = &hk_cmd_buffer_ops;
366
367 result = hk_descriptor_table_init(dev, &dev->images, AGX_TEXTURE_LENGTH,
368 1024, 1024 * 1024);
369 if (result != VK_SUCCESS)
370 goto fail_dev;
371
372 result = hk_init_sampler_heap(dev, &dev->samplers);
373 if (result != VK_SUCCESS)
374 goto fail_images;
375
376 result = hk_descriptor_table_init(
377 dev, &dev->occlusion_queries, sizeof(uint64_t), AGX_MAX_OCCLUSION_QUERIES,
378 AGX_MAX_OCCLUSION_QUERIES);
379 if (result != VK_SUCCESS)
380 goto fail_samplers;
381
382 result = hk_upload_rodata(dev);
383 if (result != VK_SUCCESS)
384 goto fail_queries;
385
386 /* Depends on rodata */
387 hk_upload_null_descriptors(dev);
388
389 /* XXX: error handling, and should this even go on the device? */
390 agx_bg_eot_init(&dev->bg_eot, &dev->dev);
391 if (!dev->bg_eot.ht) {
392 result = VK_ERROR_OUT_OF_HOST_MEMORY;
393 goto fail_rodata;
394 }
395
396 result = hk_init_internal_shaders(&dev->prolog_epilog);
397 if (result != VK_SUCCESS)
398 goto fail_bg_eot;
399
400 result = hk_init_internal_shaders(&dev->kernels);
401 if (result != VK_SUCCESS)
402 goto fail_internal_shaders;
403
404 result =
405 hk_queue_init(dev, &dev->queue, &pCreateInfo->pQueueCreateInfos[0], 0);
406 if (result != VK_SUCCESS)
407 goto fail_internal_shaders_2;
408
409 struct vk_pipeline_cache_create_info cache_info = {
410 .weak_ref = true,
411 };
412 dev->mem_cache = vk_pipeline_cache_create(&dev->vk, &cache_info, NULL);
413 if (dev->mem_cache == NULL) {
414 result = VK_ERROR_OUT_OF_HOST_MEMORY;
415 goto fail_queue;
416 }
417
418 result = hk_device_init_meta(dev);
419 if (result != VK_SUCCESS)
420 goto fail_mem_cache;
421
422 *pDevice = hk_device_to_handle(dev);
423
424 simple_mtx_init(&dev->scratch.lock, mtx_plain);
425 agx_scratch_init(&dev->dev, &dev->scratch.vs);
426 agx_scratch_init(&dev->dev, &dev->scratch.fs);
427 agx_scratch_init(&dev->dev, &dev->scratch.cs);
428
429 return VK_SUCCESS;
430
431 fail_mem_cache:
432 vk_pipeline_cache_destroy(dev->mem_cache, NULL);
433 fail_queue:
434 hk_queue_finish(dev, &dev->queue);
435 fail_rodata:
436 agx_bo_unreference(&dev->dev, dev->rodata.bo);
437 fail_bg_eot:
438 agx_bg_eot_cleanup(&dev->bg_eot);
439 fail_internal_shaders_2:
440 hk_destroy_internal_shaders(dev, &dev->kernels, false);
441 fail_internal_shaders:
442 hk_destroy_internal_shaders(dev, &dev->prolog_epilog, true);
443 fail_queries:
444 hk_descriptor_table_finish(dev, &dev->occlusion_queries);
445 fail_samplers:
446 hk_destroy_sampler_heap(dev, &dev->samplers);
447 fail_images:
448 hk_descriptor_table_finish(dev, &dev->images);
449 fail_dev:
450 agx_close_device(&dev->dev);
451 fail_fd:
452 close(dev->dev.fd);
453 fail_init:
454 vk_device_finish(&dev->vk);
455 fail_alloc:
456 vk_free(&dev->vk.alloc, dev);
457 return result;
458 }
459
460 VKAPI_ATTR void VKAPI_CALL
hk_DestroyDevice(VkDevice _device,const VkAllocationCallbacks * pAllocator)461 hk_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
462 {
463 VK_FROM_HANDLE(hk_device, dev, _device);
464
465 if (!dev)
466 return;
467
468 hk_device_finish_meta(dev);
469 hk_destroy_internal_shaders(dev, &dev->kernels, false);
470 hk_destroy_internal_shaders(dev, &dev->prolog_epilog, true);
471
472 vk_pipeline_cache_destroy(dev->mem_cache, NULL);
473 hk_queue_finish(dev, &dev->queue);
474 vk_device_finish(&dev->vk);
475
476 agx_scratch_fini(&dev->scratch.vs);
477 agx_scratch_fini(&dev->scratch.fs);
478 agx_scratch_fini(&dev->scratch.cs);
479 simple_mtx_destroy(&dev->scratch.lock);
480
481 hk_destroy_sampler_heap(dev, &dev->samplers);
482 hk_descriptor_table_finish(dev, &dev->images);
483 hk_descriptor_table_finish(dev, &dev->occlusion_queries);
484 agx_bo_unreference(&dev->dev, dev->rodata.bo);
485 agx_bo_unreference(&dev->dev, dev->heap);
486 agx_bg_eot_cleanup(&dev->bg_eot);
487 agx_close_device(&dev->dev);
488 vk_free(&dev->vk.alloc, dev);
489 }
490
491 VKAPI_ATTR VkResult VKAPI_CALL
hk_GetCalibratedTimestampsKHR(VkDevice _device,uint32_t timestampCount,const VkCalibratedTimestampInfoKHR * pTimestampInfos,uint64_t * pTimestamps,uint64_t * pMaxDeviation)492 hk_GetCalibratedTimestampsKHR(
493 VkDevice _device, uint32_t timestampCount,
494 const VkCalibratedTimestampInfoKHR *pTimestampInfos, uint64_t *pTimestamps,
495 uint64_t *pMaxDeviation)
496 {
497 // VK_FROM_HANDLE(hk_device, dev, _device);
498 // struct hk_physical_device *pdev = hk_device_physical(dev);
499 uint64_t max_clock_period = 0;
500 uint64_t begin, end;
501 int d;
502
503 #ifdef CLOCK_MONOTONIC_RAW
504 begin = vk_clock_gettime(CLOCK_MONOTONIC_RAW);
505 #else
506 begin = vk_clock_gettime(CLOCK_MONOTONIC);
507 #endif
508
509 for (d = 0; d < timestampCount; d++) {
510 switch (pTimestampInfos[d].timeDomain) {
511 case VK_TIME_DOMAIN_DEVICE_KHR:
512 unreachable("todo");
513 // pTimestamps[d] = agx_get_gpu_timestamp(&pdev->dev);
514 max_clock_period = MAX2(
515 max_clock_period, 1); /* FIXME: Is timestamp period actually 1? */
516 break;
517 case VK_TIME_DOMAIN_CLOCK_MONOTONIC_KHR:
518 pTimestamps[d] = vk_clock_gettime(CLOCK_MONOTONIC);
519 max_clock_period = MAX2(max_clock_period, 1);
520 break;
521
522 #ifdef CLOCK_MONOTONIC_RAW
523 case VK_TIME_DOMAIN_CLOCK_MONOTONIC_RAW_KHR:
524 pTimestamps[d] = begin;
525 break;
526 #endif
527 default:
528 pTimestamps[d] = 0;
529 break;
530 }
531 }
532
533 #ifdef CLOCK_MONOTONIC_RAW
534 end = vk_clock_gettime(CLOCK_MONOTONIC_RAW);
535 #else
536 end = vk_clock_gettime(CLOCK_MONOTONIC);
537 #endif
538
539 *pMaxDeviation = vk_time_max_deviation(begin, end, max_clock_period);
540
541 return VK_SUCCESS;
542 }
543