xref: /aosp_15_r20/external/mesa3d/src/asahi/vulkan/hk_query_pool.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2024 Valve Corporation
3  * Copyright 2024 Alyssa Rosenzweig
4  * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5  * SPDX-License-Identifier: MIT
6  */
7 #include "hk_query_pool.h"
8 
9 #include "agx_compile.h"
10 #include "agx_pack.h"
11 #include "hk_buffer.h"
12 #include "hk_cmd_buffer.h"
13 #include "hk_device.h"
14 #include "hk_entrypoints.h"
15 #include "hk_event.h"
16 #include "hk_physical_device.h"
17 #include "hk_shader.h"
18 
19 #include "shader_enums.h"
20 #include "vk_common_entrypoints.h"
21 #include "vk_meta.h"
22 #include "vk_pipeline.h"
23 
24 #include "asahi/lib/agx_bo.h"
25 #include "asahi/lib/libagx_shaders.h"
26 #include "asahi/lib/shaders/query.h"
27 #include "compiler/nir/nir.h"
28 #include "compiler/nir/nir_builder.h"
29 
30 #include "util/os_time.h"
31 #include "vulkan/vulkan_core.h"
32 
33 struct hk_query_report {
34    /* TODO: do we want this to be legit u64? */
35    uint32_t value;
36    uint32_t padding;
37 };
38 
39 static uint16_t *
hk_pool_oq_index_ptr(const struct hk_query_pool * pool)40 hk_pool_oq_index_ptr(const struct hk_query_pool *pool)
41 {
42    return (uint16_t *)(pool->bo->map + pool->query_start);
43 }
44 
45 static uint32_t
hk_reports_per_query(struct hk_query_pool * pool)46 hk_reports_per_query(struct hk_query_pool *pool)
47 {
48    switch (pool->vk.query_type) {
49    case VK_QUERY_TYPE_OCCLUSION:
50    case VK_QUERY_TYPE_TIMESTAMP:
51    case VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT:
52       return 1;
53    case VK_QUERY_TYPE_PIPELINE_STATISTICS:
54       return util_bitcount(pool->vk.pipeline_statistics);
55    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT:
56       // Primitives succeeded and primitives needed
57       return 2;
58    default:
59       unreachable("Unsupported query type");
60    }
61 }
62 
63 VKAPI_ATTR VkResult VKAPI_CALL
hk_CreateQueryPool(VkDevice device,const VkQueryPoolCreateInfo * pCreateInfo,const VkAllocationCallbacks * pAllocator,VkQueryPool * pQueryPool)64 hk_CreateQueryPool(VkDevice device, const VkQueryPoolCreateInfo *pCreateInfo,
65                    const VkAllocationCallbacks *pAllocator,
66                    VkQueryPool *pQueryPool)
67 {
68    VK_FROM_HANDLE(hk_device, dev, device);
69    struct hk_query_pool *pool;
70 
71    bool occlusion = pCreateInfo->queryType == VK_QUERY_TYPE_OCCLUSION;
72    unsigned occlusion_queries = occlusion ? pCreateInfo->queryCount : 0;
73 
74    pool =
75       vk_query_pool_create(&dev->vk, pCreateInfo, pAllocator, sizeof(*pool));
76    if (!pool)
77       return vk_error(dev, VK_ERROR_OUT_OF_HOST_MEMORY);
78 
79    /* We place the availability first and then data */
80    pool->query_start = align(pool->vk.query_count * sizeof(uint32_t),
81                              sizeof(struct hk_query_report));
82 
83    uint32_t reports_per_query = hk_reports_per_query(pool);
84    pool->query_stride = reports_per_query * sizeof(struct hk_query_report);
85 
86    if (pool->vk.query_count > 0) {
87       uint32_t bo_size = pool->query_start;
88 
89       /* For occlusion queries, we stick the query index remapping here */
90       if (occlusion_queries)
91          bo_size += sizeof(uint16_t) * pool->vk.query_count;
92       else
93          bo_size += pool->query_stride * pool->vk.query_count;
94 
95       pool->bo =
96          agx_bo_create(&dev->dev, bo_size, 0, AGX_BO_WRITEBACK, "Query pool");
97       if (!pool->bo) {
98          hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool), pAllocator);
99          return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
100       }
101    }
102 
103    uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
104 
105    for (unsigned i = 0; i < occlusion_queries; ++i) {
106       uint64_t zero = 0;
107       unsigned index;
108 
109       VkResult result = hk_descriptor_table_add(
110          dev, &dev->occlusion_queries, &zero, sizeof(uint64_t), &index);
111 
112       if (result != VK_SUCCESS) {
113          hk_DestroyQueryPool(device, hk_query_pool_to_handle(pool), pAllocator);
114          return vk_error(dev, VK_ERROR_OUT_OF_DEVICE_MEMORY);
115       }
116 
117       /* We increment as we go so we can clean up properly if we run out */
118       assert(pool->oq_queries < occlusion_queries);
119       oq_index[pool->oq_queries++] = index;
120    }
121 
122    *pQueryPool = hk_query_pool_to_handle(pool);
123 
124    return VK_SUCCESS;
125 }
126 
127 VKAPI_ATTR void VKAPI_CALL
hk_DestroyQueryPool(VkDevice device,VkQueryPool queryPool,const VkAllocationCallbacks * pAllocator)128 hk_DestroyQueryPool(VkDevice device, VkQueryPool queryPool,
129                     const VkAllocationCallbacks *pAllocator)
130 {
131    VK_FROM_HANDLE(hk_device, dev, device);
132    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
133 
134    if (!pool)
135       return;
136 
137    uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
138 
139    for (unsigned i = 0; i < pool->oq_queries; ++i) {
140       hk_descriptor_table_remove(dev, &dev->occlusion_queries, oq_index[i]);
141    }
142 
143    agx_bo_unreference(&dev->dev, pool->bo);
144    vk_query_pool_destroy(&dev->vk, pAllocator, &pool->vk);
145 }
146 
147 static uint64_t
hk_query_available_addr(struct hk_query_pool * pool,uint32_t query)148 hk_query_available_addr(struct hk_query_pool *pool, uint32_t query)
149 {
150    assert(query < pool->vk.query_count);
151    return pool->bo->va->addr + query * sizeof(uint32_t);
152 }
153 
154 static uint32_t *
hk_query_available_map(struct hk_query_pool * pool,uint32_t query)155 hk_query_available_map(struct hk_query_pool *pool, uint32_t query)
156 {
157    assert(query < pool->vk.query_count);
158    return (uint32_t *)pool->bo->map + query;
159 }
160 
161 static uint64_t
hk_query_offset(struct hk_query_pool * pool,uint32_t query)162 hk_query_offset(struct hk_query_pool *pool, uint32_t query)
163 {
164    assert(query < pool->vk.query_count);
165    return pool->query_start + query * pool->query_stride;
166 }
167 
168 static uint64_t
hk_query_report_addr(struct hk_device * dev,struct hk_query_pool * pool,uint32_t query)169 hk_query_report_addr(struct hk_device *dev, struct hk_query_pool *pool,
170                      uint32_t query)
171 {
172    if (pool->oq_queries) {
173       uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
174       return dev->occlusion_queries.bo->va->addr +
175              (oq_index[query] * sizeof(uint64_t));
176    } else {
177       return pool->bo->va->addr + hk_query_offset(pool, query);
178    }
179 }
180 
181 static struct hk_query_report *
hk_query_report_map(struct hk_device * dev,struct hk_query_pool * pool,uint32_t query)182 hk_query_report_map(struct hk_device *dev, struct hk_query_pool *pool,
183                     uint32_t query)
184 {
185    if (pool->oq_queries) {
186       uint64_t *queries = (uint64_t *)dev->occlusion_queries.bo->map;
187       uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
188 
189       return (struct hk_query_report *)&queries[oq_index[query]];
190    } else {
191       return (void *)((char *)pool->bo->map + hk_query_offset(pool, query));
192    }
193 }
194 
195 struct hk_write_params {
196    uint64_t address;
197    uint32_t value;
198 };
199 
200 static void
hk_nir_write_u32(nir_builder * b,UNUSED const void * key)201 hk_nir_write_u32(nir_builder *b, UNUSED const void *key)
202 {
203    nir_def *addr = nir_load_preamble(
204       b, 1, 64, .base = offsetof(struct hk_write_params, address) / 2);
205 
206    nir_def *value = nir_load_preamble(
207       b, 1, 32, .base = offsetof(struct hk_write_params, value) / 2);
208 
209    nir_store_global(b, addr, 4, value, nir_component_mask(1));
210 }
211 
212 void
hk_queue_write(struct hk_cmd_buffer * cmd,uint64_t address,uint32_t value,bool after_gfx)213 hk_queue_write(struct hk_cmd_buffer *cmd, uint64_t address, uint32_t value,
214                bool after_gfx)
215 {
216    struct hk_cs *cs = hk_cmd_buffer_get_cs_general(
217       cmd, after_gfx ? &cmd->current_cs.post_gfx : &cmd->current_cs.cs, true);
218    if (!cs)
219       return;
220 
221    hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
222 
223    /* As soon as we mark a query available, it needs to be available system
224     * wide, otherwise a CPU-side get result can query. As such, we cache flush
225     * before and then let coherency works its magic. Without this barrier, we
226     * get flakes in
227     *
228     * dEQP-VK.query_pool.occlusion_query.get_results_conservative_size_64_wait_query_without_availability_draw_triangles_discard
229     */
230    struct hk_device *dev = hk_cmd_buffer_device(cmd);
231    hk_cdm_cache_flush(dev, cs);
232 
233    struct hk_shader *s = hk_meta_kernel(dev, hk_nir_write_u32, NULL, 0);
234    struct hk_write_params params = {.address = address, .value = value};
235    uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &params, sizeof(params));
236 
237    hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(1, 1, 1), hk_grid(1, 1, 1));
238 }
239 
240 /**
241  * Goes through a series of consecutive query indices in the given pool,
242  * setting all element values to 0 and emitting them as available.
243  */
244 static void
emit_zero_queries(struct hk_cmd_buffer * cmd,struct hk_query_pool * pool,uint32_t first_index,uint32_t num_queries,bool set_available)245 emit_zero_queries(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool,
246                   uint32_t first_index, uint32_t num_queries,
247                   bool set_available)
248 {
249    struct hk_device *dev = hk_cmd_buffer_device(cmd);
250 
251    for (uint32_t i = 0; i < num_queries; i++) {
252       uint64_t available = hk_query_available_addr(pool, first_index + i);
253       uint64_t report = hk_query_report_addr(dev, pool, first_index + i);
254       hk_queue_write(cmd, available, set_available, false);
255 
256       /* XXX: is this supposed to happen on the begin? */
257       for (unsigned j = 0; j < hk_reports_per_query(pool); ++j) {
258          hk_queue_write(cmd, report + (j * sizeof(struct hk_query_report)), 0,
259                         false);
260       }
261    }
262 }
263 
264 VKAPI_ATTR void VKAPI_CALL
hk_ResetQueryPool(VkDevice device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)265 hk_ResetQueryPool(VkDevice device, VkQueryPool queryPool, uint32_t firstQuery,
266                   uint32_t queryCount)
267 {
268    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
269    VK_FROM_HANDLE(hk_device, dev, device);
270 
271    uint32_t *available = hk_query_available_map(pool, firstQuery);
272    struct hk_query_report *reports = hk_query_report_map(dev, pool, firstQuery);
273 
274    memset(available, 0, queryCount * sizeof(*available));
275    memset(reports, 0, queryCount * pool->query_stride);
276 }
277 
278 VKAPI_ATTR void VKAPI_CALL
hk_CmdResetQueryPool(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount)279 hk_CmdResetQueryPool(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
280                      uint32_t firstQuery, uint32_t queryCount)
281 {
282    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
283    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
284 
285    emit_zero_queries(cmd, pool, firstQuery, queryCount, false);
286 }
287 
288 VKAPI_ATTR void VKAPI_CALL
hk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,VkPipelineStageFlags2 stage,VkQueryPool queryPool,uint32_t query)289 hk_CmdWriteTimestamp2(VkCommandBuffer commandBuffer,
290                       VkPipelineStageFlags2 stage, VkQueryPool queryPool,
291                       uint32_t query)
292 {
293    unreachable("todo");
294 #if 0
295    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
296    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
297 
298    struct nv_push *p = hk_cmd_buffer_push(cmd, 10);
299 
300    uint64_t report_addr = hk_query_report_addr(pool, query);
301    P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
302    P_NV9097_SET_REPORT_SEMAPHORE_A(p, report_addr >> 32);
303    P_NV9097_SET_REPORT_SEMAPHORE_B(p, report_addr);
304    P_NV9097_SET_REPORT_SEMAPHORE_C(p, 0);
305    P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
306       .operation = OPERATION_REPORT_ONLY,
307       .pipeline_location = vk_stage_flags_to_nv9097_pipeline_location(stage),
308       .structure_size = STRUCTURE_SIZE_FOUR_WORDS,
309    });
310 
311    uint64_t available_addr = hk_query_available_addr(pool, query);
312    P_MTHD(p, NV9097, SET_REPORT_SEMAPHORE_A);
313    P_NV9097_SET_REPORT_SEMAPHORE_A(p, available_addr >> 32);
314    P_NV9097_SET_REPORT_SEMAPHORE_B(p, available_addr);
315    P_NV9097_SET_REPORT_SEMAPHORE_C(p, 1);
316    P_NV9097_SET_REPORT_SEMAPHORE_D(p, {
317       .operation = OPERATION_RELEASE,
318       .release = RELEASE_AFTER_ALL_PRECEEDING_WRITES_COMPLETE,
319       .pipeline_location = PIPELINE_LOCATION_ALL,
320       .structure_size = STRUCTURE_SIZE_ONE_WORD,
321    });
322 
323    /* From the Vulkan spec:
324     *
325     *   "If vkCmdWriteTimestamp2 is called while executing a render pass
326     *    instance that has multiview enabled, the timestamp uses N consecutive
327     *    query indices in the query pool (starting at query) where N is the
328     *    number of bits set in the view mask of the subpass the command is
329     *    executed in. The resulting query values are determined by an
330     *    implementation-dependent choice of one of the following behaviors:"
331     *
332     * In our case, only the first query is used, so we emit zeros for the
333     * remaining queries, as described in the first behavior listed in the
334     * Vulkan spec:
335     *
336     *   "The first query is a timestamp value and (if more than one bit is set
337     *   in the view mask) zero is written to the remaining queries."
338     */
339    if (cmd->state.gfx.render.view_mask != 0) {
340       const uint32_t num_queries =
341          util_bitcount(cmd->state.gfx.render.view_mask);
342       if (num_queries > 1)
343          emit_zero_queries(cmd, pool, query + 1, num_queries - 1, true);
344    }
345 #endif
346 }
347 
348 static void
hk_cmd_begin_end_query(struct hk_cmd_buffer * cmd,struct hk_query_pool * pool,uint32_t query,uint32_t index,VkQueryControlFlags flags,bool end)349 hk_cmd_begin_end_query(struct hk_cmd_buffer *cmd, struct hk_query_pool *pool,
350                        uint32_t query, uint32_t index,
351                        VkQueryControlFlags flags, bool end)
352 {
353    struct hk_device *dev = hk_cmd_buffer_device(cmd);
354    bool graphics = false;
355 
356    switch (pool->vk.query_type) {
357    case VK_QUERY_TYPE_OCCLUSION: {
358       assert(query < pool->oq_queries);
359 
360       if (end) {
361          cmd->state.gfx.occlusion.mode = AGX_VISIBILITY_MODE_NONE;
362       } else {
363          cmd->state.gfx.occlusion.mode = flags & VK_QUERY_CONTROL_PRECISE_BIT
364                                             ? AGX_VISIBILITY_MODE_COUNTING
365                                             : AGX_VISIBILITY_MODE_BOOLEAN;
366       }
367 
368       uint16_t *oq_index = hk_pool_oq_index_ptr(pool);
369       cmd->state.gfx.occlusion.index = oq_index[query];
370       cmd->state.gfx.dirty |= HK_DIRTY_OCCLUSION;
371       break;
372    }
373 
374    case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: {
375       uint64_t addr = hk_query_report_addr(dev, pool, query);
376       cmd->state.gfx.xfb_query[index] = end ? 0 : addr;
377       break;
378    }
379 
380    case VK_QUERY_TYPE_PIPELINE_STATISTICS: {
381       struct hk_root_descriptor_table *root = &cmd->state.gfx.descriptors.root;
382       cmd->state.gfx.descriptors.root_dirty = true;
383 
384       root->draw.pipeline_stats = hk_query_report_addr(dev, pool, query);
385       root->draw.pipeline_stats_flags = pool->vk.pipeline_statistics;
386 
387       /* XXX: I don't think is correct... when does the query become available
388        * exactly?
389        */
390       graphics = pool->vk.pipeline_statistics &
391                  ~VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT;
392       break;
393    }
394 
395    default:
396       unreachable("Unsupported query type");
397    }
398 
399    /* We need to set available=1 after the graphics work finishes. */
400    if (end) {
401       hk_queue_write(cmd, hk_query_available_addr(pool, query), 1, graphics);
402    }
403 }
404 
405 VKAPI_ATTR void VKAPI_CALL
hk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,VkQueryControlFlags flags,uint32_t index)406 hk_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
407                            uint32_t query, VkQueryControlFlags flags,
408                            uint32_t index)
409 {
410    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
411    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
412 
413    hk_cmd_begin_end_query(cmd, pool, query, index, flags, false);
414 }
415 
416 VKAPI_ATTR void VKAPI_CALL
hk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t query,uint32_t index)417 hk_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
418                          uint32_t query, uint32_t index)
419 {
420    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
421    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
422 
423    hk_cmd_begin_end_query(cmd, pool, query, index, 0, true);
424 
425    /* From the Vulkan spec:
426     *
427     *   "If queries are used while executing a render pass instance that has
428     *    multiview enabled, the query uses N consecutive query indices in
429     *    the query pool (starting at query) where N is the number of bits set
430     *    in the view mask in the subpass the query is used in. How the
431     *    numerical results of the query are distributed among the queries is
432     *    implementation-dependent."
433     *
434     * In our case, only the first query is used, so we emit zeros for the
435     * remaining queries.
436     */
437    if (cmd->state.gfx.render.view_mask != 0) {
438       const uint32_t num_queries =
439          util_bitcount(cmd->state.gfx.render.view_mask);
440       if (num_queries > 1)
441          emit_zero_queries(cmd, pool, query + 1, num_queries - 1, true);
442    }
443 }
444 
445 static bool
hk_query_is_available(struct hk_query_pool * pool,uint32_t query)446 hk_query_is_available(struct hk_query_pool *pool, uint32_t query)
447 {
448    uint32_t *available = hk_query_available_map(pool, query);
449    return p_atomic_read(available) != 0;
450 }
451 
452 #define HK_QUERY_TIMEOUT 2000000000ull
453 
454 static VkResult
hk_query_wait_for_available(struct hk_device * dev,struct hk_query_pool * pool,uint32_t query)455 hk_query_wait_for_available(struct hk_device *dev, struct hk_query_pool *pool,
456                             uint32_t query)
457 {
458    uint64_t abs_timeout_ns = os_time_get_absolute_timeout(HK_QUERY_TIMEOUT);
459 
460    while (os_time_get_nano() < abs_timeout_ns) {
461       if (hk_query_is_available(pool, query))
462          return VK_SUCCESS;
463 
464       VkResult status = vk_device_check_status(&dev->vk);
465       if (status != VK_SUCCESS)
466          return status;
467    }
468 
469    return vk_device_set_lost(&dev->vk, "query timeout");
470 }
471 
472 static void
cpu_write_query_result(void * dst,uint32_t idx,VkQueryResultFlags flags,uint64_t result)473 cpu_write_query_result(void *dst, uint32_t idx, VkQueryResultFlags flags,
474                        uint64_t result)
475 {
476    if (flags & VK_QUERY_RESULT_64_BIT) {
477       uint64_t *dst64 = dst;
478       dst64[idx] = result;
479    } else {
480       uint32_t *dst32 = dst;
481       dst32[idx] = result;
482    }
483 }
484 
485 VKAPI_ATTR VkResult VKAPI_CALL
hk_GetQueryPoolResults(VkDevice device,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,size_t dataSize,void * pData,VkDeviceSize stride,VkQueryResultFlags flags)486 hk_GetQueryPoolResults(VkDevice device, VkQueryPool queryPool,
487                        uint32_t firstQuery, uint32_t queryCount,
488                        size_t dataSize, void *pData, VkDeviceSize stride,
489                        VkQueryResultFlags flags)
490 {
491    VK_FROM_HANDLE(hk_device, dev, device);
492    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
493 
494    if (vk_device_is_lost(&dev->vk))
495       return VK_ERROR_DEVICE_LOST;
496 
497    VkResult status = VK_SUCCESS;
498    for (uint32_t i = 0; i < queryCount; i++) {
499       const uint32_t query = firstQuery + i;
500 
501       bool available = hk_query_is_available(pool, query);
502 
503       if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) {
504          status = hk_query_wait_for_available(dev, pool, query);
505          if (status != VK_SUCCESS)
506             return status;
507 
508          available = true;
509       }
510 
511       bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT);
512 
513       const struct hk_query_report *src = hk_query_report_map(dev, pool, query);
514       assert(i * stride < dataSize);
515       void *dst = (char *)pData + i * stride;
516 
517       uint32_t reports = hk_reports_per_query(pool);
518       if (write_results) {
519          for (uint32_t j = 0; j < reports; j++) {
520             cpu_write_query_result(dst, j, flags, src[j].value);
521          }
522       }
523 
524       if (!write_results)
525          status = VK_NOT_READY;
526 
527       if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)
528          cpu_write_query_result(dst, reports, flags, available);
529    }
530 
531    return status;
532 }
533 
534 static void
hk_nir_copy_query(nir_builder * b,UNUSED const void * key)535 hk_nir_copy_query(nir_builder *b, UNUSED const void *key)
536 {
537    nir_def *id = nir_channel(b, nir_load_workgroup_id(b), 0);
538    libagx_copy_query(b, nir_load_preamble(b, 1, 64), id);
539 }
540 
541 VKAPI_ATTR void VKAPI_CALL
hk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer,VkQueryPool queryPool,uint32_t firstQuery,uint32_t queryCount,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize stride,VkQueryResultFlags flags)542 hk_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, VkQueryPool queryPool,
543                            uint32_t firstQuery, uint32_t queryCount,
544                            VkBuffer dstBuffer, VkDeviceSize dstOffset,
545                            VkDeviceSize stride, VkQueryResultFlags flags)
546 {
547    VK_FROM_HANDLE(hk_cmd_buffer, cmd, commandBuffer);
548    VK_FROM_HANDLE(hk_query_pool, pool, queryPool);
549    VK_FROM_HANDLE(hk_buffer, dst_buffer, dstBuffer);
550 
551    struct hk_device *dev = hk_cmd_buffer_device(cmd);
552    struct hk_cs *cs = hk_cmd_buffer_get_cs(cmd, true);
553    if (!cs)
554       return;
555 
556    hk_ensure_cs_has_space(cmd, cs, 0x2000 /* TODO */);
557 
558    const struct libagx_copy_query_push info = {
559       .availability = pool->bo->va->addr,
560       .results = pool->oq_queries ? dev->occlusion_queries.bo->va->addr
561                                   : pool->bo->va->addr + pool->query_start,
562       .oq_index = pool->oq_queries ? pool->bo->va->addr + pool->query_start : 0,
563 
564       .first_query = firstQuery,
565       .dst_addr = hk_buffer_address(dst_buffer, dstOffset),
566       .dst_stride = stride,
567       .reports_per_query = hk_reports_per_query(pool),
568 
569       .partial = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT,
570       ._64 = flags & VK_QUERY_RESULT_64_BIT,
571       .with_availability = flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT,
572    };
573 
574    uint64_t push = hk_pool_upload(cmd, &info, sizeof(info), 8);
575 
576    struct hk_shader *s = hk_meta_kernel(dev, hk_nir_copy_query, NULL, 0);
577    uint32_t usc = hk_upload_usc_words_kernel(cmd, s, &push, sizeof(push));
578    hk_dispatch_with_usc(dev, cs, s, usc, hk_grid(queryCount, 1, 1),
579                         hk_grid(1, 1, 1));
580 }
581