xref: /aosp_15_r20/external/mesa3d/src/asahi/vulkan/hk_cmd_buffer.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2024 Valve Corporation
3  * Copyright 2024 Alyssa Rosenzweig
4  * Copyright 2022-2023 Collabora Ltd. and Red Hat Inc.
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #pragma once
9 
10 #include "util/macros.h"
11 
12 #include "util/list.h"
13 #include "agx_helpers.h"
14 #include "agx_linker.h"
15 #include "agx_pack.h"
16 #include "agx_tilebuffer.h"
17 #include "agx_uvs.h"
18 #include "pool.h"
19 #include "shader_enums.h"
20 
21 #include "hk_private.h"
22 #include "hk_shader.h"
23 
24 #include "hk_cmd_pool.h"
25 #include "hk_descriptor_set.h"
26 
27 #include "asahi/lib/agx_nir_lower_vbo.h"
28 #include "util/u_dynarray.h"
29 #include "vulkan/vulkan_core.h"
30 
31 #include "vk_command_buffer.h"
32 
33 #include <stdio.h>
34 
35 struct hk_buffer;
36 struct hk_cmd_bo;
37 struct hk_cmd_pool;
38 struct hk_image_view;
39 struct hk_push_descriptor_set;
40 struct hk_shader;
41 struct hk_linked_shader;
42 struct agx_usc_builder;
43 struct vk_shader;
44 
45 /** Root descriptor table. */
46 struct hk_root_descriptor_table {
47    uint64_t root_desc_addr;
48 
49    union {
50       struct {
51          uint32_t view_index;
52          uint32_t ppp_multisamplectl;
53 
54          /* Vertex input state */
55          uint64_t attrib_base[AGX_MAX_VBUFS];
56          uint32_t attrib_clamps[AGX_MAX_VBUFS];
57 
58          /* Pointer to the VS->TCS, VS->GS, or TES->GS buffer. */
59          uint64_t vertex_output_buffer;
60 
61          /* Mask of outputs flowing VS->TCS, VS->GS, or TES->GS . */
62          uint64_t vertex_outputs;
63 
64          /* Address of input assembly buffer if geom/tess is used, else 0 */
65          uint64_t input_assembly;
66 
67          /* Address of tessellation param buffer if tessellation used, else 0 */
68          uint64_t tess_params;
69 
70          /* Address of geometry param buffer if GS is used, else 0 */
71          uint64_t geometry_params;
72 
73          /* Pipeline statistics queries. This is a base address with flags. */
74          uint64_t pipeline_stats;
75          VkQueryPipelineStatisticFlags pipeline_stats_flags;
76 
77          float blend_constant[4];
78          uint16_t no_epilog_discard;
79          uint16_t _pad1;
80          uint16_t api_sample_mask;
81          uint16_t _pad2;
82          uint16_t force_never_in_shader;
83          uint16_t _pad3;
84          uint16_t provoking;
85          uint16_t _pad4;
86 
87          /* Mapping from varying slots written by the last vertex stage to UVS
88           * indices. This mapping must be compatible with the fragment shader.
89           */
90          uint8_t uvs_index[VARYING_SLOT_MAX];
91       } draw;
92       struct {
93          uint64_t group_count_addr;
94          uint32_t base_group[3];
95       } cs;
96    };
97 
98    /* Client push constants */
99    uint8_t push[HK_MAX_PUSH_SIZE];
100 
101    /* Descriptor set base addresses */
102    uint64_t sets[HK_MAX_SETS];
103 
104    /* Dynamic buffer bindings */
105    struct hk_buffer_address dynamic_buffers[HK_MAX_DYNAMIC_BUFFERS];
106 
107    /* Start index in dynamic_buffers where each set starts */
108    uint8_t set_dynamic_buffer_start[HK_MAX_SETS];
109 };
110 
111 /* helper macro for computing root descriptor byte offsets */
112 #define hk_root_descriptor_offset(member)                                      \
113    offsetof(struct hk_root_descriptor_table, member)
114 
115 struct hk_descriptor_state {
116    bool root_dirty;
117    struct hk_root_descriptor_table root;
118 
119    uint32_t set_sizes[HK_MAX_SETS];
120    struct hk_descriptor_set *sets[HK_MAX_SETS];
121    uint32_t sets_dirty;
122 
123    struct hk_push_descriptor_set *push[HK_MAX_SETS];
124    uint32_t push_dirty;
125 };
126 
127 struct hk_attachment {
128    VkFormat vk_format;
129    struct hk_image_view *iview;
130 
131    VkResolveModeFlagBits resolve_mode;
132    struct hk_image_view *resolve_iview;
133 };
134 
135 struct hk_bg_eot {
136    uint64_t usc;
137    struct agx_counts_packed counts;
138 };
139 
140 struct hk_render_registers {
141    uint32_t width, height, layers;
142    uint32_t isp_bgobjdepth;
143    uint32_t isp_bgobjvals;
144    struct agx_zls_control_packed zls_control, zls_control_partial;
145    uint32_t iogpu_unk_214;
146    uint32_t depth_dimensions;
147 
148    struct {
149       uint32_t dimensions;
150       uint64_t buffer, meta;
151       uint32_t stride, meta_stride;
152    } depth;
153 
154    struct {
155       uint64_t buffer, meta;
156       uint32_t stride, meta_stride;
157    } stencil;
158 
159    struct {
160       struct hk_bg_eot main;
161       struct hk_bg_eot partial;
162    } bg;
163 
164    struct {
165       struct hk_bg_eot main;
166       struct hk_bg_eot partial;
167    } eot;
168 };
169 
170 struct hk_rendering_state {
171    VkRenderingFlagBits flags;
172 
173    VkRect2D area;
174    uint32_t layer_count;
175    uint32_t view_mask;
176 
177    uint32_t color_att_count;
178    struct hk_attachment color_att[HK_MAX_RTS];
179    struct hk_attachment depth_att;
180    struct hk_attachment stencil_att;
181 
182    struct agx_tilebuffer_layout tilebuffer;
183    struct hk_render_registers cr;
184 };
185 
186 struct hk_index_buffer_state {
187    struct hk_addr_range buffer;
188    enum agx_index_size size;
189    uint32_t restart;
190 };
191 
192 /* Dirty tracking bits for state not tracked by vk_dynamic_graphics_state or
193  * shaders_dirty.
194  */
195 enum hk_dirty {
196    HK_DIRTY_INDEX = BITFIELD_BIT(0),
197    HK_DIRTY_VB = BITFIELD_BIT(1),
198    HK_DIRTY_OCCLUSION = BITFIELD_BIT(2),
199    HK_DIRTY_PROVOKING = BITFIELD_BIT(3),
200    HK_DIRTY_VARYINGS = BITFIELD_BIT(4),
201 };
202 
203 struct hk_graphics_state {
204    struct hk_rendering_state render;
205    struct hk_descriptor_state descriptors;
206 
207    enum hk_dirty dirty;
208 
209    uint64_t root;
210    uint64_t draw_params;
211    uint64_t draw_id_ptr;
212 
213    uint32_t shaders_dirty;
214    struct hk_api_shader *shaders[MESA_SHADER_MESH + 1];
215 
216    /* Vertex buffers */
217    struct hk_addr_range vb[AGX_MAX_VBUFS];
218 
219    /* Transform feedback buffers */
220    struct hk_addr_range xfb[4];
221 
222    /* Is transform feedback enabled? */
223    bool xfb_enabled;
224 
225    /* Internal transform feedback offset vec4.
226     *
227     * TODO: Strictly could be global.
228     */
229    uint64_t xfb_offsets;
230 
231    /* Pointer to the GPU memory backing active transform feedback queries,
232     * per-stream. Zero if no query is bound.
233     */
234    uint64_t xfb_query[4];
235 
236    struct hk_index_buffer_state index;
237    enum agx_primitive topology;
238    enum agx_object_type object_type;
239 
240    /* Provoking vertex 0, 1, or 2. Usually 0 or 2 for FIRST/LAST. 1 can only be
241     * set for tri fans.
242     */
243    uint8_t provoking;
244 
245    struct {
246       enum agx_visibility_mode mode;
247 
248       /* If enabled, index of the current occlusion query in the occlusion heap.
249        * There can only be one active at a time (hardware contraint).
250        */
251       uint16_t index;
252    } occlusion;
253 
254    /* Fast linked shader data structures */
255    uint64_t varyings;
256    struct agx_varyings_vs linked_varyings;
257 
258    uint32_t linked_dirty;
259    struct hk_linked_shader *linked[PIPE_SHADER_TYPES];
260    bool generate_primitive_id;
261 
262    /* Tessellation state */
263    uint64_t tess_out_draws;
264 
265    /* Needed by vk_command_buffer::dynamic_graphics_state */
266    struct vk_vertex_input_state _dynamic_vi;
267    struct vk_sample_locations_state _dynamic_sl;
268 };
269 
270 struct hk_compute_state {
271    struct hk_descriptor_state descriptors;
272    struct hk_api_shader *shader;
273 };
274 
275 struct hk_cmd_push {
276    void *map;
277    uint64_t addr;
278    uint32_t range;
279    bool no_prefetch;
280 };
281 
282 struct hk_scratch_req {
283    bool main;
284    bool preamble;
285 };
286 
287 /*
288  * hk_cs represents a single control stream, to be enqueued either to the
289  * CDM or VDM for compute/3D respectively.
290  */
291 enum hk_cs_type {
292    HK_CS_CDM,
293    HK_CS_VDM,
294 };
295 
296 struct hk_cs {
297    struct list_head node;
298 
299    /* Data master */
300    enum hk_cs_type type;
301 
302    /* Address of the root control stream for the job */
303    uint64_t addr;
304 
305    /* Start pointer of the root control stream */
306    void *start;
307 
308    /* Current pointer within the control stream */
309    void *current;
310 
311    /* End pointer of the current chunk of the control stream */
312    void *end;
313 
314    /* Whether there is more than just the root chunk */
315    bool stream_linked;
316 
317    /* Scratch requirements */
318    struct {
319       union {
320          struct hk_scratch_req vs;
321          struct hk_scratch_req cs;
322       };
323 
324       struct hk_scratch_req fs;
325    } scratch;
326 
327    /* Remaining state is for graphics only, ignored for compute */
328    struct agx_tilebuffer_layout tib;
329 
330    struct util_dynarray scissor, depth_bias;
331    uint64_t uploaded_scissor, uploaded_zbias;
332 
333    /* We can only set ppp_multisamplectl once per batch. has_sample_locations
334     * tracks if we've committed to a set of sample locations yet. vk_meta
335     * operations do not set has_sample_locations since they don't care and it
336     * would interfere with the app-provided samples.
337     *
338     */
339    bool has_sample_locations;
340    uint32_t ppp_multisamplectl;
341 
342    struct hk_render_registers cr;
343 };
344 
345 struct hk_uploader {
346    /** List of hk_cmd_bo */
347    struct list_head bos;
348 
349    /* Current addresses */
350    uint8_t *map;
351    uint64_t base;
352    uint32_t offset;
353 };
354 
355 struct hk_cmd_buffer {
356    struct vk_command_buffer vk;
357 
358    struct {
359       struct hk_graphics_state gfx;
360       struct hk_compute_state cs;
361    } state;
362 
363    struct {
364       struct hk_uploader main, usc;
365    } uploader;
366 
367    /* List of all recorded control streams */
368    struct list_head control_streams;
369 
370    /* Current recorded control stream */
371    struct {
372       /* VDM stream for 3D */
373       struct hk_cs *gfx;
374 
375       /* CDM stream for compute */
376       struct hk_cs *cs;
377 
378       /* CDM stream that executes immediately before the current graphics
379        * control stream. Used for geometry shading, tessellation, etc.
380        */
381       struct hk_cs *pre_gfx;
382 
383       /* CDM stream that will execute after the current graphics control stream
384        * finishes. Used for queries.
385        */
386       struct hk_cs *post_gfx;
387    } current_cs;
388 
389    /* Are we currently inside a vk_meta operation? This alters sample location
390     * behaviour.
391     */
392    bool in_meta;
393 
394    /* XXX: move me?
395     *
396     * Indirect draw generated by the pre-GS for the geometry shader.
397     */
398    uint64_t geom_indirect;
399 
400    /* Does the command buffer use the geometry heap? */
401    bool uses_heap;
402 
403    /* Owned large BOs */
404    struct util_dynarray large_bos;
405 };
406 
407 VK_DEFINE_HANDLE_CASTS(hk_cmd_buffer, vk.base, VkCommandBuffer,
408                        VK_OBJECT_TYPE_COMMAND_BUFFER)
409 
410 extern const struct vk_command_buffer_ops hk_cmd_buffer_ops;
411 
412 static inline struct hk_device *
hk_cmd_buffer_device(struct hk_cmd_buffer * cmd)413 hk_cmd_buffer_device(struct hk_cmd_buffer *cmd)
414 {
415    return (struct hk_device *)cmd->vk.base.device;
416 }
417 
418 static inline struct hk_cmd_pool *
hk_cmd_buffer_pool(struct hk_cmd_buffer * cmd)419 hk_cmd_buffer_pool(struct hk_cmd_buffer *cmd)
420 {
421    return (struct hk_cmd_pool *)cmd->vk.pool;
422 }
423 
424 /*
425  * The hardware vertex shader is supplied by the last geometry stage. The
426  * geometry pipeline is vertex->tess->geometry so we search backwards.
427  */
428 static inline struct hk_shader *
hk_bound_hw_vs(struct hk_graphics_state * gfx)429 hk_bound_hw_vs(struct hk_graphics_state *gfx)
430 {
431    struct hk_api_shader *vs = gfx->shaders[MESA_SHADER_VERTEX];
432    struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL];
433    struct hk_api_shader *gs = gfx->shaders[MESA_SHADER_GEOMETRY];
434 
435    if (gs)
436       return &gs->variants[HK_GS_VARIANT_RAST];
437    else if (tes)
438       return &tes->variants[HK_VS_VARIANT_HW];
439    else
440       return &vs->variants[HK_VS_VARIANT_HW];
441 }
442 
443 static inline struct hk_shader *
hk_bound_sw_vs(struct hk_graphics_state * gfx)444 hk_bound_sw_vs(struct hk_graphics_state *gfx)
445 {
446    struct hk_api_shader *vs = gfx->shaders[MESA_SHADER_VERTEX];
447    struct hk_shader *hw_vs = hk_bound_hw_vs(gfx);
448 
449    if (hw_vs == &vs->variants[HK_VS_VARIANT_HW])
450       return hw_vs;
451    else
452       return &vs->variants[HK_VS_VARIANT_SW];
453 }
454 
455 static inline struct hk_shader *
hk_bound_sw_vs_before_gs(struct hk_graphics_state * gfx)456 hk_bound_sw_vs_before_gs(struct hk_graphics_state *gfx)
457 {
458    struct hk_api_shader *vs = gfx->shaders[MESA_SHADER_VERTEX];
459    struct hk_api_shader *tes = gfx->shaders[MESA_SHADER_TESS_EVAL];
460    struct hk_api_shader *api = tes ?: vs;
461 
462    return &api->variants[HK_VS_VARIANT_SW];
463 }
464 
465 struct agx_ptr hk_pool_alloc_internal(struct hk_cmd_buffer *cmd, uint32_t size,
466                                       uint32_t alignment, bool usc);
467 
468 uint64_t hk_pool_upload(struct hk_cmd_buffer *cmd, const void *data,
469                         uint32_t size, uint32_t alignment);
470 
471 static inline struct agx_ptr
hk_pool_alloc(struct hk_cmd_buffer * cmd,uint32_t size,uint32_t alignment)472 hk_pool_alloc(struct hk_cmd_buffer *cmd, uint32_t size, uint32_t alignment)
473 {
474    return hk_pool_alloc_internal(cmd, size, alignment, false);
475 }
476 
477 static inline struct agx_ptr
hk_pool_usc_alloc(struct hk_cmd_buffer * cmd,uint32_t size,uint32_t alignment)478 hk_pool_usc_alloc(struct hk_cmd_buffer *cmd, uint32_t size, uint32_t alignment)
479 {
480    return hk_pool_alloc_internal(cmd, size, alignment, true);
481 }
482 
483 void hk_cs_init_graphics(struct hk_cmd_buffer *cmd, struct hk_cs *cs);
484 uint32_t hk_default_sample_positions(unsigned nr_samples);
485 
486 static inline struct hk_cs *
hk_cmd_buffer_get_cs_general(struct hk_cmd_buffer * cmd,struct hk_cs ** ptr,bool compute)487 hk_cmd_buffer_get_cs_general(struct hk_cmd_buffer *cmd, struct hk_cs **ptr,
488                              bool compute)
489 {
490    if ((*ptr) == NULL) {
491       /* Allocate root control stream */
492       size_t initial_size = 65536;
493       struct agx_ptr root = hk_pool_alloc(cmd, initial_size, 1024);
494       if (!root.cpu)
495          return NULL;
496 
497       /* Allocate hk_cs for the new stream */
498       struct hk_cs *cs = malloc(sizeof(*cs));
499       *cs = (struct hk_cs){
500          .type = compute ? HK_CS_CDM : HK_CS_VDM,
501          .addr = root.gpu,
502          .start = root.cpu,
503          .current = root.cpu,
504          .end = root.cpu + initial_size,
505       };
506 
507       list_inithead(&cs->node);
508 
509       bool before_gfx = (ptr == &cmd->current_cs.pre_gfx);
510 
511       /* Insert into the command buffer. We usually append to the end of the
512        * command buffer, except for pre-graphics streams which go right before
513        * the graphics workload. (This implies a level of out-of-order processing
514        * that's allowed by Vulkan and required for efficient
515        * geometry/tessellation shaders.)
516        */
517       if (before_gfx && cmd->current_cs.gfx) {
518          list_addtail(&cs->node, &cmd->current_cs.gfx->node);
519       } else {
520          list_addtail(&cs->node, &cmd->control_streams);
521       }
522 
523       *ptr = cs;
524 
525       if (!compute)
526          hk_cs_init_graphics(cmd, cs);
527    }
528 
529    assert(*ptr != NULL);
530    return *ptr;
531 }
532 
533 static inline struct hk_cs *
hk_cmd_buffer_get_cs(struct hk_cmd_buffer * cmd,bool compute)534 hk_cmd_buffer_get_cs(struct hk_cmd_buffer *cmd, bool compute)
535 {
536    struct hk_cs **ptr = compute ? &cmd->current_cs.cs : &cmd->current_cs.gfx;
537    return hk_cmd_buffer_get_cs_general(cmd, ptr, compute);
538 }
539 
540 void hk_ensure_cs_has_space(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
541                             size_t space);
542 
543 static void
hk_cmd_buffer_dirty_all(struct hk_cmd_buffer * cmd)544 hk_cmd_buffer_dirty_all(struct hk_cmd_buffer *cmd)
545 {
546    struct vk_dynamic_graphics_state *dyn = &cmd->vk.dynamic_graphics_state;
547    struct hk_graphics_state *gfx = &cmd->state.gfx;
548 
549    vk_dynamic_graphics_state_dirty_all(dyn);
550    gfx->dirty = ~0;
551    gfx->shaders_dirty = ~0;
552    gfx->linked_dirty = ~0;
553    gfx->descriptors.root_dirty = true;
554 }
555 
556 static inline void
hk_cs_destroy(struct hk_cs * cs)557 hk_cs_destroy(struct hk_cs *cs)
558 {
559    if (cs->type == HK_CS_VDM) {
560       util_dynarray_fini(&cs->scissor);
561       util_dynarray_fini(&cs->depth_bias);
562    }
563 
564    free(cs);
565 }
566 
567 static void
hk_cmd_buffer_end_compute_internal(struct hk_cs ** ptr)568 hk_cmd_buffer_end_compute_internal(struct hk_cs **ptr)
569 {
570    if (*ptr) {
571       struct hk_cs *cs = *ptr;
572       void *map = cs->current;
573       agx_push(map, CDM_STREAM_TERMINATE, _)
574          ;
575 
576       cs->current = map;
577    }
578 
579    *ptr = NULL;
580 }
581 
582 static void
hk_cmd_buffer_end_compute(struct hk_cmd_buffer * cmd)583 hk_cmd_buffer_end_compute(struct hk_cmd_buffer *cmd)
584 {
585    hk_cmd_buffer_end_compute_internal(&cmd->current_cs.cs);
586 }
587 
588 static void
hk_cmd_buffer_end_graphics(struct hk_cmd_buffer * cmd)589 hk_cmd_buffer_end_graphics(struct hk_cmd_buffer *cmd)
590 {
591    struct hk_cs *cs = cmd->current_cs.gfx;
592 
593    if (cs) {
594       void *map = cs->current;
595       agx_push(map, VDM_STREAM_TERMINATE, _)
596          ;
597 
598       /* Scissor and depth bias arrays are staged to dynamic arrays on the CPU.
599        * When we end the control stream, they're done growing and are ready for
600        * upload.
601        */
602       cs->uploaded_scissor =
603          hk_pool_upload(cmd, cs->scissor.data, cs->scissor.size, 64);
604 
605       cs->uploaded_zbias =
606          hk_pool_upload(cmd, cs->depth_bias.data, cs->depth_bias.size, 64);
607 
608       /* TODO: maybe free scissor/depth_bias now? */
609 
610       cmd->current_cs.gfx->current = map;
611       cmd->current_cs.gfx = NULL;
612       hk_cmd_buffer_end_compute_internal(&cmd->current_cs.pre_gfx);
613       hk_cmd_buffer_end_compute_internal(&cmd->current_cs.post_gfx);
614    }
615 
616    assert(cmd->current_cs.gfx == NULL);
617 
618    /* We just flushed out the heap use. If we want to use it again, we'll need
619     * to queue a free for it again.
620     */
621    cmd->uses_heap = false;
622 }
623 
624 static inline uint64_t
hk_pipeline_stat_addr(struct hk_cmd_buffer * cmd,VkQueryPipelineStatisticFlagBits stat)625 hk_pipeline_stat_addr(struct hk_cmd_buffer *cmd,
626                       VkQueryPipelineStatisticFlagBits stat)
627 {
628    struct hk_root_descriptor_table *root = &cmd->state.gfx.descriptors.root;
629    VkQueryPipelineStatisticFlags flags = root->draw.pipeline_stats_flags;
630 
631    if (flags & stat) {
632       assert(!cmd->in_meta && "queries paused for meta");
633       assert(util_bitcount(stat) == 1 && "by construction");
634 
635       /* Prefix sum to determine the compacted index in the query pool */
636       uint32_t index = util_bitcount(flags & (stat - 1));
637 
638       return root->draw.pipeline_stats + (sizeof(uint64_t) * index);
639    } else {
640       /* Query disabled */
641       return 0;
642    }
643 }
644 
645 void hk_cmd_buffer_begin_graphics(struct hk_cmd_buffer *cmd,
646                                   const VkCommandBufferBeginInfo *pBeginInfo);
647 void hk_cmd_buffer_begin_compute(struct hk_cmd_buffer *cmd,
648                                  const VkCommandBufferBeginInfo *pBeginInfo);
649 
650 void hk_cmd_invalidate_graphics_state(struct hk_cmd_buffer *cmd);
651 void hk_cmd_invalidate_compute_state(struct hk_cmd_buffer *cmd);
652 
653 void hk_cmd_bind_shaders(struct vk_command_buffer *vk_cmd, uint32_t stage_count,
654                          const gl_shader_stage *stages,
655                          struct vk_shader **const shaders);
656 
657 void hk_cmd_bind_graphics_shader(struct hk_cmd_buffer *cmd,
658                                  const gl_shader_stage stage,
659                                  struct hk_api_shader *shader);
660 
661 void hk_cmd_bind_compute_shader(struct hk_cmd_buffer *cmd,
662                                 struct hk_api_shader *shader);
663 
664 void hk_cmd_bind_vertex_buffer(struct hk_cmd_buffer *cmd, uint32_t vb_idx,
665                                struct hk_addr_range addr_range);
666 
667 static inline struct hk_descriptor_state *
hk_get_descriptors_state(struct hk_cmd_buffer * cmd,VkPipelineBindPoint bind_point)668 hk_get_descriptors_state(struct hk_cmd_buffer *cmd,
669                          VkPipelineBindPoint bind_point)
670 {
671    switch (bind_point) {
672    case VK_PIPELINE_BIND_POINT_GRAPHICS:
673       return &cmd->state.gfx.descriptors;
674    case VK_PIPELINE_BIND_POINT_COMPUTE:
675       return &cmd->state.cs.descriptors;
676    default:
677       unreachable("Unhandled bind point");
678    }
679 };
680 
681 void hk_cmd_flush_wait_dep(struct hk_cmd_buffer *cmd,
682                            const VkDependencyInfo *dep, bool wait);
683 
684 void hk_cmd_invalidate_deps(struct hk_cmd_buffer *cmd, uint32_t dep_count,
685                             const VkDependencyInfo *deps);
686 
687 void hk_cmd_buffer_flush_push_descriptors(struct hk_cmd_buffer *cmd,
688                                           struct hk_descriptor_state *desc);
689 
690 void hk_meta_resolve_rendering(struct hk_cmd_buffer *cmd,
691                                const VkRenderingInfo *pRenderingInfo);
692 
693 uint64_t hk_cmd_buffer_upload_root(struct hk_cmd_buffer *cmd,
694                                    VkPipelineBindPoint bind_point);
695 
696 void hk_reserve_scratch(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
697                         struct hk_shader *s);
698 
699 uint32_t hk_upload_usc_words(struct hk_cmd_buffer *cmd, struct hk_shader *s,
700                              struct hk_linked_shader *linked);
701 
702 uint32_t hk_upload_usc_words_kernel(struct hk_cmd_buffer *cmd,
703                                     struct hk_shader *s, void *data,
704                                     size_t data_size);
705 
706 void hk_usc_upload_spilled_rt_descs(struct agx_usc_builder *b,
707                                     struct hk_cmd_buffer *cmd);
708 
709 void hk_cdm_cache_flush(struct hk_device *dev, struct hk_cs *cs);
710 
711 struct hk_grid {
712    bool indirect;
713    union {
714       uint32_t count[3];
715       uint64_t ptr;
716    };
717 };
718 
719 static struct hk_grid
hk_grid(uint32_t x,uint32_t y,uint32_t z)720 hk_grid(uint32_t x, uint32_t y, uint32_t z)
721 {
722    return (struct hk_grid){.indirect = false, .count = {x, y, z}};
723 }
724 
725 static struct hk_grid
hk_grid_indirect(uint64_t ptr)726 hk_grid_indirect(uint64_t ptr)
727 {
728    return (struct hk_grid){.indirect = true, .ptr = ptr};
729 }
730 
731 void hk_dispatch_with_usc(struct hk_device *dev, struct hk_cs *cs,
732                           struct hk_shader *s, uint32_t usc,
733                           struct hk_grid grid, struct hk_grid local_size);
734 
735 static inline void
hk_dispatch_with_local_size(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct hk_shader * s,struct hk_grid grid,struct hk_grid local_size)736 hk_dispatch_with_local_size(struct hk_cmd_buffer *cmd, struct hk_cs *cs,
737                             struct hk_shader *s, struct hk_grid grid,
738                             struct hk_grid local_size)
739 {
740    struct hk_device *dev = hk_cmd_buffer_device(cmd);
741    uint32_t usc = hk_upload_usc_words(cmd, s, s->only_linked);
742 
743    hk_reserve_scratch(cmd, cs, s);
744    hk_dispatch_with_usc(dev, cs, s, usc, grid, local_size);
745 }
746 
747 static inline void
hk_dispatch(struct hk_cmd_buffer * cmd,struct hk_cs * cs,struct hk_shader * s,struct hk_grid grid)748 hk_dispatch(struct hk_cmd_buffer *cmd, struct hk_cs *cs, struct hk_shader *s,
749             struct hk_grid grid)
750 {
751    assert(s->info.stage == MESA_SHADER_COMPUTE);
752 
753    struct hk_grid local_size =
754       hk_grid(s->info.cs.local_size[0], s->info.cs.local_size[1],
755               s->info.cs.local_size[2]);
756 
757    if (!grid.indirect) {
758       grid.count[0] *= local_size.count[0];
759       grid.count[1] *= local_size.count[1];
760       grid.count[2] *= local_size.count[2];
761    }
762 
763    hk_dispatch_with_local_size(cmd, cs, s, grid, local_size);
764 }
765 
766 void hk_queue_write(struct hk_cmd_buffer *cmd, uint64_t address, uint32_t value,
767                     bool after_gfx);
768