xref: /aosp_15_r20/external/mesa3d/src/freedreno/vulkan/tu_cmd_buffer.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2016 Red Hat.
3  * Copyright © 2016 Bas Nieuwenhuizen
4  * SPDX-License-Identifier: MIT
5  *
6  * based in part on anv driver which is:
7  * Copyright © 2015 Intel Corporation
8  */
9 
10 #ifndef TU_CMD_BUFFER_H
11 #define TU_CMD_BUFFER_H
12 
13 #include "tu_common.h"
14 
15 #include "tu_cs.h"
16 #include "tu_descriptor_set.h"
17 #include "tu_device.h"
18 #include "tu_lrz.h"
19 #include "tu_pass.h"
20 #include "tu_pipeline.h"
21 
22 enum tu_draw_state_group_id
23 {
24    TU_DRAW_STATE_PROGRAM_CONFIG,
25    TU_DRAW_STATE_VS,
26    TU_DRAW_STATE_VS_BINNING,
27    TU_DRAW_STATE_HS,
28    TU_DRAW_STATE_DS,
29    TU_DRAW_STATE_GS,
30    TU_DRAW_STATE_GS_BINNING,
31    TU_DRAW_STATE_VPC,
32    TU_DRAW_STATE_FS,
33    TU_DRAW_STATE_VB,
34    TU_DRAW_STATE_CONST,
35    TU_DRAW_STATE_DESC_SETS,
36    TU_DRAW_STATE_DESC_SETS_LOAD,
37    TU_DRAW_STATE_VS_PARAMS,
38    TU_DRAW_STATE_FS_PARAMS,
39    TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
40    TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
41    TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
42    TU_DRAW_STATE_PRIM_MODE_GMEM,
43 
44    /* dynamic state related draw states */
45    TU_DRAW_STATE_DYNAMIC,
46    TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
47 };
48 
49 struct tu_descriptor_state
50 {
51    struct tu_descriptor_set *sets[MAX_SETS];
52    struct tu_descriptor_set push_set;
53    uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
54    uint64_t set_iova[MAX_SETS];
55    uint32_t max_sets_bound;
56    uint32_t max_dynamic_offset_size;
57 };
58 
59 enum tu_cmd_dirty_bits
60 {
61    TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
62    TU_CMD_DIRTY_DESC_SETS = BIT(1),
63    TU_CMD_DIRTY_COMPUTE_DESC_SETS = BIT(2),
64    TU_CMD_DIRTY_SHADER_CONSTS = BIT(3),
65    TU_CMD_DIRTY_LRZ = BIT(4),
66    TU_CMD_DIRTY_VS_PARAMS = BIT(5),
67    TU_CMD_DIRTY_TESS_PARAMS = BIT(6),
68    TU_CMD_DIRTY_SUBPASS = BIT(7),
69    TU_CMD_DIRTY_FDM = BIT(8),
70    TU_CMD_DIRTY_PER_VIEW_VIEWPORT = BIT(9),
71    TU_CMD_DIRTY_TES = BIT(10),
72    TU_CMD_DIRTY_PROGRAM = BIT(11),
73    TU_CMD_DIRTY_RAST_ORDER = BIT(12),
74    TU_CMD_DIRTY_FEEDBACK_LOOPS = BIT(13),
75    /* all draw states were disabled and need to be re-enabled: */
76    TU_CMD_DIRTY_DRAW_STATE = BIT(14)
77 };
78 
79 /* There are only three cache domains we have to care about: the CCU, or
80  * color cache unit, which is used for color and depth/stencil attachments
81  * and copy/blit destinations, and is split conceptually into color and depth,
82  * and the universal cache or UCHE which is used for pretty much everything
83  * else, except for the CP (uncached) and host. We need to flush whenever data
84  * crosses these boundaries.
85  */
86 
87 enum tu_cmd_access_mask {
88    TU_ACCESS_NONE = 0,
89    TU_ACCESS_UCHE_READ = 1 << 0,
90    TU_ACCESS_UCHE_WRITE = 1 << 1,
91    TU_ACCESS_CCU_COLOR_READ = 1 << 2,
92    TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
93    TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
94    TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
95 
96    /* Experiments have shown that while it's safe to avoid flushing the CCU
97     * after each blit/renderpass, it's not safe to assume that subsequent
98     * lookups with a different attachment state will hit unflushed cache
99     * entries. That is, the CCU needs to be flushed and possibly invalidated
100     * when accessing memory with a different attachment state. Writing to an
101     * attachment under the following conditions after clearing using the
102     * normal 2d engine path is known to have issues:
103     *
104     * - It isn't the 0'th layer.
105     * - There are more than one attachment, and this isn't the 0'th attachment
106     *   (this seems to also depend on the cpp of the attachments).
107     *
108     * Our best guess is that the layer/MRT state is used when computing
109     * the location of a cache entry in CCU, to avoid conflicts. We assume that
110     * any access in a renderpass after or before an access by a transfer needs
111     * a flush/invalidate, and use the _INCOHERENT variants to represent access
112     * by a renderpass.
113     */
114    TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
115    TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
116    TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
117    TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
118 
119    /* Accesses which bypasses any cache. e.g. writes via the host,
120     * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
121     */
122    TU_ACCESS_SYSMEM_READ = 1 << 10,
123    TU_ACCESS_SYSMEM_WRITE = 1 << 11,
124 
125    /* Memory writes from the CP start in-order with draws and event writes,
126     * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
127     */
128    TU_ACCESS_CP_WRITE = 1 << 12,
129 
130    /* Descriptors are read through UCHE but are also prefetched via
131     * CP_LOAD_STATE6 and the prefetched descriptors need to be invalidated
132     * when they change.
133     */
134    TU_ACCESS_BINDLESS_DESCRIPTOR_READ = 1 << 13,
135 
136    /* A write to a GMEM attachment made by CP_EVENT_WRITE::BLIT. */
137    TU_ACCESS_BLIT_WRITE_GMEM = 1 << 14,
138 
139    /* Similar to UCHE_READ, but specifically for GMEM attachment reads. */
140    TU_ACCESS_UCHE_READ_GMEM = 1 << 15,
141 
142    /* The CCHE is a write-through cache which sits behind UCHE, with multiple
143     * incoherent copies. Because it's write-through we only have to worry
144     * about invalidating it for reads. It's invalidated by "ccinv" in the
145     * shader and CP_CCHE_INVALIDATE in the command stream.
146     */
147    TU_ACCESS_CCHE_READ = 1 << 16,
148 
149    TU_ACCESS_READ =
150       TU_ACCESS_UCHE_READ |
151       TU_ACCESS_CCU_COLOR_READ |
152       TU_ACCESS_CCU_DEPTH_READ |
153       TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
154       TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
155       TU_ACCESS_SYSMEM_READ |
156       TU_ACCESS_BINDLESS_DESCRIPTOR_READ |
157       TU_ACCESS_CCHE_READ,
158 
159    TU_ACCESS_WRITE =
160       TU_ACCESS_UCHE_WRITE |
161       TU_ACCESS_CCU_COLOR_WRITE |
162       TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
163       TU_ACCESS_CCU_DEPTH_WRITE |
164       TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
165       TU_ACCESS_SYSMEM_WRITE |
166       TU_ACCESS_CP_WRITE,
167 
168    TU_ACCESS_ALL =
169       TU_ACCESS_READ |
170       TU_ACCESS_WRITE,
171 };
172 
173 /* From the driver's point of view, we only need to distinguish between things
174  * which won't start until a WFI is complete and things which additionally
175  * need a WAIT_FOR_ME.
176  *
177  * TODO: This will get more complicated with concurrent binning.
178  */
179 enum tu_stage {
180    /* As a destination stage, this is for operations on the CP which don't
181     * wait for pending WFIs to complete and therefore need a CP_WAIT_FOR_ME.
182     * As a source stage, it is for things needing no waits.
183     */
184    TU_STAGE_CP,
185 
186    /* This is for most operations, which WFI will wait to finish and will not
187     * start until any pending WFIs are finished.
188     */
189    TU_STAGE_GPU,
190 
191    /* This is only used as a destination stage and is for things needing no
192     * waits on the GPU (e.g. host operations).
193     */
194    TU_STAGE_BOTTOM,
195 };
196 
197 enum tu_cmd_flush_bits {
198    TU_CMD_FLAG_CCU_CLEAN_DEPTH = 1 << 0,
199    TU_CMD_FLAG_CCU_CLEAN_COLOR = 1 << 1,
200    TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
201    TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
202    TU_CMD_FLAG_CACHE_CLEAN = 1 << 4,
203    TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
204    TU_CMD_FLAG_CCHE_INVALIDATE = 1 << 6,
205    TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 7,
206    TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 8,
207    TU_CMD_FLAG_WAIT_FOR_ME = 1 << 9,
208    TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE = 1 << 10,
209    /* This is an unusual flush that isn't automatically executed if pending,
210     * as it isn't necessary. Therefore, it's not included in ALL_FLUSH.
211     */
212    TU_CMD_FLAG_BLIT_CACHE_CLEAN = 1 << 11,
213 
214    TU_CMD_FLAG_ALL_CLEAN =
215       TU_CMD_FLAG_CCU_CLEAN_DEPTH |
216       TU_CMD_FLAG_CCU_CLEAN_COLOR |
217       TU_CMD_FLAG_CACHE_CLEAN |
218       /* Treat the CP as a sort of "cache" which may need to be "flushed" via
219        * waiting for writes to land with WAIT_FOR_MEM_WRITES.
220        */
221       TU_CMD_FLAG_WAIT_MEM_WRITES,
222 
223    TU_CMD_FLAG_ALL_INVALIDATE =
224       TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
225       TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
226       TU_CMD_FLAG_CACHE_INVALIDATE |
227       TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE |
228       TU_CMD_FLAG_CCHE_INVALIDATE |
229       /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
230        * a command that needs CP_WAIT_FOR_ME is executed. This means we may
231        * insert an extra WAIT_FOR_ME before an indirect command requiring it
232        * in case there was another command before the current command buffer
233        * that it needs to wait for.
234        */
235       TU_CMD_FLAG_WAIT_FOR_ME,
236 };
237 
238 /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
239  * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
240  * which part of the gmem is used by the CCU. Here we keep track of what the
241  * state of the CCU.
242  */
243 enum tu_cmd_ccu_state {
244    TU_CMD_CCU_SYSMEM,
245    TU_CMD_CCU_GMEM,
246    TU_CMD_CCU_UNKNOWN,
247 };
248 
249 struct tu_cache_state {
250    /* Caches which must be made available (flushed) eventually if there are
251     * any users outside that cache domain, and caches which must be
252     * invalidated eventually if there are any reads.
253     */
254    BITMASK_ENUM(tu_cmd_flush_bits) pending_flush_bits;
255    /* Pending flushes */
256    BITMASK_ENUM(tu_cmd_flush_bits) flush_bits;
257 };
258 
259 struct tu_vs_params {
260    uint32_t vertex_offset;
261    uint32_t first_instance;
262    uint32_t draw_id;
263 };
264 
265 struct tu_tess_params {
266    bool valid;
267    enum a6xx_tess_output output_upper_left, output_lower_left;
268    enum a6xx_tess_spacing spacing;
269 };
270 
271 /* This should be for state that is set inside a renderpass and used at
272  * renderpass end time, e.g. to decide whether to use sysmem. This needs
273  * special handling for secondary cmdbufs and suspending/resuming render
274  * passes where the state may need to be combined afterwards.
275  */
276 struct tu_render_pass_state
277 {
278    bool xfb_used;
279    bool has_tess;
280    bool has_prim_generated_query_in_rp;
281    bool has_zpass_done_sample_count_write_in_rp;
282    bool disable_gmem;
283    bool sysmem_single_prim_mode;
284    bool shared_viewport;
285 
286    /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
287    bool draw_cs_writes_to_cond_pred;
288 
289    uint32_t drawcall_count;
290 
291    /* A calculated "draw cost" value for renderpass, which tries to
292     * estimate the bandwidth-per-sample of all the draws according
293     * to:
294     *
295     *    foreach_draw (...) {
296     *      sum += pipeline->color_bandwidth_per_sample;
297     *      if (depth_test_enabled)
298     *        sum += pipeline->depth_cpp_per_sample;
299     *      if (depth_write_enabled)
300     *        sum += pipeline->depth_cpp_per_sample;
301     *      if (stencil_write_enabled)
302     *        sum += pipeline->stencil_cpp_per_sample * 2;
303     *    }
304     *    drawcall_bandwidth_per_sample = sum / drawcall_count;
305     *
306     * It allows us to estimate the total bandwidth of drawcalls later, by
307     * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
308     *
309     * This does ignore depth buffer traffic for samples which do not
310     * pass due to depth-test fail, and some other details.  But it is
311     * just intended to be a rough estimate that is easy to calculate.
312     */
313    uint32_t drawcall_bandwidth_per_sample_sum;
314 
315    const char *lrz_disable_reason;
316 };
317 
318 /* These are the states of the suspend/resume state machine. In addition to
319  * tracking whether we're in the middle of a chain of suspending and
320  * resuming passes that will be merged, we need to track whether the
321  * command buffer begins in the middle of such a chain, for when it gets
322  * merged with other command buffers. We call such a chain that begins
323  * before the command buffer starts a "pre-chain".
324  *
325  * Note that when this command buffer is finished, this state is untouched
326  * but it gains a different meaning. For example, if we finish in state
327  * SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
328  * there's a suspend/resume chain that extends past the end of the command
329  * buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
330  * means that there's a suspend/resume chain that extends before the
331  * beginning.
332  */
333 enum tu_suspend_resume_state
334 {
335    /* Either there are no suspend/resume chains, or they are entirely
336     * contained in the current command buffer.
337     *
338     *   BeginCommandBuffer() <- start of current command buffer
339     *       ...
340     *       // we are here
341     */
342    SR_NONE = 0,
343 
344    /* We are in the middle of a suspend/resume chain that starts before the
345     * current command buffer. This happens when the command buffer begins
346     * with a resuming render pass and all of the passes up to the current
347     * one are suspending. In this state, our part of the chain is not saved
348     * and is in the current draw_cs/state.
349     *
350     *   BeginRendering() ... EndRendering(suspending)
351     *   BeginCommandBuffer() <- start of current command buffer
352     *       BeginRendering(resuming) ... EndRendering(suspending)
353     *       BeginRendering(resuming) ... EndRendering(suspending)
354     *       ...
355     *       // we are here
356     */
357    SR_IN_PRE_CHAIN,
358 
359    /* We are currently outside of any suspend/resume chains, but there is a
360     * chain starting before the current command buffer. It is saved in
361     * pre_chain.
362     *
363     *   BeginRendering() ... EndRendering(suspending)
364     *   BeginCommandBuffer() <- start of current command buffer
365     *       // This part is stashed in pre_chain
366     *       BeginRendering(resuming) ... EndRendering(suspending)
367     *       BeginRendering(resuming) ... EndRendering(suspending)
368     *       ...
369     *       BeginRendering(resuming) ... EndRendering() // end of chain
370     *       ...
371     *       // we are here
372     */
373    SR_AFTER_PRE_CHAIN,
374 
375    /* We are in the middle of a suspend/resume chain and there is no chain
376     * starting before the current command buffer.
377     *
378     *   BeginCommandBuffer() <- start of current command buffer
379     *       ...
380     *       BeginRendering() ... EndRendering(suspending)
381     *       BeginRendering(resuming) ... EndRendering(suspending)
382     *       BeginRendering(resuming) ... EndRendering(suspending)
383     *       ...
384     *       // we are here
385     */
386    SR_IN_CHAIN,
387 
388    /* We are in the middle of a suspend/resume chain and there is another,
389     * separate, chain starting before the current command buffer.
390     *
391     *   BeginRendering() ... EndRendering(suspending)
392     *   CommandBufferBegin() <- start of current command buffer
393     *       // This part is stashed in pre_chain
394     *       BeginRendering(resuming) ... EndRendering(suspending)
395     *       BeginRendering(resuming) ... EndRendering(suspending)
396     *       ...
397     *       BeginRendering(resuming) ... EndRendering() // end of chain
398     *       ...
399     *       BeginRendering() ... EndRendering(suspending)
400     *       BeginRendering(resuming) ... EndRendering(suspending)
401     *       BeginRendering(resuming) ... EndRendering(suspending)
402     *       ...
403     *       // we are here
404     */
405    SR_IN_CHAIN_AFTER_PRE_CHAIN,
406 };
407 
408 struct tu_cmd_state
409 {
410    uint32_t dirty;
411 
412    struct tu_shader *shaders[MESA_SHADER_STAGES];
413 
414    struct tu_program_state program;
415 
416    struct tu_render_pass_state rp;
417 
418    struct vk_render_pass_state vk_rp;
419    struct vk_vertex_input_state vi;
420    struct vk_sample_locations_state sl;
421 
422    struct tu_bandwidth bandwidth;
423 
424    /* Vertex buffers
425     * the states for these can be updated partially, so we need to save these
426     * to be able to emit a complete draw state
427     */
428    struct {
429       uint64_t base;
430       uint32_t size;
431    } vb[MAX_VBS];
432 
433    uint32_t max_vbs_bound;
434 
435    bool per_view_viewport;
436 
437    /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
438    struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
439    struct tu_draw_state vertex_buffers;
440    struct tu_draw_state shader_const;
441    struct tu_draw_state desc_sets;
442    struct tu_draw_state load_state;
443    struct tu_draw_state compute_load_state;
444    struct tu_draw_state prim_order_gmem;
445 
446    struct tu_draw_state vs_params;
447    struct tu_draw_state fs_params;
448 
449    /* Index buffer */
450    uint64_t index_va;
451    uint32_t max_index_count;
452    uint8_t index_size;
453 
454    /* because streamout base has to be 32-byte aligned
455     * there is an extra offset to deal with when it is
456     * unaligned
457     */
458    uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
459 
460    /* Renderpasses are tricky, because we may need to flush differently if
461     * using sysmem vs. gmem and therefore we have to delay any flushing that
462     * happens before a renderpass. So we have to have two copies of the flush
463     * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
464     * and one for outside a renderpass.
465     */
466    struct tu_cache_state cache;
467    struct tu_cache_state renderpass_cache;
468 
469    enum tu_cmd_ccu_state ccu_state;
470 
471    /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
472     * might get used by tu_store_gmem_attachment().
473     */
474    enum tu_gmem_layout gmem_layout;
475 
476    const struct tu_render_pass *pass;
477    const struct tu_subpass *subpass;
478    const struct tu_framebuffer *framebuffer;
479    const struct tu_tiling_config *tiling;
480    VkRect2D render_area;
481 
482    const struct tu_image_view **attachments;
483    VkClearValue *clear_values;
484 
485    /* State that in the dynamic case comes from VkRenderingInfo and needs to
486     * be saved/restored when suspending. This holds the state for the last
487     * suspended renderpass, which may point to this command buffer's dynamic_*
488     * or another command buffer if executed on a secondary.
489     */
490    struct {
491       const struct tu_render_pass *pass;
492       const struct tu_subpass *subpass;
493       const struct tu_framebuffer *framebuffer;
494       VkRect2D render_area;
495       enum tu_gmem_layout gmem_layout;
496 
497       const struct tu_image_view **attachments;
498       VkClearValue *clear_values;
499 
500       struct tu_lrz_state lrz;
501    } suspended_pass;
502 
503    bool tessfactor_addr_set;
504    bool predication_active;
505    bool msaa_disable;
506    bool blend_reads_dest;
507    bool stencil_front_write;
508    bool stencil_back_write;
509    bool pipeline_sysmem_single_prim_mode;
510    bool pipeline_has_tess;
511    bool pipeline_disable_gmem;
512    bool raster_order_attachment_access;
513    bool raster_order_attachment_access_valid;
514    VkImageAspectFlags pipeline_feedback_loops;
515 
516    bool pipeline_blend_lrz, pipeline_bandwidth;
517    uint32_t pipeline_draw_states;
518 
519    /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
520     * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
521     * but they use the same {START,STOP}_PRIMITIVE_CTRS control.
522     */
523    uint32_t prim_counters_running;
524 
525    bool prim_generated_query_running_before_rp;
526 
527    enum tu_suspend_resume_state suspend_resume;
528 
529    bool suspending, resuming;
530 
531    struct tu_lrz_state lrz;
532 
533    struct tu_draw_state lrz_and_depth_plane_state;
534 
535    struct tu_vs_params last_vs_params;
536    bool last_draw_indexed;
537 
538    struct tu_tess_params tess_params;
539 
540    uint64_t descriptor_buffer_iova[MAX_SETS];
541 };
542 
543 struct tu_cmd_buffer
544 {
545    struct vk_command_buffer vk;
546 
547    struct tu_device *device;
548 
549    struct u_trace trace;
550    struct u_trace_iterator trace_renderpass_start;
551    struct u_trace_iterator trace_renderpass_end;
552 
553    struct list_head renderpass_autotune_results;
554    struct tu_autotune_results_buffer* autotune_buffer;
555 
556    void *patchpoints_ctx;
557    struct util_dynarray fdm_bin_patchpoints;
558 
559    VkCommandBufferUsageFlags usage_flags;
560 
561    VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
562 
563    struct tu_cmd_state state;
564    uint32_t queue_family_index;
565 
566    uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
567    VkShaderStageFlags push_constant_stages;
568    struct tu_descriptor_set meta_push_descriptors;
569 
570    struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
571 
572    struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1) + 1];
573    struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
574    struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
575    const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1) + 1];
576    VkClearValue dynamic_clear_values[2 * (MAX_RTS + 1)];
577 
578    struct tu_render_pass dynamic_pass;
579    struct tu_subpass dynamic_subpass;
580    struct tu_framebuffer dynamic_framebuffer;
581 
582    struct tu_cs cs;
583    struct tu_cs draw_cs;
584    struct tu_cs tile_store_cs;
585    struct tu_cs draw_epilogue_cs;
586    struct tu_cs sub_cs;
587 
588    /* If the first render pass in the command buffer is resuming, then it is
589     * part of a suspend/resume chain that starts before the current command
590     * buffer and needs to be merged later. In this case, its incomplete state
591     * is stored in pre_chain. In the symmetric case where the last render pass
592     * is suspending, we just skip ending the render pass and its state is
593     * stored in draw_cs/the current state. The first and last render pass
594     * might be part of different chains, which is why all the state may need
595     * to be saved separately here.
596     */
597    struct {
598       struct tu_cs draw_cs;
599       struct tu_cs draw_epilogue_cs;
600 
601       struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
602 
603       struct tu_render_pass_state state;
604 
605       struct util_dynarray fdm_bin_patchpoints;
606       void *patchpoints_ctx;
607    } pre_chain;
608 
609    uint32_t vsc_draw_strm_pitch;
610    uint32_t vsc_prim_strm_pitch;
611    bool vsc_initialized;
612 };
613 VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
614                        VK_OBJECT_TYPE_COMMAND_BUFFER)
615 
616 extern const struct vk_command_buffer_ops tu_cmd_buffer_ops;
617 
618 static inline uint32_t
tu_attachment_gmem_offset(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,uint32_t layer)619 tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
620                           const struct tu_render_pass_attachment *att,
621                           uint32_t layer)
622 {
623    assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
624    return att->gmem_offset[cmd->state.gmem_layout] +
625       layer * cmd->state.tiling->tile0.width * cmd->state.tiling->tile0.height *
626       att->cpp;
627 }
628 
629 static inline uint32_t
tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,uint32_t layer)630 tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
631                                   const struct tu_render_pass_attachment *att,
632                                   uint32_t layer)
633 {
634    assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
635    return att->gmem_offset_stencil[cmd->state.gmem_layout] +
636       layer * cmd->state.tiling->tile0.width * cmd->state.tiling->tile0.height;
637 }
638 
639 void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
640                                 const struct tu_render_pass_state *src);
641 
642 VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
643                              const VkCommandBufferBeginInfo *pBeginInfo);
644 
645 template <chip CHIP>
646 void
647 tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer);
648 
649 template <chip CHIP>
650 void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer);
651 
652 template <chip CHIP>
653 void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
654                              struct tu_cs *cs,
655                              enum tu_cmd_ccu_state ccu_state);
656 
657 void
658 tu_append_pre_chain(struct tu_cmd_buffer *cmd,
659                     struct tu_cmd_buffer *secondary);
660 
661 void
662 tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
663                          struct tu_cmd_buffer *secondary);
664 
665 void
666 tu_append_post_chain(struct tu_cmd_buffer *cmd,
667                      struct tu_cmd_buffer *secondary);
668 
669 void
670 tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
671                           struct tu_cmd_buffer *suspended);
672 
673 template <chip CHIP>
674 void tu_cmd_render(struct tu_cmd_buffer *cmd);
675 
676 enum fd_gpu_event : uint32_t;
677 
678 template <chip CHIP>
679 void
680 tu_emit_event_write(struct tu_cmd_buffer *cmd,
681                     struct tu_cs *cs,
682                     enum fd_gpu_event event);
683 
684 void
685 tu_flush_for_access(struct tu_cache_state *cache,
686                     enum tu_cmd_access_mask src_mask,
687                     enum tu_cmd_access_mask dst_mask);
688 
689 static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)690 tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
691                          VkPipelineBindPoint bind_point)
692 {
693    return &cmd_buffer->descriptors[bind_point];
694 }
695 
696 void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
697                    bool msaa_disable);
698 
699 void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
700 
701 void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
702 
703 void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
704 
705 void tu6_apply_depth_bounds_workaround(struct tu_device *device,
706                                        uint32_t *rb_depth_cntl);
707 
708 typedef void (*tu_fdm_bin_apply_t)(struct tu_cmd_buffer *cmd,
709                                    struct tu_cs *cs,
710                                    void *data,
711                                    VkRect2D bin,
712                                    unsigned views,
713                                    VkExtent2D *frag_areas);
714 
715 struct tu_fdm_bin_patchpoint {
716    uint64_t iova;
717    uint32_t size;
718    void *data;
719    tu_fdm_bin_apply_t apply;
720 };
721 
722 
723 void
724 tu_barrier(struct tu_cmd_buffer *cmd,
725            uint32_t dep_count,
726            const VkDependencyInfo *dep_info);
727 
728 template <chip CHIP>
729 void
730 tu_write_event(struct tu_cmd_buffer *cmd, struct tu_event *event,
731                VkPipelineStageFlags2 stageMask, unsigned value);
732 
733 static inline void
_tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer * cmd,struct tu_cs * cs,unsigned size,tu_fdm_bin_apply_t apply,void * state,unsigned state_size)734 _tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd,
735                               struct tu_cs *cs,
736                               unsigned size,
737                               tu_fdm_bin_apply_t apply,
738                               void *state,
739                               unsigned state_size)
740 {
741    void *data = ralloc_size(cmd->patchpoints_ctx, state_size);
742    memcpy(data, state, state_size);
743    assert(cs->writeable);
744    tu_cs_reserve_space(cs, size);
745    struct tu_fdm_bin_patchpoint patch = {
746       .iova = tu_cs_get_cur_iova(cs),
747       .size = size,
748       .data = data,
749       .apply = apply,
750    };
751 
752    /* Apply the "default" setup where there is no scaling. This is used if
753     * sysmem is required, and uses up the dwords that have been reserved.
754     */
755    unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
756    VkExtent2D unscaled_frag_areas[num_views];
757    for (unsigned i = 0; i < num_views; i++) {
758       unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 };
759    }
760    apply(cmd, cs, state, (VkRect2D) {
761          { 0, 0 },
762          { MAX_VIEWPORT_SIZE, MAX_VIEWPORT_SIZE },
763         }, num_views, unscaled_frag_areas);
764    assert(tu_cs_get_cur_iova(cs) == patch.iova + patch.size * sizeof(uint32_t));
765 
766    util_dynarray_append(&cmd->fdm_bin_patchpoints,
767                         struct tu_fdm_bin_patchpoint,
768                         patch);
769 }
770 
771 #define tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, state) \
772    _tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, &state, sizeof(state))
773 
774 #endif /* TU_CMD_BUFFER_H */
775