1 /*
2 * Copyright © 2016 Red Hat.
3 * Copyright © 2016 Bas Nieuwenhuizen
4 * SPDX-License-Identifier: MIT
5 *
6 * based in part on anv driver which is:
7 * Copyright © 2015 Intel Corporation
8 */
9
10 #ifndef TU_CMD_BUFFER_H
11 #define TU_CMD_BUFFER_H
12
13 #include "tu_common.h"
14
15 #include "tu_cs.h"
16 #include "tu_descriptor_set.h"
17 #include "tu_device.h"
18 #include "tu_lrz.h"
19 #include "tu_pass.h"
20 #include "tu_pipeline.h"
21
22 enum tu_draw_state_group_id
23 {
24 TU_DRAW_STATE_PROGRAM_CONFIG,
25 TU_DRAW_STATE_VS,
26 TU_DRAW_STATE_VS_BINNING,
27 TU_DRAW_STATE_HS,
28 TU_DRAW_STATE_DS,
29 TU_DRAW_STATE_GS,
30 TU_DRAW_STATE_GS_BINNING,
31 TU_DRAW_STATE_VPC,
32 TU_DRAW_STATE_FS,
33 TU_DRAW_STATE_VB,
34 TU_DRAW_STATE_CONST,
35 TU_DRAW_STATE_DESC_SETS,
36 TU_DRAW_STATE_DESC_SETS_LOAD,
37 TU_DRAW_STATE_VS_PARAMS,
38 TU_DRAW_STATE_FS_PARAMS,
39 TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM,
40 TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM,
41 TU_DRAW_STATE_LRZ_AND_DEPTH_PLANE,
42 TU_DRAW_STATE_PRIM_MODE_GMEM,
43
44 /* dynamic state related draw states */
45 TU_DRAW_STATE_DYNAMIC,
46 TU_DRAW_STATE_COUNT = TU_DRAW_STATE_DYNAMIC + TU_DYNAMIC_STATE_COUNT,
47 };
48
49 struct tu_descriptor_state
50 {
51 struct tu_descriptor_set *sets[MAX_SETS];
52 struct tu_descriptor_set push_set;
53 uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
54 uint64_t set_iova[MAX_SETS];
55 uint32_t max_sets_bound;
56 uint32_t max_dynamic_offset_size;
57 };
58
59 enum tu_cmd_dirty_bits
60 {
61 TU_CMD_DIRTY_VERTEX_BUFFERS = BIT(0),
62 TU_CMD_DIRTY_DESC_SETS = BIT(1),
63 TU_CMD_DIRTY_COMPUTE_DESC_SETS = BIT(2),
64 TU_CMD_DIRTY_SHADER_CONSTS = BIT(3),
65 TU_CMD_DIRTY_LRZ = BIT(4),
66 TU_CMD_DIRTY_VS_PARAMS = BIT(5),
67 TU_CMD_DIRTY_TESS_PARAMS = BIT(6),
68 TU_CMD_DIRTY_SUBPASS = BIT(7),
69 TU_CMD_DIRTY_FDM = BIT(8),
70 TU_CMD_DIRTY_PER_VIEW_VIEWPORT = BIT(9),
71 TU_CMD_DIRTY_TES = BIT(10),
72 TU_CMD_DIRTY_PROGRAM = BIT(11),
73 TU_CMD_DIRTY_RAST_ORDER = BIT(12),
74 TU_CMD_DIRTY_FEEDBACK_LOOPS = BIT(13),
75 /* all draw states were disabled and need to be re-enabled: */
76 TU_CMD_DIRTY_DRAW_STATE = BIT(14)
77 };
78
79 /* There are only three cache domains we have to care about: the CCU, or
80 * color cache unit, which is used for color and depth/stencil attachments
81 * and copy/blit destinations, and is split conceptually into color and depth,
82 * and the universal cache or UCHE which is used for pretty much everything
83 * else, except for the CP (uncached) and host. We need to flush whenever data
84 * crosses these boundaries.
85 */
86
87 enum tu_cmd_access_mask {
88 TU_ACCESS_NONE = 0,
89 TU_ACCESS_UCHE_READ = 1 << 0,
90 TU_ACCESS_UCHE_WRITE = 1 << 1,
91 TU_ACCESS_CCU_COLOR_READ = 1 << 2,
92 TU_ACCESS_CCU_COLOR_WRITE = 1 << 3,
93 TU_ACCESS_CCU_DEPTH_READ = 1 << 4,
94 TU_ACCESS_CCU_DEPTH_WRITE = 1 << 5,
95
96 /* Experiments have shown that while it's safe to avoid flushing the CCU
97 * after each blit/renderpass, it's not safe to assume that subsequent
98 * lookups with a different attachment state will hit unflushed cache
99 * entries. That is, the CCU needs to be flushed and possibly invalidated
100 * when accessing memory with a different attachment state. Writing to an
101 * attachment under the following conditions after clearing using the
102 * normal 2d engine path is known to have issues:
103 *
104 * - It isn't the 0'th layer.
105 * - There are more than one attachment, and this isn't the 0'th attachment
106 * (this seems to also depend on the cpp of the attachments).
107 *
108 * Our best guess is that the layer/MRT state is used when computing
109 * the location of a cache entry in CCU, to avoid conflicts. We assume that
110 * any access in a renderpass after or before an access by a transfer needs
111 * a flush/invalidate, and use the _INCOHERENT variants to represent access
112 * by a renderpass.
113 */
114 TU_ACCESS_CCU_COLOR_INCOHERENT_READ = 1 << 6,
115 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE = 1 << 7,
116 TU_ACCESS_CCU_DEPTH_INCOHERENT_READ = 1 << 8,
117 TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE = 1 << 9,
118
119 /* Accesses which bypasses any cache. e.g. writes via the host,
120 * CP_EVENT_WRITE::BLIT, and the CP are SYSMEM_WRITE.
121 */
122 TU_ACCESS_SYSMEM_READ = 1 << 10,
123 TU_ACCESS_SYSMEM_WRITE = 1 << 11,
124
125 /* Memory writes from the CP start in-order with draws and event writes,
126 * but execute asynchronously and hence need a CP_WAIT_MEM_WRITES if read.
127 */
128 TU_ACCESS_CP_WRITE = 1 << 12,
129
130 /* Descriptors are read through UCHE but are also prefetched via
131 * CP_LOAD_STATE6 and the prefetched descriptors need to be invalidated
132 * when they change.
133 */
134 TU_ACCESS_BINDLESS_DESCRIPTOR_READ = 1 << 13,
135
136 /* A write to a GMEM attachment made by CP_EVENT_WRITE::BLIT. */
137 TU_ACCESS_BLIT_WRITE_GMEM = 1 << 14,
138
139 /* Similar to UCHE_READ, but specifically for GMEM attachment reads. */
140 TU_ACCESS_UCHE_READ_GMEM = 1 << 15,
141
142 /* The CCHE is a write-through cache which sits behind UCHE, with multiple
143 * incoherent copies. Because it's write-through we only have to worry
144 * about invalidating it for reads. It's invalidated by "ccinv" in the
145 * shader and CP_CCHE_INVALIDATE in the command stream.
146 */
147 TU_ACCESS_CCHE_READ = 1 << 16,
148
149 TU_ACCESS_READ =
150 TU_ACCESS_UCHE_READ |
151 TU_ACCESS_CCU_COLOR_READ |
152 TU_ACCESS_CCU_DEPTH_READ |
153 TU_ACCESS_CCU_COLOR_INCOHERENT_READ |
154 TU_ACCESS_CCU_DEPTH_INCOHERENT_READ |
155 TU_ACCESS_SYSMEM_READ |
156 TU_ACCESS_BINDLESS_DESCRIPTOR_READ |
157 TU_ACCESS_CCHE_READ,
158
159 TU_ACCESS_WRITE =
160 TU_ACCESS_UCHE_WRITE |
161 TU_ACCESS_CCU_COLOR_WRITE |
162 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE |
163 TU_ACCESS_CCU_DEPTH_WRITE |
164 TU_ACCESS_CCU_DEPTH_INCOHERENT_WRITE |
165 TU_ACCESS_SYSMEM_WRITE |
166 TU_ACCESS_CP_WRITE,
167
168 TU_ACCESS_ALL =
169 TU_ACCESS_READ |
170 TU_ACCESS_WRITE,
171 };
172
173 /* From the driver's point of view, we only need to distinguish between things
174 * which won't start until a WFI is complete and things which additionally
175 * need a WAIT_FOR_ME.
176 *
177 * TODO: This will get more complicated with concurrent binning.
178 */
179 enum tu_stage {
180 /* As a destination stage, this is for operations on the CP which don't
181 * wait for pending WFIs to complete and therefore need a CP_WAIT_FOR_ME.
182 * As a source stage, it is for things needing no waits.
183 */
184 TU_STAGE_CP,
185
186 /* This is for most operations, which WFI will wait to finish and will not
187 * start until any pending WFIs are finished.
188 */
189 TU_STAGE_GPU,
190
191 /* This is only used as a destination stage and is for things needing no
192 * waits on the GPU (e.g. host operations).
193 */
194 TU_STAGE_BOTTOM,
195 };
196
197 enum tu_cmd_flush_bits {
198 TU_CMD_FLAG_CCU_CLEAN_DEPTH = 1 << 0,
199 TU_CMD_FLAG_CCU_CLEAN_COLOR = 1 << 1,
200 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH = 1 << 2,
201 TU_CMD_FLAG_CCU_INVALIDATE_COLOR = 1 << 3,
202 TU_CMD_FLAG_CACHE_CLEAN = 1 << 4,
203 TU_CMD_FLAG_CACHE_INVALIDATE = 1 << 5,
204 TU_CMD_FLAG_CCHE_INVALIDATE = 1 << 6,
205 TU_CMD_FLAG_WAIT_MEM_WRITES = 1 << 7,
206 TU_CMD_FLAG_WAIT_FOR_IDLE = 1 << 8,
207 TU_CMD_FLAG_WAIT_FOR_ME = 1 << 9,
208 TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE = 1 << 10,
209 /* This is an unusual flush that isn't automatically executed if pending,
210 * as it isn't necessary. Therefore, it's not included in ALL_FLUSH.
211 */
212 TU_CMD_FLAG_BLIT_CACHE_CLEAN = 1 << 11,
213
214 TU_CMD_FLAG_ALL_CLEAN =
215 TU_CMD_FLAG_CCU_CLEAN_DEPTH |
216 TU_CMD_FLAG_CCU_CLEAN_COLOR |
217 TU_CMD_FLAG_CACHE_CLEAN |
218 /* Treat the CP as a sort of "cache" which may need to be "flushed" via
219 * waiting for writes to land with WAIT_FOR_MEM_WRITES.
220 */
221 TU_CMD_FLAG_WAIT_MEM_WRITES,
222
223 TU_CMD_FLAG_ALL_INVALIDATE =
224 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
225 TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
226 TU_CMD_FLAG_CACHE_INVALIDATE |
227 TU_CMD_FLAG_BINDLESS_DESCRIPTOR_INVALIDATE |
228 TU_CMD_FLAG_CCHE_INVALIDATE |
229 /* Treat CP_WAIT_FOR_ME as a "cache" that needs to be invalidated when a
230 * a command that needs CP_WAIT_FOR_ME is executed. This means we may
231 * insert an extra WAIT_FOR_ME before an indirect command requiring it
232 * in case there was another command before the current command buffer
233 * that it needs to wait for.
234 */
235 TU_CMD_FLAG_WAIT_FOR_ME,
236 };
237
238 /* Changing the CCU from sysmem mode to gmem mode or vice-versa is pretty
239 * heavy, involving a CCU cache flush/invalidate and a WFI in order to change
240 * which part of the gmem is used by the CCU. Here we keep track of what the
241 * state of the CCU.
242 */
243 enum tu_cmd_ccu_state {
244 TU_CMD_CCU_SYSMEM,
245 TU_CMD_CCU_GMEM,
246 TU_CMD_CCU_UNKNOWN,
247 };
248
249 struct tu_cache_state {
250 /* Caches which must be made available (flushed) eventually if there are
251 * any users outside that cache domain, and caches which must be
252 * invalidated eventually if there are any reads.
253 */
254 BITMASK_ENUM(tu_cmd_flush_bits) pending_flush_bits;
255 /* Pending flushes */
256 BITMASK_ENUM(tu_cmd_flush_bits) flush_bits;
257 };
258
259 struct tu_vs_params {
260 uint32_t vertex_offset;
261 uint32_t first_instance;
262 uint32_t draw_id;
263 };
264
265 struct tu_tess_params {
266 bool valid;
267 enum a6xx_tess_output output_upper_left, output_lower_left;
268 enum a6xx_tess_spacing spacing;
269 };
270
271 /* This should be for state that is set inside a renderpass and used at
272 * renderpass end time, e.g. to decide whether to use sysmem. This needs
273 * special handling for secondary cmdbufs and suspending/resuming render
274 * passes where the state may need to be combined afterwards.
275 */
276 struct tu_render_pass_state
277 {
278 bool xfb_used;
279 bool has_tess;
280 bool has_prim_generated_query_in_rp;
281 bool has_zpass_done_sample_count_write_in_rp;
282 bool disable_gmem;
283 bool sysmem_single_prim_mode;
284 bool shared_viewport;
285
286 /* Track whether conditional predicate for COND_REG_EXEC is changed in draw_cs */
287 bool draw_cs_writes_to_cond_pred;
288
289 uint32_t drawcall_count;
290
291 /* A calculated "draw cost" value for renderpass, which tries to
292 * estimate the bandwidth-per-sample of all the draws according
293 * to:
294 *
295 * foreach_draw (...) {
296 * sum += pipeline->color_bandwidth_per_sample;
297 * if (depth_test_enabled)
298 * sum += pipeline->depth_cpp_per_sample;
299 * if (depth_write_enabled)
300 * sum += pipeline->depth_cpp_per_sample;
301 * if (stencil_write_enabled)
302 * sum += pipeline->stencil_cpp_per_sample * 2;
303 * }
304 * drawcall_bandwidth_per_sample = sum / drawcall_count;
305 *
306 * It allows us to estimate the total bandwidth of drawcalls later, by
307 * calculating (drawcall_bandwidth_per_sample * zpass_sample_count).
308 *
309 * This does ignore depth buffer traffic for samples which do not
310 * pass due to depth-test fail, and some other details. But it is
311 * just intended to be a rough estimate that is easy to calculate.
312 */
313 uint32_t drawcall_bandwidth_per_sample_sum;
314
315 const char *lrz_disable_reason;
316 };
317
318 /* These are the states of the suspend/resume state machine. In addition to
319 * tracking whether we're in the middle of a chain of suspending and
320 * resuming passes that will be merged, we need to track whether the
321 * command buffer begins in the middle of such a chain, for when it gets
322 * merged with other command buffers. We call such a chain that begins
323 * before the command buffer starts a "pre-chain".
324 *
325 * Note that when this command buffer is finished, this state is untouched
326 * but it gains a different meaning. For example, if we finish in state
327 * SR_IN_CHAIN, we finished in the middle of a suspend/resume chain, so
328 * there's a suspend/resume chain that extends past the end of the command
329 * buffer. In this sense it's the "opposite" of SR_AFTER_PRE_CHAIN, which
330 * means that there's a suspend/resume chain that extends before the
331 * beginning.
332 */
333 enum tu_suspend_resume_state
334 {
335 /* Either there are no suspend/resume chains, or they are entirely
336 * contained in the current command buffer.
337 *
338 * BeginCommandBuffer() <- start of current command buffer
339 * ...
340 * // we are here
341 */
342 SR_NONE = 0,
343
344 /* We are in the middle of a suspend/resume chain that starts before the
345 * current command buffer. This happens when the command buffer begins
346 * with a resuming render pass and all of the passes up to the current
347 * one are suspending. In this state, our part of the chain is not saved
348 * and is in the current draw_cs/state.
349 *
350 * BeginRendering() ... EndRendering(suspending)
351 * BeginCommandBuffer() <- start of current command buffer
352 * BeginRendering(resuming) ... EndRendering(suspending)
353 * BeginRendering(resuming) ... EndRendering(suspending)
354 * ...
355 * // we are here
356 */
357 SR_IN_PRE_CHAIN,
358
359 /* We are currently outside of any suspend/resume chains, but there is a
360 * chain starting before the current command buffer. It is saved in
361 * pre_chain.
362 *
363 * BeginRendering() ... EndRendering(suspending)
364 * BeginCommandBuffer() <- start of current command buffer
365 * // This part is stashed in pre_chain
366 * BeginRendering(resuming) ... EndRendering(suspending)
367 * BeginRendering(resuming) ... EndRendering(suspending)
368 * ...
369 * BeginRendering(resuming) ... EndRendering() // end of chain
370 * ...
371 * // we are here
372 */
373 SR_AFTER_PRE_CHAIN,
374
375 /* We are in the middle of a suspend/resume chain and there is no chain
376 * starting before the current command buffer.
377 *
378 * BeginCommandBuffer() <- start of current command buffer
379 * ...
380 * BeginRendering() ... EndRendering(suspending)
381 * BeginRendering(resuming) ... EndRendering(suspending)
382 * BeginRendering(resuming) ... EndRendering(suspending)
383 * ...
384 * // we are here
385 */
386 SR_IN_CHAIN,
387
388 /* We are in the middle of a suspend/resume chain and there is another,
389 * separate, chain starting before the current command buffer.
390 *
391 * BeginRendering() ... EndRendering(suspending)
392 * CommandBufferBegin() <- start of current command buffer
393 * // This part is stashed in pre_chain
394 * BeginRendering(resuming) ... EndRendering(suspending)
395 * BeginRendering(resuming) ... EndRendering(suspending)
396 * ...
397 * BeginRendering(resuming) ... EndRendering() // end of chain
398 * ...
399 * BeginRendering() ... EndRendering(suspending)
400 * BeginRendering(resuming) ... EndRendering(suspending)
401 * BeginRendering(resuming) ... EndRendering(suspending)
402 * ...
403 * // we are here
404 */
405 SR_IN_CHAIN_AFTER_PRE_CHAIN,
406 };
407
408 struct tu_cmd_state
409 {
410 uint32_t dirty;
411
412 struct tu_shader *shaders[MESA_SHADER_STAGES];
413
414 struct tu_program_state program;
415
416 struct tu_render_pass_state rp;
417
418 struct vk_render_pass_state vk_rp;
419 struct vk_vertex_input_state vi;
420 struct vk_sample_locations_state sl;
421
422 struct tu_bandwidth bandwidth;
423
424 /* Vertex buffers
425 * the states for these can be updated partially, so we need to save these
426 * to be able to emit a complete draw state
427 */
428 struct {
429 uint64_t base;
430 uint32_t size;
431 } vb[MAX_VBS];
432
433 uint32_t max_vbs_bound;
434
435 bool per_view_viewport;
436
437 /* saved states to re-emit in TU_CMD_DIRTY_DRAW_STATE case */
438 struct tu_draw_state dynamic_state[TU_DYNAMIC_STATE_COUNT];
439 struct tu_draw_state vertex_buffers;
440 struct tu_draw_state shader_const;
441 struct tu_draw_state desc_sets;
442 struct tu_draw_state load_state;
443 struct tu_draw_state compute_load_state;
444 struct tu_draw_state prim_order_gmem;
445
446 struct tu_draw_state vs_params;
447 struct tu_draw_state fs_params;
448
449 /* Index buffer */
450 uint64_t index_va;
451 uint32_t max_index_count;
452 uint8_t index_size;
453
454 /* because streamout base has to be 32-byte aligned
455 * there is an extra offset to deal with when it is
456 * unaligned
457 */
458 uint8_t streamout_offset[IR3_MAX_SO_BUFFERS];
459
460 /* Renderpasses are tricky, because we may need to flush differently if
461 * using sysmem vs. gmem and therefore we have to delay any flushing that
462 * happens before a renderpass. So we have to have two copies of the flush
463 * state, one for intra-renderpass flushes (i.e. renderpass dependencies)
464 * and one for outside a renderpass.
465 */
466 struct tu_cache_state cache;
467 struct tu_cache_state renderpass_cache;
468
469 enum tu_cmd_ccu_state ccu_state;
470
471 /* Decides which GMEM layout to use from the tu_pass, based on whether the CCU
472 * might get used by tu_store_gmem_attachment().
473 */
474 enum tu_gmem_layout gmem_layout;
475
476 const struct tu_render_pass *pass;
477 const struct tu_subpass *subpass;
478 const struct tu_framebuffer *framebuffer;
479 const struct tu_tiling_config *tiling;
480 VkRect2D render_area;
481
482 const struct tu_image_view **attachments;
483 VkClearValue *clear_values;
484
485 /* State that in the dynamic case comes from VkRenderingInfo and needs to
486 * be saved/restored when suspending. This holds the state for the last
487 * suspended renderpass, which may point to this command buffer's dynamic_*
488 * or another command buffer if executed on a secondary.
489 */
490 struct {
491 const struct tu_render_pass *pass;
492 const struct tu_subpass *subpass;
493 const struct tu_framebuffer *framebuffer;
494 VkRect2D render_area;
495 enum tu_gmem_layout gmem_layout;
496
497 const struct tu_image_view **attachments;
498 VkClearValue *clear_values;
499
500 struct tu_lrz_state lrz;
501 } suspended_pass;
502
503 bool tessfactor_addr_set;
504 bool predication_active;
505 bool msaa_disable;
506 bool blend_reads_dest;
507 bool stencil_front_write;
508 bool stencil_back_write;
509 bool pipeline_sysmem_single_prim_mode;
510 bool pipeline_has_tess;
511 bool pipeline_disable_gmem;
512 bool raster_order_attachment_access;
513 bool raster_order_attachment_access_valid;
514 VkImageAspectFlags pipeline_feedback_loops;
515
516 bool pipeline_blend_lrz, pipeline_bandwidth;
517 uint32_t pipeline_draw_states;
518
519 /* VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT and
520 * VK_QUERY_TYPE_PRIMITIVES_GENERATED_EXT are allowed to run simultaniously,
521 * but they use the same {START,STOP}_PRIMITIVE_CTRS control.
522 */
523 uint32_t prim_counters_running;
524
525 bool prim_generated_query_running_before_rp;
526
527 enum tu_suspend_resume_state suspend_resume;
528
529 bool suspending, resuming;
530
531 struct tu_lrz_state lrz;
532
533 struct tu_draw_state lrz_and_depth_plane_state;
534
535 struct tu_vs_params last_vs_params;
536 bool last_draw_indexed;
537
538 struct tu_tess_params tess_params;
539
540 uint64_t descriptor_buffer_iova[MAX_SETS];
541 };
542
543 struct tu_cmd_buffer
544 {
545 struct vk_command_buffer vk;
546
547 struct tu_device *device;
548
549 struct u_trace trace;
550 struct u_trace_iterator trace_renderpass_start;
551 struct u_trace_iterator trace_renderpass_end;
552
553 struct list_head renderpass_autotune_results;
554 struct tu_autotune_results_buffer* autotune_buffer;
555
556 void *patchpoints_ctx;
557 struct util_dynarray fdm_bin_patchpoints;
558
559 VkCommandBufferUsageFlags usage_flags;
560
561 VkQueryPipelineStatisticFlags inherited_pipeline_statistics;
562
563 struct tu_cmd_state state;
564 uint32_t queue_family_index;
565
566 uint32_t push_constants[MAX_PUSH_CONSTANTS_SIZE / 4];
567 VkShaderStageFlags push_constant_stages;
568 struct tu_descriptor_set meta_push_descriptors;
569
570 struct tu_descriptor_state descriptors[MAX_BIND_POINTS];
571
572 struct tu_render_pass_attachment dynamic_rp_attachments[2 * (MAX_RTS + 1) + 1];
573 struct tu_subpass_attachment dynamic_color_attachments[MAX_RTS];
574 struct tu_subpass_attachment dynamic_resolve_attachments[MAX_RTS + 1];
575 const struct tu_image_view *dynamic_attachments[2 * (MAX_RTS + 1) + 1];
576 VkClearValue dynamic_clear_values[2 * (MAX_RTS + 1)];
577
578 struct tu_render_pass dynamic_pass;
579 struct tu_subpass dynamic_subpass;
580 struct tu_framebuffer dynamic_framebuffer;
581
582 struct tu_cs cs;
583 struct tu_cs draw_cs;
584 struct tu_cs tile_store_cs;
585 struct tu_cs draw_epilogue_cs;
586 struct tu_cs sub_cs;
587
588 /* If the first render pass in the command buffer is resuming, then it is
589 * part of a suspend/resume chain that starts before the current command
590 * buffer and needs to be merged later. In this case, its incomplete state
591 * is stored in pre_chain. In the symmetric case where the last render pass
592 * is suspending, we just skip ending the render pass and its state is
593 * stored in draw_cs/the current state. The first and last render pass
594 * might be part of different chains, which is why all the state may need
595 * to be saved separately here.
596 */
597 struct {
598 struct tu_cs draw_cs;
599 struct tu_cs draw_epilogue_cs;
600
601 struct u_trace_iterator trace_renderpass_start, trace_renderpass_end;
602
603 struct tu_render_pass_state state;
604
605 struct util_dynarray fdm_bin_patchpoints;
606 void *patchpoints_ctx;
607 } pre_chain;
608
609 uint32_t vsc_draw_strm_pitch;
610 uint32_t vsc_prim_strm_pitch;
611 bool vsc_initialized;
612 };
613 VK_DEFINE_HANDLE_CASTS(tu_cmd_buffer, vk.base, VkCommandBuffer,
614 VK_OBJECT_TYPE_COMMAND_BUFFER)
615
616 extern const struct vk_command_buffer_ops tu_cmd_buffer_ops;
617
618 static inline uint32_t
tu_attachment_gmem_offset(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,uint32_t layer)619 tu_attachment_gmem_offset(struct tu_cmd_buffer *cmd,
620 const struct tu_render_pass_attachment *att,
621 uint32_t layer)
622 {
623 assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
624 return att->gmem_offset[cmd->state.gmem_layout] +
625 layer * cmd->state.tiling->tile0.width * cmd->state.tiling->tile0.height *
626 att->cpp;
627 }
628
629 static inline uint32_t
tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,uint32_t layer)630 tu_attachment_gmem_offset_stencil(struct tu_cmd_buffer *cmd,
631 const struct tu_render_pass_attachment *att,
632 uint32_t layer)
633 {
634 assert(cmd->state.gmem_layout < TU_GMEM_LAYOUT_COUNT);
635 return att->gmem_offset_stencil[cmd->state.gmem_layout] +
636 layer * cmd->state.tiling->tile0.width * cmd->state.tiling->tile0.height;
637 }
638
639 void tu_render_pass_state_merge(struct tu_render_pass_state *dst,
640 const struct tu_render_pass_state *src);
641
642 VkResult tu_cmd_buffer_begin(struct tu_cmd_buffer *cmd_buffer,
643 const VkCommandBufferBeginInfo *pBeginInfo);
644
645 template <chip CHIP>
646 void
647 tu_emit_cache_flush(struct tu_cmd_buffer *cmd_buffer);
648
649 template <chip CHIP>
650 void tu_emit_cache_flush_renderpass(struct tu_cmd_buffer *cmd_buffer);
651
652 template <chip CHIP>
653 void tu_emit_cache_flush_ccu(struct tu_cmd_buffer *cmd_buffer,
654 struct tu_cs *cs,
655 enum tu_cmd_ccu_state ccu_state);
656
657 void
658 tu_append_pre_chain(struct tu_cmd_buffer *cmd,
659 struct tu_cmd_buffer *secondary);
660
661 void
662 tu_append_pre_post_chain(struct tu_cmd_buffer *cmd,
663 struct tu_cmd_buffer *secondary);
664
665 void
666 tu_append_post_chain(struct tu_cmd_buffer *cmd,
667 struct tu_cmd_buffer *secondary);
668
669 void
670 tu_restore_suspended_pass(struct tu_cmd_buffer *cmd,
671 struct tu_cmd_buffer *suspended);
672
673 template <chip CHIP>
674 void tu_cmd_render(struct tu_cmd_buffer *cmd);
675
676 enum fd_gpu_event : uint32_t;
677
678 template <chip CHIP>
679 void
680 tu_emit_event_write(struct tu_cmd_buffer *cmd,
681 struct tu_cs *cs,
682 enum fd_gpu_event event);
683
684 void
685 tu_flush_for_access(struct tu_cache_state *cache,
686 enum tu_cmd_access_mask src_mask,
687 enum tu_cmd_access_mask dst_mask);
688
689 static inline struct tu_descriptor_state *
tu_get_descriptors_state(struct tu_cmd_buffer * cmd_buffer,VkPipelineBindPoint bind_point)690 tu_get_descriptors_state(struct tu_cmd_buffer *cmd_buffer,
691 VkPipelineBindPoint bind_point)
692 {
693 return &cmd_buffer->descriptors[bind_point];
694 }
695
696 void tu6_emit_msaa(struct tu_cs *cs, VkSampleCountFlagBits samples,
697 bool msaa_disable);
698
699 void tu6_emit_window_scissor(struct tu_cs *cs, uint32_t x1, uint32_t y1, uint32_t x2, uint32_t y2);
700
701 void tu6_emit_window_offset(struct tu_cs *cs, uint32_t x1, uint32_t y1);
702
703 void tu_disable_draw_states(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
704
705 void tu6_apply_depth_bounds_workaround(struct tu_device *device,
706 uint32_t *rb_depth_cntl);
707
708 typedef void (*tu_fdm_bin_apply_t)(struct tu_cmd_buffer *cmd,
709 struct tu_cs *cs,
710 void *data,
711 VkRect2D bin,
712 unsigned views,
713 VkExtent2D *frag_areas);
714
715 struct tu_fdm_bin_patchpoint {
716 uint64_t iova;
717 uint32_t size;
718 void *data;
719 tu_fdm_bin_apply_t apply;
720 };
721
722
723 void
724 tu_barrier(struct tu_cmd_buffer *cmd,
725 uint32_t dep_count,
726 const VkDependencyInfo *dep_info);
727
728 template <chip CHIP>
729 void
730 tu_write_event(struct tu_cmd_buffer *cmd, struct tu_event *event,
731 VkPipelineStageFlags2 stageMask, unsigned value);
732
733 static inline void
_tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer * cmd,struct tu_cs * cs,unsigned size,tu_fdm_bin_apply_t apply,void * state,unsigned state_size)734 _tu_create_fdm_bin_patchpoint(struct tu_cmd_buffer *cmd,
735 struct tu_cs *cs,
736 unsigned size,
737 tu_fdm_bin_apply_t apply,
738 void *state,
739 unsigned state_size)
740 {
741 void *data = ralloc_size(cmd->patchpoints_ctx, state_size);
742 memcpy(data, state, state_size);
743 assert(cs->writeable);
744 tu_cs_reserve_space(cs, size);
745 struct tu_fdm_bin_patchpoint patch = {
746 .iova = tu_cs_get_cur_iova(cs),
747 .size = size,
748 .data = data,
749 .apply = apply,
750 };
751
752 /* Apply the "default" setup where there is no scaling. This is used if
753 * sysmem is required, and uses up the dwords that have been reserved.
754 */
755 unsigned num_views = MAX2(cmd->state.pass->num_views, 1);
756 VkExtent2D unscaled_frag_areas[num_views];
757 for (unsigned i = 0; i < num_views; i++) {
758 unscaled_frag_areas[i] = (VkExtent2D) { 1, 1 };
759 }
760 apply(cmd, cs, state, (VkRect2D) {
761 { 0, 0 },
762 { MAX_VIEWPORT_SIZE, MAX_VIEWPORT_SIZE },
763 }, num_views, unscaled_frag_areas);
764 assert(tu_cs_get_cur_iova(cs) == patch.iova + patch.size * sizeof(uint32_t));
765
766 util_dynarray_append(&cmd->fdm_bin_patchpoints,
767 struct tu_fdm_bin_patchpoint,
768 patch);
769 }
770
771 #define tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, state) \
772 _tu_create_fdm_bin_patchpoint(cmd, cs, size, apply, &state, sizeof(state))
773
774 #endif /* TU_CMD_BUFFER_H */
775