xref: /aosp_15_r20/external/mesa3d/src/freedreno/vulkan/tu_cs.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2019 Google LLC
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #ifndef TU_CS_H
7 #define TU_CS_H
8 
9 #include "tu_common.h"
10 
11 #include "freedreno_pm4.h"
12 
13 #include "tu_knl.h"
14 
15 /* For breadcrumbs we may open a network socket based on the envvar,
16  * it's not something that should be enabled by default.
17  */
18 #define TU_BREADCRUMBS_ENABLED 0
19 
20 enum tu_cs_mode
21 {
22 
23    /*
24     * A command stream in TU_CS_MODE_GROW mode grows automatically whenever it
25     * is full.  tu_cs_begin must be called before command packet emission and
26     * tu_cs_end must be called after.
27     *
28     * This mode may create multiple entries internally.  The entries must be
29     * submitted together.
30     */
31    TU_CS_MODE_GROW,
32 
33    /*
34     * A command stream in TU_CS_MODE_EXTERNAL mode wraps an external,
35     * fixed-size buffer.  tu_cs_begin and tu_cs_end are optional and have no
36     * effect on it.
37     *
38     * This mode does not create any entry or any BO.
39     */
40    TU_CS_MODE_EXTERNAL,
41 
42    /*
43     * A command stream in TU_CS_MODE_SUB_STREAM mode does not support direct
44     * command packet emission.  tu_cs_begin_sub_stream must be called to get a
45     * sub-stream to emit comamnd packets to.  When done with the sub-stream,
46     * tu_cs_end_sub_stream must be called.
47     *
48     * This mode does not create any entry internally.
49     */
50    TU_CS_MODE_SUB_STREAM,
51 };
52 
53 struct tu_cs_entry
54 {
55    /* No ownership */
56    const struct tu_bo *bo;
57 
58    uint32_t size;
59    uint32_t offset;
60 };
61 
62 struct tu_cs_memory {
63    uint32_t *map;
64    uint64_t iova;
65    bool writeable;
66 };
67 
68 struct tu_draw_state {
69    uint64_t iova;
70    uint16_t size;
71    bool writeable;
72 };
73 
74 struct tu_bo_array {
75    struct tu_bo **bos;
76    uint32_t bo_count;
77    uint32_t bo_capacity;
78    uint32_t *start;
79 };
80 
81 #define TU_COND_EXEC_STACK_SIZE 4
82 
83 struct tu_cs
84 {
85    uint32_t *start;
86    uint32_t *cur;
87    uint32_t *reserved_end;
88    uint32_t *end;
89    const char *name;
90 
91    struct tu_device *device;
92    enum tu_cs_mode mode;
93    bool writeable;
94    uint32_t next_bo_size;
95 
96    struct tu_cs_entry *entries;
97    uint32_t entry_count;
98    uint32_t entry_capacity;
99 
100    struct tu_bo_array read_only, read_write;
101 
102    /* Optional BO that this CS is sub-allocated from for TU_CS_MODE_SUB_STREAM */
103    struct tu_bo *refcount_bo;
104 
105    /* iova that this CS starts with in TU_CS_MODE_EXTERNAL */
106    uint64_t external_iova;
107 
108    /* state for cond_exec_start/cond_exec_end */
109    uint32_t cond_stack_depth;
110    uint32_t cond_flags[TU_COND_EXEC_STACK_SIZE];
111    uint32_t *cond_dwords[TU_COND_EXEC_STACK_SIZE];
112 
113    uint32_t breadcrumb_emit_after;
114 };
115 
116 void
117 tu_breadcrumbs_init(struct tu_device *device);
118 
119 void
120 tu_breadcrumbs_finish(struct tu_device *device);
121 
122 void
123 tu_cs_init(struct tu_cs *cs,
124            struct tu_device *device,
125            enum tu_cs_mode mode,
126            uint32_t initial_size, const char *name);
127 
128 void
129 tu_cs_init_external(struct tu_cs *cs, struct tu_device *device,
130                     uint32_t *start, uint32_t *end, uint64_t iova,
131                     bool writeable);
132 
133 void
134 tu_cs_init_suballoc(struct tu_cs *cs, struct tu_device *device,
135                     struct tu_suballoc_bo *bo);
136 
137 void
138 tu_cs_finish(struct tu_cs *cs);
139 
140 void
141 tu_cs_begin(struct tu_cs *cs);
142 
143 void
144 tu_cs_end(struct tu_cs *cs);
145 
146 void
147 tu_cs_set_writeable(struct tu_cs *cs, bool writeable);
148 
149 VkResult
150 tu_cs_begin_sub_stream_aligned(struct tu_cs *cs, uint32_t count,
151                                uint32_t size, struct tu_cs *sub_cs);
152 
153 static inline VkResult
tu_cs_begin_sub_stream(struct tu_cs * cs,uint32_t size,struct tu_cs * sub_cs)154 tu_cs_begin_sub_stream(struct tu_cs *cs, uint32_t size, struct tu_cs *sub_cs)
155 {
156    return tu_cs_begin_sub_stream_aligned(cs, size, 1, sub_cs);
157 }
158 
159 
160 VkResult
161 tu_cs_alloc(struct tu_cs *cs,
162             uint32_t count,
163             uint32_t size,
164             struct tu_cs_memory *memory);
165 
166 struct tu_cs_entry
167 tu_cs_end_sub_stream(struct tu_cs *cs, struct tu_cs *sub_cs);
168 
169 static inline struct tu_draw_state
tu_cs_end_draw_state(struct tu_cs * cs,struct tu_cs * sub_cs)170 tu_cs_end_draw_state(struct tu_cs *cs, struct tu_cs *sub_cs)
171 {
172    struct tu_cs_entry entry = tu_cs_end_sub_stream(cs, sub_cs);
173    return (struct tu_draw_state) {
174       .iova = entry.bo->iova + entry.offset,
175       .size = entry.size / sizeof(uint32_t),
176       .writeable = sub_cs->writeable,
177    };
178 }
179 
180 VkResult
181 tu_cs_reserve_space(struct tu_cs *cs, uint32_t reserved_size);
182 
183 uint64_t
184 tu_cs_get_cur_iova(const struct tu_cs *cs);
185 
186 static inline struct tu_draw_state
tu_cs_draw_state(struct tu_cs * sub_cs,struct tu_cs * cs,uint32_t size)187 tu_cs_draw_state(struct tu_cs *sub_cs, struct tu_cs *cs, uint32_t size)
188 {
189    struct tu_cs_memory memory;
190 
191    /* TODO: clean this up */
192    tu_cs_alloc(sub_cs, size, 1, &memory);
193    tu_cs_init_external(cs, sub_cs->device, memory.map, memory.map + size,
194                        memory.iova, memory.writeable);
195    tu_cs_begin(cs);
196    tu_cs_reserve_space(cs, size);
197 
198    return (struct tu_draw_state) {
199       .iova = memory.iova,
200       .size = size,
201       .writeable = sub_cs->writeable,
202    };
203 }
204 
205 void
206 tu_cs_reset(struct tu_cs *cs);
207 
208 VkResult
209 tu_cs_add_entries(struct tu_cs *cs, struct tu_cs *target);
210 
211 /**
212  * Get the size of the command packets emitted since the last call to
213  * tu_cs_add_entry.
214  */
215 static inline uint32_t
tu_cs_get_size(const struct tu_cs * cs)216 tu_cs_get_size(const struct tu_cs *cs)
217 {
218    return cs->cur - cs->start;
219 }
220 
221 /**
222  * Return true if there is no command packet emitted since the last call to
223  * tu_cs_add_entry.
224  */
225 static inline uint32_t
tu_cs_is_empty(const struct tu_cs * cs)226 tu_cs_is_empty(const struct tu_cs *cs)
227 {
228    return tu_cs_get_size(cs) == 0;
229 }
230 
231 /**
232  * Discard all entries.  This allows \a cs to be reused while keeping the
233  * existing BOs and command packets intact.
234  */
235 static inline void
tu_cs_discard_entries(struct tu_cs * cs)236 tu_cs_discard_entries(struct tu_cs *cs)
237 {
238    assert(cs->mode == TU_CS_MODE_GROW);
239    cs->entry_count = 0;
240 }
241 
242 /**
243  * Get the size needed for tu_cs_emit_call.
244  */
245 static inline uint32_t
tu_cs_get_call_size(const struct tu_cs * cs)246 tu_cs_get_call_size(const struct tu_cs *cs)
247 {
248    assert(cs->mode == TU_CS_MODE_GROW);
249    /* each CP_INDIRECT_BUFFER needs 4 dwords */
250    return cs->entry_count * 4;
251 }
252 
253 /**
254  * Assert that we did not exceed the reserved space.
255  */
256 static inline void
tu_cs_sanity_check(const struct tu_cs * cs)257 tu_cs_sanity_check(const struct tu_cs *cs)
258 {
259    assert(cs->start <= cs->cur);
260    assert(cs->cur <= cs->reserved_end);
261    assert(cs->reserved_end <= cs->end);
262 }
263 
264 void
265 tu_cs_emit_sync_breadcrumb(struct tu_cs *cs, uint8_t opcode, uint16_t cnt);
266 
267 /**
268  * Emit a uint32_t value into a command stream, without boundary checking.
269  */
270 static inline void
tu_cs_emit(struct tu_cs * cs,uint32_t value)271 tu_cs_emit(struct tu_cs *cs, uint32_t value)
272 {
273    assert(cs->cur < cs->reserved_end);
274    *cs->cur = value;
275    ++cs->cur;
276 
277 #if TU_BREADCRUMBS_ENABLED
278    cs->breadcrumb_emit_after--;
279    if (cs->breadcrumb_emit_after == 0)
280       tu_cs_emit_sync_breadcrumb(cs, -1, 0);
281 #endif
282 }
283 
284 /**
285  * Emit an array of uint32_t into a command stream, without boundary checking.
286  */
287 static inline void
tu_cs_emit_array(struct tu_cs * cs,const uint32_t * values,uint32_t length)288 tu_cs_emit_array(struct tu_cs *cs, const uint32_t *values, uint32_t length)
289 {
290    assert(cs->cur + length <= cs->reserved_end);
291    memcpy(cs->cur, values, sizeof(uint32_t) * length);
292    cs->cur += length;
293 }
294 
295 /**
296  * Get the size of the remaining space in the current BO.
297  */
298 static inline uint32_t
tu_cs_get_space(const struct tu_cs * cs)299 tu_cs_get_space(const struct tu_cs *cs)
300 {
301    return cs->end - cs->cur;
302 }
303 
304 static inline void
tu_cs_reserve(struct tu_cs * cs,uint32_t reserved_size)305 tu_cs_reserve(struct tu_cs *cs, uint32_t reserved_size)
306 {
307    if (cs->mode != TU_CS_MODE_GROW) {
308       assert(tu_cs_get_space(cs) >= reserved_size);
309       assert(cs->reserved_end == cs->end);
310       return;
311    }
312 
313    if (tu_cs_get_space(cs) >= reserved_size &&
314        cs->entry_count < cs->entry_capacity) {
315       cs->reserved_end = cs->cur + reserved_size;
316       return;
317    }
318 
319    ASSERTED VkResult result = tu_cs_reserve_space(cs, reserved_size);
320    /* TODO: set this error in tu_cs and use it */
321    assert(result == VK_SUCCESS);
322 }
323 
324 /**
325  * Emit a type-4 command packet header into a command stream.
326  */
327 static inline void
tu_cs_emit_pkt4(struct tu_cs * cs,uint16_t regindx,uint16_t cnt)328 tu_cs_emit_pkt4(struct tu_cs *cs, uint16_t regindx, uint16_t cnt)
329 {
330    tu_cs_reserve(cs, cnt + 1);
331    tu_cs_emit(cs, pm4_pkt4_hdr(regindx, cnt));
332 }
333 
334 /**
335  * Emit a type-7 command packet header into a command stream.
336  */
337 static inline void
tu_cs_emit_pkt7(struct tu_cs * cs,uint8_t opcode,uint16_t cnt)338 tu_cs_emit_pkt7(struct tu_cs *cs, uint8_t opcode, uint16_t cnt)
339 {
340 #if TU_BREADCRUMBS_ENABLED
341    tu_cs_emit_sync_breadcrumb(cs, opcode, cnt + 1);
342 #endif
343 
344    tu_cs_reserve(cs, cnt + 1);
345    tu_cs_emit(cs, pm4_pkt7_hdr(opcode, cnt));
346 }
347 
348 static inline void
tu_cs_emit_wfi(struct tu_cs * cs)349 tu_cs_emit_wfi(struct tu_cs *cs)
350 {
351    tu_cs_emit_pkt7(cs, CP_WAIT_FOR_IDLE, 0);
352 }
353 
354 static inline void
tu_cs_emit_qw(struct tu_cs * cs,uint64_t value)355 tu_cs_emit_qw(struct tu_cs *cs, uint64_t value)
356 {
357    tu_cs_emit(cs, (uint32_t) value);
358    tu_cs_emit(cs, (uint32_t) (value >> 32));
359 }
360 
361 static inline void
tu_cs_emit_write_reg(struct tu_cs * cs,uint16_t reg,uint32_t value)362 tu_cs_emit_write_reg(struct tu_cs *cs, uint16_t reg, uint32_t value)
363 {
364    tu_cs_emit_pkt4(cs, reg, 1);
365    tu_cs_emit(cs, value);
366 }
367 
368 /**
369  * Emit a CP_INDIRECT_BUFFER command packet.
370  */
371 static inline void
tu_cs_emit_ib(struct tu_cs * cs,const struct tu_cs_entry * entry)372 tu_cs_emit_ib(struct tu_cs *cs, const struct tu_cs_entry *entry)
373 {
374    assert(entry->bo);
375    assert(entry->size && entry->offset + entry->size <= entry->bo->size);
376    assert(entry->size % sizeof(uint32_t) == 0);
377    assert(entry->offset % sizeof(uint32_t) == 0);
378 
379    tu_cs_emit_pkt7(cs, CP_INDIRECT_BUFFER, 3);
380    tu_cs_emit_qw(cs, entry->bo->iova + entry->offset);
381    tu_cs_emit(cs, entry->size / sizeof(uint32_t));
382 }
383 
384 /* for compute which isn't using SET_DRAW_STATE */
385 static inline void
tu_cs_emit_state_ib(struct tu_cs * cs,struct tu_draw_state state)386 tu_cs_emit_state_ib(struct tu_cs *cs, struct tu_draw_state state)
387 {
388    if (state.size) {
389       tu_cs_emit_pkt7(cs, CP_INDIRECT_BUFFER, 3);
390       tu_cs_emit_qw(cs, state.iova);
391       tu_cs_emit(cs, state.size);
392    }
393 }
394 
395 /**
396  * Emit a CP_INDIRECT_BUFFER command packet for each entry in the target
397  * command stream.
398  */
399 static inline void
tu_cs_emit_call(struct tu_cs * cs,const struct tu_cs * target)400 tu_cs_emit_call(struct tu_cs *cs, const struct tu_cs *target)
401 {
402    assert(target->mode == TU_CS_MODE_GROW);
403    for (uint32_t i = 0; i < target->entry_count; i++)
404       tu_cs_emit_ib(cs, target->entries + i);
405 }
406 
407 /**
408  * Emit a CP_NOP with a string tail into the command stream.
409  */
410 void
411 tu_cs_emit_debug_string(struct tu_cs *cs, const char *string, int len);
412 
413 void
414 tu_cs_emit_debug_magic_strv(struct tu_cs *cs,
415                             uint32_t magic,
416                             const char *fmt,
417                             va_list args);
418 
419 __attribute__((format(printf, 2, 3))) void
420 tu_cs_emit_debug_msg(struct tu_cs *cs, const char *fmt, ...);
421 
422 /**
423  * Emit a single message into the CS that denote the calling function and any
424  * optional printf-style parameters when utrace markers are enabled.
425  */
426 #define TU_CS_DEBUG_MSG(CS, FORMAT_STRING, ...)                              \
427    do {                                                                      \
428       if (unlikely(u_trace_markers_enabled(&(CS)->device->trace_context)))   \
429          tu_cs_emit_debug_msg(CS, "%s(" FORMAT_STRING ")", __func__,         \
430                               ## __VA_ARGS__);                               \
431    } while (0)
432 
433 typedef struct tu_cs *tu_debug_scope;
434 
435 __attribute__((format(printf, 3, 4))) void
436 tu_cs_trace_start(struct u_trace_context *utctx,
437                   void *cs,
438                   const char *fmt,
439                   ...);
440 
441 __attribute__((format(printf, 3, 4))) void
442 tu_cs_trace_end(struct u_trace_context *utctx, void *cs, const char *fmt, ...);
443 
444 /* Helpers for bracketing a large sequence of commands of unknown size inside
445  * a CP_COND_REG_EXEC packet.
446  */
447 static inline void
tu_cond_exec_start(struct tu_cs * cs,uint32_t cond_flags)448 tu_cond_exec_start(struct tu_cs *cs, uint32_t cond_flags)
449 {
450    assert(cs->mode == TU_CS_MODE_GROW);
451    assert(cs->cond_stack_depth < TU_COND_EXEC_STACK_SIZE);
452 
453    ASSERTED enum compare_mode mode =
454       (enum compare_mode)((cond_flags & CP_COND_REG_EXEC_0_MODE__MASK) >>
455                           CP_COND_REG_EXEC_0_MODE__SHIFT);
456    assert(mode == PRED_TEST || mode == RENDER_MODE || mode == THREAD_MODE);
457 
458    tu_cs_emit_pkt7(cs, CP_COND_REG_EXEC, 2);
459    tu_cs_emit(cs, cond_flags);
460 
461    cs->cond_flags[cs->cond_stack_depth] = cond_flags;
462    cs->cond_dwords[cs->cond_stack_depth] = cs->cur;
463 
464    /* Emit dummy DWORD field here */
465    tu_cs_emit(cs, RENDER_MODE_CP_COND_REG_EXEC_1_DWORDS(0));
466 
467    cs->cond_stack_depth++;
468 }
469 #define CP_COND_EXEC_0_RENDER_MODE_GMEM \
470    (CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | CP_COND_REG_EXEC_0_GMEM)
471 #define CP_COND_EXEC_0_RENDER_MODE_SYSMEM \
472    (CP_COND_REG_EXEC_0_MODE(RENDER_MODE) | CP_COND_REG_EXEC_0_SYSMEM)
473 
474 static inline void
tu_cond_exec_end(struct tu_cs * cs)475 tu_cond_exec_end(struct tu_cs *cs)
476 {
477    assert(cs->cond_stack_depth > 0);
478    cs->cond_stack_depth--;
479 
480    cs->cond_flags[cs->cond_stack_depth] = 0;
481    /* Subtract one here to account for the DWORD field itself. */
482    uint32_t cond_len = cs->cur - cs->cond_dwords[cs->cond_stack_depth] - 1;
483    if (cond_len) {
484       *cs->cond_dwords[cs->cond_stack_depth] = cond_len;
485    } else {
486       /* rewind the CS to drop the empty cond reg packet. */
487       cs->cur = cs->cur - 3;
488    }
489 }
490 
491 uint64_t
492 tu_cs_emit_data_nop(struct tu_cs *cs,
493                     const uint32_t *data,
494                     uint32_t size,
495                     uint32_t align);
496 
497 /* Temporary struct for tracking a register state to be written, used by
498  * a6xx-pack.h and tu_cs_emit_regs()
499  */
500 struct tu_reg_value {
501    uint32_t reg;
502    uint64_t value;
503    struct tu_bo *bo;
504    bool is_address;
505    bool bo_write;
506    uint32_t bo_offset;
507    uint32_t bo_shift;
508    uint32_t bo_low;
509 };
510 
511 #define fd_reg_pair tu_reg_value
512 #define __bo_type struct tu_bo *
513 
514 #include "a6xx-pack.xml.h"
515 #include "adreno-pm4-pack.xml.h"
516 
517 #define __assert_eq(a, b)                                               \
518    do {                                                                 \
519       if ((a) != (b)) {                                                 \
520          fprintf(stderr, "assert failed: " #a " (0x%x) != " #b " (0x%x)\n", a, b); \
521          assert((a) == (b));                                            \
522       }                                                                 \
523    } while (0)
524 
525 #define __ONE_REG(i, regs)                                      \
526    do {                                                         \
527       if (i < ARRAY_SIZE(regs) && regs[i].reg > 0) {            \
528          __assert_eq(regs[0].reg + i, regs[i].reg);             \
529          if (regs[i].bo) {                                      \
530             uint64_t v = regs[i].bo->iova + regs[i].bo_offset;  \
531             v >>= regs[i].bo_shift;                             \
532             v <<= regs[i].bo_low;                               \
533             v |= regs[i].value;                                 \
534                                                                 \
535             *p++ = v;                                           \
536             *p++ = v >> 32;                                     \
537          } else {                                               \
538             *p++ = regs[i].value;                               \
539             if (regs[i].is_address)                             \
540                *p++ = regs[i].value >> 32;                      \
541          }                                                      \
542       }                                                         \
543    } while (0)
544 
545 /* Emits a sequence of register writes in order using a pkt4.  This will check
546  * (at runtime on a !NDEBUG build) that the registers were actually set up in
547  * order in the code.
548  *
549  * Note that references to buffers aren't automatically added to the CS,
550  * unlike in freedreno.  We are clever in various places to avoid duplicating
551  * the reference add work.
552  *
553  * Also, 64-bit address registers don't have a way (currently) to set a 64-bit
554  * address without having a reference to a BO, since the .dword field in the
555  * register's struct is only 32-bit wide.  We should fix this in the pack
556  * codegen later.
557  */
558 #define tu_cs_emit_regs(cs, ...) do {                   \
559    const struct fd_reg_pair regs[] = { __VA_ARGS__ };   \
560    unsigned count = ARRAY_SIZE(regs);                   \
561                                                         \
562    STATIC_ASSERT(ARRAY_SIZE(regs) > 0);                 \
563    STATIC_ASSERT(ARRAY_SIZE(regs) <= 16);               \
564                                                         \
565    tu_cs_emit_pkt4((cs), regs[0].reg, count);             \
566    uint32_t *p = (cs)->cur;                               \
567    __ONE_REG( 0, regs);                                 \
568    __ONE_REG( 1, regs);                                 \
569    __ONE_REG( 2, regs);                                 \
570    __ONE_REG( 3, regs);                                 \
571    __ONE_REG( 4, regs);                                 \
572    __ONE_REG( 5, regs);                                 \
573    __ONE_REG( 6, regs);                                 \
574    __ONE_REG( 7, regs);                                 \
575    __ONE_REG( 8, regs);                                 \
576    __ONE_REG( 9, regs);                                 \
577    __ONE_REG(10, regs);                                 \
578    __ONE_REG(11, regs);                                 \
579    __ONE_REG(12, regs);                                 \
580    __ONE_REG(13, regs);                                 \
581    __ONE_REG(14, regs);                                 \
582    __ONE_REG(15, regs);                                 \
583    (cs)->cur = p;                                         \
584    } while (0)
585 
586 #endif /* TU_CS_H */
587