1 /*
2 * Copyright © 2021 Collabora Ltd.
3 *
4 * Derived from tu_cmd_buffer.c which is:
5 * Copyright © 2016 Red Hat.
6 * Copyright © 2016 Bas Nieuwenhuizen
7 * Copyright © 2015 Intel Corporation
8 *
9 * Permission is hereby granted, free of charge, to any person obtaining a
10 * copy of this software and associated documentation files (the "Software"),
11 * to deal in the Software without restriction, including without limitation
12 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 * and/or sell copies of the Software, and to permit persons to whom the
14 * Software is furnished to do so, subject to the following conditions:
15 *
16 * The above copyright notice and this permission notice (including the next
17 * paragraph) shall be included in all copies or substantial portions of the
18 * Software.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
23 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 * DEALINGS IN THE SOFTWARE.
27 */
28
29 #include "genxml/gen_macros.h"
30
31 #include "panvk_buffer.h"
32 #include "panvk_cmd_alloc.h"
33 #include "panvk_cmd_buffer.h"
34 #include "panvk_cmd_desc_state.h"
35 #include "panvk_cmd_pool.h"
36 #include "panvk_cmd_push_constant.h"
37 #include "panvk_device.h"
38 #include "panvk_entrypoints.h"
39 #include "panvk_instance.h"
40 #include "panvk_physical_device.h"
41 #include "panvk_priv_bo.h"
42
43 #include "pan_blitter.h"
44 #include "pan_desc.h"
45 #include "pan_encoder.h"
46 #include "pan_props.h"
47 #include "pan_samples.h"
48
49 #include "vk_descriptor_update_template.h"
50 #include "vk_format.h"
51 #include "vk_synchronization.h"
52
53 static void
emit_tls(struct panvk_cmd_buffer * cmdbuf)54 emit_tls(struct panvk_cmd_buffer *cmdbuf)
55 {
56 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
57 struct panvk_physical_device *phys_dev =
58 to_panvk_physical_device(dev->vk.physical);
59 unsigned core_id_range;
60 panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
61
62 if (cmdbuf->state.tls.info.tls.size) {
63 unsigned thread_tls_alloc =
64 panfrost_query_thread_tls_alloc(&phys_dev->kmod.props);
65 unsigned size = panfrost_get_total_stack_size(
66 cmdbuf->state.tls.info.tls.size, thread_tls_alloc, core_id_range);
67
68 cmdbuf->state.tls.info.tls.ptr =
69 panvk_cmd_alloc_dev_mem(cmdbuf, tls, size, 4096).gpu;
70 }
71
72 assert(!cmdbuf->state.tls.info.wls.size);
73
74 if (cmdbuf->state.tls.desc.cpu) {
75 GENX(pan_emit_tls)(&cmdbuf->state.tls.info, cmdbuf->state.tls.desc.cpu);
76 }
77 }
78
79 static void
finish_cs(struct panvk_cmd_buffer * cmdbuf,uint32_t subqueue)80 finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue)
81 {
82 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
83 struct panvk_instance *instance =
84 to_panvk_instance(dev->vk.physical->instance);
85 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue);
86
87 cs_update_progress_seqno(b) {
88 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
89 uint32_t rel_sync_point = cmdbuf->state.cs[i].relative_sync_point;
90
91 if (!rel_sync_point)
92 continue;
93
94 cs_add64(b, cs_progress_seqno_reg(b, i), cs_progress_seqno_reg(b, i),
95 rel_sync_point);
96 }
97 }
98
99 /* We need a clean because descriptor/CS memory can be returned to the
100 * command pool where they get recycled. If we don't clean dirty cache lines,
101 * those cache lines might get evicted asynchronously and their content
102 * pushed back to main memory after the CPU has written new stuff there. */
103 struct cs_index flush_id = cs_scratch_reg32(b, 0);
104
105 cs_move32_to(b, flush_id, 0);
106 cs_wait_slots(b, SB_ALL_MASK, false);
107 cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN,
108 false, flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH)));
109 cs_wait_slot(b, SB_ID(IMM_FLUSH), false);
110
111 /* If we're in sync/trace more, we signal the debug object. */
112 if (instance->debug_flags & (PANVK_DEBUG_SYNC | PANVK_DEBUG_TRACE)) {
113 struct cs_index debug_sync_addr = cs_scratch_reg64(b, 0);
114 struct cs_index one = cs_scratch_reg32(b, 2);
115 struct cs_index error = cs_scratch_reg32(b, 3);
116 struct cs_index cmp_scratch = cs_scratch_reg32(b, 2);
117
118 cs_move32_to(b, one, 1);
119 cs_load64_to(b, debug_sync_addr, cs_subqueue_ctx_reg(b),
120 offsetof(struct panvk_cs_subqueue_context, debug_syncobjs));
121 cs_wait_slot(b, SB_ID(LS), false);
122 cs_add64(b, debug_sync_addr, debug_sync_addr,
123 sizeof(struct panvk_cs_sync32) * subqueue);
124 cs_load32_to(b, error, debug_sync_addr,
125 offsetof(struct panvk_cs_sync32, error));
126 cs_wait_slots(b, SB_ALL_MASK, false);
127 cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_SYSTEM, one, debug_sync_addr,
128 cs_now());
129
130 cs_match(b, error, cmp_scratch) {
131 cs_case(b, 0) {
132 /* Do nothing. */
133 }
134
135 cs_default(b) {
136 /* Overwrite the sync error with the first error we encountered. */
137 cs_store32(b, error, debug_sync_addr,
138 offsetof(struct panvk_cs_sync32, error));
139 cs_wait_slots(b, SB_ID(LS), false);
140 }
141 }
142 }
143
144 cs_finish(&cmdbuf->state.cs[subqueue].builder);
145 }
146
147 VKAPI_ATTR VkResult VKAPI_CALL
panvk_per_arch(EndCommandBuffer)148 panvk_per_arch(EndCommandBuffer)(VkCommandBuffer commandBuffer)
149 {
150 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
151
152 emit_tls(cmdbuf);
153
154 for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) {
155 struct cs_builder *b = &cmdbuf->state.cs[i].builder;
156
157 if (!cs_is_valid(b)) {
158 vk_command_buffer_set_error(&cmdbuf->vk,
159 VK_ERROR_OUT_OF_DEVICE_MEMORY);
160 } else {
161 finish_cs(cmdbuf, i);
162 }
163 }
164
165 return vk_command_buffer_end(&cmdbuf->vk);
166 }
167
168 static bool
src_stages_need_draw_flush(VkPipelineStageFlags2 stages)169 src_stages_need_draw_flush(VkPipelineStageFlags2 stages)
170 {
171 static const VkPipelineStageFlags2 draw_flush_stage_mask =
172 VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
173 VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
174 VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT |
175 VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
176 VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT |
177 VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT;
178
179 return (stages & draw_flush_stage_mask) != 0;
180 }
181
182 static bool
stages_cover_subqueue(enum panvk_subqueue_id subqueue,VkPipelineStageFlags2 stages)183 stages_cover_subqueue(enum panvk_subqueue_id subqueue,
184 VkPipelineStageFlags2 stages)
185 {
186 static const VkPipelineStageFlags2 queue_coverage[PANVK_SUBQUEUE_COUNT] = {
187 [PANVK_SUBQUEUE_VERTEX_TILER] = VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
188 VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
189 VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT,
190 [PANVK_SUBQUEUE_FRAGMENT] =
191 VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
192 VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
193 VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT |
194 VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
195 VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT |
196 VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT,
197 [PANVK_SUBQUEUE_COMPUTE] =
198 VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_COPY_BIT,
199 };
200
201 return (stages & queue_coverage[subqueue]) != 0;
202 }
203
204 static uint32_t
src_stages_to_subqueue_sb_mask(enum panvk_subqueue_id subqueue,VkPipelineStageFlags2 stages)205 src_stages_to_subqueue_sb_mask(enum panvk_subqueue_id subqueue,
206 VkPipelineStageFlags2 stages)
207 {
208 if (!stages_cover_subqueue(subqueue, stages))
209 return 0;
210
211 /* Indirect draw buffers are read from the command stream, and load/store
212 * operations are synchronized with the LS scoreboad immediately after the
213 * read, so no need to wait in that case.
214 */
215 if (subqueue == PANVK_SUBQUEUE_VERTEX_TILER &&
216 stages == VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT)
217 return 0;
218
219 /* We need to wait for all previously submitted jobs, and given the
220 * iterator scoreboard is a moving target, we just wait for the
221 * whole dynamic scoreboard range. */
222 return BITFIELD_RANGE(PANVK_SB_ITER_START, PANVK_SB_ITER_COUNT);
223 }
224
225 static void
collect_cache_flush_info(enum panvk_subqueue_id subqueue,struct panvk_cache_flush_info * cache_flush,VkPipelineStageFlags2 src_stages,VkPipelineStageFlags2 dst_stages,VkAccessFlags2 src_access,VkAccessFlags2 dst_access)226 collect_cache_flush_info(enum panvk_subqueue_id subqueue,
227 struct panvk_cache_flush_info *cache_flush,
228 VkPipelineStageFlags2 src_stages,
229 VkPipelineStageFlags2 dst_stages,
230 VkAccessFlags2 src_access, VkAccessFlags2 dst_access)
231 {
232 static const VkAccessFlags2 dev_writes[PANVK_SUBQUEUE_COUNT] = {
233 [PANVK_SUBQUEUE_VERTEX_TILER] = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
234 VK_ACCESS_2_SHADER_WRITE_BIT |
235 VK_ACCESS_2_TRANSFER_WRITE_BIT,
236 [PANVK_SUBQUEUE_FRAGMENT] =
237 VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT | VK_ACCESS_2_SHADER_WRITE_BIT |
238 VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
239 VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
240 VK_ACCESS_2_TRANSFER_WRITE_BIT,
241 [PANVK_SUBQUEUE_COMPUTE] = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
242 VK_ACCESS_2_SHADER_WRITE_BIT |
243 VK_ACCESS_2_TRANSFER_WRITE_BIT,
244 };
245 static const VkAccessFlags2 dev_reads[PANVK_SUBQUEUE_COUNT] = {
246 [PANVK_SUBQUEUE_VERTEX_TILER] =
247 VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_2_INDEX_READ_BIT |
248 VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_2_UNIFORM_READ_BIT |
249 VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_TRANSFER_READ_BIT |
250 VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
251 VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
252 [PANVK_SUBQUEUE_FRAGMENT] =
253 VK_ACCESS_2_UNIFORM_READ_BIT | VK_ACCESS_2_SHADER_READ_BIT |
254 VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT |
255 VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
256 VK_ACCESS_2_TRANSFER_READ_BIT | VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
257 VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
258 [PANVK_SUBQUEUE_COMPUTE] =
259 VK_ACCESS_2_UNIFORM_READ_BIT | VK_ACCESS_2_SHADER_READ_BIT |
260 VK_ACCESS_2_TRANSFER_READ_BIT | VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
261 VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
262 };
263
264 /* Note on the cache organization:
265 * - L2 cache is unified, so all changes to this cache are automatically
266 * visible to all GPU sub-components (shader cores, tiler, ...). This
267 * means we only need to flush when the host (AKA CPU) is involved.
268 * - LS caches (which are basically just read-write L1 caches) are coherent
269 * with each other and with the L2 cache, so again, we only need to flush
270 * when the host is involved.
271 * - Other read-only L1 caches (like the ones in front of the texture unit)
272 * are not coherent with the LS or L2 caches, and thus need to be
273 * invalidated any time a write happens.
274 */
275
276 #define ACCESS_HITS_RO_L1_CACHE \
277 (VK_ACCESS_2_SHADER_SAMPLED_READ_BIT | \
278 VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT | \
279 VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT | \
280 VK_ACCESS_2_TRANSFER_READ_BIT)
281
282 if ((dev_writes[subqueue] & src_access) &&
283 (dev_reads[subqueue] & ACCESS_HITS_RO_L1_CACHE & dst_access))
284 cache_flush->others |= true;
285
286 /* If the host wrote something, we need to clean/invalidate everything. */
287 if ((src_stages & VK_PIPELINE_STAGE_2_HOST_BIT) &&
288 (src_access & VK_ACCESS_2_HOST_WRITE_BIT) &&
289 ((dev_reads[subqueue] | dev_writes[subqueue]) & dst_access)) {
290 cache_flush->l2 |= MALI_CS_FLUSH_MODE_CLEAN_AND_INVALIDATE;
291 cache_flush->lsc |= MALI_CS_FLUSH_MODE_CLEAN_AND_INVALIDATE;
292 cache_flush->others |= true;
293 }
294
295 /* If the host needs to read something we wrote, we need to clean
296 * everything. */
297 if ((dst_stages & VK_PIPELINE_STAGE_2_HOST_BIT) &&
298 (dst_access & VK_ACCESS_2_HOST_READ_BIT) &&
299 (dev_writes[subqueue] & src_access)) {
300 cache_flush->l2 |= MALI_CS_FLUSH_MODE_CLEAN;
301 cache_flush->lsc |= MALI_CS_FLUSH_MODE_CLEAN;
302 }
303 }
304
305 static void
collect_cs_deps(struct panvk_cmd_buffer * cmdbuf,VkPipelineStageFlags2 src_stages,VkPipelineStageFlags2 dst_stages,VkAccessFlags src_access,VkAccessFlags dst_access,struct panvk_cs_deps * deps)306 collect_cs_deps(struct panvk_cmd_buffer *cmdbuf,
307 VkPipelineStageFlags2 src_stages,
308 VkPipelineStageFlags2 dst_stages, VkAccessFlags src_access,
309 VkAccessFlags dst_access, struct panvk_cs_deps *deps)
310 {
311 if (src_stages_need_draw_flush(src_stages) && cmdbuf->state.gfx.render.tiler)
312 deps->needs_draw_flush = true;
313
314 uint32_t wait_subqueue_mask = 0;
315 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
316 uint32_t sb_mask = src_stages_to_subqueue_sb_mask(i, src_stages);
317 assert((sb_mask != 0) == stages_cover_subqueue(i, src_stages));
318 if (!sb_mask)
319 continue;
320
321 deps->src[i].wait_sb_mask |= sb_mask;
322 collect_cache_flush_info(i, &deps->src[i].cache_flush, src_stages,
323 dst_stages, src_access, dst_access);
324 wait_subqueue_mask |= BITFIELD_BIT(i);
325 }
326
327 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
328 if (!stages_cover_subqueue(i, dst_stages))
329 continue;
330
331 deps->dst[i].wait_subqueue_mask |= wait_subqueue_mask & ~BITFIELD_BIT(i);
332 }
333 }
334
335 void
panvk_per_arch(get_cs_deps)336 panvk_per_arch(get_cs_deps)(struct panvk_cmd_buffer *cmdbuf,
337 const VkDependencyInfo *in,
338 struct panvk_cs_deps *out)
339 {
340 memset(out, 0, sizeof(*out));
341
342 for (uint32_t i = 0; i < in->memoryBarrierCount; i++) {
343 const VkMemoryBarrier2 *barrier = &in->pMemoryBarriers[i];
344 VkPipelineStageFlags2 src_stages =
345 vk_expand_pipeline_stage_flags2(barrier->srcStageMask);
346 VkPipelineStageFlags2 dst_stages =
347 vk_expand_pipeline_stage_flags2(barrier->dstStageMask);
348 VkAccessFlags2 src_access =
349 vk_filter_src_access_flags2(src_stages, barrier->srcAccessMask);
350 VkAccessFlags2 dst_access =
351 vk_filter_dst_access_flags2(dst_stages, barrier->dstAccessMask);
352
353 collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access,
354 out);
355 }
356
357 for (uint32_t i = 0; i < in->bufferMemoryBarrierCount; i++) {
358 const VkBufferMemoryBarrier2 *barrier = &in->pBufferMemoryBarriers[i];
359 VkPipelineStageFlags2 src_stages =
360 vk_expand_pipeline_stage_flags2(barrier->srcStageMask);
361 VkPipelineStageFlags2 dst_stages =
362 vk_expand_pipeline_stage_flags2(barrier->dstStageMask);
363 VkAccessFlags2 src_access =
364 vk_filter_src_access_flags2(src_stages, barrier->srcAccessMask);
365 VkAccessFlags2 dst_access =
366 vk_filter_dst_access_flags2(dst_stages, barrier->dstAccessMask);
367
368 collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access,
369 out);
370 }
371
372 for (uint32_t i = 0; i < in->imageMemoryBarrierCount; i++) {
373 const VkImageMemoryBarrier2 *barrier = &in->pImageMemoryBarriers[i];
374 VkPipelineStageFlags2 src_stages =
375 vk_expand_pipeline_stage_flags2(barrier->srcStageMask);
376 VkPipelineStageFlags2 dst_stages =
377 vk_expand_pipeline_stage_flags2(barrier->dstStageMask);
378 VkAccessFlags2 src_access =
379 vk_filter_src_access_flags2(src_stages, barrier->srcAccessMask);
380 VkAccessFlags2 dst_access =
381 vk_filter_dst_access_flags2(dst_stages, barrier->dstAccessMask);
382
383 collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access,
384 out);
385 }
386
387 /* The draw flush will add a vertex -> fragment dependency, so we can skip
388 * the one described in the deps. */
389 if (out->needs_draw_flush)
390 out->dst[PANVK_SUBQUEUE_FRAGMENT].wait_subqueue_mask &=
391 ~BITFIELD_BIT(PANVK_SUBQUEUE_VERTEX_TILER);
392 }
393
394 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdPipelineBarrier2)395 panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer,
396 const VkDependencyInfo *pDependencyInfo)
397 {
398 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
399 struct panvk_cs_deps deps;
400
401 panvk_per_arch(get_cs_deps)(cmdbuf, pDependencyInfo, &deps);
402
403 if (deps.needs_draw_flush)
404 panvk_per_arch(cmd_flush_draws)(cmdbuf);
405
406 uint32_t wait_subqueue_mask = 0;
407 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++)
408 wait_subqueue_mask |= deps.dst[i].wait_subqueue_mask;
409
410 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
411 if (!deps.src[i].wait_sb_mask)
412 continue;
413
414 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i);
415 struct panvk_cs_state *cs_state = &cmdbuf->state.cs[i];
416
417 cs_wait_slots(b, deps.src[i].wait_sb_mask, false);
418
419 struct panvk_cache_flush_info cache_flush = deps.src[i].cache_flush;
420 if (cache_flush.l2 != MALI_CS_FLUSH_MODE_NONE ||
421 cache_flush.lsc != MALI_CS_FLUSH_MODE_NONE || cache_flush.others) {
422 struct cs_index flush_id = cs_scratch_reg32(b, 0);
423
424 cs_move32_to(b, flush_id, 0);
425 cs_flush_caches(b, cache_flush.l2, cache_flush.lsc, cache_flush.others,
426 flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH)));
427 cs_wait_slot(b, SB_ID(IMM_FLUSH), false);
428 }
429
430 /* If no one waits on us, there's no point signaling the sync object. */
431 if (wait_subqueue_mask & BITFIELD_BIT(i)) {
432 struct cs_index sync_addr = cs_scratch_reg64(b, 0);
433 struct cs_index add_val = cs_scratch_reg64(b, 2);
434
435 cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
436 offsetof(struct panvk_cs_subqueue_context, syncobjs));
437 cs_wait_slot(b, SB_ID(LS), false);
438 cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * i);
439 cs_move64_to(b, add_val, 1);
440 cs_sync64_add(b, false, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr,
441 cs_now());
442 ++cs_state->relative_sync_point;
443 }
444 }
445
446 for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
447 if (!deps.dst[i].wait_subqueue_mask)
448 continue;
449
450 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i);
451 for (uint32_t j = 0; j < PANVK_SUBQUEUE_COUNT; j++) {
452 if (!(deps.dst[i].wait_subqueue_mask & BITFIELD_BIT(j)))
453 continue;
454
455 struct panvk_cs_state *cs_state = &cmdbuf->state.cs[j];
456 struct cs_index sync_addr = cs_scratch_reg64(b, 0);
457 struct cs_index wait_val = cs_scratch_reg64(b, 2);
458
459 cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
460 offsetof(struct panvk_cs_subqueue_context, syncobjs));
461 cs_wait_slot(b, SB_ID(LS), false);
462 cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * j);
463
464 cs_add64(b, wait_val, cs_progress_seqno_reg(b, j),
465 cs_state->relative_sync_point);
466 cs_sync64_wait(b, false, MALI_CS_CONDITION_GREATER, wait_val,
467 sync_addr);
468 }
469 }
470 }
471
472 void
panvk_per_arch(cs_pick_iter_sb)473 panvk_per_arch(cs_pick_iter_sb)(struct panvk_cmd_buffer *cmdbuf,
474 enum panvk_subqueue_id subqueue)
475 {
476 struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue);
477 struct cs_index iter_sb = cs_scratch_reg32(b, 0);
478 struct cs_index cmp_scratch = cs_scratch_reg32(b, 1);
479
480 cs_load32_to(b, iter_sb, cs_subqueue_ctx_reg(b),
481 offsetof(struct panvk_cs_subqueue_context, iter_sb));
482 cs_wait_slot(b, SB_ID(LS), false);
483
484 cs_match(b, iter_sb, cmp_scratch) {
485 #define CASE(x) \
486 cs_case(b, x) { \
487 cs_wait_slot(b, SB_ITER(x), false); \
488 cs_set_scoreboard_entry(b, SB_ITER(x), SB_ID(LS)); \
489 }
490
491 CASE(0)
492 CASE(1)
493 CASE(2)
494 CASE(3)
495 CASE(4)
496 #undef CASE
497 }
498 }
499
500 static struct cs_buffer
alloc_cs_buffer(void * cookie)501 alloc_cs_buffer(void *cookie)
502 {
503 struct panvk_cmd_buffer *cmdbuf = cookie;
504 const unsigned capacity = 64 * 1024 / sizeof(uint64_t);
505
506 struct panfrost_ptr ptr =
507 panvk_cmd_alloc_dev_mem(cmdbuf, cs, capacity * 8, 64);
508
509 return (struct cs_buffer){
510 .cpu = ptr.cpu,
511 .gpu = ptr.gpu,
512 .capacity = capacity,
513 };
514 }
515
516 static enum cs_reg_perm
cs_reg_perm(struct cs_builder * b,unsigned reg)517 cs_reg_perm(struct cs_builder *b, unsigned reg)
518 {
519 struct panvk_cs_state *cs_state =
520 container_of(b, struct panvk_cs_state, builder);
521 struct panvk_cs_reg_upd_context *upd_ctx;
522
523 for (upd_ctx = cs_state->reg_access.upd_ctx_stack; upd_ctx;
524 upd_ctx = upd_ctx->next) {
525 if (upd_ctx->reg_perm(b, reg) == CS_REG_RW)
526 return CS_REG_RW;
527 }
528
529 return cs_state->reg_access.base_perm(b, reg);
530 }
531
532 static void
init_cs_builders(struct panvk_cmd_buffer * cmdbuf)533 init_cs_builders(struct panvk_cmd_buffer *cmdbuf)
534 {
535 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
536 struct panvk_instance *instance =
537 to_panvk_instance(dev->vk.physical->instance);
538 const reg_perm_cb_t base_reg_perms[PANVK_SUBQUEUE_COUNT] = {
539 [PANVK_SUBQUEUE_VERTEX_TILER] = panvk_cs_vt_reg_perm,
540 [PANVK_SUBQUEUE_FRAGMENT] = panvk_cs_frag_reg_perm,
541 [PANVK_SUBQUEUE_COMPUTE] = panvk_cs_compute_reg_perm,
542 };
543
544 for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) {
545 /* Lazy allocation of the root CS. */
546 struct cs_buffer root_cs = {0};
547
548 struct cs_builder_conf conf = {
549 .nr_registers = 96,
550 .nr_kernel_registers = 4,
551 .alloc_buffer = alloc_cs_buffer,
552 .cookie = cmdbuf,
553 };
554
555 if (instance->debug_flags & PANVK_DEBUG_CS) {
556 cmdbuf->state.cs[i].ls_tracker = (struct cs_load_store_tracker){
557 .sb_slot = SB_ID(LS),
558 };
559
560 conf.ls_tracker = &cmdbuf->state.cs[i].ls_tracker;
561
562 cmdbuf->state.cs[i].reg_access.upd_ctx_stack = NULL;
563 cmdbuf->state.cs[i].reg_access.base_perm = base_reg_perms[i];
564 conf.reg_perm = cs_reg_perm;
565 }
566
567 cs_builder_init(&cmdbuf->state.cs[i].builder, &conf, root_cs);
568 }
569 }
570
571 static void
panvk_reset_cmdbuf(struct vk_command_buffer * vk_cmdbuf,VkCommandBufferResetFlags flags)572 panvk_reset_cmdbuf(struct vk_command_buffer *vk_cmdbuf,
573 VkCommandBufferResetFlags flags)
574 {
575 struct panvk_cmd_buffer *cmdbuf =
576 container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk);
577 struct panvk_cmd_pool *pool =
578 container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk);
579
580 vk_command_buffer_reset(&cmdbuf->vk);
581
582 panvk_pool_reset(&cmdbuf->cs_pool);
583 panvk_pool_reset(&cmdbuf->desc_pool);
584 panvk_pool_reset(&cmdbuf->tls_pool);
585 list_splicetail(&cmdbuf->push_sets, &pool->push_sets);
586 list_inithead(&cmdbuf->push_sets);
587
588 memset(&cmdbuf->state, 0, sizeof(cmdbuf->state));
589 init_cs_builders(cmdbuf);
590 }
591
592 static void
panvk_destroy_cmdbuf(struct vk_command_buffer * vk_cmdbuf)593 panvk_destroy_cmdbuf(struct vk_command_buffer *vk_cmdbuf)
594 {
595 struct panvk_cmd_buffer *cmdbuf =
596 container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk);
597 struct panvk_cmd_pool *pool =
598 container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk);
599 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
600
601 panvk_pool_cleanup(&cmdbuf->cs_pool);
602 panvk_pool_cleanup(&cmdbuf->desc_pool);
603 panvk_pool_cleanup(&cmdbuf->tls_pool);
604 list_splicetail(&cmdbuf->push_sets, &pool->push_sets);
605 vk_command_buffer_finish(&cmdbuf->vk);
606 vk_free(&dev->vk.alloc, cmdbuf);
607 }
608
609 static VkResult
panvk_create_cmdbuf(struct vk_command_pool * vk_pool,VkCommandBufferLevel level,struct vk_command_buffer ** cmdbuf_out)610 panvk_create_cmdbuf(struct vk_command_pool *vk_pool, VkCommandBufferLevel level,
611 struct vk_command_buffer **cmdbuf_out)
612 {
613 struct panvk_device *device =
614 container_of(vk_pool->base.device, struct panvk_device, vk);
615 struct panvk_cmd_pool *pool =
616 container_of(vk_pool, struct panvk_cmd_pool, vk);
617 struct panvk_cmd_buffer *cmdbuf;
618
619 cmdbuf = vk_zalloc(&device->vk.alloc, sizeof(*cmdbuf), 8,
620 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
621 if (!cmdbuf)
622 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
623
624 VkResult result = vk_command_buffer_init(
625 &pool->vk, &cmdbuf->vk, &panvk_per_arch(cmd_buffer_ops), level);
626 if (result != VK_SUCCESS) {
627 vk_free(&device->vk.alloc, cmdbuf);
628 return result;
629 }
630
631 list_inithead(&cmdbuf->push_sets);
632 cmdbuf->vk.dynamic_graphics_state.vi = &cmdbuf->state.gfx.dynamic.vi;
633 cmdbuf->vk.dynamic_graphics_state.ms.sample_locations =
634 &cmdbuf->state.gfx.dynamic.sl;
635
636 struct panvk_pool_properties cs_pool_props = {
637 .create_flags = 0,
638 .slab_size = 64 * 1024,
639 .label = "Command buffer CS pool",
640 .prealloc = false,
641 .owns_bos = true,
642 .needs_locking = false,
643 };
644 panvk_pool_init(&cmdbuf->cs_pool, device, &pool->cs_bo_pool, &cs_pool_props);
645
646 struct panvk_pool_properties desc_pool_props = {
647 .create_flags = 0,
648 .slab_size = 64 * 1024,
649 .label = "Command buffer descriptor pool",
650 .prealloc = false,
651 .owns_bos = true,
652 .needs_locking = false,
653 };
654 panvk_pool_init(&cmdbuf->desc_pool, device, &pool->desc_bo_pool,
655 &desc_pool_props);
656
657 struct panvk_pool_properties tls_pool_props = {
658 .create_flags =
659 panvk_device_adjust_bo_flags(device, PAN_KMOD_BO_FLAG_NO_MMAP),
660 .slab_size = 64 * 1024,
661 .label = "TLS pool",
662 .prealloc = false,
663 .owns_bos = true,
664 .needs_locking = false,
665 };
666 panvk_pool_init(&cmdbuf->tls_pool, device, &pool->tls_bo_pool,
667 &tls_pool_props);
668
669 init_cs_builders(cmdbuf);
670 *cmdbuf_out = &cmdbuf->vk;
671 return VK_SUCCESS;
672 }
673
674 const struct vk_command_buffer_ops panvk_per_arch(cmd_buffer_ops) = {
675 .create = panvk_create_cmdbuf,
676 .reset = panvk_reset_cmdbuf,
677 .destroy = panvk_destroy_cmdbuf,
678 };
679
680 VKAPI_ATTR VkResult VKAPI_CALL
panvk_per_arch(BeginCommandBuffer)681 panvk_per_arch(BeginCommandBuffer)(VkCommandBuffer commandBuffer,
682 const VkCommandBufferBeginInfo *pBeginInfo)
683 {
684 VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
685 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
686 struct panvk_instance *instance =
687 to_panvk_instance(dev->vk.physical->instance);
688
689 vk_command_buffer_begin(&cmdbuf->vk, pBeginInfo);
690 cmdbuf->flags = pBeginInfo->flags;
691
692 /* The descriptor ringbuf trips out pandecode because we always point to the
693 * next tiler/framebuffer descriptor after CS execution, which means we're
694 * decoding an uninitialized or stale descriptor.
695 * FIXME: find a way to trace the simultaneous path that doesn't crash. One
696 * option would be to disable CS intepretation and dump the RUN_xxx context
697 * on the side at execution time.
698 */
699 if (instance->debug_flags & PANVK_DEBUG_TRACE)
700 cmdbuf->flags &= ~VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
701
702 return VK_SUCCESS;
703 }
704