xref: /aosp_15_r20/external/mesa3d/src/panfrost/vulkan/csf/panvk_vX_cmd_buffer.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Collabora Ltd.
3  *
4  * Derived from tu_cmd_buffer.c which is:
5  * Copyright © 2016 Red Hat.
6  * Copyright © 2016 Bas Nieuwenhuizen
7  * Copyright © 2015 Intel Corporation
8  *
9  * Permission is hereby granted, free of charge, to any person obtaining a
10  * copy of this software and associated documentation files (the "Software"),
11  * to deal in the Software without restriction, including without limitation
12  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
13  * and/or sell copies of the Software, and to permit persons to whom the
14  * Software is furnished to do so, subject to the following conditions:
15  *
16  * The above copyright notice and this permission notice (including the next
17  * paragraph) shall be included in all copies or substantial portions of the
18  * Software.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
23  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
25  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26  * DEALINGS IN THE SOFTWARE.
27  */
28 
29 #include "genxml/gen_macros.h"
30 
31 #include "panvk_buffer.h"
32 #include "panvk_cmd_alloc.h"
33 #include "panvk_cmd_buffer.h"
34 #include "panvk_cmd_desc_state.h"
35 #include "panvk_cmd_pool.h"
36 #include "panvk_cmd_push_constant.h"
37 #include "panvk_device.h"
38 #include "panvk_entrypoints.h"
39 #include "panvk_instance.h"
40 #include "panvk_physical_device.h"
41 #include "panvk_priv_bo.h"
42 
43 #include "pan_blitter.h"
44 #include "pan_desc.h"
45 #include "pan_encoder.h"
46 #include "pan_props.h"
47 #include "pan_samples.h"
48 
49 #include "vk_descriptor_update_template.h"
50 #include "vk_format.h"
51 #include "vk_synchronization.h"
52 
53 static void
emit_tls(struct panvk_cmd_buffer * cmdbuf)54 emit_tls(struct panvk_cmd_buffer *cmdbuf)
55 {
56    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
57    struct panvk_physical_device *phys_dev =
58       to_panvk_physical_device(dev->vk.physical);
59    unsigned core_id_range;
60    panfrost_query_core_count(&phys_dev->kmod.props, &core_id_range);
61 
62    if (cmdbuf->state.tls.info.tls.size) {
63       unsigned thread_tls_alloc =
64          panfrost_query_thread_tls_alloc(&phys_dev->kmod.props);
65       unsigned size = panfrost_get_total_stack_size(
66          cmdbuf->state.tls.info.tls.size, thread_tls_alloc, core_id_range);
67 
68       cmdbuf->state.tls.info.tls.ptr =
69          panvk_cmd_alloc_dev_mem(cmdbuf, tls, size, 4096).gpu;
70    }
71 
72    assert(!cmdbuf->state.tls.info.wls.size);
73 
74    if (cmdbuf->state.tls.desc.cpu) {
75       GENX(pan_emit_tls)(&cmdbuf->state.tls.info, cmdbuf->state.tls.desc.cpu);
76    }
77 }
78 
79 static void
finish_cs(struct panvk_cmd_buffer * cmdbuf,uint32_t subqueue)80 finish_cs(struct panvk_cmd_buffer *cmdbuf, uint32_t subqueue)
81 {
82    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
83    struct panvk_instance *instance =
84       to_panvk_instance(dev->vk.physical->instance);
85    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue);
86 
87    cs_update_progress_seqno(b) {
88       for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
89          uint32_t rel_sync_point = cmdbuf->state.cs[i].relative_sync_point;
90 
91          if (!rel_sync_point)
92             continue;
93 
94          cs_add64(b, cs_progress_seqno_reg(b, i), cs_progress_seqno_reg(b, i),
95                   rel_sync_point);
96       }
97    }
98 
99    /* We need a clean because descriptor/CS memory can be returned to the
100     * command pool where they get recycled. If we don't clean dirty cache lines,
101     * those cache lines might get evicted asynchronously and their content
102     * pushed back to main memory after the CPU has written new stuff there. */
103    struct cs_index flush_id = cs_scratch_reg32(b, 0);
104 
105    cs_move32_to(b, flush_id, 0);
106    cs_wait_slots(b, SB_ALL_MASK, false);
107    cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN,
108                    false, flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH)));
109    cs_wait_slot(b, SB_ID(IMM_FLUSH), false);
110 
111    /* If we're in sync/trace more, we signal the debug object. */
112    if (instance->debug_flags & (PANVK_DEBUG_SYNC | PANVK_DEBUG_TRACE)) {
113       struct cs_index debug_sync_addr = cs_scratch_reg64(b, 0);
114       struct cs_index one = cs_scratch_reg32(b, 2);
115       struct cs_index error = cs_scratch_reg32(b, 3);
116       struct cs_index cmp_scratch = cs_scratch_reg32(b, 2);
117 
118       cs_move32_to(b, one, 1);
119       cs_load64_to(b, debug_sync_addr, cs_subqueue_ctx_reg(b),
120                    offsetof(struct panvk_cs_subqueue_context, debug_syncobjs));
121       cs_wait_slot(b, SB_ID(LS), false);
122       cs_add64(b, debug_sync_addr, debug_sync_addr,
123                sizeof(struct panvk_cs_sync32) * subqueue);
124       cs_load32_to(b, error, debug_sync_addr,
125                    offsetof(struct panvk_cs_sync32, error));
126       cs_wait_slots(b, SB_ALL_MASK, false);
127       cs_sync32_add(b, true, MALI_CS_SYNC_SCOPE_SYSTEM, one, debug_sync_addr,
128                     cs_now());
129 
130       cs_match(b, error, cmp_scratch) {
131          cs_case(b, 0) {
132             /* Do nothing. */
133          }
134 
135          cs_default(b) {
136             /* Overwrite the sync error with the first error we encountered. */
137             cs_store32(b, error, debug_sync_addr,
138                        offsetof(struct panvk_cs_sync32, error));
139             cs_wait_slots(b, SB_ID(LS), false);
140          }
141       }
142    }
143 
144    cs_finish(&cmdbuf->state.cs[subqueue].builder);
145 }
146 
147 VKAPI_ATTR VkResult VKAPI_CALL
panvk_per_arch(EndCommandBuffer)148 panvk_per_arch(EndCommandBuffer)(VkCommandBuffer commandBuffer)
149 {
150    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
151 
152    emit_tls(cmdbuf);
153 
154    for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) {
155       struct cs_builder *b = &cmdbuf->state.cs[i].builder;
156 
157       if (!cs_is_valid(b)) {
158          vk_command_buffer_set_error(&cmdbuf->vk,
159                                      VK_ERROR_OUT_OF_DEVICE_MEMORY);
160       } else {
161          finish_cs(cmdbuf, i);
162       }
163    }
164 
165    return vk_command_buffer_end(&cmdbuf->vk);
166 }
167 
168 static bool
src_stages_need_draw_flush(VkPipelineStageFlags2 stages)169 src_stages_need_draw_flush(VkPipelineStageFlags2 stages)
170 {
171    static const VkPipelineStageFlags2 draw_flush_stage_mask =
172       VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
173       VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
174       VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT |
175       VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
176       VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT |
177       VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT;
178 
179    return (stages & draw_flush_stage_mask) != 0;
180 }
181 
182 static bool
stages_cover_subqueue(enum panvk_subqueue_id subqueue,VkPipelineStageFlags2 stages)183 stages_cover_subqueue(enum panvk_subqueue_id subqueue,
184                       VkPipelineStageFlags2 stages)
185 {
186    static const VkPipelineStageFlags2 queue_coverage[PANVK_SUBQUEUE_COUNT] = {
187       [PANVK_SUBQUEUE_VERTEX_TILER] = VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT |
188                                       VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
189                                       VK_PIPELINE_STAGE_2_VERTEX_SHADER_BIT,
190       [PANVK_SUBQUEUE_FRAGMENT] =
191          VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT |
192          VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT |
193          VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT |
194          VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT |
195          VK_PIPELINE_STAGE_2_COPY_BIT | VK_PIPELINE_STAGE_2_BLIT_BIT |
196          VK_PIPELINE_STAGE_2_RESOLVE_BIT | VK_PIPELINE_STAGE_2_CLEAR_BIT,
197       [PANVK_SUBQUEUE_COMPUTE] =
198          VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_COPY_BIT,
199    };
200 
201    return (stages & queue_coverage[subqueue]) != 0;
202 }
203 
204 static uint32_t
src_stages_to_subqueue_sb_mask(enum panvk_subqueue_id subqueue,VkPipelineStageFlags2 stages)205 src_stages_to_subqueue_sb_mask(enum panvk_subqueue_id subqueue,
206                                VkPipelineStageFlags2 stages)
207 {
208    if (!stages_cover_subqueue(subqueue, stages))
209       return 0;
210 
211    /* Indirect draw buffers are read from the command stream, and load/store
212     * operations are synchronized with the LS scoreboad immediately after the
213     * read, so no need to wait in that case.
214     */
215    if (subqueue == PANVK_SUBQUEUE_VERTEX_TILER &&
216        stages == VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT)
217       return 0;
218 
219    /* We need to wait for all previously submitted jobs, and given the
220     * iterator scoreboard is a moving target, we just wait for the
221     * whole dynamic scoreboard range. */
222    return BITFIELD_RANGE(PANVK_SB_ITER_START, PANVK_SB_ITER_COUNT);
223 }
224 
225 static void
collect_cache_flush_info(enum panvk_subqueue_id subqueue,struct panvk_cache_flush_info * cache_flush,VkPipelineStageFlags2 src_stages,VkPipelineStageFlags2 dst_stages,VkAccessFlags2 src_access,VkAccessFlags2 dst_access)226 collect_cache_flush_info(enum panvk_subqueue_id subqueue,
227                          struct panvk_cache_flush_info *cache_flush,
228                          VkPipelineStageFlags2 src_stages,
229                          VkPipelineStageFlags2 dst_stages,
230                          VkAccessFlags2 src_access, VkAccessFlags2 dst_access)
231 {
232    static const VkAccessFlags2 dev_writes[PANVK_SUBQUEUE_COUNT] = {
233       [PANVK_SUBQUEUE_VERTEX_TILER] = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
234                                       VK_ACCESS_2_SHADER_WRITE_BIT |
235                                       VK_ACCESS_2_TRANSFER_WRITE_BIT,
236       [PANVK_SUBQUEUE_FRAGMENT] =
237          VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT | VK_ACCESS_2_SHADER_WRITE_BIT |
238          VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT |
239          VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT |
240          VK_ACCESS_2_TRANSFER_WRITE_BIT,
241       [PANVK_SUBQUEUE_COMPUTE] = VK_ACCESS_2_SHADER_STORAGE_WRITE_BIT |
242                                  VK_ACCESS_2_SHADER_WRITE_BIT |
243                                  VK_ACCESS_2_TRANSFER_WRITE_BIT,
244    };
245    static const VkAccessFlags2 dev_reads[PANVK_SUBQUEUE_COUNT] = {
246       [PANVK_SUBQUEUE_VERTEX_TILER] =
247          VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_2_INDEX_READ_BIT |
248          VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_2_UNIFORM_READ_BIT |
249          VK_ACCESS_2_SHADER_READ_BIT | VK_ACCESS_2_TRANSFER_READ_BIT |
250          VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
251          VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
252       [PANVK_SUBQUEUE_FRAGMENT] =
253          VK_ACCESS_2_UNIFORM_READ_BIT | VK_ACCESS_2_SHADER_READ_BIT |
254          VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT |
255          VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT |
256          VK_ACCESS_2_TRANSFER_READ_BIT | VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
257          VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
258       [PANVK_SUBQUEUE_COMPUTE] =
259          VK_ACCESS_2_UNIFORM_READ_BIT | VK_ACCESS_2_SHADER_READ_BIT |
260          VK_ACCESS_2_TRANSFER_READ_BIT | VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |
261          VK_ACCESS_2_SHADER_STORAGE_READ_BIT,
262    };
263 
264    /* Note on the cache organization:
265     * - L2 cache is unified, so all changes to this cache are automatically
266     *   visible to all GPU sub-components (shader cores, tiler, ...). This
267     *   means we only need to flush when the host (AKA CPU) is involved.
268     * - LS caches (which are basically just read-write L1 caches) are coherent
269     *   with each other and with the L2 cache, so again, we only need to flush
270     *   when the host is involved.
271     * - Other read-only L1 caches (like the ones in front of the texture unit)
272     *   are not coherent with the LS or L2 caches, and thus need to be
273     *   invalidated any time a write happens.
274     */
275 
276 #define ACCESS_HITS_RO_L1_CACHE                                                \
277    (VK_ACCESS_2_SHADER_SAMPLED_READ_BIT |                                      \
278     VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT |                                    \
279     VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT |                            \
280     VK_ACCESS_2_TRANSFER_READ_BIT)
281 
282    if ((dev_writes[subqueue] & src_access) &&
283        (dev_reads[subqueue] & ACCESS_HITS_RO_L1_CACHE & dst_access))
284       cache_flush->others |= true;
285 
286    /* If the host wrote something, we need to clean/invalidate everything. */
287    if ((src_stages & VK_PIPELINE_STAGE_2_HOST_BIT) &&
288        (src_access & VK_ACCESS_2_HOST_WRITE_BIT) &&
289        ((dev_reads[subqueue] | dev_writes[subqueue]) & dst_access)) {
290       cache_flush->l2 |= MALI_CS_FLUSH_MODE_CLEAN_AND_INVALIDATE;
291       cache_flush->lsc |= MALI_CS_FLUSH_MODE_CLEAN_AND_INVALIDATE;
292       cache_flush->others |= true;
293    }
294 
295    /* If the host needs to read something we wrote, we need to clean
296     * everything. */
297    if ((dst_stages & VK_PIPELINE_STAGE_2_HOST_BIT) &&
298        (dst_access & VK_ACCESS_2_HOST_READ_BIT) &&
299        (dev_writes[subqueue] & src_access)) {
300       cache_flush->l2 |= MALI_CS_FLUSH_MODE_CLEAN;
301       cache_flush->lsc |= MALI_CS_FLUSH_MODE_CLEAN;
302    }
303 }
304 
305 static void
collect_cs_deps(struct panvk_cmd_buffer * cmdbuf,VkPipelineStageFlags2 src_stages,VkPipelineStageFlags2 dst_stages,VkAccessFlags src_access,VkAccessFlags dst_access,struct panvk_cs_deps * deps)306 collect_cs_deps(struct panvk_cmd_buffer *cmdbuf,
307                 VkPipelineStageFlags2 src_stages,
308                 VkPipelineStageFlags2 dst_stages, VkAccessFlags src_access,
309                 VkAccessFlags dst_access, struct panvk_cs_deps *deps)
310 {
311    if (src_stages_need_draw_flush(src_stages) && cmdbuf->state.gfx.render.tiler)
312       deps->needs_draw_flush = true;
313 
314    uint32_t wait_subqueue_mask = 0;
315    for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
316       uint32_t sb_mask = src_stages_to_subqueue_sb_mask(i, src_stages);
317       assert((sb_mask != 0) == stages_cover_subqueue(i, src_stages));
318       if (!sb_mask)
319          continue;
320 
321       deps->src[i].wait_sb_mask |= sb_mask;
322       collect_cache_flush_info(i, &deps->src[i].cache_flush, src_stages,
323                                dst_stages, src_access, dst_access);
324       wait_subqueue_mask |= BITFIELD_BIT(i);
325    }
326 
327    for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
328       if (!stages_cover_subqueue(i, dst_stages))
329          continue;
330 
331       deps->dst[i].wait_subqueue_mask |= wait_subqueue_mask & ~BITFIELD_BIT(i);
332    }
333 }
334 
335 void
panvk_per_arch(get_cs_deps)336 panvk_per_arch(get_cs_deps)(struct panvk_cmd_buffer *cmdbuf,
337                             const VkDependencyInfo *in,
338                             struct panvk_cs_deps *out)
339 {
340    memset(out, 0, sizeof(*out));
341 
342    for (uint32_t i = 0; i < in->memoryBarrierCount; i++) {
343       const VkMemoryBarrier2 *barrier = &in->pMemoryBarriers[i];
344       VkPipelineStageFlags2 src_stages =
345          vk_expand_pipeline_stage_flags2(barrier->srcStageMask);
346       VkPipelineStageFlags2 dst_stages =
347          vk_expand_pipeline_stage_flags2(barrier->dstStageMask);
348       VkAccessFlags2 src_access =
349          vk_filter_src_access_flags2(src_stages, barrier->srcAccessMask);
350       VkAccessFlags2 dst_access =
351          vk_filter_dst_access_flags2(dst_stages, barrier->dstAccessMask);
352 
353       collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access,
354                       out);
355    }
356 
357    for (uint32_t i = 0; i < in->bufferMemoryBarrierCount; i++) {
358       const VkBufferMemoryBarrier2 *barrier = &in->pBufferMemoryBarriers[i];
359       VkPipelineStageFlags2 src_stages =
360          vk_expand_pipeline_stage_flags2(barrier->srcStageMask);
361       VkPipelineStageFlags2 dst_stages =
362          vk_expand_pipeline_stage_flags2(barrier->dstStageMask);
363       VkAccessFlags2 src_access =
364          vk_filter_src_access_flags2(src_stages, barrier->srcAccessMask);
365       VkAccessFlags2 dst_access =
366          vk_filter_dst_access_flags2(dst_stages, barrier->dstAccessMask);
367 
368       collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access,
369                       out);
370    }
371 
372    for (uint32_t i = 0; i < in->imageMemoryBarrierCount; i++) {
373       const VkImageMemoryBarrier2 *barrier = &in->pImageMemoryBarriers[i];
374       VkPipelineStageFlags2 src_stages =
375          vk_expand_pipeline_stage_flags2(barrier->srcStageMask);
376       VkPipelineStageFlags2 dst_stages =
377          vk_expand_pipeline_stage_flags2(barrier->dstStageMask);
378       VkAccessFlags2 src_access =
379          vk_filter_src_access_flags2(src_stages, barrier->srcAccessMask);
380       VkAccessFlags2 dst_access =
381          vk_filter_dst_access_flags2(dst_stages, barrier->dstAccessMask);
382 
383       collect_cs_deps(cmdbuf, src_stages, dst_stages, src_access, dst_access,
384                       out);
385    }
386 
387    /* The draw flush will add a vertex -> fragment dependency, so we can skip
388     * the one described in the deps. */
389    if (out->needs_draw_flush)
390       out->dst[PANVK_SUBQUEUE_FRAGMENT].wait_subqueue_mask &=
391          ~BITFIELD_BIT(PANVK_SUBQUEUE_VERTEX_TILER);
392 }
393 
394 VKAPI_ATTR void VKAPI_CALL
panvk_per_arch(CmdPipelineBarrier2)395 panvk_per_arch(CmdPipelineBarrier2)(VkCommandBuffer commandBuffer,
396                                     const VkDependencyInfo *pDependencyInfo)
397 {
398    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
399    struct panvk_cs_deps deps;
400 
401    panvk_per_arch(get_cs_deps)(cmdbuf, pDependencyInfo, &deps);
402 
403    if (deps.needs_draw_flush)
404       panvk_per_arch(cmd_flush_draws)(cmdbuf);
405 
406    uint32_t wait_subqueue_mask = 0;
407    for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++)
408       wait_subqueue_mask |= deps.dst[i].wait_subqueue_mask;
409 
410    for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
411       if (!deps.src[i].wait_sb_mask)
412          continue;
413 
414       struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i);
415       struct panvk_cs_state *cs_state = &cmdbuf->state.cs[i];
416 
417       cs_wait_slots(b, deps.src[i].wait_sb_mask, false);
418 
419       struct panvk_cache_flush_info cache_flush = deps.src[i].cache_flush;
420       if (cache_flush.l2 != MALI_CS_FLUSH_MODE_NONE ||
421           cache_flush.lsc != MALI_CS_FLUSH_MODE_NONE || cache_flush.others) {
422          struct cs_index flush_id = cs_scratch_reg32(b, 0);
423 
424          cs_move32_to(b, flush_id, 0);
425          cs_flush_caches(b, cache_flush.l2, cache_flush.lsc, cache_flush.others,
426                          flush_id, cs_defer(SB_IMM_MASK, SB_ID(IMM_FLUSH)));
427          cs_wait_slot(b, SB_ID(IMM_FLUSH), false);
428       }
429 
430       /* If no one waits on us, there's no point signaling the sync object. */
431       if (wait_subqueue_mask & BITFIELD_BIT(i)) {
432          struct cs_index sync_addr = cs_scratch_reg64(b, 0);
433          struct cs_index add_val = cs_scratch_reg64(b, 2);
434 
435          cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
436                       offsetof(struct panvk_cs_subqueue_context, syncobjs));
437          cs_wait_slot(b, SB_ID(LS), false);
438          cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * i);
439          cs_move64_to(b, add_val, 1);
440          cs_sync64_add(b, false, MALI_CS_SYNC_SCOPE_CSG, add_val, sync_addr,
441                        cs_now());
442          ++cs_state->relative_sync_point;
443       }
444    }
445 
446    for (uint32_t i = 0; i < PANVK_SUBQUEUE_COUNT; i++) {
447       if (!deps.dst[i].wait_subqueue_mask)
448          continue;
449 
450       struct cs_builder *b = panvk_get_cs_builder(cmdbuf, i);
451       for (uint32_t j = 0; j < PANVK_SUBQUEUE_COUNT; j++) {
452          if (!(deps.dst[i].wait_subqueue_mask & BITFIELD_BIT(j)))
453             continue;
454 
455          struct panvk_cs_state *cs_state = &cmdbuf->state.cs[j];
456          struct cs_index sync_addr = cs_scratch_reg64(b, 0);
457          struct cs_index wait_val = cs_scratch_reg64(b, 2);
458 
459          cs_load64_to(b, sync_addr, cs_subqueue_ctx_reg(b),
460                       offsetof(struct panvk_cs_subqueue_context, syncobjs));
461          cs_wait_slot(b, SB_ID(LS), false);
462          cs_add64(b, sync_addr, sync_addr, sizeof(struct panvk_cs_sync64) * j);
463 
464          cs_add64(b, wait_val, cs_progress_seqno_reg(b, j),
465                   cs_state->relative_sync_point);
466          cs_sync64_wait(b, false, MALI_CS_CONDITION_GREATER, wait_val,
467                         sync_addr);
468       }
469    }
470 }
471 
472 void
panvk_per_arch(cs_pick_iter_sb)473 panvk_per_arch(cs_pick_iter_sb)(struct panvk_cmd_buffer *cmdbuf,
474                                 enum panvk_subqueue_id subqueue)
475 {
476    struct cs_builder *b = panvk_get_cs_builder(cmdbuf, subqueue);
477    struct cs_index iter_sb = cs_scratch_reg32(b, 0);
478    struct cs_index cmp_scratch = cs_scratch_reg32(b, 1);
479 
480    cs_load32_to(b, iter_sb, cs_subqueue_ctx_reg(b),
481                 offsetof(struct panvk_cs_subqueue_context, iter_sb));
482    cs_wait_slot(b, SB_ID(LS), false);
483 
484    cs_match(b, iter_sb, cmp_scratch) {
485 #define CASE(x)                                                                \
486       cs_case(b, x) {                                                          \
487          cs_wait_slot(b, SB_ITER(x), false);                                   \
488          cs_set_scoreboard_entry(b, SB_ITER(x), SB_ID(LS));                    \
489       }
490 
491       CASE(0)
492       CASE(1)
493       CASE(2)
494       CASE(3)
495       CASE(4)
496 #undef CASE
497    }
498 }
499 
500 static struct cs_buffer
alloc_cs_buffer(void * cookie)501 alloc_cs_buffer(void *cookie)
502 {
503    struct panvk_cmd_buffer *cmdbuf = cookie;
504    const unsigned capacity = 64 * 1024 / sizeof(uint64_t);
505 
506    struct panfrost_ptr ptr =
507       panvk_cmd_alloc_dev_mem(cmdbuf, cs, capacity * 8, 64);
508 
509    return (struct cs_buffer){
510       .cpu = ptr.cpu,
511       .gpu = ptr.gpu,
512       .capacity = capacity,
513    };
514 }
515 
516 static enum cs_reg_perm
cs_reg_perm(struct cs_builder * b,unsigned reg)517 cs_reg_perm(struct cs_builder *b, unsigned reg)
518 {
519    struct panvk_cs_state *cs_state =
520       container_of(b, struct panvk_cs_state, builder);
521    struct panvk_cs_reg_upd_context *upd_ctx;
522 
523    for (upd_ctx = cs_state->reg_access.upd_ctx_stack; upd_ctx;
524         upd_ctx = upd_ctx->next) {
525       if (upd_ctx->reg_perm(b, reg) == CS_REG_RW)
526          return CS_REG_RW;
527    }
528 
529    return cs_state->reg_access.base_perm(b, reg);
530 }
531 
532 static void
init_cs_builders(struct panvk_cmd_buffer * cmdbuf)533 init_cs_builders(struct panvk_cmd_buffer *cmdbuf)
534 {
535    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
536    struct panvk_instance *instance =
537       to_panvk_instance(dev->vk.physical->instance);
538    const reg_perm_cb_t base_reg_perms[PANVK_SUBQUEUE_COUNT] = {
539       [PANVK_SUBQUEUE_VERTEX_TILER] = panvk_cs_vt_reg_perm,
540       [PANVK_SUBQUEUE_FRAGMENT] = panvk_cs_frag_reg_perm,
541       [PANVK_SUBQUEUE_COMPUTE] = panvk_cs_compute_reg_perm,
542    };
543 
544    for (uint32_t i = 0; i < ARRAY_SIZE(cmdbuf->state.cs); i++) {
545       /* Lazy allocation of the root CS. */
546       struct cs_buffer root_cs = {0};
547 
548       struct cs_builder_conf conf = {
549          .nr_registers = 96,
550          .nr_kernel_registers = 4,
551          .alloc_buffer = alloc_cs_buffer,
552          .cookie = cmdbuf,
553       };
554 
555       if (instance->debug_flags & PANVK_DEBUG_CS) {
556          cmdbuf->state.cs[i].ls_tracker = (struct cs_load_store_tracker){
557             .sb_slot = SB_ID(LS),
558          };
559 
560          conf.ls_tracker = &cmdbuf->state.cs[i].ls_tracker;
561 
562          cmdbuf->state.cs[i].reg_access.upd_ctx_stack = NULL;
563          cmdbuf->state.cs[i].reg_access.base_perm = base_reg_perms[i];
564          conf.reg_perm = cs_reg_perm;
565       }
566 
567       cs_builder_init(&cmdbuf->state.cs[i].builder, &conf, root_cs);
568    }
569 }
570 
571 static void
panvk_reset_cmdbuf(struct vk_command_buffer * vk_cmdbuf,VkCommandBufferResetFlags flags)572 panvk_reset_cmdbuf(struct vk_command_buffer *vk_cmdbuf,
573                    VkCommandBufferResetFlags flags)
574 {
575    struct panvk_cmd_buffer *cmdbuf =
576       container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk);
577    struct panvk_cmd_pool *pool =
578       container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk);
579 
580    vk_command_buffer_reset(&cmdbuf->vk);
581 
582    panvk_pool_reset(&cmdbuf->cs_pool);
583    panvk_pool_reset(&cmdbuf->desc_pool);
584    panvk_pool_reset(&cmdbuf->tls_pool);
585    list_splicetail(&cmdbuf->push_sets, &pool->push_sets);
586    list_inithead(&cmdbuf->push_sets);
587 
588    memset(&cmdbuf->state, 0, sizeof(cmdbuf->state));
589    init_cs_builders(cmdbuf);
590 }
591 
592 static void
panvk_destroy_cmdbuf(struct vk_command_buffer * vk_cmdbuf)593 panvk_destroy_cmdbuf(struct vk_command_buffer *vk_cmdbuf)
594 {
595    struct panvk_cmd_buffer *cmdbuf =
596       container_of(vk_cmdbuf, struct panvk_cmd_buffer, vk);
597    struct panvk_cmd_pool *pool =
598       container_of(vk_cmdbuf->pool, struct panvk_cmd_pool, vk);
599    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
600 
601    panvk_pool_cleanup(&cmdbuf->cs_pool);
602    panvk_pool_cleanup(&cmdbuf->desc_pool);
603    panvk_pool_cleanup(&cmdbuf->tls_pool);
604    list_splicetail(&cmdbuf->push_sets, &pool->push_sets);
605    vk_command_buffer_finish(&cmdbuf->vk);
606    vk_free(&dev->vk.alloc, cmdbuf);
607 }
608 
609 static VkResult
panvk_create_cmdbuf(struct vk_command_pool * vk_pool,VkCommandBufferLevel level,struct vk_command_buffer ** cmdbuf_out)610 panvk_create_cmdbuf(struct vk_command_pool *vk_pool, VkCommandBufferLevel level,
611                     struct vk_command_buffer **cmdbuf_out)
612 {
613    struct panvk_device *device =
614       container_of(vk_pool->base.device, struct panvk_device, vk);
615    struct panvk_cmd_pool *pool =
616       container_of(vk_pool, struct panvk_cmd_pool, vk);
617    struct panvk_cmd_buffer *cmdbuf;
618 
619    cmdbuf = vk_zalloc(&device->vk.alloc, sizeof(*cmdbuf), 8,
620                       VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
621    if (!cmdbuf)
622       return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
623 
624    VkResult result = vk_command_buffer_init(
625       &pool->vk, &cmdbuf->vk, &panvk_per_arch(cmd_buffer_ops), level);
626    if (result != VK_SUCCESS) {
627       vk_free(&device->vk.alloc, cmdbuf);
628       return result;
629    }
630 
631    list_inithead(&cmdbuf->push_sets);
632    cmdbuf->vk.dynamic_graphics_state.vi = &cmdbuf->state.gfx.dynamic.vi;
633    cmdbuf->vk.dynamic_graphics_state.ms.sample_locations =
634       &cmdbuf->state.gfx.dynamic.sl;
635 
636    struct panvk_pool_properties cs_pool_props = {
637       .create_flags = 0,
638       .slab_size = 64 * 1024,
639       .label = "Command buffer CS pool",
640       .prealloc = false,
641       .owns_bos = true,
642       .needs_locking = false,
643    };
644    panvk_pool_init(&cmdbuf->cs_pool, device, &pool->cs_bo_pool, &cs_pool_props);
645 
646    struct panvk_pool_properties desc_pool_props = {
647       .create_flags = 0,
648       .slab_size = 64 * 1024,
649       .label = "Command buffer descriptor pool",
650       .prealloc = false,
651       .owns_bos = true,
652       .needs_locking = false,
653    };
654    panvk_pool_init(&cmdbuf->desc_pool, device, &pool->desc_bo_pool,
655                    &desc_pool_props);
656 
657    struct panvk_pool_properties tls_pool_props = {
658       .create_flags =
659          panvk_device_adjust_bo_flags(device, PAN_KMOD_BO_FLAG_NO_MMAP),
660       .slab_size = 64 * 1024,
661       .label = "TLS pool",
662       .prealloc = false,
663       .owns_bos = true,
664       .needs_locking = false,
665    };
666    panvk_pool_init(&cmdbuf->tls_pool, device, &pool->tls_bo_pool,
667                    &tls_pool_props);
668 
669    init_cs_builders(cmdbuf);
670    *cmdbuf_out = &cmdbuf->vk;
671    return VK_SUCCESS;
672 }
673 
674 const struct vk_command_buffer_ops panvk_per_arch(cmd_buffer_ops) = {
675    .create = panvk_create_cmdbuf,
676    .reset = panvk_reset_cmdbuf,
677    .destroy = panvk_destroy_cmdbuf,
678 };
679 
680 VKAPI_ATTR VkResult VKAPI_CALL
panvk_per_arch(BeginCommandBuffer)681 panvk_per_arch(BeginCommandBuffer)(VkCommandBuffer commandBuffer,
682                                    const VkCommandBufferBeginInfo *pBeginInfo)
683 {
684    VK_FROM_HANDLE(panvk_cmd_buffer, cmdbuf, commandBuffer);
685    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
686    struct panvk_instance *instance =
687       to_panvk_instance(dev->vk.physical->instance);
688 
689    vk_command_buffer_begin(&cmdbuf->vk, pBeginInfo);
690    cmdbuf->flags = pBeginInfo->flags;
691 
692    /* The descriptor ringbuf trips out pandecode because we always point to the
693     * next tiler/framebuffer descriptor after CS execution, which means we're
694     * decoding an uninitialized or stale descriptor.
695     * FIXME: find a way to trace the simultaneous path that doesn't crash. One
696     * option would be to disable CS intepretation and dump the RUN_xxx context
697     * on the side at execution time.
698     */
699    if (instance->debug_flags & PANVK_DEBUG_TRACE)
700       cmdbuf->flags &= ~VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
701 
702    return VK_SUCCESS;
703 }
704