1 /*
2 * Copyright (C) 2023 Collabora Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include "decode.h"
25
26 #include "drm-uapi/panthor_drm.h"
27
28 #include "genxml/cs_builder.h"
29 #include "panfrost/lib/genxml/cs_builder.h"
30
31 #include "pan_blitter.h"
32 #include "pan_cmdstream.h"
33 #include "pan_context.h"
34 #include "pan_csf.h"
35 #include "pan_job.h"
36
37 #if PAN_ARCH < 10
38 #error "CSF helpers are only used for gen >= 10"
39 #endif
40
41 static struct cs_buffer
csf_alloc_cs_buffer(void * cookie)42 csf_alloc_cs_buffer(void *cookie)
43 {
44 assert(cookie && "Self-contained queues can't be extended.");
45
46 struct panfrost_batch *batch = cookie;
47 unsigned capacity = 4096;
48
49 struct panfrost_ptr ptr =
50 pan_pool_alloc_aligned(&batch->csf.cs_chunk_pool.base, capacity * 8, 64);
51
52 return (struct cs_buffer){
53 .cpu = ptr.cpu,
54 .gpu = ptr.gpu,
55 .capacity = capacity,
56 };
57 }
58
59 void
GENX(csf_cleanup_batch)60 GENX(csf_cleanup_batch)(struct panfrost_batch *batch)
61 {
62 free(batch->csf.cs.builder);
63
64 panfrost_pool_cleanup(&batch->csf.cs_chunk_pool);
65 }
66
67 void
GENX(csf_init_batch)68 GENX(csf_init_batch)(struct panfrost_batch *batch)
69 {
70 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
71
72 /* Initialize the CS chunk pool. */
73 panfrost_pool_init(&batch->csf.cs_chunk_pool, NULL, dev, 0, 32768,
74 "CS chunk pool", false, true);
75
76 /* Allocate and bind the command queue */
77 struct cs_buffer queue = csf_alloc_cs_buffer(batch);
78 const struct cs_builder_conf conf = {
79 .nr_registers = 96,
80 .nr_kernel_registers = 4,
81 .alloc_buffer = csf_alloc_cs_buffer,
82 .cookie = batch,
83 };
84
85 /* Setup the queue builder */
86 batch->csf.cs.builder = malloc(sizeof(struct cs_builder));
87 cs_builder_init(batch->csf.cs.builder, &conf, queue);
88 cs_req_res(batch->csf.cs.builder,
89 CS_COMPUTE_RES | CS_TILER_RES | CS_IDVS_RES | CS_FRAG_RES);
90
91 /* Set up entries */
92 struct cs_builder *b = batch->csf.cs.builder;
93 cs_set_scoreboard_entry(b, 2, 0);
94
95 batch->framebuffer = pan_pool_alloc_desc_aggregate(
96 &batch->pool.base, PAN_DESC(FRAMEBUFFER), PAN_DESC(ZS_CRC_EXTENSION),
97 PAN_DESC_ARRAY(MAX2(batch->key.nr_cbufs, 1), RENDER_TARGET));
98 batch->tls = pan_pool_alloc_desc(&batch->pool.base, LOCAL_STORAGE);
99 }
100
101 static void
csf_prepare_qsubmit(struct panfrost_context * ctx,struct drm_panthor_queue_submit * submit,uint8_t queue,uint64_t cs_start,uint32_t cs_size,struct drm_panthor_sync_op * syncs,uint32_t sync_count)102 csf_prepare_qsubmit(struct panfrost_context *ctx,
103 struct drm_panthor_queue_submit *submit, uint8_t queue,
104 uint64_t cs_start, uint32_t cs_size,
105 struct drm_panthor_sync_op *syncs, uint32_t sync_count)
106 {
107 struct panfrost_device *dev = pan_device(ctx->base.screen);
108
109 *submit = (struct drm_panthor_queue_submit){
110 .queue_index = queue,
111 .stream_addr = cs_start,
112 .stream_size = cs_size,
113 .latest_flush = panthor_kmod_get_flush_id(dev->kmod.dev),
114 .syncs = DRM_PANTHOR_OBJ_ARRAY(sync_count, syncs),
115 };
116 }
117
118 static void
csf_prepare_gsubmit(struct panfrost_context * ctx,struct drm_panthor_group_submit * gsubmit,struct drm_panthor_queue_submit * qsubmits,uint32_t qsubmit_count)119 csf_prepare_gsubmit(struct panfrost_context *ctx,
120 struct drm_panthor_group_submit *gsubmit,
121 struct drm_panthor_queue_submit *qsubmits,
122 uint32_t qsubmit_count)
123 {
124 *gsubmit = (struct drm_panthor_group_submit){
125 .group_handle = ctx->csf.group_handle,
126 .queue_submits = DRM_PANTHOR_OBJ_ARRAY(qsubmit_count, qsubmits),
127 };
128 }
129
130 static int
csf_submit_gsubmit(struct panfrost_context * ctx,struct drm_panthor_group_submit * gsubmit)131 csf_submit_gsubmit(struct panfrost_context *ctx,
132 struct drm_panthor_group_submit *gsubmit)
133 {
134 struct panfrost_device *dev = pan_device(ctx->base.screen);
135 int ret = 0;
136
137 if (!ctx->is_noop) {
138 ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_SUBMIT,
139 gsubmit);
140 }
141
142 if (ret)
143 return errno;
144
145 return 0;
146 }
147
148 static void
csf_emit_batch_end(struct panfrost_batch * batch)149 csf_emit_batch_end(struct panfrost_batch *batch)
150 {
151 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
152 struct cs_builder *b = batch->csf.cs.builder;
153
154 /* Barrier to let everything finish */
155 cs_wait_slots(b, BITFIELD_MASK(8), false);
156
157 if (dev->debug & PAN_DBG_SYNC) {
158 /* Get the CS state */
159 batch->csf.cs.state = pan_pool_alloc_aligned(&batch->pool.base, 8, 8);
160 memset(batch->csf.cs.state.cpu, ~0, 8);
161 cs_move64_to(b, cs_reg64(b, 90), batch->csf.cs.state.gpu);
162 cs_store_state(b, cs_reg64(b, 90), 0, MALI_CS_STATE_ERROR_STATUS,
163 cs_now());
164 }
165
166 /* Flush caches now that we're done (synchronous) */
167 struct cs_index flush_id = cs_reg32(b, 74);
168 cs_move32_to(b, flush_id, 0);
169 cs_flush_caches(b, MALI_CS_FLUSH_MODE_CLEAN, MALI_CS_FLUSH_MODE_CLEAN, true,
170 flush_id, cs_defer(0, 0));
171 cs_wait_slot(b, 0, false);
172
173 /* Finish the command stream */
174 assert(cs_is_valid(batch->csf.cs.builder));
175 cs_finish(batch->csf.cs.builder);
176 }
177
178 static int
csf_submit_collect_wait_ops(struct panfrost_batch * batch,struct util_dynarray * syncops,uint32_t vm_sync_handle)179 csf_submit_collect_wait_ops(struct panfrost_batch *batch,
180 struct util_dynarray *syncops,
181 uint32_t vm_sync_handle)
182 {
183 struct panfrost_context *ctx = batch->ctx;
184 struct panfrost_device *dev = pan_device(ctx->base.screen);
185 uint64_t vm_sync_wait_point = 0, bo_sync_point;
186 uint32_t bo_sync_handle;
187 int ret;
188
189 /* We don't wait on BOs attached to the various batch pools, because those
190 * are private to the batch, and are guaranteed to be idle at allocation
191 * time. We need to iterate over other BOs accessed by the batch though,
192 * to add the corresponding wait operations.
193 */
194 util_dynarray_foreach(&batch->bos, pan_bo_access, ptr) {
195 unsigned i = ptr - util_dynarray_element(&batch->bos, pan_bo_access, 0);
196 pan_bo_access flags = *ptr;
197
198 if (!flags)
199 continue;
200
201 /* Update the BO access flags so that panfrost_bo_wait() knows
202 * about all pending accesses.
203 * We only keep the READ/WRITE info since this is all the BO
204 * wait logic cares about.
205 * We also preserve existing flags as this batch might not
206 * be the first one to access the BO.
207 */
208 struct panfrost_bo *bo = pan_lookup_bo(dev, i);
209
210 ret = panthor_kmod_bo_get_sync_point(bo->kmod_bo, &bo_sync_handle,
211 &bo_sync_point,
212 !(flags & PAN_BO_ACCESS_WRITE));
213 if (ret)
214 return ret;
215
216 if (bo_sync_handle == vm_sync_handle) {
217 vm_sync_wait_point = MAX2(vm_sync_wait_point, bo_sync_point);
218 continue;
219 }
220
221 assert(bo_sync_point == 0 || !bo->kmod_bo->exclusive_vm);
222
223 struct drm_panthor_sync_op waitop = {
224 .flags =
225 DRM_PANTHOR_SYNC_OP_WAIT |
226 (bo_sync_point ? DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ
227 : DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ),
228 .handle = bo_sync_handle,
229 .timeline_value = bo_sync_point,
230 };
231
232 util_dynarray_append(syncops, struct drm_panthor_sync_op, waitop);
233 }
234
235 if (vm_sync_wait_point > 0) {
236 struct drm_panthor_sync_op waitop = {
237 .flags = DRM_PANTHOR_SYNC_OP_WAIT |
238 DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ,
239 .handle = vm_sync_handle,
240 .timeline_value = vm_sync_wait_point,
241 };
242
243 util_dynarray_append(syncops, struct drm_panthor_sync_op, waitop);
244 }
245
246 if (ctx->in_sync_fd >= 0) {
247 ret = drmSyncobjImportSyncFile(panfrost_device_fd(dev), ctx->in_sync_obj,
248 ctx->in_sync_fd);
249 if (ret)
250 return ret;
251
252 struct drm_panthor_sync_op waitop = {
253 .flags =
254 DRM_PANTHOR_SYNC_OP_WAIT | DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ,
255 .handle = ctx->in_sync_obj,
256 };
257
258 util_dynarray_append(syncops, struct drm_panthor_sync_op, waitop);
259
260 close(ctx->in_sync_fd);
261 ctx->in_sync_fd = -1;
262 }
263
264 return 0;
265 }
266
267 static int
csf_attach_sync_points(struct panfrost_batch * batch,uint32_t vm_sync_handle,uint64_t vm_sync_signal_point)268 csf_attach_sync_points(struct panfrost_batch *batch, uint32_t vm_sync_handle,
269 uint64_t vm_sync_signal_point)
270 {
271 struct panfrost_context *ctx = batch->ctx;
272 struct panfrost_device *dev = pan_device(ctx->base.screen);
273 int ret;
274
275 /* There should be no invisble allocation on CSF. */
276 assert(batch->invisible_pool.bos.size == 0);
277
278 /* Attach sync points to batch-private BOs first. We assume BOs can
279 * be written by the GPU to keep things simple.
280 */
281 util_dynarray_foreach(&batch->pool.bos, struct panfrost_bo *, bo) {
282 (*bo)->gpu_access |= PAN_BO_ACCESS_RW;
283 ret = panthor_kmod_bo_attach_sync_point((*bo)->kmod_bo, vm_sync_handle,
284 vm_sync_signal_point, true);
285 if (ret)
286 return ret;
287 }
288
289 util_dynarray_foreach(&batch->csf.cs_chunk_pool.bos, struct panfrost_bo *,
290 bo) {
291 (*bo)->gpu_access |= PAN_BO_ACCESS_RW;
292 ret = panthor_kmod_bo_attach_sync_point((*bo)->kmod_bo, vm_sync_handle,
293 vm_sync_signal_point, true);
294 if (ret)
295 return ret;
296 }
297
298 /* Attach the VM sync point to all resources accessed by the batch. */
299 util_dynarray_foreach(&batch->bos, pan_bo_access, ptr) {
300 unsigned i = ptr - util_dynarray_element(&batch->bos, pan_bo_access, 0);
301 pan_bo_access flags = *ptr;
302
303 if (!flags)
304 continue;
305
306 struct panfrost_bo *bo = pan_lookup_bo(dev, i);
307
308 bo->gpu_access |= flags & (PAN_BO_ACCESS_RW);
309 ret = panthor_kmod_bo_attach_sync_point(bo->kmod_bo, vm_sync_handle,
310 vm_sync_signal_point,
311 flags & PAN_BO_ACCESS_WRITE);
312 if (ret)
313 return ret;
314 }
315
316 /* And finally transfer the VM sync point to the context syncobj. */
317 return drmSyncobjTransfer(panfrost_device_fd(dev), ctx->syncobj, 0,
318 vm_sync_handle, vm_sync_signal_point, 0);
319 }
320
321 static void
csf_check_ctx_state_and_reinit(struct panfrost_context * ctx)322 csf_check_ctx_state_and_reinit(struct panfrost_context *ctx)
323 {
324 struct panfrost_device *dev = pan_device(ctx->base.screen);
325 struct drm_panthor_group_get_state state = {
326 .group_handle = ctx->csf.group_handle,
327 };
328 int ret;
329
330 ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_GET_STATE,
331 &state);
332 if (ret) {
333 mesa_loge("DRM_IOCTL_PANTHOR_GROUP_GET_STATE failed (err=%d)", errno);
334 return;
335 }
336
337 /* Context is still usable. This was a transient error. */
338 if (state.state == 0)
339 return;
340
341 /* If the VM is unusable, we can't do much, as this is shared between all
342 * contexts, and restoring the VM state is non-trivial.
343 */
344 if (pan_kmod_vm_query_state(dev->kmod.vm) != PAN_KMOD_VM_USABLE) {
345 mesa_loge("VM became unusable, we can't reset the context");
346 assert(!"VM became unusable, we can't reset the context");
347 }
348
349 panfrost_context_reinit(ctx);
350 }
351
352 static void
csf_submit_wait_and_dump(struct panfrost_batch * batch,const struct drm_panthor_group_submit * gsubmit,uint32_t vm_sync_handle,uint64_t vm_sync_signal_point)353 csf_submit_wait_and_dump(struct panfrost_batch *batch,
354 const struct drm_panthor_group_submit *gsubmit,
355 uint32_t vm_sync_handle, uint64_t vm_sync_signal_point)
356 {
357 struct panfrost_context *ctx = batch->ctx;
358 struct panfrost_device *dev = pan_device(ctx->base.screen);
359 bool wait = (dev->debug & (PAN_DBG_TRACE | PAN_DBG_SYNC)) && !ctx->is_noop;
360 bool dump = (dev->debug & PAN_DBG_TRACE);
361 bool crash = false;
362
363 if (!wait && !dump)
364 return;
365
366 /* Wait so we can get errors reported back */
367 if (wait) {
368 int ret =
369 drmSyncobjTimelineWait(panfrost_device_fd(dev), &vm_sync_handle,
370 &vm_sync_signal_point, 1, INT64_MAX, 0, NULL);
371 assert(ret >= 0);
372 }
373
374 /* Jobs won't be complete if blackhole rendering, that's ok */
375 if (!ctx->is_noop && (dev->debug & PAN_DBG_SYNC) &&
376 *((uint64_t *)batch->csf.cs.state.cpu) != 0) {
377 crash = true;
378 dump = true;
379 }
380
381 if (dump) {
382 const struct drm_panthor_queue_submit *qsubmits =
383 (void *)(uintptr_t)gsubmit->queue_submits.array;
384
385 for (unsigned i = 0; i < gsubmit->queue_submits.count; i++) {
386 uint32_t regs[256] = {0};
387 pandecode_cs(dev->decode_ctx, qsubmits[i].stream_addr,
388 qsubmits[i].stream_size, panfrost_device_gpu_id(dev),
389 regs);
390 }
391
392 if (dev->debug & PAN_DBG_DUMP)
393 pandecode_dump_mappings(dev->decode_ctx);
394 }
395
396 if (crash) {
397 fprintf(stderr, "Incomplete job or timeout\n");
398 fflush(NULL);
399 abort();
400 }
401 }
402
403 int
GENX(csf_submit_batch)404 GENX(csf_submit_batch)(struct panfrost_batch *batch)
405 {
406 /* Close the batch before submitting. */
407 csf_emit_batch_end(batch);
408
409 uint64_t cs_start = cs_root_chunk_gpu_addr(batch->csf.cs.builder);
410 uint32_t cs_size = cs_root_chunk_size(batch->csf.cs.builder);
411 struct panfrost_context *ctx = batch->ctx;
412 struct panfrost_device *dev = pan_device(ctx->base.screen);
413 uint32_t vm_sync_handle = panthor_kmod_vm_sync_handle(dev->kmod.vm);
414 struct util_dynarray syncops;
415 int ret;
416
417 util_dynarray_init(&syncops, NULL);
418
419 ret = csf_submit_collect_wait_ops(batch, &syncops, vm_sync_handle);
420 if (ret)
421 goto out_free_syncops;
422
423 uint64_t vm_sync_cur_point = panthor_kmod_vm_sync_lock(dev->kmod.vm);
424 uint64_t vm_sync_signal_point = vm_sync_cur_point + 1;
425
426 struct drm_panthor_sync_op signalop = {
427 .flags = DRM_PANTHOR_SYNC_OP_SIGNAL |
428 DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_TIMELINE_SYNCOBJ,
429 .handle = vm_sync_handle,
430 .timeline_value = vm_sync_signal_point,
431 };
432
433 util_dynarray_append(&syncops, struct drm_panthor_sync_op, signalop);
434
435 struct drm_panthor_queue_submit qsubmit;
436 struct drm_panthor_group_submit gsubmit;
437
438 csf_prepare_qsubmit(
439 ctx, &qsubmit, 0, cs_start, cs_size, util_dynarray_begin(&syncops),
440 util_dynarray_num_elements(&syncops, struct drm_panthor_sync_op));
441 csf_prepare_gsubmit(ctx, &gsubmit, &qsubmit, 1);
442 ret = csf_submit_gsubmit(ctx, &gsubmit);
443 panthor_kmod_vm_sync_unlock(dev->kmod.vm,
444 ret ? vm_sync_cur_point : vm_sync_signal_point);
445
446 if (!ret) {
447 csf_submit_wait_and_dump(batch, &gsubmit, vm_sync_handle,
448 vm_sync_signal_point);
449 ret = csf_attach_sync_points(batch, vm_sync_handle, vm_sync_signal_point);
450 } else {
451 csf_check_ctx_state_and_reinit(batch->ctx);
452 }
453
454 out_free_syncops:
455 util_dynarray_fini(&syncops);
456 return ret;
457 }
458
459 void
GENX(csf_preload_fb)460 GENX(csf_preload_fb)(struct panfrost_batch *batch, struct pan_fb_info *fb)
461 {
462 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
463
464 GENX(pan_preload_fb)
465 (&dev->blitter, &batch->pool.base, fb, 0, batch->tls.gpu, NULL);
466 }
467
468 void
GENX(csf_emit_fragment_job)469 GENX(csf_emit_fragment_job)(struct panfrost_batch *batch,
470 const struct pan_fb_info *pfb)
471 {
472 struct cs_builder *b = batch->csf.cs.builder;
473
474 if (batch->draw_count > 0) {
475 /* Finish tiling and wait for IDVS and tiling */
476 cs_finish_tiling(b, false);
477 cs_wait_slot(b, 2, false);
478 cs_vt_end(b, cs_now());
479 }
480
481 /* Set up the fragment job */
482 cs_move64_to(b, cs_reg64(b, 40), batch->framebuffer.gpu);
483 cs_move32_to(b, cs_reg32(b, 42), (batch->miny << 16) | batch->minx);
484 cs_move32_to(b, cs_reg32(b, 43),
485 ((batch->maxy - 1) << 16) | (batch->maxx - 1));
486
487 /* Run the fragment job and wait */
488 cs_run_fragment(b, false, MALI_TILE_RENDER_ORDER_Z_ORDER, false);
489 cs_wait_slot(b, 2, false);
490
491 /* Gather freed heap chunks and add them to the heap context free list
492 * so they can be re-used next time the tiler heap runs out of chunks.
493 * That's what cs_finish_fragment() is all about. The list of freed
494 * chunks is in the tiler context descriptor
495 * (completed_{top,bottom fields}). */
496 if (batch->draw_count > 0) {
497 assert(batch->tiler_ctx.valhall.desc);
498 cs_move64_to(b, cs_reg64(b, 90), batch->tiler_ctx.valhall.desc);
499 cs_load_to(b, cs_reg_tuple(b, 86, 4), cs_reg64(b, 90), BITFIELD_MASK(4),
500 40);
501 cs_wait_slot(b, 0, false);
502 cs_finish_fragment(b, true, cs_reg64(b, 86), cs_reg64(b, 88), cs_now());
503 }
504 }
505
506 static void
csf_emit_shader_regs(struct panfrost_batch * batch,enum pipe_shader_type stage,mali_ptr shader)507 csf_emit_shader_regs(struct panfrost_batch *batch, enum pipe_shader_type stage,
508 mali_ptr shader)
509 {
510 mali_ptr resources = panfrost_emit_resources(batch, stage);
511
512 assert(stage == PIPE_SHADER_VERTEX || stage == PIPE_SHADER_FRAGMENT ||
513 stage == PIPE_SHADER_COMPUTE);
514
515 unsigned offset = (stage == PIPE_SHADER_FRAGMENT) ? 4 : 0;
516 unsigned fau_count = DIV_ROUND_UP(batch->nr_push_uniforms[stage], 2);
517
518 struct cs_builder *b = batch->csf.cs.builder;
519 cs_move64_to(b, cs_reg64(b, 0 + offset), resources);
520 cs_move64_to(b, cs_reg64(b, 8 + offset),
521 batch->push_uniforms[stage] | ((uint64_t)fau_count << 56));
522 cs_move64_to(b, cs_reg64(b, 16 + offset), shader);
523 }
524
525 void
GENX(csf_launch_grid)526 GENX(csf_launch_grid)(struct panfrost_batch *batch,
527 const struct pipe_grid_info *info)
528 {
529 /* Empty compute programs are invalid and don't make sense */
530 if (batch->rsd[PIPE_SHADER_COMPUTE] == 0)
531 return;
532
533 struct panfrost_context *ctx = batch->ctx;
534 struct panfrost_device *dev = pan_device(ctx->base.screen);
535 struct panfrost_compiled_shader *cs = ctx->prog[PIPE_SHADER_COMPUTE];
536 struct cs_builder *b = batch->csf.cs.builder;
537
538 csf_emit_shader_regs(batch, PIPE_SHADER_COMPUTE,
539 batch->rsd[PIPE_SHADER_COMPUTE]);
540
541 cs_move64_to(b, cs_reg64(b, 24), batch->tls.gpu);
542
543 /* Global attribute offset */
544 cs_move32_to(b, cs_reg32(b, 32), 0);
545
546 /* Compute workgroup size */
547 uint32_t wg_size[4];
548 pan_pack(wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
549 cfg.workgroup_size_x = info->block[0];
550 cfg.workgroup_size_y = info->block[1];
551 cfg.workgroup_size_z = info->block[2];
552
553 /* Workgroups may be merged if the shader does not use barriers
554 * or shared memory. This condition is checked against the
555 * static shared_size at compile-time. We need to check the
556 * variable shared size at launch_grid time, because the
557 * compiler doesn't know about that.
558 */
559 cfg.allow_merging_workgroups = cs->info.cs.allow_merging_workgroups &&
560 (info->variable_shared_mem == 0);
561 }
562
563 cs_move32_to(b, cs_reg32(b, 33), wg_size[0]);
564
565 /* Offset */
566 for (unsigned i = 0; i < 3; ++i)
567 cs_move32_to(b, cs_reg32(b, 34 + i), 0);
568
569 unsigned threads_per_wg = info->block[0] * info->block[1] * info->block[2];
570 unsigned max_thread_cnt = panfrost_compute_max_thread_count(
571 &dev->kmod.props, cs->info.work_reg_count);
572
573 if (info->indirect) {
574 /* Load size in workgroups per dimension from memory */
575 struct cs_index address = cs_reg64(b, 64);
576 cs_move64_to(
577 b, address,
578 pan_resource(info->indirect)->image.data.base + info->indirect_offset);
579
580 struct cs_index grid_xyz = cs_reg_tuple(b, 37, 3);
581 cs_load_to(b, grid_xyz, address, BITFIELD_MASK(3), 0);
582
583 /* Wait for the load */
584 cs_wait_slot(b, 0, false);
585
586 /* Copy to FAU */
587 for (unsigned i = 0; i < 3; ++i) {
588 if (batch->num_wg_sysval[i]) {
589 cs_move64_to(b, address, batch->num_wg_sysval[i]);
590 cs_store(b, cs_extract32(b, grid_xyz, i), address, BITFIELD_MASK(1),
591 0);
592 }
593 }
594
595 /* Wait for the stores */
596 cs_wait_slot(b, 0, false);
597
598 cs_run_compute_indirect(b, DIV_ROUND_UP(max_thread_cnt, threads_per_wg),
599 false, cs_shader_res_sel(0, 0, 0, 0));
600 } else {
601 /* Set size in workgroups per dimension immediately */
602 for (unsigned i = 0; i < 3; ++i)
603 cs_move32_to(b, cs_reg32(b, 37 + i), info->grid[i]);
604
605 /* Pick the task_axis and task_increment to maximize thread utilization. */
606 unsigned task_axis = MALI_TASK_AXIS_X;
607 unsigned threads_per_task = threads_per_wg;
608 unsigned task_increment = 0;
609
610 for (unsigned i = 0; i < 3; i++) {
611 if (threads_per_task * info->grid[i] >= max_thread_cnt) {
612 /* We reached out thread limit, stop at the current axis and
613 * calculate the increment so it doesn't exceed the per-core
614 * thread capacity.
615 */
616 task_increment = max_thread_cnt / threads_per_task;
617 break;
618 } else if (task_axis == MALI_TASK_AXIS_Z) {
619 /* We reached the Z axis, and there's still room to stuff more
620 * threads. Pick the current axis grid size as our increment
621 * as there's no point using something bigger.
622 */
623 task_increment = info->grid[i];
624 break;
625 }
626
627 threads_per_task *= info->grid[i];
628 task_axis++;
629 }
630
631 assert(task_axis <= MALI_TASK_AXIS_Z);
632 assert(task_increment > 0);
633 cs_run_compute(b, task_increment, task_axis, false,
634 cs_shader_res_sel(0, 0, 0, 0));
635 }
636 }
637
638 void
GENX(csf_launch_xfb)639 GENX(csf_launch_xfb)(struct panfrost_batch *batch,
640 const struct pipe_draw_info *info, unsigned count)
641 {
642 struct cs_builder *b = batch->csf.cs.builder;
643
644 cs_move64_to(b, cs_reg64(b, 24), batch->tls.gpu);
645
646 /* TODO: Indexing. Also, attribute_offset is a legacy feature.. */
647 cs_move32_to(b, cs_reg32(b, 32), batch->ctx->offset_start);
648
649 /* Compute workgroup size */
650 uint32_t wg_size[4];
651 pan_pack(wg_size, COMPUTE_SIZE_WORKGROUP, cfg) {
652 cfg.workgroup_size_x = 1;
653 cfg.workgroup_size_y = 1;
654 cfg.workgroup_size_z = 1;
655
656 /* Transform feedback shaders do not use barriers or
657 * shared memory, so we may merge workgroups.
658 */
659 cfg.allow_merging_workgroups = true;
660 }
661 cs_move32_to(b, cs_reg32(b, 33), wg_size[0]);
662
663 /* Offset */
664 for (unsigned i = 0; i < 3; ++i)
665 cs_move32_to(b, cs_reg32(b, 34 + i), 0);
666
667 cs_move32_to(b, cs_reg32(b, 37), count);
668 cs_move32_to(b, cs_reg32(b, 38), info->instance_count);
669 cs_move32_to(b, cs_reg32(b, 39), 1);
670
671 csf_emit_shader_regs(batch, PIPE_SHADER_VERTEX,
672 batch->rsd[PIPE_SHADER_VERTEX]);
673 /* force a barrier to avoid read/write sync issues with buffers */
674 cs_wait_slot(b, 2, false);
675
676 /* XXX: Choose correctly */
677 cs_run_compute(b, 1, MALI_TASK_AXIS_Z, false, cs_shader_res_sel(0, 0, 0, 0));
678 }
679
680 static mali_ptr
csf_get_tiler_desc(struct panfrost_batch * batch)681 csf_get_tiler_desc(struct panfrost_batch *batch)
682 {
683 struct panfrost_context *ctx = batch->ctx;
684 struct panfrost_device *dev = pan_device(ctx->base.screen);
685
686 if (batch->tiler_ctx.valhall.desc)
687 return batch->tiler_ctx.valhall.desc;
688
689 struct panfrost_ptr t =
690 pan_pool_alloc_desc(&batch->pool.base, TILER_CONTEXT);
691 pan_pack(t.cpu, TILER_CONTEXT, tiler) {
692 unsigned max_levels = dev->tiler_features.max_levels;
693 assert(max_levels >= 2);
694
695 /* TODO: Select hierarchy mask more effectively */
696 tiler.hierarchy_mask = (max_levels >= 8) ? 0xFF : 0x28;
697
698 /* For large framebuffers, disable the smallest bin size to
699 * avoid pathological tiler memory usage. Required to avoid OOM
700 * on dEQP-GLES31.functional.fbo.no_attachments.maximums.all on
701 * Mali-G57.
702 */
703 if (MAX2(batch->key.width, batch->key.height) >= 4096)
704 tiler.hierarchy_mask &= ~1;
705
706 tiler.fb_width = batch->key.width;
707 tiler.fb_height = batch->key.height;
708 tiler.heap = batch->ctx->csf.heap.desc_bo->ptr.gpu;
709 tiler.sample_pattern =
710 pan_sample_pattern(util_framebuffer_get_num_samples(&batch->key));
711 tiler.first_provoking_vertex =
712 pan_tristate_get(batch->first_provoking_vertex);
713 tiler.geometry_buffer = ctx->csf.tmp_geom_bo->ptr.gpu;
714 tiler.geometry_buffer_size = ctx->csf.tmp_geom_bo->kmod_bo->size;
715 }
716
717 batch->tiler_ctx.valhall.desc = t.gpu;
718 return batch->tiler_ctx.valhall.desc;
719 }
720
721 static uint32_t
csf_emit_draw_state(struct panfrost_batch * batch,const struct pipe_draw_info * info,unsigned drawid_offset)722 csf_emit_draw_state(struct panfrost_batch *batch,
723 const struct pipe_draw_info *info, unsigned drawid_offset)
724 {
725 struct panfrost_context *ctx = batch->ctx;
726 struct panfrost_compiled_shader *vs = ctx->prog[PIPE_SHADER_VERTEX];
727 struct panfrost_compiled_shader *fs = ctx->prog[PIPE_SHADER_FRAGMENT];
728
729 bool idvs = vs->info.vs.idvs;
730 bool fs_required = panfrost_fs_required(
731 fs, ctx->blend, &ctx->pipe_framebuffer, ctx->depth_stencil);
732 bool secondary_shader = vs->info.vs.secondary_enable && fs_required;
733
734 assert(idvs && "IDVS required for CSF");
735
736 struct cs_builder *b = batch->csf.cs.builder;
737
738 if (batch->draw_count == 0)
739 cs_vt_start(batch->csf.cs.builder, cs_now());
740
741 csf_emit_shader_regs(batch, PIPE_SHADER_VERTEX,
742 panfrost_get_position_shader(batch, info));
743
744 if (fs_required) {
745 csf_emit_shader_regs(batch, PIPE_SHADER_FRAGMENT,
746 batch->rsd[PIPE_SHADER_FRAGMENT]);
747 } else {
748 cs_move64_to(b, cs_reg64(b, 4), 0);
749 cs_move64_to(b, cs_reg64(b, 12), 0);
750 cs_move64_to(b, cs_reg64(b, 20), 0);
751 }
752
753 if (secondary_shader) {
754 cs_move64_to(b, cs_reg64(b, 18), panfrost_get_varying_shader(batch));
755 }
756
757 cs_move64_to(b, cs_reg64(b, 24), batch->tls.gpu);
758 cs_move64_to(b, cs_reg64(b, 30), batch->tls.gpu);
759 cs_move32_to(b, cs_reg32(b, 32), 0);
760 cs_move32_to(b, cs_reg32(b, 37), 0);
761 cs_move32_to(b, cs_reg32(b, 38), 0);
762
763 cs_move64_to(b, cs_reg64(b, 40), csf_get_tiler_desc(batch));
764
765 STATIC_ASSERT(sizeof(batch->scissor) == pan_size(SCISSOR));
766 STATIC_ASSERT(sizeof(uint64_t) == pan_size(SCISSOR));
767 uint64_t *sbd = (uint64_t *)&batch->scissor[0];
768 cs_move64_to(b, cs_reg64(b, 42), *sbd);
769
770 cs_move32_to(b, cs_reg32(b, 44), fui(batch->minimum_z));
771 cs_move32_to(b, cs_reg32(b, 45), fui(batch->maximum_z));
772
773 if (ctx->occlusion_query && ctx->active_queries) {
774 struct panfrost_resource *rsrc = pan_resource(ctx->occlusion_query->rsrc);
775 cs_move64_to(b, cs_reg64(b, 46), rsrc->image.data.base);
776 panfrost_batch_write_rsrc(ctx->batch, rsrc, PIPE_SHADER_FRAGMENT);
777 }
778
779 cs_move32_to(b, cs_reg32(b, 48), panfrost_vertex_attribute_stride(vs, fs));
780 cs_move64_to(b, cs_reg64(b, 50),
781 batch->blend | MAX2(batch->key.nr_cbufs, 1));
782 cs_move64_to(b, cs_reg64(b, 52), batch->depth_stencil);
783
784 if (info->index_size)
785 cs_move64_to(b, cs_reg64(b, 54), batch->indices);
786
787 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
788
789 uint32_t primitive_flags = 0;
790 pan_pack(&primitive_flags, PRIMITIVE_FLAGS, cfg) {
791 if (panfrost_writes_point_size(ctx))
792 cfg.point_size_array_format = MALI_POINT_SIZE_ARRAY_FORMAT_FP16;
793
794 cfg.allow_rotating_primitives = allow_rotating_primitives(fs, info);
795
796 cfg.low_depth_cull = rast->depth_clip_near;
797 cfg.high_depth_cull = rast->depth_clip_far;
798
799 /* Non-fixed restart indices should have been lowered */
800 assert(!cfg.primitive_restart || panfrost_is_implicit_prim_restart(info));
801 cfg.primitive_restart = info->primitive_restart;
802
803 cfg.position_fifo_format = panfrost_writes_point_size(ctx)
804 ? MALI_FIFO_FORMAT_EXTENDED
805 : MALI_FIFO_FORMAT_BASIC;
806 }
807
808 cs_move32_to(b, cs_reg32(b, 56), primitive_flags);
809
810 uint32_t dcd_flags0 = 0, dcd_flags1 = 0;
811 pan_pack(&dcd_flags0, DCD_FLAGS_0, cfg) {
812 enum mesa_prim reduced_mode = u_reduced_prim(info->mode);
813 bool polygon = reduced_mode == MESA_PRIM_TRIANGLES;
814 bool lines = reduced_mode == MESA_PRIM_LINES;
815
816 /*
817 * From the Gallium documentation,
818 * pipe_rasterizer_state::cull_face "indicates which faces of
819 * polygons to cull". Points and lines are not considered
820 * polygons and should be drawn even if all faces are culled.
821 * The hardware does not take primitive type into account when
822 * culling, so we need to do that check ourselves.
823 */
824 cfg.cull_front_face = polygon && (rast->cull_face & PIPE_FACE_FRONT);
825 cfg.cull_back_face = polygon && (rast->cull_face & PIPE_FACE_BACK);
826 cfg.front_face_ccw = rast->front_ccw;
827
828 cfg.multisample_enable = rast->multisample;
829
830 /* Use per-sample shading if required by API Also use it when a
831 * blend shader is used with multisampling, as this is handled
832 * by a single ST_TILE in the blend shader with the current
833 * sample ID, requiring per-sample shading.
834 */
835 cfg.evaluate_per_sample =
836 (rast->multisample &&
837 ((ctx->min_samples > 1) || ctx->valhall_has_blend_shader));
838
839 cfg.single_sampled_lines = !rast->multisample;
840
841 if (lines && rast->line_smooth) {
842 cfg.multisample_enable = true;
843 cfg.single_sampled_lines = false;
844 }
845
846 bool has_oq = ctx->occlusion_query && ctx->active_queries;
847 if (has_oq) {
848 if (ctx->occlusion_query->type == PIPE_QUERY_OCCLUSION_COUNTER)
849 cfg.occlusion_query = MALI_OCCLUSION_MODE_COUNTER;
850 else
851 cfg.occlusion_query = MALI_OCCLUSION_MODE_PREDICATE;
852 }
853
854 if (fs_required) {
855 struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
856 fs->earlyzs, ctx->depth_stencil->writes_zs || has_oq,
857 ctx->blend->base.alpha_to_coverage,
858 ctx->depth_stencil->zs_always_passes);
859
860 cfg.pixel_kill_operation = earlyzs.kill;
861 cfg.zs_update_operation = earlyzs.update;
862
863 cfg.allow_forward_pixel_to_kill =
864 pan_allow_forward_pixel_to_kill(ctx, fs);
865 cfg.allow_forward_pixel_to_be_killed = !fs->info.writes_global;
866
867 cfg.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
868 cfg.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
869
870 /* Also use per-sample shading if required by the shader
871 */
872 cfg.evaluate_per_sample |= fs->info.fs.sample_shading;
873
874 /* Unlike Bifrost, alpha-to-coverage must be included in
875 * this identically-named flag. Confusing, isn't it?
876 */
877 cfg.shader_modifies_coverage = fs->info.fs.writes_coverage ||
878 fs->info.fs.can_discard ||
879 ctx->blend->base.alpha_to_coverage;
880
881 cfg.alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
882 } else {
883 /* These operations need to be FORCE to benefit from the
884 * depth-only pass optimizations.
885 */
886 cfg.pixel_kill_operation = MALI_PIXEL_KILL_FORCE_EARLY;
887 cfg.zs_update_operation = MALI_PIXEL_KILL_FORCE_EARLY;
888
889 /* No shader and no blend => no shader or blend
890 * reasons to disable FPK. The only FPK-related state
891 * not covered is alpha-to-coverage which we don't set
892 * without blend.
893 */
894 cfg.allow_forward_pixel_to_kill = true;
895
896 /* No shader => no shader side effects */
897 cfg.allow_forward_pixel_to_be_killed = true;
898
899 /* Alpha isn't written so these are vacuous */
900 cfg.overdraw_alpha0 = true;
901 cfg.overdraw_alpha1 = true;
902 }
903 }
904
905 pan_pack(&dcd_flags1, DCD_FLAGS_1, cfg) {
906 cfg.sample_mask = rast->multisample ? ctx->sample_mask : 0xFFFF;
907
908 if (fs_required) {
909 /* See JM Valhall equivalent code */
910 cfg.render_target_mask =
911 (fs->info.outputs_written >> FRAG_RESULT_DATA0) & ctx->fb_rt_mask;
912 }
913 }
914
915 cs_move32_to(b, cs_reg32(b, 57), dcd_flags0);
916 cs_move32_to(b, cs_reg32(b, 58), dcd_flags1);
917
918 uint64_t primsize = 0;
919 panfrost_emit_primitive_size(ctx, info->mode == MESA_PRIM_POINTS, 0,
920 &primsize);
921 cs_move64_to(b, cs_reg64(b, 60), primsize);
922
923 uint32_t flags_override;
924 /* Pack with nodefaults so only explicitly set override fields affect the
925 * previously set register values */
926 pan_pack_nodefaults(&flags_override, PRIMITIVE_FLAGS, cfg) {
927 cfg.draw_mode = pan_draw_mode(info->mode);
928 cfg.index_type = panfrost_translate_index_size(info->index_size);
929 cfg.secondary_shader = secondary_shader;
930 };
931
932 return flags_override;
933 }
934
935 static struct cs_index
csf_emit_draw_id_register(struct panfrost_batch * batch,unsigned offset)936 csf_emit_draw_id_register(struct panfrost_batch *batch, unsigned offset)
937 {
938 struct cs_builder *b = batch->csf.cs.builder;
939 struct panfrost_context *ctx = batch->ctx;
940 struct panfrost_uncompiled_shader *vs = ctx->uncompiled[PIPE_SHADER_VERTEX];
941
942 if (!BITSET_TEST(vs->nir->info.system_values_read, SYSTEM_VALUE_DRAW_ID))
943 return cs_undef();
944
945 struct cs_index drawid = cs_reg32(b, 67);
946 cs_move32_to(b, drawid, offset);
947
948 return drawid;
949 }
950
951 void
GENX(csf_launch_draw)952 GENX(csf_launch_draw)(struct panfrost_batch *batch,
953 const struct pipe_draw_info *info, unsigned drawid_offset,
954 const struct pipe_draw_start_count_bias *draw,
955 unsigned vertex_count)
956 {
957 struct cs_builder *b = batch->csf.cs.builder;
958
959 uint32_t flags_override = csf_emit_draw_state(batch, info, drawid_offset);
960 struct cs_index drawid = csf_emit_draw_id_register(batch, drawid_offset);
961
962 cs_move32_to(b, cs_reg32(b, 33), draw->count);
963 cs_move32_to(b, cs_reg32(b, 34), info->instance_count);
964 cs_move32_to(b, cs_reg32(b, 35), 0);
965
966 /* Base vertex offset on Valhall is used for both indexed and
967 * non-indexed draws, in a simple way for either. Handle both cases.
968 */
969 if (info->index_size) {
970 cs_move32_to(b, cs_reg32(b, 36), draw->index_bias);
971 cs_move32_to(b, cs_reg32(b, 39), info->index_size * draw->count);
972 } else {
973 cs_move32_to(b, cs_reg32(b, 36), draw->start);
974 cs_move32_to(b, cs_reg32(b, 39), 0);
975 }
976
977 cs_run_idvs(b, flags_override, false, true, cs_shader_res_sel(0, 0, 1, 0),
978 cs_shader_res_sel(2, 2, 2, 0), drawid);
979 }
980
981 void
GENX(csf_launch_draw_indirect)982 GENX(csf_launch_draw_indirect)(struct panfrost_batch *batch,
983 const struct pipe_draw_info *info,
984 unsigned drawid_offset,
985 const struct pipe_draw_indirect_info *indirect)
986 {
987 struct cs_builder *b = batch->csf.cs.builder;
988
989 uint32_t flags_override = csf_emit_draw_state(batch, info, drawid_offset);
990 struct cs_index drawid = csf_emit_draw_id_register(batch, drawid_offset);
991
992 struct cs_index address = cs_reg64(b, 64);
993 struct cs_index counter = cs_reg32(b, 66);
994 cs_move64_to(
995 b, address,
996 pan_resource(indirect->buffer)->image.data.base + indirect->offset);
997 cs_move32_to(b, counter, indirect->draw_count);
998
999 cs_while(b, MALI_CS_CONDITION_GREATER, counter) {
1000 if (info->index_size) {
1001 /* loads vertex count, instance count, index offset, vertex offset */
1002 cs_load_to(b, cs_reg_tuple(b, 33, 4), address, BITFIELD_MASK(4), 0);
1003 cs_move32_to(b, cs_reg32(b, 39), info->index.resource->width0);
1004 } else {
1005 /* vertex count, instance count */
1006 cs_load_to(b, cs_reg_tuple(b, 33, 2), address, BITFIELD_MASK(2), 0);
1007 cs_move32_to(b, cs_reg32(b, 35), 0);
1008 cs_load_to(b, cs_reg_tuple(b, 36, 1), address, BITFIELD_MASK(1),
1009 2 * sizeof(uint32_t)); // instance offset
1010 cs_move32_to(b, cs_reg32(b, 37), 0);
1011 cs_move32_to(b, cs_reg32(b, 39), 0);
1012 }
1013
1014 cs_wait_slot(b, 0, false);
1015 cs_run_idvs(b, flags_override, false, true, cs_shader_res_sel(0, 0, 1, 0),
1016 cs_shader_res_sel(2, 2, 2, 0), drawid);
1017
1018 cs_add64(b, address, address, indirect->stride);
1019 cs_add32(b, counter, counter, (unsigned int)-1);
1020 if (drawid.type != CS_INDEX_UNDEF)
1021 cs_add32(b, drawid, drawid, 1);
1022 }
1023 }
1024
1025 #define POSITION_FIFO_SIZE (64 * 1024)
1026
1027 int
GENX(csf_init_context)1028 GENX(csf_init_context)(struct panfrost_context *ctx)
1029 {
1030 struct panfrost_device *dev = pan_device(ctx->base.screen);
1031 struct drm_panthor_queue_create qc[] = {{
1032 .priority = 1,
1033 .ringbuf_size = 64 * 1024,
1034 }};
1035
1036 struct drm_panthor_group_create gc = {
1037 .compute_core_mask = dev->kmod.props.shader_present,
1038 .fragment_core_mask = dev->kmod.props.shader_present,
1039 .tiler_core_mask = 1,
1040 .max_compute_cores = util_bitcount64(dev->kmod.props.shader_present),
1041 .max_fragment_cores = util_bitcount64(dev->kmod.props.shader_present),
1042 .max_tiler_cores = 1,
1043 .priority = PANTHOR_GROUP_PRIORITY_MEDIUM,
1044 .queues = DRM_PANTHOR_OBJ_ARRAY(ARRAY_SIZE(qc), qc),
1045 .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
1046 };
1047
1048 int ret =
1049 drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_CREATE, &gc);
1050
1051 if (ret)
1052 goto err_group_create;
1053
1054 ctx->csf.group_handle = gc.group_handle;
1055
1056 struct drm_panthor_group_destroy gd = {
1057 .group_handle = ctx->csf.group_handle,
1058 };
1059
1060 /* Get tiler heap */
1061 struct drm_panthor_tiler_heap_create thc = {
1062 .vm_id = pan_kmod_vm_handle(dev->kmod.vm),
1063 .chunk_size = pan_screen(ctx->base.screen)->csf_tiler_heap.chunk_size,
1064 .initial_chunk_count =
1065 pan_screen(ctx->base.screen)->csf_tiler_heap.initial_chunks,
1066 .max_chunks = pan_screen(ctx->base.screen)->csf_tiler_heap.max_chunks,
1067 .target_in_flight = 65535,
1068 };
1069 ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_TILER_HEAP_CREATE,
1070 &thc);
1071
1072 if (ret)
1073 goto err_tiler_heap;
1074
1075 ctx->csf.heap.handle = thc.handle;
1076
1077 struct drm_panthor_tiler_heap_destroy thd = {
1078 .handle = ctx->csf.heap.handle,
1079 };
1080
1081 ctx->csf.heap.desc_bo =
1082 panfrost_bo_create(dev, pan_size(TILER_HEAP), 0, "Tiler Heap");
1083
1084 if (ctx->csf.heap.desc_bo == NULL)
1085 goto err_tiler_heap_desc_bo;
1086
1087 pan_pack(ctx->csf.heap.desc_bo->ptr.cpu, TILER_HEAP, heap) {
1088 heap.size = pan_screen(ctx->base.screen)->csf_tiler_heap.chunk_size;
1089 heap.base = thc.first_heap_chunk_gpu_va;
1090 heap.bottom = heap.base + 64;
1091 heap.top = heap.base + heap.size;
1092 }
1093
1094 ctx->csf.tmp_geom_bo = panfrost_bo_create(
1095 dev, POSITION_FIFO_SIZE, PAN_BO_INVISIBLE, "Temporary Geometry buffer");
1096
1097 if (ctx->csf.tmp_geom_bo == NULL)
1098 goto err_tiler_heap_tmp_geom_bo;
1099
1100 /* Setup the tiler heap */
1101 struct panfrost_bo *cs_bo =
1102 panfrost_bo_create(dev, 4096, 0, "Temporary CS buffer");
1103
1104 if (cs_bo == NULL)
1105 goto err_tiler_heap_cs_bo;
1106
1107 struct cs_buffer init_buffer = {
1108 .cpu = cs_bo->ptr.cpu,
1109 .gpu = cs_bo->ptr.gpu,
1110 .capacity = panfrost_bo_size(cs_bo) / sizeof(uint64_t),
1111 };
1112 const struct cs_builder_conf bconf = {
1113 .nr_registers = 96,
1114 .nr_kernel_registers = 4,
1115 };
1116 struct cs_builder b;
1117 cs_builder_init(&b, &bconf, init_buffer);
1118 struct cs_index heap = cs_reg64(&b, 72);
1119 cs_move64_to(&b, heap, thc.tiler_heap_ctx_gpu_va);
1120 cs_heap_set(&b, heap);
1121
1122 struct drm_panthor_queue_submit qsubmit;
1123 struct drm_panthor_group_submit gsubmit;
1124 struct drm_panthor_sync_op sync = {
1125 .flags =
1126 DRM_PANTHOR_SYNC_OP_SIGNAL | DRM_PANTHOR_SYNC_OP_HANDLE_TYPE_SYNCOBJ,
1127 .handle = ctx->syncobj,
1128 };
1129
1130 assert(cs_is_valid(&b));
1131 cs_finish(&b);
1132
1133 uint64_t cs_start = cs_root_chunk_gpu_addr(&b);
1134 uint32_t cs_size = cs_root_chunk_size(&b);
1135
1136 csf_prepare_qsubmit(ctx, &qsubmit, 0, cs_start, cs_size, &sync, 1);
1137 csf_prepare_gsubmit(ctx, &gsubmit, &qsubmit, 1);
1138 ret = csf_submit_gsubmit(ctx, &gsubmit);
1139
1140 if (ret)
1141 goto err_g_submit;
1142
1143 /* Wait before freeing the buffer. */
1144 ret = drmSyncobjWait(panfrost_device_fd(dev), &ctx->syncobj, 1, INT64_MAX, 0,
1145 NULL);
1146 assert(!ret);
1147
1148 panfrost_bo_unreference(cs_bo);
1149
1150 ctx->csf.is_init = true;
1151 return 0;
1152 err_g_submit:
1153 panfrost_bo_unreference(cs_bo);
1154 err_tiler_heap_cs_bo:
1155 panfrost_bo_unreference(ctx->csf.tmp_geom_bo);
1156 err_tiler_heap_tmp_geom_bo:
1157 panfrost_bo_unreference(ctx->csf.heap.desc_bo);
1158 err_tiler_heap_desc_bo:
1159 drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY,
1160 &thd);
1161 err_tiler_heap:
1162 drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_DESTROY, &gd);
1163 err_group_create:
1164 return -1;
1165 }
1166
1167 void
GENX(csf_cleanup_context)1168 GENX(csf_cleanup_context)(struct panfrost_context *ctx)
1169 {
1170 if (!ctx->csf.is_init)
1171 return;
1172
1173 struct panfrost_device *dev = pan_device(ctx->base.screen);
1174 struct drm_panthor_tiler_heap_destroy thd = {
1175 .handle = ctx->csf.heap.handle,
1176 };
1177 int ret;
1178
1179 /* Make sure all jobs are done before destroying the heap. */
1180 ret = drmSyncobjWait(panfrost_device_fd(dev), &ctx->syncobj, 1, INT64_MAX, 0,
1181 NULL);
1182 assert(!ret);
1183
1184 ret = drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_TILER_HEAP_DESTROY,
1185 &thd);
1186 assert(!ret);
1187
1188 struct drm_panthor_group_destroy gd = {
1189 .group_handle = ctx->csf.group_handle,
1190 };
1191
1192 ret =
1193 drmIoctl(panfrost_device_fd(dev), DRM_IOCTL_PANTHOR_GROUP_DESTROY, &gd);
1194 assert(!ret);
1195
1196 panfrost_bo_unreference(ctx->csf.heap.desc_bo);
1197 ctx->csf.is_init = false;
1198 }
1199
1200 void
GENX(csf_emit_write_timestamp)1201 GENX(csf_emit_write_timestamp)(struct panfrost_batch *batch,
1202 struct panfrost_resource *dst, unsigned offset)
1203 {
1204 struct cs_builder *b = batch->csf.cs.builder;
1205
1206 struct cs_index address = cs_reg64(b, 40);
1207 cs_move64_to(b, address,
1208 dst->image.data.base + dst->image.data.offset + offset);
1209 cs_store_state(b, address, 0, MALI_CS_STATE_TIMESTAMP, cs_now());
1210
1211 panfrost_batch_write_rsrc(batch, dst, PIPE_SHADER_VERTEX);
1212 }
1213