1 /*
2 * Copyright © 2022 Igalia S.L.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "tu_lrz.h"
7
8 #include "tu_clear_blit.h"
9 #include "tu_cmd_buffer.h"
10 #include "tu_cs.h"
11 #include "tu_image.h"
12
13 #include "common/freedreno_gpu_event.h"
14 #include "common/freedreno_lrz.h"
15
16 /* See lrz.rst for how HW works. Here are only the implementation notes.
17 *
18 * There are a number of limitations when LRZ cannot be used:
19 * - Fragment shader side-effects (writing to SSBOs, atomic operations, etc);
20 * - Writing to stencil buffer
21 * - Writing depth while:
22 * - Changing direction of depth test (e.g. from OP_GREATER to OP_LESS);
23 * - Using OP_ALWAYS or OP_NOT_EQUAL;
24 * - Clearing depth with vkCmdClearAttachments;
25 * - (pre-a650) Not clearing depth attachment with LOAD_OP_CLEAR;
26 * - (pre-a650) Using secondary command buffers;
27 * - Sysmem rendering (with small caveat).
28 *
29 * A650+ (gen3+)
30 * =============
31 *
32 * While LRZ could be reused between renderpasses LRZ, it is disabled when
33 * underlying depth buffer is changed.
34 * The following commands could change a depth image:
35 * - vkCmdBlitImage*
36 * - vkCmdCopyBufferToImage*
37 * - vkCmdCopyImage*
38 *
39 * LRZ Fast-Clear
40 * ==============
41 *
42 * It's always valid to fast-clear. On the other hand we disable
43 * fast-clear if depth clear value is not 0.0 or 1.0 because it may be worse
44 * for perf if some primitives are expected to fail depth test against the
45 * actual depth clear value.
46 *
47 * LRZ Caches
48 * ==========
49 *
50 * ! The policy here is to flush LRZ cache right after it is changed,
51 * so if LRZ data is needed afterwards - there is no need to flush it
52 * before using LRZ.
53 */
54
55 static inline void
tu_lrz_disable_reason(struct tu_cmd_buffer * cmd,const char * reason)56 tu_lrz_disable_reason(struct tu_cmd_buffer *cmd, const char *reason) {
57 cmd->state.rp.lrz_disable_reason = reason;
58 perf_debug(cmd->device, "Disabling LRZ because '%s'", reason);
59 }
60
61 template <chip CHIP>
62 static void
tu6_emit_lrz_buffer(struct tu_cs * cs,struct tu_image * depth_image)63 tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
64 {
65 if (!depth_image) {
66 tu_cs_emit_regs(cs,
67 A6XX_GRAS_LRZ_BUFFER_BASE(0),
68 A6XX_GRAS_LRZ_BUFFER_PITCH(0),
69 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
70
71 if (CHIP >= A7XX)
72 tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO());
73
74 return;
75 }
76
77 uint64_t lrz_iova = depth_image->iova + depth_image->lrz_offset;
78 uint64_t lrz_fc_iova = depth_image->iova + depth_image->lrz_fc_offset;
79 if (!depth_image->lrz_fc_offset)
80 lrz_fc_iova = 0;
81
82 tu_cs_emit_regs(cs,
83 A6XX_GRAS_LRZ_BUFFER_BASE(.qword = lrz_iova),
84 A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = depth_image->lrz_pitch),
85 A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(.qword = lrz_fc_iova));
86
87 if (CHIP >= A7XX) {
88 tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO(
89 .depth_format = tu6_pipe2depth(depth_image->vk.format)
90 ));
91 }
92 }
93
94 static void
tu6_write_lrz_reg(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_reg_value reg)95 tu6_write_lrz_reg(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
96 struct tu_reg_value reg)
97 {
98 if (cmd->device->physical_device->info->a6xx.lrz_track_quirk) {
99 tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
100 tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_LRZ));
101 tu_cs_emit(cs, reg.reg);
102 tu_cs_emit(cs, reg.value);
103 } else {
104 tu_cs_emit_pkt4(cs, reg.reg, 1);
105 tu_cs_emit(cs, reg.value);
106 }
107 }
108
109 template <chip CHIP>
110 static void
tu6_write_lrz_cntl(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct A6XX_GRAS_LRZ_CNTL cntl)111 tu6_write_lrz_cntl(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
112 struct A6XX_GRAS_LRZ_CNTL cntl)
113 {
114 if (CHIP >= A7XX) {
115 // A7XX split LRZ_CNTL into two seperate registers.
116 struct tu_reg_value cntl2 = A7XX_GRAS_LRZ_CNTL2(
117 .disable_on_wrong_dir = cntl.disable_on_wrong_dir,
118 .fc_enable = cntl.fc_enable,
119 );
120 cntl.disable_on_wrong_dir = false;
121 cntl.fc_enable = false;
122
123 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(cntl));
124 tu6_write_lrz_reg(cmd, cs, cntl2);
125 } else {
126 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(cntl));
127 }
128 }
129
130 template <chip CHIP>
131 static void
tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer * cmd,struct tu_cs * cs)132 tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
133 {
134 /* Disable direction by writing invalid depth view. */
135 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
136 .base_layer = 0b11111111111,
137 .layer_count = 0b11111111111,
138 .base_mip_level = 0b1111,
139 ));
140
141 tu6_write_lrz_cntl<CHIP>(cmd, cs, {
142 .enable = true,
143 .disable_on_wrong_dir = true,
144 });
145
146 tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_CLEAR);
147 tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_FLUSH);
148 }
149
150 static void
tu_lrz_init_state(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,const struct tu_image_view * view)151 tu_lrz_init_state(struct tu_cmd_buffer *cmd,
152 const struct tu_render_pass_attachment *att,
153 const struct tu_image_view *view)
154 {
155 if (!view->image->lrz_height) {
156 assert(!cmd->device->use_lrz || !vk_format_has_depth(att->format));
157 return;
158 }
159
160 bool clears_depth = att->clear_mask &
161 (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT);
162 bool has_gpu_tracking =
163 cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
164
165 if (!has_gpu_tracking && !clears_depth)
166 return;
167
168 /* We need to always have an LRZ view just to disable it if there is a
169 * depth attachment, there are any secondaries, and GPU tracking is
170 * enabled, in order not to rely on loadOp state which doesn't exist with
171 * dynamic rendering in secondaries. Otherwise the secondary will have LRZ
172 * enabled and there will be a NULL/garbage LRZ buffer.
173 */
174 cmd->state.lrz.image_view = view;
175
176 if (!clears_depth && !att->load)
177 return;
178
179 cmd->state.lrz.valid = true;
180 cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
181 /* Be optimistic and unconditionally enable fast-clear in
182 * secondary cmdbufs and when reusing previous LRZ state.
183 */
184 cmd->state.lrz.fast_clear = view->image->has_lrz_fc;
185
186 cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
187 cmd->state.lrz.reuse_previous_state = !clears_depth;
188 }
189
190 /* Note: if we enable LRZ here, then tu_lrz_init_state() must at least set
191 * lrz.image_view, so that an LRZ buffer is present (even if LRZ is
192 * dynamically disabled).
193 */
194
195 static void
tu_lrz_init_secondary(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att)196 tu_lrz_init_secondary(struct tu_cmd_buffer *cmd,
197 const struct tu_render_pass_attachment *att)
198 {
199 bool has_gpu_tracking =
200 cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
201
202 if (!has_gpu_tracking)
203 return;
204
205 if (!cmd->device->use_lrz)
206 return;
207
208 if (!vk_format_has_depth(att->format))
209 return;
210
211 cmd->state.lrz.valid = true;
212 cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
213 cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
214
215 /* We may not have the depth attachment when executing in a secondary
216 * inside a render pass. This means we have to be even more optimistic than
217 * the normal case and enable fast clear even if the depth image doesn't
218 * support it.
219 */
220 cmd->state.lrz.fast_clear = true;
221
222 /* These are not used inside secondaries */
223 cmd->state.lrz.image_view = NULL;
224 cmd->state.lrz.reuse_previous_state = false;
225 }
226
227 template <chip CHIP>
228 bool
tu_lrzfc_depth_supported(float depth)229 tu_lrzfc_depth_supported(float depth) {
230 /* A7XX supports fast-clearing to any value, while A6XX only supports 0.0/1.0 */
231 return CHIP >= A7XX || depth == 0.0f || depth == 1.0f;
232 }
233
234 /* This is generally the same as tu_lrz_begin_renderpass(), but we skip
235 * actually emitting anything. The lrz state needs to be consistent between
236 * renderpasses, but only the first should actually emit commands to disable
237 * lrz etc.
238 */
239 template <chip CHIP>
240 void
tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer * cmd)241 tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd)
242 {
243 /* Track LRZ valid state */
244 memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
245
246 uint32_t a;
247 for (a = 0; a < cmd->state.pass->attachment_count; a++) {
248 if (cmd->state.attachments[a]->image->lrz_height)
249 break;
250 }
251
252 if (a != cmd->state.pass->attachment_count) {
253 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
254 tu_lrz_init_state(cmd, att, cmd->state.attachments[a]);
255 if (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
256 VkClearValue clear = cmd->state.clear_values[a];
257 cmd->state.lrz.depth_clear_value = clear;
258 cmd->state.lrz.fast_clear = cmd->state.lrz.fast_clear &&
259 tu_lrzfc_depth_supported<CHIP>(clear.depthStencil.depth);
260 }
261 cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
262 }
263 }
264 TU_GENX(tu_lrz_begin_resumed_renderpass);
265
266 template <chip CHIP>
267 void
tu_lrz_begin_renderpass(struct tu_cmd_buffer * cmd)268 tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd)
269 {
270 const struct tu_render_pass *pass = cmd->state.pass;
271
272 cmd->state.rp.lrz_disable_reason = "";
273
274 int lrz_img_count = 0;
275 for (unsigned i = 0; i < pass->attachment_count; i++) {
276 if (cmd->state.attachments[i]->image->lrz_height)
277 lrz_img_count++;
278 }
279
280 if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking &&
281 cmd->state.pass->subpass_count > 1 && lrz_img_count > 1) {
282 /* Theoretically we could switch between LRZ buffers during the binning
283 * and tiling passes, but it is untested and would add complexity for
284 * presumably extremely rare case.
285 */
286 tu_lrz_disable_reason(cmd, "Several subpasses with different depth attachments");
287
288 for (unsigned i = 0; i < pass->attachment_count; i++) {
289 struct tu_image *image = cmd->state.attachments[i]->image;
290 tu_disable_lrz<CHIP>(cmd, &cmd->cs, image);
291 }
292
293 /* We need a valid LRZ fast-clear base, in case the render pass contents
294 * are in secondaries that enable LRZ, so that they can read that LRZ is
295 * dynamically disabled. It doesn't matter which we use, so just leave
296 * the last one as emitted in tu_disable_lrz().
297 */
298 memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
299 return;
300 }
301
302 /* Track LRZ valid state */
303 tu_lrz_begin_resumed_renderpass<CHIP>(cmd);
304
305 if (!cmd->state.lrz.valid) {
306 tu6_emit_lrz_buffer<CHIP>(&cmd->cs, NULL);
307 }
308 }
309 TU_GENX(tu_lrz_begin_renderpass);
310
311 void
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer * cmd)312 tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd)
313 {
314 memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
315 uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
316 if (a != VK_ATTACHMENT_UNUSED) {
317 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
318 tu_lrz_init_secondary(cmd, att);
319 }
320 }
321
322 template <chip CHIP>
323 void
tu_lrz_tiling_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)324 tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
325 {
326 /* TODO: If lrz was never valid for the entire renderpass, we could exit
327 * early here. Sometimes we know this ahead of time and null out
328 * image_view, but with LOAD_OP_DONT_CARE this only happens if there were
329 * no secondaries.
330 */
331 if (!cmd->state.lrz.image_view)
332 return;
333
334 struct tu_lrz_state *lrz = &cmd->state.lrz;
335
336 tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
337
338 if (lrz->reuse_previous_state) {
339 /* Reuse previous LRZ state, LRZ cache is assumed to be
340 * already invalidated by previous renderpass.
341 */
342 assert(lrz->gpu_dir_tracking);
343
344 tu6_write_lrz_reg(cmd, cs,
345 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
346 return;
347 }
348
349 bool invalidate_lrz = !lrz->valid && lrz->gpu_dir_tracking;
350 if (invalidate_lrz) {
351 /* Following the blob we elect to disable LRZ for the whole renderpass
352 * if it is known that LRZ is disabled somewhere in the renderpass.
353 *
354 * This is accomplished by making later GRAS_LRZ_CNTL (in binning pass)
355 * to fail the comparison of depth views.
356 */
357 tu6_disable_lrz_via_depth_view<CHIP>(cmd, cs);
358 tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
359 } else if (lrz->fast_clear || lrz->gpu_dir_tracking) {
360 if (lrz->gpu_dir_tracking) {
361 tu6_write_lrz_reg(cmd, cs,
362 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
363 }
364
365 tu6_write_lrz_cntl<CHIP>(cmd, cs, {
366 .enable = true,
367 .fc_enable = lrz->fast_clear,
368 .disable_on_wrong_dir = lrz->gpu_dir_tracking,
369 });
370
371 /* LRZ_CLEAR.fc_enable + LRZ_CLEAR - clears fast-clear buffer;
372 * LRZ_CLEAR.disable_on_wrong_dir + LRZ_CLEAR - sets direction to
373 * CUR_DIR_UNSET.
374 */
375 if (CHIP >= A7XX)
376 tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_CLEAR_DEPTH_F32(lrz->depth_clear_value.depthStencil.depth));
377 tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_CLEAR);
378 }
379
380 if (!lrz->fast_clear && !invalidate_lrz) {
381 tu6_clear_lrz<CHIP>(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
382 /* Even though we disable fast-clear we still have to dirty
383 * fast-clear buffer because both secondary cmdbufs and following
384 * renderpasses won't know that fast-clear is disabled.
385 *
386 * TODO: we could avoid this if we don't store depth and don't
387 * expect secondary cmdbufs.
388 */
389 if (lrz->image_view->image->has_lrz_fc) {
390 tu6_dirty_lrz_fc<CHIP>(cmd, cs, lrz->image_view->image);
391 }
392 }
393 }
394 TU_GENX(tu_lrz_tiling_begin);
395
396 template <chip CHIP>
397 void
tu_lrz_tiling_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)398 tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
399 {
400 if (cmd->state.lrz.fast_clear || cmd->state.lrz.gpu_dir_tracking) {
401 tu6_emit_lrz_buffer<CHIP>(cs, cmd->state.lrz.image_view->image);
402
403 if (cmd->state.lrz.gpu_dir_tracking) {
404 tu6_write_lrz_reg(cmd, &cmd->cs,
405 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = cmd->state.lrz.image_view->view.GRAS_LRZ_DEPTH_VIEW));
406 }
407
408 /* Enable flushing of LRZ fast-clear and of direction buffer */
409 tu6_write_lrz_cntl<CHIP>(cmd, cs, {
410 .enable = true,
411 .fc_enable = cmd->state.lrz.fast_clear,
412 .disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking,
413 });
414 } else {
415 tu6_write_lrz_cntl<CHIP>(cmd, cs, {.enable = false});
416 }
417
418 tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_FLUSH);
419
420 /* If gpu_dir_tracking is enabled and lrz is not valid blob, at this point,
421 * additionally clears direction buffer:
422 * GRAS_LRZ_DEPTH_VIEW(.dword = 0)
423 * GRAS_LRZ_DEPTH_VIEW(.dword = 0xffffffff)
424 * A6XX_GRAS_LRZ_CNTL(.enable = true, .disable_on_wrong_dir = true)
425 * LRZ_CLEAR
426 * LRZ_FLUSH
427 * Since it happens after all of the rendering is done there is no known
428 * reason to do such clear.
429 */
430 }
431 TU_GENX(tu_lrz_tiling_end);
432
433 template <chip CHIP>
434 void
tu_lrz_sysmem_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)435 tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
436 {
437 if (cmd->device->physical_device->info->a6xx.has_lrz_feedback) {
438 tu_lrz_tiling_begin<CHIP>(cmd, cs);
439 return;
440 }
441
442 if (!cmd->state.lrz.image_view)
443 return;
444
445 /* Actually, LRZ buffer could be filled in sysmem, in theory to
446 * be used in another renderpass, but the benefit is rather dubious.
447 */
448
449 struct tu_lrz_state *lrz = &cmd->state.lrz;
450
451 if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) {
452 tu_disable_lrz<CHIP>(cmd, cs, lrz->image_view->image);
453 /* Make sure depth view comparison will fail. */
454 tu6_write_lrz_reg(cmd, cs,
455 A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
456 } else {
457 tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
458 /* Even though we disable LRZ writes in sysmem mode - there is still
459 * LRZ test, so LRZ should be cleared.
460 */
461 if (lrz->fast_clear) {
462 tu6_write_lrz_cntl<CHIP>(cmd, &cmd->cs, {
463 .enable = true,
464 .fc_enable = true,
465 });
466
467 if (CHIP >= A7XX)
468 tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_CLEAR_DEPTH_F32(lrz->depth_clear_value.depthStencil.depth));
469 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_CLEAR);
470 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_FLUSH);
471 } else {
472 tu6_clear_lrz<CHIP>(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
473 }
474 }
475 }
476 TU_GENX(tu_lrz_sysmem_begin);
477
478 template <chip CHIP>
479 void
tu_lrz_sysmem_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)480 tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
481 {
482 if (cmd->device->physical_device->info->a6xx.has_lrz_feedback) {
483 tu_lrz_tiling_end<CHIP>(cmd, cs);
484 return;
485 }
486
487 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_FLUSH);
488 }
489 TU_GENX(tu_lrz_sysmem_end);
490
491 /* Disable LRZ outside of renderpass. */
492 template <chip CHIP>
493 void
tu_disable_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image)494 tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
495 struct tu_image *image)
496 {
497 if (!cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
498 return;
499
500 if (!image->lrz_height)
501 return;
502
503 tu6_emit_lrz_buffer<CHIP>(cs, image);
504 tu6_disable_lrz_via_depth_view<CHIP>(cmd, cs);
505 }
506 TU_GENX(tu_disable_lrz);
507
508 /* Clear LRZ, used for out of renderpass depth clears. */
509 template <chip CHIP>
510 void
tu_lrz_clear_depth_image(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)511 tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd,
512 struct tu_image *image,
513 const VkClearDepthStencilValue *pDepthStencil,
514 uint32_t rangeCount,
515 const VkImageSubresourceRange *pRanges)
516 {
517 if (!rangeCount || !image->lrz_height ||
518 !cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
519 return;
520
521 /* We cannot predict which depth subresource would be used later on,
522 * so we just pick the first one with depth cleared and clear the LRZ.
523 */
524 const VkImageSubresourceRange *range = NULL;
525 for (unsigned i = 0; i < rangeCount; i++) {
526 if (pRanges[i].aspectMask &
527 (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
528 range = &pRanges[i];
529 break;
530 }
531 }
532
533 if (!range)
534 return;
535
536 bool fast_clear = image->has_lrz_fc &&
537 tu_lrzfc_depth_supported<CHIP>(pDepthStencil->depth);
538
539 tu6_emit_lrz_buffer<CHIP>(&cmd->cs, image);
540
541 tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
542 .base_layer = range->baseArrayLayer,
543 .layer_count = vk_image_subresource_layer_count(&image->vk, range),
544 .base_mip_level = range->baseMipLevel,
545 ));
546
547 tu6_write_lrz_cntl<CHIP>(cmd, &cmd->cs, {
548 .enable = true,
549 .fc_enable = fast_clear,
550 .disable_on_wrong_dir = true,
551 });
552
553 if (CHIP >= A7XX)
554 tu_cs_emit_regs(&cmd->cs, A7XX_GRAS_LRZ_CLEAR_DEPTH_F32(pDepthStencil->depth));
555 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_CLEAR);
556 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_FLUSH);
557
558 if (!fast_clear) {
559 tu6_clear_lrz<CHIP>(cmd, &cmd->cs, image, (const VkClearValue*) pDepthStencil);
560 }
561 }
562 TU_GENX(tu_lrz_clear_depth_image);
563
564 template <chip CHIP>
565 void
tu_lrz_disable_during_renderpass(struct tu_cmd_buffer * cmd)566 tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd)
567 {
568 assert(cmd->state.pass);
569
570 cmd->state.lrz.valid = false;
571 cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
572
573 if (cmd->state.lrz.gpu_dir_tracking) {
574 tu6_write_lrz_cntl<CHIP>(cmd, &cmd->cs, {
575 .enable = true,
576 .dir = LRZ_DIR_INVALID,
577 .disable_on_wrong_dir = true,
578 });
579 }
580 }
581 TU_GENX(tu_lrz_disable_during_renderpass);
582
583 /* update lrz state based on stencil-test func:
584 *
585 * Conceptually the order of the pipeline is:
586 *
587 *
588 * FS -> Alpha-Test -> Stencil-Test -> Depth-Test
589 * | |
590 * if wrmask != 0 if wrmask != 0
591 * | |
592 * v v
593 * Stencil-Write Depth-Write
594 *
595 * Because Stencil-Test can have side effects (Stencil-Write) prior
596 * to depth test, in this case we potentially need to disable early
597 * lrz-test. See:
598 *
599 * https://www.khronos.org/opengl/wiki/Per-Sample_Processing
600 */
601 static bool
tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL * gras_lrz_cntl,VkCompareOp func,bool stencil_write)602 tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
603 VkCompareOp func,
604 bool stencil_write)
605 {
606 switch (func) {
607 case VK_COMPARE_OP_ALWAYS:
608 /* nothing to do for LRZ, but for stencil test when stencil-
609 * write is enabled, we need to disable lrz-test, since
610 * conceptually stencil test and write happens before depth-test.
611 */
612 if (stencil_write) {
613 return false;
614 }
615 break;
616 case VK_COMPARE_OP_NEVER:
617 /* fragment never passes, disable lrz_write for this draw. */
618 gras_lrz_cntl->lrz_write = false;
619 break;
620 default:
621 /* whether the fragment passes or not depends on result
622 * of stencil test, which we cannot know when doing binning
623 * pass.
624 */
625 gras_lrz_cntl->lrz_write = false;
626 /* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-
627 * effects from stencil test we need to disable lrz-test.
628 */
629 if (stencil_write) {
630 return false;
631 }
632 break;
633 }
634
635 return true;
636 }
637
638 template <chip CHIP>
639 static struct A6XX_GRAS_LRZ_CNTL
tu6_calculate_lrz_state(struct tu_cmd_buffer * cmd,const uint32_t a)640 tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,
641 const uint32_t a)
642 {
643 const struct tu_shader *fs = cmd->state.shaders[MESA_SHADER_FRAGMENT];
644 bool z_test_enable = cmd->vk.dynamic_graphics_state.ds.depth.test_enable;
645 bool z_write_enable = cmd->vk.dynamic_graphics_state.ds.depth.write_enable;
646 bool z_bounds_enable = cmd->vk.dynamic_graphics_state.ds.depth.bounds_test.enable;
647 VkCompareOp depth_compare_op =
648 cmd->vk.dynamic_graphics_state.ds.depth.compare_op;
649
650 struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };
651
652 if (!cmd->state.lrz.valid) {
653 return gras_lrz_cntl;
654 }
655
656 /* If depth test is disabled we shouldn't touch LRZ.
657 * Same if there is no depth attachment.
658 */
659 if (a == VK_ATTACHMENT_UNUSED || !z_test_enable || !cmd->device->use_lrz)
660 return gras_lrz_cntl;
661
662 if (!cmd->state.lrz.gpu_dir_tracking && !cmd->state.attachments) {
663 /* Without on-gpu LRZ direction tracking - there is nothing we
664 * can do to enable LRZ in secondary command buffers.
665 */
666 return gras_lrz_cntl;
667 }
668
669 /* See comment in tu_pipeline about disabling LRZ write for blending. */
670 bool reads_dest = cmd->state.blend_reads_dest;
671
672 gras_lrz_cntl.enable = true;
673 gras_lrz_cntl.lrz_write =
674 z_write_enable &&
675 !reads_dest &&
676 !(fs->fs.lrz.status & TU_LRZ_FORCE_DISABLE_WRITE);
677 gras_lrz_cntl.z_test_enable = z_write_enable;
678 gras_lrz_cntl.z_bounds_enable = z_bounds_enable;
679 gras_lrz_cntl.fc_enable = cmd->state.lrz.fast_clear;
680 gras_lrz_cntl.dir_write = cmd->state.lrz.gpu_dir_tracking;
681 gras_lrz_cntl.disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking;
682
683 if (CHIP >= A7XX)
684 gras_lrz_cntl.z_func = tu6_compare_func(depth_compare_op);
685
686 /* LRZ is disabled until it is cleared, which means that one "wrong"
687 * depth test or shader could disable LRZ until depth buffer is cleared.
688 */
689 bool disable_lrz = false;
690 bool temporary_disable_lrz = false;
691
692 /* What happens in FS could affect LRZ, e.g.: writes to gl_FragDepth or early
693 * fragment tests. We have to skip LRZ testing and updating, but as long as
694 * the depth direction stayed the same we can continue with LRZ testing later.
695 */
696 if (fs->fs.lrz.status & TU_LRZ_FORCE_DISABLE_LRZ) {
697 if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN || !cmd->state.lrz.gpu_dir_tracking) {
698 perf_debug(cmd->device, "Skipping LRZ due to FS");
699 temporary_disable_lrz = true;
700 } else {
701 tu_lrz_disable_reason(cmd, "FS writes depth or has side-effects (TODO: fix for gpu-direction-tracking case)");
702 disable_lrz = true;
703 }
704 }
705
706 /* If Z is not written - it doesn't affect LRZ buffer state.
707 * Which means two things:
708 * - Don't lock direction until Z is written for the first time;
709 * - If Z isn't written and direction IS locked it's possible to just
710 * temporary disable LRZ instead of fully bailing out, when direction
711 * is changed.
712 */
713
714 enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
715 switch (depth_compare_op) {
716 case VK_COMPARE_OP_ALWAYS:
717 case VK_COMPARE_OP_NOT_EQUAL:
718 /* OP_ALWAYS and OP_NOT_EQUAL could have depth value of any direction,
719 * so if there is a depth write - LRZ must be disabled.
720 */
721 if (z_write_enable) {
722 tu_lrz_disable_reason(cmd, "Depth write + ALWAYS/NOT_EQUAL");
723 disable_lrz = true;
724 gras_lrz_cntl.dir = LRZ_DIR_INVALID;
725 } else {
726 perf_debug(cmd->device, "Skipping LRZ due to ALWAYS/NOT_EQUAL");
727 temporary_disable_lrz = true;
728 }
729 break;
730 case VK_COMPARE_OP_EQUAL:
731 case VK_COMPARE_OP_NEVER:
732 /* Blob disables LRZ for OP_EQUAL, and from our empirical
733 * evidence it is a right thing to do.
734 *
735 * Both OP_EQUAL and OP_NEVER don't change LRZ buffer so
736 * we could just temporary disable LRZ.
737 */
738 temporary_disable_lrz = true;
739 break;
740 case VK_COMPARE_OP_GREATER:
741 case VK_COMPARE_OP_GREATER_OR_EQUAL:
742 lrz_direction = TU_LRZ_GREATER;
743 gras_lrz_cntl.greater = true;
744 gras_lrz_cntl.dir = LRZ_DIR_GE;
745 break;
746 case VK_COMPARE_OP_LESS:
747 case VK_COMPARE_OP_LESS_OR_EQUAL:
748 lrz_direction = TU_LRZ_LESS;
749 gras_lrz_cntl.greater = false;
750 gras_lrz_cntl.dir = LRZ_DIR_LE;
751 break;
752 default:
753 unreachable("bad VK_COMPARE_OP value or uninitialized");
754 break;
755 };
756
757 /* If depthfunc direction is changed, bail out on using LRZ. The
758 * LRZ buffer encodes a min/max depth value per block, but if
759 * we switch from GT/GE <-> LT/LE, those values cannot be
760 * interpreted properly.
761 */
762 if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&
763 lrz_direction != TU_LRZ_UNKNOWN &&
764 cmd->state.lrz.prev_direction != lrz_direction) {
765 if (z_write_enable) {
766 tu_lrz_disable_reason(cmd, "Depth write + compare-op direction change");
767 disable_lrz = true;
768 } else {
769 perf_debug(cmd->device, "Skipping LRZ due to direction change");
770 temporary_disable_lrz = true;
771 }
772 }
773
774 /* Consider the following sequence of depthfunc changes:
775 *
776 * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_GREATER
777 * LRZ is disabled during COMPARE_OP_EQUAL but could be enabled
778 * during second VK_COMPARE_OP_GREATER.
779 *
780 * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_LESS
781 * Here, LRZ is disabled during COMPARE_OP_EQUAL and should become
782 * invalid during COMPARE_OP_LESS.
783 *
784 * This shows that we should keep last KNOWN direction.
785 */
786 if (z_write_enable && lrz_direction != TU_LRZ_UNKNOWN)
787 cmd->state.lrz.prev_direction = lrz_direction;
788
789 /* Invalidate LRZ and disable write if stencil test is enabled */
790 bool stencil_test_enable = cmd->vk.dynamic_graphics_state.ds.stencil.test_enable;
791 if (!disable_lrz && stencil_test_enable) {
792 VkCompareOp stencil_front_compare_op = (VkCompareOp)
793 cmd->vk.dynamic_graphics_state.ds.stencil.front.op.compare;
794
795 VkCompareOp stencil_back_compare_op = (VkCompareOp)
796 cmd->vk.dynamic_graphics_state.ds.stencil.back.op.compare;
797
798 bool lrz_allowed = true;
799 lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
800 &gras_lrz_cntl, stencil_front_compare_op,
801 cmd->state.stencil_front_write);
802
803 lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
804 &gras_lrz_cntl, stencil_back_compare_op,
805 cmd->state.stencil_back_write);
806
807 /* Without depth write it's enough to make sure that depth test
808 * is executed after stencil test, so temporary disabling LRZ is enough.
809 */
810 if (!lrz_allowed) {
811 if (z_write_enable) {
812 tu_lrz_disable_reason(cmd, "Stencil write");
813 disable_lrz = true;
814 } else {
815 perf_debug(cmd->device, "Skipping LRZ due to stencil write");
816 temporary_disable_lrz = true;
817 }
818 }
819 }
820
821 /* Writing depth with blend enabled means we need to invalidate LRZ,
822 * because the written depth value could mean that a later draw with
823 * depth enabled (where we would otherwise write LRZ) could have
824 * fragments which don't pass the depth test due to this draw. For
825 * example, consider this sequence of draws, with depth mode GREATER:
826 *
827 * draw A:
828 * z=0.1, fragments pass
829 * draw B:
830 * z=0.4, fragments pass
831 * blend enabled (LRZ write disabled)
832 * depth write enabled
833 * draw C:
834 * z=0.2, fragments don't pass
835 * blend disabled
836 * depth write enabled
837 *
838 * Normally looking at the state in draw C, we'd assume we could
839 * enable LRZ write. But this would cause early-z/lrz to discard
840 * fragments from draw A which should be visible due to draw B.
841 */
842 if (reads_dest && z_write_enable && cmd->device->instance->conservative_lrz) {
843 tu_lrz_disable_reason(cmd, "Depth write + blending");
844 disable_lrz = true;
845 }
846
847 if (disable_lrz)
848 cmd->state.lrz.valid = false;
849
850 if (disable_lrz && cmd->state.lrz.gpu_dir_tracking) {
851 /* Direction byte on GPU should be set to CUR_DIR_DISABLED,
852 * for this it's not enough to emit empty GRAS_LRZ_CNTL.
853 */
854 gras_lrz_cntl.enable = true;
855 gras_lrz_cntl.dir = LRZ_DIR_INVALID;
856
857 return gras_lrz_cntl;
858 }
859
860 if (temporary_disable_lrz)
861 gras_lrz_cntl.enable = false;
862
863 cmd->state.lrz.enabled = cmd->state.lrz.valid && gras_lrz_cntl.enable;
864 if (!cmd->state.lrz.enabled)
865 memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));
866
867 return gras_lrz_cntl;
868 }
869
870 template <chip CHIP>
871 void
tu6_emit_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs)872 tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
873 {
874 const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
875 struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state<CHIP>(cmd, a);
876
877 tu6_write_lrz_cntl<CHIP>(cmd, cs, gras_lrz_cntl);
878 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));
879 }
880 TU_GENX(tu6_emit_lrz);
881