xref: /aosp_15_r20/external/mesa3d/src/freedreno/vulkan/tu_lrz.cc (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2022 Igalia S.L.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "tu_lrz.h"
7 
8 #include "tu_clear_blit.h"
9 #include "tu_cmd_buffer.h"
10 #include "tu_cs.h"
11 #include "tu_image.h"
12 
13 #include "common/freedreno_gpu_event.h"
14 #include "common/freedreno_lrz.h"
15 
16 /* See lrz.rst for how HW works. Here are only the implementation notes.
17  *
18  * There are a number of limitations when LRZ cannot be used:
19  * - Fragment shader side-effects (writing to SSBOs, atomic operations, etc);
20  * - Writing to stencil buffer
21  * - Writing depth while:
22  *   - Changing direction of depth test (e.g. from OP_GREATER to OP_LESS);
23  *   - Using OP_ALWAYS or OP_NOT_EQUAL;
24  * - Clearing depth with vkCmdClearAttachments;
25  * - (pre-a650) Not clearing depth attachment with LOAD_OP_CLEAR;
26  * - (pre-a650) Using secondary command buffers;
27  * - Sysmem rendering (with small caveat).
28  *
29  * A650+ (gen3+)
30  * =============
31  *
32  * While LRZ could be reused between renderpasses LRZ, it is disabled when
33  * underlying depth buffer is changed.
34  * The following commands could change a depth image:
35  * - vkCmdBlitImage*
36  * - vkCmdCopyBufferToImage*
37  * - vkCmdCopyImage*
38  *
39  * LRZ Fast-Clear
40  * ==============
41  *
42  * It's always valid to fast-clear. On the other hand we disable
43  * fast-clear if depth clear value is not 0.0 or 1.0 because it may be worse
44  * for perf if some primitives are expected to fail depth test against the
45  * actual depth clear value.
46  *
47  * LRZ Caches
48  * ==========
49  *
50  * ! The policy here is to flush LRZ cache right after it is changed,
51  * so if LRZ data is needed afterwards - there is no need to flush it
52  * before using LRZ.
53  */
54 
55 static inline void
tu_lrz_disable_reason(struct tu_cmd_buffer * cmd,const char * reason)56 tu_lrz_disable_reason(struct tu_cmd_buffer *cmd, const char *reason) {
57    cmd->state.rp.lrz_disable_reason = reason;
58    perf_debug(cmd->device, "Disabling LRZ because '%s'", reason);
59 }
60 
61 template <chip CHIP>
62 static void
tu6_emit_lrz_buffer(struct tu_cs * cs,struct tu_image * depth_image)63 tu6_emit_lrz_buffer(struct tu_cs *cs, struct tu_image *depth_image)
64 {
65    if (!depth_image) {
66       tu_cs_emit_regs(cs,
67                       A6XX_GRAS_LRZ_BUFFER_BASE(0),
68                       A6XX_GRAS_LRZ_BUFFER_PITCH(0),
69                       A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(0));
70 
71       if (CHIP >= A7XX)
72          tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO());
73 
74       return;
75    }
76 
77    uint64_t lrz_iova = depth_image->iova + depth_image->lrz_offset;
78    uint64_t lrz_fc_iova = depth_image->iova + depth_image->lrz_fc_offset;
79    if (!depth_image->lrz_fc_offset)
80       lrz_fc_iova = 0;
81 
82    tu_cs_emit_regs(cs,
83                    A6XX_GRAS_LRZ_BUFFER_BASE(.qword = lrz_iova),
84                    A6XX_GRAS_LRZ_BUFFER_PITCH(.pitch = depth_image->lrz_pitch),
85                    A6XX_GRAS_LRZ_FAST_CLEAR_BUFFER_BASE(.qword = lrz_fc_iova));
86 
87    if (CHIP >= A7XX) {
88       tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO(
89          .depth_format = tu6_pipe2depth(depth_image->vk.format)
90       ));
91    }
92 }
93 
94 static void
tu6_write_lrz_reg(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_reg_value reg)95 tu6_write_lrz_reg(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
96                   struct tu_reg_value reg)
97 {
98    if (cmd->device->physical_device->info->a6xx.lrz_track_quirk) {
99       tu_cs_emit_pkt7(cs, CP_REG_WRITE, 3);
100       tu_cs_emit(cs, CP_REG_WRITE_0_TRACKER(TRACK_LRZ));
101       tu_cs_emit(cs, reg.reg);
102       tu_cs_emit(cs, reg.value);
103    } else {
104       tu_cs_emit_pkt4(cs, reg.reg, 1);
105       tu_cs_emit(cs, reg.value);
106    }
107 }
108 
109 template <chip CHIP>
110 static void
tu6_write_lrz_cntl(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct A6XX_GRAS_LRZ_CNTL cntl)111 tu6_write_lrz_cntl(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
112                    struct A6XX_GRAS_LRZ_CNTL cntl)
113 {
114    if (CHIP >= A7XX) {
115       // A7XX split LRZ_CNTL into two seperate registers.
116       struct tu_reg_value cntl2 = A7XX_GRAS_LRZ_CNTL2(
117          .disable_on_wrong_dir = cntl.disable_on_wrong_dir,
118          .fc_enable = cntl.fc_enable,
119       );
120       cntl.disable_on_wrong_dir = false;
121       cntl.fc_enable = false;
122 
123       tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(cntl));
124       tu6_write_lrz_reg(cmd, cs, cntl2);
125    } else {
126       tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_CNTL(cntl));
127    }
128 }
129 
130 template <chip CHIP>
131 static void
tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer * cmd,struct tu_cs * cs)132 tu6_disable_lrz_via_depth_view(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
133 {
134    /* Disable direction by writing invalid depth view. */
135    tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
136       .base_layer = 0b11111111111,
137       .layer_count = 0b11111111111,
138       .base_mip_level = 0b1111,
139    ));
140 
141    tu6_write_lrz_cntl<CHIP>(cmd, cs, {
142       .enable = true,
143       .disable_on_wrong_dir = true,
144    });
145 
146    tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_CLEAR);
147    tu_emit_event_write<A6XX>(cmd, cs, FD_LRZ_FLUSH);
148 }
149 
150 static void
tu_lrz_init_state(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att,const struct tu_image_view * view)151 tu_lrz_init_state(struct tu_cmd_buffer *cmd,
152                   const struct tu_render_pass_attachment *att,
153                   const struct tu_image_view *view)
154 {
155    if (!view->image->lrz_height) {
156       assert(!cmd->device->use_lrz || !vk_format_has_depth(att->format));
157       return;
158    }
159 
160    bool clears_depth = att->clear_mask &
161       (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT);
162    bool has_gpu_tracking =
163       cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
164 
165    if (!has_gpu_tracking && !clears_depth)
166       return;
167 
168    /* We need to always have an LRZ view just to disable it if there is a
169     * depth attachment, there are any secondaries, and GPU tracking is
170     * enabled, in order not to rely on loadOp state which doesn't exist with
171     * dynamic rendering in secondaries. Otherwise the secondary will have LRZ
172     * enabled and there will be a NULL/garbage LRZ buffer.
173     */
174    cmd->state.lrz.image_view = view;
175 
176    if (!clears_depth && !att->load)
177       return;
178 
179    cmd->state.lrz.valid = true;
180    cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
181    /* Be optimistic and unconditionally enable fast-clear in
182     * secondary cmdbufs and when reusing previous LRZ state.
183     */
184    cmd->state.lrz.fast_clear = view->image->has_lrz_fc;
185 
186    cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
187    cmd->state.lrz.reuse_previous_state = !clears_depth;
188 }
189 
190 /* Note: if we enable LRZ here, then tu_lrz_init_state() must at least set
191  * lrz.image_view, so that an LRZ buffer is present (even if LRZ is
192  * dynamically disabled).
193  */
194 
195 static void
tu_lrz_init_secondary(struct tu_cmd_buffer * cmd,const struct tu_render_pass_attachment * att)196 tu_lrz_init_secondary(struct tu_cmd_buffer *cmd,
197                       const struct tu_render_pass_attachment *att)
198 {
199    bool has_gpu_tracking =
200       cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking;
201 
202    if (!has_gpu_tracking)
203       return;
204 
205    if (!cmd->device->use_lrz)
206       return;
207 
208    if (!vk_format_has_depth(att->format))
209       return;
210 
211    cmd->state.lrz.valid = true;
212    cmd->state.lrz.prev_direction = TU_LRZ_UNKNOWN;
213    cmd->state.lrz.gpu_dir_tracking = has_gpu_tracking;
214 
215    /* We may not have the depth attachment when executing in a secondary
216     * inside a render pass. This means we have to be even more optimistic than
217     * the normal case and enable fast clear even if the depth image doesn't
218     * support it.
219     */
220    cmd->state.lrz.fast_clear = true;
221 
222    /* These are not used inside secondaries */
223    cmd->state.lrz.image_view = NULL;
224    cmd->state.lrz.reuse_previous_state = false;
225 }
226 
227 template <chip CHIP>
228 bool
tu_lrzfc_depth_supported(float depth)229 tu_lrzfc_depth_supported(float depth) {
230    /* A7XX supports fast-clearing to any value, while A6XX only supports 0.0/1.0 */
231    return CHIP >= A7XX || depth == 0.0f || depth == 1.0f;
232 }
233 
234 /* This is generally the same as tu_lrz_begin_renderpass(), but we skip
235  * actually emitting anything. The lrz state needs to be consistent between
236  * renderpasses, but only the first should actually emit commands to disable
237  * lrz etc.
238  */
239 template <chip CHIP>
240 void
tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer * cmd)241 tu_lrz_begin_resumed_renderpass(struct tu_cmd_buffer *cmd)
242 {
243     /* Track LRZ valid state */
244    memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
245 
246    uint32_t a;
247    for (a = 0; a < cmd->state.pass->attachment_count; a++) {
248       if (cmd->state.attachments[a]->image->lrz_height)
249          break;
250    }
251 
252    if (a != cmd->state.pass->attachment_count) {
253       const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
254       tu_lrz_init_state(cmd, att, cmd->state.attachments[a]);
255       if (att->clear_mask & (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
256          VkClearValue clear = cmd->state.clear_values[a];
257          cmd->state.lrz.depth_clear_value = clear;
258          cmd->state.lrz.fast_clear = cmd->state.lrz.fast_clear &&
259                                      tu_lrzfc_depth_supported<CHIP>(clear.depthStencil.depth);
260       }
261       cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
262    }
263 }
264 TU_GENX(tu_lrz_begin_resumed_renderpass);
265 
266 template <chip CHIP>
267 void
tu_lrz_begin_renderpass(struct tu_cmd_buffer * cmd)268 tu_lrz_begin_renderpass(struct tu_cmd_buffer *cmd)
269 {
270    const struct tu_render_pass *pass = cmd->state.pass;
271 
272    cmd->state.rp.lrz_disable_reason = "";
273 
274    int lrz_img_count = 0;
275    for (unsigned i = 0; i < pass->attachment_count; i++) {
276       if (cmd->state.attachments[i]->image->lrz_height)
277          lrz_img_count++;
278    }
279 
280    if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking &&
281        cmd->state.pass->subpass_count > 1 && lrz_img_count > 1) {
282       /* Theoretically we could switch between LRZ buffers during the binning
283        * and tiling passes, but it is untested and would add complexity for
284        * presumably extremely rare case.
285        */
286       tu_lrz_disable_reason(cmd, "Several subpasses with different depth attachments");
287 
288       for (unsigned i = 0; i < pass->attachment_count; i++) {
289          struct tu_image *image = cmd->state.attachments[i]->image;
290          tu_disable_lrz<CHIP>(cmd, &cmd->cs, image);
291       }
292 
293       /* We need a valid LRZ fast-clear base, in case the render pass contents
294        * are in secondaries that enable LRZ, so that they can read that LRZ is
295        * dynamically disabled. It doesn't matter which we use, so just leave
296        * the last one as emitted in tu_disable_lrz().
297        */
298       memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
299       return;
300    }
301 
302     /* Track LRZ valid state */
303    tu_lrz_begin_resumed_renderpass<CHIP>(cmd);
304 
305    if (!cmd->state.lrz.valid) {
306       tu6_emit_lrz_buffer<CHIP>(&cmd->cs, NULL);
307    }
308 }
309 TU_GENX(tu_lrz_begin_renderpass);
310 
311 void
tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer * cmd)312 tu_lrz_begin_secondary_cmdbuf(struct tu_cmd_buffer *cmd)
313 {
314    memset(&cmd->state.lrz, 0, sizeof(cmd->state.lrz));
315    uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
316    if (a != VK_ATTACHMENT_UNUSED) {
317       const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
318       tu_lrz_init_secondary(cmd, att);
319    }
320 }
321 
322 template <chip CHIP>
323 void
tu_lrz_tiling_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)324 tu_lrz_tiling_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
325 {
326    /* TODO: If lrz was never valid for the entire renderpass, we could exit
327     * early here. Sometimes we know this ahead of time and null out
328     * image_view, but with LOAD_OP_DONT_CARE this only happens if there were
329     * no secondaries.
330     */
331    if (!cmd->state.lrz.image_view)
332       return;
333 
334    struct tu_lrz_state *lrz = &cmd->state.lrz;
335 
336    tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
337 
338    if (lrz->reuse_previous_state) {
339       /* Reuse previous LRZ state, LRZ cache is assumed to be
340        * already invalidated by previous renderpass.
341        */
342       assert(lrz->gpu_dir_tracking);
343 
344       tu6_write_lrz_reg(cmd, cs,
345          A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
346       return;
347    }
348 
349    bool invalidate_lrz = !lrz->valid && lrz->gpu_dir_tracking;
350    if (invalidate_lrz) {
351       /* Following the blob we elect to disable LRZ for the whole renderpass
352        * if it is known that LRZ is disabled somewhere in the renderpass.
353        *
354        * This is accomplished by making later GRAS_LRZ_CNTL (in binning pass)
355        * to fail the comparison of depth views.
356        */
357       tu6_disable_lrz_via_depth_view<CHIP>(cmd, cs);
358       tu6_write_lrz_reg(cmd, cs, A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
359    } else if (lrz->fast_clear || lrz->gpu_dir_tracking) {
360       if (lrz->gpu_dir_tracking) {
361          tu6_write_lrz_reg(cmd, cs,
362             A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = lrz->image_view->view.GRAS_LRZ_DEPTH_VIEW));
363       }
364 
365       tu6_write_lrz_cntl<CHIP>(cmd, cs, {
366          .enable = true,
367          .fc_enable = lrz->fast_clear,
368          .disable_on_wrong_dir = lrz->gpu_dir_tracking,
369       });
370 
371       /* LRZ_CLEAR.fc_enable + LRZ_CLEAR - clears fast-clear buffer;
372        * LRZ_CLEAR.disable_on_wrong_dir + LRZ_CLEAR - sets direction to
373        *  CUR_DIR_UNSET.
374        */
375       if (CHIP >= A7XX)
376          tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_CLEAR_DEPTH_F32(lrz->depth_clear_value.depthStencil.depth));
377       tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_CLEAR);
378    }
379 
380    if (!lrz->fast_clear && !invalidate_lrz) {
381       tu6_clear_lrz<CHIP>(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
382       /* Even though we disable fast-clear we still have to dirty
383        * fast-clear buffer because both secondary cmdbufs and following
384        * renderpasses won't know that fast-clear is disabled.
385        *
386        * TODO: we could avoid this if we don't store depth and don't
387        * expect secondary cmdbufs.
388        */
389       if (lrz->image_view->image->has_lrz_fc) {
390          tu6_dirty_lrz_fc<CHIP>(cmd, cs, lrz->image_view->image);
391       }
392    }
393 }
394 TU_GENX(tu_lrz_tiling_begin);
395 
396 template <chip CHIP>
397 void
tu_lrz_tiling_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)398 tu_lrz_tiling_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
399 {
400    if (cmd->state.lrz.fast_clear || cmd->state.lrz.gpu_dir_tracking) {
401       tu6_emit_lrz_buffer<CHIP>(cs, cmd->state.lrz.image_view->image);
402 
403       if (cmd->state.lrz.gpu_dir_tracking) {
404          tu6_write_lrz_reg(cmd, &cmd->cs,
405             A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = cmd->state.lrz.image_view->view.GRAS_LRZ_DEPTH_VIEW));
406       }
407 
408       /* Enable flushing of LRZ fast-clear and of direction buffer */
409       tu6_write_lrz_cntl<CHIP>(cmd, cs, {
410          .enable = true,
411          .fc_enable = cmd->state.lrz.fast_clear,
412          .disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking,
413       });
414    } else {
415       tu6_write_lrz_cntl<CHIP>(cmd, cs, {.enable = false});
416    }
417 
418    tu_emit_event_write<CHIP>(cmd, cs, FD_LRZ_FLUSH);
419 
420    /* If gpu_dir_tracking is enabled and lrz is not valid blob, at this point,
421     * additionally clears direction buffer:
422     *  GRAS_LRZ_DEPTH_VIEW(.dword = 0)
423     *  GRAS_LRZ_DEPTH_VIEW(.dword = 0xffffffff)
424     *  A6XX_GRAS_LRZ_CNTL(.enable = true, .disable_on_wrong_dir = true)
425     *  LRZ_CLEAR
426     *  LRZ_FLUSH
427     * Since it happens after all of the rendering is done there is no known
428     * reason to do such clear.
429     */
430 }
431 TU_GENX(tu_lrz_tiling_end);
432 
433 template <chip CHIP>
434 void
tu_lrz_sysmem_begin(struct tu_cmd_buffer * cmd,struct tu_cs * cs)435 tu_lrz_sysmem_begin(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
436 {
437    if (cmd->device->physical_device->info->a6xx.has_lrz_feedback) {
438       tu_lrz_tiling_begin<CHIP>(cmd, cs);
439       return;
440    }
441 
442    if (!cmd->state.lrz.image_view)
443       return;
444 
445    /* Actually, LRZ buffer could be filled in sysmem, in theory to
446     * be used in another renderpass, but the benefit is rather dubious.
447     */
448 
449    struct tu_lrz_state *lrz = &cmd->state.lrz;
450 
451    if (cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking) {
452       tu_disable_lrz<CHIP>(cmd, cs, lrz->image_view->image);
453       /* Make sure depth view comparison will fail. */
454       tu6_write_lrz_reg(cmd, cs,
455          A6XX_GRAS_LRZ_DEPTH_VIEW(.dword = 0));
456    } else {
457       tu6_emit_lrz_buffer<CHIP>(cs, lrz->image_view->image);
458       /* Even though we disable LRZ writes in sysmem mode - there is still
459        * LRZ test, so LRZ should be cleared.
460        */
461       if (lrz->fast_clear) {
462          tu6_write_lrz_cntl<CHIP>(cmd, &cmd->cs, {
463             .enable = true,
464             .fc_enable = true,
465          });
466 
467          if (CHIP >= A7XX)
468             tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_CLEAR_DEPTH_F32(lrz->depth_clear_value.depthStencil.depth));
469          tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_CLEAR);
470          tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_FLUSH);
471       } else {
472          tu6_clear_lrz<CHIP>(cmd, cs, lrz->image_view->image, &lrz->depth_clear_value);
473       }
474    }
475 }
476 TU_GENX(tu_lrz_sysmem_begin);
477 
478 template <chip CHIP>
479 void
tu_lrz_sysmem_end(struct tu_cmd_buffer * cmd,struct tu_cs * cs)480 tu_lrz_sysmem_end(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
481 {
482    if (cmd->device->physical_device->info->a6xx.has_lrz_feedback) {
483       tu_lrz_tiling_end<CHIP>(cmd, cs);
484       return;
485    }
486 
487    tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_FLUSH);
488 }
489 TU_GENX(tu_lrz_sysmem_end);
490 
491 /* Disable LRZ outside of renderpass. */
492 template <chip CHIP>
493 void
tu_disable_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image)494 tu_disable_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
495                struct tu_image *image)
496 {
497    if (!cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
498       return;
499 
500    if (!image->lrz_height)
501       return;
502 
503    tu6_emit_lrz_buffer<CHIP>(cs, image);
504    tu6_disable_lrz_via_depth_view<CHIP>(cmd, cs);
505 }
506 TU_GENX(tu_disable_lrz);
507 
508 /* Clear LRZ, used for out of renderpass depth clears. */
509 template <chip CHIP>
510 void
tu_lrz_clear_depth_image(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)511 tu_lrz_clear_depth_image(struct tu_cmd_buffer *cmd,
512                          struct tu_image *image,
513                          const VkClearDepthStencilValue *pDepthStencil,
514                          uint32_t rangeCount,
515                          const VkImageSubresourceRange *pRanges)
516 {
517    if (!rangeCount || !image->lrz_height ||
518        !cmd->device->physical_device->info->a6xx.has_lrz_dir_tracking)
519       return;
520 
521    /* We cannot predict which depth subresource would be used later on,
522     * so we just pick the first one with depth cleared and clear the LRZ.
523     */
524    const VkImageSubresourceRange *range = NULL;
525    for (unsigned i = 0; i < rangeCount; i++) {
526       if (pRanges[i].aspectMask &
527             (VK_IMAGE_ASPECT_COLOR_BIT | VK_IMAGE_ASPECT_DEPTH_BIT)) {
528          range = &pRanges[i];
529          break;
530       }
531    }
532 
533    if (!range)
534       return;
535 
536    bool fast_clear = image->has_lrz_fc &&
537                      tu_lrzfc_depth_supported<CHIP>(pDepthStencil->depth);
538 
539    tu6_emit_lrz_buffer<CHIP>(&cmd->cs, image);
540 
541    tu6_write_lrz_reg(cmd, &cmd->cs, A6XX_GRAS_LRZ_DEPTH_VIEW(
542          .base_layer = range->baseArrayLayer,
543          .layer_count = vk_image_subresource_layer_count(&image->vk, range),
544          .base_mip_level = range->baseMipLevel,
545    ));
546 
547    tu6_write_lrz_cntl<CHIP>(cmd, &cmd->cs, {
548       .enable = true,
549       .fc_enable = fast_clear,
550       .disable_on_wrong_dir = true,
551    });
552 
553    if (CHIP >= A7XX)
554       tu_cs_emit_regs(&cmd->cs, A7XX_GRAS_LRZ_CLEAR_DEPTH_F32(pDepthStencil->depth));
555    tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_CLEAR);
556    tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_LRZ_FLUSH);
557 
558    if (!fast_clear) {
559       tu6_clear_lrz<CHIP>(cmd, &cmd->cs, image, (const VkClearValue*) pDepthStencil);
560    }
561 }
562 TU_GENX(tu_lrz_clear_depth_image);
563 
564 template <chip CHIP>
565 void
tu_lrz_disable_during_renderpass(struct tu_cmd_buffer * cmd)566 tu_lrz_disable_during_renderpass(struct tu_cmd_buffer *cmd)
567 {
568    assert(cmd->state.pass);
569 
570    cmd->state.lrz.valid = false;
571    cmd->state.dirty |= TU_CMD_DIRTY_LRZ;
572 
573    if (cmd->state.lrz.gpu_dir_tracking) {
574       tu6_write_lrz_cntl<CHIP>(cmd, &cmd->cs, {
575          .enable = true,
576          .dir = LRZ_DIR_INVALID,
577          .disable_on_wrong_dir = true,
578       });
579    }
580 }
581 TU_GENX(tu_lrz_disable_during_renderpass);
582 
583 /* update lrz state based on stencil-test func:
584  *
585  * Conceptually the order of the pipeline is:
586  *
587  *
588  *   FS -> Alpha-Test  ->  Stencil-Test  ->  Depth-Test
589  *                              |                |
590  *                       if wrmask != 0     if wrmask != 0
591  *                              |                |
592  *                              v                v
593  *                        Stencil-Write      Depth-Write
594  *
595  * Because Stencil-Test can have side effects (Stencil-Write) prior
596  * to depth test, in this case we potentially need to disable early
597  * lrz-test. See:
598  *
599  * https://www.khronos.org/opengl/wiki/Per-Sample_Processing
600  */
601 static bool
tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL * gras_lrz_cntl,VkCompareOp func,bool stencil_write)602 tu6_stencil_op_lrz_allowed(struct A6XX_GRAS_LRZ_CNTL *gras_lrz_cntl,
603                            VkCompareOp func,
604                            bool stencil_write)
605 {
606    switch (func) {
607    case VK_COMPARE_OP_ALWAYS:
608       /* nothing to do for LRZ, but for stencil test when stencil-
609        * write is enabled, we need to disable lrz-test, since
610        * conceptually stencil test and write happens before depth-test.
611        */
612       if (stencil_write) {
613          return false;
614       }
615       break;
616    case VK_COMPARE_OP_NEVER:
617       /* fragment never passes, disable lrz_write for this draw. */
618       gras_lrz_cntl->lrz_write = false;
619       break;
620    default:
621       /* whether the fragment passes or not depends on result
622        * of stencil test, which we cannot know when doing binning
623        * pass.
624        */
625       gras_lrz_cntl->lrz_write = false;
626       /* similarly to the VK_COMPARE_OP_ALWAYS case, if there are side-
627        * effects from stencil test we need to disable lrz-test.
628        */
629       if (stencil_write) {
630          return false;
631       }
632       break;
633    }
634 
635    return true;
636 }
637 
638 template <chip CHIP>
639 static struct A6XX_GRAS_LRZ_CNTL
tu6_calculate_lrz_state(struct tu_cmd_buffer * cmd,const uint32_t a)640 tu6_calculate_lrz_state(struct tu_cmd_buffer *cmd,
641                         const uint32_t a)
642 {
643    const struct tu_shader *fs = cmd->state.shaders[MESA_SHADER_FRAGMENT];
644    bool z_test_enable = cmd->vk.dynamic_graphics_state.ds.depth.test_enable;
645    bool z_write_enable = cmd->vk.dynamic_graphics_state.ds.depth.write_enable;
646    bool z_bounds_enable = cmd->vk.dynamic_graphics_state.ds.depth.bounds_test.enable;
647    VkCompareOp depth_compare_op =
648       cmd->vk.dynamic_graphics_state.ds.depth.compare_op;
649 
650    struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = { 0 };
651 
652    if (!cmd->state.lrz.valid) {
653       return gras_lrz_cntl;
654    }
655 
656    /* If depth test is disabled we shouldn't touch LRZ.
657     * Same if there is no depth attachment.
658     */
659    if (a == VK_ATTACHMENT_UNUSED || !z_test_enable || !cmd->device->use_lrz)
660       return gras_lrz_cntl;
661 
662    if (!cmd->state.lrz.gpu_dir_tracking && !cmd->state.attachments) {
663       /* Without on-gpu LRZ direction tracking - there is nothing we
664        * can do to enable LRZ in secondary command buffers.
665        */
666       return gras_lrz_cntl;
667    }
668 
669    /* See comment in tu_pipeline about disabling LRZ write for blending. */
670    bool reads_dest = cmd->state.blend_reads_dest;
671 
672    gras_lrz_cntl.enable = true;
673    gras_lrz_cntl.lrz_write =
674       z_write_enable &&
675       !reads_dest &&
676       !(fs->fs.lrz.status & TU_LRZ_FORCE_DISABLE_WRITE);
677    gras_lrz_cntl.z_test_enable = z_write_enable;
678    gras_lrz_cntl.z_bounds_enable = z_bounds_enable;
679    gras_lrz_cntl.fc_enable = cmd->state.lrz.fast_clear;
680    gras_lrz_cntl.dir_write = cmd->state.lrz.gpu_dir_tracking;
681    gras_lrz_cntl.disable_on_wrong_dir = cmd->state.lrz.gpu_dir_tracking;
682 
683    if (CHIP >= A7XX)
684       gras_lrz_cntl.z_func = tu6_compare_func(depth_compare_op);
685 
686    /* LRZ is disabled until it is cleared, which means that one "wrong"
687     * depth test or shader could disable LRZ until depth buffer is cleared.
688     */
689    bool disable_lrz = false;
690    bool temporary_disable_lrz = false;
691 
692    /* What happens in FS could affect LRZ, e.g.: writes to gl_FragDepth or early
693     * fragment tests.  We have to skip LRZ testing and updating, but as long as
694     * the depth direction stayed the same we can continue with LRZ testing later.
695     */
696    if (fs->fs.lrz.status & TU_LRZ_FORCE_DISABLE_LRZ) {
697       if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN || !cmd->state.lrz.gpu_dir_tracking) {
698          perf_debug(cmd->device, "Skipping LRZ due to FS");
699          temporary_disable_lrz = true;
700       } else {
701          tu_lrz_disable_reason(cmd, "FS writes depth or has side-effects (TODO: fix for gpu-direction-tracking case)");
702          disable_lrz = true;
703       }
704    }
705 
706    /* If Z is not written - it doesn't affect LRZ buffer state.
707     * Which means two things:
708     * - Don't lock direction until Z is written for the first time;
709     * - If Z isn't written and direction IS locked it's possible to just
710     *   temporary disable LRZ instead of fully bailing out, when direction
711     *   is changed.
712     */
713 
714    enum tu_lrz_direction lrz_direction = TU_LRZ_UNKNOWN;
715    switch (depth_compare_op) {
716    case VK_COMPARE_OP_ALWAYS:
717    case VK_COMPARE_OP_NOT_EQUAL:
718       /* OP_ALWAYS and OP_NOT_EQUAL could have depth value of any direction,
719        * so if there is a depth write - LRZ must be disabled.
720        */
721       if (z_write_enable) {
722          tu_lrz_disable_reason(cmd, "Depth write + ALWAYS/NOT_EQUAL");
723          disable_lrz = true;
724          gras_lrz_cntl.dir = LRZ_DIR_INVALID;
725       } else {
726          perf_debug(cmd->device, "Skipping LRZ due to ALWAYS/NOT_EQUAL");
727          temporary_disable_lrz = true;
728       }
729       break;
730    case VK_COMPARE_OP_EQUAL:
731    case VK_COMPARE_OP_NEVER:
732       /* Blob disables LRZ for OP_EQUAL, and from our empirical
733        * evidence it is a right thing to do.
734        *
735        * Both OP_EQUAL and OP_NEVER don't change LRZ buffer so
736        * we could just temporary disable LRZ.
737        */
738       temporary_disable_lrz = true;
739       break;
740    case VK_COMPARE_OP_GREATER:
741    case VK_COMPARE_OP_GREATER_OR_EQUAL:
742       lrz_direction = TU_LRZ_GREATER;
743       gras_lrz_cntl.greater = true;
744       gras_lrz_cntl.dir = LRZ_DIR_GE;
745       break;
746    case VK_COMPARE_OP_LESS:
747    case VK_COMPARE_OP_LESS_OR_EQUAL:
748       lrz_direction = TU_LRZ_LESS;
749       gras_lrz_cntl.greater = false;
750       gras_lrz_cntl.dir = LRZ_DIR_LE;
751       break;
752    default:
753       unreachable("bad VK_COMPARE_OP value or uninitialized");
754       break;
755    };
756 
757    /* If depthfunc direction is changed, bail out on using LRZ. The
758     * LRZ buffer encodes a min/max depth value per block, but if
759     * we switch from GT/GE <-> LT/LE, those values cannot be
760     * interpreted properly.
761     */
762    if (cmd->state.lrz.prev_direction != TU_LRZ_UNKNOWN &&
763        lrz_direction != TU_LRZ_UNKNOWN &&
764        cmd->state.lrz.prev_direction != lrz_direction) {
765       if (z_write_enable) {
766          tu_lrz_disable_reason(cmd, "Depth write + compare-op direction change");
767          disable_lrz = true;
768       } else {
769          perf_debug(cmd->device, "Skipping LRZ due to direction change");
770          temporary_disable_lrz = true;
771       }
772    }
773 
774    /* Consider the following sequence of depthfunc changes:
775     *
776     * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_GREATER
777     * LRZ is disabled during COMPARE_OP_EQUAL but could be enabled
778     * during second VK_COMPARE_OP_GREATER.
779     *
780     * - COMPARE_OP_GREATER -> COMPARE_OP_EQUAL -> COMPARE_OP_LESS
781     * Here, LRZ is disabled during COMPARE_OP_EQUAL and should become
782     * invalid during COMPARE_OP_LESS.
783     *
784     * This shows that we should keep last KNOWN direction.
785     */
786    if (z_write_enable && lrz_direction != TU_LRZ_UNKNOWN)
787       cmd->state.lrz.prev_direction = lrz_direction;
788 
789    /* Invalidate LRZ and disable write if stencil test is enabled */
790    bool stencil_test_enable = cmd->vk.dynamic_graphics_state.ds.stencil.test_enable;
791    if (!disable_lrz && stencil_test_enable) {
792       VkCompareOp stencil_front_compare_op = (VkCompareOp)
793          cmd->vk.dynamic_graphics_state.ds.stencil.front.op.compare;
794 
795       VkCompareOp stencil_back_compare_op = (VkCompareOp)
796          cmd->vk.dynamic_graphics_state.ds.stencil.back.op.compare;
797 
798       bool lrz_allowed = true;
799       lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
800                                       &gras_lrz_cntl, stencil_front_compare_op,
801                                       cmd->state.stencil_front_write);
802 
803       lrz_allowed = lrz_allowed && tu6_stencil_op_lrz_allowed(
804                                       &gras_lrz_cntl, stencil_back_compare_op,
805                                       cmd->state.stencil_back_write);
806 
807       /* Without depth write it's enough to make sure that depth test
808        * is executed after stencil test, so temporary disabling LRZ is enough.
809        */
810       if (!lrz_allowed) {
811          if (z_write_enable) {
812             tu_lrz_disable_reason(cmd, "Stencil write");
813             disable_lrz = true;
814          } else {
815             perf_debug(cmd->device, "Skipping LRZ due to stencil write");
816             temporary_disable_lrz = true;
817          }
818       }
819    }
820 
821    /* Writing depth with blend enabled means we need to invalidate LRZ,
822     * because the written depth value could mean that a later draw with
823     * depth enabled (where we would otherwise write LRZ) could have
824     * fragments which don't pass the depth test due to this draw.  For
825     * example, consider this sequence of draws, with depth mode GREATER:
826     *
827     *   draw A:
828     *     z=0.1, fragments pass
829     *   draw B:
830     *     z=0.4, fragments pass
831     *     blend enabled (LRZ write disabled)
832     *     depth write enabled
833     *   draw C:
834     *     z=0.2, fragments don't pass
835     *     blend disabled
836     *     depth write enabled
837     *
838     * Normally looking at the state in draw C, we'd assume we could
839     * enable LRZ write.  But this would cause early-z/lrz to discard
840     * fragments from draw A which should be visible due to draw B.
841     */
842    if (reads_dest && z_write_enable && cmd->device->instance->conservative_lrz) {
843       tu_lrz_disable_reason(cmd, "Depth write + blending");
844       disable_lrz = true;
845    }
846 
847    if (disable_lrz)
848       cmd->state.lrz.valid = false;
849 
850    if (disable_lrz && cmd->state.lrz.gpu_dir_tracking) {
851       /* Direction byte on GPU should be set to CUR_DIR_DISABLED,
852        * for this it's not enough to emit empty GRAS_LRZ_CNTL.
853        */
854       gras_lrz_cntl.enable = true;
855       gras_lrz_cntl.dir = LRZ_DIR_INVALID;
856 
857       return gras_lrz_cntl;
858    }
859 
860    if (temporary_disable_lrz)
861       gras_lrz_cntl.enable = false;
862 
863    cmd->state.lrz.enabled = cmd->state.lrz.valid && gras_lrz_cntl.enable;
864    if (!cmd->state.lrz.enabled)
865       memset(&gras_lrz_cntl, 0, sizeof(gras_lrz_cntl));
866 
867    return gras_lrz_cntl;
868 }
869 
870 template <chip CHIP>
871 void
tu6_emit_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs)872 tu6_emit_lrz(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
873 {
874    const uint32_t a = cmd->state.subpass->depth_stencil_attachment.attachment;
875    struct A6XX_GRAS_LRZ_CNTL gras_lrz_cntl = tu6_calculate_lrz_state<CHIP>(cmd, a);
876 
877    tu6_write_lrz_cntl<CHIP>(cmd, cs, gras_lrz_cntl);
878    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(.enable = gras_lrz_cntl.enable));
879 }
880 TU_GENX(tu6_emit_lrz);
881