xref: /aosp_15_r20/external/mesa3d/src/freedreno/vulkan/tu_clear_blit.cc (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2019-2020 Valve Corporation
3  * SPDX-License-Identifier: MIT
4  *
5  * Authors:
6  *    Jonathan Marek <[email protected]>
7  */
8 
9 #include "tu_clear_blit.h"
10 
11 #include "ir3/ir3_nir.h"
12 
13 #include "util/format_r11g11b10f.h"
14 #include "util/format_rgb9e5.h"
15 #include "util/format_srgb.h"
16 #include "util/half_float.h"
17 #include "compiler/nir/nir_builder.h"
18 
19 #include "tu_buffer.h"
20 #include "tu_cmd_buffer.h"
21 #include "tu_cs.h"
22 #include "tu_formats.h"
23 #include "tu_image.h"
24 #include "tu_tracepoints.h"
25 #include "tu_lrz.h"
26 
27 #include "common/freedreno_gpu_event.h"
28 #include "common/freedreno_lrz.h"
29 
30 static const VkOffset2D blt_no_coord = { ~0, ~0 };
31 
32 static uint32_t
tu_pack_float32_for_unorm(float val,int bits)33 tu_pack_float32_for_unorm(float val, int bits)
34 {
35    return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
36 }
37 
38 /* r2d_ = BLIT_OP_SCALE operations */
39 
40 static enum a6xx_2d_ifmt
format_to_ifmt(enum pipe_format format)41 format_to_ifmt(enum pipe_format format)
42 {
43    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
44        format == PIPE_FORMAT_Z24X8_UNORM)
45       return R2D_UNORM8;
46 
47    /* get_component_bits doesn't work with depth/stencil formats: */
48    if (format == PIPE_FORMAT_Z16_UNORM || format == PIPE_FORMAT_Z32_FLOAT)
49       return R2D_FLOAT32;
50    if (format == PIPE_FORMAT_S8_UINT)
51       return R2D_INT8;
52    if (format == PIPE_FORMAT_A8_UNORM)
53       return R2D_UNORM8;
54 
55    /* use the size of the red channel to find the corresponding "ifmt" */
56    bool is_int = util_format_is_pure_integer(format);
57    switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
58    case 4: case 5: case 8:
59       return is_int ? R2D_INT8 : R2D_UNORM8;
60    case 10: case 11:
61       return is_int ? R2D_INT16 : R2D_FLOAT16;
62    case 16:
63       if (util_format_is_float(format))
64          return R2D_FLOAT16;
65       return is_int ? R2D_INT16 : R2D_FLOAT32;
66    case 32:
67       return is_int ? R2D_INT32 : R2D_FLOAT32;
68     default:
69       unreachable("bad format");
70    }
71 }
72 
73 template <chip CHIP>
74 static struct tu_native_format
blit_format_texture(enum pipe_format format,enum a6xx_tile_mode tile_mode,bool gmem)75 blit_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode, bool gmem)
76 {
77    struct tu_native_format fmt = tu6_format_texture(format, tile_mode);
78 
79    switch (format) {
80    case PIPE_FORMAT_Z24X8_UNORM:
81    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
82       /* Similar to in fdl6_view_init, we want to use
83        * FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 or FMT6_8_8_8_8_UNORM for blit
84        * src.  Since this is called when there is no image and thus no ubwc,
85        * we can always use FMT6_8_8_8_8_UNORM.
86        *
87        * Note (A7XX): Since it's erroneous to use FMT6_8_8_8_8_UNORM for a GMEM
88        * image (see blit_base_format), we use FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8
89        * instead.
90        */
91       fmt.fmt = CHIP >= A7XX && gmem ? FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 : FMT6_8_8_8_8_UNORM;
92       break;
93    default:
94       break;
95    }
96 
97    return fmt;
98 }
99 
100 static struct tu_native_format
blit_format_color(enum pipe_format format,enum a6xx_tile_mode tile_mode)101 blit_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode)
102 {
103    struct tu_native_format fmt = tu6_format_color(format, tile_mode);
104 
105    switch (format) {
106    case PIPE_FORMAT_Z24X8_UNORM:
107    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
108       /* similar to blit_format_texture but for blit dst */
109       fmt.fmt = FMT6_8_8_8_8_UNORM;
110       break;
111    default:
112       break;
113    }
114 
115    return fmt;
116 }
117 
118 template <chip CHIP>
119 static enum a6xx_format
blit_base_format(enum pipe_format format,bool ubwc,bool gmem)120 blit_base_format(enum pipe_format format, bool ubwc, bool gmem)
121 {
122    if (CHIP >= A7XX && gmem)
123       /* A7XX requires D24S8 in GMEM to always be treated as
124        * FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 regardless of if the image
125        * is UBWC-compatible. Using FMT6_8_8_8_8_UNORM instead will result
126        * in misrendering around the edges of the destination image.
127        */
128       ubwc = true;
129 
130    if (ubwc) {
131       switch (format) {
132       case PIPE_FORMAT_Z24X8_UNORM:
133       case PIPE_FORMAT_Z24_UNORM_S8_UINT:
134          /* use the ubwc-compatible FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 */
135          return FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
136       default:
137          break;
138       }
139    }
140 
141    /* note: tu6_format_color doesn't care about tiling for .fmt field */
142    return blit_format_color(format, TILE6_LINEAR).fmt;
143 }
144 
145 static void
r2d_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset2D dst,const VkOffset2D src,const VkExtent2D extent)146 r2d_coords(struct tu_cmd_buffer *cmd,
147            struct tu_cs *cs,
148            const VkOffset2D dst,
149            const VkOffset2D src,
150            const VkExtent2D extent)
151 {
152    tu_cs_emit_regs(cs,
153       A6XX_GRAS_2D_DST_TL(.x = dst.x,                    .y = dst.y),
154       A6XX_GRAS_2D_DST_BR(.x = dst.x + extent.width - 1, .y = dst.y + extent.height - 1));
155 
156    if (src.x == blt_no_coord.x)
157       return;
158 
159    tu_cs_emit_regs(cs,
160                    A6XX_GRAS_2D_SRC_TL_X(src.x),
161                    A6XX_GRAS_2D_SRC_BR_X(src.x + extent.width - 1),
162                    A6XX_GRAS_2D_SRC_TL_Y(src.y),
163                    A6XX_GRAS_2D_SRC_BR_Y(src.y + extent.height - 1));
164 }
165 
166 static void
r2d_clear_value(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,const VkClearValue * val)167 r2d_clear_value(struct tu_cmd_buffer *cmd,
168                 struct tu_cs *cs,
169                 enum pipe_format format,
170                 const VkClearValue *val)
171 {
172    uint32_t clear_value[4] = {};
173 
174    switch (format) {
175    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
176    case PIPE_FORMAT_Z24X8_UNORM:
177       /* cleared as r8g8b8a8_unorm using special format */
178       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
179       clear_value[1] = clear_value[0] >> 8;
180       clear_value[2] = clear_value[0] >> 16;
181       clear_value[3] = val->depthStencil.stencil;
182       break;
183    case PIPE_FORMAT_Z16_UNORM:
184    case PIPE_FORMAT_Z32_FLOAT:
185       /* R2D_FLOAT32 */
186       clear_value[0] = fui(val->depthStencil.depth);
187       break;
188    case PIPE_FORMAT_S8_UINT:
189       clear_value[0] = val->depthStencil.stencil;
190       break;
191    case PIPE_FORMAT_R9G9B9E5_FLOAT:
192       /* cleared as UINT32 */
193       clear_value[0] = float3_to_rgb9e5(val->color.float32);
194       break;
195    default:
196       assert(!util_format_is_depth_or_stencil(format));
197       const struct util_format_description *desc = util_format_description(format);
198       enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
199 
200       assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
201              format == PIPE_FORMAT_R11G11B10_FLOAT);
202 
203       for (unsigned i = 0; i < 4; i++) {
204          if (desc->swizzle[i] > PIPE_SWIZZLE_W)
205             continue;
206 
207          const struct util_format_channel_description *ch =
208             &desc->channel[desc->swizzle[i]];
209          if (ifmt == R2D_UNORM8) {
210             float linear = val->color.float32[i];
211             if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
212                linear = util_format_linear_to_srgb_float(val->color.float32[i]);
213 
214             if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
215                clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
216             else
217                clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
218          } else if (ifmt == R2D_FLOAT16) {
219             clear_value[i] = _mesa_float_to_half(val->color.float32[i]);
220          } else {
221             assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
222                    ifmt == R2D_INT16 || ifmt == R2D_INT8);
223             clear_value[i] = val->color.uint32[i];
224          }
225       }
226       break;
227    }
228 
229    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
230    tu_cs_emit_array(cs, clear_value, 4);
231 }
232 
233 static void
fixup_src_format(enum pipe_format * src_format,enum pipe_format dst_format,enum a6xx_format * fmt)234 fixup_src_format(enum pipe_format *src_format, enum pipe_format dst_format,
235                  enum a6xx_format *fmt)
236 {
237    /* When blitting S8 -> D24S8 or vice versa, we have to override S8, which
238     * is normally R8_UINT for sampling/blitting purposes, to a unorm format.
239     * We also have to move stencil, which is normally in the .w channel, into
240     * the right channel. Reintepreting the S8 texture as A8_UNORM solves both
241     * problems, and avoids using a swap, which seems to sometimes not work
242     * with a D24S8 source, or a texture swizzle which is only supported with
243     * the 3d path. Sometimes this blit happens on already-constructed
244     * fdl6_view's, e.g. for sysmem resolves, so this has to happen as a fixup.
245     */
246    if (*src_format == PIPE_FORMAT_S8_UINT &&
247        (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
248         dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
249       *fmt = FMT6_A8_UNORM;
250       *src_format = PIPE_FORMAT_A8_UNORM;
251    }
252 }
253 
254 static void
fixup_dst_format(enum pipe_format src_format,enum pipe_format * dst_format,enum a6xx_format * fmt)255 fixup_dst_format(enum pipe_format src_format, enum pipe_format *dst_format,
256                  enum a6xx_format *fmt)
257 {
258    if (*dst_format == PIPE_FORMAT_S8_UINT &&
259        (src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
260         src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
261       *dst_format = PIPE_FORMAT_A8_UNORM;
262       *fmt = FMT6_A8_UNORM;
263    }
264 }
265 
266 template <chip CHIP>
267 static void
r2d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,VkFilter filter,enum pipe_format dst_format)268 r2d_src(struct tu_cmd_buffer *cmd,
269         struct tu_cs *cs,
270         const struct fdl6_view *iview,
271         uint32_t layer,
272         VkFilter filter,
273         enum pipe_format dst_format)
274 {
275    uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
276    if (filter != VK_FILTER_NEAREST)
277       src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
278 
279    enum a6xx_format fmt = (enum a6xx_format)(
280       src_info & A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK);
281    enum pipe_format src_format = iview->format;
282    fixup_src_format(&src_format, dst_format, &fmt);
283 
284    src_info =
285       (src_info & ~A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK) |
286       A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(fmt);
287 
288    tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP,).reg, 5);
289    tu_cs_emit(cs, src_info);
290    tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
291    tu_cs_image_ref_2d<CHIP>(cs, iview, layer, true);
292 
293    tu_cs_emit_pkt4(cs, __SP_PS_2D_SRC_FLAGS<CHIP>({}).reg, 3);
294    tu_cs_image_flag_ref(cs, iview, layer);
295 }
296 
297 template <chip CHIP>
298 static void
r2d_src_depth(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)299 r2d_src_depth(struct tu_cmd_buffer *cmd,
300                 struct tu_cs *cs,
301                 const struct tu_image_view *iview,
302                 uint32_t layer,
303                 VkFilter filter)
304 {
305    tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP).reg, 5);
306    tu_cs_emit(cs, tu_image_view_depth(iview, SP_PS_2D_SRC_INFO));
307    tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
308    tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
309    /* SP_PS_2D_SRC_PITCH has shifted pitch field */
310    tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->depth_pitch).value);
311 
312    tu_cs_emit_pkt4(cs, __SP_PS_2D_SRC_FLAGS<CHIP>({}).reg, 3);
313    tu_cs_image_flag_ref(cs, &iview->view, layer);
314 }
315 
316 template <chip CHIP>
317 static void
r2d_src_stencil(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)318 r2d_src_stencil(struct tu_cmd_buffer *cmd,
319                 struct tu_cs *cs,
320                 const struct tu_image_view *iview,
321                 uint32_t layer,
322                 VkFilter filter)
323 {
324    tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP,).reg, 5);
325    tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS);
326    tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
327    tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
328    tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->stencil_pitch).value);
329 }
330 
331 template <chip CHIP>
332 static void
r2d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)333 r2d_src_buffer(struct tu_cmd_buffer *cmd,
334                struct tu_cs *cs,
335                enum pipe_format format,
336                uint64_t va, uint32_t pitch,
337                uint32_t width, uint32_t height,
338                enum pipe_format dst_format)
339 {
340    struct tu_native_format fmt = blit_format_texture<CHIP>(format, TILE6_LINEAR, false);
341    enum a6xx_format color_format = fmt.fmt;
342    fixup_src_format(&format, dst_format, &color_format);
343 
344    tu_cs_emit_regs(cs,
345                    SP_PS_2D_SRC_INFO(CHIP,
346                       .color_format = color_format,
347                       .color_swap = fmt.swap,
348                       .srgb = util_format_is_srgb(format),
349                       .unk20 = 1,
350                       .unk22 = 1),
351                    SP_PS_2D_SRC_SIZE(CHIP, .width = width, .height = height),
352                    SP_PS_2D_SRC(CHIP, .qword = va),
353                    SP_PS_2D_SRC_PITCH(CHIP, .pitch = pitch));
354 }
355 
356 template <chip CHIP>
357 static void
r2d_dst(struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,enum pipe_format src_format)358 r2d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
359         enum pipe_format src_format)
360 {
361    uint32_t dst_info = iview->RB_2D_DST_INFO;
362    enum a6xx_format fmt =
363       (enum a6xx_format)(dst_info & A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK);
364    enum pipe_format dst_format = iview->format;
365    fixup_dst_format(src_format, &dst_format, &fmt);
366 
367    dst_info =
368          (dst_info & ~A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK) | fmt;
369    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
370    tu_cs_emit(cs, dst_info);
371    tu_cs_image_ref_2d<CHIP>(cs, iview, layer, false);
372 
373    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
374    tu_cs_image_flag_ref(cs, iview, layer);
375 }
376 
377 static void
r2d_dst_depth(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)378 r2d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
379 {
380    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
381    tu_cs_emit(cs, tu_image_view_depth(iview, RB_2D_DST_INFO));
382    tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
383    tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->depth_pitch).value);
384 
385    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
386    tu_cs_image_flag_ref(cs, &iview->view, layer);
387 }
388 
389 static void
r2d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)390 r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
391 {
392    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
393    tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
394    tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
395    tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->stencil_pitch).value);
396 }
397 
398 static void
r2d_dst_buffer(struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,enum pipe_format src_format)399 r2d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
400                enum pipe_format src_format)
401 {
402    struct tu_native_format fmt = blit_format_color(format, TILE6_LINEAR);
403    enum a6xx_format color_fmt = fmt.fmt;
404    fixup_dst_format(src_format, &format, &color_fmt);
405    fmt.fmt = color_fmt;
406 
407    tu_cs_emit_regs(cs,
408                    A6XX_RB_2D_DST_INFO(
409                       .color_format = fmt.fmt,
410                       .color_swap = fmt.swap,
411                       .srgb = util_format_is_srgb(format)),
412                    A6XX_RB_2D_DST(.qword = va),
413                    A6XX_RB_2D_DST_PITCH(pitch));
414 }
415 
416 template <chip CHIP>
417 static void
r2d_setup_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,bool scissor)418 r2d_setup_common(struct tu_cmd_buffer *cmd,
419                  struct tu_cs *cs,
420                  enum pipe_format src_format,
421                  enum pipe_format dst_format,
422                  VkImageAspectFlags aspect_mask,
423                  unsigned blit_param,
424                  bool clear,
425                  bool ubwc,
426                  bool scissor)
427 {
428    if (!cmd->state.pass && cmd->device->dbg_renderpass_stomp_cs) {
429       tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
430    }
431 
432    enum a6xx_format fmt = blit_base_format<CHIP>(dst_format, ubwc, false);
433    fixup_dst_format(src_format, &dst_format, &fmt);
434    enum a6xx_2d_ifmt ifmt = format_to_ifmt(dst_format);
435 
436    uint32_t unknown_8c01 = 0;
437 
438    /* note: the only format with partial clearing is D24S8 */
439    if (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
440       /* preserve stencil channel */
441       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
442          unknown_8c01 = 0x08000041;
443       /* preserve depth channels */
444       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
445          unknown_8c01 = 0x00084001;
446    }
447 
448    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
449    tu_cs_emit(cs, unknown_8c01);    // TODO: seem to be always 0 on A7XX
450 
451    uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
452          .rotate = (enum a6xx_rotation) blit_param,
453          .solid_color = clear,
454          .color_format = fmt,
455          .scissor = scissor,
456          .d24s8 = fmt == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
457          .mask = 0xf,
458          .ifmt = util_format_is_srgb(dst_format) ? R2D_UNORM8_SRGB : ifmt,
459       ).value;
460 
461    tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
462    tu_cs_emit(cs, blit_cntl);
463 
464    tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
465    tu_cs_emit(cs, blit_cntl);
466 
467    if (CHIP > A6XX) {
468       tu_cs_emit_pkt4(cs, REG_A7XX_SP_PS_UNKNOWN_B2D2, 1);
469       tu_cs_emit(cs, 0x20000000);
470    }
471 
472    if (fmt == FMT6_10_10_10_2_UNORM_DEST)
473       fmt = FMT6_16_16_16_16_FLOAT;
474 
475    tu_cs_emit_regs(cs, SP_2D_DST_FORMAT(CHIP,
476          .sint = util_format_is_pure_sint(dst_format),
477          .uint = util_format_is_pure_uint(dst_format),
478          .color_format = fmt,
479          .srgb = util_format_is_srgb(dst_format),
480          .mask = 0xf));
481 }
482 
483 template <chip CHIP>
484 static void
r2d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)485 r2d_setup(struct tu_cmd_buffer *cmd,
486           struct tu_cs *cs,
487           enum pipe_format src_format,
488           enum pipe_format dst_format,
489           VkImageAspectFlags aspect_mask,
490           unsigned blit_param,
491           bool clear,
492           bool ubwc,
493           VkSampleCountFlagBits samples)
494 {
495    assert(samples == VK_SAMPLE_COUNT_1_BIT);
496 
497    if (!cmd->state.pass) {
498       tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_SYSMEM);
499    }
500 
501    r2d_setup_common<CHIP>(cmd, cs, src_format, dst_format, aspect_mask, blit_param, clear, ubwc, false);
502 }
503 
504 static void
r2d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)505 r2d_teardown(struct tu_cmd_buffer *cmd,
506              struct tu_cs *cs)
507 {
508    /* nothing to do here */
509 }
510 
511 static void
r2d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)512 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
513 {
514    if (cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
515        cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL) {
516       /* This a non-context register, so we have to WFI before changing. */
517       tu_cs_emit_wfi(cs);
518       tu_cs_emit_write_reg(
519          cs, REG_A6XX_RB_DBG_ECO_CNTL,
520          cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit);
521    }
522 
523    tu_cs_emit_pkt7(cs, CP_BLIT, 1);
524    tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
525 
526    if (cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
527        cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL) {
528       tu_cs_emit_wfi(cs);
529       tu_cs_emit_write_reg(
530          cs, REG_A6XX_RB_DBG_ECO_CNTL,
531          cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL);
532    }
533 }
534 
535 /* r3d_ = shader path operations */
536 
537 static nir_def *
load_const(nir_builder * b,unsigned base,unsigned components)538 load_const(nir_builder *b, unsigned base, unsigned components)
539 {
540    return nir_load_const_ir3(b, components, 32, nir_imm_int(b, 0),
541                              .base = base);
542 }
543 
544 static nir_shader *
build_blit_vs_shader(void)545 build_blit_vs_shader(void)
546 {
547    nir_builder _b =
548       nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
549    nir_builder *b = &_b;
550    b->shader->info.internal = true;
551 
552    nir_variable *out_pos =
553       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
554                           "gl_Position");
555    out_pos->data.location = VARYING_SLOT_POS;
556 
557    nir_def *vert0_pos = load_const(b, 0, 2);
558    nir_def *vert1_pos = load_const(b, 4, 2);
559    nir_def *vertex = nir_load_vertex_id(b);
560 
561    nir_def *pos = nir_bcsel(b, nir_i2b(b, vertex), vert1_pos, vert0_pos);
562    pos = nir_vec4(b, nir_channel(b, pos, 0),
563                      nir_channel(b, pos, 1),
564                      nir_imm_float(b, 0.0),
565                      nir_imm_float(b, 1.0));
566 
567    nir_store_var(b, out_pos, pos, 0xf);
568 
569    nir_variable *out_coords =
570       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec_type(3),
571                           "coords");
572    out_coords->data.location = VARYING_SLOT_VAR0;
573 
574    nir_def *vert0_coords = load_const(b, 2, 2);
575    nir_def *vert1_coords = load_const(b, 6, 2);
576 
577    /* Only used with "z scale" blit path which uses a 3d texture */
578    nir_def *z_coord = load_const(b, 16, 1);
579 
580    nir_def *coords = nir_bcsel(b, nir_i2b(b, vertex), vert1_coords, vert0_coords);
581    coords = nir_vec3(b, nir_channel(b, coords, 0), nir_channel(b, coords, 1),
582                      z_coord);
583 
584    nir_store_var(b, out_coords, coords, 0x7);
585 
586    return b->shader;
587 }
588 
589 static nir_shader *
build_clear_vs_shader(void)590 build_clear_vs_shader(void)
591 {
592    nir_builder _b =
593       nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
594    nir_builder *b = &_b;
595    b->shader->info.internal = true;
596 
597    nir_variable *out_pos =
598       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
599                           "gl_Position");
600    out_pos->data.location = VARYING_SLOT_POS;
601 
602    nir_def *vert0_pos = load_const(b, 0, 2);
603    nir_def *vert1_pos = load_const(b, 4, 2);
604    /* c0.z is used to clear depth */
605    nir_def *depth = load_const(b, 2, 1);
606    nir_def *vertex = nir_load_vertex_id(b);
607 
608    nir_def *pos = nir_bcsel(b, nir_i2b(b, vertex), vert1_pos, vert0_pos);
609    pos = nir_vec4(b, nir_channel(b, pos, 0),
610                      nir_channel(b, pos, 1),
611                      depth, nir_imm_float(b, 1.0));
612 
613    nir_store_var(b, out_pos, pos, 0xf);
614 
615    nir_variable *out_layer =
616       nir_variable_create(b->shader, nir_var_shader_out, glsl_uint_type(),
617                           "gl_Layer");
618    out_layer->data.location = VARYING_SLOT_LAYER;
619    nir_def *layer = load_const(b, 3, 1);
620    nir_store_var(b, out_layer, layer, 1);
621 
622    return b->shader;
623 }
624 
625 static nir_shader *
build_blit_fs_shader(bool zscale)626 build_blit_fs_shader(bool zscale)
627 {
628    nir_builder _b =
629       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
630                                      zscale ? "zscale blit fs" : "blit fs");
631    nir_builder *b = &_b;
632    b->shader->info.internal = true;
633 
634    nir_variable *out_color =
635       nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
636                           "color0");
637    out_color->data.location = FRAG_RESULT_DATA0;
638 
639    unsigned coord_components = zscale ? 3 : 2;
640    nir_variable *in_coords =
641       nir_variable_create(b->shader, nir_var_shader_in,
642                           glsl_vec_type(coord_components),
643                           "coords");
644    in_coords->data.location = VARYING_SLOT_VAR0;
645 
646    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
647    /* Note: since we're just copying data, we rely on the HW ignoring the
648     * dest_type.
649     */
650    tex->dest_type = nir_type_int32;
651    tex->is_array = false;
652    tex->is_shadow = false;
653    tex->sampler_dim = zscale ? GLSL_SAMPLER_DIM_3D : GLSL_SAMPLER_DIM_2D;
654 
655    tex->texture_index = 0;
656    tex->sampler_index = 0;
657 
658    b->shader->info.num_textures = 1;
659    BITSET_SET(b->shader->info.textures_used, 0);
660 
661    tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord,
662                                      nir_load_var(b, in_coords));
663    tex->coord_components = coord_components;
664 
665    nir_def_init(&tex->instr, &tex->def, 4, 32);
666    nir_builder_instr_insert(b, &tex->instr);
667 
668    nir_store_var(b, out_color, &tex->def, 0xf);
669 
670    return b->shader;
671 }
672 
673 /* We can only read multisample textures via txf_ms, so we need a separate
674  * variant for them.
675  */
676 static nir_shader *
build_ms_copy_fs_shader(bool half_float)677 build_ms_copy_fs_shader(bool half_float)
678 {
679    nir_builder _b =
680       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
681                                      "multisample copy fs");
682    nir_builder *b = &_b;
683    b->shader->info.internal = true;
684 
685    nir_variable *out_color =
686       nir_variable_create(b->shader, nir_var_shader_out,
687                           half_float ? glsl_f16vec_type(4) : glsl_vec4_type(),
688                           "color0");
689    out_color->data.location = FRAG_RESULT_DATA0;
690 
691    nir_variable *in_coords =
692       nir_variable_create(b->shader, nir_var_shader_in,
693                           glsl_vec_type(2),
694                           "coords");
695    in_coords->data.location = VARYING_SLOT_VAR0;
696 
697    nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2);
698 
699    tex->op = nir_texop_txf_ms;
700 
701    /* Note: since we're just copying data, we rely on the HW ignoring the
702     * dest_type.
703     */
704    tex->dest_type = half_float ? nir_type_float16 : nir_type_int32;
705    tex->is_array = false;
706    tex->is_shadow = false;
707    tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
708 
709    tex->texture_index = 0;
710    tex->sampler_index = 0;
711 
712    b->shader->info.num_textures = 1;
713    BITSET_SET(b->shader->info.textures_used, 0);
714    BITSET_SET(b->shader->info.textures_used_by_txf, 0);
715 
716    nir_def *coord = nir_f2i32(b, nir_load_var(b, in_coords));
717 
718    tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, coord);
719    tex->coord_components = 2;
720 
721    tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_ms_index,
722                                      nir_load_sample_id(b));
723 
724    nir_def_init(&tex->instr, &tex->def, 4, half_float ? 16 : 32);
725    nir_builder_instr_insert(b, &tex->instr);
726 
727    nir_store_var(b, out_color, &tex->def, 0xf);
728 
729    return b->shader;
730 }
731 
732 static nir_shader *
build_clear_fs_shader(unsigned mrts)733 build_clear_fs_shader(unsigned mrts)
734 {
735    nir_builder _b =
736       nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
737                                      "mrt%u clear fs", mrts);
738    nir_builder *b = &_b;
739    b->shader->info.internal = true;
740 
741    for (unsigned i = 0; i < mrts; i++) {
742       nir_variable *out_color =
743          nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
744                              "color");
745       out_color->data.location = FRAG_RESULT_DATA0 + i;
746 
747       nir_def *color = load_const(b, 4 * i, 4);
748       nir_store_var(b, out_color, color, 0xf);
749    }
750 
751    return b->shader;
752 }
753 
754 static void
compile_shader(struct tu_device * dev,struct nir_shader * nir,unsigned consts,unsigned * offset,enum global_shader idx)755 compile_shader(struct tu_device *dev, struct nir_shader *nir,
756                unsigned consts, unsigned *offset, enum global_shader idx)
757 {
758    nir->options = ir3_get_compiler_options(dev->compiler);
759 
760    nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
761    nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
762 
763    ir3_finalize_nir(dev->compiler, nir);
764 
765    const struct ir3_shader_options options = {
766       .num_reserved_user_consts = align(consts, 8),
767       .api_wavesize = IR3_SINGLE_OR_DOUBLE,
768       .real_wavesize = IR3_SINGLE_OR_DOUBLE,
769    };
770    struct ir3_shader *sh =
771       ir3_shader_from_nir(dev->compiler, nir, &options, NULL);
772 
773    struct ir3_shader_key key = {};
774    bool created;
775    struct ir3_shader_variant *so =
776       ir3_shader_get_variant(sh, &key, false, false, &created);
777 
778    struct tu6_global *global = dev->global_bo_map;
779 
780    assert(*offset + so->info.sizedwords <= ARRAY_SIZE(global->shaders));
781    dev->global_shaders[idx] = sh;
782    dev->global_shader_variants[idx] = so;
783    memcpy(&global->shaders[*offset], so->bin,
784           sizeof(uint32_t) * so->info.sizedwords);
785    dev->global_shader_va[idx] = dev->global_bo->iova +
786       offsetof_arr(struct tu6_global, shaders, *offset);
787    *offset += align(so->info.sizedwords, 32);
788 }
789 
790 void
tu_init_clear_blit_shaders(struct tu_device * dev)791 tu_init_clear_blit_shaders(struct tu_device *dev)
792 {
793    unsigned offset = 0;
794    compile_shader(dev, build_blit_vs_shader(), 3, &offset, GLOBAL_SH_VS_BLIT);
795    compile_shader(dev, build_clear_vs_shader(), 2, &offset, GLOBAL_SH_VS_CLEAR);
796    compile_shader(dev, build_blit_fs_shader(false), 0, &offset, GLOBAL_SH_FS_BLIT);
797    compile_shader(dev, build_blit_fs_shader(true), 0, &offset, GLOBAL_SH_FS_BLIT_ZSCALE);
798    compile_shader(dev, build_ms_copy_fs_shader(false), 0, &offset, GLOBAL_SH_FS_COPY_MS);
799    compile_shader(dev, build_ms_copy_fs_shader(true), 0, &offset, GLOBAL_SH_FS_COPY_MS_HALF);
800 
801    for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
802       compile_shader(dev, build_clear_fs_shader(num_rts), num_rts, &offset,
803                      (enum global_shader) (GLOBAL_SH_FS_CLEAR0 + num_rts));
804    }
805 }
806 
807 void
tu_destroy_clear_blit_shaders(struct tu_device * dev)808 tu_destroy_clear_blit_shaders(struct tu_device *dev)
809 {
810    for (unsigned i = 0; i < GLOBAL_SH_COUNT; i++) {
811       if (dev->global_shaders[i])
812          ir3_shader_destroy(dev->global_shaders[i]);
813    }
814 }
815 
816 enum r3d_type {
817    R3D_CLEAR,
818    R3D_BLIT,
819    R3D_COPY_HALF,
820 };
821 
822 template <chip CHIP>
823 static void
r3d_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum r3d_type type,uint32_t rts_mask,bool z_scale,VkSampleCountFlagBits samples)824 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum r3d_type type,
825            uint32_t rts_mask, bool z_scale, VkSampleCountFlagBits samples)
826 {
827    enum global_shader vs_id =
828       type == R3D_CLEAR ? GLOBAL_SH_VS_CLEAR : GLOBAL_SH_VS_BLIT;
829 
830    struct ir3_shader_variant *vs = cmd->device->global_shader_variants[vs_id];
831    uint64_t vs_iova = cmd->device->global_shader_va[vs_id];
832 
833    enum global_shader fs_id = GLOBAL_SH_FS_BLIT;
834 
835    if (z_scale) {
836       fs_id = GLOBAL_SH_FS_BLIT_ZSCALE;
837    } else if (type == R3D_COPY_HALF) {
838       /* Avoid canonicalizing NaNs due to implicit conversions in the shader.
839        *
840        * TODO: Add a half-float blit shader that uses texture() but with half
841        * registers to avoid NaN canonicaliztion for the single-sampled case.
842        */
843       fs_id = GLOBAL_SH_FS_COPY_MS_HALF;
844    } else if (samples != VK_SAMPLE_COUNT_1_BIT) {
845       fs_id = GLOBAL_SH_FS_COPY_MS;
846    }
847 
848    unsigned num_rts = util_bitcount(rts_mask);
849    if (type == R3D_CLEAR)
850       fs_id = (enum global_shader) (GLOBAL_SH_FS_CLEAR0 + num_rts);
851 
852    struct ir3_shader_variant *fs = cmd->device->global_shader_variants[fs_id];
853    uint64_t fs_iova = cmd->device->global_shader_va[fs_id];
854 
855    tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
856          .vs_state = true,
857          .hs_state = true,
858          .ds_state = true,
859          .gs_state = true,
860          .fs_state = true,
861          .cs_state = true,
862          .cs_ibo = true,
863          .gfx_ibo = true,
864          .gfx_shared_const = true,
865          .cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
866          .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,));
867 
868    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_VERTEX, vs);
869    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_CTRL, NULL);
870    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_EVAL, NULL);
871    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_GEOMETRY, NULL);
872    tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_FRAGMENT, fs);
873 
874    struct tu_pvtmem_config pvtmem = {};
875    tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
876    tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
877 
878    tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
879    if (CHIP == A7XX) {
880       tu_cs_emit_regs(cs, A7XX_VPC_PRIMITIVE_CNTL_0());
881    }
882 
883    tu6_emit_vpc<CHIP>(cs, vs, NULL, NULL, NULL, fs);
884 
885    if (CHIP >= A7XX) {
886       tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2));
887 
888       tu_cs_emit_regs(cs, A7XX_HLSQ_FS_UNKNOWN_A9AA(.consts_load_disable = false));
889    }
890 
891    /* REPL_MODE for varying with RECTLIST (2 vertices only) */
892    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
893    tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
894 
895    tu6_emit_vs<CHIP>(cs, vs, 0);
896    tu6_emit_hs<CHIP>(cs, NULL);
897    tu6_emit_ds<CHIP>(cs, NULL);
898    tu6_emit_gs<CHIP>(cs, NULL);
899    tu6_emit_fs<CHIP>(cs, fs);
900 
901    tu_cs_emit_regs(cs,
902                    A6XX_GRAS_CL_CNTL(
903                       .clip_disable = 1,
904                       .vp_clip_code_ignore = 1,
905                       .vp_xform_disable = 1,
906                       .persp_division_disable = 1,));
907    tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
908 
909    tu_cs_emit_regs(cs, PC_RASTER_CNTL(CHIP));
910    if (CHIP == A6XX) {
911       tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107());
912    } else {
913       tu_cs_emit_regs(cs, A7XX_PC_RASTER_CNTL_V2());
914    }
915 
916    tu_cs_emit_regs(cs,
917                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
918                    A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
919    tu_cs_emit_regs(cs,
920                    A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
921                    A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
922 
923    tu_cs_emit_regs(cs,
924                    A6XX_VFD_INDEX_OFFSET(),
925                    A6XX_VFD_INSTANCE_START_OFFSET());
926 
927    if (rts_mask) {
928       unsigned rts_count = util_last_bit(rts_mask);
929       tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), rts_count);
930       unsigned rt = 0;
931       for (unsigned i = 0; i < rts_count; i++) {
932          unsigned regid = 0;
933          if (rts_mask & (1u << i))
934             regid = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + rt++);
935          tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(regid) |
936                         COND(regid & HALF_REG_ID,
937                              A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION));
938       }
939    }
940 
941    tu6_emit_msaa(cs, samples, false);
942 }
943 
944 static void
tu6_emit_blit_consts_load(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t opcode,enum a6xx_state_block block,uint32_t offset,const void * consts,uint32_t size_vec4)945 tu6_emit_blit_consts_load(struct tu_cmd_buffer *cmd,
946                           struct tu_cs *cs,
947                           uint32_t opcode,
948                           enum a6xx_state_block block,
949                           uint32_t offset,
950                           const void *consts,
951                           uint32_t size_vec4)
952 {
953    assert(offset % cmd->device->compiler->const_upload_unit == 0);
954 
955    struct tu_cs_memory mem = {};
956    VkResult result = tu_cs_alloc(&cmd->sub_cs, size_vec4, 4, &mem);
957    if (result != VK_SUCCESS) {
958       vk_command_buffer_set_error(&cmd->vk, result);
959       return;
960    }
961 
962    memcpy(mem.map, consts, size_vec4 * 4 * sizeof(uint32_t));
963 
964    tu_cs_emit_pkt7(cs, opcode, 3);
965    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
966                   CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
967                   CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
968                   CP_LOAD_STATE6_0_STATE_BLOCK(block) |
969                   CP_LOAD_STATE6_0_NUM_UNIT(size_vec4));
970    tu_cs_emit_qw(cs, mem.iova);
971 }
972 
973 static void
r3d_coords_raw(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const float * coords)974 r3d_coords_raw(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const float *coords)
975 {
976    tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_GEOM, SB6_VS_SHADER, 0, coords, 2);
977 }
978 
979 /* z coordinate for "z scale" blit path which uses a 3d texture */
980 static void
r3d_coord_z(struct tu_cmd_buffer * cmd,struct tu_cs * cs,float z)981 r3d_coord_z(struct tu_cmd_buffer *cmd, struct tu_cs *cs, float z)
982 {
983    const uint32_t coord[] = {
984       fui(z),
985       0,
986       0,
987       0,
988    };
989 
990    tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_GEOM, SB6_VS_SHADER, 4, coord, 1);
991 }
992 
993 static void
r3d_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset2D dst,const VkOffset2D src,const VkExtent2D extent)994 r3d_coords(struct tu_cmd_buffer *cmd,
995            struct tu_cs *cs,
996            const VkOffset2D dst,
997            const VkOffset2D src,
998            const VkExtent2D extent)
999 {
1000    const bool no_src = src.x != blt_no_coord.x;
1001    int32_t src_x1 = no_src ? src.x : 0;
1002    int32_t src_y1 = no_src ? src.y : 0;
1003 
1004    const float coords[] = {
1005       dst.x,
1006       dst.y,
1007       src_x1,
1008       src_y1,
1009       dst.x + extent.width,
1010       dst.y + extent.height,
1011       src_x1 + extent.width,
1012       src_y1 + extent.height,
1013    };
1014    r3d_coords_raw(cmd, cs, coords);
1015 }
1016 
1017 static void
r3d_clear_value(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,const VkClearValue * val)1018 r3d_clear_value(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)
1019 {
1020    uint32_t coords[4] = {};
1021 
1022    switch (format) {
1023    case PIPE_FORMAT_Z24X8_UNORM:
1024    case PIPE_FORMAT_Z24_UNORM_S8_UINT: {
1025       /* cleared as r8g8b8a8_unorm using special format */
1026       uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
1027       coords[0] = fui((tmp & 0xff) / 255.0f);
1028       coords[1] = fui((tmp >> 8 & 0xff) / 255.0f);
1029       coords[2] = fui((tmp >> 16 & 0xff) / 255.0f);
1030       coords[3] = fui((val->depthStencil.stencil & 0xff) / 255.0f);
1031    } break;
1032    case PIPE_FORMAT_Z16_UNORM:
1033    case PIPE_FORMAT_Z32_FLOAT:
1034       coords[0] = fui(val->depthStencil.depth);
1035       coords[1] = 0;
1036       coords[2] = 0;
1037       coords[3] = 0;
1038       break;
1039    case PIPE_FORMAT_S8_UINT:
1040       coords[0] = val->depthStencil.stencil & 0xff;
1041       coords[1] = 0;
1042       coords[2] = 0;
1043       coords[3] = 0;
1044       break;
1045    default:
1046       /* as color formats use clear value as-is */
1047       assert(!util_format_is_depth_or_stencil(format));
1048       memcpy(coords, val->color.uint32, 4 * sizeof(uint32_t));
1049       break;
1050    }
1051 
1052    tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_FRAG, SB6_FS_SHADER, 0, coords, 1);
1053 }
1054 
1055 static void
r3d_src_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const uint32_t * tex_const,uint32_t offset_base,uint32_t offset_ubwc,VkFilter filter)1056 r3d_src_common(struct tu_cmd_buffer *cmd,
1057                struct tu_cs *cs,
1058                const uint32_t *tex_const,
1059                uint32_t offset_base,
1060                uint32_t offset_ubwc,
1061                VkFilter filter)
1062 {
1063    struct tu_cs_memory texture = { };
1064    VkResult result = tu_cs_alloc(&cmd->sub_cs,
1065                                  2, /* allocate space for a sampler too */
1066                                  A6XX_TEX_CONST_DWORDS, &texture);
1067    if (result != VK_SUCCESS) {
1068       vk_command_buffer_set_error(&cmd->vk, result);
1069       return;
1070    }
1071 
1072    memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
1073 
1074    /* patch addresses for layer offset */
1075    *(uint64_t*) (texture.map + 4) += offset_base;
1076    uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
1077    texture.map[7] = ubwc_addr;
1078    texture.map[8] = ubwc_addr >> 32;
1079 
1080    texture.map[A6XX_TEX_CONST_DWORDS + 0] =
1081       A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
1082       A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
1083       A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
1084       A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
1085       A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
1086       0x60000; /* XXX used by blob, doesn't seem necessary */
1087    texture.map[A6XX_TEX_CONST_DWORDS + 1] =
1088       A6XX_TEX_SAMP_1_UNNORM_COORDS |
1089       A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
1090    texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
1091    texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
1092 
1093    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
1094    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1095                CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
1096                CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1097                CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1098                CP_LOAD_STATE6_0_NUM_UNIT(1));
1099    tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
1100 
1101    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_SAMP(.qword = texture.iova + A6XX_TEX_CONST_DWORDS * 4));
1102 
1103    tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
1104    tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1105       CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1106       CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1107       CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1108       CP_LOAD_STATE6_0_NUM_UNIT(1));
1109    tu_cs_emit_qw(cs, texture.iova);
1110 
1111    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
1112    tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
1113 }
1114 
1115 static void
r3d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,VkFilter filter,enum pipe_format dst_format)1116 r3d_src(struct tu_cmd_buffer *cmd,
1117         struct tu_cs *cs,
1118         const struct fdl6_view *iview,
1119         uint32_t layer,
1120         VkFilter filter,
1121         enum pipe_format dst_format)
1122 {
1123    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1124    memcpy(desc, iview->descriptor, sizeof(desc));
1125 
1126    enum a6xx_format fmt = (enum a6xx_format)(
1127       (desc[0] & A6XX_TEX_CONST_0_FMT__MASK) >> A6XX_TEX_CONST_0_FMT__SHIFT);
1128    enum pipe_format src_format = iview->format;
1129    fixup_src_format(&src_format, dst_format, &fmt);
1130    desc[0] = (desc[0] & ~A6XX_TEX_CONST_0_FMT__MASK) |
1131       A6XX_TEX_CONST_0_FMT(fmt);
1132 
1133    r3d_src_common(cmd, cs, desc,
1134                   iview->layer_size * layer,
1135                   iview->ubwc_layer_size * layer,
1136                   filter);
1137 }
1138 
1139 template <chip CHIP>
1140 static void
r3d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)1141 r3d_src_buffer(struct tu_cmd_buffer *cmd,
1142                struct tu_cs *cs,
1143                enum pipe_format format,
1144                uint64_t va, uint32_t pitch,
1145                uint32_t width, uint32_t height,
1146                enum pipe_format dst_format)
1147 {
1148    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1149 
1150    struct tu_native_format fmt = blit_format_texture<CHIP>(format, TILE6_LINEAR, false);
1151    enum a6xx_format color_format = fmt.fmt;
1152    fixup_src_format(&format, dst_format, &color_format);
1153 
1154    desc[0] =
1155       COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) |
1156       A6XX_TEX_CONST_0_FMT(color_format) |
1157       A6XX_TEX_CONST_0_SWAP(fmt.swap) |
1158       A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1159       A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1160       A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1161       A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1162    desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
1163    desc[2] =
1164       A6XX_TEX_CONST_2_PITCH(pitch) |
1165       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1166    desc[3] = 0;
1167    desc[4] = va;
1168    desc[5] = va >> 32;
1169    for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1170       desc[i] = 0;
1171 
1172    r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1173 }
1174 
1175 static void
r3d_src_depth(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1176 r3d_src_depth(struct tu_cmd_buffer *cmd,
1177               struct tu_cs *cs,
1178               const struct tu_image_view *iview,
1179               uint32_t layer)
1180 {
1181    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1182 
1183    memcpy(desc, iview->view.descriptor, sizeof(desc));
1184    uint64_t va = iview->depth_base_addr;
1185 
1186    desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1187                 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1188                 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1189                 A6XX_TEX_CONST_0_SWAP__MASK);
1190    desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_32_FLOAT) |
1191               A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1192               A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1193               A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1194               A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1195    desc[2] =
1196       A6XX_TEX_CONST_2_PITCH(iview->depth_pitch) |
1197       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1198    desc[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(iview->depth_layer_size) |
1199       (iview->view.descriptor[3] & ~A6XX_TEX_CONST_3_ARRAY_PITCH__MASK);
1200    desc[4] = va;
1201    desc[5] = va >> 32;
1202 
1203    r3d_src_common(cmd, cs, desc,
1204                   iview->depth_layer_size * layer,
1205                   iview->view.ubwc_layer_size * layer,
1206                   VK_FILTER_NEAREST);
1207 }
1208 
1209 static void
r3d_src_stencil(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1210 r3d_src_stencil(struct tu_cmd_buffer *cmd,
1211                 struct tu_cs *cs,
1212                 const struct tu_image_view *iview,
1213                 uint32_t layer)
1214 {
1215    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1216 
1217    memcpy(desc, iview->view.descriptor, sizeof(desc));
1218    uint64_t va = iview->stencil_base_addr;
1219 
1220    desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1221                 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1222                 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1223                 A6XX_TEX_CONST_0_SWAP__MASK);
1224    desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_UINT) |
1225               A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1226               A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1227               A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1228               A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1229    desc[2] =
1230       A6XX_TEX_CONST_2_PITCH(iview->stencil_pitch) |
1231       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1232    desc[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(iview->stencil_layer_size);
1233    desc[4] = va;
1234    desc[5] = va >> 32;
1235    for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1236       desc[i] = 0;
1237 
1238    r3d_src_common(cmd, cs, desc, iview->stencil_layer_size * layer, 0,
1239                   VK_FILTER_NEAREST);
1240 }
1241 
1242 static void
r3d_src_gmem_load(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1243 r3d_src_gmem_load(struct tu_cmd_buffer *cmd,
1244                   struct tu_cs *cs,
1245                   const struct tu_image_view *iview,
1246                   uint32_t layer)
1247 {
1248    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1249 
1250    memcpy(desc, iview->view.descriptor, sizeof(desc));
1251 
1252    /* Fixup D24 formats because we always load both depth and stencil. */
1253    enum pipe_format format = iview->view.format;
1254    if (format == PIPE_FORMAT_X24S8_UINT ||
1255        format == PIPE_FORMAT_Z24X8_UNORM ||
1256        format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1257       desc[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;
1258       if (iview->view.ubwc_enabled)
1259          desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8);
1260       else
1261          desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_8_8_8_UNORM);
1262    }
1263 
1264    /* When loading/storing GMEM we always load the full image and don't do any
1265     * swizzling or swapping, that's done in the draw when reading/writing
1266     * GMEM, so we need to fixup the swizzle and swap.
1267     */
1268    desc[0] &= ~(A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1269                 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1270                 A6XX_TEX_CONST_0_SWAP__MASK);
1271    desc[0] |= A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1272               A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1273               A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1274               A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1275 
1276    r3d_src_common(cmd, cs, desc,
1277                   iview->view.layer_size * layer,
1278                   iview->view.ubwc_layer_size * layer,
1279                   VK_FILTER_NEAREST);
1280 }
1281 
1282 template <chip CHIP>
1283 static void
r3d_src_gmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,enum pipe_format format,enum pipe_format dst_format,uint32_t gmem_offset,uint32_t cpp)1284 r3d_src_gmem(struct tu_cmd_buffer *cmd,
1285              struct tu_cs *cs,
1286              const struct tu_image_view *iview,
1287              enum pipe_format format,
1288              enum pipe_format dst_format,
1289              uint32_t gmem_offset,
1290              uint32_t cpp)
1291 {
1292    uint32_t desc[A6XX_TEX_CONST_DWORDS];
1293    memcpy(desc, iview->view.descriptor, sizeof(desc));
1294 
1295    enum a6xx_format fmt = blit_format_texture<CHIP>(format, TILE6_LINEAR, true).fmt;
1296    fixup_src_format(&format, dst_format, &fmt);
1297 
1298    /* patch the format so that depth/stencil get the right format and swizzle */
1299    desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1300                 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1301                 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
1302    desc[0] |= A6XX_TEX_CONST_0_FMT(fmt) |
1303                A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1304                A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1305                A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1306                A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1307 
1308    /* patched for gmem */
1309    desc[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
1310    desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
1311    desc[2] =
1312       A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
1313       A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp);
1314    desc[3] = 0;
1315    desc[4] = cmd->device->physical_device->gmem_base + gmem_offset;
1316    desc[5] = A6XX_TEX_CONST_5_DEPTH(1);
1317    for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1318       desc[i] = 0;
1319 
1320    r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1321 }
1322 
1323 template <chip CHIP>
1324 static void
r3d_dst(struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,enum pipe_format src_format)1325 r3d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1326         enum pipe_format src_format)
1327 {
1328    uint32_t mrt_buf_info = iview->RB_MRT_BUF_INFO;
1329 
1330    enum a6xx_format fmt = (enum a6xx_format)(
1331       mrt_buf_info & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK);
1332    enum pipe_format dst_format = iview->format;
1333    fixup_dst_format(src_format, &dst_format, &fmt);
1334    mrt_buf_info =
1335       (mrt_buf_info & ~A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK) |
1336       A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT(fmt);
1337 
1338    tu_cs_emit_regs(cs,
1339       RB_MRT_BUF_INFO(CHIP, 0, .dword = mrt_buf_info),
1340       A6XX_RB_MRT_PITCH(0, iview->pitch),
1341       A6XX_RB_MRT_ARRAY_PITCH(0, iview->layer_size),
1342       A6XX_RB_MRT_BASE(0, .qword = tu_layer_address(iview, layer)),
1343       A6XX_RB_MRT_BASE_GMEM(0),
1344    );
1345 
1346    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1347    tu_cs_image_flag_ref(cs, iview, layer);
1348 
1349    /* Use color format from RB_MRT_BUF_INFO. This register is relevant for
1350     * FMT6_NV12_Y.
1351     */
1352    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = fmt));
1353 
1354    tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP, .flag_mrts = iview->ubwc_enabled));
1355    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1356 }
1357 
1358 template <chip CHIP>
1359 static void
r3d_dst_depth(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1360 r3d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1361 {
1362    tu_cs_emit_regs(cs,
1363       RB_MRT_BUF_INFO(CHIP, 0, .dword = tu_image_view_depth(iview, RB_MRT_BUF_INFO)),
1364       A6XX_RB_MRT_PITCH(0, iview->depth_pitch),
1365       A6XX_RB_MRT_ARRAY_PITCH(0, iview->depth_layer_size),
1366       A6XX_RB_MRT_BASE(0, .qword = iview->depth_base_addr + iview->depth_layer_size * layer),
1367       A6XX_RB_MRT_BASE_GMEM(0),
1368    );
1369 
1370    tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1371    tu_cs_image_flag_ref(cs, &iview->view, layer);
1372 
1373    tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP, .flag_mrts = iview->view.ubwc_enabled));
1374    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1375 }
1376 
1377 template <chip CHIP>
1378 static void
r3d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1379 r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1380 {
1381    tu_cs_emit_regs(cs,
1382       RB_MRT_BUF_INFO(CHIP, 0, .dword = tu_image_view_stencil(iview, RB_MRT_BUF_INFO)),
1383       A6XX_RB_MRT_PITCH(0, iview->stencil_pitch),
1384       A6XX_RB_MRT_ARRAY_PITCH(0, iview->stencil_layer_size),
1385       A6XX_RB_MRT_BASE(0, .qword = iview->stencil_base_addr + iview->stencil_layer_size * layer),
1386       A6XX_RB_MRT_BASE_GMEM(0),
1387    );
1388 
1389    tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP));
1390    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1391 }
1392 
1393 template <chip CHIP>
1394 static void
r3d_dst_buffer(struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,enum pipe_format src_format)1395 r3d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1396                enum pipe_format src_format)
1397 {
1398    struct tu_native_format fmt = blit_format_color(format, TILE6_LINEAR);
1399 
1400    enum a6xx_format color_fmt = fmt.fmt;
1401    fixup_dst_format(src_format, &format, &color_fmt);
1402 
1403    tu_cs_emit_regs(cs,
1404                    RB_MRT_BUF_INFO(CHIP, 0, .color_format = color_fmt, .color_swap = fmt.swap),
1405                    A6XX_RB_MRT_PITCH(0, pitch),
1406                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1407                    A6XX_RB_MRT_BASE(0, .qword = va),
1408                    A6XX_RB_MRT_BASE_GMEM(0, 0));
1409 
1410    tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP));
1411    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1412 }
1413 
1414 template <chip CHIP>
1415 static void
r3d_dst_gmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * att,bool separate_stencil,unsigned layer)1416 r3d_dst_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1417              const struct tu_image_view *iview,
1418              const struct tu_render_pass_attachment *att,
1419              bool separate_stencil, unsigned layer)
1420 {
1421    unsigned RB_MRT_BUF_INFO;
1422    unsigned gmem_offset;
1423 
1424    if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1425       if (!separate_stencil) {
1426          RB_MRT_BUF_INFO = tu_image_view_depth(iview, RB_MRT_BUF_INFO);
1427          gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
1428       } else {
1429          RB_MRT_BUF_INFO = tu_image_view_stencil(iview, RB_MRT_BUF_INFO);
1430          gmem_offset = tu_attachment_gmem_offset_stencil(cmd, att, layer);
1431       }
1432    } else {
1433       RB_MRT_BUF_INFO = iview->view.RB_MRT_BUF_INFO;
1434       gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
1435    }
1436 
1437    tu_cs_emit_regs(cs,
1438                    RB_MRT_BUF_INFO(CHIP, 0, .dword = RB_MRT_BUF_INFO),
1439                    A6XX_RB_MRT_PITCH(0, 0),
1440                    A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1441                    A6XX_RB_MRT_BASE(0, 0),
1442                    A6XX_RB_MRT_BASE_GMEM(0, gmem_offset));
1443 
1444    enum a6xx_format color_format =
1445       (enum a6xx_format)(RB_MRT_BUF_INFO & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK);
1446    tu_cs_emit_regs(cs,
1447                    A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = color_format));
1448 
1449    tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP));
1450    tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1451 }
1452 
1453 static uint8_t
aspect_write_mask(enum pipe_format format,VkImageAspectFlags aspect_mask)1454 aspect_write_mask(enum pipe_format format, VkImageAspectFlags aspect_mask)
1455 {
1456    uint8_t mask = 0xf;
1457    assert(aspect_mask);
1458    /* note: the only format with partial writing is D24S8,
1459     * clear/blit uses the _AS_R8G8B8A8 format to access it
1460     */
1461    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1462       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
1463          mask = 0x7;
1464       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1465          mask = 0x8;
1466    }
1467    return mask;
1468 }
1469 
1470 static uint8_t
aspect_write_mask_generic_clear(enum pipe_format format,VkImageAspectFlags aspect_mask)1471 aspect_write_mask_generic_clear(enum pipe_format format, VkImageAspectFlags aspect_mask)
1472 {
1473    uint8_t mask = 0xf;
1474    assert(aspect_mask);
1475    /* note: the only format with partial writing is D24S8 */
1476    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1477       if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
1478          mask = 0x1;
1479       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1480          mask = 0x2;
1481    }
1482    return mask;
1483 }
1484 
1485 enum r3d_blit_param {
1486    R3D_Z_SCALE = 1 << 0,
1487    R3D_DST_GMEM = 1 << 1,
1488    R3D_COPY = 1 << 2,
1489 };
1490 
1491 template <chip CHIP>
1492 static void
r3d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)1493 r3d_setup(struct tu_cmd_buffer *cmd,
1494           struct tu_cs *cs,
1495           enum pipe_format src_format,
1496           enum pipe_format dst_format,
1497           VkImageAspectFlags aspect_mask,
1498           unsigned blit_param,
1499           bool clear,
1500           bool ubwc,
1501           VkSampleCountFlagBits samples)
1502 {
1503    if (!cmd->state.pass && cmd->device->dbg_renderpass_stomp_cs) {
1504       tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
1505    }
1506 
1507    enum a6xx_format fmt = blit_base_format<CHIP>(dst_format, ubwc, false);
1508    fixup_dst_format(src_format, &dst_format, &fmt);
1509 
1510    if (!cmd->state.pass) {
1511       tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_SYSMEM);
1512       tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
1513    }
1514 
1515    if (!(blit_param & R3D_DST_GMEM)) {
1516       if (CHIP == A6XX) {
1517          tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.buffers_location = BUFFERS_IN_SYSMEM));
1518       } else {
1519          tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL());
1520       }
1521 
1522       tu_cs_emit_regs(cs, RB_BIN_CONTROL(CHIP, .buffers_location = BUFFERS_IN_SYSMEM));
1523 
1524       if (CHIP >= A7XX) {
1525          tu_cs_emit_regs(cs, A7XX_RB_UNKNOWN_8812(0x3ff));
1526          tu_cs_emit_regs(cs,
1527             A7XX_RB_UNKNOWN_8E06(cmd->device->physical_device->info->a6xx.magic.RB_UNKNOWN_8E06));
1528       }
1529    }
1530 
1531    enum r3d_type type;
1532    if (clear) {
1533       type = R3D_CLEAR;
1534    } else if ((blit_param & R3D_COPY) && tu_pipe_format_is_float16(src_format)) {
1535       /* Avoid canonicalizing NaNs in copies by using the special half-float
1536        * path that uses half regs.
1537        */
1538       type = R3D_COPY_HALF;
1539    } else {
1540       type = R3D_BLIT;
1541    }
1542 
1543    r3d_common<CHIP>(cmd, cs, type, 1, blit_param & R3D_Z_SCALE, samples);
1544 
1545    tu_cs_emit_regs(cs, A6XX_SP_FS_OUTPUT_CNTL1(.mrt = 1));
1546    tu_cs_emit_regs(cs, A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
1547    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1548    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
1549 
1550    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1551    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
1552    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL());
1553    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1554    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
1555    tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL());
1556    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
1557    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
1558    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
1559 
1560    tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
1561                         .color_format = fmt,
1562                         .color_sint = util_format_is_pure_sint(dst_format),
1563                         .color_uint = util_format_is_pure_uint(dst_format)));
1564 
1565    tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
1566       .component_enable = aspect_write_mask(dst_format, aspect_mask)));
1567    tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(util_format_is_srgb(dst_format)));
1568    tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(util_format_is_srgb(dst_format)));
1569 
1570    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
1571    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
1572 
1573    if (CHIP >= A7XX)
1574       tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO());
1575 
1576    tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
1577                         A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1578 
1579    /* Disable sample counting in order to not affect occlusion query. */
1580    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
1581 
1582    tu_cs_emit_regs(cs, A6XX_RB_DITHER_CNTL());
1583    if (CHIP >= A7XX) {
1584       tu_cs_emit_regs(cs, A7XX_SP_DITHER_CNTL());
1585    }
1586 
1587    if (cmd->state.prim_generated_query_running_before_rp) {
1588       tu_emit_event_write<CHIP>(cmd, cs, FD_STOP_PRIMITIVE_CTRS);
1589    }
1590 
1591    if (cmd->state.predication_active) {
1592       tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1593       tu_cs_emit(cs, 0);
1594    }
1595 }
1596 
1597 static void
r3d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1598 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1599 {
1600    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1601    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1602                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1603                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
1604    tu_cs_emit(cs, 1); /* instance count */
1605    tu_cs_emit(cs, 2); /* vertex count */
1606 }
1607 
1608 static void
r3d_run_vis(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1609 r3d_run_vis(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1610 {
1611    tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1612    tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1613                   CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1614                   CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY));
1615    tu_cs_emit(cs, 1); /* instance count */
1616    tu_cs_emit(cs, 2); /* vertex count */
1617 }
1618 
1619 template <chip CHIP>
1620 static void
r3d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1621 r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1622 {
1623    if (cmd->state.predication_active) {
1624       tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1625       tu_cs_emit(cs, 1);
1626    }
1627 
1628    /* Re-enable sample counting. */
1629    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
1630 
1631    if (cmd->state.prim_generated_query_running_before_rp) {
1632       tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
1633    }
1634 }
1635 
1636 /* blit ops - common interface for 2d/shader paths */
1637 
1638 struct blit_ops {
1639    void (*coords)(struct tu_cmd_buffer *cmd,
1640                   struct tu_cs *cs,
1641                   const VkOffset2D dst,
1642                   const VkOffset2D src,
1643                   const VkExtent2D extent);
1644    void (*clear_value)(struct tu_cmd_buffer *cmd,
1645                        struct tu_cs *cs,
1646                        enum pipe_format format,
1647                        const VkClearValue *val);
1648    void (*src)(
1649         struct tu_cmd_buffer *cmd,
1650         struct tu_cs *cs,
1651         const struct fdl6_view *iview,
1652         uint32_t layer,
1653         VkFilter filter,
1654         enum pipe_format dst_format);
1655    void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1656                       enum pipe_format format,
1657                       uint64_t va, uint32_t pitch,
1658                       uint32_t width, uint32_t height,
1659                       enum pipe_format dst_format);
1660    void (*dst)(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1661                enum pipe_format src_format);
1662    void (*dst_depth)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1663    void (*dst_stencil)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1664    void (*dst_buffer)(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1665                       enum pipe_format src_format);
1666    void (*setup)(struct tu_cmd_buffer *cmd,
1667                  struct tu_cs *cs,
1668                  enum pipe_format src_format,
1669                  enum pipe_format dst_format,
1670                  VkImageAspectFlags aspect_mask,
1671                  unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */
1672                  bool clear,
1673                  bool ubwc,
1674                  VkSampleCountFlagBits samples);
1675    void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1676    void (*teardown)(struct tu_cmd_buffer *cmd,
1677                     struct tu_cs *cs);
1678 };
1679 
1680 template <chip CHIP>
1681 static const struct blit_ops r2d_ops = {
1682    .coords = r2d_coords,
1683    .clear_value = r2d_clear_value,
1684    .src = r2d_src<CHIP>,
1685    .src_buffer = r2d_src_buffer<CHIP>,
1686    .dst = r2d_dst<CHIP>,
1687    .dst_depth = r2d_dst_depth,
1688    .dst_stencil = r2d_dst_stencil,
1689    .dst_buffer = r2d_dst_buffer,
1690    .setup = r2d_setup<CHIP>,
1691    .run = r2d_run,
1692    .teardown = r2d_teardown,
1693 };
1694 
1695 template <chip CHIP>
1696 static const struct blit_ops r3d_ops = {
1697    .coords = r3d_coords,
1698    .clear_value = r3d_clear_value,
1699    .src = r3d_src,
1700    .src_buffer = r3d_src_buffer<CHIP>,
1701    .dst = r3d_dst<CHIP>,
1702    .dst_depth = r3d_dst_depth<CHIP>,
1703    .dst_stencil = r3d_dst_stencil<CHIP>,
1704    .dst_buffer = r3d_dst_buffer<CHIP>,
1705    .setup = r3d_setup<CHIP>,
1706    .run = r3d_run,
1707    .teardown = r3d_teardown<CHIP>,
1708 };
1709 
1710 /* passthrough set coords from 3D extents */
1711 static void
coords(const struct blit_ops * ops,struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset3D dst,const VkOffset3D src,const VkExtent3D extent)1712 coords(const struct blit_ops *ops,
1713        struct tu_cmd_buffer *cmd,
1714        struct tu_cs *cs,
1715        const VkOffset3D dst,
1716        const VkOffset3D src,
1717        const VkExtent3D extent)
1718 {
1719    ops->coords(cmd, cs, (VkOffset2D) {dst.x, dst.y}, (VkOffset2D) {src.x, src.y},
1720                (VkExtent2D) {extent.width, extent.height});
1721 }
1722 
1723 /* Decides the VK format to treat our data as for a memcpy-style blit. We have
1724  * to be a bit careful because we have to pick a format with matching UBWC
1725  * compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for
1726  * everything.
1727  */
1728 static enum pipe_format
copy_format(VkFormat vk_format,VkImageAspectFlags aspect_mask)1729 copy_format(VkFormat vk_format, VkImageAspectFlags aspect_mask)
1730 {
1731    if (vk_format_is_compressed(vk_format)) {
1732       switch (vk_format_get_blocksize(vk_format)) {
1733       case 1: return PIPE_FORMAT_R8_UINT;
1734       case 2: return PIPE_FORMAT_R16_UINT;
1735       case 4: return PIPE_FORMAT_R32_UINT;
1736       case 8: return PIPE_FORMAT_R32G32_UINT;
1737       case 16:return PIPE_FORMAT_R32G32B32A32_UINT;
1738       default:
1739          unreachable("unhandled format size");
1740       }
1741    }
1742 
1743    enum pipe_format format = vk_format_to_pipe_format(vk_format);
1744 
1745    /* For SNORM formats, copy them as the equivalent UNORM format.  If we treat
1746     * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81
1747     * (also -1.0), when we're supposed to be memcpying the bits. See
1748     * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion.
1749     */
1750    format = util_format_snorm_to_unorm(format);
1751 
1752    switch (format) {
1753    case PIPE_FORMAT_R9G9B9E5_FLOAT:
1754       return PIPE_FORMAT_R32_UINT;
1755 
1756    case PIPE_FORMAT_G8_B8R8_420_UNORM:
1757       if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
1758          return PIPE_FORMAT_R8G8_UNORM;
1759       else
1760          return PIPE_FORMAT_Y8_UNORM;
1761    case PIPE_FORMAT_G8_B8_R8_420_UNORM:
1762       return PIPE_FORMAT_R8_UNORM;
1763 
1764    case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
1765       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1766          return PIPE_FORMAT_S8_UINT;
1767       assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
1768       return PIPE_FORMAT_Z32_FLOAT;
1769 
1770    default:
1771       return format;
1772    }
1773 }
1774 
1775 static void
pack_blit_event_clear_value(const VkClearValue * val,enum pipe_format format,uint32_t clear_value[4])1776 pack_blit_event_clear_value(const VkClearValue *val, enum pipe_format format, uint32_t clear_value[4])
1777 {
1778    switch (format) {
1779    case PIPE_FORMAT_Z24X8_UNORM:
1780    case PIPE_FORMAT_Z24_UNORM_S8_UINT:
1781       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
1782                        val->depthStencil.stencil << 24;
1783       return;
1784    case PIPE_FORMAT_Z16_UNORM:
1785       clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
1786       return;
1787    case PIPE_FORMAT_Z32_FLOAT:
1788       clear_value[0] = fui(val->depthStencil.depth);
1789       return;
1790    case PIPE_FORMAT_S8_UINT:
1791       clear_value[0] = val->depthStencil.stencil;
1792       return;
1793    default:
1794       break;
1795    }
1796 
1797    float tmp[4];
1798    memcpy(tmp, val->color.float32, 4 * sizeof(float));
1799    if (util_format_is_srgb(format)) {
1800       for (int i = 0; i < 3; i++)
1801          tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
1802    }
1803 
1804 #define PACK_F(type) util_format_##type##_pack_rgba_float \
1805    ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
1806    switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
1807    case 4:
1808       PACK_F(r4g4b4a4_unorm);
1809       break;
1810    case 5:
1811       if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
1812          PACK_F(r5g6b5_unorm);
1813       else
1814          PACK_F(r5g5b5a1_unorm);
1815       break;
1816    case 8:
1817       if (util_format_is_snorm(format))
1818          PACK_F(r8g8b8a8_snorm);
1819       else if (util_format_is_unorm(format))
1820          PACK_F(r8g8b8a8_unorm);
1821       else
1822          pack_int8(clear_value, val->color.uint32);
1823       break;
1824    case 10:
1825       if (util_format_is_pure_integer(format))
1826          pack_int10_2(clear_value, val->color.uint32);
1827       else
1828          PACK_F(r10g10b10a2_unorm);
1829       break;
1830    case 11:
1831       clear_value[0] = float3_to_r11g11b10f(val->color.float32);
1832       break;
1833    case 16:
1834       if (util_format_is_snorm(format))
1835          PACK_F(r16g16b16a16_snorm);
1836       else if (util_format_is_unorm(format))
1837          PACK_F(r16g16b16a16_unorm);
1838       else if (util_format_is_float(format))
1839          PACK_F(r16g16b16a16_float);
1840       else
1841          pack_int16(clear_value, val->color.uint32);
1842       break;
1843    case 32:
1844       memcpy(clear_value, val->color.float32, 4 * sizeof(float));
1845       break;
1846    case 0:
1847       assert(format == PIPE_FORMAT_A8_UNORM);
1848       PACK_F(a8_unorm);
1849       break;
1850    default:
1851       unreachable("unexpected channel size");
1852    }
1853 #undef PACK_F
1854 }
1855 
1856 static void
event_blit_setup(struct tu_cs * cs,const struct tu_render_pass_attachment * att,enum a6xx_blit_event_type blit_event_type,uint32_t clear_mask)1857 event_blit_setup(struct tu_cs *cs,
1858                  const struct tu_render_pass_attachment *att,
1859                  enum a6xx_blit_event_type blit_event_type,
1860                  uint32_t clear_mask)
1861 {
1862    tu_cs_emit_regs(
1863       cs, A6XX_RB_BLIT_GMEM_MSAA_CNTL(tu_msaa_samples(att->samples)));
1864 
1865    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
1866    tu_cs_emit(cs, 0);
1867 
1868    tu_cs_emit_regs(
1869       cs,
1870       A6XX_RB_BLIT_INFO(.type = blit_event_type,
1871                         .sample_0 =
1872                            vk_format_is_int(att->format) ||
1873                            vk_format_is_depth_or_stencil(att->format),
1874                         .depth = vk_format_is_depth_or_stencil(att->format),
1875                         .clear_mask = clear_mask, ));
1876 }
1877 
1878 struct event_blit_dst_view {
1879    const struct tu_image *image;
1880    const struct fdl6_view *view;
1881 
1882    uint32_t layer;
1883 
1884    uint64_t depth_addr;
1885    uint32_t depth_pitch;
1886 
1887    uint64_t stencil_addr;
1888    uint32_t stencil_pitch;
1889 };
1890 
1891 static event_blit_dst_view
blt_view_from_tu_view(const struct tu_image_view * iview,uint32_t layer)1892 blt_view_from_tu_view(const struct tu_image_view *iview,
1893                       uint32_t layer)
1894 {
1895    struct event_blit_dst_view blt_view;
1896    blt_view.image = iview->image;
1897    blt_view.view = &iview->view;
1898    blt_view.layer = layer;
1899 
1900    if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1901       blt_view.depth_addr =
1902          iview->depth_base_addr + iview->depth_layer_size * layer;
1903       blt_view.depth_pitch = iview->depth_pitch;
1904 
1905       blt_view.stencil_addr =
1906          iview->stencil_base_addr + iview->stencil_layer_size * layer;
1907       blt_view.stencil_pitch = iview->stencil_pitch;
1908    }
1909    return blt_view;
1910 }
1911 
1912 template <chip CHIP>
1913 static void
event_blit_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_render_pass_attachment * att,const event_blit_dst_view * blt_view,bool separate_stencil)1914 event_blit_run(struct tu_cmd_buffer *cmd,
1915                struct tu_cs *cs,
1916                const struct tu_render_pass_attachment *att,
1917                const event_blit_dst_view *blt_view,
1918                bool separate_stencil)
1919 {
1920    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
1921    if (blt_view->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1922       if (!separate_stencil) {
1923          tu_cs_emit(cs, tu_fdl_view_depth(blt_view->view, RB_BLIT_DST_INFO));
1924          tu_cs_emit_qw(cs, blt_view->depth_addr);
1925          tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(blt_view->depth_pitch).value);
1926 
1927          tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
1928          tu_cs_image_flag_ref(cs, blt_view->view, blt_view->layer);
1929       } else {
1930          tu_cs_emit(cs, tu_fdl_view_stencil(blt_view->view, RB_BLIT_DST_INFO) &
1931                            ~A6XX_RB_BLIT_DST_INFO_FLAGS);
1932          tu_cs_emit_qw(cs, blt_view->stencil_addr);
1933          tu_cs_emit(cs, A6XX_RB_BLIT_DST_PITCH(blt_view->stencil_pitch).value);
1934       }
1935    } else {
1936       tu_cs_emit(cs, blt_view->view->RB_BLIT_DST_INFO);
1937       tu_cs_image_ref_2d<CHIP>(cs, blt_view->view, blt_view->layer, false);
1938 
1939       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
1940       tu_cs_image_flag_ref(cs, blt_view->view, blt_view->layer);
1941    }
1942 
1943    if (att) {
1944       if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT && separate_stencil) {
1945          tu_cs_emit_regs(
1946             cs, A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset_stencil(
1947                    cmd, att, blt_view->layer)));
1948       } else {
1949          tu_cs_emit_regs(cs, A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset(
1950                                 cmd, att, blt_view->layer)));
1951       }
1952    }
1953 
1954    tu_emit_event_write<CHIP>(cmd, cs, FD_BLIT);
1955 }
1956 
1957 static void
tu7_generic_layer_clear(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint8_t clear_mask,bool separate_stencil,uint32_t layer,const VkClearValue * value,uint32_t a)1958 tu7_generic_layer_clear(struct tu_cmd_buffer *cmd,
1959                         struct tu_cs *cs,
1960                         enum pipe_format format,
1961                         uint8_t clear_mask,
1962                         bool separate_stencil,
1963                         uint32_t layer,
1964                         const VkClearValue *value,
1965                         uint32_t a)
1966 {
1967    const struct tu_render_pass_attachment *att =
1968       &cmd->state.pass->attachments[a];
1969    const struct tu_image_view *iview = cmd->state.attachments[a];
1970 
1971    uint32_t clear_vals[4] = {};
1972    pack_blit_event_clear_value(value, format, clear_vals);
1973 
1974    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
1975    tu_cs_emit_array(cs, clear_vals, 4);
1976 
1977    event_blit_dst_view blt_view = blt_view_from_tu_view(iview, layer);
1978 
1979    event_blit_setup(cs, att, BLIT_EVENT_CLEAR, clear_mask);
1980    event_blit_run<A7XX>(cmd, cs, att, &blt_view, separate_stencil);
1981 }
1982 
1983 
1984 
1985 /* Copies/fills/updates for buffers are happening through CCU but need
1986  * additional synchronization when write range is not aligned to 64 bytes.
1987  * Because dst buffer access uses either R8_UNORM or R32_UINT and they are not
1988  * coherent between each other in CCU since format seem to be a part of a
1989  * cache key.
1990  *
1991  * See: https://gitlab.khronos.org/vulkan/vulkan/-/issues/3306
1992  *
1993  * The synchronization with writes from UCHE (e.g. with SSBO stores) are
1994  * solved by the fact that UCHE has byte level dirtiness tracking and that CCU
1995  * flush would happen always before UCHE flush for such case (e.g. both
1996  * renderpass and dispatch would flush pending CCU write).
1997  *
1998  * Additionally see:
1999  * https://gitlab.khronos.org/vulkan/vulkan/-/issues/3398#note_400111
2000  */
2001 template <chip CHIP>
2002 static void
handle_buffer_unaligned_store(struct tu_cmd_buffer * cmd,uint64_t dst_va,uint64_t size,bool * unaligned_store)2003 handle_buffer_unaligned_store(struct tu_cmd_buffer *cmd,
2004                               uint64_t dst_va,
2005                               uint64_t size,
2006                               bool *unaligned_store)
2007 {
2008    if (*unaligned_store)
2009       return;
2010 
2011    if ((dst_va & 63) || (size & 63)) {
2012       tu_flush_for_access(&cmd->state.cache, TU_ACCESS_NONE,
2013                           TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE);
2014       /* Wait for invalidations to land. */
2015       cmd->state.cache.flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
2016       tu_emit_cache_flush<CHIP>(cmd);
2017       *unaligned_store = true;
2018    }
2019 }
2020 
2021 template <chip CHIP>
2022 static void
after_buffer_unaligned_buffer_store(struct tu_cmd_buffer * cmd,bool unaligned_store)2023 after_buffer_unaligned_buffer_store(struct tu_cmd_buffer *cmd,
2024                                     bool unaligned_store)
2025 {
2026    if (unaligned_store) {
2027       tu_flush_for_access(&cmd->state.cache,
2028                           TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE,
2029                           TU_ACCESS_NONE);
2030    }
2031 }
2032 
2033 template <chip CHIP>
2034 void
tu6_clear_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image,const VkClearValue * value)2035 tu6_clear_lrz(struct tu_cmd_buffer *cmd,
2036               struct tu_cs *cs,
2037               struct tu_image *image,
2038               const VkClearValue *value)
2039 {
2040    const struct blit_ops *ops = &r2d_ops<CHIP>;
2041 
2042    /* It is assumed that LRZ cache is invalidated at this point for
2043     * the writes here to become visible to LRZ.
2044     *
2045     * LRZ writes are going through UCHE cache, flush UCHE before changing
2046     * LRZ via CCU. Don't need to invalidate CCU since we are presumably
2047     * writing whole cache lines we assume to be 64 bytes.
2048     */
2049    tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_CACHE_CLEAN);
2050 
2051    ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM,
2052               VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
2053               VK_SAMPLE_COUNT_1_BIT);
2054    ops->clear_value(cmd, cs, PIPE_FORMAT_Z16_UNORM, value);
2055    ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM,
2056                    image->iova + image->lrz_offset,
2057                    image->lrz_pitch * 2, PIPE_FORMAT_Z16_UNORM);
2058    ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord,
2059                (VkExtent2D) { image->lrz_pitch, image->lrz_height });
2060    ops->run(cmd, cs);
2061    ops->teardown(cmd, cs);
2062 
2063    /* Clearing writes via CCU color in the PS stage, and LRZ is read via
2064     * UCHE in the earlier GRAS stage.
2065     */
2066    cmd->state.cache.flush_bits |=
2067       TU_CMD_FLAG_CCU_CLEAN_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE |
2068       TU_CMD_FLAG_WAIT_FOR_IDLE;
2069 }
2070 TU_GENX(tu6_clear_lrz);
2071 
2072 template <chip CHIP>
2073 void
tu6_dirty_lrz_fc(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image)2074 tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd,
2075                  struct tu_cs *cs,
2076                  struct tu_image *image)
2077 {
2078    const struct blit_ops *ops = &r2d_ops<CHIP>;
2079    VkClearValue clear = {};
2080    clear.color.uint32[0] = 0xffffffff;
2081 
2082    using LRZFC = fd_lrzfc_layout<CHIP>;
2083    uint64_t lrz_fc_iova = image->iova + image->lrz_fc_offset;
2084    ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
2085               VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
2086               VK_SAMPLE_COUNT_1_BIT);
2087    ops->clear_value(cmd, cs, PIPE_FORMAT_R32_UINT, &clear);
2088    ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT,
2089                    lrz_fc_iova + offsetof(LRZFC, fc1),
2090                    sizeof(LRZFC::fc1),
2091                    PIPE_FORMAT_R32_UINT);
2092    ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
2093       sizeof(LRZFC::fc1) / sizeof(uint32_t), 1
2094    });
2095    ops->run(cmd, cs);
2096    if constexpr (LRZFC::HAS_BIDIR) {
2097       ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT,
2098                       lrz_fc_iova + offsetof(LRZFC, fc2),
2099                       sizeof(LRZFC::fc2),
2100                       PIPE_FORMAT_R32_UINT);
2101       ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
2102          sizeof(LRZFC::fc2) / sizeof(uint32_t), 1
2103       });
2104       ops->run(cmd, cs);
2105    }
2106    ops->teardown(cmd, cs);
2107 }
2108 TU_GENX(tu6_dirty_lrz_fc);
2109 
2110 template<chip CHIP>
2111 static void
tu_image_view_copy_blit(struct fdl6_view * iview,struct tu_image * image,enum pipe_format format,const VkImageSubresourceLayers * subres,uint32_t layer,bool z_scale)2112 tu_image_view_copy_blit(struct fdl6_view *iview,
2113                         struct tu_image *image,
2114                         enum pipe_format format,
2115                         const VkImageSubresourceLayers *subres,
2116                         uint32_t layer,
2117                         bool z_scale)
2118 {
2119    VkImageAspectFlags aspect_mask = subres->aspectMask;
2120 
2121    /* always use the AS_R8G8B8A8 format for these */
2122    if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
2123        format == PIPE_FORMAT_Z24X8_UNORM) {
2124       aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
2125    }
2126 
2127    const struct fdl_layout *layout =
2128       &image->layout[tu6_plane_index(image->vk.format, aspect_mask)];
2129 
2130    const struct fdl_view_args args = {
2131       .chip = CHIP,
2132       .iova = image->iova,
2133       .base_miplevel = subres->mipLevel,
2134       .level_count = 1,
2135       .base_array_layer = subres->baseArrayLayer + layer,
2136       .layer_count = 1,
2137       .swiz = {
2138          PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W
2139       },
2140       .format = tu_format_for_aspect(format, aspect_mask),
2141       .type = z_scale ? FDL_VIEW_TYPE_3D : FDL_VIEW_TYPE_2D,
2142       .ubwc_fc_mutable = image->ubwc_fc_mutable,
2143    };
2144    fdl6_view_init(iview, &layout, &args, false);
2145 }
2146 
2147 template<chip CHIP>
2148 static void
tu_image_view_copy(struct fdl6_view * iview,struct tu_image * image,enum pipe_format format,const VkImageSubresourceLayers * subres,uint32_t layer)2149 tu_image_view_copy(struct fdl6_view *iview,
2150                    struct tu_image *image,
2151                    enum pipe_format format,
2152                    const VkImageSubresourceLayers *subres,
2153                    uint32_t layer)
2154 {
2155    tu_image_view_copy_blit<CHIP>(iview, image, format, subres, layer, false);
2156 }
2157 
2158 template<chip CHIP>
2159 static void
tu_image_view_blit(struct fdl6_view * iview,struct tu_image * image,const VkImageSubresourceLayers * subres,uint32_t layer)2160 tu_image_view_blit(struct fdl6_view *iview,
2161                    struct tu_image *image,
2162                    const VkImageSubresourceLayers *subres,
2163                    uint32_t layer)
2164 {
2165    enum pipe_format format =
2166       tu6_plane_format(image->vk.format, tu6_plane_index(image->vk.format,
2167                                                          subres->aspectMask));
2168    tu_image_view_copy_blit<CHIP>(iview, image, format, subres, layer, false);
2169 }
2170 
2171 template <chip CHIP>
2172 static void
tu6_blit_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageBlit2 * info,VkFilter filter)2173 tu6_blit_image(struct tu_cmd_buffer *cmd,
2174                struct tu_image *src_image,
2175                struct tu_image *dst_image,
2176                const VkImageBlit2 *info,
2177                VkFilter filter)
2178 {
2179    const struct blit_ops *ops = &r2d_ops<CHIP>;
2180    struct tu_cs *cs = &cmd->cs;
2181    bool z_scale = false;
2182    uint32_t layers = info->dstOffsets[1].z - info->dstOffsets[0].z;
2183 
2184    /* 2D blit can't do rotation mirroring from just coordinates */
2185    static const enum a6xx_rotation rotate[2][2] = {
2186       {ROTATE_0, ROTATE_HFLIP},
2187       {ROTATE_VFLIP, ROTATE_180},
2188    };
2189 
2190    bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
2191                    (info->dstOffsets[1].x < info->dstOffsets[0].x);
2192    bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
2193                    (info->dstOffsets[1].y < info->dstOffsets[0].y);
2194 
2195    int32_t src0_z = info->srcOffsets[0].z;
2196    int32_t src1_z = info->srcOffsets[1].z;
2197 
2198    if ((info->srcOffsets[1].z - info->srcOffsets[0].z !=
2199         info->dstOffsets[1].z - info->dstOffsets[0].z) ||
2200        info->srcOffsets[1].z < info->srcOffsets[0].z) {
2201       z_scale = true;
2202    }
2203 
2204    if (info->dstOffsets[1].z < info->dstOffsets[0].z) {
2205       layers = info->dstOffsets[0].z - info->dstOffsets[1].z;
2206       src0_z = info->srcOffsets[1].z;
2207       src1_z = info->srcOffsets[0].z;
2208    }
2209 
2210    if (vk_image_subresource_layer_count(&dst_image->vk, &info->dstSubresource) > 1) {
2211       assert(layers <= 1);
2212       layers = vk_image_subresource_layer_count(&dst_image->vk,
2213                                                 &info->dstSubresource);
2214    }
2215 
2216    /* BC1_RGB_* formats need to have their last components overriden with 1
2217     * when sampling, which is normally handled with the texture descriptor
2218     * swizzle. The 2d path can't handle that, so use the 3d path.
2219     *
2220     * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
2221     * the 2d path.
2222     */
2223 
2224    unsigned blit_param = rotate[mirror_y][mirror_x];
2225    if (dst_image->layout[0].nr_samples > 1 ||
2226        src_image->vk.format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
2227        src_image->vk.format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
2228        filter == VK_FILTER_CUBIC_EXT ||
2229        z_scale) {
2230       ops = &r3d_ops<CHIP>;
2231       blit_param = z_scale ? R3D_Z_SCALE : 0;
2232    }
2233 
2234    /* use the right format in setup() for D32_S8
2235     * TODO: this probably should use a helper
2236     */
2237    enum pipe_format src_format =
2238       tu6_plane_format(src_image->vk.format,
2239                        tu6_plane_index(src_image->vk.format,
2240                                        info->srcSubresource.aspectMask));
2241    enum pipe_format dst_format =
2242       tu6_plane_format(dst_image->vk.format,
2243                        tu6_plane_index(src_image->vk.format,
2244                                        info->srcSubresource.aspectMask));
2245    trace_start_blit(&cmd->trace, cs,
2246                   ops == &r3d_ops<CHIP>,
2247                   src_image->vk.format,
2248                   dst_image->vk.format,
2249                   layers);
2250 
2251    ops->setup(cmd, cs, src_format, dst_format, info->dstSubresource.aspectMask,
2252               blit_param, false, dst_image->layout[0].ubwc,
2253               (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2254 
2255    if (ops == &r3d_ops<CHIP>) {
2256       const float coords[] = { info->dstOffsets[0].x, info->dstOffsets[0].y,
2257                                info->srcOffsets[0].x, info->srcOffsets[0].y,
2258                                info->dstOffsets[1].x, info->dstOffsets[1].y,
2259                                info->srcOffsets[1].x, info->srcOffsets[1].y };
2260       r3d_coords_raw(cmd, cs, coords);
2261    } else {
2262       tu_cs_emit_regs(cs,
2263          A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
2264                              .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
2265          A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
2266                              .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
2267       tu_cs_emit_regs(cs,
2268          A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
2269          A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
2270          A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
2271          A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
2272    }
2273 
2274    struct fdl6_view dst, src;
2275    tu_image_view_blit<CHIP>(
2276       &dst, dst_image, &info->dstSubresource,
2277       MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z));
2278 
2279    if (z_scale) {
2280       tu_image_view_copy_blit<CHIP>(&src, src_image, src_format,
2281                                     &info->srcSubresource, 0, true);
2282       ops->src(cmd, cs, &src, 0, filter, dst_format);
2283    } else {
2284       tu_image_view_blit<CHIP>(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
2285    }
2286 
2287    for (uint32_t i = 0; i < layers; i++) {
2288       if (z_scale) {
2289          float t = ((float) i + 0.5f) / (float) layers;
2290          r3d_coord_z(cmd, cs, t * (src1_z - src0_z) + src0_z);
2291       } else {
2292          ops->src(cmd, cs, &src, i, filter, dst_format);
2293       }
2294       ops->dst(cs, &dst, i, src_format);
2295       ops->run(cmd, cs);
2296    }
2297 
2298    ops->teardown(cmd, cs);
2299 
2300    trace_end_blit(&cmd->trace, cs);
2301 }
2302 
2303 template <chip CHIP>
2304 VKAPI_ATTR void VKAPI_CALL
tu_CmdBlitImage2(VkCommandBuffer commandBuffer,const VkBlitImageInfo2 * pBlitImageInfo)2305 tu_CmdBlitImage2(VkCommandBuffer commandBuffer,
2306                  const VkBlitImageInfo2 *pBlitImageInfo)
2307 
2308 {
2309    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2310    VK_FROM_HANDLE(tu_image, src_image, pBlitImageInfo->srcImage);
2311    VK_FROM_HANDLE(tu_image, dst_image, pBlitImageInfo->dstImage);
2312 
2313    for (uint32_t i = 0; i < pBlitImageInfo->regionCount; ++i) {
2314       /* can't blit both depth and stencil at once with D32_S8
2315        * TODO: more advanced 3D blit path to support it instead?
2316        */
2317       if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
2318           dst_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2319          VkImageBlit2 region = pBlitImageInfo->pRegions[i];
2320          u_foreach_bit(b, region.dstSubresource.aspectMask) {
2321             region.srcSubresource.aspectMask = BIT(b);
2322             region.dstSubresource.aspectMask = BIT(b);
2323             tu6_blit_image<CHIP>(cmd, src_image, dst_image, &region, pBlitImageInfo->filter);
2324          }
2325          continue;
2326       }
2327       tu6_blit_image<CHIP>(cmd, src_image, dst_image, pBlitImageInfo->pRegions + i,
2328                      pBlitImageInfo->filter);
2329    }
2330 
2331    if (dst_image->lrz_height) {
2332       tu_disable_lrz<CHIP>(cmd, &cmd->cs, dst_image);
2333    }
2334 }
2335 TU_GENX(tu_CmdBlitImage2);
2336 
2337 static void
copy_compressed(VkFormat format,VkOffset3D * offset,VkExtent3D * extent,uint32_t * width,uint32_t * height)2338 copy_compressed(VkFormat format,
2339                 VkOffset3D *offset,
2340                 VkExtent3D *extent,
2341                 uint32_t *width,
2342                 uint32_t *height)
2343 {
2344    if (!vk_format_is_compressed(format))
2345       return;
2346 
2347    uint32_t block_width = vk_format_get_blockwidth(format);
2348    uint32_t block_height = vk_format_get_blockheight(format);
2349 
2350    offset->x /= block_width;
2351    offset->y /= block_height;
2352 
2353    if (extent) {
2354       extent->width = DIV_ROUND_UP(extent->width, block_width);
2355       extent->height = DIV_ROUND_UP(extent->height, block_height);
2356    }
2357    if (width)
2358       *width = DIV_ROUND_UP(*width, block_width);
2359    if (height)
2360       *height = DIV_ROUND_UP(*height, block_height);
2361 }
2362 
2363 template <chip CHIP>
2364 static void
tu_copy_buffer_to_image(struct tu_cmd_buffer * cmd,struct tu_buffer * src_buffer,struct tu_image * dst_image,const VkBufferImageCopy2 * info)2365 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
2366                         struct tu_buffer *src_buffer,
2367                         struct tu_image *dst_image,
2368                         const VkBufferImageCopy2 *info)
2369 {
2370    struct tu_cs *cs = &cmd->cs;
2371    uint32_t layers = MAX2(info->imageExtent.depth,
2372                           vk_image_subresource_layer_count(&dst_image->vk,
2373                                                            &info->imageSubresource));
2374    enum pipe_format src_format =
2375       copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
2376    enum pipe_format dst_format =
2377       copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
2378    const struct blit_ops *ops = &r2d_ops<CHIP>;
2379 
2380    /* special case for buffer to stencil */
2381    if (dst_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
2382        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
2383       src_format = PIPE_FORMAT_S8_UINT;
2384    }
2385 
2386    /* note: could use "R8_UNORM" when no UBWC */
2387    unsigned blit_param = 0;
2388    if (src_format == PIPE_FORMAT_Y8_UNORM ||
2389        tu_pipe_format_is_float16(src_format)) {
2390       ops = &r3d_ops<CHIP>;
2391       blit_param = R3D_COPY;
2392    }
2393 
2394    VkOffset3D offset = info->imageOffset;
2395    VkExtent3D extent = info->imageExtent;
2396    uint32_t src_width = info->bufferRowLength ?: extent.width;
2397    uint32_t src_height = info->bufferImageHeight ?: extent.height;
2398 
2399    copy_compressed(dst_image->vk.format, &offset, &extent, &src_width, &src_height);
2400 
2401    uint32_t pitch = src_width * util_format_get_blocksize(src_format);
2402    uint32_t layer_size = src_height * pitch;
2403 
2404    ops->setup(cmd, cs, src_format, dst_format,
2405               info->imageSubresource.aspectMask, blit_param, false, dst_image->layout[0].ubwc,
2406               (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2407 
2408    struct fdl6_view dst;
2409    tu_image_view_copy<CHIP>(&dst, dst_image, dst_format,
2410                             &info->imageSubresource, offset.z);
2411 
2412    for (uint32_t i = 0; i < layers; i++) {
2413       ops->dst(cs, &dst, i, src_format);
2414 
2415       uint64_t src_va = src_buffer->iova + info->bufferOffset + layer_size * i;
2416       if ((src_va & 63) || (pitch & 63)) {
2417          for (uint32_t y = 0; y < extent.height; y++) {
2418             uint32_t x = (src_va & 63) / util_format_get_blocksize(src_format);
2419             ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
2420                             x + extent.width, 1, dst_format);
2421             ops->coords(cmd, cs, (VkOffset2D) {offset.x, offset.y + y},  (VkOffset2D) {x},
2422                         (VkExtent2D) {extent.width, 1});
2423             ops->run(cmd, cs);
2424             src_va += pitch;
2425          }
2426       } else {
2427          ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height, dst_format);
2428          coords(ops, cmd, cs, offset, (VkOffset3D) {}, extent);
2429          ops->run(cmd, cs);
2430       }
2431    }
2432 
2433    ops->teardown(cmd, cs);
2434 }
2435 
2436 template <chip CHIP>
2437 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,const VkCopyBufferToImageInfo2 * pCopyBufferToImageInfo)2438 tu_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
2439                          const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)
2440 {
2441    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2442    VK_FROM_HANDLE(tu_image, dst_image, pCopyBufferToImageInfo->dstImage);
2443    VK_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferToImageInfo->srcBuffer);
2444 
2445    for (unsigned i = 0; i < pCopyBufferToImageInfo->regionCount; ++i)
2446       tu_copy_buffer_to_image<CHIP>(cmd, src_buffer, dst_image,
2447                               pCopyBufferToImageInfo->pRegions + i);
2448 
2449    if (dst_image->lrz_height) {
2450       tu_disable_lrz<CHIP>(cmd, &cmd->cs, dst_image);
2451    }
2452 }
2453 TU_GENX(tu_CmdCopyBufferToImage2);
2454 
2455 template <chip CHIP>
2456 static void
tu_copy_image_to_buffer(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_buffer * dst_buffer,const VkBufferImageCopy2 * info,bool * unaligned_store)2457 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
2458                         struct tu_image *src_image,
2459                         struct tu_buffer *dst_buffer,
2460                         const VkBufferImageCopy2 *info,
2461                         bool *unaligned_store)
2462 {
2463    struct tu_cs *cs = &cmd->cs;
2464    uint32_t layers = MAX2(info->imageExtent.depth,
2465                           vk_image_subresource_layer_count(&src_image->vk,
2466                                                            &info->imageSubresource));
2467    enum pipe_format dst_format =
2468       copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
2469    enum pipe_format src_format =
2470       copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
2471    const struct blit_ops *ops = &r2d_ops<CHIP>;
2472 
2473    if (src_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
2474        info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
2475       dst_format = PIPE_FORMAT_S8_UINT;
2476    }
2477 
2478    /* note: could use "R8_UNORM" when no UBWC */
2479    unsigned blit_param = 0;
2480    if (dst_format == PIPE_FORMAT_Y8_UNORM ||
2481        tu_pipe_format_is_float16(src_format)) {
2482       ops = &r3d_ops<CHIP>;
2483       blit_param = R3D_COPY;
2484    }
2485 
2486    VkOffset3D offset = info->imageOffset;
2487    VkExtent3D extent = info->imageExtent;
2488    uint32_t dst_width = info->bufferRowLength ?: extent.width;
2489    uint32_t dst_height = info->bufferImageHeight ?: extent.height;
2490 
2491    copy_compressed(src_image->vk.format, &offset, &extent, &dst_width, &dst_height);
2492 
2493    uint32_t pitch = dst_width * util_format_get_blocksize(dst_format);
2494    uint32_t layer_size = pitch * dst_height;
2495 
2496    handle_buffer_unaligned_store<CHIP>(cmd,
2497                                        dst_buffer->iova + info->bufferOffset,
2498                                        layer_size * layers, unaligned_store);
2499 
2500    ops->setup(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, blit_param, false, false,
2501               VK_SAMPLE_COUNT_1_BIT);
2502 
2503    struct fdl6_view src;
2504    tu_image_view_copy<CHIP>(&src, src_image, src_format,
2505                             &info->imageSubresource, offset.z);
2506 
2507    for (uint32_t i = 0; i < layers; i++) {
2508       ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
2509 
2510       uint64_t dst_va = dst_buffer->iova + info->bufferOffset + layer_size * i;
2511       if ((dst_va & 63) || (pitch & 63)) {
2512          for (uint32_t y = 0; y < extent.height; y++) {
2513             uint32_t x = (dst_va & 63) / util_format_get_blocksize(dst_format);
2514             ops->dst_buffer(cs, dst_format, dst_va & ~63, 0, src_format);
2515             ops->coords(cmd, cs, (VkOffset2D) {x}, (VkOffset2D) {offset.x, offset.y + y},
2516                         (VkExtent2D) {extent.width, 1});
2517             ops->run(cmd, cs);
2518             dst_va += pitch;
2519          }
2520       } else {
2521          ops->dst_buffer(cs, dst_format, dst_va, pitch, src_format);
2522          coords(ops, cmd, cs, (VkOffset3D) {0, 0}, offset, extent);
2523          ops->run(cmd, cs);
2524       }
2525    }
2526 
2527    ops->teardown(cmd, cs);
2528 }
2529 
2530 template <chip CHIP>
2531 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,const VkCopyImageToBufferInfo2 * pCopyImageToBufferInfo)2532 tu_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,
2533                          const VkCopyImageToBufferInfo2 *pCopyImageToBufferInfo)
2534 {
2535    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2536    VK_FROM_HANDLE(tu_image, src_image, pCopyImageToBufferInfo->srcImage);
2537    VK_FROM_HANDLE(tu_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer);
2538 
2539    bool unaligned_store = false;
2540    for (unsigned i = 0; i < pCopyImageToBufferInfo->regionCount; ++i)
2541       tu_copy_image_to_buffer<CHIP>(cmd, src_image, dst_buffer,
2542                               pCopyImageToBufferInfo->pRegions + i,
2543                               &unaligned_store);
2544 
2545    after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
2546 }
2547 TU_GENX(tu_CmdCopyImageToBuffer2);
2548 
2549 /* Tiled formats don't support swapping, which means that we can't support
2550  * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
2551  * formats like B5G5R5A1 have a separate linear-only format when sampling.
2552  * Currently we fake support for tiled swapped formats and use the unswapped
2553  * format instead, but this means that reinterpreting copies to and from
2554  * swapped formats can't be performed correctly unless we can swizzle the
2555  * components by reinterpreting the other image as the "correct" swapped
2556  * format, i.e. only when the other image is linear.
2557  */
2558 
2559 template <chip CHIP>
2560 static bool
is_swapped_format(enum pipe_format format)2561 is_swapped_format(enum pipe_format format)
2562 {
2563    struct tu_native_format linear = blit_format_texture<CHIP>(format, TILE6_LINEAR, false);
2564    struct tu_native_format tiled = blit_format_texture<CHIP>(format, TILE6_3, false);
2565    return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
2566 }
2567 
2568 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
2569  * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
2570  * versa). This should mirror the logic in fdl6_layout.
2571  */
2572 static bool
image_is_r8g8(struct tu_image * image)2573 image_is_r8g8(struct tu_image *image)
2574 {
2575    return image->layout[0].cpp == 2 &&
2576       vk_format_get_nr_components(image->vk.format) == 2;
2577 }
2578 
2579 template <chip CHIP>
2580 static void
tu_copy_image_to_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageCopy2 * info)2581 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
2582                        struct tu_image *src_image,
2583                        struct tu_image *dst_image,
2584                        const VkImageCopy2 *info)
2585 {
2586    const struct blit_ops *ops = &r2d_ops<CHIP>;
2587    struct tu_cs *cs = &cmd->cs;
2588 
2589    if (dst_image->layout[0].nr_samples > 1)
2590       ops = &r3d_ops<CHIP>;
2591 
2592    enum pipe_format format = PIPE_FORMAT_NONE;
2593    VkOffset3D src_offset = info->srcOffset;
2594    VkOffset3D dst_offset = info->dstOffset;
2595    VkExtent3D extent = info->extent;
2596    uint32_t layers_to_copy = MAX2(info->extent.depth,
2597                                   vk_image_subresource_layer_count(&src_image->vk,
2598                                                                    &info->srcSubresource));
2599 
2600    /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
2601     * Images":
2602     *
2603     *    When copying between compressed and uncompressed formats the extent
2604     *    members represent the texel dimensions of the source image and not
2605     *    the destination. When copying from a compressed image to an
2606     *    uncompressed image the image texel dimensions written to the
2607     *    uncompressed image will be source extent divided by the compressed
2608     *    texel block dimensions. When copying from an uncompressed image to a
2609     *    compressed image the image texel dimensions written to the compressed
2610     *    image will be the source extent multiplied by the compressed texel
2611     *    block dimensions.
2612     *
2613     * This means we only have to adjust the extent if the source image is
2614     * compressed.
2615     */
2616    copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL);
2617    copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL);
2618 
2619    enum pipe_format dst_format = copy_format(dst_image->vk.format, info->dstSubresource.aspectMask);
2620    enum pipe_format src_format = copy_format(src_image->vk.format, info->srcSubresource.aspectMask);
2621 
2622    /* note: could use "R8_UNORM" when no UBWC */
2623    unsigned blit_param = 0;
2624    if (dst_format == PIPE_FORMAT_Y8_UNORM ||
2625        src_format == PIPE_FORMAT_Y8_UNORM ||
2626        tu_pipe_format_is_float16(src_format) ||
2627        tu_pipe_format_is_float16(dst_format)) {
2628       ops = &r3d_ops<CHIP>;
2629       blit_param = R3D_COPY;
2630    }
2631 
2632    bool use_staging_blit = false;
2633 
2634    if (src_format == dst_format) {
2635       /* Images that share a format can always be copied directly because it's
2636        * the same as a blit.
2637        */
2638       format = src_format;
2639    } else if (!src_image->layout[0].tile_mode) {
2640       /* If an image is linear, we can always safely reinterpret it with the
2641        * other image's format and then do a regular blit.
2642        */
2643       format = dst_format;
2644    } else if (!dst_image->layout[0].tile_mode) {
2645       format = src_format;
2646    } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
2647       /* We can't currently copy r8g8 images to/from other cpp=2 images,
2648        * due to the different tile layout.
2649        */
2650       use_staging_blit = true;
2651    } else if (is_swapped_format<CHIP>(src_format) ||
2652               is_swapped_format<CHIP>(dst_format)) {
2653       /* If either format has a non-identity swap, then we can't copy
2654        * to/from it.
2655        */
2656       use_staging_blit = true;
2657    } else if (!src_image->layout[0].ubwc) {
2658       format = dst_format;
2659    } else if (!dst_image->layout[0].ubwc) {
2660       format = src_format;
2661    } else {
2662       /* Both formats use UBWC and so neither can be reinterpreted.
2663        * TODO: We could do an in-place decompression of the dst instead.
2664        */
2665       perf_debug(cmd->device, "TODO: Do in-place UBWC decompression for UBWC->UBWC blits");
2666       use_staging_blit = true;
2667    }
2668 
2669    struct fdl6_view dst, src;
2670 
2671    if (use_staging_blit) {
2672       tu_image_view_copy<CHIP>(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z);
2673       tu_image_view_copy<CHIP>(&src, src_image, src_format, &info->srcSubresource, src_offset.z);
2674 
2675       struct fdl_layout staging_layout = { 0 };
2676       VkOffset3D staging_offset = { 0 };
2677 
2678       staging_layout.tile_mode = TILE6_LINEAR;
2679       staging_layout.ubwc = false;
2680 
2681       uint32_t layer_count =
2682          vk_image_subresource_layer_count(&src_image->vk,
2683                                           &info->srcSubresource);
2684       fdl6_layout(&staging_layout,
2685                   src_format,
2686                   src_image->layout[0].nr_samples,
2687                   extent.width,
2688                   extent.height,
2689                   extent.depth,
2690                   1,
2691                   layer_count,
2692                   extent.depth > 1,
2693                   NULL);
2694 
2695       struct tu_bo *staging_bo;
2696       VkResult result = tu_get_scratch_bo(cmd->device,
2697                                           staging_layout.size,
2698                                           &staging_bo);
2699       if (result != VK_SUCCESS) {
2700          vk_command_buffer_set_error(&cmd->vk, result);
2701          return;
2702       }
2703 
2704       struct fdl6_view staging;
2705       const struct fdl_layout *staging_layout_ptr = &staging_layout;
2706       const struct fdl_view_args copy_to_args = {
2707          .chip = CHIP,
2708          .iova = staging_bo->iova,
2709          .base_miplevel = 0,
2710          .level_count = 1,
2711          .base_array_layer = 0,
2712          .layer_count = layer_count,
2713          .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2714          .format = tu_format_for_aspect(src_format, VK_IMAGE_ASPECT_COLOR_BIT),
2715          .type = FDL_VIEW_TYPE_2D,
2716          .ubwc_fc_mutable = false,
2717       };
2718       fdl6_view_init(&staging, &staging_layout_ptr, &copy_to_args, false);
2719 
2720       ops->setup(cmd, cs, src_format, src_format, VK_IMAGE_ASPECT_COLOR_BIT, blit_param, false, false,
2721                  (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2722       coords(ops, cmd, cs, staging_offset, src_offset, extent);
2723 
2724       for (uint32_t i = 0; i < layers_to_copy; i++) {
2725          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, src_format);
2726          ops->dst(cs, &staging, i, src_format);
2727          ops->run(cmd, cs);
2728       }
2729 
2730       /* When executed by the user there has to be a pipeline barrier here,
2731        * but since we're doing it manually we'll have to flush ourselves.
2732        */
2733       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
2734       tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
2735       tu_cs_emit_wfi(cs);
2736 
2737       const struct fdl_view_args copy_from_args = {
2738          .chip = CHIP,
2739          .iova = staging_bo->iova,
2740          .base_miplevel = 0,
2741          .level_count = 1,
2742          .base_array_layer = 0,
2743          .layer_count = layer_count,
2744          .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2745          .format = tu_format_for_aspect(dst_format, VK_IMAGE_ASPECT_COLOR_BIT),
2746          .type = FDL_VIEW_TYPE_2D,
2747          .ubwc_fc_mutable = false,
2748       };
2749       fdl6_view_init(&staging, &staging_layout_ptr, &copy_from_args, false);
2750 
2751       ops->setup(cmd, cs, dst_format, dst_format, info->dstSubresource.aspectMask,
2752                  blit_param, false, dst_image->layout[0].ubwc,
2753                  (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2754       coords(ops, cmd, cs, dst_offset, staging_offset, extent);
2755 
2756       for (uint32_t i = 0; i < layers_to_copy; i++) {
2757          ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST, dst_format);
2758          ops->dst(cs, &dst, i, dst_format);
2759          ops->run(cmd, cs);
2760       }
2761    } else {
2762       tu_image_view_copy<CHIP>(&dst, dst_image, format, &info->dstSubresource, dst_offset.z);
2763       tu_image_view_copy<CHIP>(&src, src_image, format, &info->srcSubresource, src_offset.z);
2764 
2765       ops->setup(cmd, cs, format, format, info->dstSubresource.aspectMask,
2766                  blit_param, false, dst_image->layout[0].ubwc,
2767                  (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2768       coords(ops, cmd, cs, dst_offset, src_offset, extent);
2769 
2770       for (uint32_t i = 0; i < layers_to_copy; i++) {
2771          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, format);
2772          ops->dst(cs, &dst, i, format);
2773          ops->run(cmd, cs);
2774       }
2775    }
2776 
2777    ops->teardown(cmd, cs);
2778 }
2779 
2780 template <chip CHIP>
2781 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImage2(VkCommandBuffer commandBuffer,const VkCopyImageInfo2 * pCopyImageInfo)2782 tu_CmdCopyImage2(VkCommandBuffer commandBuffer,
2783                  const VkCopyImageInfo2 *pCopyImageInfo)
2784 {
2785    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2786    VK_FROM_HANDLE(tu_image, src_image, pCopyImageInfo->srcImage);
2787    VK_FROM_HANDLE(tu_image, dst_image, pCopyImageInfo->dstImage);
2788 
2789    for (uint32_t i = 0; i < pCopyImageInfo->regionCount; ++i) {
2790       if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2791          VkImageCopy2 info = pCopyImageInfo->pRegions[i];
2792          u_foreach_bit(b, info.dstSubresource.aspectMask) {
2793             info.srcSubresource.aspectMask = BIT(b);
2794             info.dstSubresource.aspectMask = BIT(b);
2795             tu_copy_image_to_image<CHIP>(cmd, src_image, dst_image, &info);
2796          }
2797          continue;
2798       }
2799 
2800       tu_copy_image_to_image<CHIP>(cmd, src_image, dst_image,
2801                              pCopyImageInfo->pRegions + i);
2802    }
2803 
2804    if (dst_image->lrz_height) {
2805       tu_disable_lrz<CHIP>(cmd, &cmd->cs, dst_image);
2806    }
2807 }
2808 TU_GENX(tu_CmdCopyImage2);
2809 
2810 template <chip CHIP>
2811 static void
copy_buffer(struct tu_cmd_buffer * cmd,uint64_t dst_va,uint64_t src_va,uint64_t size,uint32_t block_size,bool * unaligned_store)2812 copy_buffer(struct tu_cmd_buffer *cmd,
2813             uint64_t dst_va,
2814             uint64_t src_va,
2815             uint64_t size,
2816             uint32_t block_size,
2817             bool *unaligned_store)
2818 {
2819    const struct blit_ops *ops = &r2d_ops<CHIP>;
2820    struct tu_cs *cs = &cmd->cs;
2821    enum pipe_format format = block_size == 4 ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R8_UNORM;
2822    uint64_t blocks = size / block_size;
2823 
2824    handle_buffer_unaligned_store<CHIP>(cmd, dst_va, size, unaligned_store);
2825 
2826    ops->setup(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
2827               VK_SAMPLE_COUNT_1_BIT);
2828 
2829    while (blocks) {
2830       uint32_t src_x = (src_va & 63) / block_size;
2831       uint32_t dst_x = (dst_va & 63) / block_size;
2832       uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
2833 
2834       ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1, format);
2835       ops->dst_buffer(     cs, format, dst_va & ~63, 0, format);
2836       ops->coords(cmd, cs, (VkOffset2D) {dst_x}, (VkOffset2D) {src_x}, (VkExtent2D) {width, 1});
2837       ops->run(cmd, cs);
2838 
2839       src_va += width * block_size;
2840       dst_va += width * block_size;
2841       blocks -= width;
2842    }
2843 
2844    ops->teardown(cmd, cs);
2845 }
2846 
2847 template <chip CHIP>
2848 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBuffer2(VkCommandBuffer commandBuffer,const VkCopyBufferInfo2 * pCopyBufferInfo)2849 tu_CmdCopyBuffer2(VkCommandBuffer commandBuffer,
2850                   const VkCopyBufferInfo2 *pCopyBufferInfo)
2851 {
2852    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2853    VK_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
2854    VK_FROM_HANDLE(tu_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
2855 
2856    bool unaligned_store = false;
2857    for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) {
2858       const VkBufferCopy2 *region = &pCopyBufferInfo->pRegions[i];
2859       copy_buffer<CHIP>(cmd,
2860                   dst_buffer->iova + region->dstOffset,
2861                   src_buffer->iova + region->srcOffset,
2862                   region->size, 1, &unaligned_store);
2863    }
2864 
2865    after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
2866 }
2867 TU_GENX(tu_CmdCopyBuffer2);
2868 
2869 template <chip CHIP>
2870 VKAPI_ATTR void VKAPI_CALL
tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize dataSize,const void * pData)2871 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
2872                    VkBuffer dstBuffer,
2873                    VkDeviceSize dstOffset,
2874                    VkDeviceSize dataSize,
2875                    const void *pData)
2876 {
2877    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2878    VK_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
2879 
2880    struct tu_cs_memory tmp;
2881    VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64 / 4, &tmp);
2882    if (result != VK_SUCCESS) {
2883       vk_command_buffer_set_error(&cmd->vk, result);
2884       return;
2885    }
2886 
2887    bool unaligned_store = false;
2888    memcpy(tmp.map, pData, dataSize);
2889    copy_buffer<CHIP>(cmd, buffer->iova + dstOffset, tmp.iova, dataSize, 4, &unaligned_store);
2890 
2891    after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
2892 }
2893 TU_GENX(tu_CmdUpdateBuffer);
2894 
2895 template <chip CHIP>
2896 VKAPI_ATTR void VKAPI_CALL
tu_CmdFillBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize fillSize,uint32_t data)2897 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
2898                  VkBuffer dstBuffer,
2899                  VkDeviceSize dstOffset,
2900                  VkDeviceSize fillSize,
2901                  uint32_t data)
2902 {
2903    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2904    VK_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
2905    const struct blit_ops *ops = &r2d_ops<CHIP>;
2906    struct tu_cs *cs = &cmd->cs;
2907 
2908    fillSize = vk_buffer_range(&buffer->vk, dstOffset, fillSize);
2909 
2910    uint64_t dst_va = buffer->iova + dstOffset;
2911    uint32_t blocks = fillSize / 4;
2912 
2913    bool unaligned_store = false;
2914    handle_buffer_unaligned_store<CHIP>(cmd, dst_va, fillSize, &unaligned_store);
2915 
2916    ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
2917               VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
2918               VK_SAMPLE_COUNT_1_BIT);
2919 
2920    VkClearValue clear_val = {};
2921    clear_val.color.uint32[0] = data;
2922    ops->clear_value(cmd, cs, PIPE_FORMAT_R32_UINT, &clear_val);
2923 
2924    while (blocks) {
2925       uint32_t dst_x = (dst_va & 63) / 4;
2926       uint32_t width = MIN2(blocks, 0x4000 - dst_x);
2927 
2928       ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT, dst_va & ~63, 0, PIPE_FORMAT_R32_UINT);
2929       ops->coords(cmd, cs, (VkOffset2D) {dst_x}, blt_no_coord, (VkExtent2D) {width, 1});
2930       ops->run(cmd, cs);
2931 
2932       dst_va += width * 4;
2933       blocks -= width;
2934    }
2935 
2936    ops->teardown(cmd, cs);
2937 
2938    after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
2939 }
2940 TU_GENX(tu_CmdFillBuffer);
2941 
2942 template <chip CHIP>
2943 VKAPI_ATTR void VKAPI_CALL
tu_CmdResolveImage2(VkCommandBuffer commandBuffer,const VkResolveImageInfo2 * pResolveImageInfo)2944 tu_CmdResolveImage2(VkCommandBuffer commandBuffer,
2945                     const VkResolveImageInfo2 *pResolveImageInfo)
2946 {
2947    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2948    VK_FROM_HANDLE(tu_image, src_image, pResolveImageInfo->srcImage);
2949    VK_FROM_HANDLE(tu_image, dst_image, pResolveImageInfo->dstImage);
2950    const struct blit_ops *ops = &r2d_ops<CHIP>;
2951    struct tu_cs *cs = &cmd->cs;
2952 
2953    enum pipe_format src_format =
2954       vk_format_to_pipe_format(src_image->vk.format);
2955    enum pipe_format dst_format =
2956       vk_format_to_pipe_format(dst_image->vk.format);
2957    ops->setup(cmd, cs, src_format, dst_format,
2958               VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst_image->layout[0].ubwc,
2959               VK_SAMPLE_COUNT_1_BIT);
2960 
2961    for (uint32_t i = 0; i < pResolveImageInfo->regionCount; ++i) {
2962       const VkImageResolve2 *info = &pResolveImageInfo->pRegions[i];
2963       uint32_t layers = MAX2(info->extent.depth,
2964                              vk_image_subresource_layer_count(&dst_image->vk,
2965                                                               &info->dstSubresource));
2966 
2967       /* TODO: aspect masks possible ? */
2968 
2969       coords(ops, cmd, cs, info->dstOffset, info->srcOffset, info->extent);
2970 
2971       struct fdl6_view dst, src;
2972       tu_image_view_blit<CHIP>(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
2973       tu_image_view_blit<CHIP>(&src, src_image, &info->srcSubresource, info->srcOffset.z);
2974 
2975       for (uint32_t i = 0; i < layers; i++) {
2976          ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
2977          ops->dst(cs, &dst, i, src_format);
2978          ops->run(cmd, cs);
2979       }
2980    }
2981 
2982    ops->teardown(cmd, cs);
2983 }
2984 TU_GENX(tu_CmdResolveImage2);
2985 
2986 #define for_each_layer(layer, layer_mask, layers) \
2987    for (uint32_t layer = 0; \
2988         layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \
2989         layer++) \
2990       if (!layer_mask || (layer_mask & BIT(layer)))
2991 
2992 template <chip CHIP>
2993 static void
resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_src_format,VkFormat vk_dst_format,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect,bool src_separate_ds,bool dst_separate_ds)2994 resolve_sysmem(struct tu_cmd_buffer *cmd,
2995                struct tu_cs *cs,
2996                VkFormat vk_src_format,
2997                VkFormat vk_dst_format,
2998                const struct tu_image_view *src,
2999                const struct tu_image_view *dst,
3000                uint32_t layer_mask,
3001                uint32_t layers,
3002                const VkRect2D *rect,
3003                bool src_separate_ds,
3004                bool dst_separate_ds)
3005 {
3006    const struct blit_ops *ops = &r2d_ops<CHIP>;
3007 
3008    trace_start_sysmem_resolve(&cmd->trace, cs, vk_dst_format);
3009 
3010    enum pipe_format src_format = vk_format_to_pipe_format(vk_src_format);
3011    enum pipe_format dst_format = vk_format_to_pipe_format(vk_dst_format);
3012 
3013    ops->setup(cmd, cs, src_format, dst_format,
3014               VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst->view.ubwc_enabled,
3015               VK_SAMPLE_COUNT_1_BIT);
3016    ops->coords(cmd, cs, rect->offset, rect->offset, rect->extent);
3017 
3018    for_each_layer(i, layer_mask, layers) {
3019       if (src_separate_ds) {
3020          if (vk_src_format == VK_FORMAT_D32_SFLOAT || vk_dst_format == VK_FORMAT_D32_SFLOAT) {
3021             r2d_src_depth<CHIP>(cmd, cs, src, i, VK_FILTER_NEAREST);
3022          } else {
3023             r2d_src_stencil<CHIP>(cmd, cs, src, i, VK_FILTER_NEAREST);
3024          }
3025       } else {
3026          ops->src(cmd, cs, &src->view, i, VK_FILTER_NEAREST, dst_format);
3027       }
3028 
3029       if (dst_separate_ds) {
3030          if (vk_dst_format == VK_FORMAT_D32_SFLOAT) {
3031             ops->dst_depth(cs, dst, i);
3032          } else {
3033             ops->dst_stencil(cs, dst, i);
3034          }
3035       } else {
3036          ops->dst(cs, &dst->view, i, src_format);
3037       }
3038 
3039       ops->run(cmd, cs);
3040    }
3041 
3042    ops->teardown(cmd, cs);
3043 
3044    trace_end_sysmem_resolve(&cmd->trace, cs);
3045 }
3046 
3047 template <chip CHIP>
3048 void
tu_resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect)3049 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
3050                   struct tu_cs *cs,
3051                   const struct tu_image_view *src,
3052                   const struct tu_image_view *dst,
3053                   uint32_t layer_mask,
3054                   uint32_t layers,
3055                   const VkRect2D *rect)
3056 {
3057    assert(src->image->vk.format == dst->image->vk.format ||
3058           (vk_format_is_depth_or_stencil(src->image->vk.format) &&
3059            vk_format_is_depth_or_stencil(dst->image->vk.format)));
3060 
3061    bool src_separate_ds = src->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
3062    bool dst_separate_ds = dst->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
3063 
3064    if (dst_separate_ds) {
3065       resolve_sysmem<CHIP>(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT,
3066                      src, dst, layer_mask, layers, rect,
3067                      src_separate_ds, dst_separate_ds);
3068       resolve_sysmem<CHIP>(cmd, cs, VK_FORMAT_S8_UINT, VK_FORMAT_S8_UINT,
3069                      src, dst, layer_mask, layers, rect,
3070                      src_separate_ds, dst_separate_ds);
3071    } else {
3072       resolve_sysmem<CHIP>(cmd, cs, src->image->vk.format, dst->image->vk.format,
3073                      src, dst, layer_mask, layers, rect,
3074                      src_separate_ds, dst_separate_ds);
3075    }
3076 }
3077 TU_GENX(tu_resolve_sysmem);
3078 
3079 template <chip CHIP>
3080 static void
clear_image_cp_blit(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)3081 clear_image_cp_blit(struct tu_cmd_buffer *cmd,
3082                     struct tu_image *image,
3083                     const VkClearValue *clear_value,
3084                     const VkImageSubresourceRange *range,
3085                     VkImageAspectFlags aspect_mask)
3086 {
3087    uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3088    uint32_t layer_count = vk_image_subresource_layer_count(&image->vk, range);
3089    struct tu_cs *cs = &cmd->cs;
3090    enum pipe_format format;
3091    if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) {
3092       format = PIPE_FORMAT_R32_UINT;
3093    } else {
3094       format = tu6_plane_format(image->vk.format,
3095                                 tu6_plane_index(image->vk.format,
3096                                                 aspect_mask));
3097    }
3098 
3099    if (image->layout[0].depth0 > 1) {
3100       assert(layer_count == 1);
3101       assert(range->baseArrayLayer == 0);
3102    }
3103 
3104    const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops<CHIP> : &r2d_ops<CHIP>;
3105 
3106    ops->setup(cmd, cs, format, format, aspect_mask, 0, true, image->layout[0].ubwc,
3107               (VkSampleCountFlagBits) image->layout[0].nr_samples);
3108    if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
3109       ops->clear_value(cmd, cs, PIPE_FORMAT_R9G9B9E5_FLOAT, clear_value);
3110    else
3111       ops->clear_value(cmd, cs, format, clear_value);
3112 
3113    for (unsigned j = 0; j < level_count; j++) {
3114       if (image->layout[0].depth0 > 1)
3115          layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j);
3116 
3117       ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
3118                      u_minify(image->layout[0].width0, range->baseMipLevel + j),
3119                      u_minify(image->layout[0].height0, range->baseMipLevel + j)
3120                   });
3121 
3122       struct fdl6_view dst;
3123       const VkImageSubresourceLayers subresource = {
3124          .aspectMask = aspect_mask,
3125          .mipLevel = range->baseMipLevel + j,
3126          .baseArrayLayer = range->baseArrayLayer,
3127          .layerCount = 1,
3128       };
3129       tu_image_view_copy_blit<CHIP>(&dst, image, format, &subresource, 0, false);
3130 
3131       for (uint32_t i = 0; i < layer_count; i++) {
3132          ops->dst(cs, &dst, i, format);
3133          ops->run(cmd, cs);
3134       }
3135    }
3136 
3137    ops->teardown(cmd, cs);
3138 }
3139 
3140 static void
clear_image_event_blit(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)3141 clear_image_event_blit(struct tu_cmd_buffer *cmd,
3142                        struct tu_image *image,
3143                        const VkClearValue *clear_value,
3144                        const VkImageSubresourceRange *range,
3145                        VkImageAspectFlags aspect_mask)
3146 {
3147    uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3148    uint32_t layer_count = vk_image_subresource_layer_count(&image->vk, range);
3149    VkFormat vk_format = image->vk.format;
3150    if (vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3151       if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
3152          vk_format = VK_FORMAT_S8_UINT;
3153       else
3154          vk_format = VK_FORMAT_D32_SFLOAT;
3155    }
3156 
3157    enum pipe_format format = vk_format_to_pipe_format(vk_format);
3158 
3159    if (image->layout[0].depth0 > 1) {
3160       assert(layer_count == 1);
3161       assert(range->baseArrayLayer == 0);
3162    }
3163 
3164    struct tu_cs *cs = &cmd->cs;
3165 
3166    tu_cs_emit_regs(cs,
3167                    A7XX_RB_BLIT_CLEAR_MODE(.clear_mode = CLEAR_MODE_SYSMEM));
3168 
3169    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
3170    tu_cs_emit(cs, 0);
3171 
3172    tu_cs_emit_regs(
3173       cs, A6XX_RB_BLIT_INFO(
3174                 .type = BLIT_EVENT_CLEAR,
3175                 .sample_0 = vk_format_is_int(vk_format) ||
3176                             vk_format_is_depth_or_stencil(vk_format),
3177                 .depth = vk_format_is_depth_or_stencil(vk_format),
3178                 .clear_mask = aspect_write_mask_generic_clear(format, aspect_mask)));
3179 
3180    uint32_t clear_vals[4] = {};
3181    pack_blit_event_clear_value(clear_value, format, clear_vals);
3182    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
3183    tu_cs_emit_array(cs, clear_vals, 4);
3184 
3185    for (unsigned level = 0; level < level_count; level++) {
3186       if (image->layout[0].depth0 > 1)
3187          layer_count =
3188             u_minify(image->layout[0].depth0, range->baseMipLevel + level);
3189 
3190       uint32_t width =
3191          u_minify(image->layout[0].width0, range->baseMipLevel + level);
3192       uint32_t height =
3193          u_minify(image->layout[0].height0, range->baseMipLevel + level);
3194       tu_cs_emit_regs(
3195          cs, A6XX_RB_BLIT_SCISSOR_TL(.x = 0, .y = 0),
3196          A6XX_RB_BLIT_SCISSOR_BR(.x = width - 1, .y = height - 1));
3197 
3198       struct fdl6_view dst;
3199       const VkImageSubresourceLayers subresource = {
3200          .aspectMask = aspect_mask,
3201          .mipLevel = range->baseMipLevel + level,
3202          .baseArrayLayer = range->baseArrayLayer,
3203          .layerCount = 1,
3204       };
3205       tu_image_view_copy_blit<A7XX>(&dst, image, format, &subresource, 0, false);
3206 
3207       for (uint32_t layer = 0; layer < layer_count; layer++) {
3208 
3209          struct event_blit_dst_view blt_view = {
3210             .image = image,
3211             .view = &dst,
3212             .layer = layer,
3213          };
3214 
3215          if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3216             uint32_t real_level = range->baseMipLevel + level;
3217             uint32_t real_layer = range->baseArrayLayer + layer;
3218             if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) {
3219                struct fdl_layout *layout = &image->layout[0];
3220                blt_view.depth_addr =
3221                   image->iova +
3222                   fdl_surface_offset(layout, real_level, real_layer);
3223                blt_view.depth_pitch = fdl_pitch(layout, real_level);
3224             } else {
3225                struct fdl_layout *layout = &image->layout[1];
3226                blt_view.stencil_addr =
3227                   image->iova +
3228                   fdl_surface_offset(layout, real_level, real_layer);
3229                blt_view.stencil_pitch = fdl_pitch(layout, real_level);
3230             }
3231          }
3232 
3233          event_blit_run<A7XX>(cmd, cs, NULL, &blt_view,
3234                               aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT);
3235       }
3236    }
3237 }
3238 
3239 static bool
use_generic_clear_for_image_clear(struct tu_cmd_buffer * cmd,struct tu_image * image)3240 use_generic_clear_for_image_clear(struct tu_cmd_buffer *cmd,
3241                                   struct tu_image *image)
3242 {
3243    return cmd->device->physical_device->info->a7xx.has_generic_clear &&
3244           /* A7XX supports R9G9B9E5_FLOAT as color attachment and supports
3245            * generic clears for it. A7XX TODO: allow R9G9B9E5_FLOAT
3246            * attachments.
3247            */
3248           image->vk.format != VK_FORMAT_E5B9G9R9_UFLOAT_PACK32;
3249 }
3250 
3251 template <chip CHIP>
3252 static void
clear_image(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)3253 clear_image(struct tu_cmd_buffer *cmd,
3254             struct tu_image *image,
3255             const VkClearValue *clear_value,
3256             const VkImageSubresourceRange *range,
3257             VkImageAspectFlags aspect_mask)
3258 {
3259    if (use_generic_clear_for_image_clear(cmd, image)) {
3260       clear_image_event_blit(cmd, image, clear_value, range, aspect_mask);
3261    } else {
3262       clear_image_cp_blit<CHIP>(cmd, image, clear_value, range, aspect_mask);
3263    }
3264 }
3265 
3266 template <chip CHIP>
3267 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearColorImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearColorValue * pColor,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)3268 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
3269                       VkImage image_h,
3270                       VkImageLayout imageLayout,
3271                       const VkClearColorValue *pColor,
3272                       uint32_t rangeCount,
3273                       const VkImageSubresourceRange *pRanges)
3274 {
3275    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3276    VK_FROM_HANDLE(tu_image, image, image_h);
3277 
3278    if (use_generic_clear_for_image_clear(cmd, image)) {
3279       /* Generic clear doesn't go through CCU (or other caches). */
3280       cmd->state.cache.flush_bits |=
3281          TU_CMD_FLAG_CCU_INVALIDATE_COLOR | TU_CMD_FLAG_WAIT_FOR_IDLE;
3282       tu_emit_cache_flush<CHIP>(cmd);
3283    }
3284 
3285    for (unsigned i = 0; i < rangeCount; i++) {
3286       clear_image<CHIP>(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
3287    }
3288 }
3289 TU_GENX(tu_CmdClearColorImage);
3290 
3291 template <chip CHIP>
3292 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)3293 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
3294                              VkImage image_h,
3295                              VkImageLayout imageLayout,
3296                              const VkClearDepthStencilValue *pDepthStencil,
3297                              uint32_t rangeCount,
3298                              const VkImageSubresourceRange *pRanges)
3299 {
3300    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3301    VK_FROM_HANDLE(tu_image, image, image_h);
3302 
3303    if (use_generic_clear_for_image_clear(cmd, image)) {
3304       /* Generic clear doesn't go through CCU (or other caches). */
3305       cmd->state.cache.flush_bits |= TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
3306                                      TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
3307                                      TU_CMD_FLAG_WAIT_FOR_IDLE;
3308       tu_emit_cache_flush<CHIP>(cmd);
3309    }
3310 
3311    for (unsigned i = 0; i < rangeCount; i++) {
3312       const VkImageSubresourceRange *range = &pRanges[i];
3313 
3314       if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3315          /* can't clear both depth and stencil at once, split up the aspect mask */
3316          u_foreach_bit(b, range->aspectMask)
3317             clear_image<CHIP>(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));
3318          continue;
3319       }
3320 
3321       clear_image<CHIP>(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
3322    }
3323 
3324    tu_lrz_clear_depth_image<CHIP>(cmd, image, pDepthStencil, rangeCount, pRanges);
3325 }
3326 TU_GENX(tu_CmdClearDepthStencilImage);
3327 
3328 template <chip CHIP>
3329 static void
tu_clear_sysmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)3330 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
3331                             uint32_t attachment_count,
3332                             const VkClearAttachment *attachments,
3333                             uint32_t rect_count,
3334                             const VkClearRect *rects)
3335 {
3336    /* the shader path here is special, it avoids changing MRT/etc state */
3337    const struct tu_subpass *subpass = cmd->state.subpass;
3338    const uint32_t mrt_count = subpass->color_count;
3339    struct tu_cs *cs = &cmd->draw_cs;
3340    uint32_t clear_value[MAX_RTS][4];
3341    float z_clear_val = 0.0f;
3342    uint8_t s_clear_val = 0;
3343    uint32_t clear_rts = 0, clear_components = 0;
3344    bool z_clear = false;
3345    bool s_clear = false;
3346 
3347    trace_start_sysmem_clear_all(&cmd->trace, cs, mrt_count, rect_count);
3348 
3349    for (uint32_t i = 0; i < attachment_count; i++) {
3350       uint32_t a;
3351       if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
3352          uint32_t c = attachments[i].colorAttachment;
3353          a = subpass->color_attachments[c].attachment;
3354          if (a == VK_ATTACHMENT_UNUSED)
3355             continue;
3356 
3357          clear_rts |= 1 << c;
3358          clear_components |= 0xf << (c * 4);
3359          memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
3360       } else {
3361          a = subpass->depth_stencil_attachment.attachment;
3362          if (a == VK_ATTACHMENT_UNUSED)
3363             continue;
3364 
3365          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3366             z_clear = true;
3367             z_clear_val = attachments[i].clearValue.depthStencil.depth;
3368          }
3369 
3370          if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3371             s_clear = true;
3372             s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
3373          }
3374       }
3375    }
3376 
3377    /* We may not know the multisample count if there are no attachments, so
3378     * just bail early to avoid corner cases later.
3379     */
3380    if (clear_rts == 0 && !z_clear && !s_clear)
3381       return;
3382 
3383    /* disable all draw states so they don't interfere
3384     * TODO: use and re-use draw states
3385     * we have to disable draw states individually to preserve
3386     * input attachment states, because a secondary command buffer
3387     * won't be able to restore them
3388     */
3389    tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
3390    for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
3391       if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
3392           i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
3393          continue;
3394       tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
3395                      CP_SET_DRAW_STATE__0_DISABLE);
3396       tu_cs_emit_qw(cs, 0);
3397    }
3398    cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
3399 
3400    tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
3401    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
3402                   A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
3403                   0xfc000000);
3404    tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
3405 
3406    r3d_common<CHIP>(cmd, cs, R3D_CLEAR, clear_rts, false, cmd->state.subpass->samples);
3407 
3408    /* Disable sample counting in order to not affect occlusion query. */
3409    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
3410 
3411    if (cmd->state.prim_generated_query_running_before_rp) {
3412       tu_emit_event_write<CHIP>(cmd, cs, FD_STOP_PRIMITIVE_CTRS);
3413    }
3414 
3415    tu_cs_emit_regs(cs,
3416                    A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
3417    tu_cs_emit_regs(cs,
3418                    A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
3419 
3420    tu_cs_emit_regs(cs,
3421                    A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
3422 
3423    tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
3424    tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
3425    for (uint32_t i = 0; i < mrt_count; i++) {
3426       tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
3427             .component_enable = COND(clear_rts & (1 << i), 0xf)));
3428    }
3429 
3430    tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
3431    tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
3432 
3433    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
3434    tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
3435          .z_test_enable = z_clear,
3436          .z_write_enable = z_clear,
3437          .zfunc = FUNC_ALWAYS));
3438    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL(z_clear));
3439    tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
3440    tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
3441          .stencil_enable = s_clear,
3442          .func = FUNC_ALWAYS,
3443          .zpass = STENCIL_REPLACE));
3444    tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL(s_clear));
3445    tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
3446    tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
3447    tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
3448 
3449    tu_cs_emit_regs(cs, A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2));
3450 
3451    unsigned num_rts = util_bitcount(clear_rts);
3452    uint32_t packed_clear_value[MAX_RTS][4];
3453 
3454    uint32_t idx = 0;
3455    u_foreach_bit(b, clear_rts) {
3456       memcpy(&packed_clear_value[idx], &clear_value[b], 4 * sizeof(uint32_t));
3457       idx++;
3458    }
3459 
3460    if (num_rts > 0)
3461       tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_FRAG, SB6_FS_SHADER,
3462                                 0, packed_clear_value, num_rts);
3463 
3464    for (uint32_t i = 0; i < rect_count; i++) {
3465       /* This should be true because of this valid usage for
3466        * vkCmdClearAttachments:
3467        *
3468        *    "If the render pass instance this is recorded in uses multiview,
3469        *    then baseArrayLayer must be zero and layerCount must be one"
3470        */
3471       assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);
3472 
3473       /* a630 doesn't support multiview masks, which means that we can't use
3474        * the normal multiview path without potentially recompiling a shader
3475        * on-demand or using a more complicated variant that takes the mask as
3476        * a const. Just use the layered path instead, since it shouldn't be
3477        * much worse.
3478        */
3479       for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount)
3480       {
3481          const float coords[] = {
3482             rects[i].rect.offset.x,
3483             rects[i].rect.offset.y,
3484             z_clear_val,
3485             uif(rects[i].baseArrayLayer + layer),
3486             rects[i].rect.offset.x + rects[i].rect.extent.width,
3487             rects[i].rect.offset.y + rects[i].rect.extent.height,
3488             z_clear_val,
3489             1.0f,
3490          };
3491 
3492          r3d_coords_raw(cmd, cs, coords);
3493          r3d_run_vis(cmd, cs);
3494       }
3495    }
3496 
3497    /* Re-enable sample counting. */
3498    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
3499 
3500    if (cmd->state.prim_generated_query_running_before_rp) {
3501       tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
3502    }
3503 
3504    trace_end_sysmem_clear_all(&cmd->trace, cs);
3505 }
3506 
3507 template <chip CHIP>
3508 static void
clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint8_t clear_mask,uint32_t gmem_offset,const VkClearValue * value)3509 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
3510                       struct tu_cs *cs,
3511                       enum pipe_format format,
3512                       uint8_t clear_mask,
3513                       uint32_t gmem_offset,
3514                       const VkClearValue *value)
3515 {
3516    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
3517    tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(
3518             blit_base_format<CHIP>(format, false, true)));
3519 
3520    tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.type = BLIT_EVENT_CLEAR,
3521                                          .clear_mask = clear_mask));
3522 
3523    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
3524    tu_cs_emit(cs, gmem_offset);
3525 
3526    tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
3527    tu_cs_emit(cs, 0);
3528 
3529    uint32_t clear_vals[4] = {};
3530    pack_blit_event_clear_value(value, format, clear_vals);
3531 
3532    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
3533    tu_cs_emit_array(cs, clear_vals, 4);
3534 
3535    tu_emit_event_write<CHIP>(cmd, cs, FD_BLIT);
3536 }
3537 
3538 template <chip CHIP>
3539 static void
tu_emit_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t attachment,uint32_t base_layer,uint32_t layers,uint32_t layer_mask,VkImageAspectFlags mask,const VkClearValue * value)3540 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
3541                               struct tu_cs *cs,
3542                               uint32_t attachment,
3543                               uint32_t base_layer,
3544                               uint32_t layers,
3545                               uint32_t layer_mask,
3546                               VkImageAspectFlags mask,
3547                               const VkClearValue *value)
3548 {
3549    const struct tu_render_pass_attachment *att =
3550       &cmd->state.pass->attachments[attachment];
3551 
3552    trace_start_gmem_clear(&cmd->trace, cs, att->format, att->samples);
3553 
3554    tu_cs_emit_regs(cs,
3555                    A6XX_RB_BLIT_GMEM_MSAA_CNTL(tu_msaa_samples(att->samples)));
3556 
3557    enum pipe_format format = vk_format_to_pipe_format(att->format);
3558    for_each_layer(i, layer_mask, layers) {
3559       uint32_t layer = i + base_layer;
3560       if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3561          if (mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3562             clear_gmem_attachment<CHIP>(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf,
3563                                   tu_attachment_gmem_offset(cmd, att, layer), value);
3564          }
3565          if (mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3566             clear_gmem_attachment<CHIP>(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf,
3567                                   tu_attachment_gmem_offset_stencil(cmd, att, layer), value);
3568          }
3569       } else {
3570          clear_gmem_attachment<CHIP>(cmd, cs, format, aspect_write_mask(format, mask),
3571                                tu_attachment_gmem_offset(cmd, att, layer), value);
3572       }
3573    }
3574 
3575    tu_flush_for_access(&cmd->state.renderpass_cache, TU_ACCESS_BLIT_WRITE_GMEM, TU_ACCESS_NONE);
3576 
3577    trace_end_gmem_clear(&cmd->trace, cs);
3578 }
3579 
3580 template <chip CHIP>
3581 static void
tu_clear_gmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)3582 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
3583                           uint32_t attachment_count,
3584                           const VkClearAttachment *attachments,
3585                           uint32_t rect_count,
3586                           const VkClearRect *rects)
3587 {
3588    const struct tu_subpass *subpass = cmd->state.subpass;
3589    struct tu_cs *cs = &cmd->draw_cs;
3590 
3591    if (rect_count > 1)
3592       perf_debug(cmd->device, "TODO: Swap tu_clear_gmem_attachments() loop for smaller command stream");
3593 
3594    for (unsigned i = 0; i < rect_count; i++) {
3595       unsigned x1 = rects[i].rect.offset.x;
3596       unsigned y1 = rects[i].rect.offset.y;
3597       unsigned x2 = x1 + rects[i].rect.extent.width - 1;
3598       unsigned y2 = y1 + rects[i].rect.extent.height - 1;
3599 
3600       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
3601       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
3602       tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
3603 
3604       for (unsigned j = 0; j < attachment_count; j++) {
3605          uint32_t a;
3606          if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
3607             a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
3608          else
3609             a = subpass->depth_stencil_attachment.attachment;
3610 
3611          if (a == VK_ATTACHMENT_UNUSED)
3612                continue;
3613 
3614          tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, a, rects[i].baseArrayLayer,
3615                                        rects[i].layerCount,
3616                                        subpass->multiview_mask,
3617                                        attachments[j].aspectMask,
3618                                        &attachments[j].clearValue);
3619       }
3620    }
3621 }
3622 
3623 template <chip CHIP>
3624 static void
tu_clear_attachments(struct tu_cmd_buffer * cmd,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)3625 tu_clear_attachments(struct tu_cmd_buffer *cmd,
3626                      uint32_t attachmentCount,
3627                      const VkClearAttachment *pAttachments,
3628                      uint32_t rectCount,
3629                      const VkClearRect *pRects)
3630 {
3631    struct tu_cs *cs = &cmd->draw_cs;
3632 
3633    /* sysmem path behaves like a draw, note we don't have a way of using different
3634     * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
3635     */
3636    tu_emit_cache_flush_renderpass<CHIP>(cmd);
3637 
3638    /* vkCmdClearAttachments is supposed to respect the predicate if active. The
3639     * easiest way to do this is to always use the 3d path, which always works
3640     * even with GMEM because it's just a simple draw using the existing
3641     * attachment state.
3642     *
3643     * Similarly, we also use the 3D path when in a secondary command buffer that
3644     * doesn't know the GMEM layout that will be chosen by the primary.
3645     */
3646    if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) {
3647       tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
3648       return;
3649    }
3650 
3651    /* If we could skip tile load/stores based on any draws intersecting them at
3652     * binning time, then emit the clear as a 3D draw so that it contributes to
3653     * that visibility.
3654    */
3655    const struct tu_subpass *subpass = cmd->state.subpass;
3656    for (uint32_t i = 0; i < attachmentCount; i++) {
3657       uint32_t a;
3658       if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
3659          uint32_t c = pAttachments[i].colorAttachment;
3660          a = subpass->color_attachments[c].attachment;
3661       } else {
3662          a = subpass->depth_stencil_attachment.attachment;
3663       }
3664       if (a != VK_ATTACHMENT_UNUSED) {
3665          const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
3666          if (att->cond_load_allowed || att->cond_store_allowed) {
3667             tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
3668             return;
3669          }
3670       }
3671    }
3672 
3673    /* Otherwise, emit 2D blits for gmem rendering. */
3674    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
3675    tu_clear_gmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
3676    tu_cond_exec_end(cs);
3677 
3678    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
3679    tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
3680    tu_cond_exec_end(cs);
3681 }
3682 
3683 static void
tu7_clear_attachment_generic_single_rect(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_render_pass_attachment * att,const VkClearAttachment * clear_att,uint32_t a,const VkClearRect * rect)3684 tu7_clear_attachment_generic_single_rect(
3685    struct tu_cmd_buffer *cmd,
3686    struct tu_cs *cs,
3687    const struct tu_render_pass_attachment *att,
3688    const VkClearAttachment *clear_att,
3689    uint32_t a,
3690    const VkClearRect *rect)
3691 {
3692    const struct tu_subpass *subpass = cmd->state.subpass;
3693    unsigned x1 = rect->rect.offset.x;
3694    unsigned y1 = rect->rect.offset.y;
3695    unsigned x2 = x1 + rect->rect.extent.width - 1;
3696    unsigned y2 = y1 + rect->rect.extent.height - 1;
3697 
3698    tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
3699    tu_cs_emit(cs,
3700               A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
3701    tu_cs_emit(cs,
3702               A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
3703 
3704    auto value = &clear_att->clearValue;
3705 
3706    enum pipe_format format = vk_format_to_pipe_format(att->format);
3707    for_each_layer(i, subpass->multiview_mask, rect->layerCount) {
3708       uint32_t layer = i + rect->baseArrayLayer;
3709       uint32_t mask =
3710          aspect_write_mask_generic_clear(format, clear_att->aspectMask);
3711 
3712       if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3713          if (clear_att->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3714             tu7_generic_layer_clear(cmd, cs, PIPE_FORMAT_Z32_FLOAT, mask,
3715                                     false, layer, value, a);
3716          }
3717          if (clear_att->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3718             tu7_generic_layer_clear(cmd, cs, PIPE_FORMAT_S8_UINT, mask, true,
3719                                     layer, value, a);
3720          }
3721       } else {
3722          tu7_generic_layer_clear(cmd, cs, format, mask, false, layer, value, a);
3723       }
3724    }
3725 }
3726 
3727 static void
tu_clear_attachments_generic(struct tu_cmd_buffer * cmd,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)3728 tu_clear_attachments_generic(struct tu_cmd_buffer *cmd,
3729                              uint32_t attachmentCount,
3730                              const VkClearAttachment *pAttachments,
3731                              uint32_t rectCount,
3732                              const VkClearRect *pRects)
3733 {
3734    struct tu_cs *cs = &cmd->draw_cs;
3735 
3736    uint32_t clear_aspects = 0;
3737    for (uint32_t i = 0; i < attachmentCount; i++) {
3738       clear_aspects |= pAttachments[i].aspectMask;
3739    }
3740 
3741    /* Generic clear doesn't go through CCU (or other caches),
3742     * so we have to flush (clean+invalidate) corresponding caches.
3743     */
3744    tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
3745    if (clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
3746       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 1);
3747       tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = CCU_FLUSH_COLOR).value);
3748    }
3749    if (clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
3750       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 1);
3751       tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = CCU_FLUSH_DEPTH).value);
3752    }
3753    tu_cs_emit_wfi(cs);
3754    tu_cond_exec_end(cs);
3755 
3756    const struct tu_subpass *subpass = cmd->state.subpass;
3757    for (uint32_t i = 0; i < attachmentCount; i++) {
3758       uint32_t a;
3759       if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
3760          uint32_t c = pAttachments[i].colorAttachment;
3761          a = subpass->color_attachments[c].attachment;
3762       } else {
3763          a = subpass->depth_stencil_attachment.attachment;
3764       }
3765       if (a != VK_ATTACHMENT_UNUSED) {
3766          const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
3767          const struct tu_image_view *iview = cmd->state.attachments[a];
3768          trace_start_generic_clear(&cmd->trace, cs, att->format,
3769                                    iview->view.ubwc_enabled, att->samples);
3770          for (unsigned j = 0; j < rectCount; j++) {
3771             tu7_clear_attachment_generic_single_rect(
3772                cmd, cs, att, &pAttachments[i], a, &pRects[j]);
3773          }
3774          trace_end_generic_clear(&cmd->trace, cs);
3775       }
3776    }
3777 }
3778 
3779 template <chip CHIP>
3780 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearAttachments(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)3781 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
3782                        uint32_t attachmentCount,
3783                        const VkClearAttachment *pAttachments,
3784                        uint32_t rectCount,
3785                        const VkClearRect *pRects)
3786 {
3787    VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3788 
3789    for (uint32_t j = 0; j < attachmentCount; j++) {
3790       if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)
3791          continue;
3792 
3793       tu_lrz_disable_during_renderpass<CHIP>(cmd);
3794    }
3795 
3796    if (cmd->device->physical_device->info->a7xx.has_generic_clear &&
3797        /* Both having predication and not knowing layout could be solved
3798         * by cs patching, which is exactly what prop driver is doing.
3799         * We don't implement it because we don't expect a reasonable impact.
3800         */
3801        !(cmd->state.predication_active ||
3802          cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT)) {
3803       tu_clear_attachments_generic(cmd, attachmentCount, pAttachments, rectCount, pRects);
3804    } else {
3805       tu_clear_attachments<CHIP>(cmd, attachmentCount, pAttachments,
3806                                  rectCount, pRects);
3807    }
3808 }
3809 TU_GENX(tu_CmdClearAttachments);
3810 
3811 template <chip CHIP>
3812 static void
clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags clear_mask,uint32_t a,bool separate_ds)3813 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
3814                         struct tu_cs *cs,
3815                         VkFormat vk_format,
3816                         VkImageAspectFlags clear_mask,
3817                         uint32_t a,
3818                         bool separate_ds)
3819 {
3820    enum pipe_format format = vk_format_to_pipe_format(vk_format);
3821    const struct tu_framebuffer *fb = cmd->state.framebuffer;
3822    const struct tu_image_view *iview = cmd->state.attachments[a];
3823    const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
3824    const struct blit_ops *ops = &r2d_ops<CHIP>;
3825    const VkClearValue *value = &cmd->state.clear_values[a];
3826    if (cmd->state.pass->attachments[a].samples > 1)
3827       ops = &r3d_ops<CHIP>;
3828 
3829    trace_start_sysmem_clear(&cmd->trace, cs, vk_format, ops == &r3d_ops<CHIP>,
3830                             cmd->state.pass->attachments[a].samples);
3831 
3832    ops->setup(cmd, cs, format, format, clear_mask, 0, true, iview->view.ubwc_enabled,
3833               cmd->state.pass->attachments[a].samples);
3834    ops->coords(cmd, cs, cmd->state.render_area.offset, (VkOffset2D) {},
3835                cmd->state.render_area.extent);
3836    ops->clear_value(cmd, cs, format, value);
3837 
3838    for_each_layer(i, clear_views, fb->layers) {
3839       if (separate_ds) {
3840          if (vk_format == VK_FORMAT_D32_SFLOAT) {
3841             ops->dst_depth(cs, iview, i);
3842          } else {
3843             ops->dst_stencil(cs, iview, i);
3844          }
3845       } else {
3846          ops->dst(cs, &iview->view, i, format);
3847       }
3848       ops->run(cmd, cs);
3849    }
3850 
3851    ops->teardown(cmd, cs);
3852 
3853    trace_end_sysmem_clear(&cmd->trace, cs);
3854 }
3855 
3856 template <chip CHIP>
3857 void
tu_clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a)3858 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
3859                            struct tu_cs *cs,
3860                            uint32_t a)
3861 {
3862    const struct tu_render_pass_attachment *attachment =
3863       &cmd->state.pass->attachments[a];
3864 
3865    if (!attachment->clear_mask)
3866       return;
3867 
3868    if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3869       if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3870          clear_sysmem_attachment<CHIP>(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
3871                                  a, true);
3872       }
3873       if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3874          clear_sysmem_attachment<CHIP>(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
3875                                  a, true);
3876       }
3877    } else {
3878       clear_sysmem_attachment<CHIP>(cmd, cs, attachment->format, attachment->clear_mask,
3879                               a, false);
3880    }
3881 
3882    /* The spec doesn't explicitly say, but presumably the initial renderpass
3883     * clear is considered part of the renderpass, and therefore barriers
3884     * aren't required inside the subpass/renderpass.  Therefore we need to
3885     * flush CCU color into CCU depth here, just like with
3886     * vkCmdClearAttachments(). Note that because this only happens at the
3887     * beginning of a renderpass, and renderpass writes are considered
3888     * "incoherent", we shouldn't have to worry about syncing depth into color
3889     * beforehand as depth should already be flushed.
3890     */
3891    if (vk_format_is_depth_or_stencil(attachment->format)) {
3892       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
3893       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_DEPTH);
3894       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_DEPTH);
3895    } else {
3896       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
3897       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_COLOR);
3898    }
3899 
3900    tu_cs_emit_wfi(cs);
3901 }
3902 TU_GENX(tu_clear_sysmem_attachment);
3903 
3904 template <chip CHIP>
3905 void
tu_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a)3906 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
3907                          struct tu_cs *cs,
3908                          uint32_t a)
3909 {
3910    const struct tu_render_pass_attachment *attachment =
3911       &cmd->state.pass->attachments[a];
3912 
3913    if (!attachment->clear_mask)
3914       return;
3915 
3916    tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, a, 0, cmd->state.framebuffer->layers,
3917                                  attachment->clear_views,
3918                                  attachment->clear_mask,
3919                                  &cmd->state.clear_values[a]);
3920 }
3921 TU_GENX(tu_clear_gmem_attachment);
3922 
3923 void
tu7_generic_clear_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a)3924 tu7_generic_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a)
3925 {
3926    const struct tu_render_pass_attachment *att =
3927       &cmd->state.pass->attachments[a];
3928    const VkClearValue *value = &cmd->state.clear_values[a];
3929    const struct tu_image_view *iview = cmd->state.attachments[a];
3930 
3931    trace_start_generic_clear(&cmd->trace, cs, att->format,
3932                              iview->view.ubwc_enabled, att->samples);
3933 
3934    enum pipe_format format = vk_format_to_pipe_format(att->format);
3935    for_each_layer(i, att->clear_views, cmd->state.framebuffer->layers) {
3936       uint32_t layer = i + 0;
3937       uint32_t mask =
3938          aspect_write_mask_generic_clear(format, att->clear_mask);
3939       if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3940          if (att->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3941             tu7_generic_layer_clear(cmd, cs, PIPE_FORMAT_Z32_FLOAT, mask,
3942                                     false, layer, value, a);
3943          }
3944          if (att->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3945             tu7_generic_layer_clear(cmd, cs, PIPE_FORMAT_S8_UINT, mask, true,
3946                                     layer, value, a);
3947          }
3948       } else {
3949          tu7_generic_layer_clear(cmd, cs, format, mask, false, layer, value, a);
3950       }
3951    }
3952 
3953    tu_flush_for_access(&cmd->state.renderpass_cache,
3954                        TU_ACCESS_BLIT_WRITE_GMEM, TU_ACCESS_NONE);
3955 
3956    trace_end_generic_clear(&cmd->trace, cs);
3957 }
3958 
3959 template <chip CHIP>
3960 static void
tu_emit_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * attachment,const VkClearValue * clear_value,enum a6xx_blit_event_type blit_event_type,bool separate_stencil)3961 tu_emit_blit(struct tu_cmd_buffer *cmd,
3962              struct tu_cs *cs,
3963              const struct tu_image_view *iview,
3964              const struct tu_render_pass_attachment *attachment,
3965              const VkClearValue *clear_value,
3966              enum a6xx_blit_event_type blit_event_type,
3967              bool separate_stencil)
3968 {
3969    assert(blit_event_type != BLIT_EVENT_CLEAR);
3970    uint32_t clear_mask = 0;
3971 
3972    /* BLIT_EVENT_STORE_AND_CLEAR would presumably swallow the
3973     * BLIT_EVENT_CLEAR at the start of a renderpass, and be more efficient.
3974     */
3975    if (blit_event_type == BLIT_EVENT_STORE && clear_value &&
3976        attachment->clear_mask &&
3977        use_generic_clear_for_image_clear(cmd, iview->image)) {
3978       blit_event_type = BLIT_EVENT_STORE_AND_CLEAR;
3979 
3980       enum pipe_format format = vk_format_to_pipe_format(attachment->format);
3981       VkImageAspectFlags aspect_mask = attachment->clear_mask;
3982       if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
3983          if (separate_stencil)
3984             aspect_mask = VK_IMAGE_ASPECT_STENCIL_BIT;
3985          else
3986             aspect_mask = VK_IMAGE_ASPECT_DEPTH_BIT;
3987       }
3988       if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
3989          if (separate_stencil)
3990             format = PIPE_FORMAT_S8_UINT;
3991          else
3992             format = PIPE_FORMAT_Z32_FLOAT;
3993       }
3994 
3995       clear_mask = aspect_write_mask_generic_clear(format, aspect_mask);
3996 
3997       uint32_t clear_vals[4] = {};
3998       pack_blit_event_clear_value(clear_value, format, clear_vals);
3999 
4000       tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
4001       tu_cs_emit_array(cs, clear_vals, 4);
4002    }
4003 
4004    event_blit_setup(cs, attachment, blit_event_type, clear_mask);
4005 
4006    for_each_layer(i, attachment->clear_views, cmd->state.framebuffer->layers) {
4007       event_blit_dst_view blt_view = blt_view_from_tu_view(iview, i);
4008       event_blit_run<CHIP>(cmd, cs, attachment, &blt_view, separate_stencil);
4009    }
4010 
4011    tu_flush_for_access(&cmd->state.cache, TU_ACCESS_BLIT_WRITE_GMEM,
4012                        TU_ACCESS_NONE);
4013 }
4014 
4015 static bool
blit_can_resolve(VkFormat format)4016 blit_can_resolve(VkFormat format)
4017 {
4018    const struct util_format_description *desc = vk_format_description(format);
4019 
4020    /* blit event can only do resolve for simple cases:
4021     * averaging samples as unsigned integers or choosing only one sample
4022     * Note this is allowed for SRGB formats, but results differ from 2D draw resolve
4023     */
4024    if (vk_format_is_snorm(format))
4025       return false;
4026 
4027    /* can't do formats with larger channel sizes
4028     * note: this includes all float formats
4029     * note2: single channel integer formats seem OK
4030     */
4031    if (desc->channel[0].size > 10 && vk_format_is_color(format))
4032       return false;
4033 
4034    switch (format) {
4035    /* for unknown reasons blit event can't msaa resolve these formats when tiled
4036     * likely related to these formats having different layout from other cpp=2 formats
4037     */
4038    case VK_FORMAT_R8G8_UNORM:
4039    case VK_FORMAT_R8G8_UINT:
4040    case VK_FORMAT_R8G8_SINT:
4041    case VK_FORMAT_R8G8_SRGB:
4042       return false;
4043    default:
4044       break;
4045    }
4046 
4047    return true;
4048 }
4049 
4050 struct apply_load_coords_state {
4051    unsigned view;
4052 };
4053 
4054 static void
fdm_apply_load_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)4055 fdm_apply_load_coords(struct tu_cmd_buffer *cmd,
4056                       struct tu_cs *cs,
4057                       void *data,
4058                       VkRect2D bin,
4059                       unsigned views,
4060                       VkExtent2D *frag_areas)
4061 {
4062    const struct apply_load_coords_state *state =
4063       (const struct apply_load_coords_state *)data;
4064    assert(state->view < views);
4065    VkExtent2D frag_area = frag_areas[state->view];
4066 
4067    assert(bin.extent.width % frag_area.width == 0);
4068    assert(bin.extent.height % frag_area.height == 0);
4069    uint32_t scaled_width = bin.extent.width / frag_area.width;
4070    uint32_t scaled_height = bin.extent.height / frag_area.height;
4071 
4072    const float coords[] = {
4073       bin.offset.x,                    bin.offset.y,
4074       bin.offset.x,                    bin.offset.y,
4075       bin.offset.x + scaled_width,     bin.offset.y + scaled_height,
4076       bin.offset.x + bin.extent.width, bin.offset.y + bin.extent.height,
4077    };
4078    r3d_coords_raw(cmd, cs, coords);
4079 }
4080 
4081 template <chip CHIP>
4082 static void
load_3d_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * att,bool separate_stencil)4083 load_3d_blit(struct tu_cmd_buffer *cmd,
4084              struct tu_cs *cs,
4085              const struct tu_image_view *iview,
4086              const struct tu_render_pass_attachment *att,
4087              bool separate_stencil)
4088 {
4089    const struct tu_framebuffer *fb = cmd->state.framebuffer;
4090    enum pipe_format format = iview->view.format;
4091    if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4092       if (separate_stencil)
4093          format = PIPE_FORMAT_S8_UINT;
4094       else
4095          format = PIPE_FORMAT_Z32_FLOAT;
4096    }
4097    r3d_setup<CHIP>(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT,
4098                    R3D_DST_GMEM, false, iview->view.ubwc_enabled,
4099                    iview->image->vk.samples);
4100 
4101    if (!cmd->state.pass->has_fdm) {
4102       r3d_coords(cmd, cs, (VkOffset2D) { 0, 0 }, (VkOffset2D) { 0, 0 },
4103                  (VkExtent2D) { fb->width, fb->height });
4104    }
4105 
4106    /* Normal loads read directly from system memory, so we have to invalidate
4107     * UCHE in case it contains stale data.
4108     */
4109    tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
4110 
4111    /* Wait for CACHE_INVALIDATE to land */
4112    tu_cs_emit_wfi(cs);
4113 
4114    for_each_layer(i, att->clear_views, cmd->state.framebuffer->layers) {
4115       if (cmd->state.pass->has_fdm) {
4116          struct apply_load_coords_state state = {
4117             .view = att->clear_views ? i : 0,
4118          };
4119          tu_create_fdm_bin_patchpoint(cmd, cs, 4, fdm_apply_load_coords, state);
4120       }
4121 
4122       r3d_dst_gmem<CHIP>(cmd, cs, iview, att, separate_stencil, i);
4123 
4124       if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4125          if (separate_stencil)
4126             r3d_src_stencil(cmd, cs, iview, i);
4127          else
4128             r3d_src_depth(cmd, cs, iview, i);
4129       } else {
4130          r3d_src_gmem_load(cmd, cs, iview, i);
4131       }
4132 
4133       r3d_run(cmd, cs);
4134    }
4135 
4136    r3d_teardown<CHIP>(cmd, cs);
4137 
4138    /* It seems we need to WFI here for depth/stencil because color writes here
4139     * aren't synchronized with depth/stencil writes.
4140     *
4141     * Note: the blob also uses a WFI for color attachments but this hasn't
4142     * been seen to be necessary.
4143     */
4144    if (vk_format_is_depth_or_stencil(att->format))
4145       tu_cs_emit_wfi(cs);
4146 }
4147 
4148 static void
tu_begin_load_store_cond_exec(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool load)4149 tu_begin_load_store_cond_exec(struct tu_cmd_buffer *cmd,
4150                               struct tu_cs *cs, bool load)
4151 {
4152    tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
4153 
4154    if (!TU_DEBUG(LOG_SKIP_GMEM_OPS))
4155       return;
4156 
4157    uint64_t result_iova;
4158    if (load)
4159       result_iova = global_iova(cmd, dbg_gmem_taken_loads);
4160    else
4161       result_iova = global_iova(cmd, dbg_gmem_taken_stores);
4162 
4163    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
4164    tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
4165    tu_cs_emit_qw(cs, result_iova);
4166    tu_cs_emit_qw(cs, result_iova);
4167    tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
4168 }
4169 
4170 static void
tu_end_load_store_cond_exec(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool load)4171 tu_end_load_store_cond_exec(struct tu_cmd_buffer *cmd,
4172                             struct tu_cs *cs, bool load)
4173 {
4174    tu_cond_exec_end(cs);
4175 
4176    if (!TU_DEBUG(LOG_SKIP_GMEM_OPS))
4177       return;
4178 
4179    uint64_t result_iova;
4180    if (load)
4181       result_iova = global_iova(cmd, dbg_gmem_total_loads);
4182    else
4183       result_iova = global_iova(cmd, dbg_gmem_total_stores);
4184 
4185    tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
4186    tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
4187    tu_cs_emit_qw(cs, result_iova);
4188    tu_cs_emit_qw(cs, result_iova);
4189    tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
4190 }
4191 
4192 template <chip CHIP>
4193 void
tu_load_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,bool cond_exec_allowed,bool force_load)4194 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
4195                         struct tu_cs *cs,
4196                         uint32_t a,
4197                         bool cond_exec_allowed,
4198                         bool force_load)
4199 {
4200    const struct tu_image_view *iview = cmd->state.attachments[a];
4201    const struct tu_render_pass_attachment *attachment =
4202       &cmd->state.pass->attachments[a];
4203 
4204    bool load_common = attachment->load || force_load;
4205    bool load_stencil =
4206       attachment->load_stencil ||
4207       (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load);
4208 
4209    if (!load_common && !load_stencil)
4210       return;
4211 
4212    trace_start_gmem_load(&cmd->trace, cs, attachment->format, force_load);
4213 
4214    /* If attachment will be cleared by vkCmdClearAttachments - it is likely
4215     * that it would be partially cleared, and since it is done by 2d blit
4216     * it doesn't produce geometry, so we have to unconditionally load.
4217     *
4218     * To simplify conditions treat partially cleared separate DS as fully
4219     * cleared and don't emit cond_exec.
4220     */
4221    bool cond_exec = cond_exec_allowed && attachment->cond_load_allowed;
4222    if (cond_exec)
4223       tu_begin_load_store_cond_exec(cmd, cs, true);
4224 
4225    if (TU_DEBUG(3D_LOAD) ||
4226        cmd->state.pass->has_fdm) {
4227       if (load_common || load_stencil)
4228          tu_disable_draw_states(cmd, cs);
4229 
4230       if (load_common)
4231          load_3d_blit<CHIP>(cmd, cs, iview, attachment, false);
4232 
4233       if (load_stencil)
4234          load_3d_blit<CHIP>(cmd, cs, iview, attachment, true);
4235    } else {
4236       if (load_common)
4237          tu_emit_blit<CHIP>(cmd, cs, iview, attachment, NULL, BLIT_EVENT_LOAD, false);
4238 
4239       if (load_stencil)
4240          tu_emit_blit<CHIP>(cmd, cs, iview, attachment, NULL, BLIT_EVENT_LOAD, true);
4241    }
4242 
4243    if (cond_exec)
4244       tu_end_load_store_cond_exec(cmd, cs, true);
4245 
4246    trace_end_gmem_load(&cmd->trace, cs);
4247 }
4248 TU_GENX(tu_load_gmem_attachment);
4249 
4250 template <chip CHIP>
4251 static void
store_cp_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t samples,bool separate_stencil,enum pipe_format src_format,enum pipe_format dst_format,uint32_t layer,uint32_t gmem_offset,uint32_t cpp)4252 store_cp_blit(struct tu_cmd_buffer *cmd,
4253               struct tu_cs *cs,
4254               const struct tu_image_view *iview,
4255               uint32_t samples,
4256               bool separate_stencil,
4257               enum pipe_format src_format,
4258               enum pipe_format dst_format,
4259               uint32_t layer,
4260               uint32_t gmem_offset,
4261               uint32_t cpp)
4262 {
4263    r2d_setup_common<CHIP>(cmd, cs, src_format, dst_format,
4264                           VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
4265                           iview->view.ubwc_enabled, true);
4266 
4267    if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4268       if (!separate_stencil) {
4269          r2d_dst_depth(cs, iview, layer);
4270       } else {
4271          r2d_dst_stencil(cs, iview, layer);
4272       }
4273    } else {
4274       r2d_dst<CHIP>(cs, &iview->view, layer, src_format);
4275    }
4276 
4277    enum a6xx_format fmt = blit_format_texture<CHIP>(src_format, TILE6_2, true).fmt;
4278    fixup_src_format(&src_format, dst_format, &fmt);
4279 
4280    tu_cs_emit_regs(cs,
4281                    SP_PS_2D_SRC_INFO(CHIP,
4282                       .color_format = fmt,
4283                       .tile_mode = TILE6_2,
4284                       .color_swap = WZYX,
4285                       .srgb = util_format_is_srgb(src_format),
4286                       .samples = tu_msaa_samples(samples),
4287                       .samples_average = !util_format_is_pure_integer(dst_format) &&
4288                                          !util_format_is_depth_or_stencil(dst_format),
4289                       .unk20 = 1,
4290                       .unk22 = 1),
4291                    SP_PS_2D_SRC_SIZE(CHIP, .width = iview->vk.extent.width, .height = iview->vk.extent.height),
4292                    SP_PS_2D_SRC(CHIP, .qword = cmd->device->physical_device->gmem_base + gmem_offset),
4293                    SP_PS_2D_SRC_PITCH(CHIP, .pitch = cmd->state.tiling->tile0.width * cpp));
4294 
4295    /* sync GMEM writes with CACHE. */
4296    tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
4297    if (CHIP >= A7XX)
4298       /* On A7XX, we need to wait for any CP_EVENT_WRITE::BLIT operations
4299        * arising from GMEM load/clears to land before we can continue.
4300        */
4301       tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
4302 
4303    /* Wait for cache event to land */
4304    tu_cs_emit_wfi(cs);
4305 
4306    r2d_run(cmd, cs);
4307 
4308    /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
4309     * sysmem, and we generally assume that GMEM renderpasses leave their
4310     * results in sysmem, so we need to flush manually here.
4311     */
4312    tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4313 }
4314 
4315 template <chip CHIP>
4316 static void
store_3d_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,VkSampleCountFlagBits dst_samples,bool separate_stencil,enum pipe_format src_format,enum pipe_format dst_format,const VkRect2D * render_area,uint32_t layer,uint32_t gmem_offset,uint32_t cpp)4317 store_3d_blit(struct tu_cmd_buffer *cmd,
4318               struct tu_cs *cs,
4319               const struct tu_image_view *iview,
4320               VkSampleCountFlagBits dst_samples,
4321               bool separate_stencil,
4322               enum pipe_format src_format,
4323               enum pipe_format dst_format,
4324               const VkRect2D *render_area,
4325               uint32_t layer,
4326               uint32_t gmem_offset,
4327               uint32_t cpp)
4328 {
4329    /* RB_BIN_CONTROL/GRAS_BIN_CONTROL are normally only set once and they
4330     * aren't set until we know whether we're HW binning or not, and we want to
4331     * avoid a dependence on that here to be able to store attachments before
4332     * the end of the renderpass in the future. Use the scratch space to
4333     * save/restore them dynamically.
4334     */
4335    tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
4336    tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A6XX_RB_BIN_CONTROL) |
4337                   CP_REG_TO_SCRATCH_0_SCRATCH(0) |
4338                   CP_REG_TO_SCRATCH_0_CNT(1 - 1));
4339    if (CHIP >= A7XX) {
4340       tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
4341       tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A7XX_RB_UNKNOWN_8812) |
4342                      CP_REG_TO_SCRATCH_0_SCRATCH(1) |
4343                      CP_REG_TO_SCRATCH_0_CNT(1 - 1));
4344    }
4345 
4346    r3d_setup<CHIP>(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT,
4347                    0, false, iview->view.ubwc_enabled, dst_samples);
4348 
4349    r3d_coords(cmd, cs, render_area->offset, render_area->offset, render_area->extent);
4350 
4351    if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4352       if (!separate_stencil) {
4353          r3d_dst_depth<CHIP>(cs, iview, layer);
4354       } else {
4355          r3d_dst_stencil<CHIP>(cs, iview, layer);
4356       }
4357    } else {
4358       r3d_dst<CHIP>(cs, &iview->view, layer, src_format);
4359    }
4360 
4361    r3d_src_gmem<CHIP>(cmd, cs, iview, src_format, dst_format, gmem_offset, cpp);
4362 
4363    /* sync GMEM writes with CACHE. */
4364    tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
4365 
4366    /* Wait for CACHE_INVALIDATE to land */
4367    tu_cs_emit_wfi(cs);
4368 
4369    r3d_run(cmd, cs);
4370 
4371    r3d_teardown<CHIP>(cmd, cs);
4372 
4373    /* Draws write to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
4374     * sysmem, and we generally assume that GMEM renderpasses leave their
4375     * results in sysmem, so we need to flush manually here. The 3d blit path
4376     * writes to depth images as a color RT, so there's no need to flush depth.
4377     */
4378    tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4379 
4380    /* Restore RB_BIN_CONTROL/GRAS_BIN_CONTROL saved above. */
4381    tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
4382    tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_RB_BIN_CONTROL) |
4383                   CP_SCRATCH_TO_REG_0_SCRATCH(0) |
4384                   CP_SCRATCH_TO_REG_0_CNT(1 - 1));
4385 
4386    tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
4387    tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_GRAS_BIN_CONTROL) |
4388                   CP_SCRATCH_TO_REG_0_SCRATCH(0) |
4389                   CP_SCRATCH_TO_REG_0_CNT(1 - 1));
4390 
4391    if (CHIP >= A7XX) {
4392       tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
4393       tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A7XX_RB_UNKNOWN_8812) |
4394                         CP_SCRATCH_TO_REG_0_SCRATCH(1) |
4395                         CP_SCRATCH_TO_REG_0_CNT(1 - 1));
4396    }
4397 }
4398 
4399 static bool
tu_attachment_store_unaligned(struct tu_cmd_buffer * cmd,uint32_t a)4400 tu_attachment_store_unaligned(struct tu_cmd_buffer *cmd, uint32_t a)
4401 {
4402    struct tu_physical_device *phys_dev = cmd->device->physical_device;
4403    const struct tu_image_view *iview = cmd->state.attachments[a];
4404    const VkRect2D *render_area = &cmd->state.render_area;
4405 
4406    /* Unaligned store is incredibly rare in CTS, we have to force it to test. */
4407    if (TU_DEBUG(UNALIGNED_STORE))
4408       return true;
4409 
4410    /* We always use the unaligned store path when scaling rendering. */
4411    if (cmd->state.pass->has_fdm)
4412       return true;
4413 
4414    uint32_t x1 = render_area->offset.x;
4415    uint32_t y1 = render_area->offset.y;
4416    uint32_t x2 = x1 + render_area->extent.width;
4417    uint32_t y2 = y1 + render_area->extent.height;
4418    /* x2/y2 can be unaligned if equal to the size of the image, since it will
4419     * write into padding space. The one exception is linear levels which don't
4420     * have the required y padding in the layout (except for the last level)
4421     */
4422    bool need_y2_align =
4423       y2 != iview->view.height || iview->view.need_y2_align;
4424 
4425    return (x1 % phys_dev->info->gmem_align_w ||
4426            (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) ||
4427            y1 % phys_dev->info->gmem_align_h ||
4428            (y2 % phys_dev->info->gmem_align_h && need_y2_align));
4429 }
4430 
4431 /* Choose the GMEM layout (use the CCU space or not) based on whether the
4432  * current attachments will need.  This has to happen at vkBeginRenderPass()
4433  * time because tu_attachment_store_unaligned() looks at the image views, which
4434  * are only available at that point.  This should match the logic for the
4435  * !use_fast_path case in tu_store_gmem_attachment().
4436  */
4437 void
tu_choose_gmem_layout(struct tu_cmd_buffer * cmd)4438 tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
4439 {
4440    cmd->state.gmem_layout = TU_GMEM_LAYOUT_FULL;
4441 
4442    for (unsigned i = 0; i < cmd->state.pass->attachment_count; i++) {
4443       if (!cmd->state.attachments[i])
4444          continue;
4445 
4446       struct tu_render_pass_attachment *att =
4447          &cmd->state.pass->attachments[i];
4448       if ((att->store || att->store_stencil) &&
4449           tu_attachment_store_unaligned(cmd, i))
4450          cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
4451       if (att->store && att->format == VK_FORMAT_S8_UINT)
4452          /* We cannot pick out S8 from D24S8/D32S8, so we conservatively disable
4453           * blit events for the S8_UINT format.
4454           */
4455          cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
4456       if (att->will_be_resolved && !blit_can_resolve(att->format))
4457          cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
4458    }
4459 
4460    cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
4461 }
4462 
4463 struct apply_store_coords_state {
4464    unsigned view;
4465 };
4466 
4467 static void
fdm_apply_store_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)4468 fdm_apply_store_coords(struct tu_cmd_buffer *cmd,
4469                        struct tu_cs *cs,
4470                        void *data,
4471                        VkRect2D bin,
4472                        unsigned views,
4473                        VkExtent2D *frag_areas)
4474 {
4475    const struct apply_store_coords_state *state =
4476       (const struct apply_store_coords_state *)data;
4477    assert(state->view < views);
4478    VkExtent2D frag_area = frag_areas[state->view];
4479 
4480    /* The bin width/height must be a multiple of the frag_area to make sure
4481     * that the scaling happens correctly. This means there may be some
4482     * destination pixels jut out of the framebuffer, but they should be
4483     * clipped by the render area.
4484     */
4485    assert(bin.extent.width % frag_area.width == 0);
4486    assert(bin.extent.height % frag_area.height == 0);
4487    uint32_t scaled_width = bin.extent.width / frag_area.width;
4488    uint32_t scaled_height = bin.extent.height / frag_area.height;
4489 
4490    tu_cs_emit_regs(cs,
4491       A6XX_GRAS_2D_DST_TL(.x = bin.offset.x,
4492                           .y = bin.offset.y),
4493       A6XX_GRAS_2D_DST_BR(.x = bin.offset.x + bin.extent.width - 1,
4494                           .y = bin.offset.y + bin.extent.height - 1));
4495    tu_cs_emit_regs(cs,
4496                    A6XX_GRAS_2D_SRC_TL_X(bin.offset.x),
4497                    A6XX_GRAS_2D_SRC_BR_X(bin.offset.x + scaled_width - 1),
4498                    A6XX_GRAS_2D_SRC_TL_Y(bin.offset.y),
4499                    A6XX_GRAS_2D_SRC_BR_Y(bin.offset.y + scaled_height - 1));
4500 }
4501 
4502 template <chip CHIP>
4503 void
tu_store_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,uint32_t gmem_a,uint32_t layers,uint32_t layer_mask,bool cond_exec_allowed)4504 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
4505                          struct tu_cs *cs,
4506                          uint32_t a,
4507                          uint32_t gmem_a,
4508                          uint32_t layers,
4509                          uint32_t layer_mask,
4510                          bool cond_exec_allowed)
4511 {
4512    const VkRect2D *render_area = &cmd->state.render_area;
4513    struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
4514    const struct tu_image_view *iview = cmd->state.attachments[a];
4515    struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
4516    const VkClearValue *clear_value = &cmd->state.clear_values[gmem_a];
4517    bool resolve = a != gmem_a;
4518    if (resolve)
4519       clear_value = NULL;
4520 
4521    if (!dst->store && !dst->store_stencil)
4522       return;
4523 
4524    bool unaligned = tu_attachment_store_unaligned(cmd, a);
4525 
4526    /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
4527     * one for depth and other for stencil. When resolving a MSAA
4528     * D32_SFLOAT_S8_UINT to S8_UINT, we need to take that into account.
4529     */
4530    bool resolve_d32s8_s8 =
4531       src->format == VK_FORMAT_D32_SFLOAT_S8_UINT &&
4532       dst->format == VK_FORMAT_S8_UINT;
4533 
4534    /* The fast path doesn't support picking out the last component of a D24S8
4535     * texture reinterpreted as RGBA8_UNORM.
4536     */
4537    bool resolve_d24s8_s8 =
4538       src->format == VK_FORMAT_D24_UNORM_S8_UINT &&
4539       dst->format == VK_FORMAT_S8_UINT;
4540 
4541    bool store_common = dst->store && !resolve_d32s8_s8;
4542    bool store_separate_stencil = dst->store_stencil || resolve_d32s8_s8;
4543 
4544    bool use_fast_path = !unaligned && !resolve_d24s8_s8 &&
4545                         (a == gmem_a || blit_can_resolve(dst->format));
4546 
4547    trace_start_gmem_store(&cmd->trace, cs, dst->format, use_fast_path, unaligned);
4548 
4549    /* Unconditional store should happen only if attachment was cleared,
4550     * which could have happened either by load_op or via vkCmdClearAttachments.
4551     */
4552    bool cond_exec = cond_exec_allowed && src->cond_store_allowed;
4553    if (cond_exec) {
4554       tu_begin_load_store_cond_exec(cmd, cs, false);
4555    }
4556 
4557    /* use fast path when render area is aligned, except for unsupported resolve cases */
4558    if (use_fast_path) {
4559       if (store_common)
4560          tu_emit_blit<CHIP>(cmd, cs, iview, src, clear_value, BLIT_EVENT_STORE, false);
4561       if (store_separate_stencil)
4562          tu_emit_blit<CHIP>(cmd, cs, iview, src, clear_value, BLIT_EVENT_STORE, true);
4563 
4564       if (cond_exec) {
4565          tu_end_load_store_cond_exec(cmd, cs, false);
4566       }
4567 
4568       trace_end_gmem_store(&cmd->trace, cs);
4569       return;
4570    }
4571 
4572    assert(cmd->state.gmem_layout == TU_GMEM_LAYOUT_AVOID_CCU);
4573 
4574    enum pipe_format src_format = vk_format_to_pipe_format(src->format);
4575    if (src_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
4576       src_format = PIPE_FORMAT_Z32_FLOAT;
4577 
4578    enum pipe_format dst_format = vk_format_to_pipe_format(dst->format);
4579    if (dst_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
4580       dst_format = PIPE_FORMAT_Z32_FLOAT;
4581 
4582    if (dst->samples > 1) {
4583       /* If we hit this path, we have to disable draw states after every tile
4584        * instead of once at the end of the renderpass, so that they aren't
4585        * executed when calling CP_DRAW.
4586        *
4587        * TODO: store a flag somewhere so we don't do this more than once and
4588        * don't do it after the renderpass when this happens.
4589        */
4590       if (store_common || store_separate_stencil)
4591          tu_disable_draw_states(cmd, cs);
4592 
4593       for_each_layer(i, layer_mask, layers) {
4594          if (store_common) {
4595             store_3d_blit<CHIP>(cmd, cs, iview, dst->samples, false, src_format,
4596                           dst_format, render_area, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp);
4597          }
4598          if (store_separate_stencil) {
4599             store_3d_blit<CHIP>(cmd, cs, iview, dst->samples, true, PIPE_FORMAT_S8_UINT,
4600                           PIPE_FORMAT_S8_UINT, render_area, i,
4601                           tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples);
4602          }
4603       }
4604    } else {
4605       if (!cmd->state.pass->has_fdm) {
4606          r2d_coords(cmd, cs, render_area->offset, render_area->offset,
4607                     render_area->extent);
4608       } else {
4609          /* Usually GRAS_2D_RESOLVE_CNTL_* clips the destination to the bin
4610           * area and the coordinates span the entire render area, but for
4611           * FDM we need to scale the coordinates so we need to take the
4612           * opposite aproach, specifying the exact bin size in the destination
4613           * coordinates and using GRAS_2D_RESOLVE_CNTL_* to clip to the render
4614           * area.
4615           */
4616          tu_cs_emit_regs(cs,
4617                          A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = render_area->offset.x,
4618                                                      .y = render_area->offset.y,),
4619                          A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = render_area->offset.x + render_area->extent.width - 1,
4620                                                      .y = render_area->offset.y + render_area->extent.height - 1,));
4621       }
4622 
4623       for_each_layer (i, layer_mask, layers) {
4624          if (cmd->state.pass->has_fdm) {
4625             unsigned view = layer_mask ? i : 0;
4626             struct apply_store_coords_state state = {
4627                .view = view,
4628             };
4629             tu_create_fdm_bin_patchpoint(cmd, cs, 8, fdm_apply_store_coords,
4630                                          state);
4631          }
4632          if (store_common) {
4633             store_cp_blit<CHIP>(cmd, cs, iview, src->samples, false, src_format,
4634                           dst_format, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp);
4635          }
4636          if (store_separate_stencil) {
4637             store_cp_blit<CHIP>(cmd, cs, iview, src->samples, true, PIPE_FORMAT_S8_UINT,
4638                           PIPE_FORMAT_S8_UINT, i, tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples);
4639          }
4640       }
4641    }
4642 
4643    if (cond_exec) {
4644       tu_end_load_store_cond_exec(cmd, cs, false);
4645    }
4646 
4647    trace_end_gmem_store(&cmd->trace, cs);
4648 }
4649 TU_GENX(tu_store_gmem_attachment);
4650