1 /*
2 * Copyright 2019-2020 Valve Corporation
3 * SPDX-License-Identifier: MIT
4 *
5 * Authors:
6 * Jonathan Marek <[email protected]>
7 */
8
9 #include "tu_clear_blit.h"
10
11 #include "ir3/ir3_nir.h"
12
13 #include "util/format_r11g11b10f.h"
14 #include "util/format_rgb9e5.h"
15 #include "util/format_srgb.h"
16 #include "util/half_float.h"
17 #include "compiler/nir/nir_builder.h"
18
19 #include "tu_buffer.h"
20 #include "tu_cmd_buffer.h"
21 #include "tu_cs.h"
22 #include "tu_formats.h"
23 #include "tu_image.h"
24 #include "tu_tracepoints.h"
25 #include "tu_lrz.h"
26
27 #include "common/freedreno_gpu_event.h"
28 #include "common/freedreno_lrz.h"
29
30 static const VkOffset2D blt_no_coord = { ~0, ~0 };
31
32 static uint32_t
tu_pack_float32_for_unorm(float val,int bits)33 tu_pack_float32_for_unorm(float val, int bits)
34 {
35 return _mesa_lroundevenf(CLAMP(val, 0.0f, 1.0f) * (float) ((1 << bits) - 1));
36 }
37
38 /* r2d_ = BLIT_OP_SCALE operations */
39
40 static enum a6xx_2d_ifmt
format_to_ifmt(enum pipe_format format)41 format_to_ifmt(enum pipe_format format)
42 {
43 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
44 format == PIPE_FORMAT_Z24X8_UNORM)
45 return R2D_UNORM8;
46
47 /* get_component_bits doesn't work with depth/stencil formats: */
48 if (format == PIPE_FORMAT_Z16_UNORM || format == PIPE_FORMAT_Z32_FLOAT)
49 return R2D_FLOAT32;
50 if (format == PIPE_FORMAT_S8_UINT)
51 return R2D_INT8;
52 if (format == PIPE_FORMAT_A8_UNORM)
53 return R2D_UNORM8;
54
55 /* use the size of the red channel to find the corresponding "ifmt" */
56 bool is_int = util_format_is_pure_integer(format);
57 switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
58 case 4: case 5: case 8:
59 return is_int ? R2D_INT8 : R2D_UNORM8;
60 case 10: case 11:
61 return is_int ? R2D_INT16 : R2D_FLOAT16;
62 case 16:
63 if (util_format_is_float(format))
64 return R2D_FLOAT16;
65 return is_int ? R2D_INT16 : R2D_FLOAT32;
66 case 32:
67 return is_int ? R2D_INT32 : R2D_FLOAT32;
68 default:
69 unreachable("bad format");
70 }
71 }
72
73 template <chip CHIP>
74 static struct tu_native_format
blit_format_texture(enum pipe_format format,enum a6xx_tile_mode tile_mode,bool gmem)75 blit_format_texture(enum pipe_format format, enum a6xx_tile_mode tile_mode, bool gmem)
76 {
77 struct tu_native_format fmt = tu6_format_texture(format, tile_mode);
78
79 switch (format) {
80 case PIPE_FORMAT_Z24X8_UNORM:
81 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
82 /* Similar to in fdl6_view_init, we want to use
83 * FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 or FMT6_8_8_8_8_UNORM for blit
84 * src. Since this is called when there is no image and thus no ubwc,
85 * we can always use FMT6_8_8_8_8_UNORM.
86 *
87 * Note (A7XX): Since it's erroneous to use FMT6_8_8_8_8_UNORM for a GMEM
88 * image (see blit_base_format), we use FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8
89 * instead.
90 */
91 fmt.fmt = CHIP >= A7XX && gmem ? FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 : FMT6_8_8_8_8_UNORM;
92 break;
93 default:
94 break;
95 }
96
97 return fmt;
98 }
99
100 static struct tu_native_format
blit_format_color(enum pipe_format format,enum a6xx_tile_mode tile_mode)101 blit_format_color(enum pipe_format format, enum a6xx_tile_mode tile_mode)
102 {
103 struct tu_native_format fmt = tu6_format_color(format, tile_mode);
104
105 switch (format) {
106 case PIPE_FORMAT_Z24X8_UNORM:
107 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
108 /* similar to blit_format_texture but for blit dst */
109 fmt.fmt = FMT6_8_8_8_8_UNORM;
110 break;
111 default:
112 break;
113 }
114
115 return fmt;
116 }
117
118 template <chip CHIP>
119 static enum a6xx_format
blit_base_format(enum pipe_format format,bool ubwc,bool gmem)120 blit_base_format(enum pipe_format format, bool ubwc, bool gmem)
121 {
122 if (CHIP >= A7XX && gmem)
123 /* A7XX requires D24S8 in GMEM to always be treated as
124 * FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 regardless of if the image
125 * is UBWC-compatible. Using FMT6_8_8_8_8_UNORM instead will result
126 * in misrendering around the edges of the destination image.
127 */
128 ubwc = true;
129
130 if (ubwc) {
131 switch (format) {
132 case PIPE_FORMAT_Z24X8_UNORM:
133 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
134 /* use the ubwc-compatible FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 */
135 return FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8;
136 default:
137 break;
138 }
139 }
140
141 /* note: tu6_format_color doesn't care about tiling for .fmt field */
142 return blit_format_color(format, TILE6_LINEAR).fmt;
143 }
144
145 static void
r2d_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset2D dst,const VkOffset2D src,const VkExtent2D extent)146 r2d_coords(struct tu_cmd_buffer *cmd,
147 struct tu_cs *cs,
148 const VkOffset2D dst,
149 const VkOffset2D src,
150 const VkExtent2D extent)
151 {
152 tu_cs_emit_regs(cs,
153 A6XX_GRAS_2D_DST_TL(.x = dst.x, .y = dst.y),
154 A6XX_GRAS_2D_DST_BR(.x = dst.x + extent.width - 1, .y = dst.y + extent.height - 1));
155
156 if (src.x == blt_no_coord.x)
157 return;
158
159 tu_cs_emit_regs(cs,
160 A6XX_GRAS_2D_SRC_TL_X(src.x),
161 A6XX_GRAS_2D_SRC_BR_X(src.x + extent.width - 1),
162 A6XX_GRAS_2D_SRC_TL_Y(src.y),
163 A6XX_GRAS_2D_SRC_BR_Y(src.y + extent.height - 1));
164 }
165
166 static void
r2d_clear_value(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,const VkClearValue * val)167 r2d_clear_value(struct tu_cmd_buffer *cmd,
168 struct tu_cs *cs,
169 enum pipe_format format,
170 const VkClearValue *val)
171 {
172 uint32_t clear_value[4] = {};
173
174 switch (format) {
175 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
176 case PIPE_FORMAT_Z24X8_UNORM:
177 /* cleared as r8g8b8a8_unorm using special format */
178 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
179 clear_value[1] = clear_value[0] >> 8;
180 clear_value[2] = clear_value[0] >> 16;
181 clear_value[3] = val->depthStencil.stencil;
182 break;
183 case PIPE_FORMAT_Z16_UNORM:
184 case PIPE_FORMAT_Z32_FLOAT:
185 /* R2D_FLOAT32 */
186 clear_value[0] = fui(val->depthStencil.depth);
187 break;
188 case PIPE_FORMAT_S8_UINT:
189 clear_value[0] = val->depthStencil.stencil;
190 break;
191 case PIPE_FORMAT_R9G9B9E5_FLOAT:
192 /* cleared as UINT32 */
193 clear_value[0] = float3_to_rgb9e5(val->color.float32);
194 break;
195 default:
196 assert(!util_format_is_depth_or_stencil(format));
197 const struct util_format_description *desc = util_format_description(format);
198 enum a6xx_2d_ifmt ifmt = format_to_ifmt(format);
199
200 assert(desc->layout == UTIL_FORMAT_LAYOUT_PLAIN ||
201 format == PIPE_FORMAT_R11G11B10_FLOAT);
202
203 for (unsigned i = 0; i < 4; i++) {
204 if (desc->swizzle[i] > PIPE_SWIZZLE_W)
205 continue;
206
207 const struct util_format_channel_description *ch =
208 &desc->channel[desc->swizzle[i]];
209 if (ifmt == R2D_UNORM8) {
210 float linear = val->color.float32[i];
211 if (desc->colorspace == UTIL_FORMAT_COLORSPACE_SRGB && i < 3)
212 linear = util_format_linear_to_srgb_float(val->color.float32[i]);
213
214 if (ch->type == UTIL_FORMAT_TYPE_SIGNED)
215 clear_value[i] = _mesa_lroundevenf(CLAMP(linear, -1.0f, 1.0f) * 127.0f);
216 else
217 clear_value[i] = tu_pack_float32_for_unorm(linear, 8);
218 } else if (ifmt == R2D_FLOAT16) {
219 clear_value[i] = _mesa_float_to_half(val->color.float32[i]);
220 } else {
221 assert(ifmt == R2D_FLOAT32 || ifmt == R2D_INT32 ||
222 ifmt == R2D_INT16 || ifmt == R2D_INT8);
223 clear_value[i] = val->color.uint32[i];
224 }
225 }
226 break;
227 }
228
229 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_SRC_SOLID_C0, 4);
230 tu_cs_emit_array(cs, clear_value, 4);
231 }
232
233 static void
fixup_src_format(enum pipe_format * src_format,enum pipe_format dst_format,enum a6xx_format * fmt)234 fixup_src_format(enum pipe_format *src_format, enum pipe_format dst_format,
235 enum a6xx_format *fmt)
236 {
237 /* When blitting S8 -> D24S8 or vice versa, we have to override S8, which
238 * is normally R8_UINT for sampling/blitting purposes, to a unorm format.
239 * We also have to move stencil, which is normally in the .w channel, into
240 * the right channel. Reintepreting the S8 texture as A8_UNORM solves both
241 * problems, and avoids using a swap, which seems to sometimes not work
242 * with a D24S8 source, or a texture swizzle which is only supported with
243 * the 3d path. Sometimes this blit happens on already-constructed
244 * fdl6_view's, e.g. for sysmem resolves, so this has to happen as a fixup.
245 */
246 if (*src_format == PIPE_FORMAT_S8_UINT &&
247 (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
248 dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
249 *fmt = FMT6_A8_UNORM;
250 *src_format = PIPE_FORMAT_A8_UNORM;
251 }
252 }
253
254 static void
fixup_dst_format(enum pipe_format src_format,enum pipe_format * dst_format,enum a6xx_format * fmt)255 fixup_dst_format(enum pipe_format src_format, enum pipe_format *dst_format,
256 enum a6xx_format *fmt)
257 {
258 if (*dst_format == PIPE_FORMAT_S8_UINT &&
259 (src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
260 src_format == PIPE_FORMAT_Z24_UNORM_S8_UINT_AS_R8G8B8A8)) {
261 *dst_format = PIPE_FORMAT_A8_UNORM;
262 *fmt = FMT6_A8_UNORM;
263 }
264 }
265
266 template <chip CHIP>
267 static void
r2d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,VkFilter filter,enum pipe_format dst_format)268 r2d_src(struct tu_cmd_buffer *cmd,
269 struct tu_cs *cs,
270 const struct fdl6_view *iview,
271 uint32_t layer,
272 VkFilter filter,
273 enum pipe_format dst_format)
274 {
275 uint32_t src_info = iview->SP_PS_2D_SRC_INFO;
276 if (filter != VK_FILTER_NEAREST)
277 src_info |= A6XX_SP_PS_2D_SRC_INFO_FILTER;
278
279 enum a6xx_format fmt = (enum a6xx_format)(
280 src_info & A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK);
281 enum pipe_format src_format = iview->format;
282 fixup_src_format(&src_format, dst_format, &fmt);
283
284 src_info =
285 (src_info & ~A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT__MASK) |
286 A6XX_SP_PS_2D_SRC_INFO_COLOR_FORMAT(fmt);
287
288 tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP,).reg, 5);
289 tu_cs_emit(cs, src_info);
290 tu_cs_emit(cs, iview->SP_PS_2D_SRC_SIZE);
291 tu_cs_image_ref_2d<CHIP>(cs, iview, layer, true);
292
293 tu_cs_emit_pkt4(cs, __SP_PS_2D_SRC_FLAGS<CHIP>({}).reg, 3);
294 tu_cs_image_flag_ref(cs, iview, layer);
295 }
296
297 template <chip CHIP>
298 static void
r2d_src_depth(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)299 r2d_src_depth(struct tu_cmd_buffer *cmd,
300 struct tu_cs *cs,
301 const struct tu_image_view *iview,
302 uint32_t layer,
303 VkFilter filter)
304 {
305 tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP).reg, 5);
306 tu_cs_emit(cs, tu_image_view_depth(iview, SP_PS_2D_SRC_INFO));
307 tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
308 tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
309 /* SP_PS_2D_SRC_PITCH has shifted pitch field */
310 tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->depth_pitch).value);
311
312 tu_cs_emit_pkt4(cs, __SP_PS_2D_SRC_FLAGS<CHIP>({}).reg, 3);
313 tu_cs_image_flag_ref(cs, &iview->view, layer);
314 }
315
316 template <chip CHIP>
317 static void
r2d_src_stencil(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer,VkFilter filter)318 r2d_src_stencil(struct tu_cmd_buffer *cmd,
319 struct tu_cs *cs,
320 const struct tu_image_view *iview,
321 uint32_t layer,
322 VkFilter filter)
323 {
324 tu_cs_emit_pkt4(cs, SP_PS_2D_SRC_INFO(CHIP,).reg, 5);
325 tu_cs_emit(cs, tu_image_view_stencil(iview, SP_PS_2D_SRC_INFO) & ~A6XX_SP_PS_2D_SRC_INFO_FLAGS);
326 tu_cs_emit(cs, iview->view.SP_PS_2D_SRC_SIZE);
327 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
328 tu_cs_emit(cs, SP_PS_2D_SRC_PITCH(CHIP, .pitch = iview->stencil_pitch).value);
329 }
330
331 template <chip CHIP>
332 static void
r2d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)333 r2d_src_buffer(struct tu_cmd_buffer *cmd,
334 struct tu_cs *cs,
335 enum pipe_format format,
336 uint64_t va, uint32_t pitch,
337 uint32_t width, uint32_t height,
338 enum pipe_format dst_format)
339 {
340 struct tu_native_format fmt = blit_format_texture<CHIP>(format, TILE6_LINEAR, false);
341 enum a6xx_format color_format = fmt.fmt;
342 fixup_src_format(&format, dst_format, &color_format);
343
344 tu_cs_emit_regs(cs,
345 SP_PS_2D_SRC_INFO(CHIP,
346 .color_format = color_format,
347 .color_swap = fmt.swap,
348 .srgb = util_format_is_srgb(format),
349 .unk20 = 1,
350 .unk22 = 1),
351 SP_PS_2D_SRC_SIZE(CHIP, .width = width, .height = height),
352 SP_PS_2D_SRC(CHIP, .qword = va),
353 SP_PS_2D_SRC_PITCH(CHIP, .pitch = pitch));
354 }
355
356 template <chip CHIP>
357 static void
r2d_dst(struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,enum pipe_format src_format)358 r2d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
359 enum pipe_format src_format)
360 {
361 uint32_t dst_info = iview->RB_2D_DST_INFO;
362 enum a6xx_format fmt =
363 (enum a6xx_format)(dst_info & A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK);
364 enum pipe_format dst_format = iview->format;
365 fixup_dst_format(src_format, &dst_format, &fmt);
366
367 dst_info =
368 (dst_info & ~A6XX_RB_2D_DST_INFO_COLOR_FORMAT__MASK) | fmt;
369 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
370 tu_cs_emit(cs, dst_info);
371 tu_cs_image_ref_2d<CHIP>(cs, iview, layer, false);
372
373 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
374 tu_cs_image_flag_ref(cs, iview, layer);
375 }
376
377 static void
r2d_dst_depth(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)378 r2d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
379 {
380 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
381 tu_cs_emit(cs, tu_image_view_depth(iview, RB_2D_DST_INFO));
382 tu_cs_emit_qw(cs, iview->depth_base_addr + iview->depth_layer_size * layer);
383 tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->depth_pitch).value);
384
385 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_FLAGS, 3);
386 tu_cs_image_flag_ref(cs, &iview->view, layer);
387 }
388
389 static void
r2d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)390 r2d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
391 {
392 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_DST_INFO, 4);
393 tu_cs_emit(cs, tu_image_view_stencil(iview, RB_2D_DST_INFO) & ~A6XX_RB_2D_DST_INFO_FLAGS);
394 tu_cs_emit_qw(cs, iview->stencil_base_addr + iview->stencil_layer_size * layer);
395 tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(iview->stencil_pitch).value);
396 }
397
398 static void
r2d_dst_buffer(struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,enum pipe_format src_format)399 r2d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
400 enum pipe_format src_format)
401 {
402 struct tu_native_format fmt = blit_format_color(format, TILE6_LINEAR);
403 enum a6xx_format color_fmt = fmt.fmt;
404 fixup_dst_format(src_format, &format, &color_fmt);
405 fmt.fmt = color_fmt;
406
407 tu_cs_emit_regs(cs,
408 A6XX_RB_2D_DST_INFO(
409 .color_format = fmt.fmt,
410 .color_swap = fmt.swap,
411 .srgb = util_format_is_srgb(format)),
412 A6XX_RB_2D_DST(.qword = va),
413 A6XX_RB_2D_DST_PITCH(pitch));
414 }
415
416 template <chip CHIP>
417 static void
r2d_setup_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,bool scissor)418 r2d_setup_common(struct tu_cmd_buffer *cmd,
419 struct tu_cs *cs,
420 enum pipe_format src_format,
421 enum pipe_format dst_format,
422 VkImageAspectFlags aspect_mask,
423 unsigned blit_param,
424 bool clear,
425 bool ubwc,
426 bool scissor)
427 {
428 if (!cmd->state.pass && cmd->device->dbg_renderpass_stomp_cs) {
429 tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
430 }
431
432 enum a6xx_format fmt = blit_base_format<CHIP>(dst_format, ubwc, false);
433 fixup_dst_format(src_format, &dst_format, &fmt);
434 enum a6xx_2d_ifmt ifmt = format_to_ifmt(dst_format);
435
436 uint32_t unknown_8c01 = 0;
437
438 /* note: the only format with partial clearing is D24S8 */
439 if (dst_format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
440 /* preserve stencil channel */
441 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
442 unknown_8c01 = 0x08000041;
443 /* preserve depth channels */
444 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
445 unknown_8c01 = 0x00084001;
446 }
447
448 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_UNKNOWN_8C01, 1);
449 tu_cs_emit(cs, unknown_8c01); // TODO: seem to be always 0 on A7XX
450
451 uint32_t blit_cntl = A6XX_RB_2D_BLIT_CNTL(
452 .rotate = (enum a6xx_rotation) blit_param,
453 .solid_color = clear,
454 .color_format = fmt,
455 .scissor = scissor,
456 .d24s8 = fmt == FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8 && !clear,
457 .mask = 0xf,
458 .ifmt = util_format_is_srgb(dst_format) ? R2D_UNORM8_SRGB : ifmt,
459 ).value;
460
461 tu_cs_emit_pkt4(cs, REG_A6XX_RB_2D_BLIT_CNTL, 1);
462 tu_cs_emit(cs, blit_cntl);
463
464 tu_cs_emit_pkt4(cs, REG_A6XX_GRAS_2D_BLIT_CNTL, 1);
465 tu_cs_emit(cs, blit_cntl);
466
467 if (CHIP > A6XX) {
468 tu_cs_emit_pkt4(cs, REG_A7XX_SP_PS_UNKNOWN_B2D2, 1);
469 tu_cs_emit(cs, 0x20000000);
470 }
471
472 if (fmt == FMT6_10_10_10_2_UNORM_DEST)
473 fmt = FMT6_16_16_16_16_FLOAT;
474
475 tu_cs_emit_regs(cs, SP_2D_DST_FORMAT(CHIP,
476 .sint = util_format_is_pure_sint(dst_format),
477 .uint = util_format_is_pure_uint(dst_format),
478 .color_format = fmt,
479 .srgb = util_format_is_srgb(dst_format),
480 .mask = 0xf));
481 }
482
483 template <chip CHIP>
484 static void
r2d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)485 r2d_setup(struct tu_cmd_buffer *cmd,
486 struct tu_cs *cs,
487 enum pipe_format src_format,
488 enum pipe_format dst_format,
489 VkImageAspectFlags aspect_mask,
490 unsigned blit_param,
491 bool clear,
492 bool ubwc,
493 VkSampleCountFlagBits samples)
494 {
495 assert(samples == VK_SAMPLE_COUNT_1_BIT);
496
497 if (!cmd->state.pass) {
498 tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_SYSMEM);
499 }
500
501 r2d_setup_common<CHIP>(cmd, cs, src_format, dst_format, aspect_mask, blit_param, clear, ubwc, false);
502 }
503
504 static void
r2d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)505 r2d_teardown(struct tu_cmd_buffer *cmd,
506 struct tu_cs *cs)
507 {
508 /* nothing to do here */
509 }
510
511 static void
r2d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)512 r2d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
513 {
514 if (cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
515 cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL) {
516 /* This a non-context register, so we have to WFI before changing. */
517 tu_cs_emit_wfi(cs);
518 tu_cs_emit_write_reg(
519 cs, REG_A6XX_RB_DBG_ECO_CNTL,
520 cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit);
521 }
522
523 tu_cs_emit_pkt7(cs, CP_BLIT, 1);
524 tu_cs_emit(cs, CP_BLIT_0_OP(BLIT_OP_SCALE));
525
526 if (cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL_blit !=
527 cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL) {
528 tu_cs_emit_wfi(cs);
529 tu_cs_emit_write_reg(
530 cs, REG_A6XX_RB_DBG_ECO_CNTL,
531 cmd->device->physical_device->info->a6xx.magic.RB_DBG_ECO_CNTL);
532 }
533 }
534
535 /* r3d_ = shader path operations */
536
537 static nir_def *
load_const(nir_builder * b,unsigned base,unsigned components)538 load_const(nir_builder *b, unsigned base, unsigned components)
539 {
540 return nir_load_const_ir3(b, components, 32, nir_imm_int(b, 0),
541 .base = base);
542 }
543
544 static nir_shader *
build_blit_vs_shader(void)545 build_blit_vs_shader(void)
546 {
547 nir_builder _b =
548 nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
549 nir_builder *b = &_b;
550 b->shader->info.internal = true;
551
552 nir_variable *out_pos =
553 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
554 "gl_Position");
555 out_pos->data.location = VARYING_SLOT_POS;
556
557 nir_def *vert0_pos = load_const(b, 0, 2);
558 nir_def *vert1_pos = load_const(b, 4, 2);
559 nir_def *vertex = nir_load_vertex_id(b);
560
561 nir_def *pos = nir_bcsel(b, nir_i2b(b, vertex), vert1_pos, vert0_pos);
562 pos = nir_vec4(b, nir_channel(b, pos, 0),
563 nir_channel(b, pos, 1),
564 nir_imm_float(b, 0.0),
565 nir_imm_float(b, 1.0));
566
567 nir_store_var(b, out_pos, pos, 0xf);
568
569 nir_variable *out_coords =
570 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec_type(3),
571 "coords");
572 out_coords->data.location = VARYING_SLOT_VAR0;
573
574 nir_def *vert0_coords = load_const(b, 2, 2);
575 nir_def *vert1_coords = load_const(b, 6, 2);
576
577 /* Only used with "z scale" blit path which uses a 3d texture */
578 nir_def *z_coord = load_const(b, 16, 1);
579
580 nir_def *coords = nir_bcsel(b, nir_i2b(b, vertex), vert1_coords, vert0_coords);
581 coords = nir_vec3(b, nir_channel(b, coords, 0), nir_channel(b, coords, 1),
582 z_coord);
583
584 nir_store_var(b, out_coords, coords, 0x7);
585
586 return b->shader;
587 }
588
589 static nir_shader *
build_clear_vs_shader(void)590 build_clear_vs_shader(void)
591 {
592 nir_builder _b =
593 nir_builder_init_simple_shader(MESA_SHADER_VERTEX, NULL, "blit vs");
594 nir_builder *b = &_b;
595 b->shader->info.internal = true;
596
597 nir_variable *out_pos =
598 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
599 "gl_Position");
600 out_pos->data.location = VARYING_SLOT_POS;
601
602 nir_def *vert0_pos = load_const(b, 0, 2);
603 nir_def *vert1_pos = load_const(b, 4, 2);
604 /* c0.z is used to clear depth */
605 nir_def *depth = load_const(b, 2, 1);
606 nir_def *vertex = nir_load_vertex_id(b);
607
608 nir_def *pos = nir_bcsel(b, nir_i2b(b, vertex), vert1_pos, vert0_pos);
609 pos = nir_vec4(b, nir_channel(b, pos, 0),
610 nir_channel(b, pos, 1),
611 depth, nir_imm_float(b, 1.0));
612
613 nir_store_var(b, out_pos, pos, 0xf);
614
615 nir_variable *out_layer =
616 nir_variable_create(b->shader, nir_var_shader_out, glsl_uint_type(),
617 "gl_Layer");
618 out_layer->data.location = VARYING_SLOT_LAYER;
619 nir_def *layer = load_const(b, 3, 1);
620 nir_store_var(b, out_layer, layer, 1);
621
622 return b->shader;
623 }
624
625 static nir_shader *
build_blit_fs_shader(bool zscale)626 build_blit_fs_shader(bool zscale)
627 {
628 nir_builder _b =
629 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
630 zscale ? "zscale blit fs" : "blit fs");
631 nir_builder *b = &_b;
632 b->shader->info.internal = true;
633
634 nir_variable *out_color =
635 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
636 "color0");
637 out_color->data.location = FRAG_RESULT_DATA0;
638
639 unsigned coord_components = zscale ? 3 : 2;
640 nir_variable *in_coords =
641 nir_variable_create(b->shader, nir_var_shader_in,
642 glsl_vec_type(coord_components),
643 "coords");
644 in_coords->data.location = VARYING_SLOT_VAR0;
645
646 nir_tex_instr *tex = nir_tex_instr_create(b->shader, 1);
647 /* Note: since we're just copying data, we rely on the HW ignoring the
648 * dest_type.
649 */
650 tex->dest_type = nir_type_int32;
651 tex->is_array = false;
652 tex->is_shadow = false;
653 tex->sampler_dim = zscale ? GLSL_SAMPLER_DIM_3D : GLSL_SAMPLER_DIM_2D;
654
655 tex->texture_index = 0;
656 tex->sampler_index = 0;
657
658 b->shader->info.num_textures = 1;
659 BITSET_SET(b->shader->info.textures_used, 0);
660
661 tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord,
662 nir_load_var(b, in_coords));
663 tex->coord_components = coord_components;
664
665 nir_def_init(&tex->instr, &tex->def, 4, 32);
666 nir_builder_instr_insert(b, &tex->instr);
667
668 nir_store_var(b, out_color, &tex->def, 0xf);
669
670 return b->shader;
671 }
672
673 /* We can only read multisample textures via txf_ms, so we need a separate
674 * variant for them.
675 */
676 static nir_shader *
build_ms_copy_fs_shader(bool half_float)677 build_ms_copy_fs_shader(bool half_float)
678 {
679 nir_builder _b =
680 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
681 "multisample copy fs");
682 nir_builder *b = &_b;
683 b->shader->info.internal = true;
684
685 nir_variable *out_color =
686 nir_variable_create(b->shader, nir_var_shader_out,
687 half_float ? glsl_f16vec_type(4) : glsl_vec4_type(),
688 "color0");
689 out_color->data.location = FRAG_RESULT_DATA0;
690
691 nir_variable *in_coords =
692 nir_variable_create(b->shader, nir_var_shader_in,
693 glsl_vec_type(2),
694 "coords");
695 in_coords->data.location = VARYING_SLOT_VAR0;
696
697 nir_tex_instr *tex = nir_tex_instr_create(b->shader, 2);
698
699 tex->op = nir_texop_txf_ms;
700
701 /* Note: since we're just copying data, we rely on the HW ignoring the
702 * dest_type.
703 */
704 tex->dest_type = half_float ? nir_type_float16 : nir_type_int32;
705 tex->is_array = false;
706 tex->is_shadow = false;
707 tex->sampler_dim = GLSL_SAMPLER_DIM_MS;
708
709 tex->texture_index = 0;
710 tex->sampler_index = 0;
711
712 b->shader->info.num_textures = 1;
713 BITSET_SET(b->shader->info.textures_used, 0);
714 BITSET_SET(b->shader->info.textures_used_by_txf, 0);
715
716 nir_def *coord = nir_f2i32(b, nir_load_var(b, in_coords));
717
718 tex->src[0] = nir_tex_src_for_ssa(nir_tex_src_coord, coord);
719 tex->coord_components = 2;
720
721 tex->src[1] = nir_tex_src_for_ssa(nir_tex_src_ms_index,
722 nir_load_sample_id(b));
723
724 nir_def_init(&tex->instr, &tex->def, 4, half_float ? 16 : 32);
725 nir_builder_instr_insert(b, &tex->instr);
726
727 nir_store_var(b, out_color, &tex->def, 0xf);
728
729 return b->shader;
730 }
731
732 static nir_shader *
build_clear_fs_shader(unsigned mrts)733 build_clear_fs_shader(unsigned mrts)
734 {
735 nir_builder _b =
736 nir_builder_init_simple_shader(MESA_SHADER_FRAGMENT, NULL,
737 "mrt%u clear fs", mrts);
738 nir_builder *b = &_b;
739 b->shader->info.internal = true;
740
741 for (unsigned i = 0; i < mrts; i++) {
742 nir_variable *out_color =
743 nir_variable_create(b->shader, nir_var_shader_out, glsl_vec4_type(),
744 "color");
745 out_color->data.location = FRAG_RESULT_DATA0 + i;
746
747 nir_def *color = load_const(b, 4 * i, 4);
748 nir_store_var(b, out_color, color, 0xf);
749 }
750
751 return b->shader;
752 }
753
754 static void
compile_shader(struct tu_device * dev,struct nir_shader * nir,unsigned consts,unsigned * offset,enum global_shader idx)755 compile_shader(struct tu_device *dev, struct nir_shader *nir,
756 unsigned consts, unsigned *offset, enum global_shader idx)
757 {
758 nir->options = ir3_get_compiler_options(dev->compiler);
759
760 nir_assign_io_var_locations(nir, nir_var_shader_in, &nir->num_inputs, nir->info.stage);
761 nir_assign_io_var_locations(nir, nir_var_shader_out, &nir->num_outputs, nir->info.stage);
762
763 ir3_finalize_nir(dev->compiler, nir);
764
765 const struct ir3_shader_options options = {
766 .num_reserved_user_consts = align(consts, 8),
767 .api_wavesize = IR3_SINGLE_OR_DOUBLE,
768 .real_wavesize = IR3_SINGLE_OR_DOUBLE,
769 };
770 struct ir3_shader *sh =
771 ir3_shader_from_nir(dev->compiler, nir, &options, NULL);
772
773 struct ir3_shader_key key = {};
774 bool created;
775 struct ir3_shader_variant *so =
776 ir3_shader_get_variant(sh, &key, false, false, &created);
777
778 struct tu6_global *global = dev->global_bo_map;
779
780 assert(*offset + so->info.sizedwords <= ARRAY_SIZE(global->shaders));
781 dev->global_shaders[idx] = sh;
782 dev->global_shader_variants[idx] = so;
783 memcpy(&global->shaders[*offset], so->bin,
784 sizeof(uint32_t) * so->info.sizedwords);
785 dev->global_shader_va[idx] = dev->global_bo->iova +
786 offsetof_arr(struct tu6_global, shaders, *offset);
787 *offset += align(so->info.sizedwords, 32);
788 }
789
790 void
tu_init_clear_blit_shaders(struct tu_device * dev)791 tu_init_clear_blit_shaders(struct tu_device *dev)
792 {
793 unsigned offset = 0;
794 compile_shader(dev, build_blit_vs_shader(), 3, &offset, GLOBAL_SH_VS_BLIT);
795 compile_shader(dev, build_clear_vs_shader(), 2, &offset, GLOBAL_SH_VS_CLEAR);
796 compile_shader(dev, build_blit_fs_shader(false), 0, &offset, GLOBAL_SH_FS_BLIT);
797 compile_shader(dev, build_blit_fs_shader(true), 0, &offset, GLOBAL_SH_FS_BLIT_ZSCALE);
798 compile_shader(dev, build_ms_copy_fs_shader(false), 0, &offset, GLOBAL_SH_FS_COPY_MS);
799 compile_shader(dev, build_ms_copy_fs_shader(true), 0, &offset, GLOBAL_SH_FS_COPY_MS_HALF);
800
801 for (uint32_t num_rts = 0; num_rts <= MAX_RTS; num_rts++) {
802 compile_shader(dev, build_clear_fs_shader(num_rts), num_rts, &offset,
803 (enum global_shader) (GLOBAL_SH_FS_CLEAR0 + num_rts));
804 }
805 }
806
807 void
tu_destroy_clear_blit_shaders(struct tu_device * dev)808 tu_destroy_clear_blit_shaders(struct tu_device *dev)
809 {
810 for (unsigned i = 0; i < GLOBAL_SH_COUNT; i++) {
811 if (dev->global_shaders[i])
812 ir3_shader_destroy(dev->global_shaders[i]);
813 }
814 }
815
816 enum r3d_type {
817 R3D_CLEAR,
818 R3D_BLIT,
819 R3D_COPY_HALF,
820 };
821
822 template <chip CHIP>
823 static void
r3d_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum r3d_type type,uint32_t rts_mask,bool z_scale,VkSampleCountFlagBits samples)824 r3d_common(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum r3d_type type,
825 uint32_t rts_mask, bool z_scale, VkSampleCountFlagBits samples)
826 {
827 enum global_shader vs_id =
828 type == R3D_CLEAR ? GLOBAL_SH_VS_CLEAR : GLOBAL_SH_VS_BLIT;
829
830 struct ir3_shader_variant *vs = cmd->device->global_shader_variants[vs_id];
831 uint64_t vs_iova = cmd->device->global_shader_va[vs_id];
832
833 enum global_shader fs_id = GLOBAL_SH_FS_BLIT;
834
835 if (z_scale) {
836 fs_id = GLOBAL_SH_FS_BLIT_ZSCALE;
837 } else if (type == R3D_COPY_HALF) {
838 /* Avoid canonicalizing NaNs due to implicit conversions in the shader.
839 *
840 * TODO: Add a half-float blit shader that uses texture() but with half
841 * registers to avoid NaN canonicaliztion for the single-sampled case.
842 */
843 fs_id = GLOBAL_SH_FS_COPY_MS_HALF;
844 } else if (samples != VK_SAMPLE_COUNT_1_BIT) {
845 fs_id = GLOBAL_SH_FS_COPY_MS;
846 }
847
848 unsigned num_rts = util_bitcount(rts_mask);
849 if (type == R3D_CLEAR)
850 fs_id = (enum global_shader) (GLOBAL_SH_FS_CLEAR0 + num_rts);
851
852 struct ir3_shader_variant *fs = cmd->device->global_shader_variants[fs_id];
853 uint64_t fs_iova = cmd->device->global_shader_va[fs_id];
854
855 tu_cs_emit_regs(cs, HLSQ_INVALIDATE_CMD(CHIP,
856 .vs_state = true,
857 .hs_state = true,
858 .ds_state = true,
859 .gs_state = true,
860 .fs_state = true,
861 .cs_state = true,
862 .cs_ibo = true,
863 .gfx_ibo = true,
864 .gfx_shared_const = true,
865 .cs_bindless = CHIP == A6XX ? 0x1f : 0xff,
866 .gfx_bindless = CHIP == A6XX ? 0x1f : 0xff,));
867
868 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_VERTEX, vs);
869 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_CTRL, NULL);
870 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_TESS_EVAL, NULL);
871 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_GEOMETRY, NULL);
872 tu6_emit_xs_config<CHIP>(cs, MESA_SHADER_FRAGMENT, fs);
873
874 struct tu_pvtmem_config pvtmem = {};
875 tu6_emit_xs(cs, MESA_SHADER_VERTEX, vs, &pvtmem, vs_iova);
876 tu6_emit_xs(cs, MESA_SHADER_FRAGMENT, fs, &pvtmem, fs_iova);
877
878 tu_cs_emit_regs(cs, A6XX_PC_PRIMITIVE_CNTL_0());
879 if (CHIP == A7XX) {
880 tu_cs_emit_regs(cs, A7XX_VPC_PRIMITIVE_CNTL_0());
881 }
882
883 tu6_emit_vpc<CHIP>(cs, vs, NULL, NULL, NULL, fs);
884
885 if (CHIP >= A7XX) {
886 tu_cs_emit_regs(cs, A6XX_GRAS_UNKNOWN_8110(0x2));
887
888 tu_cs_emit_regs(cs, A7XX_HLSQ_FS_UNKNOWN_A9AA(.consts_load_disable = false));
889 }
890
891 /* REPL_MODE for varying with RECTLIST (2 vertices only) */
892 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_INTERP_MODE(0, 0));
893 tu_cs_emit_regs(cs, A6XX_VPC_VARYING_PS_REPL_MODE(0, 2 << 2 | 1 << 0));
894
895 tu6_emit_vs<CHIP>(cs, vs, 0);
896 tu6_emit_hs<CHIP>(cs, NULL);
897 tu6_emit_ds<CHIP>(cs, NULL);
898 tu6_emit_gs<CHIP>(cs, NULL);
899 tu6_emit_fs<CHIP>(cs, fs);
900
901 tu_cs_emit_regs(cs,
902 A6XX_GRAS_CL_CNTL(
903 .clip_disable = 1,
904 .vp_clip_code_ignore = 1,
905 .vp_xform_disable = 1,
906 .persp_division_disable = 1,));
907 tu_cs_emit_regs(cs, A6XX_GRAS_SU_CNTL()); // XXX msaa enable?
908
909 tu_cs_emit_regs(cs, PC_RASTER_CNTL(CHIP));
910 if (CHIP == A6XX) {
911 tu_cs_emit_regs(cs, A6XX_VPC_UNKNOWN_9107());
912 } else {
913 tu_cs_emit_regs(cs, A7XX_PC_RASTER_CNTL_V2());
914 }
915
916 tu_cs_emit_regs(cs,
917 A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL(0, .x = 0, .y = 0),
918 A6XX_GRAS_SC_VIEWPORT_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
919 tu_cs_emit_regs(cs,
920 A6XX_GRAS_SC_SCREEN_SCISSOR_TL(0, .x = 0, .y = 0),
921 A6XX_GRAS_SC_SCREEN_SCISSOR_BR(0, .x = 0x7fff, .y = 0x7fff));
922
923 tu_cs_emit_regs(cs,
924 A6XX_VFD_INDEX_OFFSET(),
925 A6XX_VFD_INSTANCE_START_OFFSET());
926
927 if (rts_mask) {
928 unsigned rts_count = util_last_bit(rts_mask);
929 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_REG(0), rts_count);
930 unsigned rt = 0;
931 for (unsigned i = 0; i < rts_count; i++) {
932 unsigned regid = 0;
933 if (rts_mask & (1u << i))
934 regid = ir3_find_output_regid(fs, FRAG_RESULT_DATA0 + rt++);
935 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_REG_REGID(regid) |
936 COND(regid & HALF_REG_ID,
937 A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION));
938 }
939 }
940
941 tu6_emit_msaa(cs, samples, false);
942 }
943
944 static void
tu6_emit_blit_consts_load(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t opcode,enum a6xx_state_block block,uint32_t offset,const void * consts,uint32_t size_vec4)945 tu6_emit_blit_consts_load(struct tu_cmd_buffer *cmd,
946 struct tu_cs *cs,
947 uint32_t opcode,
948 enum a6xx_state_block block,
949 uint32_t offset,
950 const void *consts,
951 uint32_t size_vec4)
952 {
953 assert(offset % cmd->device->compiler->const_upload_unit == 0);
954
955 struct tu_cs_memory mem = {};
956 VkResult result = tu_cs_alloc(&cmd->sub_cs, size_vec4, 4, &mem);
957 if (result != VK_SUCCESS) {
958 vk_command_buffer_set_error(&cmd->vk, result);
959 return;
960 }
961
962 memcpy(mem.map, consts, size_vec4 * 4 * sizeof(uint32_t));
963
964 tu_cs_emit_pkt7(cs, opcode, 3);
965 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
966 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
967 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
968 CP_LOAD_STATE6_0_STATE_BLOCK(block) |
969 CP_LOAD_STATE6_0_NUM_UNIT(size_vec4));
970 tu_cs_emit_qw(cs, mem.iova);
971 }
972
973 static void
r3d_coords_raw(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const float * coords)974 r3d_coords_raw(struct tu_cmd_buffer *cmd, struct tu_cs *cs, const float *coords)
975 {
976 tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_GEOM, SB6_VS_SHADER, 0, coords, 2);
977 }
978
979 /* z coordinate for "z scale" blit path which uses a 3d texture */
980 static void
r3d_coord_z(struct tu_cmd_buffer * cmd,struct tu_cs * cs,float z)981 r3d_coord_z(struct tu_cmd_buffer *cmd, struct tu_cs *cs, float z)
982 {
983 const uint32_t coord[] = {
984 fui(z),
985 0,
986 0,
987 0,
988 };
989
990 tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_GEOM, SB6_VS_SHADER, 4, coord, 1);
991 }
992
993 static void
r3d_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset2D dst,const VkOffset2D src,const VkExtent2D extent)994 r3d_coords(struct tu_cmd_buffer *cmd,
995 struct tu_cs *cs,
996 const VkOffset2D dst,
997 const VkOffset2D src,
998 const VkExtent2D extent)
999 {
1000 const bool no_src = src.x != blt_no_coord.x;
1001 int32_t src_x1 = no_src ? src.x : 0;
1002 int32_t src_y1 = no_src ? src.y : 0;
1003
1004 const float coords[] = {
1005 dst.x,
1006 dst.y,
1007 src_x1,
1008 src_y1,
1009 dst.x + extent.width,
1010 dst.y + extent.height,
1011 src_x1 + extent.width,
1012 src_y1 + extent.height,
1013 };
1014 r3d_coords_raw(cmd, cs, coords);
1015 }
1016
1017 static void
r3d_clear_value(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,const VkClearValue * val)1018 r3d_clear_value(struct tu_cmd_buffer *cmd, struct tu_cs *cs, enum pipe_format format, const VkClearValue *val)
1019 {
1020 uint32_t coords[4] = {};
1021
1022 switch (format) {
1023 case PIPE_FORMAT_Z24X8_UNORM:
1024 case PIPE_FORMAT_Z24_UNORM_S8_UINT: {
1025 /* cleared as r8g8b8a8_unorm using special format */
1026 uint32_t tmp = tu_pack_float32_for_unorm(val->depthStencil.depth, 24);
1027 coords[0] = fui((tmp & 0xff) / 255.0f);
1028 coords[1] = fui((tmp >> 8 & 0xff) / 255.0f);
1029 coords[2] = fui((tmp >> 16 & 0xff) / 255.0f);
1030 coords[3] = fui((val->depthStencil.stencil & 0xff) / 255.0f);
1031 } break;
1032 case PIPE_FORMAT_Z16_UNORM:
1033 case PIPE_FORMAT_Z32_FLOAT:
1034 coords[0] = fui(val->depthStencil.depth);
1035 coords[1] = 0;
1036 coords[2] = 0;
1037 coords[3] = 0;
1038 break;
1039 case PIPE_FORMAT_S8_UINT:
1040 coords[0] = val->depthStencil.stencil & 0xff;
1041 coords[1] = 0;
1042 coords[2] = 0;
1043 coords[3] = 0;
1044 break;
1045 default:
1046 /* as color formats use clear value as-is */
1047 assert(!util_format_is_depth_or_stencil(format));
1048 memcpy(coords, val->color.uint32, 4 * sizeof(uint32_t));
1049 break;
1050 }
1051
1052 tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_FRAG, SB6_FS_SHADER, 0, coords, 1);
1053 }
1054
1055 static void
r3d_src_common(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const uint32_t * tex_const,uint32_t offset_base,uint32_t offset_ubwc,VkFilter filter)1056 r3d_src_common(struct tu_cmd_buffer *cmd,
1057 struct tu_cs *cs,
1058 const uint32_t *tex_const,
1059 uint32_t offset_base,
1060 uint32_t offset_ubwc,
1061 VkFilter filter)
1062 {
1063 struct tu_cs_memory texture = { };
1064 VkResult result = tu_cs_alloc(&cmd->sub_cs,
1065 2, /* allocate space for a sampler too */
1066 A6XX_TEX_CONST_DWORDS, &texture);
1067 if (result != VK_SUCCESS) {
1068 vk_command_buffer_set_error(&cmd->vk, result);
1069 return;
1070 }
1071
1072 memcpy(texture.map, tex_const, A6XX_TEX_CONST_DWORDS * 4);
1073
1074 /* patch addresses for layer offset */
1075 *(uint64_t*) (texture.map + 4) += offset_base;
1076 uint64_t ubwc_addr = (texture.map[7] | (uint64_t) texture.map[8] << 32) + offset_ubwc;
1077 texture.map[7] = ubwc_addr;
1078 texture.map[8] = ubwc_addr >> 32;
1079
1080 texture.map[A6XX_TEX_CONST_DWORDS + 0] =
1081 A6XX_TEX_SAMP_0_XY_MAG(tu6_tex_filter(filter, false)) |
1082 A6XX_TEX_SAMP_0_XY_MIN(tu6_tex_filter(filter, false)) |
1083 A6XX_TEX_SAMP_0_WRAP_S(A6XX_TEX_CLAMP_TO_EDGE) |
1084 A6XX_TEX_SAMP_0_WRAP_T(A6XX_TEX_CLAMP_TO_EDGE) |
1085 A6XX_TEX_SAMP_0_WRAP_R(A6XX_TEX_CLAMP_TO_EDGE) |
1086 0x60000; /* XXX used by blob, doesn't seem necessary */
1087 texture.map[A6XX_TEX_CONST_DWORDS + 1] =
1088 A6XX_TEX_SAMP_1_UNNORM_COORDS |
1089 A6XX_TEX_SAMP_1_MIPFILTER_LINEAR_FAR;
1090 texture.map[A6XX_TEX_CONST_DWORDS + 2] = 0;
1091 texture.map[A6XX_TEX_CONST_DWORDS + 3] = 0;
1092
1093 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
1094 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1095 CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
1096 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1097 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1098 CP_LOAD_STATE6_0_NUM_UNIT(1));
1099 tu_cs_emit_qw(cs, texture.iova + A6XX_TEX_CONST_DWORDS * 4);
1100
1101 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_SAMP(.qword = texture.iova + A6XX_TEX_CONST_DWORDS * 4));
1102
1103 tu_cs_emit_pkt7(cs, CP_LOAD_STATE6_FRAG, 3);
1104 tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(0) |
1105 CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
1106 CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1107 CP_LOAD_STATE6_0_STATE_BLOCK(SB6_FS_TEX) |
1108 CP_LOAD_STATE6_0_NUM_UNIT(1));
1109 tu_cs_emit_qw(cs, texture.iova);
1110
1111 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_CONST(.qword = texture.iova));
1112 tu_cs_emit_regs(cs, A6XX_SP_FS_TEX_COUNT(1));
1113 }
1114
1115 static void
r3d_src(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,VkFilter filter,enum pipe_format dst_format)1116 r3d_src(struct tu_cmd_buffer *cmd,
1117 struct tu_cs *cs,
1118 const struct fdl6_view *iview,
1119 uint32_t layer,
1120 VkFilter filter,
1121 enum pipe_format dst_format)
1122 {
1123 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1124 memcpy(desc, iview->descriptor, sizeof(desc));
1125
1126 enum a6xx_format fmt = (enum a6xx_format)(
1127 (desc[0] & A6XX_TEX_CONST_0_FMT__MASK) >> A6XX_TEX_CONST_0_FMT__SHIFT);
1128 enum pipe_format src_format = iview->format;
1129 fixup_src_format(&src_format, dst_format, &fmt);
1130 desc[0] = (desc[0] & ~A6XX_TEX_CONST_0_FMT__MASK) |
1131 A6XX_TEX_CONST_0_FMT(fmt);
1132
1133 r3d_src_common(cmd, cs, desc,
1134 iview->layer_size * layer,
1135 iview->ubwc_layer_size * layer,
1136 filter);
1137 }
1138
1139 template <chip CHIP>
1140 static void
r3d_src_buffer(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,uint32_t width,uint32_t height,enum pipe_format dst_format)1141 r3d_src_buffer(struct tu_cmd_buffer *cmd,
1142 struct tu_cs *cs,
1143 enum pipe_format format,
1144 uint64_t va, uint32_t pitch,
1145 uint32_t width, uint32_t height,
1146 enum pipe_format dst_format)
1147 {
1148 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1149
1150 struct tu_native_format fmt = blit_format_texture<CHIP>(format, TILE6_LINEAR, false);
1151 enum a6xx_format color_format = fmt.fmt;
1152 fixup_src_format(&format, dst_format, &color_format);
1153
1154 desc[0] =
1155 COND(util_format_is_srgb(format), A6XX_TEX_CONST_0_SRGB) |
1156 A6XX_TEX_CONST_0_FMT(color_format) |
1157 A6XX_TEX_CONST_0_SWAP(fmt.swap) |
1158 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1159 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1160 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1161 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1162 desc[1] = A6XX_TEX_CONST_1_WIDTH(width) | A6XX_TEX_CONST_1_HEIGHT(height);
1163 desc[2] =
1164 A6XX_TEX_CONST_2_PITCH(pitch) |
1165 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1166 desc[3] = 0;
1167 desc[4] = va;
1168 desc[5] = va >> 32;
1169 for (uint32_t i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1170 desc[i] = 0;
1171
1172 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1173 }
1174
1175 static void
r3d_src_depth(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1176 r3d_src_depth(struct tu_cmd_buffer *cmd,
1177 struct tu_cs *cs,
1178 const struct tu_image_view *iview,
1179 uint32_t layer)
1180 {
1181 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1182
1183 memcpy(desc, iview->view.descriptor, sizeof(desc));
1184 uint64_t va = iview->depth_base_addr;
1185
1186 desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1187 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1188 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1189 A6XX_TEX_CONST_0_SWAP__MASK);
1190 desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_32_FLOAT) |
1191 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1192 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1193 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1194 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1195 desc[2] =
1196 A6XX_TEX_CONST_2_PITCH(iview->depth_pitch) |
1197 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1198 desc[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(iview->depth_layer_size) |
1199 (iview->view.descriptor[3] & ~A6XX_TEX_CONST_3_ARRAY_PITCH__MASK);
1200 desc[4] = va;
1201 desc[5] = va >> 32;
1202
1203 r3d_src_common(cmd, cs, desc,
1204 iview->depth_layer_size * layer,
1205 iview->view.ubwc_layer_size * layer,
1206 VK_FILTER_NEAREST);
1207 }
1208
1209 static void
r3d_src_stencil(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1210 r3d_src_stencil(struct tu_cmd_buffer *cmd,
1211 struct tu_cs *cs,
1212 const struct tu_image_view *iview,
1213 uint32_t layer)
1214 {
1215 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1216
1217 memcpy(desc, iview->view.descriptor, sizeof(desc));
1218 uint64_t va = iview->stencil_base_addr;
1219
1220 desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1221 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1222 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1223 A6XX_TEX_CONST_0_SWAP__MASK);
1224 desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_UINT) |
1225 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1226 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1227 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1228 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1229 desc[2] =
1230 A6XX_TEX_CONST_2_PITCH(iview->stencil_pitch) |
1231 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D);
1232 desc[3] = A6XX_TEX_CONST_3_ARRAY_PITCH(iview->stencil_layer_size);
1233 desc[4] = va;
1234 desc[5] = va >> 32;
1235 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1236 desc[i] = 0;
1237
1238 r3d_src_common(cmd, cs, desc, iview->stencil_layer_size * layer, 0,
1239 VK_FILTER_NEAREST);
1240 }
1241
1242 static void
r3d_src_gmem_load(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1243 r3d_src_gmem_load(struct tu_cmd_buffer *cmd,
1244 struct tu_cs *cs,
1245 const struct tu_image_view *iview,
1246 uint32_t layer)
1247 {
1248 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1249
1250 memcpy(desc, iview->view.descriptor, sizeof(desc));
1251
1252 /* Fixup D24 formats because we always load both depth and stencil. */
1253 enum pipe_format format = iview->view.format;
1254 if (format == PIPE_FORMAT_X24S8_UINT ||
1255 format == PIPE_FORMAT_Z24X8_UNORM ||
1256 format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1257 desc[0] &= ~A6XX_TEX_CONST_0_FMT__MASK;
1258 if (iview->view.ubwc_enabled)
1259 desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_Z24_UNORM_S8_UINT_AS_R8G8B8A8);
1260 else
1261 desc[0] |= A6XX_TEX_CONST_0_FMT(FMT6_8_8_8_8_UNORM);
1262 }
1263
1264 /* When loading/storing GMEM we always load the full image and don't do any
1265 * swizzling or swapping, that's done in the draw when reading/writing
1266 * GMEM, so we need to fixup the swizzle and swap.
1267 */
1268 desc[0] &= ~(A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1269 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK |
1270 A6XX_TEX_CONST_0_SWAP__MASK);
1271 desc[0] |= A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1272 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1273 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1274 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1275
1276 r3d_src_common(cmd, cs, desc,
1277 iview->view.layer_size * layer,
1278 iview->view.ubwc_layer_size * layer,
1279 VK_FILTER_NEAREST);
1280 }
1281
1282 template <chip CHIP>
1283 static void
r3d_src_gmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,enum pipe_format format,enum pipe_format dst_format,uint32_t gmem_offset,uint32_t cpp)1284 r3d_src_gmem(struct tu_cmd_buffer *cmd,
1285 struct tu_cs *cs,
1286 const struct tu_image_view *iview,
1287 enum pipe_format format,
1288 enum pipe_format dst_format,
1289 uint32_t gmem_offset,
1290 uint32_t cpp)
1291 {
1292 uint32_t desc[A6XX_TEX_CONST_DWORDS];
1293 memcpy(desc, iview->view.descriptor, sizeof(desc));
1294
1295 enum a6xx_format fmt = blit_format_texture<CHIP>(format, TILE6_LINEAR, true).fmt;
1296 fixup_src_format(&format, dst_format, &fmt);
1297
1298 /* patch the format so that depth/stencil get the right format and swizzle */
1299 desc[0] &= ~(A6XX_TEX_CONST_0_FMT__MASK |
1300 A6XX_TEX_CONST_0_SWIZ_X__MASK | A6XX_TEX_CONST_0_SWIZ_Y__MASK |
1301 A6XX_TEX_CONST_0_SWIZ_Z__MASK | A6XX_TEX_CONST_0_SWIZ_W__MASK);
1302 desc[0] |= A6XX_TEX_CONST_0_FMT(fmt) |
1303 A6XX_TEX_CONST_0_SWIZ_X(A6XX_TEX_X) |
1304 A6XX_TEX_CONST_0_SWIZ_Y(A6XX_TEX_Y) |
1305 A6XX_TEX_CONST_0_SWIZ_Z(A6XX_TEX_Z) |
1306 A6XX_TEX_CONST_0_SWIZ_W(A6XX_TEX_W);
1307
1308 /* patched for gmem */
1309 desc[0] &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
1310 desc[0] |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
1311 desc[2] =
1312 A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
1313 A6XX_TEX_CONST_2_PITCH(cmd->state.tiling->tile0.width * cpp);
1314 desc[3] = 0;
1315 desc[4] = cmd->device->physical_device->gmem_base + gmem_offset;
1316 desc[5] = A6XX_TEX_CONST_5_DEPTH(1);
1317 for (unsigned i = 6; i < A6XX_TEX_CONST_DWORDS; i++)
1318 desc[i] = 0;
1319
1320 r3d_src_common(cmd, cs, desc, 0, 0, VK_FILTER_NEAREST);
1321 }
1322
1323 template <chip CHIP>
1324 static void
r3d_dst(struct tu_cs * cs,const struct fdl6_view * iview,uint32_t layer,enum pipe_format src_format)1325 r3d_dst(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1326 enum pipe_format src_format)
1327 {
1328 uint32_t mrt_buf_info = iview->RB_MRT_BUF_INFO;
1329
1330 enum a6xx_format fmt = (enum a6xx_format)(
1331 mrt_buf_info & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK);
1332 enum pipe_format dst_format = iview->format;
1333 fixup_dst_format(src_format, &dst_format, &fmt);
1334 mrt_buf_info =
1335 (mrt_buf_info & ~A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK) |
1336 A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT(fmt);
1337
1338 tu_cs_emit_regs(cs,
1339 RB_MRT_BUF_INFO(CHIP, 0, .dword = mrt_buf_info),
1340 A6XX_RB_MRT_PITCH(0, iview->pitch),
1341 A6XX_RB_MRT_ARRAY_PITCH(0, iview->layer_size),
1342 A6XX_RB_MRT_BASE(0, .qword = tu_layer_address(iview, layer)),
1343 A6XX_RB_MRT_BASE_GMEM(0),
1344 );
1345
1346 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1347 tu_cs_image_flag_ref(cs, iview, layer);
1348
1349 /* Use color format from RB_MRT_BUF_INFO. This register is relevant for
1350 * FMT6_NV12_Y.
1351 */
1352 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = fmt));
1353
1354 tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP, .flag_mrts = iview->ubwc_enabled));
1355 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1356 }
1357
1358 template <chip CHIP>
1359 static void
r3d_dst_depth(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1360 r3d_dst_depth(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1361 {
1362 tu_cs_emit_regs(cs,
1363 RB_MRT_BUF_INFO(CHIP, 0, .dword = tu_image_view_depth(iview, RB_MRT_BUF_INFO)),
1364 A6XX_RB_MRT_PITCH(0, iview->depth_pitch),
1365 A6XX_RB_MRT_ARRAY_PITCH(0, iview->depth_layer_size),
1366 A6XX_RB_MRT_BASE(0, .qword = iview->depth_base_addr + iview->depth_layer_size * layer),
1367 A6XX_RB_MRT_BASE_GMEM(0),
1368 );
1369
1370 tu_cs_emit_pkt4(cs, REG_A6XX_RB_MRT_FLAG_BUFFER(0), 3);
1371 tu_cs_image_flag_ref(cs, &iview->view, layer);
1372
1373 tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP, .flag_mrts = iview->view.ubwc_enabled));
1374 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1375 }
1376
1377 template <chip CHIP>
1378 static void
r3d_dst_stencil(struct tu_cs * cs,const struct tu_image_view * iview,uint32_t layer)1379 r3d_dst_stencil(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer)
1380 {
1381 tu_cs_emit_regs(cs,
1382 RB_MRT_BUF_INFO(CHIP, 0, .dword = tu_image_view_stencil(iview, RB_MRT_BUF_INFO)),
1383 A6XX_RB_MRT_PITCH(0, iview->stencil_pitch),
1384 A6XX_RB_MRT_ARRAY_PITCH(0, iview->stencil_layer_size),
1385 A6XX_RB_MRT_BASE(0, .qword = iview->stencil_base_addr + iview->stencil_layer_size * layer),
1386 A6XX_RB_MRT_BASE_GMEM(0),
1387 );
1388
1389 tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP));
1390 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1391 }
1392
1393 template <chip CHIP>
1394 static void
r3d_dst_buffer(struct tu_cs * cs,enum pipe_format format,uint64_t va,uint32_t pitch,enum pipe_format src_format)1395 r3d_dst_buffer(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1396 enum pipe_format src_format)
1397 {
1398 struct tu_native_format fmt = blit_format_color(format, TILE6_LINEAR);
1399
1400 enum a6xx_format color_fmt = fmt.fmt;
1401 fixup_dst_format(src_format, &format, &color_fmt);
1402
1403 tu_cs_emit_regs(cs,
1404 RB_MRT_BUF_INFO(CHIP, 0, .color_format = color_fmt, .color_swap = fmt.swap),
1405 A6XX_RB_MRT_PITCH(0, pitch),
1406 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1407 A6XX_RB_MRT_BASE(0, .qword = va),
1408 A6XX_RB_MRT_BASE_GMEM(0, 0));
1409
1410 tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP));
1411 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1412 }
1413
1414 template <chip CHIP>
1415 static void
r3d_dst_gmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * att,bool separate_stencil,unsigned layer)1416 r3d_dst_gmem(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1417 const struct tu_image_view *iview,
1418 const struct tu_render_pass_attachment *att,
1419 bool separate_stencil, unsigned layer)
1420 {
1421 unsigned RB_MRT_BUF_INFO;
1422 unsigned gmem_offset;
1423
1424 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1425 if (!separate_stencil) {
1426 RB_MRT_BUF_INFO = tu_image_view_depth(iview, RB_MRT_BUF_INFO);
1427 gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
1428 } else {
1429 RB_MRT_BUF_INFO = tu_image_view_stencil(iview, RB_MRT_BUF_INFO);
1430 gmem_offset = tu_attachment_gmem_offset_stencil(cmd, att, layer);
1431 }
1432 } else {
1433 RB_MRT_BUF_INFO = iview->view.RB_MRT_BUF_INFO;
1434 gmem_offset = tu_attachment_gmem_offset(cmd, att, layer);
1435 }
1436
1437 tu_cs_emit_regs(cs,
1438 RB_MRT_BUF_INFO(CHIP, 0, .dword = RB_MRT_BUF_INFO),
1439 A6XX_RB_MRT_PITCH(0, 0),
1440 A6XX_RB_MRT_ARRAY_PITCH(0, 0),
1441 A6XX_RB_MRT_BASE(0, 0),
1442 A6XX_RB_MRT_BASE_GMEM(0, gmem_offset));
1443
1444 enum a6xx_format color_format =
1445 (enum a6xx_format)(RB_MRT_BUF_INFO & A6XX_RB_MRT_BUF_INFO_COLOR_FORMAT__MASK);
1446 tu_cs_emit_regs(cs,
1447 A6XX_GRAS_LRZ_MRT_BUF_INFO_0(.color_format = color_format));
1448
1449 tu_cs_emit_regs(cs, RB_RENDER_CNTL(CHIP));
1450 tu_cs_emit_regs(cs, A7XX_GRAS_SU_RENDER_CNTL());
1451 }
1452
1453 static uint8_t
aspect_write_mask(enum pipe_format format,VkImageAspectFlags aspect_mask)1454 aspect_write_mask(enum pipe_format format, VkImageAspectFlags aspect_mask)
1455 {
1456 uint8_t mask = 0xf;
1457 assert(aspect_mask);
1458 /* note: the only format with partial writing is D24S8,
1459 * clear/blit uses the _AS_R8G8B8A8 format to access it
1460 */
1461 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1462 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
1463 mask = 0x7;
1464 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1465 mask = 0x8;
1466 }
1467 return mask;
1468 }
1469
1470 static uint8_t
aspect_write_mask_generic_clear(enum pipe_format format,VkImageAspectFlags aspect_mask)1471 aspect_write_mask_generic_clear(enum pipe_format format, VkImageAspectFlags aspect_mask)
1472 {
1473 uint8_t mask = 0xf;
1474 assert(aspect_mask);
1475 /* note: the only format with partial writing is D24S8 */
1476 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
1477 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT)
1478 mask = 0x1;
1479 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1480 mask = 0x2;
1481 }
1482 return mask;
1483 }
1484
1485 enum r3d_blit_param {
1486 R3D_Z_SCALE = 1 << 0,
1487 R3D_DST_GMEM = 1 << 1,
1488 R3D_COPY = 1 << 2,
1489 };
1490
1491 template <chip CHIP>
1492 static void
r3d_setup(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format src_format,enum pipe_format dst_format,VkImageAspectFlags aspect_mask,unsigned blit_param,bool clear,bool ubwc,VkSampleCountFlagBits samples)1493 r3d_setup(struct tu_cmd_buffer *cmd,
1494 struct tu_cs *cs,
1495 enum pipe_format src_format,
1496 enum pipe_format dst_format,
1497 VkImageAspectFlags aspect_mask,
1498 unsigned blit_param,
1499 bool clear,
1500 bool ubwc,
1501 VkSampleCountFlagBits samples)
1502 {
1503 if (!cmd->state.pass && cmd->device->dbg_renderpass_stomp_cs) {
1504 tu_cs_emit_call(cs, cmd->device->dbg_renderpass_stomp_cs);
1505 }
1506
1507 enum a6xx_format fmt = blit_base_format<CHIP>(dst_format, ubwc, false);
1508 fixup_dst_format(src_format, &dst_format, &fmt);
1509
1510 if (!cmd->state.pass) {
1511 tu_emit_cache_flush_ccu<CHIP>(cmd, cs, TU_CMD_CCU_SYSMEM);
1512 tu6_emit_window_scissor(cs, 0, 0, 0x3fff, 0x3fff);
1513 }
1514
1515 if (!(blit_param & R3D_DST_GMEM)) {
1516 if (CHIP == A6XX) {
1517 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL(.buffers_location = BUFFERS_IN_SYSMEM));
1518 } else {
1519 tu_cs_emit_regs(cs, A6XX_GRAS_BIN_CONTROL());
1520 }
1521
1522 tu_cs_emit_regs(cs, RB_BIN_CONTROL(CHIP, .buffers_location = BUFFERS_IN_SYSMEM));
1523
1524 if (CHIP >= A7XX) {
1525 tu_cs_emit_regs(cs, A7XX_RB_UNKNOWN_8812(0x3ff));
1526 tu_cs_emit_regs(cs,
1527 A7XX_RB_UNKNOWN_8E06(cmd->device->physical_device->info->a6xx.magic.RB_UNKNOWN_8E06));
1528 }
1529 }
1530
1531 enum r3d_type type;
1532 if (clear) {
1533 type = R3D_CLEAR;
1534 } else if ((blit_param & R3D_COPY) && tu_pipe_format_is_float16(src_format)) {
1535 /* Avoid canonicalizing NaNs in copies by using the special half-float
1536 * path that uses half regs.
1537 */
1538 type = R3D_COPY_HALF;
1539 } else {
1540 type = R3D_BLIT;
1541 }
1542
1543 r3d_common<CHIP>(cmd, cs, type, 1, blit_param & R3D_Z_SCALE, samples);
1544
1545 tu_cs_emit_regs(cs, A6XX_SP_FS_OUTPUT_CNTL1(.mrt = 1));
1546 tu_cs_emit_regs(cs, A6XX_RB_FS_OUTPUT_CNTL1(.mrt = 1));
1547 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
1548 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.sample_mask = 0xffff));
1549
1550 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
1551 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL());
1552 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL());
1553 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
1554 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL());
1555 tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL());
1556 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK());
1557 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK());
1558 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF());
1559
1560 tu_cs_emit_regs(cs, A6XX_SP_FS_MRT_REG(0,
1561 .color_format = fmt,
1562 .color_sint = util_format_is_pure_sint(dst_format),
1563 .color_uint = util_format_is_pure_uint(dst_format)));
1564
1565 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(0,
1566 .component_enable = aspect_write_mask(dst_format, aspect_mask)));
1567 tu_cs_emit_regs(cs, A6XX_RB_SRGB_CNTL(util_format_is_srgb(dst_format)));
1568 tu_cs_emit_regs(cs, A6XX_SP_SRGB_CNTL(util_format_is_srgb(dst_format)));
1569
1570 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
1571 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
1572
1573 if (CHIP >= A7XX)
1574 tu_cs_emit_regs(cs, A7XX_GRAS_LRZ_DEPTH_BUFFER_INFO());
1575
1576 tu_cs_emit_write_reg(cs, REG_A6XX_GRAS_SC_CNTL,
1577 A6XX_GRAS_SC_CNTL_CCUSINGLECACHELINESIZE(2));
1578
1579 /* Disable sample counting in order to not affect occlusion query. */
1580 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
1581
1582 tu_cs_emit_regs(cs, A6XX_RB_DITHER_CNTL());
1583 if (CHIP >= A7XX) {
1584 tu_cs_emit_regs(cs, A7XX_SP_DITHER_CNTL());
1585 }
1586
1587 if (cmd->state.prim_generated_query_running_before_rp) {
1588 tu_emit_event_write<CHIP>(cmd, cs, FD_STOP_PRIMITIVE_CTRS);
1589 }
1590
1591 if (cmd->state.predication_active) {
1592 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1593 tu_cs_emit(cs, 0);
1594 }
1595 }
1596
1597 static void
r3d_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1598 r3d_run(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1599 {
1600 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1601 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1602 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1603 CP_DRAW_INDX_OFFSET_0_VIS_CULL(IGNORE_VISIBILITY));
1604 tu_cs_emit(cs, 1); /* instance count */
1605 tu_cs_emit(cs, 2); /* vertex count */
1606 }
1607
1608 static void
r3d_run_vis(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1609 r3d_run_vis(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1610 {
1611 tu_cs_emit_pkt7(cs, CP_DRAW_INDX_OFFSET, 3);
1612 tu_cs_emit(cs, CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(DI_PT_RECTLIST) |
1613 CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(DI_SRC_SEL_AUTO_INDEX) |
1614 CP_DRAW_INDX_OFFSET_0_VIS_CULL(USE_VISIBILITY));
1615 tu_cs_emit(cs, 1); /* instance count */
1616 tu_cs_emit(cs, 2); /* vertex count */
1617 }
1618
1619 template <chip CHIP>
1620 static void
r3d_teardown(struct tu_cmd_buffer * cmd,struct tu_cs * cs)1621 r3d_teardown(struct tu_cmd_buffer *cmd, struct tu_cs *cs)
1622 {
1623 if (cmd->state.predication_active) {
1624 tu_cs_emit_pkt7(cs, CP_DRAW_PRED_ENABLE_LOCAL, 1);
1625 tu_cs_emit(cs, 1);
1626 }
1627
1628 /* Re-enable sample counting. */
1629 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
1630
1631 if (cmd->state.prim_generated_query_running_before_rp) {
1632 tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
1633 }
1634 }
1635
1636 /* blit ops - common interface for 2d/shader paths */
1637
1638 struct blit_ops {
1639 void (*coords)(struct tu_cmd_buffer *cmd,
1640 struct tu_cs *cs,
1641 const VkOffset2D dst,
1642 const VkOffset2D src,
1643 const VkExtent2D extent);
1644 void (*clear_value)(struct tu_cmd_buffer *cmd,
1645 struct tu_cs *cs,
1646 enum pipe_format format,
1647 const VkClearValue *val);
1648 void (*src)(
1649 struct tu_cmd_buffer *cmd,
1650 struct tu_cs *cs,
1651 const struct fdl6_view *iview,
1652 uint32_t layer,
1653 VkFilter filter,
1654 enum pipe_format dst_format);
1655 void (*src_buffer)(struct tu_cmd_buffer *cmd, struct tu_cs *cs,
1656 enum pipe_format format,
1657 uint64_t va, uint32_t pitch,
1658 uint32_t width, uint32_t height,
1659 enum pipe_format dst_format);
1660 void (*dst)(struct tu_cs *cs, const struct fdl6_view *iview, uint32_t layer,
1661 enum pipe_format src_format);
1662 void (*dst_depth)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1663 void (*dst_stencil)(struct tu_cs *cs, const struct tu_image_view *iview, uint32_t layer);
1664 void (*dst_buffer)(struct tu_cs *cs, enum pipe_format format, uint64_t va, uint32_t pitch,
1665 enum pipe_format src_format);
1666 void (*setup)(struct tu_cmd_buffer *cmd,
1667 struct tu_cs *cs,
1668 enum pipe_format src_format,
1669 enum pipe_format dst_format,
1670 VkImageAspectFlags aspect_mask,
1671 unsigned blit_param, /* CmdBlitImage: rotation in 2D path and z scaling in 3D path */
1672 bool clear,
1673 bool ubwc,
1674 VkSampleCountFlagBits samples);
1675 void (*run)(struct tu_cmd_buffer *cmd, struct tu_cs *cs);
1676 void (*teardown)(struct tu_cmd_buffer *cmd,
1677 struct tu_cs *cs);
1678 };
1679
1680 template <chip CHIP>
1681 static const struct blit_ops r2d_ops = {
1682 .coords = r2d_coords,
1683 .clear_value = r2d_clear_value,
1684 .src = r2d_src<CHIP>,
1685 .src_buffer = r2d_src_buffer<CHIP>,
1686 .dst = r2d_dst<CHIP>,
1687 .dst_depth = r2d_dst_depth,
1688 .dst_stencil = r2d_dst_stencil,
1689 .dst_buffer = r2d_dst_buffer,
1690 .setup = r2d_setup<CHIP>,
1691 .run = r2d_run,
1692 .teardown = r2d_teardown,
1693 };
1694
1695 template <chip CHIP>
1696 static const struct blit_ops r3d_ops = {
1697 .coords = r3d_coords,
1698 .clear_value = r3d_clear_value,
1699 .src = r3d_src,
1700 .src_buffer = r3d_src_buffer<CHIP>,
1701 .dst = r3d_dst<CHIP>,
1702 .dst_depth = r3d_dst_depth<CHIP>,
1703 .dst_stencil = r3d_dst_stencil<CHIP>,
1704 .dst_buffer = r3d_dst_buffer<CHIP>,
1705 .setup = r3d_setup<CHIP>,
1706 .run = r3d_run,
1707 .teardown = r3d_teardown<CHIP>,
1708 };
1709
1710 /* passthrough set coords from 3D extents */
1711 static void
coords(const struct blit_ops * ops,struct tu_cmd_buffer * cmd,struct tu_cs * cs,const VkOffset3D dst,const VkOffset3D src,const VkExtent3D extent)1712 coords(const struct blit_ops *ops,
1713 struct tu_cmd_buffer *cmd,
1714 struct tu_cs *cs,
1715 const VkOffset3D dst,
1716 const VkOffset3D src,
1717 const VkExtent3D extent)
1718 {
1719 ops->coords(cmd, cs, (VkOffset2D) {dst.x, dst.y}, (VkOffset2D) {src.x, src.y},
1720 (VkExtent2D) {extent.width, extent.height});
1721 }
1722
1723 /* Decides the VK format to treat our data as for a memcpy-style blit. We have
1724 * to be a bit careful because we have to pick a format with matching UBWC
1725 * compression behavior, so no just returning R8_UINT/R16_UINT/R32_UINT for
1726 * everything.
1727 */
1728 static enum pipe_format
copy_format(VkFormat vk_format,VkImageAspectFlags aspect_mask)1729 copy_format(VkFormat vk_format, VkImageAspectFlags aspect_mask)
1730 {
1731 if (vk_format_is_compressed(vk_format)) {
1732 switch (vk_format_get_blocksize(vk_format)) {
1733 case 1: return PIPE_FORMAT_R8_UINT;
1734 case 2: return PIPE_FORMAT_R16_UINT;
1735 case 4: return PIPE_FORMAT_R32_UINT;
1736 case 8: return PIPE_FORMAT_R32G32_UINT;
1737 case 16:return PIPE_FORMAT_R32G32B32A32_UINT;
1738 default:
1739 unreachable("unhandled format size");
1740 }
1741 }
1742
1743 enum pipe_format format = vk_format_to_pipe_format(vk_format);
1744
1745 /* For SNORM formats, copy them as the equivalent UNORM format. If we treat
1746 * them as snorm then the 0x80 (-1.0 snorm8) value will get clamped to 0x81
1747 * (also -1.0), when we're supposed to be memcpying the bits. See
1748 * https://gitlab.khronos.org/Tracker/vk-gl-cts/-/issues/2917 for discussion.
1749 */
1750 format = util_format_snorm_to_unorm(format);
1751
1752 switch (format) {
1753 case PIPE_FORMAT_R9G9B9E5_FLOAT:
1754 return PIPE_FORMAT_R32_UINT;
1755
1756 case PIPE_FORMAT_G8_B8R8_420_UNORM:
1757 if (aspect_mask == VK_IMAGE_ASPECT_PLANE_1_BIT)
1758 return PIPE_FORMAT_R8G8_UNORM;
1759 else
1760 return PIPE_FORMAT_Y8_UNORM;
1761 case PIPE_FORMAT_G8_B8_R8_420_UNORM:
1762 return PIPE_FORMAT_R8_UNORM;
1763
1764 case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
1765 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
1766 return PIPE_FORMAT_S8_UINT;
1767 assert(aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT);
1768 return PIPE_FORMAT_Z32_FLOAT;
1769
1770 default:
1771 return format;
1772 }
1773 }
1774
1775 static void
pack_blit_event_clear_value(const VkClearValue * val,enum pipe_format format,uint32_t clear_value[4])1776 pack_blit_event_clear_value(const VkClearValue *val, enum pipe_format format, uint32_t clear_value[4])
1777 {
1778 switch (format) {
1779 case PIPE_FORMAT_Z24X8_UNORM:
1780 case PIPE_FORMAT_Z24_UNORM_S8_UINT:
1781 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 24) |
1782 val->depthStencil.stencil << 24;
1783 return;
1784 case PIPE_FORMAT_Z16_UNORM:
1785 clear_value[0] = tu_pack_float32_for_unorm(val->depthStencil.depth, 16);
1786 return;
1787 case PIPE_FORMAT_Z32_FLOAT:
1788 clear_value[0] = fui(val->depthStencil.depth);
1789 return;
1790 case PIPE_FORMAT_S8_UINT:
1791 clear_value[0] = val->depthStencil.stencil;
1792 return;
1793 default:
1794 break;
1795 }
1796
1797 float tmp[4];
1798 memcpy(tmp, val->color.float32, 4 * sizeof(float));
1799 if (util_format_is_srgb(format)) {
1800 for (int i = 0; i < 3; i++)
1801 tmp[i] = util_format_linear_to_srgb_float(tmp[i]);
1802 }
1803
1804 #define PACK_F(type) util_format_##type##_pack_rgba_float \
1805 ( (uint8_t*) &clear_value[0], 0, tmp, 0, 1, 1)
1806 switch (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_X)) {
1807 case 4:
1808 PACK_F(r4g4b4a4_unorm);
1809 break;
1810 case 5:
1811 if (util_format_get_component_bits(format, UTIL_FORMAT_COLORSPACE_RGB, PIPE_SWIZZLE_Y) == 6)
1812 PACK_F(r5g6b5_unorm);
1813 else
1814 PACK_F(r5g5b5a1_unorm);
1815 break;
1816 case 8:
1817 if (util_format_is_snorm(format))
1818 PACK_F(r8g8b8a8_snorm);
1819 else if (util_format_is_unorm(format))
1820 PACK_F(r8g8b8a8_unorm);
1821 else
1822 pack_int8(clear_value, val->color.uint32);
1823 break;
1824 case 10:
1825 if (util_format_is_pure_integer(format))
1826 pack_int10_2(clear_value, val->color.uint32);
1827 else
1828 PACK_F(r10g10b10a2_unorm);
1829 break;
1830 case 11:
1831 clear_value[0] = float3_to_r11g11b10f(val->color.float32);
1832 break;
1833 case 16:
1834 if (util_format_is_snorm(format))
1835 PACK_F(r16g16b16a16_snorm);
1836 else if (util_format_is_unorm(format))
1837 PACK_F(r16g16b16a16_unorm);
1838 else if (util_format_is_float(format))
1839 PACK_F(r16g16b16a16_float);
1840 else
1841 pack_int16(clear_value, val->color.uint32);
1842 break;
1843 case 32:
1844 memcpy(clear_value, val->color.float32, 4 * sizeof(float));
1845 break;
1846 case 0:
1847 assert(format == PIPE_FORMAT_A8_UNORM);
1848 PACK_F(a8_unorm);
1849 break;
1850 default:
1851 unreachable("unexpected channel size");
1852 }
1853 #undef PACK_F
1854 }
1855
1856 static void
event_blit_setup(struct tu_cs * cs,const struct tu_render_pass_attachment * att,enum a6xx_blit_event_type blit_event_type,uint32_t clear_mask)1857 event_blit_setup(struct tu_cs *cs,
1858 const struct tu_render_pass_attachment *att,
1859 enum a6xx_blit_event_type blit_event_type,
1860 uint32_t clear_mask)
1861 {
1862 tu_cs_emit_regs(
1863 cs, A6XX_RB_BLIT_GMEM_MSAA_CNTL(tu_msaa_samples(att->samples)));
1864
1865 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
1866 tu_cs_emit(cs, 0);
1867
1868 tu_cs_emit_regs(
1869 cs,
1870 A6XX_RB_BLIT_INFO(.type = blit_event_type,
1871 .sample_0 =
1872 vk_format_is_int(att->format) ||
1873 vk_format_is_depth_or_stencil(att->format),
1874 .depth = vk_format_is_depth_or_stencil(att->format),
1875 .clear_mask = clear_mask, ));
1876 }
1877
1878 struct event_blit_dst_view {
1879 const struct tu_image *image;
1880 const struct fdl6_view *view;
1881
1882 uint32_t layer;
1883
1884 uint64_t depth_addr;
1885 uint32_t depth_pitch;
1886
1887 uint64_t stencil_addr;
1888 uint32_t stencil_pitch;
1889 };
1890
1891 static event_blit_dst_view
blt_view_from_tu_view(const struct tu_image_view * iview,uint32_t layer)1892 blt_view_from_tu_view(const struct tu_image_view *iview,
1893 uint32_t layer)
1894 {
1895 struct event_blit_dst_view blt_view;
1896 blt_view.image = iview->image;
1897 blt_view.view = &iview->view;
1898 blt_view.layer = layer;
1899
1900 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1901 blt_view.depth_addr =
1902 iview->depth_base_addr + iview->depth_layer_size * layer;
1903 blt_view.depth_pitch = iview->depth_pitch;
1904
1905 blt_view.stencil_addr =
1906 iview->stencil_base_addr + iview->stencil_layer_size * layer;
1907 blt_view.stencil_pitch = iview->stencil_pitch;
1908 }
1909 return blt_view;
1910 }
1911
1912 template <chip CHIP>
1913 static void
event_blit_run(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_render_pass_attachment * att,const event_blit_dst_view * blt_view,bool separate_stencil)1914 event_blit_run(struct tu_cmd_buffer *cmd,
1915 struct tu_cs *cs,
1916 const struct tu_render_pass_attachment *att,
1917 const event_blit_dst_view *blt_view,
1918 bool separate_stencil)
1919 {
1920 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 4);
1921 if (blt_view->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
1922 if (!separate_stencil) {
1923 tu_cs_emit(cs, tu_fdl_view_depth(blt_view->view, RB_BLIT_DST_INFO));
1924 tu_cs_emit_qw(cs, blt_view->depth_addr);
1925 tu_cs_emit(cs, A6XX_RB_2D_DST_PITCH(blt_view->depth_pitch).value);
1926
1927 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
1928 tu_cs_image_flag_ref(cs, blt_view->view, blt_view->layer);
1929 } else {
1930 tu_cs_emit(cs, tu_fdl_view_stencil(blt_view->view, RB_BLIT_DST_INFO) &
1931 ~A6XX_RB_BLIT_DST_INFO_FLAGS);
1932 tu_cs_emit_qw(cs, blt_view->stencil_addr);
1933 tu_cs_emit(cs, A6XX_RB_BLIT_DST_PITCH(blt_view->stencil_pitch).value);
1934 }
1935 } else {
1936 tu_cs_emit(cs, blt_view->view->RB_BLIT_DST_INFO);
1937 tu_cs_image_ref_2d<CHIP>(cs, blt_view->view, blt_view->layer, false);
1938
1939 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_FLAG_DST, 3);
1940 tu_cs_image_flag_ref(cs, blt_view->view, blt_view->layer);
1941 }
1942
1943 if (att) {
1944 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT && separate_stencil) {
1945 tu_cs_emit_regs(
1946 cs, A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset_stencil(
1947 cmd, att, blt_view->layer)));
1948 } else {
1949 tu_cs_emit_regs(cs, A6XX_RB_BLIT_BASE_GMEM(tu_attachment_gmem_offset(
1950 cmd, att, blt_view->layer)));
1951 }
1952 }
1953
1954 tu_emit_event_write<CHIP>(cmd, cs, FD_BLIT);
1955 }
1956
1957 static void
tu7_generic_layer_clear(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint8_t clear_mask,bool separate_stencil,uint32_t layer,const VkClearValue * value,uint32_t a)1958 tu7_generic_layer_clear(struct tu_cmd_buffer *cmd,
1959 struct tu_cs *cs,
1960 enum pipe_format format,
1961 uint8_t clear_mask,
1962 bool separate_stencil,
1963 uint32_t layer,
1964 const VkClearValue *value,
1965 uint32_t a)
1966 {
1967 const struct tu_render_pass_attachment *att =
1968 &cmd->state.pass->attachments[a];
1969 const struct tu_image_view *iview = cmd->state.attachments[a];
1970
1971 uint32_t clear_vals[4] = {};
1972 pack_blit_event_clear_value(value, format, clear_vals);
1973
1974 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
1975 tu_cs_emit_array(cs, clear_vals, 4);
1976
1977 event_blit_dst_view blt_view = blt_view_from_tu_view(iview, layer);
1978
1979 event_blit_setup(cs, att, BLIT_EVENT_CLEAR, clear_mask);
1980 event_blit_run<A7XX>(cmd, cs, att, &blt_view, separate_stencil);
1981 }
1982
1983
1984
1985 /* Copies/fills/updates for buffers are happening through CCU but need
1986 * additional synchronization when write range is not aligned to 64 bytes.
1987 * Because dst buffer access uses either R8_UNORM or R32_UINT and they are not
1988 * coherent between each other in CCU since format seem to be a part of a
1989 * cache key.
1990 *
1991 * See: https://gitlab.khronos.org/vulkan/vulkan/-/issues/3306
1992 *
1993 * The synchronization with writes from UCHE (e.g. with SSBO stores) are
1994 * solved by the fact that UCHE has byte level dirtiness tracking and that CCU
1995 * flush would happen always before UCHE flush for such case (e.g. both
1996 * renderpass and dispatch would flush pending CCU write).
1997 *
1998 * Additionally see:
1999 * https://gitlab.khronos.org/vulkan/vulkan/-/issues/3398#note_400111
2000 */
2001 template <chip CHIP>
2002 static void
handle_buffer_unaligned_store(struct tu_cmd_buffer * cmd,uint64_t dst_va,uint64_t size,bool * unaligned_store)2003 handle_buffer_unaligned_store(struct tu_cmd_buffer *cmd,
2004 uint64_t dst_va,
2005 uint64_t size,
2006 bool *unaligned_store)
2007 {
2008 if (*unaligned_store)
2009 return;
2010
2011 if ((dst_va & 63) || (size & 63)) {
2012 tu_flush_for_access(&cmd->state.cache, TU_ACCESS_NONE,
2013 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE);
2014 /* Wait for invalidations to land. */
2015 cmd->state.cache.flush_bits |= TU_CMD_FLAG_WAIT_FOR_IDLE;
2016 tu_emit_cache_flush<CHIP>(cmd);
2017 *unaligned_store = true;
2018 }
2019 }
2020
2021 template <chip CHIP>
2022 static void
after_buffer_unaligned_buffer_store(struct tu_cmd_buffer * cmd,bool unaligned_store)2023 after_buffer_unaligned_buffer_store(struct tu_cmd_buffer *cmd,
2024 bool unaligned_store)
2025 {
2026 if (unaligned_store) {
2027 tu_flush_for_access(&cmd->state.cache,
2028 TU_ACCESS_CCU_COLOR_INCOHERENT_WRITE,
2029 TU_ACCESS_NONE);
2030 }
2031 }
2032
2033 template <chip CHIP>
2034 void
tu6_clear_lrz(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image,const VkClearValue * value)2035 tu6_clear_lrz(struct tu_cmd_buffer *cmd,
2036 struct tu_cs *cs,
2037 struct tu_image *image,
2038 const VkClearValue *value)
2039 {
2040 const struct blit_ops *ops = &r2d_ops<CHIP>;
2041
2042 /* It is assumed that LRZ cache is invalidated at this point for
2043 * the writes here to become visible to LRZ.
2044 *
2045 * LRZ writes are going through UCHE cache, flush UCHE before changing
2046 * LRZ via CCU. Don't need to invalidate CCU since we are presumably
2047 * writing whole cache lines we assume to be 64 bytes.
2048 */
2049 tu_emit_event_write<CHIP>(cmd, &cmd->cs, FD_CACHE_CLEAN);
2050
2051 ops->setup(cmd, cs, PIPE_FORMAT_Z16_UNORM, PIPE_FORMAT_Z16_UNORM,
2052 VK_IMAGE_ASPECT_DEPTH_BIT, 0, true, false,
2053 VK_SAMPLE_COUNT_1_BIT);
2054 ops->clear_value(cmd, cs, PIPE_FORMAT_Z16_UNORM, value);
2055 ops->dst_buffer(cs, PIPE_FORMAT_Z16_UNORM,
2056 image->iova + image->lrz_offset,
2057 image->lrz_pitch * 2, PIPE_FORMAT_Z16_UNORM);
2058 ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord,
2059 (VkExtent2D) { image->lrz_pitch, image->lrz_height });
2060 ops->run(cmd, cs);
2061 ops->teardown(cmd, cs);
2062
2063 /* Clearing writes via CCU color in the PS stage, and LRZ is read via
2064 * UCHE in the earlier GRAS stage.
2065 */
2066 cmd->state.cache.flush_bits |=
2067 TU_CMD_FLAG_CCU_CLEAN_COLOR | TU_CMD_FLAG_CACHE_INVALIDATE |
2068 TU_CMD_FLAG_WAIT_FOR_IDLE;
2069 }
2070 TU_GENX(tu6_clear_lrz);
2071
2072 template <chip CHIP>
2073 void
tu6_dirty_lrz_fc(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_image * image)2074 tu6_dirty_lrz_fc(struct tu_cmd_buffer *cmd,
2075 struct tu_cs *cs,
2076 struct tu_image *image)
2077 {
2078 const struct blit_ops *ops = &r2d_ops<CHIP>;
2079 VkClearValue clear = {};
2080 clear.color.uint32[0] = 0xffffffff;
2081
2082 using LRZFC = fd_lrzfc_layout<CHIP>;
2083 uint64_t lrz_fc_iova = image->iova + image->lrz_fc_offset;
2084 ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
2085 VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
2086 VK_SAMPLE_COUNT_1_BIT);
2087 ops->clear_value(cmd, cs, PIPE_FORMAT_R32_UINT, &clear);
2088 ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT,
2089 lrz_fc_iova + offsetof(LRZFC, fc1),
2090 sizeof(LRZFC::fc1),
2091 PIPE_FORMAT_R32_UINT);
2092 ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
2093 sizeof(LRZFC::fc1) / sizeof(uint32_t), 1
2094 });
2095 ops->run(cmd, cs);
2096 if constexpr (LRZFC::HAS_BIDIR) {
2097 ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT,
2098 lrz_fc_iova + offsetof(LRZFC, fc2),
2099 sizeof(LRZFC::fc2),
2100 PIPE_FORMAT_R32_UINT);
2101 ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
2102 sizeof(LRZFC::fc2) / sizeof(uint32_t), 1
2103 });
2104 ops->run(cmd, cs);
2105 }
2106 ops->teardown(cmd, cs);
2107 }
2108 TU_GENX(tu6_dirty_lrz_fc);
2109
2110 template<chip CHIP>
2111 static void
tu_image_view_copy_blit(struct fdl6_view * iview,struct tu_image * image,enum pipe_format format,const VkImageSubresourceLayers * subres,uint32_t layer,bool z_scale)2112 tu_image_view_copy_blit(struct fdl6_view *iview,
2113 struct tu_image *image,
2114 enum pipe_format format,
2115 const VkImageSubresourceLayers *subres,
2116 uint32_t layer,
2117 bool z_scale)
2118 {
2119 VkImageAspectFlags aspect_mask = subres->aspectMask;
2120
2121 /* always use the AS_R8G8B8A8 format for these */
2122 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT ||
2123 format == PIPE_FORMAT_Z24X8_UNORM) {
2124 aspect_mask = VK_IMAGE_ASPECT_COLOR_BIT;
2125 }
2126
2127 const struct fdl_layout *layout =
2128 &image->layout[tu6_plane_index(image->vk.format, aspect_mask)];
2129
2130 const struct fdl_view_args args = {
2131 .chip = CHIP,
2132 .iova = image->iova,
2133 .base_miplevel = subres->mipLevel,
2134 .level_count = 1,
2135 .base_array_layer = subres->baseArrayLayer + layer,
2136 .layer_count = 1,
2137 .swiz = {
2138 PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W
2139 },
2140 .format = tu_format_for_aspect(format, aspect_mask),
2141 .type = z_scale ? FDL_VIEW_TYPE_3D : FDL_VIEW_TYPE_2D,
2142 .ubwc_fc_mutable = image->ubwc_fc_mutable,
2143 };
2144 fdl6_view_init(iview, &layout, &args, false);
2145 }
2146
2147 template<chip CHIP>
2148 static void
tu_image_view_copy(struct fdl6_view * iview,struct tu_image * image,enum pipe_format format,const VkImageSubresourceLayers * subres,uint32_t layer)2149 tu_image_view_copy(struct fdl6_view *iview,
2150 struct tu_image *image,
2151 enum pipe_format format,
2152 const VkImageSubresourceLayers *subres,
2153 uint32_t layer)
2154 {
2155 tu_image_view_copy_blit<CHIP>(iview, image, format, subres, layer, false);
2156 }
2157
2158 template<chip CHIP>
2159 static void
tu_image_view_blit(struct fdl6_view * iview,struct tu_image * image,const VkImageSubresourceLayers * subres,uint32_t layer)2160 tu_image_view_blit(struct fdl6_view *iview,
2161 struct tu_image *image,
2162 const VkImageSubresourceLayers *subres,
2163 uint32_t layer)
2164 {
2165 enum pipe_format format =
2166 tu6_plane_format(image->vk.format, tu6_plane_index(image->vk.format,
2167 subres->aspectMask));
2168 tu_image_view_copy_blit<CHIP>(iview, image, format, subres, layer, false);
2169 }
2170
2171 template <chip CHIP>
2172 static void
tu6_blit_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageBlit2 * info,VkFilter filter)2173 tu6_blit_image(struct tu_cmd_buffer *cmd,
2174 struct tu_image *src_image,
2175 struct tu_image *dst_image,
2176 const VkImageBlit2 *info,
2177 VkFilter filter)
2178 {
2179 const struct blit_ops *ops = &r2d_ops<CHIP>;
2180 struct tu_cs *cs = &cmd->cs;
2181 bool z_scale = false;
2182 uint32_t layers = info->dstOffsets[1].z - info->dstOffsets[0].z;
2183
2184 /* 2D blit can't do rotation mirroring from just coordinates */
2185 static const enum a6xx_rotation rotate[2][2] = {
2186 {ROTATE_0, ROTATE_HFLIP},
2187 {ROTATE_VFLIP, ROTATE_180},
2188 };
2189
2190 bool mirror_x = (info->srcOffsets[1].x < info->srcOffsets[0].x) !=
2191 (info->dstOffsets[1].x < info->dstOffsets[0].x);
2192 bool mirror_y = (info->srcOffsets[1].y < info->srcOffsets[0].y) !=
2193 (info->dstOffsets[1].y < info->dstOffsets[0].y);
2194
2195 int32_t src0_z = info->srcOffsets[0].z;
2196 int32_t src1_z = info->srcOffsets[1].z;
2197
2198 if ((info->srcOffsets[1].z - info->srcOffsets[0].z !=
2199 info->dstOffsets[1].z - info->dstOffsets[0].z) ||
2200 info->srcOffsets[1].z < info->srcOffsets[0].z) {
2201 z_scale = true;
2202 }
2203
2204 if (info->dstOffsets[1].z < info->dstOffsets[0].z) {
2205 layers = info->dstOffsets[0].z - info->dstOffsets[1].z;
2206 src0_z = info->srcOffsets[1].z;
2207 src1_z = info->srcOffsets[0].z;
2208 }
2209
2210 if (vk_image_subresource_layer_count(&dst_image->vk, &info->dstSubresource) > 1) {
2211 assert(layers <= 1);
2212 layers = vk_image_subresource_layer_count(&dst_image->vk,
2213 &info->dstSubresource);
2214 }
2215
2216 /* BC1_RGB_* formats need to have their last components overriden with 1
2217 * when sampling, which is normally handled with the texture descriptor
2218 * swizzle. The 2d path can't handle that, so use the 3d path.
2219 *
2220 * TODO: we could use RB_2D_BLIT_CNTL::MASK to make these formats work with
2221 * the 2d path.
2222 */
2223
2224 unsigned blit_param = rotate[mirror_y][mirror_x];
2225 if (dst_image->layout[0].nr_samples > 1 ||
2226 src_image->vk.format == VK_FORMAT_BC1_RGB_UNORM_BLOCK ||
2227 src_image->vk.format == VK_FORMAT_BC1_RGB_SRGB_BLOCK ||
2228 filter == VK_FILTER_CUBIC_EXT ||
2229 z_scale) {
2230 ops = &r3d_ops<CHIP>;
2231 blit_param = z_scale ? R3D_Z_SCALE : 0;
2232 }
2233
2234 /* use the right format in setup() for D32_S8
2235 * TODO: this probably should use a helper
2236 */
2237 enum pipe_format src_format =
2238 tu6_plane_format(src_image->vk.format,
2239 tu6_plane_index(src_image->vk.format,
2240 info->srcSubresource.aspectMask));
2241 enum pipe_format dst_format =
2242 tu6_plane_format(dst_image->vk.format,
2243 tu6_plane_index(src_image->vk.format,
2244 info->srcSubresource.aspectMask));
2245 trace_start_blit(&cmd->trace, cs,
2246 ops == &r3d_ops<CHIP>,
2247 src_image->vk.format,
2248 dst_image->vk.format,
2249 layers);
2250
2251 ops->setup(cmd, cs, src_format, dst_format, info->dstSubresource.aspectMask,
2252 blit_param, false, dst_image->layout[0].ubwc,
2253 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2254
2255 if (ops == &r3d_ops<CHIP>) {
2256 const float coords[] = { info->dstOffsets[0].x, info->dstOffsets[0].y,
2257 info->srcOffsets[0].x, info->srcOffsets[0].y,
2258 info->dstOffsets[1].x, info->dstOffsets[1].y,
2259 info->srcOffsets[1].x, info->srcOffsets[1].y };
2260 r3d_coords_raw(cmd, cs, coords);
2261 } else {
2262 tu_cs_emit_regs(cs,
2263 A6XX_GRAS_2D_DST_TL(.x = MIN2(info->dstOffsets[0].x, info->dstOffsets[1].x),
2264 .y = MIN2(info->dstOffsets[0].y, info->dstOffsets[1].y)),
2265 A6XX_GRAS_2D_DST_BR(.x = MAX2(info->dstOffsets[0].x, info->dstOffsets[1].x) - 1,
2266 .y = MAX2(info->dstOffsets[0].y, info->dstOffsets[1].y) - 1));
2267 tu_cs_emit_regs(cs,
2268 A6XX_GRAS_2D_SRC_TL_X(MIN2(info->srcOffsets[0].x, info->srcOffsets[1].x)),
2269 A6XX_GRAS_2D_SRC_BR_X(MAX2(info->srcOffsets[0].x, info->srcOffsets[1].x) - 1),
2270 A6XX_GRAS_2D_SRC_TL_Y(MIN2(info->srcOffsets[0].y, info->srcOffsets[1].y)),
2271 A6XX_GRAS_2D_SRC_BR_Y(MAX2(info->srcOffsets[0].y, info->srcOffsets[1].y) - 1));
2272 }
2273
2274 struct fdl6_view dst, src;
2275 tu_image_view_blit<CHIP>(
2276 &dst, dst_image, &info->dstSubresource,
2277 MIN2(info->dstOffsets[0].z, info->dstOffsets[1].z));
2278
2279 if (z_scale) {
2280 tu_image_view_copy_blit<CHIP>(&src, src_image, src_format,
2281 &info->srcSubresource, 0, true);
2282 ops->src(cmd, cs, &src, 0, filter, dst_format);
2283 } else {
2284 tu_image_view_blit<CHIP>(&src, src_image, &info->srcSubresource, info->srcOffsets[0].z);
2285 }
2286
2287 for (uint32_t i = 0; i < layers; i++) {
2288 if (z_scale) {
2289 float t = ((float) i + 0.5f) / (float) layers;
2290 r3d_coord_z(cmd, cs, t * (src1_z - src0_z) + src0_z);
2291 } else {
2292 ops->src(cmd, cs, &src, i, filter, dst_format);
2293 }
2294 ops->dst(cs, &dst, i, src_format);
2295 ops->run(cmd, cs);
2296 }
2297
2298 ops->teardown(cmd, cs);
2299
2300 trace_end_blit(&cmd->trace, cs);
2301 }
2302
2303 template <chip CHIP>
2304 VKAPI_ATTR void VKAPI_CALL
tu_CmdBlitImage2(VkCommandBuffer commandBuffer,const VkBlitImageInfo2 * pBlitImageInfo)2305 tu_CmdBlitImage2(VkCommandBuffer commandBuffer,
2306 const VkBlitImageInfo2 *pBlitImageInfo)
2307
2308 {
2309 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2310 VK_FROM_HANDLE(tu_image, src_image, pBlitImageInfo->srcImage);
2311 VK_FROM_HANDLE(tu_image, dst_image, pBlitImageInfo->dstImage);
2312
2313 for (uint32_t i = 0; i < pBlitImageInfo->regionCount; ++i) {
2314 /* can't blit both depth and stencil at once with D32_S8
2315 * TODO: more advanced 3D blit path to support it instead?
2316 */
2317 if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT ||
2318 dst_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2319 VkImageBlit2 region = pBlitImageInfo->pRegions[i];
2320 u_foreach_bit(b, region.dstSubresource.aspectMask) {
2321 region.srcSubresource.aspectMask = BIT(b);
2322 region.dstSubresource.aspectMask = BIT(b);
2323 tu6_blit_image<CHIP>(cmd, src_image, dst_image, ®ion, pBlitImageInfo->filter);
2324 }
2325 continue;
2326 }
2327 tu6_blit_image<CHIP>(cmd, src_image, dst_image, pBlitImageInfo->pRegions + i,
2328 pBlitImageInfo->filter);
2329 }
2330
2331 if (dst_image->lrz_height) {
2332 tu_disable_lrz<CHIP>(cmd, &cmd->cs, dst_image);
2333 }
2334 }
2335 TU_GENX(tu_CmdBlitImage2);
2336
2337 static void
copy_compressed(VkFormat format,VkOffset3D * offset,VkExtent3D * extent,uint32_t * width,uint32_t * height)2338 copy_compressed(VkFormat format,
2339 VkOffset3D *offset,
2340 VkExtent3D *extent,
2341 uint32_t *width,
2342 uint32_t *height)
2343 {
2344 if (!vk_format_is_compressed(format))
2345 return;
2346
2347 uint32_t block_width = vk_format_get_blockwidth(format);
2348 uint32_t block_height = vk_format_get_blockheight(format);
2349
2350 offset->x /= block_width;
2351 offset->y /= block_height;
2352
2353 if (extent) {
2354 extent->width = DIV_ROUND_UP(extent->width, block_width);
2355 extent->height = DIV_ROUND_UP(extent->height, block_height);
2356 }
2357 if (width)
2358 *width = DIV_ROUND_UP(*width, block_width);
2359 if (height)
2360 *height = DIV_ROUND_UP(*height, block_height);
2361 }
2362
2363 template <chip CHIP>
2364 static void
tu_copy_buffer_to_image(struct tu_cmd_buffer * cmd,struct tu_buffer * src_buffer,struct tu_image * dst_image,const VkBufferImageCopy2 * info)2365 tu_copy_buffer_to_image(struct tu_cmd_buffer *cmd,
2366 struct tu_buffer *src_buffer,
2367 struct tu_image *dst_image,
2368 const VkBufferImageCopy2 *info)
2369 {
2370 struct tu_cs *cs = &cmd->cs;
2371 uint32_t layers = MAX2(info->imageExtent.depth,
2372 vk_image_subresource_layer_count(&dst_image->vk,
2373 &info->imageSubresource));
2374 enum pipe_format src_format =
2375 copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
2376 enum pipe_format dst_format =
2377 copy_format(dst_image->vk.format, info->imageSubresource.aspectMask);
2378 const struct blit_ops *ops = &r2d_ops<CHIP>;
2379
2380 /* special case for buffer to stencil */
2381 if (dst_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
2382 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
2383 src_format = PIPE_FORMAT_S8_UINT;
2384 }
2385
2386 /* note: could use "R8_UNORM" when no UBWC */
2387 unsigned blit_param = 0;
2388 if (src_format == PIPE_FORMAT_Y8_UNORM ||
2389 tu_pipe_format_is_float16(src_format)) {
2390 ops = &r3d_ops<CHIP>;
2391 blit_param = R3D_COPY;
2392 }
2393
2394 VkOffset3D offset = info->imageOffset;
2395 VkExtent3D extent = info->imageExtent;
2396 uint32_t src_width = info->bufferRowLength ?: extent.width;
2397 uint32_t src_height = info->bufferImageHeight ?: extent.height;
2398
2399 copy_compressed(dst_image->vk.format, &offset, &extent, &src_width, &src_height);
2400
2401 uint32_t pitch = src_width * util_format_get_blocksize(src_format);
2402 uint32_t layer_size = src_height * pitch;
2403
2404 ops->setup(cmd, cs, src_format, dst_format,
2405 info->imageSubresource.aspectMask, blit_param, false, dst_image->layout[0].ubwc,
2406 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2407
2408 struct fdl6_view dst;
2409 tu_image_view_copy<CHIP>(&dst, dst_image, dst_format,
2410 &info->imageSubresource, offset.z);
2411
2412 for (uint32_t i = 0; i < layers; i++) {
2413 ops->dst(cs, &dst, i, src_format);
2414
2415 uint64_t src_va = src_buffer->iova + info->bufferOffset + layer_size * i;
2416 if ((src_va & 63) || (pitch & 63)) {
2417 for (uint32_t y = 0; y < extent.height; y++) {
2418 uint32_t x = (src_va & 63) / util_format_get_blocksize(src_format);
2419 ops->src_buffer(cmd, cs, src_format, src_va & ~63, pitch,
2420 x + extent.width, 1, dst_format);
2421 ops->coords(cmd, cs, (VkOffset2D) {offset.x, offset.y + y}, (VkOffset2D) {x},
2422 (VkExtent2D) {extent.width, 1});
2423 ops->run(cmd, cs);
2424 src_va += pitch;
2425 }
2426 } else {
2427 ops->src_buffer(cmd, cs, src_format, src_va, pitch, extent.width, extent.height, dst_format);
2428 coords(ops, cmd, cs, offset, (VkOffset3D) {}, extent);
2429 ops->run(cmd, cs);
2430 }
2431 }
2432
2433 ops->teardown(cmd, cs);
2434 }
2435
2436 template <chip CHIP>
2437 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,const VkCopyBufferToImageInfo2 * pCopyBufferToImageInfo)2438 tu_CmdCopyBufferToImage2(VkCommandBuffer commandBuffer,
2439 const VkCopyBufferToImageInfo2 *pCopyBufferToImageInfo)
2440 {
2441 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2442 VK_FROM_HANDLE(tu_image, dst_image, pCopyBufferToImageInfo->dstImage);
2443 VK_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferToImageInfo->srcBuffer);
2444
2445 for (unsigned i = 0; i < pCopyBufferToImageInfo->regionCount; ++i)
2446 tu_copy_buffer_to_image<CHIP>(cmd, src_buffer, dst_image,
2447 pCopyBufferToImageInfo->pRegions + i);
2448
2449 if (dst_image->lrz_height) {
2450 tu_disable_lrz<CHIP>(cmd, &cmd->cs, dst_image);
2451 }
2452 }
2453 TU_GENX(tu_CmdCopyBufferToImage2);
2454
2455 template <chip CHIP>
2456 static void
tu_copy_image_to_buffer(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_buffer * dst_buffer,const VkBufferImageCopy2 * info,bool * unaligned_store)2457 tu_copy_image_to_buffer(struct tu_cmd_buffer *cmd,
2458 struct tu_image *src_image,
2459 struct tu_buffer *dst_buffer,
2460 const VkBufferImageCopy2 *info,
2461 bool *unaligned_store)
2462 {
2463 struct tu_cs *cs = &cmd->cs;
2464 uint32_t layers = MAX2(info->imageExtent.depth,
2465 vk_image_subresource_layer_count(&src_image->vk,
2466 &info->imageSubresource));
2467 enum pipe_format dst_format =
2468 copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
2469 enum pipe_format src_format =
2470 copy_format(src_image->vk.format, info->imageSubresource.aspectMask);
2471 const struct blit_ops *ops = &r2d_ops<CHIP>;
2472
2473 if (src_image->vk.format == VK_FORMAT_D24_UNORM_S8_UINT &&
2474 info->imageSubresource.aspectMask == VK_IMAGE_ASPECT_STENCIL_BIT) {
2475 dst_format = PIPE_FORMAT_S8_UINT;
2476 }
2477
2478 /* note: could use "R8_UNORM" when no UBWC */
2479 unsigned blit_param = 0;
2480 if (dst_format == PIPE_FORMAT_Y8_UNORM ||
2481 tu_pipe_format_is_float16(src_format)) {
2482 ops = &r3d_ops<CHIP>;
2483 blit_param = R3D_COPY;
2484 }
2485
2486 VkOffset3D offset = info->imageOffset;
2487 VkExtent3D extent = info->imageExtent;
2488 uint32_t dst_width = info->bufferRowLength ?: extent.width;
2489 uint32_t dst_height = info->bufferImageHeight ?: extent.height;
2490
2491 copy_compressed(src_image->vk.format, &offset, &extent, &dst_width, &dst_height);
2492
2493 uint32_t pitch = dst_width * util_format_get_blocksize(dst_format);
2494 uint32_t layer_size = pitch * dst_height;
2495
2496 handle_buffer_unaligned_store<CHIP>(cmd,
2497 dst_buffer->iova + info->bufferOffset,
2498 layer_size * layers, unaligned_store);
2499
2500 ops->setup(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT, blit_param, false, false,
2501 VK_SAMPLE_COUNT_1_BIT);
2502
2503 struct fdl6_view src;
2504 tu_image_view_copy<CHIP>(&src, src_image, src_format,
2505 &info->imageSubresource, offset.z);
2506
2507 for (uint32_t i = 0; i < layers; i++) {
2508 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
2509
2510 uint64_t dst_va = dst_buffer->iova + info->bufferOffset + layer_size * i;
2511 if ((dst_va & 63) || (pitch & 63)) {
2512 for (uint32_t y = 0; y < extent.height; y++) {
2513 uint32_t x = (dst_va & 63) / util_format_get_blocksize(dst_format);
2514 ops->dst_buffer(cs, dst_format, dst_va & ~63, 0, src_format);
2515 ops->coords(cmd, cs, (VkOffset2D) {x}, (VkOffset2D) {offset.x, offset.y + y},
2516 (VkExtent2D) {extent.width, 1});
2517 ops->run(cmd, cs);
2518 dst_va += pitch;
2519 }
2520 } else {
2521 ops->dst_buffer(cs, dst_format, dst_va, pitch, src_format);
2522 coords(ops, cmd, cs, (VkOffset3D) {0, 0}, offset, extent);
2523 ops->run(cmd, cs);
2524 }
2525 }
2526
2527 ops->teardown(cmd, cs);
2528 }
2529
2530 template <chip CHIP>
2531 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,const VkCopyImageToBufferInfo2 * pCopyImageToBufferInfo)2532 tu_CmdCopyImageToBuffer2(VkCommandBuffer commandBuffer,
2533 const VkCopyImageToBufferInfo2 *pCopyImageToBufferInfo)
2534 {
2535 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2536 VK_FROM_HANDLE(tu_image, src_image, pCopyImageToBufferInfo->srcImage);
2537 VK_FROM_HANDLE(tu_buffer, dst_buffer, pCopyImageToBufferInfo->dstBuffer);
2538
2539 bool unaligned_store = false;
2540 for (unsigned i = 0; i < pCopyImageToBufferInfo->regionCount; ++i)
2541 tu_copy_image_to_buffer<CHIP>(cmd, src_image, dst_buffer,
2542 pCopyImageToBufferInfo->pRegions + i,
2543 &unaligned_store);
2544
2545 after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
2546 }
2547 TU_GENX(tu_CmdCopyImageToBuffer2);
2548
2549 /* Tiled formats don't support swapping, which means that we can't support
2550 * formats that require a non-WZYX swap like B8G8R8A8 natively. Also, some
2551 * formats like B5G5R5A1 have a separate linear-only format when sampling.
2552 * Currently we fake support for tiled swapped formats and use the unswapped
2553 * format instead, but this means that reinterpreting copies to and from
2554 * swapped formats can't be performed correctly unless we can swizzle the
2555 * components by reinterpreting the other image as the "correct" swapped
2556 * format, i.e. only when the other image is linear.
2557 */
2558
2559 template <chip CHIP>
2560 static bool
is_swapped_format(enum pipe_format format)2561 is_swapped_format(enum pipe_format format)
2562 {
2563 struct tu_native_format linear = blit_format_texture<CHIP>(format, TILE6_LINEAR, false);
2564 struct tu_native_format tiled = blit_format_texture<CHIP>(format, TILE6_3, false);
2565 return linear.fmt != tiled.fmt || linear.swap != tiled.swap;
2566 }
2567
2568 /* R8G8_* formats have a different tiling layout than other cpp=2 formats, and
2569 * therefore R8G8 images can't be reinterpreted as non-R8G8 images (and vice
2570 * versa). This should mirror the logic in fdl6_layout.
2571 */
2572 static bool
image_is_r8g8(struct tu_image * image)2573 image_is_r8g8(struct tu_image *image)
2574 {
2575 return image->layout[0].cpp == 2 &&
2576 vk_format_get_nr_components(image->vk.format) == 2;
2577 }
2578
2579 template <chip CHIP>
2580 static void
tu_copy_image_to_image(struct tu_cmd_buffer * cmd,struct tu_image * src_image,struct tu_image * dst_image,const VkImageCopy2 * info)2581 tu_copy_image_to_image(struct tu_cmd_buffer *cmd,
2582 struct tu_image *src_image,
2583 struct tu_image *dst_image,
2584 const VkImageCopy2 *info)
2585 {
2586 const struct blit_ops *ops = &r2d_ops<CHIP>;
2587 struct tu_cs *cs = &cmd->cs;
2588
2589 if (dst_image->layout[0].nr_samples > 1)
2590 ops = &r3d_ops<CHIP>;
2591
2592 enum pipe_format format = PIPE_FORMAT_NONE;
2593 VkOffset3D src_offset = info->srcOffset;
2594 VkOffset3D dst_offset = info->dstOffset;
2595 VkExtent3D extent = info->extent;
2596 uint32_t layers_to_copy = MAX2(info->extent.depth,
2597 vk_image_subresource_layer_count(&src_image->vk,
2598 &info->srcSubresource));
2599
2600 /* From the Vulkan 1.2.140 spec, section 19.3 "Copying Data Between
2601 * Images":
2602 *
2603 * When copying between compressed and uncompressed formats the extent
2604 * members represent the texel dimensions of the source image and not
2605 * the destination. When copying from a compressed image to an
2606 * uncompressed image the image texel dimensions written to the
2607 * uncompressed image will be source extent divided by the compressed
2608 * texel block dimensions. When copying from an uncompressed image to a
2609 * compressed image the image texel dimensions written to the compressed
2610 * image will be the source extent multiplied by the compressed texel
2611 * block dimensions.
2612 *
2613 * This means we only have to adjust the extent if the source image is
2614 * compressed.
2615 */
2616 copy_compressed(src_image->vk.format, &src_offset, &extent, NULL, NULL);
2617 copy_compressed(dst_image->vk.format, &dst_offset, NULL, NULL, NULL);
2618
2619 enum pipe_format dst_format = copy_format(dst_image->vk.format, info->dstSubresource.aspectMask);
2620 enum pipe_format src_format = copy_format(src_image->vk.format, info->srcSubresource.aspectMask);
2621
2622 /* note: could use "R8_UNORM" when no UBWC */
2623 unsigned blit_param = 0;
2624 if (dst_format == PIPE_FORMAT_Y8_UNORM ||
2625 src_format == PIPE_FORMAT_Y8_UNORM ||
2626 tu_pipe_format_is_float16(src_format) ||
2627 tu_pipe_format_is_float16(dst_format)) {
2628 ops = &r3d_ops<CHIP>;
2629 blit_param = R3D_COPY;
2630 }
2631
2632 bool use_staging_blit = false;
2633
2634 if (src_format == dst_format) {
2635 /* Images that share a format can always be copied directly because it's
2636 * the same as a blit.
2637 */
2638 format = src_format;
2639 } else if (!src_image->layout[0].tile_mode) {
2640 /* If an image is linear, we can always safely reinterpret it with the
2641 * other image's format and then do a regular blit.
2642 */
2643 format = dst_format;
2644 } else if (!dst_image->layout[0].tile_mode) {
2645 format = src_format;
2646 } else if (image_is_r8g8(src_image) != image_is_r8g8(dst_image)) {
2647 /* We can't currently copy r8g8 images to/from other cpp=2 images,
2648 * due to the different tile layout.
2649 */
2650 use_staging_blit = true;
2651 } else if (is_swapped_format<CHIP>(src_format) ||
2652 is_swapped_format<CHIP>(dst_format)) {
2653 /* If either format has a non-identity swap, then we can't copy
2654 * to/from it.
2655 */
2656 use_staging_blit = true;
2657 } else if (!src_image->layout[0].ubwc) {
2658 format = dst_format;
2659 } else if (!dst_image->layout[0].ubwc) {
2660 format = src_format;
2661 } else {
2662 /* Both formats use UBWC and so neither can be reinterpreted.
2663 * TODO: We could do an in-place decompression of the dst instead.
2664 */
2665 perf_debug(cmd->device, "TODO: Do in-place UBWC decompression for UBWC->UBWC blits");
2666 use_staging_blit = true;
2667 }
2668
2669 struct fdl6_view dst, src;
2670
2671 if (use_staging_blit) {
2672 tu_image_view_copy<CHIP>(&dst, dst_image, dst_format, &info->dstSubresource, dst_offset.z);
2673 tu_image_view_copy<CHIP>(&src, src_image, src_format, &info->srcSubresource, src_offset.z);
2674
2675 struct fdl_layout staging_layout = { 0 };
2676 VkOffset3D staging_offset = { 0 };
2677
2678 staging_layout.tile_mode = TILE6_LINEAR;
2679 staging_layout.ubwc = false;
2680
2681 uint32_t layer_count =
2682 vk_image_subresource_layer_count(&src_image->vk,
2683 &info->srcSubresource);
2684 fdl6_layout(&staging_layout,
2685 src_format,
2686 src_image->layout[0].nr_samples,
2687 extent.width,
2688 extent.height,
2689 extent.depth,
2690 1,
2691 layer_count,
2692 extent.depth > 1,
2693 NULL);
2694
2695 struct tu_bo *staging_bo;
2696 VkResult result = tu_get_scratch_bo(cmd->device,
2697 staging_layout.size,
2698 &staging_bo);
2699 if (result != VK_SUCCESS) {
2700 vk_command_buffer_set_error(&cmd->vk, result);
2701 return;
2702 }
2703
2704 struct fdl6_view staging;
2705 const struct fdl_layout *staging_layout_ptr = &staging_layout;
2706 const struct fdl_view_args copy_to_args = {
2707 .chip = CHIP,
2708 .iova = staging_bo->iova,
2709 .base_miplevel = 0,
2710 .level_count = 1,
2711 .base_array_layer = 0,
2712 .layer_count = layer_count,
2713 .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2714 .format = tu_format_for_aspect(src_format, VK_IMAGE_ASPECT_COLOR_BIT),
2715 .type = FDL_VIEW_TYPE_2D,
2716 .ubwc_fc_mutable = false,
2717 };
2718 fdl6_view_init(&staging, &staging_layout_ptr, ©_to_args, false);
2719
2720 ops->setup(cmd, cs, src_format, src_format, VK_IMAGE_ASPECT_COLOR_BIT, blit_param, false, false,
2721 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2722 coords(ops, cmd, cs, staging_offset, src_offset, extent);
2723
2724 for (uint32_t i = 0; i < layers_to_copy; i++) {
2725 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, src_format);
2726 ops->dst(cs, &staging, i, src_format);
2727 ops->run(cmd, cs);
2728 }
2729
2730 /* When executed by the user there has to be a pipeline barrier here,
2731 * but since we're doing it manually we'll have to flush ourselves.
2732 */
2733 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
2734 tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
2735 tu_cs_emit_wfi(cs);
2736
2737 const struct fdl_view_args copy_from_args = {
2738 .chip = CHIP,
2739 .iova = staging_bo->iova,
2740 .base_miplevel = 0,
2741 .level_count = 1,
2742 .base_array_layer = 0,
2743 .layer_count = layer_count,
2744 .swiz = { PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y, PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W },
2745 .format = tu_format_for_aspect(dst_format, VK_IMAGE_ASPECT_COLOR_BIT),
2746 .type = FDL_VIEW_TYPE_2D,
2747 .ubwc_fc_mutable = false,
2748 };
2749 fdl6_view_init(&staging, &staging_layout_ptr, ©_from_args, false);
2750
2751 ops->setup(cmd, cs, dst_format, dst_format, info->dstSubresource.aspectMask,
2752 blit_param, false, dst_image->layout[0].ubwc,
2753 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2754 coords(ops, cmd, cs, dst_offset, staging_offset, extent);
2755
2756 for (uint32_t i = 0; i < layers_to_copy; i++) {
2757 ops->src(cmd, cs, &staging, i, VK_FILTER_NEAREST, dst_format);
2758 ops->dst(cs, &dst, i, dst_format);
2759 ops->run(cmd, cs);
2760 }
2761 } else {
2762 tu_image_view_copy<CHIP>(&dst, dst_image, format, &info->dstSubresource, dst_offset.z);
2763 tu_image_view_copy<CHIP>(&src, src_image, format, &info->srcSubresource, src_offset.z);
2764
2765 ops->setup(cmd, cs, format, format, info->dstSubresource.aspectMask,
2766 blit_param, false, dst_image->layout[0].ubwc,
2767 (VkSampleCountFlagBits) dst_image->layout[0].nr_samples);
2768 coords(ops, cmd, cs, dst_offset, src_offset, extent);
2769
2770 for (uint32_t i = 0; i < layers_to_copy; i++) {
2771 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, format);
2772 ops->dst(cs, &dst, i, format);
2773 ops->run(cmd, cs);
2774 }
2775 }
2776
2777 ops->teardown(cmd, cs);
2778 }
2779
2780 template <chip CHIP>
2781 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyImage2(VkCommandBuffer commandBuffer,const VkCopyImageInfo2 * pCopyImageInfo)2782 tu_CmdCopyImage2(VkCommandBuffer commandBuffer,
2783 const VkCopyImageInfo2 *pCopyImageInfo)
2784 {
2785 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2786 VK_FROM_HANDLE(tu_image, src_image, pCopyImageInfo->srcImage);
2787 VK_FROM_HANDLE(tu_image, dst_image, pCopyImageInfo->dstImage);
2788
2789 for (uint32_t i = 0; i < pCopyImageInfo->regionCount; ++i) {
2790 if (src_image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
2791 VkImageCopy2 info = pCopyImageInfo->pRegions[i];
2792 u_foreach_bit(b, info.dstSubresource.aspectMask) {
2793 info.srcSubresource.aspectMask = BIT(b);
2794 info.dstSubresource.aspectMask = BIT(b);
2795 tu_copy_image_to_image<CHIP>(cmd, src_image, dst_image, &info);
2796 }
2797 continue;
2798 }
2799
2800 tu_copy_image_to_image<CHIP>(cmd, src_image, dst_image,
2801 pCopyImageInfo->pRegions + i);
2802 }
2803
2804 if (dst_image->lrz_height) {
2805 tu_disable_lrz<CHIP>(cmd, &cmd->cs, dst_image);
2806 }
2807 }
2808 TU_GENX(tu_CmdCopyImage2);
2809
2810 template <chip CHIP>
2811 static void
copy_buffer(struct tu_cmd_buffer * cmd,uint64_t dst_va,uint64_t src_va,uint64_t size,uint32_t block_size,bool * unaligned_store)2812 copy_buffer(struct tu_cmd_buffer *cmd,
2813 uint64_t dst_va,
2814 uint64_t src_va,
2815 uint64_t size,
2816 uint32_t block_size,
2817 bool *unaligned_store)
2818 {
2819 const struct blit_ops *ops = &r2d_ops<CHIP>;
2820 struct tu_cs *cs = &cmd->cs;
2821 enum pipe_format format = block_size == 4 ? PIPE_FORMAT_R32_UINT : PIPE_FORMAT_R8_UNORM;
2822 uint64_t blocks = size / block_size;
2823
2824 handle_buffer_unaligned_store<CHIP>(cmd, dst_va, size, unaligned_store);
2825
2826 ops->setup(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT, 0, false, false,
2827 VK_SAMPLE_COUNT_1_BIT);
2828
2829 while (blocks) {
2830 uint32_t src_x = (src_va & 63) / block_size;
2831 uint32_t dst_x = (dst_va & 63) / block_size;
2832 uint32_t width = MIN2(MIN2(blocks, 0x4000 - src_x), 0x4000 - dst_x);
2833
2834 ops->src_buffer(cmd, cs, format, src_va & ~63, 0, src_x + width, 1, format);
2835 ops->dst_buffer( cs, format, dst_va & ~63, 0, format);
2836 ops->coords(cmd, cs, (VkOffset2D) {dst_x}, (VkOffset2D) {src_x}, (VkExtent2D) {width, 1});
2837 ops->run(cmd, cs);
2838
2839 src_va += width * block_size;
2840 dst_va += width * block_size;
2841 blocks -= width;
2842 }
2843
2844 ops->teardown(cmd, cs);
2845 }
2846
2847 template <chip CHIP>
2848 VKAPI_ATTR void VKAPI_CALL
tu_CmdCopyBuffer2(VkCommandBuffer commandBuffer,const VkCopyBufferInfo2 * pCopyBufferInfo)2849 tu_CmdCopyBuffer2(VkCommandBuffer commandBuffer,
2850 const VkCopyBufferInfo2 *pCopyBufferInfo)
2851 {
2852 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2853 VK_FROM_HANDLE(tu_buffer, src_buffer, pCopyBufferInfo->srcBuffer);
2854 VK_FROM_HANDLE(tu_buffer, dst_buffer, pCopyBufferInfo->dstBuffer);
2855
2856 bool unaligned_store = false;
2857 for (unsigned i = 0; i < pCopyBufferInfo->regionCount; ++i) {
2858 const VkBufferCopy2 *region = &pCopyBufferInfo->pRegions[i];
2859 copy_buffer<CHIP>(cmd,
2860 dst_buffer->iova + region->dstOffset,
2861 src_buffer->iova + region->srcOffset,
2862 region->size, 1, &unaligned_store);
2863 }
2864
2865 after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
2866 }
2867 TU_GENX(tu_CmdCopyBuffer2);
2868
2869 template <chip CHIP>
2870 VKAPI_ATTR void VKAPI_CALL
tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize dataSize,const void * pData)2871 tu_CmdUpdateBuffer(VkCommandBuffer commandBuffer,
2872 VkBuffer dstBuffer,
2873 VkDeviceSize dstOffset,
2874 VkDeviceSize dataSize,
2875 const void *pData)
2876 {
2877 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2878 VK_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
2879
2880 struct tu_cs_memory tmp;
2881 VkResult result = tu_cs_alloc(&cmd->sub_cs, DIV_ROUND_UP(dataSize, 64), 64 / 4, &tmp);
2882 if (result != VK_SUCCESS) {
2883 vk_command_buffer_set_error(&cmd->vk, result);
2884 return;
2885 }
2886
2887 bool unaligned_store = false;
2888 memcpy(tmp.map, pData, dataSize);
2889 copy_buffer<CHIP>(cmd, buffer->iova + dstOffset, tmp.iova, dataSize, 4, &unaligned_store);
2890
2891 after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
2892 }
2893 TU_GENX(tu_CmdUpdateBuffer);
2894
2895 template <chip CHIP>
2896 VKAPI_ATTR void VKAPI_CALL
tu_CmdFillBuffer(VkCommandBuffer commandBuffer,VkBuffer dstBuffer,VkDeviceSize dstOffset,VkDeviceSize fillSize,uint32_t data)2897 tu_CmdFillBuffer(VkCommandBuffer commandBuffer,
2898 VkBuffer dstBuffer,
2899 VkDeviceSize dstOffset,
2900 VkDeviceSize fillSize,
2901 uint32_t data)
2902 {
2903 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2904 VK_FROM_HANDLE(tu_buffer, buffer, dstBuffer);
2905 const struct blit_ops *ops = &r2d_ops<CHIP>;
2906 struct tu_cs *cs = &cmd->cs;
2907
2908 fillSize = vk_buffer_range(&buffer->vk, dstOffset, fillSize);
2909
2910 uint64_t dst_va = buffer->iova + dstOffset;
2911 uint32_t blocks = fillSize / 4;
2912
2913 bool unaligned_store = false;
2914 handle_buffer_unaligned_store<CHIP>(cmd, dst_va, fillSize, &unaligned_store);
2915
2916 ops->setup(cmd, cs, PIPE_FORMAT_R32_UINT, PIPE_FORMAT_R32_UINT,
2917 VK_IMAGE_ASPECT_COLOR_BIT, 0, true, false,
2918 VK_SAMPLE_COUNT_1_BIT);
2919
2920 VkClearValue clear_val = {};
2921 clear_val.color.uint32[0] = data;
2922 ops->clear_value(cmd, cs, PIPE_FORMAT_R32_UINT, &clear_val);
2923
2924 while (blocks) {
2925 uint32_t dst_x = (dst_va & 63) / 4;
2926 uint32_t width = MIN2(blocks, 0x4000 - dst_x);
2927
2928 ops->dst_buffer(cs, PIPE_FORMAT_R32_UINT, dst_va & ~63, 0, PIPE_FORMAT_R32_UINT);
2929 ops->coords(cmd, cs, (VkOffset2D) {dst_x}, blt_no_coord, (VkExtent2D) {width, 1});
2930 ops->run(cmd, cs);
2931
2932 dst_va += width * 4;
2933 blocks -= width;
2934 }
2935
2936 ops->teardown(cmd, cs);
2937
2938 after_buffer_unaligned_buffer_store<CHIP>(cmd, unaligned_store);
2939 }
2940 TU_GENX(tu_CmdFillBuffer);
2941
2942 template <chip CHIP>
2943 VKAPI_ATTR void VKAPI_CALL
tu_CmdResolveImage2(VkCommandBuffer commandBuffer,const VkResolveImageInfo2 * pResolveImageInfo)2944 tu_CmdResolveImage2(VkCommandBuffer commandBuffer,
2945 const VkResolveImageInfo2 *pResolveImageInfo)
2946 {
2947 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
2948 VK_FROM_HANDLE(tu_image, src_image, pResolveImageInfo->srcImage);
2949 VK_FROM_HANDLE(tu_image, dst_image, pResolveImageInfo->dstImage);
2950 const struct blit_ops *ops = &r2d_ops<CHIP>;
2951 struct tu_cs *cs = &cmd->cs;
2952
2953 enum pipe_format src_format =
2954 vk_format_to_pipe_format(src_image->vk.format);
2955 enum pipe_format dst_format =
2956 vk_format_to_pipe_format(dst_image->vk.format);
2957 ops->setup(cmd, cs, src_format, dst_format,
2958 VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst_image->layout[0].ubwc,
2959 VK_SAMPLE_COUNT_1_BIT);
2960
2961 for (uint32_t i = 0; i < pResolveImageInfo->regionCount; ++i) {
2962 const VkImageResolve2 *info = &pResolveImageInfo->pRegions[i];
2963 uint32_t layers = MAX2(info->extent.depth,
2964 vk_image_subresource_layer_count(&dst_image->vk,
2965 &info->dstSubresource));
2966
2967 /* TODO: aspect masks possible ? */
2968
2969 coords(ops, cmd, cs, info->dstOffset, info->srcOffset, info->extent);
2970
2971 struct fdl6_view dst, src;
2972 tu_image_view_blit<CHIP>(&dst, dst_image, &info->dstSubresource, info->dstOffset.z);
2973 tu_image_view_blit<CHIP>(&src, src_image, &info->srcSubresource, info->srcOffset.z);
2974
2975 for (uint32_t i = 0; i < layers; i++) {
2976 ops->src(cmd, cs, &src, i, VK_FILTER_NEAREST, dst_format);
2977 ops->dst(cs, &dst, i, src_format);
2978 ops->run(cmd, cs);
2979 }
2980 }
2981
2982 ops->teardown(cmd, cs);
2983 }
2984 TU_GENX(tu_CmdResolveImage2);
2985
2986 #define for_each_layer(layer, layer_mask, layers) \
2987 for (uint32_t layer = 0; \
2988 layer < ((layer_mask) ? (util_logbase2(layer_mask) + 1) : layers); \
2989 layer++) \
2990 if (!layer_mask || (layer_mask & BIT(layer)))
2991
2992 template <chip CHIP>
2993 static void
resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_src_format,VkFormat vk_dst_format,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect,bool src_separate_ds,bool dst_separate_ds)2994 resolve_sysmem(struct tu_cmd_buffer *cmd,
2995 struct tu_cs *cs,
2996 VkFormat vk_src_format,
2997 VkFormat vk_dst_format,
2998 const struct tu_image_view *src,
2999 const struct tu_image_view *dst,
3000 uint32_t layer_mask,
3001 uint32_t layers,
3002 const VkRect2D *rect,
3003 bool src_separate_ds,
3004 bool dst_separate_ds)
3005 {
3006 const struct blit_ops *ops = &r2d_ops<CHIP>;
3007
3008 trace_start_sysmem_resolve(&cmd->trace, cs, vk_dst_format);
3009
3010 enum pipe_format src_format = vk_format_to_pipe_format(vk_src_format);
3011 enum pipe_format dst_format = vk_format_to_pipe_format(vk_dst_format);
3012
3013 ops->setup(cmd, cs, src_format, dst_format,
3014 VK_IMAGE_ASPECT_COLOR_BIT, 0, false, dst->view.ubwc_enabled,
3015 VK_SAMPLE_COUNT_1_BIT);
3016 ops->coords(cmd, cs, rect->offset, rect->offset, rect->extent);
3017
3018 for_each_layer(i, layer_mask, layers) {
3019 if (src_separate_ds) {
3020 if (vk_src_format == VK_FORMAT_D32_SFLOAT || vk_dst_format == VK_FORMAT_D32_SFLOAT) {
3021 r2d_src_depth<CHIP>(cmd, cs, src, i, VK_FILTER_NEAREST);
3022 } else {
3023 r2d_src_stencil<CHIP>(cmd, cs, src, i, VK_FILTER_NEAREST);
3024 }
3025 } else {
3026 ops->src(cmd, cs, &src->view, i, VK_FILTER_NEAREST, dst_format);
3027 }
3028
3029 if (dst_separate_ds) {
3030 if (vk_dst_format == VK_FORMAT_D32_SFLOAT) {
3031 ops->dst_depth(cs, dst, i);
3032 } else {
3033 ops->dst_stencil(cs, dst, i);
3034 }
3035 } else {
3036 ops->dst(cs, &dst->view, i, src_format);
3037 }
3038
3039 ops->run(cmd, cs);
3040 }
3041
3042 ops->teardown(cmd, cs);
3043
3044 trace_end_sysmem_resolve(&cmd->trace, cs);
3045 }
3046
3047 template <chip CHIP>
3048 void
tu_resolve_sysmem(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * src,const struct tu_image_view * dst,uint32_t layer_mask,uint32_t layers,const VkRect2D * rect)3049 tu_resolve_sysmem(struct tu_cmd_buffer *cmd,
3050 struct tu_cs *cs,
3051 const struct tu_image_view *src,
3052 const struct tu_image_view *dst,
3053 uint32_t layer_mask,
3054 uint32_t layers,
3055 const VkRect2D *rect)
3056 {
3057 assert(src->image->vk.format == dst->image->vk.format ||
3058 (vk_format_is_depth_or_stencil(src->image->vk.format) &&
3059 vk_format_is_depth_or_stencil(dst->image->vk.format)));
3060
3061 bool src_separate_ds = src->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
3062 bool dst_separate_ds = dst->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT;
3063
3064 if (dst_separate_ds) {
3065 resolve_sysmem<CHIP>(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_FORMAT_D32_SFLOAT,
3066 src, dst, layer_mask, layers, rect,
3067 src_separate_ds, dst_separate_ds);
3068 resolve_sysmem<CHIP>(cmd, cs, VK_FORMAT_S8_UINT, VK_FORMAT_S8_UINT,
3069 src, dst, layer_mask, layers, rect,
3070 src_separate_ds, dst_separate_ds);
3071 } else {
3072 resolve_sysmem<CHIP>(cmd, cs, src->image->vk.format, dst->image->vk.format,
3073 src, dst, layer_mask, layers, rect,
3074 src_separate_ds, dst_separate_ds);
3075 }
3076 }
3077 TU_GENX(tu_resolve_sysmem);
3078
3079 template <chip CHIP>
3080 static void
clear_image_cp_blit(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)3081 clear_image_cp_blit(struct tu_cmd_buffer *cmd,
3082 struct tu_image *image,
3083 const VkClearValue *clear_value,
3084 const VkImageSubresourceRange *range,
3085 VkImageAspectFlags aspect_mask)
3086 {
3087 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3088 uint32_t layer_count = vk_image_subresource_layer_count(&image->vk, range);
3089 struct tu_cs *cs = &cmd->cs;
3090 enum pipe_format format;
3091 if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) {
3092 format = PIPE_FORMAT_R32_UINT;
3093 } else {
3094 format = tu6_plane_format(image->vk.format,
3095 tu6_plane_index(image->vk.format,
3096 aspect_mask));
3097 }
3098
3099 if (image->layout[0].depth0 > 1) {
3100 assert(layer_count == 1);
3101 assert(range->baseArrayLayer == 0);
3102 }
3103
3104 const struct blit_ops *ops = image->layout[0].nr_samples > 1 ? &r3d_ops<CHIP> : &r2d_ops<CHIP>;
3105
3106 ops->setup(cmd, cs, format, format, aspect_mask, 0, true, image->layout[0].ubwc,
3107 (VkSampleCountFlagBits) image->layout[0].nr_samples);
3108 if (image->vk.format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32)
3109 ops->clear_value(cmd, cs, PIPE_FORMAT_R9G9B9E5_FLOAT, clear_value);
3110 else
3111 ops->clear_value(cmd, cs, format, clear_value);
3112
3113 for (unsigned j = 0; j < level_count; j++) {
3114 if (image->layout[0].depth0 > 1)
3115 layer_count = u_minify(image->layout[0].depth0, range->baseMipLevel + j);
3116
3117 ops->coords(cmd, cs, (VkOffset2D) {}, blt_no_coord, (VkExtent2D) {
3118 u_minify(image->layout[0].width0, range->baseMipLevel + j),
3119 u_minify(image->layout[0].height0, range->baseMipLevel + j)
3120 });
3121
3122 struct fdl6_view dst;
3123 const VkImageSubresourceLayers subresource = {
3124 .aspectMask = aspect_mask,
3125 .mipLevel = range->baseMipLevel + j,
3126 .baseArrayLayer = range->baseArrayLayer,
3127 .layerCount = 1,
3128 };
3129 tu_image_view_copy_blit<CHIP>(&dst, image, format, &subresource, 0, false);
3130
3131 for (uint32_t i = 0; i < layer_count; i++) {
3132 ops->dst(cs, &dst, i, format);
3133 ops->run(cmd, cs);
3134 }
3135 }
3136
3137 ops->teardown(cmd, cs);
3138 }
3139
3140 static void
clear_image_event_blit(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)3141 clear_image_event_blit(struct tu_cmd_buffer *cmd,
3142 struct tu_image *image,
3143 const VkClearValue *clear_value,
3144 const VkImageSubresourceRange *range,
3145 VkImageAspectFlags aspect_mask)
3146 {
3147 uint32_t level_count = vk_image_subresource_level_count(&image->vk, range);
3148 uint32_t layer_count = vk_image_subresource_layer_count(&image->vk, range);
3149 VkFormat vk_format = image->vk.format;
3150 if (vk_format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3151 if (aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT)
3152 vk_format = VK_FORMAT_S8_UINT;
3153 else
3154 vk_format = VK_FORMAT_D32_SFLOAT;
3155 }
3156
3157 enum pipe_format format = vk_format_to_pipe_format(vk_format);
3158
3159 if (image->layout[0].depth0 > 1) {
3160 assert(layer_count == 1);
3161 assert(range->baseArrayLayer == 0);
3162 }
3163
3164 struct tu_cs *cs = &cmd->cs;
3165
3166 tu_cs_emit_regs(cs,
3167 A7XX_RB_BLIT_CLEAR_MODE(.clear_mode = CLEAR_MODE_SYSMEM));
3168
3169 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
3170 tu_cs_emit(cs, 0);
3171
3172 tu_cs_emit_regs(
3173 cs, A6XX_RB_BLIT_INFO(
3174 .type = BLIT_EVENT_CLEAR,
3175 .sample_0 = vk_format_is_int(vk_format) ||
3176 vk_format_is_depth_or_stencil(vk_format),
3177 .depth = vk_format_is_depth_or_stencil(vk_format),
3178 .clear_mask = aspect_write_mask_generic_clear(format, aspect_mask)));
3179
3180 uint32_t clear_vals[4] = {};
3181 pack_blit_event_clear_value(clear_value, format, clear_vals);
3182 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
3183 tu_cs_emit_array(cs, clear_vals, 4);
3184
3185 for (unsigned level = 0; level < level_count; level++) {
3186 if (image->layout[0].depth0 > 1)
3187 layer_count =
3188 u_minify(image->layout[0].depth0, range->baseMipLevel + level);
3189
3190 uint32_t width =
3191 u_minify(image->layout[0].width0, range->baseMipLevel + level);
3192 uint32_t height =
3193 u_minify(image->layout[0].height0, range->baseMipLevel + level);
3194 tu_cs_emit_regs(
3195 cs, A6XX_RB_BLIT_SCISSOR_TL(.x = 0, .y = 0),
3196 A6XX_RB_BLIT_SCISSOR_BR(.x = width - 1, .y = height - 1));
3197
3198 struct fdl6_view dst;
3199 const VkImageSubresourceLayers subresource = {
3200 .aspectMask = aspect_mask,
3201 .mipLevel = range->baseMipLevel + level,
3202 .baseArrayLayer = range->baseArrayLayer,
3203 .layerCount = 1,
3204 };
3205 tu_image_view_copy_blit<A7XX>(&dst, image, format, &subresource, 0, false);
3206
3207 for (uint32_t layer = 0; layer < layer_count; layer++) {
3208
3209 struct event_blit_dst_view blt_view = {
3210 .image = image,
3211 .view = &dst,
3212 .layer = layer,
3213 };
3214
3215 if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3216 uint32_t real_level = range->baseMipLevel + level;
3217 uint32_t real_layer = range->baseArrayLayer + layer;
3218 if (aspect_mask == VK_IMAGE_ASPECT_DEPTH_BIT) {
3219 struct fdl_layout *layout = &image->layout[0];
3220 blt_view.depth_addr =
3221 image->iova +
3222 fdl_surface_offset(layout, real_level, real_layer);
3223 blt_view.depth_pitch = fdl_pitch(layout, real_level);
3224 } else {
3225 struct fdl_layout *layout = &image->layout[1];
3226 blt_view.stencil_addr =
3227 image->iova +
3228 fdl_surface_offset(layout, real_level, real_layer);
3229 blt_view.stencil_pitch = fdl_pitch(layout, real_level);
3230 }
3231 }
3232
3233 event_blit_run<A7XX>(cmd, cs, NULL, &blt_view,
3234 aspect_mask == VK_IMAGE_ASPECT_STENCIL_BIT);
3235 }
3236 }
3237 }
3238
3239 static bool
use_generic_clear_for_image_clear(struct tu_cmd_buffer * cmd,struct tu_image * image)3240 use_generic_clear_for_image_clear(struct tu_cmd_buffer *cmd,
3241 struct tu_image *image)
3242 {
3243 return cmd->device->physical_device->info->a7xx.has_generic_clear &&
3244 /* A7XX supports R9G9B9E5_FLOAT as color attachment and supports
3245 * generic clears for it. A7XX TODO: allow R9G9B9E5_FLOAT
3246 * attachments.
3247 */
3248 image->vk.format != VK_FORMAT_E5B9G9R9_UFLOAT_PACK32;
3249 }
3250
3251 template <chip CHIP>
3252 static void
clear_image(struct tu_cmd_buffer * cmd,struct tu_image * image,const VkClearValue * clear_value,const VkImageSubresourceRange * range,VkImageAspectFlags aspect_mask)3253 clear_image(struct tu_cmd_buffer *cmd,
3254 struct tu_image *image,
3255 const VkClearValue *clear_value,
3256 const VkImageSubresourceRange *range,
3257 VkImageAspectFlags aspect_mask)
3258 {
3259 if (use_generic_clear_for_image_clear(cmd, image)) {
3260 clear_image_event_blit(cmd, image, clear_value, range, aspect_mask);
3261 } else {
3262 clear_image_cp_blit<CHIP>(cmd, image, clear_value, range, aspect_mask);
3263 }
3264 }
3265
3266 template <chip CHIP>
3267 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearColorImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearColorValue * pColor,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)3268 tu_CmdClearColorImage(VkCommandBuffer commandBuffer,
3269 VkImage image_h,
3270 VkImageLayout imageLayout,
3271 const VkClearColorValue *pColor,
3272 uint32_t rangeCount,
3273 const VkImageSubresourceRange *pRanges)
3274 {
3275 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3276 VK_FROM_HANDLE(tu_image, image, image_h);
3277
3278 if (use_generic_clear_for_image_clear(cmd, image)) {
3279 /* Generic clear doesn't go through CCU (or other caches). */
3280 cmd->state.cache.flush_bits |=
3281 TU_CMD_FLAG_CCU_INVALIDATE_COLOR | TU_CMD_FLAG_WAIT_FOR_IDLE;
3282 tu_emit_cache_flush<CHIP>(cmd);
3283 }
3284
3285 for (unsigned i = 0; i < rangeCount; i++) {
3286 clear_image<CHIP>(cmd, image, (const VkClearValue*) pColor, pRanges + i, VK_IMAGE_ASPECT_COLOR_BIT);
3287 }
3288 }
3289 TU_GENX(tu_CmdClearColorImage);
3290
3291 template <chip CHIP>
3292 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,VkImage image_h,VkImageLayout imageLayout,const VkClearDepthStencilValue * pDepthStencil,uint32_t rangeCount,const VkImageSubresourceRange * pRanges)3293 tu_CmdClearDepthStencilImage(VkCommandBuffer commandBuffer,
3294 VkImage image_h,
3295 VkImageLayout imageLayout,
3296 const VkClearDepthStencilValue *pDepthStencil,
3297 uint32_t rangeCount,
3298 const VkImageSubresourceRange *pRanges)
3299 {
3300 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3301 VK_FROM_HANDLE(tu_image, image, image_h);
3302
3303 if (use_generic_clear_for_image_clear(cmd, image)) {
3304 /* Generic clear doesn't go through CCU (or other caches). */
3305 cmd->state.cache.flush_bits |= TU_CMD_FLAG_CCU_INVALIDATE_COLOR |
3306 TU_CMD_FLAG_CCU_INVALIDATE_DEPTH |
3307 TU_CMD_FLAG_WAIT_FOR_IDLE;
3308 tu_emit_cache_flush<CHIP>(cmd);
3309 }
3310
3311 for (unsigned i = 0; i < rangeCount; i++) {
3312 const VkImageSubresourceRange *range = &pRanges[i];
3313
3314 if (image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3315 /* can't clear both depth and stencil at once, split up the aspect mask */
3316 u_foreach_bit(b, range->aspectMask)
3317 clear_image<CHIP>(cmd, image, (const VkClearValue*) pDepthStencil, range, BIT(b));
3318 continue;
3319 }
3320
3321 clear_image<CHIP>(cmd, image, (const VkClearValue*) pDepthStencil, range, range->aspectMask);
3322 }
3323
3324 tu_lrz_clear_depth_image<CHIP>(cmd, image, pDepthStencil, rangeCount, pRanges);
3325 }
3326 TU_GENX(tu_CmdClearDepthStencilImage);
3327
3328 template <chip CHIP>
3329 static void
tu_clear_sysmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)3330 tu_clear_sysmem_attachments(struct tu_cmd_buffer *cmd,
3331 uint32_t attachment_count,
3332 const VkClearAttachment *attachments,
3333 uint32_t rect_count,
3334 const VkClearRect *rects)
3335 {
3336 /* the shader path here is special, it avoids changing MRT/etc state */
3337 const struct tu_subpass *subpass = cmd->state.subpass;
3338 const uint32_t mrt_count = subpass->color_count;
3339 struct tu_cs *cs = &cmd->draw_cs;
3340 uint32_t clear_value[MAX_RTS][4];
3341 float z_clear_val = 0.0f;
3342 uint8_t s_clear_val = 0;
3343 uint32_t clear_rts = 0, clear_components = 0;
3344 bool z_clear = false;
3345 bool s_clear = false;
3346
3347 trace_start_sysmem_clear_all(&cmd->trace, cs, mrt_count, rect_count);
3348
3349 for (uint32_t i = 0; i < attachment_count; i++) {
3350 uint32_t a;
3351 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
3352 uint32_t c = attachments[i].colorAttachment;
3353 a = subpass->color_attachments[c].attachment;
3354 if (a == VK_ATTACHMENT_UNUSED)
3355 continue;
3356
3357 clear_rts |= 1 << c;
3358 clear_components |= 0xf << (c * 4);
3359 memcpy(clear_value[c], &attachments[i].clearValue, 4 * sizeof(uint32_t));
3360 } else {
3361 a = subpass->depth_stencil_attachment.attachment;
3362 if (a == VK_ATTACHMENT_UNUSED)
3363 continue;
3364
3365 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3366 z_clear = true;
3367 z_clear_val = attachments[i].clearValue.depthStencil.depth;
3368 }
3369
3370 if (attachments[i].aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3371 s_clear = true;
3372 s_clear_val = attachments[i].clearValue.depthStencil.stencil & 0xff;
3373 }
3374 }
3375 }
3376
3377 /* We may not know the multisample count if there are no attachments, so
3378 * just bail early to avoid corner cases later.
3379 */
3380 if (clear_rts == 0 && !z_clear && !s_clear)
3381 return;
3382
3383 /* disable all draw states so they don't interfere
3384 * TODO: use and re-use draw states
3385 * we have to disable draw states individually to preserve
3386 * input attachment states, because a secondary command buffer
3387 * won't be able to restore them
3388 */
3389 tu_cs_emit_pkt7(cs, CP_SET_DRAW_STATE, 3 * (TU_DRAW_STATE_COUNT - 2));
3390 for (uint32_t i = 0; i < TU_DRAW_STATE_COUNT; i++) {
3391 if (i == TU_DRAW_STATE_INPUT_ATTACHMENTS_GMEM ||
3392 i == TU_DRAW_STATE_INPUT_ATTACHMENTS_SYSMEM)
3393 continue;
3394 tu_cs_emit(cs, CP_SET_DRAW_STATE__0_GROUP_ID(i) |
3395 CP_SET_DRAW_STATE__0_DISABLE);
3396 tu_cs_emit_qw(cs, 0);
3397 }
3398 cmd->state.dirty |= TU_CMD_DIRTY_DRAW_STATE;
3399
3400 tu_cs_emit_pkt4(cs, REG_A6XX_SP_FS_OUTPUT_CNTL0, 2);
3401 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(0xfc) |
3402 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(0xfc) |
3403 0xfc000000);
3404 tu_cs_emit(cs, A6XX_SP_FS_OUTPUT_CNTL1_MRT(mrt_count));
3405
3406 r3d_common<CHIP>(cmd, cs, R3D_CLEAR, clear_rts, false, cmd->state.subpass->samples);
3407
3408 /* Disable sample counting in order to not affect occlusion query. */
3409 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = true));
3410
3411 if (cmd->state.prim_generated_query_running_before_rp) {
3412 tu_emit_event_write<CHIP>(cmd, cs, FD_STOP_PRIMITIVE_CTRS);
3413 }
3414
3415 tu_cs_emit_regs(cs,
3416 A6XX_SP_FS_RENDER_COMPONENTS(.dword = clear_components));
3417 tu_cs_emit_regs(cs,
3418 A6XX_RB_RENDER_COMPONENTS(.dword = clear_components));
3419
3420 tu_cs_emit_regs(cs,
3421 A6XX_RB_FS_OUTPUT_CNTL1(.mrt = mrt_count));
3422
3423 tu_cs_emit_regs(cs, A6XX_SP_BLEND_CNTL());
3424 tu_cs_emit_regs(cs, A6XX_RB_BLEND_CNTL(.independent_blend = 1, .sample_mask = 0xffff));
3425 for (uint32_t i = 0; i < mrt_count; i++) {
3426 tu_cs_emit_regs(cs, A6XX_RB_MRT_CONTROL(i,
3427 .component_enable = COND(clear_rts & (1 << i), 0xf)));
3428 }
3429
3430 tu_cs_emit_regs(cs, A6XX_GRAS_LRZ_CNTL(0));
3431 tu_cs_emit_regs(cs, A6XX_RB_LRZ_CNTL(0));
3432
3433 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_PLANE_CNTL());
3434 tu_cs_emit_regs(cs, A6XX_RB_DEPTH_CNTL(
3435 .z_test_enable = z_clear,
3436 .z_write_enable = z_clear,
3437 .zfunc = FUNC_ALWAYS));
3438 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_CNTL(z_clear));
3439 tu_cs_emit_regs(cs, A6XX_GRAS_SU_DEPTH_PLANE_CNTL());
3440 tu_cs_emit_regs(cs, A6XX_RB_STENCIL_CONTROL(
3441 .stencil_enable = s_clear,
3442 .func = FUNC_ALWAYS,
3443 .zpass = STENCIL_REPLACE));
3444 tu_cs_emit_regs(cs, A6XX_GRAS_SU_STENCIL_CNTL(s_clear));
3445 tu_cs_emit_regs(cs, A6XX_RB_STENCILMASK(.mask = 0xff));
3446 tu_cs_emit_regs(cs, A6XX_RB_STENCILWRMASK(.wrmask = 0xff));
3447 tu_cs_emit_regs(cs, A6XX_RB_STENCILREF(.ref = s_clear_val));
3448
3449 tu_cs_emit_regs(cs, A6XX_GRAS_SC_CNTL(.ccusinglecachelinesize = 2));
3450
3451 unsigned num_rts = util_bitcount(clear_rts);
3452 uint32_t packed_clear_value[MAX_RTS][4];
3453
3454 uint32_t idx = 0;
3455 u_foreach_bit(b, clear_rts) {
3456 memcpy(&packed_clear_value[idx], &clear_value[b], 4 * sizeof(uint32_t));
3457 idx++;
3458 }
3459
3460 if (num_rts > 0)
3461 tu6_emit_blit_consts_load(cmd, cs, CP_LOAD_STATE6_FRAG, SB6_FS_SHADER,
3462 0, packed_clear_value, num_rts);
3463
3464 for (uint32_t i = 0; i < rect_count; i++) {
3465 /* This should be true because of this valid usage for
3466 * vkCmdClearAttachments:
3467 *
3468 * "If the render pass instance this is recorded in uses multiview,
3469 * then baseArrayLayer must be zero and layerCount must be one"
3470 */
3471 assert(!subpass->multiview_mask || rects[i].baseArrayLayer == 0);
3472
3473 /* a630 doesn't support multiview masks, which means that we can't use
3474 * the normal multiview path without potentially recompiling a shader
3475 * on-demand or using a more complicated variant that takes the mask as
3476 * a const. Just use the layered path instead, since it shouldn't be
3477 * much worse.
3478 */
3479 for_each_layer(layer, subpass->multiview_mask, rects[i].layerCount)
3480 {
3481 const float coords[] = {
3482 rects[i].rect.offset.x,
3483 rects[i].rect.offset.y,
3484 z_clear_val,
3485 uif(rects[i].baseArrayLayer + layer),
3486 rects[i].rect.offset.x + rects[i].rect.extent.width,
3487 rects[i].rect.offset.y + rects[i].rect.extent.height,
3488 z_clear_val,
3489 1.0f,
3490 };
3491
3492 r3d_coords_raw(cmd, cs, coords);
3493 r3d_run_vis(cmd, cs);
3494 }
3495 }
3496
3497 /* Re-enable sample counting. */
3498 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.disable = false));
3499
3500 if (cmd->state.prim_generated_query_running_before_rp) {
3501 tu_emit_event_write<CHIP>(cmd, cs, FD_START_PRIMITIVE_CTRS);
3502 }
3503
3504 trace_end_sysmem_clear_all(&cmd->trace, cs);
3505 }
3506
3507 template <chip CHIP>
3508 static void
clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,enum pipe_format format,uint8_t clear_mask,uint32_t gmem_offset,const VkClearValue * value)3509 clear_gmem_attachment(struct tu_cmd_buffer *cmd,
3510 struct tu_cs *cs,
3511 enum pipe_format format,
3512 uint8_t clear_mask,
3513 uint32_t gmem_offset,
3514 const VkClearValue *value)
3515 {
3516 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_DST_INFO, 1);
3517 tu_cs_emit(cs, A6XX_RB_BLIT_DST_INFO_COLOR_FORMAT(
3518 blit_base_format<CHIP>(format, false, true)));
3519
3520 tu_cs_emit_regs(cs, A6XX_RB_BLIT_INFO(.type = BLIT_EVENT_CLEAR,
3521 .clear_mask = clear_mask));
3522
3523 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_BASE_GMEM, 1);
3524 tu_cs_emit(cs, gmem_offset);
3525
3526 tu_cs_emit_pkt4(cs, REG_A6XX_RB_UNKNOWN_88D0, 1);
3527 tu_cs_emit(cs, 0);
3528
3529 uint32_t clear_vals[4] = {};
3530 pack_blit_event_clear_value(value, format, clear_vals);
3531
3532 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
3533 tu_cs_emit_array(cs, clear_vals, 4);
3534
3535 tu_emit_event_write<CHIP>(cmd, cs, FD_BLIT);
3536 }
3537
3538 template <chip CHIP>
3539 static void
tu_emit_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t attachment,uint32_t base_layer,uint32_t layers,uint32_t layer_mask,VkImageAspectFlags mask,const VkClearValue * value)3540 tu_emit_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
3541 struct tu_cs *cs,
3542 uint32_t attachment,
3543 uint32_t base_layer,
3544 uint32_t layers,
3545 uint32_t layer_mask,
3546 VkImageAspectFlags mask,
3547 const VkClearValue *value)
3548 {
3549 const struct tu_render_pass_attachment *att =
3550 &cmd->state.pass->attachments[attachment];
3551
3552 trace_start_gmem_clear(&cmd->trace, cs, att->format, att->samples);
3553
3554 tu_cs_emit_regs(cs,
3555 A6XX_RB_BLIT_GMEM_MSAA_CNTL(tu_msaa_samples(att->samples)));
3556
3557 enum pipe_format format = vk_format_to_pipe_format(att->format);
3558 for_each_layer(i, layer_mask, layers) {
3559 uint32_t layer = i + base_layer;
3560 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3561 if (mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3562 clear_gmem_attachment<CHIP>(cmd, cs, PIPE_FORMAT_Z32_FLOAT, 0xf,
3563 tu_attachment_gmem_offset(cmd, att, layer), value);
3564 }
3565 if (mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3566 clear_gmem_attachment<CHIP>(cmd, cs, PIPE_FORMAT_S8_UINT, 0xf,
3567 tu_attachment_gmem_offset_stencil(cmd, att, layer), value);
3568 }
3569 } else {
3570 clear_gmem_attachment<CHIP>(cmd, cs, format, aspect_write_mask(format, mask),
3571 tu_attachment_gmem_offset(cmd, att, layer), value);
3572 }
3573 }
3574
3575 tu_flush_for_access(&cmd->state.renderpass_cache, TU_ACCESS_BLIT_WRITE_GMEM, TU_ACCESS_NONE);
3576
3577 trace_end_gmem_clear(&cmd->trace, cs);
3578 }
3579
3580 template <chip CHIP>
3581 static void
tu_clear_gmem_attachments(struct tu_cmd_buffer * cmd,uint32_t attachment_count,const VkClearAttachment * attachments,uint32_t rect_count,const VkClearRect * rects)3582 tu_clear_gmem_attachments(struct tu_cmd_buffer *cmd,
3583 uint32_t attachment_count,
3584 const VkClearAttachment *attachments,
3585 uint32_t rect_count,
3586 const VkClearRect *rects)
3587 {
3588 const struct tu_subpass *subpass = cmd->state.subpass;
3589 struct tu_cs *cs = &cmd->draw_cs;
3590
3591 if (rect_count > 1)
3592 perf_debug(cmd->device, "TODO: Swap tu_clear_gmem_attachments() loop for smaller command stream");
3593
3594 for (unsigned i = 0; i < rect_count; i++) {
3595 unsigned x1 = rects[i].rect.offset.x;
3596 unsigned y1 = rects[i].rect.offset.y;
3597 unsigned x2 = x1 + rects[i].rect.extent.width - 1;
3598 unsigned y2 = y1 + rects[i].rect.extent.height - 1;
3599
3600 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
3601 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
3602 tu_cs_emit(cs, A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
3603
3604 for (unsigned j = 0; j < attachment_count; j++) {
3605 uint32_t a;
3606 if (attachments[j].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT)
3607 a = subpass->color_attachments[attachments[j].colorAttachment].attachment;
3608 else
3609 a = subpass->depth_stencil_attachment.attachment;
3610
3611 if (a == VK_ATTACHMENT_UNUSED)
3612 continue;
3613
3614 tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, a, rects[i].baseArrayLayer,
3615 rects[i].layerCount,
3616 subpass->multiview_mask,
3617 attachments[j].aspectMask,
3618 &attachments[j].clearValue);
3619 }
3620 }
3621 }
3622
3623 template <chip CHIP>
3624 static void
tu_clear_attachments(struct tu_cmd_buffer * cmd,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)3625 tu_clear_attachments(struct tu_cmd_buffer *cmd,
3626 uint32_t attachmentCount,
3627 const VkClearAttachment *pAttachments,
3628 uint32_t rectCount,
3629 const VkClearRect *pRects)
3630 {
3631 struct tu_cs *cs = &cmd->draw_cs;
3632
3633 /* sysmem path behaves like a draw, note we don't have a way of using different
3634 * flushes for sysmem/gmem, so this needs to be outside of the cond_exec
3635 */
3636 tu_emit_cache_flush_renderpass<CHIP>(cmd);
3637
3638 /* vkCmdClearAttachments is supposed to respect the predicate if active. The
3639 * easiest way to do this is to always use the 3d path, which always works
3640 * even with GMEM because it's just a simple draw using the existing
3641 * attachment state.
3642 *
3643 * Similarly, we also use the 3D path when in a secondary command buffer that
3644 * doesn't know the GMEM layout that will be chosen by the primary.
3645 */
3646 if (cmd->state.predication_active || cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT) {
3647 tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
3648 return;
3649 }
3650
3651 /* If we could skip tile load/stores based on any draws intersecting them at
3652 * binning time, then emit the clear as a 3D draw so that it contributes to
3653 * that visibility.
3654 */
3655 const struct tu_subpass *subpass = cmd->state.subpass;
3656 for (uint32_t i = 0; i < attachmentCount; i++) {
3657 uint32_t a;
3658 if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
3659 uint32_t c = pAttachments[i].colorAttachment;
3660 a = subpass->color_attachments[c].attachment;
3661 } else {
3662 a = subpass->depth_stencil_attachment.attachment;
3663 }
3664 if (a != VK_ATTACHMENT_UNUSED) {
3665 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
3666 if (att->cond_load_allowed || att->cond_store_allowed) {
3667 tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
3668 return;
3669 }
3670 }
3671 }
3672
3673 /* Otherwise, emit 2D blits for gmem rendering. */
3674 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_GMEM);
3675 tu_clear_gmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
3676 tu_cond_exec_end(cs);
3677
3678 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
3679 tu_clear_sysmem_attachments<CHIP>(cmd, attachmentCount, pAttachments, rectCount, pRects);
3680 tu_cond_exec_end(cs);
3681 }
3682
3683 static void
tu7_clear_attachment_generic_single_rect(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_render_pass_attachment * att,const VkClearAttachment * clear_att,uint32_t a,const VkClearRect * rect)3684 tu7_clear_attachment_generic_single_rect(
3685 struct tu_cmd_buffer *cmd,
3686 struct tu_cs *cs,
3687 const struct tu_render_pass_attachment *att,
3688 const VkClearAttachment *clear_att,
3689 uint32_t a,
3690 const VkClearRect *rect)
3691 {
3692 const struct tu_subpass *subpass = cmd->state.subpass;
3693 unsigned x1 = rect->rect.offset.x;
3694 unsigned y1 = rect->rect.offset.y;
3695 unsigned x2 = x1 + rect->rect.extent.width - 1;
3696 unsigned y2 = y1 + rect->rect.extent.height - 1;
3697
3698 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_SCISSOR_TL, 2);
3699 tu_cs_emit(cs,
3700 A6XX_RB_BLIT_SCISSOR_TL_X(x1) | A6XX_RB_BLIT_SCISSOR_TL_Y(y1));
3701 tu_cs_emit(cs,
3702 A6XX_RB_BLIT_SCISSOR_BR_X(x2) | A6XX_RB_BLIT_SCISSOR_BR_Y(y2));
3703
3704 auto value = &clear_att->clearValue;
3705
3706 enum pipe_format format = vk_format_to_pipe_format(att->format);
3707 for_each_layer(i, subpass->multiview_mask, rect->layerCount) {
3708 uint32_t layer = i + rect->baseArrayLayer;
3709 uint32_t mask =
3710 aspect_write_mask_generic_clear(format, clear_att->aspectMask);
3711
3712 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3713 if (clear_att->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3714 tu7_generic_layer_clear(cmd, cs, PIPE_FORMAT_Z32_FLOAT, mask,
3715 false, layer, value, a);
3716 }
3717 if (clear_att->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3718 tu7_generic_layer_clear(cmd, cs, PIPE_FORMAT_S8_UINT, mask, true,
3719 layer, value, a);
3720 }
3721 } else {
3722 tu7_generic_layer_clear(cmd, cs, format, mask, false, layer, value, a);
3723 }
3724 }
3725 }
3726
3727 static void
tu_clear_attachments_generic(struct tu_cmd_buffer * cmd,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)3728 tu_clear_attachments_generic(struct tu_cmd_buffer *cmd,
3729 uint32_t attachmentCount,
3730 const VkClearAttachment *pAttachments,
3731 uint32_t rectCount,
3732 const VkClearRect *pRects)
3733 {
3734 struct tu_cs *cs = &cmd->draw_cs;
3735
3736 uint32_t clear_aspects = 0;
3737 for (uint32_t i = 0; i < attachmentCount; i++) {
3738 clear_aspects |= pAttachments[i].aspectMask;
3739 }
3740
3741 /* Generic clear doesn't go through CCU (or other caches),
3742 * so we have to flush (clean+invalidate) corresponding caches.
3743 */
3744 tu_cond_exec_start(cs, CP_COND_EXEC_0_RENDER_MODE_SYSMEM);
3745 if (clear_aspects & VK_IMAGE_ASPECT_COLOR_BIT) {
3746 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 1);
3747 tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = CCU_FLUSH_COLOR).value);
3748 }
3749 if (clear_aspects & (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT)) {
3750 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 1);
3751 tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = CCU_FLUSH_DEPTH).value);
3752 }
3753 tu_cs_emit_wfi(cs);
3754 tu_cond_exec_end(cs);
3755
3756 const struct tu_subpass *subpass = cmd->state.subpass;
3757 for (uint32_t i = 0; i < attachmentCount; i++) {
3758 uint32_t a;
3759 if (pAttachments[i].aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
3760 uint32_t c = pAttachments[i].colorAttachment;
3761 a = subpass->color_attachments[c].attachment;
3762 } else {
3763 a = subpass->depth_stencil_attachment.attachment;
3764 }
3765 if (a != VK_ATTACHMENT_UNUSED) {
3766 const struct tu_render_pass_attachment *att = &cmd->state.pass->attachments[a];
3767 const struct tu_image_view *iview = cmd->state.attachments[a];
3768 trace_start_generic_clear(&cmd->trace, cs, att->format,
3769 iview->view.ubwc_enabled, att->samples);
3770 for (unsigned j = 0; j < rectCount; j++) {
3771 tu7_clear_attachment_generic_single_rect(
3772 cmd, cs, att, &pAttachments[i], a, &pRects[j]);
3773 }
3774 trace_end_generic_clear(&cmd->trace, cs);
3775 }
3776 }
3777 }
3778
3779 template <chip CHIP>
3780 VKAPI_ATTR void VKAPI_CALL
tu_CmdClearAttachments(VkCommandBuffer commandBuffer,uint32_t attachmentCount,const VkClearAttachment * pAttachments,uint32_t rectCount,const VkClearRect * pRects)3781 tu_CmdClearAttachments(VkCommandBuffer commandBuffer,
3782 uint32_t attachmentCount,
3783 const VkClearAttachment *pAttachments,
3784 uint32_t rectCount,
3785 const VkClearRect *pRects)
3786 {
3787 VK_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer);
3788
3789 for (uint32_t j = 0; j < attachmentCount; j++) {
3790 if ((pAttachments[j].aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) == 0)
3791 continue;
3792
3793 tu_lrz_disable_during_renderpass<CHIP>(cmd);
3794 }
3795
3796 if (cmd->device->physical_device->info->a7xx.has_generic_clear &&
3797 /* Both having predication and not knowing layout could be solved
3798 * by cs patching, which is exactly what prop driver is doing.
3799 * We don't implement it because we don't expect a reasonable impact.
3800 */
3801 !(cmd->state.predication_active ||
3802 cmd->state.gmem_layout == TU_GMEM_LAYOUT_COUNT)) {
3803 tu_clear_attachments_generic(cmd, attachmentCount, pAttachments, rectCount, pRects);
3804 } else {
3805 tu_clear_attachments<CHIP>(cmd, attachmentCount, pAttachments,
3806 rectCount, pRects);
3807 }
3808 }
3809 TU_GENX(tu_CmdClearAttachments);
3810
3811 template <chip CHIP>
3812 static void
clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,VkFormat vk_format,VkImageAspectFlags clear_mask,uint32_t a,bool separate_ds)3813 clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
3814 struct tu_cs *cs,
3815 VkFormat vk_format,
3816 VkImageAspectFlags clear_mask,
3817 uint32_t a,
3818 bool separate_ds)
3819 {
3820 enum pipe_format format = vk_format_to_pipe_format(vk_format);
3821 const struct tu_framebuffer *fb = cmd->state.framebuffer;
3822 const struct tu_image_view *iview = cmd->state.attachments[a];
3823 const uint32_t clear_views = cmd->state.pass->attachments[a].clear_views;
3824 const struct blit_ops *ops = &r2d_ops<CHIP>;
3825 const VkClearValue *value = &cmd->state.clear_values[a];
3826 if (cmd->state.pass->attachments[a].samples > 1)
3827 ops = &r3d_ops<CHIP>;
3828
3829 trace_start_sysmem_clear(&cmd->trace, cs, vk_format, ops == &r3d_ops<CHIP>,
3830 cmd->state.pass->attachments[a].samples);
3831
3832 ops->setup(cmd, cs, format, format, clear_mask, 0, true, iview->view.ubwc_enabled,
3833 cmd->state.pass->attachments[a].samples);
3834 ops->coords(cmd, cs, cmd->state.render_area.offset, (VkOffset2D) {},
3835 cmd->state.render_area.extent);
3836 ops->clear_value(cmd, cs, format, value);
3837
3838 for_each_layer(i, clear_views, fb->layers) {
3839 if (separate_ds) {
3840 if (vk_format == VK_FORMAT_D32_SFLOAT) {
3841 ops->dst_depth(cs, iview, i);
3842 } else {
3843 ops->dst_stencil(cs, iview, i);
3844 }
3845 } else {
3846 ops->dst(cs, &iview->view, i, format);
3847 }
3848 ops->run(cmd, cs);
3849 }
3850
3851 ops->teardown(cmd, cs);
3852
3853 trace_end_sysmem_clear(&cmd->trace, cs);
3854 }
3855
3856 template <chip CHIP>
3857 void
tu_clear_sysmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a)3858 tu_clear_sysmem_attachment(struct tu_cmd_buffer *cmd,
3859 struct tu_cs *cs,
3860 uint32_t a)
3861 {
3862 const struct tu_render_pass_attachment *attachment =
3863 &cmd->state.pass->attachments[a];
3864
3865 if (!attachment->clear_mask)
3866 return;
3867
3868 if (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3869 if (attachment->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3870 clear_sysmem_attachment<CHIP>(cmd, cs, VK_FORMAT_D32_SFLOAT, VK_IMAGE_ASPECT_COLOR_BIT,
3871 a, true);
3872 }
3873 if (attachment->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3874 clear_sysmem_attachment<CHIP>(cmd, cs, VK_FORMAT_S8_UINT, VK_IMAGE_ASPECT_COLOR_BIT,
3875 a, true);
3876 }
3877 } else {
3878 clear_sysmem_attachment<CHIP>(cmd, cs, attachment->format, attachment->clear_mask,
3879 a, false);
3880 }
3881
3882 /* The spec doesn't explicitly say, but presumably the initial renderpass
3883 * clear is considered part of the renderpass, and therefore barriers
3884 * aren't required inside the subpass/renderpass. Therefore we need to
3885 * flush CCU color into CCU depth here, just like with
3886 * vkCmdClearAttachments(). Note that because this only happens at the
3887 * beginning of a renderpass, and renderpass writes are considered
3888 * "incoherent", we shouldn't have to worry about syncing depth into color
3889 * beforehand as depth should already be flushed.
3890 */
3891 if (vk_format_is_depth_or_stencil(attachment->format)) {
3892 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
3893 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_DEPTH);
3894 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_DEPTH);
3895 } else {
3896 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
3897 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_INVALIDATE_COLOR);
3898 }
3899
3900 tu_cs_emit_wfi(cs);
3901 }
3902 TU_GENX(tu_clear_sysmem_attachment);
3903
3904 template <chip CHIP>
3905 void
tu_clear_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a)3906 tu_clear_gmem_attachment(struct tu_cmd_buffer *cmd,
3907 struct tu_cs *cs,
3908 uint32_t a)
3909 {
3910 const struct tu_render_pass_attachment *attachment =
3911 &cmd->state.pass->attachments[a];
3912
3913 if (!attachment->clear_mask)
3914 return;
3915
3916 tu_emit_clear_gmem_attachment<CHIP>(cmd, cs, a, 0, cmd->state.framebuffer->layers,
3917 attachment->clear_views,
3918 attachment->clear_mask,
3919 &cmd->state.clear_values[a]);
3920 }
3921 TU_GENX(tu_clear_gmem_attachment);
3922
3923 void
tu7_generic_clear_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a)3924 tu7_generic_clear_attachment(struct tu_cmd_buffer *cmd, struct tu_cs *cs, uint32_t a)
3925 {
3926 const struct tu_render_pass_attachment *att =
3927 &cmd->state.pass->attachments[a];
3928 const VkClearValue *value = &cmd->state.clear_values[a];
3929 const struct tu_image_view *iview = cmd->state.attachments[a];
3930
3931 trace_start_generic_clear(&cmd->trace, cs, att->format,
3932 iview->view.ubwc_enabled, att->samples);
3933
3934 enum pipe_format format = vk_format_to_pipe_format(att->format);
3935 for_each_layer(i, att->clear_views, cmd->state.framebuffer->layers) {
3936 uint32_t layer = i + 0;
3937 uint32_t mask =
3938 aspect_write_mask_generic_clear(format, att->clear_mask);
3939 if (att->format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
3940 if (att->clear_mask & VK_IMAGE_ASPECT_DEPTH_BIT) {
3941 tu7_generic_layer_clear(cmd, cs, PIPE_FORMAT_Z32_FLOAT, mask,
3942 false, layer, value, a);
3943 }
3944 if (att->clear_mask & VK_IMAGE_ASPECT_STENCIL_BIT) {
3945 tu7_generic_layer_clear(cmd, cs, PIPE_FORMAT_S8_UINT, mask, true,
3946 layer, value, a);
3947 }
3948 } else {
3949 tu7_generic_layer_clear(cmd, cs, format, mask, false, layer, value, a);
3950 }
3951 }
3952
3953 tu_flush_for_access(&cmd->state.renderpass_cache,
3954 TU_ACCESS_BLIT_WRITE_GMEM, TU_ACCESS_NONE);
3955
3956 trace_end_generic_clear(&cmd->trace, cs);
3957 }
3958
3959 template <chip CHIP>
3960 static void
tu_emit_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * attachment,const VkClearValue * clear_value,enum a6xx_blit_event_type blit_event_type,bool separate_stencil)3961 tu_emit_blit(struct tu_cmd_buffer *cmd,
3962 struct tu_cs *cs,
3963 const struct tu_image_view *iview,
3964 const struct tu_render_pass_attachment *attachment,
3965 const VkClearValue *clear_value,
3966 enum a6xx_blit_event_type blit_event_type,
3967 bool separate_stencil)
3968 {
3969 assert(blit_event_type != BLIT_EVENT_CLEAR);
3970 uint32_t clear_mask = 0;
3971
3972 /* BLIT_EVENT_STORE_AND_CLEAR would presumably swallow the
3973 * BLIT_EVENT_CLEAR at the start of a renderpass, and be more efficient.
3974 */
3975 if (blit_event_type == BLIT_EVENT_STORE && clear_value &&
3976 attachment->clear_mask &&
3977 use_generic_clear_for_image_clear(cmd, iview->image)) {
3978 blit_event_type = BLIT_EVENT_STORE_AND_CLEAR;
3979
3980 enum pipe_format format = vk_format_to_pipe_format(attachment->format);
3981 VkImageAspectFlags aspect_mask = attachment->clear_mask;
3982 if (format == PIPE_FORMAT_Z24_UNORM_S8_UINT) {
3983 if (separate_stencil)
3984 aspect_mask = VK_IMAGE_ASPECT_STENCIL_BIT;
3985 else
3986 aspect_mask = VK_IMAGE_ASPECT_DEPTH_BIT;
3987 }
3988 if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT) {
3989 if (separate_stencil)
3990 format = PIPE_FORMAT_S8_UINT;
3991 else
3992 format = PIPE_FORMAT_Z32_FLOAT;
3993 }
3994
3995 clear_mask = aspect_write_mask_generic_clear(format, aspect_mask);
3996
3997 uint32_t clear_vals[4] = {};
3998 pack_blit_event_clear_value(clear_value, format, clear_vals);
3999
4000 tu_cs_emit_pkt4(cs, REG_A6XX_RB_BLIT_CLEAR_COLOR_DW0, 4);
4001 tu_cs_emit_array(cs, clear_vals, 4);
4002 }
4003
4004 event_blit_setup(cs, attachment, blit_event_type, clear_mask);
4005
4006 for_each_layer(i, attachment->clear_views, cmd->state.framebuffer->layers) {
4007 event_blit_dst_view blt_view = blt_view_from_tu_view(iview, i);
4008 event_blit_run<CHIP>(cmd, cs, attachment, &blt_view, separate_stencil);
4009 }
4010
4011 tu_flush_for_access(&cmd->state.cache, TU_ACCESS_BLIT_WRITE_GMEM,
4012 TU_ACCESS_NONE);
4013 }
4014
4015 static bool
blit_can_resolve(VkFormat format)4016 blit_can_resolve(VkFormat format)
4017 {
4018 const struct util_format_description *desc = vk_format_description(format);
4019
4020 /* blit event can only do resolve for simple cases:
4021 * averaging samples as unsigned integers or choosing only one sample
4022 * Note this is allowed for SRGB formats, but results differ from 2D draw resolve
4023 */
4024 if (vk_format_is_snorm(format))
4025 return false;
4026
4027 /* can't do formats with larger channel sizes
4028 * note: this includes all float formats
4029 * note2: single channel integer formats seem OK
4030 */
4031 if (desc->channel[0].size > 10 && vk_format_is_color(format))
4032 return false;
4033
4034 switch (format) {
4035 /* for unknown reasons blit event can't msaa resolve these formats when tiled
4036 * likely related to these formats having different layout from other cpp=2 formats
4037 */
4038 case VK_FORMAT_R8G8_UNORM:
4039 case VK_FORMAT_R8G8_UINT:
4040 case VK_FORMAT_R8G8_SINT:
4041 case VK_FORMAT_R8G8_SRGB:
4042 return false;
4043 default:
4044 break;
4045 }
4046
4047 return true;
4048 }
4049
4050 struct apply_load_coords_state {
4051 unsigned view;
4052 };
4053
4054 static void
fdm_apply_load_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)4055 fdm_apply_load_coords(struct tu_cmd_buffer *cmd,
4056 struct tu_cs *cs,
4057 void *data,
4058 VkRect2D bin,
4059 unsigned views,
4060 VkExtent2D *frag_areas)
4061 {
4062 const struct apply_load_coords_state *state =
4063 (const struct apply_load_coords_state *)data;
4064 assert(state->view < views);
4065 VkExtent2D frag_area = frag_areas[state->view];
4066
4067 assert(bin.extent.width % frag_area.width == 0);
4068 assert(bin.extent.height % frag_area.height == 0);
4069 uint32_t scaled_width = bin.extent.width / frag_area.width;
4070 uint32_t scaled_height = bin.extent.height / frag_area.height;
4071
4072 const float coords[] = {
4073 bin.offset.x, bin.offset.y,
4074 bin.offset.x, bin.offset.y,
4075 bin.offset.x + scaled_width, bin.offset.y + scaled_height,
4076 bin.offset.x + bin.extent.width, bin.offset.y + bin.extent.height,
4077 };
4078 r3d_coords_raw(cmd, cs, coords);
4079 }
4080
4081 template <chip CHIP>
4082 static void
load_3d_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,const struct tu_render_pass_attachment * att,bool separate_stencil)4083 load_3d_blit(struct tu_cmd_buffer *cmd,
4084 struct tu_cs *cs,
4085 const struct tu_image_view *iview,
4086 const struct tu_render_pass_attachment *att,
4087 bool separate_stencil)
4088 {
4089 const struct tu_framebuffer *fb = cmd->state.framebuffer;
4090 enum pipe_format format = iview->view.format;
4091 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4092 if (separate_stencil)
4093 format = PIPE_FORMAT_S8_UINT;
4094 else
4095 format = PIPE_FORMAT_Z32_FLOAT;
4096 }
4097 r3d_setup<CHIP>(cmd, cs, format, format, VK_IMAGE_ASPECT_COLOR_BIT,
4098 R3D_DST_GMEM, false, iview->view.ubwc_enabled,
4099 iview->image->vk.samples);
4100
4101 if (!cmd->state.pass->has_fdm) {
4102 r3d_coords(cmd, cs, (VkOffset2D) { 0, 0 }, (VkOffset2D) { 0, 0 },
4103 (VkExtent2D) { fb->width, fb->height });
4104 }
4105
4106 /* Normal loads read directly from system memory, so we have to invalidate
4107 * UCHE in case it contains stale data.
4108 */
4109 tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
4110
4111 /* Wait for CACHE_INVALIDATE to land */
4112 tu_cs_emit_wfi(cs);
4113
4114 for_each_layer(i, att->clear_views, cmd->state.framebuffer->layers) {
4115 if (cmd->state.pass->has_fdm) {
4116 struct apply_load_coords_state state = {
4117 .view = att->clear_views ? i : 0,
4118 };
4119 tu_create_fdm_bin_patchpoint(cmd, cs, 4, fdm_apply_load_coords, state);
4120 }
4121
4122 r3d_dst_gmem<CHIP>(cmd, cs, iview, att, separate_stencil, i);
4123
4124 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4125 if (separate_stencil)
4126 r3d_src_stencil(cmd, cs, iview, i);
4127 else
4128 r3d_src_depth(cmd, cs, iview, i);
4129 } else {
4130 r3d_src_gmem_load(cmd, cs, iview, i);
4131 }
4132
4133 r3d_run(cmd, cs);
4134 }
4135
4136 r3d_teardown<CHIP>(cmd, cs);
4137
4138 /* It seems we need to WFI here for depth/stencil because color writes here
4139 * aren't synchronized with depth/stencil writes.
4140 *
4141 * Note: the blob also uses a WFI for color attachments but this hasn't
4142 * been seen to be necessary.
4143 */
4144 if (vk_format_is_depth_or_stencil(att->format))
4145 tu_cs_emit_wfi(cs);
4146 }
4147
4148 static void
tu_begin_load_store_cond_exec(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool load)4149 tu_begin_load_store_cond_exec(struct tu_cmd_buffer *cmd,
4150 struct tu_cs *cs, bool load)
4151 {
4152 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST));
4153
4154 if (!TU_DEBUG(LOG_SKIP_GMEM_OPS))
4155 return;
4156
4157 uint64_t result_iova;
4158 if (load)
4159 result_iova = global_iova(cmd, dbg_gmem_taken_loads);
4160 else
4161 result_iova = global_iova(cmd, dbg_gmem_taken_stores);
4162
4163 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
4164 tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
4165 tu_cs_emit_qw(cs, result_iova);
4166 tu_cs_emit_qw(cs, result_iova);
4167 tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
4168 }
4169
4170 static void
tu_end_load_store_cond_exec(struct tu_cmd_buffer * cmd,struct tu_cs * cs,bool load)4171 tu_end_load_store_cond_exec(struct tu_cmd_buffer *cmd,
4172 struct tu_cs *cs, bool load)
4173 {
4174 tu_cond_exec_end(cs);
4175
4176 if (!TU_DEBUG(LOG_SKIP_GMEM_OPS))
4177 return;
4178
4179 uint64_t result_iova;
4180 if (load)
4181 result_iova = global_iova(cmd, dbg_gmem_total_loads);
4182 else
4183 result_iova = global_iova(cmd, dbg_gmem_total_stores);
4184
4185 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 7);
4186 tu_cs_emit(cs, CP_MEM_TO_MEM_0_NEG_B);
4187 tu_cs_emit_qw(cs, result_iova);
4188 tu_cs_emit_qw(cs, result_iova);
4189 tu_cs_emit_qw(cs, global_iova(cmd, dbg_one));
4190 }
4191
4192 template <chip CHIP>
4193 void
tu_load_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,bool cond_exec_allowed,bool force_load)4194 tu_load_gmem_attachment(struct tu_cmd_buffer *cmd,
4195 struct tu_cs *cs,
4196 uint32_t a,
4197 bool cond_exec_allowed,
4198 bool force_load)
4199 {
4200 const struct tu_image_view *iview = cmd->state.attachments[a];
4201 const struct tu_render_pass_attachment *attachment =
4202 &cmd->state.pass->attachments[a];
4203
4204 bool load_common = attachment->load || force_load;
4205 bool load_stencil =
4206 attachment->load_stencil ||
4207 (attachment->format == VK_FORMAT_D32_SFLOAT_S8_UINT && force_load);
4208
4209 if (!load_common && !load_stencil)
4210 return;
4211
4212 trace_start_gmem_load(&cmd->trace, cs, attachment->format, force_load);
4213
4214 /* If attachment will be cleared by vkCmdClearAttachments - it is likely
4215 * that it would be partially cleared, and since it is done by 2d blit
4216 * it doesn't produce geometry, so we have to unconditionally load.
4217 *
4218 * To simplify conditions treat partially cleared separate DS as fully
4219 * cleared and don't emit cond_exec.
4220 */
4221 bool cond_exec = cond_exec_allowed && attachment->cond_load_allowed;
4222 if (cond_exec)
4223 tu_begin_load_store_cond_exec(cmd, cs, true);
4224
4225 if (TU_DEBUG(3D_LOAD) ||
4226 cmd->state.pass->has_fdm) {
4227 if (load_common || load_stencil)
4228 tu_disable_draw_states(cmd, cs);
4229
4230 if (load_common)
4231 load_3d_blit<CHIP>(cmd, cs, iview, attachment, false);
4232
4233 if (load_stencil)
4234 load_3d_blit<CHIP>(cmd, cs, iview, attachment, true);
4235 } else {
4236 if (load_common)
4237 tu_emit_blit<CHIP>(cmd, cs, iview, attachment, NULL, BLIT_EVENT_LOAD, false);
4238
4239 if (load_stencil)
4240 tu_emit_blit<CHIP>(cmd, cs, iview, attachment, NULL, BLIT_EVENT_LOAD, true);
4241 }
4242
4243 if (cond_exec)
4244 tu_end_load_store_cond_exec(cmd, cs, true);
4245
4246 trace_end_gmem_load(&cmd->trace, cs);
4247 }
4248 TU_GENX(tu_load_gmem_attachment);
4249
4250 template <chip CHIP>
4251 static void
store_cp_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,uint32_t samples,bool separate_stencil,enum pipe_format src_format,enum pipe_format dst_format,uint32_t layer,uint32_t gmem_offset,uint32_t cpp)4252 store_cp_blit(struct tu_cmd_buffer *cmd,
4253 struct tu_cs *cs,
4254 const struct tu_image_view *iview,
4255 uint32_t samples,
4256 bool separate_stencil,
4257 enum pipe_format src_format,
4258 enum pipe_format dst_format,
4259 uint32_t layer,
4260 uint32_t gmem_offset,
4261 uint32_t cpp)
4262 {
4263 r2d_setup_common<CHIP>(cmd, cs, src_format, dst_format,
4264 VK_IMAGE_ASPECT_COLOR_BIT, 0, false,
4265 iview->view.ubwc_enabled, true);
4266
4267 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4268 if (!separate_stencil) {
4269 r2d_dst_depth(cs, iview, layer);
4270 } else {
4271 r2d_dst_stencil(cs, iview, layer);
4272 }
4273 } else {
4274 r2d_dst<CHIP>(cs, &iview->view, layer, src_format);
4275 }
4276
4277 enum a6xx_format fmt = blit_format_texture<CHIP>(src_format, TILE6_2, true).fmt;
4278 fixup_src_format(&src_format, dst_format, &fmt);
4279
4280 tu_cs_emit_regs(cs,
4281 SP_PS_2D_SRC_INFO(CHIP,
4282 .color_format = fmt,
4283 .tile_mode = TILE6_2,
4284 .color_swap = WZYX,
4285 .srgb = util_format_is_srgb(src_format),
4286 .samples = tu_msaa_samples(samples),
4287 .samples_average = !util_format_is_pure_integer(dst_format) &&
4288 !util_format_is_depth_or_stencil(dst_format),
4289 .unk20 = 1,
4290 .unk22 = 1),
4291 SP_PS_2D_SRC_SIZE(CHIP, .width = iview->vk.extent.width, .height = iview->vk.extent.height),
4292 SP_PS_2D_SRC(CHIP, .qword = cmd->device->physical_device->gmem_base + gmem_offset),
4293 SP_PS_2D_SRC_PITCH(CHIP, .pitch = cmd->state.tiling->tile0.width * cpp));
4294
4295 /* sync GMEM writes with CACHE. */
4296 tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
4297 if (CHIP >= A7XX)
4298 /* On A7XX, we need to wait for any CP_EVENT_WRITE::BLIT operations
4299 * arising from GMEM load/clears to land before we can continue.
4300 */
4301 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_BLIT_CACHE);
4302
4303 /* Wait for cache event to land */
4304 tu_cs_emit_wfi(cs);
4305
4306 r2d_run(cmd, cs);
4307
4308 /* CP_BLIT writes to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
4309 * sysmem, and we generally assume that GMEM renderpasses leave their
4310 * results in sysmem, so we need to flush manually here.
4311 */
4312 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4313 }
4314
4315 template <chip CHIP>
4316 static void
store_3d_blit(struct tu_cmd_buffer * cmd,struct tu_cs * cs,const struct tu_image_view * iview,VkSampleCountFlagBits dst_samples,bool separate_stencil,enum pipe_format src_format,enum pipe_format dst_format,const VkRect2D * render_area,uint32_t layer,uint32_t gmem_offset,uint32_t cpp)4317 store_3d_blit(struct tu_cmd_buffer *cmd,
4318 struct tu_cs *cs,
4319 const struct tu_image_view *iview,
4320 VkSampleCountFlagBits dst_samples,
4321 bool separate_stencil,
4322 enum pipe_format src_format,
4323 enum pipe_format dst_format,
4324 const VkRect2D *render_area,
4325 uint32_t layer,
4326 uint32_t gmem_offset,
4327 uint32_t cpp)
4328 {
4329 /* RB_BIN_CONTROL/GRAS_BIN_CONTROL are normally only set once and they
4330 * aren't set until we know whether we're HW binning or not, and we want to
4331 * avoid a dependence on that here to be able to store attachments before
4332 * the end of the renderpass in the future. Use the scratch space to
4333 * save/restore them dynamically.
4334 */
4335 tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
4336 tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A6XX_RB_BIN_CONTROL) |
4337 CP_REG_TO_SCRATCH_0_SCRATCH(0) |
4338 CP_REG_TO_SCRATCH_0_CNT(1 - 1));
4339 if (CHIP >= A7XX) {
4340 tu_cs_emit_pkt7(cs, CP_REG_TO_SCRATCH, 1);
4341 tu_cs_emit(cs, CP_REG_TO_SCRATCH_0_REG(REG_A7XX_RB_UNKNOWN_8812) |
4342 CP_REG_TO_SCRATCH_0_SCRATCH(1) |
4343 CP_REG_TO_SCRATCH_0_CNT(1 - 1));
4344 }
4345
4346 r3d_setup<CHIP>(cmd, cs, src_format, dst_format, VK_IMAGE_ASPECT_COLOR_BIT,
4347 0, false, iview->view.ubwc_enabled, dst_samples);
4348
4349 r3d_coords(cmd, cs, render_area->offset, render_area->offset, render_area->extent);
4350
4351 if (iview->image->vk.format == VK_FORMAT_D32_SFLOAT_S8_UINT) {
4352 if (!separate_stencil) {
4353 r3d_dst_depth<CHIP>(cs, iview, layer);
4354 } else {
4355 r3d_dst_stencil<CHIP>(cs, iview, layer);
4356 }
4357 } else {
4358 r3d_dst<CHIP>(cs, &iview->view, layer, src_format);
4359 }
4360
4361 r3d_src_gmem<CHIP>(cmd, cs, iview, src_format, dst_format, gmem_offset, cpp);
4362
4363 /* sync GMEM writes with CACHE. */
4364 tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
4365
4366 /* Wait for CACHE_INVALIDATE to land */
4367 tu_cs_emit_wfi(cs);
4368
4369 r3d_run(cmd, cs);
4370
4371 r3d_teardown<CHIP>(cmd, cs);
4372
4373 /* Draws write to the CCU, unlike CP_EVENT_WRITE::BLIT which writes to
4374 * sysmem, and we generally assume that GMEM renderpasses leave their
4375 * results in sysmem, so we need to flush manually here. The 3d blit path
4376 * writes to depth images as a color RT, so there's no need to flush depth.
4377 */
4378 tu_emit_event_write<CHIP>(cmd, cs, FD_CCU_CLEAN_COLOR);
4379
4380 /* Restore RB_BIN_CONTROL/GRAS_BIN_CONTROL saved above. */
4381 tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
4382 tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_RB_BIN_CONTROL) |
4383 CP_SCRATCH_TO_REG_0_SCRATCH(0) |
4384 CP_SCRATCH_TO_REG_0_CNT(1 - 1));
4385
4386 tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
4387 tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A6XX_GRAS_BIN_CONTROL) |
4388 CP_SCRATCH_TO_REG_0_SCRATCH(0) |
4389 CP_SCRATCH_TO_REG_0_CNT(1 - 1));
4390
4391 if (CHIP >= A7XX) {
4392 tu_cs_emit_pkt7(cs, CP_SCRATCH_TO_REG, 1);
4393 tu_cs_emit(cs, CP_SCRATCH_TO_REG_0_REG(REG_A7XX_RB_UNKNOWN_8812) |
4394 CP_SCRATCH_TO_REG_0_SCRATCH(1) |
4395 CP_SCRATCH_TO_REG_0_CNT(1 - 1));
4396 }
4397 }
4398
4399 static bool
tu_attachment_store_unaligned(struct tu_cmd_buffer * cmd,uint32_t a)4400 tu_attachment_store_unaligned(struct tu_cmd_buffer *cmd, uint32_t a)
4401 {
4402 struct tu_physical_device *phys_dev = cmd->device->physical_device;
4403 const struct tu_image_view *iview = cmd->state.attachments[a];
4404 const VkRect2D *render_area = &cmd->state.render_area;
4405
4406 /* Unaligned store is incredibly rare in CTS, we have to force it to test. */
4407 if (TU_DEBUG(UNALIGNED_STORE))
4408 return true;
4409
4410 /* We always use the unaligned store path when scaling rendering. */
4411 if (cmd->state.pass->has_fdm)
4412 return true;
4413
4414 uint32_t x1 = render_area->offset.x;
4415 uint32_t y1 = render_area->offset.y;
4416 uint32_t x2 = x1 + render_area->extent.width;
4417 uint32_t y2 = y1 + render_area->extent.height;
4418 /* x2/y2 can be unaligned if equal to the size of the image, since it will
4419 * write into padding space. The one exception is linear levels which don't
4420 * have the required y padding in the layout (except for the last level)
4421 */
4422 bool need_y2_align =
4423 y2 != iview->view.height || iview->view.need_y2_align;
4424
4425 return (x1 % phys_dev->info->gmem_align_w ||
4426 (x2 % phys_dev->info->gmem_align_w && x2 != iview->view.width) ||
4427 y1 % phys_dev->info->gmem_align_h ||
4428 (y2 % phys_dev->info->gmem_align_h && need_y2_align));
4429 }
4430
4431 /* Choose the GMEM layout (use the CCU space or not) based on whether the
4432 * current attachments will need. This has to happen at vkBeginRenderPass()
4433 * time because tu_attachment_store_unaligned() looks at the image views, which
4434 * are only available at that point. This should match the logic for the
4435 * !use_fast_path case in tu_store_gmem_attachment().
4436 */
4437 void
tu_choose_gmem_layout(struct tu_cmd_buffer * cmd)4438 tu_choose_gmem_layout(struct tu_cmd_buffer *cmd)
4439 {
4440 cmd->state.gmem_layout = TU_GMEM_LAYOUT_FULL;
4441
4442 for (unsigned i = 0; i < cmd->state.pass->attachment_count; i++) {
4443 if (!cmd->state.attachments[i])
4444 continue;
4445
4446 struct tu_render_pass_attachment *att =
4447 &cmd->state.pass->attachments[i];
4448 if ((att->store || att->store_stencil) &&
4449 tu_attachment_store_unaligned(cmd, i))
4450 cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
4451 if (att->store && att->format == VK_FORMAT_S8_UINT)
4452 /* We cannot pick out S8 from D24S8/D32S8, so we conservatively disable
4453 * blit events for the S8_UINT format.
4454 */
4455 cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
4456 if (att->will_be_resolved && !blit_can_resolve(att->format))
4457 cmd->state.gmem_layout = TU_GMEM_LAYOUT_AVOID_CCU;
4458 }
4459
4460 cmd->state.tiling = &cmd->state.framebuffer->tiling[cmd->state.gmem_layout];
4461 }
4462
4463 struct apply_store_coords_state {
4464 unsigned view;
4465 };
4466
4467 static void
fdm_apply_store_coords(struct tu_cmd_buffer * cmd,struct tu_cs * cs,void * data,VkRect2D bin,unsigned views,VkExtent2D * frag_areas)4468 fdm_apply_store_coords(struct tu_cmd_buffer *cmd,
4469 struct tu_cs *cs,
4470 void *data,
4471 VkRect2D bin,
4472 unsigned views,
4473 VkExtent2D *frag_areas)
4474 {
4475 const struct apply_store_coords_state *state =
4476 (const struct apply_store_coords_state *)data;
4477 assert(state->view < views);
4478 VkExtent2D frag_area = frag_areas[state->view];
4479
4480 /* The bin width/height must be a multiple of the frag_area to make sure
4481 * that the scaling happens correctly. This means there may be some
4482 * destination pixels jut out of the framebuffer, but they should be
4483 * clipped by the render area.
4484 */
4485 assert(bin.extent.width % frag_area.width == 0);
4486 assert(bin.extent.height % frag_area.height == 0);
4487 uint32_t scaled_width = bin.extent.width / frag_area.width;
4488 uint32_t scaled_height = bin.extent.height / frag_area.height;
4489
4490 tu_cs_emit_regs(cs,
4491 A6XX_GRAS_2D_DST_TL(.x = bin.offset.x,
4492 .y = bin.offset.y),
4493 A6XX_GRAS_2D_DST_BR(.x = bin.offset.x + bin.extent.width - 1,
4494 .y = bin.offset.y + bin.extent.height - 1));
4495 tu_cs_emit_regs(cs,
4496 A6XX_GRAS_2D_SRC_TL_X(bin.offset.x),
4497 A6XX_GRAS_2D_SRC_BR_X(bin.offset.x + scaled_width - 1),
4498 A6XX_GRAS_2D_SRC_TL_Y(bin.offset.y),
4499 A6XX_GRAS_2D_SRC_BR_Y(bin.offset.y + scaled_height - 1));
4500 }
4501
4502 template <chip CHIP>
4503 void
tu_store_gmem_attachment(struct tu_cmd_buffer * cmd,struct tu_cs * cs,uint32_t a,uint32_t gmem_a,uint32_t layers,uint32_t layer_mask,bool cond_exec_allowed)4504 tu_store_gmem_attachment(struct tu_cmd_buffer *cmd,
4505 struct tu_cs *cs,
4506 uint32_t a,
4507 uint32_t gmem_a,
4508 uint32_t layers,
4509 uint32_t layer_mask,
4510 bool cond_exec_allowed)
4511 {
4512 const VkRect2D *render_area = &cmd->state.render_area;
4513 struct tu_render_pass_attachment *dst = &cmd->state.pass->attachments[a];
4514 const struct tu_image_view *iview = cmd->state.attachments[a];
4515 struct tu_render_pass_attachment *src = &cmd->state.pass->attachments[gmem_a];
4516 const VkClearValue *clear_value = &cmd->state.clear_values[gmem_a];
4517 bool resolve = a != gmem_a;
4518 if (resolve)
4519 clear_value = NULL;
4520
4521 if (!dst->store && !dst->store_stencil)
4522 return;
4523
4524 bool unaligned = tu_attachment_store_unaligned(cmd, a);
4525
4526 /* D32_SFLOAT_S8_UINT is quite special format: it has two planes,
4527 * one for depth and other for stencil. When resolving a MSAA
4528 * D32_SFLOAT_S8_UINT to S8_UINT, we need to take that into account.
4529 */
4530 bool resolve_d32s8_s8 =
4531 src->format == VK_FORMAT_D32_SFLOAT_S8_UINT &&
4532 dst->format == VK_FORMAT_S8_UINT;
4533
4534 /* The fast path doesn't support picking out the last component of a D24S8
4535 * texture reinterpreted as RGBA8_UNORM.
4536 */
4537 bool resolve_d24s8_s8 =
4538 src->format == VK_FORMAT_D24_UNORM_S8_UINT &&
4539 dst->format == VK_FORMAT_S8_UINT;
4540
4541 bool store_common = dst->store && !resolve_d32s8_s8;
4542 bool store_separate_stencil = dst->store_stencil || resolve_d32s8_s8;
4543
4544 bool use_fast_path = !unaligned && !resolve_d24s8_s8 &&
4545 (a == gmem_a || blit_can_resolve(dst->format));
4546
4547 trace_start_gmem_store(&cmd->trace, cs, dst->format, use_fast_path, unaligned);
4548
4549 /* Unconditional store should happen only if attachment was cleared,
4550 * which could have happened either by load_op or via vkCmdClearAttachments.
4551 */
4552 bool cond_exec = cond_exec_allowed && src->cond_store_allowed;
4553 if (cond_exec) {
4554 tu_begin_load_store_cond_exec(cmd, cs, false);
4555 }
4556
4557 /* use fast path when render area is aligned, except for unsupported resolve cases */
4558 if (use_fast_path) {
4559 if (store_common)
4560 tu_emit_blit<CHIP>(cmd, cs, iview, src, clear_value, BLIT_EVENT_STORE, false);
4561 if (store_separate_stencil)
4562 tu_emit_blit<CHIP>(cmd, cs, iview, src, clear_value, BLIT_EVENT_STORE, true);
4563
4564 if (cond_exec) {
4565 tu_end_load_store_cond_exec(cmd, cs, false);
4566 }
4567
4568 trace_end_gmem_store(&cmd->trace, cs);
4569 return;
4570 }
4571
4572 assert(cmd->state.gmem_layout == TU_GMEM_LAYOUT_AVOID_CCU);
4573
4574 enum pipe_format src_format = vk_format_to_pipe_format(src->format);
4575 if (src_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
4576 src_format = PIPE_FORMAT_Z32_FLOAT;
4577
4578 enum pipe_format dst_format = vk_format_to_pipe_format(dst->format);
4579 if (dst_format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT)
4580 dst_format = PIPE_FORMAT_Z32_FLOAT;
4581
4582 if (dst->samples > 1) {
4583 /* If we hit this path, we have to disable draw states after every tile
4584 * instead of once at the end of the renderpass, so that they aren't
4585 * executed when calling CP_DRAW.
4586 *
4587 * TODO: store a flag somewhere so we don't do this more than once and
4588 * don't do it after the renderpass when this happens.
4589 */
4590 if (store_common || store_separate_stencil)
4591 tu_disable_draw_states(cmd, cs);
4592
4593 for_each_layer(i, layer_mask, layers) {
4594 if (store_common) {
4595 store_3d_blit<CHIP>(cmd, cs, iview, dst->samples, false, src_format,
4596 dst_format, render_area, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp);
4597 }
4598 if (store_separate_stencil) {
4599 store_3d_blit<CHIP>(cmd, cs, iview, dst->samples, true, PIPE_FORMAT_S8_UINT,
4600 PIPE_FORMAT_S8_UINT, render_area, i,
4601 tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples);
4602 }
4603 }
4604 } else {
4605 if (!cmd->state.pass->has_fdm) {
4606 r2d_coords(cmd, cs, render_area->offset, render_area->offset,
4607 render_area->extent);
4608 } else {
4609 /* Usually GRAS_2D_RESOLVE_CNTL_* clips the destination to the bin
4610 * area and the coordinates span the entire render area, but for
4611 * FDM we need to scale the coordinates so we need to take the
4612 * opposite aproach, specifying the exact bin size in the destination
4613 * coordinates and using GRAS_2D_RESOLVE_CNTL_* to clip to the render
4614 * area.
4615 */
4616 tu_cs_emit_regs(cs,
4617 A6XX_GRAS_2D_RESOLVE_CNTL_1(.x = render_area->offset.x,
4618 .y = render_area->offset.y,),
4619 A6XX_GRAS_2D_RESOLVE_CNTL_2(.x = render_area->offset.x + render_area->extent.width - 1,
4620 .y = render_area->offset.y + render_area->extent.height - 1,));
4621 }
4622
4623 for_each_layer (i, layer_mask, layers) {
4624 if (cmd->state.pass->has_fdm) {
4625 unsigned view = layer_mask ? i : 0;
4626 struct apply_store_coords_state state = {
4627 .view = view,
4628 };
4629 tu_create_fdm_bin_patchpoint(cmd, cs, 8, fdm_apply_store_coords,
4630 state);
4631 }
4632 if (store_common) {
4633 store_cp_blit<CHIP>(cmd, cs, iview, src->samples, false, src_format,
4634 dst_format, i, tu_attachment_gmem_offset(cmd, src, i), src->cpp);
4635 }
4636 if (store_separate_stencil) {
4637 store_cp_blit<CHIP>(cmd, cs, iview, src->samples, true, PIPE_FORMAT_S8_UINT,
4638 PIPE_FORMAT_S8_UINT, i, tu_attachment_gmem_offset_stencil(cmd, src, i), src->samples);
4639 }
4640 }
4641 }
4642
4643 if (cond_exec) {
4644 tu_end_load_store_cond_exec(cmd, cs, false);
4645 }
4646
4647 trace_end_gmem_store(&cmd->trace, cs);
4648 }
4649 TU_GENX(tu_store_gmem_attachment);
4650