1 /*
2 * Copyright 2017 Advanced Micro Devices, Inc.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include "si_pipe.h"
8 #include "sid.h"
9 #include "util/format/u_format.h"
10 #include "util/u_pack_color.h"
11 #include "util/u_surface.h"
12 #include "ac_formats.h"
13
14 enum {
15 SI_CLEAR = SI_SAVE_FRAGMENT_STATE | SI_SAVE_FRAGMENT_CONSTANT,
16 SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE,
17 };
18
si_init_buffer_clear(struct si_clear_info * info,struct pipe_resource * resource,uint64_t offset,uint32_t size,uint32_t clear_value)19 void si_init_buffer_clear(struct si_clear_info *info,
20 struct pipe_resource *resource, uint64_t offset,
21 uint32_t size, uint32_t clear_value)
22 {
23 info->resource = resource;
24 info->offset = offset;
25 info->size = size;
26 info->clear_value = clear_value;
27 info->writemask = 0xffffffff;
28 info->is_dcc_msaa = false;
29 info->format = PIPE_FORMAT_NONE;
30 }
31
si_init_buffer_clear_rmw(struct si_clear_info * info,struct pipe_resource * resource,uint64_t offset,uint32_t size,uint32_t clear_value,uint32_t writemask)32 static void si_init_buffer_clear_rmw(struct si_clear_info *info,
33 struct pipe_resource *resource, uint64_t offset,
34 uint32_t size, uint32_t clear_value, uint32_t writemask)
35 {
36 si_init_buffer_clear(info, resource, offset, size, clear_value);
37 info->writemask = writemask;
38 info->format = PIPE_FORMAT_NONE;
39 }
40
si_init_clear_image_dcc_single(struct si_clear_info * info,struct si_texture * tex,unsigned level,enum pipe_format format,const union pipe_color_union * color)41 static void si_init_clear_image_dcc_single(struct si_clear_info *info, struct si_texture *tex,
42 unsigned level, enum pipe_format format,
43 const union pipe_color_union *color)
44 {
45 info->resource = &tex->buffer.b.b;
46 info->level = level;
47 info->format = format;
48 memcpy(&info->color, color, sizeof(info->color));
49 }
50
si_execute_clears(struct si_context * sctx,struct si_clear_info * info,unsigned num_clears,unsigned types,bool render_condition_enable)51 void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
52 unsigned num_clears, unsigned types, bool render_condition_enable)
53 {
54 if (!num_clears)
55 return;
56
57 /* Flush caches and wait for idle. */
58 if (types & (SI_CLEAR_TYPE_CMASK | SI_CLEAR_TYPE_DCC)) {
59 si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
60 sctx->framebuffer.CB_has_shader_readable_metadata,
61 sctx->framebuffer.all_DCC_pipe_aligned);
62 }
63
64 if (types & SI_CLEAR_TYPE_HTILE) {
65 si_make_DB_shader_coherent(sctx, sctx->framebuffer.nr_samples, sctx->framebuffer.has_stencil,
66 sctx->framebuffer.DB_has_shader_readable_metadata);
67 }
68
69 /* Invalidate the VMEM cache because we always use compute. */
70 sctx->barrier_flags |= SI_BARRIER_INV_VMEM;
71
72 /* GFX6-8: CB and DB don't use L2. */
73 if (sctx->gfx_level <= GFX8)
74 sctx->barrier_flags |= SI_BARRIER_INV_L2;
75
76 si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
77
78 /* Execute clears. */
79 for (unsigned i = 0; i < num_clears; i++) {
80 if (info[i].format) {
81 si_compute_clear_image_dcc_single(sctx, (struct si_texture*)info[i].resource,
82 info[i].level, info[i].format, &info[i].color,
83 render_condition_enable);
84 continue;
85 }
86
87 if (info[i].is_dcc_msaa) {
88 gfx9_clear_dcc_msaa(sctx, info[i].resource, info[i].clear_value, render_condition_enable);
89 continue;
90 }
91
92 assert(info[i].size > 0);
93
94 if (info[i].writemask != 0xffffffff) {
95 si_compute_clear_buffer_rmw(sctx, info[i].resource, info[i].offset, info[i].size,
96 info[i].clear_value, info[i].writemask,
97 render_condition_enable);
98 } else {
99 /* Compute shaders are much faster on both dGPUs and APUs. Don't use CP DMA. */
100 si_clear_buffer(sctx, info[i].resource, info[i].offset, info[i].size,
101 &info[i].clear_value, 4, SI_COMPUTE_CLEAR_METHOD,
102 render_condition_enable);
103 }
104 }
105
106 /* Wait for idle. */
107 sctx->barrier_flags |= SI_BARRIER_SYNC_CS;
108
109 /* GFX6-8: CB and DB don't use L2. */
110 if (sctx->gfx_level <= GFX8)
111 sctx->barrier_flags |= SI_BARRIER_WB_L2;
112
113 si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
114 }
115
si_alloc_separate_cmask(struct si_screen * sscreen,struct si_texture * tex)116 static bool si_alloc_separate_cmask(struct si_screen *sscreen, struct si_texture *tex)
117 {
118 assert(sscreen->info.gfx_level < GFX11);
119
120 /* CMASK for MSAA is allocated in advance or always disabled
121 * by "nofmask" option.
122 */
123 if (tex->cmask_buffer)
124 return true;
125
126 if (!tex->surface.cmask_size)
127 return false;
128
129 tex->cmask_buffer =
130 si_aligned_buffer_create(&sscreen->b, PIPE_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
131 tex->surface.cmask_size, 1 << tex->surface.cmask_alignment_log2);
132 if (tex->cmask_buffer == NULL)
133 return false;
134
135 /* These 2 fields are part of the framebuffer state but dirtying the atom
136 * will be done by the caller.
137 */
138 tex->cmask_base_address_reg = tex->cmask_buffer->gpu_address >> 8;
139 tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
140
141 p_atomic_inc(&sscreen->compressed_colortex_counter);
142 return true;
143 }
144
si_set_clear_color(struct si_texture * tex,enum pipe_format surface_format,const union pipe_color_union * color)145 static bool si_set_clear_color(struct si_texture *tex, enum pipe_format surface_format,
146 const union pipe_color_union *color)
147 {
148 union util_color uc;
149
150 memset(&uc, 0, sizeof(uc));
151
152 if (tex->surface.bpe == 16) {
153 /* DCC fast clear only:
154 * CLEAR_WORD0 = R = G = B
155 * CLEAR_WORD1 = A
156 */
157 assert(color->ui[0] == color->ui[1] && color->ui[0] == color->ui[2]);
158 uc.ui[0] = color->ui[0];
159 uc.ui[1] = color->ui[3];
160 } else {
161 if (tex->swap_rgb_to_bgr)
162 surface_format = util_format_rgb_to_bgr(surface_format);
163
164 util_pack_color_union(surface_format, &uc, color);
165 }
166
167 if (memcmp(tex->color_clear_value, &uc, 2 * sizeof(uint32_t)) == 0)
168 return false;
169
170 memcpy(tex->color_clear_value, &uc, 2 * sizeof(uint32_t));
171 return true;
172 }
173
gfx8_get_dcc_clear_parameters(struct si_screen * sscreen,enum pipe_format base_format,enum pipe_format surface_format,const union pipe_color_union * color,uint32_t * clear_value,bool * eliminate_needed)174 static bool gfx8_get_dcc_clear_parameters(struct si_screen *sscreen, enum pipe_format base_format,
175 enum pipe_format surface_format,
176 const union pipe_color_union *color, uint32_t *clear_value,
177 bool *eliminate_needed)
178 {
179 /* If we want to clear without needing a fast clear eliminate step, we
180 * can set color and alpha independently to 0 or 1 (or 0/max for integer
181 * formats).
182 */
183 bool values[4] = {}; /* whether to clear to 0 or 1 */
184 bool color_value = false; /* clear color to 0 or 1 */
185 bool alpha_value = false; /* clear alpha to 0 or 1 */
186 int alpha_channel; /* index of the alpha component */
187 bool has_color = false;
188 bool has_alpha = false;
189
190 const struct util_format_description *desc =
191 util_format_description(ac_simplify_cb_format(surface_format));
192
193 /* 128-bit fast clear with different R,G,B values is unsupported. */
194 if (desc->block.bits == 128 && (color->ui[0] != color->ui[1] || color->ui[0] != color->ui[2]))
195 return false;
196
197 *eliminate_needed = true;
198 *clear_value = GFX8_DCC_CLEAR_REG;
199
200 if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
201 return true; /* need ELIMINATE_FAST_CLEAR */
202
203 bool base_alpha_is_on_msb = ac_alpha_is_on_msb(&sscreen->info, base_format);
204 bool surf_alpha_is_on_msb = ac_alpha_is_on_msb(&sscreen->info, surface_format);
205
206 /* Formats with 3 channels can't have alpha. */
207 if (desc->nr_channels == 3)
208 alpha_channel = -1;
209 else if (surf_alpha_is_on_msb)
210 alpha_channel = desc->nr_channels - 1;
211 else
212 alpha_channel = 0;
213
214 for (int i = 0; i < 4; ++i) {
215 if (desc->swizzle[i] >= PIPE_SWIZZLE_0)
216 continue;
217
218 if (desc->channel[i].pure_integer && desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
219 /* Use the maximum value for clamping the clear color. */
220 int max = u_bit_consecutive(0, desc->channel[i].size - 1);
221
222 values[i] = color->i[i] != 0;
223 if (color->i[i] != 0 && MIN2(color->i[i], max) != max)
224 return true; /* need ELIMINATE_FAST_CLEAR */
225 } else if (desc->channel[i].pure_integer &&
226 desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
227 /* Use the maximum value for clamping the clear color. */
228 unsigned max = u_bit_consecutive(0, desc->channel[i].size);
229
230 values[i] = color->ui[i] != 0U;
231 if (color->ui[i] != 0U && MIN2(color->ui[i], max) != max)
232 return true; /* need ELIMINATE_FAST_CLEAR */
233 } else {
234 values[i] = color->f[i] != 0.0F;
235 if (color->f[i] != 0.0F && color->f[i] != 1.0F)
236 return true; /* need ELIMINATE_FAST_CLEAR */
237 }
238
239 if (desc->swizzle[i] == alpha_channel) {
240 alpha_value = values[i];
241 has_alpha = true;
242 } else {
243 color_value = values[i];
244 has_color = true;
245 }
246 }
247
248 /* If alpha isn't present, make it the same as color, and vice versa. */
249 if (!has_alpha)
250 alpha_value = color_value;
251 else if (!has_color)
252 color_value = alpha_value;
253
254 if (color_value != alpha_value && base_alpha_is_on_msb != surf_alpha_is_on_msb)
255 return true; /* require ELIMINATE_FAST_CLEAR */
256
257 /* Check if all color values are equal if they are present. */
258 for (int i = 0; i < 4; ++i) {
259 if (desc->swizzle[i] <= PIPE_SWIZZLE_W && desc->swizzle[i] != alpha_channel &&
260 values[i] != color_value)
261 return true; /* require ELIMINATE_FAST_CLEAR */
262 }
263
264 /* This doesn't need ELIMINATE_FAST_CLEAR.
265 * On chips predating Raven2, the DCC clear codes and the CB clear
266 * color registers must match.
267 */
268 *eliminate_needed = false;
269
270 if (color_value) {
271 if (alpha_value)
272 *clear_value = GFX8_DCC_CLEAR_1111;
273 else
274 *clear_value = GFX8_DCC_CLEAR_1110;
275 } else {
276 if (alpha_value)
277 *clear_value = GFX8_DCC_CLEAR_0001;
278 else
279 *clear_value = GFX8_DCC_CLEAR_0000;
280 }
281 return true;
282 }
283
gfx11_get_dcc_clear_parameters(struct si_screen * sscreen,struct si_texture * tex,unsigned level,enum pipe_format surface_format,const union pipe_color_union * color,uint32_t * clear_value,bool fail_if_slow)284 static bool gfx11_get_dcc_clear_parameters(struct si_screen *sscreen, struct si_texture *tex,
285 unsigned level, enum pipe_format surface_format,
286 const union pipe_color_union *color, uint32_t *clear_value,
287 bool fail_if_slow)
288 {
289 const struct util_format_description *desc =
290 util_format_description(ac_simplify_cb_format(surface_format));
291 unsigned start_bit = UINT_MAX;
292 unsigned end_bit = 0;
293
294 /* Find the used bit range. */
295 for (unsigned i = 0; i < 4; i++) {
296 unsigned swizzle = desc->swizzle[i];
297
298 if (swizzle >= PIPE_SWIZZLE_0)
299 continue;
300
301 start_bit = MIN2(start_bit, desc->channel[swizzle].shift);
302 end_bit = MAX2(end_bit, desc->channel[swizzle].shift + desc->channel[swizzle].size);
303 }
304
305 union {
306 uint8_t ub[16];
307 uint16_t us[8];
308 uint32_t ui[4];
309 } value = {};
310 util_pack_color_union(surface_format, (union util_color*)&value, color);
311
312 /* Check the cases where all components or bits are either all 0 or all 1. */
313 bool all_bits_are_0 = true;
314 bool all_bits_are_1 = true;
315 bool all_words_are_fp16_1 = false;
316 bool all_words_are_fp32_1 = false;
317
318 for (unsigned i = start_bit; i < end_bit; i++) {
319 bool bit = value.ub[i / 8] & BITFIELD_BIT(i % 8);
320
321 all_bits_are_0 &= !bit;
322 all_bits_are_1 &= bit;
323 }
324
325 if (start_bit % 16 == 0 && end_bit % 16 == 0) {
326 all_words_are_fp16_1 = true;
327 for (unsigned i = start_bit / 16; i < end_bit / 16; i++)
328 all_words_are_fp16_1 &= value.us[i] == 0x3c00;
329 }
330
331 if (start_bit % 32 == 0 && end_bit % 32 == 0) {
332 all_words_are_fp32_1 = true;
333 for (unsigned i = start_bit / 32; i < end_bit / 32; i++)
334 all_words_are_fp32_1 &= value.ui[i] == 0x3f800000;
335 }
336
337 #if 0 /* debug code */
338 int i = util_format_get_first_non_void_channel(surface_format);
339 if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED && desc->channel[i].pure_integer) {
340 printf("%i %i %i %i\n", color->i[0], color->i[1], color->i[2], color->i[3]);
341 } else if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED && desc->channel[i].pure_integer) {
342 printf("%u %u %u %u\n", color->ui[0], color->ui[1], color->ui[2], color->ui[3]);
343 } else {
344 printf("%f %f %f %f\n", color->f[0], color->f[1], color->f[2], color->f[3]);
345 }
346 for (unsigned i = 0; i < end_bit / 8; i++)
347 printf("%02x", value.ub[i]);
348 printf("\n");
349 printf("bits=[%u..%u)%s%s%s%s\n", start_bit, end_bit,
350 all_bits_are_0 ? ", all 0" : "",
351 all_bits_are_1 ? ", all 1" : "",
352 all_words_are_fp16_1 ? ", all fp16 1" : "",
353 all_words_are_fp32_1 ? ", all fp32 1" : "");
354 #endif
355
356 *clear_value = 0;
357
358 if (all_bits_are_0 || all_bits_are_1 || all_words_are_fp16_1 || all_words_are_fp32_1) {
359 if (all_bits_are_0)
360 *clear_value = GFX11_DCC_CLEAR_0000;
361 else if (all_bits_are_1)
362 *clear_value = GFX11_DCC_CLEAR_1111_UNORM;
363 else if (all_words_are_fp16_1)
364 *clear_value = GFX11_DCC_CLEAR_1111_FP16;
365 else if (all_words_are_fp32_1)
366 *clear_value = GFX11_DCC_CLEAR_1111_FP32;
367
368 return true;
369 }
370
371 /* Check 0001 and 1110 cases. */
372 if (desc->nr_channels == 2 && desc->channel[0].size == 8) {
373 if (value.ub[0] == 0x00 && value.ub[1] == 0xff) {
374 *clear_value = GFX11_DCC_CLEAR_0001_UNORM;
375 return true;
376 } else if (value.ub[0] == 0xff && value.ub[1] == 0x00) {
377 *clear_value = GFX11_DCC_CLEAR_1110_UNORM;
378 return true;
379 }
380 } else if (desc->nr_channels == 4 && desc->channel[0].size == 8) {
381 if (value.ub[0] == 0x00 && value.ub[1] == 0x00 &&
382 value.ub[2] == 0x00 && value.ub[3] == 0xff) {
383 *clear_value = GFX11_DCC_CLEAR_0001_UNORM;
384 return true;
385 } else if (value.ub[0] == 0xff && value.ub[1] == 0xff &&
386 value.ub[2] == 0xff && value.ub[3] == 0x00) {
387 *clear_value = GFX11_DCC_CLEAR_1110_UNORM;
388 return true;
389 }
390 } else if (desc->nr_channels == 4 && desc->channel[0].size == 16) {
391 if (value.us[0] == 0x0000 && value.us[1] == 0x0000 &&
392 value.us[2] == 0x0000 && value.us[3] == 0xffff) {
393 *clear_value = GFX11_DCC_CLEAR_0001_UNORM;
394 return true;
395 } else if (value.us[0] == 0xffff && value.us[1] == 0xffff &&
396 value.us[2] == 0xffff && value.us[3] == 0x0000) {
397 *clear_value = GFX11_DCC_CLEAR_1110_UNORM;
398 return true;
399 }
400 }
401
402 /* Estimate whether DCC clear-to-single is better than a slow clear. */
403 unsigned width = u_minify(tex->buffer.b.b.width0, level);
404 unsigned height = u_minify(tex->buffer.b.b.height0, level);
405 unsigned depth = util_num_layers(&tex->buffer.b.b, level);
406 unsigned num_samples = MAX2(tex->buffer.b.b.nr_samples, 1);
407 uint64_t size = (uint64_t)width * height * depth * num_samples * tex->surface.bpe;
408
409 /* These cases perform exceptionally well with DCC clear-to-single, so make them more likely. */
410 if ((num_samples <= 2 && tex->surface.bpe <= 2) ||
411 (num_samples == 1 && tex->surface.bpe == 4))
412 size *= 2;
413
414 /* These cases perform terribly with DCC clear-to-single. */
415 if (tex->buffer.b.b.nr_samples >= 4 && tex->surface.bpe >= 4)
416 size = 0;
417
418 /* This is mostly optimal for Navi31. The scaling effect of num_rb on other chips is guessed. */
419 if (!fail_if_slow || size >= sscreen->info.num_rb * 512 * 1024) {
420 *clear_value = GFX11_DCC_CLEAR_SINGLE;
421 return true;
422 }
423
424 return false;
425 }
426
vi_dcc_get_clear_info(struct si_context * sctx,struct si_texture * tex,unsigned level,unsigned clear_value,struct si_clear_info * out)427 bool vi_dcc_get_clear_info(struct si_context *sctx, struct si_texture *tex, unsigned level,
428 unsigned clear_value, struct si_clear_info *out)
429 {
430 struct pipe_resource *dcc_buffer = &tex->buffer.b.b;
431 uint64_t dcc_offset = tex->surface.meta_offset;
432 uint32_t clear_size;
433
434 assert(vi_dcc_enabled(tex, level));
435
436 if (sctx->gfx_level >= GFX10) {
437 /* 4x and 8x MSAA needs a sophisticated compute shader for
438 * the clear. GFX11 doesn't need that.
439 */
440 if (sctx->gfx_level < GFX11 && tex->buffer.b.b.nr_storage_samples >= 4)
441 return false;
442
443 unsigned num_layers = util_num_layers(&tex->buffer.b.b, level);
444
445 if (num_layers == 1) {
446 /* Clear a specific level. */
447 dcc_offset += tex->surface.u.gfx9.meta_levels[level].offset;
448 clear_size = tex->surface.u.gfx9.meta_levels[level].size;
449 } else if (tex->buffer.b.b.last_level == 0) {
450 /* Clear all layers having only 1 level. */
451 clear_size = tex->surface.meta_size;
452 } else {
453 /* Clearing DCC with both multiple levels and multiple layers is not
454 * implemented.
455 */
456 return false;
457 }
458 } else if (sctx->gfx_level == GFX9) {
459 /* TODO: Implement DCC fast clear for level 0 of mipmapped textures. Mipmapped
460 * DCC has to clear a rectangular area of DCC for level 0 (because the whole miptree
461 * is organized in a 2D plane).
462 */
463 if (tex->buffer.b.b.last_level > 0)
464 return false;
465
466 /* 4x and 8x MSAA need to clear only sample 0 and 1 in a compute shader and leave other
467 * samples untouched. (only the first 2 samples are compressed) */
468 if (tex->buffer.b.b.nr_storage_samples >= 4) {
469 si_init_buffer_clear(out, dcc_buffer, 0, 0, clear_value);
470 out->is_dcc_msaa = true;
471 return true;
472 }
473
474 clear_size = tex->surface.meta_size;
475 } else {
476 unsigned num_layers = util_num_layers(&tex->buffer.b.b, level);
477
478 /* If this is 0, fast clear isn't possible. (can occur with MSAA) */
479 if (!tex->surface.u.legacy.color.dcc_level[level].dcc_fast_clear_size)
480 return false;
481
482 /* Layered 4x and 8x MSAA DCC fast clears need to clear
483 * dcc_fast_clear_size bytes for each layer. A compute shader
484 * would be more efficient than separate per-layer clear operations.
485 */
486 if (tex->buffer.b.b.nr_storage_samples >= 4 && num_layers > 1)
487 return false;
488
489 dcc_offset += tex->surface.u.legacy.color.dcc_level[level].dcc_offset;
490 clear_size = tex->surface.u.legacy.color.dcc_level[level].dcc_fast_clear_size;
491 }
492
493 si_init_buffer_clear(out, dcc_buffer, dcc_offset, clear_size, clear_value);
494 return true;
495 }
496
497 /* Set the same micro tile mode as the destination of the last MSAA resolve.
498 * This allows hitting the MSAA resolve fast path, which requires that both
499 * src and dst micro tile modes match.
500 */
si_set_optimal_micro_tile_mode(struct si_screen * sscreen,struct si_texture * tex)501 static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen, struct si_texture *tex)
502 {
503 if (sscreen->info.gfx_level >= GFX10 || tex->buffer.b.is_shared ||
504 tex->buffer.b.b.nr_samples <= 1 ||
505 tex->surface.micro_tile_mode == tex->last_msaa_resolve_target_micro_mode)
506 return;
507
508 assert(sscreen->info.gfx_level >= GFX9 ||
509 tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
510 assert(tex->buffer.b.b.last_level == 0);
511
512 if (sscreen->info.gfx_level >= GFX9) {
513 /* 4K or larger tiles only. 0 is linear. 1-3 are 256B tiles. */
514 assert(tex->surface.u.gfx9.swizzle_mode >= 4);
515
516 /* If you do swizzle_mode % 4, you'll get:
517 * 0 = Depth
518 * 1 = Standard,
519 * 2 = Displayable
520 * 3 = Rotated
521 *
522 * Depth-sample order isn't allowed:
523 */
524 assert(tex->surface.u.gfx9.swizzle_mode % 4 != 0);
525
526 switch (tex->last_msaa_resolve_target_micro_mode) {
527 case RADEON_MICRO_MODE_DISPLAY:
528 tex->surface.u.gfx9.swizzle_mode &= ~0x3;
529 tex->surface.u.gfx9.swizzle_mode += 2; /* D */
530 break;
531 case RADEON_MICRO_MODE_STANDARD:
532 tex->surface.u.gfx9.swizzle_mode &= ~0x3;
533 tex->surface.u.gfx9.swizzle_mode += 1; /* S */
534 break;
535 case RADEON_MICRO_MODE_RENDER:
536 tex->surface.u.gfx9.swizzle_mode &= ~0x3;
537 tex->surface.u.gfx9.swizzle_mode += 3; /* R */
538 break;
539 default: /* depth */
540 assert(!"unexpected micro mode");
541 return;
542 }
543 } else if (sscreen->info.gfx_level >= GFX7) {
544 /* These magic numbers were copied from addrlib. It doesn't use
545 * any definitions for them either. They are all 2D_TILED_THIN1
546 * modes with different bpp and micro tile mode.
547 */
548 switch (tex->last_msaa_resolve_target_micro_mode) {
549 case RADEON_MICRO_MODE_DISPLAY:
550 tex->surface.u.legacy.tiling_index[0] = 10;
551 break;
552 case RADEON_MICRO_MODE_STANDARD:
553 tex->surface.u.legacy.tiling_index[0] = 14;
554 break;
555 case RADEON_MICRO_MODE_RENDER:
556 tex->surface.u.legacy.tiling_index[0] = 28;
557 break;
558 default: /* depth, thick */
559 assert(!"unexpected micro mode");
560 return;
561 }
562 } else { /* GFX6 */
563 switch (tex->last_msaa_resolve_target_micro_mode) {
564 case RADEON_MICRO_MODE_DISPLAY:
565 switch (tex->surface.bpe) {
566 case 1:
567 tex->surface.u.legacy.tiling_index[0] = 10;
568 break;
569 case 2:
570 tex->surface.u.legacy.tiling_index[0] = 11;
571 break;
572 default: /* 4, 8 */
573 tex->surface.u.legacy.tiling_index[0] = 12;
574 break;
575 }
576 break;
577 case RADEON_MICRO_MODE_STANDARD:
578 switch (tex->surface.bpe) {
579 case 1:
580 tex->surface.u.legacy.tiling_index[0] = 14;
581 break;
582 case 2:
583 tex->surface.u.legacy.tiling_index[0] = 15;
584 break;
585 case 4:
586 tex->surface.u.legacy.tiling_index[0] = 16;
587 break;
588 default: /* 8, 16 */
589 tex->surface.u.legacy.tiling_index[0] = 17;
590 break;
591 }
592 break;
593 default: /* depth, thick */
594 assert(!"unexpected micro mode");
595 return;
596 }
597 }
598
599 tex->surface.micro_tile_mode = tex->last_msaa_resolve_target_micro_mode;
600
601 p_atomic_inc(&sscreen->dirty_tex_counter);
602 }
603
si_get_htile_clear_value(struct si_texture * tex,float depth)604 static uint32_t si_get_htile_clear_value(struct si_texture *tex, float depth)
605 {
606 /* Maximum 14-bit UINT value. */
607 const uint32_t max_z_value = 0x3FFF;
608
609 /* For clears, Zmask and Smem will always be set to zero. */
610 const uint32_t zmask = 0;
611 const uint32_t smem = 0;
612
613 /* Convert depthValue to 14-bit zmin/zmax uint values. */
614 const uint32_t zmin = lroundf(depth * max_z_value);
615 const uint32_t zmax = zmin;
616
617 if (tex->htile_stencil_disabled) {
618 /* Z-only HTILE is laid out as follows:
619 * |31 18|17 4|3 0|
620 * +---------+---------+-------+
621 * | Max Z | Min Z | ZMask |
622 */
623 return ((zmax & 0x3FFF) << 18) |
624 ((zmin & 0x3FFF) << 4) |
625 ((zmask & 0xF) << 0);
626 } else {
627 /* Z+S HTILE is laid out as-follows:
628 * |31 12|11 10|9 8|7 6|5 4|3 0|
629 * +-----------+-----+------+-----+-----+-------+
630 * | Z Range | | SMem | SR1 | SR0 | ZMask |
631 *
632 * The base value for zRange is either zMax or zMin, depending on ZRANGE_PRECISION.
633 * For a fast clear, zMin == zMax == clearValue. This means that the base will
634 * always be the clear value (converted to 14-bit UINT).
635 *
636 * When abs(zMax-zMin) < 16, the delta is equal to the difference. In the case of
637 * fast clears, where zMax == zMin, the delta is always zero.
638 */
639 const uint32_t delta = 0;
640 const uint32_t zrange = (zmax << 6) | delta;
641
642 /* SResults 0 & 1 are set based on the stencil compare state.
643 * For fast-clear, the default value of sr0 and sr1 are both 0x3.
644 */
645 const uint32_t sresults = 0xf;
646
647 return ((zrange & 0xFFFFF) << 12) |
648 ((smem & 0x3) << 8) |
649 ((sresults & 0xF) << 4) |
650 ((zmask & 0xF) << 0);
651 }
652 }
653
si_can_fast_clear_depth(struct si_texture * zstex,unsigned level,float depth,unsigned buffers)654 static bool si_can_fast_clear_depth(struct si_texture *zstex, unsigned level, float depth,
655 unsigned buffers)
656 {
657 /* TC-compatible HTILE only supports depth clears to 0 or 1. */
658 return buffers & PIPE_CLEAR_DEPTH &&
659 si_htile_enabled(zstex, level, PIPE_MASK_Z) &&
660 (!zstex->tc_compatible_htile || depth == 0 || depth == 1);
661 }
662
si_can_fast_clear_stencil(struct si_texture * zstex,unsigned level,uint8_t stencil,unsigned buffers)663 static bool si_can_fast_clear_stencil(struct si_texture *zstex, unsigned level, uint8_t stencil,
664 unsigned buffers)
665 {
666 /* TC-compatible HTILE only supports stencil clears to 0. */
667 return buffers & PIPE_CLEAR_STENCIL &&
668 si_htile_enabled(zstex, level, PIPE_MASK_S) &&
669 (!zstex->tc_compatible_htile || stencil == 0);
670 }
671
si_fast_clear(struct si_context * sctx,unsigned * buffers,const union pipe_color_union * color,float depth,uint8_t stencil)672 static void si_fast_clear(struct si_context *sctx, unsigned *buffers,
673 const union pipe_color_union *color, float depth, uint8_t stencil)
674 {
675 struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
676 struct si_clear_info info[8 * 3 + 1]; /* MRTs * (CMASK + DCC + clear_dcc_single) + ZS */
677 unsigned num_clears = 0;
678 unsigned clear_types = 0;
679 unsigned num_pixels = fb->width * fb->height;
680
681 assert(sctx->gfx_level < GFX12);
682
683 /* This function is broken in BE, so just disable this path for now */
684 #if UTIL_ARCH_BIG_ENDIAN
685 return;
686 #endif
687
688 /* Gather information about what to clear. */
689 unsigned color_buffer_mask = (*buffers & PIPE_CLEAR_COLOR) >> util_logbase2(PIPE_CLEAR_COLOR0);
690 while (color_buffer_mask) {
691 unsigned i = u_bit_scan(&color_buffer_mask);
692
693 struct si_texture *tex = (struct si_texture *)fb->cbufs[i]->texture;
694 unsigned level = fb->cbufs[i]->u.tex.level;
695 unsigned num_layers = util_num_layers(&tex->buffer.b.b, level);
696
697 /* the clear is allowed if all layers are bound */
698 if (fb->cbufs[i]->u.tex.first_layer != 0 ||
699 fb->cbufs[i]->u.tex.last_layer != num_layers - 1) {
700 continue;
701 }
702
703 /* We can change the micro tile mode before a full clear. */
704 /* This is only used for MSAA textures when clearing all layers. */
705 si_set_optimal_micro_tile_mode(sctx->screen, tex);
706
707 if (tex->swap_rgb_to_bgr_on_next_clear) {
708 assert(!tex->swap_rgb_to_bgr);
709 assert(tex->buffer.b.b.nr_samples >= 2);
710 tex->swap_rgb_to_bgr = true;
711 tex->swap_rgb_to_bgr_on_next_clear = false;
712
713 /* Update all sampler views and images. */
714 p_atomic_inc(&sctx->screen->dirty_tex_counter);
715 }
716
717 /* only supported on tiled surfaces */
718 if (tex->surface.is_linear) {
719 continue;
720 }
721
722 /* Use a slow clear for small surfaces where the cost of
723 * the eliminate pass can be higher than the benefit of fast
724 * clear. The closed driver does this, but the numbers may differ.
725 *
726 * This helps on both dGPUs and APUs, even small APUs like Mullins.
727 */
728 bool fb_too_small = (uint64_t)num_pixels * num_layers <= 512 * 512;
729 bool too_small = tex->buffer.b.b.nr_samples <= 1 && fb_too_small;
730 bool eliminate_needed = false;
731 bool fmask_decompress_needed = false;
732 bool need_dirtying_fb = false;
733
734 /* Try to clear DCC first, otherwise try CMASK. */
735 if (vi_dcc_enabled(tex, level)) {
736 uint32_t reset_value;
737
738 if (sctx->screen->debug_flags & DBG(NO_DCC_CLEAR))
739 continue;
740
741 if (sctx->gfx_level >= GFX11) {
742 if (!gfx11_get_dcc_clear_parameters(sctx->screen, tex, level, fb->cbufs[i]->format,
743 color, &reset_value, true))
744 continue;
745 } else {
746 if (!gfx8_get_dcc_clear_parameters(sctx->screen, tex->buffer.b.b.format,
747 fb->cbufs[i]->format, color, &reset_value,
748 &eliminate_needed))
749 continue;
750 }
751
752 /* Shared textures can't use fast clear without an explicit flush
753 * because the clear color is not exported.
754 *
755 * Chips without DCC constant encoding must set the clear color registers
756 * correctly even if the fast clear eliminate pass is not needed.
757 */
758 if ((eliminate_needed || !sctx->screen->info.has_dcc_constant_encode) &&
759 tex->buffer.b.is_shared &&
760 !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
761 continue;
762
763 if (eliminate_needed && too_small)
764 continue;
765
766 /* We can clear any level, but we only set up the clear value registers for the first
767 * level. Therefore, all other levels can be cleared only if the clear value registers
768 * are not used, which is only the case with DCC constant encoding and 0/1 clear values.
769 */
770 if (level > 0 && (eliminate_needed || !sctx->screen->info.has_dcc_constant_encode))
771 continue;
772
773 if (tex->buffer.b.b.nr_samples >= 2 && eliminate_needed &&
774 !sctx->screen->allow_dcc_msaa_clear_to_reg_for_bpp[util_logbase2(tex->surface.bpe)])
775 continue;
776
777 assert(num_clears < ARRAY_SIZE(info));
778
779 if (!vi_dcc_get_clear_info(sctx, tex, level, reset_value, &info[num_clears]))
780 continue;
781
782 num_clears++;
783 clear_types |= SI_CLEAR_TYPE_DCC;
784
785 si_mark_display_dcc_dirty(sctx, tex);
786
787 if (sctx->gfx_level >= GFX11 && reset_value == GFX11_DCC_CLEAR_SINGLE) {
788 /* Put this clear first by moving other clears after it because this clear has
789 * the most GPU overhead.
790 */
791 if (num_clears)
792 memmove(&info[1], &info[0], sizeof(info[0]) * num_clears);
793
794 si_init_clear_image_dcc_single(&info[0], tex, level, fb->cbufs[i]->format,
795 color);
796 num_clears++;
797 }
798
799 /* DCC fast clear with MSAA should clear CMASK to 0xC. */
800 if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) {
801 assert(sctx->gfx_level < GFX11); /* no FMASK/CMASK on GFX11 */
802 assert(num_clears < ARRAY_SIZE(info));
803 si_init_buffer_clear(&info[num_clears++], &tex->cmask_buffer->b.b,
804 tex->surface.cmask_offset, tex->surface.cmask_size, 0xCCCCCCCC);
805 clear_types |= SI_CLEAR_TYPE_CMASK;
806 fmask_decompress_needed = true;
807 }
808 } else {
809 /* No CMASK on GFX11. */
810 if (sctx->gfx_level >= GFX11)
811 continue;
812
813 if (level > 0)
814 continue;
815
816 /* Shared textures can't use fast clear without an explicit flush
817 * because the clear color is not exported.
818 */
819 if (tex->buffer.b.is_shared &&
820 !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
821 continue;
822
823 if (too_small)
824 continue;
825
826 /* 128-bit formats are unsupported */
827 if (tex->surface.bpe > 8) {
828 continue;
829 }
830
831 /* RB+ doesn't work with CMASK fast clear on Stoney. */
832 if (sctx->family == CHIP_STONEY)
833 continue;
834
835 /* Disable fast clear if tex is encrypted */
836 if (tex->buffer.flags & RADEON_FLAG_ENCRYPTED)
837 continue;
838
839 uint64_t cmask_offset = 0;
840 unsigned clear_size = 0;
841 bool had_cmask_buffer = tex->cmask_buffer != NULL;
842
843 if (sctx->gfx_level >= GFX10) {
844 assert(level == 0);
845
846 /* Clearing CMASK with both multiple levels and multiple layers is not
847 * implemented.
848 */
849 if (num_layers > 1 && tex->buffer.b.b.last_level > 0)
850 continue;
851
852 if (!si_alloc_separate_cmask(sctx->screen, tex))
853 continue;
854
855 if (num_layers == 1) {
856 /* Clear level 0. */
857 cmask_offset = tex->surface.cmask_offset + tex->surface.u.gfx9.color.cmask_level0.offset;
858 clear_size = tex->surface.u.gfx9.color.cmask_level0.size;
859 } else if (tex->buffer.b.b.last_level == 0) {
860 /* Clear all layers having only 1 level. */
861 cmask_offset = tex->surface.cmask_offset;
862 clear_size = tex->surface.cmask_size;
863 } else {
864 assert(0); /* this is prevented above */
865 }
866 } else if (sctx->gfx_level == GFX9) {
867 /* TODO: Implement CMASK fast clear for level 0 of mipmapped textures. Mipmapped
868 * CMASK has to clear a rectangular area of CMASK for level 0 (because the whole
869 * miptree is organized in a 2D plane).
870 */
871 if (tex->buffer.b.b.last_level > 0)
872 continue;
873
874 if (!si_alloc_separate_cmask(sctx->screen, tex))
875 continue;
876
877 cmask_offset = tex->surface.cmask_offset;
878 clear_size = tex->surface.cmask_size;
879 } else {
880 if (!si_alloc_separate_cmask(sctx->screen, tex))
881 continue;
882
883 /* GFX6-8: This only covers mipmap level 0. */
884 cmask_offset = tex->surface.cmask_offset;
885 clear_size = tex->surface.cmask_size;
886 }
887
888 /* Do the fast clear. */
889 assert(num_clears < ARRAY_SIZE(info));
890 si_init_buffer_clear(&info[num_clears++], &tex->cmask_buffer->b.b,
891 cmask_offset, clear_size, 0);
892 clear_types |= SI_CLEAR_TYPE_CMASK;
893 eliminate_needed = true;
894 /* If we allocated a cmask buffer for this tex we need to re-emit
895 * the fb state.
896 */
897 need_dirtying_fb = !had_cmask_buffer;
898 }
899
900 if ((eliminate_needed || fmask_decompress_needed) &&
901 !(tex->dirty_level_mask & (1 << level))) {
902 assert(sctx->gfx_level < GFX11); /* no decompression needed on GFX11 */
903 tex->dirty_level_mask |= 1 << level;
904 p_atomic_inc(&sctx->screen->compressed_colortex_counter);
905 }
906
907 *buffers &= ~(PIPE_CLEAR_COLOR0 << i);
908
909 /* Chips with DCC constant encoding don't need to set the clear
910 * color registers for DCC clear values 0 and 1.
911 */
912 if (sctx->screen->info.has_dcc_constant_encode && !eliminate_needed)
913 continue;
914
915 /* There are no clear color registers on GFX11. */
916 assert(sctx->gfx_level < GFX11);
917
918 if (si_set_clear_color(tex, fb->cbufs[i]->format, color) || need_dirtying_fb) {
919 sctx->framebuffer.dirty_cbufs |= 1 << i;
920 si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
921 }
922 }
923
924 /* Depth/stencil clears. */
925 struct pipe_surface *zsbuf = fb->zsbuf;
926 struct si_texture *zstex = zsbuf ? (struct si_texture *)zsbuf->texture : NULL;
927 unsigned zs_num_layers = zstex ? util_num_layers(&zstex->buffer.b.b, zsbuf->u.tex.level) : 0;
928
929 if (zstex && zsbuf->u.tex.first_layer == 0 &&
930 zsbuf->u.tex.last_layer == zs_num_layers - 1 &&
931 si_htile_enabled(zstex, zsbuf->u.tex.level, PIPE_MASK_ZS)) {
932 unsigned level = zsbuf->u.tex.level;
933 bool update_db_depth_clear = false;
934 bool update_db_stencil_clear = false;
935 bool fb_too_small = num_pixels * zs_num_layers <= 512 * 512;
936
937 /* Transition from TC-incompatible to TC-compatible HTILE if requested. */
938 if (zstex->enable_tc_compatible_htile_next_clear) {
939 /* If both depth and stencil are present, they must be cleared together. */
940 if ((*buffers & PIPE_CLEAR_DEPTHSTENCIL) == PIPE_CLEAR_DEPTHSTENCIL ||
941 (*buffers & PIPE_CLEAR_DEPTH && (!zstex->surface.has_stencil ||
942 zstex->htile_stencil_disabled))) {
943 /* The conversion from TC-incompatible to TC-compatible can only be done in one clear. */
944 assert(zstex->buffer.b.b.last_level == 0);
945 assert(!zstex->tc_compatible_htile);
946
947 /* Enable TC-compatible HTILE. */
948 zstex->enable_tc_compatible_htile_next_clear = false;
949 zstex->tc_compatible_htile = true;
950
951 /* Update the framebuffer state to reflect the change. */
952 sctx->framebuffer.DB_has_shader_readable_metadata = true;
953 sctx->framebuffer.dirty_zsbuf = true;
954 si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
955
956 /* Update all sampler views and shader images in all contexts. */
957 p_atomic_inc(&sctx->screen->dirty_tex_counter);
958
959 /* Perform the clear here if possible, else clear to uncompressed. */
960 uint32_t clear_value;
961
962 if (zstex->htile_stencil_disabled || !zstex->surface.has_stencil) {
963 if (si_can_fast_clear_depth(zstex, level, depth, *buffers)) {
964 /* Z-only clear. */
965 clear_value = si_get_htile_clear_value(zstex, depth);
966 *buffers &= ~PIPE_CLEAR_DEPTH;
967 zstex->depth_cleared_level_mask_once |= BITFIELD_BIT(level);
968 zstex->depth_cleared_level_mask |= BITFIELD_BIT(level);
969 update_db_depth_clear = true;
970 }
971 } else if ((*buffers & PIPE_CLEAR_DEPTHSTENCIL) == PIPE_CLEAR_DEPTHSTENCIL) {
972 if (si_can_fast_clear_depth(zstex, level, depth, *buffers) &&
973 si_can_fast_clear_stencil(zstex, level, stencil, *buffers)) {
974 /* Combined Z+S clear. */
975 clear_value = si_get_htile_clear_value(zstex, depth);
976 *buffers &= ~PIPE_CLEAR_DEPTHSTENCIL;
977 zstex->depth_cleared_level_mask_once |= BITFIELD_BIT(level);
978 zstex->depth_cleared_level_mask |= BITFIELD_BIT(level);
979 zstex->stencil_cleared_level_mask_once |= BITFIELD_BIT(level);
980 update_db_depth_clear = true;
981 update_db_stencil_clear = true;
982 }
983 }
984
985 if (!update_db_depth_clear) {
986 /* Clear to uncompressed, so that it doesn't contain values incompatible
987 * with the new TC-compatible HTILE setting.
988 *
989 * 0xfffff30f = uncompressed Z + S
990 * 0xfffc000f = uncompressed Z only
991 */
992 clear_value = !zstex->htile_stencil_disabled ? 0xfffff30f : 0xfffc000f;
993 }
994
995 zstex->need_flush_after_depth_decompression = sctx->gfx_level == GFX10_3;
996
997 assert(num_clears < ARRAY_SIZE(info));
998 si_init_buffer_clear(&info[num_clears++], &zstex->buffer.b.b,
999 zstex->surface.meta_offset, zstex->surface.meta_size, clear_value);
1000 clear_types |= SI_CLEAR_TYPE_HTILE;
1001 }
1002 } else if (num_clears || !fb_too_small) {
1003 /* This is where the HTILE buffer clear is done.
1004 *
1005 * If there is no clear scheduled and the framebuffer size is too small, we should use
1006 * the draw-based clear that is without waits. If there is some other clear scheduled,
1007 * we will have to wait anyway, so add the HTILE buffer clear to the batch here.
1008 * If the framebuffer size is large enough, use this codepath too.
1009 */
1010 uint64_t htile_offset = zstex->surface.meta_offset;
1011 unsigned htile_size = 0;
1012
1013 /* Determine the HTILE subset to clear. */
1014 if (sctx->gfx_level >= GFX10) {
1015 /* This can only clear a layered texture with 1 level or a mipmap texture
1016 * with 1 layer. Other cases are unimplemented.
1017 */
1018 if (zs_num_layers == 1) {
1019 /* Clear a specific level. */
1020 htile_offset += zstex->surface.u.gfx9.meta_levels[level].offset;
1021 htile_size = zstex->surface.u.gfx9.meta_levels[level].size;
1022 } else if (zstex->buffer.b.b.last_level == 0) {
1023 /* Clear all layers having only 1 level. */
1024 htile_size = zstex->surface.meta_size;
1025 }
1026 } else {
1027 /* This can only clear a layered texture with 1 level. Other cases are
1028 * unimplemented.
1029 */
1030 if (zstex->buffer.b.b.last_level == 0)
1031 htile_size = zstex->surface.meta_size;
1032 }
1033
1034 /* Perform the clear if it's possible. */
1035 if (zstex->htile_stencil_disabled || !zstex->surface.has_stencil) {
1036 if (htile_size &&
1037 si_can_fast_clear_depth(zstex, level, depth, *buffers)) {
1038 /* Z-only clear. */
1039 assert(num_clears < ARRAY_SIZE(info));
1040 si_init_buffer_clear(&info[num_clears++], &zstex->buffer.b.b, htile_offset,
1041 htile_size, si_get_htile_clear_value(zstex, depth));
1042 clear_types |= SI_CLEAR_TYPE_HTILE;
1043 *buffers &= ~PIPE_CLEAR_DEPTH;
1044 zstex->depth_cleared_level_mask_once |= BITFIELD_BIT(level);
1045 zstex->depth_cleared_level_mask |= BITFIELD_BIT(level);
1046 update_db_depth_clear = true;
1047 }
1048 } else if ((*buffers & PIPE_CLEAR_DEPTHSTENCIL) == PIPE_CLEAR_DEPTHSTENCIL) {
1049 if (htile_size &&
1050 si_can_fast_clear_depth(zstex, level, depth, *buffers) &&
1051 si_can_fast_clear_stencil(zstex, level, stencil, *buffers)) {
1052 /* Combined Z+S clear. */
1053 assert(num_clears < ARRAY_SIZE(info));
1054 si_init_buffer_clear(&info[num_clears++], &zstex->buffer.b.b, htile_offset,
1055 htile_size, si_get_htile_clear_value(zstex, depth));
1056 clear_types |= SI_CLEAR_TYPE_HTILE;
1057 *buffers &= ~PIPE_CLEAR_DEPTHSTENCIL;
1058 zstex->depth_cleared_level_mask_once |= BITFIELD_BIT(level);
1059 zstex->depth_cleared_level_mask |= BITFIELD_BIT(level);
1060 zstex->stencil_cleared_level_mask_once |= BITFIELD_BIT(level);
1061 update_db_depth_clear = true;
1062 update_db_stencil_clear = true;
1063 }
1064 } else {
1065 /* Z-only or S-only clear when both Z/S are present using a read-modify-write
1066 * compute shader.
1067 *
1068 * If we get both clears but only one of them can be fast-cleared, we use
1069 * the draw-based fast clear to do both at the same time.
1070 */
1071 const uint32_t htile_depth_writemask = 0xfffffc0f;
1072 const uint32_t htile_stencil_writemask = 0x000003f0;
1073
1074 if (htile_size &&
1075 !(*buffers & PIPE_CLEAR_STENCIL) &&
1076 si_can_fast_clear_depth(zstex, level, depth, *buffers)) {
1077 /* Z-only clear with stencil left intact. */
1078 assert(num_clears < ARRAY_SIZE(info));
1079 si_init_buffer_clear_rmw(&info[num_clears++], &zstex->buffer.b.b, htile_offset,
1080 htile_size, si_get_htile_clear_value(zstex, depth),
1081 htile_depth_writemask);
1082 clear_types |= SI_CLEAR_TYPE_HTILE;
1083 *buffers &= ~PIPE_CLEAR_DEPTH;
1084 zstex->depth_cleared_level_mask_once |= BITFIELD_BIT(level);
1085 zstex->depth_cleared_level_mask |= BITFIELD_BIT(level);
1086 update_db_depth_clear = true;
1087 } else if (htile_size &&
1088 !(*buffers & PIPE_CLEAR_DEPTH) &&
1089 si_can_fast_clear_stencil(zstex, level, stencil, *buffers)) {
1090 /* Stencil-only clear with depth left intact. */
1091 assert(num_clears < ARRAY_SIZE(info));
1092 si_init_buffer_clear_rmw(&info[num_clears++], &zstex->buffer.b.b, htile_offset,
1093 htile_size, si_get_htile_clear_value(zstex, depth),
1094 htile_stencil_writemask);
1095 clear_types |= SI_CLEAR_TYPE_HTILE;
1096 *buffers &= ~PIPE_CLEAR_STENCIL;
1097 zstex->stencil_cleared_level_mask_once |= BITFIELD_BIT(level);
1098 update_db_stencil_clear = true;
1099 }
1100 }
1101
1102 zstex->need_flush_after_depth_decompression = update_db_depth_clear && sctx->gfx_level == GFX10_3;
1103 }
1104
1105 /* Update DB_DEPTH_CLEAR. */
1106 if (update_db_depth_clear &&
1107 zstex->depth_clear_value[level] != (float)depth) {
1108 zstex->depth_clear_value[level] = depth;
1109 sctx->framebuffer.dirty_zsbuf = true;
1110 si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
1111 }
1112
1113 /* Update DB_STENCIL_CLEAR. */
1114 if (update_db_stencil_clear &&
1115 zstex->stencil_clear_value[level] != stencil) {
1116 zstex->stencil_clear_value[level] = stencil;
1117 sctx->framebuffer.dirty_zsbuf = true;
1118 si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
1119 }
1120 }
1121
1122 si_execute_clears(sctx, info, num_clears, clear_types, sctx->render_cond_enabled);
1123 }
1124
si_fb_clear_via_compute(struct si_context * sctx,unsigned * buffers,const union pipe_color_union * color)1125 static void si_fb_clear_via_compute(struct si_context *sctx, unsigned *buffers,
1126 const union pipe_color_union *color)
1127 {
1128 struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
1129
1130 unsigned color_buffer_mask = (*buffers & PIPE_CLEAR_COLOR) >> util_logbase2(PIPE_CLEAR_COLOR0);
1131 while (color_buffer_mask) {
1132 unsigned i = u_bit_scan(&color_buffer_mask);
1133
1134 struct pipe_surface *surf = fb->cbufs[i];
1135 unsigned depth = surf->u.tex.last_layer - surf->u.tex.first_layer + 1;
1136 struct si_texture *tex = (struct si_texture *)surf->texture;
1137
1138 /* If DCC is enable (which can happen with thick tiling on gfx8, don't use compute to get
1139 * compressed clears.
1140 */
1141 if (vi_dcc_enabled(tex, surf->u.tex.level))
1142 continue;
1143
1144 /* Clears of thick and linear layouts are fastest with compute. */
1145 if (tex->surface.thick_tiling ||
1146 (tex->surface.is_linear && (surf->height > 1 || depth > 1 || surf->width >= 8192))) {
1147 struct pipe_box box;
1148
1149 u_box_3d(0, 0, surf->u.tex.first_layer, surf->width, surf->height, depth, &box);
1150
1151 if (si_compute_clear_image(sctx, &tex->buffer.b.b, surf->format, surf->u.tex.level, &box,
1152 color, sctx->render_cond_enabled, true))
1153 *buffers &= ~(PIPE_CLEAR_COLOR0 << i); /* success */
1154 }
1155 }
1156 }
1157
gfx6_clear(struct pipe_context * ctx,unsigned buffers,const struct pipe_scissor_state * scissor_state,const union pipe_color_union * color,double depth,unsigned stencil)1158 static void gfx6_clear(struct pipe_context *ctx, unsigned buffers,
1159 const struct pipe_scissor_state *scissor_state,
1160 const union pipe_color_union *color, double depth, unsigned stencil)
1161 {
1162 struct si_context *sctx = (struct si_context *)ctx;
1163 struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
1164 struct pipe_surface *zsbuf = fb->zsbuf;
1165 struct si_texture *zstex = zsbuf ? (struct si_texture *)zsbuf->texture : NULL;
1166
1167 /* Unset clear flags for non-existent buffers. */
1168 for (unsigned i = 0; i < 8; i++) {
1169 if (i >= fb->nr_cbufs || !fb->cbufs[i])
1170 buffers &= ~(PIPE_CLEAR_COLOR0 << i);
1171 }
1172 if (!zsbuf)
1173 buffers &= ~PIPE_CLEAR_DEPTHSTENCIL;
1174 else if (!util_format_has_stencil(util_format_description(zsbuf->format)))
1175 buffers &= ~PIPE_CLEAR_STENCIL;
1176
1177 si_fast_clear(sctx, &buffers, color, depth, stencil);
1178 if (!buffers)
1179 return; /* all buffers have been cleared */
1180
1181 si_fb_clear_via_compute(sctx, &buffers, color);
1182 if (!buffers)
1183 return; /* all buffers have been cleared */
1184
1185 if (buffers & PIPE_CLEAR_COLOR) {
1186 /* These buffers cannot use fast clear, make sure to disable expansion. */
1187 unsigned color_buffer_mask = (buffers & PIPE_CLEAR_COLOR) >> util_logbase2(PIPE_CLEAR_COLOR0);
1188 while (color_buffer_mask) {
1189 unsigned i = u_bit_scan(&color_buffer_mask);
1190 struct si_texture *tex = (struct si_texture *)fb->cbufs[i]->texture;
1191 if (tex->surface.fmask_size == 0)
1192 tex->dirty_level_mask &= ~(1 << fb->cbufs[i]->u.tex.level);
1193 }
1194 }
1195
1196 if (zstex && zsbuf->u.tex.first_layer == 0 &&
1197 zsbuf->u.tex.last_layer == util_max_layer(&zstex->buffer.b.b, 0)) {
1198 unsigned level = zsbuf->u.tex.level;
1199
1200 if (si_can_fast_clear_depth(zstex, level, depth, buffers)) {
1201 /* Need to disable EXPCLEAR temporarily if clearing
1202 * to a new value. */
1203 if (!(zstex->depth_cleared_level_mask_once & BITFIELD_BIT(level)) ||
1204 zstex->depth_clear_value[level] != depth) {
1205 sctx->db_depth_disable_expclear = true;
1206 }
1207
1208 if (zstex->depth_clear_value[level] != (float)depth) {
1209 if ((zstex->depth_clear_value[level] != 0) != (depth != 0)) {
1210 /* ZRANGE_PRECISION register of a bound surface will change so we
1211 * must flush the DB caches. */
1212 sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_DB;
1213 si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
1214 }
1215 /* Update DB_DEPTH_CLEAR. */
1216 zstex->depth_clear_value[level] = depth;
1217 sctx->framebuffer.dirty_zsbuf = true;
1218 si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
1219 }
1220 sctx->db_depth_clear = true;
1221 si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
1222 }
1223
1224 if (si_can_fast_clear_stencil(zstex, level, stencil, buffers)) {
1225 stencil &= 0xff;
1226
1227 /* Need to disable EXPCLEAR temporarily if clearing
1228 * to a new value. */
1229 if (!(zstex->stencil_cleared_level_mask_once & BITFIELD_BIT(level)) ||
1230 zstex->stencil_clear_value[level] != stencil) {
1231 sctx->db_stencil_disable_expclear = true;
1232 }
1233
1234 if (zstex->stencil_clear_value[level] != (uint8_t)stencil) {
1235 /* Update DB_STENCIL_CLEAR. */
1236 zstex->stencil_clear_value[level] = stencil;
1237 sctx->framebuffer.dirty_zsbuf = true;
1238 si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
1239 }
1240 sctx->db_stencil_clear = true;
1241 si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
1242 }
1243
1244 /* TODO: This hack fixes dEQP-GLES[23].functional.fragment_ops.random.* on Navi31.
1245 * The root cause is unknown.
1246 */
1247 if (sctx->gfx_level == GFX11 || sctx->gfx_level == GFX11_5) {
1248 sctx->barrier_flags |= SI_BARRIER_SYNC_VS;
1249 si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
1250 }
1251 }
1252
1253 if (unlikely(sctx->sqtt_enabled)) {
1254 if (buffers & PIPE_CLEAR_COLOR)
1255 sctx->sqtt_next_event = EventCmdClearColorImage;
1256 else if (buffers & PIPE_CLEAR_DEPTHSTENCIL)
1257 sctx->sqtt_next_event = EventCmdClearDepthStencilImage;
1258 }
1259
1260 si_blitter_begin(sctx, SI_CLEAR);
1261 util_blitter_clear(sctx->blitter, fb->width, fb->height, util_framebuffer_get_num_layers(fb),
1262 buffers, color, depth, stencil, sctx->framebuffer.nr_samples > 1);
1263 si_blitter_end(sctx);
1264
1265 if (sctx->db_depth_clear) {
1266 sctx->db_depth_clear = false;
1267 sctx->db_depth_disable_expclear = false;
1268 zstex->depth_cleared_level_mask_once |= BITFIELD_BIT(zsbuf->u.tex.level);
1269 zstex->depth_cleared_level_mask |= BITFIELD_BIT(zsbuf->u.tex.level);
1270 si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
1271 }
1272
1273 if (sctx->db_stencil_clear) {
1274 sctx->db_stencil_clear = false;
1275 sctx->db_stencil_disable_expclear = false;
1276 zstex->stencil_cleared_level_mask_once |= BITFIELD_BIT(zsbuf->u.tex.level);
1277 si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
1278 }
1279 }
1280
gfx12_clear(struct pipe_context * ctx,unsigned buffers,const struct pipe_scissor_state * scissor_state,const union pipe_color_union * color,double depth,unsigned stencil)1281 static void gfx12_clear(struct pipe_context *ctx, unsigned buffers,
1282 const struct pipe_scissor_state *scissor_state,
1283 const union pipe_color_union *color, double depth, unsigned stencil)
1284 {
1285 struct si_context *sctx = (struct si_context *)ctx;
1286 struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
1287 struct pipe_surface *zsbuf = fb->zsbuf;
1288 struct si_texture *zstex = zsbuf ? (struct si_texture *)zsbuf->texture : NULL;
1289
1290 /* Unset clear flags for non-existent buffers. */
1291 for (unsigned i = 0; i < 8; i++) {
1292 if (i >= fb->nr_cbufs || !fb->cbufs[i])
1293 buffers &= ~(PIPE_CLEAR_COLOR0 << i);
1294 }
1295 if (!zsbuf)
1296 buffers &= ~PIPE_CLEAR_DEPTHSTENCIL;
1297 else if (!util_format_has_stencil(util_format_description(zsbuf->format)))
1298 buffers &= ~PIPE_CLEAR_STENCIL;
1299
1300 if (unlikely(sctx->sqtt_enabled)) {
1301 if (buffers & PIPE_CLEAR_COLOR)
1302 sctx->sqtt_next_event = EventCmdClearColorImage;
1303 else if (buffers & PIPE_CLEAR_DEPTHSTENCIL)
1304 sctx->sqtt_next_event = EventCmdClearDepthStencilImage;
1305 }
1306
1307 si_blitter_begin(sctx, SI_CLEAR);
1308 util_blitter_clear(sctx->blitter, fb->width, fb->height, util_framebuffer_get_num_layers(fb),
1309 buffers, color, depth, stencil, sctx->framebuffer.nr_samples > 1);
1310 si_blitter_end(sctx);
1311
1312 /* This is only used by the driver, not the hw. */
1313 if (buffers & PIPE_CLEAR_DEPTH) {
1314 zstex->depth_cleared_level_mask |= BITFIELD_BIT(zsbuf->u.tex.level);
1315 zstex->depth_clear_value[zsbuf->u.tex.level] = depth;
1316 }
1317 }
1318
si_try_normal_clear(struct si_context * sctx,struct pipe_surface * dst,unsigned dstx,unsigned dsty,unsigned width,unsigned height,bool render_condition_enabled,unsigned buffers,const union pipe_color_union * color,float depth,unsigned stencil)1319 static bool si_try_normal_clear(struct si_context *sctx, struct pipe_surface *dst,
1320 unsigned dstx, unsigned dsty, unsigned width, unsigned height,
1321 bool render_condition_enabled, unsigned buffers,
1322 const union pipe_color_union *color,
1323 float depth, unsigned stencil)
1324 {
1325 /* This is worth it only if it's a whole image clear. */
1326 if (dstx == 0 && dsty == 0 &&
1327 width == dst->width &&
1328 height == dst->height &&
1329 dst->u.tex.first_layer == 0 &&
1330 dst->u.tex.last_layer == util_max_layer(dst->texture, dst->u.tex.level) &&
1331 /* pipe->clear honors render_condition, so only use it if it's unset or if it's set and enabled. */
1332 (!sctx->render_cond || render_condition_enabled) &&
1333 sctx->has_graphics) {
1334 struct pipe_context *ctx = &sctx->b;
1335 struct pipe_framebuffer_state saved_fb = {}, fb = {};
1336
1337 util_copy_framebuffer_state(&saved_fb, &sctx->framebuffer.state);
1338
1339 if (buffers & PIPE_CLEAR_COLOR) {
1340 fb.cbufs[0] = dst;
1341 fb.nr_cbufs = 1;
1342 } else {
1343 fb.zsbuf = dst;
1344 }
1345
1346 fb.width = dst->width;
1347 fb.height = dst->height;
1348
1349 ctx->set_framebuffer_state(ctx, &fb);
1350 ctx->clear(ctx, buffers, NULL, color, depth, stencil);
1351 ctx->set_framebuffer_state(ctx, &saved_fb);
1352
1353 util_copy_framebuffer_state(&saved_fb, NULL);
1354
1355 return true;
1356 }
1357
1358 return false;
1359 }
1360
si_compute_fast_clear_image(struct si_context * sctx,struct pipe_resource * dst,enum pipe_format format,unsigned level,const struct pipe_box * box,const union pipe_color_union * color,bool render_condition_enable,bool fail_if_slow)1361 bool si_compute_fast_clear_image(struct si_context *sctx, struct pipe_resource *dst,
1362 enum pipe_format format, unsigned level, const struct pipe_box *box,
1363 const union pipe_color_union *color, bool render_condition_enable,
1364 bool fail_if_slow)
1365 {
1366 struct si_texture *sdst = (struct si_texture*)dst;
1367
1368 if (!vi_dcc_enabled(sdst, level))
1369 return false;
1370
1371 /* Only the whole image can be cleared. */
1372 if (box->x != 0 || box->y != 0 || box->width != u_minify(dst->width0, level) ||
1373 box->height != u_minify(dst->height0, level) || box->depth != util_num_layers(dst, level))
1374 return false;
1375
1376 uint32_t dcc_value;
1377 bool eliminate_needed;
1378
1379 /* Get the DCC clear value. */
1380 if (sctx->gfx_level >= GFX11) {
1381 if (!gfx11_get_dcc_clear_parameters(sctx->screen, sdst, level, format,
1382 color, &dcc_value, fail_if_slow))
1383 return false;
1384 } else {
1385 if (!gfx8_get_dcc_clear_parameters(sctx->screen, dst->format, format, color, &dcc_value,
1386 &eliminate_needed) ||
1387 eliminate_needed)
1388 return false;
1389 }
1390
1391 /* Get DCC clear info. */
1392 struct si_clear_info info[3]; /* DCC + CMASK + clear_image_dcc_single */
1393 unsigned num_clears = 0, clear_types = 0;
1394
1395 if (!vi_dcc_get_clear_info(sctx, sdst, level, dcc_value, &info[num_clears]))
1396 return false;
1397
1398 num_clears++;
1399 clear_types |= SI_CLEAR_TYPE_DCC;
1400 si_mark_display_dcc_dirty(sctx, sdst);
1401
1402 if (sctx->gfx_level >= GFX11 && dcc_value == GFX11_DCC_CLEAR_SINGLE) {
1403 /* Put this clear first by moving other clears after it because this clear has
1404 * the most GPU overhead.
1405 */
1406 memmove(&info[1], &info[0], sizeof(info[0]) * num_clears);
1407 si_init_clear_image_dcc_single(&info[0], sdst, level, format, color);
1408 num_clears++;
1409 }
1410
1411 /* DCC fast clear with MSAA should clear CMASK to 0xC. */
1412 if (dst->nr_samples >= 2 && sdst->cmask_buffer) {
1413 assert(sctx->gfx_level < GFX11); /* no FMASK/CMASK on GFX11 */
1414 assert(num_clears < ARRAY_SIZE(info));
1415 si_init_buffer_clear(&info[num_clears++], &sdst->cmask_buffer->b.b,
1416 sdst->surface.cmask_offset, sdst->surface.cmask_size, 0xCCCCCCCC);
1417 clear_types |= SI_CLEAR_TYPE_CMASK;
1418
1419 if (!(sdst->dirty_level_mask & BITFIELD_BIT(level))) {
1420 sdst->dirty_level_mask |= BITFIELD_BIT(level);
1421 p_atomic_inc(&sctx->screen->compressed_colortex_counter);
1422 }
1423 }
1424
1425 assert(num_clears <= ARRAY_SIZE(info));
1426 si_execute_clears(sctx, info, num_clears, clear_types, render_condition_enable);
1427 return true;
1428 }
1429
si_clear_render_target(struct pipe_context * ctx,struct pipe_surface * dst,const union pipe_color_union * color,unsigned dstx,unsigned dsty,unsigned width,unsigned height,bool render_condition_enabled)1430 static void si_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dst,
1431 const union pipe_color_union *color, unsigned dstx,
1432 unsigned dsty, unsigned width, unsigned height,
1433 bool render_condition_enabled)
1434 {
1435 struct si_context *sctx = (struct si_context *)ctx;
1436 struct si_texture *sdst = (struct si_texture *)dst->texture;
1437
1438 /* For older chips that can do fast clear with any clear color (using GFX8_DCC_CLEAR_REG
1439 * or CMASK).
1440 */
1441 if (sctx->gfx_level <= GFX10_3 &&
1442 (vi_dcc_enabled(sdst, dst->u.tex.level) ||
1443 /* GFX6-9 allow CMASK without MSAA and allocate it on demand, but only 8-64bpp. */
1444 (sctx->gfx_level <= GFX9 && sdst->surface.bpe <= 8)) &&
1445 si_try_normal_clear(sctx, dst, dstx, dsty, width, height, render_condition_enabled,
1446 PIPE_CLEAR_COLOR0, color, 0, 0))
1447 return;
1448
1449 struct pipe_box box;
1450 u_box_3d(dstx, dsty, dst->u.tex.first_layer, width, height,
1451 dst->u.tex.last_layer - dst->u.tex.first_layer + 1, &box);
1452
1453 if (si_compute_fast_clear_image(sctx, dst->texture, dst->format, dst->u.tex.level, &box, color,
1454 render_condition_enabled, true))
1455 return;
1456
1457 if (si_compute_clear_image(sctx, dst->texture, dst->format, dst->u.tex.level, &box, color,
1458 render_condition_enabled, true))
1459 return;
1460
1461 si_gfx_clear_render_target(ctx, dst, color, dstx, dsty, width, height,
1462 render_condition_enabled);
1463 }
1464
si_gfx_clear_render_target(struct pipe_context * ctx,struct pipe_surface * dst,const union pipe_color_union * color,unsigned dstx,unsigned dsty,unsigned width,unsigned height,bool render_condition_enabled)1465 void si_gfx_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dst,
1466 const union pipe_color_union *color, unsigned dstx,
1467 unsigned dsty, unsigned width, unsigned height,
1468 bool render_condition_enabled)
1469 {
1470 struct si_context *sctx = (struct si_context *)ctx;
1471
1472 si_blitter_begin(sctx,
1473 SI_CLEAR_SURFACE | (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
1474 util_blitter_clear_render_target(sctx->blitter, dst, color, dstx, dsty, width, height);
1475 si_blitter_end(sctx);
1476 }
1477
si_clear_depth_stencil(struct pipe_context * ctx,struct pipe_surface * dst,unsigned clear_flags,double depth,unsigned stencil,unsigned dstx,unsigned dsty,unsigned width,unsigned height,bool render_condition_enabled)1478 static void si_clear_depth_stencil(struct pipe_context *ctx, struct pipe_surface *dst,
1479 unsigned clear_flags, double depth, unsigned stencil,
1480 unsigned dstx, unsigned dsty, unsigned width, unsigned height,
1481 bool render_condition_enabled)
1482 {
1483 struct si_context *sctx = (struct si_context *)ctx;
1484 union pipe_color_union unused = {};
1485
1486 /* Fast path that just clears HTILE. */
1487 if (si_try_normal_clear(sctx, dst, dstx, dsty, width, height, render_condition_enabled,
1488 clear_flags, &unused, depth, stencil))
1489 return;
1490
1491 si_blitter_begin(sctx,
1492 SI_CLEAR_SURFACE | (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
1493 util_blitter_clear_depth_stencil(sctx->blitter, dst, clear_flags, depth, stencil, dstx, dsty,
1494 width, height);
1495 si_blitter_end(sctx);
1496 }
1497
si_init_clear_functions(struct si_context * sctx)1498 void si_init_clear_functions(struct si_context *sctx)
1499 {
1500 sctx->b.clear_render_target = si_clear_render_target;
1501 sctx->b.clear_texture = u_default_clear_texture;
1502
1503 if (sctx->has_graphics) {
1504 if (sctx->gfx_level >= GFX12)
1505 sctx->b.clear = gfx12_clear;
1506 else
1507 sctx->b.clear = gfx6_clear;
1508
1509 sctx->b.clear_depth_stencil = si_clear_depth_stencil;
1510 }
1511 }
1512