xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/si_clear.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2017 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include "si_pipe.h"
8 #include "sid.h"
9 #include "util/format/u_format.h"
10 #include "util/u_pack_color.h"
11 #include "util/u_surface.h"
12 #include "ac_formats.h"
13 
14 enum {
15    SI_CLEAR = SI_SAVE_FRAGMENT_STATE | SI_SAVE_FRAGMENT_CONSTANT,
16    SI_CLEAR_SURFACE = SI_SAVE_FRAMEBUFFER | SI_SAVE_FRAGMENT_STATE,
17 };
18 
si_init_buffer_clear(struct si_clear_info * info,struct pipe_resource * resource,uint64_t offset,uint32_t size,uint32_t clear_value)19 void si_init_buffer_clear(struct si_clear_info *info,
20                           struct pipe_resource *resource, uint64_t offset,
21                           uint32_t size, uint32_t clear_value)
22 {
23    info->resource = resource;
24    info->offset = offset;
25    info->size = size;
26    info->clear_value = clear_value;
27    info->writemask = 0xffffffff;
28    info->is_dcc_msaa = false;
29    info->format = PIPE_FORMAT_NONE;
30 }
31 
si_init_buffer_clear_rmw(struct si_clear_info * info,struct pipe_resource * resource,uint64_t offset,uint32_t size,uint32_t clear_value,uint32_t writemask)32 static void si_init_buffer_clear_rmw(struct si_clear_info *info,
33                                      struct pipe_resource *resource, uint64_t offset,
34                                      uint32_t size, uint32_t clear_value, uint32_t writemask)
35 {
36    si_init_buffer_clear(info, resource, offset, size, clear_value);
37    info->writemask = writemask;
38    info->format = PIPE_FORMAT_NONE;
39 }
40 
si_init_clear_image_dcc_single(struct si_clear_info * info,struct si_texture * tex,unsigned level,enum pipe_format format,const union pipe_color_union * color)41 static void si_init_clear_image_dcc_single(struct si_clear_info *info, struct si_texture *tex,
42                                            unsigned level, enum pipe_format format,
43                                            const union pipe_color_union *color)
44 {
45    info->resource = &tex->buffer.b.b;
46    info->level = level;
47    info->format = format;
48    memcpy(&info->color, color, sizeof(info->color));
49 }
50 
si_execute_clears(struct si_context * sctx,struct si_clear_info * info,unsigned num_clears,unsigned types,bool render_condition_enable)51 void si_execute_clears(struct si_context *sctx, struct si_clear_info *info,
52                        unsigned num_clears, unsigned types, bool render_condition_enable)
53 {
54    if (!num_clears)
55       return;
56 
57    /* Flush caches and wait for idle. */
58    if (types & (SI_CLEAR_TYPE_CMASK | SI_CLEAR_TYPE_DCC)) {
59       si_make_CB_shader_coherent(sctx, sctx->framebuffer.nr_samples,
60                                  sctx->framebuffer.CB_has_shader_readable_metadata,
61                                  sctx->framebuffer.all_DCC_pipe_aligned);
62    }
63 
64    if (types & SI_CLEAR_TYPE_HTILE) {
65       si_make_DB_shader_coherent(sctx, sctx->framebuffer.nr_samples, sctx->framebuffer.has_stencil,
66                                  sctx->framebuffer.DB_has_shader_readable_metadata);
67    }
68 
69    /* Invalidate the VMEM cache because we always use compute. */
70    sctx->barrier_flags |= SI_BARRIER_INV_VMEM;
71 
72    /* GFX6-8: CB and DB don't use L2. */
73    if (sctx->gfx_level <= GFX8)
74       sctx->barrier_flags |= SI_BARRIER_INV_L2;
75 
76    si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
77 
78    /* Execute clears. */
79    for (unsigned i = 0; i < num_clears; i++) {
80       if (info[i].format) {
81          si_compute_clear_image_dcc_single(sctx, (struct si_texture*)info[i].resource,
82                                            info[i].level, info[i].format, &info[i].color,
83                                            render_condition_enable);
84          continue;
85       }
86 
87       if (info[i].is_dcc_msaa) {
88          gfx9_clear_dcc_msaa(sctx, info[i].resource, info[i].clear_value, render_condition_enable);
89          continue;
90       }
91 
92       assert(info[i].size > 0);
93 
94       if (info[i].writemask != 0xffffffff) {
95          si_compute_clear_buffer_rmw(sctx, info[i].resource, info[i].offset, info[i].size,
96                                      info[i].clear_value, info[i].writemask,
97                                      render_condition_enable);
98       } else {
99          /* Compute shaders are much faster on both dGPUs and APUs. Don't use CP DMA. */
100          si_clear_buffer(sctx, info[i].resource, info[i].offset, info[i].size,
101                          &info[i].clear_value, 4, SI_COMPUTE_CLEAR_METHOD,
102                          render_condition_enable);
103       }
104    }
105 
106    /* Wait for idle. */
107    sctx->barrier_flags |= SI_BARRIER_SYNC_CS;
108 
109    /* GFX6-8: CB and DB don't use L2. */
110    if (sctx->gfx_level <= GFX8)
111       sctx->barrier_flags |= SI_BARRIER_WB_L2;
112 
113    si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
114 }
115 
si_alloc_separate_cmask(struct si_screen * sscreen,struct si_texture * tex)116 static bool si_alloc_separate_cmask(struct si_screen *sscreen, struct si_texture *tex)
117 {
118    assert(sscreen->info.gfx_level < GFX11);
119 
120    /* CMASK for MSAA is allocated in advance or always disabled
121     * by "nofmask" option.
122     */
123    if (tex->cmask_buffer)
124       return true;
125 
126    if (!tex->surface.cmask_size)
127       return false;
128 
129    tex->cmask_buffer =
130       si_aligned_buffer_create(&sscreen->b, PIPE_RESOURCE_FLAG_UNMAPPABLE, PIPE_USAGE_DEFAULT,
131                                tex->surface.cmask_size, 1 << tex->surface.cmask_alignment_log2);
132    if (tex->cmask_buffer == NULL)
133       return false;
134 
135    /* These 2 fields are part of the framebuffer state but dirtying the atom
136     * will be done by the caller.
137     */
138    tex->cmask_base_address_reg = tex->cmask_buffer->gpu_address >> 8;
139    tex->cb_color_info |= S_028C70_FAST_CLEAR(1);
140 
141    p_atomic_inc(&sscreen->compressed_colortex_counter);
142    return true;
143 }
144 
si_set_clear_color(struct si_texture * tex,enum pipe_format surface_format,const union pipe_color_union * color)145 static bool si_set_clear_color(struct si_texture *tex, enum pipe_format surface_format,
146                                const union pipe_color_union *color)
147 {
148    union util_color uc;
149 
150    memset(&uc, 0, sizeof(uc));
151 
152    if (tex->surface.bpe == 16) {
153       /* DCC fast clear only:
154        *   CLEAR_WORD0 = R = G = B
155        *   CLEAR_WORD1 = A
156        */
157       assert(color->ui[0] == color->ui[1] && color->ui[0] == color->ui[2]);
158       uc.ui[0] = color->ui[0];
159       uc.ui[1] = color->ui[3];
160    } else {
161       if (tex->swap_rgb_to_bgr)
162          surface_format = util_format_rgb_to_bgr(surface_format);
163 
164       util_pack_color_union(surface_format, &uc, color);
165    }
166 
167    if (memcmp(tex->color_clear_value, &uc, 2 * sizeof(uint32_t)) == 0)
168       return false;
169 
170    memcpy(tex->color_clear_value, &uc, 2 * sizeof(uint32_t));
171    return true;
172 }
173 
gfx8_get_dcc_clear_parameters(struct si_screen * sscreen,enum pipe_format base_format,enum pipe_format surface_format,const union pipe_color_union * color,uint32_t * clear_value,bool * eliminate_needed)174 static bool gfx8_get_dcc_clear_parameters(struct si_screen *sscreen, enum pipe_format base_format,
175                                           enum pipe_format surface_format,
176                                           const union pipe_color_union *color, uint32_t *clear_value,
177                                           bool *eliminate_needed)
178 {
179    /* If we want to clear without needing a fast clear eliminate step, we
180     * can set color and alpha independently to 0 or 1 (or 0/max for integer
181     * formats).
182     */
183    bool values[4] = {};      /* whether to clear to 0 or 1 */
184    bool color_value = false; /* clear color to 0 or 1 */
185    bool alpha_value = false; /* clear alpha to 0 or 1 */
186    int alpha_channel;        /* index of the alpha component */
187    bool has_color = false;
188    bool has_alpha = false;
189 
190    const struct util_format_description *desc =
191       util_format_description(ac_simplify_cb_format(surface_format));
192 
193    /* 128-bit fast clear with different R,G,B values is unsupported. */
194    if (desc->block.bits == 128 && (color->ui[0] != color->ui[1] || color->ui[0] != color->ui[2]))
195       return false;
196 
197    *eliminate_needed = true;
198    *clear_value = GFX8_DCC_CLEAR_REG;
199 
200    if (desc->layout != UTIL_FORMAT_LAYOUT_PLAIN)
201       return true; /* need ELIMINATE_FAST_CLEAR */
202 
203    bool base_alpha_is_on_msb = ac_alpha_is_on_msb(&sscreen->info, base_format);
204    bool surf_alpha_is_on_msb = ac_alpha_is_on_msb(&sscreen->info, surface_format);
205 
206    /* Formats with 3 channels can't have alpha. */
207    if (desc->nr_channels == 3)
208       alpha_channel = -1;
209    else if (surf_alpha_is_on_msb)
210       alpha_channel = desc->nr_channels - 1;
211    else
212       alpha_channel = 0;
213 
214    for (int i = 0; i < 4; ++i) {
215       if (desc->swizzle[i] >= PIPE_SWIZZLE_0)
216          continue;
217 
218       if (desc->channel[i].pure_integer && desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED) {
219          /* Use the maximum value for clamping the clear color. */
220          int max = u_bit_consecutive(0, desc->channel[i].size - 1);
221 
222          values[i] = color->i[i] != 0;
223          if (color->i[i] != 0 && MIN2(color->i[i], max) != max)
224             return true; /* need ELIMINATE_FAST_CLEAR */
225       } else if (desc->channel[i].pure_integer &&
226                  desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED) {
227          /* Use the maximum value for clamping the clear color. */
228          unsigned max = u_bit_consecutive(0, desc->channel[i].size);
229 
230          values[i] = color->ui[i] != 0U;
231          if (color->ui[i] != 0U && MIN2(color->ui[i], max) != max)
232             return true; /* need ELIMINATE_FAST_CLEAR */
233       } else {
234          values[i] = color->f[i] != 0.0F;
235          if (color->f[i] != 0.0F && color->f[i] != 1.0F)
236             return true; /* need ELIMINATE_FAST_CLEAR */
237       }
238 
239       if (desc->swizzle[i] == alpha_channel) {
240          alpha_value = values[i];
241          has_alpha = true;
242       } else {
243          color_value = values[i];
244          has_color = true;
245       }
246    }
247 
248    /* If alpha isn't present, make it the same as color, and vice versa. */
249    if (!has_alpha)
250       alpha_value = color_value;
251    else if (!has_color)
252       color_value = alpha_value;
253 
254    if (color_value != alpha_value && base_alpha_is_on_msb != surf_alpha_is_on_msb)
255       return true; /* require ELIMINATE_FAST_CLEAR */
256 
257    /* Check if all color values are equal if they are present. */
258    for (int i = 0; i < 4; ++i) {
259       if (desc->swizzle[i] <= PIPE_SWIZZLE_W && desc->swizzle[i] != alpha_channel &&
260           values[i] != color_value)
261          return true; /* require ELIMINATE_FAST_CLEAR */
262    }
263 
264    /* This doesn't need ELIMINATE_FAST_CLEAR.
265     * On chips predating Raven2, the DCC clear codes and the CB clear
266     * color registers must match.
267     */
268    *eliminate_needed = false;
269 
270    if (color_value) {
271       if (alpha_value)
272          *clear_value = GFX8_DCC_CLEAR_1111;
273       else
274          *clear_value = GFX8_DCC_CLEAR_1110;
275    } else {
276       if (alpha_value)
277          *clear_value = GFX8_DCC_CLEAR_0001;
278       else
279          *clear_value = GFX8_DCC_CLEAR_0000;
280    }
281    return true;
282 }
283 
gfx11_get_dcc_clear_parameters(struct si_screen * sscreen,struct si_texture * tex,unsigned level,enum pipe_format surface_format,const union pipe_color_union * color,uint32_t * clear_value,bool fail_if_slow)284 static bool gfx11_get_dcc_clear_parameters(struct si_screen *sscreen, struct si_texture *tex,
285                                            unsigned level, enum pipe_format surface_format,
286                                            const union pipe_color_union *color, uint32_t *clear_value,
287                                            bool fail_if_slow)
288 {
289    const struct util_format_description *desc =
290       util_format_description(ac_simplify_cb_format(surface_format));
291    unsigned start_bit = UINT_MAX;
292    unsigned end_bit = 0;
293 
294    /* Find the used bit range. */
295    for (unsigned i = 0; i < 4; i++) {
296       unsigned swizzle = desc->swizzle[i];
297 
298       if (swizzle >= PIPE_SWIZZLE_0)
299          continue;
300 
301       start_bit = MIN2(start_bit, desc->channel[swizzle].shift);
302       end_bit = MAX2(end_bit, desc->channel[swizzle].shift + desc->channel[swizzle].size);
303    }
304 
305    union {
306       uint8_t ub[16];
307       uint16_t us[8];
308       uint32_t ui[4];
309    } value = {};
310    util_pack_color_union(surface_format, (union util_color*)&value, color);
311 
312    /* Check the cases where all components or bits are either all 0 or all 1. */
313    bool all_bits_are_0 = true;
314    bool all_bits_are_1 = true;
315    bool all_words_are_fp16_1 = false;
316    bool all_words_are_fp32_1 = false;
317 
318    for (unsigned i = start_bit; i < end_bit; i++) {
319       bool bit = value.ub[i / 8] & BITFIELD_BIT(i % 8);
320 
321       all_bits_are_0 &= !bit;
322       all_bits_are_1 &= bit;
323    }
324 
325    if (start_bit % 16 == 0 && end_bit % 16 == 0) {
326       all_words_are_fp16_1 = true;
327       for (unsigned i = start_bit / 16; i < end_bit / 16; i++)
328          all_words_are_fp16_1 &= value.us[i] == 0x3c00;
329    }
330 
331    if (start_bit % 32 == 0 && end_bit % 32 == 0) {
332       all_words_are_fp32_1 = true;
333       for (unsigned i = start_bit / 32; i < end_bit / 32; i++)
334          all_words_are_fp32_1 &= value.ui[i] == 0x3f800000;
335    }
336 
337 #if 0 /* debug code */
338    int i = util_format_get_first_non_void_channel(surface_format);
339    if (desc->channel[i].type == UTIL_FORMAT_TYPE_SIGNED && desc->channel[i].pure_integer) {
340       printf("%i %i %i %i\n", color->i[0], color->i[1], color->i[2], color->i[3]);
341    } else if (desc->channel[i].type == UTIL_FORMAT_TYPE_UNSIGNED && desc->channel[i].pure_integer) {
342       printf("%u %u %u %u\n", color->ui[0], color->ui[1], color->ui[2], color->ui[3]);
343    } else {
344       printf("%f %f %f %f\n", color->f[0], color->f[1], color->f[2], color->f[3]);
345    }
346    for (unsigned i = 0; i < end_bit / 8; i++)
347       printf("%02x", value.ub[i]);
348    printf("\n");
349    printf("bits=[%u..%u)%s%s%s%s\n", start_bit, end_bit,
350           all_bits_are_0 ? ", all 0" : "",
351           all_bits_are_1 ? ", all 1" : "",
352           all_words_are_fp16_1 ? ", all fp16 1" : "",
353           all_words_are_fp32_1 ? ", all fp32 1" : "");
354 #endif
355 
356    *clear_value = 0;
357 
358    if (all_bits_are_0 || all_bits_are_1 || all_words_are_fp16_1 || all_words_are_fp32_1) {
359       if (all_bits_are_0)
360          *clear_value = GFX11_DCC_CLEAR_0000;
361       else if (all_bits_are_1)
362          *clear_value = GFX11_DCC_CLEAR_1111_UNORM;
363       else if (all_words_are_fp16_1)
364          *clear_value = GFX11_DCC_CLEAR_1111_FP16;
365       else if (all_words_are_fp32_1)
366          *clear_value = GFX11_DCC_CLEAR_1111_FP32;
367 
368       return true;
369    }
370 
371    /* Check 0001 and 1110 cases. */
372    if (desc->nr_channels == 2 && desc->channel[0].size == 8) {
373       if (value.ub[0] == 0x00 && value.ub[1] == 0xff) {
374          *clear_value = GFX11_DCC_CLEAR_0001_UNORM;
375          return true;
376       } else if (value.ub[0] == 0xff && value.ub[1] == 0x00) {
377          *clear_value = GFX11_DCC_CLEAR_1110_UNORM;
378          return true;
379       }
380    } else if (desc->nr_channels == 4 && desc->channel[0].size == 8) {
381       if (value.ub[0] == 0x00 && value.ub[1] == 0x00 &&
382           value.ub[2] == 0x00 && value.ub[3] == 0xff) {
383          *clear_value = GFX11_DCC_CLEAR_0001_UNORM;
384          return true;
385       } else if (value.ub[0] == 0xff && value.ub[1] == 0xff &&
386                  value.ub[2] == 0xff && value.ub[3] == 0x00) {
387          *clear_value = GFX11_DCC_CLEAR_1110_UNORM;
388          return true;
389       }
390    } else if (desc->nr_channels == 4 && desc->channel[0].size == 16) {
391       if (value.us[0] == 0x0000 && value.us[1] == 0x0000 &&
392           value.us[2] == 0x0000 && value.us[3] == 0xffff) {
393          *clear_value = GFX11_DCC_CLEAR_0001_UNORM;
394          return true;
395       } else if (value.us[0] == 0xffff && value.us[1] == 0xffff &&
396                  value.us[2] == 0xffff && value.us[3] == 0x0000) {
397          *clear_value = GFX11_DCC_CLEAR_1110_UNORM;
398          return true;
399       }
400    }
401 
402    /* Estimate whether DCC clear-to-single is better than a slow clear. */
403    unsigned width = u_minify(tex->buffer.b.b.width0, level);
404    unsigned height = u_minify(tex->buffer.b.b.height0, level);
405    unsigned depth = util_num_layers(&tex->buffer.b.b, level);
406    unsigned num_samples = MAX2(tex->buffer.b.b.nr_samples, 1);
407    uint64_t size = (uint64_t)width * height * depth * num_samples * tex->surface.bpe;
408 
409    /* These cases perform exceptionally well with DCC clear-to-single, so make them more likely. */
410    if ((num_samples <= 2 && tex->surface.bpe <= 2) ||
411        (num_samples == 1 && tex->surface.bpe == 4))
412       size *= 2;
413 
414    /* These cases perform terribly with DCC clear-to-single. */
415    if (tex->buffer.b.b.nr_samples >= 4 && tex->surface.bpe >= 4)
416       size = 0;
417 
418    /* This is mostly optimal for Navi31. The scaling effect of num_rb on other chips is guessed. */
419    if (!fail_if_slow || size >= sscreen->info.num_rb * 512 * 1024) {
420       *clear_value = GFX11_DCC_CLEAR_SINGLE;
421       return true;
422    }
423 
424    return false;
425 }
426 
vi_dcc_get_clear_info(struct si_context * sctx,struct si_texture * tex,unsigned level,unsigned clear_value,struct si_clear_info * out)427 bool vi_dcc_get_clear_info(struct si_context *sctx, struct si_texture *tex, unsigned level,
428                            unsigned clear_value, struct si_clear_info *out)
429 {
430    struct pipe_resource *dcc_buffer = &tex->buffer.b.b;
431    uint64_t dcc_offset = tex->surface.meta_offset;
432    uint32_t clear_size;
433 
434    assert(vi_dcc_enabled(tex, level));
435 
436    if (sctx->gfx_level >= GFX10) {
437       /* 4x and 8x MSAA needs a sophisticated compute shader for
438        * the clear. GFX11 doesn't need that.
439        */
440       if (sctx->gfx_level < GFX11 && tex->buffer.b.b.nr_storage_samples >= 4)
441          return false;
442 
443       unsigned num_layers = util_num_layers(&tex->buffer.b.b, level);
444 
445       if (num_layers == 1) {
446          /* Clear a specific level. */
447          dcc_offset += tex->surface.u.gfx9.meta_levels[level].offset;
448          clear_size = tex->surface.u.gfx9.meta_levels[level].size;
449       } else if (tex->buffer.b.b.last_level == 0) {
450          /* Clear all layers having only 1 level. */
451          clear_size = tex->surface.meta_size;
452       } else {
453          /* Clearing DCC with both multiple levels and multiple layers is not
454           * implemented.
455           */
456          return false;
457       }
458    } else if (sctx->gfx_level == GFX9) {
459       /* TODO: Implement DCC fast clear for level 0 of mipmapped textures. Mipmapped
460        * DCC has to clear a rectangular area of DCC for level 0 (because the whole miptree
461        * is organized in a 2D plane).
462        */
463       if (tex->buffer.b.b.last_level > 0)
464          return false;
465 
466       /* 4x and 8x MSAA need to clear only sample 0 and 1 in a compute shader and leave other
467        * samples untouched. (only the first 2 samples are compressed) */
468       if (tex->buffer.b.b.nr_storage_samples >= 4) {
469          si_init_buffer_clear(out, dcc_buffer, 0, 0, clear_value);
470          out->is_dcc_msaa = true;
471          return true;
472       }
473 
474       clear_size = tex->surface.meta_size;
475    } else {
476       unsigned num_layers = util_num_layers(&tex->buffer.b.b, level);
477 
478       /* If this is 0, fast clear isn't possible. (can occur with MSAA) */
479       if (!tex->surface.u.legacy.color.dcc_level[level].dcc_fast_clear_size)
480          return false;
481 
482       /* Layered 4x and 8x MSAA DCC fast clears need to clear
483        * dcc_fast_clear_size bytes for each layer. A compute shader
484        * would be more efficient than separate per-layer clear operations.
485        */
486       if (tex->buffer.b.b.nr_storage_samples >= 4 && num_layers > 1)
487          return false;
488 
489       dcc_offset += tex->surface.u.legacy.color.dcc_level[level].dcc_offset;
490       clear_size = tex->surface.u.legacy.color.dcc_level[level].dcc_fast_clear_size;
491    }
492 
493    si_init_buffer_clear(out, dcc_buffer, dcc_offset, clear_size, clear_value);
494    return true;
495 }
496 
497 /* Set the same micro tile mode as the destination of the last MSAA resolve.
498  * This allows hitting the MSAA resolve fast path, which requires that both
499  * src and dst micro tile modes match.
500  */
si_set_optimal_micro_tile_mode(struct si_screen * sscreen,struct si_texture * tex)501 static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen, struct si_texture *tex)
502 {
503    if (sscreen->info.gfx_level >= GFX10 || tex->buffer.b.is_shared ||
504        tex->buffer.b.b.nr_samples <= 1 ||
505        tex->surface.micro_tile_mode == tex->last_msaa_resolve_target_micro_mode)
506       return;
507 
508    assert(sscreen->info.gfx_level >= GFX9 ||
509           tex->surface.u.legacy.level[0].mode == RADEON_SURF_MODE_2D);
510    assert(tex->buffer.b.b.last_level == 0);
511 
512    if (sscreen->info.gfx_level >= GFX9) {
513       /* 4K or larger tiles only. 0 is linear. 1-3 are 256B tiles. */
514       assert(tex->surface.u.gfx9.swizzle_mode >= 4);
515 
516       /* If you do swizzle_mode % 4, you'll get:
517        *   0 = Depth
518        *   1 = Standard,
519        *   2 = Displayable
520        *   3 = Rotated
521        *
522        * Depth-sample order isn't allowed:
523        */
524       assert(tex->surface.u.gfx9.swizzle_mode % 4 != 0);
525 
526       switch (tex->last_msaa_resolve_target_micro_mode) {
527       case RADEON_MICRO_MODE_DISPLAY:
528          tex->surface.u.gfx9.swizzle_mode &= ~0x3;
529          tex->surface.u.gfx9.swizzle_mode += 2; /* D */
530          break;
531       case RADEON_MICRO_MODE_STANDARD:
532          tex->surface.u.gfx9.swizzle_mode &= ~0x3;
533          tex->surface.u.gfx9.swizzle_mode += 1; /* S */
534          break;
535       case RADEON_MICRO_MODE_RENDER:
536          tex->surface.u.gfx9.swizzle_mode &= ~0x3;
537          tex->surface.u.gfx9.swizzle_mode += 3; /* R */
538          break;
539       default: /* depth */
540          assert(!"unexpected micro mode");
541          return;
542       }
543    } else if (sscreen->info.gfx_level >= GFX7) {
544       /* These magic numbers were copied from addrlib. It doesn't use
545        * any definitions for them either. They are all 2D_TILED_THIN1
546        * modes with different bpp and micro tile mode.
547        */
548       switch (tex->last_msaa_resolve_target_micro_mode) {
549       case RADEON_MICRO_MODE_DISPLAY:
550          tex->surface.u.legacy.tiling_index[0] = 10;
551          break;
552       case RADEON_MICRO_MODE_STANDARD:
553          tex->surface.u.legacy.tiling_index[0] = 14;
554          break;
555       case RADEON_MICRO_MODE_RENDER:
556          tex->surface.u.legacy.tiling_index[0] = 28;
557          break;
558       default: /* depth, thick */
559          assert(!"unexpected micro mode");
560          return;
561       }
562    } else { /* GFX6 */
563       switch (tex->last_msaa_resolve_target_micro_mode) {
564       case RADEON_MICRO_MODE_DISPLAY:
565          switch (tex->surface.bpe) {
566          case 1:
567             tex->surface.u.legacy.tiling_index[0] = 10;
568             break;
569          case 2:
570             tex->surface.u.legacy.tiling_index[0] = 11;
571             break;
572          default: /* 4, 8 */
573             tex->surface.u.legacy.tiling_index[0] = 12;
574             break;
575          }
576          break;
577       case RADEON_MICRO_MODE_STANDARD:
578          switch (tex->surface.bpe) {
579          case 1:
580             tex->surface.u.legacy.tiling_index[0] = 14;
581             break;
582          case 2:
583             tex->surface.u.legacy.tiling_index[0] = 15;
584             break;
585          case 4:
586             tex->surface.u.legacy.tiling_index[0] = 16;
587             break;
588          default: /* 8, 16 */
589             tex->surface.u.legacy.tiling_index[0] = 17;
590             break;
591          }
592          break;
593       default: /* depth, thick */
594          assert(!"unexpected micro mode");
595          return;
596       }
597    }
598 
599    tex->surface.micro_tile_mode = tex->last_msaa_resolve_target_micro_mode;
600 
601    p_atomic_inc(&sscreen->dirty_tex_counter);
602 }
603 
si_get_htile_clear_value(struct si_texture * tex,float depth)604 static uint32_t si_get_htile_clear_value(struct si_texture *tex, float depth)
605 {
606    /* Maximum 14-bit UINT value. */
607    const uint32_t max_z_value = 0x3FFF;
608 
609    /* For clears, Zmask and Smem will always be set to zero. */
610    const uint32_t zmask = 0;
611    const uint32_t smem  = 0;
612 
613    /* Convert depthValue to 14-bit zmin/zmax uint values. */
614    const uint32_t zmin = lroundf(depth * max_z_value);
615    const uint32_t zmax = zmin;
616 
617    if (tex->htile_stencil_disabled) {
618       /* Z-only HTILE is laid out as follows:
619        * |31     18|17      4|3     0|
620        * +---------+---------+-------+
621        * |  Max Z  |  Min Z  | ZMask |
622        */
623       return ((zmax & 0x3FFF) << 18) |
624              ((zmin & 0x3FFF) << 4) |
625              ((zmask & 0xF) << 0);
626    } else {
627       /* Z+S HTILE is laid out as-follows:
628        * |31       12|11 10|9    8|7   6|5   4|3     0|
629        * +-----------+-----+------+-----+-----+-------+
630        * |  Z Range  |     | SMem | SR1 | SR0 | ZMask |
631        *
632        * The base value for zRange is either zMax or zMin, depending on ZRANGE_PRECISION.
633        * For a fast clear, zMin == zMax == clearValue. This means that the base will
634        * always be the clear value (converted to 14-bit UINT).
635        *
636        * When abs(zMax-zMin) < 16, the delta is equal to the difference. In the case of
637        * fast clears, where zMax == zMin, the delta is always zero.
638        */
639       const uint32_t delta = 0;
640       const uint32_t zrange = (zmax << 6) | delta;
641 
642       /* SResults 0 & 1 are set based on the stencil compare state.
643        * For fast-clear, the default value of sr0 and sr1 are both 0x3.
644        */
645       const uint32_t sresults = 0xf;
646 
647       return ((zrange & 0xFFFFF) << 12) |
648              ((smem & 0x3) <<  8) |
649              ((sresults & 0xF) <<  4) |
650              ((zmask & 0xF) <<  0);
651    }
652 }
653 
si_can_fast_clear_depth(struct si_texture * zstex,unsigned level,float depth,unsigned buffers)654 static bool si_can_fast_clear_depth(struct si_texture *zstex, unsigned level, float depth,
655                                     unsigned buffers)
656 {
657    /* TC-compatible HTILE only supports depth clears to 0 or 1. */
658    return buffers & PIPE_CLEAR_DEPTH &&
659           si_htile_enabled(zstex, level, PIPE_MASK_Z) &&
660           (!zstex->tc_compatible_htile || depth == 0 || depth == 1);
661 }
662 
si_can_fast_clear_stencil(struct si_texture * zstex,unsigned level,uint8_t stencil,unsigned buffers)663 static bool si_can_fast_clear_stencil(struct si_texture *zstex, unsigned level, uint8_t stencil,
664                                       unsigned buffers)
665 {
666    /* TC-compatible HTILE only supports stencil clears to 0. */
667    return buffers & PIPE_CLEAR_STENCIL &&
668           si_htile_enabled(zstex, level, PIPE_MASK_S) &&
669           (!zstex->tc_compatible_htile || stencil == 0);
670 }
671 
si_fast_clear(struct si_context * sctx,unsigned * buffers,const union pipe_color_union * color,float depth,uint8_t stencil)672 static void si_fast_clear(struct si_context *sctx, unsigned *buffers,
673                           const union pipe_color_union *color, float depth, uint8_t stencil)
674 {
675    struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
676    struct si_clear_info info[8 * 3 + 1]; /* MRTs * (CMASK + DCC + clear_dcc_single) + ZS */
677    unsigned num_clears = 0;
678    unsigned clear_types = 0;
679    unsigned num_pixels = fb->width * fb->height;
680 
681    assert(sctx->gfx_level < GFX12);
682 
683    /* This function is broken in BE, so just disable this path for now */
684 #if UTIL_ARCH_BIG_ENDIAN
685    return;
686 #endif
687 
688    /* Gather information about what to clear. */
689    unsigned color_buffer_mask = (*buffers & PIPE_CLEAR_COLOR) >> util_logbase2(PIPE_CLEAR_COLOR0);
690    while (color_buffer_mask) {
691       unsigned i = u_bit_scan(&color_buffer_mask);
692 
693       struct si_texture *tex = (struct si_texture *)fb->cbufs[i]->texture;
694       unsigned level = fb->cbufs[i]->u.tex.level;
695       unsigned num_layers = util_num_layers(&tex->buffer.b.b, level);
696 
697       /* the clear is allowed if all layers are bound */
698       if (fb->cbufs[i]->u.tex.first_layer != 0 ||
699           fb->cbufs[i]->u.tex.last_layer != num_layers - 1) {
700          continue;
701       }
702 
703       /* We can change the micro tile mode before a full clear. */
704       /* This is only used for MSAA textures when clearing all layers. */
705       si_set_optimal_micro_tile_mode(sctx->screen, tex);
706 
707       if (tex->swap_rgb_to_bgr_on_next_clear) {
708          assert(!tex->swap_rgb_to_bgr);
709          assert(tex->buffer.b.b.nr_samples >= 2);
710          tex->swap_rgb_to_bgr = true;
711          tex->swap_rgb_to_bgr_on_next_clear = false;
712 
713          /* Update all sampler views and images. */
714          p_atomic_inc(&sctx->screen->dirty_tex_counter);
715       }
716 
717       /* only supported on tiled surfaces */
718       if (tex->surface.is_linear) {
719          continue;
720       }
721 
722       /* Use a slow clear for small surfaces where the cost of
723        * the eliminate pass can be higher than the benefit of fast
724        * clear. The closed driver does this, but the numbers may differ.
725        *
726        * This helps on both dGPUs and APUs, even small APUs like Mullins.
727        */
728       bool fb_too_small = (uint64_t)num_pixels * num_layers <= 512 * 512;
729       bool too_small = tex->buffer.b.b.nr_samples <= 1 && fb_too_small;
730       bool eliminate_needed = false;
731       bool fmask_decompress_needed = false;
732       bool need_dirtying_fb = false;
733 
734       /* Try to clear DCC first, otherwise try CMASK. */
735       if (vi_dcc_enabled(tex, level)) {
736          uint32_t reset_value;
737 
738          if (sctx->screen->debug_flags & DBG(NO_DCC_CLEAR))
739             continue;
740 
741          if (sctx->gfx_level >= GFX11) {
742             if (!gfx11_get_dcc_clear_parameters(sctx->screen, tex, level, fb->cbufs[i]->format,
743                                                 color, &reset_value, true))
744                continue;
745          } else {
746             if (!gfx8_get_dcc_clear_parameters(sctx->screen, tex->buffer.b.b.format,
747                                                fb->cbufs[i]->format, color, &reset_value,
748                                                &eliminate_needed))
749                continue;
750          }
751 
752          /* Shared textures can't use fast clear without an explicit flush
753           * because the clear color is not exported.
754           *
755           * Chips without DCC constant encoding must set the clear color registers
756           * correctly even if the fast clear eliminate pass is not needed.
757           */
758          if ((eliminate_needed || !sctx->screen->info.has_dcc_constant_encode) &&
759              tex->buffer.b.is_shared &&
760              !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
761             continue;
762 
763          if (eliminate_needed && too_small)
764             continue;
765 
766          /* We can clear any level, but we only set up the clear value registers for the first
767           * level. Therefore, all other levels can be cleared only if the clear value registers
768           * are not used, which is only the case with DCC constant encoding and 0/1 clear values.
769           */
770          if (level > 0 && (eliminate_needed || !sctx->screen->info.has_dcc_constant_encode))
771             continue;
772 
773          if (tex->buffer.b.b.nr_samples >= 2 && eliminate_needed &&
774              !sctx->screen->allow_dcc_msaa_clear_to_reg_for_bpp[util_logbase2(tex->surface.bpe)])
775             continue;
776 
777          assert(num_clears < ARRAY_SIZE(info));
778 
779          if (!vi_dcc_get_clear_info(sctx, tex, level, reset_value, &info[num_clears]))
780             continue;
781 
782          num_clears++;
783          clear_types |= SI_CLEAR_TYPE_DCC;
784 
785          si_mark_display_dcc_dirty(sctx, tex);
786 
787          if (sctx->gfx_level >= GFX11 && reset_value == GFX11_DCC_CLEAR_SINGLE) {
788             /* Put this clear first by moving other clears after it because this clear has
789              * the most GPU overhead.
790              */
791             if (num_clears)
792                memmove(&info[1], &info[0], sizeof(info[0]) * num_clears);
793 
794             si_init_clear_image_dcc_single(&info[0], tex, level, fb->cbufs[i]->format,
795                                            color);
796             num_clears++;
797          }
798 
799          /* DCC fast clear with MSAA should clear CMASK to 0xC. */
800          if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) {
801             assert(sctx->gfx_level < GFX11); /* no FMASK/CMASK on GFX11 */
802             assert(num_clears < ARRAY_SIZE(info));
803             si_init_buffer_clear(&info[num_clears++], &tex->cmask_buffer->b.b,
804                                  tex->surface.cmask_offset, tex->surface.cmask_size, 0xCCCCCCCC);
805             clear_types |= SI_CLEAR_TYPE_CMASK;
806             fmask_decompress_needed = true;
807          }
808       } else {
809          /* No CMASK on GFX11. */
810          if (sctx->gfx_level >= GFX11)
811             continue;
812 
813          if (level > 0)
814             continue;
815 
816          /* Shared textures can't use fast clear without an explicit flush
817           * because the clear color is not exported.
818           */
819          if (tex->buffer.b.is_shared &&
820              !(tex->buffer.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
821             continue;
822 
823          if (too_small)
824             continue;
825 
826          /* 128-bit formats are unsupported */
827          if (tex->surface.bpe > 8) {
828             continue;
829          }
830 
831          /* RB+ doesn't work with CMASK fast clear on Stoney. */
832          if (sctx->family == CHIP_STONEY)
833             continue;
834 
835          /* Disable fast clear if tex is encrypted */
836          if (tex->buffer.flags & RADEON_FLAG_ENCRYPTED)
837             continue;
838 
839          uint64_t cmask_offset = 0;
840          unsigned clear_size = 0;
841          bool had_cmask_buffer = tex->cmask_buffer != NULL;
842 
843          if (sctx->gfx_level >= GFX10) {
844             assert(level == 0);
845 
846             /* Clearing CMASK with both multiple levels and multiple layers is not
847              * implemented.
848              */
849             if (num_layers > 1 && tex->buffer.b.b.last_level > 0)
850                continue;
851 
852             if (!si_alloc_separate_cmask(sctx->screen, tex))
853                continue;
854 
855             if (num_layers == 1) {
856                /* Clear level 0. */
857                cmask_offset = tex->surface.cmask_offset + tex->surface.u.gfx9.color.cmask_level0.offset;
858                clear_size = tex->surface.u.gfx9.color.cmask_level0.size;
859             } else if (tex->buffer.b.b.last_level == 0) {
860                /* Clear all layers having only 1 level. */
861                cmask_offset = tex->surface.cmask_offset;
862                clear_size = tex->surface.cmask_size;
863             } else {
864                assert(0); /* this is prevented above */
865             }
866          } else if (sctx->gfx_level == GFX9) {
867             /* TODO: Implement CMASK fast clear for level 0 of mipmapped textures. Mipmapped
868              * CMASK has to clear a rectangular area of CMASK for level 0 (because the whole
869              * miptree is organized in a 2D plane).
870              */
871             if (tex->buffer.b.b.last_level > 0)
872                continue;
873 
874             if (!si_alloc_separate_cmask(sctx->screen, tex))
875                continue;
876 
877             cmask_offset = tex->surface.cmask_offset;
878             clear_size = tex->surface.cmask_size;
879          } else {
880             if (!si_alloc_separate_cmask(sctx->screen, tex))
881                continue;
882 
883             /* GFX6-8: This only covers mipmap level 0. */
884             cmask_offset = tex->surface.cmask_offset;
885             clear_size = tex->surface.cmask_size;
886          }
887 
888          /* Do the fast clear. */
889          assert(num_clears < ARRAY_SIZE(info));
890          si_init_buffer_clear(&info[num_clears++], &tex->cmask_buffer->b.b,
891                               cmask_offset, clear_size, 0);
892          clear_types |= SI_CLEAR_TYPE_CMASK;
893          eliminate_needed = true;
894          /* If we allocated a cmask buffer for this tex we need to re-emit
895           * the fb state.
896           */
897          need_dirtying_fb = !had_cmask_buffer;
898       }
899 
900       if ((eliminate_needed || fmask_decompress_needed) &&
901           !(tex->dirty_level_mask & (1 << level))) {
902          assert(sctx->gfx_level < GFX11); /* no decompression needed on GFX11 */
903          tex->dirty_level_mask |= 1 << level;
904          p_atomic_inc(&sctx->screen->compressed_colortex_counter);
905       }
906 
907       *buffers &= ~(PIPE_CLEAR_COLOR0 << i);
908 
909       /* Chips with DCC constant encoding don't need to set the clear
910        * color registers for DCC clear values 0 and 1.
911        */
912       if (sctx->screen->info.has_dcc_constant_encode && !eliminate_needed)
913          continue;
914 
915       /* There are no clear color registers on GFX11. */
916       assert(sctx->gfx_level < GFX11);
917 
918       if (si_set_clear_color(tex, fb->cbufs[i]->format, color) || need_dirtying_fb) {
919          sctx->framebuffer.dirty_cbufs |= 1 << i;
920          si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
921       }
922    }
923 
924    /* Depth/stencil clears. */
925    struct pipe_surface *zsbuf = fb->zsbuf;
926    struct si_texture *zstex = zsbuf ? (struct si_texture *)zsbuf->texture : NULL;
927    unsigned zs_num_layers = zstex ? util_num_layers(&zstex->buffer.b.b, zsbuf->u.tex.level) : 0;
928 
929    if (zstex && zsbuf->u.tex.first_layer == 0 &&
930        zsbuf->u.tex.last_layer == zs_num_layers - 1 &&
931        si_htile_enabled(zstex, zsbuf->u.tex.level, PIPE_MASK_ZS)) {
932       unsigned level = zsbuf->u.tex.level;
933       bool update_db_depth_clear = false;
934       bool update_db_stencil_clear = false;
935       bool fb_too_small = num_pixels * zs_num_layers <= 512 * 512;
936 
937       /* Transition from TC-incompatible to TC-compatible HTILE if requested. */
938       if (zstex->enable_tc_compatible_htile_next_clear) {
939           /* If both depth and stencil are present, they must be cleared together. */
940          if ((*buffers & PIPE_CLEAR_DEPTHSTENCIL) == PIPE_CLEAR_DEPTHSTENCIL ||
941              (*buffers & PIPE_CLEAR_DEPTH && (!zstex->surface.has_stencil ||
942                                               zstex->htile_stencil_disabled))) {
943             /* The conversion from TC-incompatible to TC-compatible can only be done in one clear. */
944             assert(zstex->buffer.b.b.last_level == 0);
945             assert(!zstex->tc_compatible_htile);
946 
947             /* Enable TC-compatible HTILE. */
948             zstex->enable_tc_compatible_htile_next_clear = false;
949             zstex->tc_compatible_htile = true;
950 
951             /* Update the framebuffer state to reflect the change. */
952             sctx->framebuffer.DB_has_shader_readable_metadata = true;
953             sctx->framebuffer.dirty_zsbuf = true;
954             si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
955 
956             /* Update all sampler views and shader images in all contexts. */
957             p_atomic_inc(&sctx->screen->dirty_tex_counter);
958 
959             /* Perform the clear here if possible, else clear to uncompressed. */
960             uint32_t clear_value;
961 
962             if (zstex->htile_stencil_disabled || !zstex->surface.has_stencil) {
963                if (si_can_fast_clear_depth(zstex, level, depth, *buffers)) {
964                   /* Z-only clear. */
965                   clear_value = si_get_htile_clear_value(zstex, depth);
966                   *buffers &= ~PIPE_CLEAR_DEPTH;
967                   zstex->depth_cleared_level_mask_once |= BITFIELD_BIT(level);
968                   zstex->depth_cleared_level_mask |= BITFIELD_BIT(level);
969                   update_db_depth_clear = true;
970                }
971             } else if ((*buffers & PIPE_CLEAR_DEPTHSTENCIL) == PIPE_CLEAR_DEPTHSTENCIL) {
972                if (si_can_fast_clear_depth(zstex, level, depth, *buffers) &&
973                    si_can_fast_clear_stencil(zstex, level, stencil, *buffers)) {
974                   /* Combined Z+S clear. */
975                   clear_value = si_get_htile_clear_value(zstex, depth);
976                   *buffers &= ~PIPE_CLEAR_DEPTHSTENCIL;
977                   zstex->depth_cleared_level_mask_once |= BITFIELD_BIT(level);
978                   zstex->depth_cleared_level_mask |= BITFIELD_BIT(level);
979                   zstex->stencil_cleared_level_mask_once |= BITFIELD_BIT(level);
980                   update_db_depth_clear = true;
981                   update_db_stencil_clear = true;
982                }
983             }
984 
985             if (!update_db_depth_clear) {
986                /* Clear to uncompressed, so that it doesn't contain values incompatible
987                 * with the new TC-compatible HTILE setting.
988                 *
989                 * 0xfffff30f = uncompressed Z + S
990                 * 0xfffc000f = uncompressed Z only
991                 */
992                clear_value = !zstex->htile_stencil_disabled ? 0xfffff30f : 0xfffc000f;
993             }
994 
995             zstex->need_flush_after_depth_decompression = sctx->gfx_level == GFX10_3;
996 
997             assert(num_clears < ARRAY_SIZE(info));
998             si_init_buffer_clear(&info[num_clears++], &zstex->buffer.b.b,
999                                  zstex->surface.meta_offset, zstex->surface.meta_size, clear_value);
1000             clear_types |= SI_CLEAR_TYPE_HTILE;
1001          }
1002       } else if (num_clears || !fb_too_small) {
1003          /* This is where the HTILE buffer clear is done.
1004           *
1005           * If there is no clear scheduled and the framebuffer size is too small, we should use
1006           * the draw-based clear that is without waits. If there is some other clear scheduled,
1007           * we will have to wait anyway, so add the HTILE buffer clear to the batch here.
1008           * If the framebuffer size is large enough, use this codepath too.
1009           */
1010          uint64_t htile_offset = zstex->surface.meta_offset;
1011          unsigned htile_size = 0;
1012 
1013          /* Determine the HTILE subset to clear. */
1014          if (sctx->gfx_level >= GFX10) {
1015             /* This can only clear a layered texture with 1 level or a mipmap texture
1016              * with 1 layer. Other cases are unimplemented.
1017              */
1018             if (zs_num_layers == 1) {
1019                /* Clear a specific level. */
1020                htile_offset += zstex->surface.u.gfx9.meta_levels[level].offset;
1021                htile_size = zstex->surface.u.gfx9.meta_levels[level].size;
1022             } else if (zstex->buffer.b.b.last_level == 0) {
1023                /* Clear all layers having only 1 level. */
1024                htile_size = zstex->surface.meta_size;
1025             }
1026          } else {
1027             /* This can only clear a layered texture with 1 level. Other cases are
1028              * unimplemented.
1029              */
1030             if (zstex->buffer.b.b.last_level == 0)
1031                htile_size = zstex->surface.meta_size;
1032          }
1033 
1034          /* Perform the clear if it's possible. */
1035          if (zstex->htile_stencil_disabled || !zstex->surface.has_stencil) {
1036             if (htile_size &&
1037                 si_can_fast_clear_depth(zstex, level, depth, *buffers)) {
1038                /* Z-only clear. */
1039                assert(num_clears < ARRAY_SIZE(info));
1040                si_init_buffer_clear(&info[num_clears++], &zstex->buffer.b.b, htile_offset,
1041                                     htile_size, si_get_htile_clear_value(zstex, depth));
1042                clear_types |= SI_CLEAR_TYPE_HTILE;
1043                *buffers &= ~PIPE_CLEAR_DEPTH;
1044                zstex->depth_cleared_level_mask_once |= BITFIELD_BIT(level);
1045                zstex->depth_cleared_level_mask |= BITFIELD_BIT(level);
1046                update_db_depth_clear = true;
1047             }
1048          } else if ((*buffers & PIPE_CLEAR_DEPTHSTENCIL) == PIPE_CLEAR_DEPTHSTENCIL) {
1049             if (htile_size &&
1050                 si_can_fast_clear_depth(zstex, level, depth, *buffers) &&
1051                 si_can_fast_clear_stencil(zstex, level, stencil, *buffers)) {
1052                /* Combined Z+S clear. */
1053                assert(num_clears < ARRAY_SIZE(info));
1054                si_init_buffer_clear(&info[num_clears++], &zstex->buffer.b.b, htile_offset,
1055                                     htile_size, si_get_htile_clear_value(zstex, depth));
1056                clear_types |= SI_CLEAR_TYPE_HTILE;
1057                *buffers &= ~PIPE_CLEAR_DEPTHSTENCIL;
1058                zstex->depth_cleared_level_mask_once |= BITFIELD_BIT(level);
1059                zstex->depth_cleared_level_mask |= BITFIELD_BIT(level);
1060                zstex->stencil_cleared_level_mask_once |= BITFIELD_BIT(level);
1061                update_db_depth_clear = true;
1062                update_db_stencil_clear = true;
1063             }
1064          } else {
1065             /* Z-only or S-only clear when both Z/S are present using a read-modify-write
1066              * compute shader.
1067              *
1068              * If we get both clears but only one of them can be fast-cleared, we use
1069              * the draw-based fast clear to do both at the same time.
1070              */
1071             const uint32_t htile_depth_writemask = 0xfffffc0f;
1072             const uint32_t htile_stencil_writemask = 0x000003f0;
1073 
1074             if (htile_size &&
1075                 !(*buffers & PIPE_CLEAR_STENCIL) &&
1076                 si_can_fast_clear_depth(zstex, level, depth, *buffers)) {
1077                /* Z-only clear with stencil left intact. */
1078                assert(num_clears < ARRAY_SIZE(info));
1079                si_init_buffer_clear_rmw(&info[num_clears++], &zstex->buffer.b.b, htile_offset,
1080                                         htile_size, si_get_htile_clear_value(zstex, depth),
1081                                         htile_depth_writemask);
1082                clear_types |= SI_CLEAR_TYPE_HTILE;
1083                *buffers &= ~PIPE_CLEAR_DEPTH;
1084                zstex->depth_cleared_level_mask_once |= BITFIELD_BIT(level);
1085                zstex->depth_cleared_level_mask |= BITFIELD_BIT(level);
1086                update_db_depth_clear = true;
1087             } else if (htile_size &&
1088                        !(*buffers & PIPE_CLEAR_DEPTH) &&
1089                        si_can_fast_clear_stencil(zstex, level, stencil, *buffers)) {
1090                /* Stencil-only clear with depth left intact. */
1091                assert(num_clears < ARRAY_SIZE(info));
1092                si_init_buffer_clear_rmw(&info[num_clears++], &zstex->buffer.b.b, htile_offset,
1093                                         htile_size, si_get_htile_clear_value(zstex, depth),
1094                                         htile_stencil_writemask);
1095                clear_types |= SI_CLEAR_TYPE_HTILE;
1096                *buffers &= ~PIPE_CLEAR_STENCIL;
1097                zstex->stencil_cleared_level_mask_once |= BITFIELD_BIT(level);
1098                update_db_stencil_clear = true;
1099             }
1100          }
1101 
1102          zstex->need_flush_after_depth_decompression = update_db_depth_clear && sctx->gfx_level == GFX10_3;
1103       }
1104 
1105       /* Update DB_DEPTH_CLEAR. */
1106       if (update_db_depth_clear &&
1107           zstex->depth_clear_value[level] != (float)depth) {
1108          zstex->depth_clear_value[level] = depth;
1109          sctx->framebuffer.dirty_zsbuf = true;
1110          si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
1111       }
1112 
1113       /* Update DB_STENCIL_CLEAR. */
1114       if (update_db_stencil_clear &&
1115           zstex->stencil_clear_value[level] != stencil) {
1116          zstex->stencil_clear_value[level] = stencil;
1117          sctx->framebuffer.dirty_zsbuf = true;
1118          si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
1119       }
1120    }
1121 
1122    si_execute_clears(sctx, info, num_clears, clear_types, sctx->render_cond_enabled);
1123 }
1124 
si_fb_clear_via_compute(struct si_context * sctx,unsigned * buffers,const union pipe_color_union * color)1125 static void si_fb_clear_via_compute(struct si_context *sctx, unsigned *buffers,
1126                                     const union pipe_color_union *color)
1127 {
1128    struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
1129 
1130    unsigned color_buffer_mask = (*buffers & PIPE_CLEAR_COLOR) >> util_logbase2(PIPE_CLEAR_COLOR0);
1131    while (color_buffer_mask) {
1132       unsigned i = u_bit_scan(&color_buffer_mask);
1133 
1134       struct pipe_surface *surf = fb->cbufs[i];
1135       unsigned depth = surf->u.tex.last_layer - surf->u.tex.first_layer + 1;
1136       struct si_texture *tex = (struct si_texture *)surf->texture;
1137 
1138       /* If DCC is enable (which can happen with thick tiling on gfx8, don't use compute to get
1139        * compressed clears.
1140        */
1141       if (vi_dcc_enabled(tex, surf->u.tex.level))
1142          continue;
1143 
1144       /* Clears of thick and linear layouts are fastest with compute. */
1145       if (tex->surface.thick_tiling ||
1146           (tex->surface.is_linear && (surf->height > 1 || depth > 1 || surf->width >= 8192))) {
1147          struct pipe_box box;
1148 
1149          u_box_3d(0, 0, surf->u.tex.first_layer, surf->width, surf->height, depth, &box);
1150 
1151          if (si_compute_clear_image(sctx, &tex->buffer.b.b, surf->format, surf->u.tex.level, &box,
1152                                     color, sctx->render_cond_enabled, true))
1153             *buffers &= ~(PIPE_CLEAR_COLOR0 << i); /* success */
1154       }
1155    }
1156 }
1157 
gfx6_clear(struct pipe_context * ctx,unsigned buffers,const struct pipe_scissor_state * scissor_state,const union pipe_color_union * color,double depth,unsigned stencil)1158 static void gfx6_clear(struct pipe_context *ctx, unsigned buffers,
1159                        const struct pipe_scissor_state *scissor_state,
1160                        const union pipe_color_union *color, double depth, unsigned stencil)
1161 {
1162    struct si_context *sctx = (struct si_context *)ctx;
1163    struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
1164    struct pipe_surface *zsbuf = fb->zsbuf;
1165    struct si_texture *zstex = zsbuf ? (struct si_texture *)zsbuf->texture : NULL;
1166 
1167    /* Unset clear flags for non-existent buffers. */
1168    for (unsigned i = 0; i < 8; i++) {
1169       if (i >= fb->nr_cbufs || !fb->cbufs[i])
1170          buffers &= ~(PIPE_CLEAR_COLOR0 << i);
1171    }
1172    if (!zsbuf)
1173       buffers &= ~PIPE_CLEAR_DEPTHSTENCIL;
1174    else if (!util_format_has_stencil(util_format_description(zsbuf->format)))
1175       buffers &= ~PIPE_CLEAR_STENCIL;
1176 
1177    si_fast_clear(sctx, &buffers, color, depth, stencil);
1178    if (!buffers)
1179       return; /* all buffers have been cleared */
1180 
1181    si_fb_clear_via_compute(sctx, &buffers, color);
1182    if (!buffers)
1183       return; /* all buffers have been cleared */
1184 
1185    if (buffers & PIPE_CLEAR_COLOR) {
1186       /* These buffers cannot use fast clear, make sure to disable expansion. */
1187       unsigned color_buffer_mask = (buffers & PIPE_CLEAR_COLOR) >> util_logbase2(PIPE_CLEAR_COLOR0);
1188       while (color_buffer_mask) {
1189          unsigned i = u_bit_scan(&color_buffer_mask);
1190          struct si_texture *tex = (struct si_texture *)fb->cbufs[i]->texture;
1191          if (tex->surface.fmask_size == 0)
1192             tex->dirty_level_mask &= ~(1 << fb->cbufs[i]->u.tex.level);
1193       }
1194    }
1195 
1196    if (zstex && zsbuf->u.tex.first_layer == 0 &&
1197        zsbuf->u.tex.last_layer == util_max_layer(&zstex->buffer.b.b, 0)) {
1198       unsigned level = zsbuf->u.tex.level;
1199 
1200       if (si_can_fast_clear_depth(zstex, level, depth, buffers)) {
1201          /* Need to disable EXPCLEAR temporarily if clearing
1202           * to a new value. */
1203          if (!(zstex->depth_cleared_level_mask_once & BITFIELD_BIT(level)) ||
1204              zstex->depth_clear_value[level] != depth) {
1205             sctx->db_depth_disable_expclear = true;
1206          }
1207 
1208          if (zstex->depth_clear_value[level] != (float)depth) {
1209             if ((zstex->depth_clear_value[level] != 0) != (depth != 0)) {
1210                /* ZRANGE_PRECISION register of a bound surface will change so we
1211                 * must flush the DB caches. */
1212                sctx->barrier_flags |= SI_BARRIER_SYNC_AND_INV_DB;
1213                si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
1214             }
1215             /* Update DB_DEPTH_CLEAR. */
1216             zstex->depth_clear_value[level] = depth;
1217             sctx->framebuffer.dirty_zsbuf = true;
1218             si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
1219          }
1220          sctx->db_depth_clear = true;
1221          si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
1222       }
1223 
1224       if (si_can_fast_clear_stencil(zstex, level, stencil, buffers)) {
1225          stencil &= 0xff;
1226 
1227          /* Need to disable EXPCLEAR temporarily if clearing
1228           * to a new value. */
1229          if (!(zstex->stencil_cleared_level_mask_once & BITFIELD_BIT(level)) ||
1230              zstex->stencil_clear_value[level] != stencil) {
1231             sctx->db_stencil_disable_expclear = true;
1232          }
1233 
1234          if (zstex->stencil_clear_value[level] != (uint8_t)stencil) {
1235             /* Update DB_STENCIL_CLEAR. */
1236             zstex->stencil_clear_value[level] = stencil;
1237             sctx->framebuffer.dirty_zsbuf = true;
1238             si_mark_atom_dirty(sctx, &sctx->atoms.s.framebuffer);
1239          }
1240          sctx->db_stencil_clear = true;
1241          si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
1242       }
1243 
1244       /* TODO: This hack fixes dEQP-GLES[23].functional.fragment_ops.random.* on Navi31.
1245        * The root cause is unknown.
1246        */
1247       if (sctx->gfx_level == GFX11 || sctx->gfx_level == GFX11_5) {
1248          sctx->barrier_flags |= SI_BARRIER_SYNC_VS;
1249          si_mark_atom_dirty(sctx, &sctx->atoms.s.barrier);
1250       }
1251    }
1252 
1253    if (unlikely(sctx->sqtt_enabled)) {
1254       if (buffers & PIPE_CLEAR_COLOR)
1255          sctx->sqtt_next_event = EventCmdClearColorImage;
1256       else if (buffers & PIPE_CLEAR_DEPTHSTENCIL)
1257          sctx->sqtt_next_event = EventCmdClearDepthStencilImage;
1258    }
1259 
1260    si_blitter_begin(sctx, SI_CLEAR);
1261    util_blitter_clear(sctx->blitter, fb->width, fb->height, util_framebuffer_get_num_layers(fb),
1262                       buffers, color, depth, stencil, sctx->framebuffer.nr_samples > 1);
1263    si_blitter_end(sctx);
1264 
1265    if (sctx->db_depth_clear) {
1266       sctx->db_depth_clear = false;
1267       sctx->db_depth_disable_expclear = false;
1268       zstex->depth_cleared_level_mask_once |= BITFIELD_BIT(zsbuf->u.tex.level);
1269       zstex->depth_cleared_level_mask |= BITFIELD_BIT(zsbuf->u.tex.level);
1270       si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
1271    }
1272 
1273    if (sctx->db_stencil_clear) {
1274       sctx->db_stencil_clear = false;
1275       sctx->db_stencil_disable_expclear = false;
1276       zstex->stencil_cleared_level_mask_once |= BITFIELD_BIT(zsbuf->u.tex.level);
1277       si_mark_atom_dirty(sctx, &sctx->atoms.s.db_render_state);
1278    }
1279 }
1280 
gfx12_clear(struct pipe_context * ctx,unsigned buffers,const struct pipe_scissor_state * scissor_state,const union pipe_color_union * color,double depth,unsigned stencil)1281 static void gfx12_clear(struct pipe_context *ctx, unsigned buffers,
1282                         const struct pipe_scissor_state *scissor_state,
1283                         const union pipe_color_union *color, double depth, unsigned stencil)
1284 {
1285    struct si_context *sctx = (struct si_context *)ctx;
1286    struct pipe_framebuffer_state *fb = &sctx->framebuffer.state;
1287    struct pipe_surface *zsbuf = fb->zsbuf;
1288    struct si_texture *zstex = zsbuf ? (struct si_texture *)zsbuf->texture : NULL;
1289 
1290    /* Unset clear flags for non-existent buffers. */
1291    for (unsigned i = 0; i < 8; i++) {
1292       if (i >= fb->nr_cbufs || !fb->cbufs[i])
1293          buffers &= ~(PIPE_CLEAR_COLOR0 << i);
1294    }
1295    if (!zsbuf)
1296       buffers &= ~PIPE_CLEAR_DEPTHSTENCIL;
1297    else if (!util_format_has_stencil(util_format_description(zsbuf->format)))
1298       buffers &= ~PIPE_CLEAR_STENCIL;
1299 
1300    if (unlikely(sctx->sqtt_enabled)) {
1301       if (buffers & PIPE_CLEAR_COLOR)
1302          sctx->sqtt_next_event = EventCmdClearColorImage;
1303       else if (buffers & PIPE_CLEAR_DEPTHSTENCIL)
1304          sctx->sqtt_next_event = EventCmdClearDepthStencilImage;
1305    }
1306 
1307    si_blitter_begin(sctx, SI_CLEAR);
1308    util_blitter_clear(sctx->blitter, fb->width, fb->height, util_framebuffer_get_num_layers(fb),
1309                       buffers, color, depth, stencil, sctx->framebuffer.nr_samples > 1);
1310    si_blitter_end(sctx);
1311 
1312    /* This is only used by the driver, not the hw. */
1313    if (buffers & PIPE_CLEAR_DEPTH) {
1314       zstex->depth_cleared_level_mask |= BITFIELD_BIT(zsbuf->u.tex.level);
1315       zstex->depth_clear_value[zsbuf->u.tex.level] = depth;
1316    }
1317 }
1318 
si_try_normal_clear(struct si_context * sctx,struct pipe_surface * dst,unsigned dstx,unsigned dsty,unsigned width,unsigned height,bool render_condition_enabled,unsigned buffers,const union pipe_color_union * color,float depth,unsigned stencil)1319 static bool si_try_normal_clear(struct si_context *sctx, struct pipe_surface *dst,
1320                                 unsigned dstx, unsigned dsty, unsigned width, unsigned height,
1321                                 bool render_condition_enabled, unsigned buffers,
1322                                 const union pipe_color_union *color,
1323                                 float depth, unsigned stencil)
1324 {
1325    /* This is worth it only if it's a whole image clear. */
1326    if (dstx == 0 && dsty == 0 &&
1327        width == dst->width &&
1328        height == dst->height &&
1329        dst->u.tex.first_layer == 0 &&
1330        dst->u.tex.last_layer == util_max_layer(dst->texture, dst->u.tex.level) &&
1331        /* pipe->clear honors render_condition, so only use it if it's unset or if it's set and enabled. */
1332        (!sctx->render_cond || render_condition_enabled) &&
1333        sctx->has_graphics) {
1334       struct pipe_context *ctx = &sctx->b;
1335       struct pipe_framebuffer_state saved_fb = {}, fb = {};
1336 
1337       util_copy_framebuffer_state(&saved_fb, &sctx->framebuffer.state);
1338 
1339       if (buffers & PIPE_CLEAR_COLOR) {
1340          fb.cbufs[0] = dst;
1341          fb.nr_cbufs = 1;
1342       } else {
1343          fb.zsbuf = dst;
1344       }
1345 
1346       fb.width = dst->width;
1347       fb.height = dst->height;
1348 
1349       ctx->set_framebuffer_state(ctx, &fb);
1350       ctx->clear(ctx, buffers, NULL, color, depth, stencil);
1351       ctx->set_framebuffer_state(ctx, &saved_fb);
1352 
1353       util_copy_framebuffer_state(&saved_fb, NULL);
1354 
1355       return true;
1356    }
1357 
1358    return false;
1359 }
1360 
si_compute_fast_clear_image(struct si_context * sctx,struct pipe_resource * dst,enum pipe_format format,unsigned level,const struct pipe_box * box,const union pipe_color_union * color,bool render_condition_enable,bool fail_if_slow)1361 bool si_compute_fast_clear_image(struct si_context *sctx, struct pipe_resource *dst,
1362                                  enum pipe_format format, unsigned level, const struct pipe_box *box,
1363                                  const union pipe_color_union *color, bool render_condition_enable,
1364                                  bool fail_if_slow)
1365 {
1366    struct si_texture *sdst = (struct si_texture*)dst;
1367 
1368    if (!vi_dcc_enabled(sdst, level))
1369       return false;
1370 
1371    /* Only the whole image can be cleared. */
1372    if (box->x != 0 || box->y != 0 || box->width != u_minify(dst->width0, level) ||
1373        box->height != u_minify(dst->height0, level) || box->depth != util_num_layers(dst, level))
1374       return false;
1375 
1376    uint32_t dcc_value;
1377    bool eliminate_needed;
1378 
1379    /* Get the DCC clear value. */
1380    if (sctx->gfx_level >= GFX11) {
1381       if (!gfx11_get_dcc_clear_parameters(sctx->screen, sdst, level, format,
1382                                           color, &dcc_value, fail_if_slow))
1383          return false;
1384    } else {
1385       if (!gfx8_get_dcc_clear_parameters(sctx->screen, dst->format, format, color, &dcc_value,
1386                                          &eliminate_needed) ||
1387           eliminate_needed)
1388          return false;
1389    }
1390 
1391    /* Get DCC clear info. */
1392    struct si_clear_info info[3]; /* DCC + CMASK + clear_image_dcc_single */
1393    unsigned num_clears = 0, clear_types = 0;
1394 
1395    if (!vi_dcc_get_clear_info(sctx, sdst, level, dcc_value, &info[num_clears]))
1396       return false;
1397 
1398    num_clears++;
1399    clear_types |= SI_CLEAR_TYPE_DCC;
1400    si_mark_display_dcc_dirty(sctx, sdst);
1401 
1402    if (sctx->gfx_level >= GFX11 && dcc_value == GFX11_DCC_CLEAR_SINGLE) {
1403       /* Put this clear first by moving other clears after it because this clear has
1404        * the most GPU overhead.
1405        */
1406       memmove(&info[1], &info[0], sizeof(info[0]) * num_clears);
1407       si_init_clear_image_dcc_single(&info[0], sdst, level, format, color);
1408       num_clears++;
1409    }
1410 
1411    /* DCC fast clear with MSAA should clear CMASK to 0xC. */
1412    if (dst->nr_samples >= 2 && sdst->cmask_buffer) {
1413       assert(sctx->gfx_level < GFX11); /* no FMASK/CMASK on GFX11 */
1414       assert(num_clears < ARRAY_SIZE(info));
1415       si_init_buffer_clear(&info[num_clears++], &sdst->cmask_buffer->b.b,
1416                            sdst->surface.cmask_offset, sdst->surface.cmask_size, 0xCCCCCCCC);
1417       clear_types |= SI_CLEAR_TYPE_CMASK;
1418 
1419       if (!(sdst->dirty_level_mask & BITFIELD_BIT(level))) {
1420          sdst->dirty_level_mask |= BITFIELD_BIT(level);
1421          p_atomic_inc(&sctx->screen->compressed_colortex_counter);
1422       }
1423    }
1424 
1425    assert(num_clears <= ARRAY_SIZE(info));
1426    si_execute_clears(sctx, info, num_clears, clear_types, render_condition_enable);
1427    return true;
1428 }
1429 
si_clear_render_target(struct pipe_context * ctx,struct pipe_surface * dst,const union pipe_color_union * color,unsigned dstx,unsigned dsty,unsigned width,unsigned height,bool render_condition_enabled)1430 static void si_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dst,
1431                                    const union pipe_color_union *color, unsigned dstx,
1432                                    unsigned dsty, unsigned width, unsigned height,
1433                                    bool render_condition_enabled)
1434 {
1435    struct si_context *sctx = (struct si_context *)ctx;
1436    struct si_texture *sdst = (struct si_texture *)dst->texture;
1437 
1438    /* For older chips that can do fast clear with any clear color (using GFX8_DCC_CLEAR_REG
1439     * or CMASK).
1440     */
1441    if (sctx->gfx_level <= GFX10_3 &&
1442        (vi_dcc_enabled(sdst, dst->u.tex.level) ||
1443         /* GFX6-9 allow CMASK without MSAA and allocate it on demand, but only 8-64bpp. */
1444         (sctx->gfx_level <= GFX9 && sdst->surface.bpe <= 8)) &&
1445        si_try_normal_clear(sctx, dst, dstx, dsty, width, height, render_condition_enabled,
1446                            PIPE_CLEAR_COLOR0, color, 0, 0))
1447       return;
1448 
1449    struct pipe_box box;
1450    u_box_3d(dstx, dsty, dst->u.tex.first_layer, width, height,
1451             dst->u.tex.last_layer - dst->u.tex.first_layer + 1, &box);
1452 
1453    if (si_compute_fast_clear_image(sctx, dst->texture, dst->format, dst->u.tex.level, &box, color,
1454                                    render_condition_enabled, true))
1455       return;
1456 
1457    if (si_compute_clear_image(sctx, dst->texture, dst->format, dst->u.tex.level, &box, color,
1458                               render_condition_enabled, true))
1459       return;
1460 
1461    si_gfx_clear_render_target(ctx, dst, color, dstx, dsty, width, height,
1462                               render_condition_enabled);
1463 }
1464 
si_gfx_clear_render_target(struct pipe_context * ctx,struct pipe_surface * dst,const union pipe_color_union * color,unsigned dstx,unsigned dsty,unsigned width,unsigned height,bool render_condition_enabled)1465 void si_gfx_clear_render_target(struct pipe_context *ctx, struct pipe_surface *dst,
1466                                 const union pipe_color_union *color, unsigned dstx,
1467                                 unsigned dsty, unsigned width, unsigned height,
1468                                 bool render_condition_enabled)
1469 {
1470    struct si_context *sctx = (struct si_context *)ctx;
1471 
1472    si_blitter_begin(sctx,
1473                     SI_CLEAR_SURFACE | (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
1474    util_blitter_clear_render_target(sctx->blitter, dst, color, dstx, dsty, width, height);
1475    si_blitter_end(sctx);
1476 }
1477 
si_clear_depth_stencil(struct pipe_context * ctx,struct pipe_surface * dst,unsigned clear_flags,double depth,unsigned stencil,unsigned dstx,unsigned dsty,unsigned width,unsigned height,bool render_condition_enabled)1478 static void si_clear_depth_stencil(struct pipe_context *ctx, struct pipe_surface *dst,
1479                                    unsigned clear_flags, double depth, unsigned stencil,
1480                                    unsigned dstx, unsigned dsty, unsigned width, unsigned height,
1481                                    bool render_condition_enabled)
1482 {
1483    struct si_context *sctx = (struct si_context *)ctx;
1484    union pipe_color_union unused = {};
1485 
1486    /* Fast path that just clears HTILE. */
1487    if (si_try_normal_clear(sctx, dst, dstx, dsty, width, height, render_condition_enabled,
1488                            clear_flags, &unused, depth, stencil))
1489       return;
1490 
1491    si_blitter_begin(sctx,
1492                     SI_CLEAR_SURFACE | (render_condition_enabled ? 0 : SI_DISABLE_RENDER_COND));
1493    util_blitter_clear_depth_stencil(sctx->blitter, dst, clear_flags, depth, stencil, dstx, dsty,
1494                                     width, height);
1495    si_blitter_end(sctx);
1496 }
1497 
si_init_clear_functions(struct si_context * sctx)1498 void si_init_clear_functions(struct si_context *sctx)
1499 {
1500    sctx->b.clear_render_target = si_clear_render_target;
1501    sctx->b.clear_texture = u_default_clear_texture;
1502 
1503    if (sctx->has_graphics) {
1504       if (sctx->gfx_level >= GFX12)
1505          sctx->b.clear = gfx12_clear;
1506       else
1507          sctx->b.clear = gfx6_clear;
1508 
1509       sctx->b.clear_depth_stencil = si_clear_depth_stencil;
1510    }
1511 }
1512