xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/radeonsi/si_state_binning.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2017 Advanced Micro Devices, Inc.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 /* This file handles register programming of primitive binning. */
8 
9 #include "si_build_pm4.h"
10 #include "sid.h"
11 
12 struct uvec2 {
13    unsigned x, y;
14 };
15 
16 struct si_bin_size_map {
17    unsigned start;
18    unsigned bin_size_x;
19    unsigned bin_size_y;
20 };
21 
22 typedef struct si_bin_size_map si_bin_size_subtable[3][10];
23 
24 /* Find the bin size where sum is >= table[i].start and < table[i + 1].start. */
si_find_bin_size(struct si_screen * sscreen,const si_bin_size_subtable table[],unsigned sum)25 static struct uvec2 si_find_bin_size(struct si_screen *sscreen, const si_bin_size_subtable table[],
26                                      unsigned sum)
27 {
28    unsigned log_num_rb_per_se =
29       util_logbase2_ceil(sscreen->info.max_render_backends / sscreen->info.max_se);
30    unsigned log_num_se = util_logbase2_ceil(sscreen->info.max_se);
31    unsigned i;
32 
33    /* Get the chip-specific subtable. */
34    const struct si_bin_size_map *subtable = &table[log_num_rb_per_se][log_num_se][0];
35 
36    for (i = 0; subtable[i].bin_size_x != 0; i++) {
37       if (sum >= subtable[i].start && sum < subtable[i + 1].start)
38          break;
39    }
40 
41    struct uvec2 size = {subtable[i].bin_size_x, subtable[i].bin_size_y};
42    return size;
43 }
44 
gfx9_get_color_bin_size(struct si_context * sctx,unsigned cb_target_enabled_4bit)45 static struct uvec2 gfx9_get_color_bin_size(struct si_context *sctx, unsigned cb_target_enabled_4bit)
46 {
47    unsigned num_fragments = sctx->framebuffer.nr_color_samples;
48    unsigned sum = 0;
49 
50    /* Compute the sum of all Bpp. */
51    for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
52       if (!(cb_target_enabled_4bit & (0xf << (i * 4))))
53          continue;
54 
55       struct si_texture *tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture;
56       sum += tex->surface.bpe;
57    }
58 
59    /* Multiply the sum by some function of the number of samples. */
60    if (num_fragments >= 2) {
61       if (si_get_ps_iter_samples(sctx) >= 2)
62          sum *= num_fragments;
63       else
64          sum *= 2;
65    }
66 
67    static const si_bin_size_subtable table[] = {
68       {
69          /* One RB / SE */
70          {
71             /* One shader engine */
72             {0, 128, 128},
73             {1, 64, 128},
74             {2, 32, 128},
75             {3, 16, 128},
76             {17, 0, 0},
77          },
78          {
79             /* Two shader engines */
80             {0, 128, 128},
81             {2, 64, 128},
82             {3, 32, 128},
83             {5, 16, 128},
84             {17, 0, 0},
85          },
86          {
87             /* Four shader engines */
88             {0, 128, 128},
89             {3, 64, 128},
90             {5, 16, 128},
91             {17, 0, 0},
92          },
93       },
94       {
95          /* Two RB / SE */
96          {
97             /* One shader engine */
98             {0, 128, 128},
99             {2, 64, 128},
100             {3, 32, 128},
101             {9, 16, 128},
102             {33, 0, 0},
103          },
104          {
105             /* Two shader engines */
106             {0, 128, 128},
107             {3, 64, 128},
108             {5, 32, 128},
109             {9, 16, 128},
110             {33, 0, 0},
111          },
112          {
113             /* Four shader engines */
114             {0, 256, 256},
115             {2, 128, 256},
116             {3, 128, 128},
117             {5, 64, 128},
118             {9, 16, 128},
119             {33, 0, 0},
120          },
121       },
122       {
123          /* Four RB / SE */
124          {
125             /* One shader engine */
126             {0, 128, 256},
127             {2, 128, 128},
128             {3, 64, 128},
129             {5, 32, 128},
130             {9, 16, 128},
131             {17, 0, 0},
132          },
133          {
134             /* Two shader engines */
135             {0, 256, 256},
136             {2, 128, 256},
137             {3, 128, 128},
138             {5, 64, 128},
139             {9, 32, 128},
140             {17, 16, 128},
141             {33, 0, 0},
142          },
143          {
144             /* Four shader engines */
145             {0, 256, 512},
146             {2, 128, 512},
147             {3, 64, 512},
148             {5, 32, 512},
149             {9, 32, 256},
150             {17, 32, 128},
151             {33, 0, 0},
152          },
153       },
154    };
155 
156    return si_find_bin_size(sctx->screen, table, sum);
157 }
158 
gfx9_get_depth_bin_size(struct si_context * sctx)159 static struct uvec2 gfx9_get_depth_bin_size(struct si_context *sctx)
160 {
161    struct si_state_dsa *dsa = sctx->queued.named.dsa;
162 
163    if (!sctx->framebuffer.state.zsbuf || (!dsa->depth_enabled && !dsa->stencil_enabled)) {
164       /* Return the max size. */
165       struct uvec2 size = {512, 512};
166       return size;
167    }
168 
169    struct si_texture *tex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture;
170    unsigned depth_coeff = dsa->depth_enabled ? 5 : 0;
171    unsigned stencil_coeff = tex->surface.has_stencil && dsa->stencil_enabled ? 1 : 0;
172    unsigned sum = 4 * (depth_coeff + stencil_coeff) * MAX2(tex->buffer.b.b.nr_samples, 1);
173 
174    static const si_bin_size_subtable table[] = {
175       {
176          // One RB / SE
177          {
178             // One shader engine
179             {0, 64, 512},
180             {2, 64, 256},
181             {4, 64, 128},
182             {7, 32, 128},
183             {13, 16, 128},
184             {49, 0, 0},
185          },
186          {
187             // Two shader engines
188             {0, 128, 512},
189             {2, 64, 512},
190             {4, 64, 256},
191             {7, 64, 128},
192             {13, 32, 128},
193             {25, 16, 128},
194             {49, 0, 0},
195          },
196          {
197             // Four shader engines
198             {0, 256, 512},
199             {2, 128, 512},
200             {4, 64, 512},
201             {7, 64, 256},
202             {13, 64, 128},
203             {25, 16, 128},
204             {49, 0, 0},
205          },
206       },
207       {
208          // Two RB / SE
209          {
210             // One shader engine
211             {0, 128, 512},
212             {2, 64, 512},
213             {4, 64, 256},
214             {7, 64, 128},
215             {13, 32, 128},
216             {25, 16, 128},
217             {97, 0, 0},
218          },
219          {
220             // Two shader engines
221             {0, 256, 512},
222             {2, 128, 512},
223             {4, 64, 512},
224             {7, 64, 256},
225             {13, 64, 128},
226             {25, 32, 128},
227             {49, 16, 128},
228             {97, 0, 0},
229          },
230          {
231             // Four shader engines
232             {0, 512, 512},
233             {2, 256, 512},
234             {4, 128, 512},
235             {7, 64, 512},
236             {13, 64, 256},
237             {25, 64, 128},
238             {49, 16, 128},
239             {97, 0, 0},
240          },
241       },
242       {
243          // Four RB / SE
244          {
245             // One shader engine
246             {0, 256, 512},
247             {2, 128, 512},
248             {4, 64, 512},
249             {7, 64, 256},
250             {13, 64, 128},
251             {25, 32, 128},
252             {49, 16, 128},
253             {193, 0, 0},
254          },
255          {
256             // Two shader engines
257             {0, 512, 512},
258             {2, 256, 512},
259             {4, 128, 512},
260             {7, 64, 512},
261             {13, 64, 256},
262             {25, 64, 128},
263             {49, 32, 128},
264             {97, 16, 128},
265             {193, 0, 0},
266          },
267          {
268             // Four shader engines
269             {0, 512, 512},
270             {4, 256, 512},
271             {7, 128, 512},
272             {13, 64, 512},
273             {25, 32, 512},
274             {49, 32, 256},
275             {97, 16, 128},
276             {193, 0, 0},
277          },
278       },
279    };
280 
281    return si_find_bin_size(sctx->screen, table, sum);
282 }
283 
gfx10_get_bin_sizes(struct si_context * sctx,unsigned cb_target_enabled_4bit,struct uvec2 * color_bin_size,struct uvec2 * depth_bin_size)284 static void gfx10_get_bin_sizes(struct si_context *sctx, unsigned cb_target_enabled_4bit,
285                                 struct uvec2 *color_bin_size, struct uvec2 *depth_bin_size)
286 {
287    const unsigned ZsTagSize = 64;
288    const unsigned ZsNumTags = 312;
289    const unsigned CcTagSize = 1024;
290    const unsigned CcReadTags = 31;
291    const unsigned FcTagSize = 256;
292    const unsigned FcReadTags = 44;
293 
294    const unsigned num_rbs = sctx->screen->info.max_render_backends;
295    const unsigned num_pipes = MAX2(num_rbs, sctx->screen->info.num_tcc_blocks);
296 
297    const unsigned depthBinSizeTagPart =
298       ((ZsNumTags * num_rbs / num_pipes) * (ZsTagSize * num_pipes));
299    const unsigned colorBinSizeTagPart =
300       ((CcReadTags * num_rbs / num_pipes) * (CcTagSize * num_pipes));
301    const unsigned fmaskBinSizeTagPart =
302       ((FcReadTags * num_rbs / num_pipes) * (FcTagSize * num_pipes));
303 
304    const unsigned minBinSizeX = 128;
305    const unsigned minBinSizeY = sctx->gfx_level >= GFX12 ? 128 : 64;
306 
307    const unsigned num_fragments = sctx->framebuffer.nr_color_samples;
308    const unsigned num_samples = sctx->framebuffer.nr_samples;
309    const bool ps_iter_sample = si_get_ps_iter_samples(sctx) >= 2;
310 
311    /* Calculate cColor and cFmask(if applicable) */
312    unsigned cColor = 0;
313    unsigned cFmask = 0;
314    bool has_fmask = false;
315 
316    for (unsigned i = 0; i < sctx->framebuffer.state.nr_cbufs; i++) {
317       if (!sctx->framebuffer.state.cbufs[i])
318          continue;
319 
320       struct si_texture *tex = (struct si_texture *)sctx->framebuffer.state.cbufs[i]->texture;
321       const unsigned mmrt = num_fragments == 1 ? 1 : (ps_iter_sample ? num_fragments : 2);
322 
323       cColor += tex->surface.bpe * mmrt;
324       if (num_samples >= 2 && tex->surface.fmask_offset) {
325          const unsigned fragmentsLog2 = util_logbase2(num_fragments);
326          const unsigned samplesLog2 = util_logbase2(num_samples);
327 
328          static const unsigned cFmaskMrt[4 /* fragments */][5 /* samples */] = {
329             {0, 1, 1, 1, 2}, /* fragments = 1 */
330             {0, 1, 1, 2, 4}, /* fragments = 2 */
331             {0, 1, 1, 4, 8}, /* fragments = 4 */
332             {0, 1, 2, 4, 8}  /* fragments = 8 */
333          };
334          cFmask += cFmaskMrt[fragmentsLog2][samplesLog2];
335          has_fmask = true;
336       }
337    }
338    cColor = MAX2(cColor, 1u);
339 
340    const unsigned colorLog2Pixels = util_logbase2(colorBinSizeTagPart / cColor);
341    const unsigned colorBinSizeX = 1 << ((colorLog2Pixels + 1) / 2); /* round up width */
342    const unsigned colorBinSizeY = 1 << (colorLog2Pixels / 2);       /* round down height */
343 
344    unsigned binSizeX = colorBinSizeX;
345    unsigned binSizeY = colorBinSizeY;
346 
347    if (has_fmask) {
348       cFmask = MAX2(cFmask, 1u);
349 
350       const unsigned fmaskLog2Pixels = util_logbase2(fmaskBinSizeTagPart / cFmask);
351       const unsigned fmaskBinSizeX = 1 << ((fmaskLog2Pixels + 1) / 2); /* round up width */
352       const unsigned fmaskBinSizeY = 1 << (fmaskLog2Pixels / 2);       /* round down height */
353 
354       /* use the smaller of the Color vs. Fmask bin sizes */
355       if (fmaskLog2Pixels < colorLog2Pixels) {
356          binSizeX = fmaskBinSizeX;
357          binSizeY = fmaskBinSizeY;
358       }
359    }
360 
361    /* Return size adjusted for minimum bin size */
362    color_bin_size->x = MAX2(binSizeX, minBinSizeX);
363    color_bin_size->y = MAX2(binSizeY, minBinSizeY);
364 
365    if (!sctx->framebuffer.state.zsbuf) {
366       /* Set to max sizes when no depth buffer is bound. */
367       depth_bin_size->x = 512;
368       depth_bin_size->y = 512;
369    } else {
370       struct si_texture *zstex = (struct si_texture *)sctx->framebuffer.state.zsbuf->texture;
371       struct si_state_dsa *dsa = sctx->queued.named.dsa;
372 
373       const unsigned cPerDepthSample = dsa->depth_enabled ? 5 : 0;
374       const unsigned cPerStencilSample = dsa->stencil_enabled ? 1 : 0;
375       const unsigned cDepth =
376          (cPerDepthSample + cPerStencilSample) * MAX2(zstex->buffer.b.b.nr_samples, 1);
377 
378       const unsigned depthLog2Pixels = util_logbase2(depthBinSizeTagPart / MAX2(cDepth, 1u));
379       unsigned depthBinSizeX = 1 << ((depthLog2Pixels + 1) / 2);
380       unsigned depthBinSizeY = 1 << (depthLog2Pixels / 2);
381 
382       depth_bin_size->x = MAX2(depthBinSizeX, minBinSizeX);
383       depth_bin_size->y = MAX2(depthBinSizeY, minBinSizeY);
384    }
385 }
386 
si_emit_dpbb_disable(struct si_context * sctx)387 static void si_emit_dpbb_disable(struct si_context *sctx)
388 {
389    unsigned optimal_bin_selection = !sctx->queued.named.rasterizer->bottom_edge_rule;
390 
391    radeon_begin(&sctx->gfx_cs);
392 
393    if (sctx->gfx_level >= GFX12) {
394       struct uvec2 bin_size = {128, 128};
395 
396       radeon_opt_set_context_reg(R_028C44_PA_SC_BINNER_CNTL_0,
397                                  SI_TRACKED_PA_SC_BINNER_CNTL_0,
398                                  S_028C44_BINNING_MODE(V_028C44_BINNING_DISABLED) |
399                                  S_028C44_BIN_SIZE_X_EXTEND(util_logbase2(bin_size.x) - 5) |
400                                  S_028C44_BIN_SIZE_Y_EXTEND(util_logbase2(bin_size.y) - 5) |
401                                  S_028C44_DISABLE_START_OF_PRIM(1) |
402                                  S_028C44_FPOVS_PER_BATCH(63) |
403                                  S_028C44_OPTIMAL_BIN_SELECTION(1) |
404                                  S_028C44_FLUSH_ON_BINNING_TRANSITION(1));
405    } else if (sctx->gfx_level >= GFX10) {
406       struct uvec2 bin_size = {};
407       struct uvec2 bin_size_extend = {};
408       unsigned binning_disabled =
409          sctx->gfx_level >= GFX11_5 ? V_028C44_BINNING_DISABLED
410                                     : V_028C44_DISABLE_BINNING_USE_NEW_SC;
411 
412       bin_size.x = 128;
413       bin_size.y = sctx->framebuffer.min_bytes_per_pixel <= 4 ? 128 : 64;
414 
415       if (bin_size.x >= 32)
416          bin_size_extend.x = util_logbase2(bin_size.x) - 5;
417       if (bin_size.y >= 32)
418          bin_size_extend.y = util_logbase2(bin_size.y) - 5;
419 
420       radeon_opt_set_context_reg(R_028C44_PA_SC_BINNER_CNTL_0,
421                                  SI_TRACKED_PA_SC_BINNER_CNTL_0,
422                                  S_028C44_BINNING_MODE(binning_disabled) |
423                                  S_028C44_BIN_SIZE_X(bin_size.x == 16) |
424                                  S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
425                                  S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
426                                  S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
427                                  S_028C44_DISABLE_START_OF_PRIM(1) |
428                                  S_028C44_FPOVS_PER_BATCH(63) |
429                                  S_028C44_OPTIMAL_BIN_SELECTION(optimal_bin_selection) |
430                                  S_028C44_FLUSH_ON_BINNING_TRANSITION(1));
431    } else {
432       radeon_opt_set_context_reg(R_028C44_PA_SC_BINNER_CNTL_0,
433                                  SI_TRACKED_PA_SC_BINNER_CNTL_0,
434                                  S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
435                                  S_028C44_DISABLE_START_OF_PRIM(1) |
436                                  S_028C44_FLUSH_ON_BINNING_TRANSITION(sctx->family == CHIP_VEGA12 ||
437                                                                       sctx->family == CHIP_VEGA20 ||
438                                                                       sctx->family >= CHIP_RAVEN2));
439    }
440    radeon_end_update_context_roll();
441 }
442 
si_emit_dpbb_state(struct si_context * sctx,unsigned index)443 void si_emit_dpbb_state(struct si_context *sctx, unsigned index)
444 {
445    struct si_screen *sscreen = sctx->screen;
446    struct si_state_blend *blend = sctx->queued.named.blend;
447    struct si_state_dsa *dsa = sctx->queued.named.dsa;
448    unsigned db_shader_control = sctx->ps_db_shader_control;
449    unsigned optimal_bin_selection = !sctx->queued.named.rasterizer->bottom_edge_rule;
450    unsigned pa_sc_hisz_control = sctx->ps_pa_sc_hisz_control;
451 
452    assert(sctx->gfx_level >= GFX9);
453 
454    if (!sscreen->dpbb_allowed || sctx->dpbb_force_off ||
455        sctx->dpbb_force_off_profile_vs || sctx->dpbb_force_off_profile_ps) {
456       si_emit_dpbb_disable(sctx);
457       return;
458    }
459 
460    bool ps_can_kill =
461       G_02880C_KILL_ENABLE(db_shader_control) || G_02880C_MASK_EXPORT_ENABLE(db_shader_control) ||
462       G_02880C_COVERAGE_TO_MASK_ENABLE(db_shader_control) || blend->alpha_to_coverage;
463 
464    bool db_can_reject_z_trivially = !G_02880C_Z_EXPORT_ENABLE(db_shader_control) ||
465                                     G_02880C_DEPTH_BEFORE_SHADER(db_shader_control) ||
466                                     (sctx->gfx_level >= GFX12 ?
467                                         G_028BBC_CONSERVATIVE_Z_EXPORT(pa_sc_hisz_control) :
468                                         G_02880C_CONSERVATIVE_Z_EXPORT(db_shader_control));
469 
470    /* Disable DPBB when it's believed to be inefficient. */
471    if (sscreen->info.max_render_backends > 4 && ps_can_kill && db_can_reject_z_trivially &&
472        sctx->framebuffer.state.zsbuf && dsa->db_can_write) {
473       si_emit_dpbb_disable(sctx);
474       return;
475    }
476 
477    /* Compute the bin size. */
478    /* TODO: We could also look at enabled pixel shader outputs. */
479    unsigned cb_target_enabled_4bit =
480       sctx->framebuffer.colorbuf_enabled_4bit & blend->cb_target_enabled_4bit;
481    struct uvec2 color_bin_size, depth_bin_size;
482 
483    if (sctx->gfx_level >= GFX10) {
484       gfx10_get_bin_sizes(sctx, cb_target_enabled_4bit, &color_bin_size, &depth_bin_size);
485    } else {
486       color_bin_size = gfx9_get_color_bin_size(sctx, cb_target_enabled_4bit);
487       depth_bin_size = gfx9_get_depth_bin_size(sctx);
488    }
489 
490    unsigned color_area = color_bin_size.x * color_bin_size.y;
491    unsigned depth_area = depth_bin_size.x * depth_bin_size.y;
492 
493    struct uvec2 bin_size = color_area < depth_area ? color_bin_size : depth_bin_size;
494 
495    if (!bin_size.x || !bin_size.y) {
496       si_emit_dpbb_disable(sctx);
497       return;
498    }
499 
500    /* Tunable parameters. */
501    /* Allowed range:
502     *    gfx9-10: [0, 255] (0 = unlimited)
503     *    gfx11: [1, 255] (255 = unlimited)
504     */
505    unsigned fpovs_per_batch = 63;
506 
507    /* Emit registers. */
508    struct uvec2 bin_size_extend = {};
509    if (bin_size.x >= 32)
510       bin_size_extend.x = util_logbase2(bin_size.x) - 5;
511    if (bin_size.y >= 32)
512       bin_size_extend.y = util_logbase2(bin_size.y) - 5;
513 
514    radeon_begin(&sctx->gfx_cs);
515    radeon_opt_set_context_reg(R_028C44_PA_SC_BINNER_CNTL_0, SI_TRACKED_PA_SC_BINNER_CNTL_0,
516                               S_028C44_BINNING_MODE(V_028C44_BINNING_ALLOWED) |
517                               S_028C44_BIN_SIZE_X(bin_size.x == 16) |
518                               S_028C44_BIN_SIZE_Y(bin_size.y == 16) |
519                               S_028C44_BIN_SIZE_X_EXTEND(bin_size_extend.x) |
520                               S_028C44_BIN_SIZE_Y_EXTEND(bin_size_extend.y) |
521                               S_028C44_CONTEXT_STATES_PER_BIN(sscreen->pbb_context_states_per_bin - 1) |
522                               S_028C44_PERSISTENT_STATES_PER_BIN(sscreen->pbb_persistent_states_per_bin - 1) |
523                               S_028C44_DISABLE_START_OF_PRIM(1) |
524                               S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |
525                               S_028C44_OPTIMAL_BIN_SELECTION(optimal_bin_selection) |
526                               S_028C44_FLUSH_ON_BINNING_TRANSITION(sctx->family == CHIP_VEGA12 ||
527                                                                    sctx->family == CHIP_VEGA20 ||
528                                                                    sctx->family >= CHIP_RAVEN2));
529    radeon_end_update_context_roll();
530 }
531