xref: /aosp_15_r20/external/mesa3d/src/freedreno/vulkan/tu_util.cc (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2015 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "tu_util.h"
7 
8 #include <errno.h>
9 #include <stdarg.h>
10 
11 #include "common/freedreno_rd_output.h"
12 #include "util/u_math.h"
13 #include "util/timespec.h"
14 #include "vk_enum_to_str.h"
15 
16 #include "tu_device.h"
17 #include "tu_pass.h"
18 
19 static const struct debug_control tu_debug_options[] = {
20    { "startup", TU_DEBUG_STARTUP },
21    { "nir", TU_DEBUG_NIR },
22    { "nobin", TU_DEBUG_NOBIN },
23    { "sysmem", TU_DEBUG_SYSMEM },
24    { "gmem", TU_DEBUG_GMEM },
25    { "forcebin", TU_DEBUG_FORCEBIN },
26    { "layout", TU_DEBUG_LAYOUT },
27    { "noubwc", TU_DEBUG_NOUBWC },
28    { "nomultipos", TU_DEBUG_NOMULTIPOS },
29    { "nolrz", TU_DEBUG_NOLRZ },
30    { "nolrzfc", TU_DEBUG_NOLRZFC },
31    { "perf", TU_DEBUG_PERF },
32    { "perfc", TU_DEBUG_PERFC },
33    { "flushall", TU_DEBUG_FLUSHALL },
34    { "syncdraw", TU_DEBUG_SYNCDRAW },
35    { "push_consts_per_stage", TU_DEBUG_PUSH_CONSTS_PER_STAGE },
36    { "rast_order", TU_DEBUG_RAST_ORDER },
37    { "unaligned_store", TU_DEBUG_UNALIGNED_STORE },
38    { "log_skip_gmem_ops", TU_DEBUG_LOG_SKIP_GMEM_OPS },
39    { "dynamic", TU_DEBUG_DYNAMIC },
40    { "bos", TU_DEBUG_BOS },
41    { "3d_load", TU_DEBUG_3D_LOAD },
42    { "fdm", TU_DEBUG_FDM },
43    { "noconform", TU_DEBUG_NOCONFORM },
44    { "rd", TU_DEBUG_RD },
45    { NULL, 0 }
46 };
47 
48 struct tu_env tu_env;
49 
50 static void
tu_env_init_once(void)51 tu_env_init_once(void)
52 {
53     tu_env.debug = parse_debug_string(os_get_option("TU_DEBUG"),
54             tu_debug_options);
55 
56    if (TU_DEBUG(STARTUP))
57       mesa_logi("TU_DEBUG=0x%x", tu_env.debug);
58 
59    /* TU_DEBUG=rd functionality was moved to fd_rd_output. This debug option
60     * should translate to the basic-level FD_RD_DUMP_ENABLE option.
61     */
62    if (TU_DEBUG(RD))
63       fd_rd_dump_env.flags |= FD_RD_DUMP_ENABLE;
64 }
65 
66 void
tu_env_init(void)67 tu_env_init(void)
68 {
69    fd_rd_dump_env_init();
70 
71    static once_flag once = ONCE_FLAG_INIT;
72    call_once(&once, tu_env_init_once);
73 }
74 
75 void PRINTFLIKE(3, 4)
__tu_finishme(const char * file,int line,const char * format,...)76    __tu_finishme(const char *file, int line, const char *format, ...)
77 {
78    va_list ap;
79    char buffer[256];
80 
81    va_start(ap, format);
82    vsnprintf(buffer, sizeof(buffer), format, ap);
83    va_end(ap);
84 
85    mesa_loge("%s:%d: FINISHME: %s\n", file, line, buffer);
86 }
87 
88 VkResult
__vk_startup_errorf(struct tu_instance * instance,VkResult error,const char * file,int line,const char * format,...)89 __vk_startup_errorf(struct tu_instance *instance,
90                     VkResult error,
91                     const char *file,
92                     int line,
93                     const char *format,
94                     ...)
95 {
96    va_list ap;
97    char buffer[256];
98 
99    const char *error_str = vk_Result_to_str(error);
100 
101    if (format) {
102       va_start(ap, format);
103       vsnprintf(buffer, sizeof(buffer), format, ap);
104       va_end(ap);
105 
106       mesa_loge("%s:%d: %s (%s)\n", file, line, buffer, error_str);
107    } else {
108       mesa_loge("%s:%d: %s\n", file, line, error_str);
109    }
110 
111    return error;
112 }
113 
114 static void
tu_tiling_config_update_tile_layout(struct tu_framebuffer * fb,const struct tu_device * dev,const struct tu_render_pass * pass,enum tu_gmem_layout gmem_layout)115 tu_tiling_config_update_tile_layout(struct tu_framebuffer *fb,
116                                     const struct tu_device *dev,
117                                     const struct tu_render_pass *pass,
118                                     enum tu_gmem_layout gmem_layout)
119 {
120    const uint32_t tile_align_w = pass->tile_align_w;
121    uint32_t tile_align_h = dev->physical_device->info->tile_align_h;
122    struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
123 
124    /* From the Vulkan 1.3.232 spec, under VkFramebufferCreateInfo:
125     *
126     *   If the render pass uses multiview, then layers must be one and each
127     *   attachment requires a number of layers that is greater than the
128     *   maximum bit index set in the view mask in the subpasses in which it is
129     *   used.
130     */
131 
132    uint32_t layers = MAX2(fb->layers, pass->num_views);
133 
134    /* If there is more than one layer, we need to make sure that the layer
135     * stride is expressible as an offset in RB_BLIT_BASE_GMEM which ignores
136     * the low 12 bits. The layer stride seems to be implicitly calculated from
137     * the tile width and height so we need to adjust one of them.
138     */
139    const uint32_t gmem_align_log2 = 12;
140    const uint32_t gmem_align = 1 << gmem_align_log2;
141    uint32_t min_layer_stride = tile_align_h * tile_align_w * pass->min_cpp;
142    if (layers > 1 && align(min_layer_stride, gmem_align) != min_layer_stride) {
143       /* Make sure that min_layer_stride is a multiple of gmem_align. Because
144        * gmem_align is a power of two and min_layer_stride isn't already a
145        * multiple of gmem_align, this is equivalent to shifting tile_align_h
146        * until the number of 0 bits at the bottom of min_layer_stride is at
147        * least gmem_align_log2.
148        */
149       tile_align_h <<= gmem_align_log2 - (ffs(min_layer_stride) - 1);
150 
151       /* Check that we did the math right. */
152       min_layer_stride = tile_align_h * tile_align_w * pass->min_cpp;
153       assert(align(min_layer_stride, gmem_align) == min_layer_stride);
154    }
155 
156    /* will force to sysmem, don't bother trying to have a valid tile config
157     * TODO: just skip all GMEM stuff when sysmem is forced?
158     */
159    if (!pass->gmem_pixels[gmem_layout]) {
160       tiling->possible = false;
161       /* Put in dummy values that will assertion fail in register setup using
162        * them, since you shouldn't be doing gmem work if gmem is not possible.
163        */
164       tiling->tile_count = (VkExtent2D) { 1, 1 };
165       tiling->tile0 = (VkExtent2D) { ~0, ~0 };
166       return;
167    }
168 
169    tiling->possible = false;
170 
171    uint32_t best_tile_count = ~0;
172    VkExtent2D tile_count;
173    VkExtent2D tile_size;
174    /* There aren't that many different tile widths possible, so just walk all
175     * of them finding which produces the lowest number of bins.
176     */
177    const uint32_t max_tile_width = MIN2(
178       dev->physical_device->info->tile_max_w, util_align_npot(fb->width, tile_align_w));
179    const uint32_t max_tile_height =
180       MIN2(dev->physical_device->info->tile_max_h,
181            align(fb->height, tile_align_h));
182    for (tile_size.width = tile_align_w; tile_size.width <= max_tile_width;
183         tile_size.width += tile_align_w) {
184       tile_size.height = pass->gmem_pixels[gmem_layout] / (tile_size.width * layers);
185       tile_size.height = MIN2(tile_size.height, max_tile_height);
186       tile_size.height = ROUND_DOWN_TO(tile_size.height, tile_align_h);
187       if (!tile_size.height)
188          continue;
189 
190       tile_count.width = DIV_ROUND_UP(fb->width, tile_size.width);
191       tile_count.height = DIV_ROUND_UP(fb->height, tile_size.height);
192 
193       /* Drop the height of the tile down to split tiles more evenly across the
194        * screen for a given tile count.
195        */
196       tile_size.height =
197          align(DIV_ROUND_UP(fb->height, tile_count.height), tile_align_h);
198 
199       /* Pick the layout with the minimum number of bins (lowest CP overhead
200        * and amount of cache flushing), but the most square tiles in the case
201        * of a tie (likely highest cache locality).
202        */
203       if (tile_count.width * tile_count.height < best_tile_count ||
204           (tile_count.width * tile_count.height == best_tile_count &&
205            abs((int)(tile_size.width - tile_size.height)) <
206               abs((int)(tiling->tile0.width - tiling->tile0.height)))) {
207          tiling->possible = true;
208          tiling->tile0 = tile_size;
209          tiling->tile_count = tile_count;
210          best_tile_count = tile_count.width * tile_count.height;
211       }
212    }
213 
214    /* If forcing binning, try to get at least 2 tiles in each direction. */
215    if (TU_DEBUG(FORCEBIN) && tiling->possible) {
216       if (tiling->tile_count.width == 1 && tiling->tile0.width != tile_align_w) {
217          tiling->tile0.width = util_align_npot(DIV_ROUND_UP(tiling->tile0.width, 2), tile_align_w);
218          tiling->tile_count.width = 2;
219       }
220       if (tiling->tile_count.height == 1 && tiling->tile0.height != tile_align_h) {
221          tiling->tile0.height = align(DIV_ROUND_UP(tiling->tile0.height, 2), tile_align_h);
222          tiling->tile_count.height = 2;
223       }
224    }
225 }
226 
227 static void
tu_tiling_config_update_pipe_layout(struct tu_tiling_config * tiling,const struct tu_device * dev)228 tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling,
229                                     const struct tu_device *dev)
230 {
231    const uint32_t max_pipe_count =
232       dev->physical_device->info->num_vsc_pipes;
233 
234    /* start from 1 tile per pipe */
235    tiling->pipe0 = (VkExtent2D) {
236       .width = 1,
237       .height = 1,
238    };
239    tiling->pipe_count = tiling->tile_count;
240 
241    while (tiling->pipe_count.width * tiling->pipe_count.height > max_pipe_count) {
242       if (tiling->pipe0.width < tiling->pipe0.height) {
243          tiling->pipe0.width += 1;
244          tiling->pipe_count.width =
245             DIV_ROUND_UP(tiling->tile_count.width, tiling->pipe0.width);
246       } else {
247          tiling->pipe0.height += 1;
248          tiling->pipe_count.height =
249             DIV_ROUND_UP(tiling->tile_count.height, tiling->pipe0.height);
250       }
251    }
252 }
253 
254 static void
tu_tiling_config_update_pipes(struct tu_tiling_config * tiling,const struct tu_device * dev)255 tu_tiling_config_update_pipes(struct tu_tiling_config *tiling,
256                               const struct tu_device *dev)
257 {
258    const uint32_t max_pipe_count =
259       dev->physical_device->info->num_vsc_pipes;
260    const uint32_t used_pipe_count =
261       tiling->pipe_count.width * tiling->pipe_count.height;
262    const VkExtent2D last_pipe = {
263       .width = (tiling->tile_count.width - 1) % tiling->pipe0.width + 1,
264       .height = (tiling->tile_count.height - 1) % tiling->pipe0.height + 1,
265    };
266 
267    assert(used_pipe_count <= max_pipe_count);
268    assert(max_pipe_count <= ARRAY_SIZE(tiling->pipe_config));
269 
270    for (uint32_t y = 0; y < tiling->pipe_count.height; y++) {
271       for (uint32_t x = 0; x < tiling->pipe_count.width; x++) {
272          const uint32_t pipe_x = tiling->pipe0.width * x;
273          const uint32_t pipe_y = tiling->pipe0.height * y;
274          const uint32_t pipe_w = (x == tiling->pipe_count.width - 1)
275                                     ? last_pipe.width
276                                     : tiling->pipe0.width;
277          const uint32_t pipe_h = (y == tiling->pipe_count.height - 1)
278                                     ? last_pipe.height
279                                     : tiling->pipe0.height;
280          const uint32_t n = tiling->pipe_count.width * y + x;
281 
282          tiling->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) |
283                                   A6XX_VSC_PIPE_CONFIG_REG_Y(pipe_y) |
284                                   A6XX_VSC_PIPE_CONFIG_REG_W(pipe_w) |
285                                   A6XX_VSC_PIPE_CONFIG_REG_H(pipe_h);
286          tiling->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h);
287       }
288    }
289 
290    memset(tiling->pipe_config + used_pipe_count, 0,
291           sizeof(uint32_t) * (max_pipe_count - used_pipe_count));
292 }
293 
294 static bool
is_hw_binning_possible(const struct tu_tiling_config * tiling)295 is_hw_binning_possible(const struct tu_tiling_config *tiling)
296 {
297    /* Similar to older gens, # of tiles per pipe cannot be more than 32.
298     * But there are no hangs with 16 or more tiles per pipe in either
299     * X or Y direction, so that limit does not seem to apply.
300     */
301    uint32_t tiles_per_pipe = tiling->pipe0.width * tiling->pipe0.height;
302    return tiles_per_pipe <= 32;
303 }
304 
305 static void
tu_tiling_config_update_binning(struct tu_tiling_config * tiling,const struct tu_device * device)306 tu_tiling_config_update_binning(struct tu_tiling_config *tiling, const struct tu_device *device)
307 {
308    tiling->binning_possible = is_hw_binning_possible(tiling);
309 
310    if (tiling->binning_possible) {
311       tiling->binning = (tiling->tile_count.width * tiling->tile_count.height) > 2;
312 
313       if (TU_DEBUG(FORCEBIN))
314          tiling->binning = true;
315       if (TU_DEBUG(NOBIN))
316          tiling->binning = false;
317    } else {
318       tiling->binning = false;
319    }
320 }
321 
322 void
tu_framebuffer_tiling_config(struct tu_framebuffer * fb,const struct tu_device * device,const struct tu_render_pass * pass)323 tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
324                              const struct tu_device *device,
325                              const struct tu_render_pass *pass)
326 {
327    for (int gmem_layout = 0; gmem_layout < TU_GMEM_LAYOUT_COUNT; gmem_layout++) {
328       struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
329       tu_tiling_config_update_tile_layout(fb, device, pass,
330                                           (enum tu_gmem_layout) gmem_layout);
331       tu_tiling_config_update_pipe_layout(tiling, device);
332       tu_tiling_config_update_pipes(tiling, device);
333       tu_tiling_config_update_binning(tiling, device);
334    }
335 }
336 
337 void
tu_dbg_log_gmem_load_store_skips(struct tu_device * device)338 tu_dbg_log_gmem_load_store_skips(struct tu_device *device)
339 {
340    static uint32_t last_skipped_loads = 0;
341    static uint32_t last_skipped_stores = 0;
342    static uint32_t last_total_loads = 0;
343    static uint32_t last_total_stores = 0;
344    static struct timespec last_time = {};
345 
346    pthread_mutex_lock(&device->submit_mutex);
347 
348    struct timespec current_time;
349    clock_gettime(CLOCK_MONOTONIC, &current_time);
350 
351    if (timespec_sub_to_nsec(&current_time, &last_time) > 1000 * 1000 * 1000) {
352       last_time = current_time;
353    } else {
354       pthread_mutex_unlock(&device->submit_mutex);
355       return;
356    }
357 
358    struct tu6_global *global = device->global_bo_map;
359 
360    uint32_t current_taken_loads = global->dbg_gmem_taken_loads;
361    uint32_t current_taken_stores = global->dbg_gmem_taken_stores;
362    uint32_t current_total_loads = global->dbg_gmem_total_loads;
363    uint32_t current_total_stores = global->dbg_gmem_total_stores;
364 
365    uint32_t skipped_loads = current_total_loads - current_taken_loads;
366    uint32_t skipped_stores = current_total_stores - current_taken_stores;
367 
368    uint32_t current_time_frame_skipped_loads = skipped_loads - last_skipped_loads;
369    uint32_t current_time_frame_skipped_stores = skipped_stores - last_skipped_stores;
370 
371    uint32_t current_time_frame_total_loads = current_total_loads - last_total_loads;
372    uint32_t current_time_frame_total_stores = current_total_stores - last_total_stores;
373 
374    mesa_logi("[GMEM] loads total: %u skipped: %.1f%%\n",
375          current_time_frame_total_loads,
376          current_time_frame_skipped_loads / (float) current_time_frame_total_loads * 100.f);
377    mesa_logi("[GMEM] stores total: %u skipped: %.1f%%\n",
378          current_time_frame_total_stores,
379          current_time_frame_skipped_stores / (float) current_time_frame_total_stores * 100.f);
380 
381    last_skipped_loads = skipped_loads;
382    last_skipped_stores = skipped_stores;
383    last_total_loads = current_total_loads;
384    last_total_stores = current_total_stores;
385 
386    pthread_mutex_unlock(&device->submit_mutex);
387 }
388