1 /*
2 * Copyright © 2015 Intel Corporation
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "tu_util.h"
7
8 #include <errno.h>
9 #include <stdarg.h>
10
11 #include "common/freedreno_rd_output.h"
12 #include "util/u_math.h"
13 #include "util/timespec.h"
14 #include "vk_enum_to_str.h"
15
16 #include "tu_device.h"
17 #include "tu_pass.h"
18
19 static const struct debug_control tu_debug_options[] = {
20 { "startup", TU_DEBUG_STARTUP },
21 { "nir", TU_DEBUG_NIR },
22 { "nobin", TU_DEBUG_NOBIN },
23 { "sysmem", TU_DEBUG_SYSMEM },
24 { "gmem", TU_DEBUG_GMEM },
25 { "forcebin", TU_DEBUG_FORCEBIN },
26 { "layout", TU_DEBUG_LAYOUT },
27 { "noubwc", TU_DEBUG_NOUBWC },
28 { "nomultipos", TU_DEBUG_NOMULTIPOS },
29 { "nolrz", TU_DEBUG_NOLRZ },
30 { "nolrzfc", TU_DEBUG_NOLRZFC },
31 { "perf", TU_DEBUG_PERF },
32 { "perfc", TU_DEBUG_PERFC },
33 { "flushall", TU_DEBUG_FLUSHALL },
34 { "syncdraw", TU_DEBUG_SYNCDRAW },
35 { "push_consts_per_stage", TU_DEBUG_PUSH_CONSTS_PER_STAGE },
36 { "rast_order", TU_DEBUG_RAST_ORDER },
37 { "unaligned_store", TU_DEBUG_UNALIGNED_STORE },
38 { "log_skip_gmem_ops", TU_DEBUG_LOG_SKIP_GMEM_OPS },
39 { "dynamic", TU_DEBUG_DYNAMIC },
40 { "bos", TU_DEBUG_BOS },
41 { "3d_load", TU_DEBUG_3D_LOAD },
42 { "fdm", TU_DEBUG_FDM },
43 { "noconform", TU_DEBUG_NOCONFORM },
44 { "rd", TU_DEBUG_RD },
45 { NULL, 0 }
46 };
47
48 struct tu_env tu_env;
49
50 static void
tu_env_init_once(void)51 tu_env_init_once(void)
52 {
53 tu_env.debug = parse_debug_string(os_get_option("TU_DEBUG"),
54 tu_debug_options);
55
56 if (TU_DEBUG(STARTUP))
57 mesa_logi("TU_DEBUG=0x%x", tu_env.debug);
58
59 /* TU_DEBUG=rd functionality was moved to fd_rd_output. This debug option
60 * should translate to the basic-level FD_RD_DUMP_ENABLE option.
61 */
62 if (TU_DEBUG(RD))
63 fd_rd_dump_env.flags |= FD_RD_DUMP_ENABLE;
64 }
65
66 void
tu_env_init(void)67 tu_env_init(void)
68 {
69 fd_rd_dump_env_init();
70
71 static once_flag once = ONCE_FLAG_INIT;
72 call_once(&once, tu_env_init_once);
73 }
74
75 void PRINTFLIKE(3, 4)
__tu_finishme(const char * file,int line,const char * format,...)76 __tu_finishme(const char *file, int line, const char *format, ...)
77 {
78 va_list ap;
79 char buffer[256];
80
81 va_start(ap, format);
82 vsnprintf(buffer, sizeof(buffer), format, ap);
83 va_end(ap);
84
85 mesa_loge("%s:%d: FINISHME: %s\n", file, line, buffer);
86 }
87
88 VkResult
__vk_startup_errorf(struct tu_instance * instance,VkResult error,const char * file,int line,const char * format,...)89 __vk_startup_errorf(struct tu_instance *instance,
90 VkResult error,
91 const char *file,
92 int line,
93 const char *format,
94 ...)
95 {
96 va_list ap;
97 char buffer[256];
98
99 const char *error_str = vk_Result_to_str(error);
100
101 if (format) {
102 va_start(ap, format);
103 vsnprintf(buffer, sizeof(buffer), format, ap);
104 va_end(ap);
105
106 mesa_loge("%s:%d: %s (%s)\n", file, line, buffer, error_str);
107 } else {
108 mesa_loge("%s:%d: %s\n", file, line, error_str);
109 }
110
111 return error;
112 }
113
114 static void
tu_tiling_config_update_tile_layout(struct tu_framebuffer * fb,const struct tu_device * dev,const struct tu_render_pass * pass,enum tu_gmem_layout gmem_layout)115 tu_tiling_config_update_tile_layout(struct tu_framebuffer *fb,
116 const struct tu_device *dev,
117 const struct tu_render_pass *pass,
118 enum tu_gmem_layout gmem_layout)
119 {
120 const uint32_t tile_align_w = pass->tile_align_w;
121 uint32_t tile_align_h = dev->physical_device->info->tile_align_h;
122 struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
123
124 /* From the Vulkan 1.3.232 spec, under VkFramebufferCreateInfo:
125 *
126 * If the render pass uses multiview, then layers must be one and each
127 * attachment requires a number of layers that is greater than the
128 * maximum bit index set in the view mask in the subpasses in which it is
129 * used.
130 */
131
132 uint32_t layers = MAX2(fb->layers, pass->num_views);
133
134 /* If there is more than one layer, we need to make sure that the layer
135 * stride is expressible as an offset in RB_BLIT_BASE_GMEM which ignores
136 * the low 12 bits. The layer stride seems to be implicitly calculated from
137 * the tile width and height so we need to adjust one of them.
138 */
139 const uint32_t gmem_align_log2 = 12;
140 const uint32_t gmem_align = 1 << gmem_align_log2;
141 uint32_t min_layer_stride = tile_align_h * tile_align_w * pass->min_cpp;
142 if (layers > 1 && align(min_layer_stride, gmem_align) != min_layer_stride) {
143 /* Make sure that min_layer_stride is a multiple of gmem_align. Because
144 * gmem_align is a power of two and min_layer_stride isn't already a
145 * multiple of gmem_align, this is equivalent to shifting tile_align_h
146 * until the number of 0 bits at the bottom of min_layer_stride is at
147 * least gmem_align_log2.
148 */
149 tile_align_h <<= gmem_align_log2 - (ffs(min_layer_stride) - 1);
150
151 /* Check that we did the math right. */
152 min_layer_stride = tile_align_h * tile_align_w * pass->min_cpp;
153 assert(align(min_layer_stride, gmem_align) == min_layer_stride);
154 }
155
156 /* will force to sysmem, don't bother trying to have a valid tile config
157 * TODO: just skip all GMEM stuff when sysmem is forced?
158 */
159 if (!pass->gmem_pixels[gmem_layout]) {
160 tiling->possible = false;
161 /* Put in dummy values that will assertion fail in register setup using
162 * them, since you shouldn't be doing gmem work if gmem is not possible.
163 */
164 tiling->tile_count = (VkExtent2D) { 1, 1 };
165 tiling->tile0 = (VkExtent2D) { ~0, ~0 };
166 return;
167 }
168
169 tiling->possible = false;
170
171 uint32_t best_tile_count = ~0;
172 VkExtent2D tile_count;
173 VkExtent2D tile_size;
174 /* There aren't that many different tile widths possible, so just walk all
175 * of them finding which produces the lowest number of bins.
176 */
177 const uint32_t max_tile_width = MIN2(
178 dev->physical_device->info->tile_max_w, util_align_npot(fb->width, tile_align_w));
179 const uint32_t max_tile_height =
180 MIN2(dev->physical_device->info->tile_max_h,
181 align(fb->height, tile_align_h));
182 for (tile_size.width = tile_align_w; tile_size.width <= max_tile_width;
183 tile_size.width += tile_align_w) {
184 tile_size.height = pass->gmem_pixels[gmem_layout] / (tile_size.width * layers);
185 tile_size.height = MIN2(tile_size.height, max_tile_height);
186 tile_size.height = ROUND_DOWN_TO(tile_size.height, tile_align_h);
187 if (!tile_size.height)
188 continue;
189
190 tile_count.width = DIV_ROUND_UP(fb->width, tile_size.width);
191 tile_count.height = DIV_ROUND_UP(fb->height, tile_size.height);
192
193 /* Drop the height of the tile down to split tiles more evenly across the
194 * screen for a given tile count.
195 */
196 tile_size.height =
197 align(DIV_ROUND_UP(fb->height, tile_count.height), tile_align_h);
198
199 /* Pick the layout with the minimum number of bins (lowest CP overhead
200 * and amount of cache flushing), but the most square tiles in the case
201 * of a tie (likely highest cache locality).
202 */
203 if (tile_count.width * tile_count.height < best_tile_count ||
204 (tile_count.width * tile_count.height == best_tile_count &&
205 abs((int)(tile_size.width - tile_size.height)) <
206 abs((int)(tiling->tile0.width - tiling->tile0.height)))) {
207 tiling->possible = true;
208 tiling->tile0 = tile_size;
209 tiling->tile_count = tile_count;
210 best_tile_count = tile_count.width * tile_count.height;
211 }
212 }
213
214 /* If forcing binning, try to get at least 2 tiles in each direction. */
215 if (TU_DEBUG(FORCEBIN) && tiling->possible) {
216 if (tiling->tile_count.width == 1 && tiling->tile0.width != tile_align_w) {
217 tiling->tile0.width = util_align_npot(DIV_ROUND_UP(tiling->tile0.width, 2), tile_align_w);
218 tiling->tile_count.width = 2;
219 }
220 if (tiling->tile_count.height == 1 && tiling->tile0.height != tile_align_h) {
221 tiling->tile0.height = align(DIV_ROUND_UP(tiling->tile0.height, 2), tile_align_h);
222 tiling->tile_count.height = 2;
223 }
224 }
225 }
226
227 static void
tu_tiling_config_update_pipe_layout(struct tu_tiling_config * tiling,const struct tu_device * dev)228 tu_tiling_config_update_pipe_layout(struct tu_tiling_config *tiling,
229 const struct tu_device *dev)
230 {
231 const uint32_t max_pipe_count =
232 dev->physical_device->info->num_vsc_pipes;
233
234 /* start from 1 tile per pipe */
235 tiling->pipe0 = (VkExtent2D) {
236 .width = 1,
237 .height = 1,
238 };
239 tiling->pipe_count = tiling->tile_count;
240
241 while (tiling->pipe_count.width * tiling->pipe_count.height > max_pipe_count) {
242 if (tiling->pipe0.width < tiling->pipe0.height) {
243 tiling->pipe0.width += 1;
244 tiling->pipe_count.width =
245 DIV_ROUND_UP(tiling->tile_count.width, tiling->pipe0.width);
246 } else {
247 tiling->pipe0.height += 1;
248 tiling->pipe_count.height =
249 DIV_ROUND_UP(tiling->tile_count.height, tiling->pipe0.height);
250 }
251 }
252 }
253
254 static void
tu_tiling_config_update_pipes(struct tu_tiling_config * tiling,const struct tu_device * dev)255 tu_tiling_config_update_pipes(struct tu_tiling_config *tiling,
256 const struct tu_device *dev)
257 {
258 const uint32_t max_pipe_count =
259 dev->physical_device->info->num_vsc_pipes;
260 const uint32_t used_pipe_count =
261 tiling->pipe_count.width * tiling->pipe_count.height;
262 const VkExtent2D last_pipe = {
263 .width = (tiling->tile_count.width - 1) % tiling->pipe0.width + 1,
264 .height = (tiling->tile_count.height - 1) % tiling->pipe0.height + 1,
265 };
266
267 assert(used_pipe_count <= max_pipe_count);
268 assert(max_pipe_count <= ARRAY_SIZE(tiling->pipe_config));
269
270 for (uint32_t y = 0; y < tiling->pipe_count.height; y++) {
271 for (uint32_t x = 0; x < tiling->pipe_count.width; x++) {
272 const uint32_t pipe_x = tiling->pipe0.width * x;
273 const uint32_t pipe_y = tiling->pipe0.height * y;
274 const uint32_t pipe_w = (x == tiling->pipe_count.width - 1)
275 ? last_pipe.width
276 : tiling->pipe0.width;
277 const uint32_t pipe_h = (y == tiling->pipe_count.height - 1)
278 ? last_pipe.height
279 : tiling->pipe0.height;
280 const uint32_t n = tiling->pipe_count.width * y + x;
281
282 tiling->pipe_config[n] = A6XX_VSC_PIPE_CONFIG_REG_X(pipe_x) |
283 A6XX_VSC_PIPE_CONFIG_REG_Y(pipe_y) |
284 A6XX_VSC_PIPE_CONFIG_REG_W(pipe_w) |
285 A6XX_VSC_PIPE_CONFIG_REG_H(pipe_h);
286 tiling->pipe_sizes[n] = CP_SET_BIN_DATA5_0_VSC_SIZE(pipe_w * pipe_h);
287 }
288 }
289
290 memset(tiling->pipe_config + used_pipe_count, 0,
291 sizeof(uint32_t) * (max_pipe_count - used_pipe_count));
292 }
293
294 static bool
is_hw_binning_possible(const struct tu_tiling_config * tiling)295 is_hw_binning_possible(const struct tu_tiling_config *tiling)
296 {
297 /* Similar to older gens, # of tiles per pipe cannot be more than 32.
298 * But there are no hangs with 16 or more tiles per pipe in either
299 * X or Y direction, so that limit does not seem to apply.
300 */
301 uint32_t tiles_per_pipe = tiling->pipe0.width * tiling->pipe0.height;
302 return tiles_per_pipe <= 32;
303 }
304
305 static void
tu_tiling_config_update_binning(struct tu_tiling_config * tiling,const struct tu_device * device)306 tu_tiling_config_update_binning(struct tu_tiling_config *tiling, const struct tu_device *device)
307 {
308 tiling->binning_possible = is_hw_binning_possible(tiling);
309
310 if (tiling->binning_possible) {
311 tiling->binning = (tiling->tile_count.width * tiling->tile_count.height) > 2;
312
313 if (TU_DEBUG(FORCEBIN))
314 tiling->binning = true;
315 if (TU_DEBUG(NOBIN))
316 tiling->binning = false;
317 } else {
318 tiling->binning = false;
319 }
320 }
321
322 void
tu_framebuffer_tiling_config(struct tu_framebuffer * fb,const struct tu_device * device,const struct tu_render_pass * pass)323 tu_framebuffer_tiling_config(struct tu_framebuffer *fb,
324 const struct tu_device *device,
325 const struct tu_render_pass *pass)
326 {
327 for (int gmem_layout = 0; gmem_layout < TU_GMEM_LAYOUT_COUNT; gmem_layout++) {
328 struct tu_tiling_config *tiling = &fb->tiling[gmem_layout];
329 tu_tiling_config_update_tile_layout(fb, device, pass,
330 (enum tu_gmem_layout) gmem_layout);
331 tu_tiling_config_update_pipe_layout(tiling, device);
332 tu_tiling_config_update_pipes(tiling, device);
333 tu_tiling_config_update_binning(tiling, device);
334 }
335 }
336
337 void
tu_dbg_log_gmem_load_store_skips(struct tu_device * device)338 tu_dbg_log_gmem_load_store_skips(struct tu_device *device)
339 {
340 static uint32_t last_skipped_loads = 0;
341 static uint32_t last_skipped_stores = 0;
342 static uint32_t last_total_loads = 0;
343 static uint32_t last_total_stores = 0;
344 static struct timespec last_time = {};
345
346 pthread_mutex_lock(&device->submit_mutex);
347
348 struct timespec current_time;
349 clock_gettime(CLOCK_MONOTONIC, ¤t_time);
350
351 if (timespec_sub_to_nsec(¤t_time, &last_time) > 1000 * 1000 * 1000) {
352 last_time = current_time;
353 } else {
354 pthread_mutex_unlock(&device->submit_mutex);
355 return;
356 }
357
358 struct tu6_global *global = device->global_bo_map;
359
360 uint32_t current_taken_loads = global->dbg_gmem_taken_loads;
361 uint32_t current_taken_stores = global->dbg_gmem_taken_stores;
362 uint32_t current_total_loads = global->dbg_gmem_total_loads;
363 uint32_t current_total_stores = global->dbg_gmem_total_stores;
364
365 uint32_t skipped_loads = current_total_loads - current_taken_loads;
366 uint32_t skipped_stores = current_total_stores - current_taken_stores;
367
368 uint32_t current_time_frame_skipped_loads = skipped_loads - last_skipped_loads;
369 uint32_t current_time_frame_skipped_stores = skipped_stores - last_skipped_stores;
370
371 uint32_t current_time_frame_total_loads = current_total_loads - last_total_loads;
372 uint32_t current_time_frame_total_stores = current_total_stores - last_total_stores;
373
374 mesa_logi("[GMEM] loads total: %u skipped: %.1f%%\n",
375 current_time_frame_total_loads,
376 current_time_frame_skipped_loads / (float) current_time_frame_total_loads * 100.f);
377 mesa_logi("[GMEM] stores total: %u skipped: %.1f%%\n",
378 current_time_frame_total_stores,
379 current_time_frame_skipped_stores / (float) current_time_frame_total_stores * 100.f);
380
381 last_skipped_loads = skipped_loads;
382 last_skipped_stores = skipped_stores;
383 last_total_loads = current_total_loads;
384 last_total_stores = current_total_stores;
385
386 pthread_mutex_unlock(&device->submit_mutex);
387 }
388