1 /*
2 * Copyright © 2021 Igalia S.L.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "tu_autotune.h"
7
8 #include "tu_cmd_buffer.h"
9 #include "tu_cs.h"
10 #include "tu_device.h"
11 #include "tu_image.h"
12 #include "tu_pass.h"
13
14 /* How does it work?
15 *
16 * - For each renderpass we calculate the number of samples passed
17 * by storing the number before and after in GPU memory.
18 * - To store the values each command buffer holds GPU memory which
19 * expands with more renderpasses being written.
20 * - For each renderpass we create tu_renderpass_result entry which
21 * points to the results in GPU memory.
22 * - Later on tu_renderpass_result would be added to the
23 * tu_renderpass_history entry which aggregate results for a
24 * given renderpass.
25 * - On submission:
26 * - Process results which fence was signalled.
27 * - Free per-submission data which we now don't need.
28 *
29 * - Create a command stream to write a fence value. This way we would
30 * know when we could safely read the results.
31 * - We cannot rely on the command buffer's lifetime when referencing
32 * its resources since the buffer could be destroyed before we process
33 * the results.
34 * - For each command buffer:
35 * - Reference its GPU memory.
36 * - Move if ONE_TIME_SUBMIT or copy all tu_renderpass_result to the queue.
37 *
38 * Since the command buffers could be recorded on different threads
39 * we have to maintaining some amount of locking history table,
40 * however we change the table only in a single thread at the submission
41 * time, so in most cases there will be no locking.
42 */
43
44 void
45 tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results);
46
47 #define TU_AUTOTUNE_DEBUG_LOG 0
48 /* Dump history entries on autotuner finish,
49 * could be used to gather data from traces.
50 */
51 #define TU_AUTOTUNE_LOG_AT_FINISH 0
52
53 /* How many last renderpass stats are taken into account. */
54 #define MAX_HISTORY_RESULTS 5
55 /* For how many submissions we store renderpass stats. */
56 #define MAX_HISTORY_LIFETIME 128
57
58
59 /**
60 * Tracks results for a given renderpass key
61 */
62 struct tu_renderpass_history {
63 uint64_t key;
64
65 /* We would delete old history entries */
66 uint32_t last_fence;
67
68 /**
69 * List of recent fd_renderpass_result's
70 */
71 struct list_head results;
72 uint32_t num_results;
73
74 uint32_t avg_samples;
75 };
76
77 /* Holds per-submission cs which writes the fence. */
78 struct tu_submission_data {
79 struct list_head node;
80 uint32_t fence;
81
82 struct tu_cs fence_cs;
83 };
84
85 static bool
fence_before(uint32_t a,uint32_t b)86 fence_before(uint32_t a, uint32_t b)
87 {
88 /* essentially a < b, but handle wrapped values */
89 return (int32_t)(a - b) < 0;
90 }
91
92 static uint32_t
get_autotune_fence(struct tu_autotune * at)93 get_autotune_fence(struct tu_autotune *at)
94 {
95 return at->device->global_bo_map->autotune_fence;
96 }
97
98 template <chip CHIP>
99 static void
create_submission_fence(struct tu_device * dev,struct tu_cs * cs,uint32_t fence)100 create_submission_fence(struct tu_device *dev,
101 struct tu_cs *cs,
102 uint32_t fence)
103 {
104 uint64_t dst_iova = dev->global_bo->iova + gb_offset(autotune_fence);
105 if (CHIP >= A7XX) {
106 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4);
107 tu_cs_emit(cs,
108 CP_EVENT_WRITE7_0(.event = CACHE_FLUSH_TS,
109 .write_src = EV_WRITE_USER_32B,
110 .write_dst = EV_DST_RAM,
111 .write_enabled = true).value);
112 } else {
113 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4);
114 tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS));
115 }
116
117 tu_cs_emit_qw(cs, dst_iova);
118 tu_cs_emit(cs, fence);
119 }
120
121 static struct tu_submission_data *
create_submission_data(struct tu_device * dev,struct tu_autotune * at,uint32_t fence)122 create_submission_data(struct tu_device *dev, struct tu_autotune *at,
123 uint32_t fence)
124 {
125 struct tu_submission_data *submission_data = NULL;
126 if (!list_is_empty(&at->submission_data_pool)) {
127 submission_data = list_first_entry(&at->submission_data_pool,
128 struct tu_submission_data, node);
129 list_del(&submission_data->node);
130 } else {
131 submission_data = (struct tu_submission_data *) calloc(
132 1, sizeof(struct tu_submission_data));
133 tu_cs_init(&submission_data->fence_cs, dev, TU_CS_MODE_GROW, 5, "autotune fence cs");
134 }
135 submission_data->fence = fence;
136
137 struct tu_cs* fence_cs = &submission_data->fence_cs;
138 tu_cs_begin(fence_cs);
139 TU_CALLX(dev, create_submission_fence)(dev, fence_cs, fence);
140 tu_cs_end(fence_cs);
141
142 list_addtail(&submission_data->node, &at->pending_submission_data);
143
144 return submission_data;
145 }
146
147 static void
finish_submission_data(struct tu_autotune * at,struct tu_submission_data * data)148 finish_submission_data(struct tu_autotune *at,
149 struct tu_submission_data *data)
150 {
151 list_del(&data->node);
152 list_addtail(&data->node, &at->submission_data_pool);
153 tu_cs_reset(&data->fence_cs);
154 }
155
156 static void
free_submission_data(struct tu_submission_data * data)157 free_submission_data(struct tu_submission_data *data)
158 {
159 list_del(&data->node);
160 tu_cs_finish(&data->fence_cs);
161
162 free(data);
163 }
164
165 static uint64_t
hash_renderpass_instance(const struct tu_render_pass * pass,const struct tu_framebuffer * framebuffer,const struct tu_cmd_buffer * cmd)166 hash_renderpass_instance(const struct tu_render_pass *pass,
167 const struct tu_framebuffer *framebuffer,
168 const struct tu_cmd_buffer *cmd) {
169 uint32_t data[3 + pass->attachment_count * 5];
170 uint32_t* ptr = data;
171
172 *ptr++ = framebuffer->width;
173 *ptr++ = framebuffer->height;
174 *ptr++ = framebuffer->layers;
175
176 for (unsigned i = 0; i < pass->attachment_count; i++) {
177 *ptr++ = cmd->state.attachments[i]->view.width;
178 *ptr++ = cmd->state.attachments[i]->view.height;
179 *ptr++ = cmd->state.attachments[i]->image->vk.format;
180 *ptr++ = cmd->state.attachments[i]->image->vk.array_layers;
181 *ptr++ = cmd->state.attachments[i]->image->vk.mip_levels;
182 }
183
184 return XXH64(data, sizeof(data), pass->autotune_hash);
185 }
186
187 static void
free_result(struct tu_device * dev,struct tu_renderpass_result * result)188 free_result(struct tu_device *dev, struct tu_renderpass_result *result)
189 {
190 tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo);
191 list_del(&result->node);
192 free(result);
193 }
194
195 static void
free_history(struct tu_device * dev,struct tu_renderpass_history * history)196 free_history(struct tu_device *dev, struct tu_renderpass_history *history)
197 {
198 tu_autotune_free_results_locked(dev, &history->results);
199 free(history);
200 }
201
202 static bool
get_history(struct tu_autotune * at,uint64_t rp_key,uint32_t * avg_samples)203 get_history(struct tu_autotune *at, uint64_t rp_key, uint32_t *avg_samples)
204 {
205 bool has_history = false;
206
207 /* If the lock contantion would be found in the wild -
208 * we could use try_lock here.
209 */
210 u_rwlock_rdlock(&at->ht_lock);
211 struct hash_entry *entry =
212 _mesa_hash_table_search(at->ht, &rp_key);
213 if (entry) {
214 struct tu_renderpass_history *history =
215 (struct tu_renderpass_history *) entry->data;
216 if (history->num_results > 0) {
217 *avg_samples = p_atomic_read(&history->avg_samples);
218 has_history = true;
219 }
220 }
221 u_rwlock_rdunlock(&at->ht_lock);
222
223 return has_history;
224 }
225
226 static struct tu_renderpass_result *
create_history_result(struct tu_autotune * at,uint64_t rp_key)227 create_history_result(struct tu_autotune *at, uint64_t rp_key)
228 {
229 struct tu_renderpass_result *result =
230 (struct tu_renderpass_result *) calloc(1, sizeof(*result));
231 result->rp_key = rp_key;
232
233 return result;
234 }
235
236 static void
history_add_result(struct tu_device * dev,struct tu_renderpass_history * history,struct tu_renderpass_result * result)237 history_add_result(struct tu_device *dev, struct tu_renderpass_history *history,
238 struct tu_renderpass_result *result)
239 {
240 list_delinit(&result->node);
241 list_add(&result->node, &history->results);
242
243 if (history->num_results < MAX_HISTORY_RESULTS) {
244 history->num_results++;
245 } else {
246 /* Once above the limit, start popping old results off the
247 * tail of the list:
248 */
249 struct tu_renderpass_result *old_result =
250 list_last_entry(&history->results, struct tu_renderpass_result, node);
251 mtx_lock(&dev->autotune_mutex);
252 free_result(dev, old_result);
253 mtx_unlock(&dev->autotune_mutex);
254 }
255
256 /* Do calculations here to avoid locking history in tu_autotune_use_bypass */
257 uint32_t total_samples = 0;
258 list_for_each_entry(struct tu_renderpass_result, result,
259 &history->results, node) {
260 total_samples += result->samples_passed;
261 }
262
263 float avg_samples = (float)total_samples / (float)history->num_results;
264 p_atomic_set(&history->avg_samples, (uint32_t)avg_samples);
265 }
266
267 static void
process_results(struct tu_autotune * at,uint32_t current_fence)268 process_results(struct tu_autotune *at, uint32_t current_fence)
269 {
270 struct tu_device *dev = at->device;
271
272 list_for_each_entry_safe(struct tu_renderpass_result, result,
273 &at->pending_results, node) {
274 if (fence_before(current_fence, result->fence))
275 break;
276
277 struct tu_renderpass_history *history = result->history;
278 result->samples_passed =
279 result->samples->samples_end - result->samples->samples_start;
280
281 history_add_result(dev, history, result);
282 }
283
284 list_for_each_entry_safe(struct tu_submission_data, submission_data,
285 &at->pending_submission_data, node) {
286 if (fence_before(current_fence, submission_data->fence))
287 break;
288
289 finish_submission_data(at, submission_data);
290 }
291 }
292
293 static void
queue_pending_results(struct tu_autotune * at,struct tu_cmd_buffer * cmdbuf)294 queue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf)
295 {
296 bool one_time_submit = cmdbuf->usage_flags &
297 VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
298
299 if (one_time_submit) {
300 /* We can just steal the list since it won't be resubmitted again */
301 list_splicetail(&cmdbuf->renderpass_autotune_results,
302 &at->pending_results);
303 list_inithead(&cmdbuf->renderpass_autotune_results);
304 } else {
305 list_for_each_entry_safe(struct tu_renderpass_result, result,
306 &cmdbuf->renderpass_autotune_results, node) {
307 /* TODO: copying each result isn't nice */
308 struct tu_renderpass_result *copy =
309 (struct tu_renderpass_result *) malloc(sizeof(*result));
310 *copy = *result;
311 tu_bo_get_ref(copy->bo.bo);
312 list_addtail(©->node, &at->pending_results);
313 }
314 }
315 }
316
317 struct tu_cs *
tu_autotune_on_submit(struct tu_device * dev,struct tu_autotune * at,struct tu_cmd_buffer ** cmd_buffers,uint32_t cmd_buffer_count)318 tu_autotune_on_submit(struct tu_device *dev,
319 struct tu_autotune *at,
320 struct tu_cmd_buffer **cmd_buffers,
321 uint32_t cmd_buffer_count)
322 {
323 /* We are single-threaded here */
324
325 const uint32_t gpu_fence = get_autotune_fence(at);
326 const uint32_t new_fence = at->fence_counter++;
327
328 process_results(at, gpu_fence);
329
330 /* Create history entries here to minimize work and locking being
331 * done on renderpass end.
332 */
333 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
334 struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
335 list_for_each_entry_safe(struct tu_renderpass_result, result,
336 &cmdbuf->renderpass_autotune_results, node) {
337 struct tu_renderpass_history *history;
338 struct hash_entry *entry =
339 _mesa_hash_table_search(at->ht, &result->rp_key);
340 if (!entry) {
341 history =
342 (struct tu_renderpass_history *) calloc(1, sizeof(*history));
343 history->key = result->rp_key;
344 list_inithead(&history->results);
345
346 u_rwlock_wrlock(&at->ht_lock);
347 _mesa_hash_table_insert(at->ht, &history->key, history);
348 u_rwlock_wrunlock(&at->ht_lock);
349 } else {
350 history = (struct tu_renderpass_history *) entry->data;
351 }
352
353 history->last_fence = new_fence;
354
355 result->fence = new_fence;
356 result->history = history;
357 }
358 }
359
360 struct tu_submission_data *submission_data =
361 create_submission_data(dev, at, new_fence);
362
363 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
364 struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
365 if (list_is_empty(&cmdbuf->renderpass_autotune_results))
366 continue;
367
368 queue_pending_results(at, cmdbuf);
369 }
370
371 if (TU_AUTOTUNE_DEBUG_LOG)
372 mesa_logi("Total history entries: %u", at->ht->entries);
373
374 /* Cleanup old entries from history table. The assumption
375 * here is that application doesn't hold many old unsubmitted
376 * command buffers, otherwise this table may grow big.
377 */
378 hash_table_foreach(at->ht, entry) {
379 struct tu_renderpass_history *history =
380 (struct tu_renderpass_history *) entry->data;
381 if (fence_before(gpu_fence, history->last_fence + MAX_HISTORY_LIFETIME))
382 continue;
383
384 if (TU_AUTOTUNE_DEBUG_LOG)
385 mesa_logi("Removed old history entry %016" PRIx64 "", history->key);
386
387 u_rwlock_wrlock(&at->ht_lock);
388 _mesa_hash_table_remove_key(at->ht, &history->key);
389 u_rwlock_wrunlock(&at->ht_lock);
390
391 mtx_lock(&dev->autotune_mutex);
392 free_history(dev, history);
393 mtx_unlock(&dev->autotune_mutex);
394 }
395
396 return &submission_data->fence_cs;
397 }
398
399 static bool
renderpass_key_equals(const void * _a,const void * _b)400 renderpass_key_equals(const void *_a, const void *_b)
401 {
402 return *(uint64_t *)_a == *(uint64_t *)_b;
403 }
404
405 static uint32_t
renderpass_key_hash(const void * _a)406 renderpass_key_hash(const void *_a)
407 {
408 return *((uint64_t *) _a) & 0xffffffff;
409 }
410
411 VkResult
tu_autotune_init(struct tu_autotune * at,struct tu_device * dev)412 tu_autotune_init(struct tu_autotune *at, struct tu_device *dev)
413 {
414 at->enabled = true;
415 at->device = dev;
416 at->ht = _mesa_hash_table_create(NULL,
417 renderpass_key_hash,
418 renderpass_key_equals);
419 u_rwlock_init(&at->ht_lock);
420
421 list_inithead(&at->pending_results);
422 list_inithead(&at->pending_submission_data);
423 list_inithead(&at->submission_data_pool);
424
425 /* start from 1 because tu6_global::autotune_fence is initialized to 0 */
426 at->fence_counter = 1;
427
428 return VK_SUCCESS;
429 }
430
431 void
tu_autotune_fini(struct tu_autotune * at,struct tu_device * dev)432 tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
433 {
434 if (TU_AUTOTUNE_LOG_AT_FINISH) {
435 while (!list_is_empty(&at->pending_results)) {
436 const uint32_t gpu_fence = get_autotune_fence(at);
437 process_results(at, gpu_fence);
438 }
439
440 hash_table_foreach(at->ht, entry) {
441 struct tu_renderpass_history *history =
442 (struct tu_renderpass_history *) entry->data;
443
444 mesa_logi("%016" PRIx64 " \tavg_passed=%u results=%u",
445 history->key, history->avg_samples, history->num_results);
446 }
447 }
448
449 tu_autotune_free_results(dev, &at->pending_results);
450
451 mtx_lock(&dev->autotune_mutex);
452 hash_table_foreach(at->ht, entry) {
453 struct tu_renderpass_history *history =
454 (struct tu_renderpass_history *) entry->data;
455 free_history(dev, history);
456 }
457 mtx_unlock(&dev->autotune_mutex);
458
459 list_for_each_entry_safe(struct tu_submission_data, submission_data,
460 &at->pending_submission_data, node) {
461 free_submission_data(submission_data);
462 }
463
464 list_for_each_entry_safe(struct tu_submission_data, submission_data,
465 &at->submission_data_pool, node) {
466 free_submission_data(submission_data);
467 }
468
469 _mesa_hash_table_destroy(at->ht, NULL);
470 u_rwlock_destroy(&at->ht_lock);
471 }
472
473 bool
tu_autotune_submit_requires_fence(struct tu_cmd_buffer ** cmd_buffers,uint32_t cmd_buffer_count)474 tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
475 uint32_t cmd_buffer_count)
476 {
477 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
478 struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
479 if (!list_is_empty(&cmdbuf->renderpass_autotune_results))
480 return true;
481 }
482
483 return false;
484 }
485
486 void
tu_autotune_free_results_locked(struct tu_device * dev,struct list_head * results)487 tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results)
488 {
489 list_for_each_entry_safe(struct tu_renderpass_result, result,
490 results, node) {
491 free_result(dev, result);
492 }
493 }
494
495 void
tu_autotune_free_results(struct tu_device * dev,struct list_head * results)496 tu_autotune_free_results(struct tu_device *dev, struct list_head *results)
497 {
498 mtx_lock(&dev->autotune_mutex);
499 tu_autotune_free_results_locked(dev, results);
500 mtx_unlock(&dev->autotune_mutex);
501 }
502
503 static bool
fallback_use_bypass(const struct tu_render_pass * pass,const struct tu_framebuffer * framebuffer,const struct tu_cmd_buffer * cmd_buffer)504 fallback_use_bypass(const struct tu_render_pass *pass,
505 const struct tu_framebuffer *framebuffer,
506 const struct tu_cmd_buffer *cmd_buffer)
507 {
508 if (cmd_buffer->state.rp.drawcall_count > 5)
509 return false;
510
511 for (unsigned i = 0; i < pass->subpass_count; i++) {
512 if (pass->subpasses[i].samples != VK_SAMPLE_COUNT_1_BIT)
513 return false;
514 }
515
516 return true;
517 }
518
519 static uint32_t
get_render_pass_pixel_count(const struct tu_cmd_buffer * cmd)520 get_render_pass_pixel_count(const struct tu_cmd_buffer *cmd)
521 {
522 const VkExtent2D *extent = &cmd->state.render_area.extent;
523 return extent->width * extent->height;
524 }
525
526 static uint64_t
estimate_drawcall_bandwidth(const struct tu_cmd_buffer * cmd,uint32_t avg_renderpass_sample_count)527 estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd,
528 uint32_t avg_renderpass_sample_count)
529 {
530 const struct tu_cmd_state *state = &cmd->state;
531
532 if (!state->rp.drawcall_count)
533 return 0;
534
535 /* sample count times drawcall_bandwidth_per_sample */
536 return (uint64_t)avg_renderpass_sample_count *
537 state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count;
538 }
539
540 bool
tu_autotune_use_bypass(struct tu_autotune * at,struct tu_cmd_buffer * cmd_buffer,struct tu_renderpass_result ** autotune_result)541 tu_autotune_use_bypass(struct tu_autotune *at,
542 struct tu_cmd_buffer *cmd_buffer,
543 struct tu_renderpass_result **autotune_result)
544 {
545 const struct tu_render_pass *pass = cmd_buffer->state.pass;
546 const struct tu_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
547
548 /* If a feedback loop in the subpass caused one of the pipelines used to set
549 * SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE) or even
550 * SINGLE_PRIM_MODE(FLUSH), then that should cause significantly increased
551 * sysmem bandwidth (though we haven't quantified it).
552 */
553 if (cmd_buffer->state.rp.sysmem_single_prim_mode)
554 return false;
555
556 /* If the user is using a fragment density map, then this will cause less
557 * FS invocations with GMEM, which has a hard-to-measure impact on
558 * performance because it depends on how heavy the FS is in addition to how
559 * many invocations there were and the density. Let's assume the user knows
560 * what they're doing when they added the map, because if sysmem is
561 * actually faster then they could've just not used the fragment density
562 * map.
563 */
564 if (pass->has_fdm)
565 return false;
566
567 /* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers
568 * we would have to allocate GPU memory at the submit time and copy
569 * results into it.
570 * Native games ususally don't use it, Zink and DXVK don't use it,
571 * D3D12 doesn't have such concept.
572 */
573 bool simultaneous_use =
574 cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
575
576 if (!at->enabled || simultaneous_use)
577 return fallback_use_bypass(pass, framebuffer, cmd_buffer);
578
579 /* We use 64bit hash as a key since we don't fear rare hash collision,
580 * the worst that would happen is sysmem being selected when it should
581 * have not, and with 64bit it would be extremely rare.
582 *
583 * Q: Why not make the key from framebuffer + renderpass pointers?
584 * A: At least DXVK creates new framebuffers each frame while keeping
585 * renderpasses the same. Also we want to support replaying a single
586 * frame in a loop for testing.
587 */
588 uint64_t renderpass_key = hash_renderpass_instance(pass, framebuffer, cmd_buffer);
589
590 *autotune_result = create_history_result(at, renderpass_key);
591
592 uint32_t avg_samples = 0;
593 if (get_history(at, renderpass_key, &avg_samples)) {
594 const uint32_t pass_pixel_count =
595 get_render_pass_pixel_count(cmd_buffer);
596 uint64_t sysmem_bandwidth =
597 (uint64_t)pass->sysmem_bandwidth_per_pixel * pass_pixel_count;
598 uint64_t gmem_bandwidth =
599 (uint64_t)pass->gmem_bandwidth_per_pixel * pass_pixel_count;
600
601 const uint64_t total_draw_call_bandwidth =
602 estimate_drawcall_bandwidth(cmd_buffer, avg_samples);
603
604 /* drawcalls access the memory in sysmem rendering (ignoring CCU) */
605 sysmem_bandwidth += total_draw_call_bandwidth;
606
607 /* drawcalls access gmem in gmem rendering, but we do not want to ignore
608 * them completely. The state changes between tiles also have an
609 * overhead. The magic numbers of 11 and 10 are randomly chosen.
610 */
611 gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10;
612
613 const bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth;
614 if (TU_AUTOTUNE_DEBUG_LOG) {
615 const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
616 const float drawcall_bandwidth_per_sample =
617 (float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum /
618 cmd_buffer->state.rp.drawcall_count;
619
620 mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
621 renderpass_key,
622 cmd_buffer->state.rp.drawcall_count,
623 select_sysmem ? "sysmem" : "gmem");
624 mesa_logi(" avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
625 avg_samples,
626 drawcall_bandwidth_per_sample,
627 total_draw_call_bandwidth);
628 mesa_logi(" render_area=%ux%u, sysmem_bandwidth_per_pixel=%u, gmem_bandwidth_per_pixel=%u",
629 extent->width, extent->height,
630 pass->sysmem_bandwidth_per_pixel,
631 pass->gmem_bandwidth_per_pixel);
632 mesa_logi(" sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64,
633 sysmem_bandwidth, gmem_bandwidth);
634 }
635
636 return select_sysmem;
637 }
638
639 return fallback_use_bypass(pass, framebuffer, cmd_buffer);
640 }
641
642 template <chip CHIP>
643 void
tu_autotune_begin_renderpass(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_renderpass_result * autotune_result)644 tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
645 struct tu_cs *cs,
646 struct tu_renderpass_result *autotune_result)
647 {
648 if (!autotune_result)
649 return;
650
651 struct tu_device *dev = cmd->device;
652
653 static const uint32_t size = sizeof(struct tu_renderpass_samples);
654
655 mtx_lock(&dev->autotune_mutex);
656 VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size);
657 mtx_unlock(&dev->autotune_mutex);
658 if (ret != VK_SUCCESS) {
659 autotune_result->bo.iova = 0;
660 return;
661 }
662
663 uint64_t result_iova = autotune_result->bo.iova;
664
665 autotune_result->samples =
666 (struct tu_renderpass_samples *) tu_suballoc_bo_map(
667 &autotune_result->bo);
668
669 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
670 if (cmd->device->physical_device->info->a7xx.has_event_write_sample_count) {
671 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
672 tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
673 .write_sample_count = true).value);
674 tu_cs_emit_qw(cs, result_iova);
675
676 /* If the renderpass contains an occlusion query with its own ZPASS_DONE,
677 * we have to provide a fake ZPASS_DONE event here to logically close the
678 * previous one, preventing firmware from misbehaving due to nested events.
679 * This writes into the samples_end field, which will be overwritten in
680 * tu_autotune_end_renderpass.
681 */
682 if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) {
683 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
684 tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
685 .write_sample_count = true,
686 .sample_count_end_offset = true,
687 .write_accum_sample_count_diff = true).value);
688 tu_cs_emit_qw(cs, result_iova);
689 }
690 } else {
691 tu_cs_emit_regs(cs,
692 A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
693 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
694 tu_cs_emit(cs, ZPASS_DONE);
695 }
696 }
697 TU_GENX(tu_autotune_begin_renderpass);
698
699 template <chip CHIP>
tu_autotune_end_renderpass(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_renderpass_result * autotune_result)700 void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
701 struct tu_cs *cs,
702 struct tu_renderpass_result *autotune_result)
703 {
704 if (!autotune_result)
705 return;
706
707 if (!autotune_result->bo.iova)
708 return;
709
710 uint64_t result_iova = autotune_result->bo.iova;
711
712 tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
713
714 if (cmd->device->physical_device->info->a7xx.has_event_write_sample_count) {
715 /* If the renderpass contains ZPASS_DONE events we emit a fake ZPASS_DONE
716 * event here, composing a pair of these events that firmware handles without
717 * issue. This first event writes into the samples_end field and the second
718 * event overwrites it. The second event also enables the accumulation flag
719 * even when we don't use that result because the blob always sets it.
720 */
721 if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) {
722 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
723 tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
724 .write_sample_count = true).value);
725 tu_cs_emit_qw(cs, result_iova + offsetof(struct tu_renderpass_samples, samples_end));
726 }
727
728 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
729 tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
730 .write_sample_count = true,
731 .sample_count_end_offset = true,
732 .write_accum_sample_count_diff = true).value);
733 tu_cs_emit_qw(cs, result_iova);
734 } else {
735 result_iova += offsetof(struct tu_renderpass_samples, samples_end);
736
737 tu_cs_emit_regs(cs,
738 A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
739 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
740 tu_cs_emit(cs, ZPASS_DONE);
741 }
742 }
743 TU_GENX(tu_autotune_end_renderpass);
744