xref: /aosp_15_r20/external/mesa3d/src/freedreno/vulkan/tu_autotune.cc (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Igalia S.L.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "tu_autotune.h"
7 
8 #include "tu_cmd_buffer.h"
9 #include "tu_cs.h"
10 #include "tu_device.h"
11 #include "tu_image.h"
12 #include "tu_pass.h"
13 
14 /* How does it work?
15  *
16  * - For each renderpass we calculate the number of samples passed
17  *   by storing the number before and after in GPU memory.
18  * - To store the values each command buffer holds GPU memory which
19  *   expands with more renderpasses being written.
20  * - For each renderpass we create tu_renderpass_result entry which
21  *   points to the results in GPU memory.
22  *   - Later on tu_renderpass_result would be added to the
23  *     tu_renderpass_history entry which aggregate results for a
24  *     given renderpass.
25  * - On submission:
26  *   - Process results which fence was signalled.
27  *   - Free per-submission data which we now don't need.
28  *
29  *   - Create a command stream to write a fence value. This way we would
30  *     know when we could safely read the results.
31  *   - We cannot rely on the command buffer's lifetime when referencing
32  *     its resources since the buffer could be destroyed before we process
33  *     the results.
34  *   - For each command buffer:
35  *     - Reference its GPU memory.
36  *     - Move if ONE_TIME_SUBMIT or copy all tu_renderpass_result to the queue.
37  *
38  * Since the command buffers could be recorded on different threads
39  * we have to maintaining some amount of locking history table,
40  * however we change the table only in a single thread at the submission
41  * time, so in most cases there will be no locking.
42  */
43 
44 void
45 tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results);
46 
47 #define TU_AUTOTUNE_DEBUG_LOG 0
48 /* Dump history entries on autotuner finish,
49  * could be used to gather data from traces.
50  */
51 #define TU_AUTOTUNE_LOG_AT_FINISH 0
52 
53 /* How many last renderpass stats are taken into account. */
54 #define MAX_HISTORY_RESULTS 5
55 /* For how many submissions we store renderpass stats. */
56 #define MAX_HISTORY_LIFETIME 128
57 
58 
59 /**
60  * Tracks results for a given renderpass key
61  */
62 struct tu_renderpass_history {
63    uint64_t key;
64 
65    /* We would delete old history entries */
66    uint32_t last_fence;
67 
68    /**
69     * List of recent fd_renderpass_result's
70     */
71    struct list_head results;
72    uint32_t num_results;
73 
74    uint32_t avg_samples;
75 };
76 
77 /* Holds per-submission cs which writes the fence. */
78 struct tu_submission_data {
79    struct list_head node;
80    uint32_t fence;
81 
82    struct tu_cs fence_cs;
83 };
84 
85 static bool
fence_before(uint32_t a,uint32_t b)86 fence_before(uint32_t a, uint32_t b)
87 {
88    /* essentially a < b, but handle wrapped values */
89    return (int32_t)(a - b) < 0;
90 }
91 
92 static uint32_t
get_autotune_fence(struct tu_autotune * at)93 get_autotune_fence(struct tu_autotune *at)
94 {
95    return at->device->global_bo_map->autotune_fence;
96 }
97 
98 template <chip CHIP>
99 static void
create_submission_fence(struct tu_device * dev,struct tu_cs * cs,uint32_t fence)100 create_submission_fence(struct tu_device *dev,
101                         struct tu_cs *cs,
102                         uint32_t fence)
103 {
104    uint64_t dst_iova = dev->global_bo->iova + gb_offset(autotune_fence);
105    if (CHIP >= A7XX) {
106       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 4);
107       tu_cs_emit(cs,
108          CP_EVENT_WRITE7_0(.event = CACHE_FLUSH_TS,
109                            .write_src = EV_WRITE_USER_32B,
110                            .write_dst = EV_DST_RAM,
111                            .write_enabled = true).value);
112    } else {
113       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 4);
114       tu_cs_emit(cs, CP_EVENT_WRITE_0_EVENT(CACHE_FLUSH_TS));
115    }
116 
117    tu_cs_emit_qw(cs, dst_iova);
118    tu_cs_emit(cs, fence);
119 }
120 
121 static struct tu_submission_data *
create_submission_data(struct tu_device * dev,struct tu_autotune * at,uint32_t fence)122 create_submission_data(struct tu_device *dev, struct tu_autotune *at,
123                        uint32_t fence)
124 {
125    struct tu_submission_data *submission_data = NULL;
126    if (!list_is_empty(&at->submission_data_pool)) {
127       submission_data = list_first_entry(&at->submission_data_pool,
128                                          struct tu_submission_data, node);
129       list_del(&submission_data->node);
130    } else {
131       submission_data = (struct tu_submission_data *) calloc(
132          1, sizeof(struct tu_submission_data));
133       tu_cs_init(&submission_data->fence_cs, dev, TU_CS_MODE_GROW, 5, "autotune fence cs");
134    }
135    submission_data->fence = fence;
136 
137    struct tu_cs* fence_cs = &submission_data->fence_cs;
138    tu_cs_begin(fence_cs);
139    TU_CALLX(dev, create_submission_fence)(dev, fence_cs, fence);
140    tu_cs_end(fence_cs);
141 
142    list_addtail(&submission_data->node, &at->pending_submission_data);
143 
144    return submission_data;
145 }
146 
147 static void
finish_submission_data(struct tu_autotune * at,struct tu_submission_data * data)148 finish_submission_data(struct tu_autotune *at,
149                        struct tu_submission_data *data)
150 {
151    list_del(&data->node);
152    list_addtail(&data->node, &at->submission_data_pool);
153    tu_cs_reset(&data->fence_cs);
154 }
155 
156 static void
free_submission_data(struct tu_submission_data * data)157 free_submission_data(struct tu_submission_data *data)
158 {
159    list_del(&data->node);
160    tu_cs_finish(&data->fence_cs);
161 
162    free(data);
163 }
164 
165 static uint64_t
hash_renderpass_instance(const struct tu_render_pass * pass,const struct tu_framebuffer * framebuffer,const struct tu_cmd_buffer * cmd)166 hash_renderpass_instance(const struct tu_render_pass *pass,
167                          const struct tu_framebuffer *framebuffer,
168                          const struct tu_cmd_buffer *cmd) {
169    uint32_t data[3 + pass->attachment_count * 5];
170    uint32_t* ptr = data;
171 
172    *ptr++ = framebuffer->width;
173    *ptr++ = framebuffer->height;
174    *ptr++ = framebuffer->layers;
175 
176    for (unsigned i = 0; i < pass->attachment_count; i++) {
177       *ptr++ = cmd->state.attachments[i]->view.width;
178       *ptr++ = cmd->state.attachments[i]->view.height;
179       *ptr++ = cmd->state.attachments[i]->image->vk.format;
180       *ptr++ = cmd->state.attachments[i]->image->vk.array_layers;
181       *ptr++ = cmd->state.attachments[i]->image->vk.mip_levels;
182    }
183 
184    return XXH64(data, sizeof(data), pass->autotune_hash);
185 }
186 
187 static void
free_result(struct tu_device * dev,struct tu_renderpass_result * result)188 free_result(struct tu_device *dev, struct tu_renderpass_result *result)
189 {
190    tu_suballoc_bo_free(&dev->autotune_suballoc, &result->bo);
191    list_del(&result->node);
192    free(result);
193 }
194 
195 static void
free_history(struct tu_device * dev,struct tu_renderpass_history * history)196 free_history(struct tu_device *dev, struct tu_renderpass_history *history)
197 {
198    tu_autotune_free_results_locked(dev, &history->results);
199    free(history);
200 }
201 
202 static bool
get_history(struct tu_autotune * at,uint64_t rp_key,uint32_t * avg_samples)203 get_history(struct tu_autotune *at, uint64_t rp_key, uint32_t *avg_samples)
204 {
205    bool has_history = false;
206 
207    /* If the lock contantion would be found in the wild -
208     * we could use try_lock here.
209     */
210    u_rwlock_rdlock(&at->ht_lock);
211    struct hash_entry *entry =
212       _mesa_hash_table_search(at->ht, &rp_key);
213    if (entry) {
214       struct tu_renderpass_history *history =
215          (struct tu_renderpass_history *) entry->data;
216       if (history->num_results > 0) {
217          *avg_samples = p_atomic_read(&history->avg_samples);
218          has_history = true;
219       }
220    }
221    u_rwlock_rdunlock(&at->ht_lock);
222 
223    return has_history;
224 }
225 
226 static struct tu_renderpass_result *
create_history_result(struct tu_autotune * at,uint64_t rp_key)227 create_history_result(struct tu_autotune *at, uint64_t rp_key)
228 {
229    struct tu_renderpass_result *result =
230       (struct tu_renderpass_result *) calloc(1, sizeof(*result));
231    result->rp_key = rp_key;
232 
233    return result;
234 }
235 
236 static void
history_add_result(struct tu_device * dev,struct tu_renderpass_history * history,struct tu_renderpass_result * result)237 history_add_result(struct tu_device *dev, struct tu_renderpass_history *history,
238                       struct tu_renderpass_result *result)
239 {
240    list_delinit(&result->node);
241    list_add(&result->node, &history->results);
242 
243    if (history->num_results < MAX_HISTORY_RESULTS) {
244       history->num_results++;
245    } else {
246       /* Once above the limit, start popping old results off the
247        * tail of the list:
248        */
249       struct tu_renderpass_result *old_result =
250          list_last_entry(&history->results, struct tu_renderpass_result, node);
251       mtx_lock(&dev->autotune_mutex);
252       free_result(dev, old_result);
253       mtx_unlock(&dev->autotune_mutex);
254    }
255 
256    /* Do calculations here to avoid locking history in tu_autotune_use_bypass */
257    uint32_t total_samples = 0;
258    list_for_each_entry(struct tu_renderpass_result, result,
259                        &history->results, node) {
260       total_samples += result->samples_passed;
261    }
262 
263    float avg_samples = (float)total_samples / (float)history->num_results;
264    p_atomic_set(&history->avg_samples, (uint32_t)avg_samples);
265 }
266 
267 static void
process_results(struct tu_autotune * at,uint32_t current_fence)268 process_results(struct tu_autotune *at, uint32_t current_fence)
269 {
270    struct tu_device *dev = at->device;
271 
272    list_for_each_entry_safe(struct tu_renderpass_result, result,
273                             &at->pending_results, node) {
274       if (fence_before(current_fence, result->fence))
275          break;
276 
277       struct tu_renderpass_history *history = result->history;
278       result->samples_passed =
279          result->samples->samples_end - result->samples->samples_start;
280 
281       history_add_result(dev, history, result);
282    }
283 
284    list_for_each_entry_safe(struct tu_submission_data, submission_data,
285                             &at->pending_submission_data, node) {
286       if (fence_before(current_fence, submission_data->fence))
287          break;
288 
289       finish_submission_data(at, submission_data);
290    }
291 }
292 
293 static void
queue_pending_results(struct tu_autotune * at,struct tu_cmd_buffer * cmdbuf)294 queue_pending_results(struct tu_autotune *at, struct tu_cmd_buffer *cmdbuf)
295 {
296    bool one_time_submit = cmdbuf->usage_flags &
297          VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
298 
299    if (one_time_submit) {
300       /* We can just steal the list since it won't be resubmitted again */
301       list_splicetail(&cmdbuf->renderpass_autotune_results,
302                         &at->pending_results);
303       list_inithead(&cmdbuf->renderpass_autotune_results);
304    } else {
305       list_for_each_entry_safe(struct tu_renderpass_result, result,
306                               &cmdbuf->renderpass_autotune_results, node) {
307          /* TODO: copying each result isn't nice */
308          struct tu_renderpass_result *copy =
309             (struct tu_renderpass_result *) malloc(sizeof(*result));
310          *copy = *result;
311          tu_bo_get_ref(copy->bo.bo);
312          list_addtail(&copy->node, &at->pending_results);
313       }
314    }
315 }
316 
317 struct tu_cs *
tu_autotune_on_submit(struct tu_device * dev,struct tu_autotune * at,struct tu_cmd_buffer ** cmd_buffers,uint32_t cmd_buffer_count)318 tu_autotune_on_submit(struct tu_device *dev,
319                       struct tu_autotune *at,
320                       struct tu_cmd_buffer **cmd_buffers,
321                       uint32_t cmd_buffer_count)
322 {
323    /* We are single-threaded here */
324 
325    const uint32_t gpu_fence = get_autotune_fence(at);
326    const uint32_t new_fence = at->fence_counter++;
327 
328    process_results(at, gpu_fence);
329 
330    /* Create history entries here to minimize work and locking being
331     * done on renderpass end.
332     */
333    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
334       struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
335       list_for_each_entry_safe(struct tu_renderpass_result, result,
336                           &cmdbuf->renderpass_autotune_results, node) {
337          struct tu_renderpass_history *history;
338          struct hash_entry *entry =
339             _mesa_hash_table_search(at->ht, &result->rp_key);
340          if (!entry) {
341             history =
342                (struct tu_renderpass_history *) calloc(1, sizeof(*history));
343             history->key = result->rp_key;
344             list_inithead(&history->results);
345 
346             u_rwlock_wrlock(&at->ht_lock);
347             _mesa_hash_table_insert(at->ht, &history->key, history);
348             u_rwlock_wrunlock(&at->ht_lock);
349          } else {
350             history = (struct tu_renderpass_history *) entry->data;
351          }
352 
353          history->last_fence = new_fence;
354 
355          result->fence = new_fence;
356          result->history = history;
357       }
358    }
359 
360    struct tu_submission_data *submission_data =
361       create_submission_data(dev, at, new_fence);
362 
363    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
364       struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
365       if (list_is_empty(&cmdbuf->renderpass_autotune_results))
366          continue;
367 
368       queue_pending_results(at, cmdbuf);
369    }
370 
371    if (TU_AUTOTUNE_DEBUG_LOG)
372       mesa_logi("Total history entries: %u", at->ht->entries);
373 
374    /* Cleanup old entries from history table. The assumption
375     * here is that application doesn't hold many old unsubmitted
376     * command buffers, otherwise this table may grow big.
377     */
378    hash_table_foreach(at->ht, entry) {
379       struct tu_renderpass_history *history =
380          (struct tu_renderpass_history *) entry->data;
381       if (fence_before(gpu_fence, history->last_fence + MAX_HISTORY_LIFETIME))
382          continue;
383 
384       if (TU_AUTOTUNE_DEBUG_LOG)
385          mesa_logi("Removed old history entry %016" PRIx64 "", history->key);
386 
387       u_rwlock_wrlock(&at->ht_lock);
388       _mesa_hash_table_remove_key(at->ht, &history->key);
389       u_rwlock_wrunlock(&at->ht_lock);
390 
391       mtx_lock(&dev->autotune_mutex);
392       free_history(dev, history);
393       mtx_unlock(&dev->autotune_mutex);
394    }
395 
396    return &submission_data->fence_cs;
397 }
398 
399 static bool
renderpass_key_equals(const void * _a,const void * _b)400 renderpass_key_equals(const void *_a, const void *_b)
401 {
402    return *(uint64_t *)_a == *(uint64_t *)_b;
403 }
404 
405 static uint32_t
renderpass_key_hash(const void * _a)406 renderpass_key_hash(const void *_a)
407 {
408    return *((uint64_t *) _a) & 0xffffffff;
409 }
410 
411 VkResult
tu_autotune_init(struct tu_autotune * at,struct tu_device * dev)412 tu_autotune_init(struct tu_autotune *at, struct tu_device *dev)
413 {
414    at->enabled = true;
415    at->device = dev;
416    at->ht = _mesa_hash_table_create(NULL,
417                                     renderpass_key_hash,
418                                     renderpass_key_equals);
419    u_rwlock_init(&at->ht_lock);
420 
421    list_inithead(&at->pending_results);
422    list_inithead(&at->pending_submission_data);
423    list_inithead(&at->submission_data_pool);
424 
425    /* start from 1 because tu6_global::autotune_fence is initialized to 0 */
426    at->fence_counter = 1;
427 
428    return VK_SUCCESS;
429 }
430 
431 void
tu_autotune_fini(struct tu_autotune * at,struct tu_device * dev)432 tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev)
433 {
434    if (TU_AUTOTUNE_LOG_AT_FINISH) {
435       while (!list_is_empty(&at->pending_results)) {
436          const uint32_t gpu_fence = get_autotune_fence(at);
437          process_results(at, gpu_fence);
438       }
439 
440       hash_table_foreach(at->ht, entry) {
441          struct tu_renderpass_history *history =
442             (struct tu_renderpass_history *) entry->data;
443 
444          mesa_logi("%016" PRIx64 " \tavg_passed=%u results=%u",
445                    history->key, history->avg_samples, history->num_results);
446       }
447    }
448 
449    tu_autotune_free_results(dev, &at->pending_results);
450 
451    mtx_lock(&dev->autotune_mutex);
452    hash_table_foreach(at->ht, entry) {
453       struct tu_renderpass_history *history =
454          (struct tu_renderpass_history *) entry->data;
455       free_history(dev, history);
456    }
457    mtx_unlock(&dev->autotune_mutex);
458 
459    list_for_each_entry_safe(struct tu_submission_data, submission_data,
460                             &at->pending_submission_data, node) {
461       free_submission_data(submission_data);
462    }
463 
464    list_for_each_entry_safe(struct tu_submission_data, submission_data,
465                             &at->submission_data_pool, node) {
466       free_submission_data(submission_data);
467    }
468 
469    _mesa_hash_table_destroy(at->ht, NULL);
470    u_rwlock_destroy(&at->ht_lock);
471 }
472 
473 bool
tu_autotune_submit_requires_fence(struct tu_cmd_buffer ** cmd_buffers,uint32_t cmd_buffer_count)474 tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
475                                   uint32_t cmd_buffer_count)
476 {
477    for (uint32_t i = 0; i < cmd_buffer_count; i++) {
478       struct tu_cmd_buffer *cmdbuf = cmd_buffers[i];
479       if (!list_is_empty(&cmdbuf->renderpass_autotune_results))
480          return true;
481    }
482 
483    return false;
484 }
485 
486 void
tu_autotune_free_results_locked(struct tu_device * dev,struct list_head * results)487 tu_autotune_free_results_locked(struct tu_device *dev, struct list_head *results)
488 {
489    list_for_each_entry_safe(struct tu_renderpass_result, result,
490                             results, node) {
491       free_result(dev, result);
492    }
493 }
494 
495 void
tu_autotune_free_results(struct tu_device * dev,struct list_head * results)496 tu_autotune_free_results(struct tu_device *dev, struct list_head *results)
497 {
498    mtx_lock(&dev->autotune_mutex);
499    tu_autotune_free_results_locked(dev, results);
500    mtx_unlock(&dev->autotune_mutex);
501 }
502 
503 static bool
fallback_use_bypass(const struct tu_render_pass * pass,const struct tu_framebuffer * framebuffer,const struct tu_cmd_buffer * cmd_buffer)504 fallback_use_bypass(const struct tu_render_pass *pass,
505                     const struct tu_framebuffer *framebuffer,
506                     const struct tu_cmd_buffer *cmd_buffer)
507 {
508    if (cmd_buffer->state.rp.drawcall_count > 5)
509       return false;
510 
511    for (unsigned i = 0; i < pass->subpass_count; i++) {
512       if (pass->subpasses[i].samples != VK_SAMPLE_COUNT_1_BIT)
513          return false;
514    }
515 
516    return true;
517 }
518 
519 static uint32_t
get_render_pass_pixel_count(const struct tu_cmd_buffer * cmd)520 get_render_pass_pixel_count(const struct tu_cmd_buffer *cmd)
521 {
522    const VkExtent2D *extent = &cmd->state.render_area.extent;
523    return extent->width * extent->height;
524 }
525 
526 static uint64_t
estimate_drawcall_bandwidth(const struct tu_cmd_buffer * cmd,uint32_t avg_renderpass_sample_count)527 estimate_drawcall_bandwidth(const struct tu_cmd_buffer *cmd,
528                             uint32_t avg_renderpass_sample_count)
529 {
530    const struct tu_cmd_state *state = &cmd->state;
531 
532    if (!state->rp.drawcall_count)
533       return 0;
534 
535    /* sample count times drawcall_bandwidth_per_sample */
536    return (uint64_t)avg_renderpass_sample_count *
537       state->rp.drawcall_bandwidth_per_sample_sum / state->rp.drawcall_count;
538 }
539 
540 bool
tu_autotune_use_bypass(struct tu_autotune * at,struct tu_cmd_buffer * cmd_buffer,struct tu_renderpass_result ** autotune_result)541 tu_autotune_use_bypass(struct tu_autotune *at,
542                        struct tu_cmd_buffer *cmd_buffer,
543                        struct tu_renderpass_result **autotune_result)
544 {
545    const struct tu_render_pass *pass = cmd_buffer->state.pass;
546    const struct tu_framebuffer *framebuffer = cmd_buffer->state.framebuffer;
547 
548    /* If a feedback loop in the subpass caused one of the pipelines used to set
549     * SINGLE_PRIM_MODE(FLUSH_PER_OVERLAP_AND_OVERWRITE) or even
550     * SINGLE_PRIM_MODE(FLUSH), then that should cause significantly increased
551     * sysmem bandwidth (though we haven't quantified it).
552     */
553    if (cmd_buffer->state.rp.sysmem_single_prim_mode)
554       return false;
555 
556    /* If the user is using a fragment density map, then this will cause less
557     * FS invocations with GMEM, which has a hard-to-measure impact on
558     * performance because it depends on how heavy the FS is in addition to how
559     * many invocations there were and the density. Let's assume the user knows
560     * what they're doing when they added the map, because if sysmem is
561     * actually faster then they could've just not used the fragment density
562     * map.
563     */
564    if (pass->has_fdm)
565       return false;
566 
567    /* For VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT buffers
568     * we would have to allocate GPU memory at the submit time and copy
569     * results into it.
570     * Native games ususally don't use it, Zink and DXVK don't use it,
571     * D3D12 doesn't have such concept.
572     */
573    bool simultaneous_use =
574       cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT;
575 
576    if (!at->enabled || simultaneous_use)
577       return fallback_use_bypass(pass, framebuffer, cmd_buffer);
578 
579    /* We use 64bit hash as a key since we don't fear rare hash collision,
580     * the worst that would happen is sysmem being selected when it should
581     * have not, and with 64bit it would be extremely rare.
582     *
583     * Q: Why not make the key from framebuffer + renderpass pointers?
584     * A: At least DXVK creates new framebuffers each frame while keeping
585     *    renderpasses the same. Also we want to support replaying a single
586     *    frame in a loop for testing.
587     */
588    uint64_t renderpass_key = hash_renderpass_instance(pass, framebuffer, cmd_buffer);
589 
590    *autotune_result = create_history_result(at, renderpass_key);
591 
592    uint32_t avg_samples = 0;
593    if (get_history(at, renderpass_key, &avg_samples)) {
594       const uint32_t pass_pixel_count =
595          get_render_pass_pixel_count(cmd_buffer);
596       uint64_t sysmem_bandwidth =
597          (uint64_t)pass->sysmem_bandwidth_per_pixel * pass_pixel_count;
598       uint64_t gmem_bandwidth =
599          (uint64_t)pass->gmem_bandwidth_per_pixel * pass_pixel_count;
600 
601       const uint64_t total_draw_call_bandwidth =
602          estimate_drawcall_bandwidth(cmd_buffer, avg_samples);
603 
604       /* drawcalls access the memory in sysmem rendering (ignoring CCU) */
605       sysmem_bandwidth += total_draw_call_bandwidth;
606 
607       /* drawcalls access gmem in gmem rendering, but we do not want to ignore
608        * them completely.  The state changes between tiles also have an
609        * overhead.  The magic numbers of 11 and 10 are randomly chosen.
610        */
611       gmem_bandwidth = (gmem_bandwidth * 11 + total_draw_call_bandwidth) / 10;
612 
613       const bool select_sysmem = sysmem_bandwidth <= gmem_bandwidth;
614       if (TU_AUTOTUNE_DEBUG_LOG) {
615          const VkExtent2D *extent = &cmd_buffer->state.render_area.extent;
616          const float drawcall_bandwidth_per_sample =
617             (float)cmd_buffer->state.rp.drawcall_bandwidth_per_sample_sum /
618             cmd_buffer->state.rp.drawcall_count;
619 
620          mesa_logi("autotune %016" PRIx64 ":%u selecting %s",
621                renderpass_key,
622                cmd_buffer->state.rp.drawcall_count,
623                select_sysmem ? "sysmem" : "gmem");
624          mesa_logi("   avg_samples=%u, draw_bandwidth_per_sample=%.2f, total_draw_call_bandwidth=%" PRIu64,
625                avg_samples,
626                drawcall_bandwidth_per_sample,
627                total_draw_call_bandwidth);
628          mesa_logi("   render_area=%ux%u, sysmem_bandwidth_per_pixel=%u, gmem_bandwidth_per_pixel=%u",
629                extent->width, extent->height,
630                pass->sysmem_bandwidth_per_pixel,
631                pass->gmem_bandwidth_per_pixel);
632          mesa_logi("   sysmem_bandwidth=%" PRIu64 ", gmem_bandwidth=%" PRIu64,
633                sysmem_bandwidth, gmem_bandwidth);
634       }
635 
636       return select_sysmem;
637    }
638 
639    return fallback_use_bypass(pass, framebuffer, cmd_buffer);
640 }
641 
642 template <chip CHIP>
643 void
tu_autotune_begin_renderpass(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_renderpass_result * autotune_result)644 tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
645                              struct tu_cs *cs,
646                              struct tu_renderpass_result *autotune_result)
647 {
648    if (!autotune_result)
649       return;
650 
651    struct tu_device *dev = cmd->device;
652 
653    static const uint32_t size = sizeof(struct tu_renderpass_samples);
654 
655    mtx_lock(&dev->autotune_mutex);
656    VkResult ret = tu_suballoc_bo_alloc(&autotune_result->bo, &dev->autotune_suballoc, size, size);
657    mtx_unlock(&dev->autotune_mutex);
658    if (ret != VK_SUCCESS) {
659       autotune_result->bo.iova = 0;
660       return;
661    }
662 
663    uint64_t result_iova = autotune_result->bo.iova;
664 
665    autotune_result->samples =
666       (struct tu_renderpass_samples *) tu_suballoc_bo_map(
667          &autotune_result->bo);
668 
669    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
670    if (cmd->device->physical_device->info->a7xx.has_event_write_sample_count) {
671       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
672       tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
673                                        .write_sample_count = true).value);
674       tu_cs_emit_qw(cs, result_iova);
675 
676       /* If the renderpass contains an occlusion query with its own ZPASS_DONE,
677        * we have to provide a fake ZPASS_DONE event here to logically close the
678        * previous one, preventing firmware from misbehaving due to nested events.
679        * This writes into the samples_end field, which will be overwritten in
680        * tu_autotune_end_renderpass.
681        */
682       if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) {
683          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
684          tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
685                                           .write_sample_count = true,
686                                           .sample_count_end_offset = true,
687                                           .write_accum_sample_count_diff = true).value);
688          tu_cs_emit_qw(cs, result_iova);
689       }
690    } else {
691       tu_cs_emit_regs(cs,
692                         A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
693       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
694       tu_cs_emit(cs, ZPASS_DONE);
695    }
696 }
697 TU_GENX(tu_autotune_begin_renderpass);
698 
699 template <chip CHIP>
tu_autotune_end_renderpass(struct tu_cmd_buffer * cmd,struct tu_cs * cs,struct tu_renderpass_result * autotune_result)700 void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
701                                 struct tu_cs *cs,
702                                 struct tu_renderpass_result *autotune_result)
703 {
704    if (!autotune_result)
705       return;
706 
707    if (!autotune_result->bo.iova)
708       return;
709 
710    uint64_t result_iova = autotune_result->bo.iova;
711 
712    tu_cs_emit_regs(cs, A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true));
713 
714    if (cmd->device->physical_device->info->a7xx.has_event_write_sample_count) {
715       /* If the renderpass contains ZPASS_DONE events we emit a fake ZPASS_DONE
716        * event here, composing a pair of these events that firmware handles without
717        * issue. This first event writes into the samples_end field and the second
718        * event overwrites it. The second event also enables the accumulation flag
719        * even when we don't use that result because the blob always sets it.
720        */
721       if (cmd->state.rp.has_zpass_done_sample_count_write_in_rp) {
722          tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
723          tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
724                                           .write_sample_count = true).value);
725          tu_cs_emit_qw(cs, result_iova + offsetof(struct tu_renderpass_samples, samples_end));
726       }
727 
728       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE7, 3);
729       tu_cs_emit(cs, CP_EVENT_WRITE7_0(.event = ZPASS_DONE,
730                                        .write_sample_count = true,
731                                        .sample_count_end_offset = true,
732                                        .write_accum_sample_count_diff = true).value);
733       tu_cs_emit_qw(cs, result_iova);
734    } else {
735       result_iova += offsetof(struct tu_renderpass_samples, samples_end);
736 
737       tu_cs_emit_regs(cs,
738                         A6XX_RB_SAMPLE_COUNT_ADDR(.qword = result_iova));
739       tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1);
740       tu_cs_emit(cs, ZPASS_DONE);
741    }
742 }
743 TU_GENX(tu_autotune_end_renderpass);
744