xref: /aosp_15_r20/external/mesa3d/src/freedreno/vulkan/tu_autotune.h (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Igalia S.L.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #ifndef TU_AUTOTUNE_H
7 #define TU_AUTOTUNE_H
8 
9 #include "tu_common.h"
10 
11 #include "util/hash_table.h"
12 #include "util/rwlock.h"
13 
14 #include "tu_suballoc.h"
15 
16 struct tu_renderpass_history;
17 
18 /**
19  * "autotune" our decisions about bypass vs GMEM rendering, based on historical
20  * data about a given render target.
21  *
22  * In deciding which path to take there are tradeoffs, including some that
23  * are not reasonably estimateable without having some additional information:
24  *
25  *  (1) If you know you are touching every pixel (ie. there is a clear),
26  *      then the GMEM path will at least not cost more memory bandwidth than
27  *      sysmem[1]
28  *
29  *  (2) If there is no clear, GMEM could potentially cost *more* bandwidth
30  *      if there is sysmem->GMEM restore pass.
31  *
32  *  (3) If you see a high draw count, that is an indication that there will be
33  *      enough pixels accessed multiple times to benefit from the reduced
34  *      memory bandwidth that GMEM brings
35  *
36  *  (4) But high draw count where there is not much overdraw can actually be
37  *      faster in bypass mode if it is pushing a lot of state change, due to
38  *      not having to go thru the state changes per-tile[1]
39  *
40  * The approach taken is to measure the samples-passed for the batch to estimate
41  * the amount of overdraw to detect cases where the number of pixels touched is
42  * low.
43  *
44  * [1] ignoring early-tile-exit optimizations, but any draw that touches all/
45  *     most of the tiles late in the tile-pass can defeat that
46  */
47 struct tu_autotune {
48 
49    /* We may have to disable autotuner if there are too many
50     * renderpasses in-flight.
51     */
52    bool enabled;
53 
54    struct tu_device *device;
55 
56    /**
57     * Cache to map renderpass key to historical information about
58     * rendering to that particular render target.
59     */
60    struct hash_table *ht;
61    struct u_rwlock ht_lock;
62 
63    /**
64     * List of per-renderpass results that we are waiting for the GPU
65     * to finish with before reading back the results.
66     */
67    struct list_head pending_results;
68 
69    /**
70     * List of per-submission data that we may want to free after we
71     * processed submission results.
72     * This could happend after command buffers which were in the submission
73     * are destroyed.
74     */
75    struct list_head pending_submission_data;
76 
77    /**
78     * List of per-submission data that has been finished and can be reused.
79     */
80    struct list_head submission_data_pool;
81 
82    uint32_t fence_counter;
83    uint32_t idx_counter;
84 };
85 
86 /**
87  * From the cmdstream, the captured samples-passed values are recorded
88  * at the start and end of the batch.
89  *
90  * Note that we do the math on the CPU to avoid a WFI.  But pre-emption
91  * may force us to revisit that.
92  */
93 struct PACKED tu_renderpass_samples {
94    uint64_t samples_start;
95    /* hw requires the sample start/stop locations to be 128b aligned. */
96    uint64_t __pad0;
97    uint64_t samples_end;
98    uint64_t __pad1;
99 };
100 
101 /* Necessary when writing sample counts using CP_EVENT_WRITE7::ZPASS_DONE. */
102 static_assert(offsetof(struct tu_renderpass_samples, samples_end) == 16);
103 
104 /**
105  * Tracks the results from an individual renderpass. Initially created
106  * per renderpass, and appended to the tail of at->pending_results. At a later
107  * time, when the GPU has finished writing the results, we fill samples_passed.
108  */
109 struct tu_renderpass_result {
110    /* Points into GPU memory */
111    struct tu_renderpass_samples* samples;
112 
113    struct tu_suballoc_bo bo;
114 
115    /*
116     * Below here, only used internally within autotune
117     */
118    uint64_t rp_key;
119    struct tu_renderpass_history *history;
120    struct list_head node;
121    uint32_t fence;
122    uint64_t samples_passed;
123 };
124 
125 VkResult tu_autotune_init(struct tu_autotune *at, struct tu_device *dev);
126 void tu_autotune_fini(struct tu_autotune *at, struct tu_device *dev);
127 
128 bool tu_autotune_use_bypass(struct tu_autotune *at,
129                             struct tu_cmd_buffer *cmd_buffer,
130                             struct tu_renderpass_result **autotune_result);
131 void tu_autotune_free_results(struct tu_device *dev, struct list_head *results);
132 
133 bool tu_autotune_submit_requires_fence(struct tu_cmd_buffer **cmd_buffers,
134                                        uint32_t cmd_buffer_count);
135 
136 /**
137  * A magic 8-ball that tells the gmem code whether we should do bypass mode
138  * for moar fps.
139  */
140 struct tu_cs *tu_autotune_on_submit(struct tu_device *dev,
141                                     struct tu_autotune *at,
142                                     struct tu_cmd_buffer **cmd_buffers,
143                                     uint32_t cmd_buffer_count);
144 
145 struct tu_autotune_results_buffer;
146 
147 template <chip CHIP>
148 void tu_autotune_begin_renderpass(struct tu_cmd_buffer *cmd,
149                                   struct tu_cs *cs,
150                                   struct tu_renderpass_result *autotune_result);
151 
152 template <chip CHIP>
153 void tu_autotune_end_renderpass(struct tu_cmd_buffer *cmd,
154                                 struct tu_cs *cs,
155                                 struct tu_renderpass_result *autotune_result);
156 
157 #endif /* TU_AUTOTUNE_H */
158