xref: /aosp_15_r20/external/mesa3d/src/gallium/drivers/freedreno/freedreno_autotune.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Google, Inc.
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "freedreno_autotune.h"
7 #include "freedreno_batch.h"
8 #include "freedreno_util.h"
9 
10 /**
11  * Tracks, for a given batch key (which maps to a FBO/framebuffer state),
12  *
13  * ralloc parent is fd_autotune::ht
14  */
15 struct fd_batch_history {
16    struct fd_batch_key *key;
17 
18    /* Entry in fd_autotune::lru: */
19    struct list_head node;
20 
21    unsigned num_results;
22 
23    /**
24     * List of recent fd_batch_result's
25     */
26    struct list_head results;
27 #define MAX_RESULTS 5
28 };
29 
30 static struct fd_batch_history *
get_history(struct fd_autotune * at,struct fd_batch * batch)31 get_history(struct fd_autotune *at, struct fd_batch *batch)
32 {
33    struct fd_batch_history *history;
34 
35    /* draw batches should still have their key at this point. */
36    assert(batch->key || batch->nondraw);
37    if (!batch->key)
38       return NULL;
39 
40    struct hash_entry *entry =
41       _mesa_hash_table_search_pre_hashed(at->ht, batch->hash, batch->key);
42 
43    if (entry) {
44       history = entry->data;
45       goto found;
46    }
47 
48    history = rzalloc_size(at->ht, sizeof(*history));
49 
50    history->key = fd_batch_key_clone(history, batch->key);
51    list_inithead(&history->node);
52    list_inithead(&history->results);
53 
54    /* Note: We cap # of cached GMEM states at 20.. so assuming double-
55     * buffering, 40 should be a good place to cap cached autotune state
56     */
57    if (at->ht->entries >= 40) {
58       struct fd_batch_history *last =
59          list_last_entry(&at->lru, struct fd_batch_history, node);
60       _mesa_hash_table_remove_key(at->ht, last->key);
61       list_del(&last->node);
62       ralloc_free(last);
63    }
64 
65    _mesa_hash_table_insert_pre_hashed(at->ht, batch->hash, history->key,
66                                       history);
67 
68 found:
69    /* Move to the head of the LRU: */
70    list_delinit(&history->node);
71    list_add(&history->node, &at->lru);
72 
73    return history;
74 }
75 
76 static void
result_destructor(void * r)77 result_destructor(void *r)
78 {
79    struct fd_batch_result *result = r;
80 
81    /* Just in case we manage to somehow still be on the pending_results list: */
82    list_del(&result->node);
83 }
84 
85 static struct fd_batch_result *
get_result(struct fd_autotune * at,struct fd_batch_history * history)86 get_result(struct fd_autotune *at, struct fd_batch_history *history)
87 {
88    struct fd_batch_result *result = rzalloc_size(history, sizeof(*result));
89 
90    result->fence =
91       ++at->fence_counter; /* pre-increment so zero isn't valid fence */
92    result->idx = at->idx_counter++;
93 
94    if (at->idx_counter >= ARRAY_SIZE(at->results->result))
95       at->idx_counter = 0;
96 
97    result->history = history;
98    list_addtail(&result->node, &at->pending_results);
99 
100    ralloc_set_destructor(result, result_destructor);
101 
102    return result;
103 }
104 
105 static void
process_results(struct fd_autotune * at)106 process_results(struct fd_autotune *at)
107 {
108    uint32_t current_fence = at->results->fence;
109 
110    list_for_each_entry_safe (struct fd_batch_result, result,
111                              &at->pending_results, node) {
112       if (result->fence > current_fence)
113          break;
114 
115       struct fd_batch_history *history = result->history;
116 
117       result->samples_passed = at->results->result[result->idx].samples_end -
118                                at->results->result[result->idx].samples_start;
119 
120       list_delinit(&result->node);
121       list_add(&result->node, &history->results);
122 
123       if (history->num_results < MAX_RESULTS) {
124          history->num_results++;
125       } else {
126          /* Once above a limit, start popping old results off the
127           * tail of the list:
128           */
129          struct fd_batch_result *old_result =
130             list_last_entry(&history->results, struct fd_batch_result, node);
131          list_delinit(&old_result->node);
132          ralloc_free(old_result);
133       }
134    }
135 }
136 
137 static bool
fallback_use_bypass(struct fd_batch * batch)138 fallback_use_bypass(struct fd_batch *batch)
139 {
140    struct pipe_framebuffer_state *pfb = &batch->framebuffer;
141 
142    /* Fallback logic if we have no historical data about the rendertarget: */
143    if (batch->cleared || batch->gmem_reason ||
144        (batch->num_draws > 5) || (pfb->samples > 1)) {
145       return false;
146    }
147 
148    return true;
149 }
150 
151 /**
152  * A magic 8-ball that tells the gmem code whether we should do bypass mode
153  * for moar fps.
154  */
155 bool
fd_autotune_use_bypass(struct fd_autotune * at,struct fd_batch * batch)156 fd_autotune_use_bypass(struct fd_autotune *at, struct fd_batch *batch)
157 {
158    struct pipe_framebuffer_state *pfb = &batch->framebuffer;
159 
160    process_results(at);
161 
162    /* Only enable on gen's that opt-in (and actually have sample-passed
163     * collection wired up:
164     */
165    if (!batch->ctx->screen->gmem_reason_mask)
166       return fallback_use_bypass(batch);
167 
168    if (batch->gmem_reason & ~batch->ctx->screen->gmem_reason_mask)
169       return fallback_use_bypass(batch);
170 
171    for (unsigned i = 0; i < pfb->nr_cbufs; i++) {
172       /* If ms-rtt is involved, force GMEM, as we don't currently
173        * implement a temporary render target that we can MSAA resolve
174        * from
175        */
176       if (pfb->cbufs[i] && pfb->cbufs[i]->nr_samples)
177          return fallback_use_bypass(batch);
178    }
179 
180    struct fd_batch_history *history = get_history(at, batch);
181    if (!history)
182       return fallback_use_bypass(batch);
183 
184    batch->autotune_result = get_result(at, history);
185    batch->autotune_result->cost = batch->cost;
186 
187    bool use_bypass = fallback_use_bypass(batch);
188 
189    if (use_bypass)
190       return true;
191 
192    if (history->num_results > 0) {
193       uint32_t total_samples = 0;
194 
195       // TODO we should account for clears somehow
196       // TODO should we try to notice if there is a drastic change from
197       // frame to frame?
198       list_for_each_entry (struct fd_batch_result, result, &history->results,
199                            node) {
200          total_samples += result->samples_passed;
201       }
202 
203       float avg_samples = (float)total_samples / (float)history->num_results;
204 
205       /* Low sample count could mean there was only a clear.. or there was
206        * a clear plus draws that touch no or few samples
207        */
208       if (avg_samples < 500.0f)
209          return true;
210 
211       /* Cost-per-sample is an estimate for the average number of reads+
212        * writes for a given passed sample.
213        */
214       float sample_cost = batch->cost;
215       sample_cost /= batch->num_draws;
216 
217       float total_draw_cost = (avg_samples * sample_cost) / batch->num_draws;
218       DBG("%08x:%u\ttotal_samples=%u, avg_samples=%f, sample_cost=%f, "
219           "total_draw_cost=%f\n",
220           batch->hash, batch->num_draws, total_samples, avg_samples,
221           sample_cost, total_draw_cost);
222 
223       if (total_draw_cost < 3000.0f)
224          return true;
225    }
226 
227    return use_bypass;
228 }
229 
230 void
fd_autotune_init(struct fd_autotune * at,struct fd_device * dev)231 fd_autotune_init(struct fd_autotune *at, struct fd_device *dev)
232 {
233    at->ht =
234       _mesa_hash_table_create(NULL, fd_batch_key_hash, fd_batch_key_equals);
235    list_inithead(&at->lru);
236 
237    at->results_mem = fd_bo_new(dev, sizeof(struct fd_autotune_results),
238                                0, "autotune");
239    at->results = fd_bo_map(at->results_mem);
240 
241    list_inithead(&at->pending_results);
242 }
243 
244 void
fd_autotune_fini(struct fd_autotune * at)245 fd_autotune_fini(struct fd_autotune *at)
246 {
247    _mesa_hash_table_destroy(at->ht, NULL);
248    fd_bo_del(at->results_mem);
249 }
250