1 /*
2 * Copyright © 2021 Google, Inc.
3 * SPDX-License-Identifier: MIT
4 */
5
6 #include "freedreno_autotune.h"
7 #include "freedreno_batch.h"
8 #include "freedreno_util.h"
9
10 /**
11 * Tracks, for a given batch key (which maps to a FBO/framebuffer state),
12 *
13 * ralloc parent is fd_autotune::ht
14 */
15 struct fd_batch_history {
16 struct fd_batch_key *key;
17
18 /* Entry in fd_autotune::lru: */
19 struct list_head node;
20
21 unsigned num_results;
22
23 /**
24 * List of recent fd_batch_result's
25 */
26 struct list_head results;
27 #define MAX_RESULTS 5
28 };
29
30 static struct fd_batch_history *
get_history(struct fd_autotune * at,struct fd_batch * batch)31 get_history(struct fd_autotune *at, struct fd_batch *batch)
32 {
33 struct fd_batch_history *history;
34
35 /* draw batches should still have their key at this point. */
36 assert(batch->key || batch->nondraw);
37 if (!batch->key)
38 return NULL;
39
40 struct hash_entry *entry =
41 _mesa_hash_table_search_pre_hashed(at->ht, batch->hash, batch->key);
42
43 if (entry) {
44 history = entry->data;
45 goto found;
46 }
47
48 history = rzalloc_size(at->ht, sizeof(*history));
49
50 history->key = fd_batch_key_clone(history, batch->key);
51 list_inithead(&history->node);
52 list_inithead(&history->results);
53
54 /* Note: We cap # of cached GMEM states at 20.. so assuming double-
55 * buffering, 40 should be a good place to cap cached autotune state
56 */
57 if (at->ht->entries >= 40) {
58 struct fd_batch_history *last =
59 list_last_entry(&at->lru, struct fd_batch_history, node);
60 _mesa_hash_table_remove_key(at->ht, last->key);
61 list_del(&last->node);
62 ralloc_free(last);
63 }
64
65 _mesa_hash_table_insert_pre_hashed(at->ht, batch->hash, history->key,
66 history);
67
68 found:
69 /* Move to the head of the LRU: */
70 list_delinit(&history->node);
71 list_add(&history->node, &at->lru);
72
73 return history;
74 }
75
76 static void
result_destructor(void * r)77 result_destructor(void *r)
78 {
79 struct fd_batch_result *result = r;
80
81 /* Just in case we manage to somehow still be on the pending_results list: */
82 list_del(&result->node);
83 }
84
85 static struct fd_batch_result *
get_result(struct fd_autotune * at,struct fd_batch_history * history)86 get_result(struct fd_autotune *at, struct fd_batch_history *history)
87 {
88 struct fd_batch_result *result = rzalloc_size(history, sizeof(*result));
89
90 result->fence =
91 ++at->fence_counter; /* pre-increment so zero isn't valid fence */
92 result->idx = at->idx_counter++;
93
94 if (at->idx_counter >= ARRAY_SIZE(at->results->result))
95 at->idx_counter = 0;
96
97 result->history = history;
98 list_addtail(&result->node, &at->pending_results);
99
100 ralloc_set_destructor(result, result_destructor);
101
102 return result;
103 }
104
105 static void
process_results(struct fd_autotune * at)106 process_results(struct fd_autotune *at)
107 {
108 uint32_t current_fence = at->results->fence;
109
110 list_for_each_entry_safe (struct fd_batch_result, result,
111 &at->pending_results, node) {
112 if (result->fence > current_fence)
113 break;
114
115 struct fd_batch_history *history = result->history;
116
117 result->samples_passed = at->results->result[result->idx].samples_end -
118 at->results->result[result->idx].samples_start;
119
120 list_delinit(&result->node);
121 list_add(&result->node, &history->results);
122
123 if (history->num_results < MAX_RESULTS) {
124 history->num_results++;
125 } else {
126 /* Once above a limit, start popping old results off the
127 * tail of the list:
128 */
129 struct fd_batch_result *old_result =
130 list_last_entry(&history->results, struct fd_batch_result, node);
131 list_delinit(&old_result->node);
132 ralloc_free(old_result);
133 }
134 }
135 }
136
137 static bool
fallback_use_bypass(struct fd_batch * batch)138 fallback_use_bypass(struct fd_batch *batch)
139 {
140 struct pipe_framebuffer_state *pfb = &batch->framebuffer;
141
142 /* Fallback logic if we have no historical data about the rendertarget: */
143 if (batch->cleared || batch->gmem_reason ||
144 (batch->num_draws > 5) || (pfb->samples > 1)) {
145 return false;
146 }
147
148 return true;
149 }
150
151 /**
152 * A magic 8-ball that tells the gmem code whether we should do bypass mode
153 * for moar fps.
154 */
155 bool
fd_autotune_use_bypass(struct fd_autotune * at,struct fd_batch * batch)156 fd_autotune_use_bypass(struct fd_autotune *at, struct fd_batch *batch)
157 {
158 struct pipe_framebuffer_state *pfb = &batch->framebuffer;
159
160 process_results(at);
161
162 /* Only enable on gen's that opt-in (and actually have sample-passed
163 * collection wired up:
164 */
165 if (!batch->ctx->screen->gmem_reason_mask)
166 return fallback_use_bypass(batch);
167
168 if (batch->gmem_reason & ~batch->ctx->screen->gmem_reason_mask)
169 return fallback_use_bypass(batch);
170
171 for (unsigned i = 0; i < pfb->nr_cbufs; i++) {
172 /* If ms-rtt is involved, force GMEM, as we don't currently
173 * implement a temporary render target that we can MSAA resolve
174 * from
175 */
176 if (pfb->cbufs[i] && pfb->cbufs[i]->nr_samples)
177 return fallback_use_bypass(batch);
178 }
179
180 struct fd_batch_history *history = get_history(at, batch);
181 if (!history)
182 return fallback_use_bypass(batch);
183
184 batch->autotune_result = get_result(at, history);
185 batch->autotune_result->cost = batch->cost;
186
187 bool use_bypass = fallback_use_bypass(batch);
188
189 if (use_bypass)
190 return true;
191
192 if (history->num_results > 0) {
193 uint32_t total_samples = 0;
194
195 // TODO we should account for clears somehow
196 // TODO should we try to notice if there is a drastic change from
197 // frame to frame?
198 list_for_each_entry (struct fd_batch_result, result, &history->results,
199 node) {
200 total_samples += result->samples_passed;
201 }
202
203 float avg_samples = (float)total_samples / (float)history->num_results;
204
205 /* Low sample count could mean there was only a clear.. or there was
206 * a clear plus draws that touch no or few samples
207 */
208 if (avg_samples < 500.0f)
209 return true;
210
211 /* Cost-per-sample is an estimate for the average number of reads+
212 * writes for a given passed sample.
213 */
214 float sample_cost = batch->cost;
215 sample_cost /= batch->num_draws;
216
217 float total_draw_cost = (avg_samples * sample_cost) / batch->num_draws;
218 DBG("%08x:%u\ttotal_samples=%u, avg_samples=%f, sample_cost=%f, "
219 "total_draw_cost=%f\n",
220 batch->hash, batch->num_draws, total_samples, avg_samples,
221 sample_cost, total_draw_cost);
222
223 if (total_draw_cost < 3000.0f)
224 return true;
225 }
226
227 return use_bypass;
228 }
229
230 void
fd_autotune_init(struct fd_autotune * at,struct fd_device * dev)231 fd_autotune_init(struct fd_autotune *at, struct fd_device *dev)
232 {
233 at->ht =
234 _mesa_hash_table_create(NULL, fd_batch_key_hash, fd_batch_key_equals);
235 list_inithead(&at->lru);
236
237 at->results_mem = fd_bo_new(dev, sizeof(struct fd_autotune_results),
238 0, "autotune");
239 at->results = fd_bo_map(at->results_mem);
240
241 list_inithead(&at->pending_results);
242 }
243
244 void
fd_autotune_fini(struct fd_autotune * at)245 fd_autotune_fini(struct fd_autotune *at)
246 {
247 _mesa_hash_table_destroy(at->ht, NULL);
248 fd_bo_del(at->results_mem);
249 }
250