xref: /aosp_15_r20/external/mesa3d/src/asahi/lib/agx_scratch.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2023 Asahi Lina
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "agx_scratch.h"
7 #include "asahi/compiler/agx_compile.h"
8 #include "shaders/helper.h"
9 #include "util/u_hexdump.h"
10 #include "agx_bo.h"
11 #include "libagx_shaders.h"
12 #include "nir.h"
13 #include "nir_builder_opcodes.h"
14 
15 #define AGX_ADDR_SHIFT        8
16 #define AGX_THREADS_PER_GROUP 32
17 #define AGX_SPILL_UNIT_DWORDS 8
18 
19 // FIXME: What is the actual value here? Seems to be 96 + 8 or so?
20 #define AGX_MAX_SUBGROUPS_PER_CORE 128
21 
22 // Unknown if this goes higher.
23 #define AGX_MAX_SCRATCH_BLOCK_LOG4 6
24 #define AGX_MAX_SCRATCH_DWORDS                                                 \
25    ((AGX_SPILL_UNIT_DWORDS << (2 * AGX_MAX_SCRATCH_BLOCK_LOG4)) * 4)
26 
27 struct spill_size {
28    uint32_t log4_bsize;
29    uint32_t count;
30 };
31 
32 struct agx_bo *
agx_build_helper(struct agx_device * dev)33 agx_build_helper(struct agx_device *dev)
34 {
35    struct agx_bo *bo = agx_bo_create(
36       dev, sizeof(libagx_g13_helper), 0,
37       AGX_BO_READONLY | AGX_BO_EXEC | AGX_BO_LOW_VA, "Helper shader");
38    assert(bo);
39    memcpy(bo->map, libagx_g13_helper, sizeof(libagx_g13_helper));
40 
41    if (dev->debug & AGX_DBG_SCRATCH)
42       fprintf(stderr, "Helper: 0x%" PRIx64 "\n", bo->va->addr);
43 
44    return bo;
45 }
46 
47 static struct spill_size
agx_scratch_get_spill_size(unsigned dwords)48 agx_scratch_get_spill_size(unsigned dwords)
49 {
50    if (!dwords) {
51       return (struct spill_size){0, 0};
52    }
53    assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");
54 
55    unsigned log4 =
56       util_logbase2(DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS)) / 2;
57    unsigned blocks = DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS << (2 * log4));
58    if (log4 > AGX_MAX_SCRATCH_BLOCK_LOG4) {
59       // Max size case (4 blocks)
60       assert(log4 == (AGX_MAX_SCRATCH_BLOCK_LOG4 + 1));
61       log4--;
62       blocks = 4;
63    } else if (blocks == 4) {
64       // Non max size 4 block case, shift to next log4 unit for consistency.
65       log4++;
66       blocks = 1;
67    }
68 
69    return (struct spill_size){log4, blocks};
70 }
71 
72 unsigned
agx_scratch_get_bucket(uint32_t dwords)73 agx_scratch_get_bucket(uint32_t dwords)
74 {
75    /* For debugging/analysis purposes, scratch allocation sizes are
76     * divided into buckets. Since we only allocate a single global
77     * worst-case scratch buffer, these buckets do not have any meaning
78     * for the actual allocation mechanism. They are only used to log
79     * allocation sizes. We just use a simple log2 of the size here.
80     */
81 
82    if (!dwords)
83       return 0;
84    assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");
85 
86    return MIN2(
87       AGX_SPILL_SIZE_BUCKETS - 1,
88       1 + util_logbase2_ceil(DIV_ROUND_UP(dwords, AGX_SPILL_UNIT_DWORDS)));
89 }
90 
91 static void
agx_scratch_realloc(struct agx_scratch * scratch)92 agx_scratch_realloc(struct agx_scratch *scratch)
93 {
94    if (scratch->buf)
95       agx_bo_unreference(scratch->dev, scratch->buf);
96 
97    struct spill_size size = agx_scratch_get_spill_size(scratch->size_dwords);
98 
99    if (scratch->dev->debug & AGX_DBG_SCRATCH)
100       fprintf(stderr, "Scratch realloc: %d (%d:%d) x %d\n",
101               scratch->size_dwords, size.log4_bsize, size.count,
102               scratch->subgroups);
103 
104    unsigned block_dwords = AGX_SPILL_UNIT_DWORDS << (2 * size.log4_bsize);
105    size_t block_size_bytes = (AGX_THREADS_PER_GROUP * 4) * block_dwords;
106    scratch->size_dwords = block_dwords * size.count;
107 
108    if (scratch->dev->debug & AGX_DBG_SCRATCH)
109       fprintf(stderr, "Block size: 0x%zx bytes (%d)\n", block_size_bytes,
110               size.log4_bsize);
111 
112    unsigned block_count = size.count;
113 
114    if (scratch->dev->debug & AGX_DBG_SCRATCH)
115       fprintf(stderr, "Block count: %d\n", block_count);
116 
117    size_t core_alloc = block_size_bytes * block_count * scratch->subgroups;
118 
119    size_t header_size = sizeof(struct agx_helper_header);
120 
121    size_t blocklist_off = header_size;
122    size_t blocklist_core_size =
123       scratch->subgroups * sizeof(struct agx_helper_block);
124    size_t blocklist_size = blocklist_core_size * scratch->num_cores;
125 
126    size_t blocks_off = align(header_size + blocklist_size, block_size_bytes);
127    size_t total_alloc = blocks_off + core_alloc * scratch->num_cores;
128 
129    unsigned flags = 0;
130 #ifdef SCRATCH_DEBUG
131    flags = AGX_BO_WRITEBACK;
132 #endif
133    scratch->buf = agx_bo_create(scratch->dev, total_alloc, block_size_bytes,
134                                 flags, "Scratch");
135    memset(scratch->buf->map, 0, blocks_off);
136 
137    struct agx_helper_header *hdr = scratch->buf->map;
138    scratch->header = hdr;
139 
140    uint64_t blocklist_gpu = scratch->buf->va->addr + blocklist_off;
141    struct agx_helper_block *blocklist_cpu = scratch->buf->map + blocklist_off;
142 
143 #ifdef SCRATCH_DEBUG
144    scratch->blocklist = blocklist_cpu;
145    scratch->data = scratch->buf->map + blocks_off;
146    scratch->core_size = block_size_bytes * block_count * scratch->subgroups;
147 #endif
148 
149    uint64_t blocks_gpu = scratch->buf->va->addr + blocks_off;
150 
151    hdr->subgroups = scratch->subgroups;
152 
153    unsigned num_cores = 0;
154    unsigned core_id;
155    for (core_id = 0; core_id < AGX_MAX_CORE_ID; core_id++) {
156 #ifndef SCRATCH_DEBUG_CORES
157       unsigned cores_per_cluster =
158          util_next_power_of_two(scratch->dev->params.num_cores_per_cluster);
159       unsigned cluster = core_id / cores_per_cluster;
160       unsigned core = core_id % cores_per_cluster;
161       if (cluster >= scratch->dev->params.num_clusters_total)
162          break;
163       if (core >= scratch->dev->params.num_cores_per_cluster ||
164           !(scratch->dev->params.core_masks[cluster] & BITFIELD_BIT(core)))
165          continue;
166 #endif
167       num_cores++;
168 #ifdef SCRATCH_DEBUG
169       scratch->core_present[core_id] = true;
170 #endif
171 
172       hdr->cores[core_id].blocklist = blocklist_gpu;
173 
174       for (unsigned sg = 0; sg < scratch->subgroups; sg++) {
175          uint32_t mask = BITFIELD_MASK(size.log4_bsize + 1);
176          assert(!(blocks_gpu & (block_size_bytes - 1)));
177 
178          uint32_t base = blocks_gpu >> AGX_ADDR_SHIFT;
179          uint32_t stride = block_size_bytes >> AGX_ADDR_SHIFT;
180          blocklist_cpu[sg].blocks[0] = mask | base;
181          for (int block = 1; block <= 3; block++) {
182             if (block_count >= (block + 1))
183                blocklist_cpu[sg].blocks[block] = 1 | (base + block * stride);
184             else
185                blocklist_cpu[sg].blocks[block] = 0;
186          }
187 
188          blocks_gpu += block_size_bytes * block_count;
189       }
190 
191       blocklist_gpu += sizeof(struct agx_helper_block) * scratch->subgroups;
192       blocklist_cpu += scratch->subgroups;
193    }
194    scratch->max_core_id = core_id;
195    assert(num_cores == scratch->num_cores);
196 
197    if (scratch->dev->debug & AGX_DBG_SCRATCH)
198       fprintf(stderr, "New Scratch @ 0x%" PRIx64 " (size: 0x%zx)\n",
199               scratch->buf->va->addr, scratch->buf->size);
200 }
201 
202 void
agx_scratch_alloc(struct agx_scratch * scratch,unsigned dwords,size_t subgroups)203 agx_scratch_alloc(struct agx_scratch *scratch, unsigned dwords,
204                   size_t subgroups)
205 {
206    bool realloc = false;
207 
208    if (!dwords)
209       return;
210 
211    assert(dwords <= AGX_MAX_SCRATCH_DWORDS && "Scratch size too large");
212 
213    if (!subgroups)
214       subgroups = AGX_MAX_SUBGROUPS_PER_CORE;
215 
216    subgroups = MIN2(AGX_MAX_SUBGROUPS_PER_CORE, subgroups);
217 
218    if (dwords > scratch->size_dwords) {
219       scratch->size_dwords = dwords;
220       realloc = true;
221    }
222 
223    if (subgroups > scratch->subgroups) {
224       scratch->subgroups = subgroups;
225       realloc = true;
226    }
227 
228    if (realloc) {
229       agx_scratch_realloc(scratch);
230    }
231 }
232 
233 void
agx_scratch_debug_pre(struct agx_scratch * scratch)234 agx_scratch_debug_pre(struct agx_scratch *scratch)
235 {
236    if (!scratch->buf)
237       return;
238 
239    for (int core = 0; core < scratch->max_core_id; core++) {
240       assert(!scratch->header->cores[core].alloc_cur);
241       scratch->header->cores[core].alloc_max = 0;
242       scratch->header->cores[core].alloc_failed = 0;
243       memset(scratch->header->cores[core].alloc_count, 0,
244              sizeof(scratch->header->cores[core].alloc_count));
245    }
246 }
247 
248 void
agx_scratch_debug_post(struct agx_scratch * scratch)249 agx_scratch_debug_post(struct agx_scratch *scratch)
250 {
251    if (!scratch->buf)
252       return;
253 
254    fprintf(stderr, "Scratch @ 0x%" PRIx64 "\n", scratch->buf->va->addr);
255 
256    for (int core = 0; core < scratch->max_core_id; core++) {
257       fprintf(stderr, "Core %3d: max %d, failed %d, counts:", core,
258               scratch->header->cores[core].alloc_max,
259               scratch->header->cores[core].alloc_failed);
260 
261       for (unsigned bucket = 0; bucket < AGX_SPILL_SIZE_BUCKETS; bucket++) {
262          fprintf(stderr, " %d:%-3d",
263                  bucket ? (AGX_SPILL_UNIT_DWORDS << (bucket - 1)) : 0,
264                  scratch->header->cores[core].alloc_count[bucket]);
265       }
266       fprintf(stderr, "\n");
267       assert(!scratch->header->cores[core].alloc_cur);
268       assert(!scratch->header->cores[core].alloc_failed);
269    }
270 
271 #ifdef SCRATCH_DEBUG
272    unsigned core_index = 0;
273    for (int core = 0; core < scratch->max_core_id; core++) {
274       if (!scratch->core_present[core])
275          continue;
276       void *p = scratch->data + scratch->core_size * core_index++;
277       fprintf(stderr, "\nCORE %d (0x%lx)\n", core, scratch->core_size);
278       u_hexdump(stderr, p, scratch->core_size, true);
279    }
280 #endif
281 }
282 
283 void
agx_scratch_init(struct agx_device * dev,struct agx_scratch * scratch)284 agx_scratch_init(struct agx_device *dev, struct agx_scratch *scratch)
285 {
286    memset(scratch, 0, sizeof(*scratch));
287 
288    scratch->dev = dev;
289 #ifdef SCRATCH_DEBUG_CORES
290    scratch->num_cores = SCRATCH_DEBUG_CORES;
291 #else
292    scratch->num_cores = 0;
293    for (unsigned cl = 0; cl < dev->params.num_clusters_total; cl++) {
294       scratch->num_cores += util_bitcount(dev->params.core_masks[cl]);
295    }
296 #endif
297 }
298 
299 void
agx_scratch_fini(struct agx_scratch * scratch)300 agx_scratch_fini(struct agx_scratch *scratch)
301 {
302    if (scratch->buf)
303       agx_bo_unreference(scratch->dev, scratch->buf);
304    scratch->buf = NULL;
305 }
306