xref: /aosp_15_r20/external/mesa3d/src/intel/common/intel_compute_slm.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright 2024 Intel Corporation
3  * SPDX-License-Identifier: MIT
4  */
5 
6 #include "intel_compute_slm.h"
7 
8 #include <assert.h>
9 
10 #include "util/macros.h"
11 #include "util/u_math.h"
12 
13 struct slm_encode {
14   uint32_t encode;
15   uint32_t size_in_kb;
16 };
17 
18 static inline struct slm_encode *
slm_encode_lookup(struct slm_encode * table,unsigned int table_len,uint32_t bytes)19 slm_encode_lookup(struct slm_encode *table, unsigned int table_len, uint32_t bytes)
20 {
21    const uint32_t kbytes = DIV_ROUND_UP(bytes, 1024);
22    unsigned int i;
23 
24    assert(kbytes <= table[table_len - 1].size_in_kb);
25    for (i = 0; i < table_len; i++) {
26       if (table[i].size_in_kb >= kbytes)
27          return &table[i];
28    }
29 
30    return &table[table_len - 1];
31 }
32 
33 static struct slm_encode xe2_slm_allocation_size_table[] = {
34   { .encode = 0x0, .size_in_kb = 0, },
35   { .encode = 0x1, .size_in_kb = 1, },
36   { .encode = 0x2, .size_in_kb = 2, },
37   { .encode = 0x3, .size_in_kb = 4, },
38   { .encode = 0x4, .size_in_kb = 8, },
39   { .encode = 0x5, .size_in_kb = 16, },
40   { .encode = 0x8, .size_in_kb = 24, },
41   { .encode = 0x6, .size_in_kb = 32, },
42   { .encode = 0x9, .size_in_kb = 48, },
43   { .encode = 0x7, .size_in_kb = 64, },
44   { .encode = 0xA, .size_in_kb = 96, },
45   { .encode = 0xB, .size_in_kb = 128, },
46   { .encode = 0xC, .size_in_kb = 192, },
47   { .encode = 0xD, .size_in_kb = 256, },
48   { .encode = 0xE, .size_in_kb = 384, },
49 };
50 
51 /* Shared Local Memory Size is specified as powers of two,
52  * and also have a Gen-dependent minimum value if not zero.
53  */
54 uint32_t
intel_compute_slm_calculate_size(unsigned gen,uint32_t bytes)55 intel_compute_slm_calculate_size(unsigned gen, uint32_t bytes)
56 {
57    if (gen >= 20) {
58       struct slm_encode *slm_encode;
59 
60       slm_encode = slm_encode_lookup(xe2_slm_allocation_size_table,
61                                      ARRAY_SIZE(xe2_slm_allocation_size_table),
62                                      bytes);
63       return slm_encode->size_in_kb * 1024;
64    }
65 
66    assert(bytes <= 64 * 1024);
67    if (bytes > 0)
68       return MAX2(util_next_power_of_two(bytes), gen >= 9 ? 1024 : 4096);
69    else
70       return 0;
71 }
72 
73 uint32_t
intel_compute_slm_encode_size(unsigned gen,uint32_t bytes)74 intel_compute_slm_encode_size(unsigned gen, uint32_t bytes)
75 {
76    uint32_t slm_size;
77 
78    if (bytes == 0)
79       return 0;
80 
81    if (gen >= 20) {
82       struct slm_encode *slm_encode;
83 
84       slm_encode = slm_encode_lookup(xe2_slm_allocation_size_table,
85                                      ARRAY_SIZE(xe2_slm_allocation_size_table),
86                                      bytes);
87       return slm_encode->encode;
88    }
89 
90    /* Shared Local Memory is specified as powers of two, and encoded in
91     * INTERFACE_DESCRIPTOR_DATA with the following representations:
92     *
93     * Size   | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB |
94     * -------------------------------------------------------------------
95     * Gfx7-8 |    0 | none | none |    1 |    2 |     4 |     8 |    16 |
96     * -------------------------------------------------------------------
97     * Gfx9+  |    0 |    1 |    2 |    3 |    4 |     5 |     6 |     7 |
98     */
99 
100    slm_size = intel_compute_slm_calculate_size(gen, bytes);
101    assert(util_is_power_of_two_nonzero(slm_size));
102 
103    if (gen >= 9) {
104       /* Turn an exponent of 10 (1024 kB) into 1. */
105       assert(slm_size >= 1024);
106       slm_size = ffs(slm_size) - 10;
107    } else {
108       assert(slm_size >= 4096);
109       /* Convert to the pre-Gfx9 representation. */
110       slm_size = slm_size / 4096;
111    }
112 
113    return slm_size;
114 }
115 
116 /* encode = 0 sets to largest SLM size supported in subslice */
117 static struct slm_encode preferred_slm_allocation_size_table[] = {
118    { .encode = 0x8, .size_in_kb = 0, },
119    { .encode = 0x9, .size_in_kb = 16, },
120    { .encode = 0xa, .size_in_kb = 32, },
121    { .encode = 0xb, .size_in_kb = 64, },
122    { .encode = 0xc, .size_in_kb = 96, },
123    { .encode = 0xd, .size_in_kb = 128, },
124 };
125 
126 static struct slm_encode xe2_preferred_slm_allocation_size_table[] = {
127   { .encode = 0x0, .size_in_kb = 0, },
128   { .encode = 0x1, .size_in_kb = 16, },
129   { .encode = 0x2, .size_in_kb = 32, },
130   { .encode = 0x3, .size_in_kb = 64, },
131   { .encode = 0x4, .size_in_kb = 96, },
132   { .encode = 0x5, .size_in_kb = 128, },
133   { .encode = 0x6, .size_in_kb = 160, },
134   { .encode = 0x7, .size_in_kb = 192, },
135   { .encode = 0x8, .size_in_kb = 224, },
136   { .encode = 0x9, .size_in_kb = 256, },
137   { .encode = 0xA, .size_in_kb = 384, },
138 };
139 
140 static uint32_t
intel_compute_preferred_slm_encode_size(unsigned gen,uint32_t bytes)141 intel_compute_preferred_slm_encode_size(unsigned gen, uint32_t bytes)
142 {
143    struct slm_encode *table;
144    unsigned int table_len;
145 
146    if (gen >= 20) {
147       table = xe2_preferred_slm_allocation_size_table;
148       table_len = ARRAY_SIZE(xe2_preferred_slm_allocation_size_table);
149    } else {
150       table = preferred_slm_allocation_size_table;
151       table_len = ARRAY_SIZE(preferred_slm_allocation_size_table);
152    }
153 
154    return slm_encode_lookup(table, table_len, bytes)->encode;
155 }
156 
157 /**
158  * Compute a shared local memory size to be allocated for each sub-slice.
159  * It estimate how many workgroups will run concurrently per sub-slice and
160  * multiply that per each workgroup SLM size.
161  */
162 uint32_t
intel_compute_preferred_slm_calc_encode_size(const struct intel_device_info * devinfo,const uint32_t slm_size_per_workgroup,const uint32_t invocations_per_workgroup,const uint8_t cs_simd)163 intel_compute_preferred_slm_calc_encode_size(const struct intel_device_info *devinfo,
164                                              const uint32_t slm_size_per_workgroup,
165                                              const uint32_t invocations_per_workgroup,
166                                              const uint8_t cs_simd)
167 {
168    const uint32_t max_preferred_slm_size = intel_device_info_get_max_preferred_slm_size(devinfo);
169    const uint32_t invocations_per_ss = intel_device_info_get_eu_count_first_subslice(devinfo) *
170                                        devinfo->num_thread_per_eu * cs_simd;
171    uint32_t preferred_slm_size;
172 
173    if (slm_size_per_workgroup) {
174       uint32_t workgroups_per_ss = invocations_per_ss / invocations_per_workgroup;
175 
176       preferred_slm_size = workgroups_per_ss * slm_size_per_workgroup;
177       preferred_slm_size = MIN2(preferred_slm_size, max_preferred_slm_size);
178    } else {
179       preferred_slm_size = 0;
180    }
181 
182    assert(preferred_slm_size >= slm_size_per_workgroup);
183    return intel_compute_preferred_slm_encode_size(devinfo->ver, preferred_slm_size);
184 }
185