1 /*
2 * Copyright © 2024 Collabora Ltd.
3 *
4 * SPDX-License-Identifier: MIT
5 */
6
7 #include <stddef.h>
8 #include <stdint.h>
9
10 #include "genxml/gen_macros.h"
11
12 #include "nir.h"
13 #include "nir_builder.h"
14
15 #include "pan_encoder.h"
16 #include "pan_shader.h"
17
18 #include "panvk_cmd_alloc.h"
19 #include "panvk_cmd_buffer.h"
20 #include "panvk_device.h"
21 #include "panvk_shader.h"
22
23 struct pan_nir_desc_copy_info {
24 mali_ptr sets[MAX_SETS];
25 mali_ptr tables[PANVK_BIFROST_DESC_TABLE_COUNT];
26 mali_ptr img_attrib_table;
27 struct {
28 mali_ptr table;
29 uint32_t limits[PANVK_BIFROST_DESC_TABLE_COUNT];
30 uint32_t attrib_buf_idx_offset;
31 } desc_copy;
32 uint32_t set_desc_counts[MAX_SETS];
33 };
34
35 #define get_input_field(b, name) \
36 nir_load_push_constant( \
37 b, 1, sizeof(((struct pan_nir_desc_copy_info *)0)->name) * 8, \
38 nir_imm_int(b, 0), \
39 .base = offsetof(struct pan_nir_desc_copy_info, name), \
40 .range = sizeof(((struct pan_nir_desc_copy_info *)0)->name))
41
42 #define get_input_array_slot(b, name, index) \
43 nir_load_push_constant( \
44 b, 1, sizeof(((struct pan_nir_desc_copy_info *)0)->name[0]) * 8, \
45 nir_imul_imm(b, index, \
46 sizeof(((struct pan_nir_desc_copy_info *)0)->name[0])), \
47 .base = offsetof(struct pan_nir_desc_copy_info, name), \
48 .range = sizeof(((struct pan_nir_desc_copy_info *)0)->name))
49
50 static void
extract_desc_info_from_handle(nir_builder * b,nir_def * handle,nir_def ** table,nir_def ** desc_idx)51 extract_desc_info_from_handle(nir_builder *b, nir_def *handle, nir_def **table,
52 nir_def **desc_idx)
53 {
54 *table = nir_ushr_imm(b, handle, 28);
55 *desc_idx = nir_iand_imm(b, handle, 0xfffffff);
56 }
57
58 static void
set_to_table_copy(nir_builder * b,nir_def * set_ptr,nir_def * set_desc_count,nir_def * src_desc_idx,nir_def * table_ptr,nir_def * dst_desc_idx,unsigned element_size)59 set_to_table_copy(nir_builder *b, nir_def *set_ptr, nir_def *set_desc_count,
60 nir_def *src_desc_idx, nir_def *table_ptr,
61 nir_def *dst_desc_idx, unsigned element_size)
62 {
63 /* The last binding can have
64 * VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT set, we need to make
65 * we don't do an out-of-bound access on the source set. */
66 nir_def *dst_offset =
67 nir_u2u64(b, nir_imul_imm(b, dst_desc_idx, element_size));
68
69 nir_push_if(b, nir_ult(b, src_desc_idx, set_desc_count));
70 {
71 nir_def *src_offset =
72 nir_u2u64(b, nir_imul_imm(b, src_desc_idx, PANVK_DESCRIPTOR_SIZE));
73 nir_def *desc = nir_load_global(b, nir_iadd(b, set_ptr, src_offset),
74 element_size, element_size / 4, 32);
75 nir_store_global(b, nir_iadd(b, table_ptr, dst_offset), element_size,
76 desc, ~0);
77 }
78 nir_push_else(b, NULL);
79 {
80 nir_const_value v[] = {
81 nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
82 nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
83 nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
84 nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
85 };
86
87 nir_def *desc = nir_build_imm(b, element_size / 4, 32, v);
88 nir_store_global(b, nir_iadd(b, table_ptr, dst_offset), element_size,
89 desc, ~0);
90 }
91 nir_pop_if(b, NULL);
92 }
93
94 static void
set_to_table_img_copy(nir_builder * b,nir_def * set_ptr,nir_def * set_desc_count,nir_def * src_desc_idx,nir_def * attrib_table_ptr,nir_def * attrib_buf_table_ptr,nir_def * dst_desc_idx)95 set_to_table_img_copy(nir_builder *b, nir_def *set_ptr, nir_def *set_desc_count,
96 nir_def *src_desc_idx, nir_def *attrib_table_ptr,
97 nir_def *attrib_buf_table_ptr, nir_def *dst_desc_idx)
98 {
99 /* The last binding can have
100 * VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT set, we need to make
101 * sure we don't do an out-of-bound access on the source set. */
102 const unsigned element_size = pan_size(ATTRIBUTE_BUFFER) * 2;
103 const unsigned attrib_buf_comps = element_size / 4;
104 const unsigned attrib_comps = pan_size(ATTRIBUTE) / 4;
105 nir_def *attrib_offset =
106 nir_u2u64(b, nir_imul_imm(b, dst_desc_idx, pan_size(ATTRIBUTE)));
107 nir_def *attrib_buf_offset =
108 nir_u2u64(b, nir_imul_imm(b, dst_desc_idx, element_size));
109
110 nir_push_if(b, nir_ult(b, src_desc_idx, set_desc_count));
111 {
112 nir_def *attr_buf_idx_offset =
113 get_input_field(b, desc_copy.attrib_buf_idx_offset);
114 nir_def *src_offset =
115 nir_u2u64(b, nir_imul_imm(b, src_desc_idx, PANVK_DESCRIPTOR_SIZE));
116 nir_def *src_desc = nir_load_global(b, nir_iadd(b, set_ptr, src_offset),
117 element_size, element_size / 4, 32);
118 nir_def *fmt = nir_iand_imm(b, nir_channel(b, src_desc, 2), 0xfffffc00);
119
120 /* Each image descriptor takes two attribute buffer slots, and we need
121 * to add the attribute buffer offset to have images working with vertex
122 * shader. */
123 nir_def *buf_idx =
124 nir_iadd(b, nir_imul_imm(b, dst_desc_idx, 2), attr_buf_idx_offset);
125
126 nir_def *attrib_w1 = nir_ior(b, buf_idx, fmt);
127
128 nir_def *attrib_desc = nir_vec2(b, attrib_w1, nir_imm_int(b, 0));
129
130 nir_store_global(b, nir_iadd(b, attrib_table_ptr, attrib_offset),
131 pan_size(ATTRIBUTE), attrib_desc,
132 nir_component_mask(attrib_comps));
133
134 nir_def *attrib_buf_desc = nir_vec8(
135 b, nir_channel(b, src_desc, 0), nir_channel(b, src_desc, 1),
136 nir_iand_imm(b, nir_channel(b, src_desc, 2), BITFIELD_MASK(10)),
137 nir_channel(b, src_desc, 3), nir_channel(b, src_desc, 4),
138 nir_channel(b, src_desc, 5), nir_channel(b, src_desc, 6),
139 nir_channel(b, src_desc, 7));
140 nir_store_global(b, nir_iadd(b, attrib_buf_table_ptr, attrib_buf_offset),
141 element_size, attrib_buf_desc,
142 nir_component_mask(attrib_buf_comps));
143 }
144 nir_push_else(b, NULL);
145 {
146 nir_const_value v[] = {
147 nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
148 nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
149 nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
150 nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
151 };
152
153 nir_def *desc =
154 nir_build_imm(b, MAX2(attrib_buf_comps, attrib_comps), 32, v);
155
156 nir_store_global(b, nir_iadd(b, attrib_buf_table_ptr, attrib_buf_offset),
157 pan_size(ATTRIBUTE), desc,
158 nir_component_mask(attrib_buf_comps));
159 nir_store_global(b, nir_iadd(b, attrib_table_ptr, attrib_offset),
160 element_size, desc, nir_component_mask(attrib_comps));
161 }
162 nir_pop_if(b, NULL);
163 }
164
165 static void
single_desc_copy(nir_builder * b,nir_def * desc_copy_idx)166 single_desc_copy(nir_builder *b, nir_def *desc_copy_idx)
167 {
168 nir_def *desc_copy_offset = nir_imul_imm(b, desc_copy_idx, sizeof(uint32_t));
169 nir_def *desc_copy_ptr = nir_iadd(b, get_input_field(b, desc_copy.table),
170 nir_u2u64(b, desc_copy_offset));
171 nir_def *src_copy_handle = nir_load_global(b, desc_copy_ptr, 4, 1, 32);
172
173 nir_def *set_idx, *src_desc_idx;
174 extract_desc_info_from_handle(b, src_copy_handle, &set_idx, &src_desc_idx);
175
176 nir_def *set_ptr = get_input_array_slot(b, sets, set_idx);
177 nir_def *set_desc_count = get_input_array_slot(b, set_desc_counts, set_idx);
178 nir_def *ubo_end =
179 get_input_field(b, desc_copy.limits[PANVK_BIFROST_DESC_TABLE_UBO]);
180 nir_def *img_end =
181 get_input_field(b, desc_copy.limits[PANVK_BIFROST_DESC_TABLE_IMG]);
182 nir_def *tex_end =
183 get_input_field(b, desc_copy.limits[PANVK_BIFROST_DESC_TABLE_TEXTURE]);
184 nir_def *sampler_end =
185 get_input_field(b, desc_copy.limits[PANVK_BIFROST_DESC_TABLE_SAMPLER]);
186
187 nir_push_if(b, nir_ult(b, desc_copy_idx, ubo_end));
188 {
189 nir_def *table_ptr =
190 get_input_field(b, tables[PANVK_BIFROST_DESC_TABLE_UBO]);
191
192 set_to_table_copy(b, set_ptr, set_desc_count, src_desc_idx, table_ptr,
193 desc_copy_idx, sizeof(struct mali_attribute_packed));
194 }
195 nir_push_else(b, NULL);
196 {
197 nir_push_if(b, nir_ult(b, desc_copy_idx, img_end));
198 {
199 nir_def *table_ptr =
200 get_input_field(b, tables[PANVK_BIFROST_DESC_TABLE_IMG]);
201 nir_def *attrib_table_ptr = get_input_field(b, img_attrib_table);
202 nir_def *attrib_buf_table_ptr = table_ptr;
203
204 set_to_table_img_copy(b, set_ptr, set_desc_count, src_desc_idx,
205 attrib_table_ptr, attrib_buf_table_ptr,
206 nir_isub(b, desc_copy_idx, ubo_end));
207 }
208 nir_push_else(b, NULL);
209 {
210 nir_push_if(b, nir_ult(b, desc_copy_idx, tex_end));
211 {
212 nir_def *table_ptr =
213 get_input_field(b, tables[PANVK_BIFROST_DESC_TABLE_TEXTURE]);
214
215 set_to_table_copy(b, set_ptr, set_desc_count, src_desc_idx,
216 table_ptr, nir_isub(b, desc_copy_idx, img_end),
217 sizeof(struct mali_texture_packed));
218 }
219 nir_push_else(b, NULL);
220 {
221 nir_push_if(b, nir_ult(b, desc_copy_idx, sampler_end));
222 {
223 nir_def *table_ptr =
224 get_input_field(b, tables[PANVK_BIFROST_DESC_TABLE_SAMPLER]);
225
226 set_to_table_copy(b, set_ptr, set_desc_count, src_desc_idx,
227 table_ptr, nir_isub(b, desc_copy_idx, tex_end),
228 sizeof(struct mali_sampler_packed));
229 }
230 nir_pop_if(b, NULL);
231 }
232 nir_pop_if(b, NULL);
233 }
234 nir_pop_if(b, NULL);
235 }
236 nir_pop_if(b, NULL);
237 }
238
239 static struct panvk_priv_mem
panvk_meta_desc_copy_shader(struct panvk_device * dev,struct pan_shader_info * shader_info)240 panvk_meta_desc_copy_shader(struct panvk_device *dev,
241 struct pan_shader_info *shader_info)
242 {
243 struct panvk_physical_device *phys_dev =
244 to_panvk_physical_device(dev->vk.physical);
245
246 nir_builder b = nir_builder_init_simple_shader(
247 MESA_SHADER_COMPUTE, GENX(pan_shader_get_compiler_options)(), "%s",
248 "desc_copy");
249
250 /* We actually customize that at execution time to issue the
251 * exact number of jobs. */
252 b.shader->info.workgroup_size[0] = 1;
253 b.shader->info.workgroup_size[1] = 1;
254 b.shader->info.workgroup_size[2] = 1;
255
256 nir_def *desc_copy_id =
257 nir_channel(&b, nir_load_global_invocation_id(&b, 32), 0);
258 single_desc_copy(&b, desc_copy_id);
259
260 struct panfrost_compile_inputs inputs = {
261 .gpu_id = phys_dev->kmod.props.gpu_prod_id,
262 .no_ubo_to_push = true,
263 };
264 struct util_dynarray binary;
265
266 util_dynarray_init(&binary, NULL);
267 pan_shader_preprocess(b.shader, inputs.gpu_id);
268 GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
269 ralloc_free(b.shader);
270
271 shader_info->push.count =
272 DIV_ROUND_UP(sizeof(struct pan_nir_desc_copy_info), 4);
273
274 struct panvk_priv_mem shader = panvk_pool_upload_aligned(
275 &dev->mempools.exec, binary.data, binary.size, 128);
276
277 util_dynarray_fini(&binary);
278 return shader;
279 }
280
281 void
panvk_per_arch(meta_desc_copy_init)282 panvk_per_arch(meta_desc_copy_init)(struct panvk_device *dev)
283 {
284 struct pan_shader_info shader_info;
285
286 dev->desc_copy.shader = panvk_meta_desc_copy_shader(dev, &shader_info);
287
288 mali_ptr shader = panvk_priv_mem_dev_addr(dev->desc_copy.shader);
289 struct panvk_priv_mem rsd =
290 panvk_pool_alloc_desc(&dev->mempools.rw, RENDERER_STATE);
291
292 pan_pack(panvk_priv_mem_host_addr(rsd), RENDERER_STATE, cfg) {
293 pan_shader_prepare_rsd(&shader_info, shader, &cfg);
294 }
295
296 dev->desc_copy.rsd = rsd;
297 }
298
299 void
panvk_per_arch(meta_desc_copy_cleanup)300 panvk_per_arch(meta_desc_copy_cleanup)(struct panvk_device *dev)
301 {
302 panvk_pool_free_mem(&dev->mempools.rw, dev->desc_copy.rsd);
303 panvk_pool_free_mem(&dev->mempools.exec, dev->desc_copy.shader);
304 }
305
306 VkResult
panvk_per_arch(meta_get_copy_desc_job)307 panvk_per_arch(meta_get_copy_desc_job)(
308 struct panvk_cmd_buffer *cmdbuf, const struct panvk_shader *shader,
309 const struct panvk_descriptor_state *desc_state,
310 const struct panvk_shader_desc_state *shader_desc_state,
311 uint32_t attrib_buf_idx_offset, struct panfrost_ptr *job_desc)
312 {
313 struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
314
315 *job_desc = (struct panfrost_ptr){0};
316
317 if (!shader)
318 return VK_SUCCESS;
319
320 mali_ptr copy_table = panvk_priv_mem_dev_addr(shader->desc_info.others.map);
321 if (!copy_table)
322 return VK_SUCCESS;
323
324 struct pan_nir_desc_copy_info copy_info = {
325 .img_attrib_table = shader_desc_state->img_attrib_table,
326 .desc_copy =
327 {
328 .table = copy_table,
329 .attrib_buf_idx_offset = attrib_buf_idx_offset,
330 },
331 };
332
333 for (uint32_t i = 0; i < ARRAY_SIZE(copy_info.desc_copy.limits); i++)
334 copy_info.desc_copy.limits[i] =
335 shader->desc_info.others.count[i] +
336 (i > 0 ? copy_info.desc_copy.limits[i - 1] : 0);
337
338 for (uint32_t i = 0; i < ARRAY_SIZE(desc_state->sets); i++) {
339 const struct panvk_descriptor_set *set = desc_state->sets[i];
340
341 if (!set)
342 continue;
343
344 copy_info.sets[i] = set->descs.dev;
345 copy_info.set_desc_counts[i] = set->desc_count;
346 }
347
348 for (uint32_t i = 0; i < ARRAY_SIZE(shader->desc_info.others.count); i++) {
349 uint32_t desc_count = shader->desc_info.others.count[i];
350
351 if (!desc_count)
352 continue;
353
354 copy_info.tables[i] = shader_desc_state->tables[i];
355 }
356
357 struct panfrost_ptr push_uniforms =
358 panvk_cmd_alloc_dev_mem(cmdbuf, desc, sizeof(copy_info), 16);
359
360 if (!push_uniforms.gpu)
361 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
362
363 memcpy(push_uniforms.cpu, ©_info, sizeof(copy_info));
364
365 *job_desc = panvk_cmd_alloc_desc(cmdbuf, COMPUTE_JOB);
366 if (!job_desc->gpu)
367 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
368
369 /* Given the per-stage max descriptors limit, we should never
370 * reach the workgroup dimension limit. */
371 uint32_t copy_count =
372 copy_info.desc_copy.limits[PANVK_BIFROST_DESC_TABLE_COUNT - 1];
373
374 assert(copy_count - 1 < BITFIELD_MASK(10));
375
376 panfrost_pack_work_groups_compute(
377 pan_section_ptr(job_desc->cpu, COMPUTE_JOB, INVOCATION), 1, 1, 1,
378 copy_count, 1, 1, false, false);
379
380 pan_section_pack(job_desc->cpu, COMPUTE_JOB, PARAMETERS, cfg) {
381 cfg.job_task_split = util_logbase2_ceil(copy_count + 1) +
382 util_logbase2_ceil(1 + 1) +
383 util_logbase2_ceil(1 + 1);
384 }
385
386 struct pan_tls_info tlsinfo = {0};
387 struct panfrost_ptr tls = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
388 if (!tls.gpu)
389 return VK_ERROR_OUT_OF_DEVICE_MEMORY;
390
391 GENX(pan_emit_tls)(&tlsinfo, tls.cpu);
392
393 pan_section_pack(job_desc->cpu, COMPUTE_JOB, DRAW, cfg) {
394 cfg.state = panvk_priv_mem_dev_addr(dev->desc_copy.rsd);
395 cfg.push_uniforms = push_uniforms.gpu;
396 cfg.thread_storage = tls.gpu;
397 }
398
399 return VK_SUCCESS;
400 }
401