xref: /aosp_15_r20/external/mesa3d/src/panfrost/vulkan/bifrost/panvk_vX_meta_desc_copy.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2024 Collabora Ltd.
3  *
4  * SPDX-License-Identifier: MIT
5  */
6 
7 #include <stddef.h>
8 #include <stdint.h>
9 
10 #include "genxml/gen_macros.h"
11 
12 #include "nir.h"
13 #include "nir_builder.h"
14 
15 #include "pan_encoder.h"
16 #include "pan_shader.h"
17 
18 #include "panvk_cmd_alloc.h"
19 #include "panvk_cmd_buffer.h"
20 #include "panvk_device.h"
21 #include "panvk_shader.h"
22 
23 struct pan_nir_desc_copy_info {
24    mali_ptr sets[MAX_SETS];
25    mali_ptr tables[PANVK_BIFROST_DESC_TABLE_COUNT];
26    mali_ptr img_attrib_table;
27    struct {
28       mali_ptr table;
29       uint32_t limits[PANVK_BIFROST_DESC_TABLE_COUNT];
30       uint32_t attrib_buf_idx_offset;
31    } desc_copy;
32    uint32_t set_desc_counts[MAX_SETS];
33 };
34 
35 #define get_input_field(b, name)                                               \
36    nir_load_push_constant(                                                     \
37       b, 1, sizeof(((struct pan_nir_desc_copy_info *)0)->name) * 8,            \
38       nir_imm_int(b, 0),                                                       \
39       .base = offsetof(struct pan_nir_desc_copy_info, name),                   \
40       .range = sizeof(((struct pan_nir_desc_copy_info *)0)->name))
41 
42 #define get_input_array_slot(b, name, index)                                   \
43    nir_load_push_constant(                                                     \
44       b, 1, sizeof(((struct pan_nir_desc_copy_info *)0)->name[0]) * 8,         \
45       nir_imul_imm(b, index,                                                   \
46                    sizeof(((struct pan_nir_desc_copy_info *)0)->name[0])),     \
47       .base = offsetof(struct pan_nir_desc_copy_info, name),                   \
48       .range = sizeof(((struct pan_nir_desc_copy_info *)0)->name))
49 
50 static void
extract_desc_info_from_handle(nir_builder * b,nir_def * handle,nir_def ** table,nir_def ** desc_idx)51 extract_desc_info_from_handle(nir_builder *b, nir_def *handle, nir_def **table,
52                               nir_def **desc_idx)
53 {
54    *table = nir_ushr_imm(b, handle, 28);
55    *desc_idx = nir_iand_imm(b, handle, 0xfffffff);
56 }
57 
58 static void
set_to_table_copy(nir_builder * b,nir_def * set_ptr,nir_def * set_desc_count,nir_def * src_desc_idx,nir_def * table_ptr,nir_def * dst_desc_idx,unsigned element_size)59 set_to_table_copy(nir_builder *b, nir_def *set_ptr, nir_def *set_desc_count,
60                   nir_def *src_desc_idx, nir_def *table_ptr,
61                   nir_def *dst_desc_idx, unsigned element_size)
62 {
63    /* The last binding can have
64     * VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT set, we need to make
65     * we don't do an out-of-bound access on the source set. */
66    nir_def *dst_offset =
67       nir_u2u64(b, nir_imul_imm(b, dst_desc_idx, element_size));
68 
69    nir_push_if(b, nir_ult(b, src_desc_idx, set_desc_count));
70    {
71       nir_def *src_offset =
72          nir_u2u64(b, nir_imul_imm(b, src_desc_idx, PANVK_DESCRIPTOR_SIZE));
73       nir_def *desc = nir_load_global(b, nir_iadd(b, set_ptr, src_offset),
74                                       element_size, element_size / 4, 32);
75       nir_store_global(b, nir_iadd(b, table_ptr, dst_offset), element_size,
76                        desc, ~0);
77    }
78    nir_push_else(b, NULL);
79    {
80       nir_const_value v[] = {
81          nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
82          nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
83          nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
84          nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
85       };
86 
87       nir_def *desc = nir_build_imm(b, element_size / 4, 32, v);
88       nir_store_global(b, nir_iadd(b, table_ptr, dst_offset), element_size,
89                        desc, ~0);
90    }
91    nir_pop_if(b, NULL);
92 }
93 
94 static void
set_to_table_img_copy(nir_builder * b,nir_def * set_ptr,nir_def * set_desc_count,nir_def * src_desc_idx,nir_def * attrib_table_ptr,nir_def * attrib_buf_table_ptr,nir_def * dst_desc_idx)95 set_to_table_img_copy(nir_builder *b, nir_def *set_ptr, nir_def *set_desc_count,
96                       nir_def *src_desc_idx, nir_def *attrib_table_ptr,
97                       nir_def *attrib_buf_table_ptr, nir_def *dst_desc_idx)
98 {
99    /* The last binding can have
100     * VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT set, we need to make
101     * sure we don't do an out-of-bound access on the source set. */
102    const unsigned element_size = pan_size(ATTRIBUTE_BUFFER) * 2;
103    const unsigned attrib_buf_comps = element_size / 4;
104    const unsigned attrib_comps = pan_size(ATTRIBUTE) / 4;
105    nir_def *attrib_offset =
106       nir_u2u64(b, nir_imul_imm(b, dst_desc_idx, pan_size(ATTRIBUTE)));
107    nir_def *attrib_buf_offset =
108       nir_u2u64(b, nir_imul_imm(b, dst_desc_idx, element_size));
109 
110    nir_push_if(b, nir_ult(b, src_desc_idx, set_desc_count));
111    {
112       nir_def *attr_buf_idx_offset =
113          get_input_field(b, desc_copy.attrib_buf_idx_offset);
114       nir_def *src_offset =
115          nir_u2u64(b, nir_imul_imm(b, src_desc_idx, PANVK_DESCRIPTOR_SIZE));
116       nir_def *src_desc = nir_load_global(b, nir_iadd(b, set_ptr, src_offset),
117                                           element_size, element_size / 4, 32);
118       nir_def *fmt = nir_iand_imm(b, nir_channel(b, src_desc, 2), 0xfffffc00);
119 
120       /* Each image descriptor takes two attribute buffer slots, and we need
121        * to add the attribute buffer offset to have images working with vertex
122        * shader. */
123       nir_def *buf_idx =
124          nir_iadd(b, nir_imul_imm(b, dst_desc_idx, 2), attr_buf_idx_offset);
125 
126       nir_def *attrib_w1 = nir_ior(b, buf_idx, fmt);
127 
128       nir_def *attrib_desc = nir_vec2(b, attrib_w1, nir_imm_int(b, 0));
129 
130       nir_store_global(b, nir_iadd(b, attrib_table_ptr, attrib_offset),
131                        pan_size(ATTRIBUTE), attrib_desc,
132                        nir_component_mask(attrib_comps));
133 
134       nir_def *attrib_buf_desc = nir_vec8(
135          b, nir_channel(b, src_desc, 0), nir_channel(b, src_desc, 1),
136          nir_iand_imm(b, nir_channel(b, src_desc, 2), BITFIELD_MASK(10)),
137          nir_channel(b, src_desc, 3), nir_channel(b, src_desc, 4),
138          nir_channel(b, src_desc, 5), nir_channel(b, src_desc, 6),
139          nir_channel(b, src_desc, 7));
140       nir_store_global(b, nir_iadd(b, attrib_buf_table_ptr, attrib_buf_offset),
141                        element_size, attrib_buf_desc,
142                        nir_component_mask(attrib_buf_comps));
143    }
144    nir_push_else(b, NULL);
145    {
146       nir_const_value v[] = {
147          nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
148          nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
149          nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
150          nir_const_value_for_uint(0, 32), nir_const_value_for_uint(0, 32),
151       };
152 
153       nir_def *desc =
154          nir_build_imm(b, MAX2(attrib_buf_comps, attrib_comps), 32, v);
155 
156       nir_store_global(b, nir_iadd(b, attrib_buf_table_ptr, attrib_buf_offset),
157                        pan_size(ATTRIBUTE), desc,
158                        nir_component_mask(attrib_buf_comps));
159       nir_store_global(b, nir_iadd(b, attrib_table_ptr, attrib_offset),
160                        element_size, desc, nir_component_mask(attrib_comps));
161    }
162    nir_pop_if(b, NULL);
163 }
164 
165 static void
single_desc_copy(nir_builder * b,nir_def * desc_copy_idx)166 single_desc_copy(nir_builder *b, nir_def *desc_copy_idx)
167 {
168    nir_def *desc_copy_offset = nir_imul_imm(b, desc_copy_idx, sizeof(uint32_t));
169    nir_def *desc_copy_ptr = nir_iadd(b, get_input_field(b, desc_copy.table),
170                                      nir_u2u64(b, desc_copy_offset));
171    nir_def *src_copy_handle = nir_load_global(b, desc_copy_ptr, 4, 1, 32);
172 
173    nir_def *set_idx, *src_desc_idx;
174    extract_desc_info_from_handle(b, src_copy_handle, &set_idx, &src_desc_idx);
175 
176    nir_def *set_ptr = get_input_array_slot(b, sets, set_idx);
177    nir_def *set_desc_count = get_input_array_slot(b, set_desc_counts, set_idx);
178    nir_def *ubo_end =
179       get_input_field(b, desc_copy.limits[PANVK_BIFROST_DESC_TABLE_UBO]);
180    nir_def *img_end =
181       get_input_field(b, desc_copy.limits[PANVK_BIFROST_DESC_TABLE_IMG]);
182    nir_def *tex_end =
183       get_input_field(b, desc_copy.limits[PANVK_BIFROST_DESC_TABLE_TEXTURE]);
184    nir_def *sampler_end =
185       get_input_field(b, desc_copy.limits[PANVK_BIFROST_DESC_TABLE_SAMPLER]);
186 
187    nir_push_if(b, nir_ult(b, desc_copy_idx, ubo_end));
188    {
189       nir_def *table_ptr =
190          get_input_field(b, tables[PANVK_BIFROST_DESC_TABLE_UBO]);
191 
192       set_to_table_copy(b, set_ptr, set_desc_count, src_desc_idx, table_ptr,
193                         desc_copy_idx, sizeof(struct mali_attribute_packed));
194    }
195    nir_push_else(b, NULL);
196    {
197       nir_push_if(b, nir_ult(b, desc_copy_idx, img_end));
198       {
199          nir_def *table_ptr =
200             get_input_field(b, tables[PANVK_BIFROST_DESC_TABLE_IMG]);
201          nir_def *attrib_table_ptr = get_input_field(b, img_attrib_table);
202          nir_def *attrib_buf_table_ptr = table_ptr;
203 
204          set_to_table_img_copy(b, set_ptr, set_desc_count, src_desc_idx,
205                                attrib_table_ptr, attrib_buf_table_ptr,
206                                nir_isub(b, desc_copy_idx, ubo_end));
207       }
208       nir_push_else(b, NULL);
209       {
210          nir_push_if(b, nir_ult(b, desc_copy_idx, tex_end));
211          {
212             nir_def *table_ptr =
213                get_input_field(b, tables[PANVK_BIFROST_DESC_TABLE_TEXTURE]);
214 
215             set_to_table_copy(b, set_ptr, set_desc_count, src_desc_idx,
216                               table_ptr, nir_isub(b, desc_copy_idx, img_end),
217                               sizeof(struct mali_texture_packed));
218          }
219          nir_push_else(b, NULL);
220          {
221             nir_push_if(b, nir_ult(b, desc_copy_idx, sampler_end));
222             {
223                nir_def *table_ptr =
224                   get_input_field(b, tables[PANVK_BIFROST_DESC_TABLE_SAMPLER]);
225 
226                set_to_table_copy(b, set_ptr, set_desc_count, src_desc_idx,
227                                  table_ptr, nir_isub(b, desc_copy_idx, tex_end),
228                                  sizeof(struct mali_sampler_packed));
229             }
230             nir_pop_if(b, NULL);
231          }
232          nir_pop_if(b, NULL);
233       }
234       nir_pop_if(b, NULL);
235    }
236    nir_pop_if(b, NULL);
237 }
238 
239 static struct panvk_priv_mem
panvk_meta_desc_copy_shader(struct panvk_device * dev,struct pan_shader_info * shader_info)240 panvk_meta_desc_copy_shader(struct panvk_device *dev,
241                             struct pan_shader_info *shader_info)
242 {
243    struct panvk_physical_device *phys_dev =
244       to_panvk_physical_device(dev->vk.physical);
245 
246    nir_builder b = nir_builder_init_simple_shader(
247       MESA_SHADER_COMPUTE, GENX(pan_shader_get_compiler_options)(), "%s",
248       "desc_copy");
249 
250    /* We actually customize that at execution time to issue the
251     * exact number of jobs. */
252    b.shader->info.workgroup_size[0] = 1;
253    b.shader->info.workgroup_size[1] = 1;
254    b.shader->info.workgroup_size[2] = 1;
255 
256    nir_def *desc_copy_id =
257       nir_channel(&b, nir_load_global_invocation_id(&b, 32), 0);
258    single_desc_copy(&b, desc_copy_id);
259 
260    struct panfrost_compile_inputs inputs = {
261       .gpu_id = phys_dev->kmod.props.gpu_prod_id,
262       .no_ubo_to_push = true,
263    };
264    struct util_dynarray binary;
265 
266    util_dynarray_init(&binary, NULL);
267    pan_shader_preprocess(b.shader, inputs.gpu_id);
268    GENX(pan_shader_compile)(b.shader, &inputs, &binary, shader_info);
269    ralloc_free(b.shader);
270 
271    shader_info->push.count =
272       DIV_ROUND_UP(sizeof(struct pan_nir_desc_copy_info), 4);
273 
274    struct panvk_priv_mem shader = panvk_pool_upload_aligned(
275       &dev->mempools.exec, binary.data, binary.size, 128);
276 
277    util_dynarray_fini(&binary);
278    return shader;
279 }
280 
281 void
panvk_per_arch(meta_desc_copy_init)282 panvk_per_arch(meta_desc_copy_init)(struct panvk_device *dev)
283 {
284    struct pan_shader_info shader_info;
285 
286    dev->desc_copy.shader = panvk_meta_desc_copy_shader(dev, &shader_info);
287 
288    mali_ptr shader = panvk_priv_mem_dev_addr(dev->desc_copy.shader);
289    struct panvk_priv_mem rsd =
290       panvk_pool_alloc_desc(&dev->mempools.rw, RENDERER_STATE);
291 
292    pan_pack(panvk_priv_mem_host_addr(rsd), RENDERER_STATE, cfg) {
293       pan_shader_prepare_rsd(&shader_info, shader, &cfg);
294    }
295 
296    dev->desc_copy.rsd = rsd;
297 }
298 
299 void
panvk_per_arch(meta_desc_copy_cleanup)300 panvk_per_arch(meta_desc_copy_cleanup)(struct panvk_device *dev)
301 {
302    panvk_pool_free_mem(&dev->mempools.rw, dev->desc_copy.rsd);
303    panvk_pool_free_mem(&dev->mempools.exec, dev->desc_copy.shader);
304 }
305 
306 VkResult
panvk_per_arch(meta_get_copy_desc_job)307 panvk_per_arch(meta_get_copy_desc_job)(
308    struct panvk_cmd_buffer *cmdbuf, const struct panvk_shader *shader,
309    const struct panvk_descriptor_state *desc_state,
310    const struct panvk_shader_desc_state *shader_desc_state,
311    uint32_t attrib_buf_idx_offset, struct panfrost_ptr *job_desc)
312 {
313    struct panvk_device *dev = to_panvk_device(cmdbuf->vk.base.device);
314 
315    *job_desc = (struct panfrost_ptr){0};
316 
317    if (!shader)
318       return VK_SUCCESS;
319 
320    mali_ptr copy_table = panvk_priv_mem_dev_addr(shader->desc_info.others.map);
321    if (!copy_table)
322       return VK_SUCCESS;
323 
324    struct pan_nir_desc_copy_info copy_info = {
325       .img_attrib_table = shader_desc_state->img_attrib_table,
326       .desc_copy =
327          {
328             .table = copy_table,
329             .attrib_buf_idx_offset = attrib_buf_idx_offset,
330          },
331    };
332 
333    for (uint32_t i = 0; i < ARRAY_SIZE(copy_info.desc_copy.limits); i++)
334       copy_info.desc_copy.limits[i] =
335          shader->desc_info.others.count[i] +
336          (i > 0 ? copy_info.desc_copy.limits[i - 1] : 0);
337 
338    for (uint32_t i = 0; i < ARRAY_SIZE(desc_state->sets); i++) {
339       const struct panvk_descriptor_set *set = desc_state->sets[i];
340 
341       if (!set)
342          continue;
343 
344       copy_info.sets[i] = set->descs.dev;
345       copy_info.set_desc_counts[i] = set->desc_count;
346    }
347 
348    for (uint32_t i = 0; i < ARRAY_SIZE(shader->desc_info.others.count); i++) {
349       uint32_t desc_count = shader->desc_info.others.count[i];
350 
351       if (!desc_count)
352          continue;
353 
354       copy_info.tables[i] = shader_desc_state->tables[i];
355    }
356 
357    struct panfrost_ptr push_uniforms =
358       panvk_cmd_alloc_dev_mem(cmdbuf, desc, sizeof(copy_info), 16);
359 
360    if (!push_uniforms.gpu)
361       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
362 
363    memcpy(push_uniforms.cpu, &copy_info, sizeof(copy_info));
364 
365    *job_desc = panvk_cmd_alloc_desc(cmdbuf, COMPUTE_JOB);
366    if (!job_desc->gpu)
367       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
368 
369    /* Given the per-stage max descriptors limit, we should never
370     * reach the workgroup dimension limit. */
371    uint32_t copy_count =
372       copy_info.desc_copy.limits[PANVK_BIFROST_DESC_TABLE_COUNT - 1];
373 
374    assert(copy_count - 1 < BITFIELD_MASK(10));
375 
376    panfrost_pack_work_groups_compute(
377       pan_section_ptr(job_desc->cpu, COMPUTE_JOB, INVOCATION), 1, 1, 1,
378       copy_count, 1, 1, false, false);
379 
380    pan_section_pack(job_desc->cpu, COMPUTE_JOB, PARAMETERS, cfg) {
381       cfg.job_task_split = util_logbase2_ceil(copy_count + 1) +
382                            util_logbase2_ceil(1 + 1) +
383                            util_logbase2_ceil(1 + 1);
384    }
385 
386    struct pan_tls_info tlsinfo = {0};
387    struct panfrost_ptr tls = panvk_cmd_alloc_desc(cmdbuf, LOCAL_STORAGE);
388    if (!tls.gpu)
389       return VK_ERROR_OUT_OF_DEVICE_MEMORY;
390 
391    GENX(pan_emit_tls)(&tlsinfo, tls.cpu);
392 
393    pan_section_pack(job_desc->cpu, COMPUTE_JOB, DRAW, cfg) {
394       cfg.state = panvk_priv_mem_dev_addr(dev->desc_copy.rsd);
395       cfg.push_uniforms = push_uniforms.gpu;
396       cfg.thread_storage = tls.gpu;
397    }
398 
399    return VK_SUCCESS;
400 }
401