1 /*
2 * Copyright © 2022 Imagination Technologies Ltd.
3 *
4 * based in part on v3dv driver which is:
5 * Copyright © 2019 Raspberry Pi
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the Software is
12 * furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the next
15 * paragraph) shall be included in all copies or substantial portions of the
16 * Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 * SOFTWARE.
25 */
26
27 #include <assert.h>
28 #include <stdbool.h>
29 #include <stdint.h>
30 #include <string.h>
31 #include <vulkan/vulkan.h>
32
33 #include "compiler/shader_enums.h"
34 #include "hwdef/rogue_hw_utils.h"
35 #include "nir/nir.h"
36 #include "pvr_bo.h"
37 #include "pvr_csb.h"
38 #include "pvr_csb_enum_helpers.h"
39 #include "pvr_hardcode.h"
40 #include "pvr_pds.h"
41 #include "pvr_private.h"
42 #include "pvr_robustness.h"
43 #include "pvr_shader.h"
44 #include "pvr_types.h"
45 #include "rogue/rogue.h"
46 #include "util/log.h"
47 #include "util/macros.h"
48 #include "util/ralloc.h"
49 #include "util/u_dynarray.h"
50 #include "util/u_math.h"
51 #include "vk_alloc.h"
52 #include "vk_format.h"
53 #include "vk_graphics_state.h"
54 #include "vk_log.h"
55 #include "vk_object.h"
56 #include "vk_pipeline_cache.h"
57 #include "vk_render_pass.h"
58 #include "vk_util.h"
59
60 /*****************************************************************************
61 PDS functions
62 *****************************************************************************/
63
64 /* If allocator == NULL, the internal one will be used. */
pvr_pds_coeff_program_create_and_upload(struct pvr_device * device,const VkAllocationCallbacks * allocator,const uint32_t * fpu_iterators,uint32_t fpu_iterators_count,const uint32_t * destinations,struct pvr_pds_upload * const pds_upload_out,uint32_t * const pds_temps_count_out)65 static VkResult pvr_pds_coeff_program_create_and_upload(
66 struct pvr_device *device,
67 const VkAllocationCallbacks *allocator,
68 const uint32_t *fpu_iterators,
69 uint32_t fpu_iterators_count,
70 const uint32_t *destinations,
71 struct pvr_pds_upload *const pds_upload_out,
72 uint32_t *const pds_temps_count_out)
73 {
74 struct pvr_pds_coeff_loading_program program = {
75 .num_fpu_iterators = fpu_iterators_count,
76 };
77 uint32_t staging_buffer_size;
78 uint32_t *staging_buffer;
79 VkResult result;
80
81 assert(fpu_iterators_count < PVR_MAXIMUM_ITERATIONS);
82
83 /* Get the size of the program and then allocate that much memory. */
84 pvr_pds_coefficient_loading(&program, NULL, PDS_GENERATE_SIZES);
85
86 if (!program.code_size) {
87 pds_upload_out->pvr_bo = NULL;
88 pds_upload_out->code_size = 0;
89 pds_upload_out->data_size = 0;
90 *pds_temps_count_out = 0;
91
92 return VK_SUCCESS;
93 }
94
95 staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);
96
97 staging_buffer = vk_alloc2(&device->vk.alloc,
98 allocator,
99 staging_buffer_size,
100 8,
101 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
102 if (!staging_buffer)
103 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
104
105 /* FIXME: Should we save pointers when we redesign the pds gen api ? */
106 typed_memcpy(program.FPU_iterators,
107 fpu_iterators,
108 program.num_fpu_iterators);
109
110 typed_memcpy(program.destination, destinations, program.num_fpu_iterators);
111
112 /* Generate the program into is the staging_buffer. */
113 pvr_pds_coefficient_loading(&program,
114 staging_buffer,
115 PDS_GENERATE_CODEDATA_SEGMENTS);
116
117 /* FIXME: Figure out the define for alignment of 16. */
118 result = pvr_gpu_upload_pds(device,
119 &staging_buffer[0],
120 program.data_size,
121 16,
122 &staging_buffer[program.data_size],
123 program.code_size,
124 16,
125 16,
126 pds_upload_out);
127 if (result != VK_SUCCESS) {
128 vk_free2(&device->vk.alloc, allocator, staging_buffer);
129 return result;
130 }
131
132 vk_free2(&device->vk.alloc, allocator, staging_buffer);
133
134 *pds_temps_count_out = program.temps_used;
135
136 return VK_SUCCESS;
137 }
138
139 /* FIXME: move this elsewhere since it's also called in pvr_pass.c? */
140 /* If allocator == NULL, the internal one will be used. */
pvr_pds_fragment_program_create_and_upload(struct pvr_device * device,const VkAllocationCallbacks * allocator,const struct pvr_suballoc_bo * fragment_shader_bo,uint32_t fragment_temp_count,enum rogue_msaa_mode msaa_mode,bool has_phase_rate_change,struct pvr_pds_upload * const pds_upload_out)141 VkResult pvr_pds_fragment_program_create_and_upload(
142 struct pvr_device *device,
143 const VkAllocationCallbacks *allocator,
144 const struct pvr_suballoc_bo *fragment_shader_bo,
145 uint32_t fragment_temp_count,
146 enum rogue_msaa_mode msaa_mode,
147 bool has_phase_rate_change,
148 struct pvr_pds_upload *const pds_upload_out)
149 {
150 const enum PVRX(PDSINST_DOUTU_SAMPLE_RATE)
151 sample_rate = pvr_pdsinst_doutu_sample_rate_from_rogue(msaa_mode);
152 struct pvr_pds_kickusc_program program = { 0 };
153 uint32_t staging_buffer_size;
154 uint32_t *staging_buffer;
155 VkResult result;
156
157 /* FIXME: Should it be passing in the USC offset rather than address here?
158 */
159 /* Note this is not strictly required to be done before calculating the
160 * staging_buffer_size in this particular case. It can also be done after
161 * allocating the buffer. The size from pvr_pds_kick_usc() is constant.
162 */
163 pvr_pds_setup_doutu(&program.usc_task_control,
164 fragment_shader_bo->dev_addr.addr,
165 fragment_temp_count,
166 sample_rate,
167 has_phase_rate_change);
168
169 pvr_pds_kick_usc(&program, NULL, 0, false, PDS_GENERATE_SIZES);
170
171 staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);
172
173 staging_buffer = vk_alloc2(&device->vk.alloc,
174 allocator,
175 staging_buffer_size,
176 8,
177 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
178 if (!staging_buffer)
179 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
180
181 pvr_pds_kick_usc(&program,
182 staging_buffer,
183 0,
184 false,
185 PDS_GENERATE_CODEDATA_SEGMENTS);
186
187 /* FIXME: Figure out the define for alignment of 16. */
188 result = pvr_gpu_upload_pds(device,
189 &staging_buffer[0],
190 program.data_size,
191 16,
192 &staging_buffer[program.data_size],
193 program.code_size,
194 16,
195 16,
196 pds_upload_out);
197 if (result != VK_SUCCESS) {
198 vk_free2(&device->vk.alloc, allocator, staging_buffer);
199 return result;
200 }
201
202 vk_free2(&device->vk.alloc, allocator, staging_buffer);
203
204 return VK_SUCCESS;
205 }
206
pvr_pds_get_max_vertex_program_const_map_size_in_bytes(const struct pvr_device_info * dev_info,bool robust_buffer_access)207 static inline size_t pvr_pds_get_max_vertex_program_const_map_size_in_bytes(
208 const struct pvr_device_info *dev_info,
209 bool robust_buffer_access)
210 {
211 /* FIXME: Use more local variable to improve formatting. */
212
213 /* Maximum memory allocation needed for const map entries in
214 * pvr_pds_generate_vertex_primary_program().
215 * When robustBufferAccess is disabled, it must be >= 410.
216 * When robustBufferAccess is enabled, it must be >= 570.
217 *
218 * 1. Size of entry for base instance
219 * (pvr_const_map_entry_base_instance)
220 *
221 * 2. Max. number of vertex inputs (PVR_MAX_VERTEX_INPUT_BINDINGS) * (
222 * if (!robustBufferAccess)
223 * size of vertex attribute entry
224 * (pvr_const_map_entry_vertex_attribute_address) +
225 * else
226 * size of robust vertex attribute entry
227 * (pvr_const_map_entry_robust_vertex_attribute_address) +
228 * size of entry for max attribute index
229 * (pvr_const_map_entry_vertex_attribute_max_index) +
230 * fi
231 * size of Unified Store burst entry
232 * (pvr_const_map_entry_literal32) +
233 * size of entry for vertex stride
234 * (pvr_const_map_entry_literal32) +
235 * size of entries for DDMAD control word
236 * (num_ddmad_literals * pvr_const_map_entry_literal32))
237 *
238 * 3. Size of entry for DOUTW vertex/instance control word
239 * (pvr_const_map_entry_literal32)
240 *
241 * 4. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
242 */
243
244 const size_t attribute_size =
245 (!robust_buffer_access)
246 ? sizeof(struct pvr_const_map_entry_vertex_attribute_address)
247 : sizeof(struct pvr_const_map_entry_robust_vertex_attribute_address) +
248 sizeof(struct pvr_const_map_entry_vertex_attribute_max_index);
249
250 /* If has_pds_ddmadt the DDMAD control word is now a DDMADT control word
251 * and is increased by one DWORD to contain the data for the DDMADT's
252 * out-of-bounds check.
253 */
254 const size_t pvr_pds_const_map_vertex_entry_num_ddmad_literals =
255 1U + (size_t)PVR_HAS_FEATURE(dev_info, pds_ddmadt);
256
257 return (sizeof(struct pvr_const_map_entry_base_instance) +
258 PVR_MAX_VERTEX_INPUT_BINDINGS *
259 (attribute_size +
260 (2 + pvr_pds_const_map_vertex_entry_num_ddmad_literals) *
261 sizeof(struct pvr_const_map_entry_literal32)) +
262 sizeof(struct pvr_const_map_entry_literal32) +
263 sizeof(struct pvr_const_map_entry_doutu_address));
264 }
265
266 /* This is a const pointer to an array of pvr_pds_vertex_dma structs.
267 * The array being pointed to is of PVR_MAX_VERTEX_ATTRIB_DMAS size.
268 */
269 typedef struct pvr_pds_vertex_dma (
270 *const
271 pvr_pds_attrib_dma_descriptions_array_ptr)[PVR_MAX_VERTEX_ATTRIB_DMAS];
272
273 /* dma_descriptions_out_ptr is a pointer to the array used as output.
274 * The whole array might not be filled so dma_count_out indicates how many
275 * elements were used.
276 */
pvr_pds_vertex_attrib_init_dma_descriptions(const VkPipelineVertexInputStateCreateInfo * const vertex_input_state,const struct rogue_vs_build_data * vs_data,pvr_pds_attrib_dma_descriptions_array_ptr dma_descriptions_out_ptr,uint32_t * const dma_count_out)277 static void pvr_pds_vertex_attrib_init_dma_descriptions(
278 const VkPipelineVertexInputStateCreateInfo *const vertex_input_state,
279 const struct rogue_vs_build_data *vs_data,
280 pvr_pds_attrib_dma_descriptions_array_ptr dma_descriptions_out_ptr,
281 uint32_t *const dma_count_out)
282 {
283 struct pvr_pds_vertex_dma *const dma_descriptions =
284 *dma_descriptions_out_ptr;
285 uint32_t dma_count = 0;
286
287 if (!vertex_input_state) {
288 *dma_count_out = 0;
289 return;
290 }
291
292 for (uint32_t i = 0; i < vertex_input_state->vertexAttributeDescriptionCount;
293 i++) {
294 const VkVertexInputAttributeDescription *const attrib_desc =
295 &vertex_input_state->pVertexAttributeDescriptions[i];
296 const VkVertexInputBindingDescription *binding_desc = NULL;
297 struct pvr_pds_vertex_dma *const dma_desc = &dma_descriptions[dma_count];
298 size_t location = attrib_desc->location;
299
300 assert(location < vs_data->inputs.num_input_vars);
301
302 /* Finding the matching binding description. */
303 for (uint32_t j = 0;
304 j < vertex_input_state->vertexBindingDescriptionCount;
305 j++) {
306 const VkVertexInputBindingDescription *const current_binding_desc =
307 &vertex_input_state->pVertexBindingDescriptions[j];
308
309 if (current_binding_desc->binding == attrib_desc->binding) {
310 binding_desc = current_binding_desc;
311 break;
312 }
313 }
314
315 /* From the Vulkan 1.2.195 spec for
316 * VkPipelineVertexInputStateCreateInfo:
317 *
318 * "For every binding specified by each element of
319 * pVertexAttributeDescriptions, a
320 * VkVertexInputBindingDescription must exist in
321 * pVertexBindingDescriptions with the same value of binding"
322 */
323 assert(binding_desc);
324
325 dma_desc->offset = attrib_desc->offset;
326 dma_desc->stride = binding_desc->stride;
327
328 dma_desc->flags = 0;
329
330 if (binding_desc->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
331 dma_desc->flags |= PVR_PDS_VERTEX_DMA_FLAGS_INSTANCE_RATE;
332
333 dma_desc->size_in_dwords = vs_data->inputs.components[location];
334 /* TODO: This will be different when other types are supported.
335 * Store in vs_data with base and components?
336 */
337 /* TODO: Use attrib_desc->format. */
338 dma_desc->component_size_in_bytes = ROGUE_REG_SIZE_BYTES;
339 dma_desc->destination = vs_data->inputs.base[location];
340 dma_desc->binding_index = attrib_desc->binding;
341 dma_desc->divisor = 1;
342
343 dma_desc->robustness_buffer_offset =
344 pvr_get_robustness_buffer_format_offset(attrib_desc->format);
345
346 ++dma_count;
347 }
348
349 *dma_count_out = dma_count;
350 }
351
pvr_pds_vertex_attrib_program_create_and_upload(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,struct pvr_pds_vertex_primary_program_input * const input,struct pvr_pds_attrib_program * const program_out)352 static VkResult pvr_pds_vertex_attrib_program_create_and_upload(
353 struct pvr_device *const device,
354 const VkAllocationCallbacks *const allocator,
355 struct pvr_pds_vertex_primary_program_input *const input,
356 struct pvr_pds_attrib_program *const program_out)
357 {
358 const size_t const_entries_size_in_bytes =
359 pvr_pds_get_max_vertex_program_const_map_size_in_bytes(
360 &device->pdevice->dev_info,
361 device->vk.enabled_features.robustBufferAccess);
362 struct pvr_pds_upload *const program = &program_out->program;
363 struct pvr_pds_info *const info = &program_out->info;
364 struct pvr_const_map_entry *new_entries;
365 ASSERTED uint32_t code_size_in_dwords;
366 size_t staging_buffer_size;
367 uint32_t *staging_buffer;
368 VkResult result;
369
370 memset(info, 0, sizeof(*info));
371
372 info->entries = vk_alloc2(&device->vk.alloc,
373 allocator,
374 const_entries_size_in_bytes,
375 8,
376 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
377 if (!info->entries) {
378 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
379 goto err_out;
380 }
381
382 info->entries_size_in_bytes = const_entries_size_in_bytes;
383
384 pvr_pds_generate_vertex_primary_program(
385 input,
386 NULL,
387 info,
388 device->vk.enabled_features.robustBufferAccess,
389 &device->pdevice->dev_info);
390
391 code_size_in_dwords = info->code_size_in_dwords;
392 staging_buffer_size = PVR_DW_TO_BYTES(info->code_size_in_dwords);
393
394 staging_buffer = vk_alloc2(&device->vk.alloc,
395 allocator,
396 staging_buffer_size,
397 8,
398 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
399 if (!staging_buffer) {
400 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
401 goto err_free_entries;
402 }
403
404 /* This also fills in info->entries. */
405 pvr_pds_generate_vertex_primary_program(
406 input,
407 staging_buffer,
408 info,
409 device->vk.enabled_features.robustBufferAccess,
410 &device->pdevice->dev_info);
411
412 assert(info->code_size_in_dwords <= code_size_in_dwords);
413
414 /* FIXME: Add a vk_realloc2() ? */
415 new_entries = vk_realloc((!allocator) ? &device->vk.alloc : allocator,
416 info->entries,
417 info->entries_written_size_in_bytes,
418 8,
419 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
420 if (!new_entries) {
421 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
422 goto err_free_staging_buffer;
423 }
424
425 info->entries = new_entries;
426 info->entries_size_in_bytes = info->entries_written_size_in_bytes;
427
428 /* FIXME: Figure out the define for alignment of 16. */
429 result = pvr_gpu_upload_pds(device,
430 NULL,
431 0,
432 0,
433 staging_buffer,
434 info->code_size_in_dwords,
435 16,
436 16,
437 program);
438 if (result != VK_SUCCESS)
439 goto err_free_staging_buffer;
440
441 vk_free2(&device->vk.alloc, allocator, staging_buffer);
442
443 return VK_SUCCESS;
444
445 err_free_staging_buffer:
446 vk_free2(&device->vk.alloc, allocator, staging_buffer);
447
448 err_free_entries:
449 vk_free2(&device->vk.alloc, allocator, info->entries);
450
451 err_out:
452 return result;
453 }
454
pvr_pds_vertex_attrib_program_destroy(struct pvr_device * const device,const struct VkAllocationCallbacks * const allocator,struct pvr_pds_attrib_program * const program)455 static inline void pvr_pds_vertex_attrib_program_destroy(
456 struct pvr_device *const device,
457 const struct VkAllocationCallbacks *const allocator,
458 struct pvr_pds_attrib_program *const program)
459 {
460 pvr_bo_suballoc_free(program->program.pvr_bo);
461 vk_free2(&device->vk.alloc, allocator, program->info.entries);
462 }
463
464 /* This is a const pointer to an array of pvr_pds_attrib_program structs.
465 * The array being pointed to is of PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT size.
466 */
467 typedef struct pvr_pds_attrib_program (*const pvr_pds_attrib_programs_array_ptr)
468 [PVR_PDS_VERTEX_ATTRIB_PROGRAM_COUNT];
469
470 /* Indicates that the special variable is unused and has not been allocated a
471 * register.
472 */
473 #define PVR_VERTEX_SPECIAL_VAR_UNUSED (-1)
474
475 /* Each special variable gets allocated its own vtxin reg if used. */
476 struct pvr_vertex_special_vars {
477 /* VertexIndex built-in. */
478 int16_t vertex_id_offset;
479 /* InstanceIndex built-in. */
480 int16_t instance_id_offset;
481 };
482
483 /* Generate and uploads a PDS program for DMAing vertex attribs into USC vertex
484 * inputs. This will bake the code segment and create a template of the data
485 * segment for the command buffer to fill in.
486 */
487 /* If allocator == NULL, the internal one will be used.
488 *
489 * programs_out_ptr is a pointer to the array where the outputs will be placed.
490 */
pvr_pds_vertex_attrib_programs_create_and_upload(struct pvr_device * device,const VkAllocationCallbacks * const allocator,const VkPipelineVertexInputStateCreateInfo * const vertex_input_state,uint32_t usc_temp_count,const struct rogue_vs_build_data * vs_data,const struct pvr_pds_vertex_dma dma_descriptions[static const PVR_MAX_VERTEX_ATTRIB_DMAS],uint32_t dma_count,const struct pvr_vertex_special_vars * special_vars_layout,pvr_pds_attrib_programs_array_ptr programs_out_ptr)491 static VkResult pvr_pds_vertex_attrib_programs_create_and_upload(
492 struct pvr_device *device,
493 const VkAllocationCallbacks *const allocator,
494 const VkPipelineVertexInputStateCreateInfo *const vertex_input_state,
495 uint32_t usc_temp_count,
496 const struct rogue_vs_build_data *vs_data,
497
498 /* Needed for the new path. */
499 /* TODO: Remove some of the above once the compiler is hooked up. */
500 const struct pvr_pds_vertex_dma
501 dma_descriptions[static const PVR_MAX_VERTEX_ATTRIB_DMAS],
502 uint32_t dma_count,
503 const struct pvr_vertex_special_vars *special_vars_layout,
504
505 pvr_pds_attrib_programs_array_ptr programs_out_ptr)
506 {
507 struct pvr_pds_vertex_dma dma_descriptions_old[PVR_MAX_VERTEX_ATTRIB_DMAS];
508
509 struct pvr_pds_attrib_program *const programs_out = *programs_out_ptr;
510 struct pvr_pds_vertex_primary_program_input input = { 0 };
511 VkResult result;
512
513 const bool old_path = pvr_has_hard_coded_shaders(&device->pdevice->dev_info);
514
515 if (old_path) {
516 pvr_pds_vertex_attrib_init_dma_descriptions(vertex_input_state,
517 vs_data,
518 &dma_descriptions_old,
519 &input.dma_count);
520
521 input.dma_list = dma_descriptions_old;
522 } else {
523 input.dma_list = dma_descriptions;
524 input.dma_count = dma_count;
525
526 if (special_vars_layout->vertex_id_offset !=
527 PVR_VERTEX_SPECIAL_VAR_UNUSED) {
528 /* Gets filled by the HW and copied into the appropriate reg. */
529 input.flags |= PVR_PDS_VERTEX_FLAGS_VERTEX_ID_REQUIRED;
530 input.vertex_id_register = special_vars_layout->vertex_id_offset;
531 }
532
533 if (special_vars_layout->instance_id_offset !=
534 PVR_VERTEX_SPECIAL_VAR_UNUSED) {
535 /* Gets filled by the HW and copied into the appropriate reg. */
536 input.flags |= PVR_PDS_VERTEX_FLAGS_INSTANCE_ID_REQUIRED;
537 input.instance_id_register = special_vars_layout->instance_id_offset;
538 }
539 }
540
541 pvr_pds_setup_doutu(&input.usc_task_control,
542 0,
543 usc_temp_count,
544 PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
545 false);
546
547 /* Note: programs_out_ptr is a pointer to an array so this is fine. See the
548 * typedef.
549 */
550 for (uint32_t i = 0; i < ARRAY_SIZE(*programs_out_ptr); i++) {
551 uint32_t extra_flags;
552
553 switch (i) {
554 case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASIC:
555 extra_flags = 0;
556 break;
557
558 case PVR_PDS_VERTEX_ATTRIB_PROGRAM_BASE_INSTANCE:
559 extra_flags = PVR_PDS_VERTEX_FLAGS_BASE_INSTANCE_VARIANT;
560 break;
561
562 case PVR_PDS_VERTEX_ATTRIB_PROGRAM_DRAW_INDIRECT:
563 extra_flags = PVR_PDS_VERTEX_FLAGS_DRAW_INDIRECT_VARIANT;
564 break;
565
566 default:
567 unreachable("Invalid vertex attrib program type.");
568 }
569
570 input.flags |= extra_flags;
571
572 result =
573 pvr_pds_vertex_attrib_program_create_and_upload(device,
574 allocator,
575 &input,
576 &programs_out[i]);
577 if (result != VK_SUCCESS) {
578 for (uint32_t j = 0; j < i; j++) {
579 pvr_pds_vertex_attrib_program_destroy(device,
580 allocator,
581 &programs_out[j]);
582 }
583
584 return result;
585 }
586
587 input.flags &= ~extra_flags;
588 }
589
590 return VK_SUCCESS;
591 }
592
pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(void)593 size_t pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes(void)
594 {
595 /* Maximum memory allocation needed for const map entries in
596 * pvr_pds_generate_descriptor_upload_program().
597 * It must be >= 688 bytes. This size is calculated as the sum of:
598 *
599 * 1. Max. number of descriptor sets (8) * (
600 * size of descriptor entry
601 * (pvr_const_map_entry_descriptor_set) +
602 * size of Common Store burst entry
603 * (pvr_const_map_entry_literal32))
604 *
605 * 2. Max. number of PDS program buffers (24) * (
606 * size of the largest buffer structure
607 * (pvr_const_map_entry_constant_buffer) +
608 * size of Common Store burst entry
609 * (pvr_const_map_entry_literal32)
610 *
611 * 3. Size of DOUTU entry (pvr_const_map_entry_doutu_address)
612 *
613 * 4. Max. number of PDS address literals (8) * (
614 * size of entry
615 * (pvr_const_map_entry_descriptor_set_addrs_table)
616 *
617 * 5. Max. number of address literals with single buffer entry to DOUTD
618 size of entry
619 (pvr_pds_const_map_entry_addr_literal_buffer) +
620 8 * size of entry (pvr_pds_const_map_entry_addr_literal)
621 */
622
623 /* FIXME: PVR_MAX_DESCRIPTOR_SETS is 4 and not 8. The comment above seems to
624 * say that it should be 8.
625 * Figure our a define for this or is the comment wrong?
626 */
627 return (8 * (sizeof(struct pvr_const_map_entry_descriptor_set) +
628 sizeof(struct pvr_const_map_entry_literal32)) +
629 PVR_PDS_MAX_BUFFERS *
630 (sizeof(struct pvr_const_map_entry_constant_buffer) +
631 sizeof(struct pvr_const_map_entry_literal32)) +
632 sizeof(struct pvr_const_map_entry_doutu_address) +
633 sizeof(struct pvr_pds_const_map_entry_addr_literal_buffer) +
634 8 * sizeof(struct pvr_pds_const_map_entry_addr_literal));
635 }
636
637 /* This is a const pointer to an array of PVR_PDS_MAX_BUFFERS pvr_pds_buffer
638 * structs.
639 */
640 typedef struct pvr_pds_buffer (
641 *const pvr_pds_descriptor_program_buffer_array_ptr)[PVR_PDS_MAX_BUFFERS];
642
643 /**
644 * \brief Setup buffers for the PDS descriptor program.
645 *
646 * Sets up buffers required by the PDS gen api based on compiler info.
647 *
648 * For compile time static constants that need DMAing it uploads them and
649 * returns the upload in \r static_consts_pvr_bo_out .
650 */
pvr_pds_descriptor_program_setup_buffers(struct pvr_device * device,bool robust_buffer_access,const struct rogue_compile_time_consts_data * compile_time_consts_data,const struct rogue_ubo_data * ubo_data,pvr_pds_descriptor_program_buffer_array_ptr buffers_out_ptr,uint32_t * const buffer_count_out,struct pvr_suballoc_bo ** const static_consts_pvr_bo_out)651 static VkResult pvr_pds_descriptor_program_setup_buffers(
652 struct pvr_device *device,
653 bool robust_buffer_access,
654 const struct rogue_compile_time_consts_data *compile_time_consts_data,
655 const struct rogue_ubo_data *ubo_data,
656 pvr_pds_descriptor_program_buffer_array_ptr buffers_out_ptr,
657 uint32_t *const buffer_count_out,
658 struct pvr_suballoc_bo **const static_consts_pvr_bo_out)
659 {
660 struct pvr_pds_buffer *const buffers = *buffers_out_ptr;
661 uint32_t buffer_count = 0;
662
663 for (size_t i = 0; i < ubo_data->num_ubo_entries; i++) {
664 struct pvr_pds_buffer *current_buffer = &buffers[buffer_count];
665
666 /* This is fine since buffers_out_ptr is a pointer to an array. */
667 assert(buffer_count < ARRAY_SIZE(*buffers_out_ptr));
668
669 current_buffer->type = PVR_BUFFER_TYPE_UBO;
670 current_buffer->size_in_dwords = ubo_data->size[i];
671 current_buffer->destination = ubo_data->dest[i];
672
673 current_buffer->buffer_id = buffer_count;
674 current_buffer->desc_set = ubo_data->desc_set[i];
675 current_buffer->binding = ubo_data->binding[i];
676 /* TODO: Is this always the case?
677 * E.g. can multiple UBOs have the same base buffer?
678 */
679 current_buffer->source_offset = 0;
680
681 buffer_count++;
682 }
683
684 if (compile_time_consts_data->static_consts.num > 0) {
685 VkResult result;
686
687 assert(compile_time_consts_data->static_consts.num <=
688 ARRAY_SIZE(compile_time_consts_data->static_consts.value));
689
690 /* This is fine since buffers_out_ptr is a pointer to an array. */
691 assert(buffer_count < ARRAY_SIZE(*buffers_out_ptr));
692
693 /* TODO: Is it possible to have multiple static consts buffer where the
694 * destination is not adjoining? If so we need to handle that.
695 * Currently we're only setting up a single buffer.
696 */
697 buffers[buffer_count++] = (struct pvr_pds_buffer){
698 .type = PVR_BUFFER_TYPE_COMPILE_TIME,
699 .size_in_dwords = compile_time_consts_data->static_consts.num,
700 .destination = compile_time_consts_data->static_consts.dest,
701 };
702
703 result = pvr_gpu_upload(device,
704 device->heaps.general_heap,
705 compile_time_consts_data->static_consts.value,
706 compile_time_consts_data->static_consts.num *
707 ROGUE_REG_SIZE_BYTES,
708 ROGUE_REG_SIZE_BYTES,
709 static_consts_pvr_bo_out);
710 if (result != VK_SUCCESS)
711 return result;
712 } else {
713 *static_consts_pvr_bo_out = NULL;
714 }
715
716 *buffer_count_out = buffer_count;
717
718 return VK_SUCCESS;
719 }
720
pvr_pds_descriptor_program_create_and_upload(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,const struct rogue_compile_time_consts_data * const compile_time_consts_data,const struct rogue_ubo_data * const ubo_data,const struct pvr_explicit_constant_usage * const explicit_const_usage,const struct pvr_pipeline_layout * const layout,enum pvr_stage_allocation stage,const struct pvr_sh_reg_layout * sh_reg_layout,struct pvr_stage_allocation_descriptor_state * const descriptor_state)721 static VkResult pvr_pds_descriptor_program_create_and_upload(
722 struct pvr_device *const device,
723 const VkAllocationCallbacks *const allocator,
724 const struct rogue_compile_time_consts_data *const compile_time_consts_data,
725 const struct rogue_ubo_data *const ubo_data,
726 const struct pvr_explicit_constant_usage *const explicit_const_usage,
727 const struct pvr_pipeline_layout *const layout,
728 enum pvr_stage_allocation stage,
729 const struct pvr_sh_reg_layout *sh_reg_layout,
730 struct pvr_stage_allocation_descriptor_state *const descriptor_state)
731 {
732 const size_t const_entries_size_in_bytes =
733 pvr_pds_get_max_descriptor_upload_const_map_size_in_bytes();
734 struct pvr_pds_info *const pds_info = &descriptor_state->pds_info;
735 struct pvr_pds_descriptor_program_input program = { 0 };
736 struct pvr_const_map_entry *new_entries;
737 ASSERTED uint32_t code_size_in_dwords;
738 uint32_t staging_buffer_size;
739 uint32_t *staging_buffer;
740 VkResult result;
741
742 const bool old_path = pvr_has_hard_coded_shaders(&device->pdevice->dev_info);
743
744 assert(stage != PVR_STAGE_ALLOCATION_COUNT);
745
746 *pds_info = (struct pvr_pds_info){ 0 };
747
748 if (old_path) {
749 result = pvr_pds_descriptor_program_setup_buffers(
750 device,
751 device->vk.enabled_features.robustBufferAccess,
752 compile_time_consts_data,
753 ubo_data,
754 &program.buffers,
755 &program.buffer_count,
756 &descriptor_state->static_consts);
757 if (result != VK_SUCCESS)
758 return result;
759
760 if (layout->per_stage_reg_info[stage].primary_dynamic_size_in_dwords)
761 assert(!"Unimplemented");
762
763 for (uint32_t set_num = 0; set_num < layout->set_count; set_num++) {
764 const struct pvr_descriptor_set_layout_mem_layout *const reg_layout =
765 &layout->register_layout_in_dwords_per_stage[stage][set_num];
766 const uint32_t start_offset = explicit_const_usage->start_offset;
767
768 /* TODO: Use compiler usage info to optimize this? */
769
770 /* Only dma primaries if they are actually required. */
771 if (reg_layout->primary_size) {
772 program.descriptor_sets[program.descriptor_set_count++] =
773 (struct pvr_pds_descriptor_set){
774 .descriptor_set = set_num,
775 .size_in_dwords = reg_layout->primary_size,
776 .destination = reg_layout->primary_offset + start_offset,
777 .primary = true,
778 };
779 }
780
781 /* Only dma secondaries if they are actually required. */
782 if (!reg_layout->secondary_size)
783 continue;
784
785 program.descriptor_sets[program.descriptor_set_count++] =
786 (struct pvr_pds_descriptor_set){
787 .descriptor_set = set_num,
788 .size_in_dwords = reg_layout->secondary_size,
789 .destination = reg_layout->secondary_offset + start_offset,
790 };
791 }
792 } else {
793 uint32_t addr_literals = 0;
794
795 if (sh_reg_layout->descriptor_set_addrs_table.present) {
796 program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){
797 .type = PVR_PDS_ADDR_LITERAL_DESC_SET_ADDRS_TABLE,
798 .destination = sh_reg_layout->descriptor_set_addrs_table.offset,
799 };
800 addr_literals++;
801 }
802
803 if (sh_reg_layout->push_consts.present) {
804 program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){
805 .type = PVR_PDS_ADDR_LITERAL_PUSH_CONSTS,
806 .destination = sh_reg_layout->push_consts.offset,
807 };
808 addr_literals++;
809 }
810
811 if (sh_reg_layout->blend_consts.present) {
812 program.addr_literals[addr_literals] = (struct pvr_pds_addr_literal){
813 .type = PVR_PDS_ADDR_LITERAL_BLEND_CONSTANTS,
814 .destination = sh_reg_layout->blend_consts.offset,
815 };
816 addr_literals++;
817 }
818
819 program.addr_literal_count = addr_literals;
820 }
821
822 pds_info->entries = vk_alloc2(&device->vk.alloc,
823 allocator,
824 const_entries_size_in_bytes,
825 8,
826 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
827 if (!pds_info->entries) {
828 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
829 goto err_free_static_consts;
830 }
831
832 pds_info->entries_size_in_bytes = const_entries_size_in_bytes;
833
834 pvr_pds_generate_descriptor_upload_program(&program, NULL, pds_info);
835
836 code_size_in_dwords = pds_info->code_size_in_dwords;
837 staging_buffer_size = PVR_DW_TO_BYTES(pds_info->code_size_in_dwords);
838
839 if (!staging_buffer_size) {
840 vk_free2(&device->vk.alloc, allocator, pds_info->entries);
841
842 *descriptor_state = (struct pvr_stage_allocation_descriptor_state){ 0 };
843
844 return VK_SUCCESS;
845 }
846
847 staging_buffer = vk_alloc2(&device->vk.alloc,
848 allocator,
849 staging_buffer_size,
850 8,
851 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
852 if (!staging_buffer) {
853 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
854 goto err_free_entries;
855 }
856
857 pvr_pds_generate_descriptor_upload_program(&program,
858 staging_buffer,
859 pds_info);
860
861 assert(pds_info->code_size_in_dwords <= code_size_in_dwords);
862
863 /* FIXME: use vk_realloc2() ? */
864 new_entries = vk_realloc((!allocator) ? &device->vk.alloc : allocator,
865 pds_info->entries,
866 pds_info->entries_written_size_in_bytes,
867 8,
868 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
869 if (!new_entries) {
870 result = vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
871 goto err_free_staging_buffer;
872 }
873
874 pds_info->entries = new_entries;
875 pds_info->entries_size_in_bytes = pds_info->entries_written_size_in_bytes;
876
877 /* FIXME: Figure out the define for alignment of 16. */
878 result = pvr_gpu_upload_pds(device,
879 NULL,
880 0,
881 0,
882 staging_buffer,
883 pds_info->code_size_in_dwords,
884 16,
885 16,
886 &descriptor_state->pds_code);
887 if (result != VK_SUCCESS)
888 goto err_free_staging_buffer;
889
890 vk_free2(&device->vk.alloc, allocator, staging_buffer);
891
892 return VK_SUCCESS;
893
894 err_free_staging_buffer:
895 vk_free2(&device->vk.alloc, allocator, staging_buffer);
896
897 err_free_entries:
898 vk_free2(&device->vk.alloc, allocator, pds_info->entries);
899
900 err_free_static_consts:
901 pvr_bo_suballoc_free(descriptor_state->static_consts);
902
903 return result;
904 }
905
pvr_pds_descriptor_program_destroy(struct pvr_device * const device,const struct VkAllocationCallbacks * const allocator,struct pvr_stage_allocation_descriptor_state * const descriptor_state)906 static void pvr_pds_descriptor_program_destroy(
907 struct pvr_device *const device,
908 const struct VkAllocationCallbacks *const allocator,
909 struct pvr_stage_allocation_descriptor_state *const descriptor_state)
910 {
911 if (!descriptor_state)
912 return;
913
914 pvr_bo_suballoc_free(descriptor_state->pds_code.pvr_bo);
915 vk_free2(&device->vk.alloc, allocator, descriptor_state->pds_info.entries);
916 pvr_bo_suballoc_free(descriptor_state->static_consts);
917 }
918
pvr_pds_compute_program_setup(const struct pvr_device_info * dev_info,const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],uint32_t barrier_coefficient,bool add_base_workgroup,uint32_t usc_temps,pvr_dev_addr_t usc_shader_dev_addr,struct pvr_pds_compute_shader_program * const program)919 static void pvr_pds_compute_program_setup(
920 const struct pvr_device_info *dev_info,
921 const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
922 const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
923 uint32_t barrier_coefficient,
924 bool add_base_workgroup,
925 uint32_t usc_temps,
926 pvr_dev_addr_t usc_shader_dev_addr,
927 struct pvr_pds_compute_shader_program *const program)
928 {
929 pvr_pds_compute_shader_program_init(program);
930 program->local_input_regs[0] = local_input_regs[0];
931 program->local_input_regs[1] = local_input_regs[1];
932 program->local_input_regs[2] = local_input_regs[2];
933 program->work_group_input_regs[0] = work_group_input_regs[0];
934 program->work_group_input_regs[1] = work_group_input_regs[1];
935 program->work_group_input_regs[2] = work_group_input_regs[2];
936 program->barrier_coefficient = barrier_coefficient;
937 program->add_base_workgroup = add_base_workgroup;
938 program->flattened_work_groups = true;
939 program->kick_usc = true;
940
941 STATIC_ASSERT(ARRAY_SIZE(program->local_input_regs) ==
942 PVR_WORKGROUP_DIMENSIONS);
943 STATIC_ASSERT(ARRAY_SIZE(program->work_group_input_regs) ==
944 PVR_WORKGROUP_DIMENSIONS);
945 STATIC_ASSERT(ARRAY_SIZE(program->global_input_regs) ==
946 PVR_WORKGROUP_DIMENSIONS);
947
948 pvr_pds_setup_doutu(&program->usc_task_control,
949 usc_shader_dev_addr.addr,
950 usc_temps,
951 PVRX(PDSINST_DOUTU_SAMPLE_RATE_INSTANCE),
952 false);
953
954 pvr_pds_compute_shader(program, NULL, PDS_GENERATE_SIZES, dev_info);
955 }
956
957 /* FIXME: See if pvr_device_init_compute_pds_program() and this could be merged.
958 */
pvr_pds_compute_program_create_and_upload(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],uint32_t barrier_coefficient,uint32_t usc_temps,pvr_dev_addr_t usc_shader_dev_addr,struct pvr_pds_upload * const pds_upload_out,struct pvr_pds_info * const pds_info_out)959 static VkResult pvr_pds_compute_program_create_and_upload(
960 struct pvr_device *const device,
961 const VkAllocationCallbacks *const allocator,
962 const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
963 const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
964 uint32_t barrier_coefficient,
965 uint32_t usc_temps,
966 pvr_dev_addr_t usc_shader_dev_addr,
967 struct pvr_pds_upload *const pds_upload_out,
968 struct pvr_pds_info *const pds_info_out)
969 {
970 struct pvr_device_info *dev_info = &device->pdevice->dev_info;
971 struct pvr_pds_compute_shader_program program;
972 uint32_t staging_buffer_size;
973 uint32_t *staging_buffer;
974 VkResult result;
975
976 pvr_pds_compute_program_setup(dev_info,
977 local_input_regs,
978 work_group_input_regs,
979 barrier_coefficient,
980 false,
981 usc_temps,
982 usc_shader_dev_addr,
983 &program);
984
985 /* FIXME: According to pvr_device_init_compute_pds_program() the code size
986 * is in bytes. Investigate this.
987 */
988 staging_buffer_size = PVR_DW_TO_BYTES(program.code_size + program.data_size);
989
990 staging_buffer = vk_alloc2(&device->vk.alloc,
991 allocator,
992 staging_buffer_size,
993 8,
994 VK_SYSTEM_ALLOCATION_SCOPE_COMMAND);
995 if (!staging_buffer)
996 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
997
998 /* FIXME: pvr_pds_compute_shader doesn't implement
999 * PDS_GENERATE_CODEDATA_SEGMENTS.
1000 */
1001 pvr_pds_compute_shader(&program,
1002 &staging_buffer[0],
1003 PDS_GENERATE_CODE_SEGMENT,
1004 dev_info);
1005
1006 pvr_pds_compute_shader(&program,
1007 &staging_buffer[program.code_size],
1008 PDS_GENERATE_DATA_SEGMENT,
1009 dev_info);
1010
1011 /* FIXME: Figure out the define for alignment of 16. */
1012 result = pvr_gpu_upload_pds(device,
1013 &staging_buffer[program.code_size],
1014 program.data_size,
1015 16,
1016 &staging_buffer[0],
1017 program.code_size,
1018 16,
1019 16,
1020 pds_upload_out);
1021 if (result != VK_SUCCESS) {
1022 vk_free2(&device->vk.alloc, allocator, staging_buffer);
1023 return result;
1024 }
1025
1026 *pds_info_out = (struct pvr_pds_info){
1027 .temps_required = program.highest_temp,
1028 .code_size_in_dwords = program.code_size,
1029 .data_size_in_dwords = program.data_size,
1030 };
1031
1032 vk_free2(&device->vk.alloc, allocator, staging_buffer);
1033
1034 return VK_SUCCESS;
1035 };
1036
pvr_pds_compute_program_destroy(struct pvr_device * const device,const struct VkAllocationCallbacks * const allocator,struct pvr_pds_upload * const pds_program,struct pvr_pds_info * const pds_info)1037 static void pvr_pds_compute_program_destroy(
1038 struct pvr_device *const device,
1039 const struct VkAllocationCallbacks *const allocator,
1040 struct pvr_pds_upload *const pds_program,
1041 struct pvr_pds_info *const pds_info)
1042 {
1043 /* We don't allocate an entries buffer so we don't need to free it */
1044 pvr_bo_suballoc_free(pds_program->pvr_bo);
1045 }
1046
1047 /* This only uploads the code segment. The data segment will need to be patched
1048 * with the base workgroup before uploading.
1049 */
pvr_pds_compute_base_workgroup_variant_program_init(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],uint32_t barrier_coefficient,uint32_t usc_temps,pvr_dev_addr_t usc_shader_dev_addr,struct pvr_pds_base_workgroup_program * program_out)1050 static VkResult pvr_pds_compute_base_workgroup_variant_program_init(
1051 struct pvr_device *const device,
1052 const VkAllocationCallbacks *const allocator,
1053 const uint32_t local_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
1054 const uint32_t work_group_input_regs[static const PVR_WORKGROUP_DIMENSIONS],
1055 uint32_t barrier_coefficient,
1056 uint32_t usc_temps,
1057 pvr_dev_addr_t usc_shader_dev_addr,
1058 struct pvr_pds_base_workgroup_program *program_out)
1059 {
1060 struct pvr_device_info *dev_info = &device->pdevice->dev_info;
1061 struct pvr_pds_compute_shader_program program;
1062 uint32_t buffer_size;
1063 uint32_t *buffer;
1064 VkResult result;
1065
1066 pvr_pds_compute_program_setup(dev_info,
1067 local_input_regs,
1068 work_group_input_regs,
1069 barrier_coefficient,
1070 true,
1071 usc_temps,
1072 usc_shader_dev_addr,
1073 &program);
1074
1075 /* FIXME: According to pvr_device_init_compute_pds_program() the code size
1076 * is in bytes. Investigate this.
1077 */
1078 buffer_size = PVR_DW_TO_BYTES(MAX2(program.code_size, program.data_size));
1079
1080 buffer = vk_alloc2(&device->vk.alloc,
1081 allocator,
1082 buffer_size,
1083 8,
1084 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
1085 if (!buffer)
1086 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1087
1088 pvr_pds_compute_shader(&program,
1089 &buffer[0],
1090 PDS_GENERATE_CODE_SEGMENT,
1091 dev_info);
1092
1093 /* FIXME: Figure out the define for alignment of 16. */
1094 result = pvr_gpu_upload_pds(device,
1095 NULL,
1096 0,
1097 0,
1098 buffer,
1099 program.code_size,
1100 16,
1101 16,
1102 &program_out->code_upload);
1103 if (result != VK_SUCCESS) {
1104 vk_free2(&device->vk.alloc, allocator, buffer);
1105 return result;
1106 }
1107
1108 pvr_pds_compute_shader(&program, buffer, PDS_GENERATE_DATA_SEGMENT, dev_info);
1109
1110 program_out->data_section = buffer;
1111
1112 /* We'll need to patch the base workgroup in the PDS data section before
1113 * dispatch so we save the offsets at which to patch. We only need to save
1114 * the offset for the first workgroup id since the workgroup ids are stored
1115 * contiguously in the data segment.
1116 */
1117 program_out->base_workgroup_data_patching_offset =
1118 program.base_workgroup_constant_offset_in_dwords[0];
1119
1120 program_out->info = (struct pvr_pds_info){
1121 .temps_required = program.highest_temp,
1122 .code_size_in_dwords = program.code_size,
1123 .data_size_in_dwords = program.data_size,
1124 };
1125
1126 return VK_SUCCESS;
1127 }
1128
pvr_pds_compute_base_workgroup_variant_program_finish(struct pvr_device * device,const VkAllocationCallbacks * const allocator,struct pvr_pds_base_workgroup_program * const state)1129 static void pvr_pds_compute_base_workgroup_variant_program_finish(
1130 struct pvr_device *device,
1131 const VkAllocationCallbacks *const allocator,
1132 struct pvr_pds_base_workgroup_program *const state)
1133 {
1134 pvr_bo_suballoc_free(state->code_upload.pvr_bo);
1135 vk_free2(&device->vk.alloc, allocator, state->data_section);
1136 }
1137
1138 /******************************************************************************
1139 Generic pipeline functions
1140 ******************************************************************************/
1141
pvr_pipeline_init(struct pvr_device * device,enum pvr_pipeline_type type,struct pvr_pipeline * const pipeline)1142 static void pvr_pipeline_init(struct pvr_device *device,
1143 enum pvr_pipeline_type type,
1144 struct pvr_pipeline *const pipeline)
1145 {
1146 assert(!pipeline->layout);
1147
1148 vk_object_base_init(&device->vk, &pipeline->base, VK_OBJECT_TYPE_PIPELINE);
1149
1150 pipeline->type = type;
1151 }
1152
pvr_pipeline_finish(struct pvr_pipeline * pipeline)1153 static void pvr_pipeline_finish(struct pvr_pipeline *pipeline)
1154 {
1155 vk_object_base_finish(&pipeline->base);
1156 }
1157
1158 /* How many shared regs it takes to store a pvr_dev_addr_t.
1159 * Each shared reg is 32 bits.
1160 */
1161 #define PVR_DEV_ADDR_SIZE_IN_SH_REGS \
1162 DIV_ROUND_UP(sizeof(pvr_dev_addr_t), sizeof(uint32_t))
1163
1164 /**
1165 * \brief Allocates shared registers.
1166 *
1167 * \return How many sh regs are required.
1168 */
1169 static uint32_t
pvr_pipeline_alloc_shareds(const struct pvr_device * device,const struct pvr_pipeline_layout * layout,enum pvr_stage_allocation stage,struct pvr_sh_reg_layout * const sh_reg_layout_out)1170 pvr_pipeline_alloc_shareds(const struct pvr_device *device,
1171 const struct pvr_pipeline_layout *layout,
1172 enum pvr_stage_allocation stage,
1173 struct pvr_sh_reg_layout *const sh_reg_layout_out)
1174 {
1175 ASSERTED const uint64_t reserved_shared_size =
1176 device->pdevice->dev_runtime_info.reserved_shared_size;
1177 ASSERTED const uint64_t max_coeff =
1178 device->pdevice->dev_runtime_info.max_coeffs;
1179
1180 struct pvr_sh_reg_layout reg_layout = { 0 };
1181 uint32_t next_free_sh_reg = 0;
1182
1183 reg_layout.descriptor_set_addrs_table.present =
1184 !!(layout->shader_stage_mask & BITFIELD_BIT(stage));
1185
1186 if (reg_layout.descriptor_set_addrs_table.present) {
1187 reg_layout.descriptor_set_addrs_table.offset = next_free_sh_reg;
1188 next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS;
1189 }
1190
1191 reg_layout.push_consts.present =
1192 !!(layout->push_constants_shader_stages & BITFIELD_BIT(stage));
1193
1194 if (reg_layout.push_consts.present) {
1195 reg_layout.push_consts.offset = next_free_sh_reg;
1196 next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS;
1197 }
1198
1199 *sh_reg_layout_out = reg_layout;
1200
1201 /* FIXME: We might need to take more things into consideration.
1202 * See pvr_calc_fscommon_size_and_tiles_in_flight().
1203 */
1204 assert(next_free_sh_reg <= reserved_shared_size - max_coeff);
1205
1206 return next_free_sh_reg;
1207 }
1208
1209 /******************************************************************************
1210 Compute pipeline functions
1211 ******************************************************************************/
1212
1213 /* Compiles and uploads shaders and PDS programs. */
pvr_compute_pipeline_compile(struct pvr_device * const device,struct vk_pipeline_cache * cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * const allocator,struct pvr_compute_pipeline * const compute_pipeline)1214 static VkResult pvr_compute_pipeline_compile(
1215 struct pvr_device *const device,
1216 struct vk_pipeline_cache *cache,
1217 const VkComputePipelineCreateInfo *pCreateInfo,
1218 const VkAllocationCallbacks *const allocator,
1219 struct pvr_compute_pipeline *const compute_pipeline)
1220 {
1221 struct pvr_pipeline_layout *layout = compute_pipeline->base.layout;
1222 struct pvr_sh_reg_layout *sh_reg_layout =
1223 &layout->sh_reg_layout_per_stage[PVR_STAGE_ALLOCATION_COMPUTE];
1224 struct rogue_compile_time_consts_data compile_time_consts_data;
1225 uint32_t work_group_input_regs[PVR_WORKGROUP_DIMENSIONS];
1226 struct pvr_explicit_constant_usage explicit_const_usage;
1227 uint32_t local_input_regs[PVR_WORKGROUP_DIMENSIONS];
1228 struct rogue_ubo_data ubo_data;
1229 uint32_t barrier_coefficient;
1230 uint32_t usc_temps;
1231 VkResult result;
1232
1233 if (pvr_has_hard_coded_shaders(&device->pdevice->dev_info)) {
1234 struct pvr_hard_code_compute_build_info build_info;
1235
1236 result = pvr_hard_code_compute_pipeline(device,
1237 &compute_pipeline->shader_state,
1238 &build_info);
1239 if (result != VK_SUCCESS)
1240 return result;
1241
1242 ubo_data = build_info.ubo_data;
1243 compile_time_consts_data = build_info.compile_time_consts_data;
1244
1245 /* We make sure that the compiler's unused reg value is compatible with
1246 * the pds api.
1247 */
1248 STATIC_ASSERT(ROGUE_REG_UNUSED == PVR_PDS_COMPUTE_INPUT_REG_UNUSED);
1249
1250 barrier_coefficient = build_info.barrier_reg;
1251
1252 /* TODO: Maybe change the pds api to use pointers so we avoid the copy. */
1253 local_input_regs[0] = build_info.local_invocation_regs[0];
1254 local_input_regs[1] = build_info.local_invocation_regs[1];
1255 /* This is not a mistake. We want to assign element 1 to 2. */
1256 local_input_regs[2] = build_info.local_invocation_regs[1];
1257
1258 STATIC_ASSERT(
1259 __same_type(work_group_input_regs, build_info.work_group_regs));
1260 typed_memcpy(work_group_input_regs,
1261 build_info.work_group_regs,
1262 PVR_WORKGROUP_DIMENSIONS);
1263
1264 usc_temps = build_info.usc_temps;
1265
1266 explicit_const_usage = build_info.explicit_conts_usage;
1267
1268 } else {
1269 uint32_t sh_count;
1270 sh_count = pvr_pipeline_alloc_shareds(device,
1271 layout,
1272 PVR_STAGE_ALLOCATION_COMPUTE,
1273 sh_reg_layout);
1274
1275 compute_pipeline->shader_state.const_shared_reg_count = sh_count;
1276
1277 /* FIXME: Compile and upload the shader. */
1278 /* FIXME: Initialize the shader state and setup build info. */
1279 abort();
1280 };
1281
1282 result = pvr_pds_descriptor_program_create_and_upload(
1283 device,
1284 allocator,
1285 &compile_time_consts_data,
1286 &ubo_data,
1287 &explicit_const_usage,
1288 layout,
1289 PVR_STAGE_ALLOCATION_COMPUTE,
1290 sh_reg_layout,
1291 &compute_pipeline->descriptor_state);
1292 if (result != VK_SUCCESS)
1293 goto err_free_shader;
1294
1295 result = pvr_pds_compute_program_create_and_upload(
1296 device,
1297 allocator,
1298 local_input_regs,
1299 work_group_input_regs,
1300 barrier_coefficient,
1301 usc_temps,
1302 compute_pipeline->shader_state.bo->dev_addr,
1303 &compute_pipeline->primary_program,
1304 &compute_pipeline->primary_program_info);
1305 if (result != VK_SUCCESS)
1306 goto err_free_descriptor_program;
1307
1308 /* If the workgroup ID is required, then we require the base workgroup
1309 * variant of the PDS compute program as well.
1310 */
1311 compute_pipeline->flags.base_workgroup =
1312 work_group_input_regs[0] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED ||
1313 work_group_input_regs[1] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED ||
1314 work_group_input_regs[2] != PVR_PDS_COMPUTE_INPUT_REG_UNUSED;
1315
1316 if (compute_pipeline->flags.base_workgroup) {
1317 result = pvr_pds_compute_base_workgroup_variant_program_init(
1318 device,
1319 allocator,
1320 local_input_regs,
1321 work_group_input_regs,
1322 barrier_coefficient,
1323 usc_temps,
1324 compute_pipeline->shader_state.bo->dev_addr,
1325 &compute_pipeline->primary_base_workgroup_variant_program);
1326 if (result != VK_SUCCESS)
1327 goto err_destroy_compute_program;
1328 }
1329
1330 return VK_SUCCESS;
1331
1332 err_destroy_compute_program:
1333 pvr_pds_compute_program_destroy(device,
1334 allocator,
1335 &compute_pipeline->primary_program,
1336 &compute_pipeline->primary_program_info);
1337
1338 err_free_descriptor_program:
1339 pvr_pds_descriptor_program_destroy(device,
1340 allocator,
1341 &compute_pipeline->descriptor_state);
1342
1343 err_free_shader:
1344 pvr_bo_suballoc_free(compute_pipeline->shader_state.bo);
1345
1346 return result;
1347 }
1348
1349 static VkResult
pvr_compute_pipeline_init(struct pvr_device * device,struct vk_pipeline_cache * cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,struct pvr_compute_pipeline * compute_pipeline)1350 pvr_compute_pipeline_init(struct pvr_device *device,
1351 struct vk_pipeline_cache *cache,
1352 const VkComputePipelineCreateInfo *pCreateInfo,
1353 const VkAllocationCallbacks *allocator,
1354 struct pvr_compute_pipeline *compute_pipeline)
1355 {
1356 VkResult result;
1357
1358 pvr_pipeline_init(device,
1359 PVR_PIPELINE_TYPE_COMPUTE,
1360 &compute_pipeline->base);
1361
1362 compute_pipeline->base.layout =
1363 pvr_pipeline_layout_from_handle(pCreateInfo->layout);
1364
1365 result = pvr_compute_pipeline_compile(device,
1366 cache,
1367 pCreateInfo,
1368 allocator,
1369 compute_pipeline);
1370 if (result != VK_SUCCESS) {
1371 pvr_pipeline_finish(&compute_pipeline->base);
1372 return result;
1373 }
1374
1375 return VK_SUCCESS;
1376 }
1377
1378 static VkResult
pvr_compute_pipeline_create(struct pvr_device * device,struct vk_pipeline_cache * cache,const VkComputePipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,VkPipeline * const pipeline_out)1379 pvr_compute_pipeline_create(struct pvr_device *device,
1380 struct vk_pipeline_cache *cache,
1381 const VkComputePipelineCreateInfo *pCreateInfo,
1382 const VkAllocationCallbacks *allocator,
1383 VkPipeline *const pipeline_out)
1384 {
1385 struct pvr_compute_pipeline *compute_pipeline;
1386 VkResult result;
1387
1388 compute_pipeline = vk_zalloc2(&device->vk.alloc,
1389 allocator,
1390 sizeof(*compute_pipeline),
1391 8,
1392 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
1393 if (!compute_pipeline)
1394 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1395
1396 /* Compiles and uploads shaders and PDS programs. */
1397 result = pvr_compute_pipeline_init(device,
1398 cache,
1399 pCreateInfo,
1400 allocator,
1401 compute_pipeline);
1402 if (result != VK_SUCCESS) {
1403 vk_free2(&device->vk.alloc, allocator, compute_pipeline);
1404 return result;
1405 }
1406
1407 *pipeline_out = pvr_pipeline_to_handle(&compute_pipeline->base);
1408
1409 return VK_SUCCESS;
1410 }
1411
pvr_compute_pipeline_destroy(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,struct pvr_compute_pipeline * const compute_pipeline)1412 static void pvr_compute_pipeline_destroy(
1413 struct pvr_device *const device,
1414 const VkAllocationCallbacks *const allocator,
1415 struct pvr_compute_pipeline *const compute_pipeline)
1416 {
1417 if (compute_pipeline->flags.base_workgroup) {
1418 pvr_pds_compute_base_workgroup_variant_program_finish(
1419 device,
1420 allocator,
1421 &compute_pipeline->primary_base_workgroup_variant_program);
1422 }
1423
1424 pvr_pds_compute_program_destroy(device,
1425 allocator,
1426 &compute_pipeline->primary_program,
1427 &compute_pipeline->primary_program_info);
1428 pvr_pds_descriptor_program_destroy(device,
1429 allocator,
1430 &compute_pipeline->descriptor_state);
1431 pvr_bo_suballoc_free(compute_pipeline->shader_state.bo);
1432
1433 pvr_pipeline_finish(&compute_pipeline->base);
1434
1435 vk_free2(&device->vk.alloc, allocator, compute_pipeline);
1436 }
1437
1438 VkResult
pvr_CreateComputePipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t createInfoCount,const VkComputePipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)1439 pvr_CreateComputePipelines(VkDevice _device,
1440 VkPipelineCache pipelineCache,
1441 uint32_t createInfoCount,
1442 const VkComputePipelineCreateInfo *pCreateInfos,
1443 const VkAllocationCallbacks *pAllocator,
1444 VkPipeline *pPipelines)
1445 {
1446 VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
1447 PVR_FROM_HANDLE(pvr_device, device, _device);
1448 VkResult result = VK_SUCCESS;
1449
1450 for (uint32_t i = 0; i < createInfoCount; i++) {
1451 const VkResult local_result =
1452 pvr_compute_pipeline_create(device,
1453 cache,
1454 &pCreateInfos[i],
1455 pAllocator,
1456 &pPipelines[i]);
1457 if (local_result != VK_SUCCESS) {
1458 result = local_result;
1459 pPipelines[i] = VK_NULL_HANDLE;
1460 }
1461 }
1462
1463 return result;
1464 }
1465
1466 /******************************************************************************
1467 Graphics pipeline functions
1468 ******************************************************************************/
1469
1470 static void
pvr_graphics_pipeline_destroy(struct pvr_device * const device,const VkAllocationCallbacks * const allocator,struct pvr_graphics_pipeline * const gfx_pipeline)1471 pvr_graphics_pipeline_destroy(struct pvr_device *const device,
1472 const VkAllocationCallbacks *const allocator,
1473 struct pvr_graphics_pipeline *const gfx_pipeline)
1474 {
1475 const uint32_t num_vertex_attrib_programs =
1476 ARRAY_SIZE(gfx_pipeline->shader_state.vertex.pds_attrib_programs);
1477
1478 pvr_pds_descriptor_program_destroy(
1479 device,
1480 allocator,
1481 &gfx_pipeline->shader_state.fragment.descriptor_state);
1482
1483 pvr_pds_descriptor_program_destroy(
1484 device,
1485 allocator,
1486 &gfx_pipeline->shader_state.vertex.descriptor_state);
1487
1488 for (uint32_t i = 0; i < num_vertex_attrib_programs; i++) {
1489 struct pvr_pds_attrib_program *const attrib_program =
1490 &gfx_pipeline->shader_state.vertex.pds_attrib_programs[i];
1491
1492 pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
1493 }
1494
1495 pvr_bo_suballoc_free(
1496 gfx_pipeline->shader_state.fragment.pds_fragment_program.pvr_bo);
1497 pvr_bo_suballoc_free(
1498 gfx_pipeline->shader_state.fragment.pds_coeff_program.pvr_bo);
1499
1500 pvr_bo_suballoc_free(gfx_pipeline->shader_state.fragment.bo);
1501 pvr_bo_suballoc_free(gfx_pipeline->shader_state.vertex.bo);
1502
1503 pvr_pipeline_finish(&gfx_pipeline->base);
1504
1505 vk_free2(&device->vk.alloc, allocator, gfx_pipeline);
1506 }
1507
1508 static void
pvr_vertex_state_init(struct pvr_graphics_pipeline * gfx_pipeline,const struct rogue_common_build_data * common_data,uint32_t vtxin_regs_used,const struct rogue_vs_build_data * vs_data)1509 pvr_vertex_state_init(struct pvr_graphics_pipeline *gfx_pipeline,
1510 const struct rogue_common_build_data *common_data,
1511 uint32_t vtxin_regs_used,
1512 const struct rogue_vs_build_data *vs_data)
1513 {
1514 struct pvr_vertex_shader_state *vertex_state =
1515 &gfx_pipeline->shader_state.vertex;
1516
1517 /* TODO: Hard coding these for now. These should be populated based on the
1518 * information returned by the compiler.
1519 */
1520 vertex_state->stage_state.const_shared_reg_count = common_data->shareds;
1521 vertex_state->stage_state.const_shared_reg_offset = 0;
1522 vertex_state->stage_state.coefficient_size = common_data->coeffs;
1523 vertex_state->stage_state.uses_atomic_ops = false;
1524 vertex_state->stage_state.uses_texture_rw = false;
1525 vertex_state->stage_state.uses_barrier = false;
1526 vertex_state->stage_state.has_side_effects = false;
1527 vertex_state->stage_state.empty_program = false;
1528
1529 /* This ends up unused since we'll use the temp_usage for the PDS program we
1530 * end up selecting, and the descriptor PDS program doesn't use any temps.
1531 * Let's set it to ~0 in case it ever gets used.
1532 */
1533 vertex_state->stage_state.pds_temps_count = ~0;
1534
1535 vertex_state->vertex_input_size = vtxin_regs_used;
1536 vertex_state->vertex_output_size =
1537 vs_data->num_vertex_outputs * ROGUE_REG_SIZE_BYTES;
1538 vertex_state->user_clip_planes_mask = 0;
1539 vertex_state->entry_offset = 0;
1540
1541 /* TODO: The number of varyings should be checked against the fragment
1542 * shader inputs and assigned in the place where that happens.
1543 * There will also be an opportunity to cull unused fs inputs/vs outputs.
1544 */
1545 pvr_csb_pack (&gfx_pipeline->shader_state.vertex.varying[0],
1546 TA_STATE_VARYING0,
1547 varying0) {
1548 varying0.f32_linear = vs_data->num_varyings;
1549 varying0.f32_flat = 0;
1550 varying0.f32_npc = 0;
1551 }
1552
1553 pvr_csb_pack (&gfx_pipeline->shader_state.vertex.varying[1],
1554 TA_STATE_VARYING1,
1555 varying1) {
1556 varying1.f16_linear = 0;
1557 varying1.f16_flat = 0;
1558 varying1.f16_npc = 0;
1559 }
1560 }
1561
1562 static void
pvr_fragment_state_init(struct pvr_graphics_pipeline * gfx_pipeline,const struct rogue_common_build_data * common_data)1563 pvr_fragment_state_init(struct pvr_graphics_pipeline *gfx_pipeline,
1564 const struct rogue_common_build_data *common_data)
1565 {
1566 struct pvr_fragment_shader_state *fragment_state =
1567 &gfx_pipeline->shader_state.fragment;
1568
1569 /* TODO: Hard coding these for now. These should be populated based on the
1570 * information returned by the compiler.
1571 */
1572 fragment_state->stage_state.const_shared_reg_count = 0;
1573 fragment_state->stage_state.const_shared_reg_offset = 0;
1574 fragment_state->stage_state.coefficient_size = common_data->coeffs;
1575 fragment_state->stage_state.uses_atomic_ops = false;
1576 fragment_state->stage_state.uses_texture_rw = false;
1577 fragment_state->stage_state.uses_barrier = false;
1578 fragment_state->stage_state.has_side_effects = false;
1579 fragment_state->stage_state.empty_program = false;
1580
1581 fragment_state->pass_type = PVRX(TA_PASSTYPE_OPAQUE);
1582 fragment_state->entry_offset = 0;
1583
1584 /* We can't initialize it yet since we still need to generate the PDS
1585 * programs so set it to `~0` to make sure that we set this up later on.
1586 */
1587 fragment_state->stage_state.pds_temps_count = ~0;
1588 }
1589
pvr_blend_factor_requires_consts(VkBlendFactor factor)1590 static bool pvr_blend_factor_requires_consts(VkBlendFactor factor)
1591 {
1592 switch (factor) {
1593 case VK_BLEND_FACTOR_CONSTANT_COLOR:
1594 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1595 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1596 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1597 return true;
1598
1599 default:
1600 return false;
1601 }
1602 }
1603
1604 /**
1605 * \brief Indicates whether dynamic blend constants are needed.
1606 *
1607 * If the user has specified the blend constants to be dynamic, they might not
1608 * necessarily be using them. This function makes sure that they are being used
1609 * in order to determine whether we need to upload them later on for the shader
1610 * to access them.
1611 */
pvr_graphics_pipeline_requires_dynamic_blend_consts(const struct pvr_graphics_pipeline * gfx_pipeline)1612 static bool pvr_graphics_pipeline_requires_dynamic_blend_consts(
1613 const struct pvr_graphics_pipeline *gfx_pipeline)
1614 {
1615 const struct vk_dynamic_graphics_state *const state =
1616 &gfx_pipeline->dynamic_state;
1617
1618 if (BITSET_TEST(state->set, MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS))
1619 return false;
1620
1621 for (uint32_t i = 0; i < state->cb.attachment_count; i++) {
1622 const struct vk_color_blend_attachment_state *attachment =
1623 &state->cb.attachments[i];
1624
1625 const bool has_color_write =
1626 attachment->write_mask &
1627 (VK_COLOR_COMPONENT_R_BIT | VK_COLOR_COMPONENT_G_BIT |
1628 VK_COLOR_COMPONENT_B_BIT);
1629 const bool has_alpha_write = attachment->write_mask &
1630 VK_COLOR_COMPONENT_A_BIT;
1631
1632 if (!attachment->blend_enable || attachment->write_mask == 0)
1633 continue;
1634
1635 if (has_color_write) {
1636 const uint8_t src_color_blend_factor =
1637 attachment->src_color_blend_factor;
1638 const uint8_t dst_color_blend_factor =
1639 attachment->dst_color_blend_factor;
1640
1641 if (pvr_blend_factor_requires_consts(src_color_blend_factor) ||
1642 pvr_blend_factor_requires_consts(dst_color_blend_factor)) {
1643 return true;
1644 }
1645 }
1646
1647 if (has_alpha_write) {
1648 const uint8_t src_alpha_blend_factor =
1649 attachment->src_alpha_blend_factor;
1650 const uint8_t dst_alpha_blend_factor =
1651 attachment->dst_alpha_blend_factor;
1652
1653 if (pvr_blend_factor_requires_consts(src_alpha_blend_factor) ||
1654 pvr_blend_factor_requires_consts(dst_alpha_blend_factor)) {
1655 return true;
1656 }
1657 }
1658 }
1659
1660 return false;
1661 }
1662
pvr_graphics_pipeline_alloc_shareds(const struct pvr_device * device,const struct pvr_graphics_pipeline * gfx_pipeline,enum pvr_stage_allocation stage,struct pvr_sh_reg_layout * const sh_reg_layout_out)1663 static uint32_t pvr_graphics_pipeline_alloc_shareds(
1664 const struct pvr_device *device,
1665 const struct pvr_graphics_pipeline *gfx_pipeline,
1666 enum pvr_stage_allocation stage,
1667 struct pvr_sh_reg_layout *const sh_reg_layout_out)
1668 {
1669 ASSERTED const uint64_t reserved_shared_size =
1670 device->pdevice->dev_runtime_info.reserved_shared_size;
1671 ASSERTED const uint64_t max_coeff =
1672 device->pdevice->dev_runtime_info.max_coeffs;
1673
1674 const struct pvr_pipeline_layout *layout = gfx_pipeline->base.layout;
1675 struct pvr_sh_reg_layout reg_layout = { 0 };
1676 uint32_t next_free_sh_reg = 0;
1677
1678 next_free_sh_reg =
1679 pvr_pipeline_alloc_shareds(device, layout, stage, ®_layout);
1680
1681 reg_layout.blend_consts.present =
1682 (stage == PVR_STAGE_ALLOCATION_FRAGMENT &&
1683 pvr_graphics_pipeline_requires_dynamic_blend_consts(gfx_pipeline));
1684 if (reg_layout.blend_consts.present) {
1685 reg_layout.blend_consts.offset = next_free_sh_reg;
1686 next_free_sh_reg += PVR_DEV_ADDR_SIZE_IN_SH_REGS;
1687 }
1688
1689 *sh_reg_layout_out = reg_layout;
1690
1691 /* FIXME: We might need to take more things into consideration.
1692 * See pvr_calc_fscommon_size_and_tiles_in_flight().
1693 */
1694 assert(next_free_sh_reg <= reserved_shared_size - max_coeff);
1695
1696 return next_free_sh_reg;
1697 }
1698
1699 #undef PVR_DEV_ADDR_SIZE_IN_SH_REGS
1700
pvr_graphics_pipeline_alloc_vertex_inputs(const VkPipelineVertexInputStateCreateInfo * const vs_data,rogue_vertex_inputs * const vertex_input_layout_out,unsigned * num_vertex_input_regs_out,pvr_pds_attrib_dma_descriptions_array_ptr dma_descriptions_out_ptr,uint32_t * const dma_count_out)1701 static void pvr_graphics_pipeline_alloc_vertex_inputs(
1702 const VkPipelineVertexInputStateCreateInfo *const vs_data,
1703 rogue_vertex_inputs *const vertex_input_layout_out,
1704 unsigned *num_vertex_input_regs_out,
1705 pvr_pds_attrib_dma_descriptions_array_ptr dma_descriptions_out_ptr,
1706 uint32_t *const dma_count_out)
1707 {
1708 const VkVertexInputBindingDescription
1709 *sorted_bindings[PVR_MAX_VERTEX_INPUT_BINDINGS] = { 0 };
1710 const VkVertexInputAttributeDescription
1711 *sorted_attributes[PVR_MAX_VERTEX_INPUT_BINDINGS] = { 0 };
1712
1713 rogue_vertex_inputs build_data = {
1714 .num_input_vars = vs_data->vertexAttributeDescriptionCount,
1715 };
1716 uint32_t next_reg_offset = 0;
1717
1718 struct pvr_pds_vertex_dma *const dma_descriptions =
1719 *dma_descriptions_out_ptr;
1720 uint32_t dma_count = 0;
1721
1722 /* Vertex attributes map to the `layout(location = x)` annotation in the
1723 * shader where `x` is the attribute's location.
1724 * Vertex bindings have NO relation to the shader. They have nothing to do
1725 * with the `layout(set = x, binding = y)` notation. They instead indicate
1726 * where the data for a collection of vertex attributes comes from. The
1727 * application binds a VkBuffer with vkCmdBindVertexBuffers() to a specific
1728 * binding number and based on that we'll know which buffer to DMA the data
1729 * from, to fill in the collection of vertex attributes.
1730 */
1731
1732 for (uint32_t i = 0; i < vs_data->vertexBindingDescriptionCount; i++) {
1733 const VkVertexInputBindingDescription *binding_desc =
1734 &vs_data->pVertexBindingDescriptions[i];
1735
1736 sorted_bindings[binding_desc->binding] = binding_desc;
1737 }
1738
1739 for (uint32_t i = 0; i < vs_data->vertexAttributeDescriptionCount; i++) {
1740 const VkVertexInputAttributeDescription *attribute_desc =
1741 &vs_data->pVertexAttributeDescriptions[i];
1742
1743 sorted_attributes[attribute_desc->location] = attribute_desc;
1744 }
1745
1746 for (uint32_t i = 0, j = 0; i < ARRAY_SIZE(sorted_attributes); i++) {
1747 if (sorted_attributes[i])
1748 sorted_attributes[j++] = sorted_attributes[i];
1749 }
1750
1751 for (uint32_t i = 0; i < vs_data->vertexAttributeDescriptionCount; i++) {
1752 const VkVertexInputAttributeDescription *attribute = sorted_attributes[i];
1753 const VkVertexInputBindingDescription *binding =
1754 sorted_bindings[attribute->binding];
1755 const struct util_format_description *fmt_description =
1756 vk_format_description(attribute->format);
1757 struct pvr_pds_vertex_dma *dma_desc = &dma_descriptions[dma_count];
1758 unsigned vtxin_reg_offset;
1759
1760 /* Reg allocation. */
1761
1762 vtxin_reg_offset = next_reg_offset;
1763 build_data.base[i] = vtxin_reg_offset;
1764
1765 if (fmt_description->colorspace != UTIL_FORMAT_COLORSPACE_RGB ||
1766 fmt_description->layout != UTIL_FORMAT_LAYOUT_PLAIN ||
1767 fmt_description->block.bits % 32 != 0 || !fmt_description->is_array) {
1768 /* For now we only support formats with 32 bit components since we
1769 * don't need to pack/unpack them.
1770 */
1771 /* TODO: Support any other format with VERTEX_BUFFER_BIT set that
1772 * doesn't have 32 bit components if we're advertising any.
1773 */
1774 assert(false);
1775 }
1776
1777 /* TODO: Check if this is fine with the compiler. Does it want the amount
1778 * of components or does it want a size in dwords to figure out how many
1779 * vtxin regs are covered. For formats with 32 bit components the
1780 * distinction doesn't change anything.
1781 */
1782 build_data.components[i] =
1783 util_format_get_nr_components(fmt_description->format);
1784
1785 next_reg_offset += build_data.components[i];
1786
1787 /* DMA setup. */
1788
1789 /* The PDS program sets up DDMADs to DMA attributes into vtxin regs.
1790 *
1791 * DDMAD -> Multiply, add, and DOUTD (i.e. DMA from that address).
1792 * DMA source addr = src0 * src1 + src2
1793 * DMA params = src3
1794 *
1795 * In the PDS program we setup src0 with the binding's stride and src1
1796 * with either the instance id or vertex id (both of which get filled by
1797 * the hardware). We setup src2 later on once we know which VkBuffer to
1798 * DMA the data from so it's saved for later when we patch the data
1799 * section.
1800 */
1801
1802 /* TODO: Right now we're setting up a DMA per attribute. In a case where
1803 * there are multiple attributes packed into a single binding with
1804 * adjacent locations we'd still be DMAing them separately. This is not
1805 * great so the DMA setup should be smarter and could do with some
1806 * optimization.
1807 */
1808
1809 *dma_desc = (struct pvr_pds_vertex_dma){ 0 };
1810
1811 /* In relation to the Vulkan spec. 22.4. Vertex Input Address Calculation
1812 * this corresponds to `attribDesc.offset`.
1813 * The PDS program doesn't do anything with it but just save it in the
1814 * PDS program entry.
1815 */
1816 dma_desc->offset = attribute->offset;
1817
1818 /* In relation to the Vulkan spec. 22.4. Vertex Input Address Calculation
1819 * this corresponds to `bindingDesc.stride`.
1820 * The PDS program will calculate the `effectiveVertexOffset` with this
1821 * and add it to the address provided in the patched data segment.
1822 */
1823 dma_desc->stride = binding->stride;
1824
1825 if (binding->inputRate == VK_VERTEX_INPUT_RATE_INSTANCE)
1826 dma_desc->flags = PVR_PDS_VERTEX_DMA_FLAGS_INSTANCE_RATE;
1827 else
1828 dma_desc->flags = 0;
1829
1830 /* Size to DMA per vertex attribute. Used to setup src3 in the DDMAD. */
1831 assert(fmt_description->block.bits != 0); /* Likely an unsupported fmt. */
1832 dma_desc->size_in_dwords = fmt_description->block.bits / 32;
1833
1834 /* Vtxin reg offset to start DMAing into. */
1835 dma_desc->destination = vtxin_reg_offset;
1836
1837 /* Will be used by the driver to figure out buffer address to patch in the
1838 * data section. I.e. which binding we should DMA from.
1839 */
1840 dma_desc->binding_index = attribute->binding;
1841
1842 /* We don't currently support VK_EXT_vertex_attribute_divisor so no
1843 * repeating of instance-rate vertex attributes needed. We should always
1844 * move on to the next vertex attribute.
1845 */
1846 dma_desc->divisor = 1;
1847
1848 /* Will be used to generate PDS code that takes care of robust buffer
1849 * access, and later on by the driver to write the correct robustness
1850 * buffer address to DMA the fallback values from.
1851 */
1852 dma_desc->robustness_buffer_offset =
1853 pvr_get_robustness_buffer_format_offset(attribute->format);
1854
1855 /* Used by later on by the driver to figure out if the buffer is being
1856 * accessed out of bounds, for robust buffer access.
1857 */
1858 dma_desc->component_size_in_bytes =
1859 fmt_description->block.bits / fmt_description->nr_channels / 8;
1860
1861 dma_count++;
1862 };
1863
1864 *vertex_input_layout_out = build_data;
1865 *num_vertex_input_regs_out = next_reg_offset;
1866 *dma_count_out = dma_count;
1867 }
1868
pvr_graphics_pipeline_alloc_vertex_special_vars(unsigned * num_vertex_input_regs,struct pvr_vertex_special_vars * special_vars_layout_out)1869 static void pvr_graphics_pipeline_alloc_vertex_special_vars(
1870 unsigned *num_vertex_input_regs,
1871 struct pvr_vertex_special_vars *special_vars_layout_out)
1872 {
1873 unsigned next_free_reg = *num_vertex_input_regs;
1874 struct pvr_vertex_special_vars layout;
1875
1876 /* We don't support VK_KHR_shader_draw_parameters or Vulkan 1.1 so no
1877 * BaseInstance, BaseVertex, DrawIndex.
1878 */
1879
1880 /* TODO: The shader might not necessarily be using this so we'd just be
1881 * wasting regs. Get the info from the compiler about whether or not the
1882 * shader uses them and allocate them accordingly. For now we'll set them up
1883 * regardless.
1884 */
1885
1886 layout.vertex_id_offset = (int16_t)next_free_reg;
1887 next_free_reg++;
1888
1889 layout.instance_id_offset = (int16_t)next_free_reg;
1890 next_free_reg++;
1891
1892 *num_vertex_input_regs = next_free_reg;
1893 *special_vars_layout_out = layout;
1894 }
1895
1896 /* Compiles and uploads shaders and PDS programs. */
1897 static VkResult
pvr_graphics_pipeline_compile(struct pvr_device * const device,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * const allocator,struct pvr_graphics_pipeline * const gfx_pipeline)1898 pvr_graphics_pipeline_compile(struct pvr_device *const device,
1899 struct vk_pipeline_cache *cache,
1900 const VkGraphicsPipelineCreateInfo *pCreateInfo,
1901 const VkAllocationCallbacks *const allocator,
1902 struct pvr_graphics_pipeline *const gfx_pipeline)
1903 {
1904 /* FIXME: Remove this hard coding. */
1905 struct pvr_explicit_constant_usage vert_explicit_const_usage = {
1906 .start_offset = 16,
1907 };
1908 struct pvr_explicit_constant_usage frag_explicit_const_usage = {
1909 .start_offset = 0,
1910 };
1911 static uint32_t hard_code_pipeline_n = 0;
1912
1913 struct pvr_pipeline_layout *layout = gfx_pipeline->base.layout;
1914 struct pvr_sh_reg_layout *sh_reg_layout_vert =
1915 &layout->sh_reg_layout_per_stage[PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY];
1916 struct pvr_sh_reg_layout *sh_reg_layout_frag =
1917 &layout->sh_reg_layout_per_stage[PVR_STAGE_ALLOCATION_FRAGMENT];
1918 const VkPipelineVertexInputStateCreateInfo *const vertex_input_state =
1919 pCreateInfo->pVertexInputState;
1920 const uint32_t cache_line_size =
1921 rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
1922 struct rogue_compiler *compiler = device->pdevice->compiler;
1923 struct rogue_build_ctx *ctx;
1924 VkResult result;
1925
1926 const bool old_path = pvr_has_hard_coded_shaders(&device->pdevice->dev_info);
1927
1928 /* Vars needed for the new path. */
1929 struct pvr_pds_vertex_dma vtx_dma_descriptions[PVR_MAX_VERTEX_ATTRIB_DMAS];
1930 uint32_t vtx_dma_count = 0;
1931 rogue_vertex_inputs *vertex_input_layout;
1932 unsigned *vertex_input_reg_count;
1933
1934 /* TODO: The compiler should be making use of this to determine where
1935 * specific special variables are located in the vtxin reg set.
1936 */
1937 struct pvr_vertex_special_vars special_vars_layout = { 0 };
1938
1939 uint32_t sh_count[PVR_STAGE_ALLOCATION_COUNT] = { 0 };
1940
1941 /* Setup shared build context. */
1942 ctx = rogue_build_context_create(compiler, layout);
1943 if (!ctx)
1944 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1945
1946 vertex_input_layout = &ctx->stage_data.vs.inputs;
1947 vertex_input_reg_count = &ctx->stage_data.vs.num_vertex_input_regs;
1948
1949 if (!old_path) {
1950 pvr_graphics_pipeline_alloc_vertex_inputs(vertex_input_state,
1951 vertex_input_layout,
1952 vertex_input_reg_count,
1953 &vtx_dma_descriptions,
1954 &vtx_dma_count);
1955
1956 pvr_graphics_pipeline_alloc_vertex_special_vars(vertex_input_reg_count,
1957 &special_vars_layout);
1958
1959 for (enum pvr_stage_allocation pvr_stage =
1960 PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY;
1961 pvr_stage < PVR_STAGE_ALLOCATION_COMPUTE;
1962 ++pvr_stage)
1963 sh_count[pvr_stage] = pvr_pipeline_alloc_shareds(
1964 device,
1965 layout,
1966 pvr_stage,
1967 &layout->sh_reg_layout_per_stage[pvr_stage]);
1968 }
1969
1970 /* NIR middle-end translation. */
1971 for (gl_shader_stage stage = MESA_SHADER_FRAGMENT; stage > MESA_SHADER_NONE;
1972 stage--) {
1973 const VkPipelineShaderStageCreateInfo *create_info;
1974 size_t stage_index = gfx_pipeline->stage_indices[stage];
1975
1976 if (pvr_has_hard_coded_shaders(&device->pdevice->dev_info)) {
1977 if (pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
1978 BITFIELD_BIT(stage)) {
1979 continue;
1980 }
1981 }
1982
1983 /* Skip unused/inactive stages. */
1984 if (stage_index == ~0)
1985 continue;
1986
1987 create_info = &pCreateInfo->pStages[stage_index];
1988
1989 /* SPIR-V to NIR. */
1990 ctx->nir[stage] = pvr_spirv_to_nir(ctx, stage, create_info);
1991 if (!ctx->nir[stage]) {
1992 ralloc_free(ctx);
1993 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
1994 }
1995 }
1996
1997 /* Pre-back-end analysis and optimization, driver data extraction. */
1998 /* TODO: Analyze and cull unused I/O between stages. */
1999 /* TODO: Allocate UBOs between stages;
2000 * pipeline->layout->set_{count,layout}.
2001 */
2002
2003 /* Back-end translation. */
2004 for (gl_shader_stage stage = MESA_SHADER_FRAGMENT; stage > MESA_SHADER_NONE;
2005 stage--) {
2006 if (pvr_has_hard_coded_shaders(&device->pdevice->dev_info) &&
2007 pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
2008 BITFIELD_BIT(stage)) {
2009 const struct pvr_device_info *const dev_info =
2010 &device->pdevice->dev_info;
2011 struct pvr_explicit_constant_usage *explicit_const_usage;
2012
2013 switch (stage) {
2014 case MESA_SHADER_VERTEX:
2015 explicit_const_usage = &vert_explicit_const_usage;
2016 break;
2017
2018 case MESA_SHADER_FRAGMENT:
2019 explicit_const_usage = &frag_explicit_const_usage;
2020 break;
2021
2022 default:
2023 unreachable("Unsupported stage.");
2024 }
2025
2026 pvr_hard_code_graphics_shader(dev_info,
2027 hard_code_pipeline_n,
2028 stage,
2029 &ctx->binary[stage]);
2030
2031 pvr_hard_code_graphics_get_build_info(dev_info,
2032 hard_code_pipeline_n,
2033 stage,
2034 &ctx->common_data[stage],
2035 &ctx->stage_data,
2036 explicit_const_usage);
2037
2038 continue;
2039 }
2040
2041 if (!ctx->nir[stage])
2042 continue;
2043
2044 ctx->rogue[stage] = pvr_nir_to_rogue(ctx, ctx->nir[stage]);
2045 if (!ctx->rogue[stage]) {
2046 ralloc_free(ctx);
2047 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
2048 }
2049
2050 pvr_rogue_to_binary(ctx, ctx->rogue[stage], &ctx->binary[stage]);
2051 if (!ctx->binary[stage].size) {
2052 ralloc_free(ctx);
2053 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
2054 }
2055 }
2056
2057 if (pvr_has_hard_coded_shaders(&device->pdevice->dev_info) &&
2058 pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
2059 BITFIELD_BIT(MESA_SHADER_VERTEX)) {
2060 pvr_hard_code_graphics_vertex_state(&device->pdevice->dev_info,
2061 hard_code_pipeline_n,
2062 &gfx_pipeline->shader_state.vertex);
2063 } else {
2064 pvr_vertex_state_init(gfx_pipeline,
2065 &ctx->common_data[MESA_SHADER_VERTEX],
2066 *vertex_input_reg_count,
2067 &ctx->stage_data.vs);
2068
2069 if (!old_path) {
2070 struct pvr_vertex_shader_state *vertex_state =
2071 &gfx_pipeline->shader_state.vertex;
2072
2073 /* FIXME: For now we just overwrite it but the compiler shouldn't be
2074 * returning the sh count since the driver is in charge of allocating
2075 * them.
2076 */
2077 vertex_state->stage_state.const_shared_reg_count =
2078 sh_count[PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY];
2079
2080 gfx_pipeline->shader_state.vertex.vertex_input_size =
2081 ctx->stage_data.vs.num_vertex_input_regs;
2082 }
2083 }
2084
2085 result =
2086 pvr_gpu_upload_usc(device,
2087 util_dynarray_begin(&ctx->binary[MESA_SHADER_VERTEX]),
2088 ctx->binary[MESA_SHADER_VERTEX].size,
2089 cache_line_size,
2090 &gfx_pipeline->shader_state.vertex.bo);
2091 if (result != VK_SUCCESS)
2092 goto err_free_build_context;
2093
2094 if (ctx->nir[MESA_SHADER_FRAGMENT]) {
2095 struct pvr_fragment_shader_state *fragment_state =
2096 &gfx_pipeline->shader_state.fragment;
2097
2098 if (pvr_has_hard_coded_shaders(&device->pdevice->dev_info) &&
2099 pvr_hard_code_graphics_get_flags(&device->pdevice->dev_info) &
2100 BITFIELD_BIT(MESA_SHADER_FRAGMENT)) {
2101 pvr_hard_code_graphics_fragment_state(
2102 &device->pdevice->dev_info,
2103 hard_code_pipeline_n,
2104 &gfx_pipeline->shader_state.fragment);
2105 } else {
2106 pvr_fragment_state_init(gfx_pipeline,
2107 &ctx->common_data[MESA_SHADER_FRAGMENT]);
2108
2109 if (!old_path) {
2110 /* FIXME: For now we just overwrite it but the compiler shouldn't be
2111 * returning the sh count since the driver is in charge of
2112 * allocating them.
2113 */
2114 fragment_state->stage_state.const_shared_reg_count =
2115 sh_count[PVR_STAGE_ALLOCATION_FRAGMENT];
2116 }
2117 }
2118
2119 result = pvr_gpu_upload_usc(
2120 device,
2121 util_dynarray_begin(&ctx->binary[MESA_SHADER_FRAGMENT]),
2122 ctx->binary[MESA_SHADER_FRAGMENT].size,
2123 cache_line_size,
2124 &gfx_pipeline->shader_state.fragment.bo);
2125 if (result != VK_SUCCESS)
2126 goto err_free_vertex_bo;
2127
2128 /* TODO: powervr has an optimization where it attempts to recompile
2129 * shaders. See PipelineCompileNoISPFeedbackFragmentStage. Unimplemented
2130 * since in our case the optimization doesn't happen.
2131 */
2132
2133 result = pvr_pds_coeff_program_create_and_upload(
2134 device,
2135 allocator,
2136 ctx->stage_data.fs.iterator_args.fpu_iterators,
2137 ctx->stage_data.fs.iterator_args.num_fpu_iterators,
2138 ctx->stage_data.fs.iterator_args.destination,
2139 &fragment_state->pds_coeff_program,
2140 &fragment_state->stage_state.pds_temps_count);
2141 if (result != VK_SUCCESS)
2142 goto err_free_fragment_bo;
2143
2144 result = pvr_pds_fragment_program_create_and_upload(
2145 device,
2146 allocator,
2147 gfx_pipeline->shader_state.fragment.bo,
2148 ctx->common_data[MESA_SHADER_FRAGMENT].temps,
2149 ctx->stage_data.fs.msaa_mode,
2150 ctx->stage_data.fs.phas,
2151 &fragment_state->pds_fragment_program);
2152 if (result != VK_SUCCESS)
2153 goto err_free_coeff_program;
2154
2155 /* FIXME: For now we pass in the same explicit_const_usage since it
2156 * contains all invalid entries. Fix this by hooking it up to the
2157 * compiler.
2158 */
2159 result = pvr_pds_descriptor_program_create_and_upload(
2160 device,
2161 allocator,
2162 &ctx->common_data[MESA_SHADER_FRAGMENT].compile_time_consts_data,
2163 &ctx->common_data[MESA_SHADER_FRAGMENT].ubo_data,
2164 &frag_explicit_const_usage,
2165 layout,
2166 PVR_STAGE_ALLOCATION_FRAGMENT,
2167 sh_reg_layout_frag,
2168 &fragment_state->descriptor_state);
2169 if (result != VK_SUCCESS)
2170 goto err_free_frag_program;
2171
2172 /* If not, we need to MAX2() and set
2173 * `fragment_state->stage_state.pds_temps_count` appropriately.
2174 */
2175 assert(fragment_state->descriptor_state.pds_info.temps_required == 0);
2176 }
2177
2178 result = pvr_pds_vertex_attrib_programs_create_and_upload(
2179 device,
2180 allocator,
2181 vertex_input_state,
2182 ctx->common_data[MESA_SHADER_VERTEX].temps,
2183 &ctx->stage_data.vs,
2184 vtx_dma_descriptions,
2185 vtx_dma_count,
2186 &special_vars_layout,
2187 &gfx_pipeline->shader_state.vertex.pds_attrib_programs);
2188 if (result != VK_SUCCESS)
2189 goto err_free_frag_descriptor_program;
2190
2191 result = pvr_pds_descriptor_program_create_and_upload(
2192 device,
2193 allocator,
2194 &ctx->common_data[MESA_SHADER_VERTEX].compile_time_consts_data,
2195 &ctx->common_data[MESA_SHADER_VERTEX].ubo_data,
2196 &vert_explicit_const_usage,
2197 layout,
2198 PVR_STAGE_ALLOCATION_VERTEX_GEOMETRY,
2199 sh_reg_layout_vert,
2200 &gfx_pipeline->shader_state.vertex.descriptor_state);
2201 if (result != VK_SUCCESS)
2202 goto err_free_vertex_attrib_program;
2203
2204 /* FIXME: When the temp_buffer_total_size is non-zero we need to allocate a
2205 * scratch buffer for both vertex and fragment stage.
2206 * Figure out the best place to do this.
2207 */
2208 /* assert(pvr_pds_descriptor_program_variables.temp_buff_total_size == 0); */
2209 /* TODO: Implement spilling with the above. */
2210
2211 ralloc_free(ctx);
2212
2213 hard_code_pipeline_n++;
2214
2215 return VK_SUCCESS;
2216
2217 err_free_vertex_attrib_program:
2218 for (uint32_t i = 0;
2219 i < ARRAY_SIZE(gfx_pipeline->shader_state.vertex.pds_attrib_programs);
2220 i++) {
2221 struct pvr_pds_attrib_program *const attrib_program =
2222 &gfx_pipeline->shader_state.vertex.pds_attrib_programs[i];
2223
2224 pvr_pds_vertex_attrib_program_destroy(device, allocator, attrib_program);
2225 }
2226 err_free_frag_descriptor_program:
2227 pvr_pds_descriptor_program_destroy(
2228 device,
2229 allocator,
2230 &gfx_pipeline->shader_state.fragment.descriptor_state);
2231 err_free_frag_program:
2232 pvr_bo_suballoc_free(
2233 gfx_pipeline->shader_state.fragment.pds_fragment_program.pvr_bo);
2234 err_free_coeff_program:
2235 pvr_bo_suballoc_free(
2236 gfx_pipeline->shader_state.fragment.pds_coeff_program.pvr_bo);
2237 err_free_fragment_bo:
2238 pvr_bo_suballoc_free(gfx_pipeline->shader_state.fragment.bo);
2239 err_free_vertex_bo:
2240 pvr_bo_suballoc_free(gfx_pipeline->shader_state.vertex.bo);
2241 err_free_build_context:
2242 ralloc_free(ctx);
2243 return result;
2244 }
2245
2246 static struct vk_render_pass_state
pvr_create_renderpass_state(const VkGraphicsPipelineCreateInfo * const info)2247 pvr_create_renderpass_state(const VkGraphicsPipelineCreateInfo *const info)
2248 {
2249 PVR_FROM_HANDLE(pvr_render_pass, pass, info->renderPass);
2250 const struct pvr_render_subpass *const subpass =
2251 &pass->subpasses[info->subpass];
2252
2253 enum vk_rp_attachment_flags attachments = 0;
2254
2255 assert(info->subpass < pass->subpass_count);
2256
2257 for (uint32_t i = 0; i < subpass->color_count; i++) {
2258 if (pass->attachments[subpass->color_attachments[i]].aspects)
2259 attachments |= MESA_VK_RP_ATTACHMENT_COLOR_0_BIT << i;
2260 }
2261
2262 if (subpass->depth_stencil_attachment != VK_ATTACHMENT_UNUSED) {
2263 VkImageAspectFlags ds_aspects =
2264 pass->attachments[subpass->depth_stencil_attachment].aspects;
2265 if (ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)
2266 attachments |= MESA_VK_RP_ATTACHMENT_DEPTH_BIT;
2267 if (ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT)
2268 attachments |= MESA_VK_RP_ATTACHMENT_STENCIL_BIT;
2269 }
2270
2271 return (struct vk_render_pass_state){
2272 .attachments = attachments,
2273
2274 /* TODO: This is only needed for VK_KHR_create_renderpass2 (or core 1.2),
2275 * which is not currently supported.
2276 */
2277 .view_mask = 0,
2278 };
2279 }
2280
2281 static VkResult
pvr_graphics_pipeline_init(struct pvr_device * device,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,struct pvr_graphics_pipeline * gfx_pipeline)2282 pvr_graphics_pipeline_init(struct pvr_device *device,
2283 struct vk_pipeline_cache *cache,
2284 const VkGraphicsPipelineCreateInfo *pCreateInfo,
2285 const VkAllocationCallbacks *allocator,
2286 struct pvr_graphics_pipeline *gfx_pipeline)
2287 {
2288 struct vk_dynamic_graphics_state *const dynamic_state =
2289 &gfx_pipeline->dynamic_state;
2290 const struct vk_render_pass_state rp_state =
2291 pvr_create_renderpass_state(pCreateInfo);
2292
2293 struct vk_graphics_pipeline_all_state all_state;
2294 struct vk_graphics_pipeline_state state = { 0 };
2295
2296 VkResult result;
2297
2298 pvr_pipeline_init(device, PVR_PIPELINE_TYPE_GRAPHICS, &gfx_pipeline->base);
2299
2300 result = vk_graphics_pipeline_state_fill(&device->vk,
2301 &state,
2302 pCreateInfo,
2303 &rp_state,
2304 0,
2305 &all_state,
2306 NULL,
2307 0,
2308 NULL);
2309 if (result != VK_SUCCESS)
2310 goto err_pipeline_finish;
2311
2312 vk_dynamic_graphics_state_init(dynamic_state);
2313
2314 /* Load static state into base dynamic state holder. */
2315 vk_dynamic_graphics_state_fill(dynamic_state, &state);
2316
2317 /* The value of ms.rasterization_samples is undefined when
2318 * rasterizer_discard_enable is set, but we need a specific value.
2319 * Fill that in here.
2320 */
2321 if (state.rs->rasterizer_discard_enable)
2322 dynamic_state->ms.rasterization_samples = VK_SAMPLE_COUNT_1_BIT;
2323
2324 memset(gfx_pipeline->stage_indices, ~0, sizeof(gfx_pipeline->stage_indices));
2325
2326 for (uint32_t i = 0; i < pCreateInfo->stageCount; i++) {
2327 VkShaderStageFlagBits vk_stage = pCreateInfo->pStages[i].stage;
2328 gl_shader_stage gl_stage = vk_to_mesa_shader_stage(vk_stage);
2329 /* From the Vulkan 1.2.192 spec for VkPipelineShaderStageCreateInfo:
2330 *
2331 * "stage must not be VK_SHADER_STAGE_ALL_GRAPHICS,
2332 * or VK_SHADER_STAGE_ALL."
2333 *
2334 * So we don't handle that.
2335 *
2336 * We also don't handle VK_SHADER_STAGE_TESSELLATION_* and
2337 * VK_SHADER_STAGE_GEOMETRY_BIT stages as 'tessellationShader' and
2338 * 'geometryShader' are set to false in the VkPhysicalDeviceFeatures
2339 * structure returned by the driver.
2340 */
2341 switch (pCreateInfo->pStages[i].stage) {
2342 case VK_SHADER_STAGE_VERTEX_BIT:
2343 case VK_SHADER_STAGE_FRAGMENT_BIT:
2344 gfx_pipeline->stage_indices[gl_stage] = i;
2345 break;
2346 default:
2347 unreachable("Unsupported stage.");
2348 }
2349 }
2350
2351 gfx_pipeline->base.layout =
2352 pvr_pipeline_layout_from_handle(pCreateInfo->layout);
2353
2354 /* Compiles and uploads shaders and PDS programs. */
2355 result = pvr_graphics_pipeline_compile(device,
2356 cache,
2357 pCreateInfo,
2358 allocator,
2359 gfx_pipeline);
2360 if (result != VK_SUCCESS)
2361 goto err_pipeline_finish;
2362
2363 return VK_SUCCESS;
2364
2365 err_pipeline_finish:
2366 pvr_pipeline_finish(&gfx_pipeline->base);
2367
2368 return result;
2369 }
2370
2371 /* If allocator == NULL, the internal one will be used. */
2372 static VkResult
pvr_graphics_pipeline_create(struct pvr_device * device,struct vk_pipeline_cache * cache,const VkGraphicsPipelineCreateInfo * pCreateInfo,const VkAllocationCallbacks * allocator,VkPipeline * const pipeline_out)2373 pvr_graphics_pipeline_create(struct pvr_device *device,
2374 struct vk_pipeline_cache *cache,
2375 const VkGraphicsPipelineCreateInfo *pCreateInfo,
2376 const VkAllocationCallbacks *allocator,
2377 VkPipeline *const pipeline_out)
2378 {
2379 struct pvr_graphics_pipeline *gfx_pipeline;
2380 VkResult result;
2381
2382 gfx_pipeline = vk_zalloc2(&device->vk.alloc,
2383 allocator,
2384 sizeof(*gfx_pipeline),
2385 8,
2386 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
2387 if (!gfx_pipeline)
2388 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
2389
2390 /* Compiles and uploads shaders and PDS programs too. */
2391 result = pvr_graphics_pipeline_init(device,
2392 cache,
2393 pCreateInfo,
2394 allocator,
2395 gfx_pipeline);
2396 if (result != VK_SUCCESS) {
2397 vk_free2(&device->vk.alloc, allocator, gfx_pipeline);
2398 return result;
2399 }
2400
2401 *pipeline_out = pvr_pipeline_to_handle(&gfx_pipeline->base);
2402
2403 return VK_SUCCESS;
2404 }
2405
2406 VkResult
pvr_CreateGraphicsPipelines(VkDevice _device,VkPipelineCache pipelineCache,uint32_t createInfoCount,const VkGraphicsPipelineCreateInfo * pCreateInfos,const VkAllocationCallbacks * pAllocator,VkPipeline * pPipelines)2407 pvr_CreateGraphicsPipelines(VkDevice _device,
2408 VkPipelineCache pipelineCache,
2409 uint32_t createInfoCount,
2410 const VkGraphicsPipelineCreateInfo *pCreateInfos,
2411 const VkAllocationCallbacks *pAllocator,
2412 VkPipeline *pPipelines)
2413 {
2414 VK_FROM_HANDLE(vk_pipeline_cache, cache, pipelineCache);
2415 PVR_FROM_HANDLE(pvr_device, device, _device);
2416 VkResult result = VK_SUCCESS;
2417
2418 for (uint32_t i = 0; i < createInfoCount; i++) {
2419 const VkResult local_result =
2420 pvr_graphics_pipeline_create(device,
2421 cache,
2422 &pCreateInfos[i],
2423 pAllocator,
2424 &pPipelines[i]);
2425 if (local_result != VK_SUCCESS) {
2426 result = local_result;
2427 pPipelines[i] = VK_NULL_HANDLE;
2428 }
2429 }
2430
2431 return result;
2432 }
2433
2434 /*****************************************************************************
2435 Other functions
2436 *****************************************************************************/
2437
pvr_DestroyPipeline(VkDevice _device,VkPipeline _pipeline,const VkAllocationCallbacks * pAllocator)2438 void pvr_DestroyPipeline(VkDevice _device,
2439 VkPipeline _pipeline,
2440 const VkAllocationCallbacks *pAllocator)
2441 {
2442 PVR_FROM_HANDLE(pvr_pipeline, pipeline, _pipeline);
2443 PVR_FROM_HANDLE(pvr_device, device, _device);
2444
2445 if (!pipeline)
2446 return;
2447
2448 switch (pipeline->type) {
2449 case PVR_PIPELINE_TYPE_GRAPHICS: {
2450 struct pvr_graphics_pipeline *const gfx_pipeline =
2451 to_pvr_graphics_pipeline(pipeline);
2452
2453 pvr_graphics_pipeline_destroy(device, pAllocator, gfx_pipeline);
2454 break;
2455 }
2456
2457 case PVR_PIPELINE_TYPE_COMPUTE: {
2458 struct pvr_compute_pipeline *const compute_pipeline =
2459 to_pvr_compute_pipeline(pipeline);
2460
2461 pvr_compute_pipeline_destroy(device, pAllocator, compute_pipeline);
2462 break;
2463 }
2464
2465 default:
2466 unreachable("Unknown pipeline type.");
2467 }
2468 }
2469