1 /*
2 * Copyright © 2022 Imagination Technologies Ltd.
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a copy
5 * of this software and associated documentation files (the "Software"), to deal
6 * in the Software without restriction, including without limitation the rights
7 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 * copies of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
23
24 #include <assert.h>
25 #include <stdbool.h>
26 #include <stdint.h>
27 #include <vulkan/vulkan.h>
28
29 #include "hwdef/rogue_hw_defs.h"
30 #include "hwdef/rogue_hw_utils.h"
31 #include "pvr_bo.h"
32 #include "pvr_csb.h"
33 #include "pvr_debug.h"
34 #include "pvr_csb_enum_helpers.h"
35 #include "pvr_debug.h"
36 #include "pvr_job_common.h"
37 #include "pvr_job_context.h"
38 #include "pvr_job_render.h"
39 #include "pvr_pds.h"
40 #include "pvr_private.h"
41 #include "pvr_rogue_fw.h"
42 #include "pvr_types.h"
43 #include "pvr_winsys.h"
44 #include "util/compiler.h"
45 #include "util/format/format_utils.h"
46 #include "util/macros.h"
47 #include "util/u_math.h"
48 #include "vk_alloc.h"
49 #include "vk_log.h"
50 #include "vk_util.h"
51
52 #define ROGUE_BIF_PM_FREELIST_BASE_ADDR_ALIGNSIZE 16U
53
54 /* FIXME: Is there a hardware define we can use instead? */
55 /* 1 DWord per PM physical page stored in the free list */
56 #define ROGUE_FREE_LIST_ENTRY_SIZE ((uint32_t)sizeof(uint32_t))
57
58 /* FIXME: The three defines below, for the number of PC, PD and PT entries in a
59 * 4KB page, come from rgxmmudefs_km.h (meaning they're part of the
60 * auto-generated hwdefs). Should these be defined in rogue_mmu.xml? Keeping in
61 * mind that we probably only need these three values. */
62 #define ROGUE_NUM_PC_ENTRIES_PER_PAGE 0x400U
63
64 #define ROGUE_NUM_PD_ENTRIES_PER_PAGE 0x200U
65
66 #define ROGUE_NUM_PT_ENTRIES_PER_PAGE 0x200U
67
68 struct pvr_free_list {
69 struct pvr_device *device;
70
71 uint64_t size;
72
73 struct pvr_bo *bo;
74
75 struct pvr_winsys_free_list *ws_free_list;
76 };
77
78 struct pvr_rt_dataset {
79 struct pvr_device *device;
80
81 /* RT dataset information */
82 uint32_t width;
83 uint32_t height;
84 uint32_t samples;
85 uint32_t layers;
86
87 struct pvr_free_list *global_free_list;
88 struct pvr_free_list *local_free_list;
89
90 struct pvr_bo *vheap_rtc_bo;
91 pvr_dev_addr_t vheap_dev_addr;
92 pvr_dev_addr_t rtc_dev_addr;
93
94 struct pvr_bo *tpc_bo;
95 uint64_t tpc_stride;
96 uint64_t tpc_size;
97
98 struct pvr_winsys_rt_dataset *ws_rt_dataset;
99
100 /* RT data information */
101 struct pvr_bo *mta_mlist_bo;
102
103 struct pvr_bo *rgn_headers_bo;
104 uint64_t rgn_headers_stride;
105
106 bool need_frag;
107
108 uint8_t rt_data_idx;
109
110 struct {
111 pvr_dev_addr_t mta_dev_addr;
112 pvr_dev_addr_t mlist_dev_addr;
113 pvr_dev_addr_t rgn_headers_dev_addr;
114 } rt_datas[ROGUE_NUM_RTDATAS];
115 };
116
pvr_free_list_create(struct pvr_device * device,uint32_t initial_size,uint32_t max_size,uint32_t grow_size,uint32_t grow_threshold,struct pvr_free_list * parent_free_list,struct pvr_free_list ** const free_list_out)117 VkResult pvr_free_list_create(struct pvr_device *device,
118 uint32_t initial_size,
119 uint32_t max_size,
120 uint32_t grow_size,
121 uint32_t grow_threshold,
122 struct pvr_free_list *parent_free_list,
123 struct pvr_free_list **const free_list_out)
124 {
125 const struct pvr_device_runtime_info *runtime_info =
126 &device->pdevice->dev_runtime_info;
127 struct pvr_winsys_free_list *parent_ws_free_list =
128 parent_free_list ? parent_free_list->ws_free_list : NULL;
129 const uint64_t bo_flags = PVR_BO_ALLOC_FLAG_GPU_UNCACHED |
130 PVR_BO_ALLOC_FLAG_PM_FW_PROTECT;
131 struct pvr_free_list *free_list;
132 uint32_t cache_line_size;
133 uint32_t initial_num_pages;
134 uint32_t grow_num_pages;
135 uint32_t max_num_pages;
136 uint64_t addr_alignment;
137 uint64_t size_alignment;
138 uint64_t size;
139 VkResult result;
140
141 assert((initial_size + grow_size) <= max_size);
142 assert(max_size != 0);
143 assert(grow_threshold <= 100);
144
145 /* Make sure the free list is created with at least a single page. */
146 if (initial_size == 0)
147 initial_size = ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE;
148
149 /* The freelists sizes must respect the PM freelist base address alignment
150 * requirement. As the freelist entries are cached by the SLC, it's also
151 * necessary to ensure the sizes respect the SLC cache line size to avoid
152 * invalid entries appearing in the cache, which would be problematic after
153 * a grow operation, as the SLC entries aren't invalidated. We do this by
154 * making sure the freelist values are appropriately aligned.
155 *
156 * To calculate the alignment, we first take the largest of the freelist
157 * base address alignment and the SLC cache line size. We then divide this
158 * by the freelist entry size to determine the number of freelist entries
159 * required by the PM. Finally, as each entry holds a single PM physical
160 * page, we multiple the number of entries by the page size.
161 *
162 * As an example, if the base address alignment is 16 bytes, the SLC cache
163 * line size is 64 bytes and the freelist entry size is 4 bytes then 16
164 * entries are required, as we take the SLC cacheline size (being the larger
165 * of the two values) and divide this by 4. If the PM page size is 4096
166 * bytes then we end up with an alignment of 65536 bytes.
167 */
168 cache_line_size = rogue_get_slc_cache_line_size(&device->pdevice->dev_info);
169
170 addr_alignment =
171 MAX2(ROGUE_BIF_PM_FREELIST_BASE_ADDR_ALIGNSIZE, cache_line_size);
172 size_alignment = (addr_alignment / ROGUE_FREE_LIST_ENTRY_SIZE) *
173 ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE;
174
175 assert(util_is_power_of_two_nonzero64(size_alignment));
176
177 initial_size = align64(initial_size, size_alignment);
178 max_size = align64(max_size, size_alignment);
179 grow_size = align64(grow_size, size_alignment);
180
181 /* Make sure the 'max' size doesn't exceed what the firmware supports and
182 * adjust the other sizes accordingly.
183 */
184 if (max_size > runtime_info->max_free_list_size) {
185 max_size = runtime_info->max_free_list_size;
186 assert(align64(max_size, size_alignment) == max_size);
187 }
188
189 if (initial_size > max_size)
190 initial_size = max_size;
191
192 if (initial_size == max_size)
193 grow_size = 0;
194
195 initial_num_pages = initial_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
196 max_num_pages = max_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
197 grow_num_pages = grow_size >> ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
198
199 /* Calculate the size of the buffer needed to store the free list entries
200 * based on the maximum number of pages we can have.
201 */
202 size = max_num_pages * ROGUE_FREE_LIST_ENTRY_SIZE;
203 assert(align64(size, addr_alignment) == size);
204
205 free_list = vk_alloc(&device->vk.alloc,
206 sizeof(*free_list),
207 8,
208 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
209 if (!free_list)
210 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
211
212 /* FIXME: The memory is mapped GPU uncached, but this seems to contradict
213 * the comment above about aligning to the SLC cache line size.
214 */
215 result = pvr_bo_alloc(device,
216 device->heaps.general_heap,
217 size,
218 addr_alignment,
219 bo_flags,
220 &free_list->bo);
221 if (result != VK_SUCCESS)
222 goto err_vk_free_free_list;
223
224 result = device->ws->ops->free_list_create(device->ws,
225 free_list->bo->vma,
226 initial_num_pages,
227 max_num_pages,
228 grow_num_pages,
229 grow_threshold,
230 parent_ws_free_list,
231 &free_list->ws_free_list);
232 if (result != VK_SUCCESS)
233 goto err_pvr_bo_free_bo;
234
235 free_list->device = device;
236 free_list->size = size;
237
238 *free_list_out = free_list;
239
240 return VK_SUCCESS;
241
242 err_pvr_bo_free_bo:
243 pvr_bo_free(device, free_list->bo);
244
245 err_vk_free_free_list:
246 vk_free(&device->vk.alloc, free_list);
247
248 return result;
249 }
250
pvr_free_list_destroy(struct pvr_free_list * free_list)251 void pvr_free_list_destroy(struct pvr_free_list *free_list)
252 {
253 struct pvr_device *device = free_list->device;
254
255 device->ws->ops->free_list_destroy(free_list->ws_free_list);
256 pvr_bo_free(device, free_list->bo);
257 vk_free(&device->vk.alloc, free_list);
258 }
259
pvr_get_samples_in_xy(uint32_t samples,uint32_t * const x_out,uint32_t * const y_out)260 static inline void pvr_get_samples_in_xy(uint32_t samples,
261 uint32_t *const x_out,
262 uint32_t *const y_out)
263 {
264 switch (samples) {
265 case 1:
266 *x_out = 1;
267 *y_out = 1;
268 break;
269 case 2:
270 *x_out = 1;
271 *y_out = 2;
272 break;
273 case 4:
274 *x_out = 2;
275 *y_out = 2;
276 break;
277 case 8:
278 *x_out = 2;
279 *y_out = 4;
280 break;
281 default:
282 unreachable("Unsupported number of samples");
283 }
284 }
285
pvr_rt_mtile_info_init(const struct pvr_device_info * dev_info,struct pvr_rt_mtile_info * info,uint32_t width,uint32_t height,uint32_t samples)286 void pvr_rt_mtile_info_init(const struct pvr_device_info *dev_info,
287 struct pvr_rt_mtile_info *info,
288 uint32_t width,
289 uint32_t height,
290 uint32_t samples)
291 {
292 uint32_t samples_in_x;
293 uint32_t samples_in_y;
294
295 pvr_get_samples_in_xy(samples, &samples_in_x, &samples_in_y);
296
297 info->tile_size_x = PVR_GET_FEATURE_VALUE(dev_info, tile_size_x, 1);
298 info->tile_size_y = PVR_GET_FEATURE_VALUE(dev_info, tile_size_y, 1);
299
300 info->num_tiles_x = DIV_ROUND_UP(width, info->tile_size_x);
301 info->num_tiles_y = DIV_ROUND_UP(height, info->tile_size_y);
302
303 rogue_get_num_macrotiles_xy(dev_info, &info->mtiles_x, &info->mtiles_y);
304
305 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
306 assert(PVR_GET_FEATURE_VALUE(dev_info,
307 simple_parameter_format_version,
308 0) == 2);
309 /* Set up 16 macrotiles with a multiple of 2x2 tiles per macrotile,
310 * which is aligned to a tile group.
311 */
312 info->mtile_x1 = DIV_ROUND_UP(info->num_tiles_x, 8) * 2;
313 info->mtile_y1 = DIV_ROUND_UP(info->num_tiles_y, 8) * 2;
314 info->mtile_x2 = 0;
315 info->mtile_y2 = 0;
316 info->mtile_x3 = 0;
317 info->mtile_y3 = 0;
318 info->x_tile_max = ALIGN_POT(info->num_tiles_x, 2) - 1;
319 info->y_tile_max = ALIGN_POT(info->num_tiles_y, 2) - 1;
320 } else {
321 /* Set up 16 macrotiles with a multiple of 4x4 tiles per macrotile. */
322 info->mtile_x1 = ALIGN_POT(DIV_ROUND_UP(info->num_tiles_x, 4), 4);
323 info->mtile_y1 = ALIGN_POT(DIV_ROUND_UP(info->num_tiles_y, 4), 4);
324 info->mtile_x2 = info->mtile_x1 * 2;
325 info->mtile_y2 = info->mtile_y1 * 2;
326 info->mtile_x3 = info->mtile_x1 * 3;
327 info->mtile_y3 = info->mtile_y1 * 3;
328 info->x_tile_max = info->num_tiles_x - 1;
329 info->y_tile_max = info->num_tiles_y - 1;
330 }
331
332 info->tiles_per_mtile_x = info->mtile_x1 * samples_in_x;
333 info->tiles_per_mtile_y = info->mtile_y1 * samples_in_y;
334 }
335
336 /* Note that the unit of the return value depends on the GPU. For cores with the
337 * simple_internal_parameter_format feature the returned size is interpreted as
338 * the number of region headers. For cores without this feature its interpreted
339 * as the size in dwords.
340 */
341 static uint64_t
pvr_rt_get_isp_region_size(struct pvr_device * device,const struct pvr_rt_mtile_info * mtile_info)342 pvr_rt_get_isp_region_size(struct pvr_device *device,
343 const struct pvr_rt_mtile_info *mtile_info)
344 {
345 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
346 uint64_t rgn_size =
347 (uint64_t)mtile_info->tiles_per_mtile_x * mtile_info->tiles_per_mtile_y;
348
349 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
350 uint32_t version;
351
352 rgn_size *= (uint64_t)mtile_info->mtiles_x * mtile_info->mtiles_y;
353
354 if (PVR_FEATURE_VALUE(dev_info,
355 simple_parameter_format_version,
356 &version)) {
357 version = 0;
358 }
359
360 if (version == 2) {
361 /* One region header per 2x2 tile group. */
362 rgn_size /= (2U * 2U);
363 }
364 } else {
365 const uint64_t single_rgn_header_size =
366 rogue_get_region_header_size(dev_info);
367
368 /* Round up to next dword to prevent IPF overrun and convert to bytes.
369 */
370 rgn_size = DIV_ROUND_UP(rgn_size * single_rgn_header_size, 4);
371 }
372
373 return rgn_size;
374 }
375
pvr_rt_vheap_rtc_data_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,uint32_t layers)376 static VkResult pvr_rt_vheap_rtc_data_init(struct pvr_device *device,
377 struct pvr_rt_dataset *rt_dataset,
378 uint32_t layers)
379 {
380 uint64_t vheap_size;
381 uint32_t alignment;
382 uint64_t rtc_size;
383 VkResult result;
384
385 vheap_size = ROGUE_CR_PM_VHEAP_TABLE_SIZE * ROGUE_PM_VHEAP_ENTRY_SIZE;
386
387 if (layers > 1) {
388 uint64_t rtc_entries;
389
390 vheap_size = ALIGN_POT(vheap_size, PVRX(CR_TA_RTC_ADDR_BASE_ALIGNMENT));
391
392 rtc_entries = ROGUE_NUM_TEAC + ROGUE_NUM_TE + ROGUE_NUM_VCE;
393 if (PVR_HAS_QUIRK(&device->pdevice->dev_info, 48545))
394 rtc_entries += ROGUE_NUM_TE;
395
396 rtc_size = rtc_entries * ROGUE_RTC_SIZE_IN_BYTES;
397 } else {
398 rtc_size = 0;
399 }
400
401 alignment = MAX2(PVRX(CR_PM_VHEAP_TABLE_BASE_ADDR_ALIGNMENT),
402 PVRX(CR_TA_RTC_ADDR_BASE_ALIGNMENT));
403
404 result = pvr_bo_alloc(device,
405 device->heaps.general_heap,
406 vheap_size + rtc_size,
407 alignment,
408 PVR_BO_ALLOC_FLAG_GPU_UNCACHED,
409 &rt_dataset->vheap_rtc_bo);
410 if (result != VK_SUCCESS)
411 return result;
412
413 rt_dataset->vheap_dev_addr = rt_dataset->vheap_rtc_bo->vma->dev_addr;
414
415 if (rtc_size > 0) {
416 rt_dataset->rtc_dev_addr =
417 PVR_DEV_ADDR_OFFSET(rt_dataset->vheap_dev_addr, vheap_size);
418 } else {
419 rt_dataset->rtc_dev_addr = PVR_DEV_ADDR_INVALID;
420 }
421
422 return VK_SUCCESS;
423 }
424
pvr_rt_vheap_rtc_data_fini(struct pvr_rt_dataset * rt_dataset)425 static void pvr_rt_vheap_rtc_data_fini(struct pvr_rt_dataset *rt_dataset)
426 {
427 rt_dataset->rtc_dev_addr = PVR_DEV_ADDR_INVALID;
428
429 pvr_bo_free(rt_dataset->device, rt_dataset->vheap_rtc_bo);
430 rt_dataset->vheap_rtc_bo = NULL;
431 }
432
433 static void
pvr_rt_get_tail_ptr_stride_size(const struct pvr_device * device,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers,uint64_t * const stride_out,uint64_t * const size_out)434 pvr_rt_get_tail_ptr_stride_size(const struct pvr_device *device,
435 const struct pvr_rt_mtile_info *mtile_info,
436 uint32_t layers,
437 uint64_t *const stride_out,
438 uint64_t *const size_out)
439 {
440 uint32_t max_num_mtiles;
441 uint32_t num_mtiles_x;
442 uint32_t num_mtiles_y;
443 uint32_t version;
444 uint64_t size;
445
446 num_mtiles_x = mtile_info->mtiles_x * mtile_info->tiles_per_mtile_x;
447 num_mtiles_y = mtile_info->mtiles_y * mtile_info->tiles_per_mtile_y;
448
449 max_num_mtiles = MAX2(util_next_power_of_two64(num_mtiles_x),
450 util_next_power_of_two64(num_mtiles_y));
451
452 size = (uint64_t)max_num_mtiles * max_num_mtiles;
453
454 if (PVR_FEATURE_VALUE(&device->pdevice->dev_info,
455 simple_parameter_format_version,
456 &version)) {
457 version = 0;
458 }
459
460 if (version == 2) {
461 /* One tail pointer cache entry per 2x2 tile group. */
462 size /= (2U * 2U);
463 }
464
465 size *= ROGUE_TAIL_POINTER_SIZE;
466
467 if (layers > 1) {
468 size = ALIGN_POT(size, ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE);
469
470 *stride_out = size / ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE;
471 *size_out = size * layers;
472 } else {
473 *stride_out = 0;
474 *size_out = size;
475 }
476 }
477
pvr_rt_tpc_data_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers)478 static VkResult pvr_rt_tpc_data_init(struct pvr_device *device,
479 struct pvr_rt_dataset *rt_dataset,
480 const struct pvr_rt_mtile_info *mtile_info,
481 uint32_t layers)
482 {
483 uint64_t tpc_size;
484
485 pvr_rt_get_tail_ptr_stride_size(device,
486 mtile_info,
487 layers,
488 &rt_dataset->tpc_stride,
489 &rt_dataset->tpc_size);
490 tpc_size = ALIGN_POT(rt_dataset->tpc_size, ROGUE_TE_TPC_CACHE_LINE_SIZE);
491
492 return pvr_bo_alloc(device,
493 device->heaps.general_heap,
494 tpc_size,
495 PVRX(CR_TE_TPC_ADDR_BASE_ALIGNMENT),
496 PVR_BO_ALLOC_FLAG_GPU_UNCACHED,
497 &rt_dataset->tpc_bo);
498 }
499
pvr_rt_tpc_data_fini(struct pvr_rt_dataset * rt_dataset)500 static void pvr_rt_tpc_data_fini(struct pvr_rt_dataset *rt_dataset)
501 {
502 pvr_bo_free(rt_dataset->device, rt_dataset->tpc_bo);
503 rt_dataset->tpc_bo = NULL;
504 }
505
506 static uint32_t
pvr_rt_get_mlist_size(const struct pvr_free_list * global_free_list,const struct pvr_free_list * local_free_list)507 pvr_rt_get_mlist_size(const struct pvr_free_list *global_free_list,
508 const struct pvr_free_list *local_free_list)
509 {
510 uint32_t num_pte_pages;
511 uint32_t num_pde_pages;
512 uint32_t num_pce_pages;
513 uint64_t total_pages;
514 uint32_t mlist_size;
515
516 assert(global_free_list->size + local_free_list->size <=
517 ROGUE_PM_MAX_PB_VIRT_ADDR_SPACE);
518
519 total_pages = (global_free_list->size + local_free_list->size) >>
520 ROGUE_BIF_PM_PHYSICAL_PAGE_SHIFT;
521
522 /* Calculate the total number of physical pages required to hold the page
523 * table, directory and catalog entries for the freelist pages.
524 */
525 num_pte_pages = DIV_ROUND_UP(total_pages, ROGUE_NUM_PT_ENTRIES_PER_PAGE);
526 num_pde_pages = DIV_ROUND_UP(num_pte_pages, ROGUE_NUM_PD_ENTRIES_PER_PAGE);
527 num_pce_pages = DIV_ROUND_UP(num_pde_pages, ROGUE_NUM_PC_ENTRIES_PER_PAGE);
528
529 /* Calculate the MList size considering the total number of pages in the PB
530 * are shared among all the PM address spaces.
531 */
532 mlist_size = (num_pce_pages + num_pde_pages + num_pte_pages) *
533 ROGUE_NUM_PM_ADDRESS_SPACES * ROGUE_MLIST_ENTRY_STRIDE;
534
535 return ALIGN_POT(mlist_size, ROGUE_BIF_PM_PHYSICAL_PAGE_SIZE);
536 }
537
pvr_rt_get_region_headers_stride_size(const struct pvr_device * device,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers,uint64_t * const stride_out,uint64_t * const size_out)538 static void pvr_rt_get_region_headers_stride_size(
539 const struct pvr_device *device,
540 const struct pvr_rt_mtile_info *mtile_info,
541 uint32_t layers,
542 uint64_t *const stride_out,
543 uint64_t *const size_out)
544 {
545 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
546 const uint32_t single_rgn_header_size =
547 rogue_get_region_header_size(dev_info);
548 uint64_t rgn_headers_size;
549 uint32_t num_tiles_x;
550 uint32_t num_tiles_y;
551 uint32_t group_size;
552 uint32_t version;
553
554 if (PVR_FEATURE_VALUE(dev_info, simple_parameter_format_version, &version))
555 version = 0;
556
557 group_size = version == 2 ? 2 : 1;
558
559 num_tiles_x = mtile_info->mtiles_x * mtile_info->tiles_per_mtile_x;
560 num_tiles_y = mtile_info->mtiles_y * mtile_info->tiles_per_mtile_y;
561
562 rgn_headers_size = (uint64_t)num_tiles_x / group_size;
563 /* Careful here. We want the division to happen first. */
564 rgn_headers_size *= num_tiles_y / group_size;
565 rgn_headers_size *= single_rgn_header_size;
566
567 if (PVR_HAS_FEATURE(dev_info, simple_internal_parameter_format)) {
568 rgn_headers_size =
569 ALIGN_POT(rgn_headers_size, PVRX(CR_TE_PSGREGION_ADDR_BASE_ALIGNMENT));
570 }
571
572 if (layers > 1) {
573 rgn_headers_size =
574 ALIGN_POT(rgn_headers_size, PVRX(CR_TE_PSG_REGION_STRIDE_UNIT_SIZE));
575 }
576
577 *stride_out = rgn_headers_size;
578 *size_out = rgn_headers_size * layers;
579 }
580
581 static VkResult
pvr_rt_mta_mlist_data_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,const struct pvr_free_list * global_free_list,const struct pvr_free_list * local_free_list,const struct pvr_rt_mtile_info * mtile_info)582 pvr_rt_mta_mlist_data_init(struct pvr_device *device,
583 struct pvr_rt_dataset *rt_dataset,
584 const struct pvr_free_list *global_free_list,
585 const struct pvr_free_list *local_free_list,
586 const struct pvr_rt_mtile_info *mtile_info)
587 {
588 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
589 const uint32_t mlist_size =
590 pvr_rt_get_mlist_size(global_free_list, local_free_list);
591 uint32_t mta_size = rogue_get_macrotile_array_size(dev_info);
592 const uint32_t num_rt_datas = ARRAY_SIZE(rt_dataset->rt_datas);
593 uint32_t rt_datas_mlist_size;
594 uint32_t rt_datas_mta_size;
595 pvr_dev_addr_t dev_addr;
596 VkResult result;
597
598 /* Allocate memory for macrotile array and Mlist for all RT datas.
599 *
600 * Allocation layout: MTA[0..N] + Mlist alignment padding + Mlist[0..N].
601 *
602 * N is number of RT datas.
603 */
604 rt_datas_mta_size = ALIGN_POT(mta_size * num_rt_datas,
605 PVRX(CR_PM_MLIST0_BASE_ADDR_ALIGNMENT));
606 rt_datas_mlist_size = mlist_size * num_rt_datas;
607
608 result = pvr_bo_alloc(device,
609 device->heaps.general_heap,
610 rt_datas_mta_size + rt_datas_mlist_size,
611 PVRX(CR_PM_MTILE_ARRAY_BASE_ADDR_ALIGNMENT),
612 PVR_BO_ALLOC_FLAG_GPU_UNCACHED,
613 &rt_dataset->mta_mlist_bo);
614 if (result != VK_SUCCESS)
615 return result;
616
617 dev_addr = rt_dataset->mta_mlist_bo->vma->dev_addr;
618
619 for (uint32_t i = 0; i < num_rt_datas; i++) {
620 if (mta_size != 0) {
621 rt_dataset->rt_datas[i].mta_dev_addr = dev_addr;
622 dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, mta_size);
623 } else {
624 rt_dataset->rt_datas[i].mta_dev_addr = PVR_DEV_ADDR_INVALID;
625 }
626 }
627
628 dev_addr = PVR_DEV_ADDR_OFFSET(rt_dataset->mta_mlist_bo->vma->dev_addr,
629 rt_datas_mta_size);
630
631 for (uint32_t i = 0; i < num_rt_datas; i++) {
632 if (mlist_size != 0) {
633 rt_dataset->rt_datas[i].mlist_dev_addr = dev_addr;
634 dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, mlist_size);
635 } else {
636 rt_dataset->rt_datas[i].mlist_dev_addr = PVR_DEV_ADDR_INVALID;
637 }
638 }
639
640 return VK_SUCCESS;
641 }
642
pvr_rt_mta_mlist_data_fini(struct pvr_rt_dataset * rt_dataset)643 static void pvr_rt_mta_mlist_data_fini(struct pvr_rt_dataset *rt_dataset)
644 {
645 for (uint32_t i = 0; i < ARRAY_SIZE(rt_dataset->rt_datas); i++) {
646 rt_dataset->rt_datas[i].mlist_dev_addr = PVR_DEV_ADDR_INVALID;
647 rt_dataset->rt_datas[i].mta_dev_addr = PVR_DEV_ADDR_INVALID;
648 }
649
650 pvr_bo_free(rt_dataset->device, rt_dataset->mta_mlist_bo);
651 rt_dataset->mta_mlist_bo = NULL;
652 }
653
654 static VkResult
pvr_rt_rgn_headers_data_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers)655 pvr_rt_rgn_headers_data_init(struct pvr_device *device,
656 struct pvr_rt_dataset *rt_dataset,
657 const struct pvr_rt_mtile_info *mtile_info,
658 uint32_t layers)
659 {
660 const uint32_t num_rt_datas = ARRAY_SIZE(rt_dataset->rt_datas);
661 uint64_t rgn_headers_size;
662 pvr_dev_addr_t dev_addr;
663 VkResult result;
664
665 pvr_rt_get_region_headers_stride_size(device,
666 mtile_info,
667 layers,
668 &rt_dataset->rgn_headers_stride,
669 &rgn_headers_size);
670
671 result = pvr_bo_alloc(device,
672 device->heaps.rgn_hdr_heap,
673 rgn_headers_size * num_rt_datas,
674 PVRX(CR_TE_PSGREGION_ADDR_BASE_ALIGNMENT),
675 PVR_BO_ALLOC_FLAG_GPU_UNCACHED,
676 &rt_dataset->rgn_headers_bo);
677 if (result != VK_SUCCESS)
678 return result;
679
680 dev_addr = rt_dataset->rgn_headers_bo->vma->dev_addr;
681
682 for (uint32_t i = 0; i < num_rt_datas; i++) {
683 rt_dataset->rt_datas[i].rgn_headers_dev_addr = dev_addr;
684 dev_addr = PVR_DEV_ADDR_OFFSET(dev_addr, rgn_headers_size);
685 }
686
687 return VK_SUCCESS;
688 }
689
pvr_rt_rgn_headers_data_fini(struct pvr_rt_dataset * rt_dataset)690 static void pvr_rt_rgn_headers_data_fini(struct pvr_rt_dataset *rt_dataset)
691 {
692 for (uint32_t i = 0; i < ARRAY_SIZE(rt_dataset->rt_datas); i++)
693 rt_dataset->rt_datas[i].rgn_headers_dev_addr = PVR_DEV_ADDR_INVALID;
694
695 pvr_bo_free(rt_dataset->device, rt_dataset->rgn_headers_bo);
696 rt_dataset->rgn_headers_bo = NULL;
697 }
698
pvr_rt_datas_init(struct pvr_device * device,struct pvr_rt_dataset * rt_dataset,const struct pvr_free_list * global_free_list,const struct pvr_free_list * local_free_list,const struct pvr_rt_mtile_info * mtile_info,uint32_t layers)699 static VkResult pvr_rt_datas_init(struct pvr_device *device,
700 struct pvr_rt_dataset *rt_dataset,
701 const struct pvr_free_list *global_free_list,
702 const struct pvr_free_list *local_free_list,
703 const struct pvr_rt_mtile_info *mtile_info,
704 uint32_t layers)
705 {
706 VkResult result;
707
708 result = pvr_rt_mta_mlist_data_init(device,
709 rt_dataset,
710 global_free_list,
711 local_free_list,
712 mtile_info);
713 if (result != VK_SUCCESS)
714 return result;
715
716 result =
717 pvr_rt_rgn_headers_data_init(device, rt_dataset, mtile_info, layers);
718 if (result != VK_SUCCESS)
719 goto err_pvr_rt_mta_mlist_data_fini;
720
721 return VK_SUCCESS;
722
723 err_pvr_rt_mta_mlist_data_fini:
724 pvr_rt_mta_mlist_data_fini(rt_dataset);
725
726 return VK_SUCCESS;
727 }
728
pvr_rt_datas_fini(struct pvr_rt_dataset * rt_dataset)729 static void pvr_rt_datas_fini(struct pvr_rt_dataset *rt_dataset)
730 {
731 pvr_rt_rgn_headers_data_fini(rt_dataset);
732 pvr_rt_mta_mlist_data_fini(rt_dataset);
733 }
734
pvr_rt_dataset_ws_create_info_init(struct pvr_rt_dataset * rt_dataset,const struct pvr_rt_mtile_info * mtile_info,struct pvr_winsys_rt_dataset_create_info * create_info)735 static void pvr_rt_dataset_ws_create_info_init(
736 struct pvr_rt_dataset *rt_dataset,
737 const struct pvr_rt_mtile_info *mtile_info,
738 struct pvr_winsys_rt_dataset_create_info *create_info)
739 {
740 struct pvr_device *device = rt_dataset->device;
741 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
742
743 memset(create_info, 0, sizeof(*create_info));
744
745 /* Local freelist. */
746 create_info->local_free_list = rt_dataset->local_free_list->ws_free_list;
747
748 create_info->width = rt_dataset->width;
749 create_info->height = rt_dataset->height;
750 create_info->samples = rt_dataset->samples;
751 create_info->layers = rt_dataset->layers;
752
753 /* ISP register values. */
754 if (PVR_HAS_ERN(dev_info, 42307) &&
755 !(PVR_HAS_FEATURE(dev_info, roguexe) && mtile_info->tile_size_x == 16)) {
756 float value;
757
758 if (rt_dataset->width != 0) {
759 value =
760 ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR / (float)rt_dataset->width;
761 create_info->isp_merge_lower_x = fui(value);
762
763 value =
764 ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR / (float)rt_dataset->width;
765 create_info->isp_merge_upper_x = fui(value);
766 }
767
768 if (rt_dataset->height != 0) {
769 value =
770 ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR / (float)rt_dataset->height;
771 create_info->isp_merge_lower_y = fui(value);
772
773 value =
774 ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR / (float)rt_dataset->height;
775 create_info->isp_merge_upper_y = fui(value);
776 }
777
778 value = ((float)rt_dataset->width * ROGUE_ISP_MERGE_SCALE_FACTOR) /
779 (ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR -
780 ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR);
781 create_info->isp_merge_scale_x = fui(value);
782
783 value = ((float)rt_dataset->height * ROGUE_ISP_MERGE_SCALE_FACTOR) /
784 (ROGUE_ISP_MERGE_UPPER_LIMIT_NUMERATOR -
785 ROGUE_ISP_MERGE_LOWER_LIMIT_NUMERATOR);
786 create_info->isp_merge_scale_y = fui(value);
787 }
788
789 /* Allocations and associated information. */
790 create_info->vheap_table_dev_addr = rt_dataset->vheap_dev_addr;
791 create_info->rtc_dev_addr = rt_dataset->rtc_dev_addr;
792
793 create_info->tpc_dev_addr = rt_dataset->tpc_bo->vma->dev_addr;
794 create_info->tpc_stride = rt_dataset->tpc_stride;
795 create_info->tpc_size = rt_dataset->tpc_size;
796
797 STATIC_ASSERT(ARRAY_SIZE(create_info->rt_datas) ==
798 ARRAY_SIZE(rt_dataset->rt_datas));
799 for (uint32_t i = 0; i < ARRAY_SIZE(create_info->rt_datas); i++) {
800 create_info->rt_datas[i].pm_mlist_dev_addr =
801 rt_dataset->rt_datas[i].mlist_dev_addr;
802 create_info->rt_datas[i].macrotile_array_dev_addr =
803 rt_dataset->rt_datas[i].mta_dev_addr;
804 create_info->rt_datas[i].rgn_header_dev_addr =
805 rt_dataset->rt_datas[i].rgn_headers_dev_addr;
806 }
807
808 create_info->rgn_header_size =
809 pvr_rt_get_isp_region_size(device, mtile_info);
810 }
811
812 VkResult
pvr_render_target_dataset_create(struct pvr_device * device,uint32_t width,uint32_t height,uint32_t samples,uint32_t layers,struct pvr_rt_dataset ** const rt_dataset_out)813 pvr_render_target_dataset_create(struct pvr_device *device,
814 uint32_t width,
815 uint32_t height,
816 uint32_t samples,
817 uint32_t layers,
818 struct pvr_rt_dataset **const rt_dataset_out)
819 {
820 struct pvr_device_runtime_info *runtime_info =
821 &device->pdevice->dev_runtime_info;
822 const struct pvr_device_info *dev_info = &device->pdevice->dev_info;
823 struct pvr_winsys_rt_dataset_create_info rt_dataset_create_info;
824 struct pvr_rt_mtile_info mtile_info;
825 struct pvr_rt_dataset *rt_dataset;
826 VkResult result;
827
828 assert(device->global_free_list);
829 assert(width <= rogue_get_render_size_max_x(dev_info));
830 assert(height <= rogue_get_render_size_max_y(dev_info));
831 assert(layers > 0 && layers <= PVR_MAX_FRAMEBUFFER_LAYERS);
832
833 pvr_rt_mtile_info_init(dev_info, &mtile_info, width, height, samples);
834
835 rt_dataset = vk_zalloc(&device->vk.alloc,
836 sizeof(*rt_dataset),
837 8,
838 VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
839 if (!rt_dataset)
840 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
841
842 rt_dataset->device = device;
843 rt_dataset->width = width;
844 rt_dataset->height = height;
845 rt_dataset->samples = samples;
846 rt_dataset->layers = layers;
847 rt_dataset->global_free_list = device->global_free_list;
848
849 /* The maximum supported free list size is based on the assumption that this
850 * freelist (the "local" freelist) is always the minimum size required by
851 * the hardware. See the documentation of ROGUE_FREE_LIST_MAX_SIZE for more
852 * details.
853 */
854 result = pvr_free_list_create(device,
855 runtime_info->min_free_list_size,
856 runtime_info->min_free_list_size,
857 0 /* grow_size */,
858 0 /* grow_threshold */,
859 rt_dataset->global_free_list,
860 &rt_dataset->local_free_list);
861 if (result != VK_SUCCESS)
862 goto err_vk_free_rt_dataset;
863
864 result = pvr_rt_vheap_rtc_data_init(device, rt_dataset, layers);
865 if (result != VK_SUCCESS)
866 goto err_pvr_free_list_destroy;
867
868 result = pvr_rt_tpc_data_init(device, rt_dataset, &mtile_info, layers);
869 if (result != VK_SUCCESS)
870 goto err_pvr_rt_vheap_rtc_data_fini;
871
872 result = pvr_rt_datas_init(device,
873 rt_dataset,
874 rt_dataset->global_free_list,
875 rt_dataset->local_free_list,
876 &mtile_info,
877 layers);
878 if (result != VK_SUCCESS)
879 goto err_pvr_rt_tpc_data_fini;
880
881 /* rt_dataset must be fully initialized by this point since
882 * pvr_rt_dataset_ws_create_info_init() depends on this.
883 */
884 pvr_rt_dataset_ws_create_info_init(rt_dataset,
885 &mtile_info,
886 &rt_dataset_create_info);
887
888 result =
889 device->ws->ops->render_target_dataset_create(device->ws,
890 &rt_dataset_create_info,
891 dev_info,
892 &rt_dataset->ws_rt_dataset);
893 if (result != VK_SUCCESS)
894 goto err_pvr_rt_datas_fini;
895
896 *rt_dataset_out = rt_dataset;
897
898 return VK_SUCCESS;
899
900 err_pvr_rt_datas_fini:
901 pvr_rt_datas_fini(rt_dataset);
902
903 err_pvr_rt_tpc_data_fini:
904 pvr_rt_tpc_data_fini(rt_dataset);
905
906 err_pvr_rt_vheap_rtc_data_fini:
907 pvr_rt_vheap_rtc_data_fini(rt_dataset);
908
909 err_pvr_free_list_destroy:
910 pvr_free_list_destroy(rt_dataset->local_free_list);
911
912 err_vk_free_rt_dataset:
913 vk_free(&device->vk.alloc, rt_dataset);
914
915 return result;
916 }
917
pvr_render_target_dataset_destroy(struct pvr_rt_dataset * rt_dataset)918 void pvr_render_target_dataset_destroy(struct pvr_rt_dataset *rt_dataset)
919 {
920 struct pvr_device *device = rt_dataset->device;
921
922 device->ws->ops->render_target_dataset_destroy(rt_dataset->ws_rt_dataset);
923
924 pvr_rt_datas_fini(rt_dataset);
925 pvr_rt_tpc_data_fini(rt_dataset);
926 pvr_rt_vheap_rtc_data_fini(rt_dataset);
927
928 pvr_free_list_destroy(rt_dataset->local_free_list);
929
930 vk_free(&device->vk.alloc, rt_dataset);
931 }
932
pvr_geom_state_stream_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct pvr_winsys_geometry_state * state)933 static void pvr_geom_state_stream_init(struct pvr_render_ctx *ctx,
934 struct pvr_render_job *job,
935 struct pvr_winsys_geometry_state *state)
936 {
937 const struct pvr_device *const device = ctx->device;
938 const struct pvr_device_info *const dev_info = &device->pdevice->dev_info;
939
940 uint32_t *stream_ptr = (uint32_t *)state->fw_stream;
941 uint32_t *stream_len_ptr = stream_ptr;
942
943 /* Leave space for stream header. */
944 stream_ptr += pvr_cmd_length(KMD_STREAM_HDR);
945
946 pvr_csb_pack ((uint64_t *)stream_ptr, CR_VDM_CTRL_STREAM_BASE, value) {
947 value.addr = job->ctrl_stream_addr;
948 }
949 stream_ptr += pvr_cmd_length(CR_VDM_CTRL_STREAM_BASE);
950
951 pvr_csb_pack ((uint64_t *)stream_ptr,
952 CR_TPU_BORDER_COLOUR_TABLE_VDM,
953 value) {
954 value.border_colour_table_address =
955 device->border_color_table.table->vma->dev_addr;
956 }
957 stream_ptr += pvr_cmd_length(CR_TPU_BORDER_COLOUR_TABLE_VDM);
958
959 pvr_csb_pack (stream_ptr, CR_PPP_CTRL, value) {
960 value.wclampen = true;
961 value.fixed_point_format = 1;
962 }
963 stream_ptr += pvr_cmd_length(CR_PPP_CTRL);
964
965 pvr_csb_pack (stream_ptr, CR_TE_PSG, value) {
966 value.completeonterminate = job->geometry_terminate;
967
968 value.region_stride = job->rt_dataset->rgn_headers_stride /
969 PVRX(CR_TE_PSG_REGION_STRIDE_UNIT_SIZE);
970
971 value.forcenewstate = PVR_HAS_QUIRK(dev_info, 52942);
972 }
973 stream_ptr += pvr_cmd_length(CR_TE_PSG);
974
975 /* Set up the USC common size for the context switch resume/load program
976 * (ctx->ctx_switch.programs[i].sr->pds_load_program), which was created
977 * as part of the render context.
978 */
979 pvr_csb_pack (stream_ptr, VDMCTRL_PDS_STATE0, value) {
980 /* Calculate the size in bytes. */
981 const uint16_t shared_registers_size = job->max_shared_registers * 4;
982
983 value.usc_common_size =
984 DIV_ROUND_UP(shared_registers_size,
985 PVRX(VDMCTRL_PDS_STATE0_USC_COMMON_SIZE_UNIT_SIZE));
986 }
987 stream_ptr += pvr_cmd_length(VDMCTRL_PDS_STATE0);
988
989 /* clang-format off */
990 pvr_csb_pack (stream_ptr, KMD_STREAM_VIEW_IDX, value);
991 /* clang-format on */
992 stream_ptr += pvr_cmd_length(KMD_STREAM_VIEW_IDX);
993
994 state->fw_stream_len = (uint8_t *)stream_ptr - (uint8_t *)state->fw_stream;
995 assert(state->fw_stream_len <= ARRAY_SIZE(state->fw_stream));
996
997 pvr_csb_pack ((uint64_t *)stream_len_ptr, KMD_STREAM_HDR, value) {
998 value.length = state->fw_stream_len;
999 }
1000 }
1001
1002 static void
pvr_geom_state_stream_ext_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct pvr_winsys_geometry_state * state)1003 pvr_geom_state_stream_ext_init(struct pvr_render_ctx *ctx,
1004 struct pvr_render_job *job,
1005 struct pvr_winsys_geometry_state *state)
1006 {
1007 const struct pvr_device_info *dev_info = &ctx->device->pdevice->dev_info;
1008
1009 uint32_t main_stream_len =
1010 pvr_csb_unpack((uint64_t *)state->fw_stream, KMD_STREAM_HDR).length;
1011 uint32_t *ext_stream_ptr =
1012 (uint32_t *)((uint8_t *)state->fw_stream + main_stream_len);
1013 uint32_t *header0_ptr;
1014
1015 header0_ptr = ext_stream_ptr;
1016 ext_stream_ptr += pvr_cmd_length(KMD_STREAM_EXTHDR_GEOM0);
1017
1018 pvr_csb_pack (header0_ptr, KMD_STREAM_EXTHDR_GEOM0, header0) {
1019 if (PVR_HAS_QUIRK(dev_info, 49927)) {
1020 header0.has_brn49927 = true;
1021
1022 /* The set up of CR_TPU must be identical to
1023 * pvr_render_job_ws_fragment_state_stream_ext_init().
1024 */
1025 pvr_csb_pack (ext_stream_ptr, CR_TPU, value) {
1026 value.tag_cem_4k_face_packing = true;
1027 }
1028 ext_stream_ptr += pvr_cmd_length(CR_TPU);
1029 }
1030 }
1031
1032 if ((*header0_ptr & PVRX(KMD_STREAM_EXTHDR_DATA_MASK)) != 0) {
1033 state->fw_stream_len =
1034 (uint8_t *)ext_stream_ptr - (uint8_t *)state->fw_stream;
1035 assert(state->fw_stream_len <= ARRAY_SIZE(state->fw_stream));
1036 }
1037 }
1038
1039 static void
pvr_geom_state_flags_init(const struct pvr_render_job * const job,struct pvr_winsys_geometry_state_flags * flags)1040 pvr_geom_state_flags_init(const struct pvr_render_job *const job,
1041 struct pvr_winsys_geometry_state_flags *flags)
1042 {
1043 *flags = (struct pvr_winsys_geometry_state_flags){
1044 .is_first_geometry = !job->rt_dataset->need_frag,
1045 .is_last_geometry = job->geometry_terminate,
1046 .use_single_core = job->frag_uses_atomic_ops,
1047 };
1048 }
1049
1050 static void
pvr_render_job_ws_geometry_state_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct vk_sync * wait,struct pvr_winsys_geometry_state * state)1051 pvr_render_job_ws_geometry_state_init(struct pvr_render_ctx *ctx,
1052 struct pvr_render_job *job,
1053 struct vk_sync *wait,
1054 struct pvr_winsys_geometry_state *state)
1055 {
1056 pvr_geom_state_stream_init(ctx, job, state);
1057 pvr_geom_state_stream_ext_init(ctx, job, state);
1058
1059 state->wait = wait;
1060 pvr_geom_state_flags_init(job, &state->flags);
1061 }
1062
pvr_frag_km_stream_pbe_reg_words_offset(const struct pvr_device_info * const dev_info)1063 static inline uint32_t pvr_frag_km_stream_pbe_reg_words_offset(
1064 const struct pvr_device_info *const dev_info)
1065 {
1066 uint32_t offset = 0;
1067
1068 offset += pvr_cmd_length(KMD_STREAM_HDR);
1069 offset += pvr_cmd_length(CR_ISP_SCISSOR_BASE);
1070 offset += pvr_cmd_length(CR_ISP_DBIAS_BASE);
1071 offset += pvr_cmd_length(CR_ISP_OCLQRY_BASE);
1072 offset += pvr_cmd_length(CR_ISP_ZLSCTL);
1073 offset += pvr_cmd_length(CR_ISP_ZLOAD_BASE);
1074 offset += pvr_cmd_length(CR_ISP_STENCIL_LOAD_BASE);
1075
1076 if (PVR_HAS_FEATURE(dev_info, requires_fb_cdc_zls_setup))
1077 offset += pvr_cmd_length(CR_FB_CDC_ZLS);
1078
1079 return PVR_DW_TO_BYTES(offset);
1080 }
1081
1082 #define DWORDS_PER_U64 2
1083
pvr_frag_km_stream_pds_eot_data_addr_offset(const struct pvr_device_info * const dev_info)1084 static inline uint32_t pvr_frag_km_stream_pds_eot_data_addr_offset(
1085 const struct pvr_device_info *const dev_info)
1086 {
1087 uint32_t offset = 0;
1088
1089 offset += pvr_frag_km_stream_pbe_reg_words_offset(dev_info) / 4U;
1090 offset +=
1091 PVR_MAX_COLOR_ATTACHMENTS * ROGUE_NUM_PBESTATE_REG_WORDS * DWORDS_PER_U64;
1092 offset += pvr_cmd_length(CR_TPU_BORDER_COLOUR_TABLE_PDM);
1093 offset += ROGUE_NUM_CR_PDS_BGRND_WORDS * DWORDS_PER_U64;
1094 offset += ROGUE_NUM_CR_PDS_BGRND_WORDS * DWORDS_PER_U64;
1095 offset += PVRX(KMD_STREAM_USC_CLEAR_REGISTER_COUNT) *
1096 pvr_cmd_length(CR_USC_CLEAR_REGISTER);
1097 offset += pvr_cmd_length(CR_USC_PIXEL_OUTPUT_CTRL);
1098 offset += pvr_cmd_length(CR_ISP_BGOBJDEPTH);
1099 offset += pvr_cmd_length(CR_ISP_BGOBJVALS);
1100 offset += pvr_cmd_length(CR_ISP_AA);
1101 offset += pvr_cmd_length(CR_ISP_CTL);
1102 offset += pvr_cmd_length(CR_EVENT_PIXEL_PDS_INFO);
1103
1104 if (PVR_HAS_FEATURE(dev_info, cluster_grouping))
1105 offset += pvr_cmd_length(KMD_STREAM_PIXEL_PHANTOM);
1106
1107 offset += pvr_cmd_length(KMD_STREAM_VIEW_IDX);
1108
1109 return PVR_DW_TO_BYTES(offset);
1110 }
1111
pvr_frag_state_stream_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct pvr_winsys_fragment_state * state)1112 static void pvr_frag_state_stream_init(struct pvr_render_ctx *ctx,
1113 struct pvr_render_job *job,
1114 struct pvr_winsys_fragment_state *state)
1115 {
1116 const struct pvr_device *const device = ctx->device;
1117 const struct pvr_physical_device *const pdevice = device->pdevice;
1118 const struct pvr_device_runtime_info *dev_runtime_info =
1119 &pdevice->dev_runtime_info;
1120 const struct pvr_device_info *dev_info = &pdevice->dev_info;
1121 const enum PVRX(CR_ISP_AA_MODE_TYPE)
1122 isp_aa_mode = pvr_cr_isp_aa_mode_type(job->samples);
1123
1124 enum PVRX(CR_ZLS_FORMAT_TYPE) zload_format = PVRX(CR_ZLS_FORMAT_TYPE_F32Z);
1125 uint32_t *stream_ptr = (uint32_t *)state->fw_stream;
1126 uint32_t *stream_len_ptr = stream_ptr;
1127 uint32_t pixel_ctl;
1128 uint32_t isp_ctl;
1129
1130 /* Leave space for stream header. */
1131 stream_ptr += pvr_cmd_length(KMD_STREAM_HDR);
1132
1133 /* FIXME: pass in the number of samples rather than isp_aa_mode? */
1134 pvr_setup_tiles_in_flight(dev_info,
1135 dev_runtime_info,
1136 isp_aa_mode,
1137 job->pixel_output_width,
1138 false,
1139 job->max_tiles_in_flight,
1140 &isp_ctl,
1141 &pixel_ctl);
1142
1143 pvr_csb_pack ((uint64_t *)stream_ptr, CR_ISP_SCISSOR_BASE, value) {
1144 value.addr = job->scissor_table_addr;
1145 }
1146 stream_ptr += pvr_cmd_length(CR_ISP_SCISSOR_BASE);
1147
1148 pvr_csb_pack ((uint64_t *)stream_ptr, CR_ISP_DBIAS_BASE, value) {
1149 value.addr = job->depth_bias_table_addr;
1150 }
1151 stream_ptr += pvr_cmd_length(CR_ISP_DBIAS_BASE);
1152
1153 pvr_csb_pack ((uint64_t *)stream_ptr, CR_ISP_OCLQRY_BASE, value) {
1154 const struct pvr_sub_cmd_gfx *sub_cmd =
1155 container_of(job, const struct pvr_sub_cmd_gfx, job);
1156
1157 if (sub_cmd->query_pool)
1158 value.addr = sub_cmd->query_pool->result_buffer->dev_addr;
1159 else
1160 value.addr = PVR_DEV_ADDR_INVALID;
1161 }
1162 stream_ptr += pvr_cmd_length(CR_ISP_OCLQRY_BASE);
1163
1164 pvr_csb_pack ((uint64_t *)stream_ptr, CR_ISP_ZLSCTL, value) {
1165 if (job->has_depth_attachment || job->has_stencil_attachment) {
1166 uint32_t alignment_x;
1167 uint32_t alignment_y;
1168
1169 if (job->ds.has_alignment_transfers) {
1170 rogue_get_zls_tile_size_xy(dev_info, &alignment_x, &alignment_y);
1171 } else {
1172 alignment_x = ROGUE_IPF_TILE_SIZE_PIXELS;
1173 alignment_y = ROGUE_IPF_TILE_SIZE_PIXELS;
1174 }
1175
1176 rogue_get_isp_num_tiles_xy(
1177 dev_info,
1178 job->samples,
1179 ALIGN_POT(job->ds.physical_extent.width, alignment_x),
1180 ALIGN_POT(job->ds.physical_extent.height, alignment_y),
1181 &value.zlsextent_x_z,
1182 &value.zlsextent_y_z);
1183
1184 value.zlsextent_x_z -= 1;
1185 value.zlsextent_y_z -= 1;
1186
1187 if (job->ds.memlayout == PVR_MEMLAYOUT_TWIDDLED &&
1188 !job->ds.has_alignment_transfers) {
1189 value.loadtwiddled = true;
1190 value.storetwiddled = true;
1191 }
1192
1193 value.zloadformat = job->ds.zls_format;
1194 value.zstoreformat = job->ds.zls_format;
1195
1196 zload_format = value.zloadformat;
1197 }
1198
1199 if (job->has_depth_attachment) {
1200 value.zloaden = job->ds.load.d;
1201 value.zstoreen = job->ds.store.d;
1202 }
1203
1204 if (job->has_stencil_attachment) {
1205 value.sloaden = job->ds.load.s;
1206 value.sstoreen = job->ds.store.s;
1207 }
1208
1209 value.forcezload = value.zloaden || value.sloaden;
1210 value.forcezstore = value.zstoreen || value.sstoreen;
1211 }
1212 stream_ptr += pvr_cmd_length(CR_ISP_ZLSCTL);
1213
1214 pvr_csb_pack ((uint64_t *)stream_ptr, CR_ISP_ZLOAD_BASE, value) {
1215 if (job->has_depth_attachment)
1216 value.addr = job->ds.addr;
1217 }
1218 stream_ptr += pvr_cmd_length(CR_ISP_ZLOAD_BASE);
1219
1220 pvr_csb_pack ((uint64_t *)stream_ptr, CR_ISP_STENCIL_LOAD_BASE, value) {
1221 if (job->has_stencil_attachment) {
1222 value.addr = job->ds.addr;
1223
1224 /* Enable separate stencil. This should be enabled iff the buffer set
1225 * in CR_ISP_STENCIL_LOAD_BASE does not contain a depth component.
1226 */
1227 assert(job->has_depth_attachment ||
1228 !pvr_zls_format_type_is_packed(job->ds.zls_format));
1229 value.enable = !job->has_depth_attachment;
1230 }
1231 }
1232 stream_ptr += pvr_cmd_length(CR_ISP_STENCIL_LOAD_BASE);
1233
1234 if (PVR_HAS_FEATURE(dev_info, requires_fb_cdc_zls_setup)) {
1235 /* Currently no support for FBC, so just go ahead and set the default
1236 * values.
1237 */
1238 pvr_csb_pack ((uint64_t *)stream_ptr, CR_FB_CDC_ZLS, value) {
1239 value.fbdc_depth_fmt = PVRX(TEXSTATE_FORMAT_F32);
1240 value.fbdc_stencil_fmt = PVRX(TEXSTATE_FORMAT_U8);
1241 }
1242 stream_ptr += pvr_cmd_length(CR_FB_CDC_ZLS);
1243 }
1244
1245 /* Make sure that the pvr_frag_km_...() function is returning the correct
1246 * offset.
1247 */
1248 assert((uint8_t *)stream_ptr - (uint8_t *)state->fw_stream ==
1249 pvr_frag_km_stream_pbe_reg_words_offset(dev_info));
1250
1251 STATIC_ASSERT(ARRAY_SIZE(job->pbe_reg_words) == PVR_MAX_COLOR_ATTACHMENTS);
1252 STATIC_ASSERT(ARRAY_SIZE(job->pbe_reg_words[0]) ==
1253 ROGUE_NUM_PBESTATE_REG_WORDS);
1254 STATIC_ASSERT(sizeof(job->pbe_reg_words[0][0]) == sizeof(uint64_t));
1255 memcpy(stream_ptr, job->pbe_reg_words, sizeof(job->pbe_reg_words));
1256 stream_ptr +=
1257 PVR_MAX_COLOR_ATTACHMENTS * ROGUE_NUM_PBESTATE_REG_WORDS * DWORDS_PER_U64;
1258
1259 pvr_csb_pack ((uint64_t *)stream_ptr,
1260 CR_TPU_BORDER_COLOUR_TABLE_PDM,
1261 value) {
1262 value.border_colour_table_address =
1263 device->border_color_table.table->vma->dev_addr;
1264 }
1265 stream_ptr += pvr_cmd_length(CR_TPU_BORDER_COLOUR_TABLE_PDM);
1266
1267 STATIC_ASSERT(ARRAY_SIZE(job->pds_bgnd_reg_values) ==
1268 ROGUE_NUM_CR_PDS_BGRND_WORDS);
1269 STATIC_ASSERT(sizeof(job->pds_bgnd_reg_values[0]) == sizeof(uint64_t));
1270 memcpy(stream_ptr,
1271 job->pds_bgnd_reg_values,
1272 sizeof(job->pds_bgnd_reg_values));
1273 stream_ptr += ROGUE_NUM_CR_PDS_BGRND_WORDS * DWORDS_PER_U64;
1274
1275 STATIC_ASSERT(ARRAY_SIZE(job->pds_pr_bgnd_reg_values) ==
1276 ROGUE_NUM_CR_PDS_BGRND_WORDS);
1277 STATIC_ASSERT(sizeof(job->pds_pr_bgnd_reg_values[0]) == sizeof(uint64_t));
1278 memcpy(stream_ptr,
1279 job->pds_pr_bgnd_reg_values,
1280 sizeof(job->pds_pr_bgnd_reg_values));
1281 stream_ptr += ROGUE_NUM_CR_PDS_BGRND_WORDS * DWORDS_PER_U64;
1282
1283 #undef DWORDS_PER_U64
1284
1285 memset(stream_ptr,
1286 0,
1287 PVRX(KMD_STREAM_USC_CLEAR_REGISTER_COUNT) *
1288 PVR_DW_TO_BYTES(pvr_cmd_length(CR_USC_CLEAR_REGISTER)));
1289 stream_ptr += PVRX(KMD_STREAM_USC_CLEAR_REGISTER_COUNT) *
1290 pvr_cmd_length(CR_USC_CLEAR_REGISTER);
1291
1292 *stream_ptr = pixel_ctl;
1293 stream_ptr += pvr_cmd_length(CR_USC_PIXEL_OUTPUT_CTRL);
1294
1295 pvr_csb_pack (stream_ptr, CR_ISP_BGOBJDEPTH, value) {
1296 const float depth_clear = job->ds_clear_value.depth;
1297
1298 /* This is valid even when we don't have a depth attachment because:
1299 * - zload_format is set to a sensible default above, and
1300 * - job->depth_clear_value is set to a sensible default in that case.
1301 */
1302 switch (zload_format) {
1303 case PVRX(CR_ZLS_FORMAT_TYPE_F32Z):
1304 value.value = fui(depth_clear);
1305 break;
1306
1307 case PVRX(CR_ZLS_FORMAT_TYPE_16BITINT):
1308 value.value = _mesa_float_to_unorm(depth_clear, 16);
1309 break;
1310
1311 case PVRX(CR_ZLS_FORMAT_TYPE_24BITINT):
1312 value.value = _mesa_float_to_unorm(depth_clear, 24);
1313 break;
1314
1315 default:
1316 unreachable("Unsupported depth format");
1317 }
1318 }
1319 stream_ptr += pvr_cmd_length(CR_ISP_BGOBJDEPTH);
1320
1321 pvr_csb_pack (stream_ptr, CR_ISP_BGOBJVALS, value) {
1322 value.enablebgtag = job->enable_bg_tag;
1323
1324 value.mask = true;
1325
1326 value.stencil = job->ds_clear_value.stencil & 0xFF;
1327 }
1328 stream_ptr += pvr_cmd_length(CR_ISP_BGOBJVALS);
1329
1330 pvr_csb_pack (stream_ptr, CR_ISP_AA, value) {
1331 value.mode = isp_aa_mode;
1332 }
1333 stream_ptr += pvr_cmd_length(CR_ISP_AA);
1334
1335 pvr_csb_pack (stream_ptr, CR_ISP_CTL, value) {
1336 value.sample_pos = true;
1337 value.process_empty_tiles = job->process_empty_tiles;
1338
1339 /* For integer depth formats we'll convert the specified floating point
1340 * depth bias values and specify them as integers. In this mode a depth
1341 * bias factor of 1.0 equates to 1 ULP of increase to the depth value.
1342 */
1343 value.dbias_is_int = PVR_HAS_ERN(dev_info, 42307) &&
1344 pvr_zls_format_type_is_int(job->ds.zls_format);
1345 }
1346 /* FIXME: When pvr_setup_tiles_in_flight() is refactored it might be
1347 * possible to fully pack CR_ISP_CTL above rather than having to OR in part
1348 * of the value.
1349 */
1350 *stream_ptr |= isp_ctl;
1351 stream_ptr += pvr_cmd_length(CR_ISP_CTL);
1352
1353 pvr_csb_pack (stream_ptr, CR_EVENT_PIXEL_PDS_INFO, value) {
1354 value.const_size =
1355 DIV_ROUND_UP(ctx->device->pixel_event_data_size_in_dwords,
1356 PVRX(CR_EVENT_PIXEL_PDS_INFO_CONST_SIZE_UNIT_SIZE));
1357 value.temp_stride = 0;
1358 value.usc_sr_size =
1359 DIV_ROUND_UP(PVR_STATE_PBE_DWORDS,
1360 PVRX(CR_EVENT_PIXEL_PDS_INFO_USC_SR_SIZE_UNIT_SIZE));
1361 }
1362 stream_ptr += pvr_cmd_length(CR_EVENT_PIXEL_PDS_INFO);
1363
1364 if (PVR_HAS_FEATURE(dev_info, cluster_grouping)) {
1365 pvr_csb_pack (stream_ptr, KMD_STREAM_PIXEL_PHANTOM, value) {
1366 /* Each phantom has its own MCU, so atomicity can only be guaranteed
1367 * when all work items are processed on the same phantom. This means
1368 * we need to disable all USCs other than those of the first
1369 * phantom, which has 4 clusters. Note that we only need to do this
1370 * for atomic operations in fragment shaders, since hardware
1371 * prevents the TA to run on more than one phantom anyway.
1372 */
1373 /* Note that leaving all phantoms disabled (as csbgen will do by
1374 * default since it will zero out things) will set them to their
1375 * default state (i.e. enabled) instead of disabling them.
1376 */
1377 if (PVR_HAS_FEATURE(dev_info, slc_mcu_cache_controls) &&
1378 dev_runtime_info->num_phantoms > 1 && job->frag_uses_atomic_ops) {
1379 value.phantom_0 = PVRX(KMD_STREAM_PIXEL_PHANTOM_STATE_ENABLED);
1380 }
1381 }
1382 stream_ptr += pvr_cmd_length(KMD_STREAM_PIXEL_PHANTOM);
1383 }
1384
1385 /* clang-format off */
1386 pvr_csb_pack (stream_ptr, KMD_STREAM_VIEW_IDX, value);
1387 /* clang-format on */
1388 stream_ptr += pvr_cmd_length(KMD_STREAM_VIEW_IDX);
1389
1390 /* Make sure that the pvr_frag_km_...() function is returning the correct
1391 * offset.
1392 */
1393 assert((uint8_t *)stream_ptr - (uint8_t *)state->fw_stream ==
1394 pvr_frag_km_stream_pds_eot_data_addr_offset(dev_info));
1395
1396 pvr_csb_pack (stream_ptr, CR_EVENT_PIXEL_PDS_DATA, value) {
1397 value.addr = PVR_DEV_ADDR(job->pds_pixel_event_data_offset);
1398 }
1399 stream_ptr += pvr_cmd_length(CR_EVENT_PIXEL_PDS_DATA);
1400
1401 if (PVR_HAS_FEATURE(dev_info, gpu_multicore_support)) {
1402 pvr_finishme(
1403 "Emit isp_oclqry_stride when feature gpu_multicore_support is present");
1404 *stream_ptr = 0;
1405 stream_ptr++;
1406 }
1407
1408 if (PVR_HAS_FEATURE(dev_info, zls_subtile)) {
1409 pvr_csb_pack (stream_ptr, CR_ISP_ZLS_PIXELS, value) {
1410 if (job->has_depth_attachment) {
1411 if (job->ds.has_alignment_transfers) {
1412 value.x = job->ds.physical_extent.width - 1;
1413 value.y = job->ds.physical_extent.height - 1;
1414 } else {
1415 value.x = job->ds.stride - 1;
1416 value.y = job->ds.height - 1;
1417 }
1418 }
1419 }
1420 stream_ptr += pvr_cmd_length(CR_ISP_ZLS_PIXELS);
1421 }
1422
1423 /* zls_stride */
1424 *stream_ptr = job->has_depth_attachment ? job->ds.layer_size : 0;
1425 stream_ptr++;
1426
1427 /* sls_stride */
1428 *stream_ptr = job->has_stencil_attachment ? job->ds.layer_size : 0;
1429 stream_ptr++;
1430
1431 if (PVR_HAS_FEATURE(dev_info, gpu_multicore_support)) {
1432 pvr_finishme(
1433 "Emit execute_count when feature gpu_multicore_support is present");
1434 *stream_ptr = 0;
1435 stream_ptr++;
1436 }
1437
1438 state->fw_stream_len = (uint8_t *)stream_ptr - (uint8_t *)state->fw_stream;
1439 assert(state->fw_stream_len <= ARRAY_SIZE(state->fw_stream));
1440
1441 pvr_csb_pack ((uint64_t *)stream_len_ptr, KMD_STREAM_HDR, value) {
1442 value.length = state->fw_stream_len;
1443 }
1444 }
1445
1446 #undef DWORDS_PER_U64
1447
1448 static void
pvr_frag_state_stream_ext_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct pvr_winsys_fragment_state * state)1449 pvr_frag_state_stream_ext_init(struct pvr_render_ctx *ctx,
1450 struct pvr_render_job *job,
1451 struct pvr_winsys_fragment_state *state)
1452 {
1453 const struct pvr_device_info *dev_info = &ctx->device->pdevice->dev_info;
1454
1455 uint32_t main_stream_len =
1456 pvr_csb_unpack((uint64_t *)state->fw_stream, KMD_STREAM_HDR).length;
1457 uint32_t *ext_stream_ptr =
1458 (uint32_t *)((uint8_t *)state->fw_stream + main_stream_len);
1459 uint32_t *header0_ptr;
1460
1461 header0_ptr = ext_stream_ptr;
1462 ext_stream_ptr += pvr_cmd_length(KMD_STREAM_EXTHDR_FRAG0);
1463
1464 pvr_csb_pack (header0_ptr, KMD_STREAM_EXTHDR_FRAG0, header0) {
1465 if (PVR_HAS_QUIRK(dev_info, 49927)) {
1466 header0.has_brn49927 = true;
1467
1468 /* The set up of CR_TPU must be identical to
1469 * pvr_render_job_ws_geometry_state_stream_ext_init().
1470 */
1471 pvr_csb_pack (ext_stream_ptr, CR_TPU, value) {
1472 value.tag_cem_4k_face_packing = true;
1473 }
1474 ext_stream_ptr += pvr_cmd_length(CR_TPU);
1475 }
1476 }
1477
1478 if ((*header0_ptr & PVRX(KMD_STREAM_EXTHDR_DATA_MASK)) != 0) {
1479 state->fw_stream_len =
1480 (uint8_t *)ext_stream_ptr - (uint8_t *)state->fw_stream;
1481 assert(state->fw_stream_len <= ARRAY_SIZE(state->fw_stream));
1482 }
1483 }
1484
1485 static void
pvr_frag_state_flags_init(const struct pvr_render_job * const job,struct pvr_winsys_fragment_state_flags * flags)1486 pvr_frag_state_flags_init(const struct pvr_render_job *const job,
1487 struct pvr_winsys_fragment_state_flags *flags)
1488 {
1489 *flags = (struct pvr_winsys_fragment_state_flags){
1490 .has_depth_buffer = job->has_depth_attachment,
1491 .has_stencil_buffer = job->has_stencil_attachment,
1492 .prevent_cdm_overlap = job->disable_compute_overlap,
1493 .use_single_core = job->frag_uses_atomic_ops,
1494 .get_vis_results = job->get_vis_results,
1495 .has_spm_scratch_buffer = job->requires_spm_scratch_buffer,
1496 };
1497 }
1498
1499 static void
pvr_render_job_ws_fragment_state_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct vk_sync * wait,struct pvr_winsys_fragment_state * state)1500 pvr_render_job_ws_fragment_state_init(struct pvr_render_ctx *ctx,
1501 struct pvr_render_job *job,
1502 struct vk_sync *wait,
1503 struct pvr_winsys_fragment_state *state)
1504 {
1505 pvr_frag_state_stream_init(ctx, job, state);
1506 pvr_frag_state_stream_ext_init(ctx, job, state);
1507
1508 state->wait = wait;
1509 pvr_frag_state_flags_init(job, &state->flags);
1510 }
1511
1512 /**
1513 * \brief Sets up the fragment state for a Partial Render (PR) based on the
1514 * state for a normal fragment job.
1515 *
1516 * The state of a fragment PR is almost the same as of that for a normal
1517 * fragment job apart the PBE words and the EOT program, both of which are
1518 * necessary for the render to use the SPM scratch buffer instead of the final
1519 * render targets.
1520 *
1521 * By basing the fragment PR state on that of a normal fragment state,
1522 * repacking of the same words can be avoided as we end up mostly doing copies
1523 * instead.
1524 */
pvr_render_job_ws_fragment_pr_init_based_on_fragment_state(const struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct vk_sync * wait,struct pvr_winsys_fragment_state * frag,struct pvr_winsys_fragment_state * state)1525 static void pvr_render_job_ws_fragment_pr_init_based_on_fragment_state(
1526 const struct pvr_render_ctx *ctx,
1527 struct pvr_render_job *job,
1528 struct vk_sync *wait,
1529 struct pvr_winsys_fragment_state *frag,
1530 struct pvr_winsys_fragment_state *state)
1531 {
1532 const struct pvr_device_info *const dev_info =
1533 &ctx->device->pdevice->dev_info;
1534 const uint32_t pbe_reg_byte_offset =
1535 pvr_frag_km_stream_pbe_reg_words_offset(dev_info);
1536 const uint32_t eot_data_addr_byte_offset =
1537 pvr_frag_km_stream_pds_eot_data_addr_offset(dev_info);
1538
1539 /* Massive copy :( */
1540 *state = *frag;
1541
1542 assert(state->fw_stream_len >=
1543 pbe_reg_byte_offset + sizeof(job->pr_pbe_reg_words));
1544 memcpy(&state->fw_stream[pbe_reg_byte_offset],
1545 job->pr_pbe_reg_words,
1546 sizeof(job->pr_pbe_reg_words));
1547
1548 /* TODO: Update this when csbgen is byte instead of dword granular. */
1549 assert(state->fw_stream_len >=
1550 eot_data_addr_byte_offset +
1551 PVR_DW_TO_BYTES(pvr_cmd_length(CR_EVENT_PIXEL_PDS_DATA)));
1552 pvr_csb_pack ((uint32_t *)&state->fw_stream[eot_data_addr_byte_offset],
1553 CR_EVENT_PIXEL_PDS_DATA,
1554 eot_pds_data) {
1555 eot_pds_data.addr = PVR_DEV_ADDR(job->pr_pds_pixel_event_data_offset);
1556 }
1557 }
1558
pvr_render_job_ws_submit_info_init(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct vk_sync * wait_geom,struct vk_sync * wait_frag,struct pvr_winsys_render_submit_info * submit_info)1559 static void pvr_render_job_ws_submit_info_init(
1560 struct pvr_render_ctx *ctx,
1561 struct pvr_render_job *job,
1562 struct vk_sync *wait_geom,
1563 struct vk_sync *wait_frag,
1564 struct pvr_winsys_render_submit_info *submit_info)
1565 {
1566 memset(submit_info, 0, sizeof(*submit_info));
1567
1568 submit_info->rt_dataset = job->rt_dataset->ws_rt_dataset;
1569 submit_info->rt_data_idx = job->rt_dataset->rt_data_idx;
1570
1571 submit_info->frame_num = ctx->device->global_queue_present_count;
1572 submit_info->job_num = ctx->device->global_cmd_buffer_submit_count;
1573
1574 pvr_render_job_ws_geometry_state_init(ctx,
1575 job,
1576 wait_geom,
1577 &submit_info->geometry);
1578
1579 submit_info->has_fragment_job = job->run_frag;
1580
1581 /* TODO: Move the job setup from queue submit into cmd_buffer if possible. */
1582
1583 /* TODO: See if it's worth avoiding setting up the fragment state and setup
1584 * the pr state directly if `!job->run_frag`. For now we'll always set it up.
1585 */
1586 pvr_render_job_ws_fragment_state_init(ctx,
1587 job,
1588 wait_frag,
1589 &submit_info->fragment);
1590
1591 /* TODO: In some cases we could eliminate the pr and use the frag directly in
1592 * case we enter SPM. There's likely some performance improvement to be had
1593 * there. For now we'll always setup the pr.
1594 */
1595 pvr_render_job_ws_fragment_pr_init_based_on_fragment_state(
1596 ctx,
1597 job,
1598 wait_frag,
1599 &submit_info->fragment,
1600 &submit_info->fragment_pr);
1601 }
1602
pvr_render_job_submit(struct pvr_render_ctx * ctx,struct pvr_render_job * job,struct vk_sync * wait_geom,struct vk_sync * wait_frag,struct vk_sync * signal_sync_geom,struct vk_sync * signal_sync_frag)1603 VkResult pvr_render_job_submit(struct pvr_render_ctx *ctx,
1604 struct pvr_render_job *job,
1605 struct vk_sync *wait_geom,
1606 struct vk_sync *wait_frag,
1607 struct vk_sync *signal_sync_geom,
1608 struct vk_sync *signal_sync_frag)
1609 {
1610 struct pvr_rt_dataset *rt_dataset = job->rt_dataset;
1611 struct pvr_winsys_render_submit_info submit_info;
1612 struct pvr_device *device = ctx->device;
1613 VkResult result;
1614
1615 pvr_render_job_ws_submit_info_init(ctx,
1616 job,
1617 wait_geom,
1618 wait_frag,
1619 &submit_info);
1620
1621 if (PVR_IS_DEBUG_SET(DUMP_CONTROL_STREAM)) {
1622 /* FIXME: This isn't an ideal method of accessing the information we
1623 * need, but it's considered good enough for a debug code path. It can be
1624 * streamlined and made more correct if/when pvr_render_job becomes a
1625 * subclass of pvr_sub_cmd.
1626 */
1627 const struct pvr_sub_cmd *sub_cmd =
1628 container_of(job, const struct pvr_sub_cmd, gfx.job);
1629
1630 pvr_csb_dump(&sub_cmd->gfx.control_stream,
1631 submit_info.frame_num,
1632 submit_info.job_num);
1633 }
1634
1635 result = device->ws->ops->render_submit(ctx->ws_ctx,
1636 &submit_info,
1637 &device->pdevice->dev_info,
1638 signal_sync_geom,
1639 signal_sync_frag);
1640 if (result != VK_SUCCESS)
1641 return result;
1642
1643 if (job->run_frag) {
1644 /* Move to the next render target data now that a fragment job has been
1645 * successfully submitted. This will allow the next geometry job to be
1646 * submitted to been run in parallel with it.
1647 */
1648 rt_dataset->rt_data_idx =
1649 (rt_dataset->rt_data_idx + 1) % ARRAY_SIZE(rt_dataset->rt_datas);
1650
1651 rt_dataset->need_frag = false;
1652 } else {
1653 rt_dataset->need_frag = true;
1654 }
1655
1656 return VK_SUCCESS;
1657 }
1658