1 /*
2 * Copyright © 2021 Raspberry Pi Ltd
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "v3dv_private.h"
25 #include "broadcom/common/v3d_macros.h"
26 #include "broadcom/common/v3d_util.h"
27 #include "broadcom/cle/v3dx_pack.h"
28 #include "broadcom/compiler/v3d_compiler.h"
29
30 #include "util/half_float.h"
31 #include "util/u_pack_color.h"
32 #include "vk_format.h"
33
34 void
v3dX(job_emit_binning_flush)35 v3dX(job_emit_binning_flush)(struct v3dv_job *job)
36 {
37 assert(job);
38
39 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(FLUSH));
40 v3dv_return_if_oom(NULL, job);
41
42 cl_emit(&job->bcl, FLUSH, flush);
43 }
44
45 void
v3dX(job_emit_enable_double_buffer)46 v3dX(job_emit_enable_double_buffer)(struct v3dv_job *job)
47 {
48 assert(job->can_use_double_buffer);
49 assert(job->frame_tiling.double_buffer);
50 assert(!job->frame_tiling.msaa);
51 assert(job->bcl_tile_binning_mode_ptr);
52
53 const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
54 struct cl_packet_struct(TILE_BINNING_MODE_CFG) config = {
55 cl_packet_header(TILE_BINNING_MODE_CFG),
56 };
57 config.width_in_pixels = tiling->width;
58 config.height_in_pixels = tiling->height;
59 #if V3D_VERSION == 42
60 config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
61 config.multisample_mode_4x = tiling->msaa;
62 config.double_buffer_in_non_ms_mode = tiling->double_buffer;
63 config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
64 #endif
65 #if V3D_VERSION >= 71
66 unreachable("HW generation 71 not supported yet.");
67 #endif
68
69 uint8_t *rewrite_addr = (uint8_t *)job->bcl_tile_binning_mode_ptr;
70 cl_packet_pack(TILE_BINNING_MODE_CFG)(NULL, rewrite_addr, &config);
71 }
72
73 void
v3dX(job_emit_binning_prolog)74 v3dX(job_emit_binning_prolog)(struct v3dv_job *job,
75 const struct v3dv_frame_tiling *tiling,
76 uint32_t layers)
77 {
78 /* This must go before the binning mode configuration. It is
79 * required for layered framebuffers to work.
80 */
81 cl_emit(&job->bcl, NUMBER_OF_LAYERS, config) {
82 config.number_of_layers = layers;
83 }
84
85 assert(!tiling->double_buffer || !tiling->msaa);
86 job->bcl_tile_binning_mode_ptr = cl_start(&job->bcl);
87 cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
88 config.width_in_pixels = tiling->width;
89 config.height_in_pixels = tiling->height;
90 #if V3D_VERSION == 42
91 config.number_of_render_targets = MAX2(tiling->render_target_count, 1);
92 config.multisample_mode_4x = tiling->msaa;
93 config.double_buffer_in_non_ms_mode = tiling->double_buffer;
94 config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
95 #endif
96 #if V3D_VERSION >= 71
97 config.log2_tile_width = log2_tile_size(tiling->tile_width);
98 config.log2_tile_height = log2_tile_size(tiling->tile_height);
99 /* FIXME: ideally we would like next assert on the packet header (as is
100 * general, so also applies to GL). We would need to expand
101 * gen_pack_header for that.
102 */
103 assert(config.log2_tile_width == config.log2_tile_height ||
104 config.log2_tile_width == config.log2_tile_height + 1);
105 #endif
106 }
107
108 /* There's definitely nothing in the VCD cache we want. */
109 cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
110
111 /* "Binning mode lists must have a Start Tile Binning item (6) after
112 * any prefix state data before the binning list proper starts."
113 */
114 cl_emit(&job->bcl, START_TILE_BINNING, bin);
115 }
116
117 void
v3dX(cmd_buffer_end_render_pass_secondary)118 v3dX(cmd_buffer_end_render_pass_secondary)(struct v3dv_cmd_buffer *cmd_buffer)
119 {
120 assert(cmd_buffer->state.job);
121 v3dv_cl_ensure_space_with_branch(&cmd_buffer->state.job->bcl,
122 cl_packet_length(RETURN_FROM_SUB_LIST));
123 v3dv_return_if_oom(cmd_buffer, NULL);
124 cl_emit(&cmd_buffer->state.job->bcl, RETURN_FROM_SUB_LIST, ret);
125 }
126
127 void
v3dX(job_emit_clip_window)128 v3dX(job_emit_clip_window)(struct v3dv_job *job, const VkRect2D *rect)
129 {
130 assert(job);
131
132 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CLIP_WINDOW));
133 v3dv_return_if_oom(NULL, job);
134
135 cl_emit(&job->bcl, CLIP_WINDOW, clip) {
136 clip.clip_window_left_pixel_coordinate = rect->offset.x;
137 clip.clip_window_bottom_pixel_coordinate = rect->offset.y;
138 clip.clip_window_width_in_pixels = rect->extent.width;
139 clip.clip_window_height_in_pixels = rect->extent.height;
140 }
141 }
142
143 static void
cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,struct v3dv_image_view * iview,uint32_t layer,uint32_t buffer)144 cmd_buffer_render_pass_emit_load(struct v3dv_cmd_buffer *cmd_buffer,
145 struct v3dv_cl *cl,
146 struct v3dv_image_view *iview,
147 uint32_t layer,
148 uint32_t buffer)
149 {
150 const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
151
152 /* We don't support rendering to ycbcr images, so the image view should be
153 * single-plane, and using a single-plane format. But note that the underlying
154 * image can be a ycbcr format, as we support rendering to a specific plane
155 * of an image. This is used for example on some meta_copy code paths, in
156 * order to copy from/to a plane of a ycbcr image.
157 */
158 assert(iview->plane_count == 1);
159 assert(iview->format->plane_count == 1);
160
161 uint8_t image_plane = v3dv_plane_from_aspect(iview->vk.aspects);
162 const struct v3d_resource_slice *slice =
163 &image->planes[image_plane].slices[iview->vk.base_mip_level];
164
165 uint32_t layer_offset =
166 v3dv_layer_offset(image, iview->vk.base_mip_level,
167 iview->vk.base_array_layer + layer, image_plane);
168
169 cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
170 load.buffer_to_load = buffer;
171 load.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
172
173 load.input_image_format = iview->format->planes[0].rt_type;
174
175 /* If we create an image view with only the stencil format, we
176 * re-interpret the format as RGBA8_UINT, as it is want we want in
177 * general (see CreateImageView).
178 *
179 * However, when we are loading/storing tiles from the ZSTENCIL tile
180 * buffer, we need to use the underlying DS format.
181 */
182 if (buffer == ZSTENCIL &&
183 iview->format->planes[0].rt_type == V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI) {
184 assert(image->format->planes[image_plane].rt_type == V3D_OUTPUT_IMAGE_FORMAT_D24S8);
185 load.input_image_format = image->format->planes[image_plane].rt_type;
186 }
187
188 load.r_b_swap = iview->planes[0].swap_rb;
189 load.channel_reverse = iview->planes[0].channel_reverse;
190 load.memory_format = slice->tiling;
191
192 if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
193 slice->tiling == V3D_TILING_UIF_XOR) {
194 load.height_in_ub_or_stride =
195 slice->padded_height_of_output_image_in_uif_blocks;
196 } else if (slice->tiling == V3D_TILING_RASTER) {
197 load.height_in_ub_or_stride = slice->stride;
198 }
199
200 if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
201 load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
202 else
203 load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
204 }
205 }
206
207 static inline uint32_t
v3dv_zs_buffer(bool depth,bool stencil)208 v3dv_zs_buffer(bool depth, bool stencil)
209 {
210 if (depth && stencil)
211 return ZSTENCIL;
212 else if (depth)
213 return Z;
214 else if (stencil)
215 return STENCIL;
216 return NONE;
217 }
218
219 static void
cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,uint32_t layer)220 cmd_buffer_render_pass_emit_loads(struct v3dv_cmd_buffer *cmd_buffer,
221 struct v3dv_cl *cl,
222 uint32_t layer)
223 {
224 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
225 const struct v3dv_render_pass *pass = state->pass;
226 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
227
228 assert(!pass->multiview_enabled || layer < MAX_MULTIVIEW_VIEW_COUNT);
229
230 for (uint32_t i = 0; i < subpass->color_count; i++) {
231 uint32_t attachment_idx = subpass->color_attachments[i].attachment;
232
233 if (attachment_idx == VK_ATTACHMENT_UNUSED)
234 continue;
235
236 const struct v3dv_render_pass_attachment *attachment =
237 &state->pass->attachments[attachment_idx];
238
239 /* According to the Vulkan spec:
240 *
241 * "The load operation for each sample in an attachment happens before
242 * any recorded command which accesses the sample in the first subpass
243 * where the attachment is used."
244 *
245 * If the load operation is CLEAR, we must only clear once on the first
246 * subpass that uses the attachment (and in that case we don't LOAD).
247 * After that, we always want to load so we don't lose any rendering done
248 * by a previous subpass to the same attachment. We also want to load
249 * if the current job is continuing subpass work started by a previous
250 * job, for the same reason.
251 *
252 * If the render area is not aligned to tile boundaries then we have
253 * tiles which are partially covered by it. In this case, we need to
254 * load the tiles so we can preserve the pixels that are outside the
255 * render area for any such tiles.
256 */
257 uint32_t first_subpass = !pass->multiview_enabled ?
258 attachment->first_subpass :
259 attachment->views[layer].first_subpass;
260
261 uint32_t last_subpass = !pass->multiview_enabled ?
262 attachment->last_subpass :
263 attachment->views[layer].last_subpass;
264
265 bool needs_load =
266 v3dv_cmd_buffer_check_needs_load(state,
267 VK_IMAGE_ASPECT_COLOR_BIT,
268 first_subpass,
269 attachment->desc.loadOp,
270 last_subpass,
271 attachment->desc.storeOp);
272 if (needs_load) {
273 struct v3dv_image_view *iview =
274 state->attachments[attachment_idx].image_view;
275 cmd_buffer_render_pass_emit_load(cmd_buffer, cl, iview,
276 layer, RENDER_TARGET_0 + i);
277 }
278 }
279
280 uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
281 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
282 const struct v3dv_render_pass_attachment *ds_attachment =
283 &state->pass->attachments[ds_attachment_idx];
284
285 const VkImageAspectFlags ds_aspects =
286 vk_format_aspects(ds_attachment->desc.format);
287
288 uint32_t ds_first_subpass = !pass->multiview_enabled ?
289 ds_attachment->first_subpass :
290 ds_attachment->views[layer].first_subpass;
291
292 uint32_t ds_last_subpass = !pass->multiview_enabled ?
293 ds_attachment->last_subpass :
294 ds_attachment->views[layer].last_subpass;
295
296 const bool needs_depth_load =
297 v3dv_cmd_buffer_check_needs_load(state,
298 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
299 ds_first_subpass,
300 ds_attachment->desc.loadOp,
301 ds_last_subpass,
302 ds_attachment->desc.storeOp);
303
304 const bool needs_stencil_load =
305 v3dv_cmd_buffer_check_needs_load(state,
306 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
307 ds_first_subpass,
308 ds_attachment->desc.stencilLoadOp,
309 ds_last_subpass,
310 ds_attachment->desc.stencilStoreOp);
311
312 if (needs_depth_load || needs_stencil_load) {
313 struct v3dv_image_view *iview =
314 state->attachments[ds_attachment_idx].image_view;
315 /* From the Vulkan spec:
316 *
317 * "When an image view of a depth/stencil image is used as a
318 * depth/stencil framebuffer attachment, the aspectMask is ignored
319 * and both depth and stencil image subresources are used."
320 *
321 * So we ignore the aspects from the subresource range of the image
322 * view for the depth/stencil attachment, but we still need to restrict
323 * the to aspects compatible with the render pass and the image.
324 */
325 const uint32_t zs_buffer =
326 v3dv_zs_buffer(needs_depth_load, needs_stencil_load);
327 cmd_buffer_render_pass_emit_load(cmd_buffer, cl,
328 iview, layer, zs_buffer);
329 }
330 }
331
332 cl_emit(cl, END_OF_LOADS, end);
333 }
334
335 static void
cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,uint32_t attachment_idx,uint32_t layer,uint32_t buffer,bool clear,bool is_multisample_resolve)336 cmd_buffer_render_pass_emit_store(struct v3dv_cmd_buffer *cmd_buffer,
337 struct v3dv_cl *cl,
338 uint32_t attachment_idx,
339 uint32_t layer,
340 uint32_t buffer,
341 bool clear,
342 bool is_multisample_resolve)
343 {
344 const struct v3dv_image_view *iview =
345 cmd_buffer->state.attachments[attachment_idx].image_view;
346 const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
347
348 /* We don't support rendering to ycbcr images, so the image view should be
349 * one-plane, and using a single-plane format. But note that the underlying
350 * image can be a ycbcr format, as we support rendering to a specific plane
351 * of an image. This is used for example on some meta_copy code paths, in
352 * order to copy from/to a plane of a ycbcr image.
353 */
354 assert(iview->plane_count == 1);
355 assert(iview->format->plane_count == 1);
356
357 uint8_t image_plane = v3dv_plane_from_aspect(iview->vk.aspects);
358 const struct v3d_resource_slice *slice =
359 &image->planes[image_plane].slices[iview->vk.base_mip_level];
360 uint32_t layer_offset = v3dv_layer_offset(image,
361 iview->vk.base_mip_level,
362 iview->vk.base_array_layer + layer,
363 image_plane);
364
365 /* The Clear Buffer bit is not supported for Z/Stencil stores in 7.x and it
366 * is broken in earlier V3D versions.
367 */
368 assert((buffer != Z && buffer != STENCIL && buffer != ZSTENCIL) || !clear);
369
370 cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
371 store.buffer_to_store = buffer;
372 store.address = v3dv_cl_address(image->planes[image_plane].mem->bo, layer_offset);
373 store.clear_buffer_being_stored = clear;
374
375 store.output_image_format = iview->format->planes[0].rt_type;
376
377 /* If we create an image view with only the stencil format, we
378 * re-interpret the format as RGBA8_UINT, as it is want we want in
379 * general (see CreateImageView).
380 *
381 * However, when we are loading/storing tiles from the ZSTENCIL tile
382 * buffer, we need to use the underlying DS format.
383 */
384 if (buffer == ZSTENCIL &&
385 iview->format->planes[0].rt_type == V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI) {
386 assert(image->format->planes[image_plane].rt_type == V3D_OUTPUT_IMAGE_FORMAT_D24S8);
387 store.output_image_format = image->format->planes[image_plane].rt_type;
388 }
389
390 store.r_b_swap = iview->planes[0].swap_rb;
391 store.channel_reverse = iview->planes[0].channel_reverse;
392 store.memory_format = slice->tiling;
393
394 if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
395 slice->tiling == V3D_TILING_UIF_XOR) {
396 store.height_in_ub_or_stride =
397 slice->padded_height_of_output_image_in_uif_blocks;
398 } else if (slice->tiling == V3D_TILING_RASTER) {
399 store.height_in_ub_or_stride = slice->stride;
400 }
401
402 if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
403 store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
404 else if (is_multisample_resolve)
405 store.decimate_mode = V3D_DECIMATE_MODE_4X;
406 else
407 store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
408 }
409 }
410
411 static bool
check_needs_clear(const struct v3dv_cmd_buffer_state * state,VkImageAspectFlags aspect,uint32_t first_subpass_idx,VkAttachmentLoadOp load_op,bool do_clear_with_draw)412 check_needs_clear(const struct v3dv_cmd_buffer_state *state,
413 VkImageAspectFlags aspect,
414 uint32_t first_subpass_idx,
415 VkAttachmentLoadOp load_op,
416 bool do_clear_with_draw)
417 {
418 /* We call this with image->vk.aspects & aspect, so 0 means the aspect we are
419 * testing does not exist in the image.
420 */
421 if (!aspect)
422 return false;
423
424 /* If the aspect needs to be cleared with a draw call then we won't emit
425 * the clear here.
426 */
427 if (do_clear_with_draw)
428 return false;
429
430 /* If this is resuming a subpass started with another job, then attachment
431 * load operations don't apply.
432 */
433 if (state->job->is_subpass_continue)
434 return false;
435
436 /* If the render area is not aligned to tile boundaries we can't use the
437 * TLB for a clear.
438 */
439 if (!state->tile_aligned_render_area)
440 return false;
441
442 /* If this job is running in a subpass other than the first subpass in
443 * which this attachment (or view) is used then attachment load operations
444 * don't apply.
445 */
446 if (state->job->first_subpass != first_subpass_idx)
447 return false;
448
449 /* The attachment load operation must be CLEAR */
450 return load_op == VK_ATTACHMENT_LOAD_OP_CLEAR;
451 }
452
453 static void
cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_cl * cl,uint32_t layer)454 cmd_buffer_render_pass_emit_stores(struct v3dv_cmd_buffer *cmd_buffer,
455 struct v3dv_cl *cl,
456 uint32_t layer)
457 {
458 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
459 struct v3dv_render_pass *pass = state->pass;
460 const struct v3dv_subpass *subpass =
461 &pass->subpasses[state->subpass_idx];
462
463 bool has_stores = false;
464 bool use_global_zs_clear = false;
465 bool use_global_rt_clear = false;
466
467 assert(!pass->multiview_enabled || layer < MAX_MULTIVIEW_VIEW_COUNT);
468
469 /* FIXME: separate stencil */
470 uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
471 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
472 const struct v3dv_render_pass_attachment *ds_attachment =
473 &state->pass->attachments[ds_attachment_idx];
474
475 assert(state->job->first_subpass >= ds_attachment->first_subpass);
476 assert(state->subpass_idx >= ds_attachment->first_subpass);
477 assert(state->subpass_idx <= ds_attachment->last_subpass);
478
479 /* From the Vulkan spec, VkImageSubresourceRange:
480 *
481 * "When an image view of a depth/stencil image is used as a
482 * depth/stencil framebuffer attachment, the aspectMask is ignored
483 * and both depth and stencil image subresources are used."
484 *
485 * So we ignore the aspects from the subresource range of the image
486 * view for the depth/stencil attachment, but we still need to restrict
487 * the to aspects compatible with the render pass and the image.
488 */
489 const VkImageAspectFlags aspects =
490 vk_format_aspects(ds_attachment->desc.format);
491
492 #if V3D_VERSION <= 42
493 /* GFXH-1689: The per-buffer store command's clear buffer bit is broken
494 * for depth/stencil.
495 *
496 * There used to be some confusion regarding the Clear Tile Buffers
497 * Z/S bit also being broken, but we confirmed with Broadcom that this
498 * is not the case, it was just that some other hardware bugs (that we
499 * need to work around, such as GFXH-1461) could cause this bit to behave
500 * incorrectly.
501 *
502 * There used to be another issue where the RTs bit in the Clear Tile
503 * Buffers packet also cleared Z/S, but Broadcom confirmed this is
504 * fixed since V3D 4.1.
505 *
506 * So if we have to emit a clear of depth or stencil we don't use
507 * the per-buffer store clear bit, even if we need to store the buffers,
508 * instead we always have to use the Clear Tile Buffers Z/S bit.
509 * If we have configured the job to do early Z/S clearing, then we
510 * don't want to emit any Clear Tile Buffers command at all here.
511 *
512 * Note that GFXH-1689 is not reproduced in the simulator, where
513 * using the clear buffer bit in depth/stencil stores works fine.
514 */
515
516 /* Only clear once on the first subpass that uses the attachment */
517 uint32_t ds_first_subpass = !state->pass->multiview_enabled ?
518 ds_attachment->first_subpass :
519 ds_attachment->views[layer].first_subpass;
520
521 bool needs_depth_clear =
522 check_needs_clear(state,
523 aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
524 ds_first_subpass,
525 ds_attachment->desc.loadOp,
526 subpass->do_depth_clear_with_draw);
527
528 bool needs_stencil_clear =
529 check_needs_clear(state,
530 aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
531 ds_first_subpass,
532 ds_attachment->desc.stencilLoadOp,
533 subpass->do_stencil_clear_with_draw);
534
535 use_global_zs_clear = !state->job->early_zs_clear &&
536 (needs_depth_clear || needs_stencil_clear);
537 #endif
538 #if V3D_VERSION >= 71
539 /* The store command's clear buffer bit cannot be used for Z/S stencil:
540 * since V3D 4.5.6 Z/S buffers are automatically cleared between tiles,
541 * so we don't want to emit redundant clears here.
542 */
543 use_global_zs_clear = false;
544 #endif
545
546 /* Skip the last store if it is not required */
547 uint32_t ds_last_subpass = !pass->multiview_enabled ?
548 ds_attachment->last_subpass :
549 ds_attachment->views[layer].last_subpass;
550
551 bool needs_depth_store =
552 v3dv_cmd_buffer_check_needs_store(state,
553 aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
554 ds_last_subpass,
555 ds_attachment->desc.storeOp);
556
557 bool needs_stencil_store =
558 v3dv_cmd_buffer_check_needs_store(state,
559 aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
560 ds_last_subpass,
561 ds_attachment->desc.stencilStoreOp);
562
563 /* If we have a resolve, handle it before storing the tile */
564 const struct v3dv_cmd_buffer_attachment_state *ds_att_state =
565 &state->attachments[ds_attachment_idx];
566 if (ds_att_state->use_tlb_resolve) {
567 assert(ds_att_state->has_resolve);
568 assert(subpass->resolve_depth || subpass->resolve_stencil);
569 const uint32_t resolve_attachment_idx =
570 subpass->ds_resolve_attachment.attachment;
571 assert(resolve_attachment_idx != VK_ATTACHMENT_UNUSED);
572
573 const uint32_t zs_buffer =
574 v3dv_zs_buffer(subpass->resolve_depth, subpass->resolve_stencil);
575 cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
576 resolve_attachment_idx, layer,
577 zs_buffer,
578 false, false);
579 has_stores = true;
580 } else if (ds_att_state->has_resolve) {
581 /* If we can't use the TLB to implement the resolve we will need to
582 * store the attachment so we can implement it later using a blit.
583 */
584 needs_depth_store = subpass->resolve_depth;
585 needs_stencil_store = subpass->resolve_stencil;
586 }
587
588 if (needs_depth_store || needs_stencil_store) {
589 const uint32_t zs_buffer =
590 v3dv_zs_buffer(needs_depth_store, needs_stencil_store);
591 cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
592 ds_attachment_idx, layer,
593 zs_buffer, false, false);
594 has_stores = true;
595 }
596 }
597
598 for (uint32_t i = 0; i < subpass->color_count; i++) {
599 uint32_t attachment_idx = subpass->color_attachments[i].attachment;
600
601 if (attachment_idx == VK_ATTACHMENT_UNUSED)
602 continue;
603
604 const struct v3dv_render_pass_attachment *attachment =
605 &state->pass->attachments[attachment_idx];
606
607 assert(state->job->first_subpass >= attachment->first_subpass);
608 assert(state->subpass_idx >= attachment->first_subpass);
609 assert(state->subpass_idx <= attachment->last_subpass);
610
611 /* Only clear once on the first subpass that uses the attachment */
612 uint32_t first_subpass = !pass->multiview_enabled ?
613 attachment->first_subpass :
614 attachment->views[layer].first_subpass;
615
616 bool needs_clear =
617 check_needs_clear(state,
618 VK_IMAGE_ASPECT_COLOR_BIT,
619 first_subpass,
620 attachment->desc.loadOp,
621 false);
622
623 /* Skip the last store if it is not required */
624 uint32_t last_subpass = !pass->multiview_enabled ?
625 attachment->last_subpass :
626 attachment->views[layer].last_subpass;
627
628 bool needs_store =
629 v3dv_cmd_buffer_check_needs_store(state,
630 VK_IMAGE_ASPECT_COLOR_BIT,
631 last_subpass,
632 attachment->desc.storeOp);
633
634 /* If we need to resolve this attachment emit that store first. Notice
635 * that we must not request a tile buffer clear here in that case, since
636 * that would clear the tile buffer before we get to emit the actual
637 * color attachment store below, since the clear happens after the
638 * store is completed.
639 *
640 * If the attachment doesn't support TLB resolves (or the render area
641 * is not aligned to tile boundaries) then we will have to fallback to
642 * doing the resolve in a shader separately after this job, so we will
643 * need to store the multisampled attachment even if that wasn't
644 * requested by the client.
645 */
646 const struct v3dv_cmd_buffer_attachment_state *att_state =
647 &state->attachments[attachment_idx];
648 if (att_state->use_tlb_resolve) {
649 assert(att_state->has_resolve);
650 const uint32_t resolve_attachment_idx =
651 subpass->resolve_attachments[i].attachment;
652 cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
653 resolve_attachment_idx, layer,
654 RENDER_TARGET_0 + i,
655 false, true);
656 has_stores = true;
657 } else if (att_state->has_resolve) {
658 needs_store = true;
659 }
660
661 /* Emit the color attachment store if needed */
662 if (needs_store) {
663 cmd_buffer_render_pass_emit_store(cmd_buffer, cl,
664 attachment_idx, layer,
665 RENDER_TARGET_0 + i,
666 needs_clear && !use_global_rt_clear,
667 false);
668 has_stores = true;
669 } else if (needs_clear) {
670 use_global_rt_clear = true;
671 }
672 }
673
674 /* We always need to emit at least one dummy store */
675 if (!has_stores) {
676 cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
677 store.buffer_to_store = NONE;
678 }
679 }
680
681 /* If we have any depth/stencil clears we can't use the per-buffer clear
682 * bit and instead we have to emit a single clear of all tile buffers.
683 */
684 if (use_global_zs_clear || use_global_rt_clear) {
685 #if V3D_VERSION == 42
686 cl_emit(cl, CLEAR_TILE_BUFFERS, clear) {
687 clear.clear_z_stencil_buffer = use_global_zs_clear;
688 clear.clear_all_render_targets = use_global_rt_clear;
689 }
690 #endif
691 #if V3D_VERSION >= 71
692 cl_emit(cl, CLEAR_RENDER_TARGETS, clear);
693 #endif
694 }
695 }
696
697 static void
cmd_buffer_render_pass_emit_per_tile_rcl(struct v3dv_cmd_buffer * cmd_buffer,uint32_t layer)698 cmd_buffer_render_pass_emit_per_tile_rcl(struct v3dv_cmd_buffer *cmd_buffer,
699 uint32_t layer)
700 {
701 struct v3dv_job *job = cmd_buffer->state.job;
702 assert(job);
703
704 /* Emit the generic list in our indirect state -- the rcl will just
705 * have pointers into it.
706 */
707 struct v3dv_cl *cl = &job->indirect;
708 v3dv_cl_ensure_space(cl, 200, 1);
709 v3dv_return_if_oom(cmd_buffer, NULL);
710
711 struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
712
713 cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
714
715 cmd_buffer_render_pass_emit_loads(cmd_buffer, cl, layer);
716
717 /* The binner starts out writing tiles assuming that the initial mode
718 * is triangles, so make sure that's the case.
719 */
720 cl_emit(cl, PRIM_LIST_FORMAT, fmt) {
721 fmt.primitive_type = LIST_TRIANGLES;
722 }
723
724 /* PTB assumes that value to be 0, but hw will not set it. */
725 cl_emit(cl, SET_INSTANCEID, set) {
726 set.instance_id = 0;
727 }
728
729 cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
730
731 cmd_buffer_render_pass_emit_stores(cmd_buffer, cl, layer);
732
733 cl_emit(cl, END_OF_TILE_MARKER, end);
734
735 cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
736
737 cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
738 branch.start = tile_list_start;
739 branch.end = v3dv_cl_get_address(cl);
740 }
741 }
742
743 static void
cmd_buffer_emit_render_pass_layer_rcl(struct v3dv_cmd_buffer * cmd_buffer,uint32_t layer)744 cmd_buffer_emit_render_pass_layer_rcl(struct v3dv_cmd_buffer *cmd_buffer,
745 uint32_t layer)
746 {
747 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
748
749 struct v3dv_job *job = cmd_buffer->state.job;
750 struct v3dv_cl *rcl = &job->rcl;
751
752 /* If doing multicore binning, we would need to initialize each
753 * core's tile list here.
754 */
755 const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
756 const uint32_t tile_alloc_offset =
757 64 * layer * tiling->draw_tiles_x * tiling->draw_tiles_y;
758 cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
759 list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
760 }
761
762 cmd_buffer_render_pass_emit_per_tile_rcl(cmd_buffer, layer);
763
764 uint32_t supertile_w_in_pixels =
765 tiling->tile_width * tiling->supertile_width;
766 uint32_t supertile_h_in_pixels =
767 tiling->tile_height * tiling->supertile_height;
768 const uint32_t min_x_supertile =
769 state->render_area.offset.x / supertile_w_in_pixels;
770 const uint32_t min_y_supertile =
771 state->render_area.offset.y / supertile_h_in_pixels;
772
773 uint32_t max_render_x = state->render_area.offset.x;
774 if (state->render_area.extent.width > 0)
775 max_render_x += state->render_area.extent.width - 1;
776 uint32_t max_render_y = state->render_area.offset.y;
777 if (state->render_area.extent.height > 0)
778 max_render_y += state->render_area.extent.height - 1;
779 const uint32_t max_x_supertile = max_render_x / supertile_w_in_pixels;
780 const uint32_t max_y_supertile = max_render_y / supertile_h_in_pixels;
781
782 for (int y = min_y_supertile; y <= max_y_supertile; y++) {
783 for (int x = min_x_supertile; x <= max_x_supertile; x++) {
784 cl_emit(rcl, SUPERTILE_COORDINATES, coords) {
785 coords.column_number_in_supertiles = x;
786 coords.row_number_in_supertiles = y;
787 }
788 }
789 }
790 }
791
792 static void
set_rcl_early_z_config(struct v3dv_job * job,bool * early_z_disable,uint32_t * early_z_test_and_update_direction)793 set_rcl_early_z_config(struct v3dv_job *job,
794 bool *early_z_disable,
795 uint32_t *early_z_test_and_update_direction)
796 {
797 /* Disable if none of the draw calls in this job enabled EZ */
798 if (!job->has_ez_draws) {
799 *early_z_disable = true;
800 return;
801 }
802
803 switch (job->first_ez_state) {
804 case V3D_EZ_UNDECIDED:
805 case V3D_EZ_LT_LE:
806 *early_z_disable = false;
807 *early_z_test_and_update_direction = EARLY_Z_DIRECTION_LT_LE;
808 break;
809 case V3D_EZ_GT_GE:
810 *early_z_disable = false;
811 *early_z_test_and_update_direction = EARLY_Z_DIRECTION_GT_GE;
812 break;
813 case V3D_EZ_DISABLED:
814 *early_z_disable = true;
815 break;
816 }
817 }
818
819 /* Note that for v71, render target cfg packets has just one field that
820 * combined the internal type and clamp mode. For simplicity we keep just one
821 * helper.
822 *
823 * Note: rt_type is in fact a "enum V3DX(Internal_Type)".
824 *
825 * FIXME: for v71 we are not returning all the possible combinations for
826 * render target internal type and clamp. For example for int types we are
827 * always using clamp int, and for 16f we are using clamp none or pos (that
828 * seems to be the equivalent for no-clamp on 4.2), but not pq or hlg. In
829 * summary right now we are just porting what we were doing on 4.2
830 */
831 uint32_t
v3dX(clamp_for_format_and_type)832 v3dX(clamp_for_format_and_type)(uint32_t rt_type,
833 VkFormat vk_format)
834 {
835 #if V3D_VERSION == 42
836 if (vk_format_is_int(vk_format))
837 return V3D_RENDER_TARGET_CLAMP_INT;
838 else if (vk_format_is_srgb(vk_format))
839 return V3D_RENDER_TARGET_CLAMP_NORM;
840 else
841 return V3D_RENDER_TARGET_CLAMP_NONE;
842 #endif
843 #if V3D_VERSION >= 71
844 switch (rt_type) {
845 case V3D_INTERNAL_TYPE_8I:
846 return V3D_RENDER_TARGET_TYPE_CLAMP_8I_CLAMPED;
847 case V3D_INTERNAL_TYPE_8UI:
848 return V3D_RENDER_TARGET_TYPE_CLAMP_8UI_CLAMPED;
849 case V3D_INTERNAL_TYPE_8:
850 return V3D_RENDER_TARGET_TYPE_CLAMP_8;
851 case V3D_INTERNAL_TYPE_16I:
852 return V3D_RENDER_TARGET_TYPE_CLAMP_16I_CLAMPED;
853 case V3D_INTERNAL_TYPE_16UI:
854 return V3D_RENDER_TARGET_TYPE_CLAMP_16UI_CLAMPED;
855 case V3D_INTERNAL_TYPE_16F:
856 return vk_format_is_srgb(vk_format) ?
857 V3D_RENDER_TARGET_TYPE_CLAMP_16F_CLAMP_NORM :
858 V3D_RENDER_TARGET_TYPE_CLAMP_16F;
859 case V3D_INTERNAL_TYPE_32I:
860 return V3D_RENDER_TARGET_TYPE_CLAMP_32I_CLAMPED;
861 case V3D_INTERNAL_TYPE_32UI:
862 return V3D_RENDER_TARGET_TYPE_CLAMP_32UI_CLAMPED;
863 case V3D_INTERNAL_TYPE_32F:
864 return V3D_RENDER_TARGET_TYPE_CLAMP_32F;
865 default:
866 unreachable("Unknown internal render target type");
867 }
868
869 return V3D_RENDER_TARGET_TYPE_CLAMP_INVALID;
870 #endif
871 }
872
873 static void
cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer * cmd_buffer,int rt,uint32_t * rt_bpp,uint32_t * rt_type,uint32_t * rt_clamp)874 cmd_buffer_render_pass_setup_render_target(struct v3dv_cmd_buffer *cmd_buffer,
875 int rt,
876 uint32_t *rt_bpp,
877 #if V3D_VERSION == 42
878 uint32_t *rt_type,
879 uint32_t *rt_clamp)
880 #else
881 uint32_t *rt_type_clamp)
882 #endif
883 {
884 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
885
886 assert(state->subpass_idx < state->pass->subpass_count);
887 const struct v3dv_subpass *subpass =
888 &state->pass->subpasses[state->subpass_idx];
889
890 if (rt >= subpass->color_count)
891 return;
892
893 struct v3dv_subpass_attachment *attachment = &subpass->color_attachments[rt];
894 const uint32_t attachment_idx = attachment->attachment;
895 if (attachment_idx == VK_ATTACHMENT_UNUSED)
896 return;
897
898 assert(attachment_idx < state->framebuffer->attachment_count &&
899 attachment_idx < state->attachment_alloc_count);
900 struct v3dv_image_view *iview = state->attachments[attachment_idx].image_view;
901 assert(vk_format_is_color(iview->vk.format));
902
903 assert(iview->plane_count == 1);
904 *rt_bpp = iview->planes[0].internal_bpp;
905 #if V3D_VERSION == 42
906 *rt_type = iview->planes[0].internal_type;
907 *rt_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
908 iview->vk.format);
909 #endif
910 #if V3D_VERSION >= 71
911 *rt_type_clamp = v3dX(clamp_for_format_and_type)(iview->planes[0].internal_type,
912 iview->vk.format);
913 #endif
914 }
915
916 void
v3dX(cmd_buffer_emit_render_pass_rcl)917 v3dX(cmd_buffer_emit_render_pass_rcl)(struct v3dv_cmd_buffer *cmd_buffer)
918 {
919 struct v3dv_job *job = cmd_buffer->state.job;
920 assert(job);
921
922 const struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
923 const struct v3dv_framebuffer *framebuffer = state->framebuffer;
924
925 /* We can't emit the RCL until we have a framebuffer, which we may not have
926 * if we are recording a secondary command buffer. In that case, we will
927 * have to wait until vkCmdExecuteCommands is called from a primary command
928 * buffer.
929 */
930 if (!framebuffer) {
931 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
932 return;
933 }
934
935 const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
936
937 const uint32_t fb_layers = job->frame_tiling.layers;
938
939 v3dv_cl_ensure_space_with_branch(&job->rcl, 200 +
940 MAX2(fb_layers, 1) * 256 *
941 cl_packet_length(SUPERTILE_COORDINATES));
942 v3dv_return_if_oom(cmd_buffer, NULL);
943
944 assert(state->subpass_idx < state->pass->subpass_count);
945 const struct v3dv_render_pass *pass = state->pass;
946 const struct v3dv_subpass *subpass = &pass->subpasses[state->subpass_idx];
947 struct v3dv_cl *rcl = &job->rcl;
948
949 /* Common config must be the first TILE_RENDERING_MODE_CFG and
950 * Z_STENCIL_CLEAR_VALUES must be last. The ones in between are optional
951 * updates to the previous HW state.
952 */
953 bool do_early_zs_clear = false;
954 const uint32_t ds_attachment_idx = subpass->ds_attachment.attachment;
955 assert(!tiling->msaa || !tiling->double_buffer);
956 cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
957 config.image_width_pixels = framebuffer->width;
958 config.image_height_pixels = framebuffer->height;
959 config.number_of_render_targets = MAX2(subpass->color_count, 1);
960 config.multisample_mode_4x = tiling->msaa;
961 config.double_buffer_in_non_ms_mode = tiling->double_buffer;
962 #if V3D_VERSION == 42
963 config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
964 #endif
965 #if V3D_VERSION >= 71
966 config.log2_tile_width = log2_tile_size(tiling->tile_width);
967 config.log2_tile_height = log2_tile_size(tiling->tile_height);
968 /* FIXME: ideallly we would like next assert on the packet header (as is
969 * general, so also applies to GL). We would need to expand
970 * gen_pack_header for that.
971 */
972 assert(config.log2_tile_width == config.log2_tile_height ||
973 config.log2_tile_width == config.log2_tile_height + 1);
974 #endif
975
976 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
977 const struct v3dv_image_view *iview =
978 state->attachments[ds_attachment_idx].image_view;
979
980 /* At this point the image view should be single-plane. But note that
981 * the underlying image can be multi-plane, and the image view refer
982 * to one specific plane.
983 */
984 assert(iview->plane_count == 1);
985 assert(iview->format->plane_count == 1);
986 config.internal_depth_type = iview->planes[0].internal_type;
987
988 set_rcl_early_z_config(job,
989 &config.early_z_disable,
990 &config.early_z_test_and_update_direction);
991
992 /* Early-Z/S clear can be enabled if the job is clearing and not
993 * storing (or loading) depth. If a stencil aspect is also present
994 * we have the same requirements for it, however, in this case we
995 * can accept stencil loadOp DONT_CARE as well, so instead of
996 * checking that stencil is cleared we check that is not loaded.
997 *
998 * Early-Z/S clearing is independent of Early Z/S testing, so it is
999 * possible to enable one but not the other so long as their
1000 * respective requirements are met.
1001 *
1002 * From V3D 4.5.6, Z/S buffers are always cleared automatically
1003 * between tiles, but we still want to enable early ZS clears
1004 * when Z/S are not loaded or stored.
1005 */
1006 struct v3dv_render_pass_attachment *ds_attachment =
1007 &pass->attachments[ds_attachment_idx];
1008
1009 const VkImageAspectFlags ds_aspects =
1010 vk_format_aspects(ds_attachment->desc.format);
1011
1012 bool needs_depth_store =
1013 v3dv_cmd_buffer_check_needs_store(state,
1014 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1015 ds_attachment->last_subpass,
1016 ds_attachment->desc.storeOp) ||
1017 subpass->resolve_depth;
1018 #if V3D_VERSION <= 42
1019 bool needs_depth_clear =
1020 check_needs_clear(state,
1021 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1022 ds_attachment->first_subpass,
1023 ds_attachment->desc.loadOp,
1024 subpass->do_depth_clear_with_draw);
1025
1026 do_early_zs_clear = needs_depth_clear && !needs_depth_store;
1027 #endif
1028 #if V3D_VERSION >= 71
1029 bool needs_depth_load =
1030 v3dv_cmd_buffer_check_needs_load(state,
1031 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1032 ds_attachment->first_subpass,
1033 ds_attachment->desc.loadOp,
1034 ds_attachment->last_subpass,
1035 ds_attachment->desc.storeOp);
1036 do_early_zs_clear = !needs_depth_load && !needs_depth_store;
1037 #endif
1038
1039 if (do_early_zs_clear &&
1040 vk_format_has_stencil(ds_attachment->desc.format)) {
1041 bool needs_stencil_load =
1042 v3dv_cmd_buffer_check_needs_load(state,
1043 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1044 ds_attachment->first_subpass,
1045 ds_attachment->desc.stencilLoadOp,
1046 ds_attachment->last_subpass,
1047 ds_attachment->desc.stencilStoreOp);
1048
1049 bool needs_stencil_store =
1050 v3dv_cmd_buffer_check_needs_store(state,
1051 ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT,
1052 ds_attachment->last_subpass,
1053 ds_attachment->desc.stencilStoreOp) ||
1054 subpass->resolve_stencil;
1055
1056 do_early_zs_clear = !needs_stencil_load && !needs_stencil_store;
1057 }
1058
1059 config.early_depth_stencil_clear = do_early_zs_clear;
1060 } else {
1061 config.early_z_disable = true;
1062 }
1063 }
1064
1065 /* If we enabled early Z/S clear, then we can't emit any "Clear Tile Buffers"
1066 * commands with the Z/S bit set, so keep track of whether we enabled this
1067 * in the job so we can skip these later.
1068 */
1069 job->early_zs_clear = do_early_zs_clear;
1070
1071 #if V3D_VERSION >= 71
1072 uint32_t base_addr = 0;
1073 #endif
1074 for (uint32_t i = 0; i < subpass->color_count; i++) {
1075 uint32_t attachment_idx = subpass->color_attachments[i].attachment;
1076 if (attachment_idx == VK_ATTACHMENT_UNUSED) {
1077 #if V3D_VERSION >= 71
1078 cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
1079 rt.render_target_number = i;
1080 rt.stride = 1; /* Unused */
1081 }
1082 #endif
1083 continue;
1084 }
1085
1086 struct v3dv_image_view *iview =
1087 state->attachments[attachment_idx].image_view;
1088 assert(iview->plane_count == 1);
1089
1090 const struct v3dv_image *image = (struct v3dv_image *) iview->vk.image;
1091
1092 uint8_t plane = v3dv_plane_from_aspect(iview->vk.aspects);
1093 const struct v3d_resource_slice *slice =
1094 &image->planes[plane].slices[iview->vk.base_mip_level];
1095
1096 UNUSED const uint32_t *clear_color =
1097 &state->attachments[attachment_idx].clear_value.color[0];
1098
1099 UNUSED uint32_t clear_pad = 0;
1100 if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
1101 slice->tiling == V3D_TILING_UIF_XOR) {
1102 int uif_block_height = v3d_utile_height(image->planes[plane].cpp) * 2;
1103
1104 uint32_t implicit_padded_height =
1105 align(framebuffer->height, uif_block_height) / uif_block_height;
1106
1107 if (slice->padded_height_of_output_image_in_uif_blocks -
1108 implicit_padded_height >= 15) {
1109 clear_pad = slice->padded_height_of_output_image_in_uif_blocks;
1110 }
1111 }
1112
1113 #if V3D_VERSION == 42
1114 cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
1115 clear.clear_color_low_32_bits = clear_color[0];
1116 clear.clear_color_next_24_bits = clear_color[1] & 0xffffff;
1117 clear.render_target_number = i;
1118 };
1119
1120 if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
1121 cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
1122 clear.clear_color_mid_low_32_bits =
1123 ((clear_color[1] >> 24) | (clear_color[2] << 8));
1124 clear.clear_color_mid_high_24_bits =
1125 ((clear_color[2] >> 24) | ((clear_color[3] & 0xffff) << 8));
1126 clear.render_target_number = i;
1127 };
1128 }
1129
1130 if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
1131 cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
1132 clear.uif_padded_height_in_uif_blocks = clear_pad;
1133 clear.clear_color_high_16_bits = clear_color[3] >> 16;
1134 clear.render_target_number = i;
1135 };
1136 }
1137 #endif
1138
1139 #if V3D_VERSION >= 71
1140 cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
1141 rt.clear_color_low_bits = clear_color[0];
1142 cmd_buffer_render_pass_setup_render_target(cmd_buffer, i, &rt.internal_bpp,
1143 &rt.internal_type_and_clamping);
1144 rt.stride =
1145 v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
1146 v3d_internal_bpp_words(rt.internal_bpp));
1147 rt.base_address = base_addr;
1148 rt.render_target_number = i;
1149
1150 /* base_addr in multiples of 512 bits. We divide by 8 because stride
1151 * is in 128-bit units, but it is packing 2 rows worth of data, so we
1152 * need to divide it by 2 so it is only 1 row, and then again by 4 so
1153 * it is in 512-bit units.
1154 */
1155 base_addr += (tiling->tile_height * rt.stride) / 8;
1156 }
1157
1158 if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_64) {
1159 cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
1160 rt.clear_color_mid_bits = /* 40 bits (32 + 8) */
1161 ((uint64_t) clear_color[1]) |
1162 (((uint64_t) (clear_color[2] & 0xff)) << 32);
1163 rt.render_target_number = i;
1164 }
1165 }
1166
1167 if (iview->planes[0].internal_bpp >= V3D_INTERNAL_BPP_128) {
1168 cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
1169 rt.clear_color_top_bits = /* 56 bits (24 + 32) */
1170 (((uint64_t) (clear_color[2] & 0xffffff00)) >> 8) |
1171 (((uint64_t) (clear_color[3])) << 24);
1172 rt.render_target_number = i;
1173 }
1174 }
1175 #endif
1176 }
1177
1178 #if V3D_VERSION >= 71
1179 /* If we don't have any color RTs, we still need to emit one and flag
1180 * it as not used using stride = 1.
1181 */
1182 if (subpass->color_count == 0) {
1183 cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
1184 rt.stride = 1;
1185 }
1186 }
1187 #endif
1188
1189 #if V3D_VERSION == 42
1190 cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
1191 cmd_buffer_render_pass_setup_render_target
1192 (cmd_buffer, 0, &rt.render_target_0_internal_bpp,
1193 &rt.render_target_0_internal_type, &rt.render_target_0_clamp);
1194 cmd_buffer_render_pass_setup_render_target
1195 (cmd_buffer, 1, &rt.render_target_1_internal_bpp,
1196 &rt.render_target_1_internal_type, &rt.render_target_1_clamp);
1197 cmd_buffer_render_pass_setup_render_target
1198 (cmd_buffer, 2, &rt.render_target_2_internal_bpp,
1199 &rt.render_target_2_internal_type, &rt.render_target_2_clamp);
1200 cmd_buffer_render_pass_setup_render_target
1201 (cmd_buffer, 3, &rt.render_target_3_internal_bpp,
1202 &rt.render_target_3_internal_type, &rt.render_target_3_clamp);
1203 }
1204 #endif
1205
1206 /* Ends rendering mode config. */
1207 if (ds_attachment_idx != VK_ATTACHMENT_UNUSED) {
1208 cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
1209 clear.z_clear_value =
1210 state->attachments[ds_attachment_idx].clear_value.z;
1211 clear.stencil_clear_value =
1212 state->attachments[ds_attachment_idx].clear_value.s;
1213 };
1214 } else {
1215 cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
1216 clear.z_clear_value = 1.0f;
1217 clear.stencil_clear_value = 0;
1218 };
1219 }
1220
1221 /* Always set initial block size before the first branch, which needs
1222 * to match the value from binning mode config.
1223 */
1224 cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
1225 init.use_auto_chained_tile_lists = true;
1226 init.size_of_first_block_in_chained_tile_lists =
1227 TILE_ALLOCATION_BLOCK_SIZE_64B;
1228 }
1229
1230 cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
1231 config.number_of_bin_tile_lists = 1;
1232 config.total_frame_width_in_tiles = tiling->draw_tiles_x;
1233 config.total_frame_height_in_tiles = tiling->draw_tiles_y;
1234
1235 config.supertile_width_in_tiles = tiling->supertile_width;
1236 config.supertile_height_in_tiles = tiling->supertile_height;
1237
1238 config.total_frame_width_in_supertiles =
1239 tiling->frame_width_in_supertiles;
1240 config.total_frame_height_in_supertiles =
1241 tiling->frame_height_in_supertiles;
1242 }
1243
1244 /* Emit an initial clear of the tile buffers. This is necessary
1245 * for any buffers that should be cleared (since clearing
1246 * normally happens at the *end* of the generic tile list), but
1247 * it's also nice to clear everything so the first tile doesn't
1248 * inherit any contents from some previous frame.
1249 *
1250 * Also, implement the GFXH-1742 workaround. There's a race in
1251 * the HW between the RCL updating the TLB's internal type/size
1252 * and the spawning of the QPU instances using the TLB's current
1253 * internal type/size. To make sure the QPUs get the right
1254 * state, we need 1 dummy store in between internal type/size
1255 * changes on V3D 3.x, and 2 dummy stores on 4.x.
1256 */
1257 for (int i = 0; i < 2; i++) {
1258 cl_emit(rcl, TILE_COORDINATES, coords);
1259 cl_emit(rcl, END_OF_LOADS, end);
1260 cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
1261 store.buffer_to_store = NONE;
1262 }
1263 if (cmd_buffer->state.tile_aligned_render_area &&
1264 (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
1265 #if V3D_VERSION == 42
1266 cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
1267 clear.clear_z_stencil_buffer = !job->early_zs_clear;
1268 clear.clear_all_render_targets = true;
1269 }
1270 #endif
1271 #if V3D_VERSION >= 71
1272 cl_emit(rcl, CLEAR_RENDER_TARGETS, clear_rt);
1273 #endif
1274 }
1275 cl_emit(rcl, END_OF_TILE_MARKER, end);
1276 }
1277
1278 cl_emit(rcl, FLUSH_VCD_CACHE, flush);
1279
1280 for (int layer = 0; layer < MAX2(1, fb_layers); layer++) {
1281 if (subpass->view_mask == 0 || (subpass->view_mask & (1u << layer)))
1282 cmd_buffer_emit_render_pass_layer_rcl(cmd_buffer, layer);
1283 }
1284
1285 cl_emit(rcl, END_OF_RENDERING, end);
1286 }
1287
1288 void
v3dX(viewport_compute_xform)1289 v3dX(viewport_compute_xform)(const VkViewport *viewport,
1290 float scale[3],
1291 float translate[3])
1292 {
1293 float x = viewport->x;
1294 float y = viewport->y;
1295 float half_width = 0.5f * viewport->width;
1296 float half_height = 0.5f * viewport->height;
1297 double n = viewport->minDepth;
1298 double f = viewport->maxDepth;
1299
1300 scale[0] = half_width;
1301 translate[0] = half_width + x;
1302 scale[1] = half_height;
1303 translate[1] = half_height + y;
1304
1305 scale[2] = (f - n);
1306 translate[2] = n;
1307
1308 /* It seems that if the scale is small enough the hardware won't clip
1309 * correctly so we work around this my choosing the smallest scale that
1310 * seems to work.
1311 *
1312 * This case is exercised by CTS:
1313 * dEQP-VK.draw.renderpass.inverted_depth_ranges.nodepthclamp_deltazero
1314 *
1315 * V3D 7.x fixes this by using the new
1316 * CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND.
1317 */
1318 #if V3D_VERSION <= 42
1319 const float min_abs_scale = 0.0005f;
1320 if (fabs(scale[2]) < min_abs_scale)
1321 scale[2] = scale[2] < 0 ? -min_abs_scale : min_abs_scale;
1322 #endif
1323 }
1324
1325 void
v3dX(cmd_buffer_emit_viewport)1326 v3dX(cmd_buffer_emit_viewport)(struct v3dv_cmd_buffer *cmd_buffer)
1327 {
1328 struct v3dv_dynamic_state *dynamic = &cmd_buffer->state.dynamic;
1329 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1330 assert(pipeline);
1331
1332 /* FIXME: right now we don't support multiViewport so viewports[0] would
1333 * work now, but would need to change if we allow multiple viewports.
1334 */
1335 float *vptranslate = dynamic->viewport.translate[0];
1336 float *vpscale = dynamic->viewport.scale[0];
1337
1338 struct v3dv_job *job = cmd_buffer->state.job;
1339 assert(job);
1340
1341 const uint32_t required_cl_size =
1342 cl_packet_length(CLIPPER_XY_SCALING) +
1343 cl_packet_length(CLIPPER_Z_SCALE_AND_OFFSET) +
1344 cl_packet_length(CLIPPER_Z_MIN_MAX_CLIPPING_PLANES) +
1345 cl_packet_length(VIEWPORT_OFFSET);
1346 v3dv_cl_ensure_space_with_branch(&job->bcl, required_cl_size);
1347 v3dv_return_if_oom(cmd_buffer, NULL);
1348
1349 #if V3D_VERSION == 42
1350 cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
1351 clip.viewport_half_width_in_1_256th_of_pixel = vpscale[0] * 256.0f;
1352 clip.viewport_half_height_in_1_256th_of_pixel = vpscale[1] * 256.0f;
1353 }
1354 #endif
1355 #if V3D_VERSION >= 71
1356 cl_emit(&job->bcl, CLIPPER_XY_SCALING, clip) {
1357 clip.viewport_half_width_in_1_64th_of_pixel = vpscale[0] * 64.0f;
1358 clip.viewport_half_height_in_1_64th_of_pixel = vpscale[1] * 64.0f;
1359 }
1360 #endif
1361
1362 float translate_z, scale_z;
1363 v3dv_cmd_buffer_state_get_viewport_z_xform(cmd_buffer, 0,
1364 &translate_z, &scale_z);
1365
1366 #if V3D_VERSION == 42
1367 cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
1368 clip.viewport_z_offset_zc_to_zs = translate_z;
1369 clip.viewport_z_scale_zc_to_zs = scale_z;
1370 }
1371 #endif
1372
1373 #if V3D_VERSION >= 71
1374 /* If the Z scale is too small guardband clipping may not clip correctly */
1375 if (fabsf(scale_z) < 0.01f) {
1376 cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET_NO_GUARDBAND, clip) {
1377 clip.viewport_z_offset_zc_to_zs = translate_z;
1378 clip.viewport_z_scale_zc_to_zs = scale_z;
1379 }
1380 } else {
1381 cl_emit(&job->bcl, CLIPPER_Z_SCALE_AND_OFFSET, clip) {
1382 clip.viewport_z_offset_zc_to_zs = translate_z;
1383 clip.viewport_z_scale_zc_to_zs = scale_z;
1384 }
1385 }
1386 #endif
1387
1388 cl_emit(&job->bcl, CLIPPER_Z_MIN_MAX_CLIPPING_PLANES, clip) {
1389 /* Vulkan's default Z NDC is [0..1]. If 'negative_one_to_one' is enabled,
1390 * we are using OpenGL's [-1, 1] instead.
1391 */
1392 float z1 = pipeline->negative_one_to_one ? translate_z - scale_z :
1393 translate_z;
1394 float z2 = translate_z + scale_z;
1395 clip.minimum_zw = MIN2(z1, z2);
1396 clip.maximum_zw = MAX2(z1, z2);
1397 }
1398
1399 cl_emit(&job->bcl, VIEWPORT_OFFSET, vp) {
1400 float vp_fine_x = vptranslate[0];
1401 float vp_fine_y = vptranslate[1];
1402 int32_t vp_coarse_x = 0;
1403 int32_t vp_coarse_y = 0;
1404
1405 /* The fine coordinates must be unsigned, but coarse can be signed */
1406 if (unlikely(vp_fine_x < 0)) {
1407 int32_t blocks_64 = DIV_ROUND_UP(fabsf(vp_fine_x), 64);
1408 vp_fine_x += 64.0f * blocks_64;
1409 vp_coarse_x -= blocks_64;
1410 }
1411
1412 if (unlikely(vp_fine_y < 0)) {
1413 int32_t blocks_64 = DIV_ROUND_UP(fabsf(vp_fine_y), 64);
1414 vp_fine_y += 64.0f * blocks_64;
1415 vp_coarse_y -= blocks_64;
1416 }
1417
1418 vp.fine_x = vp_fine_x;
1419 vp.fine_y = vp_fine_y;
1420 vp.coarse_x = vp_coarse_x;
1421 vp.coarse_y = vp_coarse_y;
1422 }
1423
1424 BITSET_CLEAR(cmd_buffer->vk.dynamic_graphics_state.dirty,
1425 MESA_VK_DYNAMIC_VP_VIEWPORTS);
1426 }
1427
1428 void
v3dX(cmd_buffer_emit_stencil)1429 v3dX(cmd_buffer_emit_stencil)(struct v3dv_cmd_buffer *cmd_buffer)
1430 {
1431 struct v3dv_job *job = cmd_buffer->state.job;
1432 assert(job);
1433
1434 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1435 struct vk_dynamic_graphics_state *dyn =
1436 &cmd_buffer->vk.dynamic_graphics_state;
1437 bool has_stencil =
1438 pipeline->rendering_info.stencil_attachment_format != VK_FORMAT_UNDEFINED;
1439
1440 if (!(dyn->ds.stencil.test_enable && has_stencil))
1441 return;
1442
1443 v3dv_cl_ensure_space_with_branch(&job->bcl,
1444 2 * cl_packet_length(STENCIL_CFG));
1445 v3dv_return_if_oom(cmd_buffer, NULL);
1446
1447 bool any_dynamic_stencil_state =
1448 BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK) ||
1449 BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK) ||
1450 BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE) ||
1451 BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_OP);
1452
1453 bool emitted_stencil = false;
1454 const struct vk_stencil_test_face_state *front = &dyn->ds.stencil.front;
1455 const struct vk_stencil_test_face_state *back = &dyn->ds.stencil.back;
1456
1457 const bool needs_front_and_back = any_dynamic_stencil_state ?
1458 memcmp(front, back, sizeof(*front)) != 0 :
1459 pipeline->emit_stencil_cfg[1] == true;
1460
1461 for (uint32_t i = 0; i < 2; i++) {
1462 if (any_dynamic_stencil_state) {
1463 const struct vk_stencil_test_face_state *stencil_state =
1464 i == 0 ? front : back;
1465 /* If we have any dynamic stencil state we just emit the entire
1466 * packet since for simplicity
1467 */
1468 cl_emit(&job->bcl, STENCIL_CFG, config) {
1469 config.front_config = !needs_front_and_back || i == 0;
1470 config.back_config = !needs_front_and_back || i == 1;
1471 config.stencil_test_mask = stencil_state->compare_mask & 0xff;
1472 config.stencil_write_mask = stencil_state->write_mask & 0xff;
1473 config.stencil_ref_value = stencil_state->reference & 0xff;
1474 config.stencil_test_function = stencil_state->op.compare;
1475 config.stencil_pass_op =
1476 v3dX(translate_stencil_op)(stencil_state->op.pass);
1477 config.depth_test_fail_op =
1478 v3dX(translate_stencil_op)(stencil_state->op.depth_fail);
1479 config.stencil_test_fail_op =
1480 v3dX(translate_stencil_op)(stencil_state->op.fail);
1481 }
1482 } else {
1483 assert(pipeline->emit_stencil_cfg[i]);
1484 cl_emit_prepacked(&job->bcl, &pipeline->stencil_cfg[i]);
1485 }
1486 emitted_stencil = true;
1487
1488 if (!needs_front_and_back)
1489 break;
1490 }
1491 if (emitted_stencil) {
1492 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_COMPARE_MASK);
1493 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_REFERENCE);
1494 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_WRITE_MASK);
1495 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_OP);
1496 }
1497 }
1498
1499 void
v3dX(cmd_buffer_emit_depth_bias)1500 v3dX(cmd_buffer_emit_depth_bias)(struct v3dv_cmd_buffer *cmd_buffer)
1501 {
1502 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1503 assert(pipeline);
1504 struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
1505
1506 if (!dyn->rs.depth_bias.enable)
1507 return;
1508
1509 struct v3dv_job *job = cmd_buffer->state.job;
1510 assert(job);
1511
1512 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_OFFSET));
1513 v3dv_return_if_oom(cmd_buffer, NULL);
1514
1515 cl_emit(&job->bcl, DEPTH_OFFSET, bias) {
1516 bias.depth_offset_factor = dyn->rs.depth_bias.slope;
1517 bias.depth_offset_units = dyn->rs.depth_bias.constant;
1518 #if V3D_VERSION <= 42
1519 if (pipeline->rendering_info.depth_attachment_format == VK_FORMAT_D16_UNORM)
1520 bias.depth_offset_units *= 256.0f;
1521 #endif
1522 bias.limit = dyn->rs.depth_bias.clamp;
1523 }
1524
1525 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_FACTORS);
1526 }
1527
1528 void
v3dX(cmd_buffer_emit_depth_bounds)1529 v3dX(cmd_buffer_emit_depth_bounds)(struct v3dv_cmd_buffer *cmd_buffer)
1530 {
1531 /* No depthBounds support for v42, so this method is empty in that case.
1532 *
1533 * Note that this method is being called as v3dv_job_init flags all state
1534 * as dirty. See FIXME note in v3dv_job_init.
1535 */
1536 #if V3D_VERSION >= 71
1537 struct vk_dynamic_graphics_state *dyn =
1538 &cmd_buffer->vk.dynamic_graphics_state;
1539
1540 if (!dyn->ds.depth.bounds_test.enable)
1541 return;
1542
1543 struct v3dv_job *job = cmd_buffer->state.job;
1544 assert(job);
1545
1546 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(DEPTH_BOUNDS_TEST_LIMITS));
1547 v3dv_return_if_oom(cmd_buffer, NULL);
1548
1549 cl_emit(&job->bcl, DEPTH_BOUNDS_TEST_LIMITS, bounds) {
1550 bounds.lower_test_limit = dyn->ds.depth.bounds_test.min;
1551 bounds.upper_test_limit = dyn->ds.depth.bounds_test.max;
1552 }
1553 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_BOUNDS);
1554 #endif
1555 }
1556
1557 void
v3dX(cmd_buffer_emit_line_width)1558 v3dX(cmd_buffer_emit_line_width)(struct v3dv_cmd_buffer *cmd_buffer)
1559 {
1560 struct v3dv_job *job = cmd_buffer->state.job;
1561 assert(job);
1562
1563 struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
1564
1565 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(LINE_WIDTH));
1566 v3dv_return_if_oom(cmd_buffer, NULL);
1567
1568 cl_emit(&job->bcl, LINE_WIDTH, line) {
1569 line.line_width = v3dv_get_aa_line_width(cmd_buffer->state.gfx.pipeline,
1570 cmd_buffer);
1571 }
1572
1573 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_LINE_WIDTH);
1574 }
1575
1576 void
v3dX(cmd_buffer_emit_default_point_size)1577 v3dX(cmd_buffer_emit_default_point_size)(struct v3dv_cmd_buffer *cmd_buffer)
1578 {
1579 struct v3dv_job *job = cmd_buffer->state.job;
1580 assert(job);
1581
1582 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(POINT_SIZE));
1583 v3dv_return_if_oom(cmd_buffer, NULL);
1584
1585 cl_emit(&job->bcl, POINT_SIZE, point) {
1586 point.point_size = 1.0f;
1587 }
1588
1589 job->emitted_default_point_size = true;
1590 }
1591
1592 void
v3dX(cmd_buffer_emit_sample_state)1593 v3dX(cmd_buffer_emit_sample_state)(struct v3dv_cmd_buffer *cmd_buffer)
1594 {
1595 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1596 assert(pipeline);
1597
1598 struct v3dv_job *job = cmd_buffer->state.job;
1599 assert(job);
1600
1601 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(SAMPLE_STATE));
1602 v3dv_return_if_oom(cmd_buffer, NULL);
1603
1604 cl_emit(&job->bcl, SAMPLE_STATE, state) {
1605 state.coverage = 1.0f;
1606 state.mask = pipeline->sample_mask;
1607 }
1608 }
1609
1610 void
v3dX(cmd_buffer_emit_blend)1611 v3dX(cmd_buffer_emit_blend)(struct v3dv_cmd_buffer *cmd_buffer)
1612 {
1613 struct v3dv_job *job = cmd_buffer->state.job;
1614 assert(job);
1615
1616 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1617 assert(pipeline);
1618
1619 const struct v3d_device_info *devinfo = &cmd_buffer->device->devinfo;
1620 const uint32_t max_color_rts = V3D_MAX_RENDER_TARGETS(devinfo->ver);
1621
1622 const uint32_t blend_packets_size =
1623 cl_packet_length(BLEND_ENABLES) +
1624 cl_packet_length(BLEND_CONSTANT_COLOR) +
1625 cl_packet_length(BLEND_CFG) * max_color_rts;
1626
1627 v3dv_cl_ensure_space_with_branch(&job->bcl, blend_packets_size);
1628 v3dv_return_if_oom(cmd_buffer, NULL);
1629
1630 if (cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PIPELINE) {
1631 if (pipeline->blend.enables) {
1632 cl_emit(&job->bcl, BLEND_ENABLES, enables) {
1633 enables.mask = pipeline->blend.enables;
1634 }
1635 }
1636
1637 for (uint32_t i = 0; i < max_color_rts; i++) {
1638 if (pipeline->blend.enables & (1 << i))
1639 cl_emit_prepacked(&job->bcl, &pipeline->blend.cfg[i]);
1640 }
1641 }
1642
1643 if (pipeline->blend.needs_color_constants) {
1644 const struct vk_dynamic_graphics_state *dyn =
1645 &cmd_buffer->vk.dynamic_graphics_state;
1646
1647 cl_emit(&job->bcl, BLEND_CONSTANT_COLOR, color) {
1648 color.red_f16 = _mesa_float_to_half(dyn->cb.blend_constants[0]);
1649 color.green_f16 = _mesa_float_to_half(dyn->cb.blend_constants[1]);
1650 color.blue_f16 = _mesa_float_to_half(dyn->cb.blend_constants[2]);
1651 color.alpha_f16 = _mesa_float_to_half(dyn->cb.blend_constants[3]);
1652 }
1653 }
1654
1655 BITSET_CLEAR(cmd_buffer->vk.dynamic_graphics_state.dirty,
1656 MESA_VK_DYNAMIC_CB_BLEND_CONSTANTS);
1657 }
1658
1659 void
v3dX(cmd_buffer_emit_color_write_mask)1660 v3dX(cmd_buffer_emit_color_write_mask)(struct v3dv_cmd_buffer *cmd_buffer)
1661 {
1662 struct v3dv_job *job = cmd_buffer->state.job;
1663 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(COLOR_WRITE_MASKS));
1664
1665 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1666 struct v3dv_dynamic_state *v3dv_dyn = &cmd_buffer->state.dynamic;
1667 uint32_t color_write_mask = ~v3dv_dyn->color_write_enable |
1668 pipeline->blend.color_write_masks;
1669
1670 #if V3D_VERSION <= 42
1671 /* Only 4 RTs */
1672 color_write_mask &= 0xffff;
1673 #endif
1674
1675 cl_emit(&job->bcl, COLOR_WRITE_MASKS, mask) {
1676 mask.mask = color_write_mask;
1677 }
1678
1679 BITSET_CLEAR(cmd_buffer->vk.dynamic_graphics_state.dirty,
1680 MESA_VK_DYNAMIC_CB_COLOR_WRITE_ENABLES);
1681 }
1682
1683 static void
emit_flat_shade_flags(struct v3dv_job * job,int varying_offset,uint32_t varyings,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher)1684 emit_flat_shade_flags(struct v3dv_job *job,
1685 int varying_offset,
1686 uint32_t varyings,
1687 enum V3DX(Varying_Flags_Action) lower,
1688 enum V3DX(Varying_Flags_Action) higher)
1689 {
1690 v3dv_cl_ensure_space_with_branch(&job->bcl,
1691 cl_packet_length(FLAT_SHADE_FLAGS));
1692 v3dv_return_if_oom(NULL, job);
1693
1694 cl_emit(&job->bcl, FLAT_SHADE_FLAGS, flags) {
1695 flags.varying_offset_v0 = varying_offset;
1696 flags.flat_shade_flags_for_varyings_v024 = varyings;
1697 flags.action_for_flat_shade_flags_of_lower_numbered_varyings = lower;
1698 flags.action_for_flat_shade_flags_of_higher_numbered_varyings = higher;
1699 }
1700 }
1701
1702 static void
emit_noperspective_flags(struct v3dv_job * job,int varying_offset,uint32_t varyings,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher)1703 emit_noperspective_flags(struct v3dv_job *job,
1704 int varying_offset,
1705 uint32_t varyings,
1706 enum V3DX(Varying_Flags_Action) lower,
1707 enum V3DX(Varying_Flags_Action) higher)
1708 {
1709 v3dv_cl_ensure_space_with_branch(&job->bcl,
1710 cl_packet_length(NON_PERSPECTIVE_FLAGS));
1711 v3dv_return_if_oom(NULL, job);
1712
1713 cl_emit(&job->bcl, NON_PERSPECTIVE_FLAGS, flags) {
1714 flags.varying_offset_v0 = varying_offset;
1715 flags.non_perspective_flags_for_varyings_v024 = varyings;
1716 flags.action_for_non_perspective_flags_of_lower_numbered_varyings = lower;
1717 flags.action_for_non_perspective_flags_of_higher_numbered_varyings = higher;
1718 }
1719 }
1720
1721 static void
emit_centroid_flags(struct v3dv_job * job,int varying_offset,uint32_t varyings,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher)1722 emit_centroid_flags(struct v3dv_job *job,
1723 int varying_offset,
1724 uint32_t varyings,
1725 enum V3DX(Varying_Flags_Action) lower,
1726 enum V3DX(Varying_Flags_Action) higher)
1727 {
1728 v3dv_cl_ensure_space_with_branch(&job->bcl,
1729 cl_packet_length(CENTROID_FLAGS));
1730 v3dv_return_if_oom(NULL, job);
1731
1732 cl_emit(&job->bcl, CENTROID_FLAGS, flags) {
1733 flags.varying_offset_v0 = varying_offset;
1734 flags.centroid_flags_for_varyings_v024 = varyings;
1735 flags.action_for_centroid_flags_of_lower_numbered_varyings = lower;
1736 flags.action_for_centroid_flags_of_higher_numbered_varyings = higher;
1737 }
1738 }
1739
1740 static bool
emit_varying_flags(struct v3dv_job * job,uint32_t num_flags,const uint32_t * flags,void (* flag_emit_callback)(struct v3dv_job * job,int varying_offset,uint32_t flags,enum V3DX (Varying_Flags_Action)lower,enum V3DX (Varying_Flags_Action)higher))1741 emit_varying_flags(struct v3dv_job *job,
1742 uint32_t num_flags,
1743 const uint32_t *flags,
1744 void (*flag_emit_callback)(struct v3dv_job *job,
1745 int varying_offset,
1746 uint32_t flags,
1747 enum V3DX(Varying_Flags_Action) lower,
1748 enum V3DX(Varying_Flags_Action) higher))
1749 {
1750 bool emitted_any = false;
1751 for (int i = 0; i < num_flags; i++) {
1752 if (!flags[i])
1753 continue;
1754
1755 if (emitted_any) {
1756 flag_emit_callback(job, i, flags[i],
1757 V3D_VARYING_FLAGS_ACTION_UNCHANGED,
1758 V3D_VARYING_FLAGS_ACTION_UNCHANGED);
1759 } else if (i == 0) {
1760 flag_emit_callback(job, i, flags[i],
1761 V3D_VARYING_FLAGS_ACTION_UNCHANGED,
1762 V3D_VARYING_FLAGS_ACTION_ZEROED);
1763 } else {
1764 flag_emit_callback(job, i, flags[i],
1765 V3D_VARYING_FLAGS_ACTION_ZEROED,
1766 V3D_VARYING_FLAGS_ACTION_ZEROED);
1767 }
1768
1769 emitted_any = true;
1770 }
1771
1772 return emitted_any;
1773 }
1774
1775 void
v3dX(cmd_buffer_emit_varyings_state)1776 v3dX(cmd_buffer_emit_varyings_state)(struct v3dv_cmd_buffer *cmd_buffer)
1777 {
1778 struct v3dv_job *job = cmd_buffer->state.job;
1779 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1780
1781 struct v3d_fs_prog_data *prog_data_fs =
1782 pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT]->prog_data.fs;
1783
1784 const uint32_t num_flags =
1785 ARRAY_SIZE(prog_data_fs->flat_shade_flags);
1786 const uint32_t *flat_shade_flags = prog_data_fs->flat_shade_flags;
1787 const uint32_t *noperspective_flags = prog_data_fs->noperspective_flags;
1788 const uint32_t *centroid_flags = prog_data_fs->centroid_flags;
1789
1790 if (!emit_varying_flags(job, num_flags, flat_shade_flags,
1791 emit_flat_shade_flags)) {
1792 v3dv_cl_ensure_space_with_branch(
1793 &job->bcl, cl_packet_length(ZERO_ALL_FLAT_SHADE_FLAGS));
1794 v3dv_return_if_oom(cmd_buffer, NULL);
1795
1796 cl_emit(&job->bcl, ZERO_ALL_FLAT_SHADE_FLAGS, flags);
1797 }
1798
1799 if (!emit_varying_flags(job, num_flags, noperspective_flags,
1800 emit_noperspective_flags)) {
1801 v3dv_cl_ensure_space_with_branch(
1802 &job->bcl, cl_packet_length(ZERO_ALL_NON_PERSPECTIVE_FLAGS));
1803 v3dv_return_if_oom(cmd_buffer, NULL);
1804
1805 cl_emit(&job->bcl, ZERO_ALL_NON_PERSPECTIVE_FLAGS, flags);
1806 }
1807
1808 if (!emit_varying_flags(job, num_flags, centroid_flags,
1809 emit_centroid_flags)) {
1810 v3dv_cl_ensure_space_with_branch(
1811 &job->bcl, cl_packet_length(ZERO_ALL_CENTROID_FLAGS));
1812 v3dv_return_if_oom(cmd_buffer, NULL);
1813
1814 cl_emit(&job->bcl, ZERO_ALL_CENTROID_FLAGS, flags);
1815 }
1816 }
1817
1818 #if V3D_VERSION == 42
1819 /* Updates cmd_buffer, and their job, early z state tracking. Returns false if
1820 * EZ must be disabled for the current draw call.
1821 */
1822 static bool
cmd_buffer_update_ez_state(struct v3dv_cmd_buffer * cmd_buffer,struct v3dv_pipeline * pipeline)1823 cmd_buffer_update_ez_state(struct v3dv_cmd_buffer *cmd_buffer,
1824 struct v3dv_pipeline *pipeline)
1825 {
1826 struct vk_dynamic_graphics_state *dyn = &cmd_buffer->vk.dynamic_graphics_state;
1827 /* Update first cmd_buffer ez_state tracking. If possible we reuse the
1828 * values from the pipeline
1829 */
1830 if (!BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_OP) &&
1831 !BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE) &&
1832 !BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_DEPTH_TEST_ENABLE) &&
1833 !BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_DS_DEPTH_COMPARE_OP)) {
1834 cmd_buffer->state.ez_state = pipeline->ez_state;
1835 cmd_buffer->state.incompatible_ez_test =
1836 pipeline->incompatible_ez_test;
1837 } else {
1838 v3dv_compute_ez_state(dyn, pipeline,
1839 &cmd_buffer->state.ez_state,
1840 &cmd_buffer->state.incompatible_ez_test);
1841 }
1842
1843 struct v3dv_job *job = cmd_buffer->state.job;
1844 assert(job);
1845 /* If first_ez_state is V3D_EZ_DISABLED it means that we have already
1846 * determined that we should disable EZ completely for all draw calls in
1847 * this job. This will cause us to disable EZ for the entire job in the
1848 * Tile Rendering Mode RCL packet and when we do that we need to make sure
1849 * we never emit a draw call in the job with EZ enabled in the CFG_BITS
1850 * packet, so ez_state must also be V3D_EZ_DISABLED;
1851 */
1852 if (job->first_ez_state == V3D_EZ_DISABLED) {
1853 assert(job->ez_state == V3D_EZ_DISABLED);
1854 return false;
1855 }
1856
1857 /* If ez_state is V3D_EZ_DISABLED it means that we have already decided
1858 * that EZ must be disabled for the remaining of the frame.
1859 */
1860 if (job->ez_state == V3D_EZ_DISABLED)
1861 return false;
1862
1863 /* This is part of the pre draw call handling, so we should be inside a
1864 * render pass.
1865 */
1866 assert(cmd_buffer->state.pass);
1867
1868 /* If this is the first time we update EZ state for this job we first check
1869 * if there is anything that requires disabling it completely for the entire
1870 * job (based on state that is not related to the current draw call and
1871 * pipeline/cmd_buffer state).
1872 */
1873 if (!job->decided_global_ez_enable) {
1874 job->decided_global_ez_enable = true;
1875
1876 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
1877 assert(state->subpass_idx < state->pass->subpass_count);
1878 struct v3dv_subpass *subpass = &state->pass->subpasses[state->subpass_idx];
1879 if (subpass->ds_attachment.attachment == VK_ATTACHMENT_UNUSED) {
1880 job->first_ez_state = V3D_EZ_DISABLED;
1881 job->ez_state = V3D_EZ_DISABLED;
1882 return false;
1883 }
1884
1885 /* GFXH-1918: the early-z buffer may load incorrect depth values if the
1886 * frame has odd width or height, or if the buffer is 16-bit and
1887 * multisampled.
1888 *
1889 * So we need to disable EZ in these cases.
1890 */
1891 const struct v3dv_render_pass_attachment *ds_attachment =
1892 &state->pass->attachments[subpass->ds_attachment.attachment];
1893
1894 const VkImageAspectFlags ds_aspects =
1895 vk_format_aspects(ds_attachment->desc.format);
1896
1897 bool needs_depth_load =
1898 v3dv_cmd_buffer_check_needs_load(state,
1899 ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT,
1900 ds_attachment->first_subpass,
1901 ds_attachment->desc.loadOp,
1902 ds_attachment->last_subpass,
1903 ds_attachment->desc.storeOp);
1904
1905 if (needs_depth_load) {
1906 if (ds_attachment->desc.format == VK_FORMAT_D16_UNORM &&
1907 ds_attachment->desc.samples != VK_SAMPLE_COUNT_1_BIT) {
1908 perf_debug("Loading depth aspect from a multisampled 16-bit "
1909 "depth buffer disables early-Z tests.\n");
1910 job->first_ez_state = V3D_EZ_DISABLED;
1911 job->ez_state = V3D_EZ_DISABLED;
1912 return false;
1913 }
1914
1915 struct v3dv_framebuffer *fb = state->framebuffer;
1916
1917 if (!fb) {
1918 assert(cmd_buffer->vk.level == VK_COMMAND_BUFFER_LEVEL_SECONDARY);
1919 perf_debug("Loading depth aspect in a secondary command buffer "
1920 "without framebuffer info disables early-z tests.\n");
1921 job->first_ez_state = V3D_EZ_DISABLED;
1922 job->ez_state = V3D_EZ_DISABLED;
1923 return false;
1924 }
1925
1926 if (((fb->width % 2) != 0 || (fb->height % 2) != 0)) {
1927 perf_debug("Loading depth aspect for framebuffer with odd width "
1928 "or height disables early-Z tests.\n");
1929 job->first_ez_state = V3D_EZ_DISABLED;
1930 job->ez_state = V3D_EZ_DISABLED;
1931 return false;
1932 }
1933 }
1934 }
1935
1936 /* Otherwise, we can decide to selectively enable or disable EZ for draw
1937 * calls using the CFG_BITS packet based on the bound pipeline state, or
1938 * cmd_buffer state if some stencil/depth flags were dynamic.
1939 */
1940 bool disable_ez = false;
1941 bool incompatible_test = false;
1942 switch (cmd_buffer->state.ez_state) {
1943 case V3D_EZ_UNDECIDED:
1944 /* If the pipeline didn't pick a direction but didn't disable, then go
1945 * along with the current EZ state. This allows EZ optimization for Z
1946 * func == EQUAL or NEVER.
1947 */
1948 break;
1949
1950 case V3D_EZ_LT_LE:
1951 case V3D_EZ_GT_GE:
1952 /* If the pipeline picked a direction, then it needs to match the current
1953 * direction if we've decided on one.
1954 */
1955 if (job->ez_state == V3D_EZ_UNDECIDED) {
1956 job->ez_state = cmd_buffer->state.ez_state;
1957 } else if (job->ez_state != pipeline->ez_state) {
1958 disable_ez = true;
1959 incompatible_test = true;
1960 }
1961 break;
1962
1963 case V3D_EZ_DISABLED:
1964 disable_ez = true;
1965 incompatible_test = cmd_buffer->state.incompatible_ez_test;
1966 break;
1967 }
1968
1969 if (job->first_ez_state == V3D_EZ_UNDECIDED && !disable_ez) {
1970 assert(job->ez_state != V3D_EZ_DISABLED);
1971 job->first_ez_state = job->ez_state;
1972 }
1973
1974 /* If we had to disable EZ because of an incompatible test direction and
1975 * and the cmd buffer writes depth then we need to disable EZ for the rest
1976 * of the frame.
1977 */
1978 if (incompatible_test && cmd_buffer->state.z_updates_enable) {
1979 assert(disable_ez);
1980 job->ez_state = V3D_EZ_DISABLED;
1981 }
1982
1983 if (!disable_ez)
1984 job->has_ez_draws = true;
1985
1986 return !disable_ez;
1987 }
1988 #endif
1989
1990 void
v3dX(cmd_buffer_emit_configuration_bits)1991 v3dX(cmd_buffer_emit_configuration_bits)(struct v3dv_cmd_buffer *cmd_buffer)
1992 {
1993 struct v3dv_job *job = cmd_buffer->state.job;
1994 assert(job);
1995
1996 struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline;
1997 assert(pipeline);
1998
1999 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(CFG_BITS));
2000 v3dv_return_if_oom(cmd_buffer, NULL);
2001
2002 struct vk_dynamic_graphics_state *dyn =
2003 &cmd_buffer->vk.dynamic_graphics_state;
2004
2005 /* Disable depth/stencil if we don't have a D/S attachment */
2006 bool has_depth =
2007 pipeline->rendering_info.depth_attachment_format != VK_FORMAT_UNDEFINED;
2008 bool has_stencil =
2009 pipeline->rendering_info.stencil_attachment_format != VK_FORMAT_UNDEFINED;
2010
2011 cl_emit_with_prepacked(&job->bcl, CFG_BITS, pipeline->cfg_bits, config) {
2012 if (dyn->ds.depth.test_enable && has_depth) {
2013 config.z_updates_enable = dyn->ds.depth.write_enable;
2014 config.depth_test_function = dyn->ds.depth.compare_op;
2015 } else {
2016 config.depth_test_function = VK_COMPARE_OP_ALWAYS;
2017 }
2018
2019 config.stencil_enable = dyn->ds.stencil.test_enable && has_stencil;
2020
2021 cmd_buffer->state.z_updates_enable = config.z_updates_enable;
2022 #if V3D_VERSION == 42
2023 bool enable_ez = cmd_buffer_update_ez_state(cmd_buffer, pipeline);
2024 config.early_z_enable = enable_ez;
2025 config.early_z_updates_enable = config.early_z_enable &&
2026 cmd_buffer->state.z_updates_enable;
2027 #endif
2028
2029 if (!dyn->rs.rasterizer_discard_enable) {
2030 assert(BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_RS_CULL_MODE));
2031 assert(BITSET_TEST(dyn->set, MESA_VK_DYNAMIC_RS_FRONT_FACE));
2032 config.enable_forward_facing_primitive = !(dyn->rs.cull_mode & VK_CULL_MODE_FRONT_BIT);
2033 config.enable_reverse_facing_primitive = !(dyn->rs.cull_mode & VK_CULL_MODE_BACK_BIT);
2034 /* Seems like the hardware is backwards regarding this setting... */
2035 config.clockwise_primitives = dyn->rs.front_face == VK_FRONT_FACE_COUNTER_CLOCKWISE;
2036 }
2037
2038 /* V3D 4.2 doesn't support depth bounds testing so we don't advertise that
2039 * feature and it shouldn't be used by any pipeline.
2040 */
2041 assert(cmd_buffer->device->devinfo.ver >= 71 ||
2042 !dyn->ds.depth.bounds_test.enable);
2043 #if V3D_VERSION >= 71
2044 config.depth_bounds_test_enable =
2045 dyn->ds.depth.bounds_test.enable && has_depth;
2046 #endif
2047
2048 config.enable_depth_offset = dyn->rs.depth_bias.enable;
2049 }
2050
2051 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_CULL_MODE);
2052 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_FRONT_FACE);
2053 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_DEPTH_BOUNDS_TEST_ENABLE);
2054 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_DS_STENCIL_TEST_ENABLE);
2055 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_DEPTH_BIAS_ENABLE);
2056 BITSET_CLEAR(dyn->dirty, MESA_VK_DYNAMIC_RS_RASTERIZER_DISCARD_ENABLE);
2057 }
2058
2059 void
v3dX(cmd_buffer_emit_occlusion_query)2060 v3dX(cmd_buffer_emit_occlusion_query)(struct v3dv_cmd_buffer *cmd_buffer)
2061 {
2062 struct v3dv_job *job = cmd_buffer->state.job;
2063 assert(job);
2064
2065 v3dv_cl_ensure_space_with_branch(&job->bcl,
2066 cl_packet_length(OCCLUSION_QUERY_COUNTER));
2067 v3dv_return_if_oom(cmd_buffer, NULL);
2068
2069 cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter) {
2070 if (cmd_buffer->state.query.active_query.bo) {
2071 counter.address =
2072 v3dv_cl_address(cmd_buffer->state.query.active_query.bo,
2073 cmd_buffer->state.query.active_query.offset);
2074 }
2075 }
2076
2077 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_OCCLUSION_QUERY;
2078 }
2079
2080 static struct v3dv_job *
cmd_buffer_subpass_split_for_barrier(struct v3dv_cmd_buffer * cmd_buffer,bool is_bcl_barrier)2081 cmd_buffer_subpass_split_for_barrier(struct v3dv_cmd_buffer *cmd_buffer,
2082 bool is_bcl_barrier)
2083 {
2084 assert(cmd_buffer->state.subpass_idx != -1);
2085 v3dv_cmd_buffer_finish_job(cmd_buffer);
2086 struct v3dv_job *job =
2087 v3dv_cmd_buffer_subpass_resume(cmd_buffer,
2088 cmd_buffer->state.subpass_idx);
2089 if (!job)
2090 return NULL;
2091
2092 /* FIXME: we can do better than all barriers */
2093 job->serialize = V3DV_BARRIER_ALL;
2094 job->needs_bcl_sync = is_bcl_barrier;
2095 return job;
2096 }
2097
2098 static void
cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer * primary,struct v3dv_cmd_buffer * secondary)2099 cmd_buffer_copy_secondary_end_query_state(struct v3dv_cmd_buffer *primary,
2100 struct v3dv_cmd_buffer *secondary)
2101 {
2102 struct v3dv_cmd_buffer_state *p_state = &primary->state;
2103 struct v3dv_cmd_buffer_state *s_state = &secondary->state;
2104
2105 const uint32_t total_state_count =
2106 p_state->query.end.used_count + s_state->query.end.used_count;
2107 v3dv_cmd_buffer_ensure_array_state(primary,
2108 sizeof(struct v3dv_end_query_info),
2109 total_state_count,
2110 &p_state->query.end.alloc_count,
2111 (void **) &p_state->query.end.states);
2112 v3dv_return_if_oom(primary, NULL);
2113
2114 for (uint32_t i = 0; i < s_state->query.end.used_count; i++) {
2115 const struct v3dv_end_query_info *s_qstate =
2116 &secondary->state.query.end.states[i];
2117
2118 struct v3dv_end_query_info *p_qstate =
2119 &p_state->query.end.states[p_state->query.end.used_count++];
2120
2121 memcpy(p_qstate, s_qstate, sizeof(struct v3dv_end_query_info));
2122 }
2123 }
2124
2125 void
v3dX(cmd_buffer_execute_inside_pass)2126 v3dX(cmd_buffer_execute_inside_pass)(struct v3dv_cmd_buffer *primary,
2127 uint32_t cmd_buffer_count,
2128 const VkCommandBuffer *cmd_buffers)
2129 {
2130 assert(primary->state.job);
2131
2132 /* Typically we postpone applying binning syncs until we see a draw call
2133 * that may actually access proteted resources in the binning stage. However,
2134 * if the draw calls are recorded in a secondary command buffer and the
2135 * barriers were recorded in a primary command buffer, that won't work
2136 * and we will have to check if we need a binning sync when executing the
2137 * secondary.
2138 */
2139 struct v3dv_job *primary_job = primary->state.job;
2140 if (primary_job->serialize &&
2141 (primary->state.barrier.bcl_buffer_access ||
2142 primary->state.barrier.bcl_image_access)) {
2143 v3dv_cmd_buffer_consume_bcl_sync(primary, primary_job);
2144 }
2145
2146 /* Emit occlusion query state if needed so the draw calls inside our
2147 * secondaries update the counters.
2148 */
2149 bool has_occlusion_query =
2150 primary->state.dirty & V3DV_CMD_DIRTY_OCCLUSION_QUERY;
2151 if (has_occlusion_query)
2152 v3dX(cmd_buffer_emit_occlusion_query)(primary);
2153
2154 /* FIXME: if our primary job tiling doesn't enable MSSA but any of the
2155 * pipelines used by the secondaries do, we need to re-start the primary
2156 * job to enable MSAA. See cmd_buffer_restart_job_for_msaa_if_needed.
2157 */
2158 struct v3dv_barrier_state pending_barrier = { 0 };
2159 for (uint32_t i = 0; i < cmd_buffer_count; i++) {
2160 V3DV_FROM_HANDLE(v3dv_cmd_buffer, secondary, cmd_buffers[i]);
2161
2162 assert(secondary->usage_flags &
2163 VK_COMMAND_BUFFER_USAGE_RENDER_PASS_CONTINUE_BIT);
2164
2165 list_for_each_entry(struct v3dv_job, secondary_job,
2166 &secondary->jobs, list_link) {
2167 if (secondary_job->type == V3DV_JOB_TYPE_GPU_CL_INCOMPLETE) {
2168 /* If the job is a CL, then we branch to it from the primary BCL.
2169 * In this case the secondary's BCL is finished with a
2170 * RETURN_FROM_SUB_LIST command to return back to the primary BCL
2171 * once we are done executing it.
2172 */
2173 assert(v3dv_cl_offset(&secondary_job->rcl) == 0);
2174 assert(secondary_job->bcl.bo);
2175
2176 /* Sanity check that secondary BCL ends with RETURN_FROM_SUB_LIST */
2177 STATIC_ASSERT(cl_packet_length(RETURN_FROM_SUB_LIST) == 1);
2178 assert(v3dv_cl_offset(&secondary_job->bcl) >= 1);
2179 assert(*(((uint8_t *)secondary_job->bcl.next) - 1) ==
2180 V3DX(RETURN_FROM_SUB_LIST_opcode));
2181
2182 /* If this secondary has any barriers (or we had any pending barrier
2183 * to apply), then we can't just branch to it from the primary, we
2184 * need to split the primary to create a new job that can consume
2185 * the barriers first.
2186 *
2187 * FIXME: in this case, maybe just copy the secondary BCL without
2188 * the RETURN_FROM_SUB_LIST into the primary job to skip the
2189 * branch?
2190 */
2191 primary_job = primary->state.job;
2192 if (!primary_job || secondary_job->serialize ||
2193 pending_barrier.dst_mask) {
2194 const bool needs_bcl_barrier =
2195 secondary_job->needs_bcl_sync ||
2196 pending_barrier.bcl_buffer_access ||
2197 pending_barrier.bcl_image_access;
2198
2199 primary_job =
2200 cmd_buffer_subpass_split_for_barrier(primary,
2201 needs_bcl_barrier);
2202 v3dv_return_if_oom(primary, NULL);
2203
2204 /* Since we have created a new primary we need to re-emit
2205 * occlusion query state.
2206 */
2207 if (has_occlusion_query)
2208 v3dX(cmd_buffer_emit_occlusion_query)(primary);
2209 }
2210
2211 /* Make sure our primary job has all required BO references */
2212 set_foreach(secondary_job->bos, entry) {
2213 struct v3dv_bo *bo = (struct v3dv_bo *)entry->key;
2214 v3dv_job_add_bo(primary_job, bo);
2215 }
2216
2217 /* Emit required branch instructions. We expect each of these
2218 * to end with a corresponding 'return from sub list' item.
2219 */
2220 list_for_each_entry(struct v3dv_bo, bcl_bo,
2221 &secondary_job->bcl.bo_list, list_link) {
2222 v3dv_cl_ensure_space_with_branch(&primary_job->bcl,
2223 cl_packet_length(BRANCH_TO_SUB_LIST));
2224 v3dv_return_if_oom(primary, NULL);
2225 cl_emit(&primary_job->bcl, BRANCH_TO_SUB_LIST, branch) {
2226 branch.address = v3dv_cl_address(bcl_bo, 0);
2227 }
2228 }
2229
2230 if (!secondary_job->can_use_double_buffer) {
2231 primary_job->can_use_double_buffer = false;
2232 } else {
2233 primary_job->double_buffer_score.geom +=
2234 secondary_job->double_buffer_score.geom;
2235 primary_job->double_buffer_score.render +=
2236 secondary_job->double_buffer_score.render;
2237 }
2238 primary_job->tmu_dirty_rcl |= secondary_job->tmu_dirty_rcl;
2239 } else {
2240 /* This is a regular job (CPU or GPU), so just finish the current
2241 * primary job (if any) and then add the secondary job to the
2242 * primary's job list right after it.
2243 */
2244 v3dv_cmd_buffer_finish_job(primary);
2245 v3dv_job_clone_in_cmd_buffer(secondary_job, primary);
2246 if (pending_barrier.dst_mask) {
2247 /* FIXME: do the same we do for primaries and only choose the
2248 * relevant src masks.
2249 */
2250 secondary_job->serialize = pending_barrier.src_mask_graphics |
2251 pending_barrier.src_mask_transfer |
2252 pending_barrier.src_mask_compute;
2253 if (pending_barrier.bcl_buffer_access ||
2254 pending_barrier.bcl_image_access) {
2255 secondary_job->needs_bcl_sync = true;
2256 }
2257 }
2258 }
2259
2260 memset(&pending_barrier, 0, sizeof(pending_barrier));
2261 }
2262
2263 /* If the secondary has recorded any vkCmdEndQuery commands, we need to
2264 * copy this state to the primary so it is processed properly when the
2265 * current primary job is finished.
2266 */
2267 cmd_buffer_copy_secondary_end_query_state(primary, secondary);
2268
2269 /* If this secondary had any pending barrier state we will need that
2270 * barrier state consumed with whatever comes next in the primary.
2271 */
2272 assert(secondary->state.barrier.dst_mask ||
2273 (!secondary->state.barrier.bcl_buffer_access &&
2274 !secondary->state.barrier.bcl_image_access));
2275
2276 pending_barrier = secondary->state.barrier;
2277 }
2278
2279 if (pending_barrier.dst_mask) {
2280 v3dv_cmd_buffer_merge_barrier_state(&primary->state.barrier,
2281 &pending_barrier);
2282 }
2283 }
2284
2285 static void
emit_gs_shader_state_record(struct v3dv_job * job,struct v3dv_bo * assembly_bo,struct v3dv_shader_variant * gs_bin,struct v3dv_cl_reloc gs_bin_uniforms,struct v3dv_shader_variant * gs,struct v3dv_cl_reloc gs_render_uniforms)2286 emit_gs_shader_state_record(struct v3dv_job *job,
2287 struct v3dv_bo *assembly_bo,
2288 struct v3dv_shader_variant *gs_bin,
2289 struct v3dv_cl_reloc gs_bin_uniforms,
2290 struct v3dv_shader_variant *gs,
2291 struct v3dv_cl_reloc gs_render_uniforms)
2292 {
2293 cl_emit(&job->indirect, GEOMETRY_SHADER_STATE_RECORD, shader) {
2294 shader.geometry_bin_mode_shader_code_address =
2295 v3dv_cl_address(assembly_bo, gs_bin->assembly_offset);
2296 shader.geometry_bin_mode_shader_4_way_threadable =
2297 gs_bin->prog_data.gs->base.threads == 4;
2298 shader.geometry_bin_mode_shader_start_in_final_thread_section =
2299 gs_bin->prog_data.gs->base.single_seg;
2300 #if V3D_VERSION <= 42
2301 shader.geometry_bin_mode_shader_propagate_nans = true;
2302 #endif
2303 shader.geometry_bin_mode_shader_uniforms_address =
2304 gs_bin_uniforms;
2305
2306 shader.geometry_render_mode_shader_code_address =
2307 v3dv_cl_address(assembly_bo, gs->assembly_offset);
2308 shader.geometry_render_mode_shader_4_way_threadable =
2309 gs->prog_data.gs->base.threads == 4;
2310 shader.geometry_render_mode_shader_start_in_final_thread_section =
2311 gs->prog_data.gs->base.single_seg;
2312 #if V3D_VERSION <= 42
2313 shader.geometry_render_mode_shader_propagate_nans = true;
2314 #endif
2315 shader.geometry_render_mode_shader_uniforms_address =
2316 gs_render_uniforms;
2317 }
2318 }
2319
2320 static uint8_t
v3d_gs_output_primitive(enum mesa_prim prim_type)2321 v3d_gs_output_primitive(enum mesa_prim prim_type)
2322 {
2323 switch (prim_type) {
2324 case MESA_PRIM_POINTS:
2325 return GEOMETRY_SHADER_POINTS;
2326 case MESA_PRIM_LINE_STRIP:
2327 return GEOMETRY_SHADER_LINE_STRIP;
2328 case MESA_PRIM_TRIANGLE_STRIP:
2329 return GEOMETRY_SHADER_TRI_STRIP;
2330 default:
2331 unreachable("Unsupported primitive type");
2332 }
2333 }
2334
2335 static void
emit_tes_gs_common_params(struct v3dv_job * job,uint8_t gs_out_prim_type,uint8_t gs_num_invocations)2336 emit_tes_gs_common_params(struct v3dv_job *job,
2337 uint8_t gs_out_prim_type,
2338 uint8_t gs_num_invocations)
2339 {
2340 cl_emit(&job->indirect, TESSELLATION_GEOMETRY_COMMON_PARAMS, shader) {
2341 shader.tessellation_type = TESSELLATION_TYPE_TRIANGLE;
2342 shader.tessellation_point_mode = false;
2343 shader.tessellation_edge_spacing = TESSELLATION_EDGE_SPACING_EVEN;
2344 shader.tessellation_clockwise = true;
2345 shader.tessellation_invocations = 1;
2346
2347 shader.geometry_shader_output_format =
2348 v3d_gs_output_primitive(gs_out_prim_type);
2349 shader.geometry_shader_instances = gs_num_invocations & 0x1F;
2350 }
2351 }
2352
2353 static uint8_t
simd_width_to_gs_pack_mode(uint32_t width)2354 simd_width_to_gs_pack_mode(uint32_t width)
2355 {
2356 switch (width) {
2357 case 16:
2358 return V3D_PACK_MODE_16_WAY;
2359 case 8:
2360 return V3D_PACK_MODE_8_WAY;
2361 case 4:
2362 return V3D_PACK_MODE_4_WAY;
2363 case 1:
2364 return V3D_PACK_MODE_1_WAY;
2365 default:
2366 unreachable("Invalid SIMD width");
2367 };
2368 }
2369
2370 static void
emit_tes_gs_shader_params(struct v3dv_job * job,uint32_t gs_simd,uint32_t gs_vpm_output_size,uint32_t gs_max_vpm_input_size_per_batch)2371 emit_tes_gs_shader_params(struct v3dv_job *job,
2372 uint32_t gs_simd,
2373 uint32_t gs_vpm_output_size,
2374 uint32_t gs_max_vpm_input_size_per_batch)
2375 {
2376 cl_emit(&job->indirect, TESSELLATION_GEOMETRY_SHADER_PARAMS, shader) {
2377 shader.tcs_batch_flush_mode = V3D_TCS_FLUSH_MODE_FULLY_PACKED;
2378 shader.per_patch_data_column_depth = 1;
2379 shader.tcs_output_segment_size_in_sectors = 1;
2380 shader.tcs_output_segment_pack_mode = V3D_PACK_MODE_16_WAY;
2381 shader.tes_output_segment_size_in_sectors = 1;
2382 shader.tes_output_segment_pack_mode = V3D_PACK_MODE_16_WAY;
2383 shader.gs_output_segment_size_in_sectors = gs_vpm_output_size;
2384 shader.gs_output_segment_pack_mode =
2385 simd_width_to_gs_pack_mode(gs_simd);
2386 shader.tbg_max_patches_per_tcs_batch = 1;
2387 shader.tbg_max_extra_vertex_segs_for_patches_after_first = 0;
2388 shader.tbg_min_tcs_output_segments_required_in_play = 1;
2389 shader.tbg_min_per_patch_data_segments_required_in_play = 1;
2390 shader.tpg_max_patches_per_tes_batch = 1;
2391 shader.tpg_max_vertex_segments_per_tes_batch = 0;
2392 shader.tpg_max_tcs_output_segments_per_tes_batch = 1;
2393 shader.tpg_min_tes_output_segments_required_in_play = 1;
2394 shader.gbg_max_tes_output_vertex_segments_per_gs_batch =
2395 gs_max_vpm_input_size_per_batch;
2396 shader.gbg_min_gs_output_segments_required_in_play = 1;
2397 }
2398 }
2399
2400 void
v3dX(cmd_buffer_emit_gl_shader_state)2401 v3dX(cmd_buffer_emit_gl_shader_state)(struct v3dv_cmd_buffer *cmd_buffer)
2402 {
2403 struct v3dv_job *job = cmd_buffer->state.job;
2404 assert(job);
2405
2406 struct v3dv_cmd_buffer_state *state = &cmd_buffer->state;
2407 struct v3dv_pipeline *pipeline = state->gfx.pipeline;
2408 assert(pipeline);
2409
2410 struct v3dv_shader_variant *vs_variant =
2411 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX];
2412 struct v3d_vs_prog_data *prog_data_vs = vs_variant->prog_data.vs;
2413
2414 struct v3dv_shader_variant *vs_bin_variant =
2415 pipeline->shared_data->variants[BROADCOM_SHADER_VERTEX_BIN];
2416 struct v3d_vs_prog_data *prog_data_vs_bin = vs_bin_variant->prog_data.vs;
2417
2418 struct v3dv_shader_variant *fs_variant =
2419 pipeline->shared_data->variants[BROADCOM_SHADER_FRAGMENT];
2420 struct v3d_fs_prog_data *prog_data_fs = fs_variant->prog_data.fs;
2421
2422 struct v3dv_shader_variant *gs_variant = NULL;
2423 struct v3dv_shader_variant *gs_bin_variant = NULL;
2424 struct v3d_gs_prog_data *prog_data_gs = NULL;
2425 struct v3d_gs_prog_data *prog_data_gs_bin = NULL;
2426 if (pipeline->has_gs) {
2427 gs_variant =
2428 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY];
2429 prog_data_gs = gs_variant->prog_data.gs;
2430
2431 gs_bin_variant =
2432 pipeline->shared_data->variants[BROADCOM_SHADER_GEOMETRY_BIN];
2433 prog_data_gs_bin = gs_bin_variant->prog_data.gs;
2434 }
2435
2436 /* Update the cache dirty flag based on the shader progs data */
2437 job->tmu_dirty_rcl |= prog_data_vs_bin->base.tmu_dirty_rcl;
2438 job->tmu_dirty_rcl |= prog_data_vs->base.tmu_dirty_rcl;
2439 job->tmu_dirty_rcl |= prog_data_fs->base.tmu_dirty_rcl;
2440 if (pipeline->has_gs) {
2441 job->tmu_dirty_rcl |= prog_data_gs_bin->base.tmu_dirty_rcl;
2442 job->tmu_dirty_rcl |= prog_data_gs->base.tmu_dirty_rcl;
2443 }
2444
2445 /* See GFXH-930 workaround below */
2446 uint32_t num_elements_to_emit = MAX2(pipeline->va_count, 1);
2447
2448 uint32_t shader_state_record_length =
2449 cl_packet_length(GL_SHADER_STATE_RECORD);
2450 #if V3D_VERSION >= 71
2451 if (v3d_device_has_draw_index(&pipeline->device->devinfo)) {
2452 shader_state_record_length =
2453 cl_packet_length(GL_SHADER_STATE_RECORD_DRAW_INDEX);
2454 }
2455 #endif
2456
2457 if (pipeline->has_gs) {
2458 shader_state_record_length +=
2459 cl_packet_length(GEOMETRY_SHADER_STATE_RECORD) +
2460 cl_packet_length(TESSELLATION_GEOMETRY_COMMON_PARAMS) +
2461 2 * cl_packet_length(TESSELLATION_GEOMETRY_SHADER_PARAMS);
2462 }
2463
2464 uint32_t shader_rec_offset =
2465 v3dv_cl_ensure_space(&job->indirect,
2466 shader_state_record_length +
2467 num_elements_to_emit *
2468 cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD),
2469 32);
2470 v3dv_return_if_oom(cmd_buffer, NULL);
2471
2472 struct v3dv_bo *assembly_bo = pipeline->shared_data->assembly_bo;
2473
2474 if (pipeline->has_gs) {
2475 emit_gs_shader_state_record(job,
2476 assembly_bo,
2477 gs_bin_variant,
2478 cmd_buffer->state.uniforms.gs_bin,
2479 gs_variant,
2480 cmd_buffer->state.uniforms.gs);
2481
2482 emit_tes_gs_common_params(job,
2483 prog_data_gs->out_prim_type,
2484 prog_data_gs->num_invocations);
2485
2486 emit_tes_gs_shader_params(job,
2487 pipeline->vpm_cfg_bin.gs_width,
2488 pipeline->vpm_cfg_bin.Gd,
2489 pipeline->vpm_cfg_bin.Gv);
2490
2491 emit_tes_gs_shader_params(job,
2492 pipeline->vpm_cfg.gs_width,
2493 pipeline->vpm_cfg.Gd,
2494 pipeline->vpm_cfg.Gv);
2495 }
2496
2497 #if V3D_VERSION == 42
2498 struct v3dv_bo *default_attribute_values =
2499 pipeline->default_attribute_values != NULL ?
2500 pipeline->default_attribute_values :
2501 pipeline->device->default_attribute_float;
2502 #endif
2503
2504 #if V3D_VERSION >= 71
2505 if (v3d_device_has_draw_index(&pipeline->device->devinfo)) {
2506 cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD_DRAW_INDEX,
2507 pipeline->shader_state_record, shader) {
2508 shader.min_coord_shader_input_segments_required_in_play =
2509 pipeline->vpm_cfg_bin.As;
2510 shader.min_vertex_shader_input_segments_required_in_play =
2511 pipeline->vpm_cfg.As;
2512 shader.coordinate_shader_code_address =
2513 v3dv_cl_address(assembly_bo, vs_bin_variant->assembly_offset);
2514 shader.vertex_shader_code_address =
2515 v3dv_cl_address(assembly_bo, vs_variant->assembly_offset);
2516 shader.fragment_shader_code_address =
2517 v3dv_cl_address(assembly_bo, fs_variant->assembly_offset);
2518 shader.coordinate_shader_uniforms_address = cmd_buffer->state.uniforms.vs_bin;
2519 shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
2520 shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;
2521 shader.any_shader_reads_hardware_written_primitive_id =
2522 (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid;
2523 shader.insert_primitive_id_as_first_varying_to_fragment_shader =
2524 !pipeline->has_gs && prog_data_fs->uses_pid;
2525 }
2526 } else
2527 #endif
2528 {
2529 cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_RECORD,
2530 pipeline->shader_state_record, shader) {
2531 /* FIXME: we are setting this values here and during the
2532 * prepacking. This is because both cl_emit_with_prepacked and v3dvx_pack
2533 * asserts for minimum values of these. It would be good to get
2534 * v3dvx_pack to assert on the final value if possible
2535 */
2536 shader.min_coord_shader_input_segments_required_in_play =
2537 pipeline->vpm_cfg_bin.As;
2538 shader.min_vertex_shader_input_segments_required_in_play =
2539 pipeline->vpm_cfg.As;
2540
2541 shader.coordinate_shader_code_address =
2542 v3dv_cl_address(assembly_bo, vs_bin_variant->assembly_offset);
2543 shader.vertex_shader_code_address =
2544 v3dv_cl_address(assembly_bo, vs_variant->assembly_offset);
2545 shader.fragment_shader_code_address =
2546 v3dv_cl_address(assembly_bo, fs_variant->assembly_offset);
2547
2548 shader.coordinate_shader_uniforms_address = cmd_buffer->state.uniforms.vs_bin;
2549 shader.vertex_shader_uniforms_address = cmd_buffer->state.uniforms.vs;
2550 shader.fragment_shader_uniforms_address = cmd_buffer->state.uniforms.fs;
2551
2552 #if V3D_VERSION == 42
2553 shader.address_of_default_attribute_values =
2554 v3dv_cl_address(default_attribute_values, 0);
2555 #endif
2556
2557 shader.any_shader_reads_hardware_written_primitive_id =
2558 (pipeline->has_gs && prog_data_gs->uses_pid) || prog_data_fs->uses_pid;
2559 shader.insert_primitive_id_as_first_varying_to_fragment_shader =
2560 !pipeline->has_gs && prog_data_fs->uses_pid;
2561 }
2562 }
2563
2564 /* Upload vertex element attributes (SHADER_STATE_ATTRIBUTE_RECORD) */
2565 bool cs_loaded_any = false;
2566 const bool cs_uses_builtins = prog_data_vs_bin->uses_iid ||
2567 prog_data_vs_bin->uses_biid ||
2568 prog_data_vs_bin->uses_vid;
2569 const uint32_t packet_length =
2570 cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD);
2571
2572 uint32_t emitted_va_count = 0;
2573 for (uint32_t i = 0; emitted_va_count < pipeline->va_count; i++) {
2574 assert(i < MAX_VERTEX_ATTRIBS);
2575
2576 if (pipeline->va[i].vk_format == VK_FORMAT_UNDEFINED)
2577 continue;
2578
2579 const uint32_t binding = pipeline->va[i].binding;
2580
2581 /* We store each vertex attribute in the array using its driver location
2582 * as index.
2583 */
2584 const uint32_t location = i;
2585
2586 struct v3dv_vertex_binding *c_vb = &cmd_buffer->state.vertex_bindings[binding];
2587
2588 cl_emit_with_prepacked(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD,
2589 &pipeline->vertex_attrs[i * packet_length], attr) {
2590
2591 assert(c_vb->buffer->mem->bo);
2592 attr.address = v3dv_cl_address(c_vb->buffer->mem->bo,
2593 c_vb->buffer->mem_offset +
2594 pipeline->va[i].offset +
2595 c_vb->offset);
2596
2597 attr.number_of_values_read_by_coordinate_shader =
2598 prog_data_vs_bin->vattr_sizes[location];
2599 attr.number_of_values_read_by_vertex_shader =
2600 prog_data_vs->vattr_sizes[location];
2601
2602 /* GFXH-930: At least one attribute must be enabled and read by CS
2603 * and VS. If we have attributes being consumed by the VS but not
2604 * the CS, then set up a dummy load of the last attribute into the
2605 * CS's VPM inputs. (Since CS is just dead-code-elimination compared
2606 * to VS, we can't have CS loading but not VS).
2607 *
2608 * GFXH-1602: first attribute must be active if using builtins.
2609 */
2610 if (prog_data_vs_bin->vattr_sizes[location])
2611 cs_loaded_any = true;
2612
2613 if (i == 0 && cs_uses_builtins && !cs_loaded_any) {
2614 attr.number_of_values_read_by_coordinate_shader = 1;
2615 cs_loaded_any = true;
2616 } else if (i == pipeline->va_count - 1 && !cs_loaded_any) {
2617 attr.number_of_values_read_by_coordinate_shader = 1;
2618 cs_loaded_any = true;
2619 }
2620
2621 attr.stride =
2622 cmd_buffer->vk.dynamic_graphics_state.vi_binding_strides[binding];
2623
2624 attr.maximum_index = attr.stride == 0 ?
2625 1u : MIN2(0xffffffu, c_vb->size / attr.stride);
2626 }
2627
2628 emitted_va_count++;
2629 }
2630
2631 if (pipeline->va_count == 0) {
2632 /* GFXH-930: At least one attribute must be enabled and read
2633 * by CS and VS. If we have no attributes being consumed by
2634 * the shader, set up a dummy to be loaded into the VPM.
2635 */
2636 cl_emit(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {
2637 /* Valid address of data whose value will be unused. */
2638 attr.address = v3dv_cl_address(job->indirect.bo, 0);
2639
2640 attr.type = ATTRIBUTE_FLOAT;
2641 attr.stride = 0;
2642 attr.vec_size = 1;
2643
2644 attr.number_of_values_read_by_coordinate_shader = 1;
2645 attr.number_of_values_read_by_vertex_shader = 1;
2646 }
2647 }
2648
2649 if (cmd_buffer->state.dirty & V3DV_CMD_DIRTY_PIPELINE) {
2650 v3dv_cl_ensure_space_with_branch(&job->bcl,
2651 sizeof(pipeline->vcm_cache_size));
2652 v3dv_return_if_oom(cmd_buffer, NULL);
2653
2654 cl_emit_prepacked(&job->bcl, &pipeline->vcm_cache_size);
2655 }
2656
2657 v3dv_cl_ensure_space_with_branch(&job->bcl,
2658 cl_packet_length(GL_SHADER_STATE));
2659 v3dv_return_if_oom(cmd_buffer, NULL);
2660
2661 if (pipeline->has_gs) {
2662 cl_emit(&job->bcl, GL_SHADER_STATE_INCLUDING_GS, state) {
2663 state.address = v3dv_cl_address(job->indirect.bo, shader_rec_offset);
2664 state.number_of_attribute_arrays = num_elements_to_emit;
2665 }
2666 } else {
2667 cl_emit(&job->bcl, GL_SHADER_STATE, state) {
2668 state.address = v3dv_cl_address(job->indirect.bo, shader_rec_offset);
2669 state.number_of_attribute_arrays = num_elements_to_emit;
2670 }
2671 }
2672
2673 /* Clearing push constants and descriptor sets for all stages is not quite
2674 * correct (some shader stages may not be used at all or they may not be
2675 * consuming push constants), however this is not relevant because if we
2676 * bind a different pipeline we always have to rebuild the uniform streams.
2677 */
2678 cmd_buffer->state.dirty &= ~(V3DV_CMD_DIRTY_VERTEX_BUFFER |
2679 V3DV_CMD_DIRTY_DESCRIPTOR_SETS |
2680 V3DV_CMD_DIRTY_PUSH_CONSTANTS);
2681 cmd_buffer->state.dirty_descriptor_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
2682 cmd_buffer->state.dirty_push_constants_stages &= ~VK_SHADER_STAGE_ALL_GRAPHICS;
2683 }
2684
2685 void
v3dX(cmd_buffer_emit_draw)2686 v3dX(cmd_buffer_emit_draw)(struct v3dv_cmd_buffer *cmd_buffer,
2687 struct v3dv_draw_info *info)
2688 {
2689 struct v3dv_job *job = cmd_buffer->state.job;
2690 assert(job);
2691 const struct vk_dynamic_graphics_state *dyn =
2692 &cmd_buffer->vk.dynamic_graphics_state;
2693 uint32_t hw_prim_type = v3dv_pipeline_primitive(dyn->ia.primitive_topology);
2694
2695 if (info->first_instance > 0) {
2696 v3dv_cl_ensure_space_with_branch(
2697 &job->bcl, cl_packet_length(BASE_VERTEX_BASE_INSTANCE));
2698 v3dv_return_if_oom(cmd_buffer, NULL);
2699
2700 cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) {
2701 base.base_instance = info->first_instance;
2702 base.base_vertex = 0;
2703 }
2704 }
2705
2706 if (info->instance_count > 1) {
2707 v3dv_cl_ensure_space_with_branch(
2708 &job->bcl, cl_packet_length(VERTEX_ARRAY_INSTANCED_PRIMS));
2709 v3dv_return_if_oom(cmd_buffer, NULL);
2710
2711 cl_emit(&job->bcl, VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
2712 prim.mode = hw_prim_type;
2713 prim.index_of_first_vertex = info->first_vertex;
2714 prim.number_of_instances = info->instance_count;
2715 prim.instance_length = info->vertex_count;
2716 }
2717 } else {
2718 v3dv_cl_ensure_space_with_branch(
2719 &job->bcl, cl_packet_length(VERTEX_ARRAY_PRIMS));
2720 v3dv_return_if_oom(cmd_buffer, NULL);
2721 cl_emit(&job->bcl, VERTEX_ARRAY_PRIMS, prim) {
2722 prim.mode = hw_prim_type;
2723 prim.length = info->vertex_count;
2724 prim.index_of_first_vertex = info->first_vertex;
2725 }
2726 }
2727 }
2728
2729 void
v3dX(cmd_buffer_emit_index_buffer)2730 v3dX(cmd_buffer_emit_index_buffer)(struct v3dv_cmd_buffer *cmd_buffer)
2731 {
2732 struct v3dv_job *job = cmd_buffer->state.job;
2733 assert(job);
2734
2735 /* We flag all state as dirty when we create a new job so make sure we
2736 * have a valid index buffer before attempting to emit state for it.
2737 */
2738 struct v3dv_buffer *ibuffer =
2739 v3dv_buffer_from_handle(cmd_buffer->state.index_buffer.buffer);
2740 if (ibuffer) {
2741 v3dv_cl_ensure_space_with_branch(
2742 &job->bcl, cl_packet_length(INDEX_BUFFER_SETUP));
2743 v3dv_return_if_oom(cmd_buffer, NULL);
2744
2745 const uint32_t offset = ibuffer->mem_offset +
2746 cmd_buffer->state.index_buffer.offset;
2747 assert(ibuffer->mem->bo->size >= offset);
2748 cl_emit(&job->bcl, INDEX_BUFFER_SETUP, ib) {
2749 ib.address = v3dv_cl_address(ibuffer->mem->bo, offset);
2750 ib.size = cmd_buffer->state.index_buffer.size;
2751 }
2752 }
2753
2754 cmd_buffer->state.dirty &= ~V3DV_CMD_DIRTY_INDEX_BUFFER;
2755 }
2756
2757 void
v3dX(cmd_buffer_emit_draw_indexed)2758 v3dX(cmd_buffer_emit_draw_indexed)(struct v3dv_cmd_buffer *cmd_buffer,
2759 uint32_t indexCount,
2760 uint32_t instanceCount,
2761 uint32_t firstIndex,
2762 int32_t vertexOffset,
2763 uint32_t firstInstance)
2764 {
2765 struct v3dv_job *job = cmd_buffer->state.job;
2766 assert(job);
2767
2768 const struct vk_dynamic_graphics_state *dyn =
2769 &cmd_buffer->vk.dynamic_graphics_state;
2770 uint32_t hw_prim_type = v3dv_pipeline_primitive(dyn->ia.primitive_topology);
2771 uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1;
2772 uint32_t index_offset = firstIndex * cmd_buffer->state.index_buffer.index_size;
2773
2774 if (vertexOffset != 0 || firstInstance != 0) {
2775 v3dv_cl_ensure_space_with_branch(
2776 &job->bcl, cl_packet_length(BASE_VERTEX_BASE_INSTANCE));
2777 v3dv_return_if_oom(cmd_buffer, NULL);
2778
2779 cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) {
2780 base.base_instance = firstInstance;
2781 base.base_vertex = vertexOffset;
2782 }
2783 }
2784
2785 if (instanceCount == 1) {
2786 v3dv_cl_ensure_space_with_branch(
2787 &job->bcl, cl_packet_length(INDEXED_PRIM_LIST));
2788 v3dv_return_if_oom(cmd_buffer, NULL);
2789
2790 cl_emit(&job->bcl, INDEXED_PRIM_LIST, prim) {
2791 prim.index_type = index_type;
2792 prim.length = indexCount;
2793 prim.index_offset = index_offset;
2794 prim.mode = hw_prim_type;
2795 prim.enable_primitive_restarts = dyn->ia.primitive_restart_enable;
2796 }
2797 } else if (instanceCount > 1) {
2798 v3dv_cl_ensure_space_with_branch(
2799 &job->bcl, cl_packet_length(INDEXED_INSTANCED_PRIM_LIST));
2800 v3dv_return_if_oom(cmd_buffer, NULL);
2801
2802 cl_emit(&job->bcl, INDEXED_INSTANCED_PRIM_LIST, prim) {
2803 prim.index_type = index_type;
2804 prim.index_offset = index_offset;
2805 prim.mode = hw_prim_type;
2806 prim.enable_primitive_restarts = dyn->ia.primitive_restart_enable;
2807 prim.number_of_instances = instanceCount;
2808 prim.instance_length = indexCount;
2809 }
2810 }
2811 }
2812
2813 void
v3dX(cmd_buffer_emit_draw_indirect)2814 v3dX(cmd_buffer_emit_draw_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
2815 struct v3dv_buffer *buffer,
2816 VkDeviceSize offset,
2817 uint32_t drawCount,
2818 uint32_t stride)
2819 {
2820 struct v3dv_job *job = cmd_buffer->state.job;
2821 assert(job);
2822
2823 const struct vk_dynamic_graphics_state *dyn =
2824 &cmd_buffer->vk.dynamic_graphics_state;
2825 uint32_t hw_prim_type = v3dv_pipeline_primitive(dyn->ia.primitive_topology);
2826
2827 v3dv_cl_ensure_space_with_branch(
2828 &job->bcl, cl_packet_length(INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS));
2829 v3dv_return_if_oom(cmd_buffer, NULL);
2830
2831 cl_emit(&job->bcl, INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
2832 prim.mode = hw_prim_type;
2833 prim.number_of_draw_indirect_array_records = drawCount;
2834 prim.stride_in_multiples_of_4_bytes = stride >> 2;
2835 prim.address = v3dv_cl_address(buffer->mem->bo,
2836 buffer->mem_offset + offset);
2837 }
2838 }
2839
2840 void
v3dX(cmd_buffer_emit_indexed_indirect)2841 v3dX(cmd_buffer_emit_indexed_indirect)(struct v3dv_cmd_buffer *cmd_buffer,
2842 struct v3dv_buffer *buffer,
2843 VkDeviceSize offset,
2844 uint32_t drawCount,
2845 uint32_t stride)
2846 {
2847 struct v3dv_job *job = cmd_buffer->state.job;
2848 assert(job);
2849
2850 const struct vk_dynamic_graphics_state *dyn =
2851 &cmd_buffer->vk.dynamic_graphics_state;
2852 uint32_t hw_prim_type = v3dv_pipeline_primitive(dyn->ia.primitive_topology);
2853 uint8_t index_type = ffs(cmd_buffer->state.index_buffer.index_size) - 1;
2854
2855 v3dv_cl_ensure_space_with_branch(
2856 &job->bcl, cl_packet_length(INDIRECT_INDEXED_INSTANCED_PRIM_LIST));
2857 v3dv_return_if_oom(cmd_buffer, NULL);
2858
2859 cl_emit(&job->bcl, INDIRECT_INDEXED_INSTANCED_PRIM_LIST, prim) {
2860 prim.index_type = index_type;
2861 prim.mode = hw_prim_type;
2862 prim.enable_primitive_restarts = dyn->ia.primitive_restart_enable;
2863 prim.number_of_draw_indirect_indexed_records = drawCount;
2864 prim.stride_in_multiples_of_4_bytes = stride >> 2;
2865 prim.address = v3dv_cl_address(buffer->mem->bo,
2866 buffer->mem_offset + offset);
2867 }
2868 }
2869
2870 void
v3dX(cmd_buffer_suspend)2871 v3dX(cmd_buffer_suspend)(struct v3dv_cmd_buffer *cmd_buffer)
2872 {
2873 struct v3dv_job *job = cmd_buffer->state.job;
2874 assert(job);
2875
2876 job->suspending = true;
2877
2878 v3dv_cl_ensure_space_with_branch(&job->bcl, cl_packet_length(BRANCH));
2879
2880 job->suspend_branch_inst_ptr = cl_start(&job->bcl);
2881 cl_emit(&job->bcl, BRANCH, branch) {
2882 branch.address = v3dv_cl_address(NULL, 0);
2883 }
2884
2885 /* The sim complains if the command list ends with a branch */
2886 cl_emit(&job->bcl, NOP, nop);
2887 }
2888
2889 void
v3dX(job_patch_resume_address)2890 v3dX(job_patch_resume_address)(struct v3dv_job *first_suspend,
2891 struct v3dv_job *suspend,
2892 struct v3dv_job *resume)
2893 {
2894 assert(resume && resume->resuming);
2895 assert(first_suspend && first_suspend->suspending);
2896 assert(suspend && suspend->suspending);
2897 assert(suspend->suspend_branch_inst_ptr != NULL);
2898
2899 struct v3dv_bo *resume_bo =
2900 list_first_entry(&resume->bcl.bo_list, struct v3dv_bo, list_link);
2901 struct cl_packet_struct(BRANCH) branch = {
2902 cl_packet_header(BRANCH),
2903 };
2904 branch.address = v3dv_cl_address(NULL, resume_bo->offset);
2905
2906 uint8_t *rewrite_addr = (uint8_t *) suspend->suspend_branch_inst_ptr;
2907 cl_packet_pack(BRANCH)(NULL, rewrite_addr, &branch);
2908
2909 if (resume != first_suspend) {
2910 set_foreach(resume->bos, entry) {
2911 struct v3dv_bo *bo = (void *)entry->key;
2912 v3dv_job_add_bo(first_suspend, bo);
2913 }
2914 }
2915
2916 first_suspend->suspended_bcl_end = resume->bcl.bo->offset +
2917 v3dv_cl_offset(&resume->bcl);
2918 }
2919
2920 static void
job_destroy_cb(VkDevice device,uint64_t pobj,VkAllocationCallbacks * allocb)2921 job_destroy_cb(VkDevice device, uint64_t pobj, VkAllocationCallbacks *allocb)
2922 {
2923 struct v3dv_job *clone = (struct v3dv_job *) (uintptr_t) pobj;
2924 v3dv_job_destroy(clone);
2925 }
2926
2927 /**
2928 * This checks if the command buffer has been created with
2929 * VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT, in which case we won't be
2930 * able to safely patch the resume address into the job (since we could have
2931 * another instance of this job running in the GPU, potentially resuming in a
2932 * different address). In that case, we clone the job and make the clone have
2933 * its own BCL copied from the original job so we can later patch the resume
2934 * address into it safely.
2935 */
2936 struct v3dv_job *
v3dX(cmd_buffer_prepare_suspend_job_for_submit)2937 v3dX(cmd_buffer_prepare_suspend_job_for_submit)(struct v3dv_job *job)
2938 {
2939 assert(job->suspending);
2940 assert(job->cmd_buffer);
2941 assert(job->type == V3DV_JOB_TYPE_GPU_CL);
2942
2943 if (!(job->cmd_buffer->usage_flags & VK_COMMAND_BUFFER_USAGE_SIMULTANEOUS_USE_BIT))
2944 return job;
2945
2946 /* Create the clone job, but skip the BCL since we are going to create
2947 * our own below.
2948 */
2949 struct v3dv_job *clone = v3dv_job_clone(job, true);
2950 if (!clone)
2951 return NULL;
2952
2953 /* Compute total size of BCL we need to copy */
2954 uint32_t bcl_size = 0;
2955 list_for_each_entry(struct v3dv_bo, bo, &job->bcl.bo_list, list_link)
2956 bcl_size += bo->size;
2957
2958 /* Prepare the BCL for the cloned job. For this we go over the BOs in the
2959 * BCL of the original job and we copy their contents into the single BO
2960 * in the BCL of the cloned job.
2961 */
2962 clone->clone_owns_bcl = true;
2963 v3dv_cl_init(clone, &clone->bcl);
2964 v3dv_cl_ensure_space(&clone->bcl, bcl_size, 4);
2965 if (!clone->bcl.bo)
2966 return NULL;
2967
2968 assert(clone->bcl.base);
2969 assert(clone->bcl.base == clone->bcl.next);
2970
2971 /* Unlink this job from the command buffer's execution list */
2972 list_inithead(&clone->list_link);
2973
2974 /* Copy the contents of each BO in the original job's BCL into the single
2975 * BO we have in the clone's BCL.
2976 *
2977 * If the BO is the last in the BCL (which we can tell because it wouldn't
2978 * have emitted a BRANCH instruction to link to another BO) we need to copy
2979 * up to the current BCL offset, otherwise we need to copy up to the BRANCH
2980 * instruction (excluded, since we are putting everything together into a
2981 * single BO here).
2982 */
2983 list_for_each_entry(struct v3dv_bo, bo, &job->bcl.bo_list, list_link) {
2984 assert(bo->map);
2985 uint32_t copy_size;
2986 if (bo->cl_branch_offset == 0xffffffff) { /* Last BO in BCL */
2987 assert(bo == list_last_entry(&job->bcl.bo_list, struct v3dv_bo, list_link));
2988 copy_size = v3dv_cl_offset(&job->bcl);
2989 } else {
2990 assert(bo->cl_branch_offset >= cl_packet_length(BRANCH));
2991 copy_size = bo->cl_branch_offset - cl_packet_length(BRANCH);
2992 }
2993
2994 assert(v3dv_cl_offset(&job->bcl) + copy_size < bcl_size);
2995 memcpy(cl_start(&clone->bcl), bo->map, copy_size);
2996 cl_advance_and_end(&clone->bcl, copy_size);
2997 }
2998
2999 /* Now we need to fixup the pointer to the suspend BRANCH instruction at the
3000 * end of the BCL so it points to the address in the new BCL. We know that
3001 * to suspend a command buffer we always emit a BRANCH+NOP combo, so we just
3002 * need to go back that many bytes in to the BCL to find the instruction.
3003 */
3004 uint32_t suspend_terminator_size =
3005 cl_packet_length(BRANCH) + cl_packet_length(NOP);
3006 clone->suspend_branch_inst_ptr = (struct v3dv_cl_out *)
3007 (((uint8_t *)cl_start(&clone->bcl)) - suspend_terminator_size);
3008 assert(*(((uint8_t *)clone->suspend_branch_inst_ptr)) == V3DX(BRANCH_opcode));
3009
3010 /* This job is not in the execution list of the command buffer so it
3011 * won't be destroyed with it; add it as a private object to get it freed.
3012 *
3013 * FIXME: every time this job is submitted we clone the job and we only
3014 * destroy it when the command buffer is destroyed. If the user keeps the
3015 * command buffer for the entire lifetime of the application, this command
3016 * buffer could grow significantly, so maybe we want to do something smarter
3017 * like having a syncobj bound to these jobs and every time we submit the
3018 * command buffer again we first check these sncobjs to see if we can free
3019 * some of these clones so we avoid blowing up memory.
3020 */
3021 v3dv_cmd_buffer_add_private_obj(
3022 job->cmd_buffer, (uintptr_t)clone,
3023 (v3dv_cmd_buffer_private_obj_destroy_cb)job_destroy_cb);
3024
3025 return clone;
3026 }
3027