xref: /aosp_15_r20/external/mesa3d/src/broadcom/vulkan/v3dvx_meta_common.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Copyright © 2021 Raspberry Pi Ltd
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #include "v3dv_private.h"
25 #include "v3dv_meta_common.h"
26 
27 #include "broadcom/common/v3d_macros.h"
28 #include "broadcom/common/v3d_tfu.h"
29 #include "broadcom/common/v3d_util.h"
30 #include "broadcom/cle/v3dx_pack.h"
31 #include "broadcom/compiler/v3d_compiler.h"
32 
33 struct rcl_clear_info {
34    const union v3dv_clear_value *clear_value;
35    struct v3dv_image *image;
36    VkImageAspectFlags aspects;
37    uint32_t level;
38 };
39 
40 static struct v3dv_cl *
emit_rcl_prologue(struct v3dv_job * job,struct v3dv_meta_framebuffer * fb,const struct rcl_clear_info * clear_info)41 emit_rcl_prologue(struct v3dv_job *job,
42                   struct v3dv_meta_framebuffer *fb,
43                   const struct rcl_clear_info *clear_info)
44 {
45    const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
46 
47    struct v3dv_cl *rcl = &job->rcl;
48    v3dv_cl_ensure_space_with_branch(rcl, 200 +
49                                     tiling->layers * 256 *
50                                     cl_packet_length(SUPERTILE_COORDINATES));
51    if (job->cmd_buffer->state.oom)
52       return NULL;
53 
54    assert(!tiling->msaa || !tiling->double_buffer);
55    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COMMON, config) {
56       config.early_z_disable = true;
57       config.image_width_pixels = tiling->width;
58       config.image_height_pixels = tiling->height;
59       config.number_of_render_targets = 1;
60       config.multisample_mode_4x = tiling->msaa;
61       config.double_buffer_in_non_ms_mode = tiling->double_buffer;
62 #if V3D_VERSION == 42
63       config.maximum_bpp_of_all_render_targets = tiling->internal_bpp;
64 #endif
65 #if V3D_VERSION >= 71
66       config.log2_tile_width = log2_tile_size(tiling->tile_width);
67       config.log2_tile_height = log2_tile_size(tiling->tile_height);
68       /* FIXME: ideallly we would like next assert on the packet header (as is
69        * general, so also applies to GL). We would need to expand
70        * gen_pack_header for that.
71        */
72       assert(config.log2_tile_width == config.log2_tile_height ||
73              config.log2_tile_width == config.log2_tile_height + 1);
74 #endif
75       config.internal_depth_type = fb->internal_depth_type;
76    }
77 
78    const uint32_t *color = NULL;
79    if (clear_info && (clear_info->aspects & VK_IMAGE_ASPECT_COLOR_BIT)) {
80       UNUSED uint32_t clear_pad = 0;
81       if (clear_info->image) {
82          const struct v3dv_image *image = clear_info->image;
83 
84          /* From vkCmdClearColorImage:
85           *   "image must not use any of the formats that require a sampler
86           *    YCBCR conversion"
87           */
88          assert(image->plane_count == 1);
89          const struct v3d_resource_slice *slice =
90             &image->planes[0].slices[clear_info->level];
91          if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
92              slice->tiling == V3D_TILING_UIF_XOR) {
93             int uif_block_height = v3d_utile_height(image->planes[0].cpp) * 2;
94 
95             uint32_t implicit_padded_height =
96                align(tiling->height, uif_block_height) / uif_block_height;
97 
98             if (slice->padded_height_of_output_image_in_uif_blocks -
99                 implicit_padded_height >= 15) {
100                clear_pad = slice->padded_height_of_output_image_in_uif_blocks;
101             }
102          }
103       }
104 
105       color = &clear_info->clear_value->color[0];
106 
107 #if V3D_VERSION == 42
108       cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART1, clear) {
109          clear.clear_color_low_32_bits = color[0];
110          clear.clear_color_next_24_bits = color[1] & 0x00ffffff;
111          clear.render_target_number = 0;
112       };
113 
114       if (tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
115          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART2, clear) {
116             clear.clear_color_mid_low_32_bits =
117               ((color[1] >> 24) | (color[2] << 8));
118             clear.clear_color_mid_high_24_bits =
119               ((color[2] >> 24) | ((color[3] & 0xffff) << 8));
120             clear.render_target_number = 0;
121          };
122       }
123 
124       if (tiling->internal_bpp >= V3D_INTERNAL_BPP_128 || clear_pad) {
125          cl_emit(rcl, TILE_RENDERING_MODE_CFG_CLEAR_COLORS_PART3, clear) {
126             clear.uif_padded_height_in_uif_blocks = clear_pad;
127             clear.clear_color_high_16_bits = color[3] >> 16;
128             clear.render_target_number = 0;
129          };
130       }
131 #endif
132    }
133 
134 #if V3D_VERSION == 42
135    cl_emit(rcl, TILE_RENDERING_MODE_CFG_COLOR, rt) {
136       rt.render_target_0_internal_bpp = tiling->internal_bpp;
137       rt.render_target_0_internal_type = fb->internal_type;
138       rt.render_target_0_clamp = V3D_RENDER_TARGET_CLAMP_NONE;
139    }
140 #endif
141 
142 #if V3D_VERSION >= 71
143    cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART1, rt) {
144       if (color)
145          rt.clear_color_low_bits = color[0];
146       rt.internal_bpp = tiling->internal_bpp;
147       rt.internal_type_and_clamping = v3dX(clamp_for_format_and_type)(fb->internal_type,
148                                                                       fb->vk_format);
149       rt.stride =
150          v3d_compute_rt_row_row_stride_128_bits(tiling->tile_width,
151                                                 v3d_internal_bpp_words(rt.internal_bpp));
152       rt.base_address = 0;
153       rt.render_target_number = 0;
154    }
155 
156    if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_64) {
157       cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART2, rt) {
158          rt.clear_color_mid_bits = /* 40 bits (32 + 8)  */
159             ((uint64_t) color[1]) |
160             (((uint64_t) (color[2] & 0xff)) << 32);
161          rt.render_target_number = 0;
162       }
163    }
164 
165    if (color && tiling->internal_bpp >= V3D_INTERNAL_BPP_128) {
166       cl_emit(rcl, TILE_RENDERING_MODE_CFG_RENDER_TARGET_PART3, rt) {
167          rt.clear_color_top_bits = /* 56 bits (24 + 32) */
168             (((uint64_t) (color[2] & 0xffffff00)) >> 8) |
169             (((uint64_t) (color[3])) << 24);
170          rt.render_target_number = 0;
171       }
172    }
173 #endif
174 
175    cl_emit(rcl, TILE_RENDERING_MODE_CFG_ZS_CLEAR_VALUES, clear) {
176       clear.z_clear_value = clear_info ? clear_info->clear_value->z : 1.0f;
177       clear.stencil_clear_value = clear_info ? clear_info->clear_value->s : 0;
178    };
179 
180    cl_emit(rcl, TILE_LIST_INITIAL_BLOCK_SIZE, init) {
181       init.use_auto_chained_tile_lists = true;
182       init.size_of_first_block_in_chained_tile_lists =
183          TILE_ALLOCATION_BLOCK_SIZE_64B;
184    }
185 
186    return rcl;
187 }
188 
189 static void
emit_frame_setup(struct v3dv_job * job,uint32_t min_layer,const union v3dv_clear_value * clear_value)190 emit_frame_setup(struct v3dv_job *job,
191                  uint32_t min_layer,
192                  const union v3dv_clear_value *clear_value)
193 {
194    v3dv_return_if_oom(NULL, job);
195 
196    const struct v3dv_frame_tiling *tiling = &job->frame_tiling;
197 
198    struct v3dv_cl *rcl = &job->rcl;
199 
200    const uint32_t tile_alloc_offset =
201       64 * min_layer * tiling->draw_tiles_x * tiling->draw_tiles_y;
202    cl_emit(rcl, MULTICORE_RENDERING_TILE_LIST_SET_BASE, list) {
203       list.address = v3dv_cl_address(job->tile_alloc, tile_alloc_offset);
204    }
205 
206    cl_emit(rcl, MULTICORE_RENDERING_SUPERTILE_CFG, config) {
207       config.number_of_bin_tile_lists = 1;
208       config.total_frame_width_in_tiles = tiling->draw_tiles_x;
209       config.total_frame_height_in_tiles = tiling->draw_tiles_y;
210 
211       config.supertile_width_in_tiles = tiling->supertile_width;
212       config.supertile_height_in_tiles = tiling->supertile_height;
213 
214       config.total_frame_width_in_supertiles =
215          tiling->frame_width_in_supertiles;
216       config.total_frame_height_in_supertiles =
217          tiling->frame_height_in_supertiles;
218    }
219 
220    /* Implement GFXH-1742 workaround. Also, if we are clearing we have to do
221     * it here.
222     */
223    for (int i = 0; i < 2; i++) {
224       cl_emit(rcl, TILE_COORDINATES, coords);
225       cl_emit(rcl, END_OF_LOADS, end);
226       cl_emit(rcl, STORE_TILE_BUFFER_GENERAL, store) {
227          store.buffer_to_store = NONE;
228       }
229       /* When using double-buffering, we need to clear both buffers (unless
230        * we only have a single tile to render).
231        */
232       if (clear_value &&
233           (i == 0 || v3dv_do_double_initial_tile_clear(tiling))) {
234 #if V3D_VERSION == 42
235          cl_emit(rcl, CLEAR_TILE_BUFFERS, clear) {
236             clear.clear_z_stencil_buffer = true;
237             clear.clear_all_render_targets = true;
238          }
239 #endif
240 #if V3D_VERSION >= 71
241          cl_emit(rcl, CLEAR_RENDER_TARGETS, clear);
242 #endif
243       }
244       cl_emit(rcl, END_OF_TILE_MARKER, end);
245    }
246 
247    cl_emit(rcl, FLUSH_VCD_CACHE, flush);
248 }
249 
250 static void
emit_supertile_coordinates(struct v3dv_job * job,struct v3dv_meta_framebuffer * framebuffer)251 emit_supertile_coordinates(struct v3dv_job *job,
252                            struct v3dv_meta_framebuffer *framebuffer)
253 {
254    v3dv_return_if_oom(NULL, job);
255 
256    struct v3dv_cl *rcl = &job->rcl;
257 
258    const uint32_t min_y = framebuffer->min_y_supertile;
259    const uint32_t max_y = framebuffer->max_y_supertile;
260    const uint32_t min_x = framebuffer->min_x_supertile;
261    const uint32_t max_x = framebuffer->max_x_supertile;
262 
263    for (int y = min_y; y <= max_y; y++) {
264       for (int x = min_x; x <= max_x; x++) {
265          cl_emit(rcl, SUPERTILE_COORDINATES, coords) {
266             coords.column_number_in_supertiles = x;
267             coords.row_number_in_supertiles = y;
268          }
269       }
270    }
271 }
272 
273 static void
emit_linear_load(struct v3dv_cl * cl,uint32_t buffer,struct v3dv_bo * bo,uint32_t offset,uint32_t stride,uint32_t format)274 emit_linear_load(struct v3dv_cl *cl,
275                  uint32_t buffer,
276                  struct v3dv_bo *bo,
277                  uint32_t offset,
278                  uint32_t stride,
279                  uint32_t format)
280 {
281    cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
282       load.buffer_to_load = buffer;
283       load.address = v3dv_cl_address(bo, offset);
284       load.input_image_format = format;
285       load.memory_format = V3D_TILING_RASTER;
286       load.height_in_ub_or_stride = stride;
287       load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
288    }
289 }
290 
291 static void
emit_linear_store(struct v3dv_cl * cl,uint32_t buffer,struct v3dv_bo * bo,uint32_t offset,uint32_t stride,bool msaa,uint32_t format)292 emit_linear_store(struct v3dv_cl *cl,
293                   uint32_t buffer,
294                   struct v3dv_bo *bo,
295                   uint32_t offset,
296                   uint32_t stride,
297                   bool msaa,
298                   uint32_t format)
299 {
300    cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
301       store.buffer_to_store = RENDER_TARGET_0;
302       store.address = v3dv_cl_address(bo, offset);
303       store.clear_buffer_being_stored = false;
304       store.output_image_format = format;
305       store.memory_format = V3D_TILING_RASTER;
306       store.height_in_ub_or_stride = stride;
307       store.decimate_mode = msaa ? V3D_DECIMATE_MODE_ALL_SAMPLES :
308                                    V3D_DECIMATE_MODE_SAMPLE_0;
309    }
310 }
311 
312 /* This chooses a tile buffer format that is appropriate for the copy operation.
313  * Typically, this is the image render target type, however, if we are copying
314  * depth/stencil to/from a buffer the hardware can't do raster loads/stores, so
315  * we need to load and store to/from a tile color buffer using a compatible
316  * color format.
317  */
318 static uint32_t
choose_tlb_format(struct v3dv_meta_framebuffer * framebuffer,VkImageAspectFlags aspect,bool for_store,bool is_copy_to_buffer,bool is_copy_from_buffer)319 choose_tlb_format(struct v3dv_meta_framebuffer *framebuffer,
320                   VkImageAspectFlags aspect,
321                   bool for_store,
322                   bool is_copy_to_buffer,
323                   bool is_copy_from_buffer)
324 {
325    /* At this point the framebuffer was already lowered to single-plane */
326    assert(framebuffer->format->plane_count == 1);
327 
328    if (is_copy_to_buffer || is_copy_from_buffer) {
329       switch (framebuffer->vk_format) {
330       case VK_FORMAT_D16_UNORM:
331          return V3D_OUTPUT_IMAGE_FORMAT_R16UI;
332       case VK_FORMAT_D32_SFLOAT:
333          return V3D_OUTPUT_IMAGE_FORMAT_R32F;
334       case VK_FORMAT_X8_D24_UNORM_PACK32:
335          return V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
336       case VK_FORMAT_D24_UNORM_S8_UINT:
337          /* When storing the stencil aspect of a combined depth/stencil image
338           * to a buffer, the Vulkan spec states that the output buffer must
339           * have packed stencil values, so we choose an R8UI format for our
340           * store outputs. For the load input we still want RGBA8UI since the
341           * source image contains 4 channels (including the 3 channels
342           * containing the 24-bit depth value).
343           *
344           * When loading the stencil aspect of a combined depth/stencil image
345           * from a buffer, we read packed 8-bit stencil values from the buffer
346           * that we need to put into the LSB of the 32-bit format (the R
347           * channel), so we use R8UI. For the store, if we used R8UI then we
348           * would write 8-bit stencil values consecutively over depth channels,
349           * so we need to use RGBA8UI. This will write each stencil value in
350           * its correct position, but will overwrite depth values (channels G
351           * B,A) with undefined values. To fix this,  we will have to restore
352           * the depth aspect from the Z tile buffer, which we should pre-load
353           * from the image before the store).
354           */
355          if (aspect & VK_IMAGE_ASPECT_DEPTH_BIT) {
356             return V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
357          } else {
358             assert(aspect & VK_IMAGE_ASPECT_STENCIL_BIT);
359             if (is_copy_to_buffer) {
360                return for_store ? V3D_OUTPUT_IMAGE_FORMAT_R8UI :
361                                   V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
362             } else {
363                assert(is_copy_from_buffer);
364                return for_store ? V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI :
365                                   V3D_OUTPUT_IMAGE_FORMAT_R8UI;
366             }
367          }
368       default: /* Color formats */
369          return framebuffer->format->planes[0].rt_type;
370          break;
371       }
372    } else {
373       return framebuffer->format->planes[0].rt_type;
374    }
375 }
376 
377 static inline bool
format_needs_rb_swap(struct v3dv_device * device,VkFormat format)378 format_needs_rb_swap(struct v3dv_device *device,
379                      VkFormat format)
380 {
381    /* We are calling these methods for framebuffer formats, that at this point
382     * should be single-plane
383     */
384    assert(vk_format_get_plane_count(format) == 1);
385    const uint8_t *swizzle = v3dv_get_format_swizzle(device, format, 0);
386    return v3dv_format_swizzle_needs_rb_swap(swizzle);
387 }
388 
389 static inline bool
format_needs_reverse(struct v3dv_device * device,VkFormat format)390 format_needs_reverse(struct v3dv_device *device,
391                      VkFormat format)
392 {
393    /* We are calling these methods for framebuffer formats, that at this point
394     * should be single-plane
395     */
396    assert(vk_format_get_plane_count(format) == 1);
397    const uint8_t *swizzle = v3dv_get_format_swizzle(device, format, 0);
398    return v3dv_format_swizzle_needs_reverse(swizzle);
399 }
400 
401 static void
emit_image_load(struct v3dv_device * device,struct v3dv_cl * cl,struct v3dv_meta_framebuffer * framebuffer,struct v3dv_image * image,VkImageAspectFlags aspect,uint32_t layer,uint32_t mip_level,bool is_copy_to_buffer,bool is_copy_from_buffer)402 emit_image_load(struct v3dv_device *device,
403                 struct v3dv_cl *cl,
404                 struct v3dv_meta_framebuffer *framebuffer,
405                 struct v3dv_image *image,
406                 VkImageAspectFlags aspect,
407                 uint32_t layer,
408                 uint32_t mip_level,
409                 bool is_copy_to_buffer,
410                 bool is_copy_from_buffer)
411 {
412    uint8_t plane = v3dv_plane_from_aspect(aspect);
413    uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer, plane);
414 
415    /* For multi-plane formats we are copying plane by plane to the color
416     * tlb. Framebuffer format was already selected to be a tlb single-plane
417     * compatible format. We still need to use the real plane to get the
418     * address etc from the source image.
419     */
420    assert(framebuffer->format->plane_count == 1);
421    /* For image to/from buffer copies we always load to and store from RT0,
422     * even for depth/stencil aspects, because the hardware can't do raster
423     * stores or loads from/to the depth/stencil tile buffers.
424     */
425    bool load_to_color_tlb = is_copy_to_buffer || is_copy_from_buffer ||
426                             image->format->plane_count > 1 ||
427                             aspect == VK_IMAGE_ASPECT_COLOR_BIT;
428 
429    const struct v3d_resource_slice *slice = &image->planes[plane].slices[mip_level];
430    cl_emit(cl, LOAD_TILE_BUFFER_GENERAL, load) {
431       load.buffer_to_load = load_to_color_tlb ?
432          RENDER_TARGET_0 : v3dX(zs_buffer_from_aspect_bits)(aspect);
433 
434       load.address = v3dv_cl_address(image->planes[plane].mem->bo, layer_offset);
435       load.input_image_format = choose_tlb_format(framebuffer, aspect, false,
436                                                   is_copy_to_buffer,
437                                                   is_copy_from_buffer);
438       load.memory_format = slice->tiling;
439 
440       /* When copying depth/stencil images to a buffer, for D24 formats Vulkan
441        * expects the depth value in the LSB bits of each 32-bit pixel.
442        * Unfortunately, the hardware seems to put the S8/X8 bits there and the
443        * depth bits on the MSB. To work around that we can reverse the channel
444        * order and then swap the R/B channels to get what we want.
445        *
446        * NOTE: reversing and swapping only gets us the behavior we want if the
447        * operations happen in that exact order, which seems to be the case when
448        * done on the tile buffer load operations. On the store, it seems the
449        * order is not the same. The order on the store is probably reversed so
450        * that reversing and swapping on both the load and the store preserves
451        * the original order of the channels in memory.
452        *
453        * Notice that we only need to do this when copying to a buffer, where
454        * depth and stencil aspects are copied as separate regions and
455        * the spec expects them to be tightly packed.
456        */
457       bool needs_rb_swap = false;
458       bool needs_chan_reverse = false;
459       if (is_copy_to_buffer &&
460          (framebuffer->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32 ||
461           (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
462            (aspect & VK_IMAGE_ASPECT_DEPTH_BIT)))) {
463          needs_rb_swap = true;
464          needs_chan_reverse = true;
465       } else if (!is_copy_from_buffer && !is_copy_to_buffer &&
466                  (aspect & VK_IMAGE_ASPECT_COLOR_BIT)) {
467          /* This is not a raw data copy (i.e. we are clearing the image),
468           * so we need to make sure we respect the format swizzle.
469           */
470          needs_rb_swap = format_needs_rb_swap(device, framebuffer->vk_format);
471          needs_chan_reverse = format_needs_reverse(device, framebuffer->vk_format);
472       }
473 
474       load.r_b_swap = needs_rb_swap;
475       load.channel_reverse = needs_chan_reverse;
476 
477       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
478           slice->tiling == V3D_TILING_UIF_XOR) {
479          load.height_in_ub_or_stride =
480             slice->padded_height_of_output_image_in_uif_blocks;
481       } else if (slice->tiling == V3D_TILING_RASTER) {
482          load.height_in_ub_or_stride = slice->stride;
483       }
484 
485       if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
486          load.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
487       else
488          load.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
489    }
490 }
491 
492 static void
emit_image_store(struct v3dv_device * device,struct v3dv_cl * cl,struct v3dv_meta_framebuffer * framebuffer,struct v3dv_image * image,VkImageAspectFlags aspect,uint32_t layer,uint32_t mip_level,bool is_copy_to_buffer,bool is_copy_from_buffer,bool is_multisample_resolve)493 emit_image_store(struct v3dv_device *device,
494                  struct v3dv_cl *cl,
495                  struct v3dv_meta_framebuffer *framebuffer,
496                  struct v3dv_image *image,
497                  VkImageAspectFlags aspect,
498                  uint32_t layer,
499                  uint32_t mip_level,
500                  bool is_copy_to_buffer,
501                  bool is_copy_from_buffer,
502                  bool is_multisample_resolve)
503 {
504    uint8_t plane = v3dv_plane_from_aspect(aspect);
505    uint32_t layer_offset = v3dv_layer_offset(image, mip_level, layer, plane);
506 
507    /*
508     * For multi-plane formats we are copying plane by plane to the color
509     * tlb. Framebuffer format was already selected to be a tlb single-plane
510     * compatible format. We still need to use the real plane to get the
511     * address etc.
512     */
513    assert(framebuffer->format->plane_count == 1);
514 
515    bool store_from_color_tlb = is_copy_to_buffer || is_copy_from_buffer ||
516                                image->format->plane_count > 1 ||
517                                aspect == VK_IMAGE_ASPECT_COLOR_BIT;
518 
519    const struct v3d_resource_slice *slice = &image->planes[plane].slices[mip_level];
520    cl_emit(cl, STORE_TILE_BUFFER_GENERAL, store) {
521       store.buffer_to_store = store_from_color_tlb ?
522          RENDER_TARGET_0 : v3dX(zs_buffer_from_aspect_bits)(aspect);
523 
524       store.address = v3dv_cl_address(image->planes[plane].mem->bo, layer_offset);
525 
526       store.clear_buffer_being_stored = false;
527 
528       /* See rationale in emit_image_load() */
529       bool needs_rb_swap = false;
530       bool needs_chan_reverse = false;
531       if (is_copy_from_buffer &&
532          (framebuffer->vk_format == VK_FORMAT_X8_D24_UNORM_PACK32 ||
533           (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT &&
534            (aspect & VK_IMAGE_ASPECT_DEPTH_BIT)))) {
535          needs_rb_swap = true;
536          needs_chan_reverse = true;
537       } else if (!is_copy_from_buffer && !is_copy_to_buffer &&
538                  (aspect & VK_IMAGE_ASPECT_COLOR_BIT)) {
539          needs_rb_swap = format_needs_rb_swap(device, framebuffer->vk_format);
540          needs_chan_reverse = format_needs_reverse(device, framebuffer->vk_format);
541       }
542 
543       store.r_b_swap = needs_rb_swap;
544       store.channel_reverse = needs_chan_reverse;
545 
546       store.output_image_format = choose_tlb_format(framebuffer, aspect, true,
547                                                     is_copy_to_buffer,
548                                                     is_copy_from_buffer);
549       store.memory_format = slice->tiling;
550       if (slice->tiling == V3D_TILING_UIF_NO_XOR ||
551           slice->tiling == V3D_TILING_UIF_XOR) {
552          store.height_in_ub_or_stride =
553             slice->padded_height_of_output_image_in_uif_blocks;
554       } else if (slice->tiling == V3D_TILING_RASTER) {
555          store.height_in_ub_or_stride = slice->stride;
556       }
557 
558       if (image->vk.samples > VK_SAMPLE_COUNT_1_BIT)
559          store.decimate_mode = V3D_DECIMATE_MODE_ALL_SAMPLES;
560       else if (is_multisample_resolve)
561          store.decimate_mode = V3D_DECIMATE_MODE_4X;
562       else
563          store.decimate_mode = V3D_DECIMATE_MODE_SAMPLE_0;
564    }
565 }
566 
567 static void
emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job * job,struct v3dv_meta_framebuffer * framebuffer,struct v3dv_buffer * buffer,struct v3dv_image * image,uint32_t layer_offset,const VkBufferImageCopy2 * region)568 emit_copy_layer_to_buffer_per_tile_list(struct v3dv_job *job,
569                                         struct v3dv_meta_framebuffer *framebuffer,
570                                         struct v3dv_buffer *buffer,
571                                         struct v3dv_image *image,
572                                         uint32_t layer_offset,
573                                         const VkBufferImageCopy2 *region)
574 {
575    struct v3dv_cl *cl = &job->indirect;
576    v3dv_cl_ensure_space(cl, 200, 1);
577    v3dv_return_if_oom(NULL, job);
578 
579    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
580 
581    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
582 
583    /* Load image to TLB */
584    assert((image->vk.image_type != VK_IMAGE_TYPE_3D &&
585            layer_offset < image->vk.array_layers) ||
586           layer_offset < image->vk.extent.depth);
587 
588    const uint32_t image_layer = image->vk.image_type != VK_IMAGE_TYPE_3D ?
589       region->imageSubresource.baseArrayLayer + layer_offset :
590       region->imageOffset.z + layer_offset;
591 
592    emit_image_load(job->device, cl, framebuffer, image,
593                    region->imageSubresource.aspectMask,
594                    image_layer,
595                    region->imageSubresource.mipLevel,
596                    true, false);
597 
598    cl_emit(cl, END_OF_LOADS, end);
599 
600    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
601 
602    /* Store TLB to buffer */
603    uint32_t width, height;
604    if (region->bufferRowLength == 0)
605       width = region->imageExtent.width;
606    else
607       width = region->bufferRowLength;
608 
609    if (region->bufferImageHeight == 0)
610       height = region->imageExtent.height;
611    else
612       height = region->bufferImageHeight;
613 
614    /* Handle copy from compressed format */
615    width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk.format));
616    height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk.format));
617 
618    /* If we are storing stencil from a combined depth/stencil format the
619     * Vulkan spec states that the output buffer must have packed stencil
620     * values, where each stencil value is 1 byte.
621     */
622    uint8_t plane = v3dv_plane_from_aspect(region->imageSubresource.aspectMask);
623    uint32_t cpp =
624       region->imageSubresource.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
625       1 : image->planes[plane].cpp;
626    uint32_t buffer_stride = width * cpp;
627    uint32_t buffer_offset = buffer->mem_offset + region->bufferOffset +
628                             height * buffer_stride * layer_offset;
629 
630    uint32_t format = choose_tlb_format(framebuffer,
631                                        region->imageSubresource.aspectMask,
632                                        true, true, false);
633    bool msaa = image->vk.samples > VK_SAMPLE_COUNT_1_BIT;
634 
635    emit_linear_store(cl, RENDER_TARGET_0, buffer->mem->bo,
636                      buffer_offset, buffer_stride, msaa, format);
637 
638    cl_emit(cl, END_OF_TILE_MARKER, end);
639 
640    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
641 
642    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
643       branch.start = tile_list_start;
644       branch.end = v3dv_cl_get_address(cl);
645    }
646 }
647 
648 static void
emit_copy_layer_to_buffer(struct v3dv_job * job,struct v3dv_buffer * buffer,struct v3dv_image * image,struct v3dv_meta_framebuffer * framebuffer,uint32_t layer,const VkBufferImageCopy2 * region)649 emit_copy_layer_to_buffer(struct v3dv_job *job,
650                           struct v3dv_buffer *buffer,
651                           struct v3dv_image *image,
652                           struct v3dv_meta_framebuffer *framebuffer,
653                           uint32_t layer,
654                           const VkBufferImageCopy2 *region)
655 {
656    emit_copy_layer_to_buffer_per_tile_list(job, framebuffer, buffer,
657                                            image, layer, region);
658    emit_supertile_coordinates(job, framebuffer);
659 }
660 
661 void
v3dX(meta_emit_copy_image_to_buffer_rcl)662 v3dX(meta_emit_copy_image_to_buffer_rcl)(struct v3dv_job *job,
663                                          struct v3dv_buffer *buffer,
664                                          struct v3dv_image *image,
665                                          struct v3dv_meta_framebuffer *framebuffer,
666                                          const VkBufferImageCopy2 *region)
667 {
668    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
669    v3dv_return_if_oom(NULL, job);
670 
671    emit_frame_setup(job, 0, NULL);
672    for (int layer = 0; layer < job->frame_tiling.layers; layer++)
673       emit_copy_layer_to_buffer(job, buffer, image, framebuffer, layer, region);
674    cl_emit(rcl, END_OF_RENDERING, end);
675 }
676 
677 static void
emit_resolve_image_layer_per_tile_list(struct v3dv_job * job,struct v3dv_meta_framebuffer * framebuffer,struct v3dv_image * dst,struct v3dv_image * src,uint32_t layer_offset,const VkImageResolve2 * region)678 emit_resolve_image_layer_per_tile_list(struct v3dv_job *job,
679                                        struct v3dv_meta_framebuffer *framebuffer,
680                                        struct v3dv_image *dst,
681                                        struct v3dv_image *src,
682                                        uint32_t layer_offset,
683                                        const VkImageResolve2 *region)
684 {
685    struct v3dv_cl *cl = &job->indirect;
686    v3dv_cl_ensure_space(cl, 200, 1);
687    v3dv_return_if_oom(NULL, job);
688 
689    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
690 
691    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
692 
693    assert((src->vk.image_type != VK_IMAGE_TYPE_3D &&
694            layer_offset < src->vk.array_layers) ||
695           layer_offset < src->vk.extent.depth);
696 
697    const uint32_t src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
698       region->srcSubresource.baseArrayLayer + layer_offset :
699       region->srcOffset.z + layer_offset;
700 
701    emit_image_load(job->device, cl, framebuffer, src,
702                    region->srcSubresource.aspectMask,
703                    src_layer,
704                    region->srcSubresource.mipLevel,
705                    false, false);
706 
707    cl_emit(cl, END_OF_LOADS, end);
708 
709    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
710 
711    assert((dst->vk.image_type != VK_IMAGE_TYPE_3D &&
712            layer_offset < dst->vk.array_layers) ||
713           layer_offset < dst->vk.extent.depth);
714 
715    const uint32_t dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
716       region->dstSubresource.baseArrayLayer + layer_offset :
717       region->dstOffset.z + layer_offset;
718 
719    bool is_depth_or_stencil =
720       region->dstSubresource.aspectMask &
721       (VK_IMAGE_ASPECT_DEPTH_BIT | VK_IMAGE_ASPECT_STENCIL_BIT);
722    emit_image_store(job->device, cl, framebuffer, dst,
723                     region->dstSubresource.aspectMask,
724                     dst_layer,
725                     region->dstSubresource.mipLevel,
726                     false, false, !is_depth_or_stencil);
727 
728    cl_emit(cl, END_OF_TILE_MARKER, end);
729 
730    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
731 
732    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
733       branch.start = tile_list_start;
734       branch.end = v3dv_cl_get_address(cl);
735    }
736 }
737 
738 static void
emit_resolve_image_layer(struct v3dv_job * job,struct v3dv_image * dst,struct v3dv_image * src,struct v3dv_meta_framebuffer * framebuffer,uint32_t layer,const VkImageResolve2 * region)739 emit_resolve_image_layer(struct v3dv_job *job,
740                          struct v3dv_image *dst,
741                          struct v3dv_image *src,
742                          struct v3dv_meta_framebuffer *framebuffer,
743                          uint32_t layer,
744                          const VkImageResolve2 *region)
745 {
746    emit_resolve_image_layer_per_tile_list(job, framebuffer,
747                                           dst, src, layer, region);
748    emit_supertile_coordinates(job, framebuffer);
749 }
750 
751 void
v3dX(meta_emit_resolve_image_rcl)752 v3dX(meta_emit_resolve_image_rcl)(struct v3dv_job *job,
753                                   struct v3dv_image *dst,
754                                   struct v3dv_image *src,
755                                   struct v3dv_meta_framebuffer *framebuffer,
756                                   const VkImageResolve2 *region)
757 {
758    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
759    v3dv_return_if_oom(NULL, job);
760 
761    emit_frame_setup(job, 0, NULL);
762    for (int layer = 0; layer < job->frame_tiling.layers; layer++)
763       emit_resolve_image_layer(job, dst, src, framebuffer, layer, region);
764    cl_emit(rcl, END_OF_RENDERING, end);
765 }
766 
767 static void
emit_copy_buffer_per_tile_list(struct v3dv_job * job,struct v3dv_bo * dst,struct v3dv_bo * src,uint32_t dst_offset,uint32_t src_offset,uint32_t stride,uint32_t format)768 emit_copy_buffer_per_tile_list(struct v3dv_job *job,
769                                struct v3dv_bo *dst,
770                                struct v3dv_bo *src,
771                                uint32_t dst_offset,
772                                uint32_t src_offset,
773                                uint32_t stride,
774                                uint32_t format)
775 {
776    struct v3dv_cl *cl = &job->indirect;
777    v3dv_cl_ensure_space(cl, 200, 1);
778    v3dv_return_if_oom(NULL, job);
779 
780    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
781 
782    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
783 
784    emit_linear_load(cl, RENDER_TARGET_0, src, src_offset, stride, format);
785 
786    cl_emit(cl, END_OF_LOADS, end);
787 
788    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
789 
790    emit_linear_store(cl, RENDER_TARGET_0,
791                      dst, dst_offset, stride, false, format);
792 
793    cl_emit(cl, END_OF_TILE_MARKER, end);
794 
795    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
796 
797    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
798       branch.start = tile_list_start;
799       branch.end = v3dv_cl_get_address(cl);
800    }
801 }
802 
803 void
v3dX(meta_emit_copy_buffer)804 v3dX(meta_emit_copy_buffer)(struct v3dv_job *job,
805                             struct v3dv_bo *dst,
806                             struct v3dv_bo *src,
807                             uint32_t dst_offset,
808                             uint32_t src_offset,
809                             struct v3dv_meta_framebuffer *framebuffer,
810                             uint32_t format,
811                             uint32_t item_size)
812 {
813    const uint32_t stride = job->frame_tiling.width * item_size;
814    emit_copy_buffer_per_tile_list(job, dst, src,
815                                   dst_offset, src_offset,
816                                   stride, format);
817    emit_supertile_coordinates(job, framebuffer);
818 }
819 
820 void
v3dX(meta_emit_copy_buffer_rcl)821 v3dX(meta_emit_copy_buffer_rcl)(struct v3dv_job *job,
822                                 struct v3dv_bo *dst,
823                                 struct v3dv_bo *src,
824                                 uint32_t dst_offset,
825                                 uint32_t src_offset,
826                                 struct v3dv_meta_framebuffer *framebuffer,
827                                 uint32_t format,
828                                 uint32_t item_size)
829 {
830    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
831    v3dv_return_if_oom(NULL, job);
832 
833    emit_frame_setup(job, 0, NULL);
834 
835    v3dX(meta_emit_copy_buffer)(job, dst, src, dst_offset, src_offset,
836                                framebuffer, format, item_size);
837 
838    cl_emit(rcl, END_OF_RENDERING, end);
839 }
840 
841 static void
emit_copy_image_layer_per_tile_list(struct v3dv_job * job,struct v3dv_meta_framebuffer * framebuffer,struct v3dv_image * dst,struct v3dv_image * src,uint32_t layer_offset,const VkImageCopy2 * region)842 emit_copy_image_layer_per_tile_list(struct v3dv_job *job,
843                                     struct v3dv_meta_framebuffer *framebuffer,
844                                     struct v3dv_image *dst,
845                                     struct v3dv_image *src,
846                                     uint32_t layer_offset,
847                                     const VkImageCopy2 *region)
848 {
849    struct v3dv_cl *cl = &job->indirect;
850    v3dv_cl_ensure_space(cl, 200, 1);
851    v3dv_return_if_oom(NULL, job);
852 
853    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
854 
855    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
856 
857    assert((src->vk.image_type != VK_IMAGE_TYPE_3D &&
858            layer_offset < src->vk.array_layers) ||
859           layer_offset < src->vk.extent.depth);
860 
861    const uint32_t src_layer = src->vk.image_type != VK_IMAGE_TYPE_3D ?
862       region->srcSubresource.baseArrayLayer + layer_offset :
863       region->srcOffset.z + layer_offset;
864 
865    emit_image_load(job->device, cl, framebuffer, src,
866                    region->srcSubresource.aspectMask,
867                    src_layer,
868                    region->srcSubresource.mipLevel,
869                    false, false);
870 
871    cl_emit(cl, END_OF_LOADS, end);
872 
873    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
874 
875    assert((dst->vk.image_type != VK_IMAGE_TYPE_3D &&
876            layer_offset < dst->vk.array_layers) ||
877           layer_offset < dst->vk.extent.depth);
878 
879    const uint32_t dst_layer = dst->vk.image_type != VK_IMAGE_TYPE_3D ?
880       region->dstSubresource.baseArrayLayer + layer_offset :
881       region->dstOffset.z + layer_offset;
882 
883    emit_image_store(job->device, cl, framebuffer, dst,
884                     region->dstSubresource.aspectMask,
885                     dst_layer,
886                     region->dstSubresource.mipLevel,
887                     false, false, false);
888 
889    cl_emit(cl, END_OF_TILE_MARKER, end);
890 
891    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
892 
893    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
894       branch.start = tile_list_start;
895       branch.end = v3dv_cl_get_address(cl);
896    }
897 }
898 
899 static void
emit_copy_image_layer(struct v3dv_job * job,struct v3dv_image * dst,struct v3dv_image * src,struct v3dv_meta_framebuffer * framebuffer,uint32_t layer,const VkImageCopy2 * region)900 emit_copy_image_layer(struct v3dv_job *job,
901                       struct v3dv_image *dst,
902                       struct v3dv_image *src,
903                       struct v3dv_meta_framebuffer *framebuffer,
904                       uint32_t layer,
905                       const VkImageCopy2 *region)
906 {
907    emit_copy_image_layer_per_tile_list(job, framebuffer, dst, src, layer, region);
908    emit_supertile_coordinates(job, framebuffer);
909 }
910 
911 void
v3dX(meta_emit_copy_image_rcl)912 v3dX(meta_emit_copy_image_rcl)(struct v3dv_job *job,
913                                struct v3dv_image *dst,
914                                struct v3dv_image *src,
915                                struct v3dv_meta_framebuffer *framebuffer,
916                                const VkImageCopy2 *region)
917 {
918    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
919    v3dv_return_if_oom(NULL, job);
920 
921    emit_frame_setup(job, 0, NULL);
922    for (int layer = 0; layer < job->frame_tiling.layers; layer++)
923       emit_copy_image_layer(job, dst, src, framebuffer, layer, region);
924    cl_emit(rcl, END_OF_RENDERING, end);
925 }
926 
927 void
v3dX(meta_emit_tfu_job)928 v3dX(meta_emit_tfu_job)(struct v3dv_cmd_buffer *cmd_buffer,
929                         uint32_t dst_bo_handle,
930                         uint32_t dst_offset,
931                         enum v3d_tiling_mode dst_tiling,
932                         uint32_t dst_padded_height_or_stride,
933                         uint32_t dst_cpp,
934                         uint32_t src_bo_handle,
935                         uint32_t src_offset,
936                         enum v3d_tiling_mode src_tiling,
937                         uint32_t src_padded_height_or_stride,
938                         uint32_t src_cpp,
939                         uint32_t width,
940                         uint32_t height,
941                         const struct v3dv_format_plane *format_plane)
942 {
943    struct drm_v3d_submit_tfu tfu = {
944       .ios = (height << 16) | width,
945       .bo_handles = {
946          dst_bo_handle,
947          src_bo_handle != dst_bo_handle ? src_bo_handle : 0
948       },
949    };
950 
951    tfu.iia |= src_offset;
952 
953 #if V3D_VERSION <= 42
954    if (src_tiling == V3D_TILING_RASTER) {
955       tfu.icfg = V3D33_TFU_ICFG_FORMAT_RASTER << V3D33_TFU_ICFG_FORMAT_SHIFT;
956    } else {
957       tfu.icfg = (V3D33_TFU_ICFG_FORMAT_LINEARTILE +
958                   (src_tiling - V3D_TILING_LINEARTILE)) <<
959                    V3D33_TFU_ICFG_FORMAT_SHIFT;
960    }
961    tfu.icfg |= format_plane->tex_type << V3D33_TFU_ICFG_TTYPE_SHIFT;
962 #endif
963 #if V3D_VERSION >= 71
964    if (src_tiling == V3D_TILING_RASTER) {
965       tfu.icfg = V3D71_TFU_ICFG_FORMAT_RASTER << V3D71_TFU_ICFG_IFORMAT_SHIFT;
966    } else {
967       tfu.icfg = (V3D71_TFU_ICFG_FORMAT_LINEARTILE +
968                   (src_tiling - V3D_TILING_LINEARTILE)) <<
969                    V3D71_TFU_ICFG_IFORMAT_SHIFT;
970    }
971    tfu.icfg |= format_plane->tex_type << V3D71_TFU_ICFG_OTYPE_SHIFT;
972 #endif
973 
974    tfu.ioa = dst_offset;
975 
976 #if V3D_VERSION <= 42
977    tfu.ioa |= (V3D33_TFU_IOA_FORMAT_LINEARTILE +
978                (dst_tiling - V3D_TILING_LINEARTILE)) <<
979                 V3D33_TFU_IOA_FORMAT_SHIFT;
980 #endif
981 
982 #if V3D_VERSION >= 71
983    tfu.v71.ioc = (V3D71_TFU_IOC_FORMAT_LINEARTILE +
984                   (dst_tiling - V3D_TILING_LINEARTILE)) <<
985                    V3D71_TFU_IOC_FORMAT_SHIFT;
986 
987    switch (dst_tiling) {
988    case V3D_TILING_UIF_NO_XOR:
989    case V3D_TILING_UIF_XOR:
990       tfu.v71.ioc |=
991          (dst_padded_height_or_stride / (2 * v3d_utile_height(dst_cpp))) <<
992          V3D71_TFU_IOC_STRIDE_SHIFT;
993       break;
994    case V3D_TILING_RASTER:
995       tfu.v71.ioc |= (dst_padded_height_or_stride / dst_cpp) <<
996                       V3D71_TFU_IOC_STRIDE_SHIFT;
997       break;
998    default:
999       break;
1000    }
1001 #endif
1002 
1003    switch (src_tiling) {
1004    case V3D_TILING_UIF_NO_XOR:
1005    case V3D_TILING_UIF_XOR:
1006       tfu.iis |= src_padded_height_or_stride / (2 * v3d_utile_height(src_cpp));
1007       break;
1008    case V3D_TILING_RASTER:
1009       tfu.iis |= src_padded_height_or_stride / src_cpp;
1010       break;
1011    default:
1012       break;
1013    }
1014 
1015    /* The TFU can handle raster sources but always produces UIF results */
1016    assert(dst_tiling != V3D_TILING_RASTER);
1017 
1018 #if V3D_VERSION <= 42
1019    /* If we're writing level 0 (!IOA_DIMTW), then we need to supply the
1020     * OPAD field for the destination (how many extra UIF blocks beyond
1021     * those necessary to cover the height).
1022     */
1023    if (dst_tiling == V3D_TILING_UIF_NO_XOR || dst_tiling == V3D_TILING_UIF_XOR) {
1024       uint32_t uif_block_h = 2 * v3d_utile_height(dst_cpp);
1025       uint32_t implicit_padded_height = align(height, uif_block_h);
1026       uint32_t icfg = (dst_padded_height_or_stride - implicit_padded_height) /
1027                       uif_block_h;
1028       tfu.icfg |= icfg << V3D33_TFU_ICFG_OPAD_SHIFT;
1029    }
1030 #endif
1031 
1032    v3dv_cmd_buffer_add_tfu_job(cmd_buffer, &tfu);
1033 }
1034 
1035 static void
emit_clear_image_layer_per_tile_list(struct v3dv_job * job,struct v3dv_meta_framebuffer * framebuffer,struct v3dv_image * image,VkImageAspectFlags aspects,uint32_t layer,uint32_t level)1036 emit_clear_image_layer_per_tile_list(struct v3dv_job *job,
1037                                      struct v3dv_meta_framebuffer *framebuffer,
1038                                      struct v3dv_image *image,
1039                                      VkImageAspectFlags aspects,
1040                                      uint32_t layer,
1041                                      uint32_t level)
1042 {
1043    struct v3dv_cl *cl = &job->indirect;
1044    v3dv_cl_ensure_space(cl, 200, 1);
1045    v3dv_return_if_oom(NULL, job);
1046 
1047    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
1048 
1049    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
1050 
1051    cl_emit(cl, END_OF_LOADS, end);
1052 
1053    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
1054 
1055    emit_image_store(job->device, cl, framebuffer, image, aspects,
1056                     layer, level, false, false, false);
1057 
1058    cl_emit(cl, END_OF_TILE_MARKER, end);
1059 
1060    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
1061 
1062    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
1063       branch.start = tile_list_start;
1064       branch.end = v3dv_cl_get_address(cl);
1065    }
1066 }
1067 
1068 static void
emit_clear_image_layers(struct v3dv_job * job,struct v3dv_image * image,struct v3dv_meta_framebuffer * framebuffer,VkImageAspectFlags aspects,uint32_t min_layer,uint32_t max_layer,uint32_t level)1069 emit_clear_image_layers(struct v3dv_job *job,
1070                  struct v3dv_image *image,
1071                  struct v3dv_meta_framebuffer *framebuffer,
1072                  VkImageAspectFlags aspects,
1073                  uint32_t min_layer,
1074                  uint32_t max_layer,
1075                  uint32_t level)
1076 {
1077    for (uint32_t layer = min_layer; layer < max_layer; layer++) {
1078       emit_clear_image_layer_per_tile_list(job, framebuffer, image, aspects,
1079                                            layer, level);
1080       emit_supertile_coordinates(job, framebuffer);
1081    }
1082 }
1083 
1084 void
v3dX(meta_emit_clear_image_rcl)1085 v3dX(meta_emit_clear_image_rcl)(struct v3dv_job *job,
1086                                 struct v3dv_image *image,
1087                                 struct v3dv_meta_framebuffer *framebuffer,
1088                                 const union v3dv_clear_value *clear_value,
1089                                 VkImageAspectFlags aspects,
1090                                 uint32_t min_layer,
1091                                 uint32_t max_layer,
1092                                 uint32_t level)
1093 {
1094    const struct rcl_clear_info clear_info = {
1095       .clear_value = clear_value,
1096       .image = image,
1097       .aspects = aspects,
1098       .level = level,
1099    };
1100 
1101    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, &clear_info);
1102    v3dv_return_if_oom(NULL, job);
1103 
1104    emit_frame_setup(job, 0, clear_value);
1105    emit_clear_image_layers(job, image, framebuffer, aspects,
1106                            min_layer, max_layer, level);
1107    cl_emit(rcl, END_OF_RENDERING, end);
1108 }
1109 
1110 static void
emit_fill_buffer_per_tile_list(struct v3dv_job * job,struct v3dv_bo * bo,uint32_t offset,uint32_t stride)1111 emit_fill_buffer_per_tile_list(struct v3dv_job *job,
1112                                struct v3dv_bo *bo,
1113                                uint32_t offset,
1114                                uint32_t stride)
1115 {
1116    struct v3dv_cl *cl = &job->indirect;
1117    v3dv_cl_ensure_space(cl, 200, 1);
1118    v3dv_return_if_oom(NULL, job);
1119 
1120    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
1121 
1122    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
1123 
1124    cl_emit(cl, END_OF_LOADS, end);
1125 
1126    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
1127 
1128    emit_linear_store(cl, RENDER_TARGET_0, bo, offset, stride, false,
1129                      V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI);
1130 
1131    cl_emit(cl, END_OF_TILE_MARKER, end);
1132 
1133    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
1134 
1135    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
1136       branch.start = tile_list_start;
1137       branch.end = v3dv_cl_get_address(cl);
1138    }
1139 }
1140 
1141 static void
emit_fill_buffer(struct v3dv_job * job,struct v3dv_bo * bo,uint32_t offset,struct v3dv_meta_framebuffer * framebuffer)1142 emit_fill_buffer(struct v3dv_job *job,
1143                  struct v3dv_bo *bo,
1144                  uint32_t offset,
1145                  struct v3dv_meta_framebuffer *framebuffer)
1146 {
1147    const uint32_t stride = job->frame_tiling.width * 4;
1148    emit_fill_buffer_per_tile_list(job, bo, offset, stride);
1149    emit_supertile_coordinates(job, framebuffer);
1150 }
1151 
1152 void
v3dX(meta_emit_fill_buffer_rcl)1153 v3dX(meta_emit_fill_buffer_rcl)(struct v3dv_job *job,
1154                                 struct v3dv_bo *bo,
1155                                 uint32_t offset,
1156                                 struct v3dv_meta_framebuffer *framebuffer,
1157                                 uint32_t data)
1158 {
1159    const union v3dv_clear_value clear_value = {
1160        .color = { data, 0, 0, 0 },
1161    };
1162 
1163    const struct rcl_clear_info clear_info = {
1164       .clear_value = &clear_value,
1165       .image = NULL,
1166       .aspects = VK_IMAGE_ASPECT_COLOR_BIT,
1167       .level = 0,
1168    };
1169 
1170    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, &clear_info);
1171    v3dv_return_if_oom(NULL, job);
1172 
1173    emit_frame_setup(job, 0, &clear_value);
1174    emit_fill_buffer(job, bo, offset, framebuffer);
1175    cl_emit(rcl, END_OF_RENDERING, end);
1176 }
1177 
1178 
1179 static void
emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job * job,struct v3dv_meta_framebuffer * framebuffer,struct v3dv_image * image,struct v3dv_buffer * buffer,uint32_t layer,const VkBufferImageCopy2 * region)1180 emit_copy_buffer_to_layer_per_tile_list(struct v3dv_job *job,
1181                                         struct v3dv_meta_framebuffer *framebuffer,
1182                                         struct v3dv_image *image,
1183                                         struct v3dv_buffer *buffer,
1184                                         uint32_t layer,
1185                                         const VkBufferImageCopy2 *region)
1186 {
1187    struct v3dv_cl *cl = &job->indirect;
1188    v3dv_cl_ensure_space(cl, 200, 1);
1189    v3dv_return_if_oom(NULL, job);
1190 
1191    struct v3dv_cl_reloc tile_list_start = v3dv_cl_get_address(cl);
1192 
1193    cl_emit(cl, TILE_COORDINATES_IMPLICIT, coords);
1194 
1195    assert((image->vk.image_type != VK_IMAGE_TYPE_3D && layer < image->vk.array_layers) ||
1196           layer < image->vk.extent.depth);
1197 
1198    /* Load TLB from buffer */
1199    uint32_t width, height;
1200    if (region->bufferRowLength == 0)
1201       width = region->imageExtent.width;
1202    else
1203       width = region->bufferRowLength;
1204 
1205    if (region->bufferImageHeight == 0)
1206       height = region->imageExtent.height;
1207    else
1208       height = region->bufferImageHeight;
1209 
1210    /* Handle copy to compressed format using a compatible format */
1211    width = DIV_ROUND_UP(width, vk_format_get_blockwidth(image->vk.format));
1212    height = DIV_ROUND_UP(height, vk_format_get_blockheight(image->vk.format));
1213 
1214    const VkImageSubresourceLayers *imgrsc = &region->imageSubresource;
1215    uint8_t plane = v3dv_plane_from_aspect(imgrsc->aspectMask);
1216    uint32_t cpp = imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT ?
1217                   1 : image->planes[plane].cpp;
1218    uint32_t buffer_stride = width * cpp;
1219    uint32_t buffer_offset =
1220       buffer->mem_offset + region->bufferOffset + height * buffer_stride * layer;
1221 
1222    uint32_t format = choose_tlb_format(framebuffer, imgrsc->aspectMask,
1223                                        false, false, true);
1224 
1225    uint32_t image_layer = layer + (image->vk.image_type != VK_IMAGE_TYPE_3D ?
1226       imgrsc->baseArrayLayer : region->imageOffset.z);
1227 
1228    emit_linear_load(cl, RENDER_TARGET_0, buffer->mem->bo,
1229                     buffer_offset, buffer_stride, format);
1230 
1231    /* Because we can't do raster loads/stores of Z/S formats we need to
1232     * use a color tile buffer with a compatible RGBA color format instead.
1233     * However, when we are uploading a single aspect to a combined
1234     * depth/stencil image we have the problem that our tile buffer stores don't
1235     * allow us to mask out the other aspect, so we always write all four RGBA
1236     * channels to the image and we end up overwriting that other aspect with
1237     * undefined values. To work around that, we first load the aspect we are
1238     * not copying from the image memory into a proper Z/S tile buffer. Then we
1239     * do our store from the color buffer for the aspect we are copying, and
1240     * after that, we do another store from the Z/S tile buffer to restore the
1241     * other aspect to its original value.
1242     */
1243    if (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1244       if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1245          emit_image_load(job->device, cl, framebuffer, image,
1246                          VK_IMAGE_ASPECT_STENCIL_BIT,
1247                          image_layer, imgrsc->mipLevel,
1248                          false, false);
1249       } else {
1250          assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
1251          emit_image_load(job->device, cl, framebuffer, image,
1252                          VK_IMAGE_ASPECT_DEPTH_BIT,
1253                          image_layer, imgrsc->mipLevel,
1254                          false, false);
1255       }
1256    }
1257 
1258    cl_emit(cl, END_OF_LOADS, end);
1259 
1260    cl_emit(cl, BRANCH_TO_IMPLICIT_TILE_LIST, branch);
1261 
1262    /* Store TLB to image */
1263    emit_image_store(job->device, cl, framebuffer, image, imgrsc->aspectMask,
1264                     image_layer, imgrsc->mipLevel,
1265                     false, true, false);
1266 
1267    if (framebuffer->vk_format == VK_FORMAT_D24_UNORM_S8_UINT) {
1268       if (imgrsc->aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT) {
1269          emit_image_store(job->device, cl, framebuffer, image,
1270                           VK_IMAGE_ASPECT_STENCIL_BIT,
1271                           image_layer, imgrsc->mipLevel,
1272                           false, false, false);
1273       } else {
1274          assert(imgrsc->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT);
1275          emit_image_store(job->device, cl, framebuffer, image,
1276                           VK_IMAGE_ASPECT_DEPTH_BIT,
1277                           image_layer, imgrsc->mipLevel,
1278                           false, false, false);
1279       }
1280    }
1281 
1282    cl_emit(cl, END_OF_TILE_MARKER, end);
1283 
1284    cl_emit(cl, RETURN_FROM_SUB_LIST, ret);
1285 
1286    cl_emit(&job->rcl, START_ADDRESS_OF_GENERIC_TILE_LIST, branch) {
1287       branch.start = tile_list_start;
1288       branch.end = v3dv_cl_get_address(cl);
1289    }
1290 }
1291 
1292 static void
emit_copy_buffer_to_layer(struct v3dv_job * job,struct v3dv_image * image,struct v3dv_buffer * buffer,struct v3dv_meta_framebuffer * framebuffer,uint32_t layer,const VkBufferImageCopy2 * region)1293 emit_copy_buffer_to_layer(struct v3dv_job *job,
1294                           struct v3dv_image *image,
1295                           struct v3dv_buffer *buffer,
1296                           struct v3dv_meta_framebuffer *framebuffer,
1297                           uint32_t layer,
1298                           const VkBufferImageCopy2 *region)
1299 {
1300    emit_copy_buffer_to_layer_per_tile_list(job, framebuffer, image, buffer,
1301                                            layer, region);
1302    emit_supertile_coordinates(job, framebuffer);
1303 }
1304 
1305 void
v3dX(meta_emit_copy_buffer_to_image_rcl)1306 v3dX(meta_emit_copy_buffer_to_image_rcl)(struct v3dv_job *job,
1307                                          struct v3dv_image *image,
1308                                          struct v3dv_buffer *buffer,
1309                                          struct v3dv_meta_framebuffer *framebuffer,
1310                                          const VkBufferImageCopy2 *region)
1311 {
1312    struct v3dv_cl *rcl = emit_rcl_prologue(job, framebuffer, NULL);
1313    v3dv_return_if_oom(NULL, job);
1314 
1315    emit_frame_setup(job, 0, NULL);
1316    for (int layer = 0; layer < job->frame_tiling.layers; layer++)
1317       emit_copy_buffer_to_layer(job, image, buffer, framebuffer, layer, region);
1318    cl_emit(rcl, END_OF_RENDERING, end);
1319 }
1320 
1321 /* Figure out a TLB size configuration for a number of pixels to process.
1322  * Beware that we can't "render" more than MAX_DIMxMAX_DIM pixels in a single
1323  * job, if the pixel count is larger than this, the caller might need to split
1324  * the job and call this function multiple times.
1325  */
1326 static void
framebuffer_size_for_pixel_count(uint32_t num_pixels,uint32_t * width,uint32_t * height)1327 framebuffer_size_for_pixel_count(uint32_t num_pixels,
1328                                  uint32_t *width,
1329                                  uint32_t *height)
1330 {
1331    assert(num_pixels > 0);
1332 
1333    const uint32_t max_dim_pixels = V3D_MAX_IMAGE_DIMENSION;
1334    const uint32_t max_pixels = max_dim_pixels * max_dim_pixels;
1335 
1336    uint32_t w, h;
1337    if (num_pixels > max_pixels) {
1338       w = max_dim_pixels;
1339       h = max_dim_pixels;
1340    } else {
1341       w = num_pixels;
1342       h = 1;
1343       while (w > max_dim_pixels || ((w % 2) == 0 && w > 2 * h)) {
1344          w >>= 1;
1345          h <<= 1;
1346       }
1347    }
1348    assert(w <= max_dim_pixels && h <= max_dim_pixels);
1349    assert(w * h <= num_pixels);
1350    assert(w > 0 && h > 0);
1351 
1352    *width = w;
1353    *height = h;
1354 }
1355 
1356 struct v3dv_job *
v3dX(meta_copy_buffer)1357 v3dX(meta_copy_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
1358                        struct v3dv_bo *dst,
1359                        uint32_t dst_offset,
1360                        struct v3dv_bo *src,
1361                        uint32_t src_offset,
1362                        const VkBufferCopy2 *region)
1363 {
1364    const uint32_t internal_bpp = V3D_INTERNAL_BPP_32;
1365    const uint32_t internal_type = V3D_INTERNAL_TYPE_8UI;
1366 
1367    /* Select appropriate pixel format for the copy operation based on the
1368     * size to copy and the alignment of the source and destination offsets.
1369     */
1370    src_offset += region->srcOffset;
1371    dst_offset += region->dstOffset;
1372    uint32_t item_size = 4;
1373    while (item_size > 1 &&
1374           (src_offset % item_size != 0 || dst_offset % item_size != 0)) {
1375       item_size /= 2;
1376    }
1377 
1378    while (item_size > 1 && region->size % item_size != 0)
1379       item_size /= 2;
1380 
1381    assert(region->size % item_size == 0);
1382    uint32_t num_items = region->size / item_size;
1383    assert(num_items > 0);
1384 
1385    uint32_t format;
1386    VkFormat vk_format;
1387    switch (item_size) {
1388    case 4:
1389       format = V3D_OUTPUT_IMAGE_FORMAT_RGBA8UI;
1390       vk_format = VK_FORMAT_R8G8B8A8_UINT;
1391       break;
1392    case 2:
1393       format = V3D_OUTPUT_IMAGE_FORMAT_RG8UI;
1394       vk_format = VK_FORMAT_R8G8_UINT;
1395       break;
1396    default:
1397       format = V3D_OUTPUT_IMAGE_FORMAT_R8UI;
1398       vk_format = VK_FORMAT_R8_UINT;
1399       break;
1400    }
1401 
1402    struct v3dv_job *job = NULL;
1403    while (num_items > 0) {
1404       job = v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
1405       if (!job)
1406          return NULL;
1407 
1408       uint32_t width, height;
1409       framebuffer_size_for_pixel_count(num_items, &width, &height);
1410 
1411       v3dv_job_start_frame(job, width, height, 1, true, true, 1,
1412                            internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
1413                            false);
1414 
1415       struct v3dv_meta_framebuffer framebuffer;
1416       v3dX(meta_framebuffer_init)(&framebuffer, vk_format, internal_type,
1417                                   &job->frame_tiling);
1418 
1419       v3dX(job_emit_binning_flush)(job);
1420 
1421       v3dX(meta_emit_copy_buffer_rcl)(job, dst, src, dst_offset, src_offset,
1422                                       &framebuffer, format, item_size);
1423 
1424       v3dv_cmd_buffer_finish_job(cmd_buffer);
1425 
1426       const uint32_t items_copied = width * height;
1427       const uint32_t bytes_copied = items_copied * item_size;
1428       num_items -= items_copied;
1429       src_offset += bytes_copied;
1430       dst_offset += bytes_copied;
1431    }
1432 
1433    return job;
1434 }
1435 
1436 void
v3dX(meta_fill_buffer)1437 v3dX(meta_fill_buffer)(struct v3dv_cmd_buffer *cmd_buffer,
1438                        struct v3dv_bo *bo,
1439                        uint32_t offset,
1440                        uint32_t size,
1441                        uint32_t data)
1442 {
1443    assert(size > 0 && size % 4 == 0);
1444    assert(offset + size <= bo->size);
1445 
1446    const uint32_t internal_bpp = V3D_INTERNAL_BPP_32;
1447    const uint32_t internal_type = V3D_INTERNAL_TYPE_8UI;
1448    uint32_t num_items = size / 4;
1449 
1450    while (num_items > 0) {
1451       struct v3dv_job *job =
1452          v3dv_cmd_buffer_start_job(cmd_buffer, -1, V3DV_JOB_TYPE_GPU_CL);
1453       if (!job)
1454          return;
1455 
1456       uint32_t width, height;
1457       framebuffer_size_for_pixel_count(num_items, &width, &height);
1458 
1459       v3dv_job_start_frame(job, width, height, 1, true, true, 1,
1460                            internal_bpp, 4 * v3d_internal_bpp_words(internal_bpp),
1461                            false);
1462 
1463       struct v3dv_meta_framebuffer framebuffer;
1464       v3dX(meta_framebuffer_init)(&framebuffer, VK_FORMAT_R8G8B8A8_UINT,
1465                                   internal_type, &job->frame_tiling);
1466 
1467       v3dX(job_emit_binning_flush)(job);
1468 
1469       v3dX(meta_emit_fill_buffer_rcl)(job, bo, offset, &framebuffer, data);
1470 
1471       v3dv_cmd_buffer_finish_job(cmd_buffer);
1472 
1473       const uint32_t items_copied = width * height;
1474       const uint32_t bytes_copied = items_copied * 4;
1475       num_items -= items_copied;
1476       offset += bytes_copied;
1477    }
1478 }
1479 
1480 void
v3dX(meta_framebuffer_init)1481 v3dX(meta_framebuffer_init)(struct v3dv_meta_framebuffer *fb,
1482                             VkFormat vk_format,
1483                             uint32_t internal_type,
1484                             const struct v3dv_frame_tiling *tiling)
1485 {
1486    fb->internal_type = internal_type;
1487 
1488    /* Supertile coverage always starts at 0,0  */
1489    uint32_t supertile_w_in_pixels =
1490       tiling->tile_width * tiling->supertile_width;
1491    uint32_t supertile_h_in_pixels =
1492       tiling->tile_height * tiling->supertile_height;
1493 
1494    fb->min_x_supertile = 0;
1495    fb->min_y_supertile = 0;
1496    fb->max_x_supertile = (tiling->width - 1) / supertile_w_in_pixels;
1497    fb->max_y_supertile = (tiling->height - 1) / supertile_h_in_pixels;
1498 
1499    fb->vk_format = vk_format;
1500    fb->format = v3dX(get_format)(vk_format);
1501 
1502    fb->internal_depth_type = V3D_INTERNAL_TYPE_DEPTH_32F;
1503    if (vk_format_is_depth_or_stencil(vk_format))
1504       fb->internal_depth_type = v3dX(get_internal_depth_type)(vk_format);
1505 }
1506