1 /*
2 * Copyright (c) 2011-2013 Luc Verhaegen <[email protected]>
3 * Copyright (c) 2018 Alyssa Rosenzweig <[email protected]>
4 * Copyright (c) 2018 Vasily Khoruzhick <[email protected]>
5 * Copyright (c) 2019 Collabora, Ltd.
6 *
7 * Permission is hereby granted, free of charge, to any person obtaining a
8 * copy of this software and associated documentation files (the "Software"),
9 * to deal in the Software without restriction, including without limitation
10 * the rights to use, copy, modify, merge, publish, distribute, sub license,
11 * and/or sell copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial portions
16 * of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
23 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
24 * DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28 #include "pan_tiling.h"
29 #include <stdbool.h>
30 #include "util/bitscan.h"
31 #include "util/macros.h"
32
33 /*
34 * This file implements software encode/decode of u-interleaved textures.
35 * See docs/drivers/panfrost.rst for details on the format.
36 *
37 * The tricky bit is ordering along the space-filling curve:
38 *
39 * | y3 | (x3 ^ y3) | y2 | (y2 ^ x2) | y1 | (y1 ^ x1) | y0 | (y0 ^ x0) |
40 *
41 * While interleaving bits is trivial in hardware, it is nontrivial in software.
42 * The trick is to divide the pattern up:
43 *
44 * | y3 | y3 | y2 | y2 | y1 | y1 | y0 | y0 |
45 * ^ | 0 | x3 | 0 | x2 | 0 | x1 | 0 | x0 |
46 *
47 * That is, duplicate the bits of the Y and space out the bits of the X. The top
48 * line is a function only of Y, so it can be calculated once per row and stored
49 * in a register. The bottom line is simply X with the bits spaced out. Spacing
50 * out the X is easy enough with a LUT, or by subtracting+ANDing the mask
51 * pattern (abusing carry bits).
52 *
53 */
54
55 /*
56 * Given the lower 4-bits of the Y coordinate, we would like to
57 * duplicate every bit over. So instead of 0b1010, we would like
58 * 0b11001100. The idea is that for the bits in the solely Y place, we
59 * get a Y place, and the bits in the XOR place *also* get a Y.
60 */
61 /* clang-format off */
62 const uint32_t bit_duplication[16] = {
63 0b00000000,
64 0b00000011,
65 0b00001100,
66 0b00001111,
67 0b00110000,
68 0b00110011,
69 0b00111100,
70 0b00111111,
71 0b11000000,
72 0b11000011,
73 0b11001100,
74 0b11001111,
75 0b11110000,
76 0b11110011,
77 0b11111100,
78 0b11111111,
79 };
80 /* clang-format on */
81
82 /*
83 * Space the bits out of a 4-bit nibble
84 */
85 /* clang-format off */
86 const unsigned space_4[16] = {
87 0b0000000,
88 0b0000001,
89 0b0000100,
90 0b0000101,
91 0b0010000,
92 0b0010001,
93 0b0010100,
94 0b0010101,
95 0b1000000,
96 0b1000001,
97 0b1000100,
98 0b1000101,
99 0b1010000,
100 0b1010001,
101 0b1010100,
102 0b1010101
103 };
104 /* clang-format on */
105
106 /* The scheme uses 16x16 tiles */
107
108 #define TILE_WIDTH 16
109 #define TILE_HEIGHT 16
110 #define PIXELS_PER_TILE (TILE_WIDTH * TILE_HEIGHT)
111
112 /* We need a 128-bit type for idiomatically tiling bpp128 formats. The type must
113 * only support copies and sizeof, so emulating with a packed structure works
114 * well enough, but if there's a native 128-bit type we may we well prefer
115 * that. */
116
117 #ifdef __SIZEOF_INT128__
118 typedef __uint128_t pan_uint128_t;
119 #else
120 typedef struct {
121 uint64_t lo;
122 uint64_t hi;
123 } __attribute__((packed)) pan_uint128_t;
124 #endif
125
126 typedef struct {
127 uint16_t lo;
128 uint8_t hi;
129 } __attribute__((packed)) pan_uint24_t;
130
131 typedef struct {
132 uint32_t lo;
133 uint16_t hi;
134 } __attribute__((packed)) pan_uint48_t;
135
136 typedef struct {
137 uint64_t lo;
138 uint32_t hi;
139 } __attribute__((packed)) pan_uint96_t;
140
141 /* Optimized routine to tile an aligned (w & 0xF == 0) texture. Explanation:
142 *
143 * dest_start precomputes the offset to the beginning of the first horizontal
144 * tile we're writing to, knowing that x is 16-aligned. Tiles themselves are
145 * stored linearly, so we get the X tile number by shifting and then multiply
146 * by the bytes per tile .
147 *
148 * We iterate across the pixels we're trying to store in source-order. For each
149 * row in the destination image, we figure out which row of 16x16 block we're
150 * in, by slicing off the lower 4-bits (block_y).
151 *
152 * dest then precomputes the location of the top-left corner of the block the
153 * row starts in. In pixel coordinates (where the origin is the top-left),
154 * (block_y, 0) is the top-left corner of the leftmost tile in this row. While
155 * pixels are reordered within a block, the blocks themselves are stored
156 * linearly, so multiplying block_y by the pixel stride of the destination
157 * image equals the byte offset of that top-left corner of the block this row
158 * is in.
159 *
160 * On the other hand, the source is linear so we compute the locations of the
161 * start and end of the row in the source by a simple linear addressing.
162 *
163 * For indexing within the tile, we need to XOR with the [y3 y3 y2 y2 y1 y1 y0
164 * y0] value. Since this is constant across a row, we look it up per-row and
165 * store in expanded_y.
166 *
167 * Finally, we iterate each row in source order. In the outer loop, we iterate
168 * each 16 pixel tile. Within each tile, we iterate the 16 pixels (this should
169 * be unrolled), calculating the index within the tile and writing.
170 */
171
172 #define TILED_ACCESS_TYPE(pixel_t, shift) \
173 static ALWAYS_INLINE void panfrost_access_tiled_image_##pixel_t( \
174 void *dst, void *src, uint16_t sx, uint16_t sy, uint16_t w, uint16_t h, \
175 uint32_t dst_stride, uint32_t src_stride, bool is_store) \
176 { \
177 uint8_t *dest_start = \
178 dst + ((sx >> 4) * PIXELS_PER_TILE * sizeof(pixel_t)); \
179 for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
180 uint8_t *dest = (uint8_t *)(dest_start + ((y >> 4) * dst_stride)); \
181 pixel_t *source = src + (src_y * src_stride); \
182 pixel_t *source_end = source + w; \
183 unsigned expanded_y = bit_duplication[y & 0xF] << shift; \
184 for (; source < source_end; dest += (PIXELS_PER_TILE << shift)) { \
185 for (uint8_t i = 0; i < 16; ++i) { \
186 unsigned index = expanded_y ^ (space_4[i] << shift); \
187 if (is_store) \
188 *((pixel_t *)(dest + index)) = *(source++); \
189 else \
190 *(source++) = *((pixel_t *)(dest + index)); \
191 } \
192 } \
193 } \
194 }
195
196 TILED_ACCESS_TYPE(uint8_t, 0);
197 TILED_ACCESS_TYPE(uint16_t, 1);
198 TILED_ACCESS_TYPE(uint32_t, 2);
199 TILED_ACCESS_TYPE(uint64_t, 3);
200 TILED_ACCESS_TYPE(pan_uint128_t, 4);
201
202 #define TILED_UNALIGNED_TYPE(pixel_t, is_store, tile_shift) \
203 { \
204 const unsigned mask = (1 << tile_shift) - 1; \
205 for (int y = sy, src_y = 0; src_y < h; ++y, ++src_y) { \
206 unsigned block_start_s = (y >> tile_shift) * dst_stride; \
207 unsigned source_start = src_y * src_stride; \
208 unsigned expanded_y = bit_duplication[y & mask]; \
209 \
210 for (int x = sx, src_x = 0; src_x < w; ++x, ++src_x) { \
211 unsigned block_x_s = (x >> tile_shift) * (1 << (tile_shift * 2)); \
212 unsigned index = expanded_y ^ space_4[x & mask]; \
213 uint8_t *source = src + source_start + sizeof(pixel_t) * src_x; \
214 uint8_t *dest = \
215 dst + block_start_s + sizeof(pixel_t) * (block_x_s + index); \
216 \
217 pixel_t *outp = (pixel_t *)(is_store ? dest : source); \
218 pixel_t *inp = (pixel_t *)(is_store ? source : dest); \
219 *outp = *inp; \
220 } \
221 } \
222 }
223
224 #define TILED_UNALIGNED_TYPES(store, shift) \
225 { \
226 if (bpp == 8) \
227 TILED_UNALIGNED_TYPE(uint8_t, store, shift) \
228 else if (bpp == 16) \
229 TILED_UNALIGNED_TYPE(uint16_t, store, shift) \
230 else if (bpp == 24) \
231 TILED_UNALIGNED_TYPE(pan_uint24_t, store, shift) \
232 else if (bpp == 32) \
233 TILED_UNALIGNED_TYPE(uint32_t, store, shift) \
234 else if (bpp == 48) \
235 TILED_UNALIGNED_TYPE(pan_uint48_t, store, shift) \
236 else if (bpp == 64) \
237 TILED_UNALIGNED_TYPE(uint64_t, store, shift) \
238 else if (bpp == 96) \
239 TILED_UNALIGNED_TYPE(pan_uint96_t, store, shift) \
240 else if (bpp == 128) \
241 TILED_UNALIGNED_TYPE(pan_uint128_t, store, shift) \
242 }
243
244 /*
245 * Perform a generic access to a tiled image with a given format. This works
246 * even for block-compressed images on entire blocks at a time. sx/sy/w/h are
247 * specified in pixels, not blocks, but our internal routines work in blocks,
248 * so we divide here. Alignment is assumed.
249 */
250 static void
panfrost_access_tiled_image_generic(void * dst,void * src,unsigned sx,unsigned sy,unsigned w,unsigned h,uint32_t dst_stride,uint32_t src_stride,const struct util_format_description * desc,bool _is_store)251 panfrost_access_tiled_image_generic(void *dst, void *src, unsigned sx,
252 unsigned sy, unsigned w, unsigned h,
253 uint32_t dst_stride, uint32_t src_stride,
254 const struct util_format_description *desc,
255 bool _is_store)
256 {
257 unsigned bpp = desc->block.bits;
258
259 /* Convert units */
260 sx /= desc->block.width;
261 sy /= desc->block.height;
262 w = DIV_ROUND_UP(w, desc->block.width);
263 h = DIV_ROUND_UP(h, desc->block.height);
264
265 if (desc->block.width > 1) {
266 if (_is_store)
267 TILED_UNALIGNED_TYPES(true, 2)
268 else
269 TILED_UNALIGNED_TYPES(false, 2)
270 } else {
271 if (_is_store)
272 TILED_UNALIGNED_TYPES(true, 4)
273 else
274 TILED_UNALIGNED_TYPES(false, 4)
275 }
276 }
277
278 #define OFFSET(src, _x, _y) \
279 (void *)((uint8_t *)src + ((_y)-orig_y) * src_stride + \
280 (((_x)-orig_x) * (bpp / 8)))
281
282 static ALWAYS_INLINE void
panfrost_access_tiled_image(void * dst,void * src,unsigned x,unsigned y,unsigned w,unsigned h,uint32_t dst_stride,uint32_t src_stride,enum pipe_format format,bool is_store)283 panfrost_access_tiled_image(void *dst, void *src, unsigned x, unsigned y,
284 unsigned w, unsigned h, uint32_t dst_stride,
285 uint32_t src_stride, enum pipe_format format,
286 bool is_store)
287 {
288 const struct util_format_description *desc = util_format_description(format);
289 unsigned bpp = desc->block.bits;
290
291 /* Our optimized routines cannot handle unaligned blocks (without depending
292 * on platform-specific behaviour), and there is no good reason to do so. If
293 * these assertions fail, there is either a driver bug or a non-portable unit
294 * test.
295 */
296 assert((dst_stride % (bpp / 8)) == 0 && "unaligned destination stride");
297 assert((src_stride % (bpp / 8)) == 0 && "unaligned source stride");
298
299 if (desc->block.width > 1 ||
300 !util_is_power_of_two_nonzero(desc->block.bits)) {
301 panfrost_access_tiled_image_generic(
302 dst, (void *)src, x, y, w, h, dst_stride, src_stride, desc, is_store);
303
304 return;
305 }
306
307 unsigned first_full_tile_x = DIV_ROUND_UP(x, TILE_WIDTH) * TILE_WIDTH;
308 unsigned first_full_tile_y = DIV_ROUND_UP(y, TILE_HEIGHT) * TILE_HEIGHT;
309 unsigned last_full_tile_x = ((x + w) / TILE_WIDTH) * TILE_WIDTH;
310 unsigned last_full_tile_y = ((y + h) / TILE_HEIGHT) * TILE_HEIGHT;
311
312 /* First, tile the top portion */
313
314 unsigned orig_x = x, orig_y = y;
315
316 if (first_full_tile_y != y) {
317 unsigned dist = MIN2(first_full_tile_y - y, h);
318
319 panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y), x, y, w, dist,
320 dst_stride, src_stride, desc,
321 is_store);
322
323 if (dist == h)
324 return;
325
326 y += dist;
327 h -= dist;
328 }
329
330 /* Next, the bottom portion */
331 if (last_full_tile_y != (y + h)) {
332 unsigned dist = (y + h) - last_full_tile_y;
333
334 panfrost_access_tiled_image_generic(
335 dst, OFFSET(src, x, last_full_tile_y), x, last_full_tile_y, w, dist,
336 dst_stride, src_stride, desc, is_store);
337
338 h -= dist;
339 }
340
341 /* The left portion */
342 if (first_full_tile_x != x) {
343 unsigned dist = MIN2(first_full_tile_x - x, w);
344
345 panfrost_access_tiled_image_generic(dst, OFFSET(src, x, y), x, y, dist, h,
346 dst_stride, src_stride, desc,
347 is_store);
348
349 if (dist == w)
350 return;
351
352 x += dist;
353 w -= dist;
354 }
355
356 /* Finally, the right portion */
357 if (last_full_tile_x != (x + w)) {
358 unsigned dist = (x + w) - last_full_tile_x;
359
360 panfrost_access_tiled_image_generic(
361 dst, OFFSET(src, last_full_tile_x, y), last_full_tile_x, y, dist, h,
362 dst_stride, src_stride, desc, is_store);
363
364 w -= dist;
365 }
366
367 if (bpp == 8)
368 panfrost_access_tiled_image_uint8_t(dst, OFFSET(src, x, y), x, y, w, h,
369 dst_stride, src_stride, is_store);
370 else if (bpp == 16)
371 panfrost_access_tiled_image_uint16_t(dst, OFFSET(src, x, y), x, y, w, h,
372 dst_stride, src_stride, is_store);
373 else if (bpp == 32)
374 panfrost_access_tiled_image_uint32_t(dst, OFFSET(src, x, y), x, y, w, h,
375 dst_stride, src_stride, is_store);
376 else if (bpp == 64)
377 panfrost_access_tiled_image_uint64_t(dst, OFFSET(src, x, y), x, y, w, h,
378 dst_stride, src_stride, is_store);
379 else if (bpp == 128)
380 panfrost_access_tiled_image_pan_uint128_t(
381 dst, OFFSET(src, x, y), x, y, w, h, dst_stride, src_stride, is_store);
382 }
383
384 /**
385 * Access a tiled image (load or store). Note: the region of interest (x, y, w,
386 * h) is specified in pixels, not blocks. It is expected that these quantities
387 * are aligned to the block size.
388 */
389 void
panfrost_store_tiled_image(void * dst,const void * src,unsigned x,unsigned y,unsigned w,unsigned h,uint32_t dst_stride,uint32_t src_stride,enum pipe_format format)390 panfrost_store_tiled_image(void *dst, const void *src, unsigned x, unsigned y,
391 unsigned w, unsigned h, uint32_t dst_stride,
392 uint32_t src_stride, enum pipe_format format)
393 {
394 panfrost_access_tiled_image(dst, (void *)src, x, y, w, h, dst_stride,
395 src_stride, format, true);
396 }
397
398 void
panfrost_load_tiled_image(void * dst,const void * src,unsigned x,unsigned y,unsigned w,unsigned h,uint32_t dst_stride,uint32_t src_stride,enum pipe_format format)399 panfrost_load_tiled_image(void *dst, const void *src, unsigned x, unsigned y,
400 unsigned w, unsigned h, uint32_t dst_stride,
401 uint32_t src_stride, enum pipe_format format)
402 {
403 panfrost_access_tiled_image((void *)src, dst, x, y, w, h, src_stride,
404 dst_stride, format, false);
405 }
406