xref: /aosp_15_r20/external/mesa3d/src/intel/isl/isl_tiled_memcpy.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1 /*
2  * Mesa 3-D graphics library
3  *
4  * Copyright 2012 Intel Corporation
5  * Copyright 2013 Google
6  *
7  * Permission is hereby granted, free of charge, to any person obtaining a
8  * copy of this software and associated documentation files (the
9  * "Software"), to deal in the Software without restriction, including
10  * without limitation the rights to use, copy, modify, merge, publish,
11  * distribute, sublicense, and/or sell copies of the Software, and to
12  * permit persons to whom the Software is furnished to do so, subject to
13  * the following conditions:
14  *
15  * The above copyright notice and this permission notice (including the
16  * next paragraph) shall be included in all copies or substantial portions
17  * of the Software.
18  *
19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
22  * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
23  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
24  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
25  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
26  *
27  * Authors:
28  *    Chad Versace <[email protected]>
29  *    Frank Henigman <[email protected]>
30  */
31 
32 #include <string.h>
33 
34 #include "util/macros.h"
35 #include "util/u_math.h"
36 #include "util/rounding.h"
37 #include "isl_priv.h"
38 
39 #if defined(__SSSE3__)
40 #include <tmmintrin.h>
41 #elif defined(__SSE2__)
42 #include <emmintrin.h>
43 #endif
44 
45 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
46 
47 #define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
48 #define ALIGN_UP(a, b) ALIGN(a, b)
49 
50 /* Tile dimensions.  Width and span are in bytes, height is in pixels (i.e.
51  * unitless).  A "span" is the most number of bytes we can copy from linear
52  * to tiled without needing to calculate a new destination address.
53  */
54 static const uint32_t xtile_width = 512;
55 static const uint32_t xtile_height = 8;
56 static const uint32_t xtile_span = 64;
57 static const uint32_t ytile_width = 128;
58 static const uint32_t ytile_height = 32;
59 static const uint32_t ytile_span = 16;
60 
61 static inline uint32_t
ror(uint32_t n,uint32_t d)62 ror(uint32_t n, uint32_t d)
63 {
64    return (n >> d) | (n << (32 - d));
65 }
66 
67 // bswap32 already exists as a macro on some platforms (FreeBSD)
68 #ifndef bswap32
69 static inline uint32_t
bswap32(uint32_t n)70 bswap32(uint32_t n)
71 {
72 #if defined(HAVE___BUILTIN_BSWAP32)
73    return __builtin_bswap32(n);
74 #else
75    return (n >> 24) |
76           ((n >> 8) & 0x0000ff00) |
77           ((n << 8) & 0x00ff0000) |
78           (n << 24);
79 #endif
80 }
81 #endif
82 
83 /**
84  * Copy RGBA to BGRA - swap R and B.
85  */
86 static inline void *
rgba8_copy(void * dst,const void * src,size_t bytes)87 rgba8_copy(void *dst, const void *src, size_t bytes)
88 {
89    uint32_t *d = dst;
90    uint32_t const *s = src;
91 
92    assert(bytes % 4 == 0);
93 
94    while (bytes >= 4) {
95       *d = ror(bswap32(*s), 8);
96       d += 1;
97       s += 1;
98       bytes -= 4;
99    }
100    return dst;
101 }
102 
103 #ifdef __SSSE3__
104 static const uint8_t rgba8_permutation[16] =
105    { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
106 
107 static inline void
rgba8_copy_16_aligned_dst(void * dst,const void * src)108 rgba8_copy_16_aligned_dst(void *dst, const void *src)
109 {
110    _mm_store_si128(dst,
111                    _mm_shuffle_epi8(_mm_loadu_si128(src),
112                                     *(__m128i *)rgba8_permutation));
113 }
114 
115 static inline void
rgba8_copy_16_aligned_src(void * dst,const void * src)116 rgba8_copy_16_aligned_src(void *dst, const void *src)
117 {
118    _mm_storeu_si128(dst,
119                     _mm_shuffle_epi8(_mm_load_si128(src),
120                                      *(__m128i *)rgba8_permutation));
121 }
122 
123 #elif defined(__SSE2__)
124 static inline void
rgba8_copy_16_aligned_dst(void * dst,const void * src)125 rgba8_copy_16_aligned_dst(void *dst, const void *src)
126 {
127    __m128i srcreg, dstreg, agmask, ag, rb, br;
128 
129    agmask = _mm_set1_epi32(0xFF00FF00);
130    srcreg = _mm_loadu_si128((__m128i *)src);
131 
132    rb = _mm_andnot_si128(agmask, srcreg);
133    ag = _mm_and_si128(agmask, srcreg);
134    br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
135                             _MM_SHUFFLE(2, 3, 0, 1));
136    dstreg = _mm_or_si128(ag, br);
137 
138    _mm_store_si128((__m128i *)dst, dstreg);
139 }
140 
141 static inline void
rgba8_copy_16_aligned_src(void * dst,const void * src)142 rgba8_copy_16_aligned_src(void *dst, const void *src)
143 {
144    __m128i srcreg, dstreg, agmask, ag, rb, br;
145 
146    agmask = _mm_set1_epi32(0xFF00FF00);
147    srcreg = _mm_load_si128((__m128i *)src);
148 
149    rb = _mm_andnot_si128(agmask, srcreg);
150    ag = _mm_and_si128(agmask, srcreg);
151    br = _mm_shufflehi_epi16(_mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)),
152                             _MM_SHUFFLE(2, 3, 0, 1));
153    dstreg = _mm_or_si128(ag, br);
154 
155    _mm_storeu_si128((__m128i *)dst, dstreg);
156 }
157 #endif
158 
159 /**
160  * Copy RGBA to BGRA - swap R and B, with the destination 16-byte aligned.
161  */
162 static inline void *
rgba8_copy_aligned_dst(void * dst,const void * src,size_t bytes)163 rgba8_copy_aligned_dst(void *dst, const void *src, size_t bytes)
164 {
165    assert(bytes == 0 || !(((uintptr_t)dst) & 0xf));
166 
167 #if defined(__SSSE3__) || defined(__SSE2__)
168    if (bytes == 64) {
169       rgba8_copy_16_aligned_dst(dst +  0, src +  0);
170       rgba8_copy_16_aligned_dst(dst + 16, src + 16);
171       rgba8_copy_16_aligned_dst(dst + 32, src + 32);
172       rgba8_copy_16_aligned_dst(dst + 48, src + 48);
173       return dst;
174    }
175 
176    while (bytes >= 16) {
177       rgba8_copy_16_aligned_dst(dst, src);
178       src += 16;
179       dst += 16;
180       bytes -= 16;
181    }
182 #endif
183 
184    rgba8_copy(dst, src, bytes);
185 
186    return dst;
187 }
188 
189 /**
190  * Copy RGBA to BGRA - swap R and B, with the source 16-byte aligned.
191  */
192 static inline void *
rgba8_copy_aligned_src(void * dst,const void * src,size_t bytes)193 rgba8_copy_aligned_src(void *dst, const void *src, size_t bytes)
194 {
195    assert(bytes == 0 || !(((uintptr_t)src) & 0xf));
196 
197 #if defined(__SSSE3__) || defined(__SSE2__)
198    if (bytes == 64) {
199       rgba8_copy_16_aligned_src(dst +  0, src +  0);
200       rgba8_copy_16_aligned_src(dst + 16, src + 16);
201       rgba8_copy_16_aligned_src(dst + 32, src + 32);
202       rgba8_copy_16_aligned_src(dst + 48, src + 48);
203       return dst;
204    }
205 
206    while (bytes >= 16) {
207       rgba8_copy_16_aligned_src(dst, src);
208       src += 16;
209       dst += 16;
210       bytes -= 16;
211    }
212 #endif
213 
214    rgba8_copy(dst, src, bytes);
215 
216    return dst;
217 }
218 
219 /**
220  * Each row from y0 to y1 is copied in three parts: [x0,x1), [x1,x2), [x2,x3).
221  * These ranges are in bytes, i.e. pixels * bytes-per-pixel.
222  * The first and last ranges must be shorter than a "span" (the longest linear
223  * stretch within a tile) and the middle must equal a whole number of spans.
224  * Ranges may be empty.  The region copied must land entirely within one tile.
225  * 'dst' is the start of the tile and 'src' is the corresponding
226  * address to copy from, though copying begins at (x0, y0).
227  * To enable swizzling 'swizzle_bit' must be 1<<6, otherwise zero.
228  * Swizzling flips bit 6 in the copy destination offset, when certain other
229  * bits are set in it.
230  */
231 typedef void (*tile_copy_fn)(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
232                              uint32_t y0, uint32_t y1,
233                              char *dst, const char *src,
234                              int32_t linear_pitch,
235                              uint32_t swizzle_bit,
236                              isl_memcpy_type copy_type);
237 
238 /**
239  * Copy texture data from linear to X tile layout.
240  *
241  * \copydoc tile_copy_fn
242  *
243  * The mem_copy parameters allow the user to specify an alternative mem_copy
244  * function that, for instance, may do RGBA -> BGRA swizzling.  The first
245  * function must handle any memory alignment while the second function must
246  * only handle 16-byte alignment in whichever side (source or destination) is
247  * tiled.
248  */
249 static inline void
linear_to_xtiled(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_mem_copy_fn mem_copy,isl_mem_copy_fn mem_copy_align16)250 linear_to_xtiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
251                  uint32_t y0, uint32_t y1,
252                  char *dst, const char *src,
253                  int32_t src_pitch,
254                  uint32_t swizzle_bit,
255                  isl_mem_copy_fn mem_copy,
256                  isl_mem_copy_fn mem_copy_align16)
257 {
258    /* The copy destination offset for each range copied is the sum of
259     * an X offset 'x0' or 'xo' and a Y offset 'yo.'
260     */
261    uint32_t xo, yo;
262 
263    src += (ptrdiff_t)y0 * src_pitch;
264 
265    for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
266       /* Bits 9 and 10 of the copy destination offset control swizzling.
267        * Only 'yo' contributes to those bits in the total offset,
268        * so calculate 'swizzle' just once per row.
269        * Move bits 9 and 10 three and four places respectively down
270        * to bit 6 and xor them.
271        */
272       uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
273 
274       mem_copy(dst + ((x0 + yo) ^ swizzle), src + x0, x1 - x0);
275 
276       for (xo = x1; xo < x2; xo += xtile_span) {
277          mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + xo, xtile_span);
278       }
279 
280       mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
281 
282       src += src_pitch;
283    }
284 }
285 
286 /**
287  * Copy texture data from linear to Y tile layout.
288  *
289  * \copydoc tile_copy_fn
290  */
291 static inline void
linear_to_ytiled(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y3,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_mem_copy_fn mem_copy,isl_mem_copy_fn mem_copy_align16)292 linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
293                  uint32_t y0, uint32_t y3,
294                  char *dst, const char *src,
295                  int32_t src_pitch,
296                  uint32_t swizzle_bit,
297                  isl_mem_copy_fn mem_copy,
298                  isl_mem_copy_fn mem_copy_align16)
299 {
300    /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
301     * as the tile).  Thus the destination offset for (x,y) is the sum of:
302     *   (x % column_width)                    // position within column
303     *   (x / column_width) * bytes_per_column // column number * bytes per column
304     *   y * column_width
305     *
306     * The copy destination offset for each range copied is the sum of
307     * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
308     */
309    const uint32_t column_width = ytile_span;
310    const uint32_t bytes_per_column = column_width * ytile_height;
311 
312    uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
313    uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
314 
315    uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
316    uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
317 
318    /* Bit 9 of the destination offset control swizzling.
319     * Only the X offset contributes to bit 9 of the total offset,
320     * so swizzle can be calculated in advance for these X positions.
321     * Move bit 9 three places down to bit 6.
322     */
323    uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
324    uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
325 
326    uint32_t x, yo;
327 
328    src += (ptrdiff_t)y0 * src_pitch;
329 
330    if (y0 != y1) {
331       for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
332          uint32_t xo = xo1;
333          uint32_t swizzle = swizzle1;
334 
335          mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
336 
337          /* Step by spans/columns.  As it happens, the swizzle bit flips
338           * at each step so we don't need to calculate it explicitly.
339           */
340          for (x = x1; x < x2; x += ytile_span) {
341             mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
342             xo += bytes_per_column;
343             swizzle ^= swizzle_bit;
344          }
345 
346          mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
347 
348          src += src_pitch;
349       }
350    }
351 
352    for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
353       uint32_t xo = xo1;
354       uint32_t swizzle = swizzle1;
355 
356       if (x0 != x1) {
357          mem_copy(dst + ((xo0 + yo + 0 * column_width) ^ swizzle0), src + x0 + 0 * src_pitch, x1 - x0);
358          mem_copy(dst + ((xo0 + yo + 1 * column_width) ^ swizzle0), src + x0 + 1 * src_pitch, x1 - x0);
359          mem_copy(dst + ((xo0 + yo + 2 * column_width) ^ swizzle0), src + x0 + 2 * src_pitch, x1 - x0);
360          mem_copy(dst + ((xo0 + yo + 3 * column_width) ^ swizzle0), src + x0 + 3 * src_pitch, x1 - x0);
361       }
362 
363       /* Step by spans/columns.  As it happens, the swizzle bit flips
364        * at each step so we don't need to calculate it explicitly.
365        */
366       for (x = x1; x < x2; x += ytile_span) {
367          mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x + 0 * src_pitch, ytile_span);
368          mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x + 1 * src_pitch, ytile_span);
369          mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x + 2 * src_pitch, ytile_span);
370          mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x + 3 * src_pitch, ytile_span);
371          xo += bytes_per_column;
372          swizzle ^= swizzle_bit;
373       }
374 
375       if (x2 != x3) {
376          mem_copy_align16(dst + ((xo + yo + 0 * column_width) ^ swizzle), src + x2 + 0 * src_pitch, x3 - x2);
377          mem_copy_align16(dst + ((xo + yo + 1 * column_width) ^ swizzle), src + x2 + 1 * src_pitch, x3 - x2);
378          mem_copy_align16(dst + ((xo + yo + 2 * column_width) ^ swizzle), src + x2 + 2 * src_pitch, x3 - x2);
379          mem_copy_align16(dst + ((xo + yo + 3 * column_width) ^ swizzle), src + x2 + 3 * src_pitch, x3 - x2);
380       }
381 
382       src += 4 * src_pitch;
383    }
384 
385    if (y2 != y3) {
386       for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
387          uint32_t xo = xo1;
388          uint32_t swizzle = swizzle1;
389 
390          mem_copy(dst + ((xo0 + yo) ^ swizzle0), src + x0, x1 - x0);
391 
392          /* Step by spans/columns.  As it happens, the swizzle bit flips
393           * at each step so we don't need to calculate it explicitly.
394           */
395          for (x = x1; x < x2; x += ytile_span) {
396             mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x, ytile_span);
397             xo += bytes_per_column;
398             swizzle ^= swizzle_bit;
399          }
400 
401          mem_copy_align16(dst + ((xo + yo) ^ swizzle), src + x2, x3 - x2);
402 
403          src += src_pitch;
404       }
405    }
406 }
407 
408 /**
409  * Copy texture data from linear to Tile-4 layout.
410  *
411  * \copydoc tile_copy_fn
412  */
413 static inline void
linear_to_tile4(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y3,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_mem_copy_fn mem_copy,isl_mem_copy_fn mem_copy_align16)414 linear_to_tile4(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
415                 uint32_t y0, uint32_t y3,
416                 char *dst, const char *src,
417                 int32_t src_pitch,
418                 uint32_t swizzle_bit,
419                 isl_mem_copy_fn mem_copy,
420                 isl_mem_copy_fn mem_copy_align16)
421 {
422    /* Tile 4 consist of columns that are 'ytile_span' wide and each 64B tile
423     * block consists of 4 row of Y-tile ordered data.
424     * Each 512B block within a 4kB tile contains 8 such block.
425     *
426     * To calculate the tiled  offset, we need to identify:
427     * Block X and Block Y offset at each 512B block boundary in X and Y
428     * direction.
429     *
430     * A Tile4 has the following layout :
431     *
432     *                |<------------- 128 B-------------------|
433     *                _________________________________________
434     * 512B blk(Blk0)^|  0 |  1 |  2 |  3 |  8 |  9 | 10 | 11 | ^ 512B blk(Blk1)
435     * (cell 0..7))  v|  4 |  5 |  6 |  7 | 12 | 13 | 14 | 15 | v (cell 8..15))
436     *                -----------------------------------------
437     *                | 16 | 17 | 18 | 19 | 24 | 25 | 26 | 27 |
438     *                | 20 | 21 | 22 | 23 | 28 | 29 | 30 | 31 |
439     *                -----------------------------------------
440     *                | 32 | 33 | 34 | 35 | 40 | 41 | 42 | 43 |
441     *                | 36 | 37 | 38 | 39 | 44 | 45 | 46 | 47 |
442     *                -----------------------------------------
443     *                | 48 | 49 | 50 | 51 | 56 | 57 | 58 | 59 |
444     *                | 52 | 53 | 54 | 55 | 60 | 61 | 62 | 63 |
445     *                -----------------------------------------
446     *
447     * The tile is divided in 512B blocks[Blk0..Blk7], themselves made of 2
448     * rows of 256B sub-blocks.
449     *
450     * Each sub-block is composed of 4 64B elements[cell(0)-cell(3)] (a cell
451     * in the figure above).
452     *
453     * Each 64B cell represents 4 rows of data.[cell(0), cell(1), .., cell(63)]
454     *
455     *
456     *   Block X - Adds 256B to offset when we encounter block boundary in
457     *             X direction.(Ex: Blk 0 --> Blk 1(BlkX_off = 256))
458     *   Block Y - Adds 512B to offset when we encounter block boundary in
459     *             Y direction.(Ex: Blk 0 --> Blk 3(BlkY_off = 512))
460     *
461     *   (x / ytile_span) * cacheline_size_B //Byte offset in the X dir of
462     *                                         the containing 64B block
463     *   x % ytile_span //Byte offset in X dir within a 64B block/cacheline
464     *
465     *   (y % 4) * 16 // Byte offset of the Y dir within a 64B block/cacheline
466     *   (y / 4) * 256// Byte offset of the Y dir within 512B block after 1 row
467     *                   of 64B blocks/cachelines
468     *
469     * The copy destination offset for each range copied is the sum of
470     * Block X offset 'BlkX_off', Block Y offset 'BlkY_off', X offset 'xo'
471     * and a Y offset 'yo.'
472     */
473    const uint32_t column_width = ytile_span;
474    const uint32_t tile4_blkh = 4;
475 
476    assert(ytile_span * tile4_blkh == 64);
477    const uint32_t cacheline_size_B = 64;
478 
479    /* Find intermediate Y offsets that are aligned to a 64B element
480     * (4 rows), so that we can do fully 64B memcpys on those.
481     */
482    uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
483    uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
484 
485    /* xsb0 and xsb1 are the byte offset within a 256B sub block for x0 and x1 */
486    uint32_t xsb0 = (x0 % ytile_span) + (x0 / ytile_span) * cacheline_size_B;
487    uint32_t xsb1 = (x1 % ytile_span) + (x1 / ytile_span) * cacheline_size_B;
488 
489    uint32_t Blkxsb0_off = ALIGN_DOWN(xsb0, 256);
490    uint32_t Blky0_off = (y0 / 8) * 512;
491 
492    uint32_t BlkX_off, BlkY_off;
493 
494    uint32_t x, yo, Y0, Y2;
495 
496    /* Y0 determines the initial byte offset in the Y direction */
497    Y0 = (y0 / 4) * 256 + (y0 % 4) * ytile_span;
498 
499    /* Y2 determines the byte offset required for reaching y2 if y2 doesn't map
500     * exactly to 512B block boundary
501     */
502    Y2 = y2 * 4 * column_width;
503 
504    src += (ptrdiff_t)y0 * src_pitch;
505 
506    /* To maximize memcpy speed, we do the copy in 3 parts :
507     *   - copy the first lines that are not aligned to the 64B cell's height (4 rows)
508     *   - copy the lines that are aligned to 64B cell's height
509     *   - copy the remaining lines not making up for a full 64B cell's height
510     */
511    if (y0 != y1) {
512       for (yo = Y0; yo < Y0 + (y1 - y0) * column_width; yo += column_width) {
513          uint32_t xo = xsb1;
514 
515          if (x0 != x1)
516             mem_copy(dst + (Blky0_off + Blkxsb0_off) + (xsb0 + yo), src + x0, x1 - x0);
517 
518          for (x = x1; x < x2; x += ytile_span) {
519             BlkX_off = ALIGN_DOWN(xo, 256);
520 
521             mem_copy_align16(dst + (Blky0_off + BlkX_off) + (xo + yo), src + x, ytile_span);
522             xo += cacheline_size_B;
523          }
524 
525          if (x3 != x2) {
526             BlkX_off = ALIGN_DOWN(xo, 256);
527             mem_copy_align16(dst + (Blky0_off + BlkX_off) + (xo + yo), src + x2, x3 - x2);
528          }
529 
530          src += src_pitch;
531       }
532    }
533 
534    for (yo = y1 * 4 * column_width; yo < y2 * 4 * column_width; yo += 16 * column_width) {
535       uint32_t xo = xsb1;
536       BlkY_off = ALIGN_DOWN(yo, 512);
537 
538       if (x0 != x1) {
539          mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 0 * column_width),
540                   src + x0 + 0 * src_pitch, x1 - x0);
541          mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 1 * column_width),
542                   src + x0 + 1 * src_pitch, x1 - x0);
543          mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 2 * column_width),
544                   src + x0 + 2 * src_pitch, x1 - x0);
545          mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 3 * column_width),
546                   src + x0 + 3 * src_pitch, x1 - x0);
547       }
548 
549       for (x = x1; x < x2; x += ytile_span) {
550          BlkX_off = ALIGN_DOWN(xo, 256);
551 
552          mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo+ 0 * column_width),
553                           src + x + 0 * src_pitch, ytile_span);
554          mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo + 1 * column_width),
555                           src + x + 1 * src_pitch, ytile_span);
556          mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo + 2 * column_width),
557                           src + x + 2 * src_pitch, ytile_span);
558          mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo + 3 * column_width),
559                           src + x + 3 * src_pitch, ytile_span);
560 
561          xo += cacheline_size_B;
562       }
563 
564       if (x2 != x3) {
565          BlkX_off = ALIGN_DOWN(xo, 256);
566 
567          mem_copy(dst + (BlkY_off + BlkX_off) + (xo + yo + 0 * column_width),
568                   src + x2 + 0 * src_pitch, x3 - x2);
569          mem_copy(dst + (BlkY_off + BlkX_off) + (xo + yo + 1 * column_width),
570                   src + x2 + 1 * src_pitch, x3 - x2);
571          mem_copy(dst + (BlkY_off + BlkX_off) + (xo + yo + 2 * column_width),
572                   src + x2 + 2 * src_pitch, x3 - x2);
573          mem_copy(dst + (BlkY_off + BlkX_off) + (xo + yo + 3 * column_width),
574                   src + x2 + 3 * src_pitch, x3 - x2);
575       }
576 
577       src += 4 * src_pitch;
578    }
579 
580    if (y2 != y3) {
581       for (yo = Y2; yo < Y2 + (y3 - y2) * column_width; yo += column_width) {
582          uint32_t xo = xsb1;
583          BlkY_off = ALIGN_DOWN(yo, 512);
584 
585          if (x0 != x1)
586             mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo), src + x0, x1 - x0);
587 
588          for (x = x1; x < x2; x += ytile_span) {
589             BlkX_off = ALIGN_DOWN(xo, 256);
590 
591             mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo), src + x, ytile_span);
592             xo += cacheline_size_B;
593          }
594 
595          if (x3 != x2) {
596             BlkX_off = ALIGN_DOWN(xo, 256);
597             mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo), src + x2, x3 - x2);
598          }
599 
600          src += src_pitch;
601       }
602    }
603 }
604 
605 /**
606  * Copy texture data from X tile layout to linear.
607  *
608  * \copydoc tile_copy_fn
609  */
610 static inline void
xtiled_to_linear(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_mem_copy_fn mem_copy,isl_mem_copy_fn mem_copy_align16)611 xtiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
612                  uint32_t y0, uint32_t y1,
613                  char *dst, const char *src,
614                  int32_t dst_pitch,
615                  uint32_t swizzle_bit,
616                  isl_mem_copy_fn mem_copy,
617                  isl_mem_copy_fn mem_copy_align16)
618 {
619    /* The copy destination offset for each range copied is the sum of
620     * an X offset 'x0' or 'xo' and a Y offset 'yo.'
621     */
622    uint32_t xo, yo;
623 
624    dst += (ptrdiff_t)y0 * dst_pitch;
625 
626    for (yo = y0 * xtile_width; yo < y1 * xtile_width; yo += xtile_width) {
627       /* Bits 9 and 10 of the copy destination offset control swizzling.
628        * Only 'yo' contributes to those bits in the total offset,
629        * so calculate 'swizzle' just once per row.
630        * Move bits 9 and 10 three and four places respectively down
631        * to bit 6 and xor them.
632        */
633       uint32_t swizzle = ((yo >> 3) ^ (yo >> 4)) & swizzle_bit;
634 
635       mem_copy(dst + x0, src + ((x0 + yo) ^ swizzle), x1 - x0);
636 
637       for (xo = x1; xo < x2; xo += xtile_span) {
638          mem_copy_align16(dst + xo, src + ((xo + yo) ^ swizzle), xtile_span);
639       }
640 
641       mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
642 
643       dst += dst_pitch;
644    }
645 }
646 
647  /**
648  * Copy texture data from Y tile layout to linear.
649  *
650  * \copydoc tile_copy_fn
651  */
652 static inline void
ytiled_to_linear(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y3,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_mem_copy_fn mem_copy,isl_mem_copy_fn mem_copy_align16)653 ytiled_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
654                  uint32_t y0, uint32_t y3,
655                  char *dst, const char *src,
656                  int32_t dst_pitch,
657                  uint32_t swizzle_bit,
658                  isl_mem_copy_fn mem_copy,
659                  isl_mem_copy_fn mem_copy_align16)
660 {
661    /* Y tiles consist of columns that are 'ytile_span' wide (and the same height
662     * as the tile).  Thus the destination offset for (x,y) is the sum of:
663     *   (x % column_width)                    // position within column
664     *   (x / column_width) * bytes_per_column // column number * bytes per column
665     *   y * column_width
666     *
667     * The copy destination offset for each range copied is the sum of
668     * an X offset 'xo0' or 'xo' and a Y offset 'yo.'
669     */
670    const uint32_t column_width = ytile_span;
671    const uint32_t bytes_per_column = column_width * ytile_height;
672 
673    uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
674    uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
675 
676    uint32_t xo0 = (x0 % ytile_span) + (x0 / ytile_span) * bytes_per_column;
677    uint32_t xo1 = (x1 % ytile_span) + (x1 / ytile_span) * bytes_per_column;
678 
679    /* Bit 9 of the destination offset control swizzling.
680     * Only the X offset contributes to bit 9 of the total offset,
681     * so swizzle can be calculated in advance for these X positions.
682     * Move bit 9 three places down to bit 6.
683     */
684    uint32_t swizzle0 = (xo0 >> 3) & swizzle_bit;
685    uint32_t swizzle1 = (xo1 >> 3) & swizzle_bit;
686 
687    uint32_t x, yo;
688 
689    dst += (ptrdiff_t)y0 * dst_pitch;
690 
691    if (y0 != y1) {
692       for (yo = y0 * column_width; yo < y1 * column_width; yo += column_width) {
693          uint32_t xo = xo1;
694          uint32_t swizzle = swizzle1;
695 
696          mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
697 
698          /* Step by spans/columns.  As it happens, the swizzle bit flips
699           * at each step so we don't need to calculate it explicitly.
700           */
701          for (x = x1; x < x2; x += ytile_span) {
702             mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
703             xo += bytes_per_column;
704             swizzle ^= swizzle_bit;
705          }
706 
707          mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
708 
709          dst += dst_pitch;
710       }
711    }
712 
713    for (yo = y1 * column_width; yo < y2 * column_width; yo += 4 * column_width) {
714       uint32_t xo = xo1;
715       uint32_t swizzle = swizzle1;
716 
717       if (x0 != x1) {
718          mem_copy(dst + x0 + 0 * dst_pitch, src + ((xo0 + yo + 0 * column_width) ^ swizzle0), x1 - x0);
719          mem_copy(dst + x0 + 1 * dst_pitch, src + ((xo0 + yo + 1 * column_width) ^ swizzle0), x1 - x0);
720          mem_copy(dst + x0 + 2 * dst_pitch, src + ((xo0 + yo + 2 * column_width) ^ swizzle0), x1 - x0);
721          mem_copy(dst + x0 + 3 * dst_pitch, src + ((xo0 + yo + 3 * column_width) ^ swizzle0), x1 - x0);
722       }
723 
724       /* Step by spans/columns.  As it happens, the swizzle bit flips
725        * at each step so we don't need to calculate it explicitly.
726        */
727       for (x = x1; x < x2; x += ytile_span) {
728          mem_copy_align16(dst + x + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), ytile_span);
729          mem_copy_align16(dst + x + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), ytile_span);
730          mem_copy_align16(dst + x + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), ytile_span);
731          mem_copy_align16(dst + x + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), ytile_span);
732          xo += bytes_per_column;
733          swizzle ^= swizzle_bit;
734       }
735 
736       if (x2 != x3) {
737          mem_copy_align16(dst + x2 + 0 * dst_pitch, src + ((xo + yo + 0 * column_width) ^ swizzle), x3 - x2);
738          mem_copy_align16(dst + x2 + 1 * dst_pitch, src + ((xo + yo + 1 * column_width) ^ swizzle), x3 - x2);
739          mem_copy_align16(dst + x2 + 2 * dst_pitch, src + ((xo + yo + 2 * column_width) ^ swizzle), x3 - x2);
740          mem_copy_align16(dst + x2 + 3 * dst_pitch, src + ((xo + yo + 3 * column_width) ^ swizzle), x3 - x2);
741       }
742 
743       dst += 4 * dst_pitch;
744    }
745 
746    if (y2 != y3) {
747       for (yo = y2 * column_width; yo < y3 * column_width; yo += column_width) {
748          uint32_t xo = xo1;
749          uint32_t swizzle = swizzle1;
750 
751          mem_copy(dst + x0, src + ((xo0 + yo) ^ swizzle0), x1 - x0);
752 
753          /* Step by spans/columns.  As it happens, the swizzle bit flips
754           * at each step so we don't need to calculate it explicitly.
755           */
756          for (x = x1; x < x2; x += ytile_span) {
757             mem_copy_align16(dst + x, src + ((xo + yo) ^ swizzle), ytile_span);
758             xo += bytes_per_column;
759             swizzle ^= swizzle_bit;
760          }
761 
762          mem_copy_align16(dst + x2, src + ((xo + yo) ^ swizzle), x3 - x2);
763 
764          dst += dst_pitch;
765       }
766    }
767 }
768 
769 
770 /**
771  * Copy texture data from linear to Tile-4 layout.
772  *
773  * \copydoc tile_copy_fn
774  */
775 static inline void
tile4_to_linear(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y3,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_mem_copy_fn mem_copy,isl_mem_copy_fn mem_copy_align16)776 tile4_to_linear(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
777                 uint32_t y0, uint32_t y3,
778                 char *dst, const char *src,
779                 int32_t dst_pitch,
780                 uint32_t swizzle_bit,
781                 isl_mem_copy_fn mem_copy,
782                 isl_mem_copy_fn mem_copy_align16)
783 {
784 
785    /* Tile 4 consist of columns that are 'ytile_span' wide and each 64B tile block
786     * consists of 4 row of Y-tile ordered data.
787     * Each 512B block within a 4kB tile contains 8 such block.
788     *
789     * To calculate the tiled  offset, we need to identify:
790     * Block X and Block Y offset at each 512B block boundary in X and Y direction.
791     *
792     * Refer to the Tile4 layout diagram in linear_to_tile4() function.
793     *
794     * The tile is divided in 512B blocks[Blk0..Blk7], themselves made of 2
795     * rows of 256B sub-blocks
796     *
797     * Each sub-block is composed of 4 64B elements[cell(0)-cell(3)].
798     *
799     * Each 64B cell represents 4 rows of data.[cell(0), cell(1), .., cell(63)]
800     *
801     *
802     *   Block X - Adds 256B to offset when we encounter block boundary in
803     *             X direction.(Ex: Blk 0 --> Blk 1(BlkX_off = 256))
804     *   Block Y - Adds 512B to offset when we encounter block boundary in
805     *             Y direction.(Ex: Blk 0 --> Blk 3(BlkY_off = 512))
806     *
807     *   (x / ytile_span) * cacheline_size_B //Byte offset in the X dir of the
808     *                                         containing 64B block
809     *   x % ytile_span //Byte offset in X dir within a 64B block/cacheline
810     *
811     *   (y % 4) * 16 // Byte offset of the Y dir within a 64B block/cacheline
812     *   (y / 4) * 256// Byte offset of the Y dir within 512B block after 1 row
813     *                   of 64B blocks/cachelines
814     *
815     * The copy destination offset for each range copied is the sum of
816     * Block X offset 'BlkX_off', Block Y offset 'BlkY_off', X offset 'xo'
817     * and a Y offset 'yo.'
818     */
819 
820    const uint32_t column_width = ytile_span;
821    const uint32_t tile4_blkh = 4;
822 
823    assert(ytile_span * tile4_blkh == 64);
824    const uint32_t cacheline_size_B = 64;
825 
826    /* Find intermediate Y offsets that are aligned to a 64B element
827     * (4 rows), so that we can do fully 64B memcpys on those.
828     */
829    uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4));
830    uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4));
831 
832    /* xsb0 and xsb1 are the byte offset within a 256B sub block for x0 and x1 */
833    uint32_t xsb0 = (x0 % ytile_span) + (x0 / ytile_span) * cacheline_size_B;
834    uint32_t xsb1 = (x1 % ytile_span) + (x1 / ytile_span) * cacheline_size_B;
835 
836    uint32_t Blkxsb0_off = ALIGN_DOWN(xsb0, 256);
837    uint32_t Blky0_off = (y0 / 8) * 512;
838 
839    uint32_t BlkX_off, BlkY_off;
840 
841    uint32_t x, yo, Y0, Y2;
842 
843    /* Y0 determines the initial byte offset in the Y direction */
844    Y0 = (y0 / 4) * 256 + (y0 % 4) * 16;
845 
846    /* Y2 determines the byte offset required for reaching y2 if y2 doesn't map
847     * exactly to 512B block boundary
848     */
849    Y2 = y2 * 4 * column_width;
850 
851    dst += (ptrdiff_t)y0 * dst_pitch;
852 
853    /* To maximize memcpy speed, we do the copy in 3 parts :
854     *   - copy the first lines that are not aligned to the 64B cell's height (4 rows)
855     *   - copy the lines that are aligned to 64B cell's height
856     *   - copy the remaining lines not making up for a full 64B cell's height
857     */
858    if (y0 != y1) {
859       for (yo = Y0; yo < Y0 + (y1 - y0) * column_width; yo += column_width) {
860          uint32_t xo = xsb1;
861 
862          if (x0 != x1)
863             mem_copy(dst + x0, src + (Blky0_off + Blkxsb0_off) + (xsb0 + yo), x1 - x0);
864 
865          for (x = x1; x < x2; x += ytile_span) {
866             BlkX_off = ALIGN_DOWN(xo, 256);
867 
868             mem_copy_align16(dst + x, src + (Blky0_off + BlkX_off) + (xo + yo), ytile_span);
869             xo += cacheline_size_B;
870          }
871 
872          if (x3 != x2) {
873             BlkX_off = ALIGN_DOWN(xo, 256);
874             mem_copy_align16(dst + x2, src + (Blky0_off + BlkX_off) + (xo + yo), x3 - x2);
875          }
876 
877          dst += dst_pitch;
878       }
879    }
880 
881    for (yo = y1 * 4 * column_width; yo < y2 * 4 * column_width; yo += 16 * column_width) {
882       uint32_t xo = xsb1;
883       BlkY_off = ALIGN_DOWN(yo, 512);
884 
885       if (x0 != x1) {
886          mem_copy(dst + x0 + 0 * dst_pitch,
887                   src + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 0 * column_width),
888                   x1 - x0);
889          mem_copy(dst + x0 + 1 * dst_pitch,
890                   src + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 1 * column_width),
891                   x1 - x0);
892          mem_copy(dst + x0 + 2 * dst_pitch,
893                   src + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 2 * column_width),
894                   x1 - x0);
895          mem_copy(dst + x0 + 3 * dst_pitch,
896                   src + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 3 * column_width),
897                   x1 - x0);
898       }
899 
900       for (x = x1; x < x2; x += ytile_span) {
901          BlkX_off = ALIGN_DOWN(xo, 256);
902 
903          mem_copy_align16(dst + x + 0 * dst_pitch,
904                           src + (BlkY_off + BlkX_off) + (xo + yo + 0 * column_width),
905                           ytile_span);
906          mem_copy_align16(dst + x + 1 * dst_pitch,
907                           src + (BlkY_off + BlkX_off) + (xo + yo + 1 * column_width),
908                           ytile_span);
909          mem_copy_align16(dst + x + 2 * dst_pitch,
910                           src + (BlkY_off + BlkX_off) + (xo + yo + 2 * column_width),
911                           ytile_span);
912          mem_copy_align16(dst + x + 3 * dst_pitch,
913                           src + (BlkY_off + BlkX_off) + (xo + yo + 3 * column_width),
914                           ytile_span);
915 
916          xo += cacheline_size_B;
917       }
918 
919       if (x2 != x3) {
920          BlkX_off = ALIGN_DOWN(xo, 256);
921 
922          mem_copy(dst + x2 + 0 * dst_pitch,
923                   src + (BlkY_off + BlkX_off) + (xo + yo + 0 * column_width),
924                   x3 - x2);
925          mem_copy(dst + x2 + 1 * dst_pitch,
926                   src + (BlkY_off + BlkX_off) + (xo + yo + 1 * column_width),
927                   x3 - x2);
928          mem_copy(dst + x2 + 2 * dst_pitch,
929                   src + (BlkY_off + BlkX_off) + (xo + yo + 2 * column_width),
930                   x3 - x2);
931          mem_copy(dst + x2 + 3 * dst_pitch,
932                   src + (BlkY_off + BlkX_off) + (xo + yo + 3 * column_width),
933                   x3 - x2);
934       }
935 
936       dst += 4 * dst_pitch;
937    }
938 
939    if (y2 != y3) {
940       for (yo = Y2; yo < Y2 + (y3 - y2) * column_width; yo += column_width) {
941          uint32_t xo = xsb1;
942          BlkY_off = ALIGN_DOWN(yo, 512);
943 
944          if (x0 != x1)
945             mem_copy(dst + x0, src + (BlkY_off + Blkxsb0_off) + (xsb0 + yo), x1 - x0);
946 
947          for (x = x1; x < x2; x += ytile_span) {
948             BlkX_off = ALIGN_DOWN(xo, 256);
949 
950             mem_copy_align16(dst + x, src + (BlkY_off + BlkX_off) + (xo + yo), ytile_span);
951             xo += cacheline_size_B;
952          }
953 
954          if (x3 != x2) {
955             BlkX_off = ALIGN_DOWN(xo, 256);
956             mem_copy_align16(dst + x2, src + (BlkY_off + BlkX_off) + (xo + yo), x3 - x2);
957          }
958 
959          dst += dst_pitch;
960       }
961    }
962 }
963 
964 #if defined(INLINE_SSE41)
965 static ALWAYS_INLINE void *
_memcpy_streaming_load(void * dest,const void * src,size_t count)966 _memcpy_streaming_load(void *dest, const void *src, size_t count)
967 {
968    if (count == 16) {
969       __m128i val = _mm_stream_load_si128((__m128i *)src);
970       _mm_storeu_si128((__m128i *)dest, val);
971       return dest;
972    } else if (count == 64) {
973       __m128i val0 = _mm_stream_load_si128(((__m128i *)src) + 0);
974       __m128i val1 = _mm_stream_load_si128(((__m128i *)src) + 1);
975       __m128i val2 = _mm_stream_load_si128(((__m128i *)src) + 2);
976       __m128i val3 = _mm_stream_load_si128(((__m128i *)src) + 3);
977       _mm_storeu_si128(((__m128i *)dest) + 0, val0);
978       _mm_storeu_si128(((__m128i *)dest) + 1, val1);
979       _mm_storeu_si128(((__m128i *)dest) + 2, val2);
980       _mm_storeu_si128(((__m128i *)dest) + 3, val3);
981       return dest;
982    } else {
983       assert(count < 64); /* and (count < 16) for ytiled */
984       return memcpy(dest, src, count);
985    }
986 }
987 #endif
988 
989 static isl_mem_copy_fn
choose_copy_function(isl_memcpy_type copy_type)990 choose_copy_function(isl_memcpy_type copy_type)
991 {
992    switch(copy_type) {
993    case ISL_MEMCPY:
994       return memcpy;
995    case ISL_MEMCPY_BGRA8:
996       return rgba8_copy;
997    case ISL_MEMCPY_STREAMING_LOAD:
998 #if defined(INLINE_SSE41)
999       return _memcpy_streaming_load;
1000 #else
1001       unreachable("ISL_MEMCOPY_STREAMING_LOAD requires sse4.1");
1002 #endif
1003    case ISL_MEMCPY_INVALID:
1004       unreachable("invalid copy_type");
1005    }
1006    unreachable("unhandled copy_type");
1007    return NULL;
1008 }
1009 
1010 /**
1011  * Copy texture data from linear to X tile layout, faster.
1012  *
1013  * Same as \ref linear_to_xtiled but faster, because it passes constant
1014  * parameters for common cases, allowing the compiler to inline code
1015  * optimized for those cases.
1016  *
1017  * \copydoc tile_copy_fn
1018  */
1019 static FLATTEN void
linear_to_xtiled_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1020 linear_to_xtiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1021                         uint32_t y0, uint32_t y1,
1022                         char *dst, const char *src,
1023                         int32_t src_pitch,
1024                         uint32_t swizzle_bit,
1025                         isl_memcpy_type copy_type)
1026 {
1027    isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
1028 
1029    if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
1030       if (mem_copy == memcpy)
1031          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
1032                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
1033       else if (mem_copy == rgba8_copy)
1034          return linear_to_xtiled(0, 0, xtile_width, xtile_width, 0, xtile_height,
1035                                  dst, src, src_pitch, swizzle_bit,
1036                                  rgba8_copy, rgba8_copy_aligned_dst);
1037       else
1038          unreachable("not reached");
1039    } else {
1040       if (mem_copy == memcpy)
1041          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
1042                                  dst, src, src_pitch, swizzle_bit,
1043                                  memcpy, memcpy);
1044       else if (mem_copy == rgba8_copy)
1045          return linear_to_xtiled(x0, x1, x2, x3, y0, y1,
1046                                  dst, src, src_pitch, swizzle_bit,
1047                                  rgba8_copy, rgba8_copy_aligned_dst);
1048       else
1049          unreachable("not reached");
1050    }
1051    linear_to_xtiled(x0, x1, x2, x3, y0, y1,
1052                     dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
1053 }
1054 
1055 /**
1056  * Copy texture data from linear to Y tile layout, faster.
1057  *
1058  * Same as \ref linear_to_ytiled but faster, because it passes constant
1059  * parameters for common cases, allowing the compiler to inline code
1060  * optimized for those cases.
1061  *
1062  * \copydoc tile_copy_fn
1063  */
1064 static FLATTEN void
linear_to_ytiled_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1065 linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1066                         uint32_t y0, uint32_t y1,
1067                         char *dst, const char *src,
1068                         int32_t src_pitch,
1069                         uint32_t swizzle_bit,
1070                         isl_memcpy_type copy_type)
1071 {
1072    isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
1073 
1074    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
1075       if (mem_copy == memcpy)
1076          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
1077                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
1078       else if (mem_copy == rgba8_copy)
1079          return linear_to_ytiled(0, 0, ytile_width, ytile_width, 0, ytile_height,
1080                                  dst, src, src_pitch, swizzle_bit,
1081                                  rgba8_copy, rgba8_copy_aligned_dst);
1082       else
1083          unreachable("not reached");
1084    } else {
1085       if (mem_copy == memcpy)
1086          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
1087                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
1088       else if (mem_copy == rgba8_copy)
1089          return linear_to_ytiled(x0, x1, x2, x3, y0, y1,
1090                                  dst, src, src_pitch, swizzle_bit,
1091                                  rgba8_copy, rgba8_copy_aligned_dst);
1092       else
1093          unreachable("not reached");
1094    }
1095    linear_to_ytiled(x0, x1, x2, x3, y0, y1,
1096                     dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy);
1097 }
1098 
1099 /**
1100  * Copy texture data from linear to tile 4 layout, faster.
1101  *
1102  * Same as \ref linear_to_tile4 but faster, because it passes constant
1103  * parameters for common cases, allowing the compiler to inline code
1104  * optimized for those cases.
1105  *
1106  * \copydoc tile_copy_fn
1107  */
1108 static FLATTEN void
linear_to_tile4_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t src_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1109 linear_to_tile4_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1110                         uint32_t y0, uint32_t y1,
1111                         char *dst, const char *src,
1112                         int32_t src_pitch,
1113                         uint32_t swizzle_bit,
1114                         isl_memcpy_type copy_type)
1115 {
1116    isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
1117    assert(swizzle_bit == 0);
1118 
1119    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
1120       if (mem_copy == memcpy)
1121          return linear_to_tile4(0, 0, ytile_width, ytile_width, 0, ytile_height,
1122                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
1123       else if (mem_copy == rgba8_copy)
1124          return linear_to_tile4(0, 0, ytile_width, ytile_width, 0, ytile_height,
1125                                  dst, src, src_pitch, swizzle_bit,
1126                                  rgba8_copy, rgba8_copy_aligned_dst);
1127       else
1128          unreachable("not reached");
1129    } else {
1130       if (mem_copy == memcpy)
1131          return linear_to_tile4(x0, x1, x2, x3, y0, y1,
1132                                  dst, src, src_pitch, swizzle_bit, memcpy, memcpy);
1133       else if (mem_copy == rgba8_copy)
1134          return linear_to_tile4(x0, x1, x2, x3, y0, y1,
1135                                  dst, src, src_pitch, swizzle_bit,
1136                                  rgba8_copy, rgba8_copy_aligned_dst);
1137       else
1138          unreachable("not reached");
1139    }
1140 }
1141 
1142 /**
1143  * Copy texture data from X tile layout to linear, faster.
1144  *
1145  * Same as \ref xtile_to_linear but faster, because it passes constant
1146  * parameters for common cases, allowing the compiler to inline code
1147  * optimized for those cases.
1148  *
1149  * \copydoc tile_copy_fn
1150  */
1151 static FLATTEN void
xtiled_to_linear_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1152 xtiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1153                         uint32_t y0, uint32_t y1,
1154                         char *dst, const char *src,
1155                         int32_t dst_pitch,
1156                         uint32_t swizzle_bit,
1157                         isl_memcpy_type copy_type)
1158 {
1159    isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
1160 
1161    if (x0 == 0 && x3 == xtile_width && y0 == 0 && y1 == xtile_height) {
1162       if (mem_copy == memcpy)
1163          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
1164                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
1165       else if (mem_copy == rgba8_copy)
1166          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
1167                                  dst, src, dst_pitch, swizzle_bit,
1168                                  rgba8_copy, rgba8_copy_aligned_src);
1169 #if defined(INLINE_SSE41)
1170       else if (mem_copy == _memcpy_streaming_load)
1171          return xtiled_to_linear(0, 0, xtile_width, xtile_width, 0, xtile_height,
1172                                  dst, src, dst_pitch, swizzle_bit,
1173                                  memcpy, _memcpy_streaming_load);
1174 #endif
1175       else
1176          unreachable("not reached");
1177    } else {
1178       if (mem_copy == memcpy)
1179          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
1180                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
1181       else if (mem_copy == rgba8_copy)
1182          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
1183                                  dst, src, dst_pitch, swizzle_bit,
1184                                  rgba8_copy, rgba8_copy_aligned_src);
1185 #if defined(INLINE_SSE41)
1186       else if (mem_copy == _memcpy_streaming_load)
1187          return xtiled_to_linear(x0, x1, x2, x3, y0, y1,
1188                                  dst, src, dst_pitch, swizzle_bit,
1189                                  memcpy, _memcpy_streaming_load);
1190 #endif
1191       else
1192          unreachable("not reached");
1193    }
1194    xtiled_to_linear(x0, x1, x2, x3, y0, y1,
1195                     dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
1196 }
1197 
1198 /**
1199  * Copy texture data from Y tile layout to linear, faster.
1200  *
1201  * Same as \ref ytile_to_linear but faster, because it passes constant
1202  * parameters for common cases, allowing the compiler to inline code
1203  * optimized for those cases.
1204  *
1205  * \copydoc tile_copy_fn
1206  */
1207 static FLATTEN void
ytiled_to_linear_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1208 ytiled_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1209                         uint32_t y0, uint32_t y1,
1210                         char *dst, const char *src,
1211                         int32_t dst_pitch,
1212                         uint32_t swizzle_bit,
1213                         isl_memcpy_type copy_type)
1214 {
1215    isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
1216 
1217    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
1218       if (mem_copy == memcpy)
1219          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
1220                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
1221       else if (mem_copy == rgba8_copy)
1222          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
1223                                  dst, src, dst_pitch, swizzle_bit,
1224                                  rgba8_copy, rgba8_copy_aligned_src);
1225 #if defined(INLINE_SSE41)
1226       else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
1227          return ytiled_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
1228                                  dst, src, dst_pitch, swizzle_bit,
1229                                  memcpy, _memcpy_streaming_load);
1230 #endif
1231       else
1232          unreachable("not reached");
1233    } else {
1234       if (mem_copy == memcpy)
1235          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
1236                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
1237       else if (mem_copy == rgba8_copy)
1238          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
1239                                  dst, src, dst_pitch, swizzle_bit,
1240                                  rgba8_copy, rgba8_copy_aligned_src);
1241 #if defined(INLINE_SSE41)
1242       else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
1243          return ytiled_to_linear(x0, x1, x2, x3, y0, y1,
1244                                  dst, src, dst_pitch, swizzle_bit,
1245                                  memcpy, _memcpy_streaming_load);
1246 #endif
1247       else
1248          unreachable("not reached");
1249    }
1250    ytiled_to_linear(x0, x1, x2, x3, y0, y1,
1251                     dst, src, dst_pitch, swizzle_bit, mem_copy, mem_copy);
1252 }
1253 
1254 /**
1255  * Copy texture data from tile4 layout to linear, faster.
1256  *
1257  * Same as \ref tile4_to_linear but faster, because it passes constant
1258  * parameters for common cases, allowing the compiler to inline code
1259  * optimized for those cases.
1260  *
1261  * \copydoc tile_copy_fn
1262  */
1263 static FLATTEN void
tile4_to_linear_faster(uint32_t x0,uint32_t x1,uint32_t x2,uint32_t x3,uint32_t y0,uint32_t y1,char * dst,const char * src,int32_t dst_pitch,uint32_t swizzle_bit,isl_memcpy_type copy_type)1264 tile4_to_linear_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3,
1265                         uint32_t y0, uint32_t y1,
1266                         char *dst, const char *src,
1267                         int32_t dst_pitch,
1268                         uint32_t swizzle_bit,
1269                         isl_memcpy_type copy_type)
1270 {
1271    isl_mem_copy_fn mem_copy = choose_copy_function(copy_type);
1272    assert(swizzle_bit == 0);
1273 
1274    if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) {
1275       if (mem_copy == memcpy)
1276          return tile4_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
1277                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
1278       else if (mem_copy == rgba8_copy)
1279          return tile4_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
1280                                  dst, src, dst_pitch, swizzle_bit,
1281                                  rgba8_copy, rgba8_copy_aligned_src);
1282 #if defined(INLINE_SSE41)
1283       else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
1284          return tile4_to_linear(0, 0, ytile_width, ytile_width, 0, ytile_height,
1285                                  dst, src, dst_pitch, swizzle_bit,
1286                                  memcpy, _memcpy_streaming_load);
1287 #endif
1288       else
1289          unreachable("not reached");
1290    } else {
1291       if (mem_copy == memcpy)
1292          return tile4_to_linear(x0, x1, x2, x3, y0, y1,
1293                                  dst, src, dst_pitch, swizzle_bit, memcpy, memcpy);
1294       else if (mem_copy == rgba8_copy)
1295          return tile4_to_linear(x0, x1, x2, x3, y0, y1,
1296                                  dst, src, dst_pitch, swizzle_bit,
1297                                  rgba8_copy, rgba8_copy_aligned_src);
1298 #if defined(INLINE_SSE41)
1299       else if (copy_type == ISL_MEMCPY_STREAMING_LOAD)
1300          return tile4_to_linear(x0, x1, x2, x3, y0, y1,
1301                                  dst, src, dst_pitch, swizzle_bit,
1302                                  memcpy, _memcpy_streaming_load);
1303 #endif
1304       else
1305          unreachable("not reached");
1306    }
1307 }
1308 
1309 /**
1310  * Copy from linear to tiled texture.
1311  *
1312  * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
1313  * pieces that do not cross tile boundaries and copy each piece with a tile
1314  * copy function (\ref tile_copy_fn).
1315  * The X range is in bytes, i.e. pixels * bytes-per-pixel.
1316  * The Y range is in pixels (i.e. unitless).
1317  * 'dst' is the address of (0, 0) in the destination tiled texture.
1318  * 'src' is the address of (xt1, yt1) in the source linear texture.
1319  */
1320 static void
linear_to_tiled(uint32_t xt1,uint32_t xt2,uint32_t yt1,uint32_t yt2,char * dst,const char * src,uint32_t dst_pitch,int32_t src_pitch,bool has_swizzling,enum isl_tiling tiling,isl_memcpy_type copy_type)1321 linear_to_tiled(uint32_t xt1, uint32_t xt2,
1322                       uint32_t yt1, uint32_t yt2,
1323                       char *dst, const char *src,
1324                       uint32_t dst_pitch, int32_t src_pitch,
1325                       bool has_swizzling,
1326                       enum isl_tiling tiling,
1327                       isl_memcpy_type copy_type)
1328 {
1329    tile_copy_fn tile_copy;
1330    uint32_t xt0, xt3;
1331    uint32_t yt0, yt3;
1332    uint32_t xt, yt;
1333    uint32_t tw, th, span;
1334    uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
1335 
1336    if (tiling == ISL_TILING_X) {
1337       tw = xtile_width;
1338       th = xtile_height;
1339       span = xtile_span;
1340       tile_copy = linear_to_xtiled_faster;
1341    } else if (tiling == ISL_TILING_Y0) {
1342       tw = ytile_width;
1343       th = ytile_height;
1344       span = ytile_span;
1345       tile_copy = linear_to_ytiled_faster;
1346    } else if (tiling == ISL_TILING_4) {
1347       tw = ytile_width;
1348       th = ytile_height;
1349       span = ytile_span;
1350       tile_copy = linear_to_tile4_faster;
1351    } else {
1352       unreachable("unsupported tiling");
1353    }
1354 
1355    /* Round out to tile boundaries. */
1356    xt0 = ALIGN_DOWN(xt1, tw);
1357    xt3 = ALIGN_UP  (xt2, tw);
1358    yt0 = ALIGN_DOWN(yt1, th);
1359    yt3 = ALIGN_UP  (yt2, th);
1360 
1361    /* Loop over all tiles to which we have something to copy.
1362     * 'xt' and 'yt' are the origin of the destination tile, whether copying
1363     * copying a full or partial tile.
1364     * tile_copy() copies one tile or partial tile.
1365     * Looping x inside y is the faster memory access pattern.
1366     */
1367    for (yt = yt0; yt < yt3; yt += th) {
1368       for (xt = xt0; xt < xt3; xt += tw) {
1369          /* The area to update is [x0,x3) x [y0,y1).
1370           * May not want the whole tile, hence the min and max.
1371           */
1372          uint32_t x0 = MAX2(xt1, xt);
1373          uint32_t y0 = MAX2(yt1, yt);
1374          uint32_t x3 = MIN2(xt2, xt + tw);
1375          uint32_t y1 = MIN2(yt2, yt + th);
1376 
1377          /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
1378           * the middle interval is the longest span-aligned part.
1379           * The sub-ranges could be empty.
1380           */
1381          uint32_t x1, x2;
1382          x1 = ALIGN_UP(x0, span);
1383          if (x1 > x3)
1384             x1 = x2 = x3;
1385          else
1386             x2 = ALIGN_DOWN(x3, span);
1387 
1388          assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
1389          assert(x1 - x0 < span && x3 - x2 < span);
1390          assert(x3 - x0 <= tw);
1391          assert((x2 - x1) % span == 0);
1392 
1393          /* Translate by (xt,yt) for single-tile copier. */
1394          tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
1395                    y0-yt, y1-yt,
1396                    dst + (ptrdiff_t)xt * th  +  (ptrdiff_t)yt        * dst_pitch,
1397                    src + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * src_pitch,
1398                    src_pitch,
1399                    swizzle_bit,
1400                    copy_type);
1401       }
1402    }
1403 }
1404 
1405 /**
1406  * Copy from tiled to linear texture.
1407  *
1408  * Divide the region given by X range [xt1, xt2) and Y range [yt1, yt2) into
1409  * pieces that do not cross tile boundaries and copy each piece with a tile
1410  * copy function (\ref tile_copy_fn).
1411  * The X range is in bytes, i.e. pixels * bytes-per-pixel.
1412  * The Y range is in pixels (i.e. unitless).
1413  * 'dst' is the address of (xt1, yt1) in the destination linear texture.
1414  * 'src' is the address of (0, 0) in the source tiled texture.
1415  */
1416 static void
tiled_to_linear(uint32_t xt1,uint32_t xt2,uint32_t yt1,uint32_t yt2,char * dst,const char * src,int32_t dst_pitch,uint32_t src_pitch,bool has_swizzling,enum isl_tiling tiling,isl_memcpy_type copy_type)1417 tiled_to_linear(uint32_t xt1, uint32_t xt2,
1418                       uint32_t yt1, uint32_t yt2,
1419                       char *dst, const char *src,
1420                       int32_t dst_pitch, uint32_t src_pitch,
1421                       bool has_swizzling,
1422                       enum isl_tiling tiling,
1423                       isl_memcpy_type copy_type)
1424 {
1425    tile_copy_fn tile_copy;
1426    uint32_t xt0, xt3;
1427    uint32_t yt0, yt3;
1428    uint32_t xt, yt;
1429    uint32_t tw, th, span;
1430    uint32_t swizzle_bit = has_swizzling ? 1<<6 : 0;
1431 
1432    if (tiling == ISL_TILING_X) {
1433       tw = xtile_width;
1434       th = xtile_height;
1435       span = xtile_span;
1436       tile_copy = xtiled_to_linear_faster;
1437    } else if (tiling == ISL_TILING_Y0) {
1438       tw = ytile_width;
1439       th = ytile_height;
1440       span = ytile_span;
1441       tile_copy = ytiled_to_linear_faster;
1442    } else if (tiling == ISL_TILING_4) {
1443       tw = ytile_width;
1444       th = ytile_height;
1445       span = ytile_span;
1446       tile_copy = tile4_to_linear_faster;
1447    } else {
1448       unreachable("unsupported tiling");
1449    }
1450 
1451 #if defined(INLINE_SSE41)
1452    if (copy_type == ISL_MEMCPY_STREAMING_LOAD) {
1453       /* The hidden cacheline sized register used by movntdqa can apparently
1454        * give you stale data, so do an mfence to invalidate it.
1455        */
1456       _mm_mfence();
1457    }
1458 #endif
1459 
1460    /* Round out to tile boundaries. */
1461    xt0 = ALIGN_DOWN(xt1, tw);
1462    xt3 = ALIGN_UP  (xt2, tw);
1463    yt0 = ALIGN_DOWN(yt1, th);
1464    yt3 = ALIGN_UP  (yt2, th);
1465 
1466    /* Loop over all tiles to which we have something to copy.
1467     * 'xt' and 'yt' are the origin of the destination tile, whether copying
1468     * copying a full or partial tile.
1469     * tile_copy() copies one tile or partial tile.
1470     * Looping x inside y is the faster memory access pattern.
1471     */
1472    for (yt = yt0; yt < yt3; yt += th) {
1473       for (xt = xt0; xt < xt3; xt += tw) {
1474          /* The area to update is [x0,x3) x [y0,y1).
1475           * May not want the whole tile, hence the min and max.
1476           */
1477          uint32_t x0 = MAX2(xt1, xt);
1478          uint32_t y0 = MAX2(yt1, yt);
1479          uint32_t x3 = MIN2(xt2, xt + tw);
1480          uint32_t y1 = MIN2(yt2, yt + th);
1481 
1482          /* [x0,x3) is split into [x0,x1), [x1,x2), [x2,x3) such that
1483           * the middle interval is the longest span-aligned part.
1484           * The sub-ranges could be empty.
1485           */
1486          uint32_t x1, x2;
1487          x1 = ALIGN_UP(x0, span);
1488          if (x1 > x3)
1489             x1 = x2 = x3;
1490          else
1491             x2 = ALIGN_DOWN(x3, span);
1492 
1493          assert(x0 <= x1 && x1 <= x2 && x2 <= x3);
1494          assert(x1 - x0 < span && x3 - x2 < span);
1495          assert(x3 - x0 <= tw);
1496          assert((x2 - x1) % span == 0);
1497 
1498          /* Translate by (xt,yt) for single-tile copier. */
1499          tile_copy(x0-xt, x1-xt, x2-xt, x3-xt,
1500                    y0-yt, y1-yt,
1501                    dst + (ptrdiff_t)xt - xt1 + ((ptrdiff_t)yt - yt1) * dst_pitch,
1502                    src + (ptrdiff_t)xt * th  +  (ptrdiff_t)yt        * src_pitch,
1503                    dst_pitch,
1504                    swizzle_bit,
1505                    copy_type);
1506       }
1507    }
1508 }
1509