xref: /aosp_15_r20/external/gmmlib/Source/GmmLib/Utility/CpuSwizzleBlt/CpuSwizzleBlt.c (revision 35ffd701415c9e32e53136d61a677a8d0a8fc4a5)
1 /*==============================================================================
2 Copyright(c) 2017 Intel Corporation
3 
4 Permission is hereby granted, free of charge, to any person obtaining a
5 copy of this software and associated documentation files(the "Software"),
6 to deal in the Software without restriction, including without limitation
7 the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 and / or sell copies of the Software, and to permit persons to whom the
9 Software is furnished to do so, subject to the following conditions:
10 
11 The above copyright notice and this permission notice shall be included
12 in all copies or substantial portions of the Software.
13 
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 OTHER DEALINGS IN THE SOFTWARE.
21 ============================================================================*/
22 // clang-format off
23 // CpuSwizzleBlt.c - Surface swizzling definitions and BLT functionality.
24 
25 // [!] File serves as its own header:
26 //      #define INCLUDE_CpuSwizzleBlt_c_AS_HEADER
27 //      #include "CpuSwizzleBlt.c"
28 
29 #define SUB_ELEMENT_SUPPORT         // Support for Partial Element Transfer (e.g. separating/merging depth-stencil).
30 #define INTEL_TILE_W_SUPPORT        // Stencil Only;
31 
32 #ifndef CpuSwizzleBlt_INCLUDED
33 
34 #ifdef __cplusplus
35 extern "C" {
36 #endif
37 
38 // Background ##################################################################
39 
40 /* Pixel-based surfaces commonly stored in memory row-by-row. This convention
41 has simple "y * Pitch + x" addressing but has spatial locality only in
42 horizontal direction--i.e. horizontal pixel neighbors stored next to each other
43 but vertical neighbors stored entire pitch away.
44 
45 Since many graphics operations involve multi-dimensional data access, to
46 improve cache/memory access performance it is often more beneficial to use
47 alternative storage conventions which have multi-dimensional spatial locality--
48 i.e. where pixels tend to be stored near both their horizontal and vertical
49 neighbors.
50 
51 "Tiling/Swizzling" is storage convention that increases multi-dimensional
52 spatial locality by treating surface as series of smaller regions/"tiles",
53 laid out in row-major order across surface, with entire content of each tile
54 stored contiguously. Data within each tile is stored in pattern that further
55 maximizes the locality. */
56 
57 
58 // Swizzle Descriptors #########################################################
59 
60 /* Tile sizes always powers of 2 and chosen to be architecturally convenient--
61 e.g. 4KB to match physical page size. Tile dimensions also powers of 2, usually
62 chosen to produce square tiles for targeted pixel size--e.g. 4KB = 128 bytes x
63 32 rows = 32 x 32 pixels @ 4 bytes-per-pixel.
64 
65 Since tile size and dimensions all powers of two, the spatial-to-linear mapping
66 required to store a tile can be trivial: spatial indexing bits can simply be
67 mapped to linear offset bits--e.g. for a 4KB, 128x32 tile...each byte within
68 tile can be referenced with a 7-bit X index and 5-bit Y index--and each of
69 those 12 index bits can be individually mapped to a bit in the 12-bit offset of
70 the tile's linear storage.
71 
72 The order in which spatial index bits are mapped to linear offset bits
73 determines the spatial locality properties of the surface data. E.g. the
74 following mapping...
75 
76     Linear[11:0] = Y4 Y3 Y2 Y1 Y0 X6 X5 X4 X3 X2 X1 X0
77                    \-- Y[4:0] --/ \----- X[6:0] -----/
78 
79 ...stores bytes of tile in row-major order, with horizontal neighbors stored
80 contiguously and vertical neighbors stored 128 bytes away. If instead, Y index
81 bits were mapped to the low-order...
82 
83     Linear[11:0] = X6 X5 X4 X3 X2 X1 X0 Y4 Y3 Y2 Y1 Y0
84                    \----- X[6:0] -----/ \-- Y[4:0] --/
85 
86 ...bytes of tile would be stored in column-major order, with vertical neighbors
87 stored contiguously and horizontal neighbors stored 32 bytes away.
88 
89 Individual X and Y bits can be separated and interspersed in mapping to
90 increase locality via sub-tiling--e.g...
91 
92     Linear[11:0] = Y4 Y3 Y2 X6 X5 X4 Y1 Y0 X3 X2 X1 X0
93                                      \-- Sub-Tile ---/
94 
95 ...subdivies tile into 16x4 sub-tiles laid out in row-major order across tile,
96 with sub-tile content further stored in row-major order, with horizontal byte
97 neighbors within sub-tile stored contiguously and vertical neighbors only 16
98 bytes away. This means single 64-byte cache line contains 4x4 group of 32bpp
99 pixels--which is powerful spatial locality for graphics processing.
100 
101 If mappings restricted to being "parallel" for index bits (i.e. bits of given
102 index can change position but not relative order during mapping), then bit
103 indexes need not be explicitly denoted--e.g. the previous sub-tiling mapping
104 can be represented as...
105 
106     Linear[11:0] = Y Y Y X X X Y Y X X X X
107 
108 ...where X and Y index bits are implied to be zero-based-counted in order they
109 are encountered.
110 
111 In software, spatial-to-linear mapping conveniently described with bit mask for
112 each dimension, where a set bit indicates the next bit of that dimension's
113 index is mapped to that position in the linear offset--e.g....
114 
115     Linear[11:0] = Y Y Y X X X Y Y X X X X
116     MaskX =        0 0 0 1 1 1 0 0 1 1 1 1
117     MaskY =        1 1 1 0 0 0 1 1 0 0 0 0
118 
119 Such dimensional masks all that's needed to describe given tiling/swizzling
120 convention, since tile size and dimensions can be derived from the masks:
121 
122     TileWidth =  2 ^ NumberOfSetBits(MaskX)
123     TileHeight = 2 ^ NumberOfSetBits(MaskY)
124     TileSize =   2 ^ NumberOfSetBits(MaskX OR MaskY)
125 
126 Tiling/swizzling is not limited to 2D. With addition of another tile dimension,
127 spatial locality for 3D or MSAA sample neighbors can be controlled, also. */
128 
129     typedef struct  _SWIZZLE_DESCRIPTOR {
130         struct          _SWIZZLE_DESCRIPTOR_MASKS {
131             int             x, y, z;
132         }               Mask;
133     }               SWIZZLE_DESCRIPTOR;
134 
135     typedef enum _EXTERNAL_SWIZZLE_NAME
136     {
137         TILEX = 0,
138         TILEY,
139         TILEW,
140         TILEYS,
141         TILEYF
142     }EXTERNAL_SWIZZLE_NAME;
143 
144     typedef enum  _EXTERNAL_RES_TYPE{
145         Res_2D = 0,
146         Res_3D = 1,
147         MSAA_2X,
148         MSAA_4X,
149         MSAA_8X,
150         MSAA_16X
151     }EXTERNAL_RES_TYPE;
152 
153     // Definition Helper Macros...
154     #define X ,'x'
155     #define Y ,'y'
156     #define Z ,'z'
157     #define S ,'z' // S = MSAA Sample Index
158     #define o ,0   // o = N/A Swizzle Bit
159     #ifdef INCLUDE_CpuSwizzleBlt_c_AS_HEADER
160         #define __SWIZZLE(Name, b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0) \
161             extern const SWIZZLE_DESCRIPTOR Name;
162     #else // C Compile...
163         #define __SWIZZLE(Name, b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0) \
164             const SWIZZLE_DESCRIPTOR Name = \
165                 { (b15 == 'x' ? 0x8000 : 0) + (b14 == 'x' ? 0x4000 : 0) + (b13 == 'x' ? 0x2000 : 0) + (b12 == 'x' ? 0x1000 : 0) + (b11 == 'x' ? 0x0800 : 0) + (b10 == 'x' ? 0x0400 : 0) + (b9 == 'x' ? 0x0200 : 0) + (b8 == 'x' ? 0x0100 : 0) + (b7 == 'x' ? 0x0080 : 0) + (b6 == 'x' ? 0x0040 : 0) + (b5 == 'x' ? 0x0020 : 0) + (b4 == 'x' ? 0x0010 : 0) + (b3 == 'x' ? 0x0008 : 0) + (b2 == 'x' ? 0x0004 : 0) + (b1 == 'x' ? 0x0002 : 0) + (b0 == 'x' ? 0x0001 : 0), \
166                   (b15 == 'y' ? 0x8000 : 0) + (b14 == 'y' ? 0x4000 : 0) + (b13 == 'y' ? 0x2000 : 0) + (b12 == 'y' ? 0x1000 : 0) + (b11 == 'y' ? 0x0800 : 0) + (b10 == 'y' ? 0x0400 : 0) + (b9 == 'y' ? 0x0200 : 0) + (b8 == 'y' ? 0x0100 : 0) + (b7 == 'y' ? 0x0080 : 0) + (b6 == 'y' ? 0x0040 : 0) + (b5 == 'y' ? 0x0020 : 0) + (b4 == 'y' ? 0x0010 : 0) + (b3 == 'y' ? 0x0008 : 0) + (b2 == 'y' ? 0x0004 : 0) + (b1 == 'y' ? 0x0002 : 0) + (b0 == 'y' ? 0x0001 : 0), \
167                   (b15 == 'z' ? 0x8000 : 0) + (b14 == 'z' ? 0x4000 : 0) + (b13 == 'z' ? 0x2000 : 0) + (b12 == 'z' ? 0x1000 : 0) + (b11 == 'z' ? 0x0800 : 0) + (b10 == 'z' ? 0x0400 : 0) + (b9 == 'z' ? 0x0200 : 0) + (b8 == 'z' ? 0x0100 : 0) + (b7 == 'z' ? 0x0080 : 0) + (b6 == 'z' ? 0x0040 : 0) + (b5 == 'z' ? 0x0020 : 0) + (b4 == 'z' ? 0x0010 : 0) + (b3 == 'z' ? 0x0008 : 0) + (b2 == 'z' ? 0x0004 : 0) + (b1 == 'z' ? 0x0002 : 0) + (b0 == 'z' ? 0x0001 : 0) }
168 #endif
169     #define SWIZZLE(__SWIZZLE_Args) __SWIZZLE __SWIZZLE_Args
170 
171     // Legacy Intel Tiling Swizzles...
172     SWIZZLE(( INTEL_TILE_X              o o o o Y Y Y X X X X X X X X X ));
173     SWIZZLE(( INTEL_TILE_Y              o o o o X X X Y Y Y Y Y X X X X ));
174 
175     #ifdef INTEL_TILE_W_SUPPORT
176         SWIZZLE(( INTEL_TILE_W          o o o o X X X Y Y Y Y X Y X Y X ));
177     #endif
178 // Gen9 Swizzles...
179     SWIZZLE(( INTEL_TILE_YF_128         o o o o X Y X Y X X Y Y X X X X ));
180     SWIZZLE(( INTEL_TILE_YF_64          o o o o X Y X Y X X Y Y X X X X ));
181     SWIZZLE(( INTEL_TILE_YF_32          o o o o X Y X Y X Y Y Y X X X X ));
182     SWIZZLE(( INTEL_TILE_YF_16          o o o o X Y X Y X Y Y Y X X X X ));
183     SWIZZLE(( INTEL_TILE_YF_8           o o o o X Y X Y Y Y Y Y X X X X ));
184 
185     SWIZZLE(( INTEL_TILE_YS_128         X Y X Y X Y X Y X X Y Y X X X X ));
186     SWIZZLE(( INTEL_TILE_YS_64          X Y X Y X Y X Y X X Y Y X X X X ));
187     SWIZZLE(( INTEL_TILE_YS_32          X Y X Y X Y X Y X Y Y Y X X X X ));
188     SWIZZLE(( INTEL_TILE_YS_16          X Y X Y X Y X Y X Y Y Y X X X X ));
189     SWIZZLE(( INTEL_TILE_YS_8           X Y X Y X Y X Y Y Y Y Y X X X X ));
190 
191     SWIZZLE(( INTEL_TILE_YF_MSAA2_128   o o o o S Y X Y X X Y Y X X X X ));
192     SWIZZLE(( INTEL_TILE_YF_MSAA2_64    o o o o S Y X Y X X Y Y X X X X ));
193     SWIZZLE(( INTEL_TILE_YF_MSAA2_32    o o o o S Y X Y X Y Y Y X X X X ));
194     SWIZZLE(( INTEL_TILE_YF_MSAA2_16    o o o o S Y X Y X Y Y Y X X X X ));
195     SWIZZLE(( INTEL_TILE_YF_MSAA2_8     o o o o S Y X Y Y Y Y Y X X X X ));
196 
197     SWIZZLE(( INTEL_TILE_YS_MSAA2_128   S Y X Y X Y X Y X X Y Y X X X X ));
198     SWIZZLE(( INTEL_TILE_YS_MSAA2_64    S Y X Y X Y X Y X X Y Y X X X X ));
199     SWIZZLE(( INTEL_TILE_YS_MSAA2_32    S Y X Y X Y X Y X Y Y Y X X X X ));
200     SWIZZLE(( INTEL_TILE_YS_MSAA2_16    S Y X Y X Y X Y X Y Y Y X X X X ));
201     SWIZZLE(( INTEL_TILE_YS_MSAA2_8     S Y X Y X Y X Y Y Y Y Y X X X X ));
202 
203     SWIZZLE(( INTEL_TILE_YF_MSAA4_128   o o o o S S X Y X X Y Y X X X X ));
204     SWIZZLE(( INTEL_TILE_YF_MSAA4_64    o o o o S S X Y X X Y Y X X X X ));
205     SWIZZLE(( INTEL_TILE_YF_MSAA4_32    o o o o S S X Y X Y Y Y X X X X ));
206     SWIZZLE(( INTEL_TILE_YF_MSAA4_16    o o o o S S X Y X Y Y Y X X X X ));
207     SWIZZLE(( INTEL_TILE_YF_MSAA4_8     o o o o S S X Y Y Y Y Y X X X X ));
208 
209     SWIZZLE(( INTEL_TILE_YS_MSAA4_128   S S X Y X Y X Y X X Y Y X X X X ));
210     SWIZZLE(( INTEL_TILE_YS_MSAA4_64    S S X Y X Y X Y X X Y Y X X X X ));
211     SWIZZLE(( INTEL_TILE_YS_MSAA4_32    S S X Y X Y X Y X Y Y Y X X X X ));
212     SWIZZLE(( INTEL_TILE_YS_MSAA4_16    S S X Y X Y X Y X Y Y Y X X X X ));
213     SWIZZLE(( INTEL_TILE_YS_MSAA4_8     S S X Y X Y X Y Y Y Y Y X X X X ));
214 
215     SWIZZLE(( INTEL_TILE_YF_MSAA8_128   o o o o S S S Y X X Y Y X X X X ));
216     SWIZZLE(( INTEL_TILE_YF_MSAA8_64    o o o o S S S Y X X Y Y X X X X ));
217     SWIZZLE(( INTEL_TILE_YF_MSAA8_32    o o o o S S S Y X Y Y Y X X X X ));
218     SWIZZLE(( INTEL_TILE_YF_MSAA8_16    o o o o S S S Y X Y Y Y X X X X ));
219     SWIZZLE(( INTEL_TILE_YF_MSAA8_8     o o o o S S S Y Y Y Y Y X X X X ));
220 
221     SWIZZLE(( INTEL_TILE_YS_MSAA8_128   S S S Y X Y X Y X X Y Y X X X X ));
222     SWIZZLE(( INTEL_TILE_YS_MSAA8_64    S S S Y X Y X Y X X Y Y X X X X ));
223     SWIZZLE(( INTEL_TILE_YS_MSAA8_32    S S S Y X Y X Y X Y Y Y X X X X ));
224     SWIZZLE(( INTEL_TILE_YS_MSAA8_16    S S S Y X Y X Y X Y Y Y X X X X ));
225     SWIZZLE(( INTEL_TILE_YS_MSAA8_8     S S S Y X Y X Y Y Y Y Y X X X X ));
226 
227     SWIZZLE(( INTEL_TILE_YF_MSAA16_128  o o o o S S S S X X Y Y X X X X ));
228     SWIZZLE(( INTEL_TILE_YF_MSAA16_64   o o o o S S S S X X Y Y X X X X ));
229     SWIZZLE(( INTEL_TILE_YF_MSAA16_32   o o o o S S S S X Y Y Y X X X X ));
230     SWIZZLE(( INTEL_TILE_YF_MSAA16_16   o o o o S S S S X Y Y Y X X X X ));
231     SWIZZLE(( INTEL_TILE_YF_MSAA16_8    o o o o S S S S Y Y Y Y X X X X ));
232 
233     SWIZZLE(( INTEL_TILE_YS_MSAA16_128  S S S S X Y X Y X X Y Y X X X X ));
234     SWIZZLE(( INTEL_TILE_YS_MSAA16_64   S S S S X Y X Y X X Y Y X X X X ));
235     SWIZZLE(( INTEL_TILE_YS_MSAA16_32   S S S S X Y X Y X Y Y Y X X X X ));
236     SWIZZLE(( INTEL_TILE_YS_MSAA16_16   S S S S X Y X Y X Y Y Y X X X X ));
237     SWIZZLE(( INTEL_TILE_YS_MSAA16_8    S S S S X Y X Y Y Y Y Y X X X X ));
238 
239     SWIZZLE(( INTEL_TILE_YF_3D_128      o o o o Y Z X X Z Z Y Y X X X X ));
240     SWIZZLE(( INTEL_TILE_YF_3D_64       o o o o Y Z X X Z Z Y Y X X X X ));
241     SWIZZLE(( INTEL_TILE_YF_3D_32       o o o o Y Z X Y Z Z Y Y X X X X ));
242     SWIZZLE(( INTEL_TILE_YF_3D_16       o o o o Y Z Y Z Z Z Y Y X X X X ));
243     SWIZZLE(( INTEL_TILE_YF_3D_8        o o o o Y Z Y Z Z Z Y Y X X X X ));
244 
245     SWIZZLE(( INTEL_TILE_YS_3D_128      X Y Z X Y Z X X Z Z Y Y X X X X ));
246     SWIZZLE(( INTEL_TILE_YS_3D_64       X Y Z X Y Z X X Z Z Y Y X X X X ));
247     SWIZZLE(( INTEL_TILE_YS_3D_32       X Y Z X Y Z X Y Z Z Y Y X X X X ));
248     SWIZZLE(( INTEL_TILE_YS_3D_16       X Y Z X Y Z Y Z Z Z Y Y X X X X ));
249     SWIZZLE(( INTEL_TILE_YS_3D_8        X Y Z X Y Z Y Z Z Z Y Y X X X X ));
250 
251     // XE_HP_SDV Swizzles...
252     SWIZZLE(( INTEL_TILE_4              o o o o Y Y X Y X X Y Y X X X X ));
253 
254     SWIZZLE(( INTEL_TILE_64_128         Y X X X Y Y X Y X X Y Y X X X X ));
255     SWIZZLE(( INTEL_TILE_64_64          Y X X X Y Y X Y X X Y Y X X X X ));
256     SWIZZLE(( INTEL_TILE_64_32          Y Y X X Y Y X Y X X Y Y X X X X ));
257     SWIZZLE(( INTEL_TILE_64_16          Y Y X X Y Y X Y X X Y Y X X X X ));
258     SWIZZLE(( INTEL_TILE_64_8           Y Y Y X Y Y X Y X X Y Y X X X X ));
259 
260     SWIZZLE(( INTEL_TILE_64_MSAA2_128   Y X X X Y Y X Y S X Y Y X X X X ));
261     SWIZZLE(( INTEL_TILE_64_MSAA2_64    Y X X X Y Y X Y S X Y Y X X X X ));
262     SWIZZLE(( INTEL_TILE_64_MSAA2_32    Y Y X X Y Y X Y S X Y Y X X X X ));
263     SWIZZLE(( INTEL_TILE_64_MSAA2_16    Y Y X X Y Y X Y S X Y Y X X X X ));
264     SWIZZLE(( INTEL_TILE_64_MSAA2_8     Y Y Y X Y Y X Y S X Y Y X X X X ));
265 
266     SWIZZLE(( INTEL_TILE_64_MSAA_128    Y X X X Y Y X S S X Y Y X X X X ));
267     SWIZZLE(( INTEL_TILE_64_MSAA_64     Y X X X Y Y X S S X Y Y X X X X ));
268     SWIZZLE(( INTEL_TILE_64_MSAA_32     Y Y X X Y Y X S S X Y Y X X X X ));
269     SWIZZLE(( INTEL_TILE_64_MSAA_16     Y Y X X Y Y X S S X Y Y X X X X ));
270     SWIZZLE(( INTEL_TILE_64_MSAA_8      Y Y Y X Y Y X S S X Y Y X X X X ));
271 
272     SWIZZLE(( INTEL_TILE_64_3D_128      Z Z Y X X X Z Y Z X Y Y X X X X ));
273     SWIZZLE(( INTEL_TILE_64_3D_64       Z Z Y X X X Z Y Z X Y Y X X X X ));
274     SWIZZLE(( INTEL_TILE_64_3D_32       Z Z Y X Y X Z Y Z X Y Y X X X X ));
275     SWIZZLE(( INTEL_TILE_64_3D_16       Z Z Z Y Y X Z Y Z X Y Y X X X X ));
276     SWIZZLE(( INTEL_TILE_64_3D_8        Z Z Z X Y Y Z Y Z X Y Y X X X X ));
277 
278     //Tile64 updated layout for Render Compression 256B and Physical L3
279 
280     SWIZZLE(( INTEL_TILE_64_V2_MSAA2_128   Y X X X Y Y X S X X Y Y X X X X ));
281     SWIZZLE(( INTEL_TILE_64_V2_MSAA2_64    Y Y X X Y Y X S X X Y Y X X X X ));
282     SWIZZLE(( INTEL_TILE_64_V2_MSAA2_32    Y Y Y X Y Y X S X X Y Y X X X X ));
283     SWIZZLE(( INTEL_TILE_64_V2_MSAA2_16    Y Y Y X Y Y X S X X Y Y X X X X ));
284     SWIZZLE(( INTEL_TILE_64_V2_MSAA2_8     Y Y Y Y Y Y X S X X Y Y X X X X ));
285 
286     SWIZZLE(( INTEL_TILE_64_V2_MSAA4_128   Y X X X Y Y S S X X Y Y X X X X ));
287     SWIZZLE(( INTEL_TILE_64_V2_MSAA4_64    Y X X X Y Y S S X X Y Y X X X X ));
288     SWIZZLE(( INTEL_TILE_64_V2_MSAA4_32    Y Y X X Y Y S S X X Y Y X X X X ));
289     SWIZZLE(( INTEL_TILE_64_V2_MSAA4_16    Y Y X X Y Y S S X X Y Y X X X X ));
290     SWIZZLE(( INTEL_TILE_64_V2_MSAA4_8     Y Y Y X Y Y S S X X Y Y X X X X ));
291 
292     SWIZZLE(( INTEL_TILE_64_V2_MSAA8_128   Y Y X X Y X S S S X Y Y X X X X ));
293     SWIZZLE(( INTEL_TILE_64_V2_MSAA8_64    Y Y X X Y X S S S X Y Y X X X X ));
294     SWIZZLE(( INTEL_TILE_64_V2_MSAA8_32    Y Y X X Y X S S S X Y Y X X X X ));
295     SWIZZLE(( INTEL_TILE_64_V2_MSAA8_16    Y Y Y X Y X S S S X Y Y X X X X ));
296     SWIZZLE(( INTEL_TILE_64_V2_MSAA8_8     Y Y Y X Y X S S S X Y Y X X X X ));
297 
298     SWIZZLE(( INTEL_TILE_64_V2_MSAA16_128   Y X X X Y X S S S S Y Y X X X X ));
299     SWIZZLE(( INTEL_TILE_64_V2_MSAA16_64    Y Y X X Y X S S S S Y Y X X X X ));
300     SWIZZLE(( INTEL_TILE_64_V2_MSAA16_32    Y Y X X Y X S S S S Y Y X X X X ));
301     SWIZZLE(( INTEL_TILE_64_V2_MSAA16_16    Y Y X X Y X S S S S Y Y X X X X ));
302     SWIZZLE(( INTEL_TILE_64_V2_MSAA16_8     Y Y Y X Y X S S S S Y Y X X X X ));
303 
304     SWIZZLE(( INTEL_TILE_64_V2_3D_128      Z Z Y X X Y Z Z X X Y Y X X X X ));
305     SWIZZLE(( INTEL_TILE_64_V2_3D_64       Z Z Y X X Y Z Z X X Y Y X X X X ));
306     SWIZZLE(( INTEL_TILE_64_V2_3D_32       Z Z Y X Y Y Z Z X X Y Y X X X X ));
307     SWIZZLE(( INTEL_TILE_64_V2_3D_16       Z Z Z Y Y Y Z Z X X Y Y X X X X ));
308     SWIZZLE(( INTEL_TILE_64_V2_3D_8        Z Z Z Y Y Y Z Z X X Y Y X X X X ));
309 
310 
311     #undef X
312     #undef Y
313     #undef Z
314     #undef S
315     #undef o
316     #undef __SWIZZLE
317     #undef SWIZZLE
318 
319 // Accessing Swizzled Surface ##################################################
320 
321 /* While graphics hardware prefers to access surfaces stored in tiled/swizzled
322 formats, logically accessing such surfaces with CPU-based software is non-
323 trivial when high throughput is goal.
324 
325 This file implements (1) SwizzleOffset function to compute swizzled offset of
326 dimensionally-specified surface byte, and (2) CpuSwizzleBlt function to BLT
327 between linear ("y * pitch + x") and swizzled surfaces--with goal of providing
328 high-performance, swizzling BLT implementation to be used both in production
329 and as a guide for those seeking to understand swizzled access or implement
330 functionality beyond the simple BLT. */
331 
332 // Surface Descriptor for CpuSwizzleBlt function...
333 typedef struct _CPU_SWIZZLE_BLT_SURFACE
334 {
335     void                        *pBase;         // Pointer to surface base.
336     int                         Pitch, Height;  // Row-pitch in bytes, and height, of surface.
337     const SWIZZLE_DESCRIPTOR    *pSwizzle;      // Pointer to surface's swizzle descriptor, or NULL if unswizzled.
338     int                         OffsetX;        // Horizontal offset into surface for BLT rectangle, in bytes.
339     int                         OffsetY;        // Vertical offset into surface for BLT rectangle, in physical/pitch rows.
340     int                         OffsetZ;        // Zero if N/A, or 3D offset into surface for BLT rectangle, in 3D slices or MSAA samples as appropriate.
341 
342     #ifdef SUB_ELEMENT_SUPPORT
343         struct _CPU_SWIZZLE_BLT_SURFACE_ELEMENT
344         {
345             int                     Pitch, Size; // Zero if full-pixel BLT, or pitch and size, in bytes, of pixel element being BLT'ed.
346         }                       Element;
347 
348         /* e.g. to BLT only stencil data from S8D24 surface to S8 surface...
349             Dest.Element.Size = Src.Element.Size = sizeof(S8) = 1;
350             Dest.Element.Pitch = sizeof(S8) = 1;
351             Src.Element.Pitch = sizeof(S8D24) = 4;
352             Src.OffsetX += BYTE_OFFSET_OF_S8_WITHIN_S8D24; */
353     #endif
354 } CPU_SWIZZLE_BLT_SURFACE;
355 
356 extern int SwizzleOffset(const SWIZZLE_DESCRIPTOR *pSwizzle, int Pitch, int OffsetX, int OffsetY, int OffsetZ);
357 extern void CpuSwizzleBlt(CPU_SWIZZLE_BLT_SURFACE *pDest, CPU_SWIZZLE_BLT_SURFACE *pSrc, int CopyWidthBytes, int CopyHeight);
358 
359 #ifdef __cplusplus
360 }
361 #endif
362 
363 #define CpuSwizzleBlt_INCLUDED
364 
365 #endif
366 
367 
368 #ifndef INCLUDE_CpuSwizzleBlt_c_AS_HEADER
369 
370 //#define MINIMALIST                // Use minimalist, unoptimized implementation.
371 
372 #include "assert.h" // Quoted to allow local-directory override.
373 
374 #if(_MSC_VER >= 1400)
375     #include <intrin.h>
376 #elif defined(__ARM_ARCH)
377     #include <sse2neon.h>
378 #elif((defined __clang__) ||(__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5)))
379     #include <cpuid.h>
380     #include <x86intrin.h>
381 #else
382     #error "Unexpected compiler!"
383 #endif
384 
385 
386 // POPCNT: Count Lit Bits...                 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
387 static unsigned char PopCnt4[16] =          {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
388 #define POPCNT4(x)  (PopCnt4[(x) & 0xf])
389 #define POPCNT16(x) (POPCNT4((x) >> 12) + POPCNT4((x) >> 8) + POPCNT4((x) >> 4) + POPCNT4(x))
390 
391 
SwizzleOffset(const SWIZZLE_DESCRIPTOR * pSwizzle,int Pitch,int OffsetX,int OffsetY,int OffsetZ)392 int SwizzleOffset( // ##########################################################
393 
394     /* Return swizzled offset of dimensionally-specified surface byte. */
395 
396     const SWIZZLE_DESCRIPTOR    *pSwizzle,  // Pointer to applicable swizzle descriptor.
397     int                         Pitch,      // Pointer to applicable surface row-pitch.
398     int                         OffsetX,    // Horizontal offset into surface of the target byte, in bytes.
399     int                         OffsetY,    // Vertical offset into surface of the target byte, in physical/pitch rows.
400     int                         OffsetZ)    // Zero if N/A, or 3D offset into surface of the target byte, in 3D slices or MSAA samples as appropriate.
401 
402     /* Given logically-specified (x, y, z) byte within swizzled surface,
403     function returns byte's linear/memory offset from surface's base--i.e. it
404     performs the swizzled, spatial-to-linear mapping.
405 
406     Function makes no real effort to perform optimally, since should only used
407     outside loops in CpuSwizzleBlt and similar functions. If any of this
408     functionality was needed in performance path, a custom implementation
409     should be used that limits itself to functionality specifically needed
410     (probably single-dimension, intra-tile offsets) and uses a fast computation
411     (e.g. LUT's, hard-codings, PDEP). */
412 
413 { // ###########################################################################
414 
415     char PDepSupported = -1; // AVX2/BMI2 PDEP (Parallel Deposit) Instruction
416 
417     int SwizzledOffset; // Return value being computed.
418 
419     int TileWidthBits =  POPCNT16(pSwizzle->Mask.x); // Log2(Tile Width in Bytes)
420     int TileHeightBits = POPCNT16(pSwizzle->Mask.y); // Log2(Tile Height)
421     int TileDepthBits =  POPCNT16(pSwizzle->Mask.z); // Log2(Tile Depth or MSAA Samples)
422     int TileSizeBits =   TileWidthBits + TileHeightBits + TileDepthBits; // Log2(Tile Size in Bytes)
423     int TilesPerRow =    Pitch >> TileWidthBits;     // Surface Width in Tiles
424 
425     int Row, Col;   // Tile grid position on surface, of tile containing specified byte.
426     int x, y, z;    // Position of specified byte within tile that contains it.
427 
428     if(PDepSupported == -1)
429     {
430         #if(_MSC_VER >= 1700)
431             #define PDEP(Src, Mask) _pdep_u32((Src), (Mask))
432             int CpuInfo[4];
433             __cpuidex(CpuInfo, 7, 0);
434             PDepSupported = ((CpuInfo[1] & (1 << 8)) != 0); // EBX[8] = BMI2
435         #elif ( defined (__BMI2__ ))
436             #define PDEP(Src, Mask) _pdep_u32((Src), (Mask))
437             unsigned int eax, ebx, ecx, edx;
438             __cpuid_count(7, 0, eax, ebx, ecx, edx);
439             PDepSupported = ((ebx & (1 << 8)) != 0); // EBX[8] = BMI2
440         #else
441             #define PDEP(Src, Mask) 0
442             PDepSupported = 0;
443         #endif
444     }
445 
446     assert( // Mutually Exclusive Swizzle Positions...
447         (pSwizzle->Mask.x | pSwizzle->Mask.y | pSwizzle->Mask.z) ==
448         (pSwizzle->Mask.x + pSwizzle->Mask.y + pSwizzle->Mask.z));
449 
450     assert( // Swizzle Limited to 16-bit (else expand POPCNT'ing)...
451         (pSwizzle->Mask.x | pSwizzle->Mask.y | pSwizzle->Mask.z) < (1 << 16));
452 
453     assert( // Pitch is Multiple of Tile Width...
454         Pitch == ((Pitch >> TileWidthBits) << TileWidthBits));
455 
456     { // Break Positioning into Tile-Granular and Intra-Tile Components...
457         assert((OffsetZ >>       TileDepthBits) == 0); // When dealing with 3D tiling, treat as separate single-tile-deep planes.
458         z =     OffsetZ & ((1 << TileDepthBits) - 1);
459 
460         Row =   OffsetY >>       TileHeightBits;
461         y =     OffsetY & ((1 << TileHeightBits) - 1);
462 
463         Col =   OffsetX >>       TileWidthBits;
464         x =     OffsetX & ((1 << TileWidthBits) - 1);
465     }
466 
467     SwizzledOffset = // Start with surface offset of given tile...
468         (Row * TilesPerRow + Col) << TileSizeBits; // <-- Tiles laid across surface in row-major order.
469 
470     // ...then OR swizzled offset of byte within tile...
471     if(PDepSupported)
472     {
473         SwizzledOffset +=
474             PDEP(x, pSwizzle->Mask.x) +
475             PDEP(y, pSwizzle->Mask.y) +
476             PDEP(z, pSwizzle->Mask.z);
477     }
478     else // PDEP workalike...
479     {
480         int bitIndex = 0, bitMask = 1;
481         int terminationMask = pSwizzle->Mask.x | pSwizzle->Mask.y | pSwizzle->Mask.z;
482         while(bitMask < terminationMask)
483         {
484             int MaskQ;
485             #define PROCESS(Q) {                    \
486                 MaskQ = bitMask & pSwizzle->Mask.Q; \
487                 SwizzledOffset += Q & MaskQ;        \
488                 Q <<= 1 ^ (MaskQ >> bitIndex);      \
489             }
490             PROCESS(x);
491             PROCESS(y);
492             PROCESS(z);
493 
494             bitIndex++;
495             bitMask <<= 1;
496 
497             #undef PROCESS
498         }
499     }
500 
501     return(SwizzledOffset);
502 }
503 
504 
CpuSwizzleBlt(CPU_SWIZZLE_BLT_SURFACE * pDest,CPU_SWIZZLE_BLT_SURFACE * pSrc,int CopyWidthBytes,int CopyHeight)505 void CpuSwizzleBlt( // #########################################################
506 
507     /* Performs specified swizzling BLT between two given surfaces. */
508 
509     CPU_SWIZZLE_BLT_SURFACE *pDest,         // Pointer to destination surface descriptor.
510     CPU_SWIZZLE_BLT_SURFACE *pSrc,          // Pointer to source surface descriptor.
511     int                     CopyWidthBytes, // Width of BLT rectangle, in bytes.
512     int                     CopyHeight)     // Height of BLT rectangle, in physical/pitch rows.
513 
514     #ifdef SUB_ELEMENT_SUPPORT
515 
516         /* When copying between surfaces with different pixel pitches, specify
517         CopyWidthBytes in terms of unswizzled surface's element-pitches:
518 
519             CopyWidthBytes = CopyWidthPixels * pLinearSurface.Element.Pitch; */
520 
521     #endif
522 
523 { // ###########################################################################
524 
525     CPU_SWIZZLE_BLT_SURFACE *pLinearSurface, *pSwizzledSurface;
526     int LinearToSwizzled;
527 
528     { // One surface swizzled, the other unswizzled (aka "linear")...
529         assert((pDest->pSwizzle != NULL) ^ (pSrc->pSwizzle != NULL));
530 
531         LinearToSwizzled = !pSrc->pSwizzle;
532         if(LinearToSwizzled)
533         {
534             pSwizzledSurface =  pDest;
535             pLinearSurface =    pSrc;
536         }
537         else // Swizzled-to-Linear...
538         {
539             pSwizzledSurface =  pSrc;
540             pLinearSurface =    pDest;
541         }
542     }
543 
544     #ifdef SUB_ELEMENT_SUPPORT
545     {
546         assert( // Either both or neither specified...
547             (pDest->Element.Pitch != 0) == (pSrc->Element.Pitch != 0));
548 
549         assert( // Surfaces agree on transfer element size...
550             pDest->Element.Size == pSrc->Element.Size);
551 
552         assert( // Element pitch not specified without element size...
553             !(pDest->Element.Pitch && !pDest->Element.Size));
554 
555         assert( // Legit element sizes...
556             (pDest->Element.Size <= pDest->Element.Pitch) &&
557             (pSrc->Element.Size <= pSrc->Element.Pitch));
558 
559         assert( // Sub-element CopyWidthBytes in terms of LinearSurface pitch...
560             (pLinearSurface->Element.Pitch == 0) ||
561             ((CopyWidthBytes % pLinearSurface->Element.Pitch) == 0));
562     }
563     #endif
564 
565     { // No surface overrun...
566         int NoOverrun =
567             #ifdef SUB_ELEMENT_SUPPORT
568             (
569                 // Sub-element transfer...
570                 ((pLinearSurface->Element.Size != pLinearSurface->Element.Pitch) ||
571                     (pSwizzledSurface->Element.Size != pSwizzledSurface->Element.Pitch)) &&
572                 // No overrun...
573                 ((pLinearSurface->OffsetX + CopyWidthBytes) <=
574                     (pLinearSurface->Pitch +
575                      // CopyWidthBytes's inclusion of uncopied bytes...
576                      (pLinearSurface->Element.Pitch - pLinearSurface->Element.Size))) &&
577                 ((pLinearSurface->OffsetY + CopyHeight) <= pLinearSurface->Height) &&
578                 ((pSwizzledSurface->OffsetX +
579                     // Adjust CopyWidthBytes from being in terms of LinearSurface pitch...
580                     (CopyWidthBytes / pLinearSurface->Element.Pitch * pSwizzledSurface->Element.Pitch)
581                     ) <=
582                     (pSwizzledSurface->Pitch +
583                      // CopyWidthBytes's inclusion of uncopied bytes...
584                      (pSwizzledSurface->Element.Pitch - pSwizzledSurface->Element.Size))) &&
585                 ((pSwizzledSurface->OffsetY + CopyHeight) <= pSwizzledSurface->Height)
586             ) ||
587             #endif
588 
589             ((pDest->OffsetX + CopyWidthBytes) <= pDest->Pitch) &&
590             ((pDest->OffsetY + CopyHeight) <= pDest->Height) &&
591             ((pSrc->OffsetX + CopyWidthBytes) <= pSrc->Pitch) &&
592             ((pSrc->OffsetY + CopyHeight) <= pSrc->Height);
593 
594         assert(NoOverrun);
595     }
596 
597     { // No surface overlap...
598         char *pDest0 = (char *) pDest->pBase;
599         char *pDest1 = (char *) pDest->pBase + pDest->Pitch * CopyHeight;
600         char *pSrc0 =  (char *)  pSrc->pBase;
601         char *pSrc1 =  (char *)  pSrc->pBase +  pSrc->Pitch * CopyHeight;
602 
603         assert(!(
604             ((pDest0 >= pSrc0) && (pDest0 < pSrc1)) ||
605             ((pSrc0 >= pDest0) && (pSrc0 < pDest1))));
606     }
607 
608     {
609         /* BLT will have pointer in each surface between which data will be
610         copied from source to destination. Each pointer will be appropriately
611         incremented/positioned through its surface, as BLT rectangle is
612         traversed. */
613 
614         char *pLinearAddress, *pSwizzledAddress;
615 
616         // Convenient to track traversal in swizzled surface offsets...
617         int x0 = pSwizzledSurface->OffsetX;
618         int x1 = x0 + CopyWidthBytes;
619         int y0 = pSwizzledSurface->OffsetY;
620         int y1 = y0 + CopyHeight;
621         int x, y;
622 
623         // Start linear pointer at specified base...
624         pLinearAddress =
625             (char *) pLinearSurface->pBase +
626             pLinearSurface->OffsetY * pLinearSurface->Pitch +
627             pLinearSurface->OffsetX;
628 
629         #ifdef MINIMALIST // Simple implementation for functional understanding/testing/etc.
630         {
631             #ifdef SUB_ELEMENT_SUPPORT
632                 assert( // No Sub-Element Transfer...
633                     (pLinearSurface->Element.Size == pLinearSurface->Element.Pitch) &&
634                     (pSwizzledSurface->Element.Size == pSwizzledSurface->Element.Pitch));
635             #endif
636 
637             for(y = y0; y < y1; y++)
638             {
639                 for(x = x0; x < x1; x++)
640                 {
641                     pSwizzledAddress =
642                         (char *) pSwizzledSurface->pBase +
643                         SwizzleOffset(
644                             pSwizzledSurface->pSwizzle,
645                             pSwizzledSurface->Pitch,
646                             x, y, pSwizzledSurface->OffsetZ);
647 
648                     if(LinearToSwizzled)
649                     {
650                         *pSwizzledAddress = *pLinearAddress;
651                     }
652                     else
653                     {
654                         *pLinearAddress = *pSwizzledAddress;
655                     }
656 
657                     pLinearAddress++;
658                 }
659 
660                 pLinearAddress += pLinearSurface->Pitch - CopyWidthBytes;
661             }
662         }
663         #else // Production/Performance Implementation...
664         {
665             /* Key Performance Gains from...
666                 (1) Efficient Memory Transfers (Ordering + Instruction)
667                 (2) Minimizing Work in Inner Loops */
668 
669             #if(_MSC_VER >= 1600)
670                 #include <stdint.h>
671 
672                 #pragma warning(push)
673                 #pragma warning(disable:4127) // Constant Conditional Expressions
674 
675                 unsigned long LOW_BIT_Index;
676                 #define LOW_BIT(x)  (_BitScanForward(&LOW_BIT_Index, (x)), LOW_BIT_Index)
677 
678                 unsigned long HIGH_BIT_Index;
679                 #define HIGH_BIT(x) (_BitScanReverse(&HIGH_BIT_Index, (x)), HIGH_BIT_Index)
680             #elif(__GNUC__ >= 4)
681                 #include <stdint.h>
682 
683                 #define LOW_BIT(x)  __builtin_ctz(x)
684                 #define HIGH_BIT(x) ((sizeof(x) * CHAR_BIT - 1) - __builtin_clz(x))
685             #else
686                 #error "Unexpected compiler!"
687             #endif
688 
689             typedef struct ___m24
690             {
691                 uint8_t byte[3];
692             } __m24; // 24-bit/3-byte memory element.
693 
694             // Macros intended to compile to various types of "load register from memory" instructions...
695             #define MOVB_R(  Reg, Src) (*(uint8_t  *)&(Reg) = *(uint8_t  *)(Src))
696             #define MOVW_R(  Reg, Src) (*(uint16_t *)&(Reg) = *(uint16_t *)(Src))
697             #define MOV3_R(  Reg, Src) (*(__m24    *)&(Reg) = *(__m24 *)(Src))
698             #define MOVD_R(  Reg, Src) (*(uint32_t *)&(Reg) = *(uint32_t *)(Src))
699 
700             #define MOVQ_R(  Reg, Src) ((Reg) = _mm_loadl_epi64((__m128i *)(Src)))
701             #define MOVDQ_R( Reg, Src) ((Reg) = _mm_load_si128( (__m128i *)(Src)))
702             #define MOVDQU_R(Reg, Src) ((Reg) = _mm_loadu_si128((__m128i *)(Src)))
703 
704             // As above, but the other half: "store to memory from register"...
705             #define MOVB_M(    Dest, Reg)(*(uint8_t  *)(Dest) = *(uint8_t  *)&(Reg))
706             #define MOVW_M(    Dest, Reg)(*(uint16_t *)(Dest) = *(uint16_t *)&(Reg))
707             #define MOV3_M(    Dest, Reg)(*(__m24    *)(Dest) = *(__m24    *)&(Reg))
708             #define MOVD_M(    Dest, Reg)(*(uint32_t *)(Dest) = *(uint32_t *)&(Reg))
709 
710             #define MOVQ_M(    Dest, Reg)(_mm_storel_epi64((__m128i *)(Dest), (Reg)))
711             #define MOVDQ_M(   Dest, Reg)(_mm_store_si128( (__m128i *)(Dest), (Reg)))
712             #define MOVDQU_M(  Dest, Reg)(_mm_storeu_si128((__m128i *)(Dest), (Reg)))
713             #define MOVNTDQ_M( Dest, Reg)(_mm_stream_si128((__m128i *)(Dest), (Reg)))
714 
715 
716             #define MIN_CONTAINED_POW2_BELOW_CAP(x, Cap) (1 << LOW_BIT((1 << LOW_BIT(x)) | (1 << HIGH_BIT(Cap))))
717 
718             #define SWIZZLE_OFFSET(OffsetX, OffsetY, OffsetZ) \
719                 SwizzleOffset(pSwizzledSurface->pSwizzle, pSwizzledSurface->Pitch, OffsetX, OffsetY, OffsetZ)
720 
721             #define MAX_XFER_WIDTH  16  // See "Compute Transfer Dimensions".
722             #define MAX_XFER_HEIGHT 4   // "
723 
724             char StreamingLoadSupported = -1; // SSE4.1: MOVNTDQA
725 
726             int TileWidthBits = POPCNT16(pSwizzledSurface->pSwizzle->Mask.x);   // Log2(Tile Width in Bytes)
727             int TileHeightBits = POPCNT16(pSwizzledSurface->pSwizzle->Mask.y);  // Log2(Tile Height)
728             int TileDepthBits = POPCNT16(pSwizzledSurface->pSwizzle->Mask.z);   // Log2(Tile Depth or MSAA Samples)
729             int BytesPerRowOfTiles = pSwizzledSurface->Pitch << (TileDepthBits + TileHeightBits);
730 
731             struct { int LeftCrust, MainRun, RightCrust; } CopyWidth;
732             int MaskX[MAX_XFER_WIDTH + 1], MaskY[MAX_XFER_HEIGHT + 1];
733             int SwizzledOffsetX0, SwizzledOffsetY;
734             struct { int Width, Height; } SwizzleMaxXfer;
735 
736             char *pSwizzledAddressCopyBase =
737                 (char *) pSwizzledSurface->pBase +
738                 SWIZZLE_OFFSET(0, 0, pSwizzledSurface->OffsetZ);
739 
740             assert(sizeof(__m24) == 3);
741 
742             if(StreamingLoadSupported == -1)
743             {
744                 #if(_MSC_VER >= 1500)
745                     #define MOVNTDQA_R(Reg, Src) ((Reg) = _mm_stream_load_si128((__m128i *)(Src)))
746                     int CpuInfo[4];
747                     __cpuid(CpuInfo, 1);
748                     StreamingLoadSupported = ((CpuInfo[2] & (1 << 19)) != 0); // ECX[19] = SSE4.1
749                 #elif(defined(__ARM_ARCH))
750                     #define MOVNTDQA_R(Reg, Src) ((Reg) = (Reg))
751                     StreamingLoadSupported = 0;
752                 #elif((defined __clang__) || (__GNUC__ > 4) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 5))
753                     #define MOVNTDQA_R(Reg, Src) ((Reg) = _mm_stream_load_si128((__m128i *)(Src)))
754                     unsigned int eax, ebx, ecx, edx;
755                     __cpuid(1, eax, ebx, ecx, edx);
756                     StreamingLoadSupported = ((ecx & (1 << 19)) != 0); // ECX[19] = SSE4.1
757                 #else
758                     #define MOVNTDQA_R(Reg, Src) ((Reg) = (Reg))
759                     StreamingLoadSupported = 0;
760                 #endif
761             }
762 
763             { // Compute Transfer Dimensions...
764 
765                 /* When transferring between linear and swizzled surfaces, we
766                 can't traverse linearly through memory of both since they have
767                 drastically different memory orderings--Moving linearly through
768                 one means bouncing around the other.
769 
770                 Moving linearly through linear surface is more programmatically
771                 convenient--especially when BLT rectangles not constrained to
772                 tile boundaries. But moving linearly through swizzled surface
773                 memory is often more performance-friendly--especially when that
774                 memory is CPU-mapped as WC (Write Combining), which is often
775                 the case for graphics memory.
776 
777                 Fortunately, we can avoid shortcomings of both extremes by
778                 using hybrid traversal: Traverse mostly linearly through linear
779                 surface, but have innermost loop transfer small 2D chunks sized
780                 to use critical runs of linearity in the swizzled memory.
781 
782                 The "critical runs of linearity" that we want to hit in the
783                 sizzled memory are aligned, cache-line-sized memory chunks. If
784                 we bounce around with finer granularity we'll incur penalties
785                 of partial WC buffer use (whether from WC memory use or non-
786                 temporal stores).
787 
788                 The size of 2D chunks with cache-line-sized linearity in
789                 swizzled memory is determined by swizzle mapping's low-order
790                 six bits (for 64-byte cache lines). Most swizzles use
791                 "Y Y X X X X" in their low-order bits, which means their cache
792                 lines store 16x4 chunks--So our implementation will use those
793                 dimensions as our target/maximum 2D transfer chunk. If we had
794                 any 8x8 (or taller) swizzles, we should add such support and
795                 increase our maximum chunk height. If we had any 32x2 swizzles,
796                 we should add such support and increase our maximum chunk width.
797 
798                 Our implementation only bothers optimizing for 2D transfer
799                 chunks stored in row-major order--i.e. those whose swizzle
800                 mapping bits have a series of X's in the low-order, followed by
801                 Y's in the higher-order. Where a swizzle mapping inflection
802                 from Y back to X occurs, contiguous row-ordering is lost, and
803                 we would use that smaller, row-ordered chunk size. */
804 
805                 int TargetMask;
806 
807                 // Narrow optimized transfer Width by looking for inflection from X's...
808                 SwizzleMaxXfer.Width = MAX_XFER_WIDTH;
809                 while(  (TargetMask = SwizzleMaxXfer.Width - 1) &&
810                         ((pSwizzledSurface->pSwizzle->Mask.x & TargetMask) != TargetMask))
811                 {
812                     SwizzleMaxXfer.Width >>= 1;
813                 }
814 
815                 // Narrow optimized transfer height by looking for inflection from Y's...
816                 SwizzleMaxXfer.Height = MAX_XFER_HEIGHT;
817 
818                 while(  (TargetMask = (SwizzleMaxXfer.Height - 1) * SwizzleMaxXfer.Width) &&
819                         ((pSwizzledSurface->pSwizzle->Mask.y & TargetMask) != TargetMask))
820                 {
821                     SwizzleMaxXfer.Height >>= 1;
822                 }
823             }
824 
825             { // Separate CopyWidthBytes into unaligned left/right "crust" and aligned "MainRun"...
826                 int MaxXferWidth = MIN_CONTAINED_POW2_BELOW_CAP(SwizzleMaxXfer.Width, CopyWidthBytes);
827 
828                 CopyWidth.LeftCrust = // i.e. "bytes to xfer-aligned boundary"
829                     (MaxXferWidth - x0) & (MaxXferWidth - 1); // Simplification of ((MaxXferWidth - (x0 % MaxXferWidth)) % MaxXferWidth)
830 
831                 CopyWidth.MainRun =
832                     (CopyWidthBytes - CopyWidth.LeftCrust) & ~(SwizzleMaxXfer.Width - 1); // MainRun is of SwizzleMaxXfer.Width's--not MaxXferWidth's.
833 
834                 CopyWidth.RightCrust = CopyWidthBytes - (CopyWidth.LeftCrust + CopyWidth.MainRun);
835 
836                 #ifdef SUB_ELEMENT_SUPPORT
837                 {
838                     // For partial-pixel transfers, there is no crust and MainRun is done pixel-by-pixel...
839                     if( (pLinearSurface->Element.Size != pLinearSurface->Element.Pitch) ||
840                         (pSwizzledSurface->Element.Size != pSwizzledSurface->Element.Pitch))
841                     {
842                         CopyWidth.LeftCrust = CopyWidth.RightCrust = 0;
843                         CopyWidth.MainRun = CopyWidthBytes;
844                     }
845                 }
846                 #endif
847             }
848 
849 
850             /* Unlike in MINIMALIST implementation, which fully computes
851             swizzled offset for each transfer element, we want to minimize work
852             done in our inner loops.
853 
854             One way we'll reduce work is to separate pSwizzledAddress into
855             dimensional components--e.g. so Y-swizzling doesn't have to be
856             recomputed in X-loop.
857 
858             But a more powerful way we'll reduce work is...Instead of linearly
859             incrementing spatial offsets and then converting to their swizzled
860             counterparts, we'll compute swizzled bases outside the loops and
861             keep them swizzled using swizzled incrementing inside the loops--
862             since swizzled incrementing can be much cheaper than repeatedly
863             swizzling spatial offsets.
864 
865             Intra-tile swizzled incrementing can be done by using the inverse
866             of a spatial component's swizzle mask to ripple-carry a +1 to and
867             across the bits of a currently swizzled value--e.g. with...
868 
869                 SwizzledOffsetY:   Y X Y X Y Y X X X X
870                          ~MaskY:   0 1 0 1 0 0 1 1 1 1
871                                  +                   1
872                                 -----------------------
873 
874             ...set low-order ~MaskY bits will always ripple-carry the
875             incrementing +1 to wherever Y0 happens to be, and wherever there is
876             an arithmetic carry out of one Y position, set ~MaskY bits will
877             carry it across any gaps to the next Y position.
878 
879             The above algorithm only works for adding one, but the mask used
880             can be modified to deliver the +1 to any bit location, so any power
881             of two increment can be achieved.
882 
883             After swizzled increment, residue from mask addition and undesired
884             carries outside targeted fields must be removed using the natural
885             mask--So the final intra-tile swizzled increment is...
886 
887                 SwizzledOffsetQ = (SwizzledOffsetQ + ~MaskQ + 1) & MaskQ
888                     ...where Q is the applicable X/Y/Z dimensional component.
889 
890                 Or since in two's compliment, (~MaskQ + 1) = -MaskQ...
891 
892                 SwizzledOffsetQ = (SwizzledOffsetQ - MaskQ) & MaskQ
893 
894             Since tile sizes are powers of two and tiles laid out in row-major
895             order across surface, the above swizzled incrementing can
896             additionally be used for inter-tile incrementing of X component by
897             extending applicable mask to include offset bits beyond the tile--
898             so arithmetic carries out of intra-tile X component will ripple to
899             advance swizzled inter-tile X offset to next tile. Same is not true
900             of inter-tile Y incrementing since surface pitches not restricted
901             to powers of two. */
902 
903             { // Compute Mask[IncSize] for Needed Increment Values...
904                 int ExtendedMaskX = // Bits beyond the tile (so X incrementing can operate inter-tile)...
905                     ~(pSwizzledSurface->pSwizzle->Mask.x |
906                       pSwizzledSurface->pSwizzle->Mask.y |
907                       pSwizzledSurface->pSwizzle->Mask.z);
908 
909                 /* Subtraction below delivers natural mask for +1 increment,
910                 and appropriately altered mask to deliver +1 to higher bit
911                 positions for +2/4/8/etc. increments. */
912 
913                 for(x = SwizzleMaxXfer.Width; x >= 1; x >>= 1)
914                 {
915                     MaskX[x] = SWIZZLE_OFFSET((1 << TileWidthBits) - x, 0, 0) | ExtendedMaskX;
916                 }
917 
918                 for(y = SwizzleMaxXfer.Height; y >= 1; y >>= 1)
919                 {
920                     MaskY[y] = SWIZZLE_OFFSET(0, (1 << TileHeightBits) - y, 0);
921                 }
922             }
923 
924             { // Base Dimensional Swizzled Offsets...
925                 int IntraTileY = y0 & ((1 << TileHeightBits) - 1);
926                 int TileAlignedY = y0 - IntraTileY;
927 
928                 SwizzledOffsetY = SWIZZLE_OFFSET(0, IntraTileY, 0);
929 
930                 SwizzledOffsetX0 =
931                     SWIZZLE_OFFSET(
932                         x0,
933                         TileAlignedY, // <-- Since SwizzledOffsetX will include "bits beyond the tile".
934                         0);
935             }
936 
937             // BLT Loops ///////////////////////////////////////////////////////
938 
939             /* Traverse BLT rectangle, transferring small, optimally-aligned 2D
940             chunks, as appropriate for given swizzle format. Use swizzled
941             incrementing of dimensional swizzled components. */
942 
943             for(y = y0; y < y1; )
944             {
945                 char *pSwizzledAddressLine = pSwizzledAddressCopyBase + SwizzledOffsetY;
946                 int xferHeight =
947                     // Largest pow2 xfer height that alignment, MaxXfer, and lines left will permit...
948                     MIN_CONTAINED_POW2_BELOW_CAP(y | SwizzleMaxXfer.Height, y1 - y);
949                 int SwizzledOffsetX = SwizzledOffsetX0;
950 
951                 __m128i xmm[MAX_XFER_HEIGHT];
952                 char *pLinearAddressEnd;
953                 int _MaskX;
954 
955                 // XFER Macros /////////////////////////////////////////////////
956 
957                 /* We'll define "XFER" macro to contain BLT X-loop work.
958 
959                 In simple implementation, XFER would be WHILE loop that does
960                 SSE transfer and performs pointer and swizzled offset
961                 incrementing.
962 
963                 ...but we have multiple conditions to handle...
964                   - Transfer Direction (Linear <--> Swizzled)
965                   - Optimal 2D Transfer Chunk Size
966                   - Available/Desired CPU Transfer Instructions
967                   - Unaligned Crust
968 
969                 Don't want X-loop to have conditional logic to handle
970                 variations since would retard performance--but neither do we
971                 want messy multitude of slightly different, copy-pasted code
972                 paths. So instead, XFER macro will provide common code template
973                 allowing instantiation of multiple X-loop variations--i.e. XFER
974                 calls from conditional Y-loop code will expand into separate,
975                 conditional-free, "lean and mean" X-loops.
976 
977                 Some conditional logic remains in XFER chain--but only outside
978                 X-loop. The two IF statements that remain in X-loop (i.e. those
979                 in XFER_LOAD/STORE) expand to compile-time constant conditional
980                 expressions, so with optimizing compiler, no runtime-
981                 conditional code will be generated--i.e. constant conditionals
982                 will simply decide whether given instantiation has that code or
983                 not. */
984 
985                 #define XFER(XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
986                 {                                                                                                   \
987                          XFER_LINES(4, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
988                     else XFER_LINES(2, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
989                     else XFER_LINES(1, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust);\
990                 }
991 
992                 #define XFER_LINES(XFER_LINES_Lines, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
993                     if(xferHeight == (XFER_LINES_Lines))    \
994                     {                                       \
995                         if(XFER_Crust)                      \
996                         {                                   \
997                             XFER_SPAN(MOVB_M, MOVB_R, CopyWidth.LeftCrust  & 1, 1, 1, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
998                             XFER_SPAN(MOVW_M, MOVW_R, CopyWidth.LeftCrust  & 2, 2, 2, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
999                             XFER_SPAN(MOVD_M, MOVD_R, CopyWidth.LeftCrust  & 4, 4, 4, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1000                             XFER_SPAN(MOVQ_M, MOVQ_R, CopyWidth.LeftCrust  & 8, 8, 8, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1001                         }                                   \
1002                                                             \
1003                         XFER_SPAN(XFER_Store, XFER_Load, CopyWidth.MainRun, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch);\
1004                                                             \
1005                         if(XFER_Crust)                      \
1006                         {                                   \
1007                             XFER_SPAN(MOVQ_M, MOVQ_R, CopyWidth.RightCrust & 8, 8, 8, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1008                             XFER_SPAN(MOVD_M, MOVD_R, CopyWidth.RightCrust & 4, 4, 4, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1009                             XFER_SPAN(MOVW_M, MOVW_R, CopyWidth.RightCrust & 2, 2, 2, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1010                             XFER_SPAN(MOVB_M, MOVB_R, CopyWidth.RightCrust & 1, 1, 1, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1011                         }                                   \
1012                     }
1013 
1014                 #define XFER_SPAN(XFER_Store, XFER_Load, XFER_CopyWidthBytes, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_Height, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch) \
1015                 {                                                                           \
1016                     pLinearAddressEnd = pLinearAddress + (XFER_CopyWidthBytes);             \
1017                     _MaskX = MaskX[XFER_Pitch_Swizzled];                                    \
1018                     while(pLinearAddress < pLinearAddressEnd)                               \
1019                     {                                                                       \
1020                         pSwizzledAddress = pSwizzledAddressLine + SwizzledOffsetX;          \
1021                                                                                             \
1022                         XFER_LOAD(0, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height);     \
1023                         XFER_LOAD(1, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height);     \
1024                         XFER_LOAD(2, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height);     \
1025                         XFER_LOAD(3, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height);     \
1026                         XFER_STORE(0, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
1027                         XFER_STORE(1, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
1028                         XFER_STORE(2, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
1029                         XFER_STORE(3, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
1030                                                                                             \
1031                         SwizzledOffsetX = (SwizzledOffsetX - _MaskX) & _MaskX;              \
1032                         pLinearAddress += (XFER_Pitch_Linear);                              \
1033                     }                                                                       \
1034                 }
1035 
1036                 #define XFER_LOAD(XFER_Line, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height) \
1037                 {                                                           \
1038                     if((XFER_Line) < (XFER_Height))                         \
1039                     {                                                       \
1040                         XFER_Load(                                          \
1041                             xmm[XFER_Line],                                 \
1042                             (XFER_pSrc) + (XFER_Line) * (XFER_SrcPitch));   \
1043                     }                                                       \
1044                 }
1045 
1046                 #define XFER_STORE(XFER_Line, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height) \
1047                 {                                                           \
1048                     if((XFER_Line) < (XFER_Height))                         \
1049                     {                                                       \
1050                         XFER_Store(                                         \
1051                             (XFER_pDest) + (XFER_Line) * (XFER_DestPitch),  \
1052                             xmm[XFER_Line]);                                \
1053                     }                                                       \
1054                 }
1055 
1056                 // Perform Applicable Transfer /////////////////////////////////
1057                 assert( // DQ Alignment...
1058                     ((intptr_t) pSwizzledSurface->pBase % 16 == 0) &&
1059                     (pSwizzledSurface->Pitch % 16 == 0));
1060 
1061                 #ifdef SUB_ELEMENT_SUPPORT
1062                     if( (pLinearSurface->Element.Size != pLinearSurface->Element.Pitch) ||
1063                         (pSwizzledSurface->Element.Size != pSwizzledSurface->Element.Pitch))
1064                     {
1065                         if(LinearToSwizzled)
1066                         {
1067                             switch(pLinearSurface->Element.Size)
1068                             {
1069                                 case 16: XFER(MOVNTDQ_M, MOVDQU_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1070                                 case  8: XFER(   MOVQ_M,   MOVQ_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1071                                 case  4: XFER(   MOVD_M,   MOVD_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1072                                 case  3: XFER(   MOV3_M,   MOV3_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1073                                 case  2: XFER(   MOVW_M,   MOVW_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1074                                 case  1: XFER(   MOVB_M,   MOVB_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1075                                 default: assert(0);
1076                             }
1077                         }
1078                         else
1079                         {
1080                             switch(pLinearSurface->Element.Size)
1081                             {
1082                                 case 16:
1083                                 {
1084                                     if(StreamingLoadSupported)
1085                                     {
1086                                         XFER(MOVDQU_M, MOVNTDQA_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0);
1087                                     }
1088                                     else
1089                                     {
1090                                         XFER(MOVDQU_M,    MOVDQ_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0);
1091                                     }
1092                                     break;
1093                                 }
1094                                 case  8: XFER(   MOVQ_M,   MOVQ_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1095                                 case  4: XFER(   MOVD_M,   MOVD_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1096                                 case  3: XFER(   MOV3_M,   MOV3_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1097                                 case  2: XFER(   MOVW_M,   MOVW_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1098                                 case  1: XFER(   MOVB_M,   MOVB_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1099                                 default: assert(0);
1100                             }
1101                         }
1102                     } else
1103                 #endif // SUB_ELEMENT_SUPPORT
1104                 if(LinearToSwizzled)
1105                 {
1106                     switch(SwizzleMaxXfer.Width)
1107                     {
1108                         case 16: XFER(MOVNTDQ_M, MOVDQU_R, 16, 16, pSwizzledAddress, 16, pLinearAddress, pLinearSurface->Pitch, 1); break;
1109                         #ifdef INTEL_TILE_W_SUPPORT
1110                             case  2: XFER(MOVW_M,  MOVW_R,  2,  2, pSwizzledAddress,  2, pLinearAddress, pLinearSurface->Pitch, 1); break;
1111                         #endif
1112                         default: assert(0); // Unexpected cases excluded to save compile time/size of multiplying instantiations.
1113                     }
1114                 }
1115                 else
1116                 {
1117                     switch(SwizzleMaxXfer.Width)
1118                     {
1119                         case 16:
1120                         {
1121                             if(StreamingLoadSupported)
1122                             {
1123                                 XFER(MOVDQU_M, MOVNTDQA_R, 16, 16, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, 16, 1);
1124                             }
1125                             else
1126                             {
1127                                 XFER(MOVDQU_M,    MOVDQ_R, 16, 16, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, 16, 1);
1128                             }
1129                             break;
1130                         }
1131                         #ifdef INTEL_TILE_W_SUPPORT
1132                             case 2: XFER(MOVW_M,   MOVW_R,  2,  2, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress,  2, 1); break;
1133                         #endif
1134                         default: assert(0);
1135                     }
1136                 }
1137 
1138 
1139                 // Swizzled inc of SwizzledOffsetY...
1140                 SwizzledOffsetY = (SwizzledOffsetY - MaskY[xferHeight]) & MaskY[xferHeight];
1141                 if(!SwizzledOffsetY) SwizzledOffsetX0 += BytesPerRowOfTiles; // Wraps advance SwizzledOffsetX0, since that includes "bits beyond the tile".
1142 
1143                 y += xferHeight;
1144 
1145                 /* X-loop only advanced pLinearAddress by CopyWidthBytes--even
1146                 when transferred multiple lines. Advance rest of way: */
1147                 pLinearAddress += xferHeight * pLinearSurface->Pitch - CopyWidthBytes;
1148 
1149             } // foreach(y)
1150 
1151             _mm_sfence(); // Flush Non-Temporal Writes
1152 
1153             #if(_MSC_VER)
1154                 #pragma warning(pop)
1155             #endif
1156         }
1157         #endif
1158     }
1159 } // CpuSwizzleBlt
1160 
1161 #endif // #ifndef INCLUDE_CpuSwizzleBlt_c_AS_HEADER
1162 // clang-format on
1163