xref: /aosp_15_r20/external/gmmlib/Source/GmmLib/Utility/CpuSwizzleBlt/CpuSwizzleBlt.c (revision 35ffd701415c9e32e53136d61a677a8d0a8fc4a5)
1*35ffd701SAndroid Build Coastguard Worker /*==============================================================================
2*35ffd701SAndroid Build Coastguard Worker Copyright(c) 2017 Intel Corporation
3*35ffd701SAndroid Build Coastguard Worker 
4*35ffd701SAndroid Build Coastguard Worker Permission is hereby granted, free of charge, to any person obtaining a
5*35ffd701SAndroid Build Coastguard Worker copy of this software and associated documentation files(the "Software"),
6*35ffd701SAndroid Build Coastguard Worker to deal in the Software without restriction, including without limitation
7*35ffd701SAndroid Build Coastguard Worker the rights to use, copy, modify, merge, publish, distribute, sublicense,
8*35ffd701SAndroid Build Coastguard Worker and / or sell copies of the Software, and to permit persons to whom the
9*35ffd701SAndroid Build Coastguard Worker Software is furnished to do so, subject to the following conditions:
10*35ffd701SAndroid Build Coastguard Worker 
11*35ffd701SAndroid Build Coastguard Worker The above copyright notice and this permission notice shall be included
12*35ffd701SAndroid Build Coastguard Worker in all copies or substantial portions of the Software.
13*35ffd701SAndroid Build Coastguard Worker 
14*35ffd701SAndroid Build Coastguard Worker THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15*35ffd701SAndroid Build Coastguard Worker OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16*35ffd701SAndroid Build Coastguard Worker FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17*35ffd701SAndroid Build Coastguard Worker THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18*35ffd701SAndroid Build Coastguard Worker OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19*35ffd701SAndroid Build Coastguard Worker ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20*35ffd701SAndroid Build Coastguard Worker OTHER DEALINGS IN THE SOFTWARE.
21*35ffd701SAndroid Build Coastguard Worker ============================================================================*/
22*35ffd701SAndroid Build Coastguard Worker // clang-format off
23*35ffd701SAndroid Build Coastguard Worker // CpuSwizzleBlt.c - Surface swizzling definitions and BLT functionality.
24*35ffd701SAndroid Build Coastguard Worker 
25*35ffd701SAndroid Build Coastguard Worker // [!] File serves as its own header:
26*35ffd701SAndroid Build Coastguard Worker //      #define INCLUDE_CpuSwizzleBlt_c_AS_HEADER
27*35ffd701SAndroid Build Coastguard Worker //      #include "CpuSwizzleBlt.c"
28*35ffd701SAndroid Build Coastguard Worker 
29*35ffd701SAndroid Build Coastguard Worker #define SUB_ELEMENT_SUPPORT         // Support for Partial Element Transfer (e.g. separating/merging depth-stencil).
30*35ffd701SAndroid Build Coastguard Worker #define INTEL_TILE_W_SUPPORT        // Stencil Only;
31*35ffd701SAndroid Build Coastguard Worker 
32*35ffd701SAndroid Build Coastguard Worker #ifndef CpuSwizzleBlt_INCLUDED
33*35ffd701SAndroid Build Coastguard Worker 
34*35ffd701SAndroid Build Coastguard Worker #ifdef __cplusplus
35*35ffd701SAndroid Build Coastguard Worker extern "C" {
36*35ffd701SAndroid Build Coastguard Worker #endif
37*35ffd701SAndroid Build Coastguard Worker 
38*35ffd701SAndroid Build Coastguard Worker // Background ##################################################################
39*35ffd701SAndroid Build Coastguard Worker 
40*35ffd701SAndroid Build Coastguard Worker /* Pixel-based surfaces commonly stored in memory row-by-row. This convention
41*35ffd701SAndroid Build Coastguard Worker has simple "y * Pitch + x" addressing but has spatial locality only in
42*35ffd701SAndroid Build Coastguard Worker horizontal direction--i.e. horizontal pixel neighbors stored next to each other
43*35ffd701SAndroid Build Coastguard Worker but vertical neighbors stored entire pitch away.
44*35ffd701SAndroid Build Coastguard Worker 
45*35ffd701SAndroid Build Coastguard Worker Since many graphics operations involve multi-dimensional data access, to
46*35ffd701SAndroid Build Coastguard Worker improve cache/memory access performance it is often more beneficial to use
47*35ffd701SAndroid Build Coastguard Worker alternative storage conventions which have multi-dimensional spatial locality--
48*35ffd701SAndroid Build Coastguard Worker i.e. where pixels tend to be stored near both their horizontal and vertical
49*35ffd701SAndroid Build Coastguard Worker neighbors.
50*35ffd701SAndroid Build Coastguard Worker 
51*35ffd701SAndroid Build Coastguard Worker "Tiling/Swizzling" is storage convention that increases multi-dimensional
52*35ffd701SAndroid Build Coastguard Worker spatial locality by treating surface as series of smaller regions/"tiles",
53*35ffd701SAndroid Build Coastguard Worker laid out in row-major order across surface, with entire content of each tile
54*35ffd701SAndroid Build Coastguard Worker stored contiguously. Data within each tile is stored in pattern that further
55*35ffd701SAndroid Build Coastguard Worker maximizes the locality. */
56*35ffd701SAndroid Build Coastguard Worker 
57*35ffd701SAndroid Build Coastguard Worker 
58*35ffd701SAndroid Build Coastguard Worker // Swizzle Descriptors #########################################################
59*35ffd701SAndroid Build Coastguard Worker 
60*35ffd701SAndroid Build Coastguard Worker /* Tile sizes always powers of 2 and chosen to be architecturally convenient--
61*35ffd701SAndroid Build Coastguard Worker e.g. 4KB to match physical page size. Tile dimensions also powers of 2, usually
62*35ffd701SAndroid Build Coastguard Worker chosen to produce square tiles for targeted pixel size--e.g. 4KB = 128 bytes x
63*35ffd701SAndroid Build Coastguard Worker 32 rows = 32 x 32 pixels @ 4 bytes-per-pixel.
64*35ffd701SAndroid Build Coastguard Worker 
65*35ffd701SAndroid Build Coastguard Worker Since tile size and dimensions all powers of two, the spatial-to-linear mapping
66*35ffd701SAndroid Build Coastguard Worker required to store a tile can be trivial: spatial indexing bits can simply be
67*35ffd701SAndroid Build Coastguard Worker mapped to linear offset bits--e.g. for a 4KB, 128x32 tile...each byte within
68*35ffd701SAndroid Build Coastguard Worker tile can be referenced with a 7-bit X index and 5-bit Y index--and each of
69*35ffd701SAndroid Build Coastguard Worker those 12 index bits can be individually mapped to a bit in the 12-bit offset of
70*35ffd701SAndroid Build Coastguard Worker the tile's linear storage.
71*35ffd701SAndroid Build Coastguard Worker 
72*35ffd701SAndroid Build Coastguard Worker The order in which spatial index bits are mapped to linear offset bits
73*35ffd701SAndroid Build Coastguard Worker determines the spatial locality properties of the surface data. E.g. the
74*35ffd701SAndroid Build Coastguard Worker following mapping...
75*35ffd701SAndroid Build Coastguard Worker 
76*35ffd701SAndroid Build Coastguard Worker     Linear[11:0] = Y4 Y3 Y2 Y1 Y0 X6 X5 X4 X3 X2 X1 X0
77*35ffd701SAndroid Build Coastguard Worker                    \-- Y[4:0] --/ \----- X[6:0] -----/
78*35ffd701SAndroid Build Coastguard Worker 
79*35ffd701SAndroid Build Coastguard Worker ...stores bytes of tile in row-major order, with horizontal neighbors stored
80*35ffd701SAndroid Build Coastguard Worker contiguously and vertical neighbors stored 128 bytes away. If instead, Y index
81*35ffd701SAndroid Build Coastguard Worker bits were mapped to the low-order...
82*35ffd701SAndroid Build Coastguard Worker 
83*35ffd701SAndroid Build Coastguard Worker     Linear[11:0] = X6 X5 X4 X3 X2 X1 X0 Y4 Y3 Y2 Y1 Y0
84*35ffd701SAndroid Build Coastguard Worker                    \----- X[6:0] -----/ \-- Y[4:0] --/
85*35ffd701SAndroid Build Coastguard Worker 
86*35ffd701SAndroid Build Coastguard Worker ...bytes of tile would be stored in column-major order, with vertical neighbors
87*35ffd701SAndroid Build Coastguard Worker stored contiguously and horizontal neighbors stored 32 bytes away.
88*35ffd701SAndroid Build Coastguard Worker 
89*35ffd701SAndroid Build Coastguard Worker Individual X and Y bits can be separated and interspersed in mapping to
90*35ffd701SAndroid Build Coastguard Worker increase locality via sub-tiling--e.g...
91*35ffd701SAndroid Build Coastguard Worker 
92*35ffd701SAndroid Build Coastguard Worker     Linear[11:0] = Y4 Y3 Y2 X6 X5 X4 Y1 Y0 X3 X2 X1 X0
93*35ffd701SAndroid Build Coastguard Worker                                      \-- Sub-Tile ---/
94*35ffd701SAndroid Build Coastguard Worker 
95*35ffd701SAndroid Build Coastguard Worker ...subdivies tile into 16x4 sub-tiles laid out in row-major order across tile,
96*35ffd701SAndroid Build Coastguard Worker with sub-tile content further stored in row-major order, with horizontal byte
97*35ffd701SAndroid Build Coastguard Worker neighbors within sub-tile stored contiguously and vertical neighbors only 16
98*35ffd701SAndroid Build Coastguard Worker bytes away. This means single 64-byte cache line contains 4x4 group of 32bpp
99*35ffd701SAndroid Build Coastguard Worker pixels--which is powerful spatial locality for graphics processing.
100*35ffd701SAndroid Build Coastguard Worker 
101*35ffd701SAndroid Build Coastguard Worker If mappings restricted to being "parallel" for index bits (i.e. bits of given
102*35ffd701SAndroid Build Coastguard Worker index can change position but not relative order during mapping), then bit
103*35ffd701SAndroid Build Coastguard Worker indexes need not be explicitly denoted--e.g. the previous sub-tiling mapping
104*35ffd701SAndroid Build Coastguard Worker can be represented as...
105*35ffd701SAndroid Build Coastguard Worker 
106*35ffd701SAndroid Build Coastguard Worker     Linear[11:0] = Y Y Y X X X Y Y X X X X
107*35ffd701SAndroid Build Coastguard Worker 
108*35ffd701SAndroid Build Coastguard Worker ...where X and Y index bits are implied to be zero-based-counted in order they
109*35ffd701SAndroid Build Coastguard Worker are encountered.
110*35ffd701SAndroid Build Coastguard Worker 
111*35ffd701SAndroid Build Coastguard Worker In software, spatial-to-linear mapping conveniently described with bit mask for
112*35ffd701SAndroid Build Coastguard Worker each dimension, where a set bit indicates the next bit of that dimension's
113*35ffd701SAndroid Build Coastguard Worker index is mapped to that position in the linear offset--e.g....
114*35ffd701SAndroid Build Coastguard Worker 
115*35ffd701SAndroid Build Coastguard Worker     Linear[11:0] = Y Y Y X X X Y Y X X X X
116*35ffd701SAndroid Build Coastguard Worker     MaskX =        0 0 0 1 1 1 0 0 1 1 1 1
117*35ffd701SAndroid Build Coastguard Worker     MaskY =        1 1 1 0 0 0 1 1 0 0 0 0
118*35ffd701SAndroid Build Coastguard Worker 
119*35ffd701SAndroid Build Coastguard Worker Such dimensional masks all that's needed to describe given tiling/swizzling
120*35ffd701SAndroid Build Coastguard Worker convention, since tile size and dimensions can be derived from the masks:
121*35ffd701SAndroid Build Coastguard Worker 
122*35ffd701SAndroid Build Coastguard Worker     TileWidth =  2 ^ NumberOfSetBits(MaskX)
123*35ffd701SAndroid Build Coastguard Worker     TileHeight = 2 ^ NumberOfSetBits(MaskY)
124*35ffd701SAndroid Build Coastguard Worker     TileSize =   2 ^ NumberOfSetBits(MaskX OR MaskY)
125*35ffd701SAndroid Build Coastguard Worker 
126*35ffd701SAndroid Build Coastguard Worker Tiling/swizzling is not limited to 2D. With addition of another tile dimension,
127*35ffd701SAndroid Build Coastguard Worker spatial locality for 3D or MSAA sample neighbors can be controlled, also. */
128*35ffd701SAndroid Build Coastguard Worker 
129*35ffd701SAndroid Build Coastguard Worker     typedef struct  _SWIZZLE_DESCRIPTOR {
130*35ffd701SAndroid Build Coastguard Worker         struct          _SWIZZLE_DESCRIPTOR_MASKS {
131*35ffd701SAndroid Build Coastguard Worker             int             x, y, z;
132*35ffd701SAndroid Build Coastguard Worker         }               Mask;
133*35ffd701SAndroid Build Coastguard Worker     }               SWIZZLE_DESCRIPTOR;
134*35ffd701SAndroid Build Coastguard Worker 
135*35ffd701SAndroid Build Coastguard Worker     typedef enum _EXTERNAL_SWIZZLE_NAME
136*35ffd701SAndroid Build Coastguard Worker     {
137*35ffd701SAndroid Build Coastguard Worker         TILEX = 0,
138*35ffd701SAndroid Build Coastguard Worker         TILEY,
139*35ffd701SAndroid Build Coastguard Worker         TILEW,
140*35ffd701SAndroid Build Coastguard Worker         TILEYS,
141*35ffd701SAndroid Build Coastguard Worker         TILEYF
142*35ffd701SAndroid Build Coastguard Worker     }EXTERNAL_SWIZZLE_NAME;
143*35ffd701SAndroid Build Coastguard Worker 
144*35ffd701SAndroid Build Coastguard Worker     typedef enum  _EXTERNAL_RES_TYPE{
145*35ffd701SAndroid Build Coastguard Worker         Res_2D = 0,
146*35ffd701SAndroid Build Coastguard Worker         Res_3D = 1,
147*35ffd701SAndroid Build Coastguard Worker         MSAA_2X,
148*35ffd701SAndroid Build Coastguard Worker         MSAA_4X,
149*35ffd701SAndroid Build Coastguard Worker         MSAA_8X,
150*35ffd701SAndroid Build Coastguard Worker         MSAA_16X
151*35ffd701SAndroid Build Coastguard Worker     }EXTERNAL_RES_TYPE;
152*35ffd701SAndroid Build Coastguard Worker 
153*35ffd701SAndroid Build Coastguard Worker     // Definition Helper Macros...
154*35ffd701SAndroid Build Coastguard Worker     #define X ,'x'
155*35ffd701SAndroid Build Coastguard Worker     #define Y ,'y'
156*35ffd701SAndroid Build Coastguard Worker     #define Z ,'z'
157*35ffd701SAndroid Build Coastguard Worker     #define S ,'z' // S = MSAA Sample Index
158*35ffd701SAndroid Build Coastguard Worker     #define o ,0   // o = N/A Swizzle Bit
159*35ffd701SAndroid Build Coastguard Worker     #ifdef INCLUDE_CpuSwizzleBlt_c_AS_HEADER
160*35ffd701SAndroid Build Coastguard Worker         #define __SWIZZLE(Name, b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0) \
161*35ffd701SAndroid Build Coastguard Worker             extern const SWIZZLE_DESCRIPTOR Name;
162*35ffd701SAndroid Build Coastguard Worker     #else // C Compile...
163*35ffd701SAndroid Build Coastguard Worker         #define __SWIZZLE(Name, b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0) \
164*35ffd701SAndroid Build Coastguard Worker             const SWIZZLE_DESCRIPTOR Name = \
165*35ffd701SAndroid Build Coastguard Worker                 { (b15 == 'x' ? 0x8000 : 0) + (b14 == 'x' ? 0x4000 : 0) + (b13 == 'x' ? 0x2000 : 0) + (b12 == 'x' ? 0x1000 : 0) + (b11 == 'x' ? 0x0800 : 0) + (b10 == 'x' ? 0x0400 : 0) + (b9 == 'x' ? 0x0200 : 0) + (b8 == 'x' ? 0x0100 : 0) + (b7 == 'x' ? 0x0080 : 0) + (b6 == 'x' ? 0x0040 : 0) + (b5 == 'x' ? 0x0020 : 0) + (b4 == 'x' ? 0x0010 : 0) + (b3 == 'x' ? 0x0008 : 0) + (b2 == 'x' ? 0x0004 : 0) + (b1 == 'x' ? 0x0002 : 0) + (b0 == 'x' ? 0x0001 : 0), \
166*35ffd701SAndroid Build Coastguard Worker                   (b15 == 'y' ? 0x8000 : 0) + (b14 == 'y' ? 0x4000 : 0) + (b13 == 'y' ? 0x2000 : 0) + (b12 == 'y' ? 0x1000 : 0) + (b11 == 'y' ? 0x0800 : 0) + (b10 == 'y' ? 0x0400 : 0) + (b9 == 'y' ? 0x0200 : 0) + (b8 == 'y' ? 0x0100 : 0) + (b7 == 'y' ? 0x0080 : 0) + (b6 == 'y' ? 0x0040 : 0) + (b5 == 'y' ? 0x0020 : 0) + (b4 == 'y' ? 0x0010 : 0) + (b3 == 'y' ? 0x0008 : 0) + (b2 == 'y' ? 0x0004 : 0) + (b1 == 'y' ? 0x0002 : 0) + (b0 == 'y' ? 0x0001 : 0), \
167*35ffd701SAndroid Build Coastguard Worker                   (b15 == 'z' ? 0x8000 : 0) + (b14 == 'z' ? 0x4000 : 0) + (b13 == 'z' ? 0x2000 : 0) + (b12 == 'z' ? 0x1000 : 0) + (b11 == 'z' ? 0x0800 : 0) + (b10 == 'z' ? 0x0400 : 0) + (b9 == 'z' ? 0x0200 : 0) + (b8 == 'z' ? 0x0100 : 0) + (b7 == 'z' ? 0x0080 : 0) + (b6 == 'z' ? 0x0040 : 0) + (b5 == 'z' ? 0x0020 : 0) + (b4 == 'z' ? 0x0010 : 0) + (b3 == 'z' ? 0x0008 : 0) + (b2 == 'z' ? 0x0004 : 0) + (b1 == 'z' ? 0x0002 : 0) + (b0 == 'z' ? 0x0001 : 0) }
168*35ffd701SAndroid Build Coastguard Worker #endif
169*35ffd701SAndroid Build Coastguard Worker     #define SWIZZLE(__SWIZZLE_Args) __SWIZZLE __SWIZZLE_Args
170*35ffd701SAndroid Build Coastguard Worker 
171*35ffd701SAndroid Build Coastguard Worker     // Legacy Intel Tiling Swizzles...
172*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_X              o o o o Y Y Y X X X X X X X X X ));
173*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_Y              o o o o X X X Y Y Y Y Y X X X X ));
174*35ffd701SAndroid Build Coastguard Worker 
175*35ffd701SAndroid Build Coastguard Worker     #ifdef INTEL_TILE_W_SUPPORT
176*35ffd701SAndroid Build Coastguard Worker         SWIZZLE(( INTEL_TILE_W          o o o o X X X Y Y Y Y X Y X Y X ));
177*35ffd701SAndroid Build Coastguard Worker     #endif
178*35ffd701SAndroid Build Coastguard Worker // Gen9 Swizzles...
179*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_128         o o o o X Y X Y X X Y Y X X X X ));
180*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_64          o o o o X Y X Y X X Y Y X X X X ));
181*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_32          o o o o X Y X Y X Y Y Y X X X X ));
182*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_16          o o o o X Y X Y X Y Y Y X X X X ));
183*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_8           o o o o X Y X Y Y Y Y Y X X X X ));
184*35ffd701SAndroid Build Coastguard Worker 
185*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_128         X Y X Y X Y X Y X X Y Y X X X X ));
186*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_64          X Y X Y X Y X Y X X Y Y X X X X ));
187*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_32          X Y X Y X Y X Y X Y Y Y X X X X ));
188*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_16          X Y X Y X Y X Y X Y Y Y X X X X ));
189*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_8           X Y X Y X Y X Y Y Y Y Y X X X X ));
190*35ffd701SAndroid Build Coastguard Worker 
191*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA2_128   o o o o S Y X Y X X Y Y X X X X ));
192*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA2_64    o o o o S Y X Y X X Y Y X X X X ));
193*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA2_32    o o o o S Y X Y X Y Y Y X X X X ));
194*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA2_16    o o o o S Y X Y X Y Y Y X X X X ));
195*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA2_8     o o o o S Y X Y Y Y Y Y X X X X ));
196*35ffd701SAndroid Build Coastguard Worker 
197*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA2_128   S Y X Y X Y X Y X X Y Y X X X X ));
198*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA2_64    S Y X Y X Y X Y X X Y Y X X X X ));
199*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA2_32    S Y X Y X Y X Y X Y Y Y X X X X ));
200*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA2_16    S Y X Y X Y X Y X Y Y Y X X X X ));
201*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA2_8     S Y X Y X Y X Y Y Y Y Y X X X X ));
202*35ffd701SAndroid Build Coastguard Worker 
203*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA4_128   o o o o S S X Y X X Y Y X X X X ));
204*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA4_64    o o o o S S X Y X X Y Y X X X X ));
205*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA4_32    o o o o S S X Y X Y Y Y X X X X ));
206*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA4_16    o o o o S S X Y X Y Y Y X X X X ));
207*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA4_8     o o o o S S X Y Y Y Y Y X X X X ));
208*35ffd701SAndroid Build Coastguard Worker 
209*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA4_128   S S X Y X Y X Y X X Y Y X X X X ));
210*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA4_64    S S X Y X Y X Y X X Y Y X X X X ));
211*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA4_32    S S X Y X Y X Y X Y Y Y X X X X ));
212*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA4_16    S S X Y X Y X Y X Y Y Y X X X X ));
213*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA4_8     S S X Y X Y X Y Y Y Y Y X X X X ));
214*35ffd701SAndroid Build Coastguard Worker 
215*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA8_128   o o o o S S S Y X X Y Y X X X X ));
216*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA8_64    o o o o S S S Y X X Y Y X X X X ));
217*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA8_32    o o o o S S S Y X Y Y Y X X X X ));
218*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA8_16    o o o o S S S Y X Y Y Y X X X X ));
219*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA8_8     o o o o S S S Y Y Y Y Y X X X X ));
220*35ffd701SAndroid Build Coastguard Worker 
221*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA8_128   S S S Y X Y X Y X X Y Y X X X X ));
222*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA8_64    S S S Y X Y X Y X X Y Y X X X X ));
223*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA8_32    S S S Y X Y X Y X Y Y Y X X X X ));
224*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA8_16    S S S Y X Y X Y X Y Y Y X X X X ));
225*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA8_8     S S S Y X Y X Y Y Y Y Y X X X X ));
226*35ffd701SAndroid Build Coastguard Worker 
227*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA16_128  o o o o S S S S X X Y Y X X X X ));
228*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA16_64   o o o o S S S S X X Y Y X X X X ));
229*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA16_32   o o o o S S S S X Y Y Y X X X X ));
230*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA16_16   o o o o S S S S X Y Y Y X X X X ));
231*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_MSAA16_8    o o o o S S S S Y Y Y Y X X X X ));
232*35ffd701SAndroid Build Coastguard Worker 
233*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA16_128  S S S S X Y X Y X X Y Y X X X X ));
234*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA16_64   S S S S X Y X Y X X Y Y X X X X ));
235*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA16_32   S S S S X Y X Y X Y Y Y X X X X ));
236*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA16_16   S S S S X Y X Y X Y Y Y X X X X ));
237*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_MSAA16_8    S S S S X Y X Y Y Y Y Y X X X X ));
238*35ffd701SAndroid Build Coastguard Worker 
239*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_3D_128      o o o o Y Z X X Z Z Y Y X X X X ));
240*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_3D_64       o o o o Y Z X X Z Z Y Y X X X X ));
241*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_3D_32       o o o o Y Z X Y Z Z Y Y X X X X ));
242*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_3D_16       o o o o Y Z Y Z Z Z Y Y X X X X ));
243*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YF_3D_8        o o o o Y Z Y Z Z Z Y Y X X X X ));
244*35ffd701SAndroid Build Coastguard Worker 
245*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_3D_128      X Y Z X Y Z X X Z Z Y Y X X X X ));
246*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_3D_64       X Y Z X Y Z X X Z Z Y Y X X X X ));
247*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_3D_32       X Y Z X Y Z X Y Z Z Y Y X X X X ));
248*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_3D_16       X Y Z X Y Z Y Z Z Z Y Y X X X X ));
249*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_YS_3D_8        X Y Z X Y Z Y Z Z Z Y Y X X X X ));
250*35ffd701SAndroid Build Coastguard Worker 
251*35ffd701SAndroid Build Coastguard Worker     // XE_HP_SDV Swizzles...
252*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_4              o o o o Y Y X Y X X Y Y X X X X ));
253*35ffd701SAndroid Build Coastguard Worker 
254*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_128         Y X X X Y Y X Y X X Y Y X X X X ));
255*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_64          Y X X X Y Y X Y X X Y Y X X X X ));
256*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_32          Y Y X X Y Y X Y X X Y Y X X X X ));
257*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_16          Y Y X X Y Y X Y X X Y Y X X X X ));
258*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_8           Y Y Y X Y Y X Y X X Y Y X X X X ));
259*35ffd701SAndroid Build Coastguard Worker 
260*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_MSAA2_128   Y X X X Y Y X Y S X Y Y X X X X ));
261*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_MSAA2_64    Y X X X Y Y X Y S X Y Y X X X X ));
262*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_MSAA2_32    Y Y X X Y Y X Y S X Y Y X X X X ));
263*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_MSAA2_16    Y Y X X Y Y X Y S X Y Y X X X X ));
264*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_MSAA2_8     Y Y Y X Y Y X Y S X Y Y X X X X ));
265*35ffd701SAndroid Build Coastguard Worker 
266*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_MSAA_128    Y X X X Y Y X S S X Y Y X X X X ));
267*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_MSAA_64     Y X X X Y Y X S S X Y Y X X X X ));
268*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_MSAA_32     Y Y X X Y Y X S S X Y Y X X X X ));
269*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_MSAA_16     Y Y X X Y Y X S S X Y Y X X X X ));
270*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_MSAA_8      Y Y Y X Y Y X S S X Y Y X X X X ));
271*35ffd701SAndroid Build Coastguard Worker 
272*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_3D_128      Z Z Y X X X Z Y Z X Y Y X X X X ));
273*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_3D_64       Z Z Y X X X Z Y Z X Y Y X X X X ));
274*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_3D_32       Z Z Y X Y X Z Y Z X Y Y X X X X ));
275*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_3D_16       Z Z Z Y Y X Z Y Z X Y Y X X X X ));
276*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_3D_8        Z Z Z X Y Y Z Y Z X Y Y X X X X ));
277*35ffd701SAndroid Build Coastguard Worker 
278*35ffd701SAndroid Build Coastguard Worker     //Tile64 updated layout for Render Compression 256B and Physical L3
279*35ffd701SAndroid Build Coastguard Worker 
280*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA2_128   Y X X X Y Y X S X X Y Y X X X X ));
281*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA2_64    Y Y X X Y Y X S X X Y Y X X X X ));
282*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA2_32    Y Y Y X Y Y X S X X Y Y X X X X ));
283*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA2_16    Y Y Y X Y Y X S X X Y Y X X X X ));
284*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA2_8     Y Y Y Y Y Y X S X X Y Y X X X X ));
285*35ffd701SAndroid Build Coastguard Worker 
286*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA4_128   Y X X X Y Y S S X X Y Y X X X X ));
287*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA4_64    Y X X X Y Y S S X X Y Y X X X X ));
288*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA4_32    Y Y X X Y Y S S X X Y Y X X X X ));
289*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA4_16    Y Y X X Y Y S S X X Y Y X X X X ));
290*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA4_8     Y Y Y X Y Y S S X X Y Y X X X X ));
291*35ffd701SAndroid Build Coastguard Worker 
292*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA8_128   Y Y X X Y X S S S X Y Y X X X X ));
293*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA8_64    Y Y X X Y X S S S X Y Y X X X X ));
294*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA8_32    Y Y X X Y X S S S X Y Y X X X X ));
295*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA8_16    Y Y Y X Y X S S S X Y Y X X X X ));
296*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA8_8     Y Y Y X Y X S S S X Y Y X X X X ));
297*35ffd701SAndroid Build Coastguard Worker 
298*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA16_128   Y X X X Y X S S S S Y Y X X X X ));
299*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA16_64    Y Y X X Y X S S S S Y Y X X X X ));
300*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA16_32    Y Y X X Y X S S S S Y Y X X X X ));
301*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA16_16    Y Y X X Y X S S S S Y Y X X X X ));
302*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_MSAA16_8     Y Y Y X Y X S S S S Y Y X X X X ));
303*35ffd701SAndroid Build Coastguard Worker 
304*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_3D_128      Z Z Y X X Y Z Z X X Y Y X X X X ));
305*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_3D_64       Z Z Y X X Y Z Z X X Y Y X X X X ));
306*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_3D_32       Z Z Y X Y Y Z Z X X Y Y X X X X ));
307*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_3D_16       Z Z Z Y Y Y Z Z X X Y Y X X X X ));
308*35ffd701SAndroid Build Coastguard Worker     SWIZZLE(( INTEL_TILE_64_V2_3D_8        Z Z Z Y Y Y Z Z X X Y Y X X X X ));
309*35ffd701SAndroid Build Coastguard Worker 
310*35ffd701SAndroid Build Coastguard Worker 
311*35ffd701SAndroid Build Coastguard Worker     #undef X
312*35ffd701SAndroid Build Coastguard Worker     #undef Y
313*35ffd701SAndroid Build Coastguard Worker     #undef Z
314*35ffd701SAndroid Build Coastguard Worker     #undef S
315*35ffd701SAndroid Build Coastguard Worker     #undef o
316*35ffd701SAndroid Build Coastguard Worker     #undef __SWIZZLE
317*35ffd701SAndroid Build Coastguard Worker     #undef SWIZZLE
318*35ffd701SAndroid Build Coastguard Worker 
319*35ffd701SAndroid Build Coastguard Worker // Accessing Swizzled Surface ##################################################
320*35ffd701SAndroid Build Coastguard Worker 
321*35ffd701SAndroid Build Coastguard Worker /* While graphics hardware prefers to access surfaces stored in tiled/swizzled
322*35ffd701SAndroid Build Coastguard Worker formats, logically accessing such surfaces with CPU-based software is non-
323*35ffd701SAndroid Build Coastguard Worker trivial when high throughput is goal.
324*35ffd701SAndroid Build Coastguard Worker 
325*35ffd701SAndroid Build Coastguard Worker This file implements (1) SwizzleOffset function to compute swizzled offset of
326*35ffd701SAndroid Build Coastguard Worker dimensionally-specified surface byte, and (2) CpuSwizzleBlt function to BLT
327*35ffd701SAndroid Build Coastguard Worker between linear ("y * pitch + x") and swizzled surfaces--with goal of providing
328*35ffd701SAndroid Build Coastguard Worker high-performance, swizzling BLT implementation to be used both in production
329*35ffd701SAndroid Build Coastguard Worker and as a guide for those seeking to understand swizzled access or implement
330*35ffd701SAndroid Build Coastguard Worker functionality beyond the simple BLT. */
331*35ffd701SAndroid Build Coastguard Worker 
332*35ffd701SAndroid Build Coastguard Worker // Surface Descriptor for CpuSwizzleBlt function...
333*35ffd701SAndroid Build Coastguard Worker typedef struct _CPU_SWIZZLE_BLT_SURFACE
334*35ffd701SAndroid Build Coastguard Worker {
335*35ffd701SAndroid Build Coastguard Worker     void                        *pBase;         // Pointer to surface base.
336*35ffd701SAndroid Build Coastguard Worker     int                         Pitch, Height;  // Row-pitch in bytes, and height, of surface.
337*35ffd701SAndroid Build Coastguard Worker     const SWIZZLE_DESCRIPTOR    *pSwizzle;      // Pointer to surface's swizzle descriptor, or NULL if unswizzled.
338*35ffd701SAndroid Build Coastguard Worker     int                         OffsetX;        // Horizontal offset into surface for BLT rectangle, in bytes.
339*35ffd701SAndroid Build Coastguard Worker     int                         OffsetY;        // Vertical offset into surface for BLT rectangle, in physical/pitch rows.
340*35ffd701SAndroid Build Coastguard Worker     int                         OffsetZ;        // Zero if N/A, or 3D offset into surface for BLT rectangle, in 3D slices or MSAA samples as appropriate.
341*35ffd701SAndroid Build Coastguard Worker 
342*35ffd701SAndroid Build Coastguard Worker     #ifdef SUB_ELEMENT_SUPPORT
343*35ffd701SAndroid Build Coastguard Worker         struct _CPU_SWIZZLE_BLT_SURFACE_ELEMENT
344*35ffd701SAndroid Build Coastguard Worker         {
345*35ffd701SAndroid Build Coastguard Worker             int                     Pitch, Size; // Zero if full-pixel BLT, or pitch and size, in bytes, of pixel element being BLT'ed.
346*35ffd701SAndroid Build Coastguard Worker         }                       Element;
347*35ffd701SAndroid Build Coastguard Worker 
348*35ffd701SAndroid Build Coastguard Worker         /* e.g. to BLT only stencil data from S8D24 surface to S8 surface...
349*35ffd701SAndroid Build Coastguard Worker             Dest.Element.Size = Src.Element.Size = sizeof(S8) = 1;
350*35ffd701SAndroid Build Coastguard Worker             Dest.Element.Pitch = sizeof(S8) = 1;
351*35ffd701SAndroid Build Coastguard Worker             Src.Element.Pitch = sizeof(S8D24) = 4;
352*35ffd701SAndroid Build Coastguard Worker             Src.OffsetX += BYTE_OFFSET_OF_S8_WITHIN_S8D24; */
353*35ffd701SAndroid Build Coastguard Worker     #endif
354*35ffd701SAndroid Build Coastguard Worker } CPU_SWIZZLE_BLT_SURFACE;
355*35ffd701SAndroid Build Coastguard Worker 
356*35ffd701SAndroid Build Coastguard Worker extern int SwizzleOffset(const SWIZZLE_DESCRIPTOR *pSwizzle, int Pitch, int OffsetX, int OffsetY, int OffsetZ);
357*35ffd701SAndroid Build Coastguard Worker extern void CpuSwizzleBlt(CPU_SWIZZLE_BLT_SURFACE *pDest, CPU_SWIZZLE_BLT_SURFACE *pSrc, int CopyWidthBytes, int CopyHeight);
358*35ffd701SAndroid Build Coastguard Worker 
359*35ffd701SAndroid Build Coastguard Worker #ifdef __cplusplus
360*35ffd701SAndroid Build Coastguard Worker }
361*35ffd701SAndroid Build Coastguard Worker #endif
362*35ffd701SAndroid Build Coastguard Worker 
363*35ffd701SAndroid Build Coastguard Worker #define CpuSwizzleBlt_INCLUDED
364*35ffd701SAndroid Build Coastguard Worker 
365*35ffd701SAndroid Build Coastguard Worker #endif
366*35ffd701SAndroid Build Coastguard Worker 
367*35ffd701SAndroid Build Coastguard Worker 
368*35ffd701SAndroid Build Coastguard Worker #ifndef INCLUDE_CpuSwizzleBlt_c_AS_HEADER
369*35ffd701SAndroid Build Coastguard Worker 
370*35ffd701SAndroid Build Coastguard Worker //#define MINIMALIST                // Use minimalist, unoptimized implementation.
371*35ffd701SAndroid Build Coastguard Worker 
372*35ffd701SAndroid Build Coastguard Worker #include "assert.h" // Quoted to allow local-directory override.
373*35ffd701SAndroid Build Coastguard Worker 
374*35ffd701SAndroid Build Coastguard Worker #if(_MSC_VER >= 1400)
375*35ffd701SAndroid Build Coastguard Worker     #include <intrin.h>
376*35ffd701SAndroid Build Coastguard Worker #elif defined(__ARM_ARCH)
377*35ffd701SAndroid Build Coastguard Worker     #include <sse2neon.h>
378*35ffd701SAndroid Build Coastguard Worker #elif((defined __clang__) ||(__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5)))
379*35ffd701SAndroid Build Coastguard Worker     #include <cpuid.h>
380*35ffd701SAndroid Build Coastguard Worker     #include <x86intrin.h>
381*35ffd701SAndroid Build Coastguard Worker #else
382*35ffd701SAndroid Build Coastguard Worker     #error "Unexpected compiler!"
383*35ffd701SAndroid Build Coastguard Worker #endif
384*35ffd701SAndroid Build Coastguard Worker 
385*35ffd701SAndroid Build Coastguard Worker 
386*35ffd701SAndroid Build Coastguard Worker // POPCNT: Count Lit Bits...                 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
387*35ffd701SAndroid Build Coastguard Worker static unsigned char PopCnt4[16] =          {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
388*35ffd701SAndroid Build Coastguard Worker #define POPCNT4(x)  (PopCnt4[(x) & 0xf])
389*35ffd701SAndroid Build Coastguard Worker #define POPCNT16(x) (POPCNT4((x) >> 12) + POPCNT4((x) >> 8) + POPCNT4((x) >> 4) + POPCNT4(x))
390*35ffd701SAndroid Build Coastguard Worker 
391*35ffd701SAndroid Build Coastguard Worker 
SwizzleOffset(const SWIZZLE_DESCRIPTOR * pSwizzle,int Pitch,int OffsetX,int OffsetY,int OffsetZ)392*35ffd701SAndroid Build Coastguard Worker int SwizzleOffset( // ##########################################################
393*35ffd701SAndroid Build Coastguard Worker 
394*35ffd701SAndroid Build Coastguard Worker     /* Return swizzled offset of dimensionally-specified surface byte. */
395*35ffd701SAndroid Build Coastguard Worker 
396*35ffd701SAndroid Build Coastguard Worker     const SWIZZLE_DESCRIPTOR    *pSwizzle,  // Pointer to applicable swizzle descriptor.
397*35ffd701SAndroid Build Coastguard Worker     int                         Pitch,      // Pointer to applicable surface row-pitch.
398*35ffd701SAndroid Build Coastguard Worker     int                         OffsetX,    // Horizontal offset into surface of the target byte, in bytes.
399*35ffd701SAndroid Build Coastguard Worker     int                         OffsetY,    // Vertical offset into surface of the target byte, in physical/pitch rows.
400*35ffd701SAndroid Build Coastguard Worker     int                         OffsetZ)    // Zero if N/A, or 3D offset into surface of the target byte, in 3D slices or MSAA samples as appropriate.
401*35ffd701SAndroid Build Coastguard Worker 
402*35ffd701SAndroid Build Coastguard Worker     /* Given logically-specified (x, y, z) byte within swizzled surface,
403*35ffd701SAndroid Build Coastguard Worker     function returns byte's linear/memory offset from surface's base--i.e. it
404*35ffd701SAndroid Build Coastguard Worker     performs the swizzled, spatial-to-linear mapping.
405*35ffd701SAndroid Build Coastguard Worker 
406*35ffd701SAndroid Build Coastguard Worker     Function makes no real effort to perform optimally, since should only used
407*35ffd701SAndroid Build Coastguard Worker     outside loops in CpuSwizzleBlt and similar functions. If any of this
408*35ffd701SAndroid Build Coastguard Worker     functionality was needed in performance path, a custom implementation
409*35ffd701SAndroid Build Coastguard Worker     should be used that limits itself to functionality specifically needed
410*35ffd701SAndroid Build Coastguard Worker     (probably single-dimension, intra-tile offsets) and uses a fast computation
411*35ffd701SAndroid Build Coastguard Worker     (e.g. LUT's, hard-codings, PDEP). */
412*35ffd701SAndroid Build Coastguard Worker 
413*35ffd701SAndroid Build Coastguard Worker { // ###########################################################################
414*35ffd701SAndroid Build Coastguard Worker 
415*35ffd701SAndroid Build Coastguard Worker     char PDepSupported = -1; // AVX2/BMI2 PDEP (Parallel Deposit) Instruction
416*35ffd701SAndroid Build Coastguard Worker 
417*35ffd701SAndroid Build Coastguard Worker     int SwizzledOffset; // Return value being computed.
418*35ffd701SAndroid Build Coastguard Worker 
419*35ffd701SAndroid Build Coastguard Worker     int TileWidthBits =  POPCNT16(pSwizzle->Mask.x); // Log2(Tile Width in Bytes)
420*35ffd701SAndroid Build Coastguard Worker     int TileHeightBits = POPCNT16(pSwizzle->Mask.y); // Log2(Tile Height)
421*35ffd701SAndroid Build Coastguard Worker     int TileDepthBits =  POPCNT16(pSwizzle->Mask.z); // Log2(Tile Depth or MSAA Samples)
422*35ffd701SAndroid Build Coastguard Worker     int TileSizeBits =   TileWidthBits + TileHeightBits + TileDepthBits; // Log2(Tile Size in Bytes)
423*35ffd701SAndroid Build Coastguard Worker     int TilesPerRow =    Pitch >> TileWidthBits;     // Surface Width in Tiles
424*35ffd701SAndroid Build Coastguard Worker 
425*35ffd701SAndroid Build Coastguard Worker     int Row, Col;   // Tile grid position on surface, of tile containing specified byte.
426*35ffd701SAndroid Build Coastguard Worker     int x, y, z;    // Position of specified byte within tile that contains it.
427*35ffd701SAndroid Build Coastguard Worker 
428*35ffd701SAndroid Build Coastguard Worker     if(PDepSupported == -1)
429*35ffd701SAndroid Build Coastguard Worker     {
430*35ffd701SAndroid Build Coastguard Worker         #if(_MSC_VER >= 1700)
431*35ffd701SAndroid Build Coastguard Worker             #define PDEP(Src, Mask) _pdep_u32((Src), (Mask))
432*35ffd701SAndroid Build Coastguard Worker             int CpuInfo[4];
433*35ffd701SAndroid Build Coastguard Worker             __cpuidex(CpuInfo, 7, 0);
434*35ffd701SAndroid Build Coastguard Worker             PDepSupported = ((CpuInfo[1] & (1 << 8)) != 0); // EBX[8] = BMI2
435*35ffd701SAndroid Build Coastguard Worker         #elif ( defined (__BMI2__ ))
436*35ffd701SAndroid Build Coastguard Worker             #define PDEP(Src, Mask) _pdep_u32((Src), (Mask))
437*35ffd701SAndroid Build Coastguard Worker             unsigned int eax, ebx, ecx, edx;
438*35ffd701SAndroid Build Coastguard Worker             __cpuid_count(7, 0, eax, ebx, ecx, edx);
439*35ffd701SAndroid Build Coastguard Worker             PDepSupported = ((ebx & (1 << 8)) != 0); // EBX[8] = BMI2
440*35ffd701SAndroid Build Coastguard Worker         #else
441*35ffd701SAndroid Build Coastguard Worker             #define PDEP(Src, Mask) 0
442*35ffd701SAndroid Build Coastguard Worker             PDepSupported = 0;
443*35ffd701SAndroid Build Coastguard Worker         #endif
444*35ffd701SAndroid Build Coastguard Worker     }
445*35ffd701SAndroid Build Coastguard Worker 
446*35ffd701SAndroid Build Coastguard Worker     assert( // Mutually Exclusive Swizzle Positions...
447*35ffd701SAndroid Build Coastguard Worker         (pSwizzle->Mask.x | pSwizzle->Mask.y | pSwizzle->Mask.z) ==
448*35ffd701SAndroid Build Coastguard Worker         (pSwizzle->Mask.x + pSwizzle->Mask.y + pSwizzle->Mask.z));
449*35ffd701SAndroid Build Coastguard Worker 
450*35ffd701SAndroid Build Coastguard Worker     assert( // Swizzle Limited to 16-bit (else expand POPCNT'ing)...
451*35ffd701SAndroid Build Coastguard Worker         (pSwizzle->Mask.x | pSwizzle->Mask.y | pSwizzle->Mask.z) < (1 << 16));
452*35ffd701SAndroid Build Coastguard Worker 
453*35ffd701SAndroid Build Coastguard Worker     assert( // Pitch is Multiple of Tile Width...
454*35ffd701SAndroid Build Coastguard Worker         Pitch == ((Pitch >> TileWidthBits) << TileWidthBits));
455*35ffd701SAndroid Build Coastguard Worker 
456*35ffd701SAndroid Build Coastguard Worker     { // Break Positioning into Tile-Granular and Intra-Tile Components...
457*35ffd701SAndroid Build Coastguard Worker         assert((OffsetZ >>       TileDepthBits) == 0); // When dealing with 3D tiling, treat as separate single-tile-deep planes.
458*35ffd701SAndroid Build Coastguard Worker         z =     OffsetZ & ((1 << TileDepthBits) - 1);
459*35ffd701SAndroid Build Coastguard Worker 
460*35ffd701SAndroid Build Coastguard Worker         Row =   OffsetY >>       TileHeightBits;
461*35ffd701SAndroid Build Coastguard Worker         y =     OffsetY & ((1 << TileHeightBits) - 1);
462*35ffd701SAndroid Build Coastguard Worker 
463*35ffd701SAndroid Build Coastguard Worker         Col =   OffsetX >>       TileWidthBits;
464*35ffd701SAndroid Build Coastguard Worker         x =     OffsetX & ((1 << TileWidthBits) - 1);
465*35ffd701SAndroid Build Coastguard Worker     }
466*35ffd701SAndroid Build Coastguard Worker 
467*35ffd701SAndroid Build Coastguard Worker     SwizzledOffset = // Start with surface offset of given tile...
468*35ffd701SAndroid Build Coastguard Worker         (Row * TilesPerRow + Col) << TileSizeBits; // <-- Tiles laid across surface in row-major order.
469*35ffd701SAndroid Build Coastguard Worker 
470*35ffd701SAndroid Build Coastguard Worker     // ...then OR swizzled offset of byte within tile...
471*35ffd701SAndroid Build Coastguard Worker     if(PDepSupported)
472*35ffd701SAndroid Build Coastguard Worker     {
473*35ffd701SAndroid Build Coastguard Worker         SwizzledOffset +=
474*35ffd701SAndroid Build Coastguard Worker             PDEP(x, pSwizzle->Mask.x) +
475*35ffd701SAndroid Build Coastguard Worker             PDEP(y, pSwizzle->Mask.y) +
476*35ffd701SAndroid Build Coastguard Worker             PDEP(z, pSwizzle->Mask.z);
477*35ffd701SAndroid Build Coastguard Worker     }
478*35ffd701SAndroid Build Coastguard Worker     else // PDEP workalike...
479*35ffd701SAndroid Build Coastguard Worker     {
480*35ffd701SAndroid Build Coastguard Worker         int bitIndex = 0, bitMask = 1;
481*35ffd701SAndroid Build Coastguard Worker         int terminationMask = pSwizzle->Mask.x | pSwizzle->Mask.y | pSwizzle->Mask.z;
482*35ffd701SAndroid Build Coastguard Worker         while(bitMask < terminationMask)
483*35ffd701SAndroid Build Coastguard Worker         {
484*35ffd701SAndroid Build Coastguard Worker             int MaskQ;
485*35ffd701SAndroid Build Coastguard Worker             #define PROCESS(Q) {                    \
486*35ffd701SAndroid Build Coastguard Worker                 MaskQ = bitMask & pSwizzle->Mask.Q; \
487*35ffd701SAndroid Build Coastguard Worker                 SwizzledOffset += Q & MaskQ;        \
488*35ffd701SAndroid Build Coastguard Worker                 Q <<= 1 ^ (MaskQ >> bitIndex);      \
489*35ffd701SAndroid Build Coastguard Worker             }
490*35ffd701SAndroid Build Coastguard Worker             PROCESS(x);
491*35ffd701SAndroid Build Coastguard Worker             PROCESS(y);
492*35ffd701SAndroid Build Coastguard Worker             PROCESS(z);
493*35ffd701SAndroid Build Coastguard Worker 
494*35ffd701SAndroid Build Coastguard Worker             bitIndex++;
495*35ffd701SAndroid Build Coastguard Worker             bitMask <<= 1;
496*35ffd701SAndroid Build Coastguard Worker 
497*35ffd701SAndroid Build Coastguard Worker             #undef PROCESS
498*35ffd701SAndroid Build Coastguard Worker         }
499*35ffd701SAndroid Build Coastguard Worker     }
500*35ffd701SAndroid Build Coastguard Worker 
501*35ffd701SAndroid Build Coastguard Worker     return(SwizzledOffset);
502*35ffd701SAndroid Build Coastguard Worker }
503*35ffd701SAndroid Build Coastguard Worker 
504*35ffd701SAndroid Build Coastguard Worker 
CpuSwizzleBlt(CPU_SWIZZLE_BLT_SURFACE * pDest,CPU_SWIZZLE_BLT_SURFACE * pSrc,int CopyWidthBytes,int CopyHeight)505*35ffd701SAndroid Build Coastguard Worker void CpuSwizzleBlt( // #########################################################
506*35ffd701SAndroid Build Coastguard Worker 
507*35ffd701SAndroid Build Coastguard Worker     /* Performs specified swizzling BLT between two given surfaces. */
508*35ffd701SAndroid Build Coastguard Worker 
509*35ffd701SAndroid Build Coastguard Worker     CPU_SWIZZLE_BLT_SURFACE *pDest,         // Pointer to destination surface descriptor.
510*35ffd701SAndroid Build Coastguard Worker     CPU_SWIZZLE_BLT_SURFACE *pSrc,          // Pointer to source surface descriptor.
511*35ffd701SAndroid Build Coastguard Worker     int                     CopyWidthBytes, // Width of BLT rectangle, in bytes.
512*35ffd701SAndroid Build Coastguard Worker     int                     CopyHeight)     // Height of BLT rectangle, in physical/pitch rows.
513*35ffd701SAndroid Build Coastguard Worker 
514*35ffd701SAndroid Build Coastguard Worker     #ifdef SUB_ELEMENT_SUPPORT
515*35ffd701SAndroid Build Coastguard Worker 
516*35ffd701SAndroid Build Coastguard Worker         /* When copying between surfaces with different pixel pitches, specify
517*35ffd701SAndroid Build Coastguard Worker         CopyWidthBytes in terms of unswizzled surface's element-pitches:
518*35ffd701SAndroid Build Coastguard Worker 
519*35ffd701SAndroid Build Coastguard Worker             CopyWidthBytes = CopyWidthPixels * pLinearSurface.Element.Pitch; */
520*35ffd701SAndroid Build Coastguard Worker 
521*35ffd701SAndroid Build Coastguard Worker     #endif
522*35ffd701SAndroid Build Coastguard Worker 
523*35ffd701SAndroid Build Coastguard Worker { // ###########################################################################
524*35ffd701SAndroid Build Coastguard Worker 
525*35ffd701SAndroid Build Coastguard Worker     CPU_SWIZZLE_BLT_SURFACE *pLinearSurface, *pSwizzledSurface;
526*35ffd701SAndroid Build Coastguard Worker     int LinearToSwizzled;
527*35ffd701SAndroid Build Coastguard Worker 
528*35ffd701SAndroid Build Coastguard Worker     { // One surface swizzled, the other unswizzled (aka "linear")...
529*35ffd701SAndroid Build Coastguard Worker         assert((pDest->pSwizzle != NULL) ^ (pSrc->pSwizzle != NULL));
530*35ffd701SAndroid Build Coastguard Worker 
531*35ffd701SAndroid Build Coastguard Worker         LinearToSwizzled = !pSrc->pSwizzle;
532*35ffd701SAndroid Build Coastguard Worker         if(LinearToSwizzled)
533*35ffd701SAndroid Build Coastguard Worker         {
534*35ffd701SAndroid Build Coastguard Worker             pSwizzledSurface =  pDest;
535*35ffd701SAndroid Build Coastguard Worker             pLinearSurface =    pSrc;
536*35ffd701SAndroid Build Coastguard Worker         }
537*35ffd701SAndroid Build Coastguard Worker         else // Swizzled-to-Linear...
538*35ffd701SAndroid Build Coastguard Worker         {
539*35ffd701SAndroid Build Coastguard Worker             pSwizzledSurface =  pSrc;
540*35ffd701SAndroid Build Coastguard Worker             pLinearSurface =    pDest;
541*35ffd701SAndroid Build Coastguard Worker         }
542*35ffd701SAndroid Build Coastguard Worker     }
543*35ffd701SAndroid Build Coastguard Worker 
544*35ffd701SAndroid Build Coastguard Worker     #ifdef SUB_ELEMENT_SUPPORT
545*35ffd701SAndroid Build Coastguard Worker     {
546*35ffd701SAndroid Build Coastguard Worker         assert( // Either both or neither specified...
547*35ffd701SAndroid Build Coastguard Worker             (pDest->Element.Pitch != 0) == (pSrc->Element.Pitch != 0));
548*35ffd701SAndroid Build Coastguard Worker 
549*35ffd701SAndroid Build Coastguard Worker         assert( // Surfaces agree on transfer element size...
550*35ffd701SAndroid Build Coastguard Worker             pDest->Element.Size == pSrc->Element.Size);
551*35ffd701SAndroid Build Coastguard Worker 
552*35ffd701SAndroid Build Coastguard Worker         assert( // Element pitch not specified without element size...
553*35ffd701SAndroid Build Coastguard Worker             !(pDest->Element.Pitch && !pDest->Element.Size));
554*35ffd701SAndroid Build Coastguard Worker 
555*35ffd701SAndroid Build Coastguard Worker         assert( // Legit element sizes...
556*35ffd701SAndroid Build Coastguard Worker             (pDest->Element.Size <= pDest->Element.Pitch) &&
557*35ffd701SAndroid Build Coastguard Worker             (pSrc->Element.Size <= pSrc->Element.Pitch));
558*35ffd701SAndroid Build Coastguard Worker 
559*35ffd701SAndroid Build Coastguard Worker         assert( // Sub-element CopyWidthBytes in terms of LinearSurface pitch...
560*35ffd701SAndroid Build Coastguard Worker             (pLinearSurface->Element.Pitch == 0) ||
561*35ffd701SAndroid Build Coastguard Worker             ((CopyWidthBytes % pLinearSurface->Element.Pitch) == 0));
562*35ffd701SAndroid Build Coastguard Worker     }
563*35ffd701SAndroid Build Coastguard Worker     #endif
564*35ffd701SAndroid Build Coastguard Worker 
565*35ffd701SAndroid Build Coastguard Worker     { // No surface overrun...
566*35ffd701SAndroid Build Coastguard Worker         int NoOverrun =
567*35ffd701SAndroid Build Coastguard Worker             #ifdef SUB_ELEMENT_SUPPORT
568*35ffd701SAndroid Build Coastguard Worker             (
569*35ffd701SAndroid Build Coastguard Worker                 // Sub-element transfer...
570*35ffd701SAndroid Build Coastguard Worker                 ((pLinearSurface->Element.Size != pLinearSurface->Element.Pitch) ||
571*35ffd701SAndroid Build Coastguard Worker                     (pSwizzledSurface->Element.Size != pSwizzledSurface->Element.Pitch)) &&
572*35ffd701SAndroid Build Coastguard Worker                 // No overrun...
573*35ffd701SAndroid Build Coastguard Worker                 ((pLinearSurface->OffsetX + CopyWidthBytes) <=
574*35ffd701SAndroid Build Coastguard Worker                     (pLinearSurface->Pitch +
575*35ffd701SAndroid Build Coastguard Worker                      // CopyWidthBytes's inclusion of uncopied bytes...
576*35ffd701SAndroid Build Coastguard Worker                      (pLinearSurface->Element.Pitch - pLinearSurface->Element.Size))) &&
577*35ffd701SAndroid Build Coastguard Worker                 ((pLinearSurface->OffsetY + CopyHeight) <= pLinearSurface->Height) &&
578*35ffd701SAndroid Build Coastguard Worker                 ((pSwizzledSurface->OffsetX +
579*35ffd701SAndroid Build Coastguard Worker                     // Adjust CopyWidthBytes from being in terms of LinearSurface pitch...
580*35ffd701SAndroid Build Coastguard Worker                     (CopyWidthBytes / pLinearSurface->Element.Pitch * pSwizzledSurface->Element.Pitch)
581*35ffd701SAndroid Build Coastguard Worker                     ) <=
582*35ffd701SAndroid Build Coastguard Worker                     (pSwizzledSurface->Pitch +
583*35ffd701SAndroid Build Coastguard Worker                      // CopyWidthBytes's inclusion of uncopied bytes...
584*35ffd701SAndroid Build Coastguard Worker                      (pSwizzledSurface->Element.Pitch - pSwizzledSurface->Element.Size))) &&
585*35ffd701SAndroid Build Coastguard Worker                 ((pSwizzledSurface->OffsetY + CopyHeight) <= pSwizzledSurface->Height)
586*35ffd701SAndroid Build Coastguard Worker             ) ||
587*35ffd701SAndroid Build Coastguard Worker             #endif
588*35ffd701SAndroid Build Coastguard Worker 
589*35ffd701SAndroid Build Coastguard Worker             ((pDest->OffsetX + CopyWidthBytes) <= pDest->Pitch) &&
590*35ffd701SAndroid Build Coastguard Worker             ((pDest->OffsetY + CopyHeight) <= pDest->Height) &&
591*35ffd701SAndroid Build Coastguard Worker             ((pSrc->OffsetX + CopyWidthBytes) <= pSrc->Pitch) &&
592*35ffd701SAndroid Build Coastguard Worker             ((pSrc->OffsetY + CopyHeight) <= pSrc->Height);
593*35ffd701SAndroid Build Coastguard Worker 
594*35ffd701SAndroid Build Coastguard Worker         assert(NoOverrun);
595*35ffd701SAndroid Build Coastguard Worker     }
596*35ffd701SAndroid Build Coastguard Worker 
597*35ffd701SAndroid Build Coastguard Worker     { // No surface overlap...
598*35ffd701SAndroid Build Coastguard Worker         char *pDest0 = (char *) pDest->pBase;
599*35ffd701SAndroid Build Coastguard Worker         char *pDest1 = (char *) pDest->pBase + pDest->Pitch * CopyHeight;
600*35ffd701SAndroid Build Coastguard Worker         char *pSrc0 =  (char *)  pSrc->pBase;
601*35ffd701SAndroid Build Coastguard Worker         char *pSrc1 =  (char *)  pSrc->pBase +  pSrc->Pitch * CopyHeight;
602*35ffd701SAndroid Build Coastguard Worker 
603*35ffd701SAndroid Build Coastguard Worker         assert(!(
604*35ffd701SAndroid Build Coastguard Worker             ((pDest0 >= pSrc0) && (pDest0 < pSrc1)) ||
605*35ffd701SAndroid Build Coastguard Worker             ((pSrc0 >= pDest0) && (pSrc0 < pDest1))));
606*35ffd701SAndroid Build Coastguard Worker     }
607*35ffd701SAndroid Build Coastguard Worker 
608*35ffd701SAndroid Build Coastguard Worker     {
609*35ffd701SAndroid Build Coastguard Worker         /* BLT will have pointer in each surface between which data will be
610*35ffd701SAndroid Build Coastguard Worker         copied from source to destination. Each pointer will be appropriately
611*35ffd701SAndroid Build Coastguard Worker         incremented/positioned through its surface, as BLT rectangle is
612*35ffd701SAndroid Build Coastguard Worker         traversed. */
613*35ffd701SAndroid Build Coastguard Worker 
614*35ffd701SAndroid Build Coastguard Worker         char *pLinearAddress, *pSwizzledAddress;
615*35ffd701SAndroid Build Coastguard Worker 
616*35ffd701SAndroid Build Coastguard Worker         // Convenient to track traversal in swizzled surface offsets...
617*35ffd701SAndroid Build Coastguard Worker         int x0 = pSwizzledSurface->OffsetX;
618*35ffd701SAndroid Build Coastguard Worker         int x1 = x0 + CopyWidthBytes;
619*35ffd701SAndroid Build Coastguard Worker         int y0 = pSwizzledSurface->OffsetY;
620*35ffd701SAndroid Build Coastguard Worker         int y1 = y0 + CopyHeight;
621*35ffd701SAndroid Build Coastguard Worker         int x, y;
622*35ffd701SAndroid Build Coastguard Worker 
623*35ffd701SAndroid Build Coastguard Worker         // Start linear pointer at specified base...
624*35ffd701SAndroid Build Coastguard Worker         pLinearAddress =
625*35ffd701SAndroid Build Coastguard Worker             (char *) pLinearSurface->pBase +
626*35ffd701SAndroid Build Coastguard Worker             pLinearSurface->OffsetY * pLinearSurface->Pitch +
627*35ffd701SAndroid Build Coastguard Worker             pLinearSurface->OffsetX;
628*35ffd701SAndroid Build Coastguard Worker 
629*35ffd701SAndroid Build Coastguard Worker         #ifdef MINIMALIST // Simple implementation for functional understanding/testing/etc.
630*35ffd701SAndroid Build Coastguard Worker         {
631*35ffd701SAndroid Build Coastguard Worker             #ifdef SUB_ELEMENT_SUPPORT
632*35ffd701SAndroid Build Coastguard Worker                 assert( // No Sub-Element Transfer...
633*35ffd701SAndroid Build Coastguard Worker                     (pLinearSurface->Element.Size == pLinearSurface->Element.Pitch) &&
634*35ffd701SAndroid Build Coastguard Worker                     (pSwizzledSurface->Element.Size == pSwizzledSurface->Element.Pitch));
635*35ffd701SAndroid Build Coastguard Worker             #endif
636*35ffd701SAndroid Build Coastguard Worker 
637*35ffd701SAndroid Build Coastguard Worker             for(y = y0; y < y1; y++)
638*35ffd701SAndroid Build Coastguard Worker             {
639*35ffd701SAndroid Build Coastguard Worker                 for(x = x0; x < x1; x++)
640*35ffd701SAndroid Build Coastguard Worker                 {
641*35ffd701SAndroid Build Coastguard Worker                     pSwizzledAddress =
642*35ffd701SAndroid Build Coastguard Worker                         (char *) pSwizzledSurface->pBase +
643*35ffd701SAndroid Build Coastguard Worker                         SwizzleOffset(
644*35ffd701SAndroid Build Coastguard Worker                             pSwizzledSurface->pSwizzle,
645*35ffd701SAndroid Build Coastguard Worker                             pSwizzledSurface->Pitch,
646*35ffd701SAndroid Build Coastguard Worker                             x, y, pSwizzledSurface->OffsetZ);
647*35ffd701SAndroid Build Coastguard Worker 
648*35ffd701SAndroid Build Coastguard Worker                     if(LinearToSwizzled)
649*35ffd701SAndroid Build Coastguard Worker                     {
650*35ffd701SAndroid Build Coastguard Worker                         *pSwizzledAddress = *pLinearAddress;
651*35ffd701SAndroid Build Coastguard Worker                     }
652*35ffd701SAndroid Build Coastguard Worker                     else
653*35ffd701SAndroid Build Coastguard Worker                     {
654*35ffd701SAndroid Build Coastguard Worker                         *pLinearAddress = *pSwizzledAddress;
655*35ffd701SAndroid Build Coastguard Worker                     }
656*35ffd701SAndroid Build Coastguard Worker 
657*35ffd701SAndroid Build Coastguard Worker                     pLinearAddress++;
658*35ffd701SAndroid Build Coastguard Worker                 }
659*35ffd701SAndroid Build Coastguard Worker 
660*35ffd701SAndroid Build Coastguard Worker                 pLinearAddress += pLinearSurface->Pitch - CopyWidthBytes;
661*35ffd701SAndroid Build Coastguard Worker             }
662*35ffd701SAndroid Build Coastguard Worker         }
663*35ffd701SAndroid Build Coastguard Worker         #else // Production/Performance Implementation...
664*35ffd701SAndroid Build Coastguard Worker         {
665*35ffd701SAndroid Build Coastguard Worker             /* Key Performance Gains from...
666*35ffd701SAndroid Build Coastguard Worker                 (1) Efficient Memory Transfers (Ordering + Instruction)
667*35ffd701SAndroid Build Coastguard Worker                 (2) Minimizing Work in Inner Loops */
668*35ffd701SAndroid Build Coastguard Worker 
669*35ffd701SAndroid Build Coastguard Worker             #if(_MSC_VER >= 1600)
670*35ffd701SAndroid Build Coastguard Worker                 #include <stdint.h>
671*35ffd701SAndroid Build Coastguard Worker 
672*35ffd701SAndroid Build Coastguard Worker                 #pragma warning(push)
673*35ffd701SAndroid Build Coastguard Worker                 #pragma warning(disable:4127) // Constant Conditional Expressions
674*35ffd701SAndroid Build Coastguard Worker 
675*35ffd701SAndroid Build Coastguard Worker                 unsigned long LOW_BIT_Index;
676*35ffd701SAndroid Build Coastguard Worker                 #define LOW_BIT(x)  (_BitScanForward(&LOW_BIT_Index, (x)), LOW_BIT_Index)
677*35ffd701SAndroid Build Coastguard Worker 
678*35ffd701SAndroid Build Coastguard Worker                 unsigned long HIGH_BIT_Index;
679*35ffd701SAndroid Build Coastguard Worker                 #define HIGH_BIT(x) (_BitScanReverse(&HIGH_BIT_Index, (x)), HIGH_BIT_Index)
680*35ffd701SAndroid Build Coastguard Worker             #elif(__GNUC__ >= 4)
681*35ffd701SAndroid Build Coastguard Worker                 #include <stdint.h>
682*35ffd701SAndroid Build Coastguard Worker 
683*35ffd701SAndroid Build Coastguard Worker                 #define LOW_BIT(x)  __builtin_ctz(x)
684*35ffd701SAndroid Build Coastguard Worker                 #define HIGH_BIT(x) ((sizeof(x) * CHAR_BIT - 1) - __builtin_clz(x))
685*35ffd701SAndroid Build Coastguard Worker             #else
686*35ffd701SAndroid Build Coastguard Worker                 #error "Unexpected compiler!"
687*35ffd701SAndroid Build Coastguard Worker             #endif
688*35ffd701SAndroid Build Coastguard Worker 
689*35ffd701SAndroid Build Coastguard Worker             typedef struct ___m24
690*35ffd701SAndroid Build Coastguard Worker             {
691*35ffd701SAndroid Build Coastguard Worker                 uint8_t byte[3];
692*35ffd701SAndroid Build Coastguard Worker             } __m24; // 24-bit/3-byte memory element.
693*35ffd701SAndroid Build Coastguard Worker 
694*35ffd701SAndroid Build Coastguard Worker             // Macros intended to compile to various types of "load register from memory" instructions...
695*35ffd701SAndroid Build Coastguard Worker             #define MOVB_R(  Reg, Src) (*(uint8_t  *)&(Reg) = *(uint8_t  *)(Src))
696*35ffd701SAndroid Build Coastguard Worker             #define MOVW_R(  Reg, Src) (*(uint16_t *)&(Reg) = *(uint16_t *)(Src))
697*35ffd701SAndroid Build Coastguard Worker             #define MOV3_R(  Reg, Src) (*(__m24    *)&(Reg) = *(__m24 *)(Src))
698*35ffd701SAndroid Build Coastguard Worker             #define MOVD_R(  Reg, Src) (*(uint32_t *)&(Reg) = *(uint32_t *)(Src))
699*35ffd701SAndroid Build Coastguard Worker 
700*35ffd701SAndroid Build Coastguard Worker             #define MOVQ_R(  Reg, Src) ((Reg) = _mm_loadl_epi64((__m128i *)(Src)))
701*35ffd701SAndroid Build Coastguard Worker             #define MOVDQ_R( Reg, Src) ((Reg) = _mm_load_si128( (__m128i *)(Src)))
702*35ffd701SAndroid Build Coastguard Worker             #define MOVDQU_R(Reg, Src) ((Reg) = _mm_loadu_si128((__m128i *)(Src)))
703*35ffd701SAndroid Build Coastguard Worker 
704*35ffd701SAndroid Build Coastguard Worker             // As above, but the other half: "store to memory from register"...
705*35ffd701SAndroid Build Coastguard Worker             #define MOVB_M(    Dest, Reg)(*(uint8_t  *)(Dest) = *(uint8_t  *)&(Reg))
706*35ffd701SAndroid Build Coastguard Worker             #define MOVW_M(    Dest, Reg)(*(uint16_t *)(Dest) = *(uint16_t *)&(Reg))
707*35ffd701SAndroid Build Coastguard Worker             #define MOV3_M(    Dest, Reg)(*(__m24    *)(Dest) = *(__m24    *)&(Reg))
708*35ffd701SAndroid Build Coastguard Worker             #define MOVD_M(    Dest, Reg)(*(uint32_t *)(Dest) = *(uint32_t *)&(Reg))
709*35ffd701SAndroid Build Coastguard Worker 
710*35ffd701SAndroid Build Coastguard Worker             #define MOVQ_M(    Dest, Reg)(_mm_storel_epi64((__m128i *)(Dest), (Reg)))
711*35ffd701SAndroid Build Coastguard Worker             #define MOVDQ_M(   Dest, Reg)(_mm_store_si128( (__m128i *)(Dest), (Reg)))
712*35ffd701SAndroid Build Coastguard Worker             #define MOVDQU_M(  Dest, Reg)(_mm_storeu_si128((__m128i *)(Dest), (Reg)))
713*35ffd701SAndroid Build Coastguard Worker             #define MOVNTDQ_M( Dest, Reg)(_mm_stream_si128((__m128i *)(Dest), (Reg)))
714*35ffd701SAndroid Build Coastguard Worker 
715*35ffd701SAndroid Build Coastguard Worker 
716*35ffd701SAndroid Build Coastguard Worker             #define MIN_CONTAINED_POW2_BELOW_CAP(x, Cap) (1 << LOW_BIT((1 << LOW_BIT(x)) | (1 << HIGH_BIT(Cap))))
717*35ffd701SAndroid Build Coastguard Worker 
718*35ffd701SAndroid Build Coastguard Worker             #define SWIZZLE_OFFSET(OffsetX, OffsetY, OffsetZ) \
719*35ffd701SAndroid Build Coastguard Worker                 SwizzleOffset(pSwizzledSurface->pSwizzle, pSwizzledSurface->Pitch, OffsetX, OffsetY, OffsetZ)
720*35ffd701SAndroid Build Coastguard Worker 
721*35ffd701SAndroid Build Coastguard Worker             #define MAX_XFER_WIDTH  16  // See "Compute Transfer Dimensions".
722*35ffd701SAndroid Build Coastguard Worker             #define MAX_XFER_HEIGHT 4   // "
723*35ffd701SAndroid Build Coastguard Worker 
724*35ffd701SAndroid Build Coastguard Worker             char StreamingLoadSupported = -1; // SSE4.1: MOVNTDQA
725*35ffd701SAndroid Build Coastguard Worker 
726*35ffd701SAndroid Build Coastguard Worker             int TileWidthBits = POPCNT16(pSwizzledSurface->pSwizzle->Mask.x);   // Log2(Tile Width in Bytes)
727*35ffd701SAndroid Build Coastguard Worker             int TileHeightBits = POPCNT16(pSwizzledSurface->pSwizzle->Mask.y);  // Log2(Tile Height)
728*35ffd701SAndroid Build Coastguard Worker             int TileDepthBits = POPCNT16(pSwizzledSurface->pSwizzle->Mask.z);   // Log2(Tile Depth or MSAA Samples)
729*35ffd701SAndroid Build Coastguard Worker             int BytesPerRowOfTiles = pSwizzledSurface->Pitch << (TileDepthBits + TileHeightBits);
730*35ffd701SAndroid Build Coastguard Worker 
731*35ffd701SAndroid Build Coastguard Worker             struct { int LeftCrust, MainRun, RightCrust; } CopyWidth;
732*35ffd701SAndroid Build Coastguard Worker             int MaskX[MAX_XFER_WIDTH + 1], MaskY[MAX_XFER_HEIGHT + 1];
733*35ffd701SAndroid Build Coastguard Worker             int SwizzledOffsetX0, SwizzledOffsetY;
734*35ffd701SAndroid Build Coastguard Worker             struct { int Width, Height; } SwizzleMaxXfer;
735*35ffd701SAndroid Build Coastguard Worker 
736*35ffd701SAndroid Build Coastguard Worker             char *pSwizzledAddressCopyBase =
737*35ffd701SAndroid Build Coastguard Worker                 (char *) pSwizzledSurface->pBase +
738*35ffd701SAndroid Build Coastguard Worker                 SWIZZLE_OFFSET(0, 0, pSwizzledSurface->OffsetZ);
739*35ffd701SAndroid Build Coastguard Worker 
740*35ffd701SAndroid Build Coastguard Worker             assert(sizeof(__m24) == 3);
741*35ffd701SAndroid Build Coastguard Worker 
742*35ffd701SAndroid Build Coastguard Worker             if(StreamingLoadSupported == -1)
743*35ffd701SAndroid Build Coastguard Worker             {
744*35ffd701SAndroid Build Coastguard Worker                 #if(_MSC_VER >= 1500)
745*35ffd701SAndroid Build Coastguard Worker                     #define MOVNTDQA_R(Reg, Src) ((Reg) = _mm_stream_load_si128((__m128i *)(Src)))
746*35ffd701SAndroid Build Coastguard Worker                     int CpuInfo[4];
747*35ffd701SAndroid Build Coastguard Worker                     __cpuid(CpuInfo, 1);
748*35ffd701SAndroid Build Coastguard Worker                     StreamingLoadSupported = ((CpuInfo[2] & (1 << 19)) != 0); // ECX[19] = SSE4.1
749*35ffd701SAndroid Build Coastguard Worker                 #elif(defined(__ARM_ARCH))
750*35ffd701SAndroid Build Coastguard Worker                     #define MOVNTDQA_R(Reg, Src) ((Reg) = (Reg))
751*35ffd701SAndroid Build Coastguard Worker                     StreamingLoadSupported = 0;
752*35ffd701SAndroid Build Coastguard Worker                 #elif((defined __clang__) || (__GNUC__ > 4) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 5))
753*35ffd701SAndroid Build Coastguard Worker                     #define MOVNTDQA_R(Reg, Src) ((Reg) = _mm_stream_load_si128((__m128i *)(Src)))
754*35ffd701SAndroid Build Coastguard Worker                     unsigned int eax, ebx, ecx, edx;
755*35ffd701SAndroid Build Coastguard Worker                     __cpuid(1, eax, ebx, ecx, edx);
756*35ffd701SAndroid Build Coastguard Worker                     StreamingLoadSupported = ((ecx & (1 << 19)) != 0); // ECX[19] = SSE4.1
757*35ffd701SAndroid Build Coastguard Worker                 #else
758*35ffd701SAndroid Build Coastguard Worker                     #define MOVNTDQA_R(Reg, Src) ((Reg) = (Reg))
759*35ffd701SAndroid Build Coastguard Worker                     StreamingLoadSupported = 0;
760*35ffd701SAndroid Build Coastguard Worker                 #endif
761*35ffd701SAndroid Build Coastguard Worker             }
762*35ffd701SAndroid Build Coastguard Worker 
763*35ffd701SAndroid Build Coastguard Worker             { // Compute Transfer Dimensions...
764*35ffd701SAndroid Build Coastguard Worker 
765*35ffd701SAndroid Build Coastguard Worker                 /* When transferring between linear and swizzled surfaces, we
766*35ffd701SAndroid Build Coastguard Worker                 can't traverse linearly through memory of both since they have
767*35ffd701SAndroid Build Coastguard Worker                 drastically different memory orderings--Moving linearly through
768*35ffd701SAndroid Build Coastguard Worker                 one means bouncing around the other.
769*35ffd701SAndroid Build Coastguard Worker 
770*35ffd701SAndroid Build Coastguard Worker                 Moving linearly through linear surface is more programmatically
771*35ffd701SAndroid Build Coastguard Worker                 convenient--especially when BLT rectangles not constrained to
772*35ffd701SAndroid Build Coastguard Worker                 tile boundaries. But moving linearly through swizzled surface
773*35ffd701SAndroid Build Coastguard Worker                 memory is often more performance-friendly--especially when that
774*35ffd701SAndroid Build Coastguard Worker                 memory is CPU-mapped as WC (Write Combining), which is often
775*35ffd701SAndroid Build Coastguard Worker                 the case for graphics memory.
776*35ffd701SAndroid Build Coastguard Worker 
777*35ffd701SAndroid Build Coastguard Worker                 Fortunately, we can avoid shortcomings of both extremes by
778*35ffd701SAndroid Build Coastguard Worker                 using hybrid traversal: Traverse mostly linearly through linear
779*35ffd701SAndroid Build Coastguard Worker                 surface, but have innermost loop transfer small 2D chunks sized
780*35ffd701SAndroid Build Coastguard Worker                 to use critical runs of linearity in the swizzled memory.
781*35ffd701SAndroid Build Coastguard Worker 
782*35ffd701SAndroid Build Coastguard Worker                 The "critical runs of linearity" that we want to hit in the
783*35ffd701SAndroid Build Coastguard Worker                 sizzled memory are aligned, cache-line-sized memory chunks. If
784*35ffd701SAndroid Build Coastguard Worker                 we bounce around with finer granularity we'll incur penalties
785*35ffd701SAndroid Build Coastguard Worker                 of partial WC buffer use (whether from WC memory use or non-
786*35ffd701SAndroid Build Coastguard Worker                 temporal stores).
787*35ffd701SAndroid Build Coastguard Worker 
788*35ffd701SAndroid Build Coastguard Worker                 The size of 2D chunks with cache-line-sized linearity in
789*35ffd701SAndroid Build Coastguard Worker                 swizzled memory is determined by swizzle mapping's low-order
790*35ffd701SAndroid Build Coastguard Worker                 six bits (for 64-byte cache lines). Most swizzles use
791*35ffd701SAndroid Build Coastguard Worker                 "Y Y X X X X" in their low-order bits, which means their cache
792*35ffd701SAndroid Build Coastguard Worker                 lines store 16x4 chunks--So our implementation will use those
793*35ffd701SAndroid Build Coastguard Worker                 dimensions as our target/maximum 2D transfer chunk. If we had
794*35ffd701SAndroid Build Coastguard Worker                 any 8x8 (or taller) swizzles, we should add such support and
795*35ffd701SAndroid Build Coastguard Worker                 increase our maximum chunk height. If we had any 32x2 swizzles,
796*35ffd701SAndroid Build Coastguard Worker                 we should add such support and increase our maximum chunk width.
797*35ffd701SAndroid Build Coastguard Worker 
798*35ffd701SAndroid Build Coastguard Worker                 Our implementation only bothers optimizing for 2D transfer
799*35ffd701SAndroid Build Coastguard Worker                 chunks stored in row-major order--i.e. those whose swizzle
800*35ffd701SAndroid Build Coastguard Worker                 mapping bits have a series of X's in the low-order, followed by
801*35ffd701SAndroid Build Coastguard Worker                 Y's in the higher-order. Where a swizzle mapping inflection
802*35ffd701SAndroid Build Coastguard Worker                 from Y back to X occurs, contiguous row-ordering is lost, and
803*35ffd701SAndroid Build Coastguard Worker                 we would use that smaller, row-ordered chunk size. */
804*35ffd701SAndroid Build Coastguard Worker 
805*35ffd701SAndroid Build Coastguard Worker                 int TargetMask;
806*35ffd701SAndroid Build Coastguard Worker 
807*35ffd701SAndroid Build Coastguard Worker                 // Narrow optimized transfer Width by looking for inflection from X's...
808*35ffd701SAndroid Build Coastguard Worker                 SwizzleMaxXfer.Width = MAX_XFER_WIDTH;
809*35ffd701SAndroid Build Coastguard Worker                 while(  (TargetMask = SwizzleMaxXfer.Width - 1) &&
810*35ffd701SAndroid Build Coastguard Worker                         ((pSwizzledSurface->pSwizzle->Mask.x & TargetMask) != TargetMask))
811*35ffd701SAndroid Build Coastguard Worker                 {
812*35ffd701SAndroid Build Coastguard Worker                     SwizzleMaxXfer.Width >>= 1;
813*35ffd701SAndroid Build Coastguard Worker                 }
814*35ffd701SAndroid Build Coastguard Worker 
815*35ffd701SAndroid Build Coastguard Worker                 // Narrow optimized transfer height by looking for inflection from Y's...
816*35ffd701SAndroid Build Coastguard Worker                 SwizzleMaxXfer.Height = MAX_XFER_HEIGHT;
817*35ffd701SAndroid Build Coastguard Worker 
818*35ffd701SAndroid Build Coastguard Worker                 while(  (TargetMask = (SwizzleMaxXfer.Height - 1) * SwizzleMaxXfer.Width) &&
819*35ffd701SAndroid Build Coastguard Worker                         ((pSwizzledSurface->pSwizzle->Mask.y & TargetMask) != TargetMask))
820*35ffd701SAndroid Build Coastguard Worker                 {
821*35ffd701SAndroid Build Coastguard Worker                     SwizzleMaxXfer.Height >>= 1;
822*35ffd701SAndroid Build Coastguard Worker                 }
823*35ffd701SAndroid Build Coastguard Worker             }
824*35ffd701SAndroid Build Coastguard Worker 
825*35ffd701SAndroid Build Coastguard Worker             { // Separate CopyWidthBytes into unaligned left/right "crust" and aligned "MainRun"...
826*35ffd701SAndroid Build Coastguard Worker                 int MaxXferWidth = MIN_CONTAINED_POW2_BELOW_CAP(SwizzleMaxXfer.Width, CopyWidthBytes);
827*35ffd701SAndroid Build Coastguard Worker 
828*35ffd701SAndroid Build Coastguard Worker                 CopyWidth.LeftCrust = // i.e. "bytes to xfer-aligned boundary"
829*35ffd701SAndroid Build Coastguard Worker                     (MaxXferWidth - x0) & (MaxXferWidth - 1); // Simplification of ((MaxXferWidth - (x0 % MaxXferWidth)) % MaxXferWidth)
830*35ffd701SAndroid Build Coastguard Worker 
831*35ffd701SAndroid Build Coastguard Worker                 CopyWidth.MainRun =
832*35ffd701SAndroid Build Coastguard Worker                     (CopyWidthBytes - CopyWidth.LeftCrust) & ~(SwizzleMaxXfer.Width - 1); // MainRun is of SwizzleMaxXfer.Width's--not MaxXferWidth's.
833*35ffd701SAndroid Build Coastguard Worker 
834*35ffd701SAndroid Build Coastguard Worker                 CopyWidth.RightCrust = CopyWidthBytes - (CopyWidth.LeftCrust + CopyWidth.MainRun);
835*35ffd701SAndroid Build Coastguard Worker 
836*35ffd701SAndroid Build Coastguard Worker                 #ifdef SUB_ELEMENT_SUPPORT
837*35ffd701SAndroid Build Coastguard Worker                 {
838*35ffd701SAndroid Build Coastguard Worker                     // For partial-pixel transfers, there is no crust and MainRun is done pixel-by-pixel...
839*35ffd701SAndroid Build Coastguard Worker                     if( (pLinearSurface->Element.Size != pLinearSurface->Element.Pitch) ||
840*35ffd701SAndroid Build Coastguard Worker                         (pSwizzledSurface->Element.Size != pSwizzledSurface->Element.Pitch))
841*35ffd701SAndroid Build Coastguard Worker                     {
842*35ffd701SAndroid Build Coastguard Worker                         CopyWidth.LeftCrust = CopyWidth.RightCrust = 0;
843*35ffd701SAndroid Build Coastguard Worker                         CopyWidth.MainRun = CopyWidthBytes;
844*35ffd701SAndroid Build Coastguard Worker                     }
845*35ffd701SAndroid Build Coastguard Worker                 }
846*35ffd701SAndroid Build Coastguard Worker                 #endif
847*35ffd701SAndroid Build Coastguard Worker             }
848*35ffd701SAndroid Build Coastguard Worker 
849*35ffd701SAndroid Build Coastguard Worker 
850*35ffd701SAndroid Build Coastguard Worker             /* Unlike in MINIMALIST implementation, which fully computes
851*35ffd701SAndroid Build Coastguard Worker             swizzled offset for each transfer element, we want to minimize work
852*35ffd701SAndroid Build Coastguard Worker             done in our inner loops.
853*35ffd701SAndroid Build Coastguard Worker 
854*35ffd701SAndroid Build Coastguard Worker             One way we'll reduce work is to separate pSwizzledAddress into
855*35ffd701SAndroid Build Coastguard Worker             dimensional components--e.g. so Y-swizzling doesn't have to be
856*35ffd701SAndroid Build Coastguard Worker             recomputed in X-loop.
857*35ffd701SAndroid Build Coastguard Worker 
858*35ffd701SAndroid Build Coastguard Worker             But a more powerful way we'll reduce work is...Instead of linearly
859*35ffd701SAndroid Build Coastguard Worker             incrementing spatial offsets and then converting to their swizzled
860*35ffd701SAndroid Build Coastguard Worker             counterparts, we'll compute swizzled bases outside the loops and
861*35ffd701SAndroid Build Coastguard Worker             keep them swizzled using swizzled incrementing inside the loops--
862*35ffd701SAndroid Build Coastguard Worker             since swizzled incrementing can be much cheaper than repeatedly
863*35ffd701SAndroid Build Coastguard Worker             swizzling spatial offsets.
864*35ffd701SAndroid Build Coastguard Worker 
865*35ffd701SAndroid Build Coastguard Worker             Intra-tile swizzled incrementing can be done by using the inverse
866*35ffd701SAndroid Build Coastguard Worker             of a spatial component's swizzle mask to ripple-carry a +1 to and
867*35ffd701SAndroid Build Coastguard Worker             across the bits of a currently swizzled value--e.g. with...
868*35ffd701SAndroid Build Coastguard Worker 
869*35ffd701SAndroid Build Coastguard Worker                 SwizzledOffsetY:   Y X Y X Y Y X X X X
870*35ffd701SAndroid Build Coastguard Worker                          ~MaskY:   0 1 0 1 0 0 1 1 1 1
871*35ffd701SAndroid Build Coastguard Worker                                  +                   1
872*35ffd701SAndroid Build Coastguard Worker                                 -----------------------
873*35ffd701SAndroid Build Coastguard Worker 
874*35ffd701SAndroid Build Coastguard Worker             ...set low-order ~MaskY bits will always ripple-carry the
875*35ffd701SAndroid Build Coastguard Worker             incrementing +1 to wherever Y0 happens to be, and wherever there is
876*35ffd701SAndroid Build Coastguard Worker             an arithmetic carry out of one Y position, set ~MaskY bits will
877*35ffd701SAndroid Build Coastguard Worker             carry it across any gaps to the next Y position.
878*35ffd701SAndroid Build Coastguard Worker 
879*35ffd701SAndroid Build Coastguard Worker             The above algorithm only works for adding one, but the mask used
880*35ffd701SAndroid Build Coastguard Worker             can be modified to deliver the +1 to any bit location, so any power
881*35ffd701SAndroid Build Coastguard Worker             of two increment can be achieved.
882*35ffd701SAndroid Build Coastguard Worker 
883*35ffd701SAndroid Build Coastguard Worker             After swizzled increment, residue from mask addition and undesired
884*35ffd701SAndroid Build Coastguard Worker             carries outside targeted fields must be removed using the natural
885*35ffd701SAndroid Build Coastguard Worker             mask--So the final intra-tile swizzled increment is...
886*35ffd701SAndroid Build Coastguard Worker 
887*35ffd701SAndroid Build Coastguard Worker                 SwizzledOffsetQ = (SwizzledOffsetQ + ~MaskQ + 1) & MaskQ
888*35ffd701SAndroid Build Coastguard Worker                     ...where Q is the applicable X/Y/Z dimensional component.
889*35ffd701SAndroid Build Coastguard Worker 
890*35ffd701SAndroid Build Coastguard Worker                 Or since in two's compliment, (~MaskQ + 1) = -MaskQ...
891*35ffd701SAndroid Build Coastguard Worker 
892*35ffd701SAndroid Build Coastguard Worker                 SwizzledOffsetQ = (SwizzledOffsetQ - MaskQ) & MaskQ
893*35ffd701SAndroid Build Coastguard Worker 
894*35ffd701SAndroid Build Coastguard Worker             Since tile sizes are powers of two and tiles laid out in row-major
895*35ffd701SAndroid Build Coastguard Worker             order across surface, the above swizzled incrementing can
896*35ffd701SAndroid Build Coastguard Worker             additionally be used for inter-tile incrementing of X component by
897*35ffd701SAndroid Build Coastguard Worker             extending applicable mask to include offset bits beyond the tile--
898*35ffd701SAndroid Build Coastguard Worker             so arithmetic carries out of intra-tile X component will ripple to
899*35ffd701SAndroid Build Coastguard Worker             advance swizzled inter-tile X offset to next tile. Same is not true
900*35ffd701SAndroid Build Coastguard Worker             of inter-tile Y incrementing since surface pitches not restricted
901*35ffd701SAndroid Build Coastguard Worker             to powers of two. */
902*35ffd701SAndroid Build Coastguard Worker 
903*35ffd701SAndroid Build Coastguard Worker             { // Compute Mask[IncSize] for Needed Increment Values...
904*35ffd701SAndroid Build Coastguard Worker                 int ExtendedMaskX = // Bits beyond the tile (so X incrementing can operate inter-tile)...
905*35ffd701SAndroid Build Coastguard Worker                     ~(pSwizzledSurface->pSwizzle->Mask.x |
906*35ffd701SAndroid Build Coastguard Worker                       pSwizzledSurface->pSwizzle->Mask.y |
907*35ffd701SAndroid Build Coastguard Worker                       pSwizzledSurface->pSwizzle->Mask.z);
908*35ffd701SAndroid Build Coastguard Worker 
909*35ffd701SAndroid Build Coastguard Worker                 /* Subtraction below delivers natural mask for +1 increment,
910*35ffd701SAndroid Build Coastguard Worker                 and appropriately altered mask to deliver +1 to higher bit
911*35ffd701SAndroid Build Coastguard Worker                 positions for +2/4/8/etc. increments. */
912*35ffd701SAndroid Build Coastguard Worker 
913*35ffd701SAndroid Build Coastguard Worker                 for(x = SwizzleMaxXfer.Width; x >= 1; x >>= 1)
914*35ffd701SAndroid Build Coastguard Worker                 {
915*35ffd701SAndroid Build Coastguard Worker                     MaskX[x] = SWIZZLE_OFFSET((1 << TileWidthBits) - x, 0, 0) | ExtendedMaskX;
916*35ffd701SAndroid Build Coastguard Worker                 }
917*35ffd701SAndroid Build Coastguard Worker 
918*35ffd701SAndroid Build Coastguard Worker                 for(y = SwizzleMaxXfer.Height; y >= 1; y >>= 1)
919*35ffd701SAndroid Build Coastguard Worker                 {
920*35ffd701SAndroid Build Coastguard Worker                     MaskY[y] = SWIZZLE_OFFSET(0, (1 << TileHeightBits) - y, 0);
921*35ffd701SAndroid Build Coastguard Worker                 }
922*35ffd701SAndroid Build Coastguard Worker             }
923*35ffd701SAndroid Build Coastguard Worker 
924*35ffd701SAndroid Build Coastguard Worker             { // Base Dimensional Swizzled Offsets...
925*35ffd701SAndroid Build Coastguard Worker                 int IntraTileY = y0 & ((1 << TileHeightBits) - 1);
926*35ffd701SAndroid Build Coastguard Worker                 int TileAlignedY = y0 - IntraTileY;
927*35ffd701SAndroid Build Coastguard Worker 
928*35ffd701SAndroid Build Coastguard Worker                 SwizzledOffsetY = SWIZZLE_OFFSET(0, IntraTileY, 0);
929*35ffd701SAndroid Build Coastguard Worker 
930*35ffd701SAndroid Build Coastguard Worker                 SwizzledOffsetX0 =
931*35ffd701SAndroid Build Coastguard Worker                     SWIZZLE_OFFSET(
932*35ffd701SAndroid Build Coastguard Worker                         x0,
933*35ffd701SAndroid Build Coastguard Worker                         TileAlignedY, // <-- Since SwizzledOffsetX will include "bits beyond the tile".
934*35ffd701SAndroid Build Coastguard Worker                         0);
935*35ffd701SAndroid Build Coastguard Worker             }
936*35ffd701SAndroid Build Coastguard Worker 
937*35ffd701SAndroid Build Coastguard Worker             // BLT Loops ///////////////////////////////////////////////////////
938*35ffd701SAndroid Build Coastguard Worker 
939*35ffd701SAndroid Build Coastguard Worker             /* Traverse BLT rectangle, transferring small, optimally-aligned 2D
940*35ffd701SAndroid Build Coastguard Worker             chunks, as appropriate for given swizzle format. Use swizzled
941*35ffd701SAndroid Build Coastguard Worker             incrementing of dimensional swizzled components. */
942*35ffd701SAndroid Build Coastguard Worker 
943*35ffd701SAndroid Build Coastguard Worker             for(y = y0; y < y1; )
944*35ffd701SAndroid Build Coastguard Worker             {
945*35ffd701SAndroid Build Coastguard Worker                 char *pSwizzledAddressLine = pSwizzledAddressCopyBase + SwizzledOffsetY;
946*35ffd701SAndroid Build Coastguard Worker                 int xferHeight =
947*35ffd701SAndroid Build Coastguard Worker                     // Largest pow2 xfer height that alignment, MaxXfer, and lines left will permit...
948*35ffd701SAndroid Build Coastguard Worker                     MIN_CONTAINED_POW2_BELOW_CAP(y | SwizzleMaxXfer.Height, y1 - y);
949*35ffd701SAndroid Build Coastguard Worker                 int SwizzledOffsetX = SwizzledOffsetX0;
950*35ffd701SAndroid Build Coastguard Worker 
951*35ffd701SAndroid Build Coastguard Worker                 __m128i xmm[MAX_XFER_HEIGHT];
952*35ffd701SAndroid Build Coastguard Worker                 char *pLinearAddressEnd;
953*35ffd701SAndroid Build Coastguard Worker                 int _MaskX;
954*35ffd701SAndroid Build Coastguard Worker 
955*35ffd701SAndroid Build Coastguard Worker                 // XFER Macros /////////////////////////////////////////////////
956*35ffd701SAndroid Build Coastguard Worker 
957*35ffd701SAndroid Build Coastguard Worker                 /* We'll define "XFER" macro to contain BLT X-loop work.
958*35ffd701SAndroid Build Coastguard Worker 
959*35ffd701SAndroid Build Coastguard Worker                 In simple implementation, XFER would be WHILE loop that does
960*35ffd701SAndroid Build Coastguard Worker                 SSE transfer and performs pointer and swizzled offset
961*35ffd701SAndroid Build Coastguard Worker                 incrementing.
962*35ffd701SAndroid Build Coastguard Worker 
963*35ffd701SAndroid Build Coastguard Worker                 ...but we have multiple conditions to handle...
964*35ffd701SAndroid Build Coastguard Worker                   - Transfer Direction (Linear <--> Swizzled)
965*35ffd701SAndroid Build Coastguard Worker                   - Optimal 2D Transfer Chunk Size
966*35ffd701SAndroid Build Coastguard Worker                   - Available/Desired CPU Transfer Instructions
967*35ffd701SAndroid Build Coastguard Worker                   - Unaligned Crust
968*35ffd701SAndroid Build Coastguard Worker 
969*35ffd701SAndroid Build Coastguard Worker                 Don't want X-loop to have conditional logic to handle
970*35ffd701SAndroid Build Coastguard Worker                 variations since would retard performance--but neither do we
971*35ffd701SAndroid Build Coastguard Worker                 want messy multitude of slightly different, copy-pasted code
972*35ffd701SAndroid Build Coastguard Worker                 paths. So instead, XFER macro will provide common code template
973*35ffd701SAndroid Build Coastguard Worker                 allowing instantiation of multiple X-loop variations--i.e. XFER
974*35ffd701SAndroid Build Coastguard Worker                 calls from conditional Y-loop code will expand into separate,
975*35ffd701SAndroid Build Coastguard Worker                 conditional-free, "lean and mean" X-loops.
976*35ffd701SAndroid Build Coastguard Worker 
977*35ffd701SAndroid Build Coastguard Worker                 Some conditional logic remains in XFER chain--but only outside
978*35ffd701SAndroid Build Coastguard Worker                 X-loop. The two IF statements that remain in X-loop (i.e. those
979*35ffd701SAndroid Build Coastguard Worker                 in XFER_LOAD/STORE) expand to compile-time constant conditional
980*35ffd701SAndroid Build Coastguard Worker                 expressions, so with optimizing compiler, no runtime-
981*35ffd701SAndroid Build Coastguard Worker                 conditional code will be generated--i.e. constant conditionals
982*35ffd701SAndroid Build Coastguard Worker                 will simply decide whether given instantiation has that code or
983*35ffd701SAndroid Build Coastguard Worker                 not. */
984*35ffd701SAndroid Build Coastguard Worker 
985*35ffd701SAndroid Build Coastguard Worker                 #define XFER(XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
986*35ffd701SAndroid Build Coastguard Worker                 {                                                                                                   \
987*35ffd701SAndroid Build Coastguard Worker                          XFER_LINES(4, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
988*35ffd701SAndroid Build Coastguard Worker                     else XFER_LINES(2, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
989*35ffd701SAndroid Build Coastguard Worker                     else XFER_LINES(1, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust);\
990*35ffd701SAndroid Build Coastguard Worker                 }
991*35ffd701SAndroid Build Coastguard Worker 
992*35ffd701SAndroid Build Coastguard Worker                 #define XFER_LINES(XFER_LINES_Lines, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
993*35ffd701SAndroid Build Coastguard Worker                     if(xferHeight == (XFER_LINES_Lines))    \
994*35ffd701SAndroid Build Coastguard Worker                     {                                       \
995*35ffd701SAndroid Build Coastguard Worker                         if(XFER_Crust)                      \
996*35ffd701SAndroid Build Coastguard Worker                         {                                   \
997*35ffd701SAndroid Build Coastguard Worker                             XFER_SPAN(MOVB_M, MOVB_R, CopyWidth.LeftCrust  & 1, 1, 1, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
998*35ffd701SAndroid Build Coastguard Worker                             XFER_SPAN(MOVW_M, MOVW_R, CopyWidth.LeftCrust  & 2, 2, 2, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
999*35ffd701SAndroid Build Coastguard Worker                             XFER_SPAN(MOVD_M, MOVD_R, CopyWidth.LeftCrust  & 4, 4, 4, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1000*35ffd701SAndroid Build Coastguard Worker                             XFER_SPAN(MOVQ_M, MOVQ_R, CopyWidth.LeftCrust  & 8, 8, 8, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1001*35ffd701SAndroid Build Coastguard Worker                         }                                   \
1002*35ffd701SAndroid Build Coastguard Worker                                                             \
1003*35ffd701SAndroid Build Coastguard Worker                         XFER_SPAN(XFER_Store, XFER_Load, CopyWidth.MainRun, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch);\
1004*35ffd701SAndroid Build Coastguard Worker                                                             \
1005*35ffd701SAndroid Build Coastguard Worker                         if(XFER_Crust)                      \
1006*35ffd701SAndroid Build Coastguard Worker                         {                                   \
1007*35ffd701SAndroid Build Coastguard Worker                             XFER_SPAN(MOVQ_M, MOVQ_R, CopyWidth.RightCrust & 8, 8, 8, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1008*35ffd701SAndroid Build Coastguard Worker                             XFER_SPAN(MOVD_M, MOVD_R, CopyWidth.RightCrust & 4, 4, 4, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1009*35ffd701SAndroid Build Coastguard Worker                             XFER_SPAN(MOVW_M, MOVW_R, CopyWidth.RightCrust & 2, 2, 2, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1010*35ffd701SAndroid Build Coastguard Worker                             XFER_SPAN(MOVB_M, MOVB_R, CopyWidth.RightCrust & 1, 1, 1, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1011*35ffd701SAndroid Build Coastguard Worker                         }                                   \
1012*35ffd701SAndroid Build Coastguard Worker                     }
1013*35ffd701SAndroid Build Coastguard Worker 
1014*35ffd701SAndroid Build Coastguard Worker                 #define XFER_SPAN(XFER_Store, XFER_Load, XFER_CopyWidthBytes, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_Height, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch) \
1015*35ffd701SAndroid Build Coastguard Worker                 {                                                                           \
1016*35ffd701SAndroid Build Coastguard Worker                     pLinearAddressEnd = pLinearAddress + (XFER_CopyWidthBytes);             \
1017*35ffd701SAndroid Build Coastguard Worker                     _MaskX = MaskX[XFER_Pitch_Swizzled];                                    \
1018*35ffd701SAndroid Build Coastguard Worker                     while(pLinearAddress < pLinearAddressEnd)                               \
1019*35ffd701SAndroid Build Coastguard Worker                     {                                                                       \
1020*35ffd701SAndroid Build Coastguard Worker                         pSwizzledAddress = pSwizzledAddressLine + SwizzledOffsetX;          \
1021*35ffd701SAndroid Build Coastguard Worker                                                                                             \
1022*35ffd701SAndroid Build Coastguard Worker                         XFER_LOAD(0, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height);     \
1023*35ffd701SAndroid Build Coastguard Worker                         XFER_LOAD(1, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height);     \
1024*35ffd701SAndroid Build Coastguard Worker                         XFER_LOAD(2, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height);     \
1025*35ffd701SAndroid Build Coastguard Worker                         XFER_LOAD(3, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height);     \
1026*35ffd701SAndroid Build Coastguard Worker                         XFER_STORE(0, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
1027*35ffd701SAndroid Build Coastguard Worker                         XFER_STORE(1, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
1028*35ffd701SAndroid Build Coastguard Worker                         XFER_STORE(2, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
1029*35ffd701SAndroid Build Coastguard Worker                         XFER_STORE(3, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
1030*35ffd701SAndroid Build Coastguard Worker                                                                                             \
1031*35ffd701SAndroid Build Coastguard Worker                         SwizzledOffsetX = (SwizzledOffsetX - _MaskX) & _MaskX;              \
1032*35ffd701SAndroid Build Coastguard Worker                         pLinearAddress += (XFER_Pitch_Linear);                              \
1033*35ffd701SAndroid Build Coastguard Worker                     }                                                                       \
1034*35ffd701SAndroid Build Coastguard Worker                 }
1035*35ffd701SAndroid Build Coastguard Worker 
1036*35ffd701SAndroid Build Coastguard Worker                 #define XFER_LOAD(XFER_Line, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height) \
1037*35ffd701SAndroid Build Coastguard Worker                 {                                                           \
1038*35ffd701SAndroid Build Coastguard Worker                     if((XFER_Line) < (XFER_Height))                         \
1039*35ffd701SAndroid Build Coastguard Worker                     {                                                       \
1040*35ffd701SAndroid Build Coastguard Worker                         XFER_Load(                                          \
1041*35ffd701SAndroid Build Coastguard Worker                             xmm[XFER_Line],                                 \
1042*35ffd701SAndroid Build Coastguard Worker                             (XFER_pSrc) + (XFER_Line) * (XFER_SrcPitch));   \
1043*35ffd701SAndroid Build Coastguard Worker                     }                                                       \
1044*35ffd701SAndroid Build Coastguard Worker                 }
1045*35ffd701SAndroid Build Coastguard Worker 
1046*35ffd701SAndroid Build Coastguard Worker                 #define XFER_STORE(XFER_Line, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height) \
1047*35ffd701SAndroid Build Coastguard Worker                 {                                                           \
1048*35ffd701SAndroid Build Coastguard Worker                     if((XFER_Line) < (XFER_Height))                         \
1049*35ffd701SAndroid Build Coastguard Worker                     {                                                       \
1050*35ffd701SAndroid Build Coastguard Worker                         XFER_Store(                                         \
1051*35ffd701SAndroid Build Coastguard Worker                             (XFER_pDest) + (XFER_Line) * (XFER_DestPitch),  \
1052*35ffd701SAndroid Build Coastguard Worker                             xmm[XFER_Line]);                                \
1053*35ffd701SAndroid Build Coastguard Worker                     }                                                       \
1054*35ffd701SAndroid Build Coastguard Worker                 }
1055*35ffd701SAndroid Build Coastguard Worker 
1056*35ffd701SAndroid Build Coastguard Worker                 // Perform Applicable Transfer /////////////////////////////////
1057*35ffd701SAndroid Build Coastguard Worker                 assert( // DQ Alignment...
1058*35ffd701SAndroid Build Coastguard Worker                     ((intptr_t) pSwizzledSurface->pBase % 16 == 0) &&
1059*35ffd701SAndroid Build Coastguard Worker                     (pSwizzledSurface->Pitch % 16 == 0));
1060*35ffd701SAndroid Build Coastguard Worker 
1061*35ffd701SAndroid Build Coastguard Worker                 #ifdef SUB_ELEMENT_SUPPORT
1062*35ffd701SAndroid Build Coastguard Worker                     if( (pLinearSurface->Element.Size != pLinearSurface->Element.Pitch) ||
1063*35ffd701SAndroid Build Coastguard Worker                         (pSwizzledSurface->Element.Size != pSwizzledSurface->Element.Pitch))
1064*35ffd701SAndroid Build Coastguard Worker                     {
1065*35ffd701SAndroid Build Coastguard Worker                         if(LinearToSwizzled)
1066*35ffd701SAndroid Build Coastguard Worker                         {
1067*35ffd701SAndroid Build Coastguard Worker                             switch(pLinearSurface->Element.Size)
1068*35ffd701SAndroid Build Coastguard Worker                             {
1069*35ffd701SAndroid Build Coastguard Worker                                 case 16: XFER(MOVNTDQ_M, MOVDQU_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1070*35ffd701SAndroid Build Coastguard Worker                                 case  8: XFER(   MOVQ_M,   MOVQ_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1071*35ffd701SAndroid Build Coastguard Worker                                 case  4: XFER(   MOVD_M,   MOVD_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1072*35ffd701SAndroid Build Coastguard Worker                                 case  3: XFER(   MOV3_M,   MOV3_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1073*35ffd701SAndroid Build Coastguard Worker                                 case  2: XFER(   MOVW_M,   MOVW_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1074*35ffd701SAndroid Build Coastguard Worker                                 case  1: XFER(   MOVB_M,   MOVB_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1075*35ffd701SAndroid Build Coastguard Worker                                 default: assert(0);
1076*35ffd701SAndroid Build Coastguard Worker                             }
1077*35ffd701SAndroid Build Coastguard Worker                         }
1078*35ffd701SAndroid Build Coastguard Worker                         else
1079*35ffd701SAndroid Build Coastguard Worker                         {
1080*35ffd701SAndroid Build Coastguard Worker                             switch(pLinearSurface->Element.Size)
1081*35ffd701SAndroid Build Coastguard Worker                             {
1082*35ffd701SAndroid Build Coastguard Worker                                 case 16:
1083*35ffd701SAndroid Build Coastguard Worker                                 {
1084*35ffd701SAndroid Build Coastguard Worker                                     if(StreamingLoadSupported)
1085*35ffd701SAndroid Build Coastguard Worker                                     {
1086*35ffd701SAndroid Build Coastguard Worker                                         XFER(MOVDQU_M, MOVNTDQA_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0);
1087*35ffd701SAndroid Build Coastguard Worker                                     }
1088*35ffd701SAndroid Build Coastguard Worker                                     else
1089*35ffd701SAndroid Build Coastguard Worker                                     {
1090*35ffd701SAndroid Build Coastguard Worker                                         XFER(MOVDQU_M,    MOVDQ_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0);
1091*35ffd701SAndroid Build Coastguard Worker                                     }
1092*35ffd701SAndroid Build Coastguard Worker                                     break;
1093*35ffd701SAndroid Build Coastguard Worker                                 }
1094*35ffd701SAndroid Build Coastguard Worker                                 case  8: XFER(   MOVQ_M,   MOVQ_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1095*35ffd701SAndroid Build Coastguard Worker                                 case  4: XFER(   MOVD_M,   MOVD_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1096*35ffd701SAndroid Build Coastguard Worker                                 case  3: XFER(   MOV3_M,   MOV3_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1097*35ffd701SAndroid Build Coastguard Worker                                 case  2: XFER(   MOVW_M,   MOVW_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1098*35ffd701SAndroid Build Coastguard Worker                                 case  1: XFER(   MOVB_M,   MOVB_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1099*35ffd701SAndroid Build Coastguard Worker                                 default: assert(0);
1100*35ffd701SAndroid Build Coastguard Worker                             }
1101*35ffd701SAndroid Build Coastguard Worker                         }
1102*35ffd701SAndroid Build Coastguard Worker                     } else
1103*35ffd701SAndroid Build Coastguard Worker                 #endif // SUB_ELEMENT_SUPPORT
1104*35ffd701SAndroid Build Coastguard Worker                 if(LinearToSwizzled)
1105*35ffd701SAndroid Build Coastguard Worker                 {
1106*35ffd701SAndroid Build Coastguard Worker                     switch(SwizzleMaxXfer.Width)
1107*35ffd701SAndroid Build Coastguard Worker                     {
1108*35ffd701SAndroid Build Coastguard Worker                         case 16: XFER(MOVNTDQ_M, MOVDQU_R, 16, 16, pSwizzledAddress, 16, pLinearAddress, pLinearSurface->Pitch, 1); break;
1109*35ffd701SAndroid Build Coastguard Worker                         #ifdef INTEL_TILE_W_SUPPORT
1110*35ffd701SAndroid Build Coastguard Worker                             case  2: XFER(MOVW_M,  MOVW_R,  2,  2, pSwizzledAddress,  2, pLinearAddress, pLinearSurface->Pitch, 1); break;
1111*35ffd701SAndroid Build Coastguard Worker                         #endif
1112*35ffd701SAndroid Build Coastguard Worker                         default: assert(0); // Unexpected cases excluded to save compile time/size of multiplying instantiations.
1113*35ffd701SAndroid Build Coastguard Worker                     }
1114*35ffd701SAndroid Build Coastguard Worker                 }
1115*35ffd701SAndroid Build Coastguard Worker                 else
1116*35ffd701SAndroid Build Coastguard Worker                 {
1117*35ffd701SAndroid Build Coastguard Worker                     switch(SwizzleMaxXfer.Width)
1118*35ffd701SAndroid Build Coastguard Worker                     {
1119*35ffd701SAndroid Build Coastguard Worker                         case 16:
1120*35ffd701SAndroid Build Coastguard Worker                         {
1121*35ffd701SAndroid Build Coastguard Worker                             if(StreamingLoadSupported)
1122*35ffd701SAndroid Build Coastguard Worker                             {
1123*35ffd701SAndroid Build Coastguard Worker                                 XFER(MOVDQU_M, MOVNTDQA_R, 16, 16, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, 16, 1);
1124*35ffd701SAndroid Build Coastguard Worker                             }
1125*35ffd701SAndroid Build Coastguard Worker                             else
1126*35ffd701SAndroid Build Coastguard Worker                             {
1127*35ffd701SAndroid Build Coastguard Worker                                 XFER(MOVDQU_M,    MOVDQ_R, 16, 16, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, 16, 1);
1128*35ffd701SAndroid Build Coastguard Worker                             }
1129*35ffd701SAndroid Build Coastguard Worker                             break;
1130*35ffd701SAndroid Build Coastguard Worker                         }
1131*35ffd701SAndroid Build Coastguard Worker                         #ifdef INTEL_TILE_W_SUPPORT
1132*35ffd701SAndroid Build Coastguard Worker                             case 2: XFER(MOVW_M,   MOVW_R,  2,  2, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress,  2, 1); break;
1133*35ffd701SAndroid Build Coastguard Worker                         #endif
1134*35ffd701SAndroid Build Coastguard Worker                         default: assert(0);
1135*35ffd701SAndroid Build Coastguard Worker                     }
1136*35ffd701SAndroid Build Coastguard Worker                 }
1137*35ffd701SAndroid Build Coastguard Worker 
1138*35ffd701SAndroid Build Coastguard Worker 
1139*35ffd701SAndroid Build Coastguard Worker                 // Swizzled inc of SwizzledOffsetY...
1140*35ffd701SAndroid Build Coastguard Worker                 SwizzledOffsetY = (SwizzledOffsetY - MaskY[xferHeight]) & MaskY[xferHeight];
1141*35ffd701SAndroid Build Coastguard Worker                 if(!SwizzledOffsetY) SwizzledOffsetX0 += BytesPerRowOfTiles; // Wraps advance SwizzledOffsetX0, since that includes "bits beyond the tile".
1142*35ffd701SAndroid Build Coastguard Worker 
1143*35ffd701SAndroid Build Coastguard Worker                 y += xferHeight;
1144*35ffd701SAndroid Build Coastguard Worker 
1145*35ffd701SAndroid Build Coastguard Worker                 /* X-loop only advanced pLinearAddress by CopyWidthBytes--even
1146*35ffd701SAndroid Build Coastguard Worker                 when transferred multiple lines. Advance rest of way: */
1147*35ffd701SAndroid Build Coastguard Worker                 pLinearAddress += xferHeight * pLinearSurface->Pitch - CopyWidthBytes;
1148*35ffd701SAndroid Build Coastguard Worker 
1149*35ffd701SAndroid Build Coastguard Worker             } // foreach(y)
1150*35ffd701SAndroid Build Coastguard Worker 
1151*35ffd701SAndroid Build Coastguard Worker             _mm_sfence(); // Flush Non-Temporal Writes
1152*35ffd701SAndroid Build Coastguard Worker 
1153*35ffd701SAndroid Build Coastguard Worker             #if(_MSC_VER)
1154*35ffd701SAndroid Build Coastguard Worker                 #pragma warning(pop)
1155*35ffd701SAndroid Build Coastguard Worker             #endif
1156*35ffd701SAndroid Build Coastguard Worker         }
1157*35ffd701SAndroid Build Coastguard Worker         #endif
1158*35ffd701SAndroid Build Coastguard Worker     }
1159*35ffd701SAndroid Build Coastguard Worker } // CpuSwizzleBlt
1160*35ffd701SAndroid Build Coastguard Worker 
1161*35ffd701SAndroid Build Coastguard Worker #endif // #ifndef INCLUDE_CpuSwizzleBlt_c_AS_HEADER
1162*35ffd701SAndroid Build Coastguard Worker // clang-format on
1163