1*35ffd701SAndroid Build Coastguard Worker /*==============================================================================
2*35ffd701SAndroid Build Coastguard Worker Copyright(c) 2017 Intel Corporation
3*35ffd701SAndroid Build Coastguard Worker
4*35ffd701SAndroid Build Coastguard Worker Permission is hereby granted, free of charge, to any person obtaining a
5*35ffd701SAndroid Build Coastguard Worker copy of this software and associated documentation files(the "Software"),
6*35ffd701SAndroid Build Coastguard Worker to deal in the Software without restriction, including without limitation
7*35ffd701SAndroid Build Coastguard Worker the rights to use, copy, modify, merge, publish, distribute, sublicense,
8*35ffd701SAndroid Build Coastguard Worker and / or sell copies of the Software, and to permit persons to whom the
9*35ffd701SAndroid Build Coastguard Worker Software is furnished to do so, subject to the following conditions:
10*35ffd701SAndroid Build Coastguard Worker
11*35ffd701SAndroid Build Coastguard Worker The above copyright notice and this permission notice shall be included
12*35ffd701SAndroid Build Coastguard Worker in all copies or substantial portions of the Software.
13*35ffd701SAndroid Build Coastguard Worker
14*35ffd701SAndroid Build Coastguard Worker THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15*35ffd701SAndroid Build Coastguard Worker OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16*35ffd701SAndroid Build Coastguard Worker FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17*35ffd701SAndroid Build Coastguard Worker THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18*35ffd701SAndroid Build Coastguard Worker OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19*35ffd701SAndroid Build Coastguard Worker ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20*35ffd701SAndroid Build Coastguard Worker OTHER DEALINGS IN THE SOFTWARE.
21*35ffd701SAndroid Build Coastguard Worker ============================================================================*/
22*35ffd701SAndroid Build Coastguard Worker // clang-format off
23*35ffd701SAndroid Build Coastguard Worker // CpuSwizzleBlt.c - Surface swizzling definitions and BLT functionality.
24*35ffd701SAndroid Build Coastguard Worker
25*35ffd701SAndroid Build Coastguard Worker // [!] File serves as its own header:
26*35ffd701SAndroid Build Coastguard Worker // #define INCLUDE_CpuSwizzleBlt_c_AS_HEADER
27*35ffd701SAndroid Build Coastguard Worker // #include "CpuSwizzleBlt.c"
28*35ffd701SAndroid Build Coastguard Worker
29*35ffd701SAndroid Build Coastguard Worker #define SUB_ELEMENT_SUPPORT // Support for Partial Element Transfer (e.g. separating/merging depth-stencil).
30*35ffd701SAndroid Build Coastguard Worker #define INTEL_TILE_W_SUPPORT // Stencil Only;
31*35ffd701SAndroid Build Coastguard Worker
32*35ffd701SAndroid Build Coastguard Worker #ifndef CpuSwizzleBlt_INCLUDED
33*35ffd701SAndroid Build Coastguard Worker
34*35ffd701SAndroid Build Coastguard Worker #ifdef __cplusplus
35*35ffd701SAndroid Build Coastguard Worker extern "C" {
36*35ffd701SAndroid Build Coastguard Worker #endif
37*35ffd701SAndroid Build Coastguard Worker
38*35ffd701SAndroid Build Coastguard Worker // Background ##################################################################
39*35ffd701SAndroid Build Coastguard Worker
40*35ffd701SAndroid Build Coastguard Worker /* Pixel-based surfaces commonly stored in memory row-by-row. This convention
41*35ffd701SAndroid Build Coastguard Worker has simple "y * Pitch + x" addressing but has spatial locality only in
42*35ffd701SAndroid Build Coastguard Worker horizontal direction--i.e. horizontal pixel neighbors stored next to each other
43*35ffd701SAndroid Build Coastguard Worker but vertical neighbors stored entire pitch away.
44*35ffd701SAndroid Build Coastguard Worker
45*35ffd701SAndroid Build Coastguard Worker Since many graphics operations involve multi-dimensional data access, to
46*35ffd701SAndroid Build Coastguard Worker improve cache/memory access performance it is often more beneficial to use
47*35ffd701SAndroid Build Coastguard Worker alternative storage conventions which have multi-dimensional spatial locality--
48*35ffd701SAndroid Build Coastguard Worker i.e. where pixels tend to be stored near both their horizontal and vertical
49*35ffd701SAndroid Build Coastguard Worker neighbors.
50*35ffd701SAndroid Build Coastguard Worker
51*35ffd701SAndroid Build Coastguard Worker "Tiling/Swizzling" is storage convention that increases multi-dimensional
52*35ffd701SAndroid Build Coastguard Worker spatial locality by treating surface as series of smaller regions/"tiles",
53*35ffd701SAndroid Build Coastguard Worker laid out in row-major order across surface, with entire content of each tile
54*35ffd701SAndroid Build Coastguard Worker stored contiguously. Data within each tile is stored in pattern that further
55*35ffd701SAndroid Build Coastguard Worker maximizes the locality. */
56*35ffd701SAndroid Build Coastguard Worker
57*35ffd701SAndroid Build Coastguard Worker
58*35ffd701SAndroid Build Coastguard Worker // Swizzle Descriptors #########################################################
59*35ffd701SAndroid Build Coastguard Worker
60*35ffd701SAndroid Build Coastguard Worker /* Tile sizes always powers of 2 and chosen to be architecturally convenient--
61*35ffd701SAndroid Build Coastguard Worker e.g. 4KB to match physical page size. Tile dimensions also powers of 2, usually
62*35ffd701SAndroid Build Coastguard Worker chosen to produce square tiles for targeted pixel size--e.g. 4KB = 128 bytes x
63*35ffd701SAndroid Build Coastguard Worker 32 rows = 32 x 32 pixels @ 4 bytes-per-pixel.
64*35ffd701SAndroid Build Coastguard Worker
65*35ffd701SAndroid Build Coastguard Worker Since tile size and dimensions all powers of two, the spatial-to-linear mapping
66*35ffd701SAndroid Build Coastguard Worker required to store a tile can be trivial: spatial indexing bits can simply be
67*35ffd701SAndroid Build Coastguard Worker mapped to linear offset bits--e.g. for a 4KB, 128x32 tile...each byte within
68*35ffd701SAndroid Build Coastguard Worker tile can be referenced with a 7-bit X index and 5-bit Y index--and each of
69*35ffd701SAndroid Build Coastguard Worker those 12 index bits can be individually mapped to a bit in the 12-bit offset of
70*35ffd701SAndroid Build Coastguard Worker the tile's linear storage.
71*35ffd701SAndroid Build Coastguard Worker
72*35ffd701SAndroid Build Coastguard Worker The order in which spatial index bits are mapped to linear offset bits
73*35ffd701SAndroid Build Coastguard Worker determines the spatial locality properties of the surface data. E.g. the
74*35ffd701SAndroid Build Coastguard Worker following mapping...
75*35ffd701SAndroid Build Coastguard Worker
76*35ffd701SAndroid Build Coastguard Worker Linear[11:0] = Y4 Y3 Y2 Y1 Y0 X6 X5 X4 X3 X2 X1 X0
77*35ffd701SAndroid Build Coastguard Worker \-- Y[4:0] --/ \----- X[6:0] -----/
78*35ffd701SAndroid Build Coastguard Worker
79*35ffd701SAndroid Build Coastguard Worker ...stores bytes of tile in row-major order, with horizontal neighbors stored
80*35ffd701SAndroid Build Coastguard Worker contiguously and vertical neighbors stored 128 bytes away. If instead, Y index
81*35ffd701SAndroid Build Coastguard Worker bits were mapped to the low-order...
82*35ffd701SAndroid Build Coastguard Worker
83*35ffd701SAndroid Build Coastguard Worker Linear[11:0] = X6 X5 X4 X3 X2 X1 X0 Y4 Y3 Y2 Y1 Y0
84*35ffd701SAndroid Build Coastguard Worker \----- X[6:0] -----/ \-- Y[4:0] --/
85*35ffd701SAndroid Build Coastguard Worker
86*35ffd701SAndroid Build Coastguard Worker ...bytes of tile would be stored in column-major order, with vertical neighbors
87*35ffd701SAndroid Build Coastguard Worker stored contiguously and horizontal neighbors stored 32 bytes away.
88*35ffd701SAndroid Build Coastguard Worker
89*35ffd701SAndroid Build Coastguard Worker Individual X and Y bits can be separated and interspersed in mapping to
90*35ffd701SAndroid Build Coastguard Worker increase locality via sub-tiling--e.g...
91*35ffd701SAndroid Build Coastguard Worker
92*35ffd701SAndroid Build Coastguard Worker Linear[11:0] = Y4 Y3 Y2 X6 X5 X4 Y1 Y0 X3 X2 X1 X0
93*35ffd701SAndroid Build Coastguard Worker \-- Sub-Tile ---/
94*35ffd701SAndroid Build Coastguard Worker
95*35ffd701SAndroid Build Coastguard Worker ...subdivies tile into 16x4 sub-tiles laid out in row-major order across tile,
96*35ffd701SAndroid Build Coastguard Worker with sub-tile content further stored in row-major order, with horizontal byte
97*35ffd701SAndroid Build Coastguard Worker neighbors within sub-tile stored contiguously and vertical neighbors only 16
98*35ffd701SAndroid Build Coastguard Worker bytes away. This means single 64-byte cache line contains 4x4 group of 32bpp
99*35ffd701SAndroid Build Coastguard Worker pixels--which is powerful spatial locality for graphics processing.
100*35ffd701SAndroid Build Coastguard Worker
101*35ffd701SAndroid Build Coastguard Worker If mappings restricted to being "parallel" for index bits (i.e. bits of given
102*35ffd701SAndroid Build Coastguard Worker index can change position but not relative order during mapping), then bit
103*35ffd701SAndroid Build Coastguard Worker indexes need not be explicitly denoted--e.g. the previous sub-tiling mapping
104*35ffd701SAndroid Build Coastguard Worker can be represented as...
105*35ffd701SAndroid Build Coastguard Worker
106*35ffd701SAndroid Build Coastguard Worker Linear[11:0] = Y Y Y X X X Y Y X X X X
107*35ffd701SAndroid Build Coastguard Worker
108*35ffd701SAndroid Build Coastguard Worker ...where X and Y index bits are implied to be zero-based-counted in order they
109*35ffd701SAndroid Build Coastguard Worker are encountered.
110*35ffd701SAndroid Build Coastguard Worker
111*35ffd701SAndroid Build Coastguard Worker In software, spatial-to-linear mapping conveniently described with bit mask for
112*35ffd701SAndroid Build Coastguard Worker each dimension, where a set bit indicates the next bit of that dimension's
113*35ffd701SAndroid Build Coastguard Worker index is mapped to that position in the linear offset--e.g....
114*35ffd701SAndroid Build Coastguard Worker
115*35ffd701SAndroid Build Coastguard Worker Linear[11:0] = Y Y Y X X X Y Y X X X X
116*35ffd701SAndroid Build Coastguard Worker MaskX = 0 0 0 1 1 1 0 0 1 1 1 1
117*35ffd701SAndroid Build Coastguard Worker MaskY = 1 1 1 0 0 0 1 1 0 0 0 0
118*35ffd701SAndroid Build Coastguard Worker
119*35ffd701SAndroid Build Coastguard Worker Such dimensional masks all that's needed to describe given tiling/swizzling
120*35ffd701SAndroid Build Coastguard Worker convention, since tile size and dimensions can be derived from the masks:
121*35ffd701SAndroid Build Coastguard Worker
122*35ffd701SAndroid Build Coastguard Worker TileWidth = 2 ^ NumberOfSetBits(MaskX)
123*35ffd701SAndroid Build Coastguard Worker TileHeight = 2 ^ NumberOfSetBits(MaskY)
124*35ffd701SAndroid Build Coastguard Worker TileSize = 2 ^ NumberOfSetBits(MaskX OR MaskY)
125*35ffd701SAndroid Build Coastguard Worker
126*35ffd701SAndroid Build Coastguard Worker Tiling/swizzling is not limited to 2D. With addition of another tile dimension,
127*35ffd701SAndroid Build Coastguard Worker spatial locality for 3D or MSAA sample neighbors can be controlled, also. */
128*35ffd701SAndroid Build Coastguard Worker
129*35ffd701SAndroid Build Coastguard Worker typedef struct _SWIZZLE_DESCRIPTOR {
130*35ffd701SAndroid Build Coastguard Worker struct _SWIZZLE_DESCRIPTOR_MASKS {
131*35ffd701SAndroid Build Coastguard Worker int x, y, z;
132*35ffd701SAndroid Build Coastguard Worker } Mask;
133*35ffd701SAndroid Build Coastguard Worker } SWIZZLE_DESCRIPTOR;
134*35ffd701SAndroid Build Coastguard Worker
135*35ffd701SAndroid Build Coastguard Worker typedef enum _EXTERNAL_SWIZZLE_NAME
136*35ffd701SAndroid Build Coastguard Worker {
137*35ffd701SAndroid Build Coastguard Worker TILEX = 0,
138*35ffd701SAndroid Build Coastguard Worker TILEY,
139*35ffd701SAndroid Build Coastguard Worker TILEW,
140*35ffd701SAndroid Build Coastguard Worker TILEYS,
141*35ffd701SAndroid Build Coastguard Worker TILEYF
142*35ffd701SAndroid Build Coastguard Worker }EXTERNAL_SWIZZLE_NAME;
143*35ffd701SAndroid Build Coastguard Worker
144*35ffd701SAndroid Build Coastguard Worker typedef enum _EXTERNAL_RES_TYPE{
145*35ffd701SAndroid Build Coastguard Worker Res_2D = 0,
146*35ffd701SAndroid Build Coastguard Worker Res_3D = 1,
147*35ffd701SAndroid Build Coastguard Worker MSAA_2X,
148*35ffd701SAndroid Build Coastguard Worker MSAA_4X,
149*35ffd701SAndroid Build Coastguard Worker MSAA_8X,
150*35ffd701SAndroid Build Coastguard Worker MSAA_16X
151*35ffd701SAndroid Build Coastguard Worker }EXTERNAL_RES_TYPE;
152*35ffd701SAndroid Build Coastguard Worker
153*35ffd701SAndroid Build Coastguard Worker // Definition Helper Macros...
154*35ffd701SAndroid Build Coastguard Worker #define X ,'x'
155*35ffd701SAndroid Build Coastguard Worker #define Y ,'y'
156*35ffd701SAndroid Build Coastguard Worker #define Z ,'z'
157*35ffd701SAndroid Build Coastguard Worker #define S ,'z' // S = MSAA Sample Index
158*35ffd701SAndroid Build Coastguard Worker #define o ,0 // o = N/A Swizzle Bit
159*35ffd701SAndroid Build Coastguard Worker #ifdef INCLUDE_CpuSwizzleBlt_c_AS_HEADER
160*35ffd701SAndroid Build Coastguard Worker #define __SWIZZLE(Name, b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0) \
161*35ffd701SAndroid Build Coastguard Worker extern const SWIZZLE_DESCRIPTOR Name;
162*35ffd701SAndroid Build Coastguard Worker #else // C Compile...
163*35ffd701SAndroid Build Coastguard Worker #define __SWIZZLE(Name, b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0) \
164*35ffd701SAndroid Build Coastguard Worker const SWIZZLE_DESCRIPTOR Name = \
165*35ffd701SAndroid Build Coastguard Worker { (b15 == 'x' ? 0x8000 : 0) + (b14 == 'x' ? 0x4000 : 0) + (b13 == 'x' ? 0x2000 : 0) + (b12 == 'x' ? 0x1000 : 0) + (b11 == 'x' ? 0x0800 : 0) + (b10 == 'x' ? 0x0400 : 0) + (b9 == 'x' ? 0x0200 : 0) + (b8 == 'x' ? 0x0100 : 0) + (b7 == 'x' ? 0x0080 : 0) + (b6 == 'x' ? 0x0040 : 0) + (b5 == 'x' ? 0x0020 : 0) + (b4 == 'x' ? 0x0010 : 0) + (b3 == 'x' ? 0x0008 : 0) + (b2 == 'x' ? 0x0004 : 0) + (b1 == 'x' ? 0x0002 : 0) + (b0 == 'x' ? 0x0001 : 0), \
166*35ffd701SAndroid Build Coastguard Worker (b15 == 'y' ? 0x8000 : 0) + (b14 == 'y' ? 0x4000 : 0) + (b13 == 'y' ? 0x2000 : 0) + (b12 == 'y' ? 0x1000 : 0) + (b11 == 'y' ? 0x0800 : 0) + (b10 == 'y' ? 0x0400 : 0) + (b9 == 'y' ? 0x0200 : 0) + (b8 == 'y' ? 0x0100 : 0) + (b7 == 'y' ? 0x0080 : 0) + (b6 == 'y' ? 0x0040 : 0) + (b5 == 'y' ? 0x0020 : 0) + (b4 == 'y' ? 0x0010 : 0) + (b3 == 'y' ? 0x0008 : 0) + (b2 == 'y' ? 0x0004 : 0) + (b1 == 'y' ? 0x0002 : 0) + (b0 == 'y' ? 0x0001 : 0), \
167*35ffd701SAndroid Build Coastguard Worker (b15 == 'z' ? 0x8000 : 0) + (b14 == 'z' ? 0x4000 : 0) + (b13 == 'z' ? 0x2000 : 0) + (b12 == 'z' ? 0x1000 : 0) + (b11 == 'z' ? 0x0800 : 0) + (b10 == 'z' ? 0x0400 : 0) + (b9 == 'z' ? 0x0200 : 0) + (b8 == 'z' ? 0x0100 : 0) + (b7 == 'z' ? 0x0080 : 0) + (b6 == 'z' ? 0x0040 : 0) + (b5 == 'z' ? 0x0020 : 0) + (b4 == 'z' ? 0x0010 : 0) + (b3 == 'z' ? 0x0008 : 0) + (b2 == 'z' ? 0x0004 : 0) + (b1 == 'z' ? 0x0002 : 0) + (b0 == 'z' ? 0x0001 : 0) }
168*35ffd701SAndroid Build Coastguard Worker #endif
169*35ffd701SAndroid Build Coastguard Worker #define SWIZZLE(__SWIZZLE_Args) __SWIZZLE __SWIZZLE_Args
170*35ffd701SAndroid Build Coastguard Worker
171*35ffd701SAndroid Build Coastguard Worker // Legacy Intel Tiling Swizzles...
172*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_X o o o o Y Y Y X X X X X X X X X ));
173*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_Y o o o o X X X Y Y Y Y Y X X X X ));
174*35ffd701SAndroid Build Coastguard Worker
175*35ffd701SAndroid Build Coastguard Worker #ifdef INTEL_TILE_W_SUPPORT
176*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_W o o o o X X X Y Y Y Y X Y X Y X ));
177*35ffd701SAndroid Build Coastguard Worker #endif
178*35ffd701SAndroid Build Coastguard Worker // Gen9 Swizzles...
179*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_128 o o o o X Y X Y X X Y Y X X X X ));
180*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_64 o o o o X Y X Y X X Y Y X X X X ));
181*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_32 o o o o X Y X Y X Y Y Y X X X X ));
182*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_16 o o o o X Y X Y X Y Y Y X X X X ));
183*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_8 o o o o X Y X Y Y Y Y Y X X X X ));
184*35ffd701SAndroid Build Coastguard Worker
185*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_128 X Y X Y X Y X Y X X Y Y X X X X ));
186*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_64 X Y X Y X Y X Y X X Y Y X X X X ));
187*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_32 X Y X Y X Y X Y X Y Y Y X X X X ));
188*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_16 X Y X Y X Y X Y X Y Y Y X X X X ));
189*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_8 X Y X Y X Y X Y Y Y Y Y X X X X ));
190*35ffd701SAndroid Build Coastguard Worker
191*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA2_128 o o o o S Y X Y X X Y Y X X X X ));
192*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA2_64 o o o o S Y X Y X X Y Y X X X X ));
193*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA2_32 o o o o S Y X Y X Y Y Y X X X X ));
194*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA2_16 o o o o S Y X Y X Y Y Y X X X X ));
195*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA2_8 o o o o S Y X Y Y Y Y Y X X X X ));
196*35ffd701SAndroid Build Coastguard Worker
197*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA2_128 S Y X Y X Y X Y X X Y Y X X X X ));
198*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA2_64 S Y X Y X Y X Y X X Y Y X X X X ));
199*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA2_32 S Y X Y X Y X Y X Y Y Y X X X X ));
200*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA2_16 S Y X Y X Y X Y X Y Y Y X X X X ));
201*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA2_8 S Y X Y X Y X Y Y Y Y Y X X X X ));
202*35ffd701SAndroid Build Coastguard Worker
203*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA4_128 o o o o S S X Y X X Y Y X X X X ));
204*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA4_64 o o o o S S X Y X X Y Y X X X X ));
205*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA4_32 o o o o S S X Y X Y Y Y X X X X ));
206*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA4_16 o o o o S S X Y X Y Y Y X X X X ));
207*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA4_8 o o o o S S X Y Y Y Y Y X X X X ));
208*35ffd701SAndroid Build Coastguard Worker
209*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA4_128 S S X Y X Y X Y X X Y Y X X X X ));
210*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA4_64 S S X Y X Y X Y X X Y Y X X X X ));
211*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA4_32 S S X Y X Y X Y X Y Y Y X X X X ));
212*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA4_16 S S X Y X Y X Y X Y Y Y X X X X ));
213*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA4_8 S S X Y X Y X Y Y Y Y Y X X X X ));
214*35ffd701SAndroid Build Coastguard Worker
215*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA8_128 o o o o S S S Y X X Y Y X X X X ));
216*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA8_64 o o o o S S S Y X X Y Y X X X X ));
217*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA8_32 o o o o S S S Y X Y Y Y X X X X ));
218*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA8_16 o o o o S S S Y X Y Y Y X X X X ));
219*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA8_8 o o o o S S S Y Y Y Y Y X X X X ));
220*35ffd701SAndroid Build Coastguard Worker
221*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA8_128 S S S Y X Y X Y X X Y Y X X X X ));
222*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA8_64 S S S Y X Y X Y X X Y Y X X X X ));
223*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA8_32 S S S Y X Y X Y X Y Y Y X X X X ));
224*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA8_16 S S S Y X Y X Y X Y Y Y X X X X ));
225*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA8_8 S S S Y X Y X Y Y Y Y Y X X X X ));
226*35ffd701SAndroid Build Coastguard Worker
227*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA16_128 o o o o S S S S X X Y Y X X X X ));
228*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA16_64 o o o o S S S S X X Y Y X X X X ));
229*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA16_32 o o o o S S S S X Y Y Y X X X X ));
230*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA16_16 o o o o S S S S X Y Y Y X X X X ));
231*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_MSAA16_8 o o o o S S S S Y Y Y Y X X X X ));
232*35ffd701SAndroid Build Coastguard Worker
233*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA16_128 S S S S X Y X Y X X Y Y X X X X ));
234*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA16_64 S S S S X Y X Y X X Y Y X X X X ));
235*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA16_32 S S S S X Y X Y X Y Y Y X X X X ));
236*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA16_16 S S S S X Y X Y X Y Y Y X X X X ));
237*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_MSAA16_8 S S S S X Y X Y Y Y Y Y X X X X ));
238*35ffd701SAndroid Build Coastguard Worker
239*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_3D_128 o o o o Y Z X X Z Z Y Y X X X X ));
240*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_3D_64 o o o o Y Z X X Z Z Y Y X X X X ));
241*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_3D_32 o o o o Y Z X Y Z Z Y Y X X X X ));
242*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_3D_16 o o o o Y Z Y Z Z Z Y Y X X X X ));
243*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YF_3D_8 o o o o Y Z Y Z Z Z Y Y X X X X ));
244*35ffd701SAndroid Build Coastguard Worker
245*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_3D_128 X Y Z X Y Z X X Z Z Y Y X X X X ));
246*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_3D_64 X Y Z X Y Z X X Z Z Y Y X X X X ));
247*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_3D_32 X Y Z X Y Z X Y Z Z Y Y X X X X ));
248*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_3D_16 X Y Z X Y Z Y Z Z Z Y Y X X X X ));
249*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_YS_3D_8 X Y Z X Y Z Y Z Z Z Y Y X X X X ));
250*35ffd701SAndroid Build Coastguard Worker
251*35ffd701SAndroid Build Coastguard Worker // XE_HP_SDV Swizzles...
252*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_4 o o o o Y Y X Y X X Y Y X X X X ));
253*35ffd701SAndroid Build Coastguard Worker
254*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_128 Y X X X Y Y X Y X X Y Y X X X X ));
255*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_64 Y X X X Y Y X Y X X Y Y X X X X ));
256*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_32 Y Y X X Y Y X Y X X Y Y X X X X ));
257*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_16 Y Y X X Y Y X Y X X Y Y X X X X ));
258*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_8 Y Y Y X Y Y X Y X X Y Y X X X X ));
259*35ffd701SAndroid Build Coastguard Worker
260*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_MSAA2_128 Y X X X Y Y X Y S X Y Y X X X X ));
261*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_MSAA2_64 Y X X X Y Y X Y S X Y Y X X X X ));
262*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_MSAA2_32 Y Y X X Y Y X Y S X Y Y X X X X ));
263*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_MSAA2_16 Y Y X X Y Y X Y S X Y Y X X X X ));
264*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_MSAA2_8 Y Y Y X Y Y X Y S X Y Y X X X X ));
265*35ffd701SAndroid Build Coastguard Worker
266*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_MSAA_128 Y X X X Y Y X S S X Y Y X X X X ));
267*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_MSAA_64 Y X X X Y Y X S S X Y Y X X X X ));
268*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_MSAA_32 Y Y X X Y Y X S S X Y Y X X X X ));
269*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_MSAA_16 Y Y X X Y Y X S S X Y Y X X X X ));
270*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_MSAA_8 Y Y Y X Y Y X S S X Y Y X X X X ));
271*35ffd701SAndroid Build Coastguard Worker
272*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_3D_128 Z Z Y X X X Z Y Z X Y Y X X X X ));
273*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_3D_64 Z Z Y X X X Z Y Z X Y Y X X X X ));
274*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_3D_32 Z Z Y X Y X Z Y Z X Y Y X X X X ));
275*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_3D_16 Z Z Z Y Y X Z Y Z X Y Y X X X X ));
276*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_3D_8 Z Z Z X Y Y Z Y Z X Y Y X X X X ));
277*35ffd701SAndroid Build Coastguard Worker
278*35ffd701SAndroid Build Coastguard Worker //Tile64 updated layout for Render Compression 256B and Physical L3
279*35ffd701SAndroid Build Coastguard Worker
280*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA2_128 Y X X X Y Y X S X X Y Y X X X X ));
281*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA2_64 Y Y X X Y Y X S X X Y Y X X X X ));
282*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA2_32 Y Y Y X Y Y X S X X Y Y X X X X ));
283*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA2_16 Y Y Y X Y Y X S X X Y Y X X X X ));
284*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA2_8 Y Y Y Y Y Y X S X X Y Y X X X X ));
285*35ffd701SAndroid Build Coastguard Worker
286*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA4_128 Y X X X Y Y S S X X Y Y X X X X ));
287*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA4_64 Y X X X Y Y S S X X Y Y X X X X ));
288*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA4_32 Y Y X X Y Y S S X X Y Y X X X X ));
289*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA4_16 Y Y X X Y Y S S X X Y Y X X X X ));
290*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA4_8 Y Y Y X Y Y S S X X Y Y X X X X ));
291*35ffd701SAndroid Build Coastguard Worker
292*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA8_128 Y Y X X Y X S S S X Y Y X X X X ));
293*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA8_64 Y Y X X Y X S S S X Y Y X X X X ));
294*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA8_32 Y Y X X Y X S S S X Y Y X X X X ));
295*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA8_16 Y Y Y X Y X S S S X Y Y X X X X ));
296*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA8_8 Y Y Y X Y X S S S X Y Y X X X X ));
297*35ffd701SAndroid Build Coastguard Worker
298*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA16_128 Y X X X Y X S S S S Y Y X X X X ));
299*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA16_64 Y Y X X Y X S S S S Y Y X X X X ));
300*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA16_32 Y Y X X Y X S S S S Y Y X X X X ));
301*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA16_16 Y Y X X Y X S S S S Y Y X X X X ));
302*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_MSAA16_8 Y Y Y X Y X S S S S Y Y X X X X ));
303*35ffd701SAndroid Build Coastguard Worker
304*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_3D_128 Z Z Y X X Y Z Z X X Y Y X X X X ));
305*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_3D_64 Z Z Y X X Y Z Z X X Y Y X X X X ));
306*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_3D_32 Z Z Y X Y Y Z Z X X Y Y X X X X ));
307*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_3D_16 Z Z Z Y Y Y Z Z X X Y Y X X X X ));
308*35ffd701SAndroid Build Coastguard Worker SWIZZLE(( INTEL_TILE_64_V2_3D_8 Z Z Z Y Y Y Z Z X X Y Y X X X X ));
309*35ffd701SAndroid Build Coastguard Worker
310*35ffd701SAndroid Build Coastguard Worker
311*35ffd701SAndroid Build Coastguard Worker #undef X
312*35ffd701SAndroid Build Coastguard Worker #undef Y
313*35ffd701SAndroid Build Coastguard Worker #undef Z
314*35ffd701SAndroid Build Coastguard Worker #undef S
315*35ffd701SAndroid Build Coastguard Worker #undef o
316*35ffd701SAndroid Build Coastguard Worker #undef __SWIZZLE
317*35ffd701SAndroid Build Coastguard Worker #undef SWIZZLE
318*35ffd701SAndroid Build Coastguard Worker
319*35ffd701SAndroid Build Coastguard Worker // Accessing Swizzled Surface ##################################################
320*35ffd701SAndroid Build Coastguard Worker
321*35ffd701SAndroid Build Coastguard Worker /* While graphics hardware prefers to access surfaces stored in tiled/swizzled
322*35ffd701SAndroid Build Coastguard Worker formats, logically accessing such surfaces with CPU-based software is non-
323*35ffd701SAndroid Build Coastguard Worker trivial when high throughput is goal.
324*35ffd701SAndroid Build Coastguard Worker
325*35ffd701SAndroid Build Coastguard Worker This file implements (1) SwizzleOffset function to compute swizzled offset of
326*35ffd701SAndroid Build Coastguard Worker dimensionally-specified surface byte, and (2) CpuSwizzleBlt function to BLT
327*35ffd701SAndroid Build Coastguard Worker between linear ("y * pitch + x") and swizzled surfaces--with goal of providing
328*35ffd701SAndroid Build Coastguard Worker high-performance, swizzling BLT implementation to be used both in production
329*35ffd701SAndroid Build Coastguard Worker and as a guide for those seeking to understand swizzled access or implement
330*35ffd701SAndroid Build Coastguard Worker functionality beyond the simple BLT. */
331*35ffd701SAndroid Build Coastguard Worker
332*35ffd701SAndroid Build Coastguard Worker // Surface Descriptor for CpuSwizzleBlt function...
333*35ffd701SAndroid Build Coastguard Worker typedef struct _CPU_SWIZZLE_BLT_SURFACE
334*35ffd701SAndroid Build Coastguard Worker {
335*35ffd701SAndroid Build Coastguard Worker void *pBase; // Pointer to surface base.
336*35ffd701SAndroid Build Coastguard Worker int Pitch, Height; // Row-pitch in bytes, and height, of surface.
337*35ffd701SAndroid Build Coastguard Worker const SWIZZLE_DESCRIPTOR *pSwizzle; // Pointer to surface's swizzle descriptor, or NULL if unswizzled.
338*35ffd701SAndroid Build Coastguard Worker int OffsetX; // Horizontal offset into surface for BLT rectangle, in bytes.
339*35ffd701SAndroid Build Coastguard Worker int OffsetY; // Vertical offset into surface for BLT rectangle, in physical/pitch rows.
340*35ffd701SAndroid Build Coastguard Worker int OffsetZ; // Zero if N/A, or 3D offset into surface for BLT rectangle, in 3D slices or MSAA samples as appropriate.
341*35ffd701SAndroid Build Coastguard Worker
342*35ffd701SAndroid Build Coastguard Worker #ifdef SUB_ELEMENT_SUPPORT
343*35ffd701SAndroid Build Coastguard Worker struct _CPU_SWIZZLE_BLT_SURFACE_ELEMENT
344*35ffd701SAndroid Build Coastguard Worker {
345*35ffd701SAndroid Build Coastguard Worker int Pitch, Size; // Zero if full-pixel BLT, or pitch and size, in bytes, of pixel element being BLT'ed.
346*35ffd701SAndroid Build Coastguard Worker } Element;
347*35ffd701SAndroid Build Coastguard Worker
348*35ffd701SAndroid Build Coastguard Worker /* e.g. to BLT only stencil data from S8D24 surface to S8 surface...
349*35ffd701SAndroid Build Coastguard Worker Dest.Element.Size = Src.Element.Size = sizeof(S8) = 1;
350*35ffd701SAndroid Build Coastguard Worker Dest.Element.Pitch = sizeof(S8) = 1;
351*35ffd701SAndroid Build Coastguard Worker Src.Element.Pitch = sizeof(S8D24) = 4;
352*35ffd701SAndroid Build Coastguard Worker Src.OffsetX += BYTE_OFFSET_OF_S8_WITHIN_S8D24; */
353*35ffd701SAndroid Build Coastguard Worker #endif
354*35ffd701SAndroid Build Coastguard Worker } CPU_SWIZZLE_BLT_SURFACE;
355*35ffd701SAndroid Build Coastguard Worker
356*35ffd701SAndroid Build Coastguard Worker extern int SwizzleOffset(const SWIZZLE_DESCRIPTOR *pSwizzle, int Pitch, int OffsetX, int OffsetY, int OffsetZ);
357*35ffd701SAndroid Build Coastguard Worker extern void CpuSwizzleBlt(CPU_SWIZZLE_BLT_SURFACE *pDest, CPU_SWIZZLE_BLT_SURFACE *pSrc, int CopyWidthBytes, int CopyHeight);
358*35ffd701SAndroid Build Coastguard Worker
359*35ffd701SAndroid Build Coastguard Worker #ifdef __cplusplus
360*35ffd701SAndroid Build Coastguard Worker }
361*35ffd701SAndroid Build Coastguard Worker #endif
362*35ffd701SAndroid Build Coastguard Worker
363*35ffd701SAndroid Build Coastguard Worker #define CpuSwizzleBlt_INCLUDED
364*35ffd701SAndroid Build Coastguard Worker
365*35ffd701SAndroid Build Coastguard Worker #endif
366*35ffd701SAndroid Build Coastguard Worker
367*35ffd701SAndroid Build Coastguard Worker
368*35ffd701SAndroid Build Coastguard Worker #ifndef INCLUDE_CpuSwizzleBlt_c_AS_HEADER
369*35ffd701SAndroid Build Coastguard Worker
370*35ffd701SAndroid Build Coastguard Worker //#define MINIMALIST // Use minimalist, unoptimized implementation.
371*35ffd701SAndroid Build Coastguard Worker
372*35ffd701SAndroid Build Coastguard Worker #include "assert.h" // Quoted to allow local-directory override.
373*35ffd701SAndroid Build Coastguard Worker
374*35ffd701SAndroid Build Coastguard Worker #if(_MSC_VER >= 1400)
375*35ffd701SAndroid Build Coastguard Worker #include <intrin.h>
376*35ffd701SAndroid Build Coastguard Worker #elif defined(__ARM_ARCH)
377*35ffd701SAndroid Build Coastguard Worker #include <sse2neon.h>
378*35ffd701SAndroid Build Coastguard Worker #elif((defined __clang__) ||(__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5)))
379*35ffd701SAndroid Build Coastguard Worker #include <cpuid.h>
380*35ffd701SAndroid Build Coastguard Worker #include <x86intrin.h>
381*35ffd701SAndroid Build Coastguard Worker #else
382*35ffd701SAndroid Build Coastguard Worker #error "Unexpected compiler!"
383*35ffd701SAndroid Build Coastguard Worker #endif
384*35ffd701SAndroid Build Coastguard Worker
385*35ffd701SAndroid Build Coastguard Worker
386*35ffd701SAndroid Build Coastguard Worker // POPCNT: Count Lit Bits... 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
387*35ffd701SAndroid Build Coastguard Worker static unsigned char PopCnt4[16] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
388*35ffd701SAndroid Build Coastguard Worker #define POPCNT4(x) (PopCnt4[(x) & 0xf])
389*35ffd701SAndroid Build Coastguard Worker #define POPCNT16(x) (POPCNT4((x) >> 12) + POPCNT4((x) >> 8) + POPCNT4((x) >> 4) + POPCNT4(x))
390*35ffd701SAndroid Build Coastguard Worker
391*35ffd701SAndroid Build Coastguard Worker
SwizzleOffset(const SWIZZLE_DESCRIPTOR * pSwizzle,int Pitch,int OffsetX,int OffsetY,int OffsetZ)392*35ffd701SAndroid Build Coastguard Worker int SwizzleOffset( // ##########################################################
393*35ffd701SAndroid Build Coastguard Worker
394*35ffd701SAndroid Build Coastguard Worker /* Return swizzled offset of dimensionally-specified surface byte. */
395*35ffd701SAndroid Build Coastguard Worker
396*35ffd701SAndroid Build Coastguard Worker const SWIZZLE_DESCRIPTOR *pSwizzle, // Pointer to applicable swizzle descriptor.
397*35ffd701SAndroid Build Coastguard Worker int Pitch, // Pointer to applicable surface row-pitch.
398*35ffd701SAndroid Build Coastguard Worker int OffsetX, // Horizontal offset into surface of the target byte, in bytes.
399*35ffd701SAndroid Build Coastguard Worker int OffsetY, // Vertical offset into surface of the target byte, in physical/pitch rows.
400*35ffd701SAndroid Build Coastguard Worker int OffsetZ) // Zero if N/A, or 3D offset into surface of the target byte, in 3D slices or MSAA samples as appropriate.
401*35ffd701SAndroid Build Coastguard Worker
402*35ffd701SAndroid Build Coastguard Worker /* Given logically-specified (x, y, z) byte within swizzled surface,
403*35ffd701SAndroid Build Coastguard Worker function returns byte's linear/memory offset from surface's base--i.e. it
404*35ffd701SAndroid Build Coastguard Worker performs the swizzled, spatial-to-linear mapping.
405*35ffd701SAndroid Build Coastguard Worker
406*35ffd701SAndroid Build Coastguard Worker Function makes no real effort to perform optimally, since should only used
407*35ffd701SAndroid Build Coastguard Worker outside loops in CpuSwizzleBlt and similar functions. If any of this
408*35ffd701SAndroid Build Coastguard Worker functionality was needed in performance path, a custom implementation
409*35ffd701SAndroid Build Coastguard Worker should be used that limits itself to functionality specifically needed
410*35ffd701SAndroid Build Coastguard Worker (probably single-dimension, intra-tile offsets) and uses a fast computation
411*35ffd701SAndroid Build Coastguard Worker (e.g. LUT's, hard-codings, PDEP). */
412*35ffd701SAndroid Build Coastguard Worker
413*35ffd701SAndroid Build Coastguard Worker { // ###########################################################################
414*35ffd701SAndroid Build Coastguard Worker
415*35ffd701SAndroid Build Coastguard Worker char PDepSupported = -1; // AVX2/BMI2 PDEP (Parallel Deposit) Instruction
416*35ffd701SAndroid Build Coastguard Worker
417*35ffd701SAndroid Build Coastguard Worker int SwizzledOffset; // Return value being computed.
418*35ffd701SAndroid Build Coastguard Worker
419*35ffd701SAndroid Build Coastguard Worker int TileWidthBits = POPCNT16(pSwizzle->Mask.x); // Log2(Tile Width in Bytes)
420*35ffd701SAndroid Build Coastguard Worker int TileHeightBits = POPCNT16(pSwizzle->Mask.y); // Log2(Tile Height)
421*35ffd701SAndroid Build Coastguard Worker int TileDepthBits = POPCNT16(pSwizzle->Mask.z); // Log2(Tile Depth or MSAA Samples)
422*35ffd701SAndroid Build Coastguard Worker int TileSizeBits = TileWidthBits + TileHeightBits + TileDepthBits; // Log2(Tile Size in Bytes)
423*35ffd701SAndroid Build Coastguard Worker int TilesPerRow = Pitch >> TileWidthBits; // Surface Width in Tiles
424*35ffd701SAndroid Build Coastguard Worker
425*35ffd701SAndroid Build Coastguard Worker int Row, Col; // Tile grid position on surface, of tile containing specified byte.
426*35ffd701SAndroid Build Coastguard Worker int x, y, z; // Position of specified byte within tile that contains it.
427*35ffd701SAndroid Build Coastguard Worker
428*35ffd701SAndroid Build Coastguard Worker if(PDepSupported == -1)
429*35ffd701SAndroid Build Coastguard Worker {
430*35ffd701SAndroid Build Coastguard Worker #if(_MSC_VER >= 1700)
431*35ffd701SAndroid Build Coastguard Worker #define PDEP(Src, Mask) _pdep_u32((Src), (Mask))
432*35ffd701SAndroid Build Coastguard Worker int CpuInfo[4];
433*35ffd701SAndroid Build Coastguard Worker __cpuidex(CpuInfo, 7, 0);
434*35ffd701SAndroid Build Coastguard Worker PDepSupported = ((CpuInfo[1] & (1 << 8)) != 0); // EBX[8] = BMI2
435*35ffd701SAndroid Build Coastguard Worker #elif ( defined (__BMI2__ ))
436*35ffd701SAndroid Build Coastguard Worker #define PDEP(Src, Mask) _pdep_u32((Src), (Mask))
437*35ffd701SAndroid Build Coastguard Worker unsigned int eax, ebx, ecx, edx;
438*35ffd701SAndroid Build Coastguard Worker __cpuid_count(7, 0, eax, ebx, ecx, edx);
439*35ffd701SAndroid Build Coastguard Worker PDepSupported = ((ebx & (1 << 8)) != 0); // EBX[8] = BMI2
440*35ffd701SAndroid Build Coastguard Worker #else
441*35ffd701SAndroid Build Coastguard Worker #define PDEP(Src, Mask) 0
442*35ffd701SAndroid Build Coastguard Worker PDepSupported = 0;
443*35ffd701SAndroid Build Coastguard Worker #endif
444*35ffd701SAndroid Build Coastguard Worker }
445*35ffd701SAndroid Build Coastguard Worker
446*35ffd701SAndroid Build Coastguard Worker assert( // Mutually Exclusive Swizzle Positions...
447*35ffd701SAndroid Build Coastguard Worker (pSwizzle->Mask.x | pSwizzle->Mask.y | pSwizzle->Mask.z) ==
448*35ffd701SAndroid Build Coastguard Worker (pSwizzle->Mask.x + pSwizzle->Mask.y + pSwizzle->Mask.z));
449*35ffd701SAndroid Build Coastguard Worker
450*35ffd701SAndroid Build Coastguard Worker assert( // Swizzle Limited to 16-bit (else expand POPCNT'ing)...
451*35ffd701SAndroid Build Coastguard Worker (pSwizzle->Mask.x | pSwizzle->Mask.y | pSwizzle->Mask.z) < (1 << 16));
452*35ffd701SAndroid Build Coastguard Worker
453*35ffd701SAndroid Build Coastguard Worker assert( // Pitch is Multiple of Tile Width...
454*35ffd701SAndroid Build Coastguard Worker Pitch == ((Pitch >> TileWidthBits) << TileWidthBits));
455*35ffd701SAndroid Build Coastguard Worker
456*35ffd701SAndroid Build Coastguard Worker { // Break Positioning into Tile-Granular and Intra-Tile Components...
457*35ffd701SAndroid Build Coastguard Worker assert((OffsetZ >> TileDepthBits) == 0); // When dealing with 3D tiling, treat as separate single-tile-deep planes.
458*35ffd701SAndroid Build Coastguard Worker z = OffsetZ & ((1 << TileDepthBits) - 1);
459*35ffd701SAndroid Build Coastguard Worker
460*35ffd701SAndroid Build Coastguard Worker Row = OffsetY >> TileHeightBits;
461*35ffd701SAndroid Build Coastguard Worker y = OffsetY & ((1 << TileHeightBits) - 1);
462*35ffd701SAndroid Build Coastguard Worker
463*35ffd701SAndroid Build Coastguard Worker Col = OffsetX >> TileWidthBits;
464*35ffd701SAndroid Build Coastguard Worker x = OffsetX & ((1 << TileWidthBits) - 1);
465*35ffd701SAndroid Build Coastguard Worker }
466*35ffd701SAndroid Build Coastguard Worker
467*35ffd701SAndroid Build Coastguard Worker SwizzledOffset = // Start with surface offset of given tile...
468*35ffd701SAndroid Build Coastguard Worker (Row * TilesPerRow + Col) << TileSizeBits; // <-- Tiles laid across surface in row-major order.
469*35ffd701SAndroid Build Coastguard Worker
470*35ffd701SAndroid Build Coastguard Worker // ...then OR swizzled offset of byte within tile...
471*35ffd701SAndroid Build Coastguard Worker if(PDepSupported)
472*35ffd701SAndroid Build Coastguard Worker {
473*35ffd701SAndroid Build Coastguard Worker SwizzledOffset +=
474*35ffd701SAndroid Build Coastguard Worker PDEP(x, pSwizzle->Mask.x) +
475*35ffd701SAndroid Build Coastguard Worker PDEP(y, pSwizzle->Mask.y) +
476*35ffd701SAndroid Build Coastguard Worker PDEP(z, pSwizzle->Mask.z);
477*35ffd701SAndroid Build Coastguard Worker }
478*35ffd701SAndroid Build Coastguard Worker else // PDEP workalike...
479*35ffd701SAndroid Build Coastguard Worker {
480*35ffd701SAndroid Build Coastguard Worker int bitIndex = 0, bitMask = 1;
481*35ffd701SAndroid Build Coastguard Worker int terminationMask = pSwizzle->Mask.x | pSwizzle->Mask.y | pSwizzle->Mask.z;
482*35ffd701SAndroid Build Coastguard Worker while(bitMask < terminationMask)
483*35ffd701SAndroid Build Coastguard Worker {
484*35ffd701SAndroid Build Coastguard Worker int MaskQ;
485*35ffd701SAndroid Build Coastguard Worker #define PROCESS(Q) { \
486*35ffd701SAndroid Build Coastguard Worker MaskQ = bitMask & pSwizzle->Mask.Q; \
487*35ffd701SAndroid Build Coastguard Worker SwizzledOffset += Q & MaskQ; \
488*35ffd701SAndroid Build Coastguard Worker Q <<= 1 ^ (MaskQ >> bitIndex); \
489*35ffd701SAndroid Build Coastguard Worker }
490*35ffd701SAndroid Build Coastguard Worker PROCESS(x);
491*35ffd701SAndroid Build Coastguard Worker PROCESS(y);
492*35ffd701SAndroid Build Coastguard Worker PROCESS(z);
493*35ffd701SAndroid Build Coastguard Worker
494*35ffd701SAndroid Build Coastguard Worker bitIndex++;
495*35ffd701SAndroid Build Coastguard Worker bitMask <<= 1;
496*35ffd701SAndroid Build Coastguard Worker
497*35ffd701SAndroid Build Coastguard Worker #undef PROCESS
498*35ffd701SAndroid Build Coastguard Worker }
499*35ffd701SAndroid Build Coastguard Worker }
500*35ffd701SAndroid Build Coastguard Worker
501*35ffd701SAndroid Build Coastguard Worker return(SwizzledOffset);
502*35ffd701SAndroid Build Coastguard Worker }
503*35ffd701SAndroid Build Coastguard Worker
504*35ffd701SAndroid Build Coastguard Worker
CpuSwizzleBlt(CPU_SWIZZLE_BLT_SURFACE * pDest,CPU_SWIZZLE_BLT_SURFACE * pSrc,int CopyWidthBytes,int CopyHeight)505*35ffd701SAndroid Build Coastguard Worker void CpuSwizzleBlt( // #########################################################
506*35ffd701SAndroid Build Coastguard Worker
507*35ffd701SAndroid Build Coastguard Worker /* Performs specified swizzling BLT between two given surfaces. */
508*35ffd701SAndroid Build Coastguard Worker
509*35ffd701SAndroid Build Coastguard Worker CPU_SWIZZLE_BLT_SURFACE *pDest, // Pointer to destination surface descriptor.
510*35ffd701SAndroid Build Coastguard Worker CPU_SWIZZLE_BLT_SURFACE *pSrc, // Pointer to source surface descriptor.
511*35ffd701SAndroid Build Coastguard Worker int CopyWidthBytes, // Width of BLT rectangle, in bytes.
512*35ffd701SAndroid Build Coastguard Worker int CopyHeight) // Height of BLT rectangle, in physical/pitch rows.
513*35ffd701SAndroid Build Coastguard Worker
514*35ffd701SAndroid Build Coastguard Worker #ifdef SUB_ELEMENT_SUPPORT
515*35ffd701SAndroid Build Coastguard Worker
516*35ffd701SAndroid Build Coastguard Worker /* When copying between surfaces with different pixel pitches, specify
517*35ffd701SAndroid Build Coastguard Worker CopyWidthBytes in terms of unswizzled surface's element-pitches:
518*35ffd701SAndroid Build Coastguard Worker
519*35ffd701SAndroid Build Coastguard Worker CopyWidthBytes = CopyWidthPixels * pLinearSurface.Element.Pitch; */
520*35ffd701SAndroid Build Coastguard Worker
521*35ffd701SAndroid Build Coastguard Worker #endif
522*35ffd701SAndroid Build Coastguard Worker
523*35ffd701SAndroid Build Coastguard Worker { // ###########################################################################
524*35ffd701SAndroid Build Coastguard Worker
525*35ffd701SAndroid Build Coastguard Worker CPU_SWIZZLE_BLT_SURFACE *pLinearSurface, *pSwizzledSurface;
526*35ffd701SAndroid Build Coastguard Worker int LinearToSwizzled;
527*35ffd701SAndroid Build Coastguard Worker
528*35ffd701SAndroid Build Coastguard Worker { // One surface swizzled, the other unswizzled (aka "linear")...
529*35ffd701SAndroid Build Coastguard Worker assert((pDest->pSwizzle != NULL) ^ (pSrc->pSwizzle != NULL));
530*35ffd701SAndroid Build Coastguard Worker
531*35ffd701SAndroid Build Coastguard Worker LinearToSwizzled = !pSrc->pSwizzle;
532*35ffd701SAndroid Build Coastguard Worker if(LinearToSwizzled)
533*35ffd701SAndroid Build Coastguard Worker {
534*35ffd701SAndroid Build Coastguard Worker pSwizzledSurface = pDest;
535*35ffd701SAndroid Build Coastguard Worker pLinearSurface = pSrc;
536*35ffd701SAndroid Build Coastguard Worker }
537*35ffd701SAndroid Build Coastguard Worker else // Swizzled-to-Linear...
538*35ffd701SAndroid Build Coastguard Worker {
539*35ffd701SAndroid Build Coastguard Worker pSwizzledSurface = pSrc;
540*35ffd701SAndroid Build Coastguard Worker pLinearSurface = pDest;
541*35ffd701SAndroid Build Coastguard Worker }
542*35ffd701SAndroid Build Coastguard Worker }
543*35ffd701SAndroid Build Coastguard Worker
544*35ffd701SAndroid Build Coastguard Worker #ifdef SUB_ELEMENT_SUPPORT
545*35ffd701SAndroid Build Coastguard Worker {
546*35ffd701SAndroid Build Coastguard Worker assert( // Either both or neither specified...
547*35ffd701SAndroid Build Coastguard Worker (pDest->Element.Pitch != 0) == (pSrc->Element.Pitch != 0));
548*35ffd701SAndroid Build Coastguard Worker
549*35ffd701SAndroid Build Coastguard Worker assert( // Surfaces agree on transfer element size...
550*35ffd701SAndroid Build Coastguard Worker pDest->Element.Size == pSrc->Element.Size);
551*35ffd701SAndroid Build Coastguard Worker
552*35ffd701SAndroid Build Coastguard Worker assert( // Element pitch not specified without element size...
553*35ffd701SAndroid Build Coastguard Worker !(pDest->Element.Pitch && !pDest->Element.Size));
554*35ffd701SAndroid Build Coastguard Worker
555*35ffd701SAndroid Build Coastguard Worker assert( // Legit element sizes...
556*35ffd701SAndroid Build Coastguard Worker (pDest->Element.Size <= pDest->Element.Pitch) &&
557*35ffd701SAndroid Build Coastguard Worker (pSrc->Element.Size <= pSrc->Element.Pitch));
558*35ffd701SAndroid Build Coastguard Worker
559*35ffd701SAndroid Build Coastguard Worker assert( // Sub-element CopyWidthBytes in terms of LinearSurface pitch...
560*35ffd701SAndroid Build Coastguard Worker (pLinearSurface->Element.Pitch == 0) ||
561*35ffd701SAndroid Build Coastguard Worker ((CopyWidthBytes % pLinearSurface->Element.Pitch) == 0));
562*35ffd701SAndroid Build Coastguard Worker }
563*35ffd701SAndroid Build Coastguard Worker #endif
564*35ffd701SAndroid Build Coastguard Worker
565*35ffd701SAndroid Build Coastguard Worker { // No surface overrun...
566*35ffd701SAndroid Build Coastguard Worker int NoOverrun =
567*35ffd701SAndroid Build Coastguard Worker #ifdef SUB_ELEMENT_SUPPORT
568*35ffd701SAndroid Build Coastguard Worker (
569*35ffd701SAndroid Build Coastguard Worker // Sub-element transfer...
570*35ffd701SAndroid Build Coastguard Worker ((pLinearSurface->Element.Size != pLinearSurface->Element.Pitch) ||
571*35ffd701SAndroid Build Coastguard Worker (pSwizzledSurface->Element.Size != pSwizzledSurface->Element.Pitch)) &&
572*35ffd701SAndroid Build Coastguard Worker // No overrun...
573*35ffd701SAndroid Build Coastguard Worker ((pLinearSurface->OffsetX + CopyWidthBytes) <=
574*35ffd701SAndroid Build Coastguard Worker (pLinearSurface->Pitch +
575*35ffd701SAndroid Build Coastguard Worker // CopyWidthBytes's inclusion of uncopied bytes...
576*35ffd701SAndroid Build Coastguard Worker (pLinearSurface->Element.Pitch - pLinearSurface->Element.Size))) &&
577*35ffd701SAndroid Build Coastguard Worker ((pLinearSurface->OffsetY + CopyHeight) <= pLinearSurface->Height) &&
578*35ffd701SAndroid Build Coastguard Worker ((pSwizzledSurface->OffsetX +
579*35ffd701SAndroid Build Coastguard Worker // Adjust CopyWidthBytes from being in terms of LinearSurface pitch...
580*35ffd701SAndroid Build Coastguard Worker (CopyWidthBytes / pLinearSurface->Element.Pitch * pSwizzledSurface->Element.Pitch)
581*35ffd701SAndroid Build Coastguard Worker ) <=
582*35ffd701SAndroid Build Coastguard Worker (pSwizzledSurface->Pitch +
583*35ffd701SAndroid Build Coastguard Worker // CopyWidthBytes's inclusion of uncopied bytes...
584*35ffd701SAndroid Build Coastguard Worker (pSwizzledSurface->Element.Pitch - pSwizzledSurface->Element.Size))) &&
585*35ffd701SAndroid Build Coastguard Worker ((pSwizzledSurface->OffsetY + CopyHeight) <= pSwizzledSurface->Height)
586*35ffd701SAndroid Build Coastguard Worker ) ||
587*35ffd701SAndroid Build Coastguard Worker #endif
588*35ffd701SAndroid Build Coastguard Worker
589*35ffd701SAndroid Build Coastguard Worker ((pDest->OffsetX + CopyWidthBytes) <= pDest->Pitch) &&
590*35ffd701SAndroid Build Coastguard Worker ((pDest->OffsetY + CopyHeight) <= pDest->Height) &&
591*35ffd701SAndroid Build Coastguard Worker ((pSrc->OffsetX + CopyWidthBytes) <= pSrc->Pitch) &&
592*35ffd701SAndroid Build Coastguard Worker ((pSrc->OffsetY + CopyHeight) <= pSrc->Height);
593*35ffd701SAndroid Build Coastguard Worker
594*35ffd701SAndroid Build Coastguard Worker assert(NoOverrun);
595*35ffd701SAndroid Build Coastguard Worker }
596*35ffd701SAndroid Build Coastguard Worker
597*35ffd701SAndroid Build Coastguard Worker { // No surface overlap...
598*35ffd701SAndroid Build Coastguard Worker char *pDest0 = (char *) pDest->pBase;
599*35ffd701SAndroid Build Coastguard Worker char *pDest1 = (char *) pDest->pBase + pDest->Pitch * CopyHeight;
600*35ffd701SAndroid Build Coastguard Worker char *pSrc0 = (char *) pSrc->pBase;
601*35ffd701SAndroid Build Coastguard Worker char *pSrc1 = (char *) pSrc->pBase + pSrc->Pitch * CopyHeight;
602*35ffd701SAndroid Build Coastguard Worker
603*35ffd701SAndroid Build Coastguard Worker assert(!(
604*35ffd701SAndroid Build Coastguard Worker ((pDest0 >= pSrc0) && (pDest0 < pSrc1)) ||
605*35ffd701SAndroid Build Coastguard Worker ((pSrc0 >= pDest0) && (pSrc0 < pDest1))));
606*35ffd701SAndroid Build Coastguard Worker }
607*35ffd701SAndroid Build Coastguard Worker
608*35ffd701SAndroid Build Coastguard Worker {
609*35ffd701SAndroid Build Coastguard Worker /* BLT will have pointer in each surface between which data will be
610*35ffd701SAndroid Build Coastguard Worker copied from source to destination. Each pointer will be appropriately
611*35ffd701SAndroid Build Coastguard Worker incremented/positioned through its surface, as BLT rectangle is
612*35ffd701SAndroid Build Coastguard Worker traversed. */
613*35ffd701SAndroid Build Coastguard Worker
614*35ffd701SAndroid Build Coastguard Worker char *pLinearAddress, *pSwizzledAddress;
615*35ffd701SAndroid Build Coastguard Worker
616*35ffd701SAndroid Build Coastguard Worker // Convenient to track traversal in swizzled surface offsets...
617*35ffd701SAndroid Build Coastguard Worker int x0 = pSwizzledSurface->OffsetX;
618*35ffd701SAndroid Build Coastguard Worker int x1 = x0 + CopyWidthBytes;
619*35ffd701SAndroid Build Coastguard Worker int y0 = pSwizzledSurface->OffsetY;
620*35ffd701SAndroid Build Coastguard Worker int y1 = y0 + CopyHeight;
621*35ffd701SAndroid Build Coastguard Worker int x, y;
622*35ffd701SAndroid Build Coastguard Worker
623*35ffd701SAndroid Build Coastguard Worker // Start linear pointer at specified base...
624*35ffd701SAndroid Build Coastguard Worker pLinearAddress =
625*35ffd701SAndroid Build Coastguard Worker (char *) pLinearSurface->pBase +
626*35ffd701SAndroid Build Coastguard Worker pLinearSurface->OffsetY * pLinearSurface->Pitch +
627*35ffd701SAndroid Build Coastguard Worker pLinearSurface->OffsetX;
628*35ffd701SAndroid Build Coastguard Worker
629*35ffd701SAndroid Build Coastguard Worker #ifdef MINIMALIST // Simple implementation for functional understanding/testing/etc.
630*35ffd701SAndroid Build Coastguard Worker {
631*35ffd701SAndroid Build Coastguard Worker #ifdef SUB_ELEMENT_SUPPORT
632*35ffd701SAndroid Build Coastguard Worker assert( // No Sub-Element Transfer...
633*35ffd701SAndroid Build Coastguard Worker (pLinearSurface->Element.Size == pLinearSurface->Element.Pitch) &&
634*35ffd701SAndroid Build Coastguard Worker (pSwizzledSurface->Element.Size == pSwizzledSurface->Element.Pitch));
635*35ffd701SAndroid Build Coastguard Worker #endif
636*35ffd701SAndroid Build Coastguard Worker
637*35ffd701SAndroid Build Coastguard Worker for(y = y0; y < y1; y++)
638*35ffd701SAndroid Build Coastguard Worker {
639*35ffd701SAndroid Build Coastguard Worker for(x = x0; x < x1; x++)
640*35ffd701SAndroid Build Coastguard Worker {
641*35ffd701SAndroid Build Coastguard Worker pSwizzledAddress =
642*35ffd701SAndroid Build Coastguard Worker (char *) pSwizzledSurface->pBase +
643*35ffd701SAndroid Build Coastguard Worker SwizzleOffset(
644*35ffd701SAndroid Build Coastguard Worker pSwizzledSurface->pSwizzle,
645*35ffd701SAndroid Build Coastguard Worker pSwizzledSurface->Pitch,
646*35ffd701SAndroid Build Coastguard Worker x, y, pSwizzledSurface->OffsetZ);
647*35ffd701SAndroid Build Coastguard Worker
648*35ffd701SAndroid Build Coastguard Worker if(LinearToSwizzled)
649*35ffd701SAndroid Build Coastguard Worker {
650*35ffd701SAndroid Build Coastguard Worker *pSwizzledAddress = *pLinearAddress;
651*35ffd701SAndroid Build Coastguard Worker }
652*35ffd701SAndroid Build Coastguard Worker else
653*35ffd701SAndroid Build Coastguard Worker {
654*35ffd701SAndroid Build Coastguard Worker *pLinearAddress = *pSwizzledAddress;
655*35ffd701SAndroid Build Coastguard Worker }
656*35ffd701SAndroid Build Coastguard Worker
657*35ffd701SAndroid Build Coastguard Worker pLinearAddress++;
658*35ffd701SAndroid Build Coastguard Worker }
659*35ffd701SAndroid Build Coastguard Worker
660*35ffd701SAndroid Build Coastguard Worker pLinearAddress += pLinearSurface->Pitch - CopyWidthBytes;
661*35ffd701SAndroid Build Coastguard Worker }
662*35ffd701SAndroid Build Coastguard Worker }
663*35ffd701SAndroid Build Coastguard Worker #else // Production/Performance Implementation...
664*35ffd701SAndroid Build Coastguard Worker {
665*35ffd701SAndroid Build Coastguard Worker /* Key Performance Gains from...
666*35ffd701SAndroid Build Coastguard Worker (1) Efficient Memory Transfers (Ordering + Instruction)
667*35ffd701SAndroid Build Coastguard Worker (2) Minimizing Work in Inner Loops */
668*35ffd701SAndroid Build Coastguard Worker
669*35ffd701SAndroid Build Coastguard Worker #if(_MSC_VER >= 1600)
670*35ffd701SAndroid Build Coastguard Worker #include <stdint.h>
671*35ffd701SAndroid Build Coastguard Worker
672*35ffd701SAndroid Build Coastguard Worker #pragma warning(push)
673*35ffd701SAndroid Build Coastguard Worker #pragma warning(disable:4127) // Constant Conditional Expressions
674*35ffd701SAndroid Build Coastguard Worker
675*35ffd701SAndroid Build Coastguard Worker unsigned long LOW_BIT_Index;
676*35ffd701SAndroid Build Coastguard Worker #define LOW_BIT(x) (_BitScanForward(&LOW_BIT_Index, (x)), LOW_BIT_Index)
677*35ffd701SAndroid Build Coastguard Worker
678*35ffd701SAndroid Build Coastguard Worker unsigned long HIGH_BIT_Index;
679*35ffd701SAndroid Build Coastguard Worker #define HIGH_BIT(x) (_BitScanReverse(&HIGH_BIT_Index, (x)), HIGH_BIT_Index)
680*35ffd701SAndroid Build Coastguard Worker #elif(__GNUC__ >= 4)
681*35ffd701SAndroid Build Coastguard Worker #include <stdint.h>
682*35ffd701SAndroid Build Coastguard Worker
683*35ffd701SAndroid Build Coastguard Worker #define LOW_BIT(x) __builtin_ctz(x)
684*35ffd701SAndroid Build Coastguard Worker #define HIGH_BIT(x) ((sizeof(x) * CHAR_BIT - 1) - __builtin_clz(x))
685*35ffd701SAndroid Build Coastguard Worker #else
686*35ffd701SAndroid Build Coastguard Worker #error "Unexpected compiler!"
687*35ffd701SAndroid Build Coastguard Worker #endif
688*35ffd701SAndroid Build Coastguard Worker
689*35ffd701SAndroid Build Coastguard Worker typedef struct ___m24
690*35ffd701SAndroid Build Coastguard Worker {
691*35ffd701SAndroid Build Coastguard Worker uint8_t byte[3];
692*35ffd701SAndroid Build Coastguard Worker } __m24; // 24-bit/3-byte memory element.
693*35ffd701SAndroid Build Coastguard Worker
694*35ffd701SAndroid Build Coastguard Worker // Macros intended to compile to various types of "load register from memory" instructions...
695*35ffd701SAndroid Build Coastguard Worker #define MOVB_R( Reg, Src) (*(uint8_t *)&(Reg) = *(uint8_t *)(Src))
696*35ffd701SAndroid Build Coastguard Worker #define MOVW_R( Reg, Src) (*(uint16_t *)&(Reg) = *(uint16_t *)(Src))
697*35ffd701SAndroid Build Coastguard Worker #define MOV3_R( Reg, Src) (*(__m24 *)&(Reg) = *(__m24 *)(Src))
698*35ffd701SAndroid Build Coastguard Worker #define MOVD_R( Reg, Src) (*(uint32_t *)&(Reg) = *(uint32_t *)(Src))
699*35ffd701SAndroid Build Coastguard Worker
700*35ffd701SAndroid Build Coastguard Worker #define MOVQ_R( Reg, Src) ((Reg) = _mm_loadl_epi64((__m128i *)(Src)))
701*35ffd701SAndroid Build Coastguard Worker #define MOVDQ_R( Reg, Src) ((Reg) = _mm_load_si128( (__m128i *)(Src)))
702*35ffd701SAndroid Build Coastguard Worker #define MOVDQU_R(Reg, Src) ((Reg) = _mm_loadu_si128((__m128i *)(Src)))
703*35ffd701SAndroid Build Coastguard Worker
704*35ffd701SAndroid Build Coastguard Worker // As above, but the other half: "store to memory from register"...
705*35ffd701SAndroid Build Coastguard Worker #define MOVB_M( Dest, Reg)(*(uint8_t *)(Dest) = *(uint8_t *)&(Reg))
706*35ffd701SAndroid Build Coastguard Worker #define MOVW_M( Dest, Reg)(*(uint16_t *)(Dest) = *(uint16_t *)&(Reg))
707*35ffd701SAndroid Build Coastguard Worker #define MOV3_M( Dest, Reg)(*(__m24 *)(Dest) = *(__m24 *)&(Reg))
708*35ffd701SAndroid Build Coastguard Worker #define MOVD_M( Dest, Reg)(*(uint32_t *)(Dest) = *(uint32_t *)&(Reg))
709*35ffd701SAndroid Build Coastguard Worker
710*35ffd701SAndroid Build Coastguard Worker #define MOVQ_M( Dest, Reg)(_mm_storel_epi64((__m128i *)(Dest), (Reg)))
711*35ffd701SAndroid Build Coastguard Worker #define MOVDQ_M( Dest, Reg)(_mm_store_si128( (__m128i *)(Dest), (Reg)))
712*35ffd701SAndroid Build Coastguard Worker #define MOVDQU_M( Dest, Reg)(_mm_storeu_si128((__m128i *)(Dest), (Reg)))
713*35ffd701SAndroid Build Coastguard Worker #define MOVNTDQ_M( Dest, Reg)(_mm_stream_si128((__m128i *)(Dest), (Reg)))
714*35ffd701SAndroid Build Coastguard Worker
715*35ffd701SAndroid Build Coastguard Worker
716*35ffd701SAndroid Build Coastguard Worker #define MIN_CONTAINED_POW2_BELOW_CAP(x, Cap) (1 << LOW_BIT((1 << LOW_BIT(x)) | (1 << HIGH_BIT(Cap))))
717*35ffd701SAndroid Build Coastguard Worker
718*35ffd701SAndroid Build Coastguard Worker #define SWIZZLE_OFFSET(OffsetX, OffsetY, OffsetZ) \
719*35ffd701SAndroid Build Coastguard Worker SwizzleOffset(pSwizzledSurface->pSwizzle, pSwizzledSurface->Pitch, OffsetX, OffsetY, OffsetZ)
720*35ffd701SAndroid Build Coastguard Worker
721*35ffd701SAndroid Build Coastguard Worker #define MAX_XFER_WIDTH 16 // See "Compute Transfer Dimensions".
722*35ffd701SAndroid Build Coastguard Worker #define MAX_XFER_HEIGHT 4 // "
723*35ffd701SAndroid Build Coastguard Worker
724*35ffd701SAndroid Build Coastguard Worker char StreamingLoadSupported = -1; // SSE4.1: MOVNTDQA
725*35ffd701SAndroid Build Coastguard Worker
726*35ffd701SAndroid Build Coastguard Worker int TileWidthBits = POPCNT16(pSwizzledSurface->pSwizzle->Mask.x); // Log2(Tile Width in Bytes)
727*35ffd701SAndroid Build Coastguard Worker int TileHeightBits = POPCNT16(pSwizzledSurface->pSwizzle->Mask.y); // Log2(Tile Height)
728*35ffd701SAndroid Build Coastguard Worker int TileDepthBits = POPCNT16(pSwizzledSurface->pSwizzle->Mask.z); // Log2(Tile Depth or MSAA Samples)
729*35ffd701SAndroid Build Coastguard Worker int BytesPerRowOfTiles = pSwizzledSurface->Pitch << (TileDepthBits + TileHeightBits);
730*35ffd701SAndroid Build Coastguard Worker
731*35ffd701SAndroid Build Coastguard Worker struct { int LeftCrust, MainRun, RightCrust; } CopyWidth;
732*35ffd701SAndroid Build Coastguard Worker int MaskX[MAX_XFER_WIDTH + 1], MaskY[MAX_XFER_HEIGHT + 1];
733*35ffd701SAndroid Build Coastguard Worker int SwizzledOffsetX0, SwizzledOffsetY;
734*35ffd701SAndroid Build Coastguard Worker struct { int Width, Height; } SwizzleMaxXfer;
735*35ffd701SAndroid Build Coastguard Worker
736*35ffd701SAndroid Build Coastguard Worker char *pSwizzledAddressCopyBase =
737*35ffd701SAndroid Build Coastguard Worker (char *) pSwizzledSurface->pBase +
738*35ffd701SAndroid Build Coastguard Worker SWIZZLE_OFFSET(0, 0, pSwizzledSurface->OffsetZ);
739*35ffd701SAndroid Build Coastguard Worker
740*35ffd701SAndroid Build Coastguard Worker assert(sizeof(__m24) == 3);
741*35ffd701SAndroid Build Coastguard Worker
742*35ffd701SAndroid Build Coastguard Worker if(StreamingLoadSupported == -1)
743*35ffd701SAndroid Build Coastguard Worker {
744*35ffd701SAndroid Build Coastguard Worker #if(_MSC_VER >= 1500)
745*35ffd701SAndroid Build Coastguard Worker #define MOVNTDQA_R(Reg, Src) ((Reg) = _mm_stream_load_si128((__m128i *)(Src)))
746*35ffd701SAndroid Build Coastguard Worker int CpuInfo[4];
747*35ffd701SAndroid Build Coastguard Worker __cpuid(CpuInfo, 1);
748*35ffd701SAndroid Build Coastguard Worker StreamingLoadSupported = ((CpuInfo[2] & (1 << 19)) != 0); // ECX[19] = SSE4.1
749*35ffd701SAndroid Build Coastguard Worker #elif(defined(__ARM_ARCH))
750*35ffd701SAndroid Build Coastguard Worker #define MOVNTDQA_R(Reg, Src) ((Reg) = (Reg))
751*35ffd701SAndroid Build Coastguard Worker StreamingLoadSupported = 0;
752*35ffd701SAndroid Build Coastguard Worker #elif((defined __clang__) || (__GNUC__ > 4) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 5))
753*35ffd701SAndroid Build Coastguard Worker #define MOVNTDQA_R(Reg, Src) ((Reg) = _mm_stream_load_si128((__m128i *)(Src)))
754*35ffd701SAndroid Build Coastguard Worker unsigned int eax, ebx, ecx, edx;
755*35ffd701SAndroid Build Coastguard Worker __cpuid(1, eax, ebx, ecx, edx);
756*35ffd701SAndroid Build Coastguard Worker StreamingLoadSupported = ((ecx & (1 << 19)) != 0); // ECX[19] = SSE4.1
757*35ffd701SAndroid Build Coastguard Worker #else
758*35ffd701SAndroid Build Coastguard Worker #define MOVNTDQA_R(Reg, Src) ((Reg) = (Reg))
759*35ffd701SAndroid Build Coastguard Worker StreamingLoadSupported = 0;
760*35ffd701SAndroid Build Coastguard Worker #endif
761*35ffd701SAndroid Build Coastguard Worker }
762*35ffd701SAndroid Build Coastguard Worker
763*35ffd701SAndroid Build Coastguard Worker { // Compute Transfer Dimensions...
764*35ffd701SAndroid Build Coastguard Worker
765*35ffd701SAndroid Build Coastguard Worker /* When transferring between linear and swizzled surfaces, we
766*35ffd701SAndroid Build Coastguard Worker can't traverse linearly through memory of both since they have
767*35ffd701SAndroid Build Coastguard Worker drastically different memory orderings--Moving linearly through
768*35ffd701SAndroid Build Coastguard Worker one means bouncing around the other.
769*35ffd701SAndroid Build Coastguard Worker
770*35ffd701SAndroid Build Coastguard Worker Moving linearly through linear surface is more programmatically
771*35ffd701SAndroid Build Coastguard Worker convenient--especially when BLT rectangles not constrained to
772*35ffd701SAndroid Build Coastguard Worker tile boundaries. But moving linearly through swizzled surface
773*35ffd701SAndroid Build Coastguard Worker memory is often more performance-friendly--especially when that
774*35ffd701SAndroid Build Coastguard Worker memory is CPU-mapped as WC (Write Combining), which is often
775*35ffd701SAndroid Build Coastguard Worker the case for graphics memory.
776*35ffd701SAndroid Build Coastguard Worker
777*35ffd701SAndroid Build Coastguard Worker Fortunately, we can avoid shortcomings of both extremes by
778*35ffd701SAndroid Build Coastguard Worker using hybrid traversal: Traverse mostly linearly through linear
779*35ffd701SAndroid Build Coastguard Worker surface, but have innermost loop transfer small 2D chunks sized
780*35ffd701SAndroid Build Coastguard Worker to use critical runs of linearity in the swizzled memory.
781*35ffd701SAndroid Build Coastguard Worker
782*35ffd701SAndroid Build Coastguard Worker The "critical runs of linearity" that we want to hit in the
783*35ffd701SAndroid Build Coastguard Worker sizzled memory are aligned, cache-line-sized memory chunks. If
784*35ffd701SAndroid Build Coastguard Worker we bounce around with finer granularity we'll incur penalties
785*35ffd701SAndroid Build Coastguard Worker of partial WC buffer use (whether from WC memory use or non-
786*35ffd701SAndroid Build Coastguard Worker temporal stores).
787*35ffd701SAndroid Build Coastguard Worker
788*35ffd701SAndroid Build Coastguard Worker The size of 2D chunks with cache-line-sized linearity in
789*35ffd701SAndroid Build Coastguard Worker swizzled memory is determined by swizzle mapping's low-order
790*35ffd701SAndroid Build Coastguard Worker six bits (for 64-byte cache lines). Most swizzles use
791*35ffd701SAndroid Build Coastguard Worker "Y Y X X X X" in their low-order bits, which means their cache
792*35ffd701SAndroid Build Coastguard Worker lines store 16x4 chunks--So our implementation will use those
793*35ffd701SAndroid Build Coastguard Worker dimensions as our target/maximum 2D transfer chunk. If we had
794*35ffd701SAndroid Build Coastguard Worker any 8x8 (or taller) swizzles, we should add such support and
795*35ffd701SAndroid Build Coastguard Worker increase our maximum chunk height. If we had any 32x2 swizzles,
796*35ffd701SAndroid Build Coastguard Worker we should add such support and increase our maximum chunk width.
797*35ffd701SAndroid Build Coastguard Worker
798*35ffd701SAndroid Build Coastguard Worker Our implementation only bothers optimizing for 2D transfer
799*35ffd701SAndroid Build Coastguard Worker chunks stored in row-major order--i.e. those whose swizzle
800*35ffd701SAndroid Build Coastguard Worker mapping bits have a series of X's in the low-order, followed by
801*35ffd701SAndroid Build Coastguard Worker Y's in the higher-order. Where a swizzle mapping inflection
802*35ffd701SAndroid Build Coastguard Worker from Y back to X occurs, contiguous row-ordering is lost, and
803*35ffd701SAndroid Build Coastguard Worker we would use that smaller, row-ordered chunk size. */
804*35ffd701SAndroid Build Coastguard Worker
805*35ffd701SAndroid Build Coastguard Worker int TargetMask;
806*35ffd701SAndroid Build Coastguard Worker
807*35ffd701SAndroid Build Coastguard Worker // Narrow optimized transfer Width by looking for inflection from X's...
808*35ffd701SAndroid Build Coastguard Worker SwizzleMaxXfer.Width = MAX_XFER_WIDTH;
809*35ffd701SAndroid Build Coastguard Worker while( (TargetMask = SwizzleMaxXfer.Width - 1) &&
810*35ffd701SAndroid Build Coastguard Worker ((pSwizzledSurface->pSwizzle->Mask.x & TargetMask) != TargetMask))
811*35ffd701SAndroid Build Coastguard Worker {
812*35ffd701SAndroid Build Coastguard Worker SwizzleMaxXfer.Width >>= 1;
813*35ffd701SAndroid Build Coastguard Worker }
814*35ffd701SAndroid Build Coastguard Worker
815*35ffd701SAndroid Build Coastguard Worker // Narrow optimized transfer height by looking for inflection from Y's...
816*35ffd701SAndroid Build Coastguard Worker SwizzleMaxXfer.Height = MAX_XFER_HEIGHT;
817*35ffd701SAndroid Build Coastguard Worker
818*35ffd701SAndroid Build Coastguard Worker while( (TargetMask = (SwizzleMaxXfer.Height - 1) * SwizzleMaxXfer.Width) &&
819*35ffd701SAndroid Build Coastguard Worker ((pSwizzledSurface->pSwizzle->Mask.y & TargetMask) != TargetMask))
820*35ffd701SAndroid Build Coastguard Worker {
821*35ffd701SAndroid Build Coastguard Worker SwizzleMaxXfer.Height >>= 1;
822*35ffd701SAndroid Build Coastguard Worker }
823*35ffd701SAndroid Build Coastguard Worker }
824*35ffd701SAndroid Build Coastguard Worker
825*35ffd701SAndroid Build Coastguard Worker { // Separate CopyWidthBytes into unaligned left/right "crust" and aligned "MainRun"...
826*35ffd701SAndroid Build Coastguard Worker int MaxXferWidth = MIN_CONTAINED_POW2_BELOW_CAP(SwizzleMaxXfer.Width, CopyWidthBytes);
827*35ffd701SAndroid Build Coastguard Worker
828*35ffd701SAndroid Build Coastguard Worker CopyWidth.LeftCrust = // i.e. "bytes to xfer-aligned boundary"
829*35ffd701SAndroid Build Coastguard Worker (MaxXferWidth - x0) & (MaxXferWidth - 1); // Simplification of ((MaxXferWidth - (x0 % MaxXferWidth)) % MaxXferWidth)
830*35ffd701SAndroid Build Coastguard Worker
831*35ffd701SAndroid Build Coastguard Worker CopyWidth.MainRun =
832*35ffd701SAndroid Build Coastguard Worker (CopyWidthBytes - CopyWidth.LeftCrust) & ~(SwizzleMaxXfer.Width - 1); // MainRun is of SwizzleMaxXfer.Width's--not MaxXferWidth's.
833*35ffd701SAndroid Build Coastguard Worker
834*35ffd701SAndroid Build Coastguard Worker CopyWidth.RightCrust = CopyWidthBytes - (CopyWidth.LeftCrust + CopyWidth.MainRun);
835*35ffd701SAndroid Build Coastguard Worker
836*35ffd701SAndroid Build Coastguard Worker #ifdef SUB_ELEMENT_SUPPORT
837*35ffd701SAndroid Build Coastguard Worker {
838*35ffd701SAndroid Build Coastguard Worker // For partial-pixel transfers, there is no crust and MainRun is done pixel-by-pixel...
839*35ffd701SAndroid Build Coastguard Worker if( (pLinearSurface->Element.Size != pLinearSurface->Element.Pitch) ||
840*35ffd701SAndroid Build Coastguard Worker (pSwizzledSurface->Element.Size != pSwizzledSurface->Element.Pitch))
841*35ffd701SAndroid Build Coastguard Worker {
842*35ffd701SAndroid Build Coastguard Worker CopyWidth.LeftCrust = CopyWidth.RightCrust = 0;
843*35ffd701SAndroid Build Coastguard Worker CopyWidth.MainRun = CopyWidthBytes;
844*35ffd701SAndroid Build Coastguard Worker }
845*35ffd701SAndroid Build Coastguard Worker }
846*35ffd701SAndroid Build Coastguard Worker #endif
847*35ffd701SAndroid Build Coastguard Worker }
848*35ffd701SAndroid Build Coastguard Worker
849*35ffd701SAndroid Build Coastguard Worker
850*35ffd701SAndroid Build Coastguard Worker /* Unlike in MINIMALIST implementation, which fully computes
851*35ffd701SAndroid Build Coastguard Worker swizzled offset for each transfer element, we want to minimize work
852*35ffd701SAndroid Build Coastguard Worker done in our inner loops.
853*35ffd701SAndroid Build Coastguard Worker
854*35ffd701SAndroid Build Coastguard Worker One way we'll reduce work is to separate pSwizzledAddress into
855*35ffd701SAndroid Build Coastguard Worker dimensional components--e.g. so Y-swizzling doesn't have to be
856*35ffd701SAndroid Build Coastguard Worker recomputed in X-loop.
857*35ffd701SAndroid Build Coastguard Worker
858*35ffd701SAndroid Build Coastguard Worker But a more powerful way we'll reduce work is...Instead of linearly
859*35ffd701SAndroid Build Coastguard Worker incrementing spatial offsets and then converting to their swizzled
860*35ffd701SAndroid Build Coastguard Worker counterparts, we'll compute swizzled bases outside the loops and
861*35ffd701SAndroid Build Coastguard Worker keep them swizzled using swizzled incrementing inside the loops--
862*35ffd701SAndroid Build Coastguard Worker since swizzled incrementing can be much cheaper than repeatedly
863*35ffd701SAndroid Build Coastguard Worker swizzling spatial offsets.
864*35ffd701SAndroid Build Coastguard Worker
865*35ffd701SAndroid Build Coastguard Worker Intra-tile swizzled incrementing can be done by using the inverse
866*35ffd701SAndroid Build Coastguard Worker of a spatial component's swizzle mask to ripple-carry a +1 to and
867*35ffd701SAndroid Build Coastguard Worker across the bits of a currently swizzled value--e.g. with...
868*35ffd701SAndroid Build Coastguard Worker
869*35ffd701SAndroid Build Coastguard Worker SwizzledOffsetY: Y X Y X Y Y X X X X
870*35ffd701SAndroid Build Coastguard Worker ~MaskY: 0 1 0 1 0 0 1 1 1 1
871*35ffd701SAndroid Build Coastguard Worker + 1
872*35ffd701SAndroid Build Coastguard Worker -----------------------
873*35ffd701SAndroid Build Coastguard Worker
874*35ffd701SAndroid Build Coastguard Worker ...set low-order ~MaskY bits will always ripple-carry the
875*35ffd701SAndroid Build Coastguard Worker incrementing +1 to wherever Y0 happens to be, and wherever there is
876*35ffd701SAndroid Build Coastguard Worker an arithmetic carry out of one Y position, set ~MaskY bits will
877*35ffd701SAndroid Build Coastguard Worker carry it across any gaps to the next Y position.
878*35ffd701SAndroid Build Coastguard Worker
879*35ffd701SAndroid Build Coastguard Worker The above algorithm only works for adding one, but the mask used
880*35ffd701SAndroid Build Coastguard Worker can be modified to deliver the +1 to any bit location, so any power
881*35ffd701SAndroid Build Coastguard Worker of two increment can be achieved.
882*35ffd701SAndroid Build Coastguard Worker
883*35ffd701SAndroid Build Coastguard Worker After swizzled increment, residue from mask addition and undesired
884*35ffd701SAndroid Build Coastguard Worker carries outside targeted fields must be removed using the natural
885*35ffd701SAndroid Build Coastguard Worker mask--So the final intra-tile swizzled increment is...
886*35ffd701SAndroid Build Coastguard Worker
887*35ffd701SAndroid Build Coastguard Worker SwizzledOffsetQ = (SwizzledOffsetQ + ~MaskQ + 1) & MaskQ
888*35ffd701SAndroid Build Coastguard Worker ...where Q is the applicable X/Y/Z dimensional component.
889*35ffd701SAndroid Build Coastguard Worker
890*35ffd701SAndroid Build Coastguard Worker Or since in two's compliment, (~MaskQ + 1) = -MaskQ...
891*35ffd701SAndroid Build Coastguard Worker
892*35ffd701SAndroid Build Coastguard Worker SwizzledOffsetQ = (SwizzledOffsetQ - MaskQ) & MaskQ
893*35ffd701SAndroid Build Coastguard Worker
894*35ffd701SAndroid Build Coastguard Worker Since tile sizes are powers of two and tiles laid out in row-major
895*35ffd701SAndroid Build Coastguard Worker order across surface, the above swizzled incrementing can
896*35ffd701SAndroid Build Coastguard Worker additionally be used for inter-tile incrementing of X component by
897*35ffd701SAndroid Build Coastguard Worker extending applicable mask to include offset bits beyond the tile--
898*35ffd701SAndroid Build Coastguard Worker so arithmetic carries out of intra-tile X component will ripple to
899*35ffd701SAndroid Build Coastguard Worker advance swizzled inter-tile X offset to next tile. Same is not true
900*35ffd701SAndroid Build Coastguard Worker of inter-tile Y incrementing since surface pitches not restricted
901*35ffd701SAndroid Build Coastguard Worker to powers of two. */
902*35ffd701SAndroid Build Coastguard Worker
903*35ffd701SAndroid Build Coastguard Worker { // Compute Mask[IncSize] for Needed Increment Values...
904*35ffd701SAndroid Build Coastguard Worker int ExtendedMaskX = // Bits beyond the tile (so X incrementing can operate inter-tile)...
905*35ffd701SAndroid Build Coastguard Worker ~(pSwizzledSurface->pSwizzle->Mask.x |
906*35ffd701SAndroid Build Coastguard Worker pSwizzledSurface->pSwizzle->Mask.y |
907*35ffd701SAndroid Build Coastguard Worker pSwizzledSurface->pSwizzle->Mask.z);
908*35ffd701SAndroid Build Coastguard Worker
909*35ffd701SAndroid Build Coastguard Worker /* Subtraction below delivers natural mask for +1 increment,
910*35ffd701SAndroid Build Coastguard Worker and appropriately altered mask to deliver +1 to higher bit
911*35ffd701SAndroid Build Coastguard Worker positions for +2/4/8/etc. increments. */
912*35ffd701SAndroid Build Coastguard Worker
913*35ffd701SAndroid Build Coastguard Worker for(x = SwizzleMaxXfer.Width; x >= 1; x >>= 1)
914*35ffd701SAndroid Build Coastguard Worker {
915*35ffd701SAndroid Build Coastguard Worker MaskX[x] = SWIZZLE_OFFSET((1 << TileWidthBits) - x, 0, 0) | ExtendedMaskX;
916*35ffd701SAndroid Build Coastguard Worker }
917*35ffd701SAndroid Build Coastguard Worker
918*35ffd701SAndroid Build Coastguard Worker for(y = SwizzleMaxXfer.Height; y >= 1; y >>= 1)
919*35ffd701SAndroid Build Coastguard Worker {
920*35ffd701SAndroid Build Coastguard Worker MaskY[y] = SWIZZLE_OFFSET(0, (1 << TileHeightBits) - y, 0);
921*35ffd701SAndroid Build Coastguard Worker }
922*35ffd701SAndroid Build Coastguard Worker }
923*35ffd701SAndroid Build Coastguard Worker
924*35ffd701SAndroid Build Coastguard Worker { // Base Dimensional Swizzled Offsets...
925*35ffd701SAndroid Build Coastguard Worker int IntraTileY = y0 & ((1 << TileHeightBits) - 1);
926*35ffd701SAndroid Build Coastguard Worker int TileAlignedY = y0 - IntraTileY;
927*35ffd701SAndroid Build Coastguard Worker
928*35ffd701SAndroid Build Coastguard Worker SwizzledOffsetY = SWIZZLE_OFFSET(0, IntraTileY, 0);
929*35ffd701SAndroid Build Coastguard Worker
930*35ffd701SAndroid Build Coastguard Worker SwizzledOffsetX0 =
931*35ffd701SAndroid Build Coastguard Worker SWIZZLE_OFFSET(
932*35ffd701SAndroid Build Coastguard Worker x0,
933*35ffd701SAndroid Build Coastguard Worker TileAlignedY, // <-- Since SwizzledOffsetX will include "bits beyond the tile".
934*35ffd701SAndroid Build Coastguard Worker 0);
935*35ffd701SAndroid Build Coastguard Worker }
936*35ffd701SAndroid Build Coastguard Worker
937*35ffd701SAndroid Build Coastguard Worker // BLT Loops ///////////////////////////////////////////////////////
938*35ffd701SAndroid Build Coastguard Worker
939*35ffd701SAndroid Build Coastguard Worker /* Traverse BLT rectangle, transferring small, optimally-aligned 2D
940*35ffd701SAndroid Build Coastguard Worker chunks, as appropriate for given swizzle format. Use swizzled
941*35ffd701SAndroid Build Coastguard Worker incrementing of dimensional swizzled components. */
942*35ffd701SAndroid Build Coastguard Worker
943*35ffd701SAndroid Build Coastguard Worker for(y = y0; y < y1; )
944*35ffd701SAndroid Build Coastguard Worker {
945*35ffd701SAndroid Build Coastguard Worker char *pSwizzledAddressLine = pSwizzledAddressCopyBase + SwizzledOffsetY;
946*35ffd701SAndroid Build Coastguard Worker int xferHeight =
947*35ffd701SAndroid Build Coastguard Worker // Largest pow2 xfer height that alignment, MaxXfer, and lines left will permit...
948*35ffd701SAndroid Build Coastguard Worker MIN_CONTAINED_POW2_BELOW_CAP(y | SwizzleMaxXfer.Height, y1 - y);
949*35ffd701SAndroid Build Coastguard Worker int SwizzledOffsetX = SwizzledOffsetX0;
950*35ffd701SAndroid Build Coastguard Worker
951*35ffd701SAndroid Build Coastguard Worker __m128i xmm[MAX_XFER_HEIGHT];
952*35ffd701SAndroid Build Coastguard Worker char *pLinearAddressEnd;
953*35ffd701SAndroid Build Coastguard Worker int _MaskX;
954*35ffd701SAndroid Build Coastguard Worker
955*35ffd701SAndroid Build Coastguard Worker // XFER Macros /////////////////////////////////////////////////
956*35ffd701SAndroid Build Coastguard Worker
957*35ffd701SAndroid Build Coastguard Worker /* We'll define "XFER" macro to contain BLT X-loop work.
958*35ffd701SAndroid Build Coastguard Worker
959*35ffd701SAndroid Build Coastguard Worker In simple implementation, XFER would be WHILE loop that does
960*35ffd701SAndroid Build Coastguard Worker SSE transfer and performs pointer and swizzled offset
961*35ffd701SAndroid Build Coastguard Worker incrementing.
962*35ffd701SAndroid Build Coastguard Worker
963*35ffd701SAndroid Build Coastguard Worker ...but we have multiple conditions to handle...
964*35ffd701SAndroid Build Coastguard Worker - Transfer Direction (Linear <--> Swizzled)
965*35ffd701SAndroid Build Coastguard Worker - Optimal 2D Transfer Chunk Size
966*35ffd701SAndroid Build Coastguard Worker - Available/Desired CPU Transfer Instructions
967*35ffd701SAndroid Build Coastguard Worker - Unaligned Crust
968*35ffd701SAndroid Build Coastguard Worker
969*35ffd701SAndroid Build Coastguard Worker Don't want X-loop to have conditional logic to handle
970*35ffd701SAndroid Build Coastguard Worker variations since would retard performance--but neither do we
971*35ffd701SAndroid Build Coastguard Worker want messy multitude of slightly different, copy-pasted code
972*35ffd701SAndroid Build Coastguard Worker paths. So instead, XFER macro will provide common code template
973*35ffd701SAndroid Build Coastguard Worker allowing instantiation of multiple X-loop variations--i.e. XFER
974*35ffd701SAndroid Build Coastguard Worker calls from conditional Y-loop code will expand into separate,
975*35ffd701SAndroid Build Coastguard Worker conditional-free, "lean and mean" X-loops.
976*35ffd701SAndroid Build Coastguard Worker
977*35ffd701SAndroid Build Coastguard Worker Some conditional logic remains in XFER chain--but only outside
978*35ffd701SAndroid Build Coastguard Worker X-loop. The two IF statements that remain in X-loop (i.e. those
979*35ffd701SAndroid Build Coastguard Worker in XFER_LOAD/STORE) expand to compile-time constant conditional
980*35ffd701SAndroid Build Coastguard Worker expressions, so with optimizing compiler, no runtime-
981*35ffd701SAndroid Build Coastguard Worker conditional code will be generated--i.e. constant conditionals
982*35ffd701SAndroid Build Coastguard Worker will simply decide whether given instantiation has that code or
983*35ffd701SAndroid Build Coastguard Worker not. */
984*35ffd701SAndroid Build Coastguard Worker
985*35ffd701SAndroid Build Coastguard Worker #define XFER(XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
986*35ffd701SAndroid Build Coastguard Worker { \
987*35ffd701SAndroid Build Coastguard Worker XFER_LINES(4, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
988*35ffd701SAndroid Build Coastguard Worker else XFER_LINES(2, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
989*35ffd701SAndroid Build Coastguard Worker else XFER_LINES(1, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust);\
990*35ffd701SAndroid Build Coastguard Worker }
991*35ffd701SAndroid Build Coastguard Worker
992*35ffd701SAndroid Build Coastguard Worker #define XFER_LINES(XFER_LINES_Lines, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
993*35ffd701SAndroid Build Coastguard Worker if(xferHeight == (XFER_LINES_Lines)) \
994*35ffd701SAndroid Build Coastguard Worker { \
995*35ffd701SAndroid Build Coastguard Worker if(XFER_Crust) \
996*35ffd701SAndroid Build Coastguard Worker { \
997*35ffd701SAndroid Build Coastguard Worker XFER_SPAN(MOVB_M, MOVB_R, CopyWidth.LeftCrust & 1, 1, 1, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
998*35ffd701SAndroid Build Coastguard Worker XFER_SPAN(MOVW_M, MOVW_R, CopyWidth.LeftCrust & 2, 2, 2, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
999*35ffd701SAndroid Build Coastguard Worker XFER_SPAN(MOVD_M, MOVD_R, CopyWidth.LeftCrust & 4, 4, 4, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1000*35ffd701SAndroid Build Coastguard Worker XFER_SPAN(MOVQ_M, MOVQ_R, CopyWidth.LeftCrust & 8, 8, 8, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1001*35ffd701SAndroid Build Coastguard Worker } \
1002*35ffd701SAndroid Build Coastguard Worker \
1003*35ffd701SAndroid Build Coastguard Worker XFER_SPAN(XFER_Store, XFER_Load, CopyWidth.MainRun, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch);\
1004*35ffd701SAndroid Build Coastguard Worker \
1005*35ffd701SAndroid Build Coastguard Worker if(XFER_Crust) \
1006*35ffd701SAndroid Build Coastguard Worker { \
1007*35ffd701SAndroid Build Coastguard Worker XFER_SPAN(MOVQ_M, MOVQ_R, CopyWidth.RightCrust & 8, 8, 8, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1008*35ffd701SAndroid Build Coastguard Worker XFER_SPAN(MOVD_M, MOVD_R, CopyWidth.RightCrust & 4, 4, 4, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1009*35ffd701SAndroid Build Coastguard Worker XFER_SPAN(MOVW_M, MOVW_R, CopyWidth.RightCrust & 2, 2, 2, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1010*35ffd701SAndroid Build Coastguard Worker XFER_SPAN(MOVB_M, MOVB_R, CopyWidth.RightCrust & 1, 1, 1, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1011*35ffd701SAndroid Build Coastguard Worker } \
1012*35ffd701SAndroid Build Coastguard Worker }
1013*35ffd701SAndroid Build Coastguard Worker
1014*35ffd701SAndroid Build Coastguard Worker #define XFER_SPAN(XFER_Store, XFER_Load, XFER_CopyWidthBytes, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_Height, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch) \
1015*35ffd701SAndroid Build Coastguard Worker { \
1016*35ffd701SAndroid Build Coastguard Worker pLinearAddressEnd = pLinearAddress + (XFER_CopyWidthBytes); \
1017*35ffd701SAndroid Build Coastguard Worker _MaskX = MaskX[XFER_Pitch_Swizzled]; \
1018*35ffd701SAndroid Build Coastguard Worker while(pLinearAddress < pLinearAddressEnd) \
1019*35ffd701SAndroid Build Coastguard Worker { \
1020*35ffd701SAndroid Build Coastguard Worker pSwizzledAddress = pSwizzledAddressLine + SwizzledOffsetX; \
1021*35ffd701SAndroid Build Coastguard Worker \
1022*35ffd701SAndroid Build Coastguard Worker XFER_LOAD(0, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height); \
1023*35ffd701SAndroid Build Coastguard Worker XFER_LOAD(1, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height); \
1024*35ffd701SAndroid Build Coastguard Worker XFER_LOAD(2, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height); \
1025*35ffd701SAndroid Build Coastguard Worker XFER_LOAD(3, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height); \
1026*35ffd701SAndroid Build Coastguard Worker XFER_STORE(0, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
1027*35ffd701SAndroid Build Coastguard Worker XFER_STORE(1, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
1028*35ffd701SAndroid Build Coastguard Worker XFER_STORE(2, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
1029*35ffd701SAndroid Build Coastguard Worker XFER_STORE(3, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
1030*35ffd701SAndroid Build Coastguard Worker \
1031*35ffd701SAndroid Build Coastguard Worker SwizzledOffsetX = (SwizzledOffsetX - _MaskX) & _MaskX; \
1032*35ffd701SAndroid Build Coastguard Worker pLinearAddress += (XFER_Pitch_Linear); \
1033*35ffd701SAndroid Build Coastguard Worker } \
1034*35ffd701SAndroid Build Coastguard Worker }
1035*35ffd701SAndroid Build Coastguard Worker
1036*35ffd701SAndroid Build Coastguard Worker #define XFER_LOAD(XFER_Line, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height) \
1037*35ffd701SAndroid Build Coastguard Worker { \
1038*35ffd701SAndroid Build Coastguard Worker if((XFER_Line) < (XFER_Height)) \
1039*35ffd701SAndroid Build Coastguard Worker { \
1040*35ffd701SAndroid Build Coastguard Worker XFER_Load( \
1041*35ffd701SAndroid Build Coastguard Worker xmm[XFER_Line], \
1042*35ffd701SAndroid Build Coastguard Worker (XFER_pSrc) + (XFER_Line) * (XFER_SrcPitch)); \
1043*35ffd701SAndroid Build Coastguard Worker } \
1044*35ffd701SAndroid Build Coastguard Worker }
1045*35ffd701SAndroid Build Coastguard Worker
1046*35ffd701SAndroid Build Coastguard Worker #define XFER_STORE(XFER_Line, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height) \
1047*35ffd701SAndroid Build Coastguard Worker { \
1048*35ffd701SAndroid Build Coastguard Worker if((XFER_Line) < (XFER_Height)) \
1049*35ffd701SAndroid Build Coastguard Worker { \
1050*35ffd701SAndroid Build Coastguard Worker XFER_Store( \
1051*35ffd701SAndroid Build Coastguard Worker (XFER_pDest) + (XFER_Line) * (XFER_DestPitch), \
1052*35ffd701SAndroid Build Coastguard Worker xmm[XFER_Line]); \
1053*35ffd701SAndroid Build Coastguard Worker } \
1054*35ffd701SAndroid Build Coastguard Worker }
1055*35ffd701SAndroid Build Coastguard Worker
1056*35ffd701SAndroid Build Coastguard Worker // Perform Applicable Transfer /////////////////////////////////
1057*35ffd701SAndroid Build Coastguard Worker assert( // DQ Alignment...
1058*35ffd701SAndroid Build Coastguard Worker ((intptr_t) pSwizzledSurface->pBase % 16 == 0) &&
1059*35ffd701SAndroid Build Coastguard Worker (pSwizzledSurface->Pitch % 16 == 0));
1060*35ffd701SAndroid Build Coastguard Worker
1061*35ffd701SAndroid Build Coastguard Worker #ifdef SUB_ELEMENT_SUPPORT
1062*35ffd701SAndroid Build Coastguard Worker if( (pLinearSurface->Element.Size != pLinearSurface->Element.Pitch) ||
1063*35ffd701SAndroid Build Coastguard Worker (pSwizzledSurface->Element.Size != pSwizzledSurface->Element.Pitch))
1064*35ffd701SAndroid Build Coastguard Worker {
1065*35ffd701SAndroid Build Coastguard Worker if(LinearToSwizzled)
1066*35ffd701SAndroid Build Coastguard Worker {
1067*35ffd701SAndroid Build Coastguard Worker switch(pLinearSurface->Element.Size)
1068*35ffd701SAndroid Build Coastguard Worker {
1069*35ffd701SAndroid Build Coastguard Worker case 16: XFER(MOVNTDQ_M, MOVDQU_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1070*35ffd701SAndroid Build Coastguard Worker case 8: XFER( MOVQ_M, MOVQ_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1071*35ffd701SAndroid Build Coastguard Worker case 4: XFER( MOVD_M, MOVD_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1072*35ffd701SAndroid Build Coastguard Worker case 3: XFER( MOV3_M, MOV3_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1073*35ffd701SAndroid Build Coastguard Worker case 2: XFER( MOVW_M, MOVW_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1074*35ffd701SAndroid Build Coastguard Worker case 1: XFER( MOVB_M, MOVB_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1075*35ffd701SAndroid Build Coastguard Worker default: assert(0);
1076*35ffd701SAndroid Build Coastguard Worker }
1077*35ffd701SAndroid Build Coastguard Worker }
1078*35ffd701SAndroid Build Coastguard Worker else
1079*35ffd701SAndroid Build Coastguard Worker {
1080*35ffd701SAndroid Build Coastguard Worker switch(pLinearSurface->Element.Size)
1081*35ffd701SAndroid Build Coastguard Worker {
1082*35ffd701SAndroid Build Coastguard Worker case 16:
1083*35ffd701SAndroid Build Coastguard Worker {
1084*35ffd701SAndroid Build Coastguard Worker if(StreamingLoadSupported)
1085*35ffd701SAndroid Build Coastguard Worker {
1086*35ffd701SAndroid Build Coastguard Worker XFER(MOVDQU_M, MOVNTDQA_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0);
1087*35ffd701SAndroid Build Coastguard Worker }
1088*35ffd701SAndroid Build Coastguard Worker else
1089*35ffd701SAndroid Build Coastguard Worker {
1090*35ffd701SAndroid Build Coastguard Worker XFER(MOVDQU_M, MOVDQ_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0);
1091*35ffd701SAndroid Build Coastguard Worker }
1092*35ffd701SAndroid Build Coastguard Worker break;
1093*35ffd701SAndroid Build Coastguard Worker }
1094*35ffd701SAndroid Build Coastguard Worker case 8: XFER( MOVQ_M, MOVQ_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1095*35ffd701SAndroid Build Coastguard Worker case 4: XFER( MOVD_M, MOVD_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1096*35ffd701SAndroid Build Coastguard Worker case 3: XFER( MOV3_M, MOV3_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1097*35ffd701SAndroid Build Coastguard Worker case 2: XFER( MOVW_M, MOVW_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1098*35ffd701SAndroid Build Coastguard Worker case 1: XFER( MOVB_M, MOVB_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1099*35ffd701SAndroid Build Coastguard Worker default: assert(0);
1100*35ffd701SAndroid Build Coastguard Worker }
1101*35ffd701SAndroid Build Coastguard Worker }
1102*35ffd701SAndroid Build Coastguard Worker } else
1103*35ffd701SAndroid Build Coastguard Worker #endif // SUB_ELEMENT_SUPPORT
1104*35ffd701SAndroid Build Coastguard Worker if(LinearToSwizzled)
1105*35ffd701SAndroid Build Coastguard Worker {
1106*35ffd701SAndroid Build Coastguard Worker switch(SwizzleMaxXfer.Width)
1107*35ffd701SAndroid Build Coastguard Worker {
1108*35ffd701SAndroid Build Coastguard Worker case 16: XFER(MOVNTDQ_M, MOVDQU_R, 16, 16, pSwizzledAddress, 16, pLinearAddress, pLinearSurface->Pitch, 1); break;
1109*35ffd701SAndroid Build Coastguard Worker #ifdef INTEL_TILE_W_SUPPORT
1110*35ffd701SAndroid Build Coastguard Worker case 2: XFER(MOVW_M, MOVW_R, 2, 2, pSwizzledAddress, 2, pLinearAddress, pLinearSurface->Pitch, 1); break;
1111*35ffd701SAndroid Build Coastguard Worker #endif
1112*35ffd701SAndroid Build Coastguard Worker default: assert(0); // Unexpected cases excluded to save compile time/size of multiplying instantiations.
1113*35ffd701SAndroid Build Coastguard Worker }
1114*35ffd701SAndroid Build Coastguard Worker }
1115*35ffd701SAndroid Build Coastguard Worker else
1116*35ffd701SAndroid Build Coastguard Worker {
1117*35ffd701SAndroid Build Coastguard Worker switch(SwizzleMaxXfer.Width)
1118*35ffd701SAndroid Build Coastguard Worker {
1119*35ffd701SAndroid Build Coastguard Worker case 16:
1120*35ffd701SAndroid Build Coastguard Worker {
1121*35ffd701SAndroid Build Coastguard Worker if(StreamingLoadSupported)
1122*35ffd701SAndroid Build Coastguard Worker {
1123*35ffd701SAndroid Build Coastguard Worker XFER(MOVDQU_M, MOVNTDQA_R, 16, 16, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, 16, 1);
1124*35ffd701SAndroid Build Coastguard Worker }
1125*35ffd701SAndroid Build Coastguard Worker else
1126*35ffd701SAndroid Build Coastguard Worker {
1127*35ffd701SAndroid Build Coastguard Worker XFER(MOVDQU_M, MOVDQ_R, 16, 16, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, 16, 1);
1128*35ffd701SAndroid Build Coastguard Worker }
1129*35ffd701SAndroid Build Coastguard Worker break;
1130*35ffd701SAndroid Build Coastguard Worker }
1131*35ffd701SAndroid Build Coastguard Worker #ifdef INTEL_TILE_W_SUPPORT
1132*35ffd701SAndroid Build Coastguard Worker case 2: XFER(MOVW_M, MOVW_R, 2, 2, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, 2, 1); break;
1133*35ffd701SAndroid Build Coastguard Worker #endif
1134*35ffd701SAndroid Build Coastguard Worker default: assert(0);
1135*35ffd701SAndroid Build Coastguard Worker }
1136*35ffd701SAndroid Build Coastguard Worker }
1137*35ffd701SAndroid Build Coastguard Worker
1138*35ffd701SAndroid Build Coastguard Worker
1139*35ffd701SAndroid Build Coastguard Worker // Swizzled inc of SwizzledOffsetY...
1140*35ffd701SAndroid Build Coastguard Worker SwizzledOffsetY = (SwizzledOffsetY - MaskY[xferHeight]) & MaskY[xferHeight];
1141*35ffd701SAndroid Build Coastguard Worker if(!SwizzledOffsetY) SwizzledOffsetX0 += BytesPerRowOfTiles; // Wraps advance SwizzledOffsetX0, since that includes "bits beyond the tile".
1142*35ffd701SAndroid Build Coastguard Worker
1143*35ffd701SAndroid Build Coastguard Worker y += xferHeight;
1144*35ffd701SAndroid Build Coastguard Worker
1145*35ffd701SAndroid Build Coastguard Worker /* X-loop only advanced pLinearAddress by CopyWidthBytes--even
1146*35ffd701SAndroid Build Coastguard Worker when transferred multiple lines. Advance rest of way: */
1147*35ffd701SAndroid Build Coastguard Worker pLinearAddress += xferHeight * pLinearSurface->Pitch - CopyWidthBytes;
1148*35ffd701SAndroid Build Coastguard Worker
1149*35ffd701SAndroid Build Coastguard Worker } // foreach(y)
1150*35ffd701SAndroid Build Coastguard Worker
1151*35ffd701SAndroid Build Coastguard Worker _mm_sfence(); // Flush Non-Temporal Writes
1152*35ffd701SAndroid Build Coastguard Worker
1153*35ffd701SAndroid Build Coastguard Worker #if(_MSC_VER)
1154*35ffd701SAndroid Build Coastguard Worker #pragma warning(pop)
1155*35ffd701SAndroid Build Coastguard Worker #endif
1156*35ffd701SAndroid Build Coastguard Worker }
1157*35ffd701SAndroid Build Coastguard Worker #endif
1158*35ffd701SAndroid Build Coastguard Worker }
1159*35ffd701SAndroid Build Coastguard Worker } // CpuSwizzleBlt
1160*35ffd701SAndroid Build Coastguard Worker
1161*35ffd701SAndroid Build Coastguard Worker #endif // #ifndef INCLUDE_CpuSwizzleBlt_c_AS_HEADER
1162*35ffd701SAndroid Build Coastguard Worker // clang-format on
1163