1 /*==============================================================================
2 Copyright(c) 2017 Intel Corporation
3
4 Permission is hereby granted, free of charge, to any person obtaining a
5 copy of this software and associated documentation files(the "Software"),
6 to deal in the Software without restriction, including without limitation
7 the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 and / or sell copies of the Software, and to permit persons to whom the
9 Software is furnished to do so, subject to the following conditions:
10
11 The above copyright notice and this permission notice shall be included
12 in all copies or substantial portions of the Software.
13
14 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 OTHER DEALINGS IN THE SOFTWARE.
21 ============================================================================*/
22 // clang-format off
23 // CpuSwizzleBlt.c - Surface swizzling definitions and BLT functionality.
24
25 // [!] File serves as its own header:
26 // #define INCLUDE_CpuSwizzleBlt_c_AS_HEADER
27 // #include "CpuSwizzleBlt.c"
28
29 #define SUB_ELEMENT_SUPPORT // Support for Partial Element Transfer (e.g. separating/merging depth-stencil).
30 #define INTEL_TILE_W_SUPPORT // Stencil Only;
31
32 #ifndef CpuSwizzleBlt_INCLUDED
33
34 #ifdef __cplusplus
35 extern "C" {
36 #endif
37
38 // Background ##################################################################
39
40 /* Pixel-based surfaces commonly stored in memory row-by-row. This convention
41 has simple "y * Pitch + x" addressing but has spatial locality only in
42 horizontal direction--i.e. horizontal pixel neighbors stored next to each other
43 but vertical neighbors stored entire pitch away.
44
45 Since many graphics operations involve multi-dimensional data access, to
46 improve cache/memory access performance it is often more beneficial to use
47 alternative storage conventions which have multi-dimensional spatial locality--
48 i.e. where pixels tend to be stored near both their horizontal and vertical
49 neighbors.
50
51 "Tiling/Swizzling" is storage convention that increases multi-dimensional
52 spatial locality by treating surface as series of smaller regions/"tiles",
53 laid out in row-major order across surface, with entire content of each tile
54 stored contiguously. Data within each tile is stored in pattern that further
55 maximizes the locality. */
56
57
58 // Swizzle Descriptors #########################################################
59
60 /* Tile sizes always powers of 2 and chosen to be architecturally convenient--
61 e.g. 4KB to match physical page size. Tile dimensions also powers of 2, usually
62 chosen to produce square tiles for targeted pixel size--e.g. 4KB = 128 bytes x
63 32 rows = 32 x 32 pixels @ 4 bytes-per-pixel.
64
65 Since tile size and dimensions all powers of two, the spatial-to-linear mapping
66 required to store a tile can be trivial: spatial indexing bits can simply be
67 mapped to linear offset bits--e.g. for a 4KB, 128x32 tile...each byte within
68 tile can be referenced with a 7-bit X index and 5-bit Y index--and each of
69 those 12 index bits can be individually mapped to a bit in the 12-bit offset of
70 the tile's linear storage.
71
72 The order in which spatial index bits are mapped to linear offset bits
73 determines the spatial locality properties of the surface data. E.g. the
74 following mapping...
75
76 Linear[11:0] = Y4 Y3 Y2 Y1 Y0 X6 X5 X4 X3 X2 X1 X0
77 \-- Y[4:0] --/ \----- X[6:0] -----/
78
79 ...stores bytes of tile in row-major order, with horizontal neighbors stored
80 contiguously and vertical neighbors stored 128 bytes away. If instead, Y index
81 bits were mapped to the low-order...
82
83 Linear[11:0] = X6 X5 X4 X3 X2 X1 X0 Y4 Y3 Y2 Y1 Y0
84 \----- X[6:0] -----/ \-- Y[4:0] --/
85
86 ...bytes of tile would be stored in column-major order, with vertical neighbors
87 stored contiguously and horizontal neighbors stored 32 bytes away.
88
89 Individual X and Y bits can be separated and interspersed in mapping to
90 increase locality via sub-tiling--e.g...
91
92 Linear[11:0] = Y4 Y3 Y2 X6 X5 X4 Y1 Y0 X3 X2 X1 X0
93 \-- Sub-Tile ---/
94
95 ...subdivies tile into 16x4 sub-tiles laid out in row-major order across tile,
96 with sub-tile content further stored in row-major order, with horizontal byte
97 neighbors within sub-tile stored contiguously and vertical neighbors only 16
98 bytes away. This means single 64-byte cache line contains 4x4 group of 32bpp
99 pixels--which is powerful spatial locality for graphics processing.
100
101 If mappings restricted to being "parallel" for index bits (i.e. bits of given
102 index can change position but not relative order during mapping), then bit
103 indexes need not be explicitly denoted--e.g. the previous sub-tiling mapping
104 can be represented as...
105
106 Linear[11:0] = Y Y Y X X X Y Y X X X X
107
108 ...where X and Y index bits are implied to be zero-based-counted in order they
109 are encountered.
110
111 In software, spatial-to-linear mapping conveniently described with bit mask for
112 each dimension, where a set bit indicates the next bit of that dimension's
113 index is mapped to that position in the linear offset--e.g....
114
115 Linear[11:0] = Y Y Y X X X Y Y X X X X
116 MaskX = 0 0 0 1 1 1 0 0 1 1 1 1
117 MaskY = 1 1 1 0 0 0 1 1 0 0 0 0
118
119 Such dimensional masks all that's needed to describe given tiling/swizzling
120 convention, since tile size and dimensions can be derived from the masks:
121
122 TileWidth = 2 ^ NumberOfSetBits(MaskX)
123 TileHeight = 2 ^ NumberOfSetBits(MaskY)
124 TileSize = 2 ^ NumberOfSetBits(MaskX OR MaskY)
125
126 Tiling/swizzling is not limited to 2D. With addition of another tile dimension,
127 spatial locality for 3D or MSAA sample neighbors can be controlled, also. */
128
129 typedef struct _SWIZZLE_DESCRIPTOR {
130 struct _SWIZZLE_DESCRIPTOR_MASKS {
131 int x, y, z;
132 } Mask;
133 } SWIZZLE_DESCRIPTOR;
134
135 typedef enum _EXTERNAL_SWIZZLE_NAME
136 {
137 TILEX = 0,
138 TILEY,
139 TILEW,
140 TILEYS,
141 TILEYF
142 }EXTERNAL_SWIZZLE_NAME;
143
144 typedef enum _EXTERNAL_RES_TYPE{
145 Res_2D = 0,
146 Res_3D = 1,
147 MSAA_2X,
148 MSAA_4X,
149 MSAA_8X,
150 MSAA_16X
151 }EXTERNAL_RES_TYPE;
152
153 // Definition Helper Macros...
154 #define X ,'x'
155 #define Y ,'y'
156 #define Z ,'z'
157 #define S ,'z' // S = MSAA Sample Index
158 #define o ,0 // o = N/A Swizzle Bit
159 #ifdef INCLUDE_CpuSwizzleBlt_c_AS_HEADER
160 #define __SWIZZLE(Name, b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0) \
161 extern const SWIZZLE_DESCRIPTOR Name;
162 #else // C Compile...
163 #define __SWIZZLE(Name, b15, b14, b13, b12, b11, b10, b9, b8, b7, b6, b5, b4, b3, b2, b1, b0) \
164 const SWIZZLE_DESCRIPTOR Name = \
165 { (b15 == 'x' ? 0x8000 : 0) + (b14 == 'x' ? 0x4000 : 0) + (b13 == 'x' ? 0x2000 : 0) + (b12 == 'x' ? 0x1000 : 0) + (b11 == 'x' ? 0x0800 : 0) + (b10 == 'x' ? 0x0400 : 0) + (b9 == 'x' ? 0x0200 : 0) + (b8 == 'x' ? 0x0100 : 0) + (b7 == 'x' ? 0x0080 : 0) + (b6 == 'x' ? 0x0040 : 0) + (b5 == 'x' ? 0x0020 : 0) + (b4 == 'x' ? 0x0010 : 0) + (b3 == 'x' ? 0x0008 : 0) + (b2 == 'x' ? 0x0004 : 0) + (b1 == 'x' ? 0x0002 : 0) + (b0 == 'x' ? 0x0001 : 0), \
166 (b15 == 'y' ? 0x8000 : 0) + (b14 == 'y' ? 0x4000 : 0) + (b13 == 'y' ? 0x2000 : 0) + (b12 == 'y' ? 0x1000 : 0) + (b11 == 'y' ? 0x0800 : 0) + (b10 == 'y' ? 0x0400 : 0) + (b9 == 'y' ? 0x0200 : 0) + (b8 == 'y' ? 0x0100 : 0) + (b7 == 'y' ? 0x0080 : 0) + (b6 == 'y' ? 0x0040 : 0) + (b5 == 'y' ? 0x0020 : 0) + (b4 == 'y' ? 0x0010 : 0) + (b3 == 'y' ? 0x0008 : 0) + (b2 == 'y' ? 0x0004 : 0) + (b1 == 'y' ? 0x0002 : 0) + (b0 == 'y' ? 0x0001 : 0), \
167 (b15 == 'z' ? 0x8000 : 0) + (b14 == 'z' ? 0x4000 : 0) + (b13 == 'z' ? 0x2000 : 0) + (b12 == 'z' ? 0x1000 : 0) + (b11 == 'z' ? 0x0800 : 0) + (b10 == 'z' ? 0x0400 : 0) + (b9 == 'z' ? 0x0200 : 0) + (b8 == 'z' ? 0x0100 : 0) + (b7 == 'z' ? 0x0080 : 0) + (b6 == 'z' ? 0x0040 : 0) + (b5 == 'z' ? 0x0020 : 0) + (b4 == 'z' ? 0x0010 : 0) + (b3 == 'z' ? 0x0008 : 0) + (b2 == 'z' ? 0x0004 : 0) + (b1 == 'z' ? 0x0002 : 0) + (b0 == 'z' ? 0x0001 : 0) }
168 #endif
169 #define SWIZZLE(__SWIZZLE_Args) __SWIZZLE __SWIZZLE_Args
170
171 // Legacy Intel Tiling Swizzles...
172 SWIZZLE(( INTEL_TILE_X o o o o Y Y Y X X X X X X X X X ));
173 SWIZZLE(( INTEL_TILE_Y o o o o X X X Y Y Y Y Y X X X X ));
174
175 #ifdef INTEL_TILE_W_SUPPORT
176 SWIZZLE(( INTEL_TILE_W o o o o X X X Y Y Y Y X Y X Y X ));
177 #endif
178 // Gen9 Swizzles...
179 SWIZZLE(( INTEL_TILE_YF_128 o o o o X Y X Y X X Y Y X X X X ));
180 SWIZZLE(( INTEL_TILE_YF_64 o o o o X Y X Y X X Y Y X X X X ));
181 SWIZZLE(( INTEL_TILE_YF_32 o o o o X Y X Y X Y Y Y X X X X ));
182 SWIZZLE(( INTEL_TILE_YF_16 o o o o X Y X Y X Y Y Y X X X X ));
183 SWIZZLE(( INTEL_TILE_YF_8 o o o o X Y X Y Y Y Y Y X X X X ));
184
185 SWIZZLE(( INTEL_TILE_YS_128 X Y X Y X Y X Y X X Y Y X X X X ));
186 SWIZZLE(( INTEL_TILE_YS_64 X Y X Y X Y X Y X X Y Y X X X X ));
187 SWIZZLE(( INTEL_TILE_YS_32 X Y X Y X Y X Y X Y Y Y X X X X ));
188 SWIZZLE(( INTEL_TILE_YS_16 X Y X Y X Y X Y X Y Y Y X X X X ));
189 SWIZZLE(( INTEL_TILE_YS_8 X Y X Y X Y X Y Y Y Y Y X X X X ));
190
191 SWIZZLE(( INTEL_TILE_YF_MSAA2_128 o o o o S Y X Y X X Y Y X X X X ));
192 SWIZZLE(( INTEL_TILE_YF_MSAA2_64 o o o o S Y X Y X X Y Y X X X X ));
193 SWIZZLE(( INTEL_TILE_YF_MSAA2_32 o o o o S Y X Y X Y Y Y X X X X ));
194 SWIZZLE(( INTEL_TILE_YF_MSAA2_16 o o o o S Y X Y X Y Y Y X X X X ));
195 SWIZZLE(( INTEL_TILE_YF_MSAA2_8 o o o o S Y X Y Y Y Y Y X X X X ));
196
197 SWIZZLE(( INTEL_TILE_YS_MSAA2_128 S Y X Y X Y X Y X X Y Y X X X X ));
198 SWIZZLE(( INTEL_TILE_YS_MSAA2_64 S Y X Y X Y X Y X X Y Y X X X X ));
199 SWIZZLE(( INTEL_TILE_YS_MSAA2_32 S Y X Y X Y X Y X Y Y Y X X X X ));
200 SWIZZLE(( INTEL_TILE_YS_MSAA2_16 S Y X Y X Y X Y X Y Y Y X X X X ));
201 SWIZZLE(( INTEL_TILE_YS_MSAA2_8 S Y X Y X Y X Y Y Y Y Y X X X X ));
202
203 SWIZZLE(( INTEL_TILE_YF_MSAA4_128 o o o o S S X Y X X Y Y X X X X ));
204 SWIZZLE(( INTEL_TILE_YF_MSAA4_64 o o o o S S X Y X X Y Y X X X X ));
205 SWIZZLE(( INTEL_TILE_YF_MSAA4_32 o o o o S S X Y X Y Y Y X X X X ));
206 SWIZZLE(( INTEL_TILE_YF_MSAA4_16 o o o o S S X Y X Y Y Y X X X X ));
207 SWIZZLE(( INTEL_TILE_YF_MSAA4_8 o o o o S S X Y Y Y Y Y X X X X ));
208
209 SWIZZLE(( INTEL_TILE_YS_MSAA4_128 S S X Y X Y X Y X X Y Y X X X X ));
210 SWIZZLE(( INTEL_TILE_YS_MSAA4_64 S S X Y X Y X Y X X Y Y X X X X ));
211 SWIZZLE(( INTEL_TILE_YS_MSAA4_32 S S X Y X Y X Y X Y Y Y X X X X ));
212 SWIZZLE(( INTEL_TILE_YS_MSAA4_16 S S X Y X Y X Y X Y Y Y X X X X ));
213 SWIZZLE(( INTEL_TILE_YS_MSAA4_8 S S X Y X Y X Y Y Y Y Y X X X X ));
214
215 SWIZZLE(( INTEL_TILE_YF_MSAA8_128 o o o o S S S Y X X Y Y X X X X ));
216 SWIZZLE(( INTEL_TILE_YF_MSAA8_64 o o o o S S S Y X X Y Y X X X X ));
217 SWIZZLE(( INTEL_TILE_YF_MSAA8_32 o o o o S S S Y X Y Y Y X X X X ));
218 SWIZZLE(( INTEL_TILE_YF_MSAA8_16 o o o o S S S Y X Y Y Y X X X X ));
219 SWIZZLE(( INTEL_TILE_YF_MSAA8_8 o o o o S S S Y Y Y Y Y X X X X ));
220
221 SWIZZLE(( INTEL_TILE_YS_MSAA8_128 S S S Y X Y X Y X X Y Y X X X X ));
222 SWIZZLE(( INTEL_TILE_YS_MSAA8_64 S S S Y X Y X Y X X Y Y X X X X ));
223 SWIZZLE(( INTEL_TILE_YS_MSAA8_32 S S S Y X Y X Y X Y Y Y X X X X ));
224 SWIZZLE(( INTEL_TILE_YS_MSAA8_16 S S S Y X Y X Y X Y Y Y X X X X ));
225 SWIZZLE(( INTEL_TILE_YS_MSAA8_8 S S S Y X Y X Y Y Y Y Y X X X X ));
226
227 SWIZZLE(( INTEL_TILE_YF_MSAA16_128 o o o o S S S S X X Y Y X X X X ));
228 SWIZZLE(( INTEL_TILE_YF_MSAA16_64 o o o o S S S S X X Y Y X X X X ));
229 SWIZZLE(( INTEL_TILE_YF_MSAA16_32 o o o o S S S S X Y Y Y X X X X ));
230 SWIZZLE(( INTEL_TILE_YF_MSAA16_16 o o o o S S S S X Y Y Y X X X X ));
231 SWIZZLE(( INTEL_TILE_YF_MSAA16_8 o o o o S S S S Y Y Y Y X X X X ));
232
233 SWIZZLE(( INTEL_TILE_YS_MSAA16_128 S S S S X Y X Y X X Y Y X X X X ));
234 SWIZZLE(( INTEL_TILE_YS_MSAA16_64 S S S S X Y X Y X X Y Y X X X X ));
235 SWIZZLE(( INTEL_TILE_YS_MSAA16_32 S S S S X Y X Y X Y Y Y X X X X ));
236 SWIZZLE(( INTEL_TILE_YS_MSAA16_16 S S S S X Y X Y X Y Y Y X X X X ));
237 SWIZZLE(( INTEL_TILE_YS_MSAA16_8 S S S S X Y X Y Y Y Y Y X X X X ));
238
239 SWIZZLE(( INTEL_TILE_YF_3D_128 o o o o Y Z X X Z Z Y Y X X X X ));
240 SWIZZLE(( INTEL_TILE_YF_3D_64 o o o o Y Z X X Z Z Y Y X X X X ));
241 SWIZZLE(( INTEL_TILE_YF_3D_32 o o o o Y Z X Y Z Z Y Y X X X X ));
242 SWIZZLE(( INTEL_TILE_YF_3D_16 o o o o Y Z Y Z Z Z Y Y X X X X ));
243 SWIZZLE(( INTEL_TILE_YF_3D_8 o o o o Y Z Y Z Z Z Y Y X X X X ));
244
245 SWIZZLE(( INTEL_TILE_YS_3D_128 X Y Z X Y Z X X Z Z Y Y X X X X ));
246 SWIZZLE(( INTEL_TILE_YS_3D_64 X Y Z X Y Z X X Z Z Y Y X X X X ));
247 SWIZZLE(( INTEL_TILE_YS_3D_32 X Y Z X Y Z X Y Z Z Y Y X X X X ));
248 SWIZZLE(( INTEL_TILE_YS_3D_16 X Y Z X Y Z Y Z Z Z Y Y X X X X ));
249 SWIZZLE(( INTEL_TILE_YS_3D_8 X Y Z X Y Z Y Z Z Z Y Y X X X X ));
250
251 // XE_HP_SDV Swizzles...
252 SWIZZLE(( INTEL_TILE_4 o o o o Y Y X Y X X Y Y X X X X ));
253
254 SWIZZLE(( INTEL_TILE_64_128 Y X X X Y Y X Y X X Y Y X X X X ));
255 SWIZZLE(( INTEL_TILE_64_64 Y X X X Y Y X Y X X Y Y X X X X ));
256 SWIZZLE(( INTEL_TILE_64_32 Y Y X X Y Y X Y X X Y Y X X X X ));
257 SWIZZLE(( INTEL_TILE_64_16 Y Y X X Y Y X Y X X Y Y X X X X ));
258 SWIZZLE(( INTEL_TILE_64_8 Y Y Y X Y Y X Y X X Y Y X X X X ));
259
260 SWIZZLE(( INTEL_TILE_64_MSAA2_128 Y X X X Y Y X Y S X Y Y X X X X ));
261 SWIZZLE(( INTEL_TILE_64_MSAA2_64 Y X X X Y Y X Y S X Y Y X X X X ));
262 SWIZZLE(( INTEL_TILE_64_MSAA2_32 Y Y X X Y Y X Y S X Y Y X X X X ));
263 SWIZZLE(( INTEL_TILE_64_MSAA2_16 Y Y X X Y Y X Y S X Y Y X X X X ));
264 SWIZZLE(( INTEL_TILE_64_MSAA2_8 Y Y Y X Y Y X Y S X Y Y X X X X ));
265
266 SWIZZLE(( INTEL_TILE_64_MSAA_128 Y X X X Y Y X S S X Y Y X X X X ));
267 SWIZZLE(( INTEL_TILE_64_MSAA_64 Y X X X Y Y X S S X Y Y X X X X ));
268 SWIZZLE(( INTEL_TILE_64_MSAA_32 Y Y X X Y Y X S S X Y Y X X X X ));
269 SWIZZLE(( INTEL_TILE_64_MSAA_16 Y Y X X Y Y X S S X Y Y X X X X ));
270 SWIZZLE(( INTEL_TILE_64_MSAA_8 Y Y Y X Y Y X S S X Y Y X X X X ));
271
272 SWIZZLE(( INTEL_TILE_64_3D_128 Z Z Y X X X Z Y Z X Y Y X X X X ));
273 SWIZZLE(( INTEL_TILE_64_3D_64 Z Z Y X X X Z Y Z X Y Y X X X X ));
274 SWIZZLE(( INTEL_TILE_64_3D_32 Z Z Y X Y X Z Y Z X Y Y X X X X ));
275 SWIZZLE(( INTEL_TILE_64_3D_16 Z Z Z Y Y X Z Y Z X Y Y X X X X ));
276 SWIZZLE(( INTEL_TILE_64_3D_8 Z Z Z X Y Y Z Y Z X Y Y X X X X ));
277
278 //Tile64 updated layout for Render Compression 256B and Physical L3
279
280 SWIZZLE(( INTEL_TILE_64_V2_MSAA2_128 Y X X X Y Y X S X X Y Y X X X X ));
281 SWIZZLE(( INTEL_TILE_64_V2_MSAA2_64 Y Y X X Y Y X S X X Y Y X X X X ));
282 SWIZZLE(( INTEL_TILE_64_V2_MSAA2_32 Y Y Y X Y Y X S X X Y Y X X X X ));
283 SWIZZLE(( INTEL_TILE_64_V2_MSAA2_16 Y Y Y X Y Y X S X X Y Y X X X X ));
284 SWIZZLE(( INTEL_TILE_64_V2_MSAA2_8 Y Y Y Y Y Y X S X X Y Y X X X X ));
285
286 SWIZZLE(( INTEL_TILE_64_V2_MSAA4_128 Y X X X Y Y S S X X Y Y X X X X ));
287 SWIZZLE(( INTEL_TILE_64_V2_MSAA4_64 Y X X X Y Y S S X X Y Y X X X X ));
288 SWIZZLE(( INTEL_TILE_64_V2_MSAA4_32 Y Y X X Y Y S S X X Y Y X X X X ));
289 SWIZZLE(( INTEL_TILE_64_V2_MSAA4_16 Y Y X X Y Y S S X X Y Y X X X X ));
290 SWIZZLE(( INTEL_TILE_64_V2_MSAA4_8 Y Y Y X Y Y S S X X Y Y X X X X ));
291
292 SWIZZLE(( INTEL_TILE_64_V2_MSAA8_128 Y Y X X Y X S S S X Y Y X X X X ));
293 SWIZZLE(( INTEL_TILE_64_V2_MSAA8_64 Y Y X X Y X S S S X Y Y X X X X ));
294 SWIZZLE(( INTEL_TILE_64_V2_MSAA8_32 Y Y X X Y X S S S X Y Y X X X X ));
295 SWIZZLE(( INTEL_TILE_64_V2_MSAA8_16 Y Y Y X Y X S S S X Y Y X X X X ));
296 SWIZZLE(( INTEL_TILE_64_V2_MSAA8_8 Y Y Y X Y X S S S X Y Y X X X X ));
297
298 SWIZZLE(( INTEL_TILE_64_V2_MSAA16_128 Y X X X Y X S S S S Y Y X X X X ));
299 SWIZZLE(( INTEL_TILE_64_V2_MSAA16_64 Y Y X X Y X S S S S Y Y X X X X ));
300 SWIZZLE(( INTEL_TILE_64_V2_MSAA16_32 Y Y X X Y X S S S S Y Y X X X X ));
301 SWIZZLE(( INTEL_TILE_64_V2_MSAA16_16 Y Y X X Y X S S S S Y Y X X X X ));
302 SWIZZLE(( INTEL_TILE_64_V2_MSAA16_8 Y Y Y X Y X S S S S Y Y X X X X ));
303
304 SWIZZLE(( INTEL_TILE_64_V2_3D_128 Z Z Y X X Y Z Z X X Y Y X X X X ));
305 SWIZZLE(( INTEL_TILE_64_V2_3D_64 Z Z Y X X Y Z Z X X Y Y X X X X ));
306 SWIZZLE(( INTEL_TILE_64_V2_3D_32 Z Z Y X Y Y Z Z X X Y Y X X X X ));
307 SWIZZLE(( INTEL_TILE_64_V2_3D_16 Z Z Z Y Y Y Z Z X X Y Y X X X X ));
308 SWIZZLE(( INTEL_TILE_64_V2_3D_8 Z Z Z Y Y Y Z Z X X Y Y X X X X ));
309
310
311 #undef X
312 #undef Y
313 #undef Z
314 #undef S
315 #undef o
316 #undef __SWIZZLE
317 #undef SWIZZLE
318
319 // Accessing Swizzled Surface ##################################################
320
321 /* While graphics hardware prefers to access surfaces stored in tiled/swizzled
322 formats, logically accessing such surfaces with CPU-based software is non-
323 trivial when high throughput is goal.
324
325 This file implements (1) SwizzleOffset function to compute swizzled offset of
326 dimensionally-specified surface byte, and (2) CpuSwizzleBlt function to BLT
327 between linear ("y * pitch + x") and swizzled surfaces--with goal of providing
328 high-performance, swizzling BLT implementation to be used both in production
329 and as a guide for those seeking to understand swizzled access or implement
330 functionality beyond the simple BLT. */
331
332 // Surface Descriptor for CpuSwizzleBlt function...
333 typedef struct _CPU_SWIZZLE_BLT_SURFACE
334 {
335 void *pBase; // Pointer to surface base.
336 int Pitch, Height; // Row-pitch in bytes, and height, of surface.
337 const SWIZZLE_DESCRIPTOR *pSwizzle; // Pointer to surface's swizzle descriptor, or NULL if unswizzled.
338 int OffsetX; // Horizontal offset into surface for BLT rectangle, in bytes.
339 int OffsetY; // Vertical offset into surface for BLT rectangle, in physical/pitch rows.
340 int OffsetZ; // Zero if N/A, or 3D offset into surface for BLT rectangle, in 3D slices or MSAA samples as appropriate.
341
342 #ifdef SUB_ELEMENT_SUPPORT
343 struct _CPU_SWIZZLE_BLT_SURFACE_ELEMENT
344 {
345 int Pitch, Size; // Zero if full-pixel BLT, or pitch and size, in bytes, of pixel element being BLT'ed.
346 } Element;
347
348 /* e.g. to BLT only stencil data from S8D24 surface to S8 surface...
349 Dest.Element.Size = Src.Element.Size = sizeof(S8) = 1;
350 Dest.Element.Pitch = sizeof(S8) = 1;
351 Src.Element.Pitch = sizeof(S8D24) = 4;
352 Src.OffsetX += BYTE_OFFSET_OF_S8_WITHIN_S8D24; */
353 #endif
354 } CPU_SWIZZLE_BLT_SURFACE;
355
356 extern int SwizzleOffset(const SWIZZLE_DESCRIPTOR *pSwizzle, int Pitch, int OffsetX, int OffsetY, int OffsetZ);
357 extern void CpuSwizzleBlt(CPU_SWIZZLE_BLT_SURFACE *pDest, CPU_SWIZZLE_BLT_SURFACE *pSrc, int CopyWidthBytes, int CopyHeight);
358
359 #ifdef __cplusplus
360 }
361 #endif
362
363 #define CpuSwizzleBlt_INCLUDED
364
365 #endif
366
367
368 #ifndef INCLUDE_CpuSwizzleBlt_c_AS_HEADER
369
370 //#define MINIMALIST // Use minimalist, unoptimized implementation.
371
372 #include "assert.h" // Quoted to allow local-directory override.
373
374 #if(_MSC_VER >= 1400)
375 #include <intrin.h>
376 #elif defined(__ARM_ARCH)
377 #include <sse2neon.h>
378 #elif((defined __clang__) ||(__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5)))
379 #include <cpuid.h>
380 #include <x86intrin.h>
381 #else
382 #error "Unexpected compiler!"
383 #endif
384
385
386 // POPCNT: Count Lit Bits... 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
387 static unsigned char PopCnt4[16] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
388 #define POPCNT4(x) (PopCnt4[(x) & 0xf])
389 #define POPCNT16(x) (POPCNT4((x) >> 12) + POPCNT4((x) >> 8) + POPCNT4((x) >> 4) + POPCNT4(x))
390
391
SwizzleOffset(const SWIZZLE_DESCRIPTOR * pSwizzle,int Pitch,int OffsetX,int OffsetY,int OffsetZ)392 int SwizzleOffset( // ##########################################################
393
394 /* Return swizzled offset of dimensionally-specified surface byte. */
395
396 const SWIZZLE_DESCRIPTOR *pSwizzle, // Pointer to applicable swizzle descriptor.
397 int Pitch, // Pointer to applicable surface row-pitch.
398 int OffsetX, // Horizontal offset into surface of the target byte, in bytes.
399 int OffsetY, // Vertical offset into surface of the target byte, in physical/pitch rows.
400 int OffsetZ) // Zero if N/A, or 3D offset into surface of the target byte, in 3D slices or MSAA samples as appropriate.
401
402 /* Given logically-specified (x, y, z) byte within swizzled surface,
403 function returns byte's linear/memory offset from surface's base--i.e. it
404 performs the swizzled, spatial-to-linear mapping.
405
406 Function makes no real effort to perform optimally, since should only used
407 outside loops in CpuSwizzleBlt and similar functions. If any of this
408 functionality was needed in performance path, a custom implementation
409 should be used that limits itself to functionality specifically needed
410 (probably single-dimension, intra-tile offsets) and uses a fast computation
411 (e.g. LUT's, hard-codings, PDEP). */
412
413 { // ###########################################################################
414
415 char PDepSupported = -1; // AVX2/BMI2 PDEP (Parallel Deposit) Instruction
416
417 int SwizzledOffset; // Return value being computed.
418
419 int TileWidthBits = POPCNT16(pSwizzle->Mask.x); // Log2(Tile Width in Bytes)
420 int TileHeightBits = POPCNT16(pSwizzle->Mask.y); // Log2(Tile Height)
421 int TileDepthBits = POPCNT16(pSwizzle->Mask.z); // Log2(Tile Depth or MSAA Samples)
422 int TileSizeBits = TileWidthBits + TileHeightBits + TileDepthBits; // Log2(Tile Size in Bytes)
423 int TilesPerRow = Pitch >> TileWidthBits; // Surface Width in Tiles
424
425 int Row, Col; // Tile grid position on surface, of tile containing specified byte.
426 int x, y, z; // Position of specified byte within tile that contains it.
427
428 if(PDepSupported == -1)
429 {
430 #if(_MSC_VER >= 1700)
431 #define PDEP(Src, Mask) _pdep_u32((Src), (Mask))
432 int CpuInfo[4];
433 __cpuidex(CpuInfo, 7, 0);
434 PDepSupported = ((CpuInfo[1] & (1 << 8)) != 0); // EBX[8] = BMI2
435 #elif ( defined (__BMI2__ ))
436 #define PDEP(Src, Mask) _pdep_u32((Src), (Mask))
437 unsigned int eax, ebx, ecx, edx;
438 __cpuid_count(7, 0, eax, ebx, ecx, edx);
439 PDepSupported = ((ebx & (1 << 8)) != 0); // EBX[8] = BMI2
440 #else
441 #define PDEP(Src, Mask) 0
442 PDepSupported = 0;
443 #endif
444 }
445
446 assert( // Mutually Exclusive Swizzle Positions...
447 (pSwizzle->Mask.x | pSwizzle->Mask.y | pSwizzle->Mask.z) ==
448 (pSwizzle->Mask.x + pSwizzle->Mask.y + pSwizzle->Mask.z));
449
450 assert( // Swizzle Limited to 16-bit (else expand POPCNT'ing)...
451 (pSwizzle->Mask.x | pSwizzle->Mask.y | pSwizzle->Mask.z) < (1 << 16));
452
453 assert( // Pitch is Multiple of Tile Width...
454 Pitch == ((Pitch >> TileWidthBits) << TileWidthBits));
455
456 { // Break Positioning into Tile-Granular and Intra-Tile Components...
457 assert((OffsetZ >> TileDepthBits) == 0); // When dealing with 3D tiling, treat as separate single-tile-deep planes.
458 z = OffsetZ & ((1 << TileDepthBits) - 1);
459
460 Row = OffsetY >> TileHeightBits;
461 y = OffsetY & ((1 << TileHeightBits) - 1);
462
463 Col = OffsetX >> TileWidthBits;
464 x = OffsetX & ((1 << TileWidthBits) - 1);
465 }
466
467 SwizzledOffset = // Start with surface offset of given tile...
468 (Row * TilesPerRow + Col) << TileSizeBits; // <-- Tiles laid across surface in row-major order.
469
470 // ...then OR swizzled offset of byte within tile...
471 if(PDepSupported)
472 {
473 SwizzledOffset +=
474 PDEP(x, pSwizzle->Mask.x) +
475 PDEP(y, pSwizzle->Mask.y) +
476 PDEP(z, pSwizzle->Mask.z);
477 }
478 else // PDEP workalike...
479 {
480 int bitIndex = 0, bitMask = 1;
481 int terminationMask = pSwizzle->Mask.x | pSwizzle->Mask.y | pSwizzle->Mask.z;
482 while(bitMask < terminationMask)
483 {
484 int MaskQ;
485 #define PROCESS(Q) { \
486 MaskQ = bitMask & pSwizzle->Mask.Q; \
487 SwizzledOffset += Q & MaskQ; \
488 Q <<= 1 ^ (MaskQ >> bitIndex); \
489 }
490 PROCESS(x);
491 PROCESS(y);
492 PROCESS(z);
493
494 bitIndex++;
495 bitMask <<= 1;
496
497 #undef PROCESS
498 }
499 }
500
501 return(SwizzledOffset);
502 }
503
504
CpuSwizzleBlt(CPU_SWIZZLE_BLT_SURFACE * pDest,CPU_SWIZZLE_BLT_SURFACE * pSrc,int CopyWidthBytes,int CopyHeight)505 void CpuSwizzleBlt( // #########################################################
506
507 /* Performs specified swizzling BLT between two given surfaces. */
508
509 CPU_SWIZZLE_BLT_SURFACE *pDest, // Pointer to destination surface descriptor.
510 CPU_SWIZZLE_BLT_SURFACE *pSrc, // Pointer to source surface descriptor.
511 int CopyWidthBytes, // Width of BLT rectangle, in bytes.
512 int CopyHeight) // Height of BLT rectangle, in physical/pitch rows.
513
514 #ifdef SUB_ELEMENT_SUPPORT
515
516 /* When copying between surfaces with different pixel pitches, specify
517 CopyWidthBytes in terms of unswizzled surface's element-pitches:
518
519 CopyWidthBytes = CopyWidthPixels * pLinearSurface.Element.Pitch; */
520
521 #endif
522
523 { // ###########################################################################
524
525 CPU_SWIZZLE_BLT_SURFACE *pLinearSurface, *pSwizzledSurface;
526 int LinearToSwizzled;
527
528 { // One surface swizzled, the other unswizzled (aka "linear")...
529 assert((pDest->pSwizzle != NULL) ^ (pSrc->pSwizzle != NULL));
530
531 LinearToSwizzled = !pSrc->pSwizzle;
532 if(LinearToSwizzled)
533 {
534 pSwizzledSurface = pDest;
535 pLinearSurface = pSrc;
536 }
537 else // Swizzled-to-Linear...
538 {
539 pSwizzledSurface = pSrc;
540 pLinearSurface = pDest;
541 }
542 }
543
544 #ifdef SUB_ELEMENT_SUPPORT
545 {
546 assert( // Either both or neither specified...
547 (pDest->Element.Pitch != 0) == (pSrc->Element.Pitch != 0));
548
549 assert( // Surfaces agree on transfer element size...
550 pDest->Element.Size == pSrc->Element.Size);
551
552 assert( // Element pitch not specified without element size...
553 !(pDest->Element.Pitch && !pDest->Element.Size));
554
555 assert( // Legit element sizes...
556 (pDest->Element.Size <= pDest->Element.Pitch) &&
557 (pSrc->Element.Size <= pSrc->Element.Pitch));
558
559 assert( // Sub-element CopyWidthBytes in terms of LinearSurface pitch...
560 (pLinearSurface->Element.Pitch == 0) ||
561 ((CopyWidthBytes % pLinearSurface->Element.Pitch) == 0));
562 }
563 #endif
564
565 { // No surface overrun...
566 int NoOverrun =
567 #ifdef SUB_ELEMENT_SUPPORT
568 (
569 // Sub-element transfer...
570 ((pLinearSurface->Element.Size != pLinearSurface->Element.Pitch) ||
571 (pSwizzledSurface->Element.Size != pSwizzledSurface->Element.Pitch)) &&
572 // No overrun...
573 ((pLinearSurface->OffsetX + CopyWidthBytes) <=
574 (pLinearSurface->Pitch +
575 // CopyWidthBytes's inclusion of uncopied bytes...
576 (pLinearSurface->Element.Pitch - pLinearSurface->Element.Size))) &&
577 ((pLinearSurface->OffsetY + CopyHeight) <= pLinearSurface->Height) &&
578 ((pSwizzledSurface->OffsetX +
579 // Adjust CopyWidthBytes from being in terms of LinearSurface pitch...
580 (CopyWidthBytes / pLinearSurface->Element.Pitch * pSwizzledSurface->Element.Pitch)
581 ) <=
582 (pSwizzledSurface->Pitch +
583 // CopyWidthBytes's inclusion of uncopied bytes...
584 (pSwizzledSurface->Element.Pitch - pSwizzledSurface->Element.Size))) &&
585 ((pSwizzledSurface->OffsetY + CopyHeight) <= pSwizzledSurface->Height)
586 ) ||
587 #endif
588
589 ((pDest->OffsetX + CopyWidthBytes) <= pDest->Pitch) &&
590 ((pDest->OffsetY + CopyHeight) <= pDest->Height) &&
591 ((pSrc->OffsetX + CopyWidthBytes) <= pSrc->Pitch) &&
592 ((pSrc->OffsetY + CopyHeight) <= pSrc->Height);
593
594 assert(NoOverrun);
595 }
596
597 { // No surface overlap...
598 char *pDest0 = (char *) pDest->pBase;
599 char *pDest1 = (char *) pDest->pBase + pDest->Pitch * CopyHeight;
600 char *pSrc0 = (char *) pSrc->pBase;
601 char *pSrc1 = (char *) pSrc->pBase + pSrc->Pitch * CopyHeight;
602
603 assert(!(
604 ((pDest0 >= pSrc0) && (pDest0 < pSrc1)) ||
605 ((pSrc0 >= pDest0) && (pSrc0 < pDest1))));
606 }
607
608 {
609 /* BLT will have pointer in each surface between which data will be
610 copied from source to destination. Each pointer will be appropriately
611 incremented/positioned through its surface, as BLT rectangle is
612 traversed. */
613
614 char *pLinearAddress, *pSwizzledAddress;
615
616 // Convenient to track traversal in swizzled surface offsets...
617 int x0 = pSwizzledSurface->OffsetX;
618 int x1 = x0 + CopyWidthBytes;
619 int y0 = pSwizzledSurface->OffsetY;
620 int y1 = y0 + CopyHeight;
621 int x, y;
622
623 // Start linear pointer at specified base...
624 pLinearAddress =
625 (char *) pLinearSurface->pBase +
626 pLinearSurface->OffsetY * pLinearSurface->Pitch +
627 pLinearSurface->OffsetX;
628
629 #ifdef MINIMALIST // Simple implementation for functional understanding/testing/etc.
630 {
631 #ifdef SUB_ELEMENT_SUPPORT
632 assert( // No Sub-Element Transfer...
633 (pLinearSurface->Element.Size == pLinearSurface->Element.Pitch) &&
634 (pSwizzledSurface->Element.Size == pSwizzledSurface->Element.Pitch));
635 #endif
636
637 for(y = y0; y < y1; y++)
638 {
639 for(x = x0; x < x1; x++)
640 {
641 pSwizzledAddress =
642 (char *) pSwizzledSurface->pBase +
643 SwizzleOffset(
644 pSwizzledSurface->pSwizzle,
645 pSwizzledSurface->Pitch,
646 x, y, pSwizzledSurface->OffsetZ);
647
648 if(LinearToSwizzled)
649 {
650 *pSwizzledAddress = *pLinearAddress;
651 }
652 else
653 {
654 *pLinearAddress = *pSwizzledAddress;
655 }
656
657 pLinearAddress++;
658 }
659
660 pLinearAddress += pLinearSurface->Pitch - CopyWidthBytes;
661 }
662 }
663 #else // Production/Performance Implementation...
664 {
665 /* Key Performance Gains from...
666 (1) Efficient Memory Transfers (Ordering + Instruction)
667 (2) Minimizing Work in Inner Loops */
668
669 #if(_MSC_VER >= 1600)
670 #include <stdint.h>
671
672 #pragma warning(push)
673 #pragma warning(disable:4127) // Constant Conditional Expressions
674
675 unsigned long LOW_BIT_Index;
676 #define LOW_BIT(x) (_BitScanForward(&LOW_BIT_Index, (x)), LOW_BIT_Index)
677
678 unsigned long HIGH_BIT_Index;
679 #define HIGH_BIT(x) (_BitScanReverse(&HIGH_BIT_Index, (x)), HIGH_BIT_Index)
680 #elif(__GNUC__ >= 4)
681 #include <stdint.h>
682
683 #define LOW_BIT(x) __builtin_ctz(x)
684 #define HIGH_BIT(x) ((sizeof(x) * CHAR_BIT - 1) - __builtin_clz(x))
685 #else
686 #error "Unexpected compiler!"
687 #endif
688
689 typedef struct ___m24
690 {
691 uint8_t byte[3];
692 } __m24; // 24-bit/3-byte memory element.
693
694 // Macros intended to compile to various types of "load register from memory" instructions...
695 #define MOVB_R( Reg, Src) (*(uint8_t *)&(Reg) = *(uint8_t *)(Src))
696 #define MOVW_R( Reg, Src) (*(uint16_t *)&(Reg) = *(uint16_t *)(Src))
697 #define MOV3_R( Reg, Src) (*(__m24 *)&(Reg) = *(__m24 *)(Src))
698 #define MOVD_R( Reg, Src) (*(uint32_t *)&(Reg) = *(uint32_t *)(Src))
699
700 #define MOVQ_R( Reg, Src) ((Reg) = _mm_loadl_epi64((__m128i *)(Src)))
701 #define MOVDQ_R( Reg, Src) ((Reg) = _mm_load_si128( (__m128i *)(Src)))
702 #define MOVDQU_R(Reg, Src) ((Reg) = _mm_loadu_si128((__m128i *)(Src)))
703
704 // As above, but the other half: "store to memory from register"...
705 #define MOVB_M( Dest, Reg)(*(uint8_t *)(Dest) = *(uint8_t *)&(Reg))
706 #define MOVW_M( Dest, Reg)(*(uint16_t *)(Dest) = *(uint16_t *)&(Reg))
707 #define MOV3_M( Dest, Reg)(*(__m24 *)(Dest) = *(__m24 *)&(Reg))
708 #define MOVD_M( Dest, Reg)(*(uint32_t *)(Dest) = *(uint32_t *)&(Reg))
709
710 #define MOVQ_M( Dest, Reg)(_mm_storel_epi64((__m128i *)(Dest), (Reg)))
711 #define MOVDQ_M( Dest, Reg)(_mm_store_si128( (__m128i *)(Dest), (Reg)))
712 #define MOVDQU_M( Dest, Reg)(_mm_storeu_si128((__m128i *)(Dest), (Reg)))
713 #define MOVNTDQ_M( Dest, Reg)(_mm_stream_si128((__m128i *)(Dest), (Reg)))
714
715
716 #define MIN_CONTAINED_POW2_BELOW_CAP(x, Cap) (1 << LOW_BIT((1 << LOW_BIT(x)) | (1 << HIGH_BIT(Cap))))
717
718 #define SWIZZLE_OFFSET(OffsetX, OffsetY, OffsetZ) \
719 SwizzleOffset(pSwizzledSurface->pSwizzle, pSwizzledSurface->Pitch, OffsetX, OffsetY, OffsetZ)
720
721 #define MAX_XFER_WIDTH 16 // See "Compute Transfer Dimensions".
722 #define MAX_XFER_HEIGHT 4 // "
723
724 char StreamingLoadSupported = -1; // SSE4.1: MOVNTDQA
725
726 int TileWidthBits = POPCNT16(pSwizzledSurface->pSwizzle->Mask.x); // Log2(Tile Width in Bytes)
727 int TileHeightBits = POPCNT16(pSwizzledSurface->pSwizzle->Mask.y); // Log2(Tile Height)
728 int TileDepthBits = POPCNT16(pSwizzledSurface->pSwizzle->Mask.z); // Log2(Tile Depth or MSAA Samples)
729 int BytesPerRowOfTiles = pSwizzledSurface->Pitch << (TileDepthBits + TileHeightBits);
730
731 struct { int LeftCrust, MainRun, RightCrust; } CopyWidth;
732 int MaskX[MAX_XFER_WIDTH + 1], MaskY[MAX_XFER_HEIGHT + 1];
733 int SwizzledOffsetX0, SwizzledOffsetY;
734 struct { int Width, Height; } SwizzleMaxXfer;
735
736 char *pSwizzledAddressCopyBase =
737 (char *) pSwizzledSurface->pBase +
738 SWIZZLE_OFFSET(0, 0, pSwizzledSurface->OffsetZ);
739
740 assert(sizeof(__m24) == 3);
741
742 if(StreamingLoadSupported == -1)
743 {
744 #if(_MSC_VER >= 1500)
745 #define MOVNTDQA_R(Reg, Src) ((Reg) = _mm_stream_load_si128((__m128i *)(Src)))
746 int CpuInfo[4];
747 __cpuid(CpuInfo, 1);
748 StreamingLoadSupported = ((CpuInfo[2] & (1 << 19)) != 0); // ECX[19] = SSE4.1
749 #elif(defined(__ARM_ARCH))
750 #define MOVNTDQA_R(Reg, Src) ((Reg) = (Reg))
751 StreamingLoadSupported = 0;
752 #elif((defined __clang__) || (__GNUC__ > 4) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 5))
753 #define MOVNTDQA_R(Reg, Src) ((Reg) = _mm_stream_load_si128((__m128i *)(Src)))
754 unsigned int eax, ebx, ecx, edx;
755 __cpuid(1, eax, ebx, ecx, edx);
756 StreamingLoadSupported = ((ecx & (1 << 19)) != 0); // ECX[19] = SSE4.1
757 #else
758 #define MOVNTDQA_R(Reg, Src) ((Reg) = (Reg))
759 StreamingLoadSupported = 0;
760 #endif
761 }
762
763 { // Compute Transfer Dimensions...
764
765 /* When transferring between linear and swizzled surfaces, we
766 can't traverse linearly through memory of both since they have
767 drastically different memory orderings--Moving linearly through
768 one means bouncing around the other.
769
770 Moving linearly through linear surface is more programmatically
771 convenient--especially when BLT rectangles not constrained to
772 tile boundaries. But moving linearly through swizzled surface
773 memory is often more performance-friendly--especially when that
774 memory is CPU-mapped as WC (Write Combining), which is often
775 the case for graphics memory.
776
777 Fortunately, we can avoid shortcomings of both extremes by
778 using hybrid traversal: Traverse mostly linearly through linear
779 surface, but have innermost loop transfer small 2D chunks sized
780 to use critical runs of linearity in the swizzled memory.
781
782 The "critical runs of linearity" that we want to hit in the
783 sizzled memory are aligned, cache-line-sized memory chunks. If
784 we bounce around with finer granularity we'll incur penalties
785 of partial WC buffer use (whether from WC memory use or non-
786 temporal stores).
787
788 The size of 2D chunks with cache-line-sized linearity in
789 swizzled memory is determined by swizzle mapping's low-order
790 six bits (for 64-byte cache lines). Most swizzles use
791 "Y Y X X X X" in their low-order bits, which means their cache
792 lines store 16x4 chunks--So our implementation will use those
793 dimensions as our target/maximum 2D transfer chunk. If we had
794 any 8x8 (or taller) swizzles, we should add such support and
795 increase our maximum chunk height. If we had any 32x2 swizzles,
796 we should add such support and increase our maximum chunk width.
797
798 Our implementation only bothers optimizing for 2D transfer
799 chunks stored in row-major order--i.e. those whose swizzle
800 mapping bits have a series of X's in the low-order, followed by
801 Y's in the higher-order. Where a swizzle mapping inflection
802 from Y back to X occurs, contiguous row-ordering is lost, and
803 we would use that smaller, row-ordered chunk size. */
804
805 int TargetMask;
806
807 // Narrow optimized transfer Width by looking for inflection from X's...
808 SwizzleMaxXfer.Width = MAX_XFER_WIDTH;
809 while( (TargetMask = SwizzleMaxXfer.Width - 1) &&
810 ((pSwizzledSurface->pSwizzle->Mask.x & TargetMask) != TargetMask))
811 {
812 SwizzleMaxXfer.Width >>= 1;
813 }
814
815 // Narrow optimized transfer height by looking for inflection from Y's...
816 SwizzleMaxXfer.Height = MAX_XFER_HEIGHT;
817
818 while( (TargetMask = (SwizzleMaxXfer.Height - 1) * SwizzleMaxXfer.Width) &&
819 ((pSwizzledSurface->pSwizzle->Mask.y & TargetMask) != TargetMask))
820 {
821 SwizzleMaxXfer.Height >>= 1;
822 }
823 }
824
825 { // Separate CopyWidthBytes into unaligned left/right "crust" and aligned "MainRun"...
826 int MaxXferWidth = MIN_CONTAINED_POW2_BELOW_CAP(SwizzleMaxXfer.Width, CopyWidthBytes);
827
828 CopyWidth.LeftCrust = // i.e. "bytes to xfer-aligned boundary"
829 (MaxXferWidth - x0) & (MaxXferWidth - 1); // Simplification of ((MaxXferWidth - (x0 % MaxXferWidth)) % MaxXferWidth)
830
831 CopyWidth.MainRun =
832 (CopyWidthBytes - CopyWidth.LeftCrust) & ~(SwizzleMaxXfer.Width - 1); // MainRun is of SwizzleMaxXfer.Width's--not MaxXferWidth's.
833
834 CopyWidth.RightCrust = CopyWidthBytes - (CopyWidth.LeftCrust + CopyWidth.MainRun);
835
836 #ifdef SUB_ELEMENT_SUPPORT
837 {
838 // For partial-pixel transfers, there is no crust and MainRun is done pixel-by-pixel...
839 if( (pLinearSurface->Element.Size != pLinearSurface->Element.Pitch) ||
840 (pSwizzledSurface->Element.Size != pSwizzledSurface->Element.Pitch))
841 {
842 CopyWidth.LeftCrust = CopyWidth.RightCrust = 0;
843 CopyWidth.MainRun = CopyWidthBytes;
844 }
845 }
846 #endif
847 }
848
849
850 /* Unlike in MINIMALIST implementation, which fully computes
851 swizzled offset for each transfer element, we want to minimize work
852 done in our inner loops.
853
854 One way we'll reduce work is to separate pSwizzledAddress into
855 dimensional components--e.g. so Y-swizzling doesn't have to be
856 recomputed in X-loop.
857
858 But a more powerful way we'll reduce work is...Instead of linearly
859 incrementing spatial offsets and then converting to their swizzled
860 counterparts, we'll compute swizzled bases outside the loops and
861 keep them swizzled using swizzled incrementing inside the loops--
862 since swizzled incrementing can be much cheaper than repeatedly
863 swizzling spatial offsets.
864
865 Intra-tile swizzled incrementing can be done by using the inverse
866 of a spatial component's swizzle mask to ripple-carry a +1 to and
867 across the bits of a currently swizzled value--e.g. with...
868
869 SwizzledOffsetY: Y X Y X Y Y X X X X
870 ~MaskY: 0 1 0 1 0 0 1 1 1 1
871 + 1
872 -----------------------
873
874 ...set low-order ~MaskY bits will always ripple-carry the
875 incrementing +1 to wherever Y0 happens to be, and wherever there is
876 an arithmetic carry out of one Y position, set ~MaskY bits will
877 carry it across any gaps to the next Y position.
878
879 The above algorithm only works for adding one, but the mask used
880 can be modified to deliver the +1 to any bit location, so any power
881 of two increment can be achieved.
882
883 After swizzled increment, residue from mask addition and undesired
884 carries outside targeted fields must be removed using the natural
885 mask--So the final intra-tile swizzled increment is...
886
887 SwizzledOffsetQ = (SwizzledOffsetQ + ~MaskQ + 1) & MaskQ
888 ...where Q is the applicable X/Y/Z dimensional component.
889
890 Or since in two's compliment, (~MaskQ + 1) = -MaskQ...
891
892 SwizzledOffsetQ = (SwizzledOffsetQ - MaskQ) & MaskQ
893
894 Since tile sizes are powers of two and tiles laid out in row-major
895 order across surface, the above swizzled incrementing can
896 additionally be used for inter-tile incrementing of X component by
897 extending applicable mask to include offset bits beyond the tile--
898 so arithmetic carries out of intra-tile X component will ripple to
899 advance swizzled inter-tile X offset to next tile. Same is not true
900 of inter-tile Y incrementing since surface pitches not restricted
901 to powers of two. */
902
903 { // Compute Mask[IncSize] for Needed Increment Values...
904 int ExtendedMaskX = // Bits beyond the tile (so X incrementing can operate inter-tile)...
905 ~(pSwizzledSurface->pSwizzle->Mask.x |
906 pSwizzledSurface->pSwizzle->Mask.y |
907 pSwizzledSurface->pSwizzle->Mask.z);
908
909 /* Subtraction below delivers natural mask for +1 increment,
910 and appropriately altered mask to deliver +1 to higher bit
911 positions for +2/4/8/etc. increments. */
912
913 for(x = SwizzleMaxXfer.Width; x >= 1; x >>= 1)
914 {
915 MaskX[x] = SWIZZLE_OFFSET((1 << TileWidthBits) - x, 0, 0) | ExtendedMaskX;
916 }
917
918 for(y = SwizzleMaxXfer.Height; y >= 1; y >>= 1)
919 {
920 MaskY[y] = SWIZZLE_OFFSET(0, (1 << TileHeightBits) - y, 0);
921 }
922 }
923
924 { // Base Dimensional Swizzled Offsets...
925 int IntraTileY = y0 & ((1 << TileHeightBits) - 1);
926 int TileAlignedY = y0 - IntraTileY;
927
928 SwizzledOffsetY = SWIZZLE_OFFSET(0, IntraTileY, 0);
929
930 SwizzledOffsetX0 =
931 SWIZZLE_OFFSET(
932 x0,
933 TileAlignedY, // <-- Since SwizzledOffsetX will include "bits beyond the tile".
934 0);
935 }
936
937 // BLT Loops ///////////////////////////////////////////////////////
938
939 /* Traverse BLT rectangle, transferring small, optimally-aligned 2D
940 chunks, as appropriate for given swizzle format. Use swizzled
941 incrementing of dimensional swizzled components. */
942
943 for(y = y0; y < y1; )
944 {
945 char *pSwizzledAddressLine = pSwizzledAddressCopyBase + SwizzledOffsetY;
946 int xferHeight =
947 // Largest pow2 xfer height that alignment, MaxXfer, and lines left will permit...
948 MIN_CONTAINED_POW2_BELOW_CAP(y | SwizzleMaxXfer.Height, y1 - y);
949 int SwizzledOffsetX = SwizzledOffsetX0;
950
951 __m128i xmm[MAX_XFER_HEIGHT];
952 char *pLinearAddressEnd;
953 int _MaskX;
954
955 // XFER Macros /////////////////////////////////////////////////
956
957 /* We'll define "XFER" macro to contain BLT X-loop work.
958
959 In simple implementation, XFER would be WHILE loop that does
960 SSE transfer and performs pointer and swizzled offset
961 incrementing.
962
963 ...but we have multiple conditions to handle...
964 - Transfer Direction (Linear <--> Swizzled)
965 - Optimal 2D Transfer Chunk Size
966 - Available/Desired CPU Transfer Instructions
967 - Unaligned Crust
968
969 Don't want X-loop to have conditional logic to handle
970 variations since would retard performance--but neither do we
971 want messy multitude of slightly different, copy-pasted code
972 paths. So instead, XFER macro will provide common code template
973 allowing instantiation of multiple X-loop variations--i.e. XFER
974 calls from conditional Y-loop code will expand into separate,
975 conditional-free, "lean and mean" X-loops.
976
977 Some conditional logic remains in XFER chain--but only outside
978 X-loop. The two IF statements that remain in X-loop (i.e. those
979 in XFER_LOAD/STORE) expand to compile-time constant conditional
980 expressions, so with optimizing compiler, no runtime-
981 conditional code will be generated--i.e. constant conditionals
982 will simply decide whether given instantiation has that code or
983 not. */
984
985 #define XFER(XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
986 { \
987 XFER_LINES(4, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
988 else XFER_LINES(2, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
989 else XFER_LINES(1, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust);\
990 }
991
992 #define XFER_LINES(XFER_LINES_Lines, XFER_Store, XFER_Load, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch, XFER_Crust) \
993 if(xferHeight == (XFER_LINES_Lines)) \
994 { \
995 if(XFER_Crust) \
996 { \
997 XFER_SPAN(MOVB_M, MOVB_R, CopyWidth.LeftCrust & 1, 1, 1, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
998 XFER_SPAN(MOVW_M, MOVW_R, CopyWidth.LeftCrust & 2, 2, 2, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
999 XFER_SPAN(MOVD_M, MOVD_R, CopyWidth.LeftCrust & 4, 4, 4, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1000 XFER_SPAN(MOVQ_M, MOVQ_R, CopyWidth.LeftCrust & 8, 8, 8, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1001 } \
1002 \
1003 XFER_SPAN(XFER_Store, XFER_Load, CopyWidth.MainRun, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch);\
1004 \
1005 if(XFER_Crust) \
1006 { \
1007 XFER_SPAN(MOVQ_M, MOVQ_R, CopyWidth.RightCrust & 8, 8, 8, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1008 XFER_SPAN(MOVD_M, MOVD_R, CopyWidth.RightCrust & 4, 4, 4, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1009 XFER_SPAN(MOVW_M, MOVW_R, CopyWidth.RightCrust & 2, 2, 2, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1010 XFER_SPAN(MOVB_M, MOVB_R, CopyWidth.RightCrust & 1, 1, 1, XFER_LINES_Lines, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch); \
1011 } \
1012 }
1013
1014 #define XFER_SPAN(XFER_Store, XFER_Load, XFER_CopyWidthBytes, XFER_Pitch_Swizzled, XFER_Pitch_Linear, XFER_Height, XFER_pDest, XFER_DestPitch, XFER_pSrc, XFER_SrcPitch) \
1015 { \
1016 pLinearAddressEnd = pLinearAddress + (XFER_CopyWidthBytes); \
1017 _MaskX = MaskX[XFER_Pitch_Swizzled]; \
1018 while(pLinearAddress < pLinearAddressEnd) \
1019 { \
1020 pSwizzledAddress = pSwizzledAddressLine + SwizzledOffsetX; \
1021 \
1022 XFER_LOAD(0, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height); \
1023 XFER_LOAD(1, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height); \
1024 XFER_LOAD(2, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height); \
1025 XFER_LOAD(3, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height); \
1026 XFER_STORE(0, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
1027 XFER_STORE(1, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
1028 XFER_STORE(2, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
1029 XFER_STORE(3, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height); \
1030 \
1031 SwizzledOffsetX = (SwizzledOffsetX - _MaskX) & _MaskX; \
1032 pLinearAddress += (XFER_Pitch_Linear); \
1033 } \
1034 }
1035
1036 #define XFER_LOAD(XFER_Line, XFER_Load, XFER_pSrc, XFER_SrcPitch, XFER_Height) \
1037 { \
1038 if((XFER_Line) < (XFER_Height)) \
1039 { \
1040 XFER_Load( \
1041 xmm[XFER_Line], \
1042 (XFER_pSrc) + (XFER_Line) * (XFER_SrcPitch)); \
1043 } \
1044 }
1045
1046 #define XFER_STORE(XFER_Line, XFER_Store, XFER_pDest, XFER_DestPitch, XFER_Height) \
1047 { \
1048 if((XFER_Line) < (XFER_Height)) \
1049 { \
1050 XFER_Store( \
1051 (XFER_pDest) + (XFER_Line) * (XFER_DestPitch), \
1052 xmm[XFER_Line]); \
1053 } \
1054 }
1055
1056 // Perform Applicable Transfer /////////////////////////////////
1057 assert( // DQ Alignment...
1058 ((intptr_t) pSwizzledSurface->pBase % 16 == 0) &&
1059 (pSwizzledSurface->Pitch % 16 == 0));
1060
1061 #ifdef SUB_ELEMENT_SUPPORT
1062 if( (pLinearSurface->Element.Size != pLinearSurface->Element.Pitch) ||
1063 (pSwizzledSurface->Element.Size != pSwizzledSurface->Element.Pitch))
1064 {
1065 if(LinearToSwizzled)
1066 {
1067 switch(pLinearSurface->Element.Size)
1068 {
1069 case 16: XFER(MOVNTDQ_M, MOVDQU_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1070 case 8: XFER( MOVQ_M, MOVQ_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1071 case 4: XFER( MOVD_M, MOVD_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1072 case 3: XFER( MOV3_M, MOV3_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1073 case 2: XFER( MOVW_M, MOVW_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1074 case 1: XFER( MOVB_M, MOVB_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, pLinearAddress, pLinearSurface->Pitch, 0); break;
1075 default: assert(0);
1076 }
1077 }
1078 else
1079 {
1080 switch(pLinearSurface->Element.Size)
1081 {
1082 case 16:
1083 {
1084 if(StreamingLoadSupported)
1085 {
1086 XFER(MOVDQU_M, MOVNTDQA_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0);
1087 }
1088 else
1089 {
1090 XFER(MOVDQU_M, MOVDQ_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0);
1091 }
1092 break;
1093 }
1094 case 8: XFER( MOVQ_M, MOVQ_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1095 case 4: XFER( MOVD_M, MOVD_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1096 case 3: XFER( MOV3_M, MOV3_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1097 case 2: XFER( MOVW_M, MOVW_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1098 case 1: XFER( MOVB_M, MOVB_R, pSwizzledSurface->Element.Pitch, pLinearSurface->Element.Pitch, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, SwizzleMaxXfer.Width, 0); break;
1099 default: assert(0);
1100 }
1101 }
1102 } else
1103 #endif // SUB_ELEMENT_SUPPORT
1104 if(LinearToSwizzled)
1105 {
1106 switch(SwizzleMaxXfer.Width)
1107 {
1108 case 16: XFER(MOVNTDQ_M, MOVDQU_R, 16, 16, pSwizzledAddress, 16, pLinearAddress, pLinearSurface->Pitch, 1); break;
1109 #ifdef INTEL_TILE_W_SUPPORT
1110 case 2: XFER(MOVW_M, MOVW_R, 2, 2, pSwizzledAddress, 2, pLinearAddress, pLinearSurface->Pitch, 1); break;
1111 #endif
1112 default: assert(0); // Unexpected cases excluded to save compile time/size of multiplying instantiations.
1113 }
1114 }
1115 else
1116 {
1117 switch(SwizzleMaxXfer.Width)
1118 {
1119 case 16:
1120 {
1121 if(StreamingLoadSupported)
1122 {
1123 XFER(MOVDQU_M, MOVNTDQA_R, 16, 16, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, 16, 1);
1124 }
1125 else
1126 {
1127 XFER(MOVDQU_M, MOVDQ_R, 16, 16, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, 16, 1);
1128 }
1129 break;
1130 }
1131 #ifdef INTEL_TILE_W_SUPPORT
1132 case 2: XFER(MOVW_M, MOVW_R, 2, 2, pLinearAddress, pLinearSurface->Pitch, pSwizzledAddress, 2, 1); break;
1133 #endif
1134 default: assert(0);
1135 }
1136 }
1137
1138
1139 // Swizzled inc of SwizzledOffsetY...
1140 SwizzledOffsetY = (SwizzledOffsetY - MaskY[xferHeight]) & MaskY[xferHeight];
1141 if(!SwizzledOffsetY) SwizzledOffsetX0 += BytesPerRowOfTiles; // Wraps advance SwizzledOffsetX0, since that includes "bits beyond the tile".
1142
1143 y += xferHeight;
1144
1145 /* X-loop only advanced pLinearAddress by CopyWidthBytes--even
1146 when transferred multiple lines. Advance rest of way: */
1147 pLinearAddress += xferHeight * pLinearSurface->Pitch - CopyWidthBytes;
1148
1149 } // foreach(y)
1150
1151 _mm_sfence(); // Flush Non-Temporal Writes
1152
1153 #if(_MSC_VER)
1154 #pragma warning(pop)
1155 #endif
1156 }
1157 #endif
1158 }
1159 } // CpuSwizzleBlt
1160
1161 #endif // #ifndef INCLUDE_CpuSwizzleBlt_c_AS_HEADER
1162 // clang-format on
1163