xref: /aosp_15_r20/external/swiftshader/src/Device/Blitter.cpp (revision 03ce13f70fcc45d86ee91b7ee4cab1936a95046e)
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "Blitter.hpp"
16 
17 #include "Pipeline/ShaderCore.hpp"
18 #include "Reactor/Reactor.hpp"
19 #include "System/CPUID.hpp"
20 #include "System/Debug.hpp"
21 #include "System/Half.hpp"
22 #include "System/Memory.hpp"
23 #include "Vulkan/VkImage.hpp"
24 #include "Vulkan/VkImageView.hpp"
25 
26 #include <utility>
27 
28 #if defined(__i386__) || defined(__x86_64__)
29 #	include <xmmintrin.h>
30 #	include <emmintrin.h>
31 #endif
32 
33 namespace sw {
34 
PackFields(const rr::Int4 & ints,const sw::int4 shifts)35 static rr::RValue<rr::Int> PackFields(const rr::Int4 &ints, const sw::int4 shifts)
36 {
37 	return (rr::Int(ints.x) << shifts[0]) |
38 	       (rr::Int(ints.y) << shifts[1]) |
39 	       (rr::Int(ints.z) << shifts[2]) |
40 	       (rr::Int(ints.w) << shifts[3]);
41 }
42 
Blitter()43 Blitter::Blitter()
44     : blitMutex()
45     , blitCache(1024)
46     , cornerUpdateMutex()
47     , cornerUpdateCache(64)  // We only need one of these per format
48 {
49 }
50 
~Blitter()51 Blitter::~Blitter()
52 {
53 }
54 
clear(const void * pixel,vk::Format format,vk::Image * dest,const vk::Format & viewFormat,const VkImageSubresourceRange & subresourceRange,const VkRect2D * renderArea)55 void Blitter::clear(const void *pixel, vk::Format format, vk::Image *dest, const vk::Format &viewFormat, const VkImageSubresourceRange &subresourceRange, const VkRect2D *renderArea)
56 {
57 	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
58 	vk::Format dstFormat = viewFormat.getAspectFormat(aspect);
59 	if(dstFormat == VK_FORMAT_UNDEFINED)
60 	{
61 		return;
62 	}
63 
64 	VkClearValue clampedPixel;
65 	if(viewFormat.isSignedNormalized() || viewFormat.isUnsignedNormalized())
66 	{
67 		const float minValue = viewFormat.isSignedNormalized() ? -1.0f : 0.0f;
68 
69 		if(aspect & VK_IMAGE_ASPECT_COLOR_BIT)
70 		{
71 			memcpy(clampedPixel.color.float32, pixel, sizeof(VkClearColorValue));
72 			clampedPixel.color.float32[0] = sw::clamp(clampedPixel.color.float32[0], minValue, 1.0f);
73 			clampedPixel.color.float32[1] = sw::clamp(clampedPixel.color.float32[1], minValue, 1.0f);
74 			clampedPixel.color.float32[2] = sw::clamp(clampedPixel.color.float32[2], minValue, 1.0f);
75 			clampedPixel.color.float32[3] = sw::clamp(clampedPixel.color.float32[3], minValue, 1.0f);
76 			pixel = clampedPixel.color.float32;
77 		}
78 
79 		// Stencil never requires clamping, so we can check for Depth only
80 		if(aspect & VK_IMAGE_ASPECT_DEPTH_BIT)
81 		{
82 			memcpy(&(clampedPixel.depthStencil), pixel, sizeof(VkClearDepthStencilValue));
83 			clampedPixel.depthStencil.depth = sw::clamp(clampedPixel.depthStencil.depth, minValue, 1.0f);
84 			pixel = &(clampedPixel.depthStencil);
85 		}
86 	}
87 
88 	if(fastClear(pixel, format, dest, dstFormat, subresourceRange, renderArea))
89 	{
90 		return;
91 	}
92 
93 	State state(format, dstFormat, 1, dest->getSampleCount(), Options{ 0xF });
94 	auto blitRoutine = getBlitRoutine(state);
95 	if(!blitRoutine)
96 	{
97 		return;
98 	}
99 
100 	VkImageSubresource subres = {
101 		subresourceRange.aspectMask,
102 		subresourceRange.baseMipLevel,
103 		subresourceRange.baseArrayLayer
104 	};
105 
106 	uint32_t lastMipLevel = dest->getLastMipLevel(subresourceRange);
107 	uint32_t lastLayer = dest->getLastLayerIndex(subresourceRange);
108 
109 	VkRect2D area = { { 0, 0 }, { 0, 0 } };
110 	if(renderArea)
111 	{
112 		ASSERT(subresourceRange.levelCount == 1);
113 		area = *renderArea;
114 	}
115 
116 	for(; subres.mipLevel <= lastMipLevel; subres.mipLevel++)
117 	{
118 		VkExtent3D extent = dest->getMipLevelExtent(aspect, subres.mipLevel);
119 		if(!renderArea)
120 		{
121 			area.extent.width = extent.width;
122 			area.extent.height = extent.height;
123 		}
124 
125 		BlitData data = {
126 			pixel, nullptr,  // source, dest
127 
128 			assert_cast<uint32_t>(format.bytes()),                                  // sPitchB
129 			assert_cast<uint32_t>(dest->rowPitchBytes(aspect, subres.mipLevel)),    // dPitchB
130 			0,                                                                      // sSliceB (unused in clear operations)
131 			assert_cast<uint32_t>(dest->slicePitchBytes(aspect, subres.mipLevel)),  // dSliceB
132 
133 			0.5f, 0.5f, 0.5f, 0.0f, 0.0f, 0.0f,  // x0, y0, z0, w, h, d
134 
135 			area.offset.x, static_cast<int>(area.offset.x + area.extent.width),   // x0d, x1d
136 			area.offset.y, static_cast<int>(area.offset.y + area.extent.height),  // y0d, y1d
137 			0, 1,                                                                 // z0d, z1d
138 
139 			0, 0, 0,  // sWidth, sHeight, sDepth
140 
141 			false,  // filter3D
142 		};
143 
144 		if(renderArea && dest->is3DSlice())
145 		{
146 			// Reinterpret layers as depth slices
147 			subres.arrayLayer = 0;
148 			for(uint32_t depth = subresourceRange.baseArrayLayer; depth <= lastLayer; depth++)
149 			{
150 				data.dest = dest->getTexelPointer({ 0, 0, static_cast<int32_t>(depth) }, subres);
151 				blitRoutine(&data);
152 			}
153 		}
154 		else
155 		{
156 			for(subres.arrayLayer = subresourceRange.baseArrayLayer; subres.arrayLayer <= lastLayer; subres.arrayLayer++)
157 			{
158 				for(uint32_t depth = 0; depth < extent.depth; depth++)
159 				{
160 					data.dest = dest->getTexelPointer({ 0, 0, static_cast<int32_t>(depth) }, subres);
161 
162 					blitRoutine(&data);
163 				}
164 			}
165 		}
166 	}
167 	dest->contentsChanged(subresourceRange);
168 }
169 
fastClear(const void * clearValue,vk::Format clearFormat,vk::Image * dest,const vk::Format & viewFormat,const VkImageSubresourceRange & subresourceRange,const VkRect2D * renderArea)170 bool Blitter::fastClear(const void *clearValue, vk::Format clearFormat, vk::Image *dest, const vk::Format &viewFormat, const VkImageSubresourceRange &subresourceRange, const VkRect2D *renderArea)
171 {
172 	if(clearFormat != VK_FORMAT_R32G32B32A32_SFLOAT &&
173 	   clearFormat != VK_FORMAT_D32_SFLOAT &&
174 	   clearFormat != VK_FORMAT_S8_UINT)
175 	{
176 		return false;
177 	}
178 
179 	union ClearValue
180 	{
181 		struct
182 		{
183 			float r;
184 			float g;
185 			float b;
186 			float a;
187 		};
188 
189 		float rgb[3];
190 
191 		float d;
192 		uint32_t d_as_u32;
193 
194 		uint32_t s;
195 	};
196 
197 	const ClearValue &c = *reinterpret_cast<const ClearValue *>(clearValue);
198 
199 	uint32_t packed = 0;
200 
201 	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
202 	switch(viewFormat)
203 	{
204 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
205 		packed = ((uint16_t)(31 * c.b + 0.5f) << 0) |
206 		         ((uint16_t)(63 * c.g + 0.5f) << 5) |
207 		         ((uint16_t)(31 * c.r + 0.5f) << 11);
208 		break;
209 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
210 		packed = ((uint16_t)(31 * c.r + 0.5f) << 0) |
211 		         ((uint16_t)(63 * c.g + 0.5f) << 5) |
212 		         ((uint16_t)(31 * c.b + 0.5f) << 11);
213 		break;
214 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
215 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
216 	case VK_FORMAT_R8G8B8A8_UNORM:
217 		packed = ((uint32_t)(255 * c.a + 0.5f) << 24) |
218 		         ((uint32_t)(255 * c.b + 0.5f) << 16) |
219 		         ((uint32_t)(255 * c.g + 0.5f) << 8) |
220 		         ((uint32_t)(255 * c.r + 0.5f) << 0);
221 		break;
222 	case VK_FORMAT_B8G8R8A8_UNORM:
223 		packed = ((uint32_t)(255 * c.a + 0.5f) << 24) |
224 		         ((uint32_t)(255 * c.r + 0.5f) << 16) |
225 		         ((uint32_t)(255 * c.g + 0.5f) << 8) |
226 		         ((uint32_t)(255 * c.b + 0.5f) << 0);
227 		break;
228 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
229 		packed = R11G11B10F(c.rgb);
230 		break;
231 	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
232 		packed = RGB9E5(c.rgb);
233 		break;
234 	case VK_FORMAT_D32_SFLOAT:
235 		ASSERT(clearFormat == VK_FORMAT_D32_SFLOAT);
236 		packed = c.d_as_u32;  // float reinterpreted as uint32
237 		break;
238 	case VK_FORMAT_S8_UINT:
239 		ASSERT(clearFormat == VK_FORMAT_S8_UINT);
240 		packed = static_cast<uint8_t>(c.s);
241 		break;
242 	default:
243 		return false;
244 	}
245 
246 	VkImageSubresource subres = {
247 		subresourceRange.aspectMask,
248 		subresourceRange.baseMipLevel,
249 		subresourceRange.baseArrayLayer
250 	};
251 	uint32_t lastMipLevel = dest->getLastMipLevel(subresourceRange);
252 	uint32_t lastLayer = dest->getLastLayerIndex(subresourceRange);
253 
254 	VkRect2D area = { { 0, 0 }, { 0, 0 } };
255 	if(renderArea)
256 	{
257 		ASSERT(subresourceRange.levelCount == 1);
258 		area = *renderArea;
259 	}
260 
261 	for(; subres.mipLevel <= lastMipLevel; subres.mipLevel++)
262 	{
263 		int rowPitchBytes = dest->rowPitchBytes(aspect, subres.mipLevel);
264 		int slicePitchBytes = dest->slicePitchBytes(aspect, subres.mipLevel);
265 		VkExtent3D extent = dest->getMipLevelExtent(aspect, subres.mipLevel);
266 		if(!renderArea)
267 		{
268 			area.extent.width = extent.width;
269 			area.extent.height = extent.height;
270 		}
271 		if(dest->is3DSlice())
272 		{
273 			extent.depth = 1;  // The 3D image is instead interpreted as a 2D image with layers
274 		}
275 
276 		for(subres.arrayLayer = subresourceRange.baseArrayLayer; subres.arrayLayer <= lastLayer; subres.arrayLayer++)
277 		{
278 			for(uint32_t depth = 0; depth < extent.depth; depth++)
279 			{
280 				uint8_t *slice = (uint8_t *)dest->getTexelPointer(
281 				    { area.offset.x, area.offset.y, static_cast<int32_t>(depth) }, subres);
282 
283 				for(int j = 0; j < dest->getSampleCount(); j++)
284 				{
285 					uint8_t *d = slice;
286 
287 					switch(viewFormat.bytes())
288 					{
289 					case 4:
290 						for(uint32_t i = 0; i < area.extent.height; i++)
291 						{
292 							ASSERT(d < dest->end());
293 							sw::clear((uint32_t *)d, packed, area.extent.width);
294 							d += rowPitchBytes;
295 						}
296 						break;
297 					case 2:
298 						for(uint32_t i = 0; i < area.extent.height; i++)
299 						{
300 							ASSERT(d < dest->end());
301 							sw::clear((uint16_t *)d, static_cast<uint16_t>(packed), area.extent.width);
302 							d += rowPitchBytes;
303 						}
304 						break;
305 					case 1:
306 						for(uint32_t i = 0; i < area.extent.height; i++)
307 						{
308 							ASSERT(d < dest->end());
309 							memset(d, packed, area.extent.width);
310 							d += rowPitchBytes;
311 						}
312 						break;
313 					default:
314 						assert(false);
315 					}
316 
317 					slice += slicePitchBytes;
318 				}
319 			}
320 		}
321 	}
322 	dest->contentsChanged(subresourceRange);
323 
324 	return true;
325 }
326 
readFloat4(Pointer<Byte> element,const State & state)327 Float4 Blitter::readFloat4(Pointer<Byte> element, const State &state)
328 {
329 	Float4 c(0.0f, 0.0f, 0.0f, 1.0f);
330 
331 	switch(state.sourceFormat)
332 	{
333 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
334 		c.w = Float(Int(*Pointer<Byte>(element)) & Int(0xF));
335 		c.x = Float((Int(*Pointer<Byte>(element)) >> 4) & Int(0xF));
336 		c.y = Float(Int(*Pointer<Byte>(element + 1)) & Int(0xF));
337 		c.z = Float((Int(*Pointer<Byte>(element + 1)) >> 4) & Int(0xF));
338 		break;
339 	case VK_FORMAT_R8_SINT:
340 	case VK_FORMAT_R8_SNORM:
341 		c.x = Float(Int(*Pointer<SByte>(element)));
342 		c.w = float(0x7F);
343 		break;
344 	case VK_FORMAT_R8_UNORM:
345 	case VK_FORMAT_R8_UINT:
346 	case VK_FORMAT_R8_SRGB:
347 		c.x = Float(Int(*Pointer<Byte>(element)));
348 		c.w = float(0xFF);
349 		break;
350 	case VK_FORMAT_R16_SINT:
351 	case VK_FORMAT_R16_SNORM:
352 		c.x = Float(Int(*Pointer<Short>(element)));
353 		c.w = float(0x7FFF);
354 		break;
355 	case VK_FORMAT_R16_UNORM:
356 	case VK_FORMAT_R16_UINT:
357 		c.x = Float(Int(*Pointer<UShort>(element)));
358 		c.w = float(0xFFFF);
359 		break;
360 	case VK_FORMAT_R32_SINT:
361 		c.x = Float(*Pointer<Int>(element));
362 		c.w = float(0x7FFFFFFF);
363 		break;
364 	case VK_FORMAT_R32_UINT:
365 		c.x = Float(*Pointer<UInt>(element));
366 		c.w = float(0xFFFFFFFF);
367 		break;
368 	case VK_FORMAT_B8G8R8A8_SRGB:
369 	case VK_FORMAT_B8G8R8A8_UNORM:
370 		c = Float4(*Pointer<Byte4>(element)).zyxw;
371 		break;
372 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
373 	case VK_FORMAT_R8G8B8A8_SINT:
374 	case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
375 	case VK_FORMAT_R8G8B8A8_SNORM:
376 		c = Float4(*Pointer<SByte4>(element));
377 		break;
378 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
379 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
380 	case VK_FORMAT_R8G8B8A8_UNORM:
381 	case VK_FORMAT_R8G8B8A8_UINT:
382 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
383 	case VK_FORMAT_R8G8B8A8_SRGB:
384 		c = Float4(*Pointer<Byte4>(element));
385 		break;
386 	case VK_FORMAT_R16G16B16A16_SINT:
387 	case VK_FORMAT_R16G16B16A16_SNORM:
388 		c = Float4(*Pointer<Short4>(element));
389 		break;
390 	case VK_FORMAT_R16G16B16A16_UNORM:
391 	case VK_FORMAT_R16G16B16A16_UINT:
392 		c = Float4(*Pointer<UShort4>(element));
393 		break;
394 	case VK_FORMAT_R32G32B32A32_SINT:
395 		c = Float4(*Pointer<Int4>(element));
396 		break;
397 	case VK_FORMAT_R32G32B32A32_UINT:
398 		c = Float4(*Pointer<UInt4>(element));
399 		break;
400 	case VK_FORMAT_R8G8_SINT:
401 	case VK_FORMAT_R8G8_SNORM:
402 		c.x = Float(Int(*Pointer<SByte>(element + 0)));
403 		c.y = Float(Int(*Pointer<SByte>(element + 1)));
404 		c.w = float(0x7F);
405 		break;
406 	case VK_FORMAT_R8G8_UNORM:
407 	case VK_FORMAT_R8G8_UINT:
408 	case VK_FORMAT_R8G8_SRGB:
409 		c.x = Float(Int(*Pointer<Byte>(element + 0)));
410 		c.y = Float(Int(*Pointer<Byte>(element + 1)));
411 		c.w = float(0xFF);
412 		break;
413 	case VK_FORMAT_R16G16_SINT:
414 	case VK_FORMAT_R16G16_SNORM:
415 		c.x = Float(Int(*Pointer<Short>(element + 0)));
416 		c.y = Float(Int(*Pointer<Short>(element + 2)));
417 		c.w = float(0x7FFF);
418 		break;
419 	case VK_FORMAT_R16G16_UNORM:
420 	case VK_FORMAT_R16G16_UINT:
421 		c.x = Float(Int(*Pointer<UShort>(element + 0)));
422 		c.y = Float(Int(*Pointer<UShort>(element + 2)));
423 		c.w = float(0xFFFF);
424 		break;
425 	case VK_FORMAT_R32G32_SINT:
426 		c.x = Float(*Pointer<Int>(element + 0));
427 		c.y = Float(*Pointer<Int>(element + 4));
428 		c.w = float(0x7FFFFFFF);
429 		break;
430 	case VK_FORMAT_R32G32_UINT:
431 		c.x = Float(*Pointer<UInt>(element + 0));
432 		c.y = Float(*Pointer<UInt>(element + 4));
433 		c.w = float(0xFFFFFFFF);
434 		break;
435 	case VK_FORMAT_R32G32B32A32_SFLOAT:
436 		c = *Pointer<Float4>(element);
437 		break;
438 	case VK_FORMAT_R32G32_SFLOAT:
439 		c.x = *Pointer<Float>(element + 0);
440 		c.y = *Pointer<Float>(element + 4);
441 		break;
442 	case VK_FORMAT_R32_SFLOAT:
443 		c.x = *Pointer<Float>(element);
444 		break;
445 	case VK_FORMAT_R16G16B16A16_SFLOAT:
446 		c.w = Float(*Pointer<Half>(element + 6));
447 	case VK_FORMAT_R16G16B16_SFLOAT:
448 		c.z = Float(*Pointer<Half>(element + 4));
449 	case VK_FORMAT_R16G16_SFLOAT:
450 		c.y = Float(*Pointer<Half>(element + 2));
451 	case VK_FORMAT_R16_SFLOAT:
452 		c.x = Float(*Pointer<Half>(element));
453 		break;
454 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
455 		c = r11g11b10Unpack(*Pointer<UInt>(element));
456 		break;
457 	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
458 		// This type contains a common 5 bit exponent (E) and a 9 bit the mantissa for R, G and B.
459 		c.x = Float(*Pointer<UInt>(element) & UInt(0x000001FF));          // R's mantissa (bits 0-8)
460 		c.y = Float((*Pointer<UInt>(element) & UInt(0x0003FE00)) >> 9);   // G's mantissa (bits 9-17)
461 		c.z = Float((*Pointer<UInt>(element) & UInt(0x07FC0000)) >> 18);  // B's mantissa (bits 18-26)
462 		c *= Float4(
463 		    // 2^E, using the exponent (bits 27-31) and treating it as an unsigned integer value
464 		    Float(UInt(1) << ((*Pointer<UInt>(element) & UInt(0xF8000000)) >> 27)) *
465 		    // Since the 9 bit mantissa values currently stored in RGB were converted straight
466 		    // from int to float (in the [0, 1<<9] range instead of the [0, 1] range), they
467 		    // are (1 << 9) times too high.
468 		    // Also, the exponent has 5 bits and we compute the exponent bias of floating point
469 		    // formats using "2^(k-1) - 1", so, in this case, the exponent bias is 2^(5-1)-1 = 15
470 		    // Exponent bias (15) + number of mantissa bits per component (9) = 24
471 		    Float(1.0f / (1 << 24)));
472 		c.w = 1.0f;
473 		break;
474 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
475 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF000)) >> UShort(12)));
476 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x0F00)) >> UShort(8)));
477 		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
478 		c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
479 		break;
480 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
481 		c.w = Float(Int((*Pointer<UShort>(element) & UShort(0xF000)) >> UShort(12)));
482 		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x0F00)) >> UShort(8)));
483 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
484 		c.x = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
485 		break;
486 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
487 		c.w = Float(Int((*Pointer<UShort>(element) & UShort(0xF000)) >> UShort(12)));
488 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x0F00)) >> UShort(8)));
489 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
490 		c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
491 		break;
492 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
493 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
494 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
495 		c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
496 		break;
497 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
498 		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
499 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
500 		c.x = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
501 		break;
502 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
503 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
504 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07C0)) >> UShort(6)));
505 		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x003E)) >> UShort(1)));
506 		c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x0001)));
507 		break;
508 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
509 		c.z = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
510 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07C0)) >> UShort(6)));
511 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x003E)) >> UShort(1)));
512 		c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x0001)));
513 		break;
514 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
515 		c.w = Float(Int((*Pointer<UShort>(element) & UShort(0x8000)) >> UShort(15)));
516 		c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x7C00)) >> UShort(10)));
517 		c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x03E0)) >> UShort(5)));
518 		c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
519 		break;
520 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
521 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
522 		c.x = Float(Int((*Pointer<UInt>(element) & UInt(0x000003FF))));
523 		c.y = Float(Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10));
524 		c.z = Float(Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20));
525 		c.w = Float(Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30));
526 		break;
527 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
528 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
529 		c.z = Float(Int((*Pointer<UInt>(element) & UInt(0x000003FF))));
530 		c.y = Float(Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10));
531 		c.x = Float(Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20));
532 		c.w = Float(Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30));
533 		break;
534 	case VK_FORMAT_D16_UNORM:
535 		c.x = Float(Int((*Pointer<UShort>(element))));
536 		break;
537 	case VK_FORMAT_X8_D24_UNORM_PACK32:
538 		c.x = Float(Int((*Pointer<UInt>(element) & UInt(0xFFFFFF00)) >> 8));
539 		break;
540 	case VK_FORMAT_D32_SFLOAT:
541 		c.x = *Pointer<Float>(element);
542 		break;
543 	case VK_FORMAT_S8_UINT:
544 		c.x = Float(Int(*Pointer<Byte>(element)));
545 		break;
546 	default:
547 		UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
548 	}
549 
550 	return c;
551 }
552 
write(Float4 & c,Pointer<Byte> element,const State & state)553 void Blitter::write(Float4 &c, Pointer<Byte> element, const State &state)
554 {
555 	bool writeR = state.writeRed;
556 	bool writeG = state.writeGreen;
557 	bool writeB = state.writeBlue;
558 	bool writeA = state.writeAlpha;
559 	bool writeRGBA = writeR && writeG && writeB && writeA;
560 
561 	switch(state.destFormat)
562 	{
563 	case VK_FORMAT_R4G4_UNORM_PACK8:
564 		if(writeR | writeG)
565 		{
566 			if(!writeR)
567 			{
568 				*Pointer<Byte>(element) = (Byte(RoundInt(Float(c.y))) & Byte(0xF)) |
569 				                          (*Pointer<Byte>(element) & Byte(0xF0));
570 			}
571 			else if(!writeG)
572 			{
573 				*Pointer<Byte>(element) = (*Pointer<Byte>(element) & Byte(0xF)) |
574 				                          (Byte(RoundInt(Float(c.x))) << Byte(4));
575 			}
576 			else
577 			{
578 				*Pointer<Byte>(element) = (Byte(RoundInt(Float(c.y))) & Byte(0xF)) |
579 				                          (Byte(RoundInt(Float(c.x))) << Byte(4));
580 			}
581 		}
582 		break;
583 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
584 		if(writeRGBA)
585 		{
586 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 12, 8, 4, 0 }));
587 		}
588 		else
589 		{
590 			unsigned short mask = (writeA ? 0x000F : 0x0000) |
591 			                      (writeB ? 0x00F0 : 0x0000) |
592 			                      (writeG ? 0x0F00 : 0x0000) |
593 			                      (writeR ? 0xF000 : 0x0000);
594 			unsigned short unmask = ~mask;
595 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
596 			                            (UShort(PackFields(RoundInt(c) & Int4(0xF), { 12, 8, 4, 0 })) & UShort(mask));
597 		}
598 		break;
599 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
600 		if(writeRGBA)
601 		{
602 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 4, 8, 12, 0 }));
603 		}
604 		else
605 		{
606 			unsigned short mask = (writeA ? 0x000F : 0x0000) |
607 			                      (writeR ? 0x00F0 : 0x0000) |
608 			                      (writeG ? 0x0F00 : 0x0000) |
609 			                      (writeB ? 0xF000 : 0x0000);
610 			unsigned short unmask = ~mask;
611 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
612 			                            (UShort(PackFields(RoundInt(c) & Int4(0xF), { 4, 8, 12, 0 })) & UShort(mask));
613 		}
614 		break;
615 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
616 		if(writeRGBA)
617 		{
618 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 8, 4, 0, 12 }));
619 		}
620 		else
621 		{
622 			unsigned short mask = (writeB ? 0x000F : 0x0000) |
623 			                      (writeG ? 0x00F0 : 0x0000) |
624 			                      (writeR ? 0x0F00 : 0x0000) |
625 			                      (writeA ? 0xF000 : 0x0000);
626 			unsigned short unmask = ~mask;
627 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
628 			                            (UShort(PackFields(RoundInt(c) & Int4(0xF), { 8, 4, 0, 12 })) & UShort(mask));
629 		}
630 		break;
631 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
632 		if(writeRGBA)
633 		{
634 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 0, 4, 8, 12 }));
635 		}
636 		else
637 		{
638 			unsigned short mask = (writeR ? 0x000F : 0x0000) |
639 			                      (writeG ? 0x00F0 : 0x0000) |
640 			                      (writeB ? 0x0F00 : 0x0000) |
641 			                      (writeA ? 0xF000 : 0x0000);
642 			unsigned short unmask = ~mask;
643 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
644 			                            (UShort(PackFields(RoundInt(c) & Int4(0xF), { 0, 4, 8, 12 })) & UShort(mask));
645 		}
646 		break;
647 	case VK_FORMAT_B8G8R8A8_SRGB:
648 	case VK_FORMAT_B8G8R8A8_UNORM:
649 		if(writeRGBA)
650 		{
651 			Short4 c0 = RoundShort4(c.zyxw);
652 			*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
653 		}
654 		else
655 		{
656 			if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
657 			if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
658 			if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
659 			if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
660 		}
661 		break;
662 	case VK_FORMAT_B8G8R8_SNORM:
663 		if(writeB) { *Pointer<SByte>(element + 0) = SByte(RoundInt(Float(c.z))); }
664 		if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
665 		if(writeR) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.x))); }
666 		break;
667 	case VK_FORMAT_B8G8R8_UNORM:
668 	case VK_FORMAT_B8G8R8_SRGB:
669 		if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
670 		if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
671 		if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
672 		break;
673 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
674 	case VK_FORMAT_R8G8B8A8_UNORM:
675 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
676 	case VK_FORMAT_R8G8B8A8_SRGB:
677 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
678 	case VK_FORMAT_R8G8B8A8_UINT:
679 	case VK_FORMAT_R8G8B8A8_USCALED:
680 	case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
681 		if(writeRGBA)
682 		{
683 			Short4 c0 = RoundShort4(c);
684 			*Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
685 		}
686 		else
687 		{
688 			if(writeR) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.x))); }
689 			if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
690 			if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
691 			if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
692 		}
693 		break;
694 	case VK_FORMAT_R32G32B32A32_SFLOAT:
695 		if(writeRGBA)
696 		{
697 			*Pointer<Float4>(element) = c;
698 		}
699 		else
700 		{
701 			if(writeR) { *Pointer<Float>(element) = c.x; }
702 			if(writeG) { *Pointer<Float>(element + 4) = c.y; }
703 			if(writeB) { *Pointer<Float>(element + 8) = c.z; }
704 			if(writeA) { *Pointer<Float>(element + 12) = c.w; }
705 		}
706 		break;
707 	case VK_FORMAT_R32G32B32_SFLOAT:
708 		if(writeR) { *Pointer<Float>(element) = c.x; }
709 		if(writeG) { *Pointer<Float>(element + 4) = c.y; }
710 		if(writeB) { *Pointer<Float>(element + 8) = c.z; }
711 		break;
712 	case VK_FORMAT_R32G32_SFLOAT:
713 		if(writeR && writeG)
714 		{
715 			*Pointer<Float2>(element) = Float2(c);
716 		}
717 		else
718 		{
719 			if(writeR) { *Pointer<Float>(element) = c.x; }
720 			if(writeG) { *Pointer<Float>(element + 4) = c.y; }
721 		}
722 		break;
723 	case VK_FORMAT_R32_SFLOAT:
724 		if(writeR) { *Pointer<Float>(element) = c.x; }
725 		break;
726 	case VK_FORMAT_R16G16B16A16_SFLOAT:
727 		if(writeA) { *Pointer<Half>(element + 6) = Half(c.w); }
728 		// [[fallthrough]]
729 	case VK_FORMAT_R16G16B16_SFLOAT:
730 		if(writeB) { *Pointer<Half>(element + 4) = Half(c.z); }
731 		// [[fallthrough]]
732 	case VK_FORMAT_R16G16_SFLOAT:
733 		if(writeG) { *Pointer<Half>(element + 2) = Half(c.y); }
734 		// [[fallthrough]]
735 	case VK_FORMAT_R16_SFLOAT:
736 		if(writeR) { *Pointer<Half>(element) = Half(c.x); }
737 		break;
738 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
739 		{
740 			UInt rgb = r11g11b10Pack(c);
741 
742 			UInt old = *Pointer<UInt>(element);
743 
744 			unsigned int mask = (writeR ? 0x000007FF : 0) |
745 			                    (writeG ? 0x003FF800 : 0) |
746 			                    (writeB ? 0xFFC00000 : 0);
747 
748 			*Pointer<UInt>(element) = (rgb & mask) | (old & ~mask);
749 		}
750 		break;
751 	case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
752 		{
753 			ASSERT(writeRGBA);  // Can't sensibly write just part of this format.
754 
755 			// Vulkan 1.1.117 section 15.2.1 RGB to Shared Exponent Conversion
756 
757 			constexpr int N = 9;       // number of mantissa bits per component
758 			constexpr int B = 15;      // exponent bias
759 			constexpr int E_max = 31;  // maximum possible biased exponent value
760 
761 			// Maximum representable value.
762 			constexpr float sharedexp_max = ((static_cast<float>(1 << N) - 1) / static_cast<float>(1 << N)) * static_cast<float>(1 << (E_max - B));
763 
764 			// Clamp components to valid range. NaN becomes 0.
765 			Float red_c = Min(IfThenElse(!(c.x > 0), Float(0), Float(c.x)), sharedexp_max);
766 			Float green_c = Min(IfThenElse(!(c.y > 0), Float(0), Float(c.y)), sharedexp_max);
767 			Float blue_c = Min(IfThenElse(!(c.z > 0), Float(0), Float(c.z)), sharedexp_max);
768 
769 			// We're reducing the mantissa to 9 bits, so we must round up if the next
770 			// bit is 1. In other words add 0.5 to the new mantissa's position and
771 			// allow overflow into the exponent so we can scale correctly.
772 			constexpr int half = 1 << (23 - N);
773 			Float red_r = As<Float>(As<Int>(red_c) + half);
774 			Float green_r = As<Float>(As<Int>(green_c) + half);
775 			Float blue_r = As<Float>(As<Int>(blue_c) + half);
776 
777 			// The largest component determines the shared exponent. It can't be lower
778 			// than 0 (after bias subtraction) so also limit to the mimimum representable.
779 			constexpr float min_s = 0.5f / (1 << B);
780 			Float max_s = Max(Max(red_r, green_r), Max(blue_r, min_s));
781 
782 			// Obtain the reciprocal of the shared exponent by inverting the bits,
783 			// and scale by the new mantissa's size. Note that the IEEE-754 single-precision
784 			// format has an implicit leading 1, but this shared component format does not.
785 			Float scale = As<Float>((As<Int>(max_s) & 0x7F800000) ^ 0x7F800000) * (1 << (N - 2));
786 
787 			UInt R9 = RoundInt(red_c * scale);
788 			UInt G9 = UInt(RoundInt(green_c * scale));
789 			UInt B9 = UInt(RoundInt(blue_c * scale));
790 			UInt E5 = (As<UInt>(max_s) >> 23) - 127 + 15 + 1;
791 
792 			UInt E5B9G9R9 = (E5 << 27) | (B9 << 18) | (G9 << 9) | R9;
793 
794 			*Pointer<UInt>(element) = E5B9G9R9;
795 		}
796 		break;
797 	case VK_FORMAT_B8G8R8A8_SNORM:
798 		if(writeB) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.z))); }
799 		if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
800 		if(writeR) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.x))); }
801 		if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
802 		break;
803 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
804 	case VK_FORMAT_R8G8B8A8_SINT:
805 	case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
806 	case VK_FORMAT_R8G8B8A8_SNORM:
807 	case VK_FORMAT_R8G8B8A8_SSCALED:
808 	case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
809 		if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
810 		// [[fallthrough]]
811 	case VK_FORMAT_R8G8B8_SINT:
812 	case VK_FORMAT_R8G8B8_SNORM:
813 	case VK_FORMAT_R8G8B8_SSCALED:
814 		if(writeB) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.z))); }
815 		// [[fallthrough]]
816 	case VK_FORMAT_R8G8_SINT:
817 	case VK_FORMAT_R8G8_SNORM:
818 	case VK_FORMAT_R8G8_SSCALED:
819 		if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
820 		// [[fallthrough]]
821 	case VK_FORMAT_R8_SINT:
822 	case VK_FORMAT_R8_SNORM:
823 	case VK_FORMAT_R8_SSCALED:
824 		if(writeR) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.x))); }
825 		break;
826 	case VK_FORMAT_R8G8B8_UINT:
827 	case VK_FORMAT_R8G8B8_UNORM:
828 	case VK_FORMAT_R8G8B8_USCALED:
829 	case VK_FORMAT_R8G8B8_SRGB:
830 		if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
831 		// [[fallthrough]]
832 	case VK_FORMAT_R8G8_UINT:
833 	case VK_FORMAT_R8G8_UNORM:
834 	case VK_FORMAT_R8G8_USCALED:
835 	case VK_FORMAT_R8G8_SRGB:
836 		if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
837 		// [[fallthrough]]
838 	case VK_FORMAT_R8_UINT:
839 	case VK_FORMAT_R8_UNORM:
840 	case VK_FORMAT_R8_USCALED:
841 	case VK_FORMAT_R8_SRGB:
842 		if(writeR) { *Pointer<Byte>(element) = Byte(RoundInt(Float(c.x))); }
843 		break;
844 	case VK_FORMAT_R16G16B16A16_SINT:
845 	case VK_FORMAT_R16G16B16A16_SNORM:
846 	case VK_FORMAT_R16G16B16A16_SSCALED:
847 		if(writeRGBA)
848 		{
849 			*Pointer<Short4>(element) = Short4(RoundInt(c));
850 		}
851 		else
852 		{
853 			if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
854 			if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
855 			if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
856 			if(writeA) { *Pointer<Short>(element + 6) = Short(RoundInt(Float(c.w))); }
857 		}
858 		break;
859 	case VK_FORMAT_R16G16B16_SINT:
860 	case VK_FORMAT_R16G16B16_SNORM:
861 	case VK_FORMAT_R16G16B16_SSCALED:
862 		if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
863 		if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
864 		if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
865 		break;
866 	case VK_FORMAT_R16G16_SINT:
867 	case VK_FORMAT_R16G16_SNORM:
868 	case VK_FORMAT_R16G16_SSCALED:
869 		if(writeR && writeG)
870 		{
871 			*Pointer<Short2>(element) = Short2(Short4(RoundInt(c)));
872 		}
873 		else
874 		{
875 			if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
876 			if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
877 		}
878 		break;
879 	case VK_FORMAT_R16_SINT:
880 	case VK_FORMAT_R16_SNORM:
881 	case VK_FORMAT_R16_SSCALED:
882 		if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
883 		break;
884 	case VK_FORMAT_R16G16B16A16_UINT:
885 	case VK_FORMAT_R16G16B16A16_UNORM:
886 	case VK_FORMAT_R16G16B16A16_USCALED:
887 		if(writeRGBA)
888 		{
889 			*Pointer<UShort4>(element) = UShort4(RoundInt(c));
890 		}
891 		else
892 		{
893 			if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
894 			if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
895 			if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
896 			if(writeA) { *Pointer<UShort>(element + 6) = UShort(RoundInt(Float(c.w))); }
897 		}
898 		break;
899 	case VK_FORMAT_R16G16B16_UINT:
900 	case VK_FORMAT_R16G16B16_UNORM:
901 	case VK_FORMAT_R16G16B16_USCALED:
902 		if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
903 		if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
904 		if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
905 		break;
906 	case VK_FORMAT_R16G16_UINT:
907 	case VK_FORMAT_R16G16_UNORM:
908 	case VK_FORMAT_R16G16_USCALED:
909 		if(writeR && writeG)
910 		{
911 			*Pointer<UShort2>(element) = UShort2(UShort4(RoundInt(c)));
912 		}
913 		else
914 		{
915 			if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
916 			if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
917 		}
918 		break;
919 	case VK_FORMAT_R16_UINT:
920 	case VK_FORMAT_R16_UNORM:
921 	case VK_FORMAT_R16_USCALED:
922 		if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
923 		break;
924 	case VK_FORMAT_R32G32B32A32_SINT:
925 		if(writeRGBA)
926 		{
927 			*Pointer<Int4>(element) = RoundInt(c);
928 		}
929 		else
930 		{
931 			if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
932 			if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
933 			if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
934 			if(writeA) { *Pointer<Int>(element + 12) = RoundInt(Float(c.w)); }
935 		}
936 		break;
937 	case VK_FORMAT_R32G32B32_SINT:
938 		if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
939 		// [[fallthrough]]
940 	case VK_FORMAT_R32G32_SINT:
941 		if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
942 		// [[fallthrough]]
943 	case VK_FORMAT_R32_SINT:
944 		if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
945 		break;
946 	case VK_FORMAT_R32G32B32A32_UINT:
947 		if(writeRGBA)
948 		{
949 			*Pointer<UInt4>(element) = UInt4(RoundInt(c));
950 		}
951 		else
952 		{
953 			if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
954 			if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
955 			if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
956 			if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(RoundInt(Float(c.w))); }
957 		}
958 		break;
959 	case VK_FORMAT_R32G32B32_UINT:
960 		if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
961 		// [[fallthrough]]
962 	case VK_FORMAT_R32G32_UINT:
963 		if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
964 		// [[fallthrough]]
965 	case VK_FORMAT_R32_UINT:
966 		if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
967 		break;
968 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
969 		if(writeR && writeG && writeB)
970 		{
971 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c.xyzz), { 11, 5, 0, 0 }));
972 		}
973 		else
974 		{
975 			unsigned short mask = (writeB ? 0x001F : 0x0000) | (writeG ? 0x07E0 : 0x0000) | (writeR ? 0xF800 : 0x0000);
976 			unsigned short unmask = ~mask;
977 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
978 			                            (UShort(PackFields(RoundInt(c.xyzz), { 11, 5, 0, 0 })) &
979 			                             UShort(mask));
980 		}
981 		break;
982 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
983 		if(writeR && writeG && writeB)
984 		{
985 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c.zyxx), { 11, 5, 0, 0 }));
986 		}
987 		else
988 		{
989 			unsigned short mask = (writeR ? 0x001F : 0x0000) | (writeG ? 0x07E0 : 0x0000) | (writeB ? 0xF800 : 0x0000);
990 			unsigned short unmask = ~mask;
991 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
992 			                            (UShort(PackFields(RoundInt(c.zyxx), { 11, 5, 0, 0 })) &
993 			                             UShort(mask));
994 		}
995 		break;
996 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
997 		if(writeRGBA)
998 		{
999 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c), { 11, 6, 1, 0 }));
1000 		}
1001 		else
1002 		{
1003 			unsigned short mask = (writeA ? 0x8000 : 0x0000) |
1004 			                      (writeR ? 0x7C00 : 0x0000) |
1005 			                      (writeG ? 0x03E0 : 0x0000) |
1006 			                      (writeB ? 0x001F : 0x0000);
1007 			unsigned short unmask = ~mask;
1008 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
1009 			                            (UShort(PackFields(RoundInt(c), { 11, 6, 1, 0 })) &
1010 			                             UShort(mask));
1011 		}
1012 		break;
1013 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1014 		if(writeRGBA)
1015 		{
1016 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c), { 1, 6, 11, 0 }));
1017 		}
1018 		else
1019 		{
1020 			unsigned short mask = (writeA ? 0x8000 : 0x0000) |
1021 			                      (writeR ? 0x7C00 : 0x0000) |
1022 			                      (writeG ? 0x03E0 : 0x0000) |
1023 			                      (writeB ? 0x001F : 0x0000);
1024 			unsigned short unmask = ~mask;
1025 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
1026 			                            (UShort(PackFields(RoundInt(c), { 1, 6, 11, 0 })) &
1027 			                             UShort(mask));
1028 		}
1029 		break;
1030 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1031 		if(writeRGBA)
1032 		{
1033 			*Pointer<UShort>(element) = UShort(PackFields(RoundInt(c), { 10, 5, 0, 15 }));
1034 		}
1035 		else
1036 		{
1037 			unsigned short mask = (writeA ? 0x8000 : 0x0000) |
1038 			                      (writeR ? 0x7C00 : 0x0000) |
1039 			                      (writeG ? 0x03E0 : 0x0000) |
1040 			                      (writeB ? 0x001F : 0x0000);
1041 			unsigned short unmask = ~mask;
1042 			*Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
1043 			                            (UShort(PackFields(RoundInt(c), { 10, 5, 0, 15 })) &
1044 			                             UShort(mask));
1045 		}
1046 		break;
1047 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1048 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1049 	case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
1050 		if(writeRGBA)
1051 		{
1052 			*Pointer<UInt>(element) = As<UInt>(PackFields(RoundInt(c), { 0, 10, 20, 30 }));
1053 		}
1054 		else
1055 		{
1056 			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1057 			                    (writeB ? 0x3FF00000 : 0x0000) |
1058 			                    (writeG ? 0x000FFC00 : 0x0000) |
1059 			                    (writeR ? 0x000003FF : 0x0000);
1060 			unsigned int unmask = ~mask;
1061 			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1062 			                          (As<UInt>(PackFields(RoundInt(c), { 0, 10, 20, 30 })) &
1063 			                           UInt(mask));
1064 		}
1065 		break;
1066 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1067 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1068 	case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
1069 		if(writeRGBA)
1070 		{
1071 			*Pointer<UInt>(element) = As<UInt>(PackFields(RoundInt(c), { 20, 10, 0, 30 }));
1072 		}
1073 		else
1074 		{
1075 			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1076 			                    (writeR ? 0x3FF00000 : 0x0000) |
1077 			                    (writeG ? 0x000FFC00 : 0x0000) |
1078 			                    (writeB ? 0x000003FF : 0x0000);
1079 			unsigned int unmask = ~mask;
1080 			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1081 			                          (As<UInt>(PackFields(RoundInt(c), { 20, 10, 0, 30 })) &
1082 			                           UInt(mask));
1083 		}
1084 		break;
1085 	case VK_FORMAT_D16_UNORM:
1086 		*Pointer<UShort>(element) = UShort(RoundInt(Float(c.x)));
1087 		break;
1088 	case VK_FORMAT_X8_D24_UNORM_PACK32:
1089 		*Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)) << 8);
1090 		break;
1091 	case VK_FORMAT_D32_SFLOAT:
1092 		*Pointer<Float>(element) = c.x;
1093 		break;
1094 	case VK_FORMAT_S8_UINT:
1095 		*Pointer<Byte>(element) = Byte(RoundInt(Float(c.x)));
1096 		break;
1097 	default:
1098 		UNSUPPORTED("Blitter destination format %d", (int)state.destFormat);
1099 		break;
1100 	}
1101 }
1102 
readInt4(Pointer<Byte> element,const State & state)1103 Int4 Blitter::readInt4(Pointer<Byte> element, const State &state)
1104 {
1105 	Int4 c(0, 0, 0, 1);
1106 
1107 	switch(state.sourceFormat)
1108 	{
1109 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1110 	case VK_FORMAT_R8G8B8A8_SINT:
1111 		c = Insert(c, Int(*Pointer<SByte>(element + 3)), 3);
1112 		c = Insert(c, Int(*Pointer<SByte>(element + 2)), 2);
1113 		// [[fallthrough]]
1114 	case VK_FORMAT_R8G8_SINT:
1115 		c = Insert(c, Int(*Pointer<SByte>(element + 1)), 1);
1116 		// [[fallthrough]]
1117 	case VK_FORMAT_R8_SINT:
1118 		c = Insert(c, Int(*Pointer<SByte>(element)), 0);
1119 		break;
1120 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1121 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000003FF))), 0);
1122 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10), 1);
1123 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20), 2);
1124 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30), 3);
1125 		break;
1126 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1127 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000003FF))), 2);
1128 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10), 1);
1129 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20), 0);
1130 		c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30), 3);
1131 		break;
1132 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1133 	case VK_FORMAT_R8G8B8A8_UINT:
1134 		c = Insert(c, Int(*Pointer<Byte>(element + 3)), 3);
1135 		c = Insert(c, Int(*Pointer<Byte>(element + 2)), 2);
1136 		// [[fallthrough]]
1137 	case VK_FORMAT_R8G8_UINT:
1138 		c = Insert(c, Int(*Pointer<Byte>(element + 1)), 1);
1139 		// [[fallthrough]]
1140 	case VK_FORMAT_R8_UINT:
1141 	case VK_FORMAT_S8_UINT:
1142 		c = Insert(c, Int(*Pointer<Byte>(element)), 0);
1143 		break;
1144 	case VK_FORMAT_R16G16B16A16_SINT:
1145 		c = Insert(c, Int(*Pointer<Short>(element + 6)), 3);
1146 		c = Insert(c, Int(*Pointer<Short>(element + 4)), 2);
1147 		// [[fallthrough]]
1148 	case VK_FORMAT_R16G16_SINT:
1149 		c = Insert(c, Int(*Pointer<Short>(element + 2)), 1);
1150 		// [[fallthrough]]
1151 	case VK_FORMAT_R16_SINT:
1152 		c = Insert(c, Int(*Pointer<Short>(element)), 0);
1153 		break;
1154 	case VK_FORMAT_R16G16B16A16_UINT:
1155 		c = Insert(c, Int(*Pointer<UShort>(element + 6)), 3);
1156 		c = Insert(c, Int(*Pointer<UShort>(element + 4)), 2);
1157 		// [[fallthrough]]
1158 	case VK_FORMAT_R16G16_UINT:
1159 		c = Insert(c, Int(*Pointer<UShort>(element + 2)), 1);
1160 		// [[fallthrough]]
1161 	case VK_FORMAT_R16_UINT:
1162 		c = Insert(c, Int(*Pointer<UShort>(element)), 0);
1163 		break;
1164 	case VK_FORMAT_R32G32B32A32_SINT:
1165 	case VK_FORMAT_R32G32B32A32_UINT:
1166 		c = *Pointer<Int4>(element);
1167 		break;
1168 	case VK_FORMAT_R32G32_SINT:
1169 	case VK_FORMAT_R32G32_UINT:
1170 		c = Insert(c, *Pointer<Int>(element + 4), 1);
1171 		// [[fallthrough]]
1172 	case VK_FORMAT_R32_SINT:
1173 	case VK_FORMAT_R32_UINT:
1174 		c = Insert(c, *Pointer<Int>(element), 0);
1175 		break;
1176 	default:
1177 		UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
1178 	}
1179 
1180 	return c;
1181 }
1182 
write(Int4 & c,Pointer<Byte> element,const State & state)1183 void Blitter::write(Int4 &c, Pointer<Byte> element, const State &state)
1184 {
1185 	bool writeR = state.writeRed;
1186 	bool writeG = state.writeGreen;
1187 	bool writeB = state.writeBlue;
1188 	bool writeA = state.writeAlpha;
1189 	bool writeRGBA = writeR && writeG && writeB && writeA;
1190 
1191 	ASSERT(state.sourceFormat.isUnsigned() == state.destFormat.isUnsigned());
1192 
1193 	switch(state.destFormat)
1194 	{
1195 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1196 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1197 		c = Min(As<UInt4>(c), UInt4(0x03FF, 0x03FF, 0x03FF, 0x0003));
1198 		break;
1199 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1200 	case VK_FORMAT_R8G8B8A8_UINT:
1201 	case VK_FORMAT_R8G8B8_UINT:
1202 	case VK_FORMAT_R8G8_UINT:
1203 	case VK_FORMAT_R8_UINT:
1204 	case VK_FORMAT_R8G8B8A8_USCALED:
1205 	case VK_FORMAT_R8G8B8_USCALED:
1206 	case VK_FORMAT_R8G8_USCALED:
1207 	case VK_FORMAT_R8_USCALED:
1208 	case VK_FORMAT_S8_UINT:
1209 		c = Min(As<UInt4>(c), UInt4(0xFF));
1210 		break;
1211 	case VK_FORMAT_R16G16B16A16_UINT:
1212 	case VK_FORMAT_R16G16B16_UINT:
1213 	case VK_FORMAT_R16G16_UINT:
1214 	case VK_FORMAT_R16_UINT:
1215 	case VK_FORMAT_R16G16B16A16_USCALED:
1216 	case VK_FORMAT_R16G16B16_USCALED:
1217 	case VK_FORMAT_R16G16_USCALED:
1218 	case VK_FORMAT_R16_USCALED:
1219 		c = Min(As<UInt4>(c), UInt4(0xFFFF));
1220 		break;
1221 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1222 	case VK_FORMAT_R8G8B8A8_SINT:
1223 	case VK_FORMAT_R8G8_SINT:
1224 	case VK_FORMAT_R8_SINT:
1225 	case VK_FORMAT_R8G8B8A8_SSCALED:
1226 	case VK_FORMAT_R8G8B8_SSCALED:
1227 	case VK_FORMAT_R8G8_SSCALED:
1228 	case VK_FORMAT_R8_SSCALED:
1229 		c = Min(Max(c, Int4(-0x80)), Int4(0x7F));
1230 		break;
1231 	case VK_FORMAT_R16G16B16A16_SINT:
1232 	case VK_FORMAT_R16G16B16_SINT:
1233 	case VK_FORMAT_R16G16_SINT:
1234 	case VK_FORMAT_R16_SINT:
1235 	case VK_FORMAT_R16G16B16A16_SSCALED:
1236 	case VK_FORMAT_R16G16B16_SSCALED:
1237 	case VK_FORMAT_R16G16_SSCALED:
1238 	case VK_FORMAT_R16_SSCALED:
1239 		c = Min(Max(c, Int4(-0x8000)), Int4(0x7FFF));
1240 		break;
1241 	default:
1242 		break;
1243 	}
1244 
1245 	switch(state.destFormat)
1246 	{
1247 	case VK_FORMAT_B8G8R8A8_SINT:
1248 	case VK_FORMAT_B8G8R8A8_SSCALED:
1249 		if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
1250 		// [[fallthrough]]
1251 	case VK_FORMAT_B8G8R8_SINT:
1252 	case VK_FORMAT_B8G8R8_SSCALED:
1253 		if(writeB) { *Pointer<SByte>(element) = SByte(Extract(c, 2)); }
1254 		if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
1255 		if(writeR) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 0)); }
1256 		break;
1257 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1258 	case VK_FORMAT_R8G8B8A8_SINT:
1259 	case VK_FORMAT_R8G8B8A8_SSCALED:
1260 	case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
1261 		if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
1262 		// [[fallthrough]]
1263 	case VK_FORMAT_R8G8B8_SINT:
1264 	case VK_FORMAT_R8G8B8_SSCALED:
1265 		if(writeB) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 2)); }
1266 		// [[fallthrough]]
1267 	case VK_FORMAT_R8G8_SINT:
1268 	case VK_FORMAT_R8G8_SSCALED:
1269 		if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
1270 		// [[fallthrough]]
1271 	case VK_FORMAT_R8_SINT:
1272 	case VK_FORMAT_R8_SSCALED:
1273 		if(writeR) { *Pointer<SByte>(element) = SByte(Extract(c, 0)); }
1274 		break;
1275 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1276 	case VK_FORMAT_A2B10G10R10_SINT_PACK32:
1277 	case VK_FORMAT_A2B10G10R10_USCALED_PACK32:
1278 	case VK_FORMAT_A2B10G10R10_SSCALED_PACK32:
1279 		if(writeRGBA)
1280 		{
1281 			*Pointer<UInt>(element) = As<UInt>(PackFields(c, { 0, 10, 20, 30 }));
1282 		}
1283 		else
1284 		{
1285 			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1286 			                    (writeB ? 0x3FF00000 : 0x0000) |
1287 			                    (writeG ? 0x000FFC00 : 0x0000) |
1288 			                    (writeR ? 0x000003FF : 0x0000);
1289 			unsigned int unmask = ~mask;
1290 			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1291 			                          (As<UInt>(PackFields(c, { 0, 10, 20, 30 })) & UInt(mask));
1292 		}
1293 		break;
1294 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1295 	case VK_FORMAT_A2R10G10B10_SINT_PACK32:
1296 	case VK_FORMAT_A2R10G10B10_USCALED_PACK32:
1297 	case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
1298 		if(writeRGBA)
1299 		{
1300 			*Pointer<UInt>(element) = As<UInt>(PackFields(c, { 20, 10, 0, 30 }));
1301 		}
1302 		else
1303 		{
1304 			unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1305 			                    (writeR ? 0x3FF00000 : 0x0000) |
1306 			                    (writeG ? 0x000FFC00 : 0x0000) |
1307 			                    (writeB ? 0x000003FF : 0x0000);
1308 			unsigned int unmask = ~mask;
1309 			*Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1310 			                          (As<UInt>(PackFields(c, { 20, 10, 0, 30 })) & UInt(mask));
1311 		}
1312 		break;
1313 	case VK_FORMAT_B8G8R8A8_UINT:
1314 	case VK_FORMAT_B8G8R8A8_USCALED:
1315 		if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
1316 		// [[fallthrough]]
1317 	case VK_FORMAT_B8G8R8_UINT:
1318 	case VK_FORMAT_B8G8R8_USCALED:
1319 	case VK_FORMAT_B8G8R8_SRGB:
1320 		if(writeB) { *Pointer<Byte>(element) = Byte(Extract(c, 2)); }
1321 		if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
1322 		if(writeR) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 0)); }
1323 		break;
1324 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1325 	case VK_FORMAT_R8G8B8A8_UINT:
1326 	case VK_FORMAT_R8G8B8A8_USCALED:
1327 	case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
1328 		if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
1329 		// [[fallthrough]]
1330 	case VK_FORMAT_R8G8B8_UINT:
1331 	case VK_FORMAT_R8G8B8_USCALED:
1332 		if(writeB) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 2)); }
1333 		// [[fallthrough]]
1334 	case VK_FORMAT_R8G8_UINT:
1335 	case VK_FORMAT_R8G8_USCALED:
1336 		if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
1337 		// [[fallthrough]]
1338 	case VK_FORMAT_R8_UINT:
1339 	case VK_FORMAT_R8_USCALED:
1340 	case VK_FORMAT_S8_UINT:
1341 		if(writeR) { *Pointer<Byte>(element) = Byte(Extract(c, 0)); }
1342 		break;
1343 	case VK_FORMAT_R16G16B16A16_SINT:
1344 	case VK_FORMAT_R16G16B16A16_SSCALED:
1345 		if(writeA) { *Pointer<Short>(element + 6) = Short(Extract(c, 3)); }
1346 		// [[fallthrough]]
1347 	case VK_FORMAT_R16G16B16_SINT:
1348 	case VK_FORMAT_R16G16B16_SSCALED:
1349 		if(writeB) { *Pointer<Short>(element + 4) = Short(Extract(c, 2)); }
1350 		// [[fallthrough]]
1351 	case VK_FORMAT_R16G16_SINT:
1352 	case VK_FORMAT_R16G16_SSCALED:
1353 		if(writeG) { *Pointer<Short>(element + 2) = Short(Extract(c, 1)); }
1354 		// [[fallthrough]]
1355 	case VK_FORMAT_R16_SINT:
1356 	case VK_FORMAT_R16_SSCALED:
1357 		if(writeR) { *Pointer<Short>(element) = Short(Extract(c, 0)); }
1358 		break;
1359 	case VK_FORMAT_R16G16B16A16_UINT:
1360 	case VK_FORMAT_R16G16B16A16_USCALED:
1361 		if(writeA) { *Pointer<UShort>(element + 6) = UShort(Extract(c, 3)); }
1362 		// [[fallthrough]]
1363 	case VK_FORMAT_R16G16B16_UINT:
1364 	case VK_FORMAT_R16G16B16_USCALED:
1365 		if(writeB) { *Pointer<UShort>(element + 4) = UShort(Extract(c, 2)); }
1366 		// [[fallthrough]]
1367 	case VK_FORMAT_R16G16_UINT:
1368 	case VK_FORMAT_R16G16_USCALED:
1369 		if(writeG) { *Pointer<UShort>(element + 2) = UShort(Extract(c, 1)); }
1370 		// [[fallthrough]]
1371 	case VK_FORMAT_R16_UINT:
1372 	case VK_FORMAT_R16_USCALED:
1373 		if(writeR) { *Pointer<UShort>(element) = UShort(Extract(c, 0)); }
1374 		break;
1375 	case VK_FORMAT_R32G32B32A32_SINT:
1376 		if(writeRGBA)
1377 		{
1378 			*Pointer<Int4>(element) = c;
1379 		}
1380 		else
1381 		{
1382 			if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1383 			if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
1384 			if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
1385 			if(writeA) { *Pointer<Int>(element + 12) = Extract(c, 3); }
1386 		}
1387 		break;
1388 	case VK_FORMAT_R32G32B32_SINT:
1389 		if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1390 		if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
1391 		if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
1392 		break;
1393 	case VK_FORMAT_R32G32_SINT:
1394 		if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1395 		if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
1396 		break;
1397 	case VK_FORMAT_R32_SINT:
1398 		if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1399 		break;
1400 	case VK_FORMAT_R32G32B32A32_UINT:
1401 		if(writeRGBA)
1402 		{
1403 			*Pointer<UInt4>(element) = As<UInt4>(c);
1404 		}
1405 		else
1406 		{
1407 			if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
1408 			if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
1409 			if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
1410 			if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(Extract(c, 3)); }
1411 		}
1412 		break;
1413 	case VK_FORMAT_R32G32B32_UINT:
1414 		if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
1415 		// [[fallthrough]]
1416 	case VK_FORMAT_R32G32_UINT:
1417 		if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
1418 		// [[fallthrough]]
1419 	case VK_FORMAT_R32_UINT:
1420 		if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
1421 		break;
1422 	default:
1423 		UNSUPPORTED("Blitter destination format %d", (int)state.destFormat);
1424 	}
1425 }
1426 
ApplyScaleAndClamp(Float4 & value,const State & state,bool preScaled)1427 void Blitter::ApplyScaleAndClamp(Float4 &value, const State &state, bool preScaled)
1428 {
1429 	float4 scale{}, unscale{};
1430 
1431 	if(state.clearOperation &&
1432 	   state.sourceFormat.isUnnormalizedInteger() &&
1433 	   !state.destFormat.isUnnormalizedInteger())
1434 	{
1435 		// If we're clearing a buffer from an int or uint color into a normalized color,
1436 		// then the whole range of the int or uint color must be scaled between 0 and 1.
1437 		switch(state.sourceFormat)
1438 		{
1439 		case VK_FORMAT_R32G32B32A32_SINT:
1440 			unscale = float4(static_cast<float>(0x7FFFFFFF));
1441 			break;
1442 		case VK_FORMAT_R32G32B32A32_UINT:
1443 			unscale = float4(static_cast<float>(0xFFFFFFFF));
1444 			break;
1445 		default:
1446 			UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
1447 		}
1448 	}
1449 	else
1450 	{
1451 		unscale = state.sourceFormat.getScale();
1452 	}
1453 
1454 	scale = state.destFormat.getScale();
1455 
1456 	bool srcSRGB = state.sourceFormat.isSRGBformat();
1457 	bool dstSRGB = state.destFormat.isSRGBformat();
1458 
1459 	if(state.allowSRGBConversion && ((srcSRGB && !preScaled) || dstSRGB))  // One of the formats is sRGB encoded.
1460 	{
1461 		value *= preScaled ? Float4(1.0f / scale.x, 1.0f / scale.y, 1.0f / scale.z, 1.0f / scale.w) :  // Unapply scale
1462 		             Float4(1.0f / unscale.x, 1.0f / unscale.y, 1.0f / unscale.z, 1.0f / unscale.w);   // Apply unscale
1463 		value.xyz = (srcSRGB && !preScaled) ? sRGBtoLinear(value) : linearToSRGB(value);
1464 		value *= Float4(scale.x, scale.y, scale.z, scale.w);  // Apply scale
1465 	}
1466 	else if(unscale != scale)
1467 	{
1468 		value *= Float4(scale.x / unscale.x, scale.y / unscale.y, scale.z / unscale.z, scale.w / unscale.w);
1469 	}
1470 
1471 	if(state.sourceFormat.isFloatFormat() && !state.destFormat.isFloatFormat())
1472 	{
1473 		value = Min(value, Float4(scale.x, scale.y, scale.z, scale.w));
1474 
1475 		value = Max(value, Float4(state.destFormat.isUnsignedComponent(0) ? 0.0f : -scale.x,
1476 		                          state.destFormat.isUnsignedComponent(1) ? 0.0f : -scale.y,
1477 		                          state.destFormat.isUnsignedComponent(2) ? 0.0f : -scale.z,
1478 		                          state.destFormat.isUnsignedComponent(3) ? 0.0f : -scale.w));
1479 	}
1480 
1481 	if(!state.sourceFormat.isUnsigned() && state.destFormat.isUnsigned())
1482 	{
1483 		value = Max(value, Float4(0.0f));
1484 	}
1485 }
1486 
ComputeOffset(Int & x,Int & y,Int & pitchB,int bytes)1487 Int Blitter::ComputeOffset(Int &x, Int &y, Int &pitchB, int bytes)
1488 {
1489 	return y * pitchB + x * bytes;
1490 }
1491 
ComputeOffset(Int & x,Int & y,Int & z,Int & sliceB,Int & pitchB,int bytes)1492 Int Blitter::ComputeOffset(Int &x, Int &y, Int &z, Int &sliceB, Int &pitchB, int bytes)
1493 {
1494 	return z * sliceB + y * pitchB + x * bytes;
1495 }
1496 
sample(Pointer<Byte> & source,Float & x,Float & y,Float & z,Int & sWidth,Int & sHeight,Int & sDepth,Int & sSliceB,Int & sPitchB,const State & state)1497 Float4 Blitter::sample(Pointer<Byte> &source, Float &x, Float &y, Float &z,
1498                        Int &sWidth, Int &sHeight, Int &sDepth,
1499                        Int &sSliceB, Int &sPitchB, const State &state)
1500 {
1501 	bool intSrc = state.sourceFormat.isUnnormalizedInteger();
1502 	int srcBytes = state.sourceFormat.bytes();
1503 
1504 	Float4 color;
1505 
1506 	bool preScaled = false;
1507 	if(!state.filter || intSrc)
1508 	{
1509 		Int X = Int(x);
1510 		Int Y = Int(y);
1511 		Int Z = Int(z);
1512 
1513 		if(state.clampToEdge)
1514 		{
1515 			X = Clamp(X, 0, sWidth - 1);
1516 			Y = Clamp(Y, 0, sHeight - 1);
1517 			Z = Clamp(Z, 0, sDepth - 1);
1518 		}
1519 
1520 		Pointer<Byte> s = source + ComputeOffset(X, Y, Z, sSliceB, sPitchB, srcBytes);
1521 
1522 		color = readFloat4(s, state);
1523 
1524 		if(state.srcSamples > 1)  // Resolve multisampled source
1525 		{
1526 			if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat())  // sRGB -> RGB
1527 			{
1528 				ApplyScaleAndClamp(color, state);
1529 				preScaled = true;
1530 			}
1531 			Float4 accum = color;
1532 			for(int sample = 1; sample < state.srcSamples; sample++)
1533 			{
1534 				s += sSliceB;
1535 				color = readFloat4(s, state);
1536 
1537 				if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat())  // sRGB -> RGB
1538 				{
1539 					ApplyScaleAndClamp(color, state);
1540 					preScaled = true;
1541 				}
1542 				accum += color;
1543 			}
1544 			color = accum * Float4(1.0f / static_cast<float>(state.srcSamples));
1545 		}
1546 	}
1547 	else  // Bilinear filtering
1548 	{
1549 		Float X = x;
1550 		Float Y = y;
1551 		Float Z = z;
1552 
1553 		if(state.clampToEdge)
1554 		{
1555 			X = Min(Max(x, 0.5f), Float(sWidth) - 0.5f);
1556 			Y = Min(Max(y, 0.5f), Float(sHeight) - 0.5f);
1557 			Z = Min(Max(z, 0.5f), Float(sDepth) - 0.5f);
1558 		}
1559 
1560 		Float x0 = X - 0.5f;
1561 		Float y0 = Y - 0.5f;
1562 		Float z0 = Z - 0.5f;
1563 
1564 		Int X0 = Max(Int(x0), 0);
1565 		Int Y0 = Max(Int(y0), 0);
1566 		Int Z0 = Max(Int(z0), 0);
1567 
1568 		Int X1 = X0 + 1;
1569 		Int Y1 = Y0 + 1;
1570 		X1 = IfThenElse(X1 >= sWidth, X0, X1);
1571 		Y1 = IfThenElse(Y1 >= sHeight, Y0, Y1);
1572 
1573 		if(state.filter3D)
1574 		{
1575 			Int Z1 = Z0 + 1;
1576 			Z1 = IfThenElse(Z1 >= sHeight, Z0, Z1);
1577 
1578 			Pointer<Byte> s000 = source + ComputeOffset(X0, Y0, Z0, sSliceB, sPitchB, srcBytes);
1579 			Pointer<Byte> s010 = source + ComputeOffset(X1, Y0, Z0, sSliceB, sPitchB, srcBytes);
1580 			Pointer<Byte> s100 = source + ComputeOffset(X0, Y1, Z0, sSliceB, sPitchB, srcBytes);
1581 			Pointer<Byte> s110 = source + ComputeOffset(X1, Y1, Z0, sSliceB, sPitchB, srcBytes);
1582 			Pointer<Byte> s001 = source + ComputeOffset(X0, Y0, Z1, sSliceB, sPitchB, srcBytes);
1583 			Pointer<Byte> s011 = source + ComputeOffset(X1, Y0, Z1, sSliceB, sPitchB, srcBytes);
1584 			Pointer<Byte> s101 = source + ComputeOffset(X0, Y1, Z1, sSliceB, sPitchB, srcBytes);
1585 			Pointer<Byte> s111 = source + ComputeOffset(X1, Y1, Z1, sSliceB, sPitchB, srcBytes);
1586 
1587 			Float4 c000 = readFloat4(s000, state);
1588 			Float4 c010 = readFloat4(s010, state);
1589 			Float4 c100 = readFloat4(s100, state);
1590 			Float4 c110 = readFloat4(s110, state);
1591 			Float4 c001 = readFloat4(s001, state);
1592 			Float4 c011 = readFloat4(s011, state);
1593 			Float4 c101 = readFloat4(s101, state);
1594 			Float4 c111 = readFloat4(s111, state);
1595 
1596 			if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat())  // sRGB -> RGB
1597 			{
1598 				ApplyScaleAndClamp(c000, state);
1599 				ApplyScaleAndClamp(c010, state);
1600 				ApplyScaleAndClamp(c100, state);
1601 				ApplyScaleAndClamp(c110, state);
1602 				ApplyScaleAndClamp(c001, state);
1603 				ApplyScaleAndClamp(c011, state);
1604 				ApplyScaleAndClamp(c101, state);
1605 				ApplyScaleAndClamp(c111, state);
1606 				preScaled = true;
1607 			}
1608 
1609 			Float4 fx = Float4(x0 - Float(X0));
1610 			Float4 fy = Float4(y0 - Float(Y0));
1611 			Float4 fz = Float4(z0 - Float(Z0));
1612 			Float4 ix = Float4(1.0f) - fx;
1613 			Float4 iy = Float4(1.0f) - fy;
1614 			Float4 iz = Float4(1.0f) - fz;
1615 
1616 			color = ((c000 * ix + c010 * fx) * iy +
1617 			         (c100 * ix + c110 * fx) * fy) *
1618 			            iz +
1619 			        ((c001 * ix + c011 * fx) * iy +
1620 			         (c101 * ix + c111 * fx) * fy) *
1621 			            fz;
1622 		}
1623 		else
1624 		{
1625 			Pointer<Byte> s00 = source + ComputeOffset(X0, Y0, Z0, sSliceB, sPitchB, srcBytes);
1626 			Pointer<Byte> s01 = source + ComputeOffset(X1, Y0, Z0, sSliceB, sPitchB, srcBytes);
1627 			Pointer<Byte> s10 = source + ComputeOffset(X0, Y1, Z0, sSliceB, sPitchB, srcBytes);
1628 			Pointer<Byte> s11 = source + ComputeOffset(X1, Y1, Z0, sSliceB, sPitchB, srcBytes);
1629 
1630 			Float4 c00 = readFloat4(s00, state);
1631 			Float4 c01 = readFloat4(s01, state);
1632 			Float4 c10 = readFloat4(s10, state);
1633 			Float4 c11 = readFloat4(s11, state);
1634 
1635 			if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat())  // sRGB -> RGB
1636 			{
1637 				ApplyScaleAndClamp(c00, state);
1638 				ApplyScaleAndClamp(c01, state);
1639 				ApplyScaleAndClamp(c10, state);
1640 				ApplyScaleAndClamp(c11, state);
1641 				preScaled = true;
1642 			}
1643 
1644 			Float4 fx = Float4(x0 - Float(X0));
1645 			Float4 fy = Float4(y0 - Float(Y0));
1646 			Float4 ix = Float4(1.0f) - fx;
1647 			Float4 iy = Float4(1.0f) - fy;
1648 
1649 			color = (c00 * ix + c01 * fx) * iy +
1650 			        (c10 * ix + c11 * fx) * fy;
1651 		}
1652 	}
1653 
1654 	ApplyScaleAndClamp(color, state, preScaled);
1655 
1656 	return color;
1657 }
1658 
generate(const State & state)1659 Blitter::BlitRoutineType Blitter::generate(const State &state)
1660 {
1661 	BlitFunction function;
1662 	{
1663 		Pointer<Byte> blit(function.Arg<0>());
1664 
1665 		Pointer<Byte> source = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData, source));
1666 		Pointer<Byte> dest = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData, dest));
1667 		Int sPitchB = *Pointer<Int>(blit + OFFSET(BlitData, sPitchB));
1668 		Int dPitchB = *Pointer<Int>(blit + OFFSET(BlitData, dPitchB));
1669 		Int sSliceB = *Pointer<Int>(blit + OFFSET(BlitData, sSliceB));
1670 		Int dSliceB = *Pointer<Int>(blit + OFFSET(BlitData, dSliceB));
1671 
1672 		Float x0 = *Pointer<Float>(blit + OFFSET(BlitData, x0));
1673 		Float y0 = *Pointer<Float>(blit + OFFSET(BlitData, y0));
1674 		Float z0 = *Pointer<Float>(blit + OFFSET(BlitData, z0));
1675 		Float w = *Pointer<Float>(blit + OFFSET(BlitData, w));
1676 		Float h = *Pointer<Float>(blit + OFFSET(BlitData, h));
1677 		Float d = *Pointer<Float>(blit + OFFSET(BlitData, d));
1678 
1679 		Int x0d = *Pointer<Int>(blit + OFFSET(BlitData, x0d));
1680 		Int x1d = *Pointer<Int>(blit + OFFSET(BlitData, x1d));
1681 		Int y0d = *Pointer<Int>(blit + OFFSET(BlitData, y0d));
1682 		Int y1d = *Pointer<Int>(blit + OFFSET(BlitData, y1d));
1683 		Int z0d = *Pointer<Int>(blit + OFFSET(BlitData, z0d));
1684 		Int z1d = *Pointer<Int>(blit + OFFSET(BlitData, z1d));
1685 
1686 		Int sWidth = *Pointer<Int>(blit + OFFSET(BlitData, sWidth));
1687 		Int sHeight = *Pointer<Int>(blit + OFFSET(BlitData, sHeight));
1688 		Int sDepth = *Pointer<Int>(blit + OFFSET(BlitData, sDepth));
1689 
1690 		bool intSrc = state.sourceFormat.isUnnormalizedInteger();
1691 		bool intDst = state.destFormat.isUnnormalizedInteger();
1692 		bool intBoth = intSrc && intDst;
1693 		int srcBytes = state.sourceFormat.bytes();
1694 		int dstBytes = state.destFormat.bytes();
1695 
1696 		bool hasConstantColorI = false;
1697 		Int4 constantColorI;
1698 		bool hasConstantColorF = false;
1699 		Float4 constantColorF;
1700 		if(state.clearOperation)
1701 		{
1702 			if(intBoth)  // Integer types
1703 			{
1704 				constantColorI = readInt4(source, state);
1705 				hasConstantColorI = true;
1706 			}
1707 			else
1708 			{
1709 				constantColorF = readFloat4(source, state);
1710 				hasConstantColorF = true;
1711 
1712 				ApplyScaleAndClamp(constantColorF, state);
1713 			}
1714 		}
1715 
1716 		For(Int k = z0d, k < z1d, k++)
1717 		{
1718 			Float z = state.clearOperation ? RValue<Float>(z0) : z0 + Float(k) * d;
1719 			Pointer<Byte> destSlice = dest + k * dSliceB;
1720 
1721 			For(Int j = y0d, j < y1d, j++)
1722 			{
1723 				Float y = state.clearOperation ? RValue<Float>(y0) : y0 + Float(j) * h;
1724 				Pointer<Byte> destLine = destSlice + j * dPitchB;
1725 
1726 				For(Int i = x0d, i < x1d, i++)
1727 				{
1728 					Float x = state.clearOperation ? RValue<Float>(x0) : x0 + Float(i) * w;
1729 					Pointer<Byte> d = destLine + i * dstBytes;
1730 
1731 					if(hasConstantColorI)
1732 					{
1733 						for(int s = 0; s < state.destSamples; s++)
1734 						{
1735 							write(constantColorI, d, state);
1736 
1737 							d += dSliceB;
1738 						}
1739 					}
1740 					else if(hasConstantColorF)
1741 					{
1742 						for(int s = 0; s < state.destSamples; s++)
1743 						{
1744 							write(constantColorF, d, state);
1745 
1746 							d += dSliceB;
1747 						}
1748 					}
1749 					else if(intBoth)  // Integer types do not support filtering
1750 					{
1751 						Int X = Int(x);
1752 						Int Y = Int(y);
1753 						Int Z = Int(z);
1754 
1755 						if(state.clampToEdge)
1756 						{
1757 							X = Clamp(X, 0, sWidth - 1);
1758 							Y = Clamp(Y, 0, sHeight - 1);
1759 							Z = Clamp(Z, 0, sDepth - 1);
1760 						}
1761 
1762 						Pointer<Byte> s = source + ComputeOffset(X, Y, Z, sSliceB, sPitchB, srcBytes);
1763 
1764 						// When both formats are true integer types, we don't go to float to avoid losing precision
1765 						Int4 color = readInt4(s, state);
1766 						for(int s = 0; s < state.destSamples; s++)
1767 						{
1768 							write(color, d, state);
1769 
1770 							d += dSliceB;
1771 						}
1772 					}
1773 					else
1774 					{
1775 						Float4 color = sample(source, x, y, z, sWidth, sHeight, sDepth, sSliceB, sPitchB, state);
1776 
1777 						for(int s = 0; s < state.destSamples; s++)
1778 						{
1779 							write(color, d, state);
1780 
1781 							d += dSliceB;
1782 						}
1783 					}
1784 				}
1785 			}
1786 		}
1787 	}
1788 
1789 	return function("BlitRoutine");
1790 }
1791 
getBlitRoutine(const State & state)1792 Blitter::BlitRoutineType Blitter::getBlitRoutine(const State &state)
1793 {
1794 	marl::lock lock(blitMutex);
1795 	auto blitRoutine = blitCache.lookup(state);
1796 
1797 	if(!blitRoutine)
1798 	{
1799 		blitRoutine = generate(state);
1800 		blitCache.add(state, blitRoutine);
1801 	}
1802 
1803 	return blitRoutine;
1804 }
1805 
getCornerUpdateRoutine(const State & state)1806 Blitter::CornerUpdateRoutineType Blitter::getCornerUpdateRoutine(const State &state)
1807 {
1808 	marl::lock lock(cornerUpdateMutex);
1809 	auto cornerUpdateRoutine = cornerUpdateCache.lookup(state);
1810 
1811 	if(!cornerUpdateRoutine)
1812 	{
1813 		cornerUpdateRoutine = generateCornerUpdate(state);
1814 		cornerUpdateCache.add(state, cornerUpdateRoutine);
1815 	}
1816 
1817 	return cornerUpdateRoutine;
1818 }
1819 
blit(const vk::Image * src,vk::Image * dst,VkImageBlit2KHR region,VkFilter filter)1820 void Blitter::blit(const vk::Image *src, vk::Image *dst, VkImageBlit2KHR region, VkFilter filter)
1821 {
1822 	ASSERT(src->getFormat() != VK_FORMAT_UNDEFINED);
1823 	ASSERT(dst->getFormat() != VK_FORMAT_UNDEFINED);
1824 
1825 	// Vulkan 1.2 section 18.5. Image Copies with Scaling:
1826 	// "The layerCount member of srcSubresource and dstSubresource must match"
1827 	// "The aspectMask member of srcSubresource and dstSubresource must match"
1828 	ASSERT(region.srcSubresource.layerCount == region.dstSubresource.layerCount);
1829 	ASSERT(region.srcSubresource.aspectMask == region.dstSubresource.aspectMask);
1830 
1831 	if(region.dstOffsets[0].x > region.dstOffsets[1].x)
1832 	{
1833 		std::swap(region.srcOffsets[0].x, region.srcOffsets[1].x);
1834 		std::swap(region.dstOffsets[0].x, region.dstOffsets[1].x);
1835 	}
1836 
1837 	if(region.dstOffsets[0].y > region.dstOffsets[1].y)
1838 	{
1839 		std::swap(region.srcOffsets[0].y, region.srcOffsets[1].y);
1840 		std::swap(region.dstOffsets[0].y, region.dstOffsets[1].y);
1841 	}
1842 
1843 	if(region.dstOffsets[0].z > region.dstOffsets[1].z)
1844 	{
1845 		std::swap(region.srcOffsets[0].z, region.srcOffsets[1].z);
1846 		std::swap(region.dstOffsets[0].z, region.dstOffsets[1].z);
1847 	}
1848 
1849 	VkImageAspectFlagBits srcAspect = static_cast<VkImageAspectFlagBits>(region.srcSubresource.aspectMask);
1850 	VkImageAspectFlagBits dstAspect = static_cast<VkImageAspectFlagBits>(region.dstSubresource.aspectMask);
1851 	VkExtent3D srcExtent = src->getMipLevelExtent(srcAspect, region.srcSubresource.mipLevel);
1852 
1853 	float widthRatio = static_cast<float>(region.srcOffsets[1].x - region.srcOffsets[0].x) /
1854 	                   static_cast<float>(region.dstOffsets[1].x - region.dstOffsets[0].x);
1855 	float heightRatio = static_cast<float>(region.srcOffsets[1].y - region.srcOffsets[0].y) /
1856 	                    static_cast<float>(region.dstOffsets[1].y - region.dstOffsets[0].y);
1857 	float depthRatio = static_cast<float>(region.srcOffsets[1].z - region.srcOffsets[0].z) /
1858 	                   static_cast<float>(region.dstOffsets[1].z - region.dstOffsets[0].z);
1859 	float x0 = region.srcOffsets[0].x + (0.5f - region.dstOffsets[0].x) * widthRatio;
1860 	float y0 = region.srcOffsets[0].y + (0.5f - region.dstOffsets[0].y) * heightRatio;
1861 	float z0 = region.srcOffsets[0].z + (0.5f - region.dstOffsets[0].z) * depthRatio;
1862 
1863 	auto srcFormat = src->getFormat(srcAspect);
1864 	auto dstFormat = dst->getFormat(dstAspect);
1865 
1866 	bool doFilter = (filter != VK_FILTER_NEAREST);
1867 	bool allowSRGBConversion =
1868 	    doFilter ||
1869 	    (src->getSampleCount() > 1) ||
1870 	    (srcFormat.isSRGBformat() != dstFormat.isSRGBformat());
1871 
1872 	State state(srcFormat, dstFormat, src->getSampleCount(), dst->getSampleCount(),
1873 	            Options{ doFilter, allowSRGBConversion });
1874 	state.clampToEdge = (region.srcOffsets[0].x < 0) ||
1875 	                    (region.srcOffsets[0].y < 0) ||
1876 	                    (static_cast<uint32_t>(region.srcOffsets[1].x) > srcExtent.width) ||
1877 	                    (static_cast<uint32_t>(region.srcOffsets[1].y) > srcExtent.height) ||
1878 	                    (doFilter && ((x0 < 0.5f) || (y0 < 0.5f)));
1879 	state.filter3D = (region.srcOffsets[1].z - region.srcOffsets[0].z) !=
1880 	                 (region.dstOffsets[1].z - region.dstOffsets[0].z);
1881 
1882 	auto blitRoutine = getBlitRoutine(state);
1883 	if(!blitRoutine)
1884 	{
1885 		return;
1886 	}
1887 
1888 	BlitData data = {
1889 		nullptr,                                                                                 // source
1890 		nullptr,                                                                                 // dest
1891 		assert_cast<uint32_t>(src->rowPitchBytes(srcAspect, region.srcSubresource.mipLevel)),    // sPitchB
1892 		assert_cast<uint32_t>(dst->rowPitchBytes(dstAspect, region.dstSubresource.mipLevel)),    // dPitchB
1893 		assert_cast<uint32_t>(src->slicePitchBytes(srcAspect, region.srcSubresource.mipLevel)),  // sSliceB
1894 		assert_cast<uint32_t>(dst->slicePitchBytes(dstAspect, region.dstSubresource.mipLevel)),  // dSliceB
1895 
1896 		x0,
1897 		y0,
1898 		z0,
1899 		widthRatio,
1900 		heightRatio,
1901 		depthRatio,
1902 
1903 		region.dstOffsets[0].x,  // x0d
1904 		region.dstOffsets[1].x,  // x1d
1905 		region.dstOffsets[0].y,  // y0d
1906 		region.dstOffsets[1].y,  // y1d
1907 		region.dstOffsets[0].z,  // z0d
1908 		region.dstOffsets[1].z,  // z1d
1909 
1910 		static_cast<int>(srcExtent.width),   // sWidth
1911 		static_cast<int>(srcExtent.height),  // sHeight
1912 		static_cast<int>(srcExtent.depth),   // sDepth
1913 
1914 		false,  // filter3D
1915 	};
1916 
1917 	VkImageSubresource srcSubres = {
1918 		region.srcSubresource.aspectMask,
1919 		region.srcSubresource.mipLevel,
1920 		region.srcSubresource.baseArrayLayer
1921 	};
1922 
1923 	VkImageSubresource dstSubres = {
1924 		region.dstSubresource.aspectMask,
1925 		region.dstSubresource.mipLevel,
1926 		region.dstSubresource.baseArrayLayer
1927 	};
1928 
1929 	VkImageSubresourceRange dstSubresRange = {
1930 		region.dstSubresource.aspectMask,
1931 		region.dstSubresource.mipLevel,
1932 		1,  // levelCount
1933 		region.dstSubresource.baseArrayLayer,
1934 		region.dstSubresource.layerCount
1935 	};
1936 
1937 	uint32_t lastLayer = src->getLastLayerIndex(dstSubresRange);
1938 
1939 	for(; dstSubres.arrayLayer <= lastLayer; srcSubres.arrayLayer++, dstSubres.arrayLayer++)
1940 	{
1941 		data.source = src->getTexelPointer({ 0, 0, 0 }, srcSubres);
1942 		data.dest = dst->getTexelPointer({ 0, 0, 0 }, dstSubres);
1943 
1944 		ASSERT(data.source < src->end());
1945 		ASSERT(data.dest < dst->end());
1946 
1947 		blitRoutine(&data);
1948 	}
1949 
1950 	dst->contentsChanged(dstSubresRange);
1951 }
1952 
resolveDepth(const vk::ImageView * src,vk::ImageView * dst,const VkResolveModeFlagBits depthResolveMode)1953 static void resolveDepth(const vk::ImageView *src, vk::ImageView *dst, const VkResolveModeFlagBits depthResolveMode)
1954 {
1955 	if(depthResolveMode == VK_RESOLVE_MODE_NONE)
1956 	{
1957 		return;
1958 	}
1959 
1960 	vk::Format format = src->getFormat(VK_IMAGE_ASPECT_DEPTH_BIT);
1961 	VkExtent2D extent = src->getMipLevelExtent(0, VK_IMAGE_ASPECT_DEPTH_BIT);
1962 	int width = extent.width;
1963 	int height = extent.height;
1964 	int pitch = src->rowPitchBytes(VK_IMAGE_ASPECT_DEPTH_BIT, 0);
1965 
1966 	// To support other resolve modes, get the slice bytes and get a pointer to each sample plane.
1967 	// Then modify the loop below to include logic for handling each new mode.
1968 	uint8_t *source = (uint8_t *)src->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_DEPTH_BIT, 0, 0);
1969 	uint8_t *dest = (uint8_t *)dst->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_DEPTH_BIT, 0, 0);
1970 
1971 	size_t formatSize = format.bytes();
1972 	// TODO(b/167558951) support other resolve modes.
1973 	ASSERT(depthResolveMode == VK_RESOLVE_MODE_SAMPLE_ZERO_BIT);
1974 	for(int y = 0; y < height; y++)
1975 	{
1976 		memcpy(dest, source, formatSize * width);
1977 
1978 		source += pitch;
1979 		dest += pitch;
1980 	}
1981 
1982 	dst->contentsChanged(vk::Image::DIRECT_MEMORY_ACCESS);
1983 }
1984 
resolveStencil(const vk::ImageView * src,vk::ImageView * dst,const VkResolveModeFlagBits stencilResolveMode)1985 static void resolveStencil(const vk::ImageView *src, vk::ImageView *dst, const VkResolveModeFlagBits stencilResolveMode)
1986 {
1987 	if(stencilResolveMode == VK_RESOLVE_MODE_NONE)
1988 	{
1989 		return;
1990 	}
1991 
1992 	VkExtent2D extent = src->getMipLevelExtent(0, VK_IMAGE_ASPECT_STENCIL_BIT);
1993 	int width = extent.width;
1994 	int height = extent.height;
1995 	int pitch = src->rowPitchBytes(VK_IMAGE_ASPECT_STENCIL_BIT, 0);
1996 
1997 	// To support other resolve modes, use src->slicePitchBytes() and get a pointer to each sample's slice.
1998 	// Then modify the loop below to include logic for handling each new mode.
1999 	uint8_t *source = reinterpret_cast<uint8_t *>(src->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0));
2000 	uint8_t *dest = reinterpret_cast<uint8_t *>(dst->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0));
2001 
2002 	// TODO(b/167558951) support other resolve modes.
2003 	ASSERT(stencilResolveMode == VK_RESOLVE_MODE_SAMPLE_ZERO_BIT);
2004 	for(int y = 0; y < height; y++)
2005 	{
2006 		// Stencil is always 8 bits, so the width of the resource we're resolving is
2007 		// the number of bytes in each row we need to copy during for SAMPLE_ZERO
2008 		memcpy(dest, source, width);
2009 
2010 		source += pitch;
2011 		dest += pitch;
2012 	}
2013 
2014 	dst->contentsChanged(vk::Image::DIRECT_MEMORY_ACCESS);
2015 }
2016 
resolveDepthStencil(const vk::ImageView * src,vk::ImageView * dst,VkResolveModeFlagBits depthResolveMode,VkResolveModeFlagBits stencilResolveMode)2017 void Blitter::resolveDepthStencil(const vk::ImageView *src, vk::ImageView *dst, VkResolveModeFlagBits depthResolveMode, VkResolveModeFlagBits stencilResolveMode)
2018 {
2019 	VkImageSubresourceRange srcRange = src->getSubresourceRange();
2020 	VkImageSubresourceRange dstRange = src->getSubresourceRange();
2021 	ASSERT(src->getFormat() == dst->getFormat());
2022 	ASSERT(srcRange.layerCount == 1 && dstRange.layerCount == 1);
2023 	ASSERT(srcRange.aspectMask == dstRange.aspectMask);
2024 
2025 	if(srcRange.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
2026 	{
2027 		resolveDepth(src, dst, depthResolveMode);
2028 	}
2029 	if(srcRange.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
2030 	{
2031 		resolveStencil(src, dst, stencilResolveMode);
2032 	}
2033 }
2034 
resolve(const vk::Image * src,vk::Image * dst,VkImageResolve2KHR region)2035 void Blitter::resolve(const vk::Image *src, vk::Image *dst, VkImageResolve2KHR region)
2036 {
2037 	// "The aspectMask member of srcSubresource and dstSubresource must only contain VK_IMAGE_ASPECT_COLOR_BIT"
2038 	ASSERT(region.srcSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
2039 	ASSERT(region.dstSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
2040 	// "The layerCount member of srcSubresource and dstSubresource must match"
2041 	ASSERT(region.srcSubresource.layerCount == region.dstSubresource.layerCount);
2042 
2043 	// We use this method both for explicit resolves from vkCmdResolveImage, and implicit ones for resolve attachments.
2044 	// - vkCmdResolveImage: "srcImage and dstImage must have been created with the same image format."
2045 	// - VkSubpassDescription: "each resolve attachment that is not VK_ATTACHMENT_UNUSED must have the same VkFormat as its corresponding color attachment."
2046 	ASSERT(src->getFormat() == dst->getFormat());
2047 
2048 	if(fastResolve(src, dst, region))
2049 	{
2050 		return;
2051 	}
2052 
2053 	// Fall back to a generic blit which performs the resolve.
2054 	VkImageBlit2KHR blitRegion;
2055 	blitRegion.sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR;
2056 	blitRegion.pNext = nullptr;
2057 
2058 	blitRegion.srcOffsets[0] = blitRegion.srcOffsets[1] = region.srcOffset;
2059 	blitRegion.srcOffsets[1].x += region.extent.width;
2060 	blitRegion.srcOffsets[1].y += region.extent.height;
2061 	blitRegion.srcOffsets[1].z += region.extent.depth;
2062 
2063 	blitRegion.dstOffsets[0] = blitRegion.dstOffsets[1] = region.dstOffset;
2064 	blitRegion.dstOffsets[1].x += region.extent.width;
2065 	blitRegion.dstOffsets[1].y += region.extent.height;
2066 	blitRegion.dstOffsets[1].z += region.extent.depth;
2067 
2068 	blitRegion.srcSubresource = region.srcSubresource;
2069 	blitRegion.dstSubresource = region.dstSubresource;
2070 
2071 	blit(src, dst, blitRegion, VK_FILTER_NEAREST);
2072 }
2073 
averageByte4(uint32_t x,uint32_t y)2074 static inline uint32_t averageByte4(uint32_t x, uint32_t y)
2075 {
2076 	return (x & y) + (((x ^ y) >> 1) & 0x7F7F7F7F) + ((x ^ y) & 0x01010101);
2077 }
2078 
fastResolve(const vk::Image * src,vk::Image * dst,VkImageResolve2KHR region)2079 bool Blitter::fastResolve(const vk::Image *src, vk::Image *dst, VkImageResolve2KHR region)
2080 {
2081 	if(region.dstOffset != VkOffset3D{ 0, 0, 0 })
2082 	{
2083 		return false;
2084 	}
2085 
2086 	if(region.srcOffset != VkOffset3D{ 0, 0, 0 })
2087 	{
2088 		return false;
2089 	}
2090 
2091 	if(region.srcSubresource.layerCount != 1)
2092 	{
2093 		return false;
2094 	}
2095 
2096 	if(region.extent != src->getExtent() ||
2097 	   region.extent != dst->getExtent() ||
2098 	   region.extent.depth != 1)
2099 	{
2100 		return false;
2101 	}
2102 
2103 	VkImageSubresource srcSubresource = {
2104 		region.srcSubresource.aspectMask,
2105 		region.srcSubresource.mipLevel,
2106 		region.srcSubresource.baseArrayLayer
2107 	};
2108 
2109 	VkImageSubresource dstSubresource = {
2110 		region.dstSubresource.aspectMask,
2111 		region.dstSubresource.mipLevel,
2112 		region.dstSubresource.baseArrayLayer
2113 	};
2114 
2115 	VkImageSubresourceRange dstSubresourceRange = {
2116 		region.dstSubresource.aspectMask,
2117 		region.dstSubresource.mipLevel,
2118 		1,  // levelCount
2119 		region.dstSubresource.baseArrayLayer,
2120 		region.dstSubresource.layerCount
2121 	};
2122 
2123 	void *source = src->getTexelPointer({ 0, 0, 0 }, srcSubresource);
2124 	uint8_t *dest = reinterpret_cast<uint8_t *>(dst->getTexelPointer({ 0, 0, 0 }, dstSubresource));
2125 
2126 	auto format = src->getFormat();
2127 	auto samples = src->getSampleCount();
2128 	auto extent = src->getExtent();
2129 
2130 	int width = extent.width;
2131 	int height = extent.height;
2132 	int pitch = src->rowPitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, region.srcSubresource.mipLevel);
2133 	int slice = src->slicePitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, region.srcSubresource.mipLevel);
2134 
2135 	uint8_t *source0 = (uint8_t *)source;
2136 	uint8_t *source1 = source0 + slice;
2137 	uint8_t *source2 = source1 + slice;
2138 	uint8_t *source3 = source2 + slice;
2139 
2140 	[[maybe_unused]] const bool SSE2 = CPUID::supportsSSE2();
2141 
2142 	if(format == VK_FORMAT_R8G8B8A8_UNORM || format == VK_FORMAT_B8G8R8A8_UNORM || format == VK_FORMAT_A8B8G8R8_UNORM_PACK32)
2143 	{
2144 		if(samples == 4)
2145 		{
2146 			for(int y = 0; y < height; y++)
2147 			{
2148 				int x = 0;
2149 
2150 #if defined(__i386__) || defined(__x86_64__)
2151 				if(SSE2)
2152 				{
2153 					for(; (x + 3) < width; x += 4)
2154 					{
2155 						__m128i c0 = _mm_loadu_si128((__m128i *)(source0 + 4 * x));
2156 						__m128i c1 = _mm_loadu_si128((__m128i *)(source1 + 4 * x));
2157 						__m128i c2 = _mm_loadu_si128((__m128i *)(source2 + 4 * x));
2158 						__m128i c3 = _mm_loadu_si128((__m128i *)(source3 + 4 * x));
2159 
2160 						c0 = _mm_avg_epu8(c0, c1);
2161 						c2 = _mm_avg_epu8(c2, c3);
2162 						c0 = _mm_avg_epu8(c0, c2);
2163 
2164 						_mm_storeu_si128((__m128i *)(dest + 4 * x), c0);
2165 					}
2166 				}
2167 #endif
2168 
2169 				for(; x < width; x++)
2170 				{
2171 					uint32_t c0 = *(uint32_t *)(source0 + 4 * x);
2172 					uint32_t c1 = *(uint32_t *)(source1 + 4 * x);
2173 					uint32_t c2 = *(uint32_t *)(source2 + 4 * x);
2174 					uint32_t c3 = *(uint32_t *)(source3 + 4 * x);
2175 
2176 					uint32_t c01 = averageByte4(c0, c1);
2177 					uint32_t c23 = averageByte4(c2, c3);
2178 					uint32_t c03 = averageByte4(c01, c23);
2179 
2180 					*(uint32_t *)(dest + 4 * x) = c03;
2181 				}
2182 
2183 				source0 += pitch;
2184 				source1 += pitch;
2185 				source2 += pitch;
2186 				source3 += pitch;
2187 				dest += pitch;
2188 
2189 				ASSERT(source0 < src->end());
2190 				ASSERT(source3 < src->end());
2191 				ASSERT(dest < dst->end());
2192 			}
2193 		}
2194 		else
2195 			UNSUPPORTED("Samples: %d", samples);
2196 	}
2197 	else
2198 	{
2199 		return false;
2200 	}
2201 
2202 	dst->contentsChanged(dstSubresourceRange);
2203 
2204 	return true;
2205 }
2206 
copy(const vk::Image * src,uint8_t * dst,unsigned int dstPitch)2207 void Blitter::copy(const vk::Image *src, uint8_t *dst, unsigned int dstPitch)
2208 {
2209 	VkExtent3D extent = src->getExtent();
2210 	size_t rowBytes = src->getFormat(VK_IMAGE_ASPECT_COLOR_BIT).bytes() * extent.width;
2211 	unsigned int srcPitch = src->rowPitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, 0);
2212 	ASSERT(dstPitch >= rowBytes && srcPitch >= rowBytes && src->getMipLevelExtent(VK_IMAGE_ASPECT_COLOR_BIT, 0).height >= extent.height);
2213 
2214 	const uint8_t *s = (uint8_t *)src->getTexelPointer({ 0, 0, 0 }, { VK_IMAGE_ASPECT_COLOR_BIT, 0, 0 });
2215 	uint8_t *d = dst;
2216 
2217 	for(uint32_t y = 0; y < extent.height; y++)
2218 	{
2219 		memcpy(d, s, rowBytes);
2220 
2221 		s += srcPitch;
2222 		d += dstPitch;
2223 	}
2224 }
2225 
computeCubeCorner(Pointer<Byte> & layer,Int & x0,Int & x1,Int & y0,Int & y1,Int & pitchB,const State & state)2226 void Blitter::computeCubeCorner(Pointer<Byte> &layer, Int &x0, Int &x1, Int &y0, Int &y1, Int &pitchB, const State &state)
2227 {
2228 	int bytes = state.sourceFormat.bytes();
2229 
2230 	Float4 c = readFloat4(layer + ComputeOffset(x0, y1, pitchB, bytes), state) +
2231 	           readFloat4(layer + ComputeOffset(x1, y0, pitchB, bytes), state) +
2232 	           readFloat4(layer + ComputeOffset(x1, y1, pitchB, bytes), state);
2233 
2234 	c *= Float4(1.0f / 3.0f);
2235 
2236 	write(c, layer + ComputeOffset(x0, y0, pitchB, bytes), state);
2237 }
2238 
generateCornerUpdate(const State & state)2239 Blitter::CornerUpdateRoutineType Blitter::generateCornerUpdate(const State &state)
2240 {
2241 	// Reading and writing from/to the same image
2242 	ASSERT(state.sourceFormat == state.destFormat);
2243 	ASSERT(state.srcSamples == state.destSamples);
2244 
2245 	// Vulkan 1.2: "If samples is not VK_SAMPLE_COUNT_1_BIT, then imageType must be
2246 	// VK_IMAGE_TYPE_2D, flags must not contain VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT"
2247 	ASSERT(state.srcSamples == 1);
2248 
2249 	CornerUpdateFunction function;
2250 	{
2251 		Pointer<Byte> blit(function.Arg<0>());
2252 
2253 		Pointer<Byte> layers = *Pointer<Pointer<Byte>>(blit + OFFSET(CubeBorderData, layers));
2254 		Int pitchB = *Pointer<Int>(blit + OFFSET(CubeBorderData, pitchB));
2255 		UInt layerSize = *Pointer<Int>(blit + OFFSET(CubeBorderData, layerSize));
2256 		UInt dim = *Pointer<Int>(blit + OFFSET(CubeBorderData, dim));
2257 
2258 		// Low Border, Low Pixel, High Border, High Pixel
2259 		Int LB(-1), LP(0), HB(dim), HP(dim - 1);
2260 
2261 		for(int face = 0; face < 6; face++)
2262 		{
2263 			computeCubeCorner(layers, LB, LP, LB, LP, pitchB, state);
2264 			computeCubeCorner(layers, LB, LP, HB, HP, pitchB, state);
2265 			computeCubeCorner(layers, HB, HP, LB, LP, pitchB, state);
2266 			computeCubeCorner(layers, HB, HP, HB, HP, pitchB, state);
2267 			layers = layers + layerSize;
2268 		}
2269 	}
2270 
2271 	return function("BlitRoutine");
2272 }
2273 
updateBorders(const vk::Image * image,const VkImageSubresource & subresource)2274 void Blitter::updateBorders(const vk::Image *image, const VkImageSubresource &subresource)
2275 {
2276 	ASSERT(image->getArrayLayers() >= (subresource.arrayLayer + 6));
2277 
2278 	// From Vulkan 1.1 spec, section 11.5. Image Views:
2279 	// "For cube and cube array image views, the layers of the image view starting
2280 	//  at baseArrayLayer correspond to faces in the order +X, -X, +Y, -Y, +Z, -Z."
2281 	VkImageSubresource posX = subresource;
2282 	VkImageSubresource negX = posX;
2283 	negX.arrayLayer++;
2284 	VkImageSubresource posY = negX;
2285 	posY.arrayLayer++;
2286 	VkImageSubresource negY = posY;
2287 	negY.arrayLayer++;
2288 	VkImageSubresource posZ = negY;
2289 	posZ.arrayLayer++;
2290 	VkImageSubresource negZ = posZ;
2291 	negZ.arrayLayer++;
2292 
2293 	// Copy top / bottom
2294 	copyCubeEdge(image, posX, BOTTOM, negY, RIGHT);
2295 	copyCubeEdge(image, posY, BOTTOM, posZ, TOP);
2296 	copyCubeEdge(image, posZ, BOTTOM, negY, TOP);
2297 	copyCubeEdge(image, negX, BOTTOM, negY, LEFT);
2298 	copyCubeEdge(image, negY, BOTTOM, negZ, BOTTOM);
2299 	copyCubeEdge(image, negZ, BOTTOM, negY, BOTTOM);
2300 
2301 	copyCubeEdge(image, posX, TOP, posY, RIGHT);
2302 	copyCubeEdge(image, posY, TOP, negZ, TOP);
2303 	copyCubeEdge(image, posZ, TOP, posY, BOTTOM);
2304 	copyCubeEdge(image, negX, TOP, posY, LEFT);
2305 	copyCubeEdge(image, negY, TOP, posZ, BOTTOM);
2306 	copyCubeEdge(image, negZ, TOP, posY, TOP);
2307 
2308 	// Copy left / right
2309 	copyCubeEdge(image, posX, RIGHT, negZ, LEFT);
2310 	copyCubeEdge(image, posY, RIGHT, posX, TOP);
2311 	copyCubeEdge(image, posZ, RIGHT, posX, LEFT);
2312 	copyCubeEdge(image, negX, RIGHT, posZ, LEFT);
2313 	copyCubeEdge(image, negY, RIGHT, posX, BOTTOM);
2314 	copyCubeEdge(image, negZ, RIGHT, negX, LEFT);
2315 
2316 	copyCubeEdge(image, posX, LEFT, posZ, RIGHT);
2317 	copyCubeEdge(image, posY, LEFT, negX, TOP);
2318 	copyCubeEdge(image, posZ, LEFT, negX, RIGHT);
2319 	copyCubeEdge(image, negX, LEFT, negZ, RIGHT);
2320 	copyCubeEdge(image, negY, LEFT, negX, BOTTOM);
2321 	copyCubeEdge(image, negZ, LEFT, posX, RIGHT);
2322 
2323 	// Compute corner colors
2324 	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresource.aspectMask);
2325 	vk::Format format = image->getFormat(aspect);
2326 	VkSampleCountFlagBits samples = image->getSampleCount();
2327 	State state(format, format, samples, samples, Options{ 0xF });
2328 
2329 	// Vulkan 1.2: "If samples is not VK_SAMPLE_COUNT_1_BIT, then imageType must be
2330 	// VK_IMAGE_TYPE_2D, flags must not contain VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT"
2331 	ASSERT(samples == VK_SAMPLE_COUNT_1_BIT);
2332 
2333 	auto cornerUpdateRoutine = getCornerUpdateRoutine(state);
2334 	if(!cornerUpdateRoutine)
2335 	{
2336 		return;
2337 	}
2338 
2339 	VkExtent3D extent = image->getMipLevelExtent(aspect, subresource.mipLevel);
2340 	CubeBorderData data = {
2341 		image->getTexelPointer({ 0, 0, 0 }, posX),
2342 		assert_cast<uint32_t>(image->rowPitchBytes(aspect, subresource.mipLevel)),
2343 		assert_cast<uint32_t>(image->getLayerSize(aspect)),
2344 		extent.width
2345 	};
2346 	cornerUpdateRoutine(&data);
2347 }
2348 
copyCubeEdge(const vk::Image * image,const VkImageSubresource & dstSubresource,Edge dstEdge,const VkImageSubresource & srcSubresource,Edge srcEdge)2349 void Blitter::copyCubeEdge(const vk::Image *image,
2350                            const VkImageSubresource &dstSubresource, Edge dstEdge,
2351                            const VkImageSubresource &srcSubresource, Edge srcEdge)
2352 {
2353 	ASSERT(srcSubresource.aspectMask == dstSubresource.aspectMask);
2354 	ASSERT(srcSubresource.mipLevel == dstSubresource.mipLevel);
2355 	ASSERT(srcSubresource.arrayLayer != dstSubresource.arrayLayer);
2356 
2357 	// Figure out if the edges to be copied in reverse order respectively from one another
2358 	// The copy should be reversed whenever the same edges are contiguous or if we're
2359 	// copying top <-> right or bottom <-> left. This is explained by the layout, which is:
2360 	//
2361 	//      | +y |
2362 	// | -x | +z | +x | -z |
2363 	//      | -y |
2364 
2365 	bool reverse = (srcEdge == dstEdge) ||
2366 	               ((srcEdge == TOP) && (dstEdge == RIGHT)) ||
2367 	               ((srcEdge == RIGHT) && (dstEdge == TOP)) ||
2368 	               ((srcEdge == BOTTOM) && (dstEdge == LEFT)) ||
2369 	               ((srcEdge == LEFT) && (dstEdge == BOTTOM));
2370 
2371 	VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(srcSubresource.aspectMask);
2372 	int bytes = image->getFormat(aspect).bytes();
2373 	int pitchB = image->rowPitchBytes(aspect, srcSubresource.mipLevel);
2374 
2375 	VkExtent3D extent = image->getMipLevelExtent(aspect, srcSubresource.mipLevel);
2376 	int w = extent.width;
2377 	int h = extent.height;
2378 	if(w != h)
2379 	{
2380 		UNSUPPORTED("Cube doesn't have square faces : (%d, %d)", w, h);
2381 	}
2382 
2383 	// Src is expressed in the regular [0, width-1], [0, height-1] space
2384 	bool srcHorizontal = ((srcEdge == TOP) || (srcEdge == BOTTOM));
2385 	int srcDelta = srcHorizontal ? bytes : pitchB;
2386 	VkOffset3D srcOffset = { (srcEdge == RIGHT) ? (w - 1) : 0, (srcEdge == BOTTOM) ? (h - 1) : 0, 0 };
2387 
2388 	// Dst contains borders, so it is expressed in the [-1, width], [-1, height] space
2389 	bool dstHorizontal = ((dstEdge == TOP) || (dstEdge == BOTTOM));
2390 	int dstDelta = (dstHorizontal ? bytes : pitchB) * (reverse ? -1 : 1);
2391 	VkOffset3D dstOffset = { (dstEdge == RIGHT) ? w : -1, (dstEdge == BOTTOM) ? h : -1, 0 };
2392 
2393 	// Don't write in the corners
2394 	if(dstHorizontal)
2395 	{
2396 		dstOffset.x += reverse ? w : 1;
2397 	}
2398 	else
2399 	{
2400 		dstOffset.y += reverse ? h : 1;
2401 	}
2402 
2403 	const uint8_t *src = static_cast<const uint8_t *>(image->getTexelPointer(srcOffset, srcSubresource));
2404 	uint8_t *dst = static_cast<uint8_t *>(image->getTexelPointer(dstOffset, dstSubresource));
2405 	ASSERT((src < image->end()) && ((src + (w * srcDelta)) < image->end()));
2406 	ASSERT((dst < image->end()) && ((dst + (w * dstDelta)) < image->end()));
2407 
2408 	for(int i = 0; i < w; ++i, dst += dstDelta, src += srcDelta)
2409 	{
2410 		memcpy(dst, src, bytes);
2411 	}
2412 }
2413 
2414 }  // namespace sw
2415