1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "Blitter.hpp"
16
17 #include "Pipeline/ShaderCore.hpp"
18 #include "Reactor/Reactor.hpp"
19 #include "System/CPUID.hpp"
20 #include "System/Debug.hpp"
21 #include "System/Half.hpp"
22 #include "System/Memory.hpp"
23 #include "Vulkan/VkImage.hpp"
24 #include "Vulkan/VkImageView.hpp"
25
26 #include <utility>
27
28 #if defined(__i386__) || defined(__x86_64__)
29 # include <xmmintrin.h>
30 # include <emmintrin.h>
31 #endif
32
33 namespace sw {
34
PackFields(const rr::Int4 & ints,const sw::int4 shifts)35 static rr::RValue<rr::Int> PackFields(const rr::Int4 &ints, const sw::int4 shifts)
36 {
37 return (rr::Int(ints.x) << shifts[0]) |
38 (rr::Int(ints.y) << shifts[1]) |
39 (rr::Int(ints.z) << shifts[2]) |
40 (rr::Int(ints.w) << shifts[3]);
41 }
42
Blitter()43 Blitter::Blitter()
44 : blitMutex()
45 , blitCache(1024)
46 , cornerUpdateMutex()
47 , cornerUpdateCache(64) // We only need one of these per format
48 {
49 }
50
~Blitter()51 Blitter::~Blitter()
52 {
53 }
54
clear(const void * pixel,vk::Format format,vk::Image * dest,const vk::Format & viewFormat,const VkImageSubresourceRange & subresourceRange,const VkRect2D * renderArea)55 void Blitter::clear(const void *pixel, vk::Format format, vk::Image *dest, const vk::Format &viewFormat, const VkImageSubresourceRange &subresourceRange, const VkRect2D *renderArea)
56 {
57 VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
58 vk::Format dstFormat = viewFormat.getAspectFormat(aspect);
59 if(dstFormat == VK_FORMAT_UNDEFINED)
60 {
61 return;
62 }
63
64 VkClearValue clampedPixel;
65 if(viewFormat.isSignedNormalized() || viewFormat.isUnsignedNormalized())
66 {
67 const float minValue = viewFormat.isSignedNormalized() ? -1.0f : 0.0f;
68
69 if(aspect & VK_IMAGE_ASPECT_COLOR_BIT)
70 {
71 memcpy(clampedPixel.color.float32, pixel, sizeof(VkClearColorValue));
72 clampedPixel.color.float32[0] = sw::clamp(clampedPixel.color.float32[0], minValue, 1.0f);
73 clampedPixel.color.float32[1] = sw::clamp(clampedPixel.color.float32[1], minValue, 1.0f);
74 clampedPixel.color.float32[2] = sw::clamp(clampedPixel.color.float32[2], minValue, 1.0f);
75 clampedPixel.color.float32[3] = sw::clamp(clampedPixel.color.float32[3], minValue, 1.0f);
76 pixel = clampedPixel.color.float32;
77 }
78
79 // Stencil never requires clamping, so we can check for Depth only
80 if(aspect & VK_IMAGE_ASPECT_DEPTH_BIT)
81 {
82 memcpy(&(clampedPixel.depthStencil), pixel, sizeof(VkClearDepthStencilValue));
83 clampedPixel.depthStencil.depth = sw::clamp(clampedPixel.depthStencil.depth, minValue, 1.0f);
84 pixel = &(clampedPixel.depthStencil);
85 }
86 }
87
88 if(fastClear(pixel, format, dest, dstFormat, subresourceRange, renderArea))
89 {
90 return;
91 }
92
93 State state(format, dstFormat, 1, dest->getSampleCount(), Options{ 0xF });
94 auto blitRoutine = getBlitRoutine(state);
95 if(!blitRoutine)
96 {
97 return;
98 }
99
100 VkImageSubresource subres = {
101 subresourceRange.aspectMask,
102 subresourceRange.baseMipLevel,
103 subresourceRange.baseArrayLayer
104 };
105
106 uint32_t lastMipLevel = dest->getLastMipLevel(subresourceRange);
107 uint32_t lastLayer = dest->getLastLayerIndex(subresourceRange);
108
109 VkRect2D area = { { 0, 0 }, { 0, 0 } };
110 if(renderArea)
111 {
112 ASSERT(subresourceRange.levelCount == 1);
113 area = *renderArea;
114 }
115
116 for(; subres.mipLevel <= lastMipLevel; subres.mipLevel++)
117 {
118 VkExtent3D extent = dest->getMipLevelExtent(aspect, subres.mipLevel);
119 if(!renderArea)
120 {
121 area.extent.width = extent.width;
122 area.extent.height = extent.height;
123 }
124
125 BlitData data = {
126 pixel, nullptr, // source, dest
127
128 assert_cast<uint32_t>(format.bytes()), // sPitchB
129 assert_cast<uint32_t>(dest->rowPitchBytes(aspect, subres.mipLevel)), // dPitchB
130 0, // sSliceB (unused in clear operations)
131 assert_cast<uint32_t>(dest->slicePitchBytes(aspect, subres.mipLevel)), // dSliceB
132
133 0.5f, 0.5f, 0.5f, 0.0f, 0.0f, 0.0f, // x0, y0, z0, w, h, d
134
135 area.offset.x, static_cast<int>(area.offset.x + area.extent.width), // x0d, x1d
136 area.offset.y, static_cast<int>(area.offset.y + area.extent.height), // y0d, y1d
137 0, 1, // z0d, z1d
138
139 0, 0, 0, // sWidth, sHeight, sDepth
140
141 false, // filter3D
142 };
143
144 if(renderArea && dest->is3DSlice())
145 {
146 // Reinterpret layers as depth slices
147 subres.arrayLayer = 0;
148 for(uint32_t depth = subresourceRange.baseArrayLayer; depth <= lastLayer; depth++)
149 {
150 data.dest = dest->getTexelPointer({ 0, 0, static_cast<int32_t>(depth) }, subres);
151 blitRoutine(&data);
152 }
153 }
154 else
155 {
156 for(subres.arrayLayer = subresourceRange.baseArrayLayer; subres.arrayLayer <= lastLayer; subres.arrayLayer++)
157 {
158 for(uint32_t depth = 0; depth < extent.depth; depth++)
159 {
160 data.dest = dest->getTexelPointer({ 0, 0, static_cast<int32_t>(depth) }, subres);
161
162 blitRoutine(&data);
163 }
164 }
165 }
166 }
167 dest->contentsChanged(subresourceRange);
168 }
169
fastClear(const void * clearValue,vk::Format clearFormat,vk::Image * dest,const vk::Format & viewFormat,const VkImageSubresourceRange & subresourceRange,const VkRect2D * renderArea)170 bool Blitter::fastClear(const void *clearValue, vk::Format clearFormat, vk::Image *dest, const vk::Format &viewFormat, const VkImageSubresourceRange &subresourceRange, const VkRect2D *renderArea)
171 {
172 if(clearFormat != VK_FORMAT_R32G32B32A32_SFLOAT &&
173 clearFormat != VK_FORMAT_D32_SFLOAT &&
174 clearFormat != VK_FORMAT_S8_UINT)
175 {
176 return false;
177 }
178
179 union ClearValue
180 {
181 struct
182 {
183 float r;
184 float g;
185 float b;
186 float a;
187 };
188
189 float rgb[3];
190
191 float d;
192 uint32_t d_as_u32;
193
194 uint32_t s;
195 };
196
197 const ClearValue &c = *reinterpret_cast<const ClearValue *>(clearValue);
198
199 uint32_t packed = 0;
200
201 VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresourceRange.aspectMask);
202 switch(viewFormat)
203 {
204 case VK_FORMAT_R5G6B5_UNORM_PACK16:
205 packed = ((uint16_t)(31 * c.b + 0.5f) << 0) |
206 ((uint16_t)(63 * c.g + 0.5f) << 5) |
207 ((uint16_t)(31 * c.r + 0.5f) << 11);
208 break;
209 case VK_FORMAT_B5G6R5_UNORM_PACK16:
210 packed = ((uint16_t)(31 * c.r + 0.5f) << 0) |
211 ((uint16_t)(63 * c.g + 0.5f) << 5) |
212 ((uint16_t)(31 * c.b + 0.5f) << 11);
213 break;
214 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
215 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
216 case VK_FORMAT_R8G8B8A8_UNORM:
217 packed = ((uint32_t)(255 * c.a + 0.5f) << 24) |
218 ((uint32_t)(255 * c.b + 0.5f) << 16) |
219 ((uint32_t)(255 * c.g + 0.5f) << 8) |
220 ((uint32_t)(255 * c.r + 0.5f) << 0);
221 break;
222 case VK_FORMAT_B8G8R8A8_UNORM:
223 packed = ((uint32_t)(255 * c.a + 0.5f) << 24) |
224 ((uint32_t)(255 * c.r + 0.5f) << 16) |
225 ((uint32_t)(255 * c.g + 0.5f) << 8) |
226 ((uint32_t)(255 * c.b + 0.5f) << 0);
227 break;
228 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
229 packed = R11G11B10F(c.rgb);
230 break;
231 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
232 packed = RGB9E5(c.rgb);
233 break;
234 case VK_FORMAT_D32_SFLOAT:
235 ASSERT(clearFormat == VK_FORMAT_D32_SFLOAT);
236 packed = c.d_as_u32; // float reinterpreted as uint32
237 break;
238 case VK_FORMAT_S8_UINT:
239 ASSERT(clearFormat == VK_FORMAT_S8_UINT);
240 packed = static_cast<uint8_t>(c.s);
241 break;
242 default:
243 return false;
244 }
245
246 VkImageSubresource subres = {
247 subresourceRange.aspectMask,
248 subresourceRange.baseMipLevel,
249 subresourceRange.baseArrayLayer
250 };
251 uint32_t lastMipLevel = dest->getLastMipLevel(subresourceRange);
252 uint32_t lastLayer = dest->getLastLayerIndex(subresourceRange);
253
254 VkRect2D area = { { 0, 0 }, { 0, 0 } };
255 if(renderArea)
256 {
257 ASSERT(subresourceRange.levelCount == 1);
258 area = *renderArea;
259 }
260
261 for(; subres.mipLevel <= lastMipLevel; subres.mipLevel++)
262 {
263 int rowPitchBytes = dest->rowPitchBytes(aspect, subres.mipLevel);
264 int slicePitchBytes = dest->slicePitchBytes(aspect, subres.mipLevel);
265 VkExtent3D extent = dest->getMipLevelExtent(aspect, subres.mipLevel);
266 if(!renderArea)
267 {
268 area.extent.width = extent.width;
269 area.extent.height = extent.height;
270 }
271 if(dest->is3DSlice())
272 {
273 extent.depth = 1; // The 3D image is instead interpreted as a 2D image with layers
274 }
275
276 for(subres.arrayLayer = subresourceRange.baseArrayLayer; subres.arrayLayer <= lastLayer; subres.arrayLayer++)
277 {
278 for(uint32_t depth = 0; depth < extent.depth; depth++)
279 {
280 uint8_t *slice = (uint8_t *)dest->getTexelPointer(
281 { area.offset.x, area.offset.y, static_cast<int32_t>(depth) }, subres);
282
283 for(int j = 0; j < dest->getSampleCount(); j++)
284 {
285 uint8_t *d = slice;
286
287 switch(viewFormat.bytes())
288 {
289 case 4:
290 for(uint32_t i = 0; i < area.extent.height; i++)
291 {
292 ASSERT(d < dest->end());
293 sw::clear((uint32_t *)d, packed, area.extent.width);
294 d += rowPitchBytes;
295 }
296 break;
297 case 2:
298 for(uint32_t i = 0; i < area.extent.height; i++)
299 {
300 ASSERT(d < dest->end());
301 sw::clear((uint16_t *)d, static_cast<uint16_t>(packed), area.extent.width);
302 d += rowPitchBytes;
303 }
304 break;
305 case 1:
306 for(uint32_t i = 0; i < area.extent.height; i++)
307 {
308 ASSERT(d < dest->end());
309 memset(d, packed, area.extent.width);
310 d += rowPitchBytes;
311 }
312 break;
313 default:
314 assert(false);
315 }
316
317 slice += slicePitchBytes;
318 }
319 }
320 }
321 }
322 dest->contentsChanged(subresourceRange);
323
324 return true;
325 }
326
readFloat4(Pointer<Byte> element,const State & state)327 Float4 Blitter::readFloat4(Pointer<Byte> element, const State &state)
328 {
329 Float4 c(0.0f, 0.0f, 0.0f, 1.0f);
330
331 switch(state.sourceFormat)
332 {
333 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
334 c.w = Float(Int(*Pointer<Byte>(element)) & Int(0xF));
335 c.x = Float((Int(*Pointer<Byte>(element)) >> 4) & Int(0xF));
336 c.y = Float(Int(*Pointer<Byte>(element + 1)) & Int(0xF));
337 c.z = Float((Int(*Pointer<Byte>(element + 1)) >> 4) & Int(0xF));
338 break;
339 case VK_FORMAT_R8_SINT:
340 case VK_FORMAT_R8_SNORM:
341 c.x = Float(Int(*Pointer<SByte>(element)));
342 c.w = float(0x7F);
343 break;
344 case VK_FORMAT_R8_UNORM:
345 case VK_FORMAT_R8_UINT:
346 case VK_FORMAT_R8_SRGB:
347 c.x = Float(Int(*Pointer<Byte>(element)));
348 c.w = float(0xFF);
349 break;
350 case VK_FORMAT_R16_SINT:
351 case VK_FORMAT_R16_SNORM:
352 c.x = Float(Int(*Pointer<Short>(element)));
353 c.w = float(0x7FFF);
354 break;
355 case VK_FORMAT_R16_UNORM:
356 case VK_FORMAT_R16_UINT:
357 c.x = Float(Int(*Pointer<UShort>(element)));
358 c.w = float(0xFFFF);
359 break;
360 case VK_FORMAT_R32_SINT:
361 c.x = Float(*Pointer<Int>(element));
362 c.w = float(0x7FFFFFFF);
363 break;
364 case VK_FORMAT_R32_UINT:
365 c.x = Float(*Pointer<UInt>(element));
366 c.w = float(0xFFFFFFFF);
367 break;
368 case VK_FORMAT_B8G8R8A8_SRGB:
369 case VK_FORMAT_B8G8R8A8_UNORM:
370 c = Float4(*Pointer<Byte4>(element)).zyxw;
371 break;
372 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
373 case VK_FORMAT_R8G8B8A8_SINT:
374 case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
375 case VK_FORMAT_R8G8B8A8_SNORM:
376 c = Float4(*Pointer<SByte4>(element));
377 break;
378 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
379 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
380 case VK_FORMAT_R8G8B8A8_UNORM:
381 case VK_FORMAT_R8G8B8A8_UINT:
382 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
383 case VK_FORMAT_R8G8B8A8_SRGB:
384 c = Float4(*Pointer<Byte4>(element));
385 break;
386 case VK_FORMAT_R16G16B16A16_SINT:
387 case VK_FORMAT_R16G16B16A16_SNORM:
388 c = Float4(*Pointer<Short4>(element));
389 break;
390 case VK_FORMAT_R16G16B16A16_UNORM:
391 case VK_FORMAT_R16G16B16A16_UINT:
392 c = Float4(*Pointer<UShort4>(element));
393 break;
394 case VK_FORMAT_R32G32B32A32_SINT:
395 c = Float4(*Pointer<Int4>(element));
396 break;
397 case VK_FORMAT_R32G32B32A32_UINT:
398 c = Float4(*Pointer<UInt4>(element));
399 break;
400 case VK_FORMAT_R8G8_SINT:
401 case VK_FORMAT_R8G8_SNORM:
402 c.x = Float(Int(*Pointer<SByte>(element + 0)));
403 c.y = Float(Int(*Pointer<SByte>(element + 1)));
404 c.w = float(0x7F);
405 break;
406 case VK_FORMAT_R8G8_UNORM:
407 case VK_FORMAT_R8G8_UINT:
408 case VK_FORMAT_R8G8_SRGB:
409 c.x = Float(Int(*Pointer<Byte>(element + 0)));
410 c.y = Float(Int(*Pointer<Byte>(element + 1)));
411 c.w = float(0xFF);
412 break;
413 case VK_FORMAT_R16G16_SINT:
414 case VK_FORMAT_R16G16_SNORM:
415 c.x = Float(Int(*Pointer<Short>(element + 0)));
416 c.y = Float(Int(*Pointer<Short>(element + 2)));
417 c.w = float(0x7FFF);
418 break;
419 case VK_FORMAT_R16G16_UNORM:
420 case VK_FORMAT_R16G16_UINT:
421 c.x = Float(Int(*Pointer<UShort>(element + 0)));
422 c.y = Float(Int(*Pointer<UShort>(element + 2)));
423 c.w = float(0xFFFF);
424 break;
425 case VK_FORMAT_R32G32_SINT:
426 c.x = Float(*Pointer<Int>(element + 0));
427 c.y = Float(*Pointer<Int>(element + 4));
428 c.w = float(0x7FFFFFFF);
429 break;
430 case VK_FORMAT_R32G32_UINT:
431 c.x = Float(*Pointer<UInt>(element + 0));
432 c.y = Float(*Pointer<UInt>(element + 4));
433 c.w = float(0xFFFFFFFF);
434 break;
435 case VK_FORMAT_R32G32B32A32_SFLOAT:
436 c = *Pointer<Float4>(element);
437 break;
438 case VK_FORMAT_R32G32_SFLOAT:
439 c.x = *Pointer<Float>(element + 0);
440 c.y = *Pointer<Float>(element + 4);
441 break;
442 case VK_FORMAT_R32_SFLOAT:
443 c.x = *Pointer<Float>(element);
444 break;
445 case VK_FORMAT_R16G16B16A16_SFLOAT:
446 c.w = Float(*Pointer<Half>(element + 6));
447 case VK_FORMAT_R16G16B16_SFLOAT:
448 c.z = Float(*Pointer<Half>(element + 4));
449 case VK_FORMAT_R16G16_SFLOAT:
450 c.y = Float(*Pointer<Half>(element + 2));
451 case VK_FORMAT_R16_SFLOAT:
452 c.x = Float(*Pointer<Half>(element));
453 break;
454 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
455 c = r11g11b10Unpack(*Pointer<UInt>(element));
456 break;
457 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
458 // This type contains a common 5 bit exponent (E) and a 9 bit the mantissa for R, G and B.
459 c.x = Float(*Pointer<UInt>(element) & UInt(0x000001FF)); // R's mantissa (bits 0-8)
460 c.y = Float((*Pointer<UInt>(element) & UInt(0x0003FE00)) >> 9); // G's mantissa (bits 9-17)
461 c.z = Float((*Pointer<UInt>(element) & UInt(0x07FC0000)) >> 18); // B's mantissa (bits 18-26)
462 c *= Float4(
463 // 2^E, using the exponent (bits 27-31) and treating it as an unsigned integer value
464 Float(UInt(1) << ((*Pointer<UInt>(element) & UInt(0xF8000000)) >> 27)) *
465 // Since the 9 bit mantissa values currently stored in RGB were converted straight
466 // from int to float (in the [0, 1<<9] range instead of the [0, 1] range), they
467 // are (1 << 9) times too high.
468 // Also, the exponent has 5 bits and we compute the exponent bias of floating point
469 // formats using "2^(k-1) - 1", so, in this case, the exponent bias is 2^(5-1)-1 = 15
470 // Exponent bias (15) + number of mantissa bits per component (9) = 24
471 Float(1.0f / (1 << 24)));
472 c.w = 1.0f;
473 break;
474 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
475 c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF000)) >> UShort(12)));
476 c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x0F00)) >> UShort(8)));
477 c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
478 c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
479 break;
480 case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
481 c.w = Float(Int((*Pointer<UShort>(element) & UShort(0xF000)) >> UShort(12)));
482 c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x0F00)) >> UShort(8)));
483 c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
484 c.x = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
485 break;
486 case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
487 c.w = Float(Int((*Pointer<UShort>(element) & UShort(0xF000)) >> UShort(12)));
488 c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x0F00)) >> UShort(8)));
489 c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x00F0)) >> UShort(4)));
490 c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x000F)));
491 break;
492 case VK_FORMAT_R5G6B5_UNORM_PACK16:
493 c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
494 c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
495 c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
496 break;
497 case VK_FORMAT_B5G6R5_UNORM_PACK16:
498 c.z = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
499 c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07E0)) >> UShort(5)));
500 c.x = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
501 break;
502 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
503 c.x = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
504 c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07C0)) >> UShort(6)));
505 c.z = Float(Int((*Pointer<UShort>(element) & UShort(0x003E)) >> UShort(1)));
506 c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x0001)));
507 break;
508 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
509 c.z = Float(Int((*Pointer<UShort>(element) & UShort(0xF800)) >> UShort(11)));
510 c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x07C0)) >> UShort(6)));
511 c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x003E)) >> UShort(1)));
512 c.w = Float(Int(*Pointer<UShort>(element) & UShort(0x0001)));
513 break;
514 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
515 c.w = Float(Int((*Pointer<UShort>(element) & UShort(0x8000)) >> UShort(15)));
516 c.x = Float(Int((*Pointer<UShort>(element) & UShort(0x7C00)) >> UShort(10)));
517 c.y = Float(Int((*Pointer<UShort>(element) & UShort(0x03E0)) >> UShort(5)));
518 c.z = Float(Int(*Pointer<UShort>(element) & UShort(0x001F)));
519 break;
520 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
521 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
522 c.x = Float(Int((*Pointer<UInt>(element) & UInt(0x000003FF))));
523 c.y = Float(Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10));
524 c.z = Float(Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20));
525 c.w = Float(Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30));
526 break;
527 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
528 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
529 c.z = Float(Int((*Pointer<UInt>(element) & UInt(0x000003FF))));
530 c.y = Float(Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10));
531 c.x = Float(Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20));
532 c.w = Float(Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30));
533 break;
534 case VK_FORMAT_D16_UNORM:
535 c.x = Float(Int((*Pointer<UShort>(element))));
536 break;
537 case VK_FORMAT_X8_D24_UNORM_PACK32:
538 c.x = Float(Int((*Pointer<UInt>(element) & UInt(0xFFFFFF00)) >> 8));
539 break;
540 case VK_FORMAT_D32_SFLOAT:
541 c.x = *Pointer<Float>(element);
542 break;
543 case VK_FORMAT_S8_UINT:
544 c.x = Float(Int(*Pointer<Byte>(element)));
545 break;
546 default:
547 UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
548 }
549
550 return c;
551 }
552
write(Float4 & c,Pointer<Byte> element,const State & state)553 void Blitter::write(Float4 &c, Pointer<Byte> element, const State &state)
554 {
555 bool writeR = state.writeRed;
556 bool writeG = state.writeGreen;
557 bool writeB = state.writeBlue;
558 bool writeA = state.writeAlpha;
559 bool writeRGBA = writeR && writeG && writeB && writeA;
560
561 switch(state.destFormat)
562 {
563 case VK_FORMAT_R4G4_UNORM_PACK8:
564 if(writeR | writeG)
565 {
566 if(!writeR)
567 {
568 *Pointer<Byte>(element) = (Byte(RoundInt(Float(c.y))) & Byte(0xF)) |
569 (*Pointer<Byte>(element) & Byte(0xF0));
570 }
571 else if(!writeG)
572 {
573 *Pointer<Byte>(element) = (*Pointer<Byte>(element) & Byte(0xF)) |
574 (Byte(RoundInt(Float(c.x))) << Byte(4));
575 }
576 else
577 {
578 *Pointer<Byte>(element) = (Byte(RoundInt(Float(c.y))) & Byte(0xF)) |
579 (Byte(RoundInt(Float(c.x))) << Byte(4));
580 }
581 }
582 break;
583 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
584 if(writeRGBA)
585 {
586 *Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 12, 8, 4, 0 }));
587 }
588 else
589 {
590 unsigned short mask = (writeA ? 0x000F : 0x0000) |
591 (writeB ? 0x00F0 : 0x0000) |
592 (writeG ? 0x0F00 : 0x0000) |
593 (writeR ? 0xF000 : 0x0000);
594 unsigned short unmask = ~mask;
595 *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
596 (UShort(PackFields(RoundInt(c) & Int4(0xF), { 12, 8, 4, 0 })) & UShort(mask));
597 }
598 break;
599 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
600 if(writeRGBA)
601 {
602 *Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 4, 8, 12, 0 }));
603 }
604 else
605 {
606 unsigned short mask = (writeA ? 0x000F : 0x0000) |
607 (writeR ? 0x00F0 : 0x0000) |
608 (writeG ? 0x0F00 : 0x0000) |
609 (writeB ? 0xF000 : 0x0000);
610 unsigned short unmask = ~mask;
611 *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
612 (UShort(PackFields(RoundInt(c) & Int4(0xF), { 4, 8, 12, 0 })) & UShort(mask));
613 }
614 break;
615 case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
616 if(writeRGBA)
617 {
618 *Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 8, 4, 0, 12 }));
619 }
620 else
621 {
622 unsigned short mask = (writeB ? 0x000F : 0x0000) |
623 (writeG ? 0x00F0 : 0x0000) |
624 (writeR ? 0x0F00 : 0x0000) |
625 (writeA ? 0xF000 : 0x0000);
626 unsigned short unmask = ~mask;
627 *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
628 (UShort(PackFields(RoundInt(c) & Int4(0xF), { 8, 4, 0, 12 })) & UShort(mask));
629 }
630 break;
631 case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
632 if(writeRGBA)
633 {
634 *Pointer<UShort>(element) = UShort(PackFields(RoundInt(c) & Int4(0xF), { 0, 4, 8, 12 }));
635 }
636 else
637 {
638 unsigned short mask = (writeR ? 0x000F : 0x0000) |
639 (writeG ? 0x00F0 : 0x0000) |
640 (writeB ? 0x0F00 : 0x0000) |
641 (writeA ? 0xF000 : 0x0000);
642 unsigned short unmask = ~mask;
643 *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
644 (UShort(PackFields(RoundInt(c) & Int4(0xF), { 0, 4, 8, 12 })) & UShort(mask));
645 }
646 break;
647 case VK_FORMAT_B8G8R8A8_SRGB:
648 case VK_FORMAT_B8G8R8A8_UNORM:
649 if(writeRGBA)
650 {
651 Short4 c0 = RoundShort4(c.zyxw);
652 *Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
653 }
654 else
655 {
656 if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
657 if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
658 if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
659 if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
660 }
661 break;
662 case VK_FORMAT_B8G8R8_SNORM:
663 if(writeB) { *Pointer<SByte>(element + 0) = SByte(RoundInt(Float(c.z))); }
664 if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
665 if(writeR) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.x))); }
666 break;
667 case VK_FORMAT_B8G8R8_UNORM:
668 case VK_FORMAT_B8G8R8_SRGB:
669 if(writeB) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.z))); }
670 if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
671 if(writeR) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.x))); }
672 break;
673 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
674 case VK_FORMAT_R8G8B8A8_UNORM:
675 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
676 case VK_FORMAT_R8G8B8A8_SRGB:
677 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
678 case VK_FORMAT_R8G8B8A8_UINT:
679 case VK_FORMAT_R8G8B8A8_USCALED:
680 case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
681 if(writeRGBA)
682 {
683 Short4 c0 = RoundShort4(c);
684 *Pointer<Byte4>(element) = Byte4(PackUnsigned(c0, c0));
685 }
686 else
687 {
688 if(writeR) { *Pointer<Byte>(element + 0) = Byte(RoundInt(Float(c.x))); }
689 if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
690 if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
691 if(writeA) { *Pointer<Byte>(element + 3) = Byte(RoundInt(Float(c.w))); }
692 }
693 break;
694 case VK_FORMAT_R32G32B32A32_SFLOAT:
695 if(writeRGBA)
696 {
697 *Pointer<Float4>(element) = c;
698 }
699 else
700 {
701 if(writeR) { *Pointer<Float>(element) = c.x; }
702 if(writeG) { *Pointer<Float>(element + 4) = c.y; }
703 if(writeB) { *Pointer<Float>(element + 8) = c.z; }
704 if(writeA) { *Pointer<Float>(element + 12) = c.w; }
705 }
706 break;
707 case VK_FORMAT_R32G32B32_SFLOAT:
708 if(writeR) { *Pointer<Float>(element) = c.x; }
709 if(writeG) { *Pointer<Float>(element + 4) = c.y; }
710 if(writeB) { *Pointer<Float>(element + 8) = c.z; }
711 break;
712 case VK_FORMAT_R32G32_SFLOAT:
713 if(writeR && writeG)
714 {
715 *Pointer<Float2>(element) = Float2(c);
716 }
717 else
718 {
719 if(writeR) { *Pointer<Float>(element) = c.x; }
720 if(writeG) { *Pointer<Float>(element + 4) = c.y; }
721 }
722 break;
723 case VK_FORMAT_R32_SFLOAT:
724 if(writeR) { *Pointer<Float>(element) = c.x; }
725 break;
726 case VK_FORMAT_R16G16B16A16_SFLOAT:
727 if(writeA) { *Pointer<Half>(element + 6) = Half(c.w); }
728 // [[fallthrough]]
729 case VK_FORMAT_R16G16B16_SFLOAT:
730 if(writeB) { *Pointer<Half>(element + 4) = Half(c.z); }
731 // [[fallthrough]]
732 case VK_FORMAT_R16G16_SFLOAT:
733 if(writeG) { *Pointer<Half>(element + 2) = Half(c.y); }
734 // [[fallthrough]]
735 case VK_FORMAT_R16_SFLOAT:
736 if(writeR) { *Pointer<Half>(element) = Half(c.x); }
737 break;
738 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
739 {
740 UInt rgb = r11g11b10Pack(c);
741
742 UInt old = *Pointer<UInt>(element);
743
744 unsigned int mask = (writeR ? 0x000007FF : 0) |
745 (writeG ? 0x003FF800 : 0) |
746 (writeB ? 0xFFC00000 : 0);
747
748 *Pointer<UInt>(element) = (rgb & mask) | (old & ~mask);
749 }
750 break;
751 case VK_FORMAT_E5B9G9R9_UFLOAT_PACK32:
752 {
753 ASSERT(writeRGBA); // Can't sensibly write just part of this format.
754
755 // Vulkan 1.1.117 section 15.2.1 RGB to Shared Exponent Conversion
756
757 constexpr int N = 9; // number of mantissa bits per component
758 constexpr int B = 15; // exponent bias
759 constexpr int E_max = 31; // maximum possible biased exponent value
760
761 // Maximum representable value.
762 constexpr float sharedexp_max = ((static_cast<float>(1 << N) - 1) / static_cast<float>(1 << N)) * static_cast<float>(1 << (E_max - B));
763
764 // Clamp components to valid range. NaN becomes 0.
765 Float red_c = Min(IfThenElse(!(c.x > 0), Float(0), Float(c.x)), sharedexp_max);
766 Float green_c = Min(IfThenElse(!(c.y > 0), Float(0), Float(c.y)), sharedexp_max);
767 Float blue_c = Min(IfThenElse(!(c.z > 0), Float(0), Float(c.z)), sharedexp_max);
768
769 // We're reducing the mantissa to 9 bits, so we must round up if the next
770 // bit is 1. In other words add 0.5 to the new mantissa's position and
771 // allow overflow into the exponent so we can scale correctly.
772 constexpr int half = 1 << (23 - N);
773 Float red_r = As<Float>(As<Int>(red_c) + half);
774 Float green_r = As<Float>(As<Int>(green_c) + half);
775 Float blue_r = As<Float>(As<Int>(blue_c) + half);
776
777 // The largest component determines the shared exponent. It can't be lower
778 // than 0 (after bias subtraction) so also limit to the mimimum representable.
779 constexpr float min_s = 0.5f / (1 << B);
780 Float max_s = Max(Max(red_r, green_r), Max(blue_r, min_s));
781
782 // Obtain the reciprocal of the shared exponent by inverting the bits,
783 // and scale by the new mantissa's size. Note that the IEEE-754 single-precision
784 // format has an implicit leading 1, but this shared component format does not.
785 Float scale = As<Float>((As<Int>(max_s) & 0x7F800000) ^ 0x7F800000) * (1 << (N - 2));
786
787 UInt R9 = RoundInt(red_c * scale);
788 UInt G9 = UInt(RoundInt(green_c * scale));
789 UInt B9 = UInt(RoundInt(blue_c * scale));
790 UInt E5 = (As<UInt>(max_s) >> 23) - 127 + 15 + 1;
791
792 UInt E5B9G9R9 = (E5 << 27) | (B9 << 18) | (G9 << 9) | R9;
793
794 *Pointer<UInt>(element) = E5B9G9R9;
795 }
796 break;
797 case VK_FORMAT_B8G8R8A8_SNORM:
798 if(writeB) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.z))); }
799 if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
800 if(writeR) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.x))); }
801 if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
802 break;
803 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
804 case VK_FORMAT_R8G8B8A8_SINT:
805 case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
806 case VK_FORMAT_R8G8B8A8_SNORM:
807 case VK_FORMAT_R8G8B8A8_SSCALED:
808 case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
809 if(writeA) { *Pointer<SByte>(element + 3) = SByte(RoundInt(Float(c.w))); }
810 // [[fallthrough]]
811 case VK_FORMAT_R8G8B8_SINT:
812 case VK_FORMAT_R8G8B8_SNORM:
813 case VK_FORMAT_R8G8B8_SSCALED:
814 if(writeB) { *Pointer<SByte>(element + 2) = SByte(RoundInt(Float(c.z))); }
815 // [[fallthrough]]
816 case VK_FORMAT_R8G8_SINT:
817 case VK_FORMAT_R8G8_SNORM:
818 case VK_FORMAT_R8G8_SSCALED:
819 if(writeG) { *Pointer<SByte>(element + 1) = SByte(RoundInt(Float(c.y))); }
820 // [[fallthrough]]
821 case VK_FORMAT_R8_SINT:
822 case VK_FORMAT_R8_SNORM:
823 case VK_FORMAT_R8_SSCALED:
824 if(writeR) { *Pointer<SByte>(element) = SByte(RoundInt(Float(c.x))); }
825 break;
826 case VK_FORMAT_R8G8B8_UINT:
827 case VK_FORMAT_R8G8B8_UNORM:
828 case VK_FORMAT_R8G8B8_USCALED:
829 case VK_FORMAT_R8G8B8_SRGB:
830 if(writeB) { *Pointer<Byte>(element + 2) = Byte(RoundInt(Float(c.z))); }
831 // [[fallthrough]]
832 case VK_FORMAT_R8G8_UINT:
833 case VK_FORMAT_R8G8_UNORM:
834 case VK_FORMAT_R8G8_USCALED:
835 case VK_FORMAT_R8G8_SRGB:
836 if(writeG) { *Pointer<Byte>(element + 1) = Byte(RoundInt(Float(c.y))); }
837 // [[fallthrough]]
838 case VK_FORMAT_R8_UINT:
839 case VK_FORMAT_R8_UNORM:
840 case VK_FORMAT_R8_USCALED:
841 case VK_FORMAT_R8_SRGB:
842 if(writeR) { *Pointer<Byte>(element) = Byte(RoundInt(Float(c.x))); }
843 break;
844 case VK_FORMAT_R16G16B16A16_SINT:
845 case VK_FORMAT_R16G16B16A16_SNORM:
846 case VK_FORMAT_R16G16B16A16_SSCALED:
847 if(writeRGBA)
848 {
849 *Pointer<Short4>(element) = Short4(RoundInt(c));
850 }
851 else
852 {
853 if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
854 if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
855 if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
856 if(writeA) { *Pointer<Short>(element + 6) = Short(RoundInt(Float(c.w))); }
857 }
858 break;
859 case VK_FORMAT_R16G16B16_SINT:
860 case VK_FORMAT_R16G16B16_SNORM:
861 case VK_FORMAT_R16G16B16_SSCALED:
862 if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
863 if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
864 if(writeB) { *Pointer<Short>(element + 4) = Short(RoundInt(Float(c.z))); }
865 break;
866 case VK_FORMAT_R16G16_SINT:
867 case VK_FORMAT_R16G16_SNORM:
868 case VK_FORMAT_R16G16_SSCALED:
869 if(writeR && writeG)
870 {
871 *Pointer<Short2>(element) = Short2(Short4(RoundInt(c)));
872 }
873 else
874 {
875 if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
876 if(writeG) { *Pointer<Short>(element + 2) = Short(RoundInt(Float(c.y))); }
877 }
878 break;
879 case VK_FORMAT_R16_SINT:
880 case VK_FORMAT_R16_SNORM:
881 case VK_FORMAT_R16_SSCALED:
882 if(writeR) { *Pointer<Short>(element) = Short(RoundInt(Float(c.x))); }
883 break;
884 case VK_FORMAT_R16G16B16A16_UINT:
885 case VK_FORMAT_R16G16B16A16_UNORM:
886 case VK_FORMAT_R16G16B16A16_USCALED:
887 if(writeRGBA)
888 {
889 *Pointer<UShort4>(element) = UShort4(RoundInt(c));
890 }
891 else
892 {
893 if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
894 if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
895 if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
896 if(writeA) { *Pointer<UShort>(element + 6) = UShort(RoundInt(Float(c.w))); }
897 }
898 break;
899 case VK_FORMAT_R16G16B16_UINT:
900 case VK_FORMAT_R16G16B16_UNORM:
901 case VK_FORMAT_R16G16B16_USCALED:
902 if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
903 if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
904 if(writeB) { *Pointer<UShort>(element + 4) = UShort(RoundInt(Float(c.z))); }
905 break;
906 case VK_FORMAT_R16G16_UINT:
907 case VK_FORMAT_R16G16_UNORM:
908 case VK_FORMAT_R16G16_USCALED:
909 if(writeR && writeG)
910 {
911 *Pointer<UShort2>(element) = UShort2(UShort4(RoundInt(c)));
912 }
913 else
914 {
915 if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
916 if(writeG) { *Pointer<UShort>(element + 2) = UShort(RoundInt(Float(c.y))); }
917 }
918 break;
919 case VK_FORMAT_R16_UINT:
920 case VK_FORMAT_R16_UNORM:
921 case VK_FORMAT_R16_USCALED:
922 if(writeR) { *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x))); }
923 break;
924 case VK_FORMAT_R32G32B32A32_SINT:
925 if(writeRGBA)
926 {
927 *Pointer<Int4>(element) = RoundInt(c);
928 }
929 else
930 {
931 if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
932 if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
933 if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
934 if(writeA) { *Pointer<Int>(element + 12) = RoundInt(Float(c.w)); }
935 }
936 break;
937 case VK_FORMAT_R32G32B32_SINT:
938 if(writeB) { *Pointer<Int>(element + 8) = RoundInt(Float(c.z)); }
939 // [[fallthrough]]
940 case VK_FORMAT_R32G32_SINT:
941 if(writeG) { *Pointer<Int>(element + 4) = RoundInt(Float(c.y)); }
942 // [[fallthrough]]
943 case VK_FORMAT_R32_SINT:
944 if(writeR) { *Pointer<Int>(element) = RoundInt(Float(c.x)); }
945 break;
946 case VK_FORMAT_R32G32B32A32_UINT:
947 if(writeRGBA)
948 {
949 *Pointer<UInt4>(element) = UInt4(RoundInt(c));
950 }
951 else
952 {
953 if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
954 if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
955 if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
956 if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(RoundInt(Float(c.w))); }
957 }
958 break;
959 case VK_FORMAT_R32G32B32_UINT:
960 if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(RoundInt(Float(c.z))); }
961 // [[fallthrough]]
962 case VK_FORMAT_R32G32_UINT:
963 if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(RoundInt(Float(c.y))); }
964 // [[fallthrough]]
965 case VK_FORMAT_R32_UINT:
966 if(writeR) { *Pointer<UInt>(element) = As<UInt>(RoundInt(Float(c.x))); }
967 break;
968 case VK_FORMAT_R5G6B5_UNORM_PACK16:
969 if(writeR && writeG && writeB)
970 {
971 *Pointer<UShort>(element) = UShort(PackFields(RoundInt(c.xyzz), { 11, 5, 0, 0 }));
972 }
973 else
974 {
975 unsigned short mask = (writeB ? 0x001F : 0x0000) | (writeG ? 0x07E0 : 0x0000) | (writeR ? 0xF800 : 0x0000);
976 unsigned short unmask = ~mask;
977 *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
978 (UShort(PackFields(RoundInt(c.xyzz), { 11, 5, 0, 0 })) &
979 UShort(mask));
980 }
981 break;
982 case VK_FORMAT_B5G6R5_UNORM_PACK16:
983 if(writeR && writeG && writeB)
984 {
985 *Pointer<UShort>(element) = UShort(PackFields(RoundInt(c.zyxx), { 11, 5, 0, 0 }));
986 }
987 else
988 {
989 unsigned short mask = (writeR ? 0x001F : 0x0000) | (writeG ? 0x07E0 : 0x0000) | (writeB ? 0xF800 : 0x0000);
990 unsigned short unmask = ~mask;
991 *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
992 (UShort(PackFields(RoundInt(c.zyxx), { 11, 5, 0, 0 })) &
993 UShort(mask));
994 }
995 break;
996 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
997 if(writeRGBA)
998 {
999 *Pointer<UShort>(element) = UShort(PackFields(RoundInt(c), { 11, 6, 1, 0 }));
1000 }
1001 else
1002 {
1003 unsigned short mask = (writeA ? 0x8000 : 0x0000) |
1004 (writeR ? 0x7C00 : 0x0000) |
1005 (writeG ? 0x03E0 : 0x0000) |
1006 (writeB ? 0x001F : 0x0000);
1007 unsigned short unmask = ~mask;
1008 *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
1009 (UShort(PackFields(RoundInt(c), { 11, 6, 1, 0 })) &
1010 UShort(mask));
1011 }
1012 break;
1013 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1014 if(writeRGBA)
1015 {
1016 *Pointer<UShort>(element) = UShort(PackFields(RoundInt(c), { 1, 6, 11, 0 }));
1017 }
1018 else
1019 {
1020 unsigned short mask = (writeA ? 0x8000 : 0x0000) |
1021 (writeR ? 0x7C00 : 0x0000) |
1022 (writeG ? 0x03E0 : 0x0000) |
1023 (writeB ? 0x001F : 0x0000);
1024 unsigned short unmask = ~mask;
1025 *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
1026 (UShort(PackFields(RoundInt(c), { 1, 6, 11, 0 })) &
1027 UShort(mask));
1028 }
1029 break;
1030 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1031 if(writeRGBA)
1032 {
1033 *Pointer<UShort>(element) = UShort(PackFields(RoundInt(c), { 10, 5, 0, 15 }));
1034 }
1035 else
1036 {
1037 unsigned short mask = (writeA ? 0x8000 : 0x0000) |
1038 (writeR ? 0x7C00 : 0x0000) |
1039 (writeG ? 0x03E0 : 0x0000) |
1040 (writeB ? 0x001F : 0x0000);
1041 unsigned short unmask = ~mask;
1042 *Pointer<UShort>(element) = (*Pointer<UShort>(element) & UShort(unmask)) |
1043 (UShort(PackFields(RoundInt(c), { 10, 5, 0, 15 })) &
1044 UShort(mask));
1045 }
1046 break;
1047 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1048 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1049 case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
1050 if(writeRGBA)
1051 {
1052 *Pointer<UInt>(element) = As<UInt>(PackFields(RoundInt(c), { 0, 10, 20, 30 }));
1053 }
1054 else
1055 {
1056 unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1057 (writeB ? 0x3FF00000 : 0x0000) |
1058 (writeG ? 0x000FFC00 : 0x0000) |
1059 (writeR ? 0x000003FF : 0x0000);
1060 unsigned int unmask = ~mask;
1061 *Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1062 (As<UInt>(PackFields(RoundInt(c), { 0, 10, 20, 30 })) &
1063 UInt(mask));
1064 }
1065 break;
1066 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1067 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1068 case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
1069 if(writeRGBA)
1070 {
1071 *Pointer<UInt>(element) = As<UInt>(PackFields(RoundInt(c), { 20, 10, 0, 30 }));
1072 }
1073 else
1074 {
1075 unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1076 (writeR ? 0x3FF00000 : 0x0000) |
1077 (writeG ? 0x000FFC00 : 0x0000) |
1078 (writeB ? 0x000003FF : 0x0000);
1079 unsigned int unmask = ~mask;
1080 *Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1081 (As<UInt>(PackFields(RoundInt(c), { 20, 10, 0, 30 })) &
1082 UInt(mask));
1083 }
1084 break;
1085 case VK_FORMAT_D16_UNORM:
1086 *Pointer<UShort>(element) = UShort(RoundInt(Float(c.x)));
1087 break;
1088 case VK_FORMAT_X8_D24_UNORM_PACK32:
1089 *Pointer<UInt>(element) = UInt(RoundInt(Float(c.x)) << 8);
1090 break;
1091 case VK_FORMAT_D32_SFLOAT:
1092 *Pointer<Float>(element) = c.x;
1093 break;
1094 case VK_FORMAT_S8_UINT:
1095 *Pointer<Byte>(element) = Byte(RoundInt(Float(c.x)));
1096 break;
1097 default:
1098 UNSUPPORTED("Blitter destination format %d", (int)state.destFormat);
1099 break;
1100 }
1101 }
1102
readInt4(Pointer<Byte> element,const State & state)1103 Int4 Blitter::readInt4(Pointer<Byte> element, const State &state)
1104 {
1105 Int4 c(0, 0, 0, 1);
1106
1107 switch(state.sourceFormat)
1108 {
1109 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1110 case VK_FORMAT_R8G8B8A8_SINT:
1111 c = Insert(c, Int(*Pointer<SByte>(element + 3)), 3);
1112 c = Insert(c, Int(*Pointer<SByte>(element + 2)), 2);
1113 // [[fallthrough]]
1114 case VK_FORMAT_R8G8_SINT:
1115 c = Insert(c, Int(*Pointer<SByte>(element + 1)), 1);
1116 // [[fallthrough]]
1117 case VK_FORMAT_R8_SINT:
1118 c = Insert(c, Int(*Pointer<SByte>(element)), 0);
1119 break;
1120 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1121 c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000003FF))), 0);
1122 c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10), 1);
1123 c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20), 2);
1124 c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30), 3);
1125 break;
1126 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1127 c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000003FF))), 2);
1128 c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x000FFC00)) >> 10), 1);
1129 c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0x3FF00000)) >> 20), 0);
1130 c = Insert(c, Int((*Pointer<UInt>(element) & UInt(0xC0000000)) >> 30), 3);
1131 break;
1132 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1133 case VK_FORMAT_R8G8B8A8_UINT:
1134 c = Insert(c, Int(*Pointer<Byte>(element + 3)), 3);
1135 c = Insert(c, Int(*Pointer<Byte>(element + 2)), 2);
1136 // [[fallthrough]]
1137 case VK_FORMAT_R8G8_UINT:
1138 c = Insert(c, Int(*Pointer<Byte>(element + 1)), 1);
1139 // [[fallthrough]]
1140 case VK_FORMAT_R8_UINT:
1141 case VK_FORMAT_S8_UINT:
1142 c = Insert(c, Int(*Pointer<Byte>(element)), 0);
1143 break;
1144 case VK_FORMAT_R16G16B16A16_SINT:
1145 c = Insert(c, Int(*Pointer<Short>(element + 6)), 3);
1146 c = Insert(c, Int(*Pointer<Short>(element + 4)), 2);
1147 // [[fallthrough]]
1148 case VK_FORMAT_R16G16_SINT:
1149 c = Insert(c, Int(*Pointer<Short>(element + 2)), 1);
1150 // [[fallthrough]]
1151 case VK_FORMAT_R16_SINT:
1152 c = Insert(c, Int(*Pointer<Short>(element)), 0);
1153 break;
1154 case VK_FORMAT_R16G16B16A16_UINT:
1155 c = Insert(c, Int(*Pointer<UShort>(element + 6)), 3);
1156 c = Insert(c, Int(*Pointer<UShort>(element + 4)), 2);
1157 // [[fallthrough]]
1158 case VK_FORMAT_R16G16_UINT:
1159 c = Insert(c, Int(*Pointer<UShort>(element + 2)), 1);
1160 // [[fallthrough]]
1161 case VK_FORMAT_R16_UINT:
1162 c = Insert(c, Int(*Pointer<UShort>(element)), 0);
1163 break;
1164 case VK_FORMAT_R32G32B32A32_SINT:
1165 case VK_FORMAT_R32G32B32A32_UINT:
1166 c = *Pointer<Int4>(element);
1167 break;
1168 case VK_FORMAT_R32G32_SINT:
1169 case VK_FORMAT_R32G32_UINT:
1170 c = Insert(c, *Pointer<Int>(element + 4), 1);
1171 // [[fallthrough]]
1172 case VK_FORMAT_R32_SINT:
1173 case VK_FORMAT_R32_UINT:
1174 c = Insert(c, *Pointer<Int>(element), 0);
1175 break;
1176 default:
1177 UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
1178 }
1179
1180 return c;
1181 }
1182
write(Int4 & c,Pointer<Byte> element,const State & state)1183 void Blitter::write(Int4 &c, Pointer<Byte> element, const State &state)
1184 {
1185 bool writeR = state.writeRed;
1186 bool writeG = state.writeGreen;
1187 bool writeB = state.writeBlue;
1188 bool writeA = state.writeAlpha;
1189 bool writeRGBA = writeR && writeG && writeB && writeA;
1190
1191 ASSERT(state.sourceFormat.isUnsigned() == state.destFormat.isUnsigned());
1192
1193 switch(state.destFormat)
1194 {
1195 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1196 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1197 c = Min(As<UInt4>(c), UInt4(0x03FF, 0x03FF, 0x03FF, 0x0003));
1198 break;
1199 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1200 case VK_FORMAT_R8G8B8A8_UINT:
1201 case VK_FORMAT_R8G8B8_UINT:
1202 case VK_FORMAT_R8G8_UINT:
1203 case VK_FORMAT_R8_UINT:
1204 case VK_FORMAT_R8G8B8A8_USCALED:
1205 case VK_FORMAT_R8G8B8_USCALED:
1206 case VK_FORMAT_R8G8_USCALED:
1207 case VK_FORMAT_R8_USCALED:
1208 case VK_FORMAT_S8_UINT:
1209 c = Min(As<UInt4>(c), UInt4(0xFF));
1210 break;
1211 case VK_FORMAT_R16G16B16A16_UINT:
1212 case VK_FORMAT_R16G16B16_UINT:
1213 case VK_FORMAT_R16G16_UINT:
1214 case VK_FORMAT_R16_UINT:
1215 case VK_FORMAT_R16G16B16A16_USCALED:
1216 case VK_FORMAT_R16G16B16_USCALED:
1217 case VK_FORMAT_R16G16_USCALED:
1218 case VK_FORMAT_R16_USCALED:
1219 c = Min(As<UInt4>(c), UInt4(0xFFFF));
1220 break;
1221 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1222 case VK_FORMAT_R8G8B8A8_SINT:
1223 case VK_FORMAT_R8G8_SINT:
1224 case VK_FORMAT_R8_SINT:
1225 case VK_FORMAT_R8G8B8A8_SSCALED:
1226 case VK_FORMAT_R8G8B8_SSCALED:
1227 case VK_FORMAT_R8G8_SSCALED:
1228 case VK_FORMAT_R8_SSCALED:
1229 c = Min(Max(c, Int4(-0x80)), Int4(0x7F));
1230 break;
1231 case VK_FORMAT_R16G16B16A16_SINT:
1232 case VK_FORMAT_R16G16B16_SINT:
1233 case VK_FORMAT_R16G16_SINT:
1234 case VK_FORMAT_R16_SINT:
1235 case VK_FORMAT_R16G16B16A16_SSCALED:
1236 case VK_FORMAT_R16G16B16_SSCALED:
1237 case VK_FORMAT_R16G16_SSCALED:
1238 case VK_FORMAT_R16_SSCALED:
1239 c = Min(Max(c, Int4(-0x8000)), Int4(0x7FFF));
1240 break;
1241 default:
1242 break;
1243 }
1244
1245 switch(state.destFormat)
1246 {
1247 case VK_FORMAT_B8G8R8A8_SINT:
1248 case VK_FORMAT_B8G8R8A8_SSCALED:
1249 if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
1250 // [[fallthrough]]
1251 case VK_FORMAT_B8G8R8_SINT:
1252 case VK_FORMAT_B8G8R8_SSCALED:
1253 if(writeB) { *Pointer<SByte>(element) = SByte(Extract(c, 2)); }
1254 if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
1255 if(writeR) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 0)); }
1256 break;
1257 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
1258 case VK_FORMAT_R8G8B8A8_SINT:
1259 case VK_FORMAT_R8G8B8A8_SSCALED:
1260 case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
1261 if(writeA) { *Pointer<SByte>(element + 3) = SByte(Extract(c, 3)); }
1262 // [[fallthrough]]
1263 case VK_FORMAT_R8G8B8_SINT:
1264 case VK_FORMAT_R8G8B8_SSCALED:
1265 if(writeB) { *Pointer<SByte>(element + 2) = SByte(Extract(c, 2)); }
1266 // [[fallthrough]]
1267 case VK_FORMAT_R8G8_SINT:
1268 case VK_FORMAT_R8G8_SSCALED:
1269 if(writeG) { *Pointer<SByte>(element + 1) = SByte(Extract(c, 1)); }
1270 // [[fallthrough]]
1271 case VK_FORMAT_R8_SINT:
1272 case VK_FORMAT_R8_SSCALED:
1273 if(writeR) { *Pointer<SByte>(element) = SByte(Extract(c, 0)); }
1274 break;
1275 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
1276 case VK_FORMAT_A2B10G10R10_SINT_PACK32:
1277 case VK_FORMAT_A2B10G10R10_USCALED_PACK32:
1278 case VK_FORMAT_A2B10G10R10_SSCALED_PACK32:
1279 if(writeRGBA)
1280 {
1281 *Pointer<UInt>(element) = As<UInt>(PackFields(c, { 0, 10, 20, 30 }));
1282 }
1283 else
1284 {
1285 unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1286 (writeB ? 0x3FF00000 : 0x0000) |
1287 (writeG ? 0x000FFC00 : 0x0000) |
1288 (writeR ? 0x000003FF : 0x0000);
1289 unsigned int unmask = ~mask;
1290 *Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1291 (As<UInt>(PackFields(c, { 0, 10, 20, 30 })) & UInt(mask));
1292 }
1293 break;
1294 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
1295 case VK_FORMAT_A2R10G10B10_SINT_PACK32:
1296 case VK_FORMAT_A2R10G10B10_USCALED_PACK32:
1297 case VK_FORMAT_A2R10G10B10_SSCALED_PACK32:
1298 if(writeRGBA)
1299 {
1300 *Pointer<UInt>(element) = As<UInt>(PackFields(c, { 20, 10, 0, 30 }));
1301 }
1302 else
1303 {
1304 unsigned int mask = (writeA ? 0xC0000000 : 0x0000) |
1305 (writeR ? 0x3FF00000 : 0x0000) |
1306 (writeG ? 0x000FFC00 : 0x0000) |
1307 (writeB ? 0x000003FF : 0x0000);
1308 unsigned int unmask = ~mask;
1309 *Pointer<UInt>(element) = (*Pointer<UInt>(element) & UInt(unmask)) |
1310 (As<UInt>(PackFields(c, { 20, 10, 0, 30 })) & UInt(mask));
1311 }
1312 break;
1313 case VK_FORMAT_B8G8R8A8_UINT:
1314 case VK_FORMAT_B8G8R8A8_USCALED:
1315 if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
1316 // [[fallthrough]]
1317 case VK_FORMAT_B8G8R8_UINT:
1318 case VK_FORMAT_B8G8R8_USCALED:
1319 case VK_FORMAT_B8G8R8_SRGB:
1320 if(writeB) { *Pointer<Byte>(element) = Byte(Extract(c, 2)); }
1321 if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
1322 if(writeR) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 0)); }
1323 break;
1324 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
1325 case VK_FORMAT_R8G8B8A8_UINT:
1326 case VK_FORMAT_R8G8B8A8_USCALED:
1327 case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
1328 if(writeA) { *Pointer<Byte>(element + 3) = Byte(Extract(c, 3)); }
1329 // [[fallthrough]]
1330 case VK_FORMAT_R8G8B8_UINT:
1331 case VK_FORMAT_R8G8B8_USCALED:
1332 if(writeB) { *Pointer<Byte>(element + 2) = Byte(Extract(c, 2)); }
1333 // [[fallthrough]]
1334 case VK_FORMAT_R8G8_UINT:
1335 case VK_FORMAT_R8G8_USCALED:
1336 if(writeG) { *Pointer<Byte>(element + 1) = Byte(Extract(c, 1)); }
1337 // [[fallthrough]]
1338 case VK_FORMAT_R8_UINT:
1339 case VK_FORMAT_R8_USCALED:
1340 case VK_FORMAT_S8_UINT:
1341 if(writeR) { *Pointer<Byte>(element) = Byte(Extract(c, 0)); }
1342 break;
1343 case VK_FORMAT_R16G16B16A16_SINT:
1344 case VK_FORMAT_R16G16B16A16_SSCALED:
1345 if(writeA) { *Pointer<Short>(element + 6) = Short(Extract(c, 3)); }
1346 // [[fallthrough]]
1347 case VK_FORMAT_R16G16B16_SINT:
1348 case VK_FORMAT_R16G16B16_SSCALED:
1349 if(writeB) { *Pointer<Short>(element + 4) = Short(Extract(c, 2)); }
1350 // [[fallthrough]]
1351 case VK_FORMAT_R16G16_SINT:
1352 case VK_FORMAT_R16G16_SSCALED:
1353 if(writeG) { *Pointer<Short>(element + 2) = Short(Extract(c, 1)); }
1354 // [[fallthrough]]
1355 case VK_FORMAT_R16_SINT:
1356 case VK_FORMAT_R16_SSCALED:
1357 if(writeR) { *Pointer<Short>(element) = Short(Extract(c, 0)); }
1358 break;
1359 case VK_FORMAT_R16G16B16A16_UINT:
1360 case VK_FORMAT_R16G16B16A16_USCALED:
1361 if(writeA) { *Pointer<UShort>(element + 6) = UShort(Extract(c, 3)); }
1362 // [[fallthrough]]
1363 case VK_FORMAT_R16G16B16_UINT:
1364 case VK_FORMAT_R16G16B16_USCALED:
1365 if(writeB) { *Pointer<UShort>(element + 4) = UShort(Extract(c, 2)); }
1366 // [[fallthrough]]
1367 case VK_FORMAT_R16G16_UINT:
1368 case VK_FORMAT_R16G16_USCALED:
1369 if(writeG) { *Pointer<UShort>(element + 2) = UShort(Extract(c, 1)); }
1370 // [[fallthrough]]
1371 case VK_FORMAT_R16_UINT:
1372 case VK_FORMAT_R16_USCALED:
1373 if(writeR) { *Pointer<UShort>(element) = UShort(Extract(c, 0)); }
1374 break;
1375 case VK_FORMAT_R32G32B32A32_SINT:
1376 if(writeRGBA)
1377 {
1378 *Pointer<Int4>(element) = c;
1379 }
1380 else
1381 {
1382 if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1383 if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
1384 if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
1385 if(writeA) { *Pointer<Int>(element + 12) = Extract(c, 3); }
1386 }
1387 break;
1388 case VK_FORMAT_R32G32B32_SINT:
1389 if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1390 if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
1391 if(writeB) { *Pointer<Int>(element + 8) = Extract(c, 2); }
1392 break;
1393 case VK_FORMAT_R32G32_SINT:
1394 if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1395 if(writeG) { *Pointer<Int>(element + 4) = Extract(c, 1); }
1396 break;
1397 case VK_FORMAT_R32_SINT:
1398 if(writeR) { *Pointer<Int>(element) = Extract(c, 0); }
1399 break;
1400 case VK_FORMAT_R32G32B32A32_UINT:
1401 if(writeRGBA)
1402 {
1403 *Pointer<UInt4>(element) = As<UInt4>(c);
1404 }
1405 else
1406 {
1407 if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
1408 if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
1409 if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
1410 if(writeA) { *Pointer<UInt>(element + 12) = As<UInt>(Extract(c, 3)); }
1411 }
1412 break;
1413 case VK_FORMAT_R32G32B32_UINT:
1414 if(writeB) { *Pointer<UInt>(element + 8) = As<UInt>(Extract(c, 2)); }
1415 // [[fallthrough]]
1416 case VK_FORMAT_R32G32_UINT:
1417 if(writeG) { *Pointer<UInt>(element + 4) = As<UInt>(Extract(c, 1)); }
1418 // [[fallthrough]]
1419 case VK_FORMAT_R32_UINT:
1420 if(writeR) { *Pointer<UInt>(element) = As<UInt>(Extract(c, 0)); }
1421 break;
1422 default:
1423 UNSUPPORTED("Blitter destination format %d", (int)state.destFormat);
1424 }
1425 }
1426
ApplyScaleAndClamp(Float4 & value,const State & state,bool preScaled)1427 void Blitter::ApplyScaleAndClamp(Float4 &value, const State &state, bool preScaled)
1428 {
1429 float4 scale{}, unscale{};
1430
1431 if(state.clearOperation &&
1432 state.sourceFormat.isUnnormalizedInteger() &&
1433 !state.destFormat.isUnnormalizedInteger())
1434 {
1435 // If we're clearing a buffer from an int or uint color into a normalized color,
1436 // then the whole range of the int or uint color must be scaled between 0 and 1.
1437 switch(state.sourceFormat)
1438 {
1439 case VK_FORMAT_R32G32B32A32_SINT:
1440 unscale = float4(static_cast<float>(0x7FFFFFFF));
1441 break;
1442 case VK_FORMAT_R32G32B32A32_UINT:
1443 unscale = float4(static_cast<float>(0xFFFFFFFF));
1444 break;
1445 default:
1446 UNSUPPORTED("Blitter source format %d", (int)state.sourceFormat);
1447 }
1448 }
1449 else
1450 {
1451 unscale = state.sourceFormat.getScale();
1452 }
1453
1454 scale = state.destFormat.getScale();
1455
1456 bool srcSRGB = state.sourceFormat.isSRGBformat();
1457 bool dstSRGB = state.destFormat.isSRGBformat();
1458
1459 if(state.allowSRGBConversion && ((srcSRGB && !preScaled) || dstSRGB)) // One of the formats is sRGB encoded.
1460 {
1461 value *= preScaled ? Float4(1.0f / scale.x, 1.0f / scale.y, 1.0f / scale.z, 1.0f / scale.w) : // Unapply scale
1462 Float4(1.0f / unscale.x, 1.0f / unscale.y, 1.0f / unscale.z, 1.0f / unscale.w); // Apply unscale
1463 value.xyz = (srcSRGB && !preScaled) ? sRGBtoLinear(value) : linearToSRGB(value);
1464 value *= Float4(scale.x, scale.y, scale.z, scale.w); // Apply scale
1465 }
1466 else if(unscale != scale)
1467 {
1468 value *= Float4(scale.x / unscale.x, scale.y / unscale.y, scale.z / unscale.z, scale.w / unscale.w);
1469 }
1470
1471 if(state.sourceFormat.isFloatFormat() && !state.destFormat.isFloatFormat())
1472 {
1473 value = Min(value, Float4(scale.x, scale.y, scale.z, scale.w));
1474
1475 value = Max(value, Float4(state.destFormat.isUnsignedComponent(0) ? 0.0f : -scale.x,
1476 state.destFormat.isUnsignedComponent(1) ? 0.0f : -scale.y,
1477 state.destFormat.isUnsignedComponent(2) ? 0.0f : -scale.z,
1478 state.destFormat.isUnsignedComponent(3) ? 0.0f : -scale.w));
1479 }
1480
1481 if(!state.sourceFormat.isUnsigned() && state.destFormat.isUnsigned())
1482 {
1483 value = Max(value, Float4(0.0f));
1484 }
1485 }
1486
ComputeOffset(Int & x,Int & y,Int & pitchB,int bytes)1487 Int Blitter::ComputeOffset(Int &x, Int &y, Int &pitchB, int bytes)
1488 {
1489 return y * pitchB + x * bytes;
1490 }
1491
ComputeOffset(Int & x,Int & y,Int & z,Int & sliceB,Int & pitchB,int bytes)1492 Int Blitter::ComputeOffset(Int &x, Int &y, Int &z, Int &sliceB, Int &pitchB, int bytes)
1493 {
1494 return z * sliceB + y * pitchB + x * bytes;
1495 }
1496
sample(Pointer<Byte> & source,Float & x,Float & y,Float & z,Int & sWidth,Int & sHeight,Int & sDepth,Int & sSliceB,Int & sPitchB,const State & state)1497 Float4 Blitter::sample(Pointer<Byte> &source, Float &x, Float &y, Float &z,
1498 Int &sWidth, Int &sHeight, Int &sDepth,
1499 Int &sSliceB, Int &sPitchB, const State &state)
1500 {
1501 bool intSrc = state.sourceFormat.isUnnormalizedInteger();
1502 int srcBytes = state.sourceFormat.bytes();
1503
1504 Float4 color;
1505
1506 bool preScaled = false;
1507 if(!state.filter || intSrc)
1508 {
1509 Int X = Int(x);
1510 Int Y = Int(y);
1511 Int Z = Int(z);
1512
1513 if(state.clampToEdge)
1514 {
1515 X = Clamp(X, 0, sWidth - 1);
1516 Y = Clamp(Y, 0, sHeight - 1);
1517 Z = Clamp(Z, 0, sDepth - 1);
1518 }
1519
1520 Pointer<Byte> s = source + ComputeOffset(X, Y, Z, sSliceB, sPitchB, srcBytes);
1521
1522 color = readFloat4(s, state);
1523
1524 if(state.srcSamples > 1) // Resolve multisampled source
1525 {
1526 if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat()) // sRGB -> RGB
1527 {
1528 ApplyScaleAndClamp(color, state);
1529 preScaled = true;
1530 }
1531 Float4 accum = color;
1532 for(int sample = 1; sample < state.srcSamples; sample++)
1533 {
1534 s += sSliceB;
1535 color = readFloat4(s, state);
1536
1537 if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat()) // sRGB -> RGB
1538 {
1539 ApplyScaleAndClamp(color, state);
1540 preScaled = true;
1541 }
1542 accum += color;
1543 }
1544 color = accum * Float4(1.0f / static_cast<float>(state.srcSamples));
1545 }
1546 }
1547 else // Bilinear filtering
1548 {
1549 Float X = x;
1550 Float Y = y;
1551 Float Z = z;
1552
1553 if(state.clampToEdge)
1554 {
1555 X = Min(Max(x, 0.5f), Float(sWidth) - 0.5f);
1556 Y = Min(Max(y, 0.5f), Float(sHeight) - 0.5f);
1557 Z = Min(Max(z, 0.5f), Float(sDepth) - 0.5f);
1558 }
1559
1560 Float x0 = X - 0.5f;
1561 Float y0 = Y - 0.5f;
1562 Float z0 = Z - 0.5f;
1563
1564 Int X0 = Max(Int(x0), 0);
1565 Int Y0 = Max(Int(y0), 0);
1566 Int Z0 = Max(Int(z0), 0);
1567
1568 Int X1 = X0 + 1;
1569 Int Y1 = Y0 + 1;
1570 X1 = IfThenElse(X1 >= sWidth, X0, X1);
1571 Y1 = IfThenElse(Y1 >= sHeight, Y0, Y1);
1572
1573 if(state.filter3D)
1574 {
1575 Int Z1 = Z0 + 1;
1576 Z1 = IfThenElse(Z1 >= sHeight, Z0, Z1);
1577
1578 Pointer<Byte> s000 = source + ComputeOffset(X0, Y0, Z0, sSliceB, sPitchB, srcBytes);
1579 Pointer<Byte> s010 = source + ComputeOffset(X1, Y0, Z0, sSliceB, sPitchB, srcBytes);
1580 Pointer<Byte> s100 = source + ComputeOffset(X0, Y1, Z0, sSliceB, sPitchB, srcBytes);
1581 Pointer<Byte> s110 = source + ComputeOffset(X1, Y1, Z0, sSliceB, sPitchB, srcBytes);
1582 Pointer<Byte> s001 = source + ComputeOffset(X0, Y0, Z1, sSliceB, sPitchB, srcBytes);
1583 Pointer<Byte> s011 = source + ComputeOffset(X1, Y0, Z1, sSliceB, sPitchB, srcBytes);
1584 Pointer<Byte> s101 = source + ComputeOffset(X0, Y1, Z1, sSliceB, sPitchB, srcBytes);
1585 Pointer<Byte> s111 = source + ComputeOffset(X1, Y1, Z1, sSliceB, sPitchB, srcBytes);
1586
1587 Float4 c000 = readFloat4(s000, state);
1588 Float4 c010 = readFloat4(s010, state);
1589 Float4 c100 = readFloat4(s100, state);
1590 Float4 c110 = readFloat4(s110, state);
1591 Float4 c001 = readFloat4(s001, state);
1592 Float4 c011 = readFloat4(s011, state);
1593 Float4 c101 = readFloat4(s101, state);
1594 Float4 c111 = readFloat4(s111, state);
1595
1596 if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat()) // sRGB -> RGB
1597 {
1598 ApplyScaleAndClamp(c000, state);
1599 ApplyScaleAndClamp(c010, state);
1600 ApplyScaleAndClamp(c100, state);
1601 ApplyScaleAndClamp(c110, state);
1602 ApplyScaleAndClamp(c001, state);
1603 ApplyScaleAndClamp(c011, state);
1604 ApplyScaleAndClamp(c101, state);
1605 ApplyScaleAndClamp(c111, state);
1606 preScaled = true;
1607 }
1608
1609 Float4 fx = Float4(x0 - Float(X0));
1610 Float4 fy = Float4(y0 - Float(Y0));
1611 Float4 fz = Float4(z0 - Float(Z0));
1612 Float4 ix = Float4(1.0f) - fx;
1613 Float4 iy = Float4(1.0f) - fy;
1614 Float4 iz = Float4(1.0f) - fz;
1615
1616 color = ((c000 * ix + c010 * fx) * iy +
1617 (c100 * ix + c110 * fx) * fy) *
1618 iz +
1619 ((c001 * ix + c011 * fx) * iy +
1620 (c101 * ix + c111 * fx) * fy) *
1621 fz;
1622 }
1623 else
1624 {
1625 Pointer<Byte> s00 = source + ComputeOffset(X0, Y0, Z0, sSliceB, sPitchB, srcBytes);
1626 Pointer<Byte> s01 = source + ComputeOffset(X1, Y0, Z0, sSliceB, sPitchB, srcBytes);
1627 Pointer<Byte> s10 = source + ComputeOffset(X0, Y1, Z0, sSliceB, sPitchB, srcBytes);
1628 Pointer<Byte> s11 = source + ComputeOffset(X1, Y1, Z0, sSliceB, sPitchB, srcBytes);
1629
1630 Float4 c00 = readFloat4(s00, state);
1631 Float4 c01 = readFloat4(s01, state);
1632 Float4 c10 = readFloat4(s10, state);
1633 Float4 c11 = readFloat4(s11, state);
1634
1635 if(state.allowSRGBConversion && state.sourceFormat.isSRGBformat()) // sRGB -> RGB
1636 {
1637 ApplyScaleAndClamp(c00, state);
1638 ApplyScaleAndClamp(c01, state);
1639 ApplyScaleAndClamp(c10, state);
1640 ApplyScaleAndClamp(c11, state);
1641 preScaled = true;
1642 }
1643
1644 Float4 fx = Float4(x0 - Float(X0));
1645 Float4 fy = Float4(y0 - Float(Y0));
1646 Float4 ix = Float4(1.0f) - fx;
1647 Float4 iy = Float4(1.0f) - fy;
1648
1649 color = (c00 * ix + c01 * fx) * iy +
1650 (c10 * ix + c11 * fx) * fy;
1651 }
1652 }
1653
1654 ApplyScaleAndClamp(color, state, preScaled);
1655
1656 return color;
1657 }
1658
generate(const State & state)1659 Blitter::BlitRoutineType Blitter::generate(const State &state)
1660 {
1661 BlitFunction function;
1662 {
1663 Pointer<Byte> blit(function.Arg<0>());
1664
1665 Pointer<Byte> source = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData, source));
1666 Pointer<Byte> dest = *Pointer<Pointer<Byte>>(blit + OFFSET(BlitData, dest));
1667 Int sPitchB = *Pointer<Int>(blit + OFFSET(BlitData, sPitchB));
1668 Int dPitchB = *Pointer<Int>(blit + OFFSET(BlitData, dPitchB));
1669 Int sSliceB = *Pointer<Int>(blit + OFFSET(BlitData, sSliceB));
1670 Int dSliceB = *Pointer<Int>(blit + OFFSET(BlitData, dSliceB));
1671
1672 Float x0 = *Pointer<Float>(blit + OFFSET(BlitData, x0));
1673 Float y0 = *Pointer<Float>(blit + OFFSET(BlitData, y0));
1674 Float z0 = *Pointer<Float>(blit + OFFSET(BlitData, z0));
1675 Float w = *Pointer<Float>(blit + OFFSET(BlitData, w));
1676 Float h = *Pointer<Float>(blit + OFFSET(BlitData, h));
1677 Float d = *Pointer<Float>(blit + OFFSET(BlitData, d));
1678
1679 Int x0d = *Pointer<Int>(blit + OFFSET(BlitData, x0d));
1680 Int x1d = *Pointer<Int>(blit + OFFSET(BlitData, x1d));
1681 Int y0d = *Pointer<Int>(blit + OFFSET(BlitData, y0d));
1682 Int y1d = *Pointer<Int>(blit + OFFSET(BlitData, y1d));
1683 Int z0d = *Pointer<Int>(blit + OFFSET(BlitData, z0d));
1684 Int z1d = *Pointer<Int>(blit + OFFSET(BlitData, z1d));
1685
1686 Int sWidth = *Pointer<Int>(blit + OFFSET(BlitData, sWidth));
1687 Int sHeight = *Pointer<Int>(blit + OFFSET(BlitData, sHeight));
1688 Int sDepth = *Pointer<Int>(blit + OFFSET(BlitData, sDepth));
1689
1690 bool intSrc = state.sourceFormat.isUnnormalizedInteger();
1691 bool intDst = state.destFormat.isUnnormalizedInteger();
1692 bool intBoth = intSrc && intDst;
1693 int srcBytes = state.sourceFormat.bytes();
1694 int dstBytes = state.destFormat.bytes();
1695
1696 bool hasConstantColorI = false;
1697 Int4 constantColorI;
1698 bool hasConstantColorF = false;
1699 Float4 constantColorF;
1700 if(state.clearOperation)
1701 {
1702 if(intBoth) // Integer types
1703 {
1704 constantColorI = readInt4(source, state);
1705 hasConstantColorI = true;
1706 }
1707 else
1708 {
1709 constantColorF = readFloat4(source, state);
1710 hasConstantColorF = true;
1711
1712 ApplyScaleAndClamp(constantColorF, state);
1713 }
1714 }
1715
1716 For(Int k = z0d, k < z1d, k++)
1717 {
1718 Float z = state.clearOperation ? RValue<Float>(z0) : z0 + Float(k) * d;
1719 Pointer<Byte> destSlice = dest + k * dSliceB;
1720
1721 For(Int j = y0d, j < y1d, j++)
1722 {
1723 Float y = state.clearOperation ? RValue<Float>(y0) : y0 + Float(j) * h;
1724 Pointer<Byte> destLine = destSlice + j * dPitchB;
1725
1726 For(Int i = x0d, i < x1d, i++)
1727 {
1728 Float x = state.clearOperation ? RValue<Float>(x0) : x0 + Float(i) * w;
1729 Pointer<Byte> d = destLine + i * dstBytes;
1730
1731 if(hasConstantColorI)
1732 {
1733 for(int s = 0; s < state.destSamples; s++)
1734 {
1735 write(constantColorI, d, state);
1736
1737 d += dSliceB;
1738 }
1739 }
1740 else if(hasConstantColorF)
1741 {
1742 for(int s = 0; s < state.destSamples; s++)
1743 {
1744 write(constantColorF, d, state);
1745
1746 d += dSliceB;
1747 }
1748 }
1749 else if(intBoth) // Integer types do not support filtering
1750 {
1751 Int X = Int(x);
1752 Int Y = Int(y);
1753 Int Z = Int(z);
1754
1755 if(state.clampToEdge)
1756 {
1757 X = Clamp(X, 0, sWidth - 1);
1758 Y = Clamp(Y, 0, sHeight - 1);
1759 Z = Clamp(Z, 0, sDepth - 1);
1760 }
1761
1762 Pointer<Byte> s = source + ComputeOffset(X, Y, Z, sSliceB, sPitchB, srcBytes);
1763
1764 // When both formats are true integer types, we don't go to float to avoid losing precision
1765 Int4 color = readInt4(s, state);
1766 for(int s = 0; s < state.destSamples; s++)
1767 {
1768 write(color, d, state);
1769
1770 d += dSliceB;
1771 }
1772 }
1773 else
1774 {
1775 Float4 color = sample(source, x, y, z, sWidth, sHeight, sDepth, sSliceB, sPitchB, state);
1776
1777 for(int s = 0; s < state.destSamples; s++)
1778 {
1779 write(color, d, state);
1780
1781 d += dSliceB;
1782 }
1783 }
1784 }
1785 }
1786 }
1787 }
1788
1789 return function("BlitRoutine");
1790 }
1791
getBlitRoutine(const State & state)1792 Blitter::BlitRoutineType Blitter::getBlitRoutine(const State &state)
1793 {
1794 marl::lock lock(blitMutex);
1795 auto blitRoutine = blitCache.lookup(state);
1796
1797 if(!blitRoutine)
1798 {
1799 blitRoutine = generate(state);
1800 blitCache.add(state, blitRoutine);
1801 }
1802
1803 return blitRoutine;
1804 }
1805
getCornerUpdateRoutine(const State & state)1806 Blitter::CornerUpdateRoutineType Blitter::getCornerUpdateRoutine(const State &state)
1807 {
1808 marl::lock lock(cornerUpdateMutex);
1809 auto cornerUpdateRoutine = cornerUpdateCache.lookup(state);
1810
1811 if(!cornerUpdateRoutine)
1812 {
1813 cornerUpdateRoutine = generateCornerUpdate(state);
1814 cornerUpdateCache.add(state, cornerUpdateRoutine);
1815 }
1816
1817 return cornerUpdateRoutine;
1818 }
1819
blit(const vk::Image * src,vk::Image * dst,VkImageBlit2KHR region,VkFilter filter)1820 void Blitter::blit(const vk::Image *src, vk::Image *dst, VkImageBlit2KHR region, VkFilter filter)
1821 {
1822 ASSERT(src->getFormat() != VK_FORMAT_UNDEFINED);
1823 ASSERT(dst->getFormat() != VK_FORMAT_UNDEFINED);
1824
1825 // Vulkan 1.2 section 18.5. Image Copies with Scaling:
1826 // "The layerCount member of srcSubresource and dstSubresource must match"
1827 // "The aspectMask member of srcSubresource and dstSubresource must match"
1828 ASSERT(region.srcSubresource.layerCount == region.dstSubresource.layerCount);
1829 ASSERT(region.srcSubresource.aspectMask == region.dstSubresource.aspectMask);
1830
1831 if(region.dstOffsets[0].x > region.dstOffsets[1].x)
1832 {
1833 std::swap(region.srcOffsets[0].x, region.srcOffsets[1].x);
1834 std::swap(region.dstOffsets[0].x, region.dstOffsets[1].x);
1835 }
1836
1837 if(region.dstOffsets[0].y > region.dstOffsets[1].y)
1838 {
1839 std::swap(region.srcOffsets[0].y, region.srcOffsets[1].y);
1840 std::swap(region.dstOffsets[0].y, region.dstOffsets[1].y);
1841 }
1842
1843 if(region.dstOffsets[0].z > region.dstOffsets[1].z)
1844 {
1845 std::swap(region.srcOffsets[0].z, region.srcOffsets[1].z);
1846 std::swap(region.dstOffsets[0].z, region.dstOffsets[1].z);
1847 }
1848
1849 VkImageAspectFlagBits srcAspect = static_cast<VkImageAspectFlagBits>(region.srcSubresource.aspectMask);
1850 VkImageAspectFlagBits dstAspect = static_cast<VkImageAspectFlagBits>(region.dstSubresource.aspectMask);
1851 VkExtent3D srcExtent = src->getMipLevelExtent(srcAspect, region.srcSubresource.mipLevel);
1852
1853 float widthRatio = static_cast<float>(region.srcOffsets[1].x - region.srcOffsets[0].x) /
1854 static_cast<float>(region.dstOffsets[1].x - region.dstOffsets[0].x);
1855 float heightRatio = static_cast<float>(region.srcOffsets[1].y - region.srcOffsets[0].y) /
1856 static_cast<float>(region.dstOffsets[1].y - region.dstOffsets[0].y);
1857 float depthRatio = static_cast<float>(region.srcOffsets[1].z - region.srcOffsets[0].z) /
1858 static_cast<float>(region.dstOffsets[1].z - region.dstOffsets[0].z);
1859 float x0 = region.srcOffsets[0].x + (0.5f - region.dstOffsets[0].x) * widthRatio;
1860 float y0 = region.srcOffsets[0].y + (0.5f - region.dstOffsets[0].y) * heightRatio;
1861 float z0 = region.srcOffsets[0].z + (0.5f - region.dstOffsets[0].z) * depthRatio;
1862
1863 auto srcFormat = src->getFormat(srcAspect);
1864 auto dstFormat = dst->getFormat(dstAspect);
1865
1866 bool doFilter = (filter != VK_FILTER_NEAREST);
1867 bool allowSRGBConversion =
1868 doFilter ||
1869 (src->getSampleCount() > 1) ||
1870 (srcFormat.isSRGBformat() != dstFormat.isSRGBformat());
1871
1872 State state(srcFormat, dstFormat, src->getSampleCount(), dst->getSampleCount(),
1873 Options{ doFilter, allowSRGBConversion });
1874 state.clampToEdge = (region.srcOffsets[0].x < 0) ||
1875 (region.srcOffsets[0].y < 0) ||
1876 (static_cast<uint32_t>(region.srcOffsets[1].x) > srcExtent.width) ||
1877 (static_cast<uint32_t>(region.srcOffsets[1].y) > srcExtent.height) ||
1878 (doFilter && ((x0 < 0.5f) || (y0 < 0.5f)));
1879 state.filter3D = (region.srcOffsets[1].z - region.srcOffsets[0].z) !=
1880 (region.dstOffsets[1].z - region.dstOffsets[0].z);
1881
1882 auto blitRoutine = getBlitRoutine(state);
1883 if(!blitRoutine)
1884 {
1885 return;
1886 }
1887
1888 BlitData data = {
1889 nullptr, // source
1890 nullptr, // dest
1891 assert_cast<uint32_t>(src->rowPitchBytes(srcAspect, region.srcSubresource.mipLevel)), // sPitchB
1892 assert_cast<uint32_t>(dst->rowPitchBytes(dstAspect, region.dstSubresource.mipLevel)), // dPitchB
1893 assert_cast<uint32_t>(src->slicePitchBytes(srcAspect, region.srcSubresource.mipLevel)), // sSliceB
1894 assert_cast<uint32_t>(dst->slicePitchBytes(dstAspect, region.dstSubresource.mipLevel)), // dSliceB
1895
1896 x0,
1897 y0,
1898 z0,
1899 widthRatio,
1900 heightRatio,
1901 depthRatio,
1902
1903 region.dstOffsets[0].x, // x0d
1904 region.dstOffsets[1].x, // x1d
1905 region.dstOffsets[0].y, // y0d
1906 region.dstOffsets[1].y, // y1d
1907 region.dstOffsets[0].z, // z0d
1908 region.dstOffsets[1].z, // z1d
1909
1910 static_cast<int>(srcExtent.width), // sWidth
1911 static_cast<int>(srcExtent.height), // sHeight
1912 static_cast<int>(srcExtent.depth), // sDepth
1913
1914 false, // filter3D
1915 };
1916
1917 VkImageSubresource srcSubres = {
1918 region.srcSubresource.aspectMask,
1919 region.srcSubresource.mipLevel,
1920 region.srcSubresource.baseArrayLayer
1921 };
1922
1923 VkImageSubresource dstSubres = {
1924 region.dstSubresource.aspectMask,
1925 region.dstSubresource.mipLevel,
1926 region.dstSubresource.baseArrayLayer
1927 };
1928
1929 VkImageSubresourceRange dstSubresRange = {
1930 region.dstSubresource.aspectMask,
1931 region.dstSubresource.mipLevel,
1932 1, // levelCount
1933 region.dstSubresource.baseArrayLayer,
1934 region.dstSubresource.layerCount
1935 };
1936
1937 uint32_t lastLayer = src->getLastLayerIndex(dstSubresRange);
1938
1939 for(; dstSubres.arrayLayer <= lastLayer; srcSubres.arrayLayer++, dstSubres.arrayLayer++)
1940 {
1941 data.source = src->getTexelPointer({ 0, 0, 0 }, srcSubres);
1942 data.dest = dst->getTexelPointer({ 0, 0, 0 }, dstSubres);
1943
1944 ASSERT(data.source < src->end());
1945 ASSERT(data.dest < dst->end());
1946
1947 blitRoutine(&data);
1948 }
1949
1950 dst->contentsChanged(dstSubresRange);
1951 }
1952
resolveDepth(const vk::ImageView * src,vk::ImageView * dst,const VkResolveModeFlagBits depthResolveMode)1953 static void resolveDepth(const vk::ImageView *src, vk::ImageView *dst, const VkResolveModeFlagBits depthResolveMode)
1954 {
1955 if(depthResolveMode == VK_RESOLVE_MODE_NONE)
1956 {
1957 return;
1958 }
1959
1960 vk::Format format = src->getFormat(VK_IMAGE_ASPECT_DEPTH_BIT);
1961 VkExtent2D extent = src->getMipLevelExtent(0, VK_IMAGE_ASPECT_DEPTH_BIT);
1962 int width = extent.width;
1963 int height = extent.height;
1964 int pitch = src->rowPitchBytes(VK_IMAGE_ASPECT_DEPTH_BIT, 0);
1965
1966 // To support other resolve modes, get the slice bytes and get a pointer to each sample plane.
1967 // Then modify the loop below to include logic for handling each new mode.
1968 uint8_t *source = (uint8_t *)src->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_DEPTH_BIT, 0, 0);
1969 uint8_t *dest = (uint8_t *)dst->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_DEPTH_BIT, 0, 0);
1970
1971 size_t formatSize = format.bytes();
1972 // TODO(b/167558951) support other resolve modes.
1973 ASSERT(depthResolveMode == VK_RESOLVE_MODE_SAMPLE_ZERO_BIT);
1974 for(int y = 0; y < height; y++)
1975 {
1976 memcpy(dest, source, formatSize * width);
1977
1978 source += pitch;
1979 dest += pitch;
1980 }
1981
1982 dst->contentsChanged(vk::Image::DIRECT_MEMORY_ACCESS);
1983 }
1984
resolveStencil(const vk::ImageView * src,vk::ImageView * dst,const VkResolveModeFlagBits stencilResolveMode)1985 static void resolveStencil(const vk::ImageView *src, vk::ImageView *dst, const VkResolveModeFlagBits stencilResolveMode)
1986 {
1987 if(stencilResolveMode == VK_RESOLVE_MODE_NONE)
1988 {
1989 return;
1990 }
1991
1992 VkExtent2D extent = src->getMipLevelExtent(0, VK_IMAGE_ASPECT_STENCIL_BIT);
1993 int width = extent.width;
1994 int height = extent.height;
1995 int pitch = src->rowPitchBytes(VK_IMAGE_ASPECT_STENCIL_BIT, 0);
1996
1997 // To support other resolve modes, use src->slicePitchBytes() and get a pointer to each sample's slice.
1998 // Then modify the loop below to include logic for handling each new mode.
1999 uint8_t *source = reinterpret_cast<uint8_t *>(src->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0));
2000 uint8_t *dest = reinterpret_cast<uint8_t *>(dst->getOffsetPointer({ 0, 0, 0 }, VK_IMAGE_ASPECT_STENCIL_BIT, 0, 0));
2001
2002 // TODO(b/167558951) support other resolve modes.
2003 ASSERT(stencilResolveMode == VK_RESOLVE_MODE_SAMPLE_ZERO_BIT);
2004 for(int y = 0; y < height; y++)
2005 {
2006 // Stencil is always 8 bits, so the width of the resource we're resolving is
2007 // the number of bytes in each row we need to copy during for SAMPLE_ZERO
2008 memcpy(dest, source, width);
2009
2010 source += pitch;
2011 dest += pitch;
2012 }
2013
2014 dst->contentsChanged(vk::Image::DIRECT_MEMORY_ACCESS);
2015 }
2016
resolveDepthStencil(const vk::ImageView * src,vk::ImageView * dst,VkResolveModeFlagBits depthResolveMode,VkResolveModeFlagBits stencilResolveMode)2017 void Blitter::resolveDepthStencil(const vk::ImageView *src, vk::ImageView *dst, VkResolveModeFlagBits depthResolveMode, VkResolveModeFlagBits stencilResolveMode)
2018 {
2019 VkImageSubresourceRange srcRange = src->getSubresourceRange();
2020 VkImageSubresourceRange dstRange = src->getSubresourceRange();
2021 ASSERT(src->getFormat() == dst->getFormat());
2022 ASSERT(srcRange.layerCount == 1 && dstRange.layerCount == 1);
2023 ASSERT(srcRange.aspectMask == dstRange.aspectMask);
2024
2025 if(srcRange.aspectMask & VK_IMAGE_ASPECT_DEPTH_BIT)
2026 {
2027 resolveDepth(src, dst, depthResolveMode);
2028 }
2029 if(srcRange.aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT)
2030 {
2031 resolveStencil(src, dst, stencilResolveMode);
2032 }
2033 }
2034
resolve(const vk::Image * src,vk::Image * dst,VkImageResolve2KHR region)2035 void Blitter::resolve(const vk::Image *src, vk::Image *dst, VkImageResolve2KHR region)
2036 {
2037 // "The aspectMask member of srcSubresource and dstSubresource must only contain VK_IMAGE_ASPECT_COLOR_BIT"
2038 ASSERT(region.srcSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
2039 ASSERT(region.dstSubresource.aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
2040 // "The layerCount member of srcSubresource and dstSubresource must match"
2041 ASSERT(region.srcSubresource.layerCount == region.dstSubresource.layerCount);
2042
2043 // We use this method both for explicit resolves from vkCmdResolveImage, and implicit ones for resolve attachments.
2044 // - vkCmdResolveImage: "srcImage and dstImage must have been created with the same image format."
2045 // - VkSubpassDescription: "each resolve attachment that is not VK_ATTACHMENT_UNUSED must have the same VkFormat as its corresponding color attachment."
2046 ASSERT(src->getFormat() == dst->getFormat());
2047
2048 if(fastResolve(src, dst, region))
2049 {
2050 return;
2051 }
2052
2053 // Fall back to a generic blit which performs the resolve.
2054 VkImageBlit2KHR blitRegion;
2055 blitRegion.sType = VK_STRUCTURE_TYPE_IMAGE_BLIT_2_KHR;
2056 blitRegion.pNext = nullptr;
2057
2058 blitRegion.srcOffsets[0] = blitRegion.srcOffsets[1] = region.srcOffset;
2059 blitRegion.srcOffsets[1].x += region.extent.width;
2060 blitRegion.srcOffsets[1].y += region.extent.height;
2061 blitRegion.srcOffsets[1].z += region.extent.depth;
2062
2063 blitRegion.dstOffsets[0] = blitRegion.dstOffsets[1] = region.dstOffset;
2064 blitRegion.dstOffsets[1].x += region.extent.width;
2065 blitRegion.dstOffsets[1].y += region.extent.height;
2066 blitRegion.dstOffsets[1].z += region.extent.depth;
2067
2068 blitRegion.srcSubresource = region.srcSubresource;
2069 blitRegion.dstSubresource = region.dstSubresource;
2070
2071 blit(src, dst, blitRegion, VK_FILTER_NEAREST);
2072 }
2073
averageByte4(uint32_t x,uint32_t y)2074 static inline uint32_t averageByte4(uint32_t x, uint32_t y)
2075 {
2076 return (x & y) + (((x ^ y) >> 1) & 0x7F7F7F7F) + ((x ^ y) & 0x01010101);
2077 }
2078
fastResolve(const vk::Image * src,vk::Image * dst,VkImageResolve2KHR region)2079 bool Blitter::fastResolve(const vk::Image *src, vk::Image *dst, VkImageResolve2KHR region)
2080 {
2081 if(region.dstOffset != VkOffset3D{ 0, 0, 0 })
2082 {
2083 return false;
2084 }
2085
2086 if(region.srcOffset != VkOffset3D{ 0, 0, 0 })
2087 {
2088 return false;
2089 }
2090
2091 if(region.srcSubresource.layerCount != 1)
2092 {
2093 return false;
2094 }
2095
2096 if(region.extent != src->getExtent() ||
2097 region.extent != dst->getExtent() ||
2098 region.extent.depth != 1)
2099 {
2100 return false;
2101 }
2102
2103 VkImageSubresource srcSubresource = {
2104 region.srcSubresource.aspectMask,
2105 region.srcSubresource.mipLevel,
2106 region.srcSubresource.baseArrayLayer
2107 };
2108
2109 VkImageSubresource dstSubresource = {
2110 region.dstSubresource.aspectMask,
2111 region.dstSubresource.mipLevel,
2112 region.dstSubresource.baseArrayLayer
2113 };
2114
2115 VkImageSubresourceRange dstSubresourceRange = {
2116 region.dstSubresource.aspectMask,
2117 region.dstSubresource.mipLevel,
2118 1, // levelCount
2119 region.dstSubresource.baseArrayLayer,
2120 region.dstSubresource.layerCount
2121 };
2122
2123 void *source = src->getTexelPointer({ 0, 0, 0 }, srcSubresource);
2124 uint8_t *dest = reinterpret_cast<uint8_t *>(dst->getTexelPointer({ 0, 0, 0 }, dstSubresource));
2125
2126 auto format = src->getFormat();
2127 auto samples = src->getSampleCount();
2128 auto extent = src->getExtent();
2129
2130 int width = extent.width;
2131 int height = extent.height;
2132 int pitch = src->rowPitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, region.srcSubresource.mipLevel);
2133 int slice = src->slicePitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, region.srcSubresource.mipLevel);
2134
2135 uint8_t *source0 = (uint8_t *)source;
2136 uint8_t *source1 = source0 + slice;
2137 uint8_t *source2 = source1 + slice;
2138 uint8_t *source3 = source2 + slice;
2139
2140 [[maybe_unused]] const bool SSE2 = CPUID::supportsSSE2();
2141
2142 if(format == VK_FORMAT_R8G8B8A8_UNORM || format == VK_FORMAT_B8G8R8A8_UNORM || format == VK_FORMAT_A8B8G8R8_UNORM_PACK32)
2143 {
2144 if(samples == 4)
2145 {
2146 for(int y = 0; y < height; y++)
2147 {
2148 int x = 0;
2149
2150 #if defined(__i386__) || defined(__x86_64__)
2151 if(SSE2)
2152 {
2153 for(; (x + 3) < width; x += 4)
2154 {
2155 __m128i c0 = _mm_loadu_si128((__m128i *)(source0 + 4 * x));
2156 __m128i c1 = _mm_loadu_si128((__m128i *)(source1 + 4 * x));
2157 __m128i c2 = _mm_loadu_si128((__m128i *)(source2 + 4 * x));
2158 __m128i c3 = _mm_loadu_si128((__m128i *)(source3 + 4 * x));
2159
2160 c0 = _mm_avg_epu8(c0, c1);
2161 c2 = _mm_avg_epu8(c2, c3);
2162 c0 = _mm_avg_epu8(c0, c2);
2163
2164 _mm_storeu_si128((__m128i *)(dest + 4 * x), c0);
2165 }
2166 }
2167 #endif
2168
2169 for(; x < width; x++)
2170 {
2171 uint32_t c0 = *(uint32_t *)(source0 + 4 * x);
2172 uint32_t c1 = *(uint32_t *)(source1 + 4 * x);
2173 uint32_t c2 = *(uint32_t *)(source2 + 4 * x);
2174 uint32_t c3 = *(uint32_t *)(source3 + 4 * x);
2175
2176 uint32_t c01 = averageByte4(c0, c1);
2177 uint32_t c23 = averageByte4(c2, c3);
2178 uint32_t c03 = averageByte4(c01, c23);
2179
2180 *(uint32_t *)(dest + 4 * x) = c03;
2181 }
2182
2183 source0 += pitch;
2184 source1 += pitch;
2185 source2 += pitch;
2186 source3 += pitch;
2187 dest += pitch;
2188
2189 ASSERT(source0 < src->end());
2190 ASSERT(source3 < src->end());
2191 ASSERT(dest < dst->end());
2192 }
2193 }
2194 else
2195 UNSUPPORTED("Samples: %d", samples);
2196 }
2197 else
2198 {
2199 return false;
2200 }
2201
2202 dst->contentsChanged(dstSubresourceRange);
2203
2204 return true;
2205 }
2206
copy(const vk::Image * src,uint8_t * dst,unsigned int dstPitch)2207 void Blitter::copy(const vk::Image *src, uint8_t *dst, unsigned int dstPitch)
2208 {
2209 VkExtent3D extent = src->getExtent();
2210 size_t rowBytes = src->getFormat(VK_IMAGE_ASPECT_COLOR_BIT).bytes() * extent.width;
2211 unsigned int srcPitch = src->rowPitchBytes(VK_IMAGE_ASPECT_COLOR_BIT, 0);
2212 ASSERT(dstPitch >= rowBytes && srcPitch >= rowBytes && src->getMipLevelExtent(VK_IMAGE_ASPECT_COLOR_BIT, 0).height >= extent.height);
2213
2214 const uint8_t *s = (uint8_t *)src->getTexelPointer({ 0, 0, 0 }, { VK_IMAGE_ASPECT_COLOR_BIT, 0, 0 });
2215 uint8_t *d = dst;
2216
2217 for(uint32_t y = 0; y < extent.height; y++)
2218 {
2219 memcpy(d, s, rowBytes);
2220
2221 s += srcPitch;
2222 d += dstPitch;
2223 }
2224 }
2225
computeCubeCorner(Pointer<Byte> & layer,Int & x0,Int & x1,Int & y0,Int & y1,Int & pitchB,const State & state)2226 void Blitter::computeCubeCorner(Pointer<Byte> &layer, Int &x0, Int &x1, Int &y0, Int &y1, Int &pitchB, const State &state)
2227 {
2228 int bytes = state.sourceFormat.bytes();
2229
2230 Float4 c = readFloat4(layer + ComputeOffset(x0, y1, pitchB, bytes), state) +
2231 readFloat4(layer + ComputeOffset(x1, y0, pitchB, bytes), state) +
2232 readFloat4(layer + ComputeOffset(x1, y1, pitchB, bytes), state);
2233
2234 c *= Float4(1.0f / 3.0f);
2235
2236 write(c, layer + ComputeOffset(x0, y0, pitchB, bytes), state);
2237 }
2238
generateCornerUpdate(const State & state)2239 Blitter::CornerUpdateRoutineType Blitter::generateCornerUpdate(const State &state)
2240 {
2241 // Reading and writing from/to the same image
2242 ASSERT(state.sourceFormat == state.destFormat);
2243 ASSERT(state.srcSamples == state.destSamples);
2244
2245 // Vulkan 1.2: "If samples is not VK_SAMPLE_COUNT_1_BIT, then imageType must be
2246 // VK_IMAGE_TYPE_2D, flags must not contain VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT"
2247 ASSERT(state.srcSamples == 1);
2248
2249 CornerUpdateFunction function;
2250 {
2251 Pointer<Byte> blit(function.Arg<0>());
2252
2253 Pointer<Byte> layers = *Pointer<Pointer<Byte>>(blit + OFFSET(CubeBorderData, layers));
2254 Int pitchB = *Pointer<Int>(blit + OFFSET(CubeBorderData, pitchB));
2255 UInt layerSize = *Pointer<Int>(blit + OFFSET(CubeBorderData, layerSize));
2256 UInt dim = *Pointer<Int>(blit + OFFSET(CubeBorderData, dim));
2257
2258 // Low Border, Low Pixel, High Border, High Pixel
2259 Int LB(-1), LP(0), HB(dim), HP(dim - 1);
2260
2261 for(int face = 0; face < 6; face++)
2262 {
2263 computeCubeCorner(layers, LB, LP, LB, LP, pitchB, state);
2264 computeCubeCorner(layers, LB, LP, HB, HP, pitchB, state);
2265 computeCubeCorner(layers, HB, HP, LB, LP, pitchB, state);
2266 computeCubeCorner(layers, HB, HP, HB, HP, pitchB, state);
2267 layers = layers + layerSize;
2268 }
2269 }
2270
2271 return function("BlitRoutine");
2272 }
2273
updateBorders(const vk::Image * image,const VkImageSubresource & subresource)2274 void Blitter::updateBorders(const vk::Image *image, const VkImageSubresource &subresource)
2275 {
2276 ASSERT(image->getArrayLayers() >= (subresource.arrayLayer + 6));
2277
2278 // From Vulkan 1.1 spec, section 11.5. Image Views:
2279 // "For cube and cube array image views, the layers of the image view starting
2280 // at baseArrayLayer correspond to faces in the order +X, -X, +Y, -Y, +Z, -Z."
2281 VkImageSubresource posX = subresource;
2282 VkImageSubresource negX = posX;
2283 negX.arrayLayer++;
2284 VkImageSubresource posY = negX;
2285 posY.arrayLayer++;
2286 VkImageSubresource negY = posY;
2287 negY.arrayLayer++;
2288 VkImageSubresource posZ = negY;
2289 posZ.arrayLayer++;
2290 VkImageSubresource negZ = posZ;
2291 negZ.arrayLayer++;
2292
2293 // Copy top / bottom
2294 copyCubeEdge(image, posX, BOTTOM, negY, RIGHT);
2295 copyCubeEdge(image, posY, BOTTOM, posZ, TOP);
2296 copyCubeEdge(image, posZ, BOTTOM, negY, TOP);
2297 copyCubeEdge(image, negX, BOTTOM, negY, LEFT);
2298 copyCubeEdge(image, negY, BOTTOM, negZ, BOTTOM);
2299 copyCubeEdge(image, negZ, BOTTOM, negY, BOTTOM);
2300
2301 copyCubeEdge(image, posX, TOP, posY, RIGHT);
2302 copyCubeEdge(image, posY, TOP, negZ, TOP);
2303 copyCubeEdge(image, posZ, TOP, posY, BOTTOM);
2304 copyCubeEdge(image, negX, TOP, posY, LEFT);
2305 copyCubeEdge(image, negY, TOP, posZ, BOTTOM);
2306 copyCubeEdge(image, negZ, TOP, posY, TOP);
2307
2308 // Copy left / right
2309 copyCubeEdge(image, posX, RIGHT, negZ, LEFT);
2310 copyCubeEdge(image, posY, RIGHT, posX, TOP);
2311 copyCubeEdge(image, posZ, RIGHT, posX, LEFT);
2312 copyCubeEdge(image, negX, RIGHT, posZ, LEFT);
2313 copyCubeEdge(image, negY, RIGHT, posX, BOTTOM);
2314 copyCubeEdge(image, negZ, RIGHT, negX, LEFT);
2315
2316 copyCubeEdge(image, posX, LEFT, posZ, RIGHT);
2317 copyCubeEdge(image, posY, LEFT, negX, TOP);
2318 copyCubeEdge(image, posZ, LEFT, negX, RIGHT);
2319 copyCubeEdge(image, negX, LEFT, negZ, RIGHT);
2320 copyCubeEdge(image, negY, LEFT, negX, BOTTOM);
2321 copyCubeEdge(image, negZ, LEFT, posX, RIGHT);
2322
2323 // Compute corner colors
2324 VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(subresource.aspectMask);
2325 vk::Format format = image->getFormat(aspect);
2326 VkSampleCountFlagBits samples = image->getSampleCount();
2327 State state(format, format, samples, samples, Options{ 0xF });
2328
2329 // Vulkan 1.2: "If samples is not VK_SAMPLE_COUNT_1_BIT, then imageType must be
2330 // VK_IMAGE_TYPE_2D, flags must not contain VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT"
2331 ASSERT(samples == VK_SAMPLE_COUNT_1_BIT);
2332
2333 auto cornerUpdateRoutine = getCornerUpdateRoutine(state);
2334 if(!cornerUpdateRoutine)
2335 {
2336 return;
2337 }
2338
2339 VkExtent3D extent = image->getMipLevelExtent(aspect, subresource.mipLevel);
2340 CubeBorderData data = {
2341 image->getTexelPointer({ 0, 0, 0 }, posX),
2342 assert_cast<uint32_t>(image->rowPitchBytes(aspect, subresource.mipLevel)),
2343 assert_cast<uint32_t>(image->getLayerSize(aspect)),
2344 extent.width
2345 };
2346 cornerUpdateRoutine(&data);
2347 }
2348
copyCubeEdge(const vk::Image * image,const VkImageSubresource & dstSubresource,Edge dstEdge,const VkImageSubresource & srcSubresource,Edge srcEdge)2349 void Blitter::copyCubeEdge(const vk::Image *image,
2350 const VkImageSubresource &dstSubresource, Edge dstEdge,
2351 const VkImageSubresource &srcSubresource, Edge srcEdge)
2352 {
2353 ASSERT(srcSubresource.aspectMask == dstSubresource.aspectMask);
2354 ASSERT(srcSubresource.mipLevel == dstSubresource.mipLevel);
2355 ASSERT(srcSubresource.arrayLayer != dstSubresource.arrayLayer);
2356
2357 // Figure out if the edges to be copied in reverse order respectively from one another
2358 // The copy should be reversed whenever the same edges are contiguous or if we're
2359 // copying top <-> right or bottom <-> left. This is explained by the layout, which is:
2360 //
2361 // | +y |
2362 // | -x | +z | +x | -z |
2363 // | -y |
2364
2365 bool reverse = (srcEdge == dstEdge) ||
2366 ((srcEdge == TOP) && (dstEdge == RIGHT)) ||
2367 ((srcEdge == RIGHT) && (dstEdge == TOP)) ||
2368 ((srcEdge == BOTTOM) && (dstEdge == LEFT)) ||
2369 ((srcEdge == LEFT) && (dstEdge == BOTTOM));
2370
2371 VkImageAspectFlagBits aspect = static_cast<VkImageAspectFlagBits>(srcSubresource.aspectMask);
2372 int bytes = image->getFormat(aspect).bytes();
2373 int pitchB = image->rowPitchBytes(aspect, srcSubresource.mipLevel);
2374
2375 VkExtent3D extent = image->getMipLevelExtent(aspect, srcSubresource.mipLevel);
2376 int w = extent.width;
2377 int h = extent.height;
2378 if(w != h)
2379 {
2380 UNSUPPORTED("Cube doesn't have square faces : (%d, %d)", w, h);
2381 }
2382
2383 // Src is expressed in the regular [0, width-1], [0, height-1] space
2384 bool srcHorizontal = ((srcEdge == TOP) || (srcEdge == BOTTOM));
2385 int srcDelta = srcHorizontal ? bytes : pitchB;
2386 VkOffset3D srcOffset = { (srcEdge == RIGHT) ? (w - 1) : 0, (srcEdge == BOTTOM) ? (h - 1) : 0, 0 };
2387
2388 // Dst contains borders, so it is expressed in the [-1, width], [-1, height] space
2389 bool dstHorizontal = ((dstEdge == TOP) || (dstEdge == BOTTOM));
2390 int dstDelta = (dstHorizontal ? bytes : pitchB) * (reverse ? -1 : 1);
2391 VkOffset3D dstOffset = { (dstEdge == RIGHT) ? w : -1, (dstEdge == BOTTOM) ? h : -1, 0 };
2392
2393 // Don't write in the corners
2394 if(dstHorizontal)
2395 {
2396 dstOffset.x += reverse ? w : 1;
2397 }
2398 else
2399 {
2400 dstOffset.y += reverse ? h : 1;
2401 }
2402
2403 const uint8_t *src = static_cast<const uint8_t *>(image->getTexelPointer(srcOffset, srcSubresource));
2404 uint8_t *dst = static_cast<uint8_t *>(image->getTexelPointer(dstOffset, dstSubresource));
2405 ASSERT((src < image->end()) && ((src + (w * srcDelta)) < image->end()));
2406 ASSERT((dst < image->end()) && ((dst + (w * dstDelta)) < image->end()));
2407
2408 for(int i = 0; i < w; ++i, dst += dstDelta, src += srcDelta)
2409 {
2410 memcpy(dst, src, bytes);
2411 }
2412 }
2413
2414 } // namespace sw
2415