xref: /aosp_15_r20/external/swiftshader/src/Pipeline/PixelRoutine.cpp (revision 03ce13f70fcc45d86ee91b7ee4cab1936a95046e)
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "PixelRoutine.hpp"
16 
17 #include "Constants.hpp"
18 #include "SamplerCore.hpp"
19 #include "Device/Primitive.hpp"
20 #include "Device/QuadRasterizer.hpp"
21 #include "Device/Renderer.hpp"
22 #include "System/Debug.hpp"
23 #include "System/Math.hpp"
24 #include "Vulkan/VkPipelineLayout.hpp"
25 #include "Vulkan/VkStringify.hpp"
26 
27 namespace sw {
28 namespace {
29 
shouldUsePerSampleShading(const PixelProcessor::State & state,const SpirvShader * spirvShader)30 bool shouldUsePerSampleShading(const PixelProcessor::State &state, const SpirvShader *spirvShader)
31 {
32 	if(state.sampleShadingEnabled && (state.minSampleShading * state.multiSampleCount > 1.0f))
33 	{
34 		return true;
35 	}
36 
37 	if(spirvShader)
38 	{
39 		if(spirvShader->getUsedCapabilities().InterpolationFunction)  // TODO(b/194714095)
40 		{
41 			return true;
42 		}
43 
44 		if(spirvShader->getUsedCapabilities().SampleRateShading)
45 		{
46 			return true;
47 		}
48 	}
49 
50 	return false;
51 }
52 
53 }  // namespace
54 
PixelRoutine(const PixelProcessor::State & state,const vk::PipelineLayout * pipelineLayout,const SpirvShader * spirvShader,const vk::Attachments & attachments,const vk::DescriptorSet::Bindings & descriptorSets)55 PixelRoutine::PixelRoutine(
56     const PixelProcessor::State &state,
57     const vk::PipelineLayout *pipelineLayout,
58     const SpirvShader *spirvShader,
59     const vk::Attachments &attachments,
60     const vk::DescriptorSet::Bindings &descriptorSets)
61     : QuadRasterizer(state, spirvShader)
62     , routine(pipelineLayout)
63     , attachments(attachments)
64     , descriptorSets(descriptorSets)
65     , shaderContainsInterpolation(spirvShader && spirvShader->getUsedCapabilities().InterpolationFunction)
66     , perSampleShading(shouldUsePerSampleShading(state, spirvShader))
67     , invocationCount(perSampleShading ? state.multiSampleCount : 1)
68 {
69 	if(spirvShader)
70 	{
71 		spirvShader->emitProlog(&routine);
72 	}
73 }
74 
~PixelRoutine()75 PixelRoutine::~PixelRoutine()
76 {
77 }
78 
getSampleSet(int invocation) const79 PixelRoutine::SampleSet PixelRoutine::getSampleSet(int invocation) const
80 {
81 	unsigned int sampleBegin = perSampleShading ? invocation : 0;
82 	unsigned int sampleEnd = perSampleShading ? (invocation + 1) : state.multiSampleCount;
83 
84 	SampleSet samples;
85 
86 	for(unsigned int q = sampleBegin; q < sampleEnd; q++)
87 	{
88 		if(state.multiSampleMask & (1 << q))
89 		{
90 			samples.push_back(q);
91 		}
92 	}
93 
94 	return samples;
95 }
96 
quad(Pointer<Byte> cBuffer[MAX_COLOR_BUFFERS],Pointer<Byte> & zBuffer,Pointer<Byte> & sBuffer,Int cMask[4],Int & x,Int & y)97 void PixelRoutine::quad(Pointer<Byte> cBuffer[MAX_COLOR_BUFFERS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
98 {
99 	const bool earlyFragmentTests = !spirvShader || spirvShader->getExecutionModes().EarlyFragmentTests;
100 
101 	Int zMask[4];  // Depth mask
102 	Int sMask[4];  // Stencil mask
103 	SIMD::Float unclampedZ[4];
104 
105 	for(int invocation = 0; invocation < invocationCount; invocation++)
106 	{
107 		SampleSet samples = getSampleSet(invocation);
108 
109 		if(samples.empty())
110 		{
111 			continue;
112 		}
113 
114 		for(unsigned int q : samples)
115 		{
116 			zMask[q] = cMask[q];
117 			sMask[q] = cMask[q];
118 		}
119 
120 		stencilTest(sBuffer, x, sMask, samples);
121 
122 		SIMD::Float rhwCentroid;
123 
124 		// Compute the x coordinate of each fragment in the SIMD group.
125 		const auto xMorton = SIMD::Float([](int i) { return float(compactEvenBits(i)); });  // 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3, ...
126 		xFragment = SIMD::Float(Float(x)) + xMorton - SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, x0)));
127 
128 		if(interpolateZ())
129 		{
130 			for(unsigned int q : samples)
131 			{
132 				SIMD::Float x = xFragment;
133 
134 				if(state.enableMultiSampling)
135 				{
136 					x -= SIMD::Float(*Pointer<Float>(constants + OFFSET(Constants, SampleLocationsX) + q * sizeof(float)));
137 				}
138 
139 				z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive, z), false, false);
140 
141 				if(state.depthBias)
142 				{
143 					z[q] += SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, zBias)));
144 				}
145 
146 				unclampedZ[q] = z[q];
147 			}
148 		}
149 
150 		Bool depthPass = false;
151 
152 		if(earlyFragmentTests)
153 		{
154 			for(unsigned int q : samples)
155 			{
156 				z[q] = clampDepth(z[q]);
157 				depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
158 				depthBoundsTest(zBuffer, q, x, zMask[q], cMask[q]);
159 			}
160 
161 			writeStencil(sBuffer, x, sMask, zMask, cMask, samples);
162 		}
163 
164 		If(depthPass || !earlyFragmentTests)
165 		{
166 			if(earlyFragmentTests)
167 			{
168 				writeDepth(zBuffer, x, zMask, samples);
169 				occlusionSampleCount(zMask, sMask, samples);
170 			}
171 
172 			// TODO(b/236162233): Use SIMD::Float2
173 			SIMD::Float xCentroid = 0.0f;
174 			SIMD::Float yCentroid = 0.0f;
175 
176 			if(state.centroid || shaderContainsInterpolation)  // TODO(b/194714095)
177 			{
178 				SIMD::Float weight = 1.0e-9f;
179 
180 				for(unsigned int q : samples)
181 				{
182 					ASSERT(SIMD::Width == 4);
183 					xCentroid += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, sampleX[q]) + 16 * cMask[q]));
184 					yCentroid += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, sampleY[q]) + 16 * cMask[q]));
185 					weight += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]));
186 				}
187 
188 				weight = Rcp(weight, true /* relaxedPrecision */);
189 				xCentroid *= weight;
190 				yCentroid *= weight;
191 
192 				xCentroid += xFragment;
193 				yCentroid += yFragment;
194 			}
195 
196 			if(interpolateW())
197 			{
198 				w = interpolate(xFragment, Dw, rhw, primitive + OFFSET(Primitive, w), false, false);
199 				rhw = reciprocal(w, false, true);
200 
201 				if(state.centroid || shaderContainsInterpolation)  // TODO(b/194714095)
202 				{
203 					rhwCentroid = reciprocal(SpirvRoutine::interpolateAtXY(xCentroid, yCentroid, rhwCentroid, primitive + OFFSET(Primitive, w), SpirvRoutine::Linear));
204 				}
205 			}
206 
207 			if(spirvShader)
208 			{
209 				if(shaderContainsInterpolation)  // TODO(b/194714095)
210 				{
211 					routine.interpolationData.primitive = primitive;
212 
213 					routine.interpolationData.x = xFragment;
214 					routine.interpolationData.y = yFragment;
215 					routine.interpolationData.rhw = rhw;
216 
217 					routine.interpolationData.xCentroid = xCentroid;
218 					routine.interpolationData.yCentroid = yCentroid;
219 					routine.interpolationData.rhwCentroid = rhwCentroid;
220 				}
221 
222 				SIMD::Float xSample = xFragment;
223 				SIMD::Float ySample = yFragment;
224 
225 				if(perSampleShading && (state.multiSampleCount > 1))
226 				{
227 					xSample += SampleLocationsX[samples[0]];
228 					ySample += SampleLocationsY[samples[0]];
229 				}
230 
231 				int packedInterpolant = 0;
232 				for(int interfaceInterpolant = 0; interfaceInterpolant < MAX_INTERFACE_COMPONENTS; interfaceInterpolant++)
233 				{
234 					const auto &input = spirvShader->inputs[interfaceInterpolant];
235 					if(input.Type != Spirv::ATTRIBTYPE_UNUSED)
236 					{
237 						routine.inputsInterpolation[packedInterpolant] = input.Flat ? SpirvRoutine::Flat : (input.NoPerspective ? SpirvRoutine::Linear : SpirvRoutine::Perspective);
238 						if(input.Centroid && state.enableMultiSampling)
239 						{
240 							routine.inputs[interfaceInterpolant] =
241 							    SpirvRoutine::interpolateAtXY(xCentroid, yCentroid, rhwCentroid,
242 							                                  primitive + OFFSET(Primitive, V[packedInterpolant]),
243 							                                  routine.inputsInterpolation[packedInterpolant]);
244 						}
245 						else if(perSampleShading)
246 						{
247 							routine.inputs[interfaceInterpolant] =
248 							    SpirvRoutine::interpolateAtXY(xSample, ySample, rhw,
249 							                                  primitive + OFFSET(Primitive, V[packedInterpolant]),
250 							                                  routine.inputsInterpolation[packedInterpolant]);
251 						}
252 						else
253 						{
254 							routine.inputs[interfaceInterpolant] =
255 							    interpolate(xFragment, Dv[interfaceInterpolant], rhw,
256 							                primitive + OFFSET(Primitive, V[packedInterpolant]),
257 							                input.Flat, !input.NoPerspective);
258 						}
259 						packedInterpolant++;
260 					}
261 				}
262 
263 				setBuiltins(x, y, unclampedZ, w, cMask, samples);
264 
265 				for(uint32_t i = 0; i < state.numClipDistances; i++)
266 				{
267 					auto distance = interpolate(xFragment, DclipDistance[i], rhw,
268 					                            primitive + OFFSET(Primitive, clipDistance[i]),
269 					                            false, true);
270 
271 					auto clipMask = SignMask(CmpGE(distance, SIMD::Float(0)));
272 					for(unsigned int q : samples)
273 					{
274 						// FIXME(b/148105887): Fragments discarded by clipping do not exist at
275 						// all -- they should not be counted in queries or have their Z/S effects
276 						// performed when early fragment tests are enabled.
277 						cMask[q] &= clipMask;
278 					}
279 
280 					if(spirvShader->getUsedCapabilities().ClipDistance)
281 					{
282 						auto it = spirvShader->inputBuiltins.find(spv::BuiltInClipDistance);
283 						if(it != spirvShader->inputBuiltins.end())
284 						{
285 							if(i < it->second.SizeInComponents)
286 							{
287 								routine.getVariable(it->second.Id)[it->second.FirstComponent + i] = distance;
288 							}
289 						}
290 					}
291 				}
292 
293 				if(spirvShader->getUsedCapabilities().CullDistance)
294 				{
295 					auto it = spirvShader->inputBuiltins.find(spv::BuiltInCullDistance);
296 					if(it != spirvShader->inputBuiltins.end())
297 					{
298 						for(uint32_t i = 0; i < state.numCullDistances; i++)
299 						{
300 							if(i < it->second.SizeInComponents)
301 							{
302 								routine.getVariable(it->second.Id)[it->second.FirstComponent + i] =
303 								    interpolate(xFragment, DcullDistance[i], rhw,
304 								                primitive + OFFSET(Primitive, cullDistance[i]),
305 								                false, true);
306 							}
307 						}
308 					}
309 				}
310 			}
311 
312 			if(spirvShader)
313 			{
314 				executeShader(cMask, earlyFragmentTests ? sMask : cMask, earlyFragmentTests ? zMask : cMask, samples);
315 			}
316 
317 			Bool alphaPass = alphaTest(cMask, samples);
318 
319 			if((spirvShader && spirvShader->coverageModified()) || state.alphaToCoverage)
320 			{
321 				for(unsigned int q : samples)
322 				{
323 					zMask[q] &= cMask[q];
324 					sMask[q] &= cMask[q];
325 				}
326 			}
327 
328 			If(alphaPass)
329 			{
330 				if(!earlyFragmentTests)
331 				{
332 					for(unsigned int q : samples)
333 					{
334 						z[q] = clampDepth(z[q]);
335 						depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
336 						depthBoundsTest(zBuffer, q, x, zMask[q], cMask[q]);
337 					}
338 				}
339 
340 				If(depthPass)
341 				{
342 					if(!earlyFragmentTests)
343 					{
344 						writeDepth(zBuffer, x, zMask, samples);
345 						occlusionSampleCount(zMask, sMask, samples);
346 					}
347 
348 					blendColor(cBuffer, x, sMask, zMask, cMask, samples);
349 				}
350 			}
351 		}
352 
353 		if(!earlyFragmentTests)
354 		{
355 			writeStencil(sBuffer, x, sMask, zMask, cMask, samples);
356 		}
357 	}
358 }
359 
stencilTest(const Pointer<Byte> & sBuffer,const Int & x,Int sMask[4],const SampleSet & samples)360 void PixelRoutine::stencilTest(const Pointer<Byte> &sBuffer, const Int &x, Int sMask[4], const SampleSet &samples)
361 {
362 	if(!state.stencilActive)
363 	{
364 		return;
365 	}
366 
367 	for(unsigned int q : samples)
368 	{
369 		// (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
370 
371 		Pointer<Byte> buffer = sBuffer + x;
372 
373 		if(q > 0)
374 		{
375 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
376 		}
377 
378 		Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
379 		Byte8 value = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
380 		value = value | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
381 		Byte8 valueBack = value;
382 
383 		if(state.frontStencil.useCompareMask)
384 		{
385 			value &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].testMaskQ));
386 		}
387 
388 		stencilTest(value, state.frontStencil.compareOp, false);
389 
390 		if(state.backStencil.useCompareMask)
391 		{
392 			valueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].testMaskQ));
393 		}
394 
395 		stencilTest(valueBack, state.backStencil.compareOp, true);
396 
397 		value &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
398 		valueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
399 		value |= valueBack;
400 
401 		sMask[q] &= SignMask(value);
402 	}
403 }
404 
stencilTest(Byte8 & value,VkCompareOp stencilCompareMode,bool isBack)405 void PixelRoutine::stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool isBack)
406 {
407 	Byte8 equal;
408 
409 	switch(stencilCompareMode)
410 	{
411 	case VK_COMPARE_OP_ALWAYS:
412 		value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
413 		break;
414 	case VK_COMPARE_OP_NEVER:
415 		value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
416 		break;
417 	case VK_COMPARE_OP_LESS:  // a < b ~ b > a
418 		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
419 		value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
420 		break;
421 	case VK_COMPARE_OP_EQUAL:
422 		value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
423 		break;
424 	case VK_COMPARE_OP_NOT_EQUAL:  // a != b ~ !(a == b)
425 		value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
426 		value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
427 		break;
428 	case VK_COMPARE_OP_LESS_OR_EQUAL:  // a <= b ~ (b > a) || (a == b)
429 		equal = value;
430 		equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
431 		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
432 		value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
433 		value |= equal;
434 		break;
435 	case VK_COMPARE_OP_GREATER:  // a > b
436 		equal = *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ));
437 		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
438 		equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
439 		value = equal;
440 		break;
441 	case VK_COMPARE_OP_GREATER_OR_EQUAL:  // a >= b ~ !(a < b) ~ !(b > a)
442 		value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
443 		value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
444 		value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
445 		break;
446 	default:
447 		UNSUPPORTED("VkCompareOp: %d", int(stencilCompareMode));
448 	}
449 }
450 
readDepth32F(const Pointer<Byte> & zBuffer,int q,const Int & x) const451 SIMD::Float PixelRoutine::readDepth32F(const Pointer<Byte> &zBuffer, int q, const Int &x) const
452 {
453 	ASSERT(SIMD::Width == 4);
454 	Pointer<Byte> buffer = zBuffer + 4 * x;
455 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
456 
457 	if(q > 0)
458 	{
459 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
460 	}
461 
462 	Float4 zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
463 	return SIMD::Float(zValue);
464 }
465 
readDepth16(const Pointer<Byte> & zBuffer,int q,const Int & x) const466 SIMD::Float PixelRoutine::readDepth16(const Pointer<Byte> &zBuffer, int q, const Int &x) const
467 {
468 	ASSERT(SIMD::Width == 4);
469 	Pointer<Byte> buffer = zBuffer + 2 * x;
470 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
471 
472 	if(q > 0)
473 	{
474 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
475 	}
476 
477 	UShort4 zValue16;
478 	zValue16 = As<UShort4>(Insert(As<Int2>(zValue16), *Pointer<Int>(buffer), 0));
479 	zValue16 = As<UShort4>(Insert(As<Int2>(zValue16), *Pointer<Int>(buffer + pitch), 1));
480 	Float4 zValue = Float4(zValue16);
481 	return SIMD::Float(zValue);
482 }
483 
clampDepth(const SIMD::Float & z)484 SIMD::Float PixelRoutine::clampDepth(const SIMD::Float &z)
485 {
486 	if(!state.depthClamp)
487 	{
488 		return z;
489 	}
490 
491 	return Min(Max(z, state.minDepthClamp), state.maxDepthClamp);
492 }
493 
depthTest(const Pointer<Byte> & zBuffer,int q,const Int & x,const SIMD::Float & z,const Int & sMask,Int & zMask,const Int & cMask)494 Bool PixelRoutine::depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const SIMD::Float &z, const Int &sMask, Int &zMask, const Int &cMask)
495 {
496 	if(!state.depthTestActive)
497 	{
498 		return true;
499 	}
500 
501 	SIMD::Float Z;
502 	SIMD::Float zValue;
503 
504 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
505 	{
506 		switch(state.depthFormat)
507 		{
508 		case VK_FORMAT_D16_UNORM:
509 			Z = Min(Max(Round(z * 0xFFFF), 0.0f), 0xFFFF);
510 			zValue = readDepth16(zBuffer, q, x);
511 			break;
512 		case VK_FORMAT_D32_SFLOAT:
513 		case VK_FORMAT_D32_SFLOAT_S8_UINT:
514 			Z = z;
515 			zValue = readDepth32F(zBuffer, q, x);
516 			break;
517 		default:
518 			UNSUPPORTED("Depth format: %d", int(state.depthFormat));
519 			return false;
520 		}
521 	}
522 
523 	SIMD::Int zTest;
524 
525 	switch(state.depthCompareMode)
526 	{
527 	case VK_COMPARE_OP_ALWAYS:
528 		// Optimized
529 		break;
530 	case VK_COMPARE_OP_NEVER:
531 		// Optimized
532 		break;
533 	case VK_COMPARE_OP_EQUAL:
534 		zTest = CmpEQ(zValue, Z);
535 		break;
536 	case VK_COMPARE_OP_NOT_EQUAL:
537 		zTest = CmpNEQ(zValue, Z);
538 		break;
539 	case VK_COMPARE_OP_LESS:
540 		zTest = CmpNLE(zValue, Z);
541 		break;
542 	case VK_COMPARE_OP_GREATER_OR_EQUAL:
543 		zTest = CmpLE(zValue, Z);
544 		break;
545 	case VK_COMPARE_OP_LESS_OR_EQUAL:
546 		zTest = CmpNLT(zValue, Z);
547 		break;
548 	case VK_COMPARE_OP_GREATER:
549 		zTest = CmpLT(zValue, Z);
550 		break;
551 	default:
552 		UNSUPPORTED("VkCompareOp: %d", int(state.depthCompareMode));
553 	}
554 
555 	switch(state.depthCompareMode)
556 	{
557 	case VK_COMPARE_OP_ALWAYS:
558 		zMask = cMask;
559 		break;
560 	case VK_COMPARE_OP_NEVER:
561 		zMask = 0x0;
562 		break;
563 	default:
564 		zMask = SignMask(zTest) & cMask;
565 		break;
566 	}
567 
568 	if(state.stencilActive)
569 	{
570 		zMask &= sMask;
571 	}
572 
573 	return zMask != 0;
574 }
575 
depthBoundsTest16(const Pointer<Byte> & zBuffer,int q,const Int & x)576 Int4 PixelRoutine::depthBoundsTest16(const Pointer<Byte> &zBuffer, int q, const Int &x)
577 {
578 	Pointer<Byte> buffer = zBuffer + 2 * x;
579 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
580 
581 	if(q > 0)
582 	{
583 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
584 	}
585 
586 	Float4 minDepthBound(state.minDepthBounds);
587 	Float4 maxDepthBound(state.maxDepthBounds);
588 
589 	Int2 z;
590 	z = Insert(z, *Pointer<Int>(buffer), 0);
591 	z = Insert(z, *Pointer<Int>(buffer + pitch), 1);
592 
593 	Float4 zValue = Float4(As<UShort4>(z)) * (1.0f / 0xFFFF);
594 	return Int4(CmpLE(minDepthBound, zValue) & CmpLE(zValue, maxDepthBound));
595 }
596 
depthBoundsTest32F(const Pointer<Byte> & zBuffer,int q,const Int & x)597 Int4 PixelRoutine::depthBoundsTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x)
598 {
599 	Pointer<Byte> buffer = zBuffer + 4 * x;
600 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
601 
602 	if(q > 0)
603 	{
604 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
605 	}
606 
607 	Float4 zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
608 	return Int4(CmpLE(state.minDepthBounds, zValue) & CmpLE(zValue, state.maxDepthBounds));
609 }
610 
depthBoundsTest(const Pointer<Byte> & zBuffer,int q,const Int & x,Int & zMask,Int & cMask)611 void PixelRoutine::depthBoundsTest(const Pointer<Byte> &zBuffer, int q, const Int &x, Int &zMask, Int &cMask)
612 {
613 	if(!state.depthBoundsTestActive)
614 	{
615 		return;
616 	}
617 
618 	Int4 zTest;
619 	switch(state.depthFormat)
620 	{
621 	case VK_FORMAT_D16_UNORM:
622 		zTest = depthBoundsTest16(zBuffer, q, x);
623 		break;
624 	case VK_FORMAT_D32_SFLOAT:
625 	case VK_FORMAT_D32_SFLOAT_S8_UINT:
626 		zTest = depthBoundsTest32F(zBuffer, q, x);
627 		break;
628 	default:
629 		UNSUPPORTED("Depth format: %d", int(state.depthFormat));
630 		break;
631 	}
632 
633 	if(!state.depthTestActive)
634 	{
635 		cMask &= zMask & SignMask(zTest);
636 	}
637 	else
638 	{
639 		zMask &= cMask & SignMask(zTest);
640 	}
641 }
642 
alphaToCoverage(Int cMask[4],const SIMD::Float & alpha,const SampleSet & samples)643 void PixelRoutine::alphaToCoverage(Int cMask[4], const SIMD::Float &alpha, const SampleSet &samples)
644 {
645 	static const int a2c[4] = {
646 		OFFSET(DrawData, a2c0),
647 		OFFSET(DrawData, a2c1),
648 		OFFSET(DrawData, a2c2),
649 		OFFSET(DrawData, a2c3),
650 	};
651 
652 	for(unsigned int q : samples)
653 	{
654 		SIMD::Int coverage = CmpNLT(alpha, SIMD::Float(*Pointer<Float>(data + a2c[q])));
655 		Int aMask = SignMask(coverage);
656 		cMask[q] &= aMask;
657 	}
658 }
659 
writeDepth32F(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)660 void PixelRoutine::writeDepth32F(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
661 {
662 	Float4 Z = z;
663 
664 	Pointer<Byte> buffer = zBuffer + 4 * x;
665 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
666 
667 	if(q > 0)
668 	{
669 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
670 	}
671 
672 	Float4 zValue;
673 
674 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
675 	{
676 		zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
677 	}
678 
679 	Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + zMask * 16, 16));
680 	zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + zMask * 16, 16));
681 	Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
682 
683 	*Pointer<Float2>(buffer) = Float2(Z.xy);
684 	*Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
685 }
686 
writeDepth16(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)687 void PixelRoutine::writeDepth16(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
688 {
689 	Short4 Z = UShort4(Round(z * 0xFFFF), true);
690 
691 	Pointer<Byte> buffer = zBuffer + 2 * x;
692 	Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
693 
694 	if(q > 0)
695 	{
696 		buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
697 	}
698 
699 	Short4 zValue;
700 
701 	if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
702 	{
703 		zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer), 0));
704 		zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer + pitch), 1));
705 	}
706 
707 	Z = Z & *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q) + zMask * 8, 8);
708 	zValue = zValue & *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q) + zMask * 8, 8);
709 	Z = Z | zValue;
710 
711 	*Pointer<Int>(buffer) = Extract(As<Int2>(Z), 0);
712 	*Pointer<Int>(buffer + pitch) = Extract(As<Int2>(Z), 1);
713 }
714 
writeDepth(Pointer<Byte> & zBuffer,const Int & x,const Int zMask[4],const SampleSet & samples)715 void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, const Int &x, const Int zMask[4], const SampleSet &samples)
716 {
717 	if(!state.depthWriteEnable)
718 	{
719 		return;
720 	}
721 
722 	for(unsigned int q : samples)
723 	{
724 		ASSERT(SIMD::Width == 4);
725 		switch(state.depthFormat)
726 		{
727 		case VK_FORMAT_D16_UNORM:
728 			writeDepth16(zBuffer, q, x, Extract128(z[q], 0), zMask[q]);
729 			break;
730 		case VK_FORMAT_D32_SFLOAT:
731 		case VK_FORMAT_D32_SFLOAT_S8_UINT:
732 			writeDepth32F(zBuffer, q, x, Extract128(z[q], 0), zMask[q]);
733 			break;
734 		default:
735 			UNSUPPORTED("Depth format: %d", int(state.depthFormat));
736 			break;
737 		}
738 	}
739 }
740 
occlusionSampleCount(const Int zMask[4],const Int sMask[4],const SampleSet & samples)741 void PixelRoutine::occlusionSampleCount(const Int zMask[4], const Int sMask[4], const SampleSet &samples)
742 {
743 	if(!state.occlusionEnabled)
744 	{
745 		return;
746 	}
747 
748 	for(unsigned int q : samples)
749 	{
750 		occlusion += *Pointer<UInt>(constants + OFFSET(Constants, occlusionCount) + 4 * (zMask[q] & sMask[q]));
751 	}
752 }
753 
writeStencil(Pointer<Byte> & sBuffer,const Int & x,const Int sMask[4],const Int zMask[4],const Int cMask[4],const SampleSet & samples)754 void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, const Int &x, const Int sMask[4], const Int zMask[4], const Int cMask[4], const SampleSet &samples)
755 {
756 	if(!state.stencilActive)
757 	{
758 		return;
759 	}
760 
761 	if(state.frontStencil.passOp == VK_STENCIL_OP_KEEP && state.frontStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.frontStencil.failOp == VK_STENCIL_OP_KEEP)
762 	{
763 		if(state.backStencil.passOp == VK_STENCIL_OP_KEEP && state.backStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.backStencil.failOp == VK_STENCIL_OP_KEEP)
764 		{
765 			return;
766 		}
767 	}
768 
769 	if(!state.frontStencil.writeEnabled && !state.backStencil.writeEnabled)
770 	{
771 		return;
772 	}
773 
774 	for(unsigned int q : samples)
775 	{
776 		Pointer<Byte> buffer = sBuffer + x;
777 
778 		if(q > 0)
779 		{
780 			buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
781 		}
782 
783 		Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
784 		Byte8 bufferValue = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
785 		bufferValue = bufferValue | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
786 		Byte8 newValue = stencilOperation(bufferValue, state.frontStencil, false, zMask[q], sMask[q]);
787 
788 		if(state.frontStencil.useWriteMask)  // Assume 8-bit stencil buffer
789 		{
790 			Byte8 maskedValue = bufferValue;
791 			newValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].writeMaskQ));
792 			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].invWriteMaskQ));
793 			newValue |= maskedValue;
794 		}
795 
796 		Byte8 newValueBack = stencilOperation(bufferValue, state.backStencil, true, zMask[q], sMask[q]);
797 
798 		if(state.backStencil.useWriteMask)  // Assume 8-bit stencil buffer
799 		{
800 			Byte8 maskedValue = bufferValue;
801 			newValueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].writeMaskQ));
802 			maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].invWriteMaskQ));
803 			newValueBack |= maskedValue;
804 		}
805 
806 		newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
807 		newValueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
808 		newValue |= newValueBack;
809 
810 		newValue &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * cMask[q]);
811 		bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * cMask[q]);
812 		newValue |= bufferValue;
813 
814 		*Pointer<Short>(buffer) = Extract(As<Short4>(newValue), 0);
815 		*Pointer<Short>(buffer + pitch) = Extract(As<Short4>(newValue), 1);
816 	}
817 }
818 
stencilOperation(const Byte8 & bufferValue,const PixelProcessor::States::StencilOpState & ops,bool isBack,const Int & zMask,const Int & sMask)819 Byte8 PixelRoutine::stencilOperation(const Byte8 &bufferValue, const PixelProcessor::States::StencilOpState &ops, bool isBack, const Int &zMask, const Int &sMask)
820 {
821 	Byte8 pass = stencilOperation(bufferValue, ops.passOp, isBack);
822 
823 	if(state.depthTestActive && ops.depthFailOp != ops.passOp)  // zMask valid and values not the same
824 	{
825 		Byte8 zFail = stencilOperation(bufferValue, ops.depthFailOp, isBack);
826 
827 		pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * zMask);
828 		zFail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * zMask);
829 		pass |= zFail;
830 	}
831 
832 	if(ops.failOp != ops.passOp || (state.depthTestActive && ops.failOp != ops.depthFailOp))
833 	{
834 		Byte8 fail = stencilOperation(bufferValue, ops.failOp, isBack);
835 
836 		pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * sMask);
837 		fail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * sMask);
838 		pass |= fail;
839 	}
840 
841 	return pass;
842 }
843 
hasStencilReplaceRef() const844 bool PixelRoutine::hasStencilReplaceRef() const
845 {
846 	return spirvShader &&
847 	       (spirvShader->outputBuiltins.find(spv::BuiltInFragStencilRefEXT) !=
848 	        spirvShader->outputBuiltins.end());
849 }
850 
stencilReplaceRef()851 Byte8 PixelRoutine::stencilReplaceRef()
852 {
853 	ASSERT(spirvShader);
854 
855 	auto it = spirvShader->outputBuiltins.find(spv::BuiltInFragStencilRefEXT);
856 	ASSERT(it != spirvShader->outputBuiltins.end());
857 
858 	UInt4 sRef = As<UInt4>(routine.getVariable(it->second.Id)[it->second.FirstComponent]) & UInt4(0xff);
859 	// TODO (b/148295813): Could be done with a single pshufb instruction. Optimize the
860 	//                     following line by either adding a rr::Shuffle() variant to do
861 	//                     it explicitly or adding a Byte4(Int4) constructor would work.
862 	sRef.x = rr::UInt(sRef.x) | (rr::UInt(sRef.y) << 8) | (rr::UInt(sRef.z) << 16) | (rr::UInt(sRef.w) << 24);
863 
864 	UInt2 sRefDuplicated;
865 	sRefDuplicated = Insert(sRefDuplicated, sRef.x, 0);
866 	sRefDuplicated = Insert(sRefDuplicated, sRef.x, 1);
867 	return As<Byte8>(sRefDuplicated);
868 }
869 
stencilOperation(const Byte8 & bufferValue,VkStencilOp operation,bool isBack)870 Byte8 PixelRoutine::stencilOperation(const Byte8 &bufferValue, VkStencilOp operation, bool isBack)
871 {
872 	if(hasStencilReplaceRef())
873 	{
874 		return stencilReplaceRef();
875 	}
876 	else
877 	{
878 		switch(operation)
879 		{
880 		case VK_STENCIL_OP_KEEP:
881 			return bufferValue;
882 		case VK_STENCIL_OP_ZERO:
883 			return Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
884 		case VK_STENCIL_OP_REPLACE:
885 			return *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceQ));
886 		case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
887 			return AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
888 		case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
889 			return SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
890 		case VK_STENCIL_OP_INVERT:
891 			return bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
892 		case VK_STENCIL_OP_INCREMENT_AND_WRAP:
893 			return bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
894 		case VK_STENCIL_OP_DECREMENT_AND_WRAP:
895 			return bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
896 		default:
897 			UNSUPPORTED("VkStencilOp: %d", int(operation));
898 		}
899 	}
900 
901 	return Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
902 }
903 
isSRGB(int index) const904 bool PixelRoutine::isSRGB(int index) const
905 {
906 	return vk::Format(state.colorFormat[index]).isSRGBformat();
907 }
908 
readPixel(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4s & pixel)909 void PixelRoutine::readPixel(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &pixel)
910 {
911 	Short4 c01;
912 	Short4 c23;
913 	Pointer<Byte> buffer = cBuffer;
914 	Pointer<Byte> buffer2;
915 
916 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
917 
918 	vk::Format format = state.colorFormat[index];
919 	switch(format)
920 	{
921 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
922 		buffer += 2 * x;
923 		buffer2 = buffer + pitchB;
924 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
925 
926 		pixel.x = (c01 & Short4(0xF000u));
927 		pixel.y = (c01 & Short4(0x0F00u)) << 4;
928 		pixel.z = (c01 & Short4(0x00F0u)) << 8;
929 		pixel.w = (c01 & Short4(0x000Fu)) << 12;
930 
931 		// Expand to 16 bit range
932 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
933 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
934 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
935 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
936 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
937 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
938 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
939 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
940 		break;
941 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
942 		buffer += 2 * x;
943 		buffer2 = buffer + pitchB;
944 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
945 
946 		pixel.z = (c01 & Short4(0xF000u));
947 		pixel.y = (c01 & Short4(0x0F00u)) << 4;
948 		pixel.x = (c01 & Short4(0x00F0u)) << 8;
949 		pixel.w = (c01 & Short4(0x000Fu)) << 12;
950 
951 		// Expand to 16 bit range
952 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
953 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
954 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
955 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
956 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
957 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
958 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
959 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
960 		break;
961 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
962 		buffer += 2 * x;
963 		buffer2 = buffer + pitchB;
964 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
965 
966 		pixel.w = (c01 & Short4(0xF000u));
967 		pixel.z = (c01 & Short4(0x0F00u)) << 4;
968 		pixel.y = (c01 & Short4(0x00F0u)) << 8;
969 		pixel.x = (c01 & Short4(0x000Fu)) << 12;
970 
971 		// Expand to 16 bit range
972 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
973 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
974 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
975 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
976 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
977 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
978 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
979 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
980 		break;
981 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
982 		buffer += 2 * x;
983 		buffer2 = buffer + pitchB;
984 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
985 
986 		pixel.w = (c01 & Short4(0xF000u));
987 		pixel.x = (c01 & Short4(0x0F00u)) << 4;
988 		pixel.y = (c01 & Short4(0x00F0u)) << 8;
989 		pixel.z = (c01 & Short4(0x000Fu)) << 12;
990 
991 		// Expand to 16 bit range
992 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
993 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
994 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
995 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
996 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
997 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
998 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
999 		pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1000 		break;
1001 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1002 		buffer += 2 * x;
1003 		buffer2 = buffer + pitchB;
1004 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1005 
1006 		pixel.x = (c01 & Short4(0xF800u));
1007 		pixel.y = (c01 & Short4(0x07C0u)) << 5;
1008 		pixel.z = (c01 & Short4(0x003Eu)) << 10;
1009 		pixel.w = ((c01 & Short4(0x0001u)) << 15) >> 15;
1010 
1011 		// Expand to 16 bit range
1012 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1013 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1014 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1015 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1016 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1017 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1018 		break;
1019 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1020 		buffer += 2 * x;
1021 		buffer2 = buffer + pitchB;
1022 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1023 
1024 		pixel.z = (c01 & Short4(0xF800u));
1025 		pixel.y = (c01 & Short4(0x07C0u)) << 5;
1026 		pixel.x = (c01 & Short4(0x003Eu)) << 10;
1027 		pixel.w = ((c01 & Short4(0x0001u)) << 15) >> 15;
1028 
1029 		// Expand to 16 bit range
1030 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1031 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1032 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1033 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1034 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1035 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1036 		break;
1037 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1038 		buffer += 2 * x;
1039 		buffer2 = buffer + pitchB;
1040 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1041 
1042 		pixel.x = (c01 & Short4(0x7C00u)) << 1;
1043 		pixel.y = (c01 & Short4(0x03E0u)) << 6;
1044 		pixel.z = (c01 & Short4(0x001Fu)) << 11;
1045 		pixel.w = (c01 & Short4(0x8000u)) >> 15;
1046 
1047 		// Expand to 16 bit range
1048 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1049 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1050 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1051 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1052 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1053 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1054 		break;
1055 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
1056 		buffer += 2 * x;
1057 		buffer2 = buffer + pitchB;
1058 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1059 
1060 		pixel.x = c01 & Short4(0xF800u);
1061 		pixel.y = (c01 & Short4(0x07E0u)) << 5;
1062 		pixel.z = (c01 & Short4(0x001Fu)) << 11;
1063 		pixel.w = Short4(0xFFFFu);
1064 
1065 		// Expand to 16 bit range
1066 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1067 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1068 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
1069 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
1070 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1071 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1072 		break;
1073 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
1074 		buffer += 2 * x;
1075 		buffer2 = buffer + pitchB;
1076 		c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1077 
1078 		pixel.z = c01 & Short4(0xF800u);
1079 		pixel.y = (c01 & Short4(0x07E0u)) << 5;
1080 		pixel.x = (c01 & Short4(0x001Fu)) << 11;
1081 		pixel.w = Short4(0xFFFFu);
1082 
1083 		// Expand to 16 bit range
1084 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1085 		pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1086 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
1087 		pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
1088 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1089 		pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1090 		break;
1091 	case VK_FORMAT_B8G8R8A8_UNORM:
1092 	case VK_FORMAT_B8G8R8A8_SRGB:
1093 		buffer += 4 * x;
1094 		c01 = *Pointer<Short4>(buffer);
1095 		buffer += pitchB;
1096 		c23 = *Pointer<Short4>(buffer);
1097 		pixel.z = c01;
1098 		pixel.y = c01;
1099 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1100 		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1101 		pixel.x = pixel.z;
1102 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1103 		pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1104 		pixel.y = pixel.z;
1105 		pixel.w = pixel.x;
1106 		pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1107 		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1108 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1109 		pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1110 		break;
1111 	case VK_FORMAT_R8G8B8A8_UNORM:
1112 	case VK_FORMAT_R8G8B8A8_SRGB:
1113 		buffer += 4 * x;
1114 		c01 = *Pointer<Short4>(buffer);
1115 		buffer += pitchB;
1116 		c23 = *Pointer<Short4>(buffer);
1117 		pixel.z = c01;
1118 		pixel.y = c01;
1119 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1120 		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1121 		pixel.x = pixel.z;
1122 		pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1123 		pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1124 		pixel.y = pixel.z;
1125 		pixel.w = pixel.x;
1126 		pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1127 		pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1128 		pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1129 		pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1130 		break;
1131 	case VK_FORMAT_R8_UNORM:
1132 		buffer += 1 * x;
1133 		pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
1134 		buffer += pitchB;
1135 		pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
1136 		pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1137 		pixel.y = Short4(0x0000);
1138 		pixel.z = Short4(0x0000);
1139 		pixel.w = Short4(0xFFFFu);
1140 		break;
1141 	case VK_FORMAT_R8G8_UNORM:
1142 		buffer += 2 * x;
1143 		c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
1144 		buffer += pitchB;
1145 		c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
1146 		pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
1147 		pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
1148 		pixel.z = Short4(0x0000u);
1149 		pixel.w = Short4(0xFFFFu);
1150 		break;
1151 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1152 		{
1153 			Int4 v = Int4(0);
1154 			buffer += 4 * x;
1155 			v = Insert(v, *Pointer<Int>(buffer + 0), 0);
1156 			v = Insert(v, *Pointer<Int>(buffer + 4), 1);
1157 			buffer += pitchB;
1158 			v = Insert(v, *Pointer<Int>(buffer + 0), 2);
1159 			v = Insert(v, *Pointer<Int>(buffer + 4), 3);
1160 
1161 			pixel.x = Short4(v << 6) & Short4(0xFFC0u);
1162 			pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
1163 			pixel.z = Short4(v >> 14) & Short4(0xFFC0u);
1164 			pixel.w = Short4(v >> 16) & Short4(0xC000u);
1165 
1166 			// Expand to 16 bit range
1167 			pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1168 			pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1169 			pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1170 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 2);
1171 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1172 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1173 		}
1174 		break;
1175 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1176 		{
1177 			Int4 v = Int4(0);
1178 			v = Insert(v, *Pointer<Int>(buffer + 4 * x), 0);
1179 			v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 1);
1180 			buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1181 			v = Insert(v, *Pointer<Int>(buffer + 4 * x), 2);
1182 			v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 3);
1183 
1184 			pixel.x = Short4(v >> 14) & Short4(0xFFC0u);
1185 			pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
1186 			pixel.z = Short4(v << 6) & Short4(0xFFC0u);
1187 			pixel.w = Short4(v >> 16) & Short4(0xC000u);
1188 
1189 			// Expand to 16 bit range
1190 			pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1191 			pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1192 			pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1193 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 2);
1194 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1195 			pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1196 		}
1197 		break;
1198 	default:
1199 		UNSUPPORTED("VkFormat %d", int(format));
1200 	}
1201 }
1202 
blendConstant(vk::Format format,int component,BlendFactorModifier modifier)1203 Float PixelRoutine::blendConstant(vk::Format format, int component, BlendFactorModifier modifier)
1204 {
1205 	bool inverse = (modifier == OneMinus);
1206 
1207 	if(format.isUnsignedNormalized())
1208 	{
1209 		return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantU.v[component]))
1210 		               : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantU.v[component]));
1211 	}
1212 	else if(format.isSignedNormalized())
1213 	{
1214 		return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantS.v[component]))
1215 		               : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantS.v[component]));
1216 	}
1217 	else  // Floating-point format
1218 	{
1219 		ASSERT(format.isFloatFormat());
1220 		return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantF.v[component]))
1221 		               : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantF.v[component]));
1222 	}
1223 }
1224 
blendFactorRGB(SIMD::Float4 & blendFactor,const SIMD::Float4 & sourceColor,const SIMD::Float4 & destColor,VkBlendFactor colorBlendFactor,vk::Format format)1225 void PixelRoutine::blendFactorRGB(SIMD::Float4 &blendFactor, const SIMD::Float4 &sourceColor, const SIMD::Float4 &destColor, VkBlendFactor colorBlendFactor, vk::Format format)
1226 {
1227 	switch(colorBlendFactor)
1228 	{
1229 	case VK_BLEND_FACTOR_ZERO:
1230 		blendFactor.x = 0.0f;
1231 		blendFactor.y = 0.0f;
1232 		blendFactor.z = 0.0f;
1233 		break;
1234 	case VK_BLEND_FACTOR_ONE:
1235 		blendFactor.x = 1.0f;
1236 		blendFactor.y = 1.0f;
1237 		blendFactor.z = 1.0f;
1238 		break;
1239 	case VK_BLEND_FACTOR_SRC_COLOR:
1240 		blendFactor.x = sourceColor.x;
1241 		blendFactor.y = sourceColor.y;
1242 		blendFactor.z = sourceColor.z;
1243 		break;
1244 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1245 		blendFactor.x = 1.0f - sourceColor.x;
1246 		blendFactor.y = 1.0f - sourceColor.y;
1247 		blendFactor.z = 1.0f - sourceColor.z;
1248 		break;
1249 	case VK_BLEND_FACTOR_DST_COLOR:
1250 		blendFactor.x = destColor.x;
1251 		blendFactor.y = destColor.y;
1252 		blendFactor.z = destColor.z;
1253 		break;
1254 	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1255 		blendFactor.x = 1.0f - destColor.x;
1256 		blendFactor.y = 1.0f - destColor.y;
1257 		blendFactor.z = 1.0f - destColor.z;
1258 		break;
1259 	case VK_BLEND_FACTOR_SRC_ALPHA:
1260 		blendFactor.x = sourceColor.w;
1261 		blendFactor.y = sourceColor.w;
1262 		blendFactor.z = sourceColor.w;
1263 		break;
1264 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1265 		blendFactor.x = 1.0f - sourceColor.w;
1266 		blendFactor.y = 1.0f - sourceColor.w;
1267 		blendFactor.z = 1.0f - sourceColor.w;
1268 		break;
1269 	case VK_BLEND_FACTOR_DST_ALPHA:
1270 		blendFactor.x = destColor.w;
1271 		blendFactor.y = destColor.w;
1272 		blendFactor.z = destColor.w;
1273 		break;
1274 	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1275 		blendFactor.x = 1.0f - destColor.w;
1276 		blendFactor.y = 1.0f - destColor.w;
1277 		blendFactor.z = 1.0f - destColor.w;
1278 		break;
1279 	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1280 		blendFactor.x = 1.0f - destColor.w;
1281 		blendFactor.x = Min(blendFactor.x, sourceColor.w);
1282 		blendFactor.y = blendFactor.x;
1283 		blendFactor.z = blendFactor.x;
1284 		break;
1285 	case VK_BLEND_FACTOR_CONSTANT_COLOR:
1286 		blendFactor.x = blendConstant(format, 0);
1287 		blendFactor.y = blendConstant(format, 1);
1288 		blendFactor.z = blendConstant(format, 2);
1289 		break;
1290 	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1291 		blendFactor.x = blendConstant(format, 3);
1292 		blendFactor.y = blendConstant(format, 3);
1293 		blendFactor.z = blendConstant(format, 3);
1294 		break;
1295 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1296 		blendFactor.x = blendConstant(format, 0, OneMinus);
1297 		blendFactor.y = blendConstant(format, 1, OneMinus);
1298 		blendFactor.z = blendConstant(format, 2, OneMinus);
1299 		break;
1300 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1301 		blendFactor.x = blendConstant(format, 3, OneMinus);
1302 		blendFactor.y = blendConstant(format, 3, OneMinus);
1303 		blendFactor.z = blendConstant(format, 3, OneMinus);
1304 		break;
1305 
1306 	default:
1307 		UNSUPPORTED("VkBlendFactor: %d", int(colorBlendFactor));
1308 	}
1309 
1310 	// "If the color attachment is fixed-point, the components of the source and destination values and blend factors are each clamped
1311 	//  to [0,1] or [-1,1] respectively for an unsigned normalized or signed normalized color attachment prior to evaluating the blend
1312 	//  operations. If the color attachment is floating-point, no clamping occurs."
1313 	if(blendFactorCanExceedFormatRange(colorBlendFactor, format))
1314 	{
1315 		if(format.isUnsignedNormalized())
1316 		{
1317 			blendFactor.x = Min(Max(blendFactor.x, 0.0f), 1.0f);
1318 			blendFactor.y = Min(Max(blendFactor.y, 0.0f), 1.0f);
1319 			blendFactor.z = Min(Max(blendFactor.z, 0.0f), 1.0f);
1320 		}
1321 		else if(format.isSignedNormalized())
1322 		{
1323 			blendFactor.x = Min(Max(blendFactor.x, -1.0f), 1.0f);
1324 			blendFactor.y = Min(Max(blendFactor.y, -1.0f), 1.0f);
1325 			blendFactor.z = Min(Max(blendFactor.z, -1.0f), 1.0f);
1326 		}
1327 	}
1328 }
1329 
blendFactorAlpha(SIMD::Float & blendFactorAlpha,const SIMD::Float & sourceAlpha,const SIMD::Float & destAlpha,VkBlendFactor alphaBlendFactor,vk::Format format)1330 void PixelRoutine::blendFactorAlpha(SIMD::Float &blendFactorAlpha, const SIMD::Float &sourceAlpha, const SIMD::Float &destAlpha, VkBlendFactor alphaBlendFactor, vk::Format format)
1331 {
1332 	switch(alphaBlendFactor)
1333 	{
1334 	case VK_BLEND_FACTOR_ZERO:
1335 		blendFactorAlpha = 0.0f;
1336 		break;
1337 	case VK_BLEND_FACTOR_ONE:
1338 		blendFactorAlpha = 1.0f;
1339 		break;
1340 	case VK_BLEND_FACTOR_SRC_COLOR:
1341 		blendFactorAlpha = sourceAlpha;
1342 		break;
1343 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1344 		blendFactorAlpha = 1.0f - sourceAlpha;
1345 		break;
1346 	case VK_BLEND_FACTOR_DST_COLOR:
1347 		blendFactorAlpha = destAlpha;
1348 		break;
1349 	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1350 		blendFactorAlpha = 1.0f - destAlpha;
1351 		break;
1352 	case VK_BLEND_FACTOR_SRC_ALPHA:
1353 		blendFactorAlpha = sourceAlpha;
1354 		break;
1355 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1356 		blendFactorAlpha = 1.0f - sourceAlpha;
1357 		break;
1358 	case VK_BLEND_FACTOR_DST_ALPHA:
1359 		blendFactorAlpha = destAlpha;
1360 		break;
1361 	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1362 		blendFactorAlpha = 1.0f - destAlpha;
1363 		break;
1364 	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1365 		blendFactorAlpha = 1.0f;
1366 		break;
1367 	case VK_BLEND_FACTOR_CONSTANT_COLOR:
1368 	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1369 		blendFactorAlpha = blendConstant(format, 3);
1370 		break;
1371 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1372 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1373 		blendFactorAlpha = blendConstant(format, 3, OneMinus);
1374 		break;
1375 	default:
1376 		UNSUPPORTED("VkBlendFactor: %d", int(alphaBlendFactor));
1377 	}
1378 
1379 	// "If the color attachment is fixed-point, the components of the source and destination values and blend factors are each clamped
1380 	//  to [0,1] or [-1,1] respectively for an unsigned normalized or signed normalized color attachment prior to evaluating the blend
1381 	//  operations. If the color attachment is floating-point, no clamping occurs."
1382 	if(blendFactorCanExceedFormatRange(alphaBlendFactor, format))
1383 	{
1384 		if(format.isUnsignedNormalized())
1385 		{
1386 			blendFactorAlpha = Min(Max(blendFactorAlpha, 0.0f), 1.0f);
1387 		}
1388 		else if(format.isSignedNormalized())
1389 		{
1390 			blendFactorAlpha = Min(Max(blendFactorAlpha, -1.0f), 1.0f);
1391 		}
1392 	}
1393 }
1394 
blendOpOverlay(SIMD::Float & src,SIMD::Float & dst)1395 SIMD::Float PixelRoutine::blendOpOverlay(SIMD::Float &src, SIMD::Float &dst)
1396 {
1397 	SIMD::Int largeDst = CmpGT(dst, 0.5f);
1398 	return As<SIMD::Float>(
1399 	    (~largeDst & As<SIMD::Int>(2.0f * src * dst)) |
1400 	    (largeDst & As<SIMD::Int>(1.0f - (2.0f * (1.0f - src) * (1.0f - dst)))));
1401 }
1402 
blendOpColorDodge(SIMD::Float & src,SIMD::Float & dst)1403 SIMD::Float PixelRoutine::blendOpColorDodge(SIMD::Float &src, SIMD::Float &dst)
1404 {
1405 	SIMD::Int srcBelowOne = CmpLT(src, 1.0f);
1406 	SIMD::Int positiveDst = CmpGT(dst, 0.0f);
1407 	return As<SIMD::Float>(positiveDst & ((~srcBelowOne & As<SIMD::Int>(SIMD::Float(1.0f))) |
1408 	                                      (srcBelowOne & As<SIMD::Int>(Min(1.0f, (dst / (1.0f - src)))))));
1409 }
1410 
blendOpColorBurn(SIMD::Float & src,SIMD::Float & dst)1411 SIMD::Float PixelRoutine::blendOpColorBurn(SIMD::Float &src, SIMD::Float &dst)
1412 {
1413 	SIMD::Int dstBelowOne = CmpLT(dst, 1.0f);
1414 	SIMD::Int positiveSrc = CmpGT(src, 0.0f);
1415 	return As<SIMD::Float>(
1416 	    (~dstBelowOne & As<SIMD::Int>(SIMD::Float(1.0f))) |
1417 	    (dstBelowOne & positiveSrc & As<SIMD::Int>(1.0f - Min(1.0f, (1.0f - dst) / src))));
1418 }
1419 
blendOpHardlight(SIMD::Float & src,SIMD::Float & dst)1420 SIMD::Float PixelRoutine::blendOpHardlight(SIMD::Float &src, SIMD::Float &dst)
1421 {
1422 	SIMD::Int largeSrc = CmpGT(src, 0.5f);
1423 	return As<SIMD::Float>(
1424 	    (~largeSrc & As<SIMD::Int>(2.0f * src * dst)) |
1425 	    (largeSrc & As<SIMD::Int>(1.0f - (2.0f * (1.0f - src) * (1.0f - dst)))));
1426 }
1427 
blendOpSoftlight(SIMD::Float & src,SIMD::Float & dst)1428 SIMD::Float PixelRoutine::blendOpSoftlight(SIMD::Float &src, SIMD::Float &dst)
1429 {
1430 	SIMD::Int largeSrc = CmpGT(src, 0.5f);
1431 	SIMD::Int largeDst = CmpGT(dst, 0.25f);
1432 
1433 	return As<SIMD::Float>(
1434 	    (~largeSrc & As<SIMD::Int>(dst - ((1.0f - (2.0f * src)) * dst * (1.0f - dst)))) |
1435 	    (largeSrc & ((~largeDst & As<SIMD::Int>(dst + (((2.0f * src) - 1.0f) * dst * ((((16.0f * dst) - 12.0f) * dst) + 3.0f)))) |
1436 	                 (largeDst & As<SIMD::Int>(dst + (((2.0f * src) - 1.0f) * (Sqrt<Mediump>(dst) - dst)))))));
1437 }
1438 
maxRGB(SIMD::Float4 & c)1439 SIMD::Float PixelRoutine::maxRGB(SIMD::Float4 &c)
1440 {
1441 	return Max(Max(c.x, c.y), c.z);
1442 }
1443 
minRGB(SIMD::Float4 & c)1444 SIMD::Float PixelRoutine::minRGB(SIMD::Float4 &c)
1445 {
1446 	return Min(Min(c.x, c.y), c.z);
1447 }
1448 
setLumSat(SIMD::Float4 & cbase,SIMD::Float4 & csat,SIMD::Float4 & clum,SIMD::Float & x,SIMD::Float & y,SIMD::Float & z)1449 void PixelRoutine::setLumSat(SIMD::Float4 &cbase, SIMD::Float4 &csat, SIMD::Float4 &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z)
1450 {
1451 	SIMD::Float minbase = minRGB(cbase);
1452 	SIMD::Float sbase = maxRGB(cbase) - minbase;
1453 	SIMD::Float ssat = maxRGB(csat) - minRGB(csat);
1454 	SIMD::Int isNonZero = CmpGT(sbase, 0.0f);
1455 	SIMD::Float4 color;
1456 	color.x = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.x - minbase) * ssat / sbase));
1457 	color.y = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.y - minbase) * ssat / sbase));
1458 	color.z = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.z - minbase) * ssat / sbase));
1459 	setLum(color, clum, x, y, z);
1460 }
1461 
lumRGB(SIMD::Float4 & c)1462 SIMD::Float PixelRoutine::lumRGB(SIMD::Float4 &c)
1463 {
1464 	return c.x * 0.3f + c.y * 0.59f + c.z * 0.11f;
1465 }
1466 
computeLum(SIMD::Float & color,SIMD::Float & lum,SIMD::Float & mincol,SIMD::Float & maxcol,SIMD::Int & negative,SIMD::Int & aboveOne)1467 SIMD::Float PixelRoutine::computeLum(SIMD::Float &color, SIMD::Float &lum, SIMD::Float &mincol, SIMD::Float &maxcol, SIMD::Int &negative, SIMD::Int &aboveOne)
1468 {
1469 	return As<SIMD::Float>(
1470 	    (negative & As<SIMD::Int>(lum + ((color - lum) * lum) / (lum - mincol))) |
1471 	    (~negative & ((aboveOne & As<SIMD::Int>(lum + ((color - lum) * (1.0f - lum)) / (maxcol - lum))) |
1472 	                  (~aboveOne & As<SIMD::Int>(color)))));
1473 }
1474 
setLum(SIMD::Float4 & cbase,SIMD::Float4 & clum,SIMD::Float & x,SIMD::Float & y,SIMD::Float & z)1475 void PixelRoutine::setLum(SIMD::Float4 &cbase, SIMD::Float4 &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z)
1476 {
1477 	SIMD::Float lbase = lumRGB(cbase);
1478 	SIMD::Float llum = lumRGB(clum);
1479 	SIMD::Float ldiff = llum - lbase;
1480 
1481 	SIMD::Float4 color;
1482 	color.x = cbase.x + ldiff;
1483 	color.y = cbase.y + ldiff;
1484 	color.z = cbase.z + ldiff;
1485 
1486 	SIMD::Float lum = lumRGB(color);
1487 	SIMD::Float mincol = minRGB(color);
1488 	SIMD::Float maxcol = maxRGB(color);
1489 
1490 	SIMD::Int negative = CmpLT(mincol, 0.0f);
1491 	SIMD::Int aboveOne = CmpGT(maxcol, 1.0f);
1492 
1493 	x = computeLum(color.x, lum, mincol, maxcol, negative, aboveOne);
1494 	y = computeLum(color.y, lum, mincol, maxcol, negative, aboveOne);
1495 	z = computeLum(color.z, lum, mincol, maxcol, negative, aboveOne);
1496 }
1497 
premultiply(SIMD::Float4 & c)1498 void PixelRoutine::premultiply(SIMD::Float4 &c)
1499 {
1500 	SIMD::Int nonZeroAlpha = CmpNEQ(c.w, 0.0f);
1501 	c.x = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.x / c.w));
1502 	c.y = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.y / c.w));
1503 	c.z = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.z / c.w));
1504 }
1505 
computeAdvancedBlendMode(int index,const SIMD::Float4 & src,const SIMD::Float4 & dst,const SIMD::Float4 & srcFactor,const SIMD::Float4 & dstFactor)1506 SIMD::Float4 PixelRoutine::computeAdvancedBlendMode(int index, const SIMD::Float4 &src, const SIMD::Float4 &dst, const SIMD::Float4 &srcFactor, const SIMD::Float4 &dstFactor)
1507 {
1508 	SIMD::Float4 srcColor = src;
1509 	srcColor.x *= srcFactor.x;
1510 	srcColor.y *= srcFactor.y;
1511 	srcColor.z *= srcFactor.z;
1512 	srcColor.w *= srcFactor.w;
1513 
1514 	SIMD::Float4 dstColor = dst;
1515 	dstColor.x *= dstFactor.x;
1516 	dstColor.y *= dstFactor.y;
1517 	dstColor.z *= dstFactor.z;
1518 	dstColor.w *= dstFactor.w;
1519 
1520 	premultiply(srcColor);
1521 	premultiply(dstColor);
1522 
1523 	SIMD::Float4 blendedColor;
1524 
1525 	switch(state.blendState[index].blendOperation)
1526 	{
1527 	case VK_BLEND_OP_MULTIPLY_EXT:
1528 		blendedColor.x = (srcColor.x * dstColor.x);
1529 		blendedColor.y = (srcColor.y * dstColor.y);
1530 		blendedColor.z = (srcColor.z * dstColor.z);
1531 		break;
1532 	case VK_BLEND_OP_SCREEN_EXT:
1533 		blendedColor.x = srcColor.x + dstColor.x - (srcColor.x * dstColor.x);
1534 		blendedColor.y = srcColor.y + dstColor.y - (srcColor.y * dstColor.y);
1535 		blendedColor.z = srcColor.z + dstColor.z - (srcColor.z * dstColor.z);
1536 		break;
1537 	case VK_BLEND_OP_OVERLAY_EXT:
1538 		blendedColor.x = blendOpOverlay(srcColor.x, dstColor.x);
1539 		blendedColor.y = blendOpOverlay(srcColor.y, dstColor.y);
1540 		blendedColor.z = blendOpOverlay(srcColor.z, dstColor.z);
1541 		break;
1542 	case VK_BLEND_OP_DARKEN_EXT:
1543 		blendedColor.x = Min(srcColor.x, dstColor.x);
1544 		blendedColor.y = Min(srcColor.y, dstColor.y);
1545 		blendedColor.z = Min(srcColor.z, dstColor.z);
1546 		break;
1547 	case VK_BLEND_OP_LIGHTEN_EXT:
1548 		blendedColor.x = Max(srcColor.x, dstColor.x);
1549 		blendedColor.y = Max(srcColor.y, dstColor.y);
1550 		blendedColor.z = Max(srcColor.z, dstColor.z);
1551 		break;
1552 	case VK_BLEND_OP_COLORDODGE_EXT:
1553 		blendedColor.x = blendOpColorDodge(srcColor.x, dstColor.x);
1554 		blendedColor.y = blendOpColorDodge(srcColor.y, dstColor.y);
1555 		blendedColor.z = blendOpColorDodge(srcColor.z, dstColor.z);
1556 		break;
1557 	case VK_BLEND_OP_COLORBURN_EXT:
1558 		blendedColor.x = blendOpColorBurn(srcColor.x, dstColor.x);
1559 		blendedColor.y = blendOpColorBurn(srcColor.y, dstColor.y);
1560 		blendedColor.z = blendOpColorBurn(srcColor.z, dstColor.z);
1561 		break;
1562 	case VK_BLEND_OP_HARDLIGHT_EXT:
1563 		blendedColor.x = blendOpHardlight(srcColor.x, dstColor.x);
1564 		blendedColor.y = blendOpHardlight(srcColor.y, dstColor.y);
1565 		blendedColor.z = blendOpHardlight(srcColor.z, dstColor.z);
1566 		break;
1567 	case VK_BLEND_OP_SOFTLIGHT_EXT:
1568 		blendedColor.x = blendOpSoftlight(srcColor.x, dstColor.x);
1569 		blendedColor.y = blendOpSoftlight(srcColor.y, dstColor.y);
1570 		blendedColor.z = blendOpSoftlight(srcColor.z, dstColor.z);
1571 		break;
1572 	case VK_BLEND_OP_DIFFERENCE_EXT:
1573 		blendedColor.x = Abs(srcColor.x - dstColor.x);
1574 		blendedColor.y = Abs(srcColor.y - dstColor.y);
1575 		blendedColor.z = Abs(srcColor.z - dstColor.z);
1576 		break;
1577 	case VK_BLEND_OP_EXCLUSION_EXT:
1578 		blendedColor.x = srcColor.x + dstColor.x - (srcColor.x * dstColor.x * 2.0f);
1579 		blendedColor.y = srcColor.y + dstColor.y - (srcColor.y * dstColor.y * 2.0f);
1580 		blendedColor.z = srcColor.z + dstColor.z - (srcColor.z * dstColor.z * 2.0f);
1581 		break;
1582 	case VK_BLEND_OP_HSL_HUE_EXT:
1583 		setLumSat(srcColor, dstColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
1584 		break;
1585 	case VK_BLEND_OP_HSL_SATURATION_EXT:
1586 		setLumSat(dstColor, srcColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
1587 		break;
1588 	case VK_BLEND_OP_HSL_COLOR_EXT:
1589 		setLum(srcColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
1590 		break;
1591 	case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
1592 		setLum(dstColor, srcColor, blendedColor.x, blendedColor.y, blendedColor.z);
1593 		break;
1594 	default:
1595 		UNSUPPORTED("Unsupported advanced VkBlendOp: %d", int(state.blendState[index].blendOperation));
1596 		break;
1597 	}
1598 
1599 	SIMD::Float p = srcColor.w * dstColor.w;
1600 	blendedColor.x *= p;
1601 	blendedColor.y *= p;
1602 	blendedColor.z *= p;
1603 
1604 	p = srcColor.w * (1.0f - dstColor.w);
1605 	blendedColor.x += srcColor.x * p;
1606 	blendedColor.y += srcColor.y * p;
1607 	blendedColor.z += srcColor.z * p;
1608 
1609 	p = dstColor.w * (1.0f - srcColor.w);
1610 	blendedColor.x += dstColor.x * p;
1611 	blendedColor.y += dstColor.y * p;
1612 	blendedColor.z += dstColor.z * p;
1613 
1614 	return blendedColor;
1615 }
1616 
blendFactorCanExceedFormatRange(VkBlendFactor blendFactor,vk::Format format)1617 bool PixelRoutine::blendFactorCanExceedFormatRange(VkBlendFactor blendFactor, vk::Format format)
1618 {
1619 	switch(blendFactor)
1620 	{
1621 	case VK_BLEND_FACTOR_ZERO:
1622 	case VK_BLEND_FACTOR_ONE:
1623 		return false;
1624 	case VK_BLEND_FACTOR_SRC_COLOR:
1625 	case VK_BLEND_FACTOR_SRC_ALPHA:
1626 		// Source values have been clamped after fragment shader execution if the attachment format is normalized.
1627 		return false;
1628 	case VK_BLEND_FACTOR_DST_COLOR:
1629 	case VK_BLEND_FACTOR_DST_ALPHA:
1630 		// Dest values have a valid range due to being read from the attachment.
1631 		return false;
1632 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1633 	case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1634 	case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1635 	case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1636 		// For signed formats, negative values cause the result to exceed 1.0.
1637 		return format.isSignedNormalized();
1638 	case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1639 		// min(As, 1 - Ad)
1640 		return false;
1641 	case VK_BLEND_FACTOR_CONSTANT_COLOR:
1642 	case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1643 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1644 	case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1645 		return false;
1646 
1647 	default:
1648 		UNSUPPORTED("VkBlendFactor: %d", int(blendFactor));
1649 		return false;
1650 	}
1651 }
1652 
alphaBlend(int index,const Pointer<Byte> & cBuffer,const SIMD::Float4 & sourceColor,const Int & x)1653 SIMD::Float4 PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, const SIMD::Float4 &sourceColor, const Int &x)
1654 {
1655 	if(!state.blendState[index].alphaBlendEnable)
1656 	{
1657 		return sourceColor;
1658 	}
1659 
1660 	vk::Format format = state.colorFormat[index];
1661 	ASSERT(format.supportsColorAttachmentBlend());
1662 
1663 	Pointer<Byte> buffer = cBuffer;
1664 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1665 
1666 	// texelColor holds four texel color values.
1667 	// Note: Despite the type being Vector4f, the colors may be stored as
1668 	// integers. Half-floats are stored as full 32-bit floats.
1669 	// Non-float and non-fixed point formats are not alpha blended.
1670 	Vector4f texelColor;
1671 
1672 	switch(format)
1673 	{
1674 	case VK_FORMAT_R32_SINT:
1675 	case VK_FORMAT_R32_UINT:
1676 	case VK_FORMAT_R32_SFLOAT:
1677 		// FIXME: movlps
1678 		buffer += 4 * x;
1679 		texelColor.x.x = *Pointer<Float>(buffer + 0);
1680 		texelColor.x.y = *Pointer<Float>(buffer + 4);
1681 		buffer += pitchB;
1682 		// FIXME: movhps
1683 		texelColor.x.z = *Pointer<Float>(buffer + 0);
1684 		texelColor.x.w = *Pointer<Float>(buffer + 4);
1685 		texelColor.y = texelColor.z = texelColor.w = 1.0f;
1686 		break;
1687 	case VK_FORMAT_R32G32_SINT:
1688 	case VK_FORMAT_R32G32_UINT:
1689 	case VK_FORMAT_R32G32_SFLOAT:
1690 		buffer += 8 * x;
1691 		texelColor.x = *Pointer<Float4>(buffer, 16);
1692 		buffer += pitchB;
1693 		texelColor.y = *Pointer<Float4>(buffer, 16);
1694 		texelColor.z = texelColor.x;
1695 		texelColor.x = ShuffleLowHigh(texelColor.x, texelColor.y, 0x0202);
1696 		texelColor.z = ShuffleLowHigh(texelColor.z, texelColor.y, 0x1313);
1697 		texelColor.y = texelColor.z;
1698 		texelColor.z = texelColor.w = 1.0f;
1699 		break;
1700 	case VK_FORMAT_R32G32B32A32_SFLOAT:
1701 	case VK_FORMAT_R32G32B32A32_SINT:
1702 	case VK_FORMAT_R32G32B32A32_UINT:
1703 		buffer += 16 * x;
1704 		texelColor.x = *Pointer<Float4>(buffer + 0, 16);
1705 		texelColor.y = *Pointer<Float4>(buffer + 16, 16);
1706 		buffer += pitchB;
1707 		texelColor.z = *Pointer<Float4>(buffer + 0, 16);
1708 		texelColor.w = *Pointer<Float4>(buffer + 16, 16);
1709 		transpose4x4(texelColor.x, texelColor.y, texelColor.z, texelColor.w);
1710 		break;
1711 	case VK_FORMAT_R16_UNORM:
1712 		buffer += 2 * x;
1713 		texelColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
1714 		texelColor.x.y = Float(Int(*Pointer<UShort>(buffer + 2)));
1715 		buffer += pitchB;
1716 		texelColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
1717 		texelColor.x.w = Float(Int(*Pointer<UShort>(buffer + 2)));
1718 		texelColor.x *= (1.0f / 0xFFFF);
1719 		texelColor.y = texelColor.z = texelColor.w = 1.0f;
1720 		break;
1721 	case VK_FORMAT_R16_SFLOAT:
1722 		buffer += 2 * x;
1723 		texelColor.x.x = Float(*Pointer<Half>(buffer + 0));
1724 		texelColor.x.y = Float(*Pointer<Half>(buffer + 2));
1725 		buffer += pitchB;
1726 		texelColor.x.z = Float(*Pointer<Half>(buffer + 0));
1727 		texelColor.x.w = Float(*Pointer<Half>(buffer + 2));
1728 		texelColor.y = texelColor.z = texelColor.w = 1.0f;
1729 		break;
1730 	case VK_FORMAT_R16G16_UNORM:
1731 		buffer += 4 * x;
1732 		texelColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
1733 		texelColor.y.x = Float(Int(*Pointer<UShort>(buffer + 2)));
1734 		texelColor.x.y = Float(Int(*Pointer<UShort>(buffer + 4)));
1735 		texelColor.y.y = Float(Int(*Pointer<UShort>(buffer + 6)));
1736 		buffer += pitchB;
1737 		texelColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
1738 		texelColor.y.z = Float(Int(*Pointer<UShort>(buffer + 2)));
1739 		texelColor.x.w = Float(Int(*Pointer<UShort>(buffer + 4)));
1740 		texelColor.y.w = Float(Int(*Pointer<UShort>(buffer + 6)));
1741 		texelColor.x *= (1.0f / 0xFFFF);
1742 		texelColor.y *= (1.0f / 0xFFFF);
1743 		texelColor.z = texelColor.w = 1.0f;
1744 		break;
1745 	case VK_FORMAT_R16G16_SFLOAT:
1746 		buffer += 4 * x;
1747 		texelColor.x.x = Float(*Pointer<Half>(buffer + 0));
1748 		texelColor.y.x = Float(*Pointer<Half>(buffer + 2));
1749 		texelColor.x.y = Float(*Pointer<Half>(buffer + 4));
1750 		texelColor.y.y = Float(*Pointer<Half>(buffer + 6));
1751 		buffer += pitchB;
1752 		texelColor.x.z = Float(*Pointer<Half>(buffer + 0));
1753 		texelColor.y.z = Float(*Pointer<Half>(buffer + 2));
1754 		texelColor.x.w = Float(*Pointer<Half>(buffer + 4));
1755 		texelColor.y.w = Float(*Pointer<Half>(buffer + 6));
1756 		texelColor.z = texelColor.w = 1.0f;
1757 		break;
1758 	case VK_FORMAT_R16G16B16A16_UNORM:
1759 		buffer += 8 * x;
1760 		texelColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0x0)));
1761 		texelColor.y.x = Float(Int(*Pointer<UShort>(buffer + 0x2)));
1762 		texelColor.z.x = Float(Int(*Pointer<UShort>(buffer + 0x4)));
1763 		texelColor.w.x = Float(Int(*Pointer<UShort>(buffer + 0x6)));
1764 		texelColor.x.y = Float(Int(*Pointer<UShort>(buffer + 0x8)));
1765 		texelColor.y.y = Float(Int(*Pointer<UShort>(buffer + 0xa)));
1766 		texelColor.z.y = Float(Int(*Pointer<UShort>(buffer + 0xc)));
1767 		texelColor.w.y = Float(Int(*Pointer<UShort>(buffer + 0xe)));
1768 		buffer += pitchB;
1769 		texelColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0x0)));
1770 		texelColor.y.z = Float(Int(*Pointer<UShort>(buffer + 0x2)));
1771 		texelColor.z.z = Float(Int(*Pointer<UShort>(buffer + 0x4)));
1772 		texelColor.w.z = Float(Int(*Pointer<UShort>(buffer + 0x6)));
1773 		texelColor.x.w = Float(Int(*Pointer<UShort>(buffer + 0x8)));
1774 		texelColor.y.w = Float(Int(*Pointer<UShort>(buffer + 0xa)));
1775 		texelColor.z.w = Float(Int(*Pointer<UShort>(buffer + 0xc)));
1776 		texelColor.w.w = Float(Int(*Pointer<UShort>(buffer + 0xe)));
1777 		texelColor.x *= (1.0f / 0xFFFF);
1778 		texelColor.y *= (1.0f / 0xFFFF);
1779 		texelColor.z *= (1.0f / 0xFFFF);
1780 		texelColor.w *= (1.0f / 0xFFFF);
1781 		break;
1782 	case VK_FORMAT_R16G16B16A16_SFLOAT:
1783 		buffer += 8 * x;
1784 		texelColor.x.x = Float(*Pointer<Half>(buffer + 0x0));
1785 		texelColor.y.x = Float(*Pointer<Half>(buffer + 0x2));
1786 		texelColor.z.x = Float(*Pointer<Half>(buffer + 0x4));
1787 		texelColor.w.x = Float(*Pointer<Half>(buffer + 0x6));
1788 		texelColor.x.y = Float(*Pointer<Half>(buffer + 0x8));
1789 		texelColor.y.y = Float(*Pointer<Half>(buffer + 0xa));
1790 		texelColor.z.y = Float(*Pointer<Half>(buffer + 0xc));
1791 		texelColor.w.y = Float(*Pointer<Half>(buffer + 0xe));
1792 		buffer += pitchB;
1793 		texelColor.x.z = Float(*Pointer<Half>(buffer + 0x0));
1794 		texelColor.y.z = Float(*Pointer<Half>(buffer + 0x2));
1795 		texelColor.z.z = Float(*Pointer<Half>(buffer + 0x4));
1796 		texelColor.w.z = Float(*Pointer<Half>(buffer + 0x6));
1797 		texelColor.x.w = Float(*Pointer<Half>(buffer + 0x8));
1798 		texelColor.y.w = Float(*Pointer<Half>(buffer + 0xa));
1799 		texelColor.z.w = Float(*Pointer<Half>(buffer + 0xc));
1800 		texelColor.w.w = Float(*Pointer<Half>(buffer + 0xe));
1801 		break;
1802 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
1803 		buffer += 4 * x;
1804 		texelColor.x = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
1805 		texelColor.y = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
1806 		buffer += pitchB;
1807 		texelColor.z = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
1808 		texelColor.w = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
1809 		transpose4x3(texelColor.x, texelColor.y, texelColor.z, texelColor.w);
1810 		texelColor.w = 1.0f;
1811 		break;
1812 	default:
1813 		{
1814 			// Attempt to read an integer based format and convert it to float
1815 			Vector4s color;
1816 			readPixel(index, cBuffer, x, color);
1817 			texelColor.x = Float4(As<UShort4>(color.x)) * (1.0f / 0xFFFF);
1818 			texelColor.y = Float4(As<UShort4>(color.y)) * (1.0f / 0xFFFF);
1819 			texelColor.z = Float4(As<UShort4>(color.z)) * (1.0f / 0xFFFF);
1820 			texelColor.w = Float4(As<UShort4>(color.w)) * (1.0f / 0xFFFF);
1821 
1822 			if(isSRGB(index))
1823 			{
1824 				texelColor.x = sRGBtoLinear(texelColor.x);
1825 				texelColor.y = sRGBtoLinear(texelColor.y);
1826 				texelColor.z = sRGBtoLinear(texelColor.z);
1827 			}
1828 		}
1829 		break;
1830 	}
1831 
1832 	ASSERT(SIMD::Width == 4);
1833 	SIMD::Float4 destColor;
1834 	destColor.x = texelColor.x;
1835 	destColor.y = texelColor.y;
1836 	destColor.z = texelColor.z;
1837 	destColor.w = texelColor.w;
1838 
1839 	SIMD::Float4 sourceFactor;
1840 	SIMD::Float4 destFactor;
1841 
1842 	blendFactorRGB(sourceFactor, sourceColor, destColor, state.blendState[index].sourceBlendFactor, format);
1843 	blendFactorRGB(destFactor, sourceColor, destColor, state.blendState[index].destBlendFactor, format);
1844 	blendFactorAlpha(sourceFactor.w, sourceColor.w, destColor.w, state.blendState[index].sourceBlendFactorAlpha, format);
1845 	blendFactorAlpha(destFactor.w, sourceColor.w, destColor.w, state.blendState[index].destBlendFactorAlpha, format);
1846 
1847 	SIMD::Float4 blendedColor;
1848 
1849 	switch(state.blendState[index].blendOperation)
1850 	{
1851 	case VK_BLEND_OP_ADD:
1852 		blendedColor.x = sourceColor.x * sourceFactor.x + destColor.x * destFactor.x;
1853 		blendedColor.y = sourceColor.y * sourceFactor.y + destColor.y * destFactor.y;
1854 		blendedColor.z = sourceColor.z * sourceFactor.z + destColor.z * destFactor.z;
1855 		break;
1856 	case VK_BLEND_OP_SUBTRACT:
1857 		blendedColor.x = sourceColor.x * sourceFactor.x - destColor.x * destFactor.x;
1858 		blendedColor.y = sourceColor.y * sourceFactor.y - destColor.y * destFactor.y;
1859 		blendedColor.z = sourceColor.z * sourceFactor.z - destColor.z * destFactor.z;
1860 		break;
1861 	case VK_BLEND_OP_REVERSE_SUBTRACT:
1862 		blendedColor.x = destColor.x * destFactor.x - sourceColor.x * sourceFactor.x;
1863 		blendedColor.y = destColor.y * destFactor.y - sourceColor.y * sourceFactor.y;
1864 		blendedColor.z = destColor.z * destFactor.z - sourceColor.z * sourceFactor.z;
1865 		break;
1866 	case VK_BLEND_OP_MIN:
1867 		blendedColor.x = Min(sourceColor.x, destColor.x);
1868 		blendedColor.y = Min(sourceColor.y, destColor.y);
1869 		blendedColor.z = Min(sourceColor.z, destColor.z);
1870 		break;
1871 	case VK_BLEND_OP_MAX:
1872 		blendedColor.x = Max(sourceColor.x, destColor.x);
1873 		blendedColor.y = Max(sourceColor.y, destColor.y);
1874 		blendedColor.z = Max(sourceColor.z, destColor.z);
1875 		break;
1876 	case VK_BLEND_OP_SRC_EXT:
1877 		blendedColor.x = sourceColor.x;
1878 		blendedColor.y = sourceColor.y;
1879 		blendedColor.z = sourceColor.z;
1880 		break;
1881 	case VK_BLEND_OP_DST_EXT:
1882 		blendedColor.x = destColor.x;
1883 		blendedColor.y = destColor.y;
1884 		blendedColor.z = destColor.z;
1885 		break;
1886 	case VK_BLEND_OP_ZERO_EXT:
1887 		blendedColor.x = 0.0f;
1888 		blendedColor.y = 0.0f;
1889 		blendedColor.z = 0.0f;
1890 		break;
1891 	case VK_BLEND_OP_MULTIPLY_EXT:
1892 	case VK_BLEND_OP_SCREEN_EXT:
1893 	case VK_BLEND_OP_OVERLAY_EXT:
1894 	case VK_BLEND_OP_DARKEN_EXT:
1895 	case VK_BLEND_OP_LIGHTEN_EXT:
1896 	case VK_BLEND_OP_COLORDODGE_EXT:
1897 	case VK_BLEND_OP_COLORBURN_EXT:
1898 	case VK_BLEND_OP_HARDLIGHT_EXT:
1899 	case VK_BLEND_OP_SOFTLIGHT_EXT:
1900 	case VK_BLEND_OP_DIFFERENCE_EXT:
1901 	case VK_BLEND_OP_EXCLUSION_EXT:
1902 	case VK_BLEND_OP_HSL_HUE_EXT:
1903 	case VK_BLEND_OP_HSL_SATURATION_EXT:
1904 	case VK_BLEND_OP_HSL_COLOR_EXT:
1905 	case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
1906 		blendedColor = computeAdvancedBlendMode(index, sourceColor, destColor, sourceFactor, destFactor);
1907 		break;
1908 	default:
1909 		UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperation));
1910 	}
1911 
1912 	switch(state.blendState[index].blendOperationAlpha)
1913 	{
1914 	case VK_BLEND_OP_ADD:
1915 		blendedColor.w = sourceColor.w * sourceFactor.w + destColor.w * destFactor.w;
1916 		break;
1917 	case VK_BLEND_OP_SUBTRACT:
1918 		blendedColor.w = sourceColor.w * sourceFactor.w - destColor.w * destFactor.w;
1919 		break;
1920 	case VK_BLEND_OP_REVERSE_SUBTRACT:
1921 		blendedColor.w = destColor.w * destFactor.w - sourceColor.w * sourceFactor.w;
1922 		break;
1923 	case VK_BLEND_OP_MIN:
1924 		blendedColor.w = Min(sourceColor.w, destColor.w);
1925 		break;
1926 	case VK_BLEND_OP_MAX:
1927 		blendedColor.w = Max(sourceColor.w, destColor.w);
1928 		break;
1929 	case VK_BLEND_OP_SRC_EXT:
1930 		blendedColor.w = sourceColor.w;
1931 		break;
1932 	case VK_BLEND_OP_DST_EXT:
1933 		blendedColor.w = destColor.w;
1934 		break;
1935 	case VK_BLEND_OP_ZERO_EXT:
1936 		blendedColor.w = 0.0f;
1937 		break;
1938 	case VK_BLEND_OP_MULTIPLY_EXT:
1939 	case VK_BLEND_OP_SCREEN_EXT:
1940 	case VK_BLEND_OP_OVERLAY_EXT:
1941 	case VK_BLEND_OP_DARKEN_EXT:
1942 	case VK_BLEND_OP_LIGHTEN_EXT:
1943 	case VK_BLEND_OP_COLORDODGE_EXT:
1944 	case VK_BLEND_OP_COLORBURN_EXT:
1945 	case VK_BLEND_OP_HARDLIGHT_EXT:
1946 	case VK_BLEND_OP_SOFTLIGHT_EXT:
1947 	case VK_BLEND_OP_DIFFERENCE_EXT:
1948 	case VK_BLEND_OP_EXCLUSION_EXT:
1949 	case VK_BLEND_OP_HSL_HUE_EXT:
1950 	case VK_BLEND_OP_HSL_SATURATION_EXT:
1951 	case VK_BLEND_OP_HSL_COLOR_EXT:
1952 	case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
1953 		// All of the currently supported 'advanced blend modes' compute the alpha the same way.
1954 		blendedColor.w = sourceColor.w + destColor.w - (sourceColor.w * destColor.w);
1955 		break;
1956 	default:
1957 		UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperationAlpha));
1958 	}
1959 
1960 	return blendedColor;
1961 }
1962 
writeColor(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4f & color,const Int & sMask,const Int & zMask,const Int & cMask)1963 void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4f &color, const Int &sMask, const Int &zMask, const Int &cMask)
1964 {
1965 	if(isSRGB(index))
1966 	{
1967 		color.x = linearToSRGB(color.x);
1968 		color.y = linearToSRGB(color.y);
1969 		color.z = linearToSRGB(color.z);
1970 	}
1971 
1972 	vk::Format format = state.colorFormat[index];
1973 	switch(format)
1974 	{
1975 	case VK_FORMAT_B8G8R8A8_UNORM:
1976 	case VK_FORMAT_B8G8R8A8_SRGB:
1977 	case VK_FORMAT_R8G8B8A8_UNORM:
1978 	case VK_FORMAT_R8G8B8A8_SRGB:
1979 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1980 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1981 		color.w = Min(Max(color.w, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1982 		color.w = As<Float4>(RoundInt(color.w * 0xFF));
1983 		color.z = Min(Max(color.z, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1984 		color.z = As<Float4>(RoundInt(color.z * 0xFF));
1985 		// [[fallthrough]]
1986 	case VK_FORMAT_R8G8_UNORM:
1987 		color.y = Min(Max(color.y, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1988 		color.y = As<Float4>(RoundInt(color.y * 0xFF));
1989 		//[[fallthrough]]
1990 	case VK_FORMAT_R8_UNORM:
1991 		color.x = Min(Max(color.x, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1992 		color.x = As<Float4>(RoundInt(color.x * 0xFF));
1993 		break;
1994 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1995 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1996 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
1997 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
1998 		color.w = Min(Max(color.w, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
1999 		color.w = As<Float4>(RoundInt(color.w * 0xF));
2000 		color.z = Min(Max(color.z, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2001 		color.z = As<Float4>(RoundInt(color.z * 0xF));
2002 		color.y = Min(Max(color.y, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2003 		color.y = As<Float4>(RoundInt(color.y * 0xF));
2004 		color.x = Min(Max(color.x, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2005 		color.x = As<Float4>(RoundInt(color.x * 0xF));
2006 		break;
2007 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
2008 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
2009 		color.z = Min(Max(color.z, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2010 		color.z = As<Float4>(RoundInt(color.z * 0x1F));
2011 		color.y = Min(Max(color.y, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2012 		color.y = As<Float4>(RoundInt(color.y * 0x3F));
2013 		color.x = Min(Max(color.x, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2014 		color.x = As<Float4>(RoundInt(color.x * 0x1F));
2015 		break;
2016 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
2017 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
2018 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
2019 		color.w = Min(Max(color.w, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2020 		color.w = As<Float4>(RoundInt(color.w));
2021 		color.z = Min(Max(color.z, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2022 		color.z = As<Float4>(RoundInt(color.z * 0x1F));
2023 		color.y = Min(Max(color.y, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2024 		color.y = As<Float4>(RoundInt(color.y * 0x1F));
2025 		color.x = Min(Max(color.x, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2026 		color.x = As<Float4>(RoundInt(color.x * 0x1F));
2027 		break;
2028 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
2029 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
2030 		color.w = Min(Max(color.w, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2031 		color.w = As<Float4>(RoundInt(color.w * 0x3));
2032 		color.z = Min(Max(color.z, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2033 		color.z = As<Float4>(RoundInt(color.z * 0x3FF));
2034 		color.y = Min(Max(color.y, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2035 		color.y = As<Float4>(RoundInt(color.y * 0x3FF));
2036 		color.x = Min(Max(color.x, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2037 		color.x = As<Float4>(RoundInt(color.x * 0x3FF));
2038 		break;
2039 	case VK_FORMAT_R16G16B16A16_UNORM:
2040 		color.w = Min(Max(color.w, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2041 		color.w = As<Float4>(RoundInt(color.w * 0xFFFF));
2042 		color.z = Min(Max(color.z, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2043 		color.z = As<Float4>(RoundInt(color.z * 0xFFFF));
2044 		// [[fallthrough]]
2045 	case VK_FORMAT_R16G16_UNORM:
2046 		color.y = Min(Max(color.y, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2047 		color.y = As<Float4>(RoundInt(color.y * 0xFFFF));
2048 		//[[fallthrough]]
2049 	case VK_FORMAT_R16_UNORM:
2050 		color.x = Min(Max(color.x, 0.0f), 1.0f);  // TODO(b/204560089): Omit clamp if redundant
2051 		color.x = As<Float4>(RoundInt(color.x * 0xFFFF));
2052 		break;
2053 	default:
2054 		// TODO(b/204560089): Omit clamp if redundant
2055 		if(format.isUnsignedNormalized())
2056 		{
2057 			color.x = Min(Max(color.x, 0.0f), 1.0f);
2058 			color.y = Min(Max(color.y, 0.0f), 1.0f);
2059 			color.z = Min(Max(color.z, 0.0f), 1.0f);
2060 			color.w = Min(Max(color.w, 0.0f), 1.0f);
2061 		}
2062 		else if(format.isSignedNormalized())
2063 		{
2064 			color.x = Min(Max(color.x, -1.0f), 1.0f);
2065 			color.y = Min(Max(color.y, -1.0f), 1.0f);
2066 			color.z = Min(Max(color.z, -1.0f), 1.0f);
2067 			color.w = Min(Max(color.w, -1.0f), 1.0f);
2068 		}
2069 	}
2070 
2071 	switch(format)
2072 	{
2073 	case VK_FORMAT_R16_SFLOAT:
2074 	case VK_FORMAT_R32_SFLOAT:
2075 	case VK_FORMAT_R32_SINT:
2076 	case VK_FORMAT_R32_UINT:
2077 	case VK_FORMAT_R16_UNORM:
2078 	case VK_FORMAT_R16_SINT:
2079 	case VK_FORMAT_R16_UINT:
2080 	case VK_FORMAT_R8_SINT:
2081 	case VK_FORMAT_R8_UINT:
2082 	case VK_FORMAT_R8_UNORM:
2083 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2084 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2085 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
2086 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
2087 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
2088 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
2089 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
2090 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
2091 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
2092 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
2093 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
2094 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
2095 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
2096 		break;
2097 	case VK_FORMAT_R16G16_SFLOAT:
2098 	case VK_FORMAT_R32G32_SFLOAT:
2099 	case VK_FORMAT_R32G32_SINT:
2100 	case VK_FORMAT_R32G32_UINT:
2101 	case VK_FORMAT_R16G16_UNORM:
2102 	case VK_FORMAT_R16G16_SINT:
2103 	case VK_FORMAT_R16G16_UINT:
2104 	case VK_FORMAT_R8G8_SINT:
2105 	case VK_FORMAT_R8G8_UINT:
2106 	case VK_FORMAT_R8G8_UNORM:
2107 		color.z = color.x;
2108 		color.x = UnpackLow(color.x, color.y);
2109 		color.z = UnpackHigh(color.z, color.y);
2110 		color.y = color.z;
2111 		break;
2112 	case VK_FORMAT_R16G16B16A16_SFLOAT:
2113 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2114 	case VK_FORMAT_R32G32B32A32_SFLOAT:
2115 	case VK_FORMAT_R32G32B32A32_SINT:
2116 	case VK_FORMAT_R32G32B32A32_UINT:
2117 	case VK_FORMAT_R16G16B16A16_UNORM:
2118 	case VK_FORMAT_R16G16B16A16_SINT:
2119 	case VK_FORMAT_R16G16B16A16_UINT:
2120 	case VK_FORMAT_R8G8B8A8_SINT:
2121 	case VK_FORMAT_R8G8B8A8_UINT:
2122 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
2123 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
2124 	case VK_FORMAT_R8G8B8A8_UNORM:
2125 	case VK_FORMAT_R8G8B8A8_SRGB:
2126 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
2127 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
2128 		transpose4x4(color.x, color.y, color.z, color.w);
2129 		break;
2130 	case VK_FORMAT_B8G8R8A8_UNORM:
2131 	case VK_FORMAT_B8G8R8A8_SRGB:
2132 		transpose4x4zyxw(color.z, color.y, color.x, color.w);
2133 		break;
2134 	default:
2135 		UNSUPPORTED("VkFormat: %d", int(format));
2136 	}
2137 
2138 	int writeMask = state.colorWriteActive(index);
2139 	if(format.isBGRformat())
2140 	{
2141 		// For BGR formats, flip R and B channels in the channels mask
2142 		writeMask = (writeMask & 0x0000000A) | (writeMask & 0x00000001) << 2 | (writeMask & 0x00000004) >> 2;
2143 	}
2144 
2145 	Int xMask;  // Combination of all masks
2146 
2147 	if(state.depthTestActive)
2148 	{
2149 		xMask = zMask;
2150 	}
2151 	else
2152 	{
2153 		xMask = cMask;
2154 	}
2155 
2156 	if(state.stencilActive)
2157 	{
2158 		xMask &= sMask;
2159 	}
2160 
2161 	Pointer<Byte> buffer = cBuffer;
2162 	Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2163 	Float4 value;
2164 
2165 	switch(format)
2166 	{
2167 	case VK_FORMAT_R32_SFLOAT:
2168 	case VK_FORMAT_R32_SINT:
2169 	case VK_FORMAT_R32_UINT:
2170 		if(writeMask & 0x00000001)
2171 		{
2172 			buffer += 4 * x;
2173 
2174 			// FIXME: movlps
2175 			value.x = *Pointer<Float>(buffer + 0);
2176 			value.y = *Pointer<Float>(buffer + 4);
2177 
2178 			buffer += pitchB;
2179 
2180 			// FIXME: movhps
2181 			value.z = *Pointer<Float>(buffer + 0);
2182 			value.w = *Pointer<Float>(buffer + 4);
2183 
2184 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2185 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2186 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2187 
2188 			// FIXME: movhps
2189 			*Pointer<Float>(buffer + 0) = color.x.z;
2190 			*Pointer<Float>(buffer + 4) = color.x.w;
2191 
2192 			buffer -= pitchB;
2193 
2194 			// FIXME: movlps
2195 			*Pointer<Float>(buffer + 0) = color.x.x;
2196 			*Pointer<Float>(buffer + 4) = color.x.y;
2197 		}
2198 		break;
2199 	case VK_FORMAT_R16_SFLOAT:
2200 		if(writeMask & 0x00000001)
2201 		{
2202 			buffer += 2 * x;
2203 
2204 			value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 0);
2205 			value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 1);
2206 
2207 			buffer += pitchB;
2208 
2209 			value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 2);
2210 			value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 3);
2211 
2212 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2213 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2214 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2215 
2216 			*Pointer<Half>(buffer + 0) = Half(color.x.z);
2217 			*Pointer<Half>(buffer + 2) = Half(color.x.w);
2218 
2219 			buffer -= pitchB;
2220 
2221 			*Pointer<Half>(buffer + 0) = Half(color.x.x);
2222 			*Pointer<Half>(buffer + 2) = Half(color.x.y);
2223 		}
2224 		break;
2225 	case VK_FORMAT_R16_UNORM:
2226 	case VK_FORMAT_R16_SINT:
2227 	case VK_FORMAT_R16_UINT:
2228 		if(writeMask & 0x00000001)
2229 		{
2230 			buffer += 2 * x;
2231 
2232 			UShort4 xyzw;
2233 			xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2234 
2235 			buffer += pitchB;
2236 
2237 			xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2238 			value = As<Float4>(Int4(xyzw));
2239 
2240 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2241 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2242 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2243 
2244 			Float component = color.x.z;
2245 			*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2246 			component = color.x.w;
2247 			*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2248 
2249 			buffer -= pitchB;
2250 
2251 			component = color.x.x;
2252 			*Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2253 			component = color.x.y;
2254 			*Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2255 		}
2256 		break;
2257 	case VK_FORMAT_R8_SINT:
2258 	case VK_FORMAT_R8_UINT:
2259 	case VK_FORMAT_R8_UNORM:
2260 		if(writeMask & 0x00000001)
2261 		{
2262 			buffer += x;
2263 
2264 			UInt xyzw, packedCol;
2265 
2266 			xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFFu;
2267 			buffer += pitchB;
2268 			xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2269 
2270 			Short4 tmpCol = Short4(As<Int4>(color.x));
2271 			if(format == VK_FORMAT_R8_SINT)
2272 			{
2273 				tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
2274 			}
2275 			else
2276 			{
2277 				tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
2278 			}
2279 			packedCol = Extract(As<Int2>(tmpCol), 0);
2280 
2281 			packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2282 			            (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2283 
2284 			*Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2285 			buffer -= pitchB;
2286 			*Pointer<UShort>(buffer) = UShort(packedCol);
2287 		}
2288 		break;
2289 	case VK_FORMAT_R32G32_SFLOAT:
2290 	case VK_FORMAT_R32G32_SINT:
2291 	case VK_FORMAT_R32G32_UINT:
2292 		buffer += 8 * x;
2293 
2294 		value = *Pointer<Float4>(buffer);
2295 
2296 		if((writeMask & 0x00000003) != 0x00000003)
2297 		{
2298 			Float4 masked = value;
2299 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[writeMask & 0x3][0])));
2300 			masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~writeMask & 0x3][0])));
2301 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(masked));
2302 		}
2303 
2304 		color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16, 16));
2305 		value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ01X) + xMask * 16, 16));
2306 		color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2307 		*Pointer<Float4>(buffer) = color.x;
2308 
2309 		buffer += pitchB;
2310 
2311 		value = *Pointer<Float4>(buffer);
2312 
2313 		if((writeMask & 0x00000003) != 0x00000003)
2314 		{
2315 			Float4 masked;
2316 
2317 			masked = value;
2318 			color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[writeMask & 0x3][0])));
2319 			masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~writeMask & 0x3][0])));
2320 			color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(masked));
2321 		}
2322 
2323 		color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16, 16));
2324 		value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ23X) + xMask * 16, 16));
2325 		color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(value));
2326 		*Pointer<Float4>(buffer) = color.y;
2327 		break;
2328 	case VK_FORMAT_R16G16_SFLOAT:
2329 		if((writeMask & 0x00000003) != 0x0)
2330 		{
2331 			buffer += 4 * x;
2332 
2333 			UInt2 rgbaMask;
2334 			UInt2 packedCol;
2335 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.y))) << 16) | UInt(As<UShort>(Half(color.x.x))), 0);
2336 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.w))) << 16) | UInt(As<UShort>(Half(color.x.z))), 1);
2337 
2338 			UShort4 value = *Pointer<UShort4>(buffer);
2339 			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2340 			if((writeMask & 0x3) != 0x3)
2341 			{
2342 				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[writeMask & 0x3]));
2343 				rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2344 				mergedMask &= rgbaMask;
2345 			}
2346 			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2347 
2348 			buffer += pitchB;
2349 
2350 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.y))) << 16) | UInt(As<UShort>(Half(color.y.x))), 0);
2351 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.w))) << 16) | UInt(As<UShort>(Half(color.y.z))), 1);
2352 			value = *Pointer<UShort4>(buffer);
2353 			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2354 			if((writeMask & 0x3) != 0x3)
2355 			{
2356 				mergedMask &= rgbaMask;
2357 			}
2358 			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2359 		}
2360 		break;
2361 	case VK_FORMAT_R16G16_UNORM:
2362 	case VK_FORMAT_R16G16_SINT:
2363 	case VK_FORMAT_R16G16_UINT:
2364 		if((writeMask & 0x00000003) != 0x0)
2365 		{
2366 			buffer += 4 * x;
2367 
2368 			UInt2 rgbaMask;
2369 			UShort4 packedCol = UShort4(As<Int4>(color.x));
2370 			UShort4 value = *Pointer<UShort4>(buffer);
2371 			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2372 			if((writeMask & 0x3) != 0x3)
2373 			{
2374 				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[writeMask & 0x3]));
2375 				rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2376 				mergedMask &= rgbaMask;
2377 			}
2378 			*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2379 
2380 			buffer += pitchB;
2381 
2382 			packedCol = UShort4(As<Int4>(color.y));
2383 			value = *Pointer<UShort4>(buffer);
2384 			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2385 			if((writeMask & 0x3) != 0x3)
2386 			{
2387 				mergedMask &= rgbaMask;
2388 			}
2389 			*Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2390 		}
2391 		break;
2392 	case VK_FORMAT_R8G8_SINT:
2393 	case VK_FORMAT_R8G8_UINT:
2394 	case VK_FORMAT_R8G8_UNORM:
2395 		if((writeMask & 0x00000003) != 0x0)
2396 		{
2397 			buffer += 2 * x;
2398 
2399 			Int2 xyzw, packedCol;
2400 
2401 			xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2402 			buffer += pitchB;
2403 			xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2404 
2405 			if(format == VK_FORMAT_R8G8_SINT)
2406 			{
2407 				packedCol = As<Int2>(PackSigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2408 			}
2409 			else
2410 			{
2411 				packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2412 			}
2413 
2414 			UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2415 			if((writeMask & 0x3) != 0x3)
2416 			{
2417 				Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (writeMask & 0x3)]));
2418 				UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2419 				mergedMask &= rgbaMask;
2420 			}
2421 
2422 			packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2423 
2424 			*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2425 			buffer -= pitchB;
2426 			*Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2427 		}
2428 		break;
2429 	case VK_FORMAT_R32G32B32A32_SFLOAT:
2430 	case VK_FORMAT_R32G32B32A32_SINT:
2431 	case VK_FORMAT_R32G32B32A32_UINT:
2432 		buffer += 16 * x;
2433 
2434 		{
2435 			value = *Pointer<Float4>(buffer, 16);
2436 
2437 			if(writeMask != 0x0000000F)
2438 			{
2439 				Float4 masked = value;
2440 				color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[writeMask])));
2441 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[writeMask])));
2442 				color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(masked));
2443 			}
2444 
2445 			color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskX0X) + xMask * 16, 16));
2446 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX0X) + xMask * 16, 16));
2447 			color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2448 			*Pointer<Float4>(buffer, 16) = color.x;
2449 		}
2450 
2451 		{
2452 			value = *Pointer<Float4>(buffer + 16, 16);
2453 
2454 			if(writeMask != 0x0000000F)
2455 			{
2456 				Float4 masked = value;
2457 				color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[writeMask])));
2458 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[writeMask])));
2459 				color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(masked));
2460 			}
2461 
2462 			color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskX1X) + xMask * 16, 16));
2463 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX1X) + xMask * 16, 16));
2464 			color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(value));
2465 			*Pointer<Float4>(buffer + 16, 16) = color.y;
2466 		}
2467 
2468 		buffer += pitchB;
2469 
2470 		{
2471 			value = *Pointer<Float4>(buffer, 16);
2472 
2473 			if(writeMask != 0x0000000F)
2474 			{
2475 				Float4 masked = value;
2476 				color.z = As<Float4>(As<Int4>(color.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[writeMask])));
2477 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[writeMask])));
2478 				color.z = As<Float4>(As<Int4>(color.z) | As<Int4>(masked));
2479 			}
2480 
2481 			color.z = As<Float4>(As<Int4>(color.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskX2X) + xMask * 16, 16));
2482 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX2X) + xMask * 16, 16));
2483 			color.z = As<Float4>(As<Int4>(color.z) | As<Int4>(value));
2484 			*Pointer<Float4>(buffer, 16) = color.z;
2485 		}
2486 
2487 		{
2488 			value = *Pointer<Float4>(buffer + 16, 16);
2489 
2490 			if(writeMask != 0x0000000F)
2491 			{
2492 				Float4 masked = value;
2493 				color.w = As<Float4>(As<Int4>(color.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[writeMask])));
2494 				masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[writeMask])));
2495 				color.w = As<Float4>(As<Int4>(color.w) | As<Int4>(masked));
2496 			}
2497 
2498 			color.w = As<Float4>(As<Int4>(color.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskX3X) + xMask * 16, 16));
2499 			value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX3X) + xMask * 16, 16));
2500 			color.w = As<Float4>(As<Int4>(color.w) | As<Int4>(value));
2501 			*Pointer<Float4>(buffer + 16, 16) = color.w;
2502 		}
2503 		break;
2504 	case VK_FORMAT_R16G16B16A16_SFLOAT:
2505 		if((writeMask & 0x0000000F) != 0x0)
2506 		{
2507 			buffer += 8 * x;
2508 
2509 			UInt4 rgbaMask;
2510 			UInt4 value = *Pointer<UInt4>(buffer);
2511 			UInt4 packedCol;
2512 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.y))) << 16) | UInt(As<UShort>(Half(color.x.x))), 0);
2513 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.w))) << 16) | UInt(As<UShort>(Half(color.x.z))), 1);
2514 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.y))) << 16) | UInt(As<UShort>(Half(color.y.x))), 2);
2515 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.w))) << 16) | UInt(As<UShort>(Half(color.y.z))), 3);
2516 			UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2517 			if((writeMask & 0xF) != 0xF)
2518 			{
2519 				UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[writeMask]));
2520 				rgbaMask = UInt4(tmpMask, tmpMask);
2521 				mergedMask &= rgbaMask;
2522 			}
2523 			*Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2524 
2525 			buffer += pitchB;
2526 
2527 			value = *Pointer<UInt4>(buffer);
2528 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.z.y))) << 16) | UInt(As<UShort>(Half(color.z.x))), 0);
2529 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.z.w))) << 16) | UInt(As<UShort>(Half(color.z.z))), 1);
2530 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.w.y))) << 16) | UInt(As<UShort>(Half(color.w.x))), 2);
2531 			packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.w.w))) << 16) | UInt(As<UShort>(Half(color.w.z))), 3);
2532 			mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2533 			if((writeMask & 0xF) != 0xF)
2534 			{
2535 				mergedMask &= rgbaMask;
2536 			}
2537 			*Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2538 		}
2539 		break;
2540 	case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2541 		if((writeMask & 0x7) != 0x0)
2542 		{
2543 			buffer += 4 * x;
2544 
2545 			UInt4 packedCol;
2546 			packedCol = Insert(packedCol, r11g11b10Pack(color.x), 0);
2547 			packedCol = Insert(packedCol, r11g11b10Pack(color.y), 1);
2548 			packedCol = Insert(packedCol, r11g11b10Pack(color.z), 2);
2549 			packedCol = Insert(packedCol, r11g11b10Pack(color.w), 3);
2550 
2551 			UInt4 value;
2552 			value = Insert(value, *Pointer<UInt>(buffer + 0), 0);
2553 			value = Insert(value, *Pointer<UInt>(buffer + 4), 1);
2554 			buffer += pitchB;
2555 			value = Insert(value, *Pointer<UInt>(buffer + 0), 2);
2556 			value = Insert(value, *Pointer<UInt>(buffer + 4), 3);
2557 
2558 			UInt4 mask = *Pointer<UInt4>(constants + OFFSET(Constants, maskD4X[0]) + xMask * 16, 16);
2559 			if((writeMask & 0x7) != 0x7)
2560 			{
2561 				mask &= *Pointer<UInt4>(constants + OFFSET(Constants, mask11X[writeMask & 0x7]), 16);
2562 			}
2563 			value = (packedCol & mask) | (value & ~mask);
2564 
2565 			*Pointer<UInt>(buffer + 0) = value.z;
2566 			*Pointer<UInt>(buffer + 4) = value.w;
2567 			buffer -= pitchB;
2568 			*Pointer<UInt>(buffer + 0) = value.x;
2569 			*Pointer<UInt>(buffer + 4) = value.y;
2570 		}
2571 		break;
2572 	case VK_FORMAT_R16G16B16A16_UNORM:
2573 	case VK_FORMAT_R16G16B16A16_SINT:
2574 	case VK_FORMAT_R16G16B16A16_UINT:
2575 		if((writeMask & 0x0000000F) != 0x0)
2576 		{
2577 			buffer += 8 * x;
2578 
2579 			UInt4 rgbaMask;
2580 			UShort8 value = *Pointer<UShort8>(buffer);
2581 			UShort8 packedCol = UShort8(UShort4(As<Int4>(color.x)), UShort4(As<Int4>(color.y)));
2582 			UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2583 			if((writeMask & 0xF) != 0xF)
2584 			{
2585 				UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[writeMask]));
2586 				rgbaMask = UInt4(tmpMask, tmpMask);
2587 				mergedMask &= rgbaMask;
2588 			}
2589 			*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2590 
2591 			buffer += pitchB;
2592 
2593 			value = *Pointer<UShort8>(buffer);
2594 			packedCol = UShort8(UShort4(As<Int4>(color.z)), UShort4(As<Int4>(color.w)));
2595 			mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2596 			if((writeMask & 0xF) != 0xF)
2597 			{
2598 				mergedMask &= rgbaMask;
2599 			}
2600 			*Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2601 		}
2602 		break;
2603 	case VK_FORMAT_B8G8R8A8_UNORM:
2604 	case VK_FORMAT_B8G8R8A8_SRGB:
2605 	case VK_FORMAT_R8G8B8A8_SINT:
2606 	case VK_FORMAT_R8G8B8A8_UINT:
2607 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
2608 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
2609 	case VK_FORMAT_R8G8B8A8_UNORM:
2610 	case VK_FORMAT_R8G8B8A8_SRGB:
2611 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
2612 	case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
2613 		if((writeMask & 0x0000000F) != 0x0)
2614 		{
2615 			UInt2 value, packedCol, mergedMask;
2616 
2617 			buffer += 4 * x;
2618 
2619 			bool isSigned = !format.isUnsigned();
2620 
2621 			if(isSigned)
2622 			{
2623 				packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2624 			}
2625 			else
2626 			{
2627 				packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2628 			}
2629 			value = *Pointer<UInt2>(buffer, 16);
2630 			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2631 			if(writeMask != 0xF)
2632 			{
2633 				mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[writeMask]));
2634 			}
2635 			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2636 
2637 			buffer += pitchB;
2638 
2639 			if(isSigned)
2640 			{
2641 				packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(color.z)), Short4(As<Int4>(color.w))));
2642 			}
2643 			else
2644 			{
2645 				packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(color.z)), Short4(As<Int4>(color.w))));
2646 			}
2647 			value = *Pointer<UInt2>(buffer, 16);
2648 			mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2649 			if(writeMask != 0xF)
2650 			{
2651 				mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[writeMask]));
2652 			}
2653 			*Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2654 		}
2655 		break;
2656 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2657 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
2658 		if((writeMask & 0x0000000F) != 0x0)
2659 		{
2660 			Int2 mergedMask, packedCol, value;
2661 			Int4 packed = ((As<Int4>(color.w) & Int4(0x3)) << 30) |
2662 			              ((As<Int4>(color.z) & Int4(0x3ff)) << 20) |
2663 			              ((As<Int4>(color.y) & Int4(0x3ff)) << 10) |
2664 			              ((As<Int4>(color.x) & Int4(0x3ff)));
2665 
2666 			buffer += 4 * x;
2667 			value = *Pointer<Int2>(buffer, 16);
2668 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2669 			if(writeMask != 0xF)
2670 			{
2671 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask]));
2672 			}
2673 			*Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
2674 
2675 			buffer += pitchB;
2676 
2677 			value = *Pointer<Int2>(buffer, 16);
2678 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2679 			if(writeMask != 0xF)
2680 			{
2681 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask]));
2682 			}
2683 			*Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
2684 		}
2685 		break;
2686 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2687 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
2688 		if((writeMask & 0x0000000F) != 0x0)
2689 		{
2690 			Int2 mergedMask, packedCol, value;
2691 			Int4 packed = ((As<Int4>(color.w) & Int4(0x3)) << 30) |
2692 			              ((As<Int4>(color.x) & Int4(0x3ff)) << 20) |
2693 			              ((As<Int4>(color.y) & Int4(0x3ff)) << 10) |
2694 			              ((As<Int4>(color.z) & Int4(0x3ff)));
2695 
2696 			buffer += 4 * x;
2697 			value = *Pointer<Int2>(buffer, 16);
2698 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2699 			if(writeMask != 0xF)
2700 			{
2701 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask]));
2702 			}
2703 			*Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
2704 
2705 			buffer += pitchB;
2706 
2707 			value = *Pointer<Int2>(buffer, 16);
2708 			mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2709 			if(writeMask != 0xF)
2710 			{
2711 				mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask]));
2712 			}
2713 			*Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
2714 		}
2715 		break;
2716 	case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
2717 	case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
2718 	case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
2719 	case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
2720 		{
2721 			buffer += 2 * x;
2722 			Int value = *Pointer<Int>(buffer);
2723 
2724 			Int channelMask;
2725 			Short4 current;
2726 			switch(format)
2727 			{
2728 			case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
2729 				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4argbQ[writeMask][0]));
2730 				current = (UShort4(As<Int4>(color.x)) & UShort4(0xF)) << 12 |
2731 				          (UShort4(As<Int4>(color.y)) & UShort4(0xF)) << 8 |
2732 				          (UShort4(As<Int4>(color.z)) & UShort4(0xF)) << 4 |
2733 				          (UShort4(As<Int4>(color.w)) & UShort4(0xF));
2734 				break;
2735 			case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
2736 				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4argbQ[writeMask][0]));
2737 				current = (UShort4(As<Int4>(color.z)) & UShort4(0xF)) << 12 |
2738 				          (UShort4(As<Int4>(color.y)) & UShort4(0xF)) << 8 |
2739 				          (UShort4(As<Int4>(color.x)) & UShort4(0xF)) << 4 |
2740 				          (UShort4(As<Int4>(color.w)) & UShort4(0xF));
2741 				break;
2742 			case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
2743 				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4rgbaQ[writeMask][0]));
2744 				current = (UShort4(As<Int4>(color.w)) & UShort4(0xF)) << 12 |
2745 				          (UShort4(As<Int4>(color.x)) & UShort4(0xF)) << 8 |
2746 				          (UShort4(As<Int4>(color.y)) & UShort4(0xF)) << 4 |
2747 				          (UShort4(As<Int4>(color.z)) & UShort4(0xF));
2748 				break;
2749 			case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
2750 				channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4rgbaQ[writeMask][0]));
2751 				current = (UShort4(As<Int4>(color.w)) & UShort4(0xF)) << 12 |
2752 				          (UShort4(As<Int4>(color.z)) & UShort4(0xF)) << 8 |
2753 				          (UShort4(As<Int4>(color.y)) & UShort4(0xF)) << 4 |
2754 				          (UShort4(As<Int4>(color.x)) & UShort4(0xF));
2755 				break;
2756 			default:
2757 				UNREACHABLE("Format: %s", vk::Stringify(format).c_str());
2758 			}
2759 
2760 			Int c01 = Extract(As<Int2>(current), 0);
2761 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2762 			if(writeMask != 0x0000000F)
2763 			{
2764 				mask01 &= channelMask;
2765 			}
2766 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2767 
2768 			buffer += pitchB;
2769 			value = *Pointer<Int>(buffer);
2770 
2771 			Int c23 = Extract(As<Int2>(current), 1);
2772 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2773 			if(writeMask != 0x0000000F)
2774 			{
2775 				mask23 &= channelMask;
2776 			}
2777 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2778 		}
2779 		break;
2780 	case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
2781 		{
2782 			buffer += 2 * x;
2783 			Int value = *Pointer<Int>(buffer);
2784 
2785 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskr5g5b5a1Q[writeMask][0]));
2786 			Short4 current = (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) << 11 |
2787 			                 (UShort4(As<Int4>(color.y)) & UShort4(0x1F)) << 6 |
2788 			                 (UShort4(As<Int4>(color.z)) & UShort4(0x1F)) << 1 |
2789 			                 (UShort4(As<Int4>(color.w)) & UShort4(0x1));
2790 
2791 			Int c01 = Extract(As<Int2>(current), 0);
2792 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2793 			if(writeMask != 0x0000000F)
2794 			{
2795 				mask01 &= channelMask;
2796 			}
2797 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2798 
2799 			buffer += pitchB;
2800 			value = *Pointer<Int>(buffer);
2801 
2802 			Int c23 = Extract(As<Int2>(current), 1);
2803 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2804 			if(writeMask != 0x0000000F)
2805 			{
2806 				mask23 &= channelMask;
2807 			}
2808 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2809 		}
2810 		break;
2811 	case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
2812 		{
2813 			buffer += 2 * x;
2814 			Int value = *Pointer<Int>(buffer);
2815 
2816 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskb5g5r5a1Q[writeMask][0]));
2817 			Short4 current = (UShort4(As<Int4>(color.z)) & UShort4(0x1F)) << 11 |
2818 			                 (UShort4(As<Int4>(color.y)) & UShort4(0x1F)) << 6 |
2819 			                 (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) << 1 |
2820 			                 (UShort4(As<Int4>(color.w)) & UShort4(0x1));
2821 
2822 			Int c01 = Extract(As<Int2>(current), 0);
2823 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2824 			if(writeMask != 0x0000000F)
2825 			{
2826 				mask01 &= channelMask;
2827 			}
2828 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2829 
2830 			buffer += pitchB;
2831 			value = *Pointer<Int>(buffer);
2832 
2833 			Int c23 = Extract(As<Int2>(current), 1);
2834 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2835 			if(writeMask != 0x0000000F)
2836 			{
2837 				mask23 &= channelMask;
2838 			}
2839 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2840 		}
2841 		break;
2842 	case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
2843 		{
2844 			buffer += 2 * x;
2845 			Int value = *Pointer<Int>(buffer);
2846 
2847 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask5551Q[writeMask][0]));
2848 			Short4 current = (UShort4(As<Int4>(color.w)) & UShort4(0x1)) << 15 |
2849 			                 (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) << 10 |
2850 			                 (UShort4(As<Int4>(color.y)) & UShort4(0x1F)) << 5 |
2851 			                 (UShort4(As<Int4>(color.z)) & UShort4(0x1F));
2852 
2853 			Int c01 = Extract(As<Int2>(current), 0);
2854 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2855 			if(writeMask != 0x0000000F)
2856 			{
2857 				mask01 &= channelMask;
2858 			}
2859 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2860 
2861 			buffer += pitchB;
2862 			value = *Pointer<Int>(buffer);
2863 
2864 			Int c23 = Extract(As<Int2>(current), 1);
2865 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2866 			if(writeMask != 0x0000000F)
2867 			{
2868 				mask23 &= channelMask;
2869 			}
2870 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2871 		}
2872 		break;
2873 	case VK_FORMAT_R5G6B5_UNORM_PACK16:
2874 		{
2875 			buffer += 2 * x;
2876 			Int value = *Pointer<Int>(buffer);
2877 
2878 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask565Q[writeMask & 0x7][0]));
2879 			Short4 current = (UShort4(As<Int4>(color.z)) & UShort4(0x1F)) |
2880 			                 (UShort4(As<Int4>(color.y)) & UShort4(0x3F)) << 5 |
2881 			                 (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) << 11;
2882 
2883 			Int c01 = Extract(As<Int2>(current), 0);
2884 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2885 			if((writeMask & 0x00000007) != 0x00000007)
2886 			{
2887 				mask01 &= channelMask;
2888 			}
2889 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2890 
2891 			buffer += pitchB;
2892 			value = *Pointer<Int>(buffer);
2893 
2894 			Int c23 = Extract(As<Int2>(current), 1);
2895 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2896 			if((writeMask & 0x00000007) != 0x00000007)
2897 			{
2898 				mask23 &= channelMask;
2899 			}
2900 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2901 		}
2902 		break;
2903 	case VK_FORMAT_B5G6R5_UNORM_PACK16:
2904 		{
2905 			buffer += 2 * x;
2906 			Int value = *Pointer<Int>(buffer);
2907 
2908 			Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask565Q[writeMask & 0x7][0]));
2909 			Short4 current = (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) |
2910 			                 (UShort4(As<Int4>(color.y)) & UShort4(0x3F)) << 5 |
2911 			                 (UShort4(As<Int4>(color.z)) & UShort4(0x1F)) << 11;
2912 
2913 			Int c01 = Extract(As<Int2>(current), 0);
2914 			Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2915 			if((writeMask & 0x00000007) != 0x00000007)
2916 			{
2917 				mask01 &= channelMask;
2918 			}
2919 			*Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2920 
2921 			buffer += pitchB;
2922 			value = *Pointer<Int>(buffer);
2923 
2924 			Int c23 = Extract(As<Int2>(current), 1);
2925 			Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2926 			if((writeMask & 0x00000007) != 0x00000007)
2927 			{
2928 				mask23 &= channelMask;
2929 			}
2930 			*Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2931 		}
2932 		break;
2933 	default:
2934 		UNSUPPORTED("VkFormat: %d", int(format));
2935 	}
2936 }
2937 
2938 }  // namespace sw
2939