1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "PixelRoutine.hpp"
16
17 #include "Constants.hpp"
18 #include "SamplerCore.hpp"
19 #include "Device/Primitive.hpp"
20 #include "Device/QuadRasterizer.hpp"
21 #include "Device/Renderer.hpp"
22 #include "System/Debug.hpp"
23 #include "System/Math.hpp"
24 #include "Vulkan/VkPipelineLayout.hpp"
25 #include "Vulkan/VkStringify.hpp"
26
27 namespace sw {
28 namespace {
29
shouldUsePerSampleShading(const PixelProcessor::State & state,const SpirvShader * spirvShader)30 bool shouldUsePerSampleShading(const PixelProcessor::State &state, const SpirvShader *spirvShader)
31 {
32 if(state.sampleShadingEnabled && (state.minSampleShading * state.multiSampleCount > 1.0f))
33 {
34 return true;
35 }
36
37 if(spirvShader)
38 {
39 if(spirvShader->getUsedCapabilities().InterpolationFunction) // TODO(b/194714095)
40 {
41 return true;
42 }
43
44 if(spirvShader->getUsedCapabilities().SampleRateShading)
45 {
46 return true;
47 }
48 }
49
50 return false;
51 }
52
53 } // namespace
54
PixelRoutine(const PixelProcessor::State & state,const vk::PipelineLayout * pipelineLayout,const SpirvShader * spirvShader,const vk::Attachments & attachments,const vk::DescriptorSet::Bindings & descriptorSets)55 PixelRoutine::PixelRoutine(
56 const PixelProcessor::State &state,
57 const vk::PipelineLayout *pipelineLayout,
58 const SpirvShader *spirvShader,
59 const vk::Attachments &attachments,
60 const vk::DescriptorSet::Bindings &descriptorSets)
61 : QuadRasterizer(state, spirvShader)
62 , routine(pipelineLayout)
63 , attachments(attachments)
64 , descriptorSets(descriptorSets)
65 , shaderContainsInterpolation(spirvShader && spirvShader->getUsedCapabilities().InterpolationFunction)
66 , perSampleShading(shouldUsePerSampleShading(state, spirvShader))
67 , invocationCount(perSampleShading ? state.multiSampleCount : 1)
68 {
69 if(spirvShader)
70 {
71 spirvShader->emitProlog(&routine);
72 }
73 }
74
~PixelRoutine()75 PixelRoutine::~PixelRoutine()
76 {
77 }
78
getSampleSet(int invocation) const79 PixelRoutine::SampleSet PixelRoutine::getSampleSet(int invocation) const
80 {
81 unsigned int sampleBegin = perSampleShading ? invocation : 0;
82 unsigned int sampleEnd = perSampleShading ? (invocation + 1) : state.multiSampleCount;
83
84 SampleSet samples;
85
86 for(unsigned int q = sampleBegin; q < sampleEnd; q++)
87 {
88 if(state.multiSampleMask & (1 << q))
89 {
90 samples.push_back(q);
91 }
92 }
93
94 return samples;
95 }
96
quad(Pointer<Byte> cBuffer[MAX_COLOR_BUFFERS],Pointer<Byte> & zBuffer,Pointer<Byte> & sBuffer,Int cMask[4],Int & x,Int & y)97 void PixelRoutine::quad(Pointer<Byte> cBuffer[MAX_COLOR_BUFFERS], Pointer<Byte> &zBuffer, Pointer<Byte> &sBuffer, Int cMask[4], Int &x, Int &y)
98 {
99 const bool earlyFragmentTests = !spirvShader || spirvShader->getExecutionModes().EarlyFragmentTests;
100
101 Int zMask[4]; // Depth mask
102 Int sMask[4]; // Stencil mask
103 SIMD::Float unclampedZ[4];
104
105 for(int invocation = 0; invocation < invocationCount; invocation++)
106 {
107 SampleSet samples = getSampleSet(invocation);
108
109 if(samples.empty())
110 {
111 continue;
112 }
113
114 for(unsigned int q : samples)
115 {
116 zMask[q] = cMask[q];
117 sMask[q] = cMask[q];
118 }
119
120 stencilTest(sBuffer, x, sMask, samples);
121
122 SIMD::Float rhwCentroid;
123
124 // Compute the x coordinate of each fragment in the SIMD group.
125 const auto xMorton = SIMD::Float([](int i) { return float(compactEvenBits(i)); }); // 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3, ...
126 xFragment = SIMD::Float(Float(x)) + xMorton - SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, x0)));
127
128 if(interpolateZ())
129 {
130 for(unsigned int q : samples)
131 {
132 SIMD::Float x = xFragment;
133
134 if(state.enableMultiSampling)
135 {
136 x -= SIMD::Float(*Pointer<Float>(constants + OFFSET(Constants, SampleLocationsX) + q * sizeof(float)));
137 }
138
139 z[q] = interpolate(x, Dz[q], z[q], primitive + OFFSET(Primitive, z), false, false);
140
141 if(state.depthBias)
142 {
143 z[q] += SIMD::Float(*Pointer<Float>(primitive + OFFSET(Primitive, zBias)));
144 }
145
146 unclampedZ[q] = z[q];
147 }
148 }
149
150 Bool depthPass = false;
151
152 if(earlyFragmentTests)
153 {
154 for(unsigned int q : samples)
155 {
156 z[q] = clampDepth(z[q]);
157 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
158 depthBoundsTest(zBuffer, q, x, zMask[q], cMask[q]);
159 }
160
161 writeStencil(sBuffer, x, sMask, zMask, cMask, samples);
162 }
163
164 If(depthPass || !earlyFragmentTests)
165 {
166 if(earlyFragmentTests)
167 {
168 writeDepth(zBuffer, x, zMask, samples);
169 occlusionSampleCount(zMask, sMask, samples);
170 }
171
172 // TODO(b/236162233): Use SIMD::Float2
173 SIMD::Float xCentroid = 0.0f;
174 SIMD::Float yCentroid = 0.0f;
175
176 if(state.centroid || shaderContainsInterpolation) // TODO(b/194714095)
177 {
178 SIMD::Float weight = 1.0e-9f;
179
180 for(unsigned int q : samples)
181 {
182 ASSERT(SIMD::Width == 4);
183 xCentroid += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, sampleX[q]) + 16 * cMask[q]));
184 yCentroid += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, sampleY[q]) + 16 * cMask[q]));
185 weight += SIMD::Float(*Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]));
186 }
187
188 weight = Rcp(weight, true /* relaxedPrecision */);
189 xCentroid *= weight;
190 yCentroid *= weight;
191
192 xCentroid += xFragment;
193 yCentroid += yFragment;
194 }
195
196 if(interpolateW())
197 {
198 w = interpolate(xFragment, Dw, rhw, primitive + OFFSET(Primitive, w), false, false);
199 rhw = reciprocal(w, false, true);
200
201 if(state.centroid || shaderContainsInterpolation) // TODO(b/194714095)
202 {
203 rhwCentroid = reciprocal(SpirvRoutine::interpolateAtXY(xCentroid, yCentroid, rhwCentroid, primitive + OFFSET(Primitive, w), SpirvRoutine::Linear));
204 }
205 }
206
207 if(spirvShader)
208 {
209 if(shaderContainsInterpolation) // TODO(b/194714095)
210 {
211 routine.interpolationData.primitive = primitive;
212
213 routine.interpolationData.x = xFragment;
214 routine.interpolationData.y = yFragment;
215 routine.interpolationData.rhw = rhw;
216
217 routine.interpolationData.xCentroid = xCentroid;
218 routine.interpolationData.yCentroid = yCentroid;
219 routine.interpolationData.rhwCentroid = rhwCentroid;
220 }
221
222 SIMD::Float xSample = xFragment;
223 SIMD::Float ySample = yFragment;
224
225 if(perSampleShading && (state.multiSampleCount > 1))
226 {
227 xSample += SampleLocationsX[samples[0]];
228 ySample += SampleLocationsY[samples[0]];
229 }
230
231 int packedInterpolant = 0;
232 for(int interfaceInterpolant = 0; interfaceInterpolant < MAX_INTERFACE_COMPONENTS; interfaceInterpolant++)
233 {
234 const auto &input = spirvShader->inputs[interfaceInterpolant];
235 if(input.Type != Spirv::ATTRIBTYPE_UNUSED)
236 {
237 routine.inputsInterpolation[packedInterpolant] = input.Flat ? SpirvRoutine::Flat : (input.NoPerspective ? SpirvRoutine::Linear : SpirvRoutine::Perspective);
238 if(input.Centroid && state.enableMultiSampling)
239 {
240 routine.inputs[interfaceInterpolant] =
241 SpirvRoutine::interpolateAtXY(xCentroid, yCentroid, rhwCentroid,
242 primitive + OFFSET(Primitive, V[packedInterpolant]),
243 routine.inputsInterpolation[packedInterpolant]);
244 }
245 else if(perSampleShading)
246 {
247 routine.inputs[interfaceInterpolant] =
248 SpirvRoutine::interpolateAtXY(xSample, ySample, rhw,
249 primitive + OFFSET(Primitive, V[packedInterpolant]),
250 routine.inputsInterpolation[packedInterpolant]);
251 }
252 else
253 {
254 routine.inputs[interfaceInterpolant] =
255 interpolate(xFragment, Dv[interfaceInterpolant], rhw,
256 primitive + OFFSET(Primitive, V[packedInterpolant]),
257 input.Flat, !input.NoPerspective);
258 }
259 packedInterpolant++;
260 }
261 }
262
263 setBuiltins(x, y, unclampedZ, w, cMask, samples);
264
265 for(uint32_t i = 0; i < state.numClipDistances; i++)
266 {
267 auto distance = interpolate(xFragment, DclipDistance[i], rhw,
268 primitive + OFFSET(Primitive, clipDistance[i]),
269 false, true);
270
271 auto clipMask = SignMask(CmpGE(distance, SIMD::Float(0)));
272 for(unsigned int q : samples)
273 {
274 // FIXME(b/148105887): Fragments discarded by clipping do not exist at
275 // all -- they should not be counted in queries or have their Z/S effects
276 // performed when early fragment tests are enabled.
277 cMask[q] &= clipMask;
278 }
279
280 if(spirvShader->getUsedCapabilities().ClipDistance)
281 {
282 auto it = spirvShader->inputBuiltins.find(spv::BuiltInClipDistance);
283 if(it != spirvShader->inputBuiltins.end())
284 {
285 if(i < it->second.SizeInComponents)
286 {
287 routine.getVariable(it->second.Id)[it->second.FirstComponent + i] = distance;
288 }
289 }
290 }
291 }
292
293 if(spirvShader->getUsedCapabilities().CullDistance)
294 {
295 auto it = spirvShader->inputBuiltins.find(spv::BuiltInCullDistance);
296 if(it != spirvShader->inputBuiltins.end())
297 {
298 for(uint32_t i = 0; i < state.numCullDistances; i++)
299 {
300 if(i < it->second.SizeInComponents)
301 {
302 routine.getVariable(it->second.Id)[it->second.FirstComponent + i] =
303 interpolate(xFragment, DcullDistance[i], rhw,
304 primitive + OFFSET(Primitive, cullDistance[i]),
305 false, true);
306 }
307 }
308 }
309 }
310 }
311
312 if(spirvShader)
313 {
314 executeShader(cMask, earlyFragmentTests ? sMask : cMask, earlyFragmentTests ? zMask : cMask, samples);
315 }
316
317 Bool alphaPass = alphaTest(cMask, samples);
318
319 if((spirvShader && spirvShader->coverageModified()) || state.alphaToCoverage)
320 {
321 for(unsigned int q : samples)
322 {
323 zMask[q] &= cMask[q];
324 sMask[q] &= cMask[q];
325 }
326 }
327
328 If(alphaPass)
329 {
330 if(!earlyFragmentTests)
331 {
332 for(unsigned int q : samples)
333 {
334 z[q] = clampDepth(z[q]);
335 depthPass = depthPass || depthTest(zBuffer, q, x, z[q], sMask[q], zMask[q], cMask[q]);
336 depthBoundsTest(zBuffer, q, x, zMask[q], cMask[q]);
337 }
338 }
339
340 If(depthPass)
341 {
342 if(!earlyFragmentTests)
343 {
344 writeDepth(zBuffer, x, zMask, samples);
345 occlusionSampleCount(zMask, sMask, samples);
346 }
347
348 blendColor(cBuffer, x, sMask, zMask, cMask, samples);
349 }
350 }
351 }
352
353 if(!earlyFragmentTests)
354 {
355 writeStencil(sBuffer, x, sMask, zMask, cMask, samples);
356 }
357 }
358 }
359
stencilTest(const Pointer<Byte> & sBuffer,const Int & x,Int sMask[4],const SampleSet & samples)360 void PixelRoutine::stencilTest(const Pointer<Byte> &sBuffer, const Int &x, Int sMask[4], const SampleSet &samples)
361 {
362 if(!state.stencilActive)
363 {
364 return;
365 }
366
367 for(unsigned int q : samples)
368 {
369 // (StencilRef & StencilMask) CompFunc (StencilBufferValue & StencilMask)
370
371 Pointer<Byte> buffer = sBuffer + x;
372
373 if(q > 0)
374 {
375 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
376 }
377
378 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
379 Byte8 value = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
380 value = value | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
381 Byte8 valueBack = value;
382
383 if(state.frontStencil.useCompareMask)
384 {
385 value &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].testMaskQ));
386 }
387
388 stencilTest(value, state.frontStencil.compareOp, false);
389
390 if(state.backStencil.useCompareMask)
391 {
392 valueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].testMaskQ));
393 }
394
395 stencilTest(valueBack, state.backStencil.compareOp, true);
396
397 value &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
398 valueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
399 value |= valueBack;
400
401 sMask[q] &= SignMask(value);
402 }
403 }
404
stencilTest(Byte8 & value,VkCompareOp stencilCompareMode,bool isBack)405 void PixelRoutine::stencilTest(Byte8 &value, VkCompareOp stencilCompareMode, bool isBack)
406 {
407 Byte8 equal;
408
409 switch(stencilCompareMode)
410 {
411 case VK_COMPARE_OP_ALWAYS:
412 value = Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
413 break;
414 case VK_COMPARE_OP_NEVER:
415 value = Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
416 break;
417 case VK_COMPARE_OP_LESS: // a < b ~ b > a
418 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
419 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
420 break;
421 case VK_COMPARE_OP_EQUAL:
422 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
423 break;
424 case VK_COMPARE_OP_NOT_EQUAL: // a != b ~ !(a == b)
425 value = CmpEQ(value, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
426 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
427 break;
428 case VK_COMPARE_OP_LESS_OR_EQUAL: // a <= b ~ (b > a) || (a == b)
429 equal = value;
430 equal = CmpEQ(equal, *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedQ)));
431 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
432 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
433 value |= equal;
434 break;
435 case VK_COMPARE_OP_GREATER: // a > b
436 equal = *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ));
437 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
438 equal = CmpGT(As<SByte8>(equal), As<SByte8>(value));
439 value = equal;
440 break;
441 case VK_COMPARE_OP_GREATER_OR_EQUAL: // a >= b ~ !(a < b) ~ !(b > a)
442 value += Byte8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
443 value = CmpGT(As<SByte8>(value), *Pointer<SByte8>(data + OFFSET(DrawData, stencil[isBack].referenceMaskedSignedQ)));
444 value ^= Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
445 break;
446 default:
447 UNSUPPORTED("VkCompareOp: %d", int(stencilCompareMode));
448 }
449 }
450
readDepth32F(const Pointer<Byte> & zBuffer,int q,const Int & x) const451 SIMD::Float PixelRoutine::readDepth32F(const Pointer<Byte> &zBuffer, int q, const Int &x) const
452 {
453 ASSERT(SIMD::Width == 4);
454 Pointer<Byte> buffer = zBuffer + 4 * x;
455 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
456
457 if(q > 0)
458 {
459 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
460 }
461
462 Float4 zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
463 return SIMD::Float(zValue);
464 }
465
readDepth16(const Pointer<Byte> & zBuffer,int q,const Int & x) const466 SIMD::Float PixelRoutine::readDepth16(const Pointer<Byte> &zBuffer, int q, const Int &x) const
467 {
468 ASSERT(SIMD::Width == 4);
469 Pointer<Byte> buffer = zBuffer + 2 * x;
470 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
471
472 if(q > 0)
473 {
474 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
475 }
476
477 UShort4 zValue16;
478 zValue16 = As<UShort4>(Insert(As<Int2>(zValue16), *Pointer<Int>(buffer), 0));
479 zValue16 = As<UShort4>(Insert(As<Int2>(zValue16), *Pointer<Int>(buffer + pitch), 1));
480 Float4 zValue = Float4(zValue16);
481 return SIMD::Float(zValue);
482 }
483
clampDepth(const SIMD::Float & z)484 SIMD::Float PixelRoutine::clampDepth(const SIMD::Float &z)
485 {
486 if(!state.depthClamp)
487 {
488 return z;
489 }
490
491 return Min(Max(z, state.minDepthClamp), state.maxDepthClamp);
492 }
493
depthTest(const Pointer<Byte> & zBuffer,int q,const Int & x,const SIMD::Float & z,const Int & sMask,Int & zMask,const Int & cMask)494 Bool PixelRoutine::depthTest(const Pointer<Byte> &zBuffer, int q, const Int &x, const SIMD::Float &z, const Int &sMask, Int &zMask, const Int &cMask)
495 {
496 if(!state.depthTestActive)
497 {
498 return true;
499 }
500
501 SIMD::Float Z;
502 SIMD::Float zValue;
503
504 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
505 {
506 switch(state.depthFormat)
507 {
508 case VK_FORMAT_D16_UNORM:
509 Z = Min(Max(Round(z * 0xFFFF), 0.0f), 0xFFFF);
510 zValue = readDepth16(zBuffer, q, x);
511 break;
512 case VK_FORMAT_D32_SFLOAT:
513 case VK_FORMAT_D32_SFLOAT_S8_UINT:
514 Z = z;
515 zValue = readDepth32F(zBuffer, q, x);
516 break;
517 default:
518 UNSUPPORTED("Depth format: %d", int(state.depthFormat));
519 return false;
520 }
521 }
522
523 SIMD::Int zTest;
524
525 switch(state.depthCompareMode)
526 {
527 case VK_COMPARE_OP_ALWAYS:
528 // Optimized
529 break;
530 case VK_COMPARE_OP_NEVER:
531 // Optimized
532 break;
533 case VK_COMPARE_OP_EQUAL:
534 zTest = CmpEQ(zValue, Z);
535 break;
536 case VK_COMPARE_OP_NOT_EQUAL:
537 zTest = CmpNEQ(zValue, Z);
538 break;
539 case VK_COMPARE_OP_LESS:
540 zTest = CmpNLE(zValue, Z);
541 break;
542 case VK_COMPARE_OP_GREATER_OR_EQUAL:
543 zTest = CmpLE(zValue, Z);
544 break;
545 case VK_COMPARE_OP_LESS_OR_EQUAL:
546 zTest = CmpNLT(zValue, Z);
547 break;
548 case VK_COMPARE_OP_GREATER:
549 zTest = CmpLT(zValue, Z);
550 break;
551 default:
552 UNSUPPORTED("VkCompareOp: %d", int(state.depthCompareMode));
553 }
554
555 switch(state.depthCompareMode)
556 {
557 case VK_COMPARE_OP_ALWAYS:
558 zMask = cMask;
559 break;
560 case VK_COMPARE_OP_NEVER:
561 zMask = 0x0;
562 break;
563 default:
564 zMask = SignMask(zTest) & cMask;
565 break;
566 }
567
568 if(state.stencilActive)
569 {
570 zMask &= sMask;
571 }
572
573 return zMask != 0;
574 }
575
depthBoundsTest16(const Pointer<Byte> & zBuffer,int q,const Int & x)576 Int4 PixelRoutine::depthBoundsTest16(const Pointer<Byte> &zBuffer, int q, const Int &x)
577 {
578 Pointer<Byte> buffer = zBuffer + 2 * x;
579 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
580
581 if(q > 0)
582 {
583 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
584 }
585
586 Float4 minDepthBound(state.minDepthBounds);
587 Float4 maxDepthBound(state.maxDepthBounds);
588
589 Int2 z;
590 z = Insert(z, *Pointer<Int>(buffer), 0);
591 z = Insert(z, *Pointer<Int>(buffer + pitch), 1);
592
593 Float4 zValue = Float4(As<UShort4>(z)) * (1.0f / 0xFFFF);
594 return Int4(CmpLE(minDepthBound, zValue) & CmpLE(zValue, maxDepthBound));
595 }
596
depthBoundsTest32F(const Pointer<Byte> & zBuffer,int q,const Int & x)597 Int4 PixelRoutine::depthBoundsTest32F(const Pointer<Byte> &zBuffer, int q, const Int &x)
598 {
599 Pointer<Byte> buffer = zBuffer + 4 * x;
600 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
601
602 if(q > 0)
603 {
604 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
605 }
606
607 Float4 zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
608 return Int4(CmpLE(state.minDepthBounds, zValue) & CmpLE(zValue, state.maxDepthBounds));
609 }
610
depthBoundsTest(const Pointer<Byte> & zBuffer,int q,const Int & x,Int & zMask,Int & cMask)611 void PixelRoutine::depthBoundsTest(const Pointer<Byte> &zBuffer, int q, const Int &x, Int &zMask, Int &cMask)
612 {
613 if(!state.depthBoundsTestActive)
614 {
615 return;
616 }
617
618 Int4 zTest;
619 switch(state.depthFormat)
620 {
621 case VK_FORMAT_D16_UNORM:
622 zTest = depthBoundsTest16(zBuffer, q, x);
623 break;
624 case VK_FORMAT_D32_SFLOAT:
625 case VK_FORMAT_D32_SFLOAT_S8_UINT:
626 zTest = depthBoundsTest32F(zBuffer, q, x);
627 break;
628 default:
629 UNSUPPORTED("Depth format: %d", int(state.depthFormat));
630 break;
631 }
632
633 if(!state.depthTestActive)
634 {
635 cMask &= zMask & SignMask(zTest);
636 }
637 else
638 {
639 zMask &= cMask & SignMask(zTest);
640 }
641 }
642
alphaToCoverage(Int cMask[4],const SIMD::Float & alpha,const SampleSet & samples)643 void PixelRoutine::alphaToCoverage(Int cMask[4], const SIMD::Float &alpha, const SampleSet &samples)
644 {
645 static const int a2c[4] = {
646 OFFSET(DrawData, a2c0),
647 OFFSET(DrawData, a2c1),
648 OFFSET(DrawData, a2c2),
649 OFFSET(DrawData, a2c3),
650 };
651
652 for(unsigned int q : samples)
653 {
654 SIMD::Int coverage = CmpNLT(alpha, SIMD::Float(*Pointer<Float>(data + a2c[q])));
655 Int aMask = SignMask(coverage);
656 cMask[q] &= aMask;
657 }
658 }
659
writeDepth32F(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)660 void PixelRoutine::writeDepth32F(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
661 {
662 Float4 Z = z;
663
664 Pointer<Byte> buffer = zBuffer + 4 * x;
665 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
666
667 if(q > 0)
668 {
669 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
670 }
671
672 Float4 zValue;
673
674 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
675 {
676 zValue = Float4(*Pointer<Float2>(buffer), *Pointer<Float2>(buffer + pitch));
677 }
678
679 Z = As<Float4>(As<Int4>(Z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + zMask * 16, 16));
680 zValue = As<Float4>(As<Int4>(zValue) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + zMask * 16, 16));
681 Z = As<Float4>(As<Int4>(Z) | As<Int4>(zValue));
682
683 *Pointer<Float2>(buffer) = Float2(Z.xy);
684 *Pointer<Float2>(buffer + pitch) = Float2(Z.zw);
685 }
686
writeDepth16(Pointer<Byte> & zBuffer,int q,const Int & x,const Float4 & z,const Int & zMask)687 void PixelRoutine::writeDepth16(Pointer<Byte> &zBuffer, int q, const Int &x, const Float4 &z, const Int &zMask)
688 {
689 Short4 Z = UShort4(Round(z * 0xFFFF), true);
690
691 Pointer<Byte> buffer = zBuffer + 2 * x;
692 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, depthPitchB));
693
694 if(q > 0)
695 {
696 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, depthSliceB));
697 }
698
699 Short4 zValue;
700
701 if(state.depthCompareMode != VK_COMPARE_OP_NEVER || (state.depthCompareMode != VK_COMPARE_OP_ALWAYS && !state.depthWriteEnable))
702 {
703 zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer), 0));
704 zValue = As<Short4>(Insert(As<Int2>(zValue), *Pointer<Int>(buffer + pitch), 1));
705 }
706
707 Z = Z & *Pointer<Short4>(constants + OFFSET(Constants, maskW4Q) + zMask * 8, 8);
708 zValue = zValue & *Pointer<Short4>(constants + OFFSET(Constants, invMaskW4Q) + zMask * 8, 8);
709 Z = Z | zValue;
710
711 *Pointer<Int>(buffer) = Extract(As<Int2>(Z), 0);
712 *Pointer<Int>(buffer + pitch) = Extract(As<Int2>(Z), 1);
713 }
714
writeDepth(Pointer<Byte> & zBuffer,const Int & x,const Int zMask[4],const SampleSet & samples)715 void PixelRoutine::writeDepth(Pointer<Byte> &zBuffer, const Int &x, const Int zMask[4], const SampleSet &samples)
716 {
717 if(!state.depthWriteEnable)
718 {
719 return;
720 }
721
722 for(unsigned int q : samples)
723 {
724 ASSERT(SIMD::Width == 4);
725 switch(state.depthFormat)
726 {
727 case VK_FORMAT_D16_UNORM:
728 writeDepth16(zBuffer, q, x, Extract128(z[q], 0), zMask[q]);
729 break;
730 case VK_FORMAT_D32_SFLOAT:
731 case VK_FORMAT_D32_SFLOAT_S8_UINT:
732 writeDepth32F(zBuffer, q, x, Extract128(z[q], 0), zMask[q]);
733 break;
734 default:
735 UNSUPPORTED("Depth format: %d", int(state.depthFormat));
736 break;
737 }
738 }
739 }
740
occlusionSampleCount(const Int zMask[4],const Int sMask[4],const SampleSet & samples)741 void PixelRoutine::occlusionSampleCount(const Int zMask[4], const Int sMask[4], const SampleSet &samples)
742 {
743 if(!state.occlusionEnabled)
744 {
745 return;
746 }
747
748 for(unsigned int q : samples)
749 {
750 occlusion += *Pointer<UInt>(constants + OFFSET(Constants, occlusionCount) + 4 * (zMask[q] & sMask[q]));
751 }
752 }
753
writeStencil(Pointer<Byte> & sBuffer,const Int & x,const Int sMask[4],const Int zMask[4],const Int cMask[4],const SampleSet & samples)754 void PixelRoutine::writeStencil(Pointer<Byte> &sBuffer, const Int &x, const Int sMask[4], const Int zMask[4], const Int cMask[4], const SampleSet &samples)
755 {
756 if(!state.stencilActive)
757 {
758 return;
759 }
760
761 if(state.frontStencil.passOp == VK_STENCIL_OP_KEEP && state.frontStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.frontStencil.failOp == VK_STENCIL_OP_KEEP)
762 {
763 if(state.backStencil.passOp == VK_STENCIL_OP_KEEP && state.backStencil.depthFailOp == VK_STENCIL_OP_KEEP && state.backStencil.failOp == VK_STENCIL_OP_KEEP)
764 {
765 return;
766 }
767 }
768
769 if(!state.frontStencil.writeEnabled && !state.backStencil.writeEnabled)
770 {
771 return;
772 }
773
774 for(unsigned int q : samples)
775 {
776 Pointer<Byte> buffer = sBuffer + x;
777
778 if(q > 0)
779 {
780 buffer += q * *Pointer<Int>(data + OFFSET(DrawData, stencilSliceB));
781 }
782
783 Int pitch = *Pointer<Int>(data + OFFSET(DrawData, stencilPitchB));
784 Byte8 bufferValue = *Pointer<Byte8>(buffer) & Byte8(-1, -1, 0, 0, 0, 0, 0, 0);
785 bufferValue = bufferValue | (*Pointer<Byte8>(buffer + pitch - 2) & Byte8(0, 0, -1, -1, 0, 0, 0, 0));
786 Byte8 newValue = stencilOperation(bufferValue, state.frontStencil, false, zMask[q], sMask[q]);
787
788 if(state.frontStencil.useWriteMask) // Assume 8-bit stencil buffer
789 {
790 Byte8 maskedValue = bufferValue;
791 newValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].writeMaskQ));
792 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[0].invWriteMaskQ));
793 newValue |= maskedValue;
794 }
795
796 Byte8 newValueBack = stencilOperation(bufferValue, state.backStencil, true, zMask[q], sMask[q]);
797
798 if(state.backStencil.useWriteMask) // Assume 8-bit stencil buffer
799 {
800 Byte8 maskedValue = bufferValue;
801 newValueBack &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].writeMaskQ));
802 maskedValue &= *Pointer<Byte8>(data + OFFSET(DrawData, stencil[1].invWriteMaskQ));
803 newValueBack |= maskedValue;
804 }
805
806 newValue &= *Pointer<Byte8>(primitive + OFFSET(Primitive, clockwiseMask));
807 newValueBack &= *Pointer<Byte8>(primitive + OFFSET(Primitive, invClockwiseMask));
808 newValue |= newValueBack;
809
810 newValue &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * cMask[q]);
811 bufferValue &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * cMask[q]);
812 newValue |= bufferValue;
813
814 *Pointer<Short>(buffer) = Extract(As<Short4>(newValue), 0);
815 *Pointer<Short>(buffer + pitch) = Extract(As<Short4>(newValue), 1);
816 }
817 }
818
stencilOperation(const Byte8 & bufferValue,const PixelProcessor::States::StencilOpState & ops,bool isBack,const Int & zMask,const Int & sMask)819 Byte8 PixelRoutine::stencilOperation(const Byte8 &bufferValue, const PixelProcessor::States::StencilOpState &ops, bool isBack, const Int &zMask, const Int &sMask)
820 {
821 Byte8 pass = stencilOperation(bufferValue, ops.passOp, isBack);
822
823 if(state.depthTestActive && ops.depthFailOp != ops.passOp) // zMask valid and values not the same
824 {
825 Byte8 zFail = stencilOperation(bufferValue, ops.depthFailOp, isBack);
826
827 pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * zMask);
828 zFail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * zMask);
829 pass |= zFail;
830 }
831
832 if(ops.failOp != ops.passOp || (state.depthTestActive && ops.failOp != ops.depthFailOp))
833 {
834 Byte8 fail = stencilOperation(bufferValue, ops.failOp, isBack);
835
836 pass &= *Pointer<Byte8>(constants + OFFSET(Constants, maskB4Q) + 8 * sMask);
837 fail &= *Pointer<Byte8>(constants + OFFSET(Constants, invMaskB4Q) + 8 * sMask);
838 pass |= fail;
839 }
840
841 return pass;
842 }
843
hasStencilReplaceRef() const844 bool PixelRoutine::hasStencilReplaceRef() const
845 {
846 return spirvShader &&
847 (spirvShader->outputBuiltins.find(spv::BuiltInFragStencilRefEXT) !=
848 spirvShader->outputBuiltins.end());
849 }
850
stencilReplaceRef()851 Byte8 PixelRoutine::stencilReplaceRef()
852 {
853 ASSERT(spirvShader);
854
855 auto it = spirvShader->outputBuiltins.find(spv::BuiltInFragStencilRefEXT);
856 ASSERT(it != spirvShader->outputBuiltins.end());
857
858 UInt4 sRef = As<UInt4>(routine.getVariable(it->second.Id)[it->second.FirstComponent]) & UInt4(0xff);
859 // TODO (b/148295813): Could be done with a single pshufb instruction. Optimize the
860 // following line by either adding a rr::Shuffle() variant to do
861 // it explicitly or adding a Byte4(Int4) constructor would work.
862 sRef.x = rr::UInt(sRef.x) | (rr::UInt(sRef.y) << 8) | (rr::UInt(sRef.z) << 16) | (rr::UInt(sRef.w) << 24);
863
864 UInt2 sRefDuplicated;
865 sRefDuplicated = Insert(sRefDuplicated, sRef.x, 0);
866 sRefDuplicated = Insert(sRefDuplicated, sRef.x, 1);
867 return As<Byte8>(sRefDuplicated);
868 }
869
stencilOperation(const Byte8 & bufferValue,VkStencilOp operation,bool isBack)870 Byte8 PixelRoutine::stencilOperation(const Byte8 &bufferValue, VkStencilOp operation, bool isBack)
871 {
872 if(hasStencilReplaceRef())
873 {
874 return stencilReplaceRef();
875 }
876 else
877 {
878 switch(operation)
879 {
880 case VK_STENCIL_OP_KEEP:
881 return bufferValue;
882 case VK_STENCIL_OP_ZERO:
883 return Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
884 case VK_STENCIL_OP_REPLACE:
885 return *Pointer<Byte8>(data + OFFSET(DrawData, stencil[isBack].referenceQ));
886 case VK_STENCIL_OP_INCREMENT_AND_CLAMP:
887 return AddSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
888 case VK_STENCIL_OP_DECREMENT_AND_CLAMP:
889 return SubSat(bufferValue, Byte8(1, 1, 1, 1, 1, 1, 1, 1));
890 case VK_STENCIL_OP_INVERT:
891 return bufferValue ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
892 case VK_STENCIL_OP_INCREMENT_AND_WRAP:
893 return bufferValue + Byte8(1, 1, 1, 1, 1, 1, 1, 1);
894 case VK_STENCIL_OP_DECREMENT_AND_WRAP:
895 return bufferValue - Byte8(1, 1, 1, 1, 1, 1, 1, 1);
896 default:
897 UNSUPPORTED("VkStencilOp: %d", int(operation));
898 }
899 }
900
901 return Byte8(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
902 }
903
isSRGB(int index) const904 bool PixelRoutine::isSRGB(int index) const
905 {
906 return vk::Format(state.colorFormat[index]).isSRGBformat();
907 }
908
readPixel(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4s & pixel)909 void PixelRoutine::readPixel(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4s &pixel)
910 {
911 Short4 c01;
912 Short4 c23;
913 Pointer<Byte> buffer = cBuffer;
914 Pointer<Byte> buffer2;
915
916 Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
917
918 vk::Format format = state.colorFormat[index];
919 switch(format)
920 {
921 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
922 buffer += 2 * x;
923 buffer2 = buffer + pitchB;
924 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
925
926 pixel.x = (c01 & Short4(0xF000u));
927 pixel.y = (c01 & Short4(0x0F00u)) << 4;
928 pixel.z = (c01 & Short4(0x00F0u)) << 8;
929 pixel.w = (c01 & Short4(0x000Fu)) << 12;
930
931 // Expand to 16 bit range
932 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
933 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
934 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
935 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
936 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
937 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
938 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
939 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
940 break;
941 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
942 buffer += 2 * x;
943 buffer2 = buffer + pitchB;
944 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
945
946 pixel.z = (c01 & Short4(0xF000u));
947 pixel.y = (c01 & Short4(0x0F00u)) << 4;
948 pixel.x = (c01 & Short4(0x00F0u)) << 8;
949 pixel.w = (c01 & Short4(0x000Fu)) << 12;
950
951 // Expand to 16 bit range
952 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
953 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
954 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
955 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
956 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
957 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
958 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
959 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
960 break;
961 case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
962 buffer += 2 * x;
963 buffer2 = buffer + pitchB;
964 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
965
966 pixel.w = (c01 & Short4(0xF000u));
967 pixel.z = (c01 & Short4(0x0F00u)) << 4;
968 pixel.y = (c01 & Short4(0x00F0u)) << 8;
969 pixel.x = (c01 & Short4(0x000Fu)) << 12;
970
971 // Expand to 16 bit range
972 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
973 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
974 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
975 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
976 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
977 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
978 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
979 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
980 break;
981 case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
982 buffer += 2 * x;
983 buffer2 = buffer + pitchB;
984 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
985
986 pixel.w = (c01 & Short4(0xF000u));
987 pixel.x = (c01 & Short4(0x0F00u)) << 4;
988 pixel.y = (c01 & Short4(0x00F0u)) << 8;
989 pixel.z = (c01 & Short4(0x000Fu)) << 12;
990
991 // Expand to 16 bit range
992 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 4);
993 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 8);
994 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 4);
995 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 8);
996 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 4);
997 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 8);
998 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
999 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1000 break;
1001 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
1002 buffer += 2 * x;
1003 buffer2 = buffer + pitchB;
1004 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1005
1006 pixel.x = (c01 & Short4(0xF800u));
1007 pixel.y = (c01 & Short4(0x07C0u)) << 5;
1008 pixel.z = (c01 & Short4(0x003Eu)) << 10;
1009 pixel.w = ((c01 & Short4(0x0001u)) << 15) >> 15;
1010
1011 // Expand to 16 bit range
1012 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1013 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1014 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1015 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1016 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1017 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1018 break;
1019 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
1020 buffer += 2 * x;
1021 buffer2 = buffer + pitchB;
1022 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1023
1024 pixel.z = (c01 & Short4(0xF800u));
1025 pixel.y = (c01 & Short4(0x07C0u)) << 5;
1026 pixel.x = (c01 & Short4(0x003Eu)) << 10;
1027 pixel.w = ((c01 & Short4(0x0001u)) << 15) >> 15;
1028
1029 // Expand to 16 bit range
1030 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1031 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1032 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1033 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1034 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1035 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1036 break;
1037 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
1038 buffer += 2 * x;
1039 buffer2 = buffer + pitchB;
1040 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1041
1042 pixel.x = (c01 & Short4(0x7C00u)) << 1;
1043 pixel.y = (c01 & Short4(0x03E0u)) << 6;
1044 pixel.z = (c01 & Short4(0x001Fu)) << 11;
1045 pixel.w = (c01 & Short4(0x8000u)) >> 15;
1046
1047 // Expand to 16 bit range
1048 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1049 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1050 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 5);
1051 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1052 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1053 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1054 break;
1055 case VK_FORMAT_R5G6B5_UNORM_PACK16:
1056 buffer += 2 * x;
1057 buffer2 = buffer + pitchB;
1058 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1059
1060 pixel.x = c01 & Short4(0xF800u);
1061 pixel.y = (c01 & Short4(0x07E0u)) << 5;
1062 pixel.z = (c01 & Short4(0x001Fu)) << 11;
1063 pixel.w = Short4(0xFFFFu);
1064
1065 // Expand to 16 bit range
1066 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1067 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1068 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
1069 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
1070 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1071 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1072 break;
1073 case VK_FORMAT_B5G6R5_UNORM_PACK16:
1074 buffer += 2 * x;
1075 buffer2 = buffer + pitchB;
1076 c01 = As<Short4>(Int2(*Pointer<Int>(buffer), *Pointer<Int>(buffer2)));
1077
1078 pixel.z = c01 & Short4(0xF800u);
1079 pixel.y = (c01 & Short4(0x07E0u)) << 5;
1080 pixel.x = (c01 & Short4(0x001Fu)) << 11;
1081 pixel.w = Short4(0xFFFFu);
1082
1083 // Expand to 16 bit range
1084 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 5);
1085 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1086 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 6);
1087 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 12);
1088 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 5);
1089 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1090 break;
1091 case VK_FORMAT_B8G8R8A8_UNORM:
1092 case VK_FORMAT_B8G8R8A8_SRGB:
1093 buffer += 4 * x;
1094 c01 = *Pointer<Short4>(buffer);
1095 buffer += pitchB;
1096 c23 = *Pointer<Short4>(buffer);
1097 pixel.z = c01;
1098 pixel.y = c01;
1099 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1100 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1101 pixel.x = pixel.z;
1102 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1103 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1104 pixel.y = pixel.z;
1105 pixel.w = pixel.x;
1106 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1107 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1108 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1109 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1110 break;
1111 case VK_FORMAT_R8G8B8A8_UNORM:
1112 case VK_FORMAT_R8G8B8A8_SRGB:
1113 buffer += 4 * x;
1114 c01 = *Pointer<Short4>(buffer);
1115 buffer += pitchB;
1116 c23 = *Pointer<Short4>(buffer);
1117 pixel.z = c01;
1118 pixel.y = c01;
1119 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(c23));
1120 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(c23));
1121 pixel.x = pixel.z;
1122 pixel.z = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.y));
1123 pixel.x = UnpackHigh(As<Byte8>(pixel.x), As<Byte8>(pixel.y));
1124 pixel.y = pixel.z;
1125 pixel.w = pixel.x;
1126 pixel.x = UnpackLow(As<Byte8>(pixel.z), As<Byte8>(pixel.z));
1127 pixel.y = UnpackHigh(As<Byte8>(pixel.y), As<Byte8>(pixel.y));
1128 pixel.z = UnpackLow(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1129 pixel.w = UnpackHigh(As<Byte8>(pixel.w), As<Byte8>(pixel.w));
1130 break;
1131 case VK_FORMAT_R8_UNORM:
1132 buffer += 1 * x;
1133 pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 0);
1134 buffer += pitchB;
1135 pixel.x = Insert(pixel.x, *Pointer<Short>(buffer), 1);
1136 pixel.x = UnpackLow(As<Byte8>(pixel.x), As<Byte8>(pixel.x));
1137 pixel.y = Short4(0x0000);
1138 pixel.z = Short4(0x0000);
1139 pixel.w = Short4(0xFFFFu);
1140 break;
1141 case VK_FORMAT_R8G8_UNORM:
1142 buffer += 2 * x;
1143 c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 0));
1144 buffer += pitchB;
1145 c01 = As<Short4>(Insert(As<Int2>(c01), *Pointer<Int>(buffer), 1));
1146 pixel.x = (c01 & Short4(0x00FFu)) | (c01 << 8);
1147 pixel.y = (c01 & Short4(0xFF00u)) | As<Short4>(As<UShort4>(c01) >> 8);
1148 pixel.z = Short4(0x0000u);
1149 pixel.w = Short4(0xFFFFu);
1150 break;
1151 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
1152 {
1153 Int4 v = Int4(0);
1154 buffer += 4 * x;
1155 v = Insert(v, *Pointer<Int>(buffer + 0), 0);
1156 v = Insert(v, *Pointer<Int>(buffer + 4), 1);
1157 buffer += pitchB;
1158 v = Insert(v, *Pointer<Int>(buffer + 0), 2);
1159 v = Insert(v, *Pointer<Int>(buffer + 4), 3);
1160
1161 pixel.x = Short4(v << 6) & Short4(0xFFC0u);
1162 pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
1163 pixel.z = Short4(v >> 14) & Short4(0xFFC0u);
1164 pixel.w = Short4(v >> 16) & Short4(0xC000u);
1165
1166 // Expand to 16 bit range
1167 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1168 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1169 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1170 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 2);
1171 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1172 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1173 }
1174 break;
1175 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
1176 {
1177 Int4 v = Int4(0);
1178 v = Insert(v, *Pointer<Int>(buffer + 4 * x), 0);
1179 v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 1);
1180 buffer += *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1181 v = Insert(v, *Pointer<Int>(buffer + 4 * x), 2);
1182 v = Insert(v, *Pointer<Int>(buffer + 4 * x + 4), 3);
1183
1184 pixel.x = Short4(v >> 14) & Short4(0xFFC0u);
1185 pixel.y = Short4(v >> 4) & Short4(0xFFC0u);
1186 pixel.z = Short4(v << 6) & Short4(0xFFC0u);
1187 pixel.w = Short4(v >> 16) & Short4(0xC000u);
1188
1189 // Expand to 16 bit range
1190 pixel.x |= As<Short4>(As<UShort4>(pixel.x) >> 10);
1191 pixel.y |= As<Short4>(As<UShort4>(pixel.y) >> 10);
1192 pixel.z |= As<Short4>(As<UShort4>(pixel.z) >> 10);
1193 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 2);
1194 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 4);
1195 pixel.w |= As<Short4>(As<UShort4>(pixel.w) >> 8);
1196 }
1197 break;
1198 default:
1199 UNSUPPORTED("VkFormat %d", int(format));
1200 }
1201 }
1202
blendConstant(vk::Format format,int component,BlendFactorModifier modifier)1203 Float PixelRoutine::blendConstant(vk::Format format, int component, BlendFactorModifier modifier)
1204 {
1205 bool inverse = (modifier == OneMinus);
1206
1207 if(format.isUnsignedNormalized())
1208 {
1209 return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantU.v[component]))
1210 : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantU.v[component]));
1211 }
1212 else if(format.isSignedNormalized())
1213 {
1214 return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantS.v[component]))
1215 : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantS.v[component]));
1216 }
1217 else // Floating-point format
1218 {
1219 ASSERT(format.isFloatFormat());
1220 return inverse ? *Pointer<Float>(data + OFFSET(DrawData, factor.invBlendConstantF.v[component]))
1221 : *Pointer<Float>(data + OFFSET(DrawData, factor.blendConstantF.v[component]));
1222 }
1223 }
1224
blendFactorRGB(SIMD::Float4 & blendFactor,const SIMD::Float4 & sourceColor,const SIMD::Float4 & destColor,VkBlendFactor colorBlendFactor,vk::Format format)1225 void PixelRoutine::blendFactorRGB(SIMD::Float4 &blendFactor, const SIMD::Float4 &sourceColor, const SIMD::Float4 &destColor, VkBlendFactor colorBlendFactor, vk::Format format)
1226 {
1227 switch(colorBlendFactor)
1228 {
1229 case VK_BLEND_FACTOR_ZERO:
1230 blendFactor.x = 0.0f;
1231 blendFactor.y = 0.0f;
1232 blendFactor.z = 0.0f;
1233 break;
1234 case VK_BLEND_FACTOR_ONE:
1235 blendFactor.x = 1.0f;
1236 blendFactor.y = 1.0f;
1237 blendFactor.z = 1.0f;
1238 break;
1239 case VK_BLEND_FACTOR_SRC_COLOR:
1240 blendFactor.x = sourceColor.x;
1241 blendFactor.y = sourceColor.y;
1242 blendFactor.z = sourceColor.z;
1243 break;
1244 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1245 blendFactor.x = 1.0f - sourceColor.x;
1246 blendFactor.y = 1.0f - sourceColor.y;
1247 blendFactor.z = 1.0f - sourceColor.z;
1248 break;
1249 case VK_BLEND_FACTOR_DST_COLOR:
1250 blendFactor.x = destColor.x;
1251 blendFactor.y = destColor.y;
1252 blendFactor.z = destColor.z;
1253 break;
1254 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1255 blendFactor.x = 1.0f - destColor.x;
1256 blendFactor.y = 1.0f - destColor.y;
1257 blendFactor.z = 1.0f - destColor.z;
1258 break;
1259 case VK_BLEND_FACTOR_SRC_ALPHA:
1260 blendFactor.x = sourceColor.w;
1261 blendFactor.y = sourceColor.w;
1262 blendFactor.z = sourceColor.w;
1263 break;
1264 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1265 blendFactor.x = 1.0f - sourceColor.w;
1266 blendFactor.y = 1.0f - sourceColor.w;
1267 blendFactor.z = 1.0f - sourceColor.w;
1268 break;
1269 case VK_BLEND_FACTOR_DST_ALPHA:
1270 blendFactor.x = destColor.w;
1271 blendFactor.y = destColor.w;
1272 blendFactor.z = destColor.w;
1273 break;
1274 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1275 blendFactor.x = 1.0f - destColor.w;
1276 blendFactor.y = 1.0f - destColor.w;
1277 blendFactor.z = 1.0f - destColor.w;
1278 break;
1279 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1280 blendFactor.x = 1.0f - destColor.w;
1281 blendFactor.x = Min(blendFactor.x, sourceColor.w);
1282 blendFactor.y = blendFactor.x;
1283 blendFactor.z = blendFactor.x;
1284 break;
1285 case VK_BLEND_FACTOR_CONSTANT_COLOR:
1286 blendFactor.x = blendConstant(format, 0);
1287 blendFactor.y = blendConstant(format, 1);
1288 blendFactor.z = blendConstant(format, 2);
1289 break;
1290 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1291 blendFactor.x = blendConstant(format, 3);
1292 blendFactor.y = blendConstant(format, 3);
1293 blendFactor.z = blendConstant(format, 3);
1294 break;
1295 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1296 blendFactor.x = blendConstant(format, 0, OneMinus);
1297 blendFactor.y = blendConstant(format, 1, OneMinus);
1298 blendFactor.z = blendConstant(format, 2, OneMinus);
1299 break;
1300 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1301 blendFactor.x = blendConstant(format, 3, OneMinus);
1302 blendFactor.y = blendConstant(format, 3, OneMinus);
1303 blendFactor.z = blendConstant(format, 3, OneMinus);
1304 break;
1305
1306 default:
1307 UNSUPPORTED("VkBlendFactor: %d", int(colorBlendFactor));
1308 }
1309
1310 // "If the color attachment is fixed-point, the components of the source and destination values and blend factors are each clamped
1311 // to [0,1] or [-1,1] respectively for an unsigned normalized or signed normalized color attachment prior to evaluating the blend
1312 // operations. If the color attachment is floating-point, no clamping occurs."
1313 if(blendFactorCanExceedFormatRange(colorBlendFactor, format))
1314 {
1315 if(format.isUnsignedNormalized())
1316 {
1317 blendFactor.x = Min(Max(blendFactor.x, 0.0f), 1.0f);
1318 blendFactor.y = Min(Max(blendFactor.y, 0.0f), 1.0f);
1319 blendFactor.z = Min(Max(blendFactor.z, 0.0f), 1.0f);
1320 }
1321 else if(format.isSignedNormalized())
1322 {
1323 blendFactor.x = Min(Max(blendFactor.x, -1.0f), 1.0f);
1324 blendFactor.y = Min(Max(blendFactor.y, -1.0f), 1.0f);
1325 blendFactor.z = Min(Max(blendFactor.z, -1.0f), 1.0f);
1326 }
1327 }
1328 }
1329
blendFactorAlpha(SIMD::Float & blendFactorAlpha,const SIMD::Float & sourceAlpha,const SIMD::Float & destAlpha,VkBlendFactor alphaBlendFactor,vk::Format format)1330 void PixelRoutine::blendFactorAlpha(SIMD::Float &blendFactorAlpha, const SIMD::Float &sourceAlpha, const SIMD::Float &destAlpha, VkBlendFactor alphaBlendFactor, vk::Format format)
1331 {
1332 switch(alphaBlendFactor)
1333 {
1334 case VK_BLEND_FACTOR_ZERO:
1335 blendFactorAlpha = 0.0f;
1336 break;
1337 case VK_BLEND_FACTOR_ONE:
1338 blendFactorAlpha = 1.0f;
1339 break;
1340 case VK_BLEND_FACTOR_SRC_COLOR:
1341 blendFactorAlpha = sourceAlpha;
1342 break;
1343 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1344 blendFactorAlpha = 1.0f - sourceAlpha;
1345 break;
1346 case VK_BLEND_FACTOR_DST_COLOR:
1347 blendFactorAlpha = destAlpha;
1348 break;
1349 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1350 blendFactorAlpha = 1.0f - destAlpha;
1351 break;
1352 case VK_BLEND_FACTOR_SRC_ALPHA:
1353 blendFactorAlpha = sourceAlpha;
1354 break;
1355 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1356 blendFactorAlpha = 1.0f - sourceAlpha;
1357 break;
1358 case VK_BLEND_FACTOR_DST_ALPHA:
1359 blendFactorAlpha = destAlpha;
1360 break;
1361 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1362 blendFactorAlpha = 1.0f - destAlpha;
1363 break;
1364 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1365 blendFactorAlpha = 1.0f;
1366 break;
1367 case VK_BLEND_FACTOR_CONSTANT_COLOR:
1368 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1369 blendFactorAlpha = blendConstant(format, 3);
1370 break;
1371 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1372 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1373 blendFactorAlpha = blendConstant(format, 3, OneMinus);
1374 break;
1375 default:
1376 UNSUPPORTED("VkBlendFactor: %d", int(alphaBlendFactor));
1377 }
1378
1379 // "If the color attachment is fixed-point, the components of the source and destination values and blend factors are each clamped
1380 // to [0,1] or [-1,1] respectively for an unsigned normalized or signed normalized color attachment prior to evaluating the blend
1381 // operations. If the color attachment is floating-point, no clamping occurs."
1382 if(blendFactorCanExceedFormatRange(alphaBlendFactor, format))
1383 {
1384 if(format.isUnsignedNormalized())
1385 {
1386 blendFactorAlpha = Min(Max(blendFactorAlpha, 0.0f), 1.0f);
1387 }
1388 else if(format.isSignedNormalized())
1389 {
1390 blendFactorAlpha = Min(Max(blendFactorAlpha, -1.0f), 1.0f);
1391 }
1392 }
1393 }
1394
blendOpOverlay(SIMD::Float & src,SIMD::Float & dst)1395 SIMD::Float PixelRoutine::blendOpOverlay(SIMD::Float &src, SIMD::Float &dst)
1396 {
1397 SIMD::Int largeDst = CmpGT(dst, 0.5f);
1398 return As<SIMD::Float>(
1399 (~largeDst & As<SIMD::Int>(2.0f * src * dst)) |
1400 (largeDst & As<SIMD::Int>(1.0f - (2.0f * (1.0f - src) * (1.0f - dst)))));
1401 }
1402
blendOpColorDodge(SIMD::Float & src,SIMD::Float & dst)1403 SIMD::Float PixelRoutine::blendOpColorDodge(SIMD::Float &src, SIMD::Float &dst)
1404 {
1405 SIMD::Int srcBelowOne = CmpLT(src, 1.0f);
1406 SIMD::Int positiveDst = CmpGT(dst, 0.0f);
1407 return As<SIMD::Float>(positiveDst & ((~srcBelowOne & As<SIMD::Int>(SIMD::Float(1.0f))) |
1408 (srcBelowOne & As<SIMD::Int>(Min(1.0f, (dst / (1.0f - src)))))));
1409 }
1410
blendOpColorBurn(SIMD::Float & src,SIMD::Float & dst)1411 SIMD::Float PixelRoutine::blendOpColorBurn(SIMD::Float &src, SIMD::Float &dst)
1412 {
1413 SIMD::Int dstBelowOne = CmpLT(dst, 1.0f);
1414 SIMD::Int positiveSrc = CmpGT(src, 0.0f);
1415 return As<SIMD::Float>(
1416 (~dstBelowOne & As<SIMD::Int>(SIMD::Float(1.0f))) |
1417 (dstBelowOne & positiveSrc & As<SIMD::Int>(1.0f - Min(1.0f, (1.0f - dst) / src))));
1418 }
1419
blendOpHardlight(SIMD::Float & src,SIMD::Float & dst)1420 SIMD::Float PixelRoutine::blendOpHardlight(SIMD::Float &src, SIMD::Float &dst)
1421 {
1422 SIMD::Int largeSrc = CmpGT(src, 0.5f);
1423 return As<SIMD::Float>(
1424 (~largeSrc & As<SIMD::Int>(2.0f * src * dst)) |
1425 (largeSrc & As<SIMD::Int>(1.0f - (2.0f * (1.0f - src) * (1.0f - dst)))));
1426 }
1427
blendOpSoftlight(SIMD::Float & src,SIMD::Float & dst)1428 SIMD::Float PixelRoutine::blendOpSoftlight(SIMD::Float &src, SIMD::Float &dst)
1429 {
1430 SIMD::Int largeSrc = CmpGT(src, 0.5f);
1431 SIMD::Int largeDst = CmpGT(dst, 0.25f);
1432
1433 return As<SIMD::Float>(
1434 (~largeSrc & As<SIMD::Int>(dst - ((1.0f - (2.0f * src)) * dst * (1.0f - dst)))) |
1435 (largeSrc & ((~largeDst & As<SIMD::Int>(dst + (((2.0f * src) - 1.0f) * dst * ((((16.0f * dst) - 12.0f) * dst) + 3.0f)))) |
1436 (largeDst & As<SIMD::Int>(dst + (((2.0f * src) - 1.0f) * (Sqrt<Mediump>(dst) - dst)))))));
1437 }
1438
maxRGB(SIMD::Float4 & c)1439 SIMD::Float PixelRoutine::maxRGB(SIMD::Float4 &c)
1440 {
1441 return Max(Max(c.x, c.y), c.z);
1442 }
1443
minRGB(SIMD::Float4 & c)1444 SIMD::Float PixelRoutine::minRGB(SIMD::Float4 &c)
1445 {
1446 return Min(Min(c.x, c.y), c.z);
1447 }
1448
setLumSat(SIMD::Float4 & cbase,SIMD::Float4 & csat,SIMD::Float4 & clum,SIMD::Float & x,SIMD::Float & y,SIMD::Float & z)1449 void PixelRoutine::setLumSat(SIMD::Float4 &cbase, SIMD::Float4 &csat, SIMD::Float4 &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z)
1450 {
1451 SIMD::Float minbase = minRGB(cbase);
1452 SIMD::Float sbase = maxRGB(cbase) - minbase;
1453 SIMD::Float ssat = maxRGB(csat) - minRGB(csat);
1454 SIMD::Int isNonZero = CmpGT(sbase, 0.0f);
1455 SIMD::Float4 color;
1456 color.x = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.x - minbase) * ssat / sbase));
1457 color.y = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.y - minbase) * ssat / sbase));
1458 color.z = As<SIMD::Float>(isNonZero & As<SIMD::Int>((cbase.z - minbase) * ssat / sbase));
1459 setLum(color, clum, x, y, z);
1460 }
1461
lumRGB(SIMD::Float4 & c)1462 SIMD::Float PixelRoutine::lumRGB(SIMD::Float4 &c)
1463 {
1464 return c.x * 0.3f + c.y * 0.59f + c.z * 0.11f;
1465 }
1466
computeLum(SIMD::Float & color,SIMD::Float & lum,SIMD::Float & mincol,SIMD::Float & maxcol,SIMD::Int & negative,SIMD::Int & aboveOne)1467 SIMD::Float PixelRoutine::computeLum(SIMD::Float &color, SIMD::Float &lum, SIMD::Float &mincol, SIMD::Float &maxcol, SIMD::Int &negative, SIMD::Int &aboveOne)
1468 {
1469 return As<SIMD::Float>(
1470 (negative & As<SIMD::Int>(lum + ((color - lum) * lum) / (lum - mincol))) |
1471 (~negative & ((aboveOne & As<SIMD::Int>(lum + ((color - lum) * (1.0f - lum)) / (maxcol - lum))) |
1472 (~aboveOne & As<SIMD::Int>(color)))));
1473 }
1474
setLum(SIMD::Float4 & cbase,SIMD::Float4 & clum,SIMD::Float & x,SIMD::Float & y,SIMD::Float & z)1475 void PixelRoutine::setLum(SIMD::Float4 &cbase, SIMD::Float4 &clum, SIMD::Float &x, SIMD::Float &y, SIMD::Float &z)
1476 {
1477 SIMD::Float lbase = lumRGB(cbase);
1478 SIMD::Float llum = lumRGB(clum);
1479 SIMD::Float ldiff = llum - lbase;
1480
1481 SIMD::Float4 color;
1482 color.x = cbase.x + ldiff;
1483 color.y = cbase.y + ldiff;
1484 color.z = cbase.z + ldiff;
1485
1486 SIMD::Float lum = lumRGB(color);
1487 SIMD::Float mincol = minRGB(color);
1488 SIMD::Float maxcol = maxRGB(color);
1489
1490 SIMD::Int negative = CmpLT(mincol, 0.0f);
1491 SIMD::Int aboveOne = CmpGT(maxcol, 1.0f);
1492
1493 x = computeLum(color.x, lum, mincol, maxcol, negative, aboveOne);
1494 y = computeLum(color.y, lum, mincol, maxcol, negative, aboveOne);
1495 z = computeLum(color.z, lum, mincol, maxcol, negative, aboveOne);
1496 }
1497
premultiply(SIMD::Float4 & c)1498 void PixelRoutine::premultiply(SIMD::Float4 &c)
1499 {
1500 SIMD::Int nonZeroAlpha = CmpNEQ(c.w, 0.0f);
1501 c.x = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.x / c.w));
1502 c.y = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.y / c.w));
1503 c.z = As<SIMD::Float>(nonZeroAlpha & As<SIMD::Int>(c.z / c.w));
1504 }
1505
computeAdvancedBlendMode(int index,const SIMD::Float4 & src,const SIMD::Float4 & dst,const SIMD::Float4 & srcFactor,const SIMD::Float4 & dstFactor)1506 SIMD::Float4 PixelRoutine::computeAdvancedBlendMode(int index, const SIMD::Float4 &src, const SIMD::Float4 &dst, const SIMD::Float4 &srcFactor, const SIMD::Float4 &dstFactor)
1507 {
1508 SIMD::Float4 srcColor = src;
1509 srcColor.x *= srcFactor.x;
1510 srcColor.y *= srcFactor.y;
1511 srcColor.z *= srcFactor.z;
1512 srcColor.w *= srcFactor.w;
1513
1514 SIMD::Float4 dstColor = dst;
1515 dstColor.x *= dstFactor.x;
1516 dstColor.y *= dstFactor.y;
1517 dstColor.z *= dstFactor.z;
1518 dstColor.w *= dstFactor.w;
1519
1520 premultiply(srcColor);
1521 premultiply(dstColor);
1522
1523 SIMD::Float4 blendedColor;
1524
1525 switch(state.blendState[index].blendOperation)
1526 {
1527 case VK_BLEND_OP_MULTIPLY_EXT:
1528 blendedColor.x = (srcColor.x * dstColor.x);
1529 blendedColor.y = (srcColor.y * dstColor.y);
1530 blendedColor.z = (srcColor.z * dstColor.z);
1531 break;
1532 case VK_BLEND_OP_SCREEN_EXT:
1533 blendedColor.x = srcColor.x + dstColor.x - (srcColor.x * dstColor.x);
1534 blendedColor.y = srcColor.y + dstColor.y - (srcColor.y * dstColor.y);
1535 blendedColor.z = srcColor.z + dstColor.z - (srcColor.z * dstColor.z);
1536 break;
1537 case VK_BLEND_OP_OVERLAY_EXT:
1538 blendedColor.x = blendOpOverlay(srcColor.x, dstColor.x);
1539 blendedColor.y = blendOpOverlay(srcColor.y, dstColor.y);
1540 blendedColor.z = blendOpOverlay(srcColor.z, dstColor.z);
1541 break;
1542 case VK_BLEND_OP_DARKEN_EXT:
1543 blendedColor.x = Min(srcColor.x, dstColor.x);
1544 blendedColor.y = Min(srcColor.y, dstColor.y);
1545 blendedColor.z = Min(srcColor.z, dstColor.z);
1546 break;
1547 case VK_BLEND_OP_LIGHTEN_EXT:
1548 blendedColor.x = Max(srcColor.x, dstColor.x);
1549 blendedColor.y = Max(srcColor.y, dstColor.y);
1550 blendedColor.z = Max(srcColor.z, dstColor.z);
1551 break;
1552 case VK_BLEND_OP_COLORDODGE_EXT:
1553 blendedColor.x = blendOpColorDodge(srcColor.x, dstColor.x);
1554 blendedColor.y = blendOpColorDodge(srcColor.y, dstColor.y);
1555 blendedColor.z = blendOpColorDodge(srcColor.z, dstColor.z);
1556 break;
1557 case VK_BLEND_OP_COLORBURN_EXT:
1558 blendedColor.x = blendOpColorBurn(srcColor.x, dstColor.x);
1559 blendedColor.y = blendOpColorBurn(srcColor.y, dstColor.y);
1560 blendedColor.z = blendOpColorBurn(srcColor.z, dstColor.z);
1561 break;
1562 case VK_BLEND_OP_HARDLIGHT_EXT:
1563 blendedColor.x = blendOpHardlight(srcColor.x, dstColor.x);
1564 blendedColor.y = blendOpHardlight(srcColor.y, dstColor.y);
1565 blendedColor.z = blendOpHardlight(srcColor.z, dstColor.z);
1566 break;
1567 case VK_BLEND_OP_SOFTLIGHT_EXT:
1568 blendedColor.x = blendOpSoftlight(srcColor.x, dstColor.x);
1569 blendedColor.y = blendOpSoftlight(srcColor.y, dstColor.y);
1570 blendedColor.z = blendOpSoftlight(srcColor.z, dstColor.z);
1571 break;
1572 case VK_BLEND_OP_DIFFERENCE_EXT:
1573 blendedColor.x = Abs(srcColor.x - dstColor.x);
1574 blendedColor.y = Abs(srcColor.y - dstColor.y);
1575 blendedColor.z = Abs(srcColor.z - dstColor.z);
1576 break;
1577 case VK_BLEND_OP_EXCLUSION_EXT:
1578 blendedColor.x = srcColor.x + dstColor.x - (srcColor.x * dstColor.x * 2.0f);
1579 blendedColor.y = srcColor.y + dstColor.y - (srcColor.y * dstColor.y * 2.0f);
1580 blendedColor.z = srcColor.z + dstColor.z - (srcColor.z * dstColor.z * 2.0f);
1581 break;
1582 case VK_BLEND_OP_HSL_HUE_EXT:
1583 setLumSat(srcColor, dstColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
1584 break;
1585 case VK_BLEND_OP_HSL_SATURATION_EXT:
1586 setLumSat(dstColor, srcColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
1587 break;
1588 case VK_BLEND_OP_HSL_COLOR_EXT:
1589 setLum(srcColor, dstColor, blendedColor.x, blendedColor.y, blendedColor.z);
1590 break;
1591 case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
1592 setLum(dstColor, srcColor, blendedColor.x, blendedColor.y, blendedColor.z);
1593 break;
1594 default:
1595 UNSUPPORTED("Unsupported advanced VkBlendOp: %d", int(state.blendState[index].blendOperation));
1596 break;
1597 }
1598
1599 SIMD::Float p = srcColor.w * dstColor.w;
1600 blendedColor.x *= p;
1601 blendedColor.y *= p;
1602 blendedColor.z *= p;
1603
1604 p = srcColor.w * (1.0f - dstColor.w);
1605 blendedColor.x += srcColor.x * p;
1606 blendedColor.y += srcColor.y * p;
1607 blendedColor.z += srcColor.z * p;
1608
1609 p = dstColor.w * (1.0f - srcColor.w);
1610 blendedColor.x += dstColor.x * p;
1611 blendedColor.y += dstColor.y * p;
1612 blendedColor.z += dstColor.z * p;
1613
1614 return blendedColor;
1615 }
1616
blendFactorCanExceedFormatRange(VkBlendFactor blendFactor,vk::Format format)1617 bool PixelRoutine::blendFactorCanExceedFormatRange(VkBlendFactor blendFactor, vk::Format format)
1618 {
1619 switch(blendFactor)
1620 {
1621 case VK_BLEND_FACTOR_ZERO:
1622 case VK_BLEND_FACTOR_ONE:
1623 return false;
1624 case VK_BLEND_FACTOR_SRC_COLOR:
1625 case VK_BLEND_FACTOR_SRC_ALPHA:
1626 // Source values have been clamped after fragment shader execution if the attachment format is normalized.
1627 return false;
1628 case VK_BLEND_FACTOR_DST_COLOR:
1629 case VK_BLEND_FACTOR_DST_ALPHA:
1630 // Dest values have a valid range due to being read from the attachment.
1631 return false;
1632 case VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR:
1633 case VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA:
1634 case VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR:
1635 case VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA:
1636 // For signed formats, negative values cause the result to exceed 1.0.
1637 return format.isSignedNormalized();
1638 case VK_BLEND_FACTOR_SRC_ALPHA_SATURATE:
1639 // min(As, 1 - Ad)
1640 return false;
1641 case VK_BLEND_FACTOR_CONSTANT_COLOR:
1642 case VK_BLEND_FACTOR_CONSTANT_ALPHA:
1643 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR:
1644 case VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA:
1645 return false;
1646
1647 default:
1648 UNSUPPORTED("VkBlendFactor: %d", int(blendFactor));
1649 return false;
1650 }
1651 }
1652
alphaBlend(int index,const Pointer<Byte> & cBuffer,const SIMD::Float4 & sourceColor,const Int & x)1653 SIMD::Float4 PixelRoutine::alphaBlend(int index, const Pointer<Byte> &cBuffer, const SIMD::Float4 &sourceColor, const Int &x)
1654 {
1655 if(!state.blendState[index].alphaBlendEnable)
1656 {
1657 return sourceColor;
1658 }
1659
1660 vk::Format format = state.colorFormat[index];
1661 ASSERT(format.supportsColorAttachmentBlend());
1662
1663 Pointer<Byte> buffer = cBuffer;
1664 Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
1665
1666 // texelColor holds four texel color values.
1667 // Note: Despite the type being Vector4f, the colors may be stored as
1668 // integers. Half-floats are stored as full 32-bit floats.
1669 // Non-float and non-fixed point formats are not alpha blended.
1670 Vector4f texelColor;
1671
1672 switch(format)
1673 {
1674 case VK_FORMAT_R32_SINT:
1675 case VK_FORMAT_R32_UINT:
1676 case VK_FORMAT_R32_SFLOAT:
1677 // FIXME: movlps
1678 buffer += 4 * x;
1679 texelColor.x.x = *Pointer<Float>(buffer + 0);
1680 texelColor.x.y = *Pointer<Float>(buffer + 4);
1681 buffer += pitchB;
1682 // FIXME: movhps
1683 texelColor.x.z = *Pointer<Float>(buffer + 0);
1684 texelColor.x.w = *Pointer<Float>(buffer + 4);
1685 texelColor.y = texelColor.z = texelColor.w = 1.0f;
1686 break;
1687 case VK_FORMAT_R32G32_SINT:
1688 case VK_FORMAT_R32G32_UINT:
1689 case VK_FORMAT_R32G32_SFLOAT:
1690 buffer += 8 * x;
1691 texelColor.x = *Pointer<Float4>(buffer, 16);
1692 buffer += pitchB;
1693 texelColor.y = *Pointer<Float4>(buffer, 16);
1694 texelColor.z = texelColor.x;
1695 texelColor.x = ShuffleLowHigh(texelColor.x, texelColor.y, 0x0202);
1696 texelColor.z = ShuffleLowHigh(texelColor.z, texelColor.y, 0x1313);
1697 texelColor.y = texelColor.z;
1698 texelColor.z = texelColor.w = 1.0f;
1699 break;
1700 case VK_FORMAT_R32G32B32A32_SFLOAT:
1701 case VK_FORMAT_R32G32B32A32_SINT:
1702 case VK_FORMAT_R32G32B32A32_UINT:
1703 buffer += 16 * x;
1704 texelColor.x = *Pointer<Float4>(buffer + 0, 16);
1705 texelColor.y = *Pointer<Float4>(buffer + 16, 16);
1706 buffer += pitchB;
1707 texelColor.z = *Pointer<Float4>(buffer + 0, 16);
1708 texelColor.w = *Pointer<Float4>(buffer + 16, 16);
1709 transpose4x4(texelColor.x, texelColor.y, texelColor.z, texelColor.w);
1710 break;
1711 case VK_FORMAT_R16_UNORM:
1712 buffer += 2 * x;
1713 texelColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
1714 texelColor.x.y = Float(Int(*Pointer<UShort>(buffer + 2)));
1715 buffer += pitchB;
1716 texelColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
1717 texelColor.x.w = Float(Int(*Pointer<UShort>(buffer + 2)));
1718 texelColor.x *= (1.0f / 0xFFFF);
1719 texelColor.y = texelColor.z = texelColor.w = 1.0f;
1720 break;
1721 case VK_FORMAT_R16_SFLOAT:
1722 buffer += 2 * x;
1723 texelColor.x.x = Float(*Pointer<Half>(buffer + 0));
1724 texelColor.x.y = Float(*Pointer<Half>(buffer + 2));
1725 buffer += pitchB;
1726 texelColor.x.z = Float(*Pointer<Half>(buffer + 0));
1727 texelColor.x.w = Float(*Pointer<Half>(buffer + 2));
1728 texelColor.y = texelColor.z = texelColor.w = 1.0f;
1729 break;
1730 case VK_FORMAT_R16G16_UNORM:
1731 buffer += 4 * x;
1732 texelColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0)));
1733 texelColor.y.x = Float(Int(*Pointer<UShort>(buffer + 2)));
1734 texelColor.x.y = Float(Int(*Pointer<UShort>(buffer + 4)));
1735 texelColor.y.y = Float(Int(*Pointer<UShort>(buffer + 6)));
1736 buffer += pitchB;
1737 texelColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0)));
1738 texelColor.y.z = Float(Int(*Pointer<UShort>(buffer + 2)));
1739 texelColor.x.w = Float(Int(*Pointer<UShort>(buffer + 4)));
1740 texelColor.y.w = Float(Int(*Pointer<UShort>(buffer + 6)));
1741 texelColor.x *= (1.0f / 0xFFFF);
1742 texelColor.y *= (1.0f / 0xFFFF);
1743 texelColor.z = texelColor.w = 1.0f;
1744 break;
1745 case VK_FORMAT_R16G16_SFLOAT:
1746 buffer += 4 * x;
1747 texelColor.x.x = Float(*Pointer<Half>(buffer + 0));
1748 texelColor.y.x = Float(*Pointer<Half>(buffer + 2));
1749 texelColor.x.y = Float(*Pointer<Half>(buffer + 4));
1750 texelColor.y.y = Float(*Pointer<Half>(buffer + 6));
1751 buffer += pitchB;
1752 texelColor.x.z = Float(*Pointer<Half>(buffer + 0));
1753 texelColor.y.z = Float(*Pointer<Half>(buffer + 2));
1754 texelColor.x.w = Float(*Pointer<Half>(buffer + 4));
1755 texelColor.y.w = Float(*Pointer<Half>(buffer + 6));
1756 texelColor.z = texelColor.w = 1.0f;
1757 break;
1758 case VK_FORMAT_R16G16B16A16_UNORM:
1759 buffer += 8 * x;
1760 texelColor.x.x = Float(Int(*Pointer<UShort>(buffer + 0x0)));
1761 texelColor.y.x = Float(Int(*Pointer<UShort>(buffer + 0x2)));
1762 texelColor.z.x = Float(Int(*Pointer<UShort>(buffer + 0x4)));
1763 texelColor.w.x = Float(Int(*Pointer<UShort>(buffer + 0x6)));
1764 texelColor.x.y = Float(Int(*Pointer<UShort>(buffer + 0x8)));
1765 texelColor.y.y = Float(Int(*Pointer<UShort>(buffer + 0xa)));
1766 texelColor.z.y = Float(Int(*Pointer<UShort>(buffer + 0xc)));
1767 texelColor.w.y = Float(Int(*Pointer<UShort>(buffer + 0xe)));
1768 buffer += pitchB;
1769 texelColor.x.z = Float(Int(*Pointer<UShort>(buffer + 0x0)));
1770 texelColor.y.z = Float(Int(*Pointer<UShort>(buffer + 0x2)));
1771 texelColor.z.z = Float(Int(*Pointer<UShort>(buffer + 0x4)));
1772 texelColor.w.z = Float(Int(*Pointer<UShort>(buffer + 0x6)));
1773 texelColor.x.w = Float(Int(*Pointer<UShort>(buffer + 0x8)));
1774 texelColor.y.w = Float(Int(*Pointer<UShort>(buffer + 0xa)));
1775 texelColor.z.w = Float(Int(*Pointer<UShort>(buffer + 0xc)));
1776 texelColor.w.w = Float(Int(*Pointer<UShort>(buffer + 0xe)));
1777 texelColor.x *= (1.0f / 0xFFFF);
1778 texelColor.y *= (1.0f / 0xFFFF);
1779 texelColor.z *= (1.0f / 0xFFFF);
1780 texelColor.w *= (1.0f / 0xFFFF);
1781 break;
1782 case VK_FORMAT_R16G16B16A16_SFLOAT:
1783 buffer += 8 * x;
1784 texelColor.x.x = Float(*Pointer<Half>(buffer + 0x0));
1785 texelColor.y.x = Float(*Pointer<Half>(buffer + 0x2));
1786 texelColor.z.x = Float(*Pointer<Half>(buffer + 0x4));
1787 texelColor.w.x = Float(*Pointer<Half>(buffer + 0x6));
1788 texelColor.x.y = Float(*Pointer<Half>(buffer + 0x8));
1789 texelColor.y.y = Float(*Pointer<Half>(buffer + 0xa));
1790 texelColor.z.y = Float(*Pointer<Half>(buffer + 0xc));
1791 texelColor.w.y = Float(*Pointer<Half>(buffer + 0xe));
1792 buffer += pitchB;
1793 texelColor.x.z = Float(*Pointer<Half>(buffer + 0x0));
1794 texelColor.y.z = Float(*Pointer<Half>(buffer + 0x2));
1795 texelColor.z.z = Float(*Pointer<Half>(buffer + 0x4));
1796 texelColor.w.z = Float(*Pointer<Half>(buffer + 0x6));
1797 texelColor.x.w = Float(*Pointer<Half>(buffer + 0x8));
1798 texelColor.y.w = Float(*Pointer<Half>(buffer + 0xa));
1799 texelColor.z.w = Float(*Pointer<Half>(buffer + 0xc));
1800 texelColor.w.w = Float(*Pointer<Half>(buffer + 0xe));
1801 break;
1802 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
1803 buffer += 4 * x;
1804 texelColor.x = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
1805 texelColor.y = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
1806 buffer += pitchB;
1807 texelColor.z = r11g11b10Unpack(*Pointer<UInt>(buffer + 0));
1808 texelColor.w = r11g11b10Unpack(*Pointer<UInt>(buffer + 4));
1809 transpose4x3(texelColor.x, texelColor.y, texelColor.z, texelColor.w);
1810 texelColor.w = 1.0f;
1811 break;
1812 default:
1813 {
1814 // Attempt to read an integer based format and convert it to float
1815 Vector4s color;
1816 readPixel(index, cBuffer, x, color);
1817 texelColor.x = Float4(As<UShort4>(color.x)) * (1.0f / 0xFFFF);
1818 texelColor.y = Float4(As<UShort4>(color.y)) * (1.0f / 0xFFFF);
1819 texelColor.z = Float4(As<UShort4>(color.z)) * (1.0f / 0xFFFF);
1820 texelColor.w = Float4(As<UShort4>(color.w)) * (1.0f / 0xFFFF);
1821
1822 if(isSRGB(index))
1823 {
1824 texelColor.x = sRGBtoLinear(texelColor.x);
1825 texelColor.y = sRGBtoLinear(texelColor.y);
1826 texelColor.z = sRGBtoLinear(texelColor.z);
1827 }
1828 }
1829 break;
1830 }
1831
1832 ASSERT(SIMD::Width == 4);
1833 SIMD::Float4 destColor;
1834 destColor.x = texelColor.x;
1835 destColor.y = texelColor.y;
1836 destColor.z = texelColor.z;
1837 destColor.w = texelColor.w;
1838
1839 SIMD::Float4 sourceFactor;
1840 SIMD::Float4 destFactor;
1841
1842 blendFactorRGB(sourceFactor, sourceColor, destColor, state.blendState[index].sourceBlendFactor, format);
1843 blendFactorRGB(destFactor, sourceColor, destColor, state.blendState[index].destBlendFactor, format);
1844 blendFactorAlpha(sourceFactor.w, sourceColor.w, destColor.w, state.blendState[index].sourceBlendFactorAlpha, format);
1845 blendFactorAlpha(destFactor.w, sourceColor.w, destColor.w, state.blendState[index].destBlendFactorAlpha, format);
1846
1847 SIMD::Float4 blendedColor;
1848
1849 switch(state.blendState[index].blendOperation)
1850 {
1851 case VK_BLEND_OP_ADD:
1852 blendedColor.x = sourceColor.x * sourceFactor.x + destColor.x * destFactor.x;
1853 blendedColor.y = sourceColor.y * sourceFactor.y + destColor.y * destFactor.y;
1854 blendedColor.z = sourceColor.z * sourceFactor.z + destColor.z * destFactor.z;
1855 break;
1856 case VK_BLEND_OP_SUBTRACT:
1857 blendedColor.x = sourceColor.x * sourceFactor.x - destColor.x * destFactor.x;
1858 blendedColor.y = sourceColor.y * sourceFactor.y - destColor.y * destFactor.y;
1859 blendedColor.z = sourceColor.z * sourceFactor.z - destColor.z * destFactor.z;
1860 break;
1861 case VK_BLEND_OP_REVERSE_SUBTRACT:
1862 blendedColor.x = destColor.x * destFactor.x - sourceColor.x * sourceFactor.x;
1863 blendedColor.y = destColor.y * destFactor.y - sourceColor.y * sourceFactor.y;
1864 blendedColor.z = destColor.z * destFactor.z - sourceColor.z * sourceFactor.z;
1865 break;
1866 case VK_BLEND_OP_MIN:
1867 blendedColor.x = Min(sourceColor.x, destColor.x);
1868 blendedColor.y = Min(sourceColor.y, destColor.y);
1869 blendedColor.z = Min(sourceColor.z, destColor.z);
1870 break;
1871 case VK_BLEND_OP_MAX:
1872 blendedColor.x = Max(sourceColor.x, destColor.x);
1873 blendedColor.y = Max(sourceColor.y, destColor.y);
1874 blendedColor.z = Max(sourceColor.z, destColor.z);
1875 break;
1876 case VK_BLEND_OP_SRC_EXT:
1877 blendedColor.x = sourceColor.x;
1878 blendedColor.y = sourceColor.y;
1879 blendedColor.z = sourceColor.z;
1880 break;
1881 case VK_BLEND_OP_DST_EXT:
1882 blendedColor.x = destColor.x;
1883 blendedColor.y = destColor.y;
1884 blendedColor.z = destColor.z;
1885 break;
1886 case VK_BLEND_OP_ZERO_EXT:
1887 blendedColor.x = 0.0f;
1888 blendedColor.y = 0.0f;
1889 blendedColor.z = 0.0f;
1890 break;
1891 case VK_BLEND_OP_MULTIPLY_EXT:
1892 case VK_BLEND_OP_SCREEN_EXT:
1893 case VK_BLEND_OP_OVERLAY_EXT:
1894 case VK_BLEND_OP_DARKEN_EXT:
1895 case VK_BLEND_OP_LIGHTEN_EXT:
1896 case VK_BLEND_OP_COLORDODGE_EXT:
1897 case VK_BLEND_OP_COLORBURN_EXT:
1898 case VK_BLEND_OP_HARDLIGHT_EXT:
1899 case VK_BLEND_OP_SOFTLIGHT_EXT:
1900 case VK_BLEND_OP_DIFFERENCE_EXT:
1901 case VK_BLEND_OP_EXCLUSION_EXT:
1902 case VK_BLEND_OP_HSL_HUE_EXT:
1903 case VK_BLEND_OP_HSL_SATURATION_EXT:
1904 case VK_BLEND_OP_HSL_COLOR_EXT:
1905 case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
1906 blendedColor = computeAdvancedBlendMode(index, sourceColor, destColor, sourceFactor, destFactor);
1907 break;
1908 default:
1909 UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperation));
1910 }
1911
1912 switch(state.blendState[index].blendOperationAlpha)
1913 {
1914 case VK_BLEND_OP_ADD:
1915 blendedColor.w = sourceColor.w * sourceFactor.w + destColor.w * destFactor.w;
1916 break;
1917 case VK_BLEND_OP_SUBTRACT:
1918 blendedColor.w = sourceColor.w * sourceFactor.w - destColor.w * destFactor.w;
1919 break;
1920 case VK_BLEND_OP_REVERSE_SUBTRACT:
1921 blendedColor.w = destColor.w * destFactor.w - sourceColor.w * sourceFactor.w;
1922 break;
1923 case VK_BLEND_OP_MIN:
1924 blendedColor.w = Min(sourceColor.w, destColor.w);
1925 break;
1926 case VK_BLEND_OP_MAX:
1927 blendedColor.w = Max(sourceColor.w, destColor.w);
1928 break;
1929 case VK_BLEND_OP_SRC_EXT:
1930 blendedColor.w = sourceColor.w;
1931 break;
1932 case VK_BLEND_OP_DST_EXT:
1933 blendedColor.w = destColor.w;
1934 break;
1935 case VK_BLEND_OP_ZERO_EXT:
1936 blendedColor.w = 0.0f;
1937 break;
1938 case VK_BLEND_OP_MULTIPLY_EXT:
1939 case VK_BLEND_OP_SCREEN_EXT:
1940 case VK_BLEND_OP_OVERLAY_EXT:
1941 case VK_BLEND_OP_DARKEN_EXT:
1942 case VK_BLEND_OP_LIGHTEN_EXT:
1943 case VK_BLEND_OP_COLORDODGE_EXT:
1944 case VK_BLEND_OP_COLORBURN_EXT:
1945 case VK_BLEND_OP_HARDLIGHT_EXT:
1946 case VK_BLEND_OP_SOFTLIGHT_EXT:
1947 case VK_BLEND_OP_DIFFERENCE_EXT:
1948 case VK_BLEND_OP_EXCLUSION_EXT:
1949 case VK_BLEND_OP_HSL_HUE_EXT:
1950 case VK_BLEND_OP_HSL_SATURATION_EXT:
1951 case VK_BLEND_OP_HSL_COLOR_EXT:
1952 case VK_BLEND_OP_HSL_LUMINOSITY_EXT:
1953 // All of the currently supported 'advanced blend modes' compute the alpha the same way.
1954 blendedColor.w = sourceColor.w + destColor.w - (sourceColor.w * destColor.w);
1955 break;
1956 default:
1957 UNSUPPORTED("VkBlendOp: %d", int(state.blendState[index].blendOperationAlpha));
1958 }
1959
1960 return blendedColor;
1961 }
1962
writeColor(int index,const Pointer<Byte> & cBuffer,const Int & x,Vector4f & color,const Int & sMask,const Int & zMask,const Int & cMask)1963 void PixelRoutine::writeColor(int index, const Pointer<Byte> &cBuffer, const Int &x, Vector4f &color, const Int &sMask, const Int &zMask, const Int &cMask)
1964 {
1965 if(isSRGB(index))
1966 {
1967 color.x = linearToSRGB(color.x);
1968 color.y = linearToSRGB(color.y);
1969 color.z = linearToSRGB(color.z);
1970 }
1971
1972 vk::Format format = state.colorFormat[index];
1973 switch(format)
1974 {
1975 case VK_FORMAT_B8G8R8A8_UNORM:
1976 case VK_FORMAT_B8G8R8A8_SRGB:
1977 case VK_FORMAT_R8G8B8A8_UNORM:
1978 case VK_FORMAT_R8G8B8A8_SRGB:
1979 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
1980 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
1981 color.w = Min(Max(color.w, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1982 color.w = As<Float4>(RoundInt(color.w * 0xFF));
1983 color.z = Min(Max(color.z, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1984 color.z = As<Float4>(RoundInt(color.z * 0xFF));
1985 // [[fallthrough]]
1986 case VK_FORMAT_R8G8_UNORM:
1987 color.y = Min(Max(color.y, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1988 color.y = As<Float4>(RoundInt(color.y * 0xFF));
1989 //[[fallthrough]]
1990 case VK_FORMAT_R8_UNORM:
1991 color.x = Min(Max(color.x, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1992 color.x = As<Float4>(RoundInt(color.x * 0xFF));
1993 break;
1994 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
1995 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
1996 case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
1997 case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
1998 color.w = Min(Max(color.w, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
1999 color.w = As<Float4>(RoundInt(color.w * 0xF));
2000 color.z = Min(Max(color.z, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2001 color.z = As<Float4>(RoundInt(color.z * 0xF));
2002 color.y = Min(Max(color.y, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2003 color.y = As<Float4>(RoundInt(color.y * 0xF));
2004 color.x = Min(Max(color.x, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2005 color.x = As<Float4>(RoundInt(color.x * 0xF));
2006 break;
2007 case VK_FORMAT_B5G6R5_UNORM_PACK16:
2008 case VK_FORMAT_R5G6B5_UNORM_PACK16:
2009 color.z = Min(Max(color.z, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2010 color.z = As<Float4>(RoundInt(color.z * 0x1F));
2011 color.y = Min(Max(color.y, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2012 color.y = As<Float4>(RoundInt(color.y * 0x3F));
2013 color.x = Min(Max(color.x, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2014 color.x = As<Float4>(RoundInt(color.x * 0x1F));
2015 break;
2016 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
2017 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
2018 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
2019 color.w = Min(Max(color.w, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2020 color.w = As<Float4>(RoundInt(color.w));
2021 color.z = Min(Max(color.z, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2022 color.z = As<Float4>(RoundInt(color.z * 0x1F));
2023 color.y = Min(Max(color.y, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2024 color.y = As<Float4>(RoundInt(color.y * 0x1F));
2025 color.x = Min(Max(color.x, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2026 color.x = As<Float4>(RoundInt(color.x * 0x1F));
2027 break;
2028 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
2029 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
2030 color.w = Min(Max(color.w, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2031 color.w = As<Float4>(RoundInt(color.w * 0x3));
2032 color.z = Min(Max(color.z, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2033 color.z = As<Float4>(RoundInt(color.z * 0x3FF));
2034 color.y = Min(Max(color.y, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2035 color.y = As<Float4>(RoundInt(color.y * 0x3FF));
2036 color.x = Min(Max(color.x, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2037 color.x = As<Float4>(RoundInt(color.x * 0x3FF));
2038 break;
2039 case VK_FORMAT_R16G16B16A16_UNORM:
2040 color.w = Min(Max(color.w, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2041 color.w = As<Float4>(RoundInt(color.w * 0xFFFF));
2042 color.z = Min(Max(color.z, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2043 color.z = As<Float4>(RoundInt(color.z * 0xFFFF));
2044 // [[fallthrough]]
2045 case VK_FORMAT_R16G16_UNORM:
2046 color.y = Min(Max(color.y, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2047 color.y = As<Float4>(RoundInt(color.y * 0xFFFF));
2048 //[[fallthrough]]
2049 case VK_FORMAT_R16_UNORM:
2050 color.x = Min(Max(color.x, 0.0f), 1.0f); // TODO(b/204560089): Omit clamp if redundant
2051 color.x = As<Float4>(RoundInt(color.x * 0xFFFF));
2052 break;
2053 default:
2054 // TODO(b/204560089): Omit clamp if redundant
2055 if(format.isUnsignedNormalized())
2056 {
2057 color.x = Min(Max(color.x, 0.0f), 1.0f);
2058 color.y = Min(Max(color.y, 0.0f), 1.0f);
2059 color.z = Min(Max(color.z, 0.0f), 1.0f);
2060 color.w = Min(Max(color.w, 0.0f), 1.0f);
2061 }
2062 else if(format.isSignedNormalized())
2063 {
2064 color.x = Min(Max(color.x, -1.0f), 1.0f);
2065 color.y = Min(Max(color.y, -1.0f), 1.0f);
2066 color.z = Min(Max(color.z, -1.0f), 1.0f);
2067 color.w = Min(Max(color.w, -1.0f), 1.0f);
2068 }
2069 }
2070
2071 switch(format)
2072 {
2073 case VK_FORMAT_R16_SFLOAT:
2074 case VK_FORMAT_R32_SFLOAT:
2075 case VK_FORMAT_R32_SINT:
2076 case VK_FORMAT_R32_UINT:
2077 case VK_FORMAT_R16_UNORM:
2078 case VK_FORMAT_R16_SINT:
2079 case VK_FORMAT_R16_UINT:
2080 case VK_FORMAT_R8_SINT:
2081 case VK_FORMAT_R8_UINT:
2082 case VK_FORMAT_R8_UNORM:
2083 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2084 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2085 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
2086 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
2087 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
2088 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
2089 case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
2090 case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
2091 case VK_FORMAT_B5G6R5_UNORM_PACK16:
2092 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
2093 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
2094 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
2095 case VK_FORMAT_R5G6B5_UNORM_PACK16:
2096 break;
2097 case VK_FORMAT_R16G16_SFLOAT:
2098 case VK_FORMAT_R32G32_SFLOAT:
2099 case VK_FORMAT_R32G32_SINT:
2100 case VK_FORMAT_R32G32_UINT:
2101 case VK_FORMAT_R16G16_UNORM:
2102 case VK_FORMAT_R16G16_SINT:
2103 case VK_FORMAT_R16G16_UINT:
2104 case VK_FORMAT_R8G8_SINT:
2105 case VK_FORMAT_R8G8_UINT:
2106 case VK_FORMAT_R8G8_UNORM:
2107 color.z = color.x;
2108 color.x = UnpackLow(color.x, color.y);
2109 color.z = UnpackHigh(color.z, color.y);
2110 color.y = color.z;
2111 break;
2112 case VK_FORMAT_R16G16B16A16_SFLOAT:
2113 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2114 case VK_FORMAT_R32G32B32A32_SFLOAT:
2115 case VK_FORMAT_R32G32B32A32_SINT:
2116 case VK_FORMAT_R32G32B32A32_UINT:
2117 case VK_FORMAT_R16G16B16A16_UNORM:
2118 case VK_FORMAT_R16G16B16A16_SINT:
2119 case VK_FORMAT_R16G16B16A16_UINT:
2120 case VK_FORMAT_R8G8B8A8_SINT:
2121 case VK_FORMAT_R8G8B8A8_UINT:
2122 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
2123 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
2124 case VK_FORMAT_R8G8B8A8_UNORM:
2125 case VK_FORMAT_R8G8B8A8_SRGB:
2126 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
2127 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
2128 transpose4x4(color.x, color.y, color.z, color.w);
2129 break;
2130 case VK_FORMAT_B8G8R8A8_UNORM:
2131 case VK_FORMAT_B8G8R8A8_SRGB:
2132 transpose4x4zyxw(color.z, color.y, color.x, color.w);
2133 break;
2134 default:
2135 UNSUPPORTED("VkFormat: %d", int(format));
2136 }
2137
2138 int writeMask = state.colorWriteActive(index);
2139 if(format.isBGRformat())
2140 {
2141 // For BGR formats, flip R and B channels in the channels mask
2142 writeMask = (writeMask & 0x0000000A) | (writeMask & 0x00000001) << 2 | (writeMask & 0x00000004) >> 2;
2143 }
2144
2145 Int xMask; // Combination of all masks
2146
2147 if(state.depthTestActive)
2148 {
2149 xMask = zMask;
2150 }
2151 else
2152 {
2153 xMask = cMask;
2154 }
2155
2156 if(state.stencilActive)
2157 {
2158 xMask &= sMask;
2159 }
2160
2161 Pointer<Byte> buffer = cBuffer;
2162 Int pitchB = *Pointer<Int>(data + OFFSET(DrawData, colorPitchB[index]));
2163 Float4 value;
2164
2165 switch(format)
2166 {
2167 case VK_FORMAT_R32_SFLOAT:
2168 case VK_FORMAT_R32_SINT:
2169 case VK_FORMAT_R32_UINT:
2170 if(writeMask & 0x00000001)
2171 {
2172 buffer += 4 * x;
2173
2174 // FIXME: movlps
2175 value.x = *Pointer<Float>(buffer + 0);
2176 value.y = *Pointer<Float>(buffer + 4);
2177
2178 buffer += pitchB;
2179
2180 // FIXME: movhps
2181 value.z = *Pointer<Float>(buffer + 0);
2182 value.w = *Pointer<Float>(buffer + 4);
2183
2184 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2185 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2186 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2187
2188 // FIXME: movhps
2189 *Pointer<Float>(buffer + 0) = color.x.z;
2190 *Pointer<Float>(buffer + 4) = color.x.w;
2191
2192 buffer -= pitchB;
2193
2194 // FIXME: movlps
2195 *Pointer<Float>(buffer + 0) = color.x.x;
2196 *Pointer<Float>(buffer + 4) = color.x.y;
2197 }
2198 break;
2199 case VK_FORMAT_R16_SFLOAT:
2200 if(writeMask & 0x00000001)
2201 {
2202 buffer += 2 * x;
2203
2204 value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 0);
2205 value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 1);
2206
2207 buffer += pitchB;
2208
2209 value = Insert(value, Float(*Pointer<Half>(buffer + 0)), 2);
2210 value = Insert(value, Float(*Pointer<Half>(buffer + 2)), 3);
2211
2212 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2213 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2214 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2215
2216 *Pointer<Half>(buffer + 0) = Half(color.x.z);
2217 *Pointer<Half>(buffer + 2) = Half(color.x.w);
2218
2219 buffer -= pitchB;
2220
2221 *Pointer<Half>(buffer + 0) = Half(color.x.x);
2222 *Pointer<Half>(buffer + 2) = Half(color.x.y);
2223 }
2224 break;
2225 case VK_FORMAT_R16_UNORM:
2226 case VK_FORMAT_R16_SINT:
2227 case VK_FORMAT_R16_UINT:
2228 if(writeMask & 0x00000001)
2229 {
2230 buffer += 2 * x;
2231
2232 UShort4 xyzw;
2233 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 0));
2234
2235 buffer += pitchB;
2236
2237 xyzw = As<UShort4>(Insert(As<Int2>(xyzw), *Pointer<Int>(buffer), 1));
2238 value = As<Float4>(Int4(xyzw));
2239
2240 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X) + xMask * 16, 16));
2241 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X) + xMask * 16, 16));
2242 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2243
2244 Float component = color.x.z;
2245 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2246 component = color.x.w;
2247 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2248
2249 buffer -= pitchB;
2250
2251 component = color.x.x;
2252 *Pointer<UShort>(buffer + 0) = UShort(As<Int>(component));
2253 component = color.x.y;
2254 *Pointer<UShort>(buffer + 2) = UShort(As<Int>(component));
2255 }
2256 break;
2257 case VK_FORMAT_R8_SINT:
2258 case VK_FORMAT_R8_UINT:
2259 case VK_FORMAT_R8_UNORM:
2260 if(writeMask & 0x00000001)
2261 {
2262 buffer += x;
2263
2264 UInt xyzw, packedCol;
2265
2266 xyzw = UInt(*Pointer<UShort>(buffer)) & 0xFFFFu;
2267 buffer += pitchB;
2268 xyzw |= UInt(*Pointer<UShort>(buffer)) << 16;
2269
2270 Short4 tmpCol = Short4(As<Int4>(color.x));
2271 if(format == VK_FORMAT_R8_SINT)
2272 {
2273 tmpCol = As<Short4>(PackSigned(tmpCol, tmpCol));
2274 }
2275 else
2276 {
2277 tmpCol = As<Short4>(PackUnsigned(tmpCol, tmpCol));
2278 }
2279 packedCol = Extract(As<Int2>(tmpCol), 0);
2280
2281 packedCol = (packedCol & *Pointer<UInt>(constants + OFFSET(Constants, maskB4Q) + 8 * xMask)) |
2282 (xyzw & *Pointer<UInt>(constants + OFFSET(Constants, invMaskB4Q) + 8 * xMask));
2283
2284 *Pointer<UShort>(buffer) = UShort(packedCol >> 16);
2285 buffer -= pitchB;
2286 *Pointer<UShort>(buffer) = UShort(packedCol);
2287 }
2288 break;
2289 case VK_FORMAT_R32G32_SFLOAT:
2290 case VK_FORMAT_R32G32_SINT:
2291 case VK_FORMAT_R32G32_UINT:
2292 buffer += 8 * x;
2293
2294 value = *Pointer<Float4>(buffer);
2295
2296 if((writeMask & 0x00000003) != 0x00000003)
2297 {
2298 Float4 masked = value;
2299 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[writeMask & 0x3][0])));
2300 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~writeMask & 0x3][0])));
2301 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(masked));
2302 }
2303
2304 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16, 16));
2305 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ01X) + xMask * 16, 16));
2306 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2307 *Pointer<Float4>(buffer) = color.x;
2308
2309 buffer += pitchB;
2310
2311 value = *Pointer<Float4>(buffer);
2312
2313 if((writeMask & 0x00000003) != 0x00000003)
2314 {
2315 Float4 masked;
2316
2317 masked = value;
2318 color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[writeMask & 0x3][0])));
2319 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, maskD01X[~writeMask & 0x3][0])));
2320 color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(masked));
2321 }
2322
2323 color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16, 16));
2324 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskQ23X) + xMask * 16, 16));
2325 color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(value));
2326 *Pointer<Float4>(buffer) = color.y;
2327 break;
2328 case VK_FORMAT_R16G16_SFLOAT:
2329 if((writeMask & 0x00000003) != 0x0)
2330 {
2331 buffer += 4 * x;
2332
2333 UInt2 rgbaMask;
2334 UInt2 packedCol;
2335 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.y))) << 16) | UInt(As<UShort>(Half(color.x.x))), 0);
2336 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.w))) << 16) | UInt(As<UShort>(Half(color.x.z))), 1);
2337
2338 UShort4 value = *Pointer<UShort4>(buffer);
2339 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2340 if((writeMask & 0x3) != 0x3)
2341 {
2342 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[writeMask & 0x3]));
2343 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2344 mergedMask &= rgbaMask;
2345 }
2346 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2347
2348 buffer += pitchB;
2349
2350 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.y))) << 16) | UInt(As<UShort>(Half(color.y.x))), 0);
2351 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.w))) << 16) | UInt(As<UShort>(Half(color.y.z))), 1);
2352 value = *Pointer<UShort4>(buffer);
2353 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2354 if((writeMask & 0x3) != 0x3)
2355 {
2356 mergedMask &= rgbaMask;
2357 }
2358 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2359 }
2360 break;
2361 case VK_FORMAT_R16G16_UNORM:
2362 case VK_FORMAT_R16G16_SINT:
2363 case VK_FORMAT_R16G16_UINT:
2364 if((writeMask & 0x00000003) != 0x0)
2365 {
2366 buffer += 4 * x;
2367
2368 UInt2 rgbaMask;
2369 UShort4 packedCol = UShort4(As<Int4>(color.x));
2370 UShort4 value = *Pointer<UShort4>(buffer);
2371 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2372 if((writeMask & 0x3) != 0x3)
2373 {
2374 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[writeMask & 0x3]));
2375 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2376 mergedMask &= rgbaMask;
2377 }
2378 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2379
2380 buffer += pitchB;
2381
2382 packedCol = UShort4(As<Int4>(color.y));
2383 value = *Pointer<UShort4>(buffer);
2384 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2385 if((writeMask & 0x3) != 0x3)
2386 {
2387 mergedMask &= rgbaMask;
2388 }
2389 *Pointer<UInt2>(buffer) = (As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(value) & ~mergedMask);
2390 }
2391 break;
2392 case VK_FORMAT_R8G8_SINT:
2393 case VK_FORMAT_R8G8_UINT:
2394 case VK_FORMAT_R8G8_UNORM:
2395 if((writeMask & 0x00000003) != 0x0)
2396 {
2397 buffer += 2 * x;
2398
2399 Int2 xyzw, packedCol;
2400
2401 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 0);
2402 buffer += pitchB;
2403 xyzw = Insert(xyzw, *Pointer<Int>(buffer), 1);
2404
2405 if(format == VK_FORMAT_R8G8_SINT)
2406 {
2407 packedCol = As<Int2>(PackSigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2408 }
2409 else
2410 {
2411 packedCol = As<Int2>(PackUnsigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2412 }
2413
2414 UInt2 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q) + xMask * 8);
2415 if((writeMask & 0x3) != 0x3)
2416 {
2417 Int tmpMask = *Pointer<Int>(constants + OFFSET(Constants, maskB4Q[5 * (writeMask & 0x3)]));
2418 UInt2 rgbaMask = As<UInt2>(Int2(tmpMask, tmpMask));
2419 mergedMask &= rgbaMask;
2420 }
2421
2422 packedCol = As<Int2>((As<UInt2>(packedCol) & mergedMask) | (As<UInt2>(xyzw) & ~mergedMask));
2423
2424 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 1));
2425 buffer -= pitchB;
2426 *Pointer<UInt>(buffer) = As<UInt>(Extract(packedCol, 0));
2427 }
2428 break;
2429 case VK_FORMAT_R32G32B32A32_SFLOAT:
2430 case VK_FORMAT_R32G32B32A32_SINT:
2431 case VK_FORMAT_R32G32B32A32_UINT:
2432 buffer += 16 * x;
2433
2434 {
2435 value = *Pointer<Float4>(buffer, 16);
2436
2437 if(writeMask != 0x0000000F)
2438 {
2439 Float4 masked = value;
2440 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[writeMask])));
2441 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[writeMask])));
2442 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(masked));
2443 }
2444
2445 color.x = As<Float4>(As<Int4>(color.x) & *Pointer<Int4>(constants + OFFSET(Constants, maskX0X) + xMask * 16, 16));
2446 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX0X) + xMask * 16, 16));
2447 color.x = As<Float4>(As<Int4>(color.x) | As<Int4>(value));
2448 *Pointer<Float4>(buffer, 16) = color.x;
2449 }
2450
2451 {
2452 value = *Pointer<Float4>(buffer + 16, 16);
2453
2454 if(writeMask != 0x0000000F)
2455 {
2456 Float4 masked = value;
2457 color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[writeMask])));
2458 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[writeMask])));
2459 color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(masked));
2460 }
2461
2462 color.y = As<Float4>(As<Int4>(color.y) & *Pointer<Int4>(constants + OFFSET(Constants, maskX1X) + xMask * 16, 16));
2463 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX1X) + xMask * 16, 16));
2464 color.y = As<Float4>(As<Int4>(color.y) | As<Int4>(value));
2465 *Pointer<Float4>(buffer + 16, 16) = color.y;
2466 }
2467
2468 buffer += pitchB;
2469
2470 {
2471 value = *Pointer<Float4>(buffer, 16);
2472
2473 if(writeMask != 0x0000000F)
2474 {
2475 Float4 masked = value;
2476 color.z = As<Float4>(As<Int4>(color.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[writeMask])));
2477 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[writeMask])));
2478 color.z = As<Float4>(As<Int4>(color.z) | As<Int4>(masked));
2479 }
2480
2481 color.z = As<Float4>(As<Int4>(color.z) & *Pointer<Int4>(constants + OFFSET(Constants, maskX2X) + xMask * 16, 16));
2482 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX2X) + xMask * 16, 16));
2483 color.z = As<Float4>(As<Int4>(color.z) | As<Int4>(value));
2484 *Pointer<Float4>(buffer, 16) = color.z;
2485 }
2486
2487 {
2488 value = *Pointer<Float4>(buffer + 16, 16);
2489
2490 if(writeMask != 0x0000000F)
2491 {
2492 Float4 masked = value;
2493 color.w = As<Float4>(As<Int4>(color.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskD4X[writeMask])));
2494 masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskD4X[writeMask])));
2495 color.w = As<Float4>(As<Int4>(color.w) | As<Int4>(masked));
2496 }
2497
2498 color.w = As<Float4>(As<Int4>(color.w) & *Pointer<Int4>(constants + OFFSET(Constants, maskX3X) + xMask * 16, 16));
2499 value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants, invMaskX3X) + xMask * 16, 16));
2500 color.w = As<Float4>(As<Int4>(color.w) | As<Int4>(value));
2501 *Pointer<Float4>(buffer + 16, 16) = color.w;
2502 }
2503 break;
2504 case VK_FORMAT_R16G16B16A16_SFLOAT:
2505 if((writeMask & 0x0000000F) != 0x0)
2506 {
2507 buffer += 8 * x;
2508
2509 UInt4 rgbaMask;
2510 UInt4 value = *Pointer<UInt4>(buffer);
2511 UInt4 packedCol;
2512 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.y))) << 16) | UInt(As<UShort>(Half(color.x.x))), 0);
2513 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.x.w))) << 16) | UInt(As<UShort>(Half(color.x.z))), 1);
2514 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.y))) << 16) | UInt(As<UShort>(Half(color.y.x))), 2);
2515 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.y.w))) << 16) | UInt(As<UShort>(Half(color.y.z))), 3);
2516 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2517 if((writeMask & 0xF) != 0xF)
2518 {
2519 UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[writeMask]));
2520 rgbaMask = UInt4(tmpMask, tmpMask);
2521 mergedMask &= rgbaMask;
2522 }
2523 *Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2524
2525 buffer += pitchB;
2526
2527 value = *Pointer<UInt4>(buffer);
2528 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.z.y))) << 16) | UInt(As<UShort>(Half(color.z.x))), 0);
2529 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.z.w))) << 16) | UInt(As<UShort>(Half(color.z.z))), 1);
2530 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.w.y))) << 16) | UInt(As<UShort>(Half(color.w.x))), 2);
2531 packedCol = Insert(packedCol, (UInt(As<UShort>(Half(color.w.w))) << 16) | UInt(As<UShort>(Half(color.w.z))), 3);
2532 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2533 if((writeMask & 0xF) != 0xF)
2534 {
2535 mergedMask &= rgbaMask;
2536 }
2537 *Pointer<UInt4>(buffer) = (packedCol & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2538 }
2539 break;
2540 case VK_FORMAT_B10G11R11_UFLOAT_PACK32:
2541 if((writeMask & 0x7) != 0x0)
2542 {
2543 buffer += 4 * x;
2544
2545 UInt4 packedCol;
2546 packedCol = Insert(packedCol, r11g11b10Pack(color.x), 0);
2547 packedCol = Insert(packedCol, r11g11b10Pack(color.y), 1);
2548 packedCol = Insert(packedCol, r11g11b10Pack(color.z), 2);
2549 packedCol = Insert(packedCol, r11g11b10Pack(color.w), 3);
2550
2551 UInt4 value;
2552 value = Insert(value, *Pointer<UInt>(buffer + 0), 0);
2553 value = Insert(value, *Pointer<UInt>(buffer + 4), 1);
2554 buffer += pitchB;
2555 value = Insert(value, *Pointer<UInt>(buffer + 0), 2);
2556 value = Insert(value, *Pointer<UInt>(buffer + 4), 3);
2557
2558 UInt4 mask = *Pointer<UInt4>(constants + OFFSET(Constants, maskD4X[0]) + xMask * 16, 16);
2559 if((writeMask & 0x7) != 0x7)
2560 {
2561 mask &= *Pointer<UInt4>(constants + OFFSET(Constants, mask11X[writeMask & 0x7]), 16);
2562 }
2563 value = (packedCol & mask) | (value & ~mask);
2564
2565 *Pointer<UInt>(buffer + 0) = value.z;
2566 *Pointer<UInt>(buffer + 4) = value.w;
2567 buffer -= pitchB;
2568 *Pointer<UInt>(buffer + 0) = value.x;
2569 *Pointer<UInt>(buffer + 4) = value.y;
2570 }
2571 break;
2572 case VK_FORMAT_R16G16B16A16_UNORM:
2573 case VK_FORMAT_R16G16B16A16_SINT:
2574 case VK_FORMAT_R16G16B16A16_UINT:
2575 if((writeMask & 0x0000000F) != 0x0)
2576 {
2577 buffer += 8 * x;
2578
2579 UInt4 rgbaMask;
2580 UShort8 value = *Pointer<UShort8>(buffer);
2581 UShort8 packedCol = UShort8(UShort4(As<Int4>(color.x)), UShort4(As<Int4>(color.y)));
2582 UInt4 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ01X) + xMask * 16);
2583 if((writeMask & 0xF) != 0xF)
2584 {
2585 UInt2 tmpMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskW4Q[writeMask]));
2586 rgbaMask = UInt4(tmpMask, tmpMask);
2587 mergedMask &= rgbaMask;
2588 }
2589 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2590
2591 buffer += pitchB;
2592
2593 value = *Pointer<UShort8>(buffer);
2594 packedCol = UShort8(UShort4(As<Int4>(color.z)), UShort4(As<Int4>(color.w)));
2595 mergedMask = *Pointer<UInt4>(constants + OFFSET(Constants, maskQ23X) + xMask * 16);
2596 if((writeMask & 0xF) != 0xF)
2597 {
2598 mergedMask &= rgbaMask;
2599 }
2600 *Pointer<UInt4>(buffer) = (As<UInt4>(packedCol) & mergedMask) | (As<UInt4>(value) & ~mergedMask);
2601 }
2602 break;
2603 case VK_FORMAT_B8G8R8A8_UNORM:
2604 case VK_FORMAT_B8G8R8A8_SRGB:
2605 case VK_FORMAT_R8G8B8A8_SINT:
2606 case VK_FORMAT_R8G8B8A8_UINT:
2607 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
2608 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
2609 case VK_FORMAT_R8G8B8A8_UNORM:
2610 case VK_FORMAT_R8G8B8A8_SRGB:
2611 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
2612 case VK_FORMAT_A8B8G8R8_SRGB_PACK32:
2613 if((writeMask & 0x0000000F) != 0x0)
2614 {
2615 UInt2 value, packedCol, mergedMask;
2616
2617 buffer += 4 * x;
2618
2619 bool isSigned = !format.isUnsigned();
2620
2621 if(isSigned)
2622 {
2623 packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2624 }
2625 else
2626 {
2627 packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(color.x)), Short4(As<Int4>(color.y))));
2628 }
2629 value = *Pointer<UInt2>(buffer, 16);
2630 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2631 if(writeMask != 0xF)
2632 {
2633 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[writeMask]));
2634 }
2635 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2636
2637 buffer += pitchB;
2638
2639 if(isSigned)
2640 {
2641 packedCol = As<UInt2>(PackSigned(Short4(As<Int4>(color.z)), Short4(As<Int4>(color.w))));
2642 }
2643 else
2644 {
2645 packedCol = As<UInt2>(PackUnsigned(Short4(As<Int4>(color.z)), Short4(As<Int4>(color.w))));
2646 }
2647 value = *Pointer<UInt2>(buffer, 16);
2648 mergedMask = *Pointer<UInt2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2649 if(writeMask != 0xF)
2650 {
2651 mergedMask &= *Pointer<UInt2>(constants + OFFSET(Constants, maskB4Q[writeMask]));
2652 }
2653 *Pointer<UInt2>(buffer) = (packedCol & mergedMask) | (value & ~mergedMask);
2654 }
2655 break;
2656 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
2657 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
2658 if((writeMask & 0x0000000F) != 0x0)
2659 {
2660 Int2 mergedMask, packedCol, value;
2661 Int4 packed = ((As<Int4>(color.w) & Int4(0x3)) << 30) |
2662 ((As<Int4>(color.z) & Int4(0x3ff)) << 20) |
2663 ((As<Int4>(color.y) & Int4(0x3ff)) << 10) |
2664 ((As<Int4>(color.x) & Int4(0x3ff)));
2665
2666 buffer += 4 * x;
2667 value = *Pointer<Int2>(buffer, 16);
2668 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2669 if(writeMask != 0xF)
2670 {
2671 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask]));
2672 }
2673 *Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
2674
2675 buffer += pitchB;
2676
2677 value = *Pointer<Int2>(buffer, 16);
2678 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2679 if(writeMask != 0xF)
2680 {
2681 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask]));
2682 }
2683 *Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
2684 }
2685 break;
2686 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
2687 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
2688 if((writeMask & 0x0000000F) != 0x0)
2689 {
2690 Int2 mergedMask, packedCol, value;
2691 Int4 packed = ((As<Int4>(color.w) & Int4(0x3)) << 30) |
2692 ((As<Int4>(color.x) & Int4(0x3ff)) << 20) |
2693 ((As<Int4>(color.y) & Int4(0x3ff)) << 10) |
2694 ((As<Int4>(color.z) & Int4(0x3ff)));
2695
2696 buffer += 4 * x;
2697 value = *Pointer<Int2>(buffer, 16);
2698 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD01Q) + xMask * 8);
2699 if(writeMask != 0xF)
2700 {
2701 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask]));
2702 }
2703 *Pointer<Int2>(buffer) = (As<Int2>(packed) & mergedMask) | (value & ~mergedMask);
2704
2705 buffer += pitchB;
2706
2707 value = *Pointer<Int2>(buffer, 16);
2708 mergedMask = *Pointer<Int2>(constants + OFFSET(Constants, maskD23Q) + xMask * 8);
2709 if(writeMask != 0xF)
2710 {
2711 mergedMask &= *Pointer<Int2>(constants + OFFSET(Constants, mask10Q[writeMask]));
2712 }
2713 *Pointer<Int2>(buffer) = (As<Int2>(Int4(packed.zwww)) & mergedMask) | (value & ~mergedMask);
2714 }
2715 break;
2716 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
2717 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
2718 case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
2719 case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
2720 {
2721 buffer += 2 * x;
2722 Int value = *Pointer<Int>(buffer);
2723
2724 Int channelMask;
2725 Short4 current;
2726 switch(format)
2727 {
2728 case VK_FORMAT_R4G4B4A4_UNORM_PACK16:
2729 channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4argbQ[writeMask][0]));
2730 current = (UShort4(As<Int4>(color.x)) & UShort4(0xF)) << 12 |
2731 (UShort4(As<Int4>(color.y)) & UShort4(0xF)) << 8 |
2732 (UShort4(As<Int4>(color.z)) & UShort4(0xF)) << 4 |
2733 (UShort4(As<Int4>(color.w)) & UShort4(0xF));
2734 break;
2735 case VK_FORMAT_B4G4R4A4_UNORM_PACK16:
2736 channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4argbQ[writeMask][0]));
2737 current = (UShort4(As<Int4>(color.z)) & UShort4(0xF)) << 12 |
2738 (UShort4(As<Int4>(color.y)) & UShort4(0xF)) << 8 |
2739 (UShort4(As<Int4>(color.x)) & UShort4(0xF)) << 4 |
2740 (UShort4(As<Int4>(color.w)) & UShort4(0xF));
2741 break;
2742 case VK_FORMAT_A4R4G4B4_UNORM_PACK16:
2743 channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4rgbaQ[writeMask][0]));
2744 current = (UShort4(As<Int4>(color.w)) & UShort4(0xF)) << 12 |
2745 (UShort4(As<Int4>(color.x)) & UShort4(0xF)) << 8 |
2746 (UShort4(As<Int4>(color.y)) & UShort4(0xF)) << 4 |
2747 (UShort4(As<Int4>(color.z)) & UShort4(0xF));
2748 break;
2749 case VK_FORMAT_A4B4G4R4_UNORM_PACK16:
2750 channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask4rgbaQ[writeMask][0]));
2751 current = (UShort4(As<Int4>(color.w)) & UShort4(0xF)) << 12 |
2752 (UShort4(As<Int4>(color.z)) & UShort4(0xF)) << 8 |
2753 (UShort4(As<Int4>(color.y)) & UShort4(0xF)) << 4 |
2754 (UShort4(As<Int4>(color.x)) & UShort4(0xF));
2755 break;
2756 default:
2757 UNREACHABLE("Format: %s", vk::Stringify(format).c_str());
2758 }
2759
2760 Int c01 = Extract(As<Int2>(current), 0);
2761 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2762 if(writeMask != 0x0000000F)
2763 {
2764 mask01 &= channelMask;
2765 }
2766 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2767
2768 buffer += pitchB;
2769 value = *Pointer<Int>(buffer);
2770
2771 Int c23 = Extract(As<Int2>(current), 1);
2772 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2773 if(writeMask != 0x0000000F)
2774 {
2775 mask23 &= channelMask;
2776 }
2777 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2778 }
2779 break;
2780 case VK_FORMAT_R5G5B5A1_UNORM_PACK16:
2781 {
2782 buffer += 2 * x;
2783 Int value = *Pointer<Int>(buffer);
2784
2785 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskr5g5b5a1Q[writeMask][0]));
2786 Short4 current = (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) << 11 |
2787 (UShort4(As<Int4>(color.y)) & UShort4(0x1F)) << 6 |
2788 (UShort4(As<Int4>(color.z)) & UShort4(0x1F)) << 1 |
2789 (UShort4(As<Int4>(color.w)) & UShort4(0x1));
2790
2791 Int c01 = Extract(As<Int2>(current), 0);
2792 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2793 if(writeMask != 0x0000000F)
2794 {
2795 mask01 &= channelMask;
2796 }
2797 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2798
2799 buffer += pitchB;
2800 value = *Pointer<Int>(buffer);
2801
2802 Int c23 = Extract(As<Int2>(current), 1);
2803 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2804 if(writeMask != 0x0000000F)
2805 {
2806 mask23 &= channelMask;
2807 }
2808 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2809 }
2810 break;
2811 case VK_FORMAT_B5G5R5A1_UNORM_PACK16:
2812 {
2813 buffer += 2 * x;
2814 Int value = *Pointer<Int>(buffer);
2815
2816 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, maskb5g5r5a1Q[writeMask][0]));
2817 Short4 current = (UShort4(As<Int4>(color.z)) & UShort4(0x1F)) << 11 |
2818 (UShort4(As<Int4>(color.y)) & UShort4(0x1F)) << 6 |
2819 (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) << 1 |
2820 (UShort4(As<Int4>(color.w)) & UShort4(0x1));
2821
2822 Int c01 = Extract(As<Int2>(current), 0);
2823 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2824 if(writeMask != 0x0000000F)
2825 {
2826 mask01 &= channelMask;
2827 }
2828 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2829
2830 buffer += pitchB;
2831 value = *Pointer<Int>(buffer);
2832
2833 Int c23 = Extract(As<Int2>(current), 1);
2834 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2835 if(writeMask != 0x0000000F)
2836 {
2837 mask23 &= channelMask;
2838 }
2839 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2840 }
2841 break;
2842 case VK_FORMAT_A1R5G5B5_UNORM_PACK16:
2843 {
2844 buffer += 2 * x;
2845 Int value = *Pointer<Int>(buffer);
2846
2847 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask5551Q[writeMask][0]));
2848 Short4 current = (UShort4(As<Int4>(color.w)) & UShort4(0x1)) << 15 |
2849 (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) << 10 |
2850 (UShort4(As<Int4>(color.y)) & UShort4(0x1F)) << 5 |
2851 (UShort4(As<Int4>(color.z)) & UShort4(0x1F));
2852
2853 Int c01 = Extract(As<Int2>(current), 0);
2854 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2855 if(writeMask != 0x0000000F)
2856 {
2857 mask01 &= channelMask;
2858 }
2859 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2860
2861 buffer += pitchB;
2862 value = *Pointer<Int>(buffer);
2863
2864 Int c23 = Extract(As<Int2>(current), 1);
2865 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2866 if(writeMask != 0x0000000F)
2867 {
2868 mask23 &= channelMask;
2869 }
2870 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2871 }
2872 break;
2873 case VK_FORMAT_R5G6B5_UNORM_PACK16:
2874 {
2875 buffer += 2 * x;
2876 Int value = *Pointer<Int>(buffer);
2877
2878 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask565Q[writeMask & 0x7][0]));
2879 Short4 current = (UShort4(As<Int4>(color.z)) & UShort4(0x1F)) |
2880 (UShort4(As<Int4>(color.y)) & UShort4(0x3F)) << 5 |
2881 (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) << 11;
2882
2883 Int c01 = Extract(As<Int2>(current), 0);
2884 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2885 if((writeMask & 0x00000007) != 0x00000007)
2886 {
2887 mask01 &= channelMask;
2888 }
2889 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2890
2891 buffer += pitchB;
2892 value = *Pointer<Int>(buffer);
2893
2894 Int c23 = Extract(As<Int2>(current), 1);
2895 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2896 if((writeMask & 0x00000007) != 0x00000007)
2897 {
2898 mask23 &= channelMask;
2899 }
2900 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2901 }
2902 break;
2903 case VK_FORMAT_B5G6R5_UNORM_PACK16:
2904 {
2905 buffer += 2 * x;
2906 Int value = *Pointer<Int>(buffer);
2907
2908 Int channelMask = *Pointer<Int>(constants + OFFSET(Constants, mask565Q[writeMask & 0x7][0]));
2909 Short4 current = (UShort4(As<Int4>(color.x)) & UShort4(0x1F)) |
2910 (UShort4(As<Int4>(color.y)) & UShort4(0x3F)) << 5 |
2911 (UShort4(As<Int4>(color.z)) & UShort4(0x1F)) << 11;
2912
2913 Int c01 = Extract(As<Int2>(current), 0);
2914 Int mask01 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][0]) + xMask * 8);
2915 if((writeMask & 0x00000007) != 0x00000007)
2916 {
2917 mask01 &= channelMask;
2918 }
2919 *Pointer<Int>(buffer) = (c01 & mask01) | (value & ~mask01);
2920
2921 buffer += pitchB;
2922 value = *Pointer<Int>(buffer);
2923
2924 Int c23 = Extract(As<Int2>(current), 1);
2925 Int mask23 = *Pointer<Int>(constants + OFFSET(Constants, maskW4Q[0][2]) + xMask * 8);
2926 if((writeMask & 0x00000007) != 0x00000007)
2927 {
2928 mask23 &= channelMask;
2929 }
2930 *Pointer<Int>(buffer) = (c23 & mask23) | (value & ~mask23);
2931 }
2932 break;
2933 default:
2934 UNSUPPORTED("VkFormat: %d", int(format));
2935 }
2936 }
2937
2938 } // namespace sw
2939