1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "VertexRoutine.hpp"
16
17 #include "Constants.hpp"
18 #include "SpirvShader.hpp"
19 #include "Device/Clipper.hpp"
20 #include "Device/Renderer.hpp"
21 #include "Device/Vertex.hpp"
22 #include "System/Debug.hpp"
23 #include "System/Half.hpp"
24 #include "Vulkan/VkDevice.hpp"
25
26 namespace sw {
27
VertexRoutine(const VertexProcessor::State & state,const vk::PipelineLayout * pipelineLayout,const SpirvShader * spirvShader)28 VertexRoutine::VertexRoutine(
29 const VertexProcessor::State &state,
30 const vk::PipelineLayout *pipelineLayout,
31 const SpirvShader *spirvShader)
32 : routine(pipelineLayout)
33 , state(state)
34 , spirvShader(spirvShader)
35 {
36 spirvShader->emitProlog(&routine);
37 }
38
~VertexRoutine()39 VertexRoutine::~VertexRoutine()
40 {
41 }
42
generate()43 void VertexRoutine::generate()
44 {
45 Pointer<Byte> cache = task + OFFSET(VertexTask, vertexCache);
46 Pointer<Byte> vertexCache = cache + OFFSET(VertexCache, vertex);
47 Pointer<UInt> tagCache = Pointer<UInt>(cache + OFFSET(VertexCache, tag));
48
49 UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask, vertexCount));
50
51 constants = device + OFFSET(vk::Device, constants);
52
53 // Check the cache one vertex index at a time. If a hit occurs, copy from the cache to the 'vertex' output buffer.
54 // On a cache miss, process a SIMD width of consecutive indices from the input batch. They're written to the cache
55 // in reverse order to guarantee that the first one doesn't get evicted and can be written out.
56
57 Do
58 {
59 UInt index = *batch;
60 UInt cacheIndex = index & VertexCache::TAG_MASK;
61
62 If(tagCache[cacheIndex] != index)
63 {
64 readInput(batch);
65 program(batch, vertexCount);
66 computeClipFlags();
67 computeCullMask();
68
69 writeCache(vertexCache, tagCache, batch);
70 }
71
72 Pointer<Byte> cacheEntry = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
73
74 // For points, vertexCount is 1 per primitive, so duplicate vertex for all 3 vertices of the primitive
75 for(int i = 0; i < (state.isPoint ? 3 : 1); i++)
76 {
77 writeVertex(vertex, cacheEntry);
78 vertex += sizeof(Vertex);
79 }
80
81 batch = Pointer<UInt>(Pointer<Byte>(batch) + sizeof(uint32_t));
82 vertexCount--;
83 }
84 Until(vertexCount == 0);
85
86 Return();
87 }
88
readInput(Pointer<UInt> & batch)89 void VertexRoutine::readInput(Pointer<UInt> &batch)
90 {
91 for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
92 {
93 if(spirvShader->inputs[i + 0].Type != Spirv::ATTRIBTYPE_UNUSED ||
94 spirvShader->inputs[i + 1].Type != Spirv::ATTRIBTYPE_UNUSED ||
95 spirvShader->inputs[i + 2].Type != Spirv::ATTRIBTYPE_UNUSED ||
96 spirvShader->inputs[i + 3].Type != Spirv::ATTRIBTYPE_UNUSED)
97 {
98 Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, input) + sizeof(void *) * (i / 4));
99 UInt stride = *Pointer<UInt>(data + OFFSET(DrawData, stride) + sizeof(uint32_t) * (i / 4));
100 Int baseVertex = *Pointer<Int>(data + OFFSET(DrawData, baseVertex));
101 UInt robustnessSize(0);
102 if(state.robustBufferAccess)
103 {
104 robustnessSize = *Pointer<UInt>(data + OFFSET(DrawData, robustnessSize) + sizeof(uint32_t) * (i / 4));
105 }
106
107 auto value = readStream(input, stride, state.input[i / 4], batch, state.robustBufferAccess, robustnessSize, baseVertex);
108 routine.inputs[i + 0] = value.x;
109 routine.inputs[i + 1] = value.y;
110 routine.inputs[i + 2] = value.z;
111 routine.inputs[i + 3] = value.w;
112 }
113 }
114 }
115
computeClipFlags()116 void VertexRoutine::computeClipFlags()
117 {
118 auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
119 if(it != spirvShader->outputBuiltins.end())
120 {
121 assert(it->second.SizeInComponents == 4);
122 auto &pos = routine.getVariable(it->second.Id);
123 auto posX = pos[it->second.FirstComponent + 0];
124 auto posY = pos[it->second.FirstComponent + 1];
125 auto posZ = pos[it->second.FirstComponent + 2];
126 auto posW = pos[it->second.FirstComponent + 3];
127
128 SIMD::Int maxX = CmpLT(posW, posX);
129 SIMD::Int maxY = CmpLT(posW, posY);
130 SIMD::Int minX = CmpNLE(-posW, posX);
131 SIMD::Int minY = CmpNLE(-posW, posY);
132
133 clipFlags = maxX & Clipper::CLIP_RIGHT;
134 clipFlags |= maxY & Clipper::CLIP_TOP;
135 clipFlags |= minX & Clipper::CLIP_LEFT;
136 clipFlags |= minY & Clipper::CLIP_BOTTOM;
137 if(state.depthClipEnable)
138 {
139 // If depthClipNegativeOneToOne is enabled, depth values are in [-1, 1] instead of [0, 1].
140 SIMD::Int maxZ = CmpLT(posW, posZ);
141 SIMD::Int minZ = CmpNLE(state.depthClipNegativeOneToOne ? -posW : 0.0f, posZ);
142 clipFlags |= maxZ & Clipper::CLIP_FAR;
143 clipFlags |= minZ & Clipper::CLIP_NEAR;
144 }
145
146 SIMD::Float maxPos = As<SIMD::Float>(SIMD::Int(0x7F7FFFFF));
147 SIMD::Int finiteX = CmpLE(Abs(posX), maxPos);
148 SIMD::Int finiteY = CmpLE(Abs(posY), maxPos);
149 SIMD::Int finiteZ = CmpLE(Abs(posZ), maxPos);
150
151 SIMD::Int finiteXYZ = finiteX & finiteY & finiteZ;
152 clipFlags |= finiteXYZ & Clipper::CLIP_FINITE;
153 }
154 }
155
computeCullMask()156 void VertexRoutine::computeCullMask()
157 {
158 cullMask = Int(15);
159
160 auto it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
161 if(it != spirvShader->outputBuiltins.end())
162 {
163 auto count = spirvShader->getNumOutputCullDistances();
164 for(uint32_t i = 0; i < count; i++)
165 {
166 const auto &distance = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
167 auto mask = SignMask(CmpGE(distance, SIMD::Float(0)));
168 cullMask &= mask;
169 }
170 }
171 }
172
readStream(Pointer<Byte> & buffer,UInt & stride,const Stream & stream,Pointer<UInt> & batch,bool robustBufferAccess,UInt & robustnessSize,Int baseVertex)173 Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch,
174 bool robustBufferAccess, UInt &robustnessSize, Int baseVertex)
175 {
176 Vector4f v;
177 // Because of the following rule in the Vulkan spec, we do not care if a very large negative
178 // baseVertex would overflow all the way back into a valid region of the index buffer:
179 // "Out-of-bounds buffer loads will return any of the following values :
180 // - Values from anywhere within the memory range(s) bound to the buffer (possibly including
181 // bytes of memory past the end of the buffer, up to the end of the bound range)."
182 UInt4 offsets = (*Pointer<UInt4>(As<Pointer<UInt4>>(batch)) + As<UInt4>(Int4(baseVertex))) * UInt4(stride);
183
184 Pointer<Byte> source0 = buffer + offsets.x;
185 Pointer<Byte> source1 = buffer + offsets.y;
186 Pointer<Byte> source2 = buffer + offsets.z;
187 Pointer<Byte> source3 = buffer + offsets.w;
188
189 vk::Format format(stream.format);
190
191 UInt4 zero(0);
192 if(robustBufferAccess)
193 {
194 // Prevent integer overflow on the addition below.
195 offsets = Min(offsets, UInt4(robustnessSize));
196
197 // "vertex input attributes are considered out of bounds if the offset of the attribute
198 // in the bound vertex buffer range plus the size of the attribute is greater than ..."
199 UInt4 limits = offsets + UInt4(format.bytes());
200
201 Pointer<Byte> zeroSource = As<Pointer<Byte>>(&zero);
202 // TODO(b/141124876): Optimize for wide-vector gather operations.
203 source0 = IfThenElse(limits.x > robustnessSize, zeroSource, source0);
204 source1 = IfThenElse(limits.y > robustnessSize, zeroSource, source1);
205 source2 = IfThenElse(limits.z > robustnessSize, zeroSource, source2);
206 source3 = IfThenElse(limits.w > robustnessSize, zeroSource, source3);
207 }
208
209 int componentCount = format.componentCount();
210 bool normalized = !format.isUnnormalizedInteger();
211 bool isNativeFloatAttrib = (stream.attribType == Spirv::ATTRIBTYPE_FLOAT) || normalized;
212 bool bgra = false;
213
214 switch(stream.format)
215 {
216 case VK_FORMAT_R32_SFLOAT:
217 case VK_FORMAT_R32G32_SFLOAT:
218 case VK_FORMAT_R32G32B32_SFLOAT:
219 case VK_FORMAT_R32G32B32A32_SFLOAT:
220 {
221 if(componentCount == 0)
222 {
223 // Null stream, all default components
224 }
225 else
226 {
227 if(componentCount == 1)
228 {
229 v.x.x = *Pointer<Float>(source0);
230 v.x.y = *Pointer<Float>(source1);
231 v.x.z = *Pointer<Float>(source2);
232 v.x.w = *Pointer<Float>(source3);
233 }
234 else
235 {
236 v.x = *Pointer<Float4>(source0);
237 v.y = *Pointer<Float4>(source1);
238 v.z = *Pointer<Float4>(source2);
239 v.w = *Pointer<Float4>(source3);
240
241 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
242 }
243 }
244 }
245 break;
246 case VK_FORMAT_B8G8R8A8_UNORM:
247 bgra = true;
248 // [[fallthrough]]
249 case VK_FORMAT_R8_UNORM:
250 case VK_FORMAT_R8G8_UNORM:
251 case VK_FORMAT_R8G8B8A8_UNORM:
252 case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
253 v.x = Float4(*Pointer<Byte4>(source0));
254 v.y = Float4(*Pointer<Byte4>(source1));
255 v.z = Float4(*Pointer<Byte4>(source2));
256 v.w = Float4(*Pointer<Byte4>(source3));
257
258 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
259
260 if(componentCount >= 1) v.x *= (1.0f / 0xFF);
261 if(componentCount >= 2) v.y *= (1.0f / 0xFF);
262 if(componentCount >= 3) v.z *= (1.0f / 0xFF);
263 if(componentCount >= 4) v.w *= (1.0f / 0xFF);
264 break;
265 case VK_FORMAT_R8_UINT:
266 case VK_FORMAT_R8G8_UINT:
267 case VK_FORMAT_R8G8B8A8_UINT:
268 case VK_FORMAT_A8B8G8R8_UINT_PACK32:
269 v.x = As<Float4>(Int4(*Pointer<Byte4>(source0)));
270 v.y = As<Float4>(Int4(*Pointer<Byte4>(source1)));
271 v.z = As<Float4>(Int4(*Pointer<Byte4>(source2)));
272 v.w = As<Float4>(Int4(*Pointer<Byte4>(source3)));
273
274 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
275 break;
276 case VK_FORMAT_R8_SNORM:
277 case VK_FORMAT_R8G8_SNORM:
278 case VK_FORMAT_R8G8B8A8_SNORM:
279 case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
280 v.x = Float4(*Pointer<SByte4>(source0));
281 v.y = Float4(*Pointer<SByte4>(source1));
282 v.z = Float4(*Pointer<SByte4>(source2));
283 v.w = Float4(*Pointer<SByte4>(source3));
284
285 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
286
287 if(componentCount >= 1) v.x = Max(v.x * (1.0f / 0x7F), Float4(-1.0f));
288 if(componentCount >= 2) v.y = Max(v.y * (1.0f / 0x7F), Float4(-1.0f));
289 if(componentCount >= 3) v.z = Max(v.z * (1.0f / 0x7F), Float4(-1.0f));
290 if(componentCount >= 4) v.w = Max(v.w * (1.0f / 0x7F), Float4(-1.0f));
291 break;
292 case VK_FORMAT_R8_USCALED:
293 case VK_FORMAT_R8G8_USCALED:
294 case VK_FORMAT_R8G8B8A8_USCALED:
295 case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
296 v.x = Float4(*Pointer<Byte4>(source0));
297 v.y = Float4(*Pointer<Byte4>(source1));
298 v.z = Float4(*Pointer<Byte4>(source2));
299 v.w = Float4(*Pointer<Byte4>(source3));
300
301 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
302 break;
303 case VK_FORMAT_R8_SSCALED:
304 case VK_FORMAT_R8G8_SSCALED:
305 case VK_FORMAT_R8G8B8A8_SSCALED:
306 case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
307 v.x = Float4(*Pointer<SByte4>(source0));
308 v.y = Float4(*Pointer<SByte4>(source1));
309 v.z = Float4(*Pointer<SByte4>(source2));
310 v.w = Float4(*Pointer<SByte4>(source3));
311
312 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
313 break;
314 case VK_FORMAT_R8_SINT:
315 case VK_FORMAT_R8G8_SINT:
316 case VK_FORMAT_R8G8B8A8_SINT:
317 case VK_FORMAT_A8B8G8R8_SINT_PACK32:
318 v.x = As<Float4>(Int4(*Pointer<SByte4>(source0)));
319 v.y = As<Float4>(Int4(*Pointer<SByte4>(source1)));
320 v.z = As<Float4>(Int4(*Pointer<SByte4>(source2)));
321 v.w = As<Float4>(Int4(*Pointer<SByte4>(source3)));
322
323 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
324 break;
325 case VK_FORMAT_R16_UNORM:
326 case VK_FORMAT_R16G16_UNORM:
327 case VK_FORMAT_R16G16B16A16_UNORM:
328 v.x = Float4(*Pointer<UShort4>(source0));
329 v.y = Float4(*Pointer<UShort4>(source1));
330 v.z = Float4(*Pointer<UShort4>(source2));
331 v.w = Float4(*Pointer<UShort4>(source3));
332
333 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
334
335 if(componentCount >= 1) v.x *= (1.0f / 0xFFFF);
336 if(componentCount >= 2) v.y *= (1.0f / 0xFFFF);
337 if(componentCount >= 3) v.z *= (1.0f / 0xFFFF);
338 if(componentCount >= 4) v.w *= (1.0f / 0xFFFF);
339 break;
340 case VK_FORMAT_R16_SNORM:
341 case VK_FORMAT_R16G16_SNORM:
342 case VK_FORMAT_R16G16B16A16_SNORM:
343 v.x = Float4(*Pointer<Short4>(source0));
344 v.y = Float4(*Pointer<Short4>(source1));
345 v.z = Float4(*Pointer<Short4>(source2));
346 v.w = Float4(*Pointer<Short4>(source3));
347
348 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
349
350 if(componentCount >= 1) v.x = Max(v.x * (1.0f / 0x7FFF), Float4(-1.0f));
351 if(componentCount >= 2) v.y = Max(v.y * (1.0f / 0x7FFF), Float4(-1.0f));
352 if(componentCount >= 3) v.z = Max(v.z * (1.0f / 0x7FFF), Float4(-1.0f));
353 if(componentCount >= 4) v.w = Max(v.w * (1.0f / 0x7FFF), Float4(-1.0f));
354 break;
355 case VK_FORMAT_R16_USCALED:
356 case VK_FORMAT_R16G16_USCALED:
357 case VK_FORMAT_R16G16B16A16_USCALED:
358 v.x = Float4(*Pointer<UShort4>(source0));
359 v.y = Float4(*Pointer<UShort4>(source1));
360 v.z = Float4(*Pointer<UShort4>(source2));
361 v.w = Float4(*Pointer<UShort4>(source3));
362
363 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
364 break;
365 case VK_FORMAT_R16_SSCALED:
366 case VK_FORMAT_R16G16_SSCALED:
367 case VK_FORMAT_R16G16B16A16_SSCALED:
368 v.x = Float4(*Pointer<Short4>(source0));
369 v.y = Float4(*Pointer<Short4>(source1));
370 v.z = Float4(*Pointer<Short4>(source2));
371 v.w = Float4(*Pointer<Short4>(source3));
372
373 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
374 break;
375 case VK_FORMAT_R16_SINT:
376 case VK_FORMAT_R16G16_SINT:
377 case VK_FORMAT_R16G16B16A16_SINT:
378 v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
379 v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
380 v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
381 v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
382
383 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
384 break;
385 case VK_FORMAT_R16_UINT:
386 case VK_FORMAT_R16G16_UINT:
387 case VK_FORMAT_R16G16B16A16_UINT:
388 v.x = As<Float4>(Int4(*Pointer<UShort4>(source0)));
389 v.y = As<Float4>(Int4(*Pointer<UShort4>(source1)));
390 v.z = As<Float4>(Int4(*Pointer<UShort4>(source2)));
391 v.w = As<Float4>(Int4(*Pointer<UShort4>(source3)));
392
393 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
394 break;
395 case VK_FORMAT_R32_SINT:
396 case VK_FORMAT_R32G32_SINT:
397 case VK_FORMAT_R32G32B32_SINT:
398 case VK_FORMAT_R32G32B32A32_SINT:
399 v.x = *Pointer<Float4>(source0);
400 v.y = *Pointer<Float4>(source1);
401 v.z = *Pointer<Float4>(source2);
402 v.w = *Pointer<Float4>(source3);
403
404 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
405 break;
406 case VK_FORMAT_R32_UINT:
407 case VK_FORMAT_R32G32_UINT:
408 case VK_FORMAT_R32G32B32_UINT:
409 case VK_FORMAT_R32G32B32A32_UINT:
410 v.x = *Pointer<Float4>(source0);
411 v.y = *Pointer<Float4>(source1);
412 v.z = *Pointer<Float4>(source2);
413 v.w = *Pointer<Float4>(source3);
414
415 transpose4xN(v.x, v.y, v.z, v.w, componentCount);
416 break;
417 case VK_FORMAT_R16_SFLOAT:
418 case VK_FORMAT_R16G16_SFLOAT:
419 case VK_FORMAT_R16G16B16A16_SFLOAT:
420 {
421 if(componentCount >= 1)
422 {
423 UShort x0 = *Pointer<UShort>(source0 + 0);
424 UShort x1 = *Pointer<UShort>(source1 + 0);
425 UShort x2 = *Pointer<UShort>(source2 + 0);
426 UShort x3 = *Pointer<UShort>(source3 + 0);
427
428 v.x.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x0) * 4);
429 v.x.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x1) * 4);
430 v.x.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x2) * 4);
431 v.x.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x3) * 4);
432 }
433
434 if(componentCount >= 2)
435 {
436 UShort y0 = *Pointer<UShort>(source0 + 2);
437 UShort y1 = *Pointer<UShort>(source1 + 2);
438 UShort y2 = *Pointer<UShort>(source2 + 2);
439 UShort y3 = *Pointer<UShort>(source3 + 2);
440
441 v.y.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y0) * 4);
442 v.y.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y1) * 4);
443 v.y.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y2) * 4);
444 v.y.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y3) * 4);
445 }
446
447 if(componentCount >= 3)
448 {
449 UShort z0 = *Pointer<UShort>(source0 + 4);
450 UShort z1 = *Pointer<UShort>(source1 + 4);
451 UShort z2 = *Pointer<UShort>(source2 + 4);
452 UShort z3 = *Pointer<UShort>(source3 + 4);
453
454 v.z.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z0) * 4);
455 v.z.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z1) * 4);
456 v.z.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z2) * 4);
457 v.z.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z3) * 4);
458 }
459
460 if(componentCount >= 4)
461 {
462 UShort w0 = *Pointer<UShort>(source0 + 6);
463 UShort w1 = *Pointer<UShort>(source1 + 6);
464 UShort w2 = *Pointer<UShort>(source2 + 6);
465 UShort w3 = *Pointer<UShort>(source3 + 6);
466
467 v.w.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w0) * 4);
468 v.w.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w1) * 4);
469 v.w.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w2) * 4);
470 v.w.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w3) * 4);
471 }
472 }
473 break;
474 case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
475 bgra = true;
476 // [[fallthrough]]
477 case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
478 {
479 Int4 src;
480 src = Insert(src, *Pointer<Int>(source0), 0);
481 src = Insert(src, *Pointer<Int>(source1), 1);
482 src = Insert(src, *Pointer<Int>(source2), 2);
483 src = Insert(src, *Pointer<Int>(source3), 3);
484 v.x = Float4((src << 22) >> 22);
485 v.y = Float4((src << 12) >> 22);
486 v.z = Float4((src << 02) >> 22);
487 v.w = Float4(src >> 30);
488
489 v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
490 v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
491 v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
492 v.w = Max(v.w, Float4(-1.0f));
493 }
494 break;
495 case VK_FORMAT_A2R10G10B10_SINT_PACK32:
496 bgra = true;
497 // [[fallthrough]]
498 case VK_FORMAT_A2B10G10R10_SINT_PACK32:
499 {
500 Int4 src;
501 src = Insert(src, *Pointer<Int>(source0), 0);
502 src = Insert(src, *Pointer<Int>(source1), 1);
503 src = Insert(src, *Pointer<Int>(source2), 2);
504 src = Insert(src, *Pointer<Int>(source3), 3);
505 v.x = As<Float4>((src << 22) >> 22);
506 v.y = As<Float4>((src << 12) >> 22);
507 v.z = As<Float4>((src << 02) >> 22);
508 v.w = As<Float4>(src >> 30);
509 }
510 break;
511 case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
512 bgra = true;
513 // [[fallthrough]]
514 case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
515 {
516 Int4 src;
517 src = Insert(src, *Pointer<Int>(source0), 0);
518 src = Insert(src, *Pointer<Int>(source1), 1);
519 src = Insert(src, *Pointer<Int>(source2), 2);
520 src = Insert(src, *Pointer<Int>(source3), 3);
521
522 v.x = Float4(src & Int4(0x3FF));
523 v.y = Float4((src >> 10) & Int4(0x3FF));
524 v.z = Float4((src >> 20) & Int4(0x3FF));
525 v.w = Float4((src >> 30) & Int4(0x3));
526
527 v.x *= Float4(1.0f / 0x3FF);
528 v.y *= Float4(1.0f / 0x3FF);
529 v.z *= Float4(1.0f / 0x3FF);
530 v.w *= Float4(1.0f / 0x3);
531 }
532 break;
533 case VK_FORMAT_A2R10G10B10_UINT_PACK32:
534 bgra = true;
535 // [[fallthrough]]
536 case VK_FORMAT_A2B10G10R10_UINT_PACK32:
537 {
538 Int4 src;
539 src = Insert(src, *Pointer<Int>(source0), 0);
540 src = Insert(src, *Pointer<Int>(source1), 1);
541 src = Insert(src, *Pointer<Int>(source2), 2);
542 src = Insert(src, *Pointer<Int>(source3), 3);
543
544 v.x = As<Float4>(src & Int4(0x3FF));
545 v.y = As<Float4>((src >> 10) & Int4(0x3FF));
546 v.z = As<Float4>((src >> 20) & Int4(0x3FF));
547 v.w = As<Float4>((src >> 30) & Int4(0x3));
548 }
549 break;
550 default:
551 UNSUPPORTED("stream.format %d", int(stream.format));
552 }
553
554 if(bgra)
555 {
556 // Swap red and blue
557 Float4 t = v.x;
558 v.x = v.z;
559 v.z = t;
560 }
561
562 if(componentCount < 1) v.x = Float4(0.0f);
563 if(componentCount < 2) v.y = Float4(0.0f);
564 if(componentCount < 3) v.z = Float4(0.0f);
565 if(componentCount < 4) v.w = isNativeFloatAttrib ? As<Float4>(Float4(1.0f)) : As<Float4>(Int4(1));
566
567 return v;
568 }
569
writeCache(Pointer<Byte> & vertexCache,Pointer<UInt> & tagCache,Pointer<UInt> & batch)570 void VertexRoutine::writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch)
571 {
572 ASSERT(SIMD::Width == 4);
573
574 UInt index0 = batch[0];
575 UInt index1 = batch[1];
576 UInt index2 = batch[2];
577 UInt index3 = batch[3];
578
579 UInt cacheIndex0 = index0 & VertexCache::TAG_MASK;
580 UInt cacheIndex1 = index1 & VertexCache::TAG_MASK;
581 UInt cacheIndex2 = index2 & VertexCache::TAG_MASK;
582 UInt cacheIndex3 = index3 & VertexCache::TAG_MASK;
583
584 // We processed a SIMD group of vertices, with the first one being the one that missed the cache tag check.
585 // Write them out in reverse order here and below to ensure the first one is now guaranteed to be in the cache.
586 tagCache[cacheIndex3] = index3;
587 tagCache[cacheIndex2] = index2;
588 tagCache[cacheIndex1] = index1;
589 tagCache[cacheIndex0] = index0;
590
591 auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
592 if(it != spirvShader->outputBuiltins.end())
593 {
594 assert(it->second.SizeInComponents == 4);
595 auto &position = routine.getVariable(it->second.Id);
596
597 SIMD::Float4 pos;
598 pos.x = position[it->second.FirstComponent + 0];
599 pos.y = position[it->second.FirstComponent + 1];
600 pos.z = position[it->second.FirstComponent + 2];
601 pos.w = position[it->second.FirstComponent + 3];
602
603 // Projection and viewport transform.
604 SIMD::Float w = As<SIMD::Float>(As<SIMD::Int>(pos.w) | (As<SIMD::Int>(CmpEQ(pos.w, 0.0f)) & As<SIMD::Int>(SIMD::Float(1.0f))));
605 SIMD::Float rhw = 1.0f / w;
606
607 SIMD::Float4 proj;
608 proj.x = As<Float4>(RoundIntClamped(SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, X0xF))) + pos.x * rhw * SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, WxF)))));
609 proj.y = As<Float4>(RoundIntClamped(SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, Y0xF))) + pos.y * rhw * SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, HxF)))));
610 proj.z = pos.z * rhw;
611 proj.w = rhw;
612
613 Float4 pos_x = Extract128(pos.x, 0);
614 Float4 pos_y = Extract128(pos.y, 0);
615 Float4 pos_z = Extract128(pos.z, 0);
616 Float4 pos_w = Extract128(pos.w, 0);
617 transpose4x4(pos_x, pos_y, pos_z, pos_w);
618
619 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, position), 16) = pos_w;
620 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, position), 16) = pos_z;
621 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, position), 16) = pos_y;
622 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, position), 16) = pos_x;
623
624 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, clipFlags)) = Extract(clipFlags, 3);
625 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, clipFlags)) = Extract(clipFlags, 2);
626 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, clipFlags)) = Extract(clipFlags, 1);
627 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, clipFlags)) = Extract(clipFlags, 0);
628
629 Float4 proj_x = Extract128(proj.x, 0);
630 Float4 proj_y = Extract128(proj.y, 0);
631 Float4 proj_z = Extract128(proj.z, 0);
632 Float4 proj_w = Extract128(proj.w, 0);
633 transpose4x4(proj_x, proj_y, proj_z, proj_w);
634
635 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, projected), 16) = proj_w;
636 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, projected), 16) = proj_z;
637 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, projected), 16) = proj_y;
638 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, projected), 16) = proj_x;
639 }
640
641 it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize);
642 if(it != spirvShader->outputBuiltins.end())
643 {
644 ASSERT(it->second.SizeInComponents == 1);
645 auto psize = routine.getVariable(it->second.Id)[it->second.FirstComponent];
646
647 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, pointSize)) = Extract(psize, 3);
648 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, pointSize)) = Extract(psize, 2);
649 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, pointSize)) = Extract(psize, 1);
650 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, pointSize)) = Extract(psize, 0);
651 }
652
653 it = spirvShader->outputBuiltins.find(spv::BuiltInClipDistance);
654 if(it != spirvShader->outputBuiltins.end())
655 {
656 auto count = spirvShader->getNumOutputClipDistances();
657 for(unsigned int i = 0; i < count; i++)
658 {
659 auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
660 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 3);
661 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 2);
662 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 1);
663 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 0);
664 }
665 }
666
667 it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
668 if(it != spirvShader->outputBuiltins.end())
669 {
670 auto count = spirvShader->getNumOutputCullDistances();
671 for(unsigned int i = 0; i < count; i++)
672 {
673 auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
674 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 3);
675 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 2);
676 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 1);
677 *Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 0);
678 }
679 }
680
681 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, cullMask)) = -((cullMask >> 3) & 1);
682 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, cullMask)) = -((cullMask >> 2) & 1);
683 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, cullMask)) = -((cullMask >> 1) & 1);
684 *Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, cullMask)) = -((cullMask >> 0) & 1);
685
686 for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
687 {
688 if(spirvShader->outputs[i + 0].Type != Spirv::ATTRIBTYPE_UNUSED ||
689 spirvShader->outputs[i + 1].Type != Spirv::ATTRIBTYPE_UNUSED ||
690 spirvShader->outputs[i + 2].Type != Spirv::ATTRIBTYPE_UNUSED ||
691 spirvShader->outputs[i + 3].Type != Spirv::ATTRIBTYPE_UNUSED)
692 {
693 Vector4f v;
694 v.x = Extract128(routine.outputs[i + 0], 0);
695 v.y = Extract128(routine.outputs[i + 1], 0);
696 v.z = Extract128(routine.outputs[i + 2], 0);
697 v.w = Extract128(routine.outputs[i + 3], 0);
698
699 transpose4x4(v.x, v.y, v.z, v.w);
700
701 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, v[i]), 16) = v.w;
702 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, v[i]), 16) = v.z;
703 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, v[i]), 16) = v.y;
704 *Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, v[i]), 16) = v.x;
705 }
706 }
707 }
708
writeVertex(const Pointer<Byte> & vertex,Pointer<Byte> & cacheEntry)709 void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheEntry)
710 {
711 *Pointer<Int4>(vertex + OFFSET(Vertex, position)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex, position));
712 *Pointer<Int>(vertex + OFFSET(Vertex, pointSize)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, pointSize));
713
714 *Pointer<Int>(vertex + OFFSET(Vertex, clipFlags)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, clipFlags));
715 *Pointer<Int>(vertex + OFFSET(Vertex, cullMask)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, cullMask));
716 *Pointer<Int4>(vertex + OFFSET(Vertex, projected)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex, projected));
717
718 for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
719 {
720 if(spirvShader->outputs[i].Type != Spirv::ATTRIBTYPE_UNUSED)
721 {
722 *Pointer<Int>(vertex + OFFSET(Vertex, v[i]), 4) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, v[i]), 4);
723 }
724 }
725 for(unsigned int i = 0; i < spirvShader->getNumOutputClipDistances(); i++)
726 {
727 *Pointer<Float>(vertex + OFFSET(Vertex, clipDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, clipDistance[i]), 4);
728 }
729 for(unsigned int i = 0; i < spirvShader->getNumOutputCullDistances(); i++)
730 {
731 *Pointer<Float>(vertex + OFFSET(Vertex, cullDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, cullDistance[i]), 4);
732 }
733 }
734
735 } // namespace sw
736