xref: /aosp_15_r20/external/swiftshader/src/Pipeline/VertexRoutine.cpp (revision 03ce13f70fcc45d86ee91b7ee4cab1936a95046e)
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "VertexRoutine.hpp"
16 
17 #include "Constants.hpp"
18 #include "SpirvShader.hpp"
19 #include "Device/Clipper.hpp"
20 #include "Device/Renderer.hpp"
21 #include "Device/Vertex.hpp"
22 #include "System/Debug.hpp"
23 #include "System/Half.hpp"
24 #include "Vulkan/VkDevice.hpp"
25 
26 namespace sw {
27 
VertexRoutine(const VertexProcessor::State & state,const vk::PipelineLayout * pipelineLayout,const SpirvShader * spirvShader)28 VertexRoutine::VertexRoutine(
29     const VertexProcessor::State &state,
30     const vk::PipelineLayout *pipelineLayout,
31     const SpirvShader *spirvShader)
32     : routine(pipelineLayout)
33     , state(state)
34     , spirvShader(spirvShader)
35 {
36 	spirvShader->emitProlog(&routine);
37 }
38 
~VertexRoutine()39 VertexRoutine::~VertexRoutine()
40 {
41 }
42 
generate()43 void VertexRoutine::generate()
44 {
45 	Pointer<Byte> cache = task + OFFSET(VertexTask, vertexCache);
46 	Pointer<Byte> vertexCache = cache + OFFSET(VertexCache, vertex);
47 	Pointer<UInt> tagCache = Pointer<UInt>(cache + OFFSET(VertexCache, tag));
48 
49 	UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask, vertexCount));
50 
51 	constants = device + OFFSET(vk::Device, constants);
52 
53 	// Check the cache one vertex index at a time. If a hit occurs, copy from the cache to the 'vertex' output buffer.
54 	// On a cache miss, process a SIMD width of consecutive indices from the input batch. They're written to the cache
55 	// in reverse order to guarantee that the first one doesn't get evicted and can be written out.
56 
57 	Do
58 	{
59 		UInt index = *batch;
60 		UInt cacheIndex = index & VertexCache::TAG_MASK;
61 
62 		If(tagCache[cacheIndex] != index)
63 		{
64 			readInput(batch);
65 			program(batch, vertexCount);
66 			computeClipFlags();
67 			computeCullMask();
68 
69 			writeCache(vertexCache, tagCache, batch);
70 		}
71 
72 		Pointer<Byte> cacheEntry = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
73 
74 		// For points, vertexCount is 1 per primitive, so duplicate vertex for all 3 vertices of the primitive
75 		for(int i = 0; i < (state.isPoint ? 3 : 1); i++)
76 		{
77 			writeVertex(vertex, cacheEntry);
78 			vertex += sizeof(Vertex);
79 		}
80 
81 		batch = Pointer<UInt>(Pointer<Byte>(batch) + sizeof(uint32_t));
82 		vertexCount--;
83 	}
84 	Until(vertexCount == 0);
85 
86 	Return();
87 }
88 
readInput(Pointer<UInt> & batch)89 void VertexRoutine::readInput(Pointer<UInt> &batch)
90 {
91 	for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
92 	{
93 		if(spirvShader->inputs[i + 0].Type != Spirv::ATTRIBTYPE_UNUSED ||
94 		   spirvShader->inputs[i + 1].Type != Spirv::ATTRIBTYPE_UNUSED ||
95 		   spirvShader->inputs[i + 2].Type != Spirv::ATTRIBTYPE_UNUSED ||
96 		   spirvShader->inputs[i + 3].Type != Spirv::ATTRIBTYPE_UNUSED)
97 		{
98 			Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, input) + sizeof(void *) * (i / 4));
99 			UInt stride = *Pointer<UInt>(data + OFFSET(DrawData, stride) + sizeof(uint32_t) * (i / 4));
100 			Int baseVertex = *Pointer<Int>(data + OFFSET(DrawData, baseVertex));
101 			UInt robustnessSize(0);
102 			if(state.robustBufferAccess)
103 			{
104 				robustnessSize = *Pointer<UInt>(data + OFFSET(DrawData, robustnessSize) + sizeof(uint32_t) * (i / 4));
105 			}
106 
107 			auto value = readStream(input, stride, state.input[i / 4], batch, state.robustBufferAccess, robustnessSize, baseVertex);
108 			routine.inputs[i + 0] = value.x;
109 			routine.inputs[i + 1] = value.y;
110 			routine.inputs[i + 2] = value.z;
111 			routine.inputs[i + 3] = value.w;
112 		}
113 	}
114 }
115 
computeClipFlags()116 void VertexRoutine::computeClipFlags()
117 {
118 	auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
119 	if(it != spirvShader->outputBuiltins.end())
120 	{
121 		assert(it->second.SizeInComponents == 4);
122 		auto &pos = routine.getVariable(it->second.Id);
123 		auto posX = pos[it->second.FirstComponent + 0];
124 		auto posY = pos[it->second.FirstComponent + 1];
125 		auto posZ = pos[it->second.FirstComponent + 2];
126 		auto posW = pos[it->second.FirstComponent + 3];
127 
128 		SIMD::Int maxX = CmpLT(posW, posX);
129 		SIMD::Int maxY = CmpLT(posW, posY);
130 		SIMD::Int minX = CmpNLE(-posW, posX);
131 		SIMD::Int minY = CmpNLE(-posW, posY);
132 
133 		clipFlags = maxX & Clipper::CLIP_RIGHT;
134 		clipFlags |= maxY & Clipper::CLIP_TOP;
135 		clipFlags |= minX & Clipper::CLIP_LEFT;
136 		clipFlags |= minY & Clipper::CLIP_BOTTOM;
137 		if(state.depthClipEnable)
138 		{
139 			// If depthClipNegativeOneToOne is enabled, depth values are in [-1, 1] instead of [0, 1].
140 			SIMD::Int maxZ = CmpLT(posW, posZ);
141 			SIMD::Int minZ = CmpNLE(state.depthClipNegativeOneToOne ? -posW : 0.0f, posZ);
142 			clipFlags |= maxZ & Clipper::CLIP_FAR;
143 			clipFlags |= minZ & Clipper::CLIP_NEAR;
144 		}
145 
146 		SIMD::Float maxPos = As<SIMD::Float>(SIMD::Int(0x7F7FFFFF));
147 		SIMD::Int finiteX = CmpLE(Abs(posX), maxPos);
148 		SIMD::Int finiteY = CmpLE(Abs(posY), maxPos);
149 		SIMD::Int finiteZ = CmpLE(Abs(posZ), maxPos);
150 
151 		SIMD::Int finiteXYZ = finiteX & finiteY & finiteZ;
152 		clipFlags |= finiteXYZ & Clipper::CLIP_FINITE;
153 	}
154 }
155 
computeCullMask()156 void VertexRoutine::computeCullMask()
157 {
158 	cullMask = Int(15);
159 
160 	auto it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
161 	if(it != spirvShader->outputBuiltins.end())
162 	{
163 		auto count = spirvShader->getNumOutputCullDistances();
164 		for(uint32_t i = 0; i < count; i++)
165 		{
166 			const auto &distance = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
167 			auto mask = SignMask(CmpGE(distance, SIMD::Float(0)));
168 			cullMask &= mask;
169 		}
170 	}
171 }
172 
readStream(Pointer<Byte> & buffer,UInt & stride,const Stream & stream,Pointer<UInt> & batch,bool robustBufferAccess,UInt & robustnessSize,Int baseVertex)173 Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch,
174                                    bool robustBufferAccess, UInt &robustnessSize, Int baseVertex)
175 {
176 	Vector4f v;
177 	// Because of the following rule in the Vulkan spec, we do not care if a very large negative
178 	// baseVertex would overflow all the way back into a valid region of the index buffer:
179 	// "Out-of-bounds buffer loads will return any of the following values :
180 	//  - Values from anywhere within the memory range(s) bound to the buffer (possibly including
181 	//    bytes of memory past the end of the buffer, up to the end of the bound range)."
182 	UInt4 offsets = (*Pointer<UInt4>(As<Pointer<UInt4>>(batch)) + As<UInt4>(Int4(baseVertex))) * UInt4(stride);
183 
184 	Pointer<Byte> source0 = buffer + offsets.x;
185 	Pointer<Byte> source1 = buffer + offsets.y;
186 	Pointer<Byte> source2 = buffer + offsets.z;
187 	Pointer<Byte> source3 = buffer + offsets.w;
188 
189 	vk::Format format(stream.format);
190 
191 	UInt4 zero(0);
192 	if(robustBufferAccess)
193 	{
194 		// Prevent integer overflow on the addition below.
195 		offsets = Min(offsets, UInt4(robustnessSize));
196 
197 		// "vertex input attributes are considered out of bounds if the offset of the attribute
198 		//  in the bound vertex buffer range plus the size of the attribute is greater than ..."
199 		UInt4 limits = offsets + UInt4(format.bytes());
200 
201 		Pointer<Byte> zeroSource = As<Pointer<Byte>>(&zero);
202 		// TODO(b/141124876): Optimize for wide-vector gather operations.
203 		source0 = IfThenElse(limits.x > robustnessSize, zeroSource, source0);
204 		source1 = IfThenElse(limits.y > robustnessSize, zeroSource, source1);
205 		source2 = IfThenElse(limits.z > robustnessSize, zeroSource, source2);
206 		source3 = IfThenElse(limits.w > robustnessSize, zeroSource, source3);
207 	}
208 
209 	int componentCount = format.componentCount();
210 	bool normalized = !format.isUnnormalizedInteger();
211 	bool isNativeFloatAttrib = (stream.attribType == Spirv::ATTRIBTYPE_FLOAT) || normalized;
212 	bool bgra = false;
213 
214 	switch(stream.format)
215 	{
216 	case VK_FORMAT_R32_SFLOAT:
217 	case VK_FORMAT_R32G32_SFLOAT:
218 	case VK_FORMAT_R32G32B32_SFLOAT:
219 	case VK_FORMAT_R32G32B32A32_SFLOAT:
220 		{
221 			if(componentCount == 0)
222 			{
223 				// Null stream, all default components
224 			}
225 			else
226 			{
227 				if(componentCount == 1)
228 				{
229 					v.x.x = *Pointer<Float>(source0);
230 					v.x.y = *Pointer<Float>(source1);
231 					v.x.z = *Pointer<Float>(source2);
232 					v.x.w = *Pointer<Float>(source3);
233 				}
234 				else
235 				{
236 					v.x = *Pointer<Float4>(source0);
237 					v.y = *Pointer<Float4>(source1);
238 					v.z = *Pointer<Float4>(source2);
239 					v.w = *Pointer<Float4>(source3);
240 
241 					transpose4xN(v.x, v.y, v.z, v.w, componentCount);
242 				}
243 			}
244 		}
245 		break;
246 	case VK_FORMAT_B8G8R8A8_UNORM:
247 		bgra = true;
248 		// [[fallthrough]]
249 	case VK_FORMAT_R8_UNORM:
250 	case VK_FORMAT_R8G8_UNORM:
251 	case VK_FORMAT_R8G8B8A8_UNORM:
252 	case VK_FORMAT_A8B8G8R8_UNORM_PACK32:
253 		v.x = Float4(*Pointer<Byte4>(source0));
254 		v.y = Float4(*Pointer<Byte4>(source1));
255 		v.z = Float4(*Pointer<Byte4>(source2));
256 		v.w = Float4(*Pointer<Byte4>(source3));
257 
258 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
259 
260 		if(componentCount >= 1) v.x *= (1.0f / 0xFF);
261 		if(componentCount >= 2) v.y *= (1.0f / 0xFF);
262 		if(componentCount >= 3) v.z *= (1.0f / 0xFF);
263 		if(componentCount >= 4) v.w *= (1.0f / 0xFF);
264 		break;
265 	case VK_FORMAT_R8_UINT:
266 	case VK_FORMAT_R8G8_UINT:
267 	case VK_FORMAT_R8G8B8A8_UINT:
268 	case VK_FORMAT_A8B8G8R8_UINT_PACK32:
269 		v.x = As<Float4>(Int4(*Pointer<Byte4>(source0)));
270 		v.y = As<Float4>(Int4(*Pointer<Byte4>(source1)));
271 		v.z = As<Float4>(Int4(*Pointer<Byte4>(source2)));
272 		v.w = As<Float4>(Int4(*Pointer<Byte4>(source3)));
273 
274 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
275 		break;
276 	case VK_FORMAT_R8_SNORM:
277 	case VK_FORMAT_R8G8_SNORM:
278 	case VK_FORMAT_R8G8B8A8_SNORM:
279 	case VK_FORMAT_A8B8G8R8_SNORM_PACK32:
280 		v.x = Float4(*Pointer<SByte4>(source0));
281 		v.y = Float4(*Pointer<SByte4>(source1));
282 		v.z = Float4(*Pointer<SByte4>(source2));
283 		v.w = Float4(*Pointer<SByte4>(source3));
284 
285 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
286 
287 		if(componentCount >= 1) v.x = Max(v.x * (1.0f / 0x7F), Float4(-1.0f));
288 		if(componentCount >= 2) v.y = Max(v.y * (1.0f / 0x7F), Float4(-1.0f));
289 		if(componentCount >= 3) v.z = Max(v.z * (1.0f / 0x7F), Float4(-1.0f));
290 		if(componentCount >= 4) v.w = Max(v.w * (1.0f / 0x7F), Float4(-1.0f));
291 		break;
292 	case VK_FORMAT_R8_USCALED:
293 	case VK_FORMAT_R8G8_USCALED:
294 	case VK_FORMAT_R8G8B8A8_USCALED:
295 	case VK_FORMAT_A8B8G8R8_USCALED_PACK32:
296 		v.x = Float4(*Pointer<Byte4>(source0));
297 		v.y = Float4(*Pointer<Byte4>(source1));
298 		v.z = Float4(*Pointer<Byte4>(source2));
299 		v.w = Float4(*Pointer<Byte4>(source3));
300 
301 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
302 		break;
303 	case VK_FORMAT_R8_SSCALED:
304 	case VK_FORMAT_R8G8_SSCALED:
305 	case VK_FORMAT_R8G8B8A8_SSCALED:
306 	case VK_FORMAT_A8B8G8R8_SSCALED_PACK32:
307 		v.x = Float4(*Pointer<SByte4>(source0));
308 		v.y = Float4(*Pointer<SByte4>(source1));
309 		v.z = Float4(*Pointer<SByte4>(source2));
310 		v.w = Float4(*Pointer<SByte4>(source3));
311 
312 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
313 		break;
314 	case VK_FORMAT_R8_SINT:
315 	case VK_FORMAT_R8G8_SINT:
316 	case VK_FORMAT_R8G8B8A8_SINT:
317 	case VK_FORMAT_A8B8G8R8_SINT_PACK32:
318 		v.x = As<Float4>(Int4(*Pointer<SByte4>(source0)));
319 		v.y = As<Float4>(Int4(*Pointer<SByte4>(source1)));
320 		v.z = As<Float4>(Int4(*Pointer<SByte4>(source2)));
321 		v.w = As<Float4>(Int4(*Pointer<SByte4>(source3)));
322 
323 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
324 		break;
325 	case VK_FORMAT_R16_UNORM:
326 	case VK_FORMAT_R16G16_UNORM:
327 	case VK_FORMAT_R16G16B16A16_UNORM:
328 		v.x = Float4(*Pointer<UShort4>(source0));
329 		v.y = Float4(*Pointer<UShort4>(source1));
330 		v.z = Float4(*Pointer<UShort4>(source2));
331 		v.w = Float4(*Pointer<UShort4>(source3));
332 
333 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
334 
335 		if(componentCount >= 1) v.x *= (1.0f / 0xFFFF);
336 		if(componentCount >= 2) v.y *= (1.0f / 0xFFFF);
337 		if(componentCount >= 3) v.z *= (1.0f / 0xFFFF);
338 		if(componentCount >= 4) v.w *= (1.0f / 0xFFFF);
339 		break;
340 	case VK_FORMAT_R16_SNORM:
341 	case VK_FORMAT_R16G16_SNORM:
342 	case VK_FORMAT_R16G16B16A16_SNORM:
343 		v.x = Float4(*Pointer<Short4>(source0));
344 		v.y = Float4(*Pointer<Short4>(source1));
345 		v.z = Float4(*Pointer<Short4>(source2));
346 		v.w = Float4(*Pointer<Short4>(source3));
347 
348 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
349 
350 		if(componentCount >= 1) v.x = Max(v.x * (1.0f / 0x7FFF), Float4(-1.0f));
351 		if(componentCount >= 2) v.y = Max(v.y * (1.0f / 0x7FFF), Float4(-1.0f));
352 		if(componentCount >= 3) v.z = Max(v.z * (1.0f / 0x7FFF), Float4(-1.0f));
353 		if(componentCount >= 4) v.w = Max(v.w * (1.0f / 0x7FFF), Float4(-1.0f));
354 		break;
355 	case VK_FORMAT_R16_USCALED:
356 	case VK_FORMAT_R16G16_USCALED:
357 	case VK_FORMAT_R16G16B16A16_USCALED:
358 		v.x = Float4(*Pointer<UShort4>(source0));
359 		v.y = Float4(*Pointer<UShort4>(source1));
360 		v.z = Float4(*Pointer<UShort4>(source2));
361 		v.w = Float4(*Pointer<UShort4>(source3));
362 
363 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
364 		break;
365 	case VK_FORMAT_R16_SSCALED:
366 	case VK_FORMAT_R16G16_SSCALED:
367 	case VK_FORMAT_R16G16B16A16_SSCALED:
368 		v.x = Float4(*Pointer<Short4>(source0));
369 		v.y = Float4(*Pointer<Short4>(source1));
370 		v.z = Float4(*Pointer<Short4>(source2));
371 		v.w = Float4(*Pointer<Short4>(source3));
372 
373 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
374 		break;
375 	case VK_FORMAT_R16_SINT:
376 	case VK_FORMAT_R16G16_SINT:
377 	case VK_FORMAT_R16G16B16A16_SINT:
378 		v.x = As<Float4>(Int4(*Pointer<Short4>(source0)));
379 		v.y = As<Float4>(Int4(*Pointer<Short4>(source1)));
380 		v.z = As<Float4>(Int4(*Pointer<Short4>(source2)));
381 		v.w = As<Float4>(Int4(*Pointer<Short4>(source3)));
382 
383 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
384 		break;
385 	case VK_FORMAT_R16_UINT:
386 	case VK_FORMAT_R16G16_UINT:
387 	case VK_FORMAT_R16G16B16A16_UINT:
388 		v.x = As<Float4>(Int4(*Pointer<UShort4>(source0)));
389 		v.y = As<Float4>(Int4(*Pointer<UShort4>(source1)));
390 		v.z = As<Float4>(Int4(*Pointer<UShort4>(source2)));
391 		v.w = As<Float4>(Int4(*Pointer<UShort4>(source3)));
392 
393 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
394 		break;
395 	case VK_FORMAT_R32_SINT:
396 	case VK_FORMAT_R32G32_SINT:
397 	case VK_FORMAT_R32G32B32_SINT:
398 	case VK_FORMAT_R32G32B32A32_SINT:
399 		v.x = *Pointer<Float4>(source0);
400 		v.y = *Pointer<Float4>(source1);
401 		v.z = *Pointer<Float4>(source2);
402 		v.w = *Pointer<Float4>(source3);
403 
404 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
405 		break;
406 	case VK_FORMAT_R32_UINT:
407 	case VK_FORMAT_R32G32_UINT:
408 	case VK_FORMAT_R32G32B32_UINT:
409 	case VK_FORMAT_R32G32B32A32_UINT:
410 		v.x = *Pointer<Float4>(source0);
411 		v.y = *Pointer<Float4>(source1);
412 		v.z = *Pointer<Float4>(source2);
413 		v.w = *Pointer<Float4>(source3);
414 
415 		transpose4xN(v.x, v.y, v.z, v.w, componentCount);
416 		break;
417 	case VK_FORMAT_R16_SFLOAT:
418 	case VK_FORMAT_R16G16_SFLOAT:
419 	case VK_FORMAT_R16G16B16A16_SFLOAT:
420 		{
421 			if(componentCount >= 1)
422 			{
423 				UShort x0 = *Pointer<UShort>(source0 + 0);
424 				UShort x1 = *Pointer<UShort>(source1 + 0);
425 				UShort x2 = *Pointer<UShort>(source2 + 0);
426 				UShort x3 = *Pointer<UShort>(source3 + 0);
427 
428 				v.x.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x0) * 4);
429 				v.x.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x1) * 4);
430 				v.x.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x2) * 4);
431 				v.x.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(x3) * 4);
432 			}
433 
434 			if(componentCount >= 2)
435 			{
436 				UShort y0 = *Pointer<UShort>(source0 + 2);
437 				UShort y1 = *Pointer<UShort>(source1 + 2);
438 				UShort y2 = *Pointer<UShort>(source2 + 2);
439 				UShort y3 = *Pointer<UShort>(source3 + 2);
440 
441 				v.y.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y0) * 4);
442 				v.y.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y1) * 4);
443 				v.y.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y2) * 4);
444 				v.y.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(y3) * 4);
445 			}
446 
447 			if(componentCount >= 3)
448 			{
449 				UShort z0 = *Pointer<UShort>(source0 + 4);
450 				UShort z1 = *Pointer<UShort>(source1 + 4);
451 				UShort z2 = *Pointer<UShort>(source2 + 4);
452 				UShort z3 = *Pointer<UShort>(source3 + 4);
453 
454 				v.z.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z0) * 4);
455 				v.z.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z1) * 4);
456 				v.z.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z2) * 4);
457 				v.z.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(z3) * 4);
458 			}
459 
460 			if(componentCount >= 4)
461 			{
462 				UShort w0 = *Pointer<UShort>(source0 + 6);
463 				UShort w1 = *Pointer<UShort>(source1 + 6);
464 				UShort w2 = *Pointer<UShort>(source2 + 6);
465 				UShort w3 = *Pointer<UShort>(source3 + 6);
466 
467 				v.w.x = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w0) * 4);
468 				v.w.y = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w1) * 4);
469 				v.w.z = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w2) * 4);
470 				v.w.w = *Pointer<Float>(constants + OFFSET(Constants, half2float) + Int(w3) * 4);
471 			}
472 		}
473 		break;
474 	case VK_FORMAT_A2R10G10B10_SNORM_PACK32:
475 		bgra = true;
476 		// [[fallthrough]]
477 	case VK_FORMAT_A2B10G10R10_SNORM_PACK32:
478 		{
479 			Int4 src;
480 			src = Insert(src, *Pointer<Int>(source0), 0);
481 			src = Insert(src, *Pointer<Int>(source1), 1);
482 			src = Insert(src, *Pointer<Int>(source2), 2);
483 			src = Insert(src, *Pointer<Int>(source3), 3);
484 			v.x = Float4((src << 22) >> 22);
485 			v.y = Float4((src << 12) >> 22);
486 			v.z = Float4((src << 02) >> 22);
487 			v.w = Float4(src >> 30);
488 
489 			v.x = Max(v.x * Float4(1.0f / 0x1FF), Float4(-1.0f));
490 			v.y = Max(v.y * Float4(1.0f / 0x1FF), Float4(-1.0f));
491 			v.z = Max(v.z * Float4(1.0f / 0x1FF), Float4(-1.0f));
492 			v.w = Max(v.w, Float4(-1.0f));
493 		}
494 		break;
495 	case VK_FORMAT_A2R10G10B10_SINT_PACK32:
496 		bgra = true;
497 		// [[fallthrough]]
498 	case VK_FORMAT_A2B10G10R10_SINT_PACK32:
499 		{
500 			Int4 src;
501 			src = Insert(src, *Pointer<Int>(source0), 0);
502 			src = Insert(src, *Pointer<Int>(source1), 1);
503 			src = Insert(src, *Pointer<Int>(source2), 2);
504 			src = Insert(src, *Pointer<Int>(source3), 3);
505 			v.x = As<Float4>((src << 22) >> 22);
506 			v.y = As<Float4>((src << 12) >> 22);
507 			v.z = As<Float4>((src << 02) >> 22);
508 			v.w = As<Float4>(src >> 30);
509 		}
510 		break;
511 	case VK_FORMAT_A2R10G10B10_UNORM_PACK32:
512 		bgra = true;
513 		// [[fallthrough]]
514 	case VK_FORMAT_A2B10G10R10_UNORM_PACK32:
515 		{
516 			Int4 src;
517 			src = Insert(src, *Pointer<Int>(source0), 0);
518 			src = Insert(src, *Pointer<Int>(source1), 1);
519 			src = Insert(src, *Pointer<Int>(source2), 2);
520 			src = Insert(src, *Pointer<Int>(source3), 3);
521 
522 			v.x = Float4(src & Int4(0x3FF));
523 			v.y = Float4((src >> 10) & Int4(0x3FF));
524 			v.z = Float4((src >> 20) & Int4(0x3FF));
525 			v.w = Float4((src >> 30) & Int4(0x3));
526 
527 			v.x *= Float4(1.0f / 0x3FF);
528 			v.y *= Float4(1.0f / 0x3FF);
529 			v.z *= Float4(1.0f / 0x3FF);
530 			v.w *= Float4(1.0f / 0x3);
531 		}
532 		break;
533 	case VK_FORMAT_A2R10G10B10_UINT_PACK32:
534 		bgra = true;
535 		// [[fallthrough]]
536 	case VK_FORMAT_A2B10G10R10_UINT_PACK32:
537 		{
538 			Int4 src;
539 			src = Insert(src, *Pointer<Int>(source0), 0);
540 			src = Insert(src, *Pointer<Int>(source1), 1);
541 			src = Insert(src, *Pointer<Int>(source2), 2);
542 			src = Insert(src, *Pointer<Int>(source3), 3);
543 
544 			v.x = As<Float4>(src & Int4(0x3FF));
545 			v.y = As<Float4>((src >> 10) & Int4(0x3FF));
546 			v.z = As<Float4>((src >> 20) & Int4(0x3FF));
547 			v.w = As<Float4>((src >> 30) & Int4(0x3));
548 		}
549 		break;
550 	default:
551 		UNSUPPORTED("stream.format %d", int(stream.format));
552 	}
553 
554 	if(bgra)
555 	{
556 		// Swap red and blue
557 		Float4 t = v.x;
558 		v.x = v.z;
559 		v.z = t;
560 	}
561 
562 	if(componentCount < 1) v.x = Float4(0.0f);
563 	if(componentCount < 2) v.y = Float4(0.0f);
564 	if(componentCount < 3) v.z = Float4(0.0f);
565 	if(componentCount < 4) v.w = isNativeFloatAttrib ? As<Float4>(Float4(1.0f)) : As<Float4>(Int4(1));
566 
567 	return v;
568 }
569 
writeCache(Pointer<Byte> & vertexCache,Pointer<UInt> & tagCache,Pointer<UInt> & batch)570 void VertexRoutine::writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch)
571 {
572 	ASSERT(SIMD::Width == 4);
573 
574 	UInt index0 = batch[0];
575 	UInt index1 = batch[1];
576 	UInt index2 = batch[2];
577 	UInt index3 = batch[3];
578 
579 	UInt cacheIndex0 = index0 & VertexCache::TAG_MASK;
580 	UInt cacheIndex1 = index1 & VertexCache::TAG_MASK;
581 	UInt cacheIndex2 = index2 & VertexCache::TAG_MASK;
582 	UInt cacheIndex3 = index3 & VertexCache::TAG_MASK;
583 
584 	// We processed a SIMD group of vertices, with the first one being the one that missed the cache tag check.
585 	// Write them out in reverse order here and below to ensure the first one is now guaranteed to be in the cache.
586 	tagCache[cacheIndex3] = index3;
587 	tagCache[cacheIndex2] = index2;
588 	tagCache[cacheIndex1] = index1;
589 	tagCache[cacheIndex0] = index0;
590 
591 	auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
592 	if(it != spirvShader->outputBuiltins.end())
593 	{
594 		assert(it->second.SizeInComponents == 4);
595 		auto &position = routine.getVariable(it->second.Id);
596 
597 		SIMD::Float4 pos;
598 		pos.x = position[it->second.FirstComponent + 0];
599 		pos.y = position[it->second.FirstComponent + 1];
600 		pos.z = position[it->second.FirstComponent + 2];
601 		pos.w = position[it->second.FirstComponent + 3];
602 
603 		// Projection and viewport transform.
604 		SIMD::Float w = As<SIMD::Float>(As<SIMD::Int>(pos.w) | (As<SIMD::Int>(CmpEQ(pos.w, 0.0f)) & As<SIMD::Int>(SIMD::Float(1.0f))));
605 		SIMD::Float rhw = 1.0f / w;
606 
607 		SIMD::Float4 proj;
608 		proj.x = As<Float4>(RoundIntClamped(SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, X0xF))) + pos.x * rhw * SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, WxF)))));
609 		proj.y = As<Float4>(RoundIntClamped(SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, Y0xF))) + pos.y * rhw * SIMD::Float(*Pointer<Float>(data + OFFSET(DrawData, HxF)))));
610 		proj.z = pos.z * rhw;
611 		proj.w = rhw;
612 
613 		Float4 pos_x = Extract128(pos.x, 0);
614 		Float4 pos_y = Extract128(pos.y, 0);
615 		Float4 pos_z = Extract128(pos.z, 0);
616 		Float4 pos_w = Extract128(pos.w, 0);
617 		transpose4x4(pos_x, pos_y, pos_z, pos_w);
618 
619 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, position), 16) = pos_w;
620 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, position), 16) = pos_z;
621 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, position), 16) = pos_y;
622 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, position), 16) = pos_x;
623 
624 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, clipFlags)) = Extract(clipFlags, 3);
625 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, clipFlags)) = Extract(clipFlags, 2);
626 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, clipFlags)) = Extract(clipFlags, 1);
627 		*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, clipFlags)) = Extract(clipFlags, 0);
628 
629 		Float4 proj_x = Extract128(proj.x, 0);
630 		Float4 proj_y = Extract128(proj.y, 0);
631 		Float4 proj_z = Extract128(proj.z, 0);
632 		Float4 proj_w = Extract128(proj.w, 0);
633 		transpose4x4(proj_x, proj_y, proj_z, proj_w);
634 
635 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, projected), 16) = proj_w;
636 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, projected), 16) = proj_z;
637 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, projected), 16) = proj_y;
638 		*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, projected), 16) = proj_x;
639 	}
640 
641 	it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize);
642 	if(it != spirvShader->outputBuiltins.end())
643 	{
644 		ASSERT(it->second.SizeInComponents == 1);
645 		auto psize = routine.getVariable(it->second.Id)[it->second.FirstComponent];
646 
647 		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, pointSize)) = Extract(psize, 3);
648 		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, pointSize)) = Extract(psize, 2);
649 		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, pointSize)) = Extract(psize, 1);
650 		*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, pointSize)) = Extract(psize, 0);
651 	}
652 
653 	it = spirvShader->outputBuiltins.find(spv::BuiltInClipDistance);
654 	if(it != spirvShader->outputBuiltins.end())
655 	{
656 		auto count = spirvShader->getNumOutputClipDistances();
657 		for(unsigned int i = 0; i < count; i++)
658 		{
659 			auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
660 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 3);
661 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 2);
662 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 1);
663 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, clipDistance[i])) = Extract(dist, 0);
664 		}
665 	}
666 
667 	it = spirvShader->outputBuiltins.find(spv::BuiltInCullDistance);
668 	if(it != spirvShader->outputBuiltins.end())
669 	{
670 		auto count = spirvShader->getNumOutputCullDistances();
671 		for(unsigned int i = 0; i < count; i++)
672 		{
673 			auto dist = routine.getVariable(it->second.Id)[it->second.FirstComponent + i];
674 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 3);
675 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 2);
676 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 1);
677 			*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, cullDistance[i])) = Extract(dist, 0);
678 		}
679 	}
680 
681 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, cullMask)) = -((cullMask >> 3) & 1);
682 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, cullMask)) = -((cullMask >> 2) & 1);
683 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, cullMask)) = -((cullMask >> 1) & 1);
684 	*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, cullMask)) = -((cullMask >> 0) & 1);
685 
686 	for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
687 	{
688 		if(spirvShader->outputs[i + 0].Type != Spirv::ATTRIBTYPE_UNUSED ||
689 		   spirvShader->outputs[i + 1].Type != Spirv::ATTRIBTYPE_UNUSED ||
690 		   spirvShader->outputs[i + 2].Type != Spirv::ATTRIBTYPE_UNUSED ||
691 		   spirvShader->outputs[i + 3].Type != Spirv::ATTRIBTYPE_UNUSED)
692 		{
693 			Vector4f v;
694 			v.x = Extract128(routine.outputs[i + 0], 0);
695 			v.y = Extract128(routine.outputs[i + 1], 0);
696 			v.z = Extract128(routine.outputs[i + 2], 0);
697 			v.w = Extract128(routine.outputs[i + 3], 0);
698 
699 			transpose4x4(v.x, v.y, v.z, v.w);
700 
701 			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex, v[i]), 16) = v.w;
702 			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex, v[i]), 16) = v.z;
703 			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex, v[i]), 16) = v.y;
704 			*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex, v[i]), 16) = v.x;
705 		}
706 	}
707 }
708 
writeVertex(const Pointer<Byte> & vertex,Pointer<Byte> & cacheEntry)709 void VertexRoutine::writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheEntry)
710 {
711 	*Pointer<Int4>(vertex + OFFSET(Vertex, position)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex, position));
712 	*Pointer<Int>(vertex + OFFSET(Vertex, pointSize)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, pointSize));
713 
714 	*Pointer<Int>(vertex + OFFSET(Vertex, clipFlags)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, clipFlags));
715 	*Pointer<Int>(vertex + OFFSET(Vertex, cullMask)) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, cullMask));
716 	*Pointer<Int4>(vertex + OFFSET(Vertex, projected)) = *Pointer<Int4>(cacheEntry + OFFSET(Vertex, projected));
717 
718 	for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i++)
719 	{
720 		if(spirvShader->outputs[i].Type != Spirv::ATTRIBTYPE_UNUSED)
721 		{
722 			*Pointer<Int>(vertex + OFFSET(Vertex, v[i]), 4) = *Pointer<Int>(cacheEntry + OFFSET(Vertex, v[i]), 4);
723 		}
724 	}
725 	for(unsigned int i = 0; i < spirvShader->getNumOutputClipDistances(); i++)
726 	{
727 		*Pointer<Float>(vertex + OFFSET(Vertex, clipDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, clipDistance[i]), 4);
728 	}
729 	for(unsigned int i = 0; i < spirvShader->getNumOutputCullDistances(); i++)
730 	{
731 		*Pointer<Float>(vertex + OFFSET(Vertex, cullDistance[i]), 4) = *Pointer<Float>(cacheEntry + OFFSET(Vertex, cullDistance[i]), 4);
732 	}
733 }
734 
735 }  // namespace sw
736