xref: /aosp_15_r20/external/swiftshader/src/Reactor/SIMD.hpp (revision 03ce13f70fcc45d86ee91b7ee4cab1936a95046e)
1 // Copyright 2022 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef rr_SIMD_hpp
16 #define rr_SIMD_hpp
17 
18 #include "Reactor.hpp"
19 
20 #include <functional>
21 #include <vector>
22 
23 namespace rr {
24 
25 namespace scalar {
26 using Int = rr::Int;
27 using UInt = rr::UInt;
28 using Float = rr::Float;
29 template<class T>
30 using Pointer = rr::Pointer<T>;
31 }  // namespace scalar
32 
33 namespace packed {
34 using Int4 = rr::Int4;
35 using UInt4 = rr::UInt4;
36 using Float4 = rr::Float4;
37 }  // namespace packed
38 
39 namespace SIMD {
40 
41 extern const int Width;
42 
43 class Int;
44 class UInt;
45 class Float;
46 class Pointer;
47 
48 class Int : public LValue<SIMD::Int>,
49             public XYZW<SIMD::Int>  // TODO(b/214583550): Eliminate and replace with SwizzleQuad() and/or other intrinsics.
50 {
51 public:
52 	explicit Int(RValue<SIMD::Float> cast);
53 
54 	Int();
55 	Int(int broadcast);
56 	Int(int x, int y, int z, int w);
57 	Int(std::vector<int> v);
58 	Int(std::function<int(int)> LaneValueProducer);
59 	Int(RValue<SIMD::Int> rhs);
60 	Int(const Int &rhs);
61 	Int(const Reference<SIMD::Int> &rhs);
62 	Int(RValue<SIMD::UInt> rhs);
63 	Int(const UInt &rhs);
64 	Int(const Reference<SIMD::UInt> &rhs);
65 	Int(RValue<scalar::Int> rhs);
66 	Int(const scalar::Int &rhs);
67 	Int(const Reference<scalar::Int> &rhs);
68 
69 	template<int T>
70 	Int(const SwizzleMask1<packed::Int4, T> &rhs);
71 
72 	RValue<SIMD::Int> operator=(int broadcast);
73 	RValue<SIMD::Int> operator=(RValue<SIMD::Int> rhs);
74 	RValue<SIMD::Int> operator=(const Int &rhs);
75 	RValue<SIMD::Int> operator=(const Reference<SIMD::Int> &rhs);
76 
77 	static Type *type();
element_count()78 	static int element_count() { return SIMD::Width; }
79 };
80 
81 class UInt : public LValue<SIMD::UInt>,
82              public XYZW<SIMD::UInt>  // TODO(b/214583550): Eliminate and replace with SwizzleQuad() and/or other intrinsics.
83 {
84 public:
85 	explicit UInt(RValue<SIMD::Float> cast);
86 
87 	UInt();
88 	UInt(int broadcast);
89 	UInt(int x, int y, int z, int w);
90 	UInt(std::vector<int> v);
91 	UInt(std::function<int(int)> LaneValueProducer);
92 	UInt(RValue<SIMD::UInt> rhs);
93 	UInt(const UInt &rhs);
94 	UInt(const Reference<SIMD::UInt> &rhs);
95 	UInt(RValue<SIMD::Int> rhs);
96 	UInt(const Int &rhs);
97 	UInt(const Reference<SIMD::Int> &rhs);
98 	UInt(RValue<scalar::UInt> rhs);
99 	UInt(const scalar::UInt &rhs);
100 	UInt(const Reference<scalar::UInt> &rhs);
101 
102 	RValue<SIMD::UInt> operator=(RValue<SIMD::UInt> rhs);
103 	RValue<SIMD::UInt> operator=(const UInt &rhs);
104 	RValue<SIMD::UInt> operator=(const Reference<SIMD::UInt> &rhs);
105 
106 	static Type *type();
element_count()107 	static int element_count() { return SIMD::Width; }
108 };
109 
110 class Float : public LValue<SIMD::Float>,
111               public XYZW<SIMD::Float>  // TODO(b/214583550): Eliminate and replace with SwizzleQuad() and/or other intrinsics.
112 {
113 public:
114 	explicit Float(RValue<SIMD::Int> cast);
115 	explicit Float(RValue<SIMD::UInt> cast);
116 
117 	Float();
118 	Float(float broadcast);
119 	Float(float x, float y, float z, float w);
120 	Float(std::vector<float> v);
121 	Float(std::function<float(int)> LaneValueProducer);
122 	Float(RValue<SIMD::Float> rhs);
123 	Float(const Float &rhs);
124 	Float(const Reference<SIMD::Float> &rhs);
125 	Float(RValue<scalar::Float> rhs);
126 	Float(const scalar::Float &rhs);
127 	Float(const Reference<scalar::Float> &rhs);
128 
129 	Float(RValue<packed::Float4> rhs);
130 	RValue<SIMD::Float> operator=(RValue<packed::Float4> rhs);
131 	template<int T>
132 	Float(const SwizzleMask1<packed::Float4, T> &rhs);
133 
134 	RValue<SIMD::Float> operator=(float broadcast);
135 	RValue<SIMD::Float> operator=(RValue<SIMD::Float> rhs);
136 	RValue<SIMD::Float> operator=(const Float &rhs);
137 	RValue<SIMD::Float> operator=(const Reference<SIMD::Float> &rhs);
138 	RValue<SIMD::Float> operator=(RValue<scalar::Float> rhs);
139 	RValue<SIMD::Float> operator=(const scalar::Float &rhs);
140 	RValue<SIMD::Float> operator=(const Reference<scalar::Float> &rhs);
141 
142 	static SIMD::Float infinity();
143 
144 	static Type *type();
element_count()145 	static int element_count() { return SIMD::Width; }
146 };
147 
148 class Pointer
149 {
150 public:
151 	Pointer(scalar::Pointer<Byte> base, scalar::Int limit);
152 	Pointer(scalar::Pointer<Byte> base, unsigned int limit);
153 	Pointer(scalar::Pointer<Byte> base, scalar::Int limit, SIMD::Int offset);
154 	Pointer(scalar::Pointer<Byte> base, unsigned int limit, SIMD::Int offset);
155 	Pointer(std::vector<scalar::Pointer<Byte>> pointers);
156 	explicit Pointer(SIMD::UInt cast);                           // Cast from 32-bit integers to 32-bit pointers
157 	explicit Pointer(SIMD::UInt castLow, SIMD::UInt castHight);  // Cast from pairs of 32-bit integers to 64-bit pointers
158 
159 	Pointer &operator+=(SIMD::Int i);
160 	Pointer operator+(SIMD::Int i);
161 	Pointer &operator+=(int i);
162 	Pointer operator+(int i);
163 
164 	SIMD::Int offsets() const;
165 
166 	SIMD::Int isInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
167 
168 	bool isStaticallyInBounds(unsigned int accessSize, OutOfBoundsBehavior robustness) const;
169 
170 	Int limit() const;
171 
172 	// Returns true if all offsets are compile-time static and sequential
173 	// (N+0*step, N+1*step, N+2*step, N+3*step)
174 	bool hasStaticSequentialOffsets(unsigned int step) const;
175 
176 	// Returns true if all offsets are compile-time static and equal
177 	// (N, N, N, N)
178 	bool hasStaticEqualOffsets() const;
179 
180 	template<typename T>
181 	inline T Load(OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed, int alignment = sizeof(float));
182 
183 	template<typename T>
184 	inline void Store(T val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
185 
186 	template<typename T>
187 	inline void Store(RValue<T> val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic = false, std::memory_order order = std::memory_order_relaxed);
188 
189 	scalar::Pointer<Byte> getUniformPointer() const;
190 	scalar::Pointer<Byte> getPointerForLane(int lane) const;
191 	static Pointer IfThenElse(SIMD::Int condition, const Pointer &lhs, const Pointer &rhs);
192 
193 	void castTo(SIMD::UInt &bits) const;                              // Cast from 32-bit pointers to 32-bit integers
194 	void castTo(SIMD::UInt &lowerBits, SIMD::UInt &upperBits) const;  // Cast from 64-bit pointers to pairs of 32-bit integers
195 
196 #ifdef ENABLE_RR_PRINT
197 	std::vector<rr::Value *> getPrintValues() const;
198 #endif
199 
200 private:
201 	// Base address for the pointer, common across all lanes.
202 	scalar::Pointer<Byte> base;
203 	// Per-lane address for dealing with non-uniform data
204 	std::vector<scalar::Pointer<Byte>> pointers;
205 
206 public:
207 	// Upper (non-inclusive) limit for offsets from base.
208 	scalar::Int dynamicLimit;  // If hasDynamicLimit is false, dynamicLimit is zero.
209 	unsigned int staticLimit = 0;
210 
211 	// Per lane offsets from base.
212 	SIMD::Int dynamicOffsets;  // If hasDynamicOffsets is false, all dynamicOffsets are zero.
213 	std::vector<int32_t> staticOffsets;
214 
215 	bool hasDynamicLimit = false;    // True if dynamicLimit is non-zero.
216 	bool hasDynamicOffsets = false;  // True if any dynamicOffsets are non-zero.
217 	bool isBasePlusOffset = false;   // True if this uses base+offset. False if this is a collection of Pointers
218 };
219 
220 }  // namespace SIMD
221 
222 RValue<SIMD::Int> operator+(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
223 RValue<SIMD::Int> operator-(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
224 RValue<SIMD::Int> operator*(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
225 RValue<SIMD::Int> operator/(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
226 RValue<SIMD::Int> operator%(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
227 RValue<SIMD::Int> operator&(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
228 RValue<SIMD::Int> operator|(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
229 RValue<SIMD::Int> operator^(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
230 RValue<SIMD::Int> operator<<(RValue<SIMD::Int> lhs, unsigned char rhs);
231 RValue<SIMD::Int> operator>>(RValue<SIMD::Int> lhs, unsigned char rhs);
232 RValue<SIMD::Int> operator<<(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
233 RValue<SIMD::Int> operator>>(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
234 RValue<SIMD::Int> operator+=(SIMD::Int &lhs, RValue<SIMD::Int> rhs);
235 RValue<SIMD::Int> operator-=(SIMD::Int &lhs, RValue<SIMD::Int> rhs);
236 RValue<SIMD::Int> operator*=(SIMD::Int &lhs, RValue<SIMD::Int> rhs);
237 //	RValue<SIMD::Int> operator/=(SIMD::Int &lhs, RValue<SIMD::Int> rhs);
238 //	RValue<SIMD::Int> operator%=(SIMD::Int &lhs, RValue<SIMD::Int> rhs);
239 RValue<SIMD::Int> operator&=(SIMD::Int &lhs, RValue<SIMD::Int> rhs);
240 RValue<SIMD::Int> operator|=(SIMD::Int &lhs, RValue<SIMD::Int> rhs);
241 RValue<SIMD::Int> operator^=(SIMD::Int &lhs, RValue<SIMD::Int> rhs);
242 RValue<SIMD::Int> operator<<=(SIMD::Int &lhs, unsigned char rhs);
243 RValue<SIMD::Int> operator>>=(SIMD::Int &lhs, unsigned char rhs);
244 RValue<SIMD::Int> operator+(RValue<SIMD::Int> val);
245 RValue<SIMD::Int> operator-(RValue<SIMD::Int> val);
246 RValue<SIMD::Int> operator~(RValue<SIMD::Int> val);
247 //	RValue<SIMD::Int> operator++(SIMD::Int &val, int);   // Post-increment
248 //	const Int &operator++(SIMD::Int &val);   // Pre-increment
249 //	RValue<SIMD::Int> operator--(SIMD::Int &val, int);   // Post-decrement
250 //	const Int &operator--(SIMD::Int &val);   // Pre-decrement
251 //	RValue<Bool> operator<(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
252 //	RValue<Bool> operator<=(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
253 //	RValue<Bool> operator>(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
254 //	RValue<Bool> operator>=(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
255 //	RValue<Bool> operator!=(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
256 //	RValue<Bool> operator==(RValue<SIMD::Int> lhs, RValue<SIMD::Int> rhs);
257 
258 RValue<SIMD::Int> CmpEQ(RValue<SIMD::Int> x, RValue<SIMD::Int> y);
259 RValue<SIMD::Int> CmpLT(RValue<SIMD::Int> x, RValue<SIMD::Int> y);
260 RValue<SIMD::Int> CmpLE(RValue<SIMD::Int> x, RValue<SIMD::Int> y);
261 RValue<SIMD::Int> CmpNEQ(RValue<SIMD::Int> x, RValue<SIMD::Int> y);
262 RValue<SIMD::Int> CmpNLT(RValue<SIMD::Int> x, RValue<SIMD::Int> y);
263 RValue<SIMD::Int> CmpNLE(RValue<SIMD::Int> x, RValue<SIMD::Int> y);
CmpGT(RValue<SIMD::Int> x,RValue<SIMD::Int> y)264 inline RValue<SIMD::Int> CmpGT(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
265 {
266 	return CmpNLE(x, y);
267 }
CmpGE(RValue<SIMD::Int> x,RValue<SIMD::Int> y)268 inline RValue<SIMD::Int> CmpGE(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
269 {
270 	return CmpNLT(x, y);
271 }
272 RValue<SIMD::Int> Abs(RValue<SIMD::Int> x);
273 RValue<SIMD::Int> Max(RValue<SIMD::Int> x, RValue<SIMD::Int> y);
274 RValue<SIMD::Int> Min(RValue<SIMD::Int> x, RValue<SIMD::Int> y);
275 // Convert to nearest integer. If a converted value is outside of the integer
276 // range, the returned result is undefined.
277 RValue<SIMD::Int> RoundInt(RValue<SIMD::Float> cast);
278 // Rounds to the nearest integer, but clamps very large values to an
279 // implementation-dependent range.
280 // Specifically, on x86, values larger than 2147483583.0 are converted to
281 // 2147483583 (0x7FFFFFBF) instead of producing 0x80000000.
282 RValue<SIMD::Int> RoundIntClamped(RValue<SIMD::Float> cast);
283 RValue<scalar::Int> Extract(RValue<SIMD::Int> val, int i);
284 RValue<SIMD::Int> Insert(RValue<SIMD::Int> val, RValue<scalar::Int> element, int i);
285 RValue<packed::Int4> Extract128(RValue<SIMD::Int> val, int i);
286 RValue<SIMD::Int> Insert128(RValue<SIMD::Int> val, RValue<packed::Int4> element, int i);
287 
288 RValue<SIMD::UInt> operator+(RValue<SIMD::UInt> lhs, RValue<SIMD::UInt> rhs);
289 RValue<SIMD::UInt> operator-(RValue<SIMD::UInt> lhs, RValue<SIMD::UInt> rhs);
290 RValue<SIMD::UInt> operator*(RValue<SIMD::UInt> lhs, RValue<SIMD::UInt> rhs);
291 RValue<SIMD::UInt> operator/(RValue<SIMD::UInt> lhs, RValue<SIMD::UInt> rhs);
292 RValue<SIMD::UInt> operator%(RValue<SIMD::UInt> lhs, RValue<SIMD::UInt> rhs);
293 RValue<SIMD::UInt> operator&(RValue<SIMD::UInt> lhs, RValue<SIMD::UInt> rhs);
294 RValue<SIMD::UInt> operator|(RValue<SIMD::UInt> lhs, RValue<SIMD::UInt> rhs);
295 RValue<SIMD::UInt> operator^(RValue<SIMD::UInt> lhs, RValue<SIMD::UInt> rhs);
296 RValue<SIMD::UInt> operator<<(RValue<SIMD::UInt> lhs, unsigned char rhs);
297 RValue<SIMD::UInt> operator>>(RValue<SIMD::UInt> lhs, unsigned char rhs);
298 RValue<SIMD::UInt> operator<<(RValue<SIMD::UInt> lhs, RValue<SIMD::UInt> rhs);
299 RValue<SIMD::UInt> operator>>(RValue<SIMD::UInt> lhs, RValue<SIMD::UInt> rhs);
300 RValue<SIMD::UInt> operator+=(SIMD::UInt &lhs, RValue<SIMD::UInt> rhs);
301 RValue<SIMD::UInt> operator-=(SIMD::UInt &lhs, RValue<SIMD::UInt> rhs);
302 RValue<SIMD::UInt> operator*=(SIMD::UInt &lhs, RValue<SIMD::UInt> rhs);
303 //	RValue<SIMD::UInt> operator/=(SIMD::UInt &lhs, RValue<SIMD::UInt> rhs);
304 //	RValue<SIMD::UInt> operator%=(SIMD::UInt &lhs, RValue<SIMD::UInt> rhs);
305 RValue<SIMD::UInt> operator&=(SIMD::UInt &lhs, RValue<SIMD::UInt> rhs);
306 RValue<SIMD::UInt> operator|=(SIMD::UInt &lhs, RValue<SIMD::UInt> rhs);
307 RValue<SIMD::UInt> operator^=(SIMD::UInt &lhs, RValue<SIMD::UInt> rhs);
308 RValue<SIMD::UInt> operator<<=(SIMD::UInt &lhs, unsigned char rhs);
309 RValue<SIMD::UInt> operator>>=(SIMD::UInt &lhs, unsigned char rhs);
310 RValue<SIMD::UInt> operator+(RValue<SIMD::UInt> val);
311 RValue<SIMD::UInt> operator-(RValue<SIMD::UInt> val);
312 RValue<SIMD::UInt> operator~(RValue<SIMD::UInt> val);
313 //	RValue<SIMD::UInt> operator++(SIMD::UInt &val, int);   // Post-increment
314 //	const UInt &operator++(SIMD::UInt &val);   // Pre-increment
315 //	RValue<SIMD::UInt> operator--(SIMD::UInt &val, int);   // Post-decrement
316 //	const UInt &operator--(SIMD::UInt &val);   // Pre-decrement
317 //	RValue<Bool> operator<(RValue<SIMD::UInt> lhs, RValue<SIMD::UInt> rhs);
318 //	RValue<Bool> operator<=(RValue<SIMD::UInt> lhs, RValue<SIMD::UInt> rhs);
319 //	RValue<Bool> operator>(RValue<SIMD::UInt> lhs, RValue<SIMD::UInt> rhs);
320 //	RValue<Bool> operator>=(RValue<SIMD::UInt> lhs, RValue<SIMD::UInt> rhs);
321 //	RValue<Bool> operator!=(RValue<SIMD::UInt> lhs, RValue<SIMD::UInt> rhs);
322 //	RValue<Bool> operator==(RValue<SIMD::UInt> lhs, RValue<SIMD::UInt> rhs);
323 
324 RValue<SIMD::UInt> CmpEQ(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y);
325 RValue<SIMD::UInt> CmpLT(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y);
326 RValue<SIMD::UInt> CmpLE(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y);
327 RValue<SIMD::UInt> CmpNEQ(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y);
328 RValue<SIMD::UInt> CmpNLT(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y);
329 RValue<SIMD::UInt> CmpNLE(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y);
CmpGT(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)330 inline RValue<SIMD::UInt> CmpGT(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
331 {
332 	return CmpNLE(x, y);
333 }
CmpGE(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)334 inline RValue<SIMD::UInt> CmpGE(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
335 {
336 	return CmpNLT(x, y);
337 }
338 RValue<SIMD::UInt> Max(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y);
339 RValue<SIMD::UInt> Min(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y);
340 RValue<scalar::UInt> Extract(RValue<SIMD::UInt> val, int i);
341 RValue<SIMD::UInt> Insert(RValue<SIMD::UInt> val, RValue<scalar::UInt> element, int i);
342 RValue<packed::UInt4> Extract128(RValue<SIMD::UInt> val, int i);
343 RValue<SIMD::UInt> Insert128(RValue<SIMD::UInt> val, RValue<packed::UInt4> element, int i);
344 //	RValue<SIMD::UInt> RoundInt(RValue<SIMD::Float> cast);
345 
346 RValue<SIMD::Float> operator+(RValue<SIMD::Float> lhs, RValue<SIMD::Float> rhs);
347 RValue<SIMD::Float> operator-(RValue<SIMD::Float> lhs, RValue<SIMD::Float> rhs);
348 RValue<SIMD::Float> operator*(RValue<SIMD::Float> lhs, RValue<SIMD::Float> rhs);
349 RValue<SIMD::Float> operator/(RValue<SIMD::Float> lhs, RValue<SIMD::Float> rhs);
350 RValue<SIMD::Float> operator%(RValue<SIMD::Float> lhs, RValue<SIMD::Float> rhs);
351 RValue<SIMD::Float> operator+=(SIMD::Float &lhs, RValue<SIMD::Float> rhs);
352 RValue<SIMD::Float> operator-=(SIMD::Float &lhs, RValue<SIMD::Float> rhs);
353 RValue<SIMD::Float> operator*=(SIMD::Float &lhs, RValue<SIMD::Float> rhs);
354 RValue<SIMD::Float> operator/=(SIMD::Float &lhs, RValue<SIMD::Float> rhs);
355 RValue<SIMD::Float> operator%=(SIMD::Float &lhs, RValue<SIMD::Float> rhs);
356 RValue<SIMD::Float> operator+(RValue<SIMD::Float> val);
357 RValue<SIMD::Float> operator-(RValue<SIMD::Float> val);
358 
359 // Computes `x * y + z`, which may be fused into one operation to produce a higher-precision result.
360 RValue<SIMD::Float> MulAdd(RValue<SIMD::Float> x, RValue<SIMD::Float> y, RValue<SIMD::Float> z);
361 // Computes a fused `x * y + z` operation. Caps::fmaIsFast indicates whether it emits an FMA instruction.
362 RValue<SIMD::Float> FMA(RValue<SIMD::Float> x, RValue<SIMD::Float> y, RValue<SIMD::Float> z);
363 
364 RValue<SIMD::Float> Abs(RValue<SIMD::Float> x);
365 RValue<SIMD::Float> Max(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
366 RValue<SIMD::Float> Min(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
367 
368 RValue<SIMD::Float> Rcp(RValue<SIMD::Float> x, bool relaxedPrecision, bool exactAtPow2 = false);
369 RValue<SIMD::Float> RcpSqrt(RValue<SIMD::Float> x, bool relaxedPrecision);
370 RValue<SIMD::Float> Sqrt(RValue<SIMD::Float> x);
371 RValue<SIMD::Float> Insert(RValue<SIMD::Float> val, RValue<rr ::Float> element, int i);
372 RValue<rr ::Float> Extract(RValue<SIMD::Float> x, int i);
373 RValue<packed::Float4> Extract128(RValue<SIMD::Float> val, int i);
374 RValue<SIMD::Float> Insert128(RValue<SIMD::Float> val, RValue<packed::Float4> element, int i);
375 
376 // Ordered comparison functions
377 RValue<SIMD::Int> CmpEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
378 RValue<SIMD::Int> CmpLT(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
379 RValue<SIMD::Int> CmpLE(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
380 RValue<SIMD::Int> CmpNEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
381 RValue<SIMD::Int> CmpNLT(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
382 RValue<SIMD::Int> CmpNLE(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
CmpGT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)383 inline RValue<SIMD::Int> CmpGT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
384 {
385 	return CmpNLE(x, y);
386 }
CmpGE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)387 inline RValue<SIMD::Int> CmpGE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
388 {
389 	return CmpNLT(x, y);
390 }
391 
392 // Unordered comparison functions
393 RValue<SIMD::Int> CmpUEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
394 RValue<SIMD::Int> CmpULT(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
395 RValue<SIMD::Int> CmpULE(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
396 RValue<SIMD::Int> CmpUNEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
397 RValue<SIMD::Int> CmpUNLT(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
398 RValue<SIMD::Int> CmpUNLE(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
CmpUGT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)399 inline RValue<SIMD::Int> CmpUGT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
400 {
401 	return CmpUNLE(x, y);
402 }
CmpUGE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)403 inline RValue<SIMD::Int> CmpUGE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
404 {
405 	return CmpUNLT(x, y);
406 }
407 
408 RValue<SIMD::Int> IsInf(RValue<SIMD::Float> x);
409 RValue<SIMD::Int> IsNan(RValue<SIMD::Float> x);
410 RValue<SIMD::Float> Round(RValue<SIMD::Float> x);
411 RValue<SIMD::Float> Trunc(RValue<SIMD::Float> x);
412 RValue<SIMD::Float> Frac(RValue<SIMD::Float> x);
413 RValue<SIMD::Float> Floor(RValue<SIMD::Float> x);
414 RValue<SIMD::Float> Ceil(RValue<SIMD::Float> x);
415 
416 // Trigonometric functions
417 RValue<SIMD::Float> Sin(RValue<SIMD::Float> x);
418 RValue<SIMD::Float> Cos(RValue<SIMD::Float> x);
419 RValue<SIMD::Float> Tan(RValue<SIMD::Float> x);
420 RValue<SIMD::Float> Asin(RValue<SIMD::Float> x);
421 RValue<SIMD::Float> Acos(RValue<SIMD::Float> x);
422 RValue<SIMD::Float> Atan(RValue<SIMD::Float> x);
423 RValue<SIMD::Float> Sinh(RValue<SIMD::Float> x);
424 RValue<SIMD::Float> Cosh(RValue<SIMD::Float> x);
425 RValue<SIMD::Float> Tanh(RValue<SIMD::Float> x);
426 RValue<SIMD::Float> Asinh(RValue<SIMD::Float> x);
427 RValue<SIMD::Float> Acosh(RValue<SIMD::Float> x);
428 RValue<SIMD::Float> Atanh(RValue<SIMD::Float> x);
429 RValue<SIMD::Float> Atan2(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
430 
431 // Exponential functions
432 RValue<SIMD::Float> Pow(RValue<SIMD::Float> x, RValue<SIMD::Float> y);
433 RValue<SIMD::Float> Exp(RValue<SIMD::Float> x);
434 RValue<SIMD::Float> Log(RValue<SIMD::Float> x);
435 RValue<SIMD::Float> Exp2(RValue<SIMD::Float> x);
436 RValue<SIMD::Float> Log2(RValue<SIMD::Float> x);
437 
438 RValue<Int> SignMask(RValue<SIMD::Int> x);
439 RValue<SIMD::UInt> Ctlz(RValue<SIMD::UInt> x, bool isZeroUndef);
440 RValue<SIMD::UInt> Cttz(RValue<SIMD::UInt> x, bool isZeroUndef);
441 RValue<SIMD::Int> MulHigh(RValue<SIMD::Int> x, RValue<SIMD::Int> y);
442 RValue<SIMD::UInt> MulHigh(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y);
443 RValue<Bool> AnyTrue(const RValue<SIMD::Int> &bools);
444 RValue<Bool> AnyFalse(const RValue<SIMD::Int> &bools);
445 RValue<Bool> Divergent(const RValue<SIMD::Int> &ints);
446 RValue<SIMD::Int> Swizzle(RValue<SIMD::Int> x, uint16_t select);
447 RValue<SIMD::UInt> Swizzle(RValue<SIMD::UInt> x, uint16_t select);
448 RValue<SIMD::Float> Swizzle(RValue<SIMD::Float> x, uint16_t select);
449 RValue<SIMD::Int> Shuffle(RValue<SIMD::Int> x, RValue<SIMD::Int> y, uint16_t select);
450 RValue<SIMD::UInt> Shuffle(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y, uint16_t select);
451 RValue<SIMD::Float> Shuffle(RValue<SIMD::Float> x, RValue<SIMD::Float> y, uint16_t select);
452 
453 RValue<SIMD::Float> Gather(RValue<Pointer<Float>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes = false);
454 RValue<SIMD::Int> Gather(RValue<Pointer<Int>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes = false);
455 void Scatter(RValue<Pointer<Float>> base, RValue<SIMD::Float> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment);
456 void Scatter(RValue<Pointer<Int>> base, RValue<SIMD::Int> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment);
457 
458 template<>
RValue(int i)459 inline RValue<SIMD::Int>::RValue(int i)
460     : val(broadcast(i, SIMD::Int::type()))
461 {
462 	RR_DEBUG_INFO_EMIT_VAR(val);
463 }
464 
465 template<>
RValue(unsigned int i)466 inline RValue<SIMD::UInt>::RValue(unsigned int i)
467     : val(broadcast(int(i), SIMD::UInt::type()))
468 {
469 	RR_DEBUG_INFO_EMIT_VAR(val);
470 }
471 
472 template<>
RValue(float f)473 inline RValue<SIMD::Float>::RValue(float f)
474     : val(broadcast(f, SIMD::Float::type()))
475 {
476 	RR_DEBUG_INFO_EMIT_VAR(val);
477 }
478 
479 template<int T>
Int(const SwizzleMask1<packed::Int4,T> & rhs)480 SIMD::Int::Int(const SwizzleMask1<packed::Int4, T> &rhs)
481     : XYZW(this)
482 {
483 	*this = rhs.operator RValue<scalar::Int>();
484 }
485 
486 template<int T>
Float(const SwizzleMask1<packed::Float4,T> & rhs)487 SIMD::Float::Float(const SwizzleMask1<packed::Float4, T> &rhs)
488     : XYZW(this)
489 {
490 	*this = rhs.operator RValue<scalar::Float>();
491 }
492 
493 template<typename T>
Load(OutOfBoundsBehavior robustness,SIMD::Int mask,bool atomic,std::memory_order order,int alignment)494 inline T SIMD::Pointer::Load(OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
495 {
496 	using EL = typename Scalar<T>::Type;
497 
498 	if(!isBasePlusOffset)
499 	{
500 		T out = T(0);
501 		for(int i = 0; i < SIMD::Width; i++)
502 		{
503 			If(Extract(mask, i) != 0)
504 			{
505 				auto el = rr::Load(scalar::Pointer<EL>(pointers[i]), alignment, atomic, order);
506 				out = Insert(out, el, i);
507 			}
508 		}
509 		return out;
510 	}
511 
512 	if(isStaticallyInBounds(sizeof(float), robustness))
513 	{
514 		// All elements are statically known to be in-bounds.
515 		// We can avoid costly conditional on masks.
516 
517 		if(hasStaticSequentialOffsets(sizeof(float)))
518 		{
519 			// Offsets are sequential. Perform regular load.
520 			return rr::Load(scalar::Pointer<T>(base + staticOffsets[0]), alignment, atomic, order);
521 		}
522 
523 		if(hasStaticEqualOffsets())
524 		{
525 			// Load one, replicate.
526 			return T(*scalar::Pointer<EL>(base + staticOffsets[0], alignment));
527 		}
528 	}
529 	else
530 	{
531 		switch(robustness)
532 		{
533 		case OutOfBoundsBehavior::Nullify:
534 		case OutOfBoundsBehavior::RobustBufferAccess:
535 		case OutOfBoundsBehavior::UndefinedValue:
536 			mask &= isInBounds(sizeof(float), robustness);  // Disable out-of-bounds reads.
537 			break;
538 		case OutOfBoundsBehavior::UndefinedBehavior:
539 			// Nothing to do. Application/compiler must guarantee no out-of-bounds accesses.
540 			break;
541 		}
542 	}
543 
544 	auto offs = offsets();
545 
546 	if(!atomic && order == std::memory_order_relaxed)
547 	{
548 		if(hasStaticEqualOffsets())
549 		{
550 			// Load one, replicate.
551 			// Be careful of the case where the post-bounds-check mask
552 			// is 0, in which case we must not load.
553 			T out = T(0);
554 			If(AnyTrue(mask))
555 			{
556 				EL el = *scalar::Pointer<EL>(base + staticOffsets[0], alignment);
557 				out = T(el);
558 			}
559 			return out;
560 		}
561 
562 		bool zeroMaskedLanes = true;
563 		switch(robustness)
564 		{
565 		case OutOfBoundsBehavior::Nullify:
566 		case OutOfBoundsBehavior::RobustBufferAccess:  // Must either return an in-bounds value, or zero.
567 			zeroMaskedLanes = true;
568 			break;
569 		case OutOfBoundsBehavior::UndefinedValue:
570 		case OutOfBoundsBehavior::UndefinedBehavior:
571 			zeroMaskedLanes = false;
572 			break;
573 		}
574 
575 		// TODO(b/195446858): Optimize static sequential offsets case by using masked load.
576 
577 		return Gather(scalar::Pointer<EL>(base), offs, mask, alignment, zeroMaskedLanes);
578 	}
579 	else
580 	{
581 		T out;
582 		auto anyLanesDisabled = AnyFalse(mask);
583 		If(hasStaticEqualOffsets() && !anyLanesDisabled)
584 		{
585 			// Load one, replicate.
586 			auto offset = Extract(offs, 0);
587 			out = T(rr::Load(scalar::Pointer<EL>(&base[offset]), alignment, atomic, order));
588 		}
589 		Else If(hasStaticSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
590 		{
591 			// Load all elements in a single SIMD instruction.
592 			auto offset = Extract(offs, 0);
593 			out = rr::Load(scalar::Pointer<T>(&base[offset]), alignment, atomic, order);
594 		}
595 		Else
596 		{
597 			// Divergent offsets or masked lanes.
598 			out = T(0);
599 			for(int i = 0; i < SIMD::Width; i++)
600 			{
601 				If(Extract(mask, i) != 0)
602 				{
603 					auto offset = Extract(offs, i);
604 					auto el = rr::Load(scalar::Pointer<EL>(&base[offset]), alignment, atomic, order);
605 					out = Insert(out, el, i);
606 				}
607 			}
608 		}
609 		return out;
610 	}
611 }
612 
613 template<>
Load(OutOfBoundsBehavior robustness,SIMD::Int mask,bool atomic,std::memory_order order,int alignment)614 inline SIMD::Pointer SIMD::Pointer::Load(OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */, int alignment /* = sizeof(float) */)
615 {
616 	std::vector<scalar::Pointer<Byte>> pointers(SIMD::Width);
617 
618 	for(int i = 0; i < SIMD::Width; i++)
619 	{
620 		If(Extract(mask, i) != 0)
621 		{
622 			pointers[i] = rr::Load(scalar::Pointer<scalar::Pointer<Byte>>(getPointerForLane(i)), alignment, atomic, order);
623 		}
624 	}
625 
626 	return SIMD::Pointer(pointers);
627 }
628 
629 template<typename T>
Store(T val,OutOfBoundsBehavior robustness,SIMD::Int mask,bool atomic,std::memory_order order)630 inline void SIMD::Pointer::Store(T val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
631 {
632 	using EL = typename Scalar<T>::Type;
633 	constexpr size_t alignment = sizeof(float);
634 
635 	if(!isBasePlusOffset)
636 	{
637 		for(int i = 0; i < SIMD::Width; i++)
638 		{
639 			If(Extract(mask, i) != 0)
640 			{
641 				rr::Store(Extract(val, i), scalar::Pointer<EL>(pointers[i]), alignment, atomic, order);
642 			}
643 		}
644 		return;
645 	}
646 
647 	auto offs = offsets();
648 	switch(robustness)
649 	{
650 	case OutOfBoundsBehavior::Nullify:
651 	case OutOfBoundsBehavior::RobustBufferAccess:       // TODO: Allows writing anywhere within bounds. Could be faster than masking.
652 	case OutOfBoundsBehavior::UndefinedValue:           // Should not be used for store operations. Treat as robust buffer access.
653 		mask &= isInBounds(sizeof(float), robustness);  // Disable out-of-bounds writes.
654 		break;
655 	case OutOfBoundsBehavior::UndefinedBehavior:
656 		// Nothing to do. Application/compiler must guarantee no out-of-bounds accesses.
657 		break;
658 	}
659 
660 	if(!atomic && order == std::memory_order_relaxed)
661 	{
662 		if(hasStaticEqualOffsets())
663 		{
664 			If(AnyTrue(mask))
665 			{
666 				assert(SIMD::Width == 4);
667 
668 				// All equal. One of these writes will win -- elect the winning lane.
669 				auto v0111 = SIMD::Int(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
670 				auto elect = mask & ~(v0111 & (mask.xxyz | mask.xxxy | mask.xxxx));
671 				auto maskedVal = As<SIMD::Int>(val) & elect;
672 				auto scalarVal = Extract(maskedVal, 0) |
673 				                 Extract(maskedVal, 1) |
674 				                 Extract(maskedVal, 2) |
675 				                 Extract(maskedVal, 3);
676 				*scalar::Pointer<EL>(base + staticOffsets[0], alignment) = As<EL>(scalarVal);
677 			}
678 		}
679 		else if(hasStaticSequentialOffsets(sizeof(float)) &&
680 		        isStaticallyInBounds(sizeof(float), robustness))
681 		{
682 			// TODO(b/195446858): Optimize using masked store.
683 			// Pointer has no elements OOB, and the store is not atomic.
684 			// Perform a read-modify-write.
685 			auto p = scalar::Pointer<SIMD::Int>(base + staticOffsets[0], alignment);
686 			auto prev = *p;
687 			*p = (prev & ~mask) | (As<SIMD::Int>(val) & mask);
688 		}
689 		else
690 		{
691 			Scatter(scalar::Pointer<EL>(base), val, offs, mask, alignment);
692 		}
693 	}
694 	else
695 	{
696 		auto anyLanesDisabled = AnyFalse(mask);
697 		If(hasStaticSequentialOffsets(sizeof(float)) && !anyLanesDisabled)
698 		{
699 			// Store all elements in a single SIMD instruction.
700 			auto offset = Extract(offs, 0);
701 			rr::Store(val, scalar::Pointer<T>(&base[offset]), alignment, atomic, order);
702 		}
703 		Else
704 		{
705 			// Divergent offsets or masked lanes.
706 			for(int i = 0; i < SIMD::Width; i++)
707 			{
708 				If(Extract(mask, i) != 0)
709 				{
710 					auto offset = Extract(offs, i);
711 					rr::Store(Extract(val, i), scalar::Pointer<EL>(&base[offset]), alignment, atomic, order);
712 				}
713 			}
714 		}
715 	}
716 }
717 
718 template<>
Store(SIMD::Pointer val,OutOfBoundsBehavior robustness,SIMD::Int mask,bool atomic,std::memory_order order)719 inline void SIMD::Pointer::Store(SIMD::Pointer val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
720 {
721 	constexpr size_t alignment = sizeof(void *);
722 
723 	for(int i = 0; i < SIMD::Width; i++)
724 	{
725 		If(Extract(mask, i) != 0)
726 		{
727 			rr::Store(val.getPointerForLane(i), scalar::Pointer<scalar::Pointer<Byte>>(getPointerForLane(i)), alignment, atomic, order);
728 		}
729 	}
730 }
731 
732 template<typename T>
Store(RValue<T> val,OutOfBoundsBehavior robustness,SIMD::Int mask,bool atomic,std::memory_order order)733 inline void SIMD::Pointer::Store(RValue<T> val, OutOfBoundsBehavior robustness, SIMD::Int mask, bool atomic /* = false */, std::memory_order order /* = std::memory_order_relaxed */)
734 {
735 	Store(T(val), robustness, mask, atomic, order);
736 }
737 
738 }  // namespace rr
739 
740 #endif  // rr_SIMD_hpp
741