xref: /aosp_15_r20/external/swiftshader/src/Reactor/LLVMReactor.cpp (revision 03ce13f70fcc45d86ee91b7ee4cab1936a95046e)
1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //    http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "LLVMReactor.hpp"
16 
17 #include "CPUID.hpp"
18 #include "Debug.hpp"
19 #include "LLVMReactorDebugInfo.hpp"
20 #include "PragmaInternals.hpp"
21 #include "Print.hpp"
22 #include "Reactor.hpp"
23 #include "SIMD.hpp"
24 #include "x86.hpp"
25 
26 #include "llvm/IR/Intrinsics.h"
27 #include "llvm/IR/IntrinsicsX86.h"
28 #include "llvm/Support/Alignment.h"
29 #include "llvm/Support/Error.h"
30 #include "llvm/Support/ManagedStatic.h"
31 
32 #include <fstream>
33 #include <iostream>
34 #include <mutex>
35 #include <numeric>
36 #include <thread>
37 #include <unordered_map>
38 
39 #if defined(__i386__) || defined(__x86_64__)
40 #	include <xmmintrin.h>
41 #endif
42 
43 #include <math.h>
44 
45 #if defined(__x86_64__) && defined(_WIN32)
X86CompilationCallback()46 extern "C" void X86CompilationCallback()
47 {
48 	UNIMPLEMENTED_NO_BUG("X86CompilationCallback");
49 }
50 #endif
51 
52 #if !LLVM_ENABLE_THREADS
53 #	error "LLVM_ENABLE_THREADS needs to be enabled"
54 #endif
55 
56 #if LLVM_VERSION_MAJOR < 11
57 namespace llvm {
58 using FixedVectorType = VectorType;
59 }  // namespace llvm
60 #endif
61 
62 namespace {
63 
64 // Used to automatically invoke llvm_shutdown() when driver is unloaded
65 llvm::llvm_shutdown_obj llvmShutdownObj;
66 
67 // This has to be a raw pointer because glibc 2.17 doesn't support __cxa_thread_atexit_impl
68 // for destructing objects at exit. See crbug.com/1074222
69 thread_local rr::JITBuilder *jit = nullptr;
70 
getNumElements(llvm::FixedVectorType * vec)71 auto getNumElements(llvm::FixedVectorType *vec)
72 {
73 #if LLVM_VERSION_MAJOR >= 11
74 	return vec->getElementCount();
75 #else
76 	return vec->getNumElements();
77 #endif
78 }
79 
lowerPAVG(llvm::Value * x,llvm::Value * y)80 llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
81 {
82 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
83 
84 	llvm::VectorType *extTy =
85 	    llvm::VectorType::getExtendedElementVectorType(ty);
86 	x = jit->builder->CreateZExt(x, extTy);
87 	y = jit->builder->CreateZExt(y, extTy);
88 
89 	// (x + y + 1) >> 1
90 	llvm::Constant *one = llvm::ConstantInt::get(extTy, 1);
91 	llvm::Value *res = jit->builder->CreateAdd(x, y);
92 	res = jit->builder->CreateAdd(res, one);
93 	res = jit->builder->CreateLShr(res, one);
94 	return jit->builder->CreateTrunc(res, ty);
95 }
96 
lowerPMINMAX(llvm::Value * x,llvm::Value * y,llvm::ICmpInst::Predicate pred)97 llvm::Value *lowerPMINMAX(llvm::Value *x, llvm::Value *y,
98                           llvm::ICmpInst::Predicate pred)
99 {
100 	return jit->builder->CreateSelect(jit->builder->CreateICmp(pred, x, y), x, y);
101 }
102 
lowerPCMP(llvm::ICmpInst::Predicate pred,llvm::Value * x,llvm::Value * y,llvm::Type * dstTy)103 llvm::Value *lowerPCMP(llvm::ICmpInst::Predicate pred, llvm::Value *x,
104                        llvm::Value *y, llvm::Type *dstTy)
105 {
106 	return jit->builder->CreateSExt(jit->builder->CreateICmp(pred, x, y), dstTy, "");
107 }
108 
lowerPFMINMAX(llvm::Value * x,llvm::Value * y,llvm::FCmpInst::Predicate pred)109 [[maybe_unused]] llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
110                                             llvm::FCmpInst::Predicate pred)
111 {
112 	return jit->builder->CreateSelect(jit->builder->CreateFCmp(pred, x, y), x, y);
113 }
114 
lowerRound(llvm::Value * x)115 [[maybe_unused]] llvm::Value *lowerRound(llvm::Value *x)
116 {
117 	llvm::Function *nearbyint = llvm::Intrinsic::getDeclaration(
118 	    jit->module.get(), llvm::Intrinsic::nearbyint, { x->getType() });
119 	return jit->builder->CreateCall(nearbyint, { x });
120 }
121 
lowerRoundInt(llvm::Value * x,llvm::Type * ty)122 [[maybe_unused]] llvm::Value *lowerRoundInt(llvm::Value *x, llvm::Type *ty)
123 {
124 	return jit->builder->CreateFPToSI(lowerRound(x), ty);
125 }
126 
lowerFloor(llvm::Value * x)127 [[maybe_unused]] llvm::Value *lowerFloor(llvm::Value *x)
128 {
129 	llvm::Function *floor = llvm::Intrinsic::getDeclaration(
130 	    jit->module.get(), llvm::Intrinsic::floor, { x->getType() });
131 	return jit->builder->CreateCall(floor, { x });
132 }
133 
lowerTrunc(llvm::Value * x)134 [[maybe_unused]] llvm::Value *lowerTrunc(llvm::Value *x)
135 {
136 	llvm::Function *trunc = llvm::Intrinsic::getDeclaration(
137 	    jit->module.get(), llvm::Intrinsic::trunc, { x->getType() });
138 	return jit->builder->CreateCall(trunc, { x });
139 }
140 
lowerSQRT(llvm::Value * x)141 [[maybe_unused]] llvm::Value *lowerSQRT(llvm::Value *x)
142 {
143 	llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
144 	    jit->module.get(), llvm::Intrinsic::sqrt, { x->getType() });
145 	return jit->builder->CreateCall(sqrt, { x });
146 }
147 
lowerRCP(llvm::Value * x)148 [[maybe_unused]] llvm::Value *lowerRCP(llvm::Value *x)
149 {
150 	llvm::Type *ty = x->getType();
151 	llvm::Constant *one;
152 	if(llvm::FixedVectorType *vectorTy = llvm::dyn_cast<llvm::FixedVectorType>(ty))
153 	{
154 		one = llvm::ConstantVector::getSplat(getNumElements(vectorTy),
155 		                                     llvm::ConstantFP::get(vectorTy->getElementType(), 1));
156 	}
157 	else
158 	{
159 		one = llvm::ConstantFP::get(ty, 1);
160 	}
161 	return jit->builder->CreateFDiv(one, x);
162 }
163 
lowerRSQRT(llvm::Value * x)164 [[maybe_unused]] llvm::Value *lowerRSQRT(llvm::Value *x)
165 {
166 	return lowerRCP(lowerSQRT(x));
167 }
168 
lowerVectorShl(llvm::Value * x,uint64_t scalarY)169 [[maybe_unused]] llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
170 {
171 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
172 	llvm::Value *y = llvm::ConstantVector::getSplat(getNumElements(ty),
173 	                                                llvm::ConstantInt::get(ty->getElementType(), scalarY));
174 	return jit->builder->CreateShl(x, y);
175 }
176 
lowerVectorAShr(llvm::Value * x,uint64_t scalarY)177 [[maybe_unused]] llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
178 {
179 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
180 	llvm::Value *y = llvm::ConstantVector::getSplat(getNumElements(ty),
181 	                                                llvm::ConstantInt::get(ty->getElementType(), scalarY));
182 	return jit->builder->CreateAShr(x, y);
183 }
184 
lowerVectorLShr(llvm::Value * x,uint64_t scalarY)185 [[maybe_unused]] llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
186 {
187 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
188 	llvm::Value *y = llvm::ConstantVector::getSplat(getNumElements(ty),
189 	                                                llvm::ConstantInt::get(ty->getElementType(), scalarY));
190 	return jit->builder->CreateLShr(x, y);
191 }
192 
lowerShuffleVector(llvm::Value * v1,llvm::Value * v2,llvm::ArrayRef<int> select)193 llvm::Value *lowerShuffleVector(llvm::Value *v1, llvm::Value *v2, llvm::ArrayRef<int> select)
194 {
195 	int size = select.size();
196 	const int maxSize = 16;
197 	llvm::Constant *swizzle[maxSize];
198 	ASSERT(size <= maxSize);
199 
200 	for(int i = 0; i < size; i++)
201 	{
202 		swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), select[i]);
203 	}
204 
205 	llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(swizzle, size));
206 
207 	return jit->builder->CreateShuffleVector(v1, v2, shuffle);
208 }
209 
lowerMulAdd(llvm::Value * x,llvm::Value * y)210 [[maybe_unused]] llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
211 {
212 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
213 	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
214 
215 	llvm::Value *extX = jit->builder->CreateSExt(x, extTy);
216 	llvm::Value *extY = jit->builder->CreateSExt(y, extTy);
217 	llvm::Value *mult = jit->builder->CreateMul(extX, extY);
218 
219 	llvm::Value *undef = llvm::UndefValue::get(extTy);
220 
221 	llvm::SmallVector<int, 16> evenIdx;
222 	llvm::SmallVector<int, 16> oddIdx;
223 	for(uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
224 	{
225 		evenIdx.push_back(i);
226 		oddIdx.push_back(i + 1);
227 	}
228 
229 	llvm::Value *lhs = lowerShuffleVector(mult, undef, evenIdx);
230 	llvm::Value *rhs = lowerShuffleVector(mult, undef, oddIdx);
231 	return jit->builder->CreateAdd(lhs, rhs);
232 }
233 
lowerPack(llvm::Value * x,llvm::Value * y,bool isSigned)234 [[maybe_unused]] llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
235 {
236 	llvm::FixedVectorType *srcTy = llvm::cast<llvm::FixedVectorType>(x->getType());
237 	llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
238 
239 	llvm::IntegerType *dstElemTy =
240 	    llvm::cast<llvm::IntegerType>(dstTy->getElementType());
241 
242 	uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
243 	ASSERT_MSG(truncNumBits < 64, "shift 64 must be handled separately. truncNumBits: %d", int(truncNumBits));
244 	llvm::Constant *max, *min;
245 	if(isSigned)
246 	{
247 		max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
248 		min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
249 	}
250 	else
251 	{
252 		max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
253 		min = llvm::ConstantInt::get(srcTy, 0, false);
254 	}
255 
256 	x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
257 	x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
258 	y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
259 	y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
260 
261 	x = jit->builder->CreateTrunc(x, dstTy);
262 	y = jit->builder->CreateTrunc(y, dstTy);
263 
264 	llvm::SmallVector<int, 16> index(srcTy->getNumElements() * 2);
265 	std::iota(index.begin(), index.end(), 0);
266 
267 	return lowerShuffleVector(x, y, index);
268 }
269 
lowerSignMask(llvm::Value * x,llvm::Type * retTy)270 [[maybe_unused]] llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
271 {
272 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
273 	llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
274 	llvm::Value *cmp = jit->builder->CreateICmpSLT(x, zero);
275 
276 	llvm::Value *ret = jit->builder->CreateZExt(
277 	    jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
278 	for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
279 	{
280 		llvm::Value *elem = jit->builder->CreateZExt(
281 		    jit->builder->CreateExtractElement(cmp, i), retTy);
282 		ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
283 	}
284 	return ret;
285 }
286 
lowerFPSignMask(llvm::Value * x,llvm::Type * retTy)287 [[maybe_unused]] llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
288 {
289 	llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
290 	llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
291 	llvm::Value *cmp = jit->builder->CreateFCmpULT(x, zero);
292 
293 	llvm::Value *ret = jit->builder->CreateZExt(
294 	    jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
295 	for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
296 	{
297 		llvm::Value *elem = jit->builder->CreateZExt(
298 		    jit->builder->CreateExtractElement(cmp, i), retTy);
299 		ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
300 	}
301 	return ret;
302 }
303 
lowerPUADDSAT(llvm::Value * x,llvm::Value * y)304 llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
305 {
306 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::uadd_sat, x, y);
307 }
308 
lowerPSADDSAT(llvm::Value * x,llvm::Value * y)309 llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
310 {
311 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::sadd_sat, x, y);
312 }
313 
lowerPUSUBSAT(llvm::Value * x,llvm::Value * y)314 llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
315 {
316 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::usub_sat, x, y);
317 }
318 
lowerPSSUBSAT(llvm::Value * x,llvm::Value * y)319 llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
320 {
321 	return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::ssub_sat, x, y);
322 }
323 
lowerMulHigh(llvm::Value * x,llvm::Value * y,bool sext)324 llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
325 {
326 	llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
327 	llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
328 
329 	llvm::Value *extX, *extY;
330 	if(sext)
331 	{
332 		extX = jit->builder->CreateSExt(x, extTy);
333 		extY = jit->builder->CreateSExt(y, extTy);
334 	}
335 	else
336 	{
337 		extX = jit->builder->CreateZExt(x, extTy);
338 		extY = jit->builder->CreateZExt(y, extTy);
339 	}
340 
341 	llvm::Value *mult = jit->builder->CreateMul(extX, extY);
342 
343 	llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
344 	llvm::Value *mulh = jit->builder->CreateAShr(mult, intTy->getBitWidth());
345 	return jit->builder->CreateTrunc(mulh, ty);
346 }
347 
348 // TODO(crbug.com/swiftshader/185): A temporary workaround for failing chromium tests.
clampForShift(llvm::Value * rhs)349 llvm::Value *clampForShift(llvm::Value *rhs)
350 {
351 	llvm::Value *max;
352 	if(auto *vec = llvm::dyn_cast<llvm::FixedVectorType>(rhs->getType()))
353 	{
354 		auto N = vec->getElementType()->getIntegerBitWidth() - 1;
355 		max = llvm::ConstantVector::getSplat(getNumElements(vec), llvm::ConstantInt::get(vec->getElementType(), N));
356 	}
357 	else
358 	{
359 		auto N = rhs->getType()->getIntegerBitWidth() - 1;
360 		max = llvm::ConstantInt::get(rhs->getType(), N);
361 	}
362 	return jit->builder->CreateSelect(jit->builder->CreateICmpULE(rhs, max), rhs, max);
363 }
364 
365 }  // namespace
366 
367 namespace rr {
368 
369 const int SIMD::Width = 4;
370 
backendName()371 std::string Caps::backendName()
372 {
373 	return std::string("LLVM ") + LLVM_VERSION_STRING;
374 }
375 
coroutinesSupported()376 bool Caps::coroutinesSupported()
377 {
378 	return true;
379 }
380 
fmaIsFast()381 bool Caps::fmaIsFast()
382 {
383 	static bool AVX2 = CPUID::supportsAVX2();  // Also checks for FMA support
384 
385 	// If x86 FMA instructions are supported, assume LLVM will emit them instead of making calls to std::fma().
386 	return AVX2;
387 }
388 
389 // The abstract Type* types are implemented as LLVM types, except that
390 // 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
391 // and VFP in ARM, and eliminate the overhead of converting them to explicit
392 // 128-bit ones. LLVM types are pointers, so we can represent emulated types
393 // as abstract pointers with small enum values.
394 enum InternalType : uintptr_t
395 {
396 	// Emulated types:
397 	Type_v2i32,
398 	Type_v4i16,
399 	Type_v2i16,
400 	Type_v8i8,
401 	Type_v4i8,
402 	Type_v2f32,
403 	EmulatedTypeCount,
404 	// Returned by asInternalType() to indicate that the abstract Type*
405 	// should be interpreted as LLVM type pointer:
406 	Type_LLVM
407 };
408 
asInternalType(Type * type)409 inline InternalType asInternalType(Type *type)
410 {
411 	InternalType t = static_cast<InternalType>(reinterpret_cast<uintptr_t>(type));
412 	return (t < EmulatedTypeCount) ? t : Type_LLVM;
413 }
414 
T(Type * t)415 llvm::Type *T(Type *t)
416 {
417 	// Use 128-bit vectors to implement logically shorter ones.
418 	switch(asInternalType(t))
419 	{
420 	case Type_v2i32: return T(Int4::type());
421 	case Type_v4i16: return T(Short8::type());
422 	case Type_v2i16: return T(Short8::type());
423 	case Type_v8i8: return T(Byte16::type());
424 	case Type_v4i8: return T(Byte16::type());
425 	case Type_v2f32: return T(Float4::type());
426 	case Type_LLVM: return reinterpret_cast<llvm::Type *>(t);
427 	default:
428 		UNREACHABLE("asInternalType(t): %d", int(asInternalType(t)));
429 		return nullptr;
430 	}
431 }
432 
T(InternalType t)433 Type *T(InternalType t)
434 {
435 	return reinterpret_cast<Type *>(t);
436 }
437 
T(const std::vector<Type * > & t)438 inline const std::vector<llvm::Type *> &T(const std::vector<Type *> &t)
439 {
440 	return reinterpret_cast<const std::vector<llvm::Type *> &>(t);
441 }
442 
B(BasicBlock * t)443 inline llvm::BasicBlock *B(BasicBlock *t)
444 {
445 	return reinterpret_cast<llvm::BasicBlock *>(t);
446 }
447 
B(llvm::BasicBlock * t)448 inline BasicBlock *B(llvm::BasicBlock *t)
449 {
450 	return reinterpret_cast<BasicBlock *>(t);
451 }
452 
typeSize(Type * type)453 static size_t typeSize(Type *type)
454 {
455 	switch(asInternalType(type))
456 	{
457 	case Type_v2i32: return 8;
458 	case Type_v4i16: return 8;
459 	case Type_v2i16: return 4;
460 	case Type_v8i8: return 8;
461 	case Type_v4i8: return 4;
462 	case Type_v2f32: return 8;
463 	case Type_LLVM:
464 		{
465 			llvm::Type *t = T(type);
466 
467 			if(t->isPointerTy())
468 			{
469 				return sizeof(void *);
470 			}
471 
472 			// At this point we should only have LLVM 'primitive' types.
473 			unsigned int bits = t->getPrimitiveSizeInBits();
474 			ASSERT_MSG(bits != 0, "bits: %d", int(bits));
475 
476 			// TODO(capn): Booleans are 1 bit integers in LLVM's SSA type system,
477 			// but are typically stored as one byte. The DataLayout structure should
478 			// be used here and many other places if this assumption fails.
479 			return (bits + 7) / 8;
480 		}
481 		break;
482 	default:
483 		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
484 		return 0;
485 	}
486 }
487 
createFunction(const char * name,llvm::Type * retTy,const std::vector<llvm::Type * > & params)488 static llvm::Function *createFunction(const char *name, llvm::Type *retTy, const std::vector<llvm::Type *> &params)
489 {
490 	llvm::FunctionType *functionType = llvm::FunctionType::get(retTy, params, false);
491 	auto func = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, name, jit->module.get());
492 
493 	func->setLinkage(llvm::GlobalValue::ExternalLinkage);
494 	func->setDoesNotThrow();
495 	func->setCallingConv(llvm::CallingConv::C);
496 
497 	if(__has_feature(address_sanitizer))
498 	{
499 		func->addFnAttr(llvm::Attribute::SanitizeAddress);
500 	}
501 
502 	func->addFnAttr("warn-stack-size", "524288");  // Warn when a function uses more than 512 KiB of stack memory
503 
504 	return func;
505 }
506 
Nucleus()507 Nucleus::Nucleus()
508 {
509 #if !__has_feature(memory_sanitizer)
510 	// thread_local variables in shared libraries are initialized at load-time,
511 	// but this is not observed by MemorySanitizer if the loader itself was not
512 	// instrumented, leading to false-positive uninitialized variable errors.
513 	ASSERT(jit == nullptr);
514 	ASSERT(Variable::unmaterializedVariables == nullptr);
515 #endif
516 
517 	jit = new JITBuilder();
518 	Variable::unmaterializedVariables = new Variable::UnmaterializedVariables();
519 }
520 
~Nucleus()521 Nucleus::~Nucleus()
522 {
523 	delete Variable::unmaterializedVariables;
524 	Variable::unmaterializedVariables = nullptr;
525 
526 	delete jit;
527 	jit = nullptr;
528 }
529 
acquireRoutine(const char * name)530 std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name)
531 {
532 	if(jit->builder->GetInsertBlock()->empty() || !jit->builder->GetInsertBlock()->back().isTerminator())
533 	{
534 		llvm::Type *type = jit->function->getReturnType();
535 
536 		if(type->isVoidTy())
537 		{
538 			createRetVoid();
539 		}
540 		else
541 		{
542 			createRet(V(llvm::UndefValue::get(type)));
543 		}
544 	}
545 
546 	std::shared_ptr<Routine> routine;
547 
548 	auto acquire = [&](rr::JITBuilder *jit) {
549 	// ::jit is thread-local, so when this is executed on a separate thread (see JIT_IN_SEPARATE_THREAD)
550 	// it needs to only use the jit variable passed in as an argument.
551 
552 #ifdef ENABLE_RR_DEBUG_INFO
553 		if(jit->debugInfo != nullptr)
554 		{
555 			jit->debugInfo->Finalize();
556 		}
557 #endif  // ENABLE_RR_DEBUG_INFO
558 
559 		if(false)
560 		{
561 			std::error_code error;
562 			llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
563 			jit->module->print(file, 0);
564 		}
565 
566 		jit->runPasses();
567 
568 		if(false)
569 		{
570 			std::error_code error;
571 			llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
572 			jit->module->print(file, 0);
573 		}
574 
575 		routine = jit->acquireRoutine(name, &jit->function, 1);
576 	};
577 
578 #ifdef JIT_IN_SEPARATE_THREAD
579 	// Perform optimizations and codegen in a separate thread to avoid stack overflow.
580 	// FIXME(b/149829034): This is not a long-term solution. Reactor has no control
581 	// over the threading and stack sizes of its users, so this should be addressed
582 	// at a higher level instead.
583 	std::thread thread(acquire, jit);
584 	thread.join();
585 #else
586 	acquire(jit);
587 #endif
588 
589 	return routine;
590 }
591 
allocateStackVariable(Type * type,int arraySize)592 Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
593 {
594 	// Need to allocate it in the entry block for mem2reg to work
595 	llvm::BasicBlock &entryBlock = jit->function->getEntryBlock();
596 
597 	llvm::Instruction *declaration;
598 
599 #if LLVM_VERSION_MAJOR >= 11
600 	auto align = jit->module->getDataLayout().getPrefTypeAlign(T(type));
601 #else
602 	auto align = llvm::MaybeAlign(jit->module->getDataLayout().getPrefTypeAlignment(T(type)));
603 #endif
604 
605 	if(arraySize)
606 	{
607 		Value *size = (sizeof(size_t) == 8) ? Nucleus::createConstantLong(arraySize) : Nucleus::createConstantInt(arraySize);
608 		declaration = new llvm::AllocaInst(T(type), 0, V(size), align);
609 	}
610 	else
611 	{
612 		declaration = new llvm::AllocaInst(T(type), 0, (llvm::Value *)nullptr, align);
613 	}
614 
615 #if LLVM_VERSION_MAJOR >= 16
616 	declaration->insertInto(&entryBlock, entryBlock.begin());
617 #else
618 	entryBlock.getInstList().push_front(declaration);
619 #endif
620 
621 	if(getPragmaState(InitializeLocalVariables))
622 	{
623 		llvm::Type *i8PtrTy = llvm::Type::getInt8Ty(*jit->context)->getPointerTo();
624 		llvm::Type *i32Ty = llvm::Type::getInt32Ty(*jit->context);
625 		llvm::Function *memset = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::memset, { i8PtrTy, i32Ty });
626 
627 		jit->builder->CreateCall(memset, { jit->builder->CreatePointerCast(declaration, i8PtrTy),
628 		                                   V(Nucleus::createConstantByte((unsigned char)0)),
629 		                                   V(Nucleus::createConstantInt((int)typeSize(type) * (arraySize ? arraySize : 1))),
630 		                                   V(Nucleus::createConstantBool(false)) });
631 	}
632 
633 	return V(declaration);
634 }
635 
createBasicBlock()636 BasicBlock *Nucleus::createBasicBlock()
637 {
638 	return B(llvm::BasicBlock::Create(*jit->context, "", jit->function));
639 }
640 
getInsertBlock()641 BasicBlock *Nucleus::getInsertBlock()
642 {
643 	return B(jit->builder->GetInsertBlock());
644 }
645 
setInsertBlock(BasicBlock * basicBlock)646 void Nucleus::setInsertBlock(BasicBlock *basicBlock)
647 {
648 	// assert(jit->builder->GetInsertBlock()->back().isTerminator());
649 
650 	jit->builder->SetInsertPoint(B(basicBlock));
651 }
652 
createFunction(Type * ReturnType,const std::vector<Type * > & Params)653 void Nucleus::createFunction(Type *ReturnType, const std::vector<Type *> &Params)
654 {
655 	jit->function = rr::createFunction("", T(ReturnType), T(Params));
656 
657 #ifdef ENABLE_RR_DEBUG_INFO
658 	jit->debugInfo = std::make_unique<DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
659 #endif  // ENABLE_RR_DEBUG_INFO
660 
661 	jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->function));
662 }
663 
getArgument(unsigned int index)664 Value *Nucleus::getArgument(unsigned int index)
665 {
666 	llvm::Function::arg_iterator args = jit->function->arg_begin();
667 
668 	while(index)
669 	{
670 		args++;
671 		index--;
672 	}
673 
674 	return V(&*args);
675 }
676 
createRetVoid()677 void Nucleus::createRetVoid()
678 {
679 	RR_DEBUG_INFO_UPDATE_LOC();
680 
681 	ASSERT_MSG(jit->function->getReturnType() == T(Void::type()), "Return type mismatch");
682 
683 	// Code generated after this point is unreachable, so any variables
684 	// being read can safely return an undefined value. We have to avoid
685 	// materializing variables after the terminator ret instruction.
686 	Variable::killUnmaterialized();
687 
688 	jit->builder->CreateRetVoid();
689 }
690 
createRet(Value * v)691 void Nucleus::createRet(Value *v)
692 {
693 	RR_DEBUG_INFO_UPDATE_LOC();
694 
695 	ASSERT_MSG(jit->function->getReturnType() == V(v)->getType(), "Return type mismatch");
696 
697 	// Code generated after this point is unreachable, so any variables
698 	// being read can safely return an undefined value. We have to avoid
699 	// materializing variables after the terminator ret instruction.
700 	Variable::killUnmaterialized();
701 
702 	jit->builder->CreateRet(V(v));
703 }
704 
createBr(BasicBlock * dest)705 void Nucleus::createBr(BasicBlock *dest)
706 {
707 	RR_DEBUG_INFO_UPDATE_LOC();
708 	Variable::materializeAll();
709 
710 	jit->builder->CreateBr(B(dest));
711 }
712 
createCondBr(Value * cond,BasicBlock * ifTrue,BasicBlock * ifFalse)713 void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
714 {
715 	RR_DEBUG_INFO_UPDATE_LOC();
716 	Variable::materializeAll();
717 	jit->builder->CreateCondBr(V(cond), B(ifTrue), B(ifFalse));
718 }
719 
createAdd(Value * lhs,Value * rhs)720 Value *Nucleus::createAdd(Value *lhs, Value *rhs)
721 {
722 	RR_DEBUG_INFO_UPDATE_LOC();
723 	return V(jit->builder->CreateAdd(V(lhs), V(rhs)));
724 }
725 
createSub(Value * lhs,Value * rhs)726 Value *Nucleus::createSub(Value *lhs, Value *rhs)
727 {
728 	RR_DEBUG_INFO_UPDATE_LOC();
729 	return V(jit->builder->CreateSub(V(lhs), V(rhs)));
730 }
731 
createMul(Value * lhs,Value * rhs)732 Value *Nucleus::createMul(Value *lhs, Value *rhs)
733 {
734 	RR_DEBUG_INFO_UPDATE_LOC();
735 	return V(jit->builder->CreateMul(V(lhs), V(rhs)));
736 }
737 
createUDiv(Value * lhs,Value * rhs)738 Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
739 {
740 	RR_DEBUG_INFO_UPDATE_LOC();
741 	return V(jit->builder->CreateUDiv(V(lhs), V(rhs)));
742 }
743 
createSDiv(Value * lhs,Value * rhs)744 Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
745 {
746 	RR_DEBUG_INFO_UPDATE_LOC();
747 	return V(jit->builder->CreateSDiv(V(lhs), V(rhs)));
748 }
749 
createFAdd(Value * lhs,Value * rhs)750 Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
751 {
752 	RR_DEBUG_INFO_UPDATE_LOC();
753 	return V(jit->builder->CreateFAdd(V(lhs), V(rhs)));
754 }
755 
createFSub(Value * lhs,Value * rhs)756 Value *Nucleus::createFSub(Value *lhs, Value *rhs)
757 {
758 	RR_DEBUG_INFO_UPDATE_LOC();
759 	return V(jit->builder->CreateFSub(V(lhs), V(rhs)));
760 }
761 
createFMul(Value * lhs,Value * rhs)762 Value *Nucleus::createFMul(Value *lhs, Value *rhs)
763 {
764 	RR_DEBUG_INFO_UPDATE_LOC();
765 	return V(jit->builder->CreateFMul(V(lhs), V(rhs)));
766 }
767 
createFDiv(Value * lhs,Value * rhs)768 Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
769 {
770 	RR_DEBUG_INFO_UPDATE_LOC();
771 	return V(jit->builder->CreateFDiv(V(lhs), V(rhs)));
772 }
773 
createURem(Value * lhs,Value * rhs)774 Value *Nucleus::createURem(Value *lhs, Value *rhs)
775 {
776 	RR_DEBUG_INFO_UPDATE_LOC();
777 	return V(jit->builder->CreateURem(V(lhs), V(rhs)));
778 }
779 
createSRem(Value * lhs,Value * rhs)780 Value *Nucleus::createSRem(Value *lhs, Value *rhs)
781 {
782 	RR_DEBUG_INFO_UPDATE_LOC();
783 	return V(jit->builder->CreateSRem(V(lhs), V(rhs)));
784 }
785 
createFRem(Value * lhs,Value * rhs)786 Value *Nucleus::createFRem(Value *lhs, Value *rhs)
787 {
788 	RR_DEBUG_INFO_UPDATE_LOC();
789 	return V(jit->builder->CreateFRem(V(lhs), V(rhs)));
790 }
791 
operator %(RValue<Float4> lhs,RValue<Float4> rhs)792 RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
793 {
794 	return RValue<Float4>(Nucleus::createFRem(lhs.value(), rhs.value()));
795 }
796 
createShl(Value * lhs,Value * rhs)797 Value *Nucleus::createShl(Value *lhs, Value *rhs)
798 {
799 	RR_DEBUG_INFO_UPDATE_LOC();
800 	auto *clamped_rhs = clampForShift(V(rhs));
801 	return V(jit->builder->CreateShl(V(lhs), clamped_rhs));
802 }
803 
createLShr(Value * lhs,Value * rhs)804 Value *Nucleus::createLShr(Value *lhs, Value *rhs)
805 {
806 	RR_DEBUG_INFO_UPDATE_LOC();
807 	auto *clamped_rhs = clampForShift(V(rhs));
808 	return V(jit->builder->CreateLShr(V(lhs), clamped_rhs));
809 }
810 
createAShr(Value * lhs,Value * rhs)811 Value *Nucleus::createAShr(Value *lhs, Value *rhs)
812 {
813 	RR_DEBUG_INFO_UPDATE_LOC();
814 	return V(jit->builder->CreateAShr(V(lhs), V(rhs)));
815 }
816 
createAnd(Value * lhs,Value * rhs)817 Value *Nucleus::createAnd(Value *lhs, Value *rhs)
818 {
819 	RR_DEBUG_INFO_UPDATE_LOC();
820 	return V(jit->builder->CreateAnd(V(lhs), V(rhs)));
821 }
822 
createOr(Value * lhs,Value * rhs)823 Value *Nucleus::createOr(Value *lhs, Value *rhs)
824 {
825 	RR_DEBUG_INFO_UPDATE_LOC();
826 	return V(jit->builder->CreateOr(V(lhs), V(rhs)));
827 }
828 
createXor(Value * lhs,Value * rhs)829 Value *Nucleus::createXor(Value *lhs, Value *rhs)
830 {
831 	RR_DEBUG_INFO_UPDATE_LOC();
832 	return V(jit->builder->CreateXor(V(lhs), V(rhs)));
833 }
834 
createNeg(Value * v)835 Value *Nucleus::createNeg(Value *v)
836 {
837 	RR_DEBUG_INFO_UPDATE_LOC();
838 	return V(jit->builder->CreateNeg(V(v)));
839 }
840 
createFNeg(Value * v)841 Value *Nucleus::createFNeg(Value *v)
842 {
843 	RR_DEBUG_INFO_UPDATE_LOC();
844 	return V(jit->builder->CreateFNeg(V(v)));
845 }
846 
createNot(Value * v)847 Value *Nucleus::createNot(Value *v)
848 {
849 	RR_DEBUG_INFO_UPDATE_LOC();
850 	return V(jit->builder->CreateNot(V(v)));
851 }
852 
createLoad(Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)853 Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
854 {
855 	RR_DEBUG_INFO_UPDATE_LOC();
856 	switch(asInternalType(type))
857 	{
858 	case Type_v2i32:
859 	case Type_v4i16:
860 	case Type_v8i8:
861 	case Type_v2f32:
862 		return createBitCast(
863 		    createInsertElement(
864 		        V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false))),
865 		        createLoad(createBitCast(ptr, Pointer<Long>::type()), Long::type(), isVolatile, alignment, atomic, memoryOrder),
866 		        0),
867 		    type);
868 	case Type_v2i16:
869 	case Type_v4i8:
870 		if(alignment != 0)  // Not a local variable (all vectors are 128-bit).
871 		{
872 			Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false)));
873 			Value *i = createLoad(createBitCast(ptr, Pointer<Int>::type()), Int::type(), isVolatile, alignment, atomic, memoryOrder);
874 			i = createZExt(i, Long::type());
875 			Value *v = createInsertElement(u, i, 0);
876 			return createBitCast(v, type);
877 		}
878 		// Fallthrough to non-emulated case.
879 	case Type_LLVM:
880 		{
881 			auto elTy = T(type);
882 
883 			if(!atomic)
884 			{
885 				return V(jit->builder->CreateAlignedLoad(elTy, V(ptr), llvm::MaybeAlign(alignment), isVolatile));
886 			}
887 			else if(elTy->isIntegerTy() || elTy->isPointerTy())
888 			{
889 				// Integers and pointers can be atomically loaded by setting
890 				// the ordering constraint on the load instruction.
891 				auto load = jit->builder->CreateAlignedLoad(elTy, V(ptr), llvm::MaybeAlign(alignment), isVolatile);
892 				load->setAtomic(atomicOrdering(atomic, memoryOrder));
893 				return V(load);
894 			}
895 			else if(elTy->isFloatTy() || elTy->isDoubleTy())
896 			{
897 				// LLVM claims to support atomic loads of float types as
898 				// above, but certain backends cannot deal with this.
899 				// Load as an integer and bitcast. See b/136037244.
900 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
901 				auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
902 				auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
903 				auto load = jit->builder->CreateAlignedLoad(elAsIntTy, ptrCast, llvm::MaybeAlign(alignment), isVolatile);
904 				load->setAtomic(atomicOrdering(atomic, memoryOrder));
905 				auto loadCast = jit->builder->CreateBitCast(load, elTy);
906 				return V(loadCast);
907 			}
908 			else
909 			{
910 				// More exotic types require falling back to the extern:
911 				// void __atomic_load(size_t size, void *ptr, void *ret, int ordering)
912 				auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
913 				auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
914 				auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
915 				auto i8PtrTy = i8Ty->getPointerTo();
916 				auto voidTy = llvm::Type::getVoidTy(*jit->context);
917 				auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
918 				auto func = jit->module->getOrInsertFunction("__atomic_load", funcTy);
919 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
920 				auto out = allocateStackVariable(type);
921 				jit->builder->CreateCall(func, {
922 				                                   llvm::ConstantInt::get(sizetTy, size),
923 				                                   jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
924 				                                   jit->builder->CreatePointerCast(V(out), i8PtrTy),
925 				                                   llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
926 				                               });
927 				return V(jit->builder->CreateLoad(T(type), V(out)));
928 			}
929 		}
930 	default:
931 		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
932 		return nullptr;
933 	}
934 }
935 
createStore(Value * value,Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)936 Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
937 {
938 	RR_DEBUG_INFO_UPDATE_LOC();
939 	switch(asInternalType(type))
940 	{
941 	case Type_v2i32:
942 	case Type_v4i16:
943 	case Type_v8i8:
944 	case Type_v2f32:
945 		createStore(
946 		    createExtractElement(
947 		        createBitCast(value, T(llvm::VectorType::get(T(Long::type()), 2, false))), Long::type(), 0),
948 		    createBitCast(ptr, Pointer<Long>::type()),
949 		    Long::type(), isVolatile, alignment, atomic, memoryOrder);
950 		return value;
951 	case Type_v2i16:
952 	case Type_v4i8:
953 		if(alignment != 0)  // Not a local variable (all vectors are 128-bit).
954 		{
955 			createStore(
956 			    createExtractElement(createBitCast(value, Int4::type()), Int::type(), 0),
957 			    createBitCast(ptr, Pointer<Int>::type()),
958 			    Int::type(), isVolatile, alignment, atomic, memoryOrder);
959 			return value;
960 		}
961 		// Fallthrough to non-emulated case.
962 	case Type_LLVM:
963 		{
964 			auto elTy = T(type);
965 
966 			if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
967 			{
968 				// Mark all memory writes as initialized by calling __msan_unpoison
969 				// void __msan_unpoison(const volatile void *a, size_t size)
970 				auto voidTy = llvm::Type::getVoidTy(*jit->context);
971 				auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
972 				auto voidPtrTy = i8Ty->getPointerTo();
973 				auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
974 				auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
975 				auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
976 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
977 
978 				jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(V(ptr), voidPtrTy),
979 				                                 llvm::ConstantInt::get(sizetTy, size) });
980 			}
981 
982 			if(!atomic)
983 			{
984 				jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
985 			}
986 			else if(elTy->isIntegerTy() || elTy->isPointerTy())
987 			{
988 				// Integers and pointers can be atomically stored by setting
989 				// the ordering constraint on the store instruction.
990 				auto store = jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
991 				store->setAtomic(atomicOrdering(atomic, memoryOrder));
992 			}
993 			else if(elTy->isFloatTy() || elTy->isDoubleTy())
994 			{
995 				// LLVM claims to support atomic stores of float types as
996 				// above, but certain backends cannot deal with this.
997 				// Store as an bitcast integer. See b/136037244.
998 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
999 				auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
1000 				auto valCast = jit->builder->CreateBitCast(V(value), elAsIntTy);
1001 				auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
1002 				auto store = jit->builder->CreateAlignedStore(valCast, ptrCast, llvm::MaybeAlign(alignment), isVolatile);
1003 				store->setAtomic(atomicOrdering(atomic, memoryOrder));
1004 			}
1005 			else
1006 			{
1007 				// More exotic types require falling back to the extern:
1008 				// void __atomic_store(size_t size, void *ptr, void *val, int ordering)
1009 				auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1010 				auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
1011 				auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1012 				auto i8PtrTy = i8Ty->getPointerTo();
1013 				auto voidTy = llvm::Type::getVoidTy(*jit->context);
1014 				auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
1015 				auto func = jit->module->getOrInsertFunction("__atomic_store", funcTy);
1016 				auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1017 				auto copy = allocateStackVariable(type);
1018 				jit->builder->CreateStore(V(value), V(copy));
1019 				jit->builder->CreateCall(func, {
1020 				                                   llvm::ConstantInt::get(sizetTy, size),
1021 				                                   jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
1022 				                                   jit->builder->CreatePointerCast(V(copy), i8PtrTy),
1023 				                                   llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
1024 				                               });
1025 			}
1026 
1027 			return value;
1028 		}
1029 	default:
1030 		UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
1031 		return nullptr;
1032 	}
1033 }
1034 
createMaskedLoad(Value * ptr,Type * elTy,Value * mask,unsigned int alignment,bool zeroMaskedLanes)1035 Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1036 {
1037 	RR_DEBUG_INFO_UPDATE_LOC();
1038 
1039 	ASSERT(V(ptr)->getType()->isPointerTy());
1040 	ASSERT(V(mask)->getType()->isVectorTy());
1041 
1042 	auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1043 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1044 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1045 	auto elVecTy = llvm::VectorType::get(T(elTy), numEls, false);
1046 	auto elVecPtrTy = elVecTy->getPointerTo();
1047 	auto i8Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1048 	auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1049 	auto align = llvm::ConstantInt::get(i32Ty, alignment);
1050 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_load, { elVecTy, elVecPtrTy });
1051 	return V(jit->builder->CreateCall(func, { V(ptr), align, i8Mask, passthrough }));
1052 }
1053 
createMaskedStore(Value * ptr,Value * val,Value * mask,unsigned int alignment)1054 void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
1055 {
1056 	RR_DEBUG_INFO_UPDATE_LOC();
1057 
1058 	ASSERT(V(ptr)->getType()->isPointerTy());
1059 	ASSERT(V(val)->getType()->isVectorTy());
1060 	ASSERT(V(mask)->getType()->isVectorTy());
1061 
1062 	auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1063 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1064 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1065 	auto elVecTy = V(val)->getType();
1066 	auto elVecPtrTy = elVecTy->getPointerTo();
1067 	auto i1Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1068 	auto align = llvm::ConstantInt::get(i32Ty, alignment);
1069 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_store, { elVecTy, elVecPtrTy });
1070 	jit->builder->CreateCall(func, { V(val), V(ptr), align, i1Mask });
1071 
1072 	if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
1073 	{
1074 		// Mark memory writes as initialized by calling __msan_unpoison
1075 		// void __msan_unpoison(const volatile void *a, size_t size)
1076 		auto voidTy = llvm::Type::getVoidTy(*jit->context);
1077 		auto voidPtrTy = voidTy->getPointerTo();
1078 		auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1079 		auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
1080 		auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
1081 		auto size = jit->module->getDataLayout().getTypeStoreSize(llvm::cast<llvm::VectorType>(elVecTy)->getElementType());
1082 
1083 		for(unsigned i = 0; i < numEls; i++)
1084 		{
1085 			// Check mask for this element
1086 			auto idx = llvm::ConstantInt::get(i32Ty, i);
1087 			auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1088 			auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1089 			jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1090 			jit->builder->SetInsertPoint(thenBlock);
1091 
1092 			// Insert __msan_unpoison call in conditional block
1093 			auto elPtr = jit->builder->CreateGEP(elVecTy, V(ptr), idx);
1094 			jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(elPtr, voidPtrTy),
1095 			                                 llvm::ConstantInt::get(sizetTy, size) });
1096 
1097 			jit->builder->CreateBr(mergeBlock);
1098 			jit->builder->SetInsertPoint(mergeBlock);
1099 		}
1100 	}
1101 }
1102 
createGather(llvm::Value * base,llvm::Type * elTy,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment,bool zeroMaskedLanes)1103 static llvm::Value *createGather(llvm::Value *base, llvm::Type *elTy, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1104 {
1105 	ASSERT(base->getType()->isPointerTy());
1106 	ASSERT(offsets->getType()->isVectorTy());
1107 	ASSERT(mask->getType()->isVectorTy());
1108 
1109 	auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1110 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1111 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1112 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1113 	auto i8PtrTy = i8Ty->getPointerTo();
1114 	auto elPtrTy = elTy->getPointerTo();
1115 	auto elVecTy = llvm::VectorType::get(elTy, numEls, false);
1116 	auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1117 	auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1118 	auto i8Ptrs = jit->builder->CreateGEP(i8Ty, i8Base, offsets);
1119 	auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1120 	auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1121 	auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1122 
1123 	if(!__has_feature(memory_sanitizer))
1124 	{
1125 		auto align = llvm::ConstantInt::get(i32Ty, alignment);
1126 		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy });
1127 		return jit->builder->CreateCall(func, { elPtrs, align, i1Mask, passthrough });
1128 	}
1129 	else  // __has_feature(memory_sanitizer)
1130 	{
1131 		// MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_gather
1132 		// Work around it by emulating gather with element-wise loads.
1133 		// TODO(b/172238865): Remove when supported by MemorySanitizer.
1134 
1135 		Value *result = Nucleus::allocateStackVariable(T(elVecTy));
1136 		Nucleus::createStore(V(passthrough), result, T(elVecTy));
1137 
1138 		for(unsigned i = 0; i < numEls; i++)
1139 		{
1140 			// Check mask for this element
1141 			Value *elementMask = Nucleus::createExtractElement(V(i1Mask), T(i1Ty), i);
1142 
1143 			If(RValue<Bool>(elementMask))
1144 			{
1145 				Value *elPtr = Nucleus::createExtractElement(V(elPtrs), T(elPtrTy), i);
1146 				Value *el = Nucleus::createLoad(elPtr, T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1147 
1148 				Value *v = Nucleus::createLoad(result, T(elVecTy));
1149 				v = Nucleus::createInsertElement(v, el, i);
1150 				Nucleus::createStore(v, result, T(elVecTy));
1151 			}
1152 		}
1153 
1154 		return V(Nucleus::createLoad(result, T(elVecTy)));
1155 	}
1156 }
1157 
Gather(RValue<Pointer<Float>> base,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment,bool zeroMaskedLanes)1158 RValue<SIMD::Float> Gather(RValue<Pointer<Float>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1159 {
1160 	return As<SIMD::Float>(V(createGather(V(base.value()), T(Float::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1161 }
1162 
Gather(RValue<Pointer<Int>> base,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment,bool zeroMaskedLanes)1163 RValue<SIMD::Int> Gather(RValue<Pointer<Int>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1164 {
1165 	return As<SIMD::Int>(V(createGather(V(base.value()), T(Int::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1166 }
1167 
createScatter(llvm::Value * base,llvm::Value * val,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment)1168 static void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
1169 {
1170 	ASSERT(base->getType()->isPointerTy());
1171 	ASSERT(val->getType()->isVectorTy());
1172 	ASSERT(offsets->getType()->isVectorTy());
1173 	ASSERT(mask->getType()->isVectorTy());
1174 
1175 	auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1176 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1177 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1178 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1179 	auto i8PtrTy = i8Ty->getPointerTo();
1180 	auto elVecTy = val->getType();
1181 	auto elTy = llvm::cast<llvm::VectorType>(elVecTy)->getElementType();
1182 	auto elPtrTy = elTy->getPointerTo();
1183 	auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1184 
1185 	auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1186 	auto i8Ptrs = jit->builder->CreateGEP(i8Ty, i8Base, offsets);
1187 	auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1188 	auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false);  // vec<int, int, ...> -> vec<bool, bool, ...>
1189 
1190 	if(!__has_feature(memory_sanitizer))
1191 	{
1192 		auto align = llvm::ConstantInt::get(i32Ty, alignment);
1193 		auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy });
1194 		jit->builder->CreateCall(func, { val, elPtrs, align, i1Mask });
1195 	}
1196 	else  // __has_feature(memory_sanitizer)
1197 	{
1198 		// MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_scatter
1199 		// Work around it by emulating scatter with element-wise stores.
1200 		// TODO(b/172238865): Remove when supported by MemorySanitizer.
1201 
1202 		for(unsigned i = 0; i < numEls; i++)
1203 		{
1204 			// Check mask for this element
1205 			auto idx = llvm::ConstantInt::get(i32Ty, i);
1206 			auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1207 			auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1208 			jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1209 			jit->builder->SetInsertPoint(thenBlock);
1210 
1211 			auto el = jit->builder->CreateExtractElement(val, idx);
1212 			auto elPtr = jit->builder->CreateExtractElement(elPtrs, idx);
1213 			Nucleus::createStore(V(el), V(elPtr), T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1214 
1215 			jit->builder->CreateBr(mergeBlock);
1216 			jit->builder->SetInsertPoint(mergeBlock);
1217 		}
1218 	}
1219 }
1220 
Scatter(RValue<Pointer<Float>> base,RValue<SIMD::Float> val,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment)1221 void Scatter(RValue<Pointer<Float>> base, RValue<SIMD::Float> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
1222 {
1223 	return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1224 }
1225 
Scatter(RValue<Pointer<Int>> base,RValue<SIMD::Int> val,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment)1226 void Scatter(RValue<Pointer<Int>> base, RValue<SIMD::Int> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
1227 {
1228 	return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1229 }
1230 
createFence(std::memory_order memoryOrder)1231 void Nucleus::createFence(std::memory_order memoryOrder)
1232 {
1233 	RR_DEBUG_INFO_UPDATE_LOC();
1234 	jit->builder->CreateFence(atomicOrdering(true, memoryOrder));
1235 }
1236 
createGEP(Value * ptr,Type * type,Value * index,bool unsignedIndex)1237 Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
1238 {
1239 	RR_DEBUG_INFO_UPDATE_LOC();
1240 
1241 	if(sizeof(void *) == 8)
1242 	{
1243 		// LLVM manual: "When indexing into an array, pointer or vector,
1244 		// integers of any width are allowed, and they are not required to
1245 		// be constant. These integers are treated as signed values where
1246 		// relevant."
1247 		//
1248 		// Thus if we want indexes to be treated as unsigned we have to
1249 		// zero-extend them ourselves.
1250 		//
1251 		// Note that this is not because we want to address anywhere near
1252 		// 4 GB of data. Instead this is important for performance because
1253 		// x86 supports automatic zero-extending of 32-bit registers to
1254 		// 64-bit. Thus when indexing into an array using a uint32 is
1255 		// actually faster than an int32.
1256 		index = unsignedIndex ? createZExt(index, Long::type()) : createSExt(index, Long::type());
1257 	}
1258 
1259 	// For non-emulated types we can rely on LLVM's GEP to calculate the
1260 	// effective address correctly.
1261 	if(asInternalType(type) == Type_LLVM)
1262 	{
1263 		return V(jit->builder->CreateGEP(T(type), V(ptr), V(index)));
1264 	}
1265 
1266 	// For emulated types we have to multiply the index by the intended
1267 	// type size ourselves to obain the byte offset.
1268 	index = (sizeof(void *) == 8) ? createMul(index, createConstantLong((int64_t)typeSize(type))) : createMul(index, createConstantInt((int)typeSize(type)));
1269 
1270 	// Cast to a byte pointer, apply the byte offset, and cast back to the
1271 	// original pointer type.
1272 	return createBitCast(
1273 	    V(jit->builder->CreateGEP(T(Byte::type()), V(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::type()), 0)))), V(index))),
1274 	    T(llvm::PointerType::get(T(type), 0)));
1275 }
1276 
createAtomicAdd(Value * ptr,Value * value,std::memory_order memoryOrder)1277 Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
1278 {
1279 	RR_DEBUG_INFO_UPDATE_LOC();
1280 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, V(ptr), V(value),
1281 #if LLVM_VERSION_MAJOR >= 11
1282 	                                       llvm::MaybeAlign(),
1283 #endif
1284 	                                       atomicOrdering(true, memoryOrder)));
1285 }
1286 
createAtomicSub(Value * ptr,Value * value,std::memory_order memoryOrder)1287 Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
1288 {
1289 	RR_DEBUG_INFO_UPDATE_LOC();
1290 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Sub, V(ptr), V(value),
1291 #if LLVM_VERSION_MAJOR >= 11
1292 	                                       llvm::MaybeAlign(),
1293 #endif
1294 	                                       atomicOrdering(true, memoryOrder)));
1295 }
1296 
createAtomicAnd(Value * ptr,Value * value,std::memory_order memoryOrder)1297 Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
1298 {
1299 	RR_DEBUG_INFO_UPDATE_LOC();
1300 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::And, V(ptr), V(value),
1301 #if LLVM_VERSION_MAJOR >= 11
1302 	                                       llvm::MaybeAlign(),
1303 #endif
1304 	                                       atomicOrdering(true, memoryOrder)));
1305 }
1306 
createAtomicOr(Value * ptr,Value * value,std::memory_order memoryOrder)1307 Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
1308 {
1309 	RR_DEBUG_INFO_UPDATE_LOC();
1310 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Or, V(ptr), V(value),
1311 #if LLVM_VERSION_MAJOR >= 11
1312 	                                       llvm::MaybeAlign(),
1313 #endif
1314 	                                       atomicOrdering(true, memoryOrder)));
1315 }
1316 
createAtomicXor(Value * ptr,Value * value,std::memory_order memoryOrder)1317 Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
1318 {
1319 	RR_DEBUG_INFO_UPDATE_LOC();
1320 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xor, V(ptr), V(value),
1321 #if LLVM_VERSION_MAJOR >= 11
1322 	                                       llvm::MaybeAlign(),
1323 #endif
1324 	                                       atomicOrdering(true, memoryOrder)));
1325 }
1326 
createAtomicMin(Value * ptr,Value * value,std::memory_order memoryOrder)1327 Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1328 {
1329 	RR_DEBUG_INFO_UPDATE_LOC();
1330 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Min, V(ptr), V(value),
1331 #if LLVM_VERSION_MAJOR >= 11
1332 	                                       llvm::MaybeAlign(),
1333 #endif
1334 	                                       atomicOrdering(true, memoryOrder)));
1335 }
1336 
createAtomicMax(Value * ptr,Value * value,std::memory_order memoryOrder)1337 Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1338 {
1339 	RR_DEBUG_INFO_UPDATE_LOC();
1340 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Max, V(ptr), V(value),
1341 #if LLVM_VERSION_MAJOR >= 11
1342 	                                       llvm::MaybeAlign(),
1343 #endif
1344 	                                       atomicOrdering(true, memoryOrder)));
1345 }
1346 
createAtomicUMin(Value * ptr,Value * value,std::memory_order memoryOrder)1347 Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1348 {
1349 	RR_DEBUG_INFO_UPDATE_LOC();
1350 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMin, V(ptr), V(value),
1351 #if LLVM_VERSION_MAJOR >= 11
1352 	                                       llvm::MaybeAlign(),
1353 #endif
1354 	                                       atomicOrdering(true, memoryOrder)));
1355 }
1356 
createAtomicUMax(Value * ptr,Value * value,std::memory_order memoryOrder)1357 Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1358 {
1359 	RR_DEBUG_INFO_UPDATE_LOC();
1360 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMax, V(ptr), V(value),
1361 #if LLVM_VERSION_MAJOR >= 11
1362 	                                       llvm::MaybeAlign(),
1363 #endif
1364 	                                       atomicOrdering(true, memoryOrder)));
1365 }
1366 
createAtomicExchange(Value * ptr,Value * value,std::memory_order memoryOrder)1367 Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
1368 {
1369 	RR_DEBUG_INFO_UPDATE_LOC();
1370 	return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, V(ptr), V(value),
1371 #if LLVM_VERSION_MAJOR >= 11
1372 	                                       llvm::MaybeAlign(),
1373 #endif
1374 	                                       atomicOrdering(true, memoryOrder)));
1375 }
1376 
createAtomicCompareExchange(Value * ptr,Value * value,Value * compare,std::memory_order memoryOrderEqual,std::memory_order memoryOrderUnequal)1377 Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
1378 {
1379 	RR_DEBUG_INFO_UPDATE_LOC();
1380 	// Note: AtomicCmpXchgInstruction returns a 2-member struct containing {result, success-flag}, not the result directly.
1381 	return V(jit->builder->CreateExtractValue(
1382 	    jit->builder->CreateAtomicCmpXchg(V(ptr), V(compare), V(value),
1383 #if LLVM_VERSION_MAJOR >= 11
1384 	                                      llvm::MaybeAlign(),
1385 #endif
1386 	                                      atomicOrdering(true, memoryOrderEqual),
1387 	                                      atomicOrdering(true, memoryOrderUnequal)),
1388 	    llvm::ArrayRef<unsigned>(0u)));
1389 }
1390 
createTrunc(Value * v,Type * destType)1391 Value *Nucleus::createTrunc(Value *v, Type *destType)
1392 {
1393 	RR_DEBUG_INFO_UPDATE_LOC();
1394 	return V(jit->builder->CreateTrunc(V(v), T(destType)));
1395 }
1396 
createZExt(Value * v,Type * destType)1397 Value *Nucleus::createZExt(Value *v, Type *destType)
1398 {
1399 	RR_DEBUG_INFO_UPDATE_LOC();
1400 	return V(jit->builder->CreateZExt(V(v), T(destType)));
1401 }
1402 
createSExt(Value * v,Type * destType)1403 Value *Nucleus::createSExt(Value *v, Type *destType)
1404 {
1405 	RR_DEBUG_INFO_UPDATE_LOC();
1406 	return V(jit->builder->CreateSExt(V(v), T(destType)));
1407 }
1408 
createFPToUI(Value * v,Type * destType)1409 Value *Nucleus::createFPToUI(Value *v, Type *destType)
1410 {
1411 	RR_DEBUG_INFO_UPDATE_LOC();
1412 	return V(jit->builder->CreateFPToUI(V(v), T(destType)));
1413 }
1414 
createFPToSI(Value * v,Type * destType)1415 Value *Nucleus::createFPToSI(Value *v, Type *destType)
1416 {
1417 	RR_DEBUG_INFO_UPDATE_LOC();
1418 	return V(jit->builder->CreateFPToSI(V(v), T(destType)));
1419 }
1420 
createSIToFP(Value * v,Type * destType)1421 Value *Nucleus::createSIToFP(Value *v, Type *destType)
1422 {
1423 	RR_DEBUG_INFO_UPDATE_LOC();
1424 	return V(jit->builder->CreateSIToFP(V(v), T(destType)));
1425 }
1426 
createFPTrunc(Value * v,Type * destType)1427 Value *Nucleus::createFPTrunc(Value *v, Type *destType)
1428 {
1429 	RR_DEBUG_INFO_UPDATE_LOC();
1430 	return V(jit->builder->CreateFPTrunc(V(v), T(destType)));
1431 }
1432 
createFPExt(Value * v,Type * destType)1433 Value *Nucleus::createFPExt(Value *v, Type *destType)
1434 {
1435 	RR_DEBUG_INFO_UPDATE_LOC();
1436 	return V(jit->builder->CreateFPExt(V(v), T(destType)));
1437 }
1438 
createBitCast(Value * v,Type * destType)1439 Value *Nucleus::createBitCast(Value *v, Type *destType)
1440 {
1441 	RR_DEBUG_INFO_UPDATE_LOC();
1442 	// Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
1443 	// support for casting between scalars and wide vectors. Emulate them by writing to the stack and
1444 	// reading back as the destination type.
1445 	if(!V(v)->getType()->isVectorTy() && T(destType)->isVectorTy())
1446 	{
1447 		Value *readAddress = allocateStackVariable(destType);
1448 		Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(V(v)->getType(), 0)));
1449 		createStore(v, writeAddress, T(V(v)->getType()));
1450 		return createLoad(readAddress, destType);
1451 	}
1452 	else if(V(v)->getType()->isVectorTy() && !T(destType)->isVectorTy())
1453 	{
1454 		Value *writeAddress = allocateStackVariable(T(V(v)->getType()));
1455 		createStore(v, writeAddress, T(V(v)->getType()));
1456 		Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
1457 		return createLoad(readAddress, destType);
1458 	}
1459 
1460 	return V(jit->builder->CreateBitCast(V(v), T(destType)));
1461 }
1462 
createICmpEQ(Value * lhs,Value * rhs)1463 Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
1464 {
1465 	RR_DEBUG_INFO_UPDATE_LOC();
1466 	return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
1467 }
1468 
createICmpNE(Value * lhs,Value * rhs)1469 Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
1470 {
1471 	RR_DEBUG_INFO_UPDATE_LOC();
1472 	return V(jit->builder->CreateICmpNE(V(lhs), V(rhs)));
1473 }
1474 
createICmpUGT(Value * lhs,Value * rhs)1475 Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
1476 {
1477 	RR_DEBUG_INFO_UPDATE_LOC();
1478 	return V(jit->builder->CreateICmpUGT(V(lhs), V(rhs)));
1479 }
1480 
createICmpUGE(Value * lhs,Value * rhs)1481 Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
1482 {
1483 	RR_DEBUG_INFO_UPDATE_LOC();
1484 	return V(jit->builder->CreateICmpUGE(V(lhs), V(rhs)));
1485 }
1486 
createICmpULT(Value * lhs,Value * rhs)1487 Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
1488 {
1489 	RR_DEBUG_INFO_UPDATE_LOC();
1490 	return V(jit->builder->CreateICmpULT(V(lhs), V(rhs)));
1491 }
1492 
createICmpULE(Value * lhs,Value * rhs)1493 Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
1494 {
1495 	RR_DEBUG_INFO_UPDATE_LOC();
1496 	return V(jit->builder->CreateICmpULE(V(lhs), V(rhs)));
1497 }
1498 
createICmpSGT(Value * lhs,Value * rhs)1499 Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
1500 {
1501 	RR_DEBUG_INFO_UPDATE_LOC();
1502 	return V(jit->builder->CreateICmpSGT(V(lhs), V(rhs)));
1503 }
1504 
createICmpSGE(Value * lhs,Value * rhs)1505 Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
1506 {
1507 	RR_DEBUG_INFO_UPDATE_LOC();
1508 	return V(jit->builder->CreateICmpSGE(V(lhs), V(rhs)));
1509 }
1510 
createICmpSLT(Value * lhs,Value * rhs)1511 Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
1512 {
1513 	RR_DEBUG_INFO_UPDATE_LOC();
1514 	return V(jit->builder->CreateICmpSLT(V(lhs), V(rhs)));
1515 }
1516 
createICmpSLE(Value * lhs,Value * rhs)1517 Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
1518 {
1519 	RR_DEBUG_INFO_UPDATE_LOC();
1520 	return V(jit->builder->CreateICmpSLE(V(lhs), V(rhs)));
1521 }
1522 
createFCmpOEQ(Value * lhs,Value * rhs)1523 Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
1524 {
1525 	RR_DEBUG_INFO_UPDATE_LOC();
1526 	return V(jit->builder->CreateFCmpOEQ(V(lhs), V(rhs)));
1527 }
1528 
createFCmpOGT(Value * lhs,Value * rhs)1529 Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
1530 {
1531 	RR_DEBUG_INFO_UPDATE_LOC();
1532 	return V(jit->builder->CreateFCmpOGT(V(lhs), V(rhs)));
1533 }
1534 
createFCmpOGE(Value * lhs,Value * rhs)1535 Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
1536 {
1537 	RR_DEBUG_INFO_UPDATE_LOC();
1538 	return V(jit->builder->CreateFCmpOGE(V(lhs), V(rhs)));
1539 }
1540 
createFCmpOLT(Value * lhs,Value * rhs)1541 Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
1542 {
1543 	RR_DEBUG_INFO_UPDATE_LOC();
1544 	return V(jit->builder->CreateFCmpOLT(V(lhs), V(rhs)));
1545 }
1546 
createFCmpOLE(Value * lhs,Value * rhs)1547 Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
1548 {
1549 	RR_DEBUG_INFO_UPDATE_LOC();
1550 	return V(jit->builder->CreateFCmpOLE(V(lhs), V(rhs)));
1551 }
1552 
createFCmpONE(Value * lhs,Value * rhs)1553 Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
1554 {
1555 	RR_DEBUG_INFO_UPDATE_LOC();
1556 	return V(jit->builder->CreateFCmpONE(V(lhs), V(rhs)));
1557 }
1558 
createFCmpORD(Value * lhs,Value * rhs)1559 Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
1560 {
1561 	RR_DEBUG_INFO_UPDATE_LOC();
1562 	return V(jit->builder->CreateFCmpORD(V(lhs), V(rhs)));
1563 }
1564 
createFCmpUNO(Value * lhs,Value * rhs)1565 Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
1566 {
1567 	RR_DEBUG_INFO_UPDATE_LOC();
1568 	return V(jit->builder->CreateFCmpUNO(V(lhs), V(rhs)));
1569 }
1570 
createFCmpUEQ(Value * lhs,Value * rhs)1571 Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
1572 {
1573 	RR_DEBUG_INFO_UPDATE_LOC();
1574 	return V(jit->builder->CreateFCmpUEQ(V(lhs), V(rhs)));
1575 }
1576 
createFCmpUGT(Value * lhs,Value * rhs)1577 Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
1578 {
1579 	RR_DEBUG_INFO_UPDATE_LOC();
1580 	return V(jit->builder->CreateFCmpUGT(V(lhs), V(rhs)));
1581 }
1582 
createFCmpUGE(Value * lhs,Value * rhs)1583 Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
1584 {
1585 	RR_DEBUG_INFO_UPDATE_LOC();
1586 	return V(jit->builder->CreateFCmpUGE(V(lhs), V(rhs)));
1587 }
1588 
createFCmpULT(Value * lhs,Value * rhs)1589 Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
1590 {
1591 	RR_DEBUG_INFO_UPDATE_LOC();
1592 	return V(jit->builder->CreateFCmpULT(V(lhs), V(rhs)));
1593 }
1594 
createFCmpULE(Value * lhs,Value * rhs)1595 Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
1596 {
1597 	RR_DEBUG_INFO_UPDATE_LOC();
1598 	return V(jit->builder->CreateFCmpULE(V(lhs), V(rhs)));
1599 }
1600 
createFCmpUNE(Value * lhs,Value * rhs)1601 Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
1602 {
1603 	RR_DEBUG_INFO_UPDATE_LOC();
1604 	return V(jit->builder->CreateFCmpUNE(V(lhs), V(rhs)));
1605 }
1606 
createExtractElement(Value * vector,Type * type,int index)1607 Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
1608 {
1609 	RR_DEBUG_INFO_UPDATE_LOC();
1610 	ASSERT(V(vector)->getType()->getContainedType(0) == T(type));
1611 	return V(jit->builder->CreateExtractElement(V(vector), V(createConstantInt(index))));
1612 }
1613 
createInsertElement(Value * vector,Value * element,int index)1614 Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
1615 {
1616 	RR_DEBUG_INFO_UPDATE_LOC();
1617 	return V(jit->builder->CreateInsertElement(V(vector), V(element), V(createConstantInt(index))));
1618 }
1619 
createShuffleVector(Value * v1,Value * v2,std::vector<int> select)1620 Value *Nucleus::createShuffleVector(Value *v1, Value *v2, std::vector<int> select)
1621 {
1622 	RR_DEBUG_INFO_UPDATE_LOC();
1623 
1624 	size_t size = llvm::cast<llvm::FixedVectorType>(V(v1)->getType())->getNumElements();
1625 	ASSERT(size == llvm::cast<llvm::FixedVectorType>(V(v2)->getType())->getNumElements());
1626 
1627 	llvm::SmallVector<int, 16> mask;
1628 	const size_t selectSize = select.size();
1629 	for(size_t i = 0; i < size; i++)
1630 	{
1631 		mask.push_back(select[i % selectSize]);
1632 	}
1633 
1634 	return V(lowerShuffleVector(V(v1), V(v2), mask));
1635 }
1636 
createSelect(Value * c,Value * ifTrue,Value * ifFalse)1637 Value *Nucleus::createSelect(Value *c, Value *ifTrue, Value *ifFalse)
1638 {
1639 	RR_DEBUG_INFO_UPDATE_LOC();
1640 	return V(jit->builder->CreateSelect(V(c), V(ifTrue), V(ifFalse)));
1641 }
1642 
createSwitch(Value * control,BasicBlock * defaultBranch,unsigned numCases)1643 SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
1644 {
1645 	RR_DEBUG_INFO_UPDATE_LOC();
1646 	return reinterpret_cast<SwitchCases *>(jit->builder->CreateSwitch(V(control), B(defaultBranch), numCases));
1647 }
1648 
addSwitchCase(SwitchCases * switchCases,int label,BasicBlock * branch)1649 void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
1650 {
1651 	RR_DEBUG_INFO_UPDATE_LOC();
1652 	llvm::SwitchInst *sw = reinterpret_cast<llvm::SwitchInst *>(switchCases);
1653 	sw->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), label, true), B(branch));
1654 }
1655 
createUnreachable()1656 void Nucleus::createUnreachable()
1657 {
1658 	RR_DEBUG_INFO_UPDATE_LOC();
1659 	jit->builder->CreateUnreachable();
1660 }
1661 
getType(Value * value)1662 Type *Nucleus::getType(Value *value)
1663 {
1664 	return T(V(value)->getType());
1665 }
1666 
getContainedType(Type * vectorType)1667 Type *Nucleus::getContainedType(Type *vectorType)
1668 {
1669 	return T(T(vectorType)->getContainedType(0));
1670 }
1671 
getPointerType(Type * ElementType)1672 Type *Nucleus::getPointerType(Type *ElementType)
1673 {
1674 	return T(llvm::PointerType::get(T(ElementType), 0));
1675 }
1676 
getNaturalIntType()1677 static llvm::Type *getNaturalIntType()
1678 {
1679 	return llvm::Type::getIntNTy(*jit->context, sizeof(int) * 8);
1680 }
1681 
getPrintfStorageType(Type * valueType)1682 Type *Nucleus::getPrintfStorageType(Type *valueType)
1683 {
1684 	llvm::Type *valueTy = T(valueType);
1685 	if(valueTy->isIntegerTy())
1686 	{
1687 		return T(getNaturalIntType());
1688 	}
1689 	if(valueTy->isFloatTy())
1690 	{
1691 		return T(llvm::Type::getDoubleTy(*jit->context));
1692 	}
1693 
1694 	UNIMPLEMENTED_NO_BUG("getPrintfStorageType: add more cases as needed");
1695 	return {};
1696 }
1697 
createNullValue(Type * Ty)1698 Value *Nucleus::createNullValue(Type *Ty)
1699 {
1700 	RR_DEBUG_INFO_UPDATE_LOC();
1701 	return V(llvm::Constant::getNullValue(T(Ty)));
1702 }
1703 
createConstantLong(int64_t i)1704 Value *Nucleus::createConstantLong(int64_t i)
1705 {
1706 	RR_DEBUG_INFO_UPDATE_LOC();
1707 	return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), i, true));
1708 }
1709 
createConstantInt(int i)1710 Value *Nucleus::createConstantInt(int i)
1711 {
1712 	RR_DEBUG_INFO_UPDATE_LOC();
1713 	return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, true));
1714 }
1715 
createConstantInt(unsigned int i)1716 Value *Nucleus::createConstantInt(unsigned int i)
1717 {
1718 	RR_DEBUG_INFO_UPDATE_LOC();
1719 	return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, false));
1720 }
1721 
createConstantBool(bool b)1722 Value *Nucleus::createConstantBool(bool b)
1723 {
1724 	RR_DEBUG_INFO_UPDATE_LOC();
1725 	return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(*jit->context), b));
1726 }
1727 
createConstantByte(signed char i)1728 Value *Nucleus::createConstantByte(signed char i)
1729 {
1730 	RR_DEBUG_INFO_UPDATE_LOC();
1731 	return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, true));
1732 }
1733 
createConstantByte(unsigned char i)1734 Value *Nucleus::createConstantByte(unsigned char i)
1735 {
1736 	RR_DEBUG_INFO_UPDATE_LOC();
1737 	return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, false));
1738 }
1739 
createConstantShort(short i)1740 Value *Nucleus::createConstantShort(short i)
1741 {
1742 	RR_DEBUG_INFO_UPDATE_LOC();
1743 	return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, true));
1744 }
1745 
createConstantShort(unsigned short i)1746 Value *Nucleus::createConstantShort(unsigned short i)
1747 {
1748 	RR_DEBUG_INFO_UPDATE_LOC();
1749 	return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, false));
1750 }
1751 
createConstantFloat(float x)1752 Value *Nucleus::createConstantFloat(float x)
1753 {
1754 	RR_DEBUG_INFO_UPDATE_LOC();
1755 	return V(llvm::ConstantFP::get(T(Float::type()), x));
1756 }
1757 
createNullPointer(Type * Ty)1758 Value *Nucleus::createNullPointer(Type *Ty)
1759 {
1760 	RR_DEBUG_INFO_UPDATE_LOC();
1761 	return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
1762 }
1763 
createConstantVector(std::vector<int64_t> constants,Type * type)1764 Value *Nucleus::createConstantVector(std::vector<int64_t> constants, Type *type)
1765 {
1766 	RR_DEBUG_INFO_UPDATE_LOC();
1767 	ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1768 	const size_t numConstants = constants.size();                                             // Number of provided constants for the (emulated) type.
1769 	const size_t numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements();  // Number of elements of the underlying vector type.
1770 	llvm::SmallVector<llvm::Constant *, 16> constantVector;
1771 
1772 	for(size_t i = 0; i < numElements; i++)
1773 	{
1774 		constantVector.push_back(llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i % numConstants]));
1775 	}
1776 
1777 	return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector)));
1778 }
1779 
createConstantVector(std::vector<double> constants,Type * type)1780 Value *Nucleus::createConstantVector(std::vector<double> constants, Type *type)
1781 {
1782 	RR_DEBUG_INFO_UPDATE_LOC();
1783 	ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1784 	const size_t numConstants = constants.size();                                             // Number of provided constants for the (emulated) type.
1785 	const size_t numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements();  // Number of elements of the underlying vector type.
1786 	llvm::SmallVector<llvm::Constant *, 16> constantVector;
1787 
1788 	for(size_t i = 0; i < numElements; i++)
1789 	{
1790 		constantVector.push_back(llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i % numConstants]));
1791 	}
1792 
1793 	return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector)));
1794 }
1795 
createConstantString(const char * v)1796 Value *Nucleus::createConstantString(const char *v)
1797 {
1798 	// NOTE: Do not call RR_DEBUG_INFO_UPDATE_LOC() here to avoid recursion when called from rr::Printv
1799 	auto ptr = jit->builder->CreateGlobalStringPtr(v);
1800 	return V(ptr);
1801 }
1802 
setOptimizerCallback(OptimizerCallback * callback)1803 void Nucleus::setOptimizerCallback(OptimizerCallback *callback)
1804 {
1805 	// The LLVM backend does not produce optimizer reports.
1806 	(void)callback;
1807 }
1808 
type()1809 Type *Void::type()
1810 {
1811 	return T(llvm::Type::getVoidTy(*jit->context));
1812 }
1813 
type()1814 Type *Bool::type()
1815 {
1816 	return T(llvm::Type::getInt1Ty(*jit->context));
1817 }
1818 
type()1819 Type *Byte::type()
1820 {
1821 	return T(llvm::Type::getInt8Ty(*jit->context));
1822 }
1823 
type()1824 Type *SByte::type()
1825 {
1826 	return T(llvm::Type::getInt8Ty(*jit->context));
1827 }
1828 
type()1829 Type *Short::type()
1830 {
1831 	return T(llvm::Type::getInt16Ty(*jit->context));
1832 }
1833 
type()1834 Type *UShort::type()
1835 {
1836 	return T(llvm::Type::getInt16Ty(*jit->context));
1837 }
1838 
type()1839 Type *Byte4::type()
1840 {
1841 	return T(Type_v4i8);
1842 }
1843 
type()1844 Type *SByte4::type()
1845 {
1846 	return T(Type_v4i8);
1847 }
1848 
AddSat(RValue<Byte8> x,RValue<Byte8> y)1849 RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
1850 {
1851 	RR_DEBUG_INFO_UPDATE_LOC();
1852 #if defined(__i386__) || defined(__x86_64__)
1853 	return x86::paddusb(x, y);
1854 #else
1855 	return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
1856 #endif
1857 }
1858 
SubSat(RValue<Byte8> x,RValue<Byte8> y)1859 RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
1860 {
1861 	RR_DEBUG_INFO_UPDATE_LOC();
1862 #if defined(__i386__) || defined(__x86_64__)
1863 	return x86::psubusb(x, y);
1864 #else
1865 	return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
1866 #endif
1867 }
1868 
SignMask(RValue<Byte8> x)1869 RValue<Int> SignMask(RValue<Byte8> x)
1870 {
1871 	RR_DEBUG_INFO_UPDATE_LOC();
1872 #if defined(__i386__) || defined(__x86_64__)
1873 	return x86::pmovmskb(x);
1874 #else
1875 	return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1876 #endif
1877 }
1878 
1879 //	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
1880 //	{
1881 //#if defined(__i386__) || defined(__x86_64__)
1882 //		return x86::pcmpgtb(x, y);   // FIXME: Signedness
1883 //#else
1884 //		return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1885 //#endif
1886 //	}
1887 
CmpEQ(RValue<Byte8> x,RValue<Byte8> y)1888 RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
1889 {
1890 	RR_DEBUG_INFO_UPDATE_LOC();
1891 #if defined(__i386__) || defined(__x86_64__)
1892 	return x86::pcmpeqb(x, y);
1893 #else
1894 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1895 #endif
1896 }
1897 
type()1898 Type *Byte8::type()
1899 {
1900 	return T(Type_v8i8);
1901 }
1902 
AddSat(RValue<SByte8> x,RValue<SByte8> y)1903 RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
1904 {
1905 	RR_DEBUG_INFO_UPDATE_LOC();
1906 #if defined(__i386__) || defined(__x86_64__)
1907 	return x86::paddsb(x, y);
1908 #else
1909 	return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
1910 #endif
1911 }
1912 
SubSat(RValue<SByte8> x,RValue<SByte8> y)1913 RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
1914 {
1915 	RR_DEBUG_INFO_UPDATE_LOC();
1916 #if defined(__i386__) || defined(__x86_64__)
1917 	return x86::psubsb(x, y);
1918 #else
1919 	return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
1920 #endif
1921 }
1922 
SignMask(RValue<SByte8> x)1923 RValue<Int> SignMask(RValue<SByte8> x)
1924 {
1925 	RR_DEBUG_INFO_UPDATE_LOC();
1926 #if defined(__i386__) || defined(__x86_64__)
1927 	return x86::pmovmskb(As<Byte8>(x));
1928 #else
1929 	return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1930 #endif
1931 }
1932 
CmpGT(RValue<SByte8> x,RValue<SByte8> y)1933 RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
1934 {
1935 	RR_DEBUG_INFO_UPDATE_LOC();
1936 #if defined(__i386__) || defined(__x86_64__)
1937 	return x86::pcmpgtb(x, y);
1938 #else
1939 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1940 #endif
1941 }
1942 
CmpEQ(RValue<SByte8> x,RValue<SByte8> y)1943 RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
1944 {
1945 	RR_DEBUG_INFO_UPDATE_LOC();
1946 #if defined(__i386__) || defined(__x86_64__)
1947 	return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
1948 #else
1949 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1950 #endif
1951 }
1952 
type()1953 Type *SByte8::type()
1954 {
1955 	return T(Type_v8i8);
1956 }
1957 
type()1958 Type *Byte16::type()
1959 {
1960 	return T(llvm::VectorType::get(T(Byte::type()), 16, false));
1961 }
1962 
type()1963 Type *SByte16::type()
1964 {
1965 	return T(llvm::VectorType::get(T(SByte::type()), 16, false));
1966 }
1967 
type()1968 Type *Short2::type()
1969 {
1970 	return T(Type_v2i16);
1971 }
1972 
type()1973 Type *UShort2::type()
1974 {
1975 	return T(Type_v2i16);
1976 }
1977 
Short4(RValue<Int4> cast)1978 Short4::Short4(RValue<Int4> cast)
1979 {
1980 	RR_DEBUG_INFO_UPDATE_LOC();
1981 	std::vector<int> select = { 0, 2, 4, 6, 0, 2, 4, 6 };
1982 	Value *short8 = Nucleus::createBitCast(cast.value(), Short8::type());
1983 
1984 	Value *packed = Nucleus::createShuffleVector(short8, short8, select);
1985 	Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value();
1986 
1987 	storeValue(short4);
1988 }
1989 
1990 //	Short4::Short4(RValue<Float> cast)
1991 //	{
1992 //	}
1993 
Short4(RValue<Float4> cast)1994 Short4::Short4(RValue<Float4> cast)
1995 {
1996 	RR_DEBUG_INFO_UPDATE_LOC();
1997 	Int4 v4i32 = Int4(cast);
1998 #if defined(__i386__) || defined(__x86_64__)
1999 	v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
2000 #else
2001 	Value *v = v4i32.loadValue();
2002 	v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
2003 #endif
2004 
2005 	storeValue(As<Short4>(Int2(v4i32)).value());
2006 }
2007 
operator <<(RValue<Short4> lhs,unsigned char rhs)2008 RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
2009 {
2010 	RR_DEBUG_INFO_UPDATE_LOC();
2011 #if defined(__i386__) || defined(__x86_64__)
2012 	//	return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2013 
2014 	return x86::psllw(lhs, rhs);
2015 #else
2016 	return As<Short4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2017 #endif
2018 }
2019 
operator >>(RValue<Short4> lhs,unsigned char rhs)2020 RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
2021 {
2022 	RR_DEBUG_INFO_UPDATE_LOC();
2023 #if defined(__i386__) || defined(__x86_64__)
2024 	return x86::psraw(lhs, rhs);
2025 #else
2026 	return As<Short4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2027 #endif
2028 }
2029 
Max(RValue<Short4> x,RValue<Short4> y)2030 RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
2031 {
2032 	RR_DEBUG_INFO_UPDATE_LOC();
2033 #if defined(__i386__) || defined(__x86_64__)
2034 	return x86::pmaxsw(x, y);
2035 #else
2036 	return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
2037 #endif
2038 }
2039 
Min(RValue<Short4> x,RValue<Short4> y)2040 RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
2041 {
2042 	RR_DEBUG_INFO_UPDATE_LOC();
2043 #if defined(__i386__) || defined(__x86_64__)
2044 	return x86::pminsw(x, y);
2045 #else
2046 	return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
2047 #endif
2048 }
2049 
AddSat(RValue<Short4> x,RValue<Short4> y)2050 RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
2051 {
2052 	RR_DEBUG_INFO_UPDATE_LOC();
2053 #if defined(__i386__) || defined(__x86_64__)
2054 	return x86::paddsw(x, y);
2055 #else
2056 	return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
2057 #endif
2058 }
2059 
SubSat(RValue<Short4> x,RValue<Short4> y)2060 RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
2061 {
2062 	RR_DEBUG_INFO_UPDATE_LOC();
2063 #if defined(__i386__) || defined(__x86_64__)
2064 	return x86::psubsw(x, y);
2065 #else
2066 	return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
2067 #endif
2068 }
2069 
MulHigh(RValue<Short4> x,RValue<Short4> y)2070 RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
2071 {
2072 	RR_DEBUG_INFO_UPDATE_LOC();
2073 #if defined(__i386__) || defined(__x86_64__)
2074 	return x86::pmulhw(x, y);
2075 #else
2076 	return As<Short4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2077 #endif
2078 }
2079 
MulAdd(RValue<Short4> x,RValue<Short4> y)2080 RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
2081 {
2082 	RR_DEBUG_INFO_UPDATE_LOC();
2083 #if defined(__i386__) || defined(__x86_64__)
2084 	return x86::pmaddwd(x, y);
2085 #else
2086 	return As<Int2>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2087 #endif
2088 }
2089 
PackSigned(RValue<Short4> x,RValue<Short4> y)2090 RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
2091 {
2092 	RR_DEBUG_INFO_UPDATE_LOC();
2093 #if defined(__i386__) || defined(__x86_64__)
2094 	auto result = x86::packsswb(x, y);
2095 #else
2096 	auto result = V(lowerPack(V(x.value()), V(y.value()), true));
2097 #endif
2098 	return As<SByte8>(Swizzle(As<Int4>(result), 0x0202));
2099 }
2100 
PackUnsigned(RValue<Short4> x,RValue<Short4> y)2101 RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
2102 {
2103 	RR_DEBUG_INFO_UPDATE_LOC();
2104 #if defined(__i386__) || defined(__x86_64__)
2105 	auto result = x86::packuswb(x, y);
2106 #else
2107 	auto result = V(lowerPack(V(x.value()), V(y.value()), false));
2108 #endif
2109 	return As<Byte8>(Swizzle(As<Int4>(result), 0x0202));
2110 }
2111 
CmpGT(RValue<Short4> x,RValue<Short4> y)2112 RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
2113 {
2114 	RR_DEBUG_INFO_UPDATE_LOC();
2115 #if defined(__i386__) || defined(__x86_64__)
2116 	return x86::pcmpgtw(x, y);
2117 #else
2118 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
2119 #endif
2120 }
2121 
CmpEQ(RValue<Short4> x,RValue<Short4> y)2122 RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
2123 {
2124 	RR_DEBUG_INFO_UPDATE_LOC();
2125 #if defined(__i386__) || defined(__x86_64__)
2126 	return x86::pcmpeqw(x, y);
2127 #else
2128 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
2129 #endif
2130 }
2131 
type()2132 Type *Short4::type()
2133 {
2134 	return T(Type_v4i16);
2135 }
2136 
UShort4(RValue<Float4> cast,bool saturate)2137 UShort4::UShort4(RValue<Float4> cast, bool saturate)
2138 {
2139 	RR_DEBUG_INFO_UPDATE_LOC();
2140 	if(saturate)
2141 	{
2142 #if defined(__i386__) || defined(__x86_64__)
2143 		if(CPUID::supportsSSE4_1())
2144 		{
2145 			Int4 int4(Min(cast, Float4(0xFFFF)));  // packusdw takes care of 0x0000 saturation
2146 			*this = As<Short4>(PackUnsigned(int4, int4));
2147 		}
2148 		else
2149 #endif
2150 		{
2151 			*this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
2152 		}
2153 	}
2154 	else
2155 	{
2156 		*this = Short4(Int4(cast));
2157 	}
2158 }
2159 
operator <<(RValue<UShort4> lhs,unsigned char rhs)2160 RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
2161 {
2162 	RR_DEBUG_INFO_UPDATE_LOC();
2163 #if defined(__i386__) || defined(__x86_64__)
2164 	//	return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2165 
2166 	return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
2167 #else
2168 	return As<UShort4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2169 #endif
2170 }
2171 
operator >>(RValue<UShort4> lhs,unsigned char rhs)2172 RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
2173 {
2174 	RR_DEBUG_INFO_UPDATE_LOC();
2175 #if defined(__i386__) || defined(__x86_64__)
2176 	//	return RValue<Short4>(Nucleus::createLShr(lhs.value(), rhs.value()));
2177 
2178 	return x86::psrlw(lhs, rhs);
2179 #else
2180 	return As<UShort4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2181 #endif
2182 }
2183 
Max(RValue<UShort4> x,RValue<UShort4> y)2184 RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
2185 {
2186 	RR_DEBUG_INFO_UPDATE_LOC();
2187 	return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2188 }
2189 
Min(RValue<UShort4> x,RValue<UShort4> y)2190 RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
2191 {
2192 	RR_DEBUG_INFO_UPDATE_LOC();
2193 	return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2194 }
2195 
AddSat(RValue<UShort4> x,RValue<UShort4> y)2196 RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
2197 {
2198 	RR_DEBUG_INFO_UPDATE_LOC();
2199 #if defined(__i386__) || defined(__x86_64__)
2200 	return x86::paddusw(x, y);
2201 #else
2202 	return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
2203 #endif
2204 }
2205 
SubSat(RValue<UShort4> x,RValue<UShort4> y)2206 RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
2207 {
2208 	RR_DEBUG_INFO_UPDATE_LOC();
2209 #if defined(__i386__) || defined(__x86_64__)
2210 	return x86::psubusw(x, y);
2211 #else
2212 	return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
2213 #endif
2214 }
2215 
MulHigh(RValue<UShort4> x,RValue<UShort4> y)2216 RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
2217 {
2218 	RR_DEBUG_INFO_UPDATE_LOC();
2219 #if defined(__i386__) || defined(__x86_64__)
2220 	return x86::pmulhuw(x, y);
2221 #else
2222 	return As<UShort4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2223 #endif
2224 }
2225 
Average(RValue<UShort4> x,RValue<UShort4> y)2226 RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
2227 {
2228 	RR_DEBUG_INFO_UPDATE_LOC();
2229 #if defined(__i386__) || defined(__x86_64__)
2230 	return x86::pavgw(x, y);
2231 #else
2232 	return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
2233 #endif
2234 }
2235 
type()2236 Type *UShort4::type()
2237 {
2238 	return T(Type_v4i16);
2239 }
2240 
operator <<(RValue<Short8> lhs,unsigned char rhs)2241 RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
2242 {
2243 	RR_DEBUG_INFO_UPDATE_LOC();
2244 #if defined(__i386__) || defined(__x86_64__)
2245 	return x86::psllw(lhs, rhs);
2246 #else
2247 	return As<Short8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2248 #endif
2249 }
2250 
operator >>(RValue<Short8> lhs,unsigned char rhs)2251 RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
2252 {
2253 	RR_DEBUG_INFO_UPDATE_LOC();
2254 #if defined(__i386__) || defined(__x86_64__)
2255 	return x86::psraw(lhs, rhs);
2256 #else
2257 	return As<Short8>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2258 #endif
2259 }
2260 
MulAdd(RValue<Short8> x,RValue<Short8> y)2261 RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
2262 {
2263 	RR_DEBUG_INFO_UPDATE_LOC();
2264 #if defined(__i386__) || defined(__x86_64__)
2265 	return x86::pmaddwd(x, y);
2266 #else
2267 	return As<Int4>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2268 #endif
2269 }
2270 
MulHigh(RValue<Short8> x,RValue<Short8> y)2271 RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
2272 {
2273 	RR_DEBUG_INFO_UPDATE_LOC();
2274 #if defined(__i386__) || defined(__x86_64__)
2275 	return x86::pmulhw(x, y);
2276 #else
2277 	return As<Short8>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2278 #endif
2279 }
2280 
type()2281 Type *Short8::type()
2282 {
2283 	return T(llvm::VectorType::get(T(Short::type()), 8, false));
2284 }
2285 
operator <<(RValue<UShort8> lhs,unsigned char rhs)2286 RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
2287 {
2288 	RR_DEBUG_INFO_UPDATE_LOC();
2289 #if defined(__i386__) || defined(__x86_64__)
2290 	return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
2291 #else
2292 	return As<UShort8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2293 #endif
2294 }
2295 
operator >>(RValue<UShort8> lhs,unsigned char rhs)2296 RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
2297 {
2298 	RR_DEBUG_INFO_UPDATE_LOC();
2299 #if defined(__i386__) || defined(__x86_64__)
2300 	return x86::psrlw(lhs, rhs);  // FIXME: Fallback required
2301 #else
2302 	return As<UShort8>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2303 #endif
2304 }
2305 
MulHigh(RValue<UShort8> x,RValue<UShort8> y)2306 RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
2307 {
2308 	RR_DEBUG_INFO_UPDATE_LOC();
2309 #if defined(__i386__) || defined(__x86_64__)
2310 	return x86::pmulhuw(x, y);
2311 #else
2312 	return As<UShort8>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2313 #endif
2314 }
2315 
type()2316 Type *UShort8::type()
2317 {
2318 	return T(llvm::VectorType::get(T(UShort::type()), 8, false));
2319 }
2320 
operator ++(Int & val,int)2321 RValue<Int> operator++(Int &val, int)  // Post-increment
2322 {
2323 	RR_DEBUG_INFO_UPDATE_LOC();
2324 	RValue<Int> res = val;
2325 
2326 	Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2327 	val.storeValue(inc);
2328 
2329 	return res;
2330 }
2331 
operator ++(Int & val)2332 const Int &operator++(Int &val)  // Pre-increment
2333 {
2334 	RR_DEBUG_INFO_UPDATE_LOC();
2335 	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2336 	val.storeValue(inc);
2337 
2338 	return val;
2339 }
2340 
operator --(Int & val,int)2341 RValue<Int> operator--(Int &val, int)  // Post-decrement
2342 {
2343 	RR_DEBUG_INFO_UPDATE_LOC();
2344 	RValue<Int> res = val;
2345 
2346 	Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2347 	val.storeValue(inc);
2348 
2349 	return res;
2350 }
2351 
operator --(Int & val)2352 const Int &operator--(Int &val)  // Pre-decrement
2353 {
2354 	RR_DEBUG_INFO_UPDATE_LOC();
2355 	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2356 	val.storeValue(inc);
2357 
2358 	return val;
2359 }
2360 
RoundInt(RValue<Float> cast)2361 RValue<Int> RoundInt(RValue<Float> cast)
2362 {
2363 	RR_DEBUG_INFO_UPDATE_LOC();
2364 #if defined(__i386__) || defined(__x86_64__)
2365 	return x86::cvtss2si(cast);
2366 #else
2367 	return RValue<Int>(V(lowerRoundInt(V(cast.value()), T(Int::type()))));
2368 #endif
2369 }
2370 
type()2371 Type *Int::type()
2372 {
2373 	return T(llvm::Type::getInt32Ty(*jit->context));
2374 }
2375 
type()2376 Type *Long::type()
2377 {
2378 	return T(llvm::Type::getInt64Ty(*jit->context));
2379 }
2380 
UInt(RValue<Float> cast)2381 UInt::UInt(RValue<Float> cast)
2382 {
2383 	RR_DEBUG_INFO_UPDATE_LOC();
2384 	Value *integer = Nucleus::createFPToUI(cast.value(), UInt::type());
2385 	storeValue(integer);
2386 }
2387 
operator ++(UInt & val,int)2388 RValue<UInt> operator++(UInt &val, int)  // Post-increment
2389 {
2390 	RR_DEBUG_INFO_UPDATE_LOC();
2391 	RValue<UInt> res = val;
2392 
2393 	Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2394 	val.storeValue(inc);
2395 
2396 	return res;
2397 }
2398 
operator ++(UInt & val)2399 const UInt &operator++(UInt &val)  // Pre-increment
2400 {
2401 	RR_DEBUG_INFO_UPDATE_LOC();
2402 	Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2403 	val.storeValue(inc);
2404 
2405 	return val;
2406 }
2407 
operator --(UInt & val,int)2408 RValue<UInt> operator--(UInt &val, int)  // Post-decrement
2409 {
2410 	RR_DEBUG_INFO_UPDATE_LOC();
2411 	RValue<UInt> res = val;
2412 
2413 	Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2414 	val.storeValue(inc);
2415 
2416 	return res;
2417 }
2418 
operator --(UInt & val)2419 const UInt &operator--(UInt &val)  // Pre-decrement
2420 {
2421 	RR_DEBUG_INFO_UPDATE_LOC();
2422 	Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2423 	val.storeValue(inc);
2424 
2425 	return val;
2426 }
2427 
2428 //	RValue<UInt> RoundUInt(RValue<Float> cast)
2429 //	{
2430 //#if defined(__i386__) || defined(__x86_64__)
2431 //		return x86::cvtss2si(val);   // FIXME: Unsigned
2432 //#else
2433 //		return IfThenElse(cast > 0.0f, Int(cast + 0.5f), Int(cast - 0.5f));
2434 //#endif
2435 //	}
2436 
type()2437 Type *UInt::type()
2438 {
2439 	return T(llvm::Type::getInt32Ty(*jit->context));
2440 }
2441 
2442 //	Int2::Int2(RValue<Int> cast)
2443 //	{
2444 //		Value *extend = Nucleus::createZExt(cast.value(), Long::type());
2445 //		Value *vector = Nucleus::createBitCast(extend, Int2::type());
2446 //
2447 //		int shuffle[2] = {0, 0};
2448 //		Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
2449 //
2450 //		storeValue(replicate);
2451 //	}
2452 
operator <<(RValue<Int2> lhs,unsigned char rhs)2453 RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
2454 {
2455 	RR_DEBUG_INFO_UPDATE_LOC();
2456 #if defined(__i386__) || defined(__x86_64__)
2457 	//	return RValue<Int2>(Nucleus::createShl(lhs.value(), rhs.value()));
2458 
2459 	return x86::pslld(lhs, rhs);
2460 #else
2461 	return As<Int2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2462 #endif
2463 }
2464 
operator >>(RValue<Int2> lhs,unsigned char rhs)2465 RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
2466 {
2467 	RR_DEBUG_INFO_UPDATE_LOC();
2468 #if defined(__i386__) || defined(__x86_64__)
2469 	//	return RValue<Int2>(Nucleus::createAShr(lhs.value(), rhs.value()));
2470 
2471 	return x86::psrad(lhs, rhs);
2472 #else
2473 	return As<Int2>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2474 #endif
2475 }
2476 
type()2477 Type *Int2::type()
2478 {
2479 	return T(Type_v2i32);
2480 }
2481 
operator <<(RValue<UInt2> lhs,unsigned char rhs)2482 RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
2483 {
2484 	RR_DEBUG_INFO_UPDATE_LOC();
2485 #if defined(__i386__) || defined(__x86_64__)
2486 	//	return RValue<UInt2>(Nucleus::createShl(lhs.value(), rhs.value()));
2487 
2488 	return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
2489 #else
2490 	return As<UInt2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2491 #endif
2492 }
2493 
operator >>(RValue<UInt2> lhs,unsigned char rhs)2494 RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
2495 {
2496 	RR_DEBUG_INFO_UPDATE_LOC();
2497 #if defined(__i386__) || defined(__x86_64__)
2498 	//	return RValue<UInt2>(Nucleus::createLShr(lhs.value(), rhs.value()));
2499 
2500 	return x86::psrld(lhs, rhs);
2501 #else
2502 	return As<UInt2>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2503 #endif
2504 }
2505 
type()2506 Type *UInt2::type()
2507 {
2508 	return T(Type_v2i32);
2509 }
2510 
Int4(RValue<Byte4> cast)2511 Int4::Int4(RValue<Byte4> cast)
2512     : XYZW(this)
2513 {
2514 	RR_DEBUG_INFO_UPDATE_LOC();
2515 	std::vector<int> swizzle = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };
2516 	Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2517 	Value *b = Nucleus::createShuffleVector(a, Nucleus::createNullValue(Byte16::type()), swizzle);
2518 
2519 	std::vector<int> swizzle2 = { 0, 8, 1, 9, 2, 10, 3, 11 };
2520 	Value *c = Nucleus::createBitCast(b, Short8::type());
2521 	Value *d = Nucleus::createShuffleVector(c, Nucleus::createNullValue(Short8::type()), swizzle2);
2522 
2523 	*this = As<Int4>(d);
2524 }
2525 
Int4(RValue<SByte4> cast)2526 Int4::Int4(RValue<SByte4> cast)
2527     : XYZW(this)
2528 {
2529 	RR_DEBUG_INFO_UPDATE_LOC();
2530 	std::vector<int> swizzle = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 };
2531 	Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2532 	Value *b = Nucleus::createShuffleVector(a, a, swizzle);
2533 
2534 	std::vector<int> swizzle2 = { 0, 0, 1, 1, 2, 2, 3, 3 };
2535 	Value *c = Nucleus::createBitCast(b, Short8::type());
2536 	Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
2537 
2538 	*this = As<Int4>(d) >> 24;
2539 }
2540 
Int4(RValue<Short4> cast)2541 Int4::Int4(RValue<Short4> cast)
2542     : XYZW(this)
2543 {
2544 	RR_DEBUG_INFO_UPDATE_LOC();
2545 	std::vector<int> swizzle = { 0, 0, 1, 1, 2, 2, 3, 3 };
2546 	Value *c = Nucleus::createShuffleVector(cast.value(), cast.value(), swizzle);
2547 	*this = As<Int4>(c) >> 16;
2548 }
2549 
Int4(RValue<UShort4> cast)2550 Int4::Int4(RValue<UShort4> cast)
2551     : XYZW(this)
2552 {
2553 	RR_DEBUG_INFO_UPDATE_LOC();
2554 	std::vector<int> swizzle = { 0, 8, 1, 9, 2, 10, 3, 11 };
2555 	Value *c = Nucleus::createShuffleVector(cast.value(), Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
2556 	*this = As<Int4>(c);
2557 }
2558 
Int4(RValue<Int> rhs)2559 Int4::Int4(RValue<Int> rhs)
2560     : XYZW(this)
2561 {
2562 	RR_DEBUG_INFO_UPDATE_LOC();
2563 	Value *vector = loadValue();
2564 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2565 
2566 	std::vector<int> swizzle = { 0, 0, 0, 0 };
2567 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2568 
2569 	storeValue(replicate);
2570 }
2571 
operator <<(RValue<Int4> lhs,unsigned char rhs)2572 RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
2573 {
2574 	RR_DEBUG_INFO_UPDATE_LOC();
2575 #if defined(__i386__) || defined(__x86_64__)
2576 	return x86::pslld(lhs, rhs);
2577 #else
2578 	return As<Int4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2579 #endif
2580 }
2581 
operator >>(RValue<Int4> lhs,unsigned char rhs)2582 RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
2583 {
2584 	RR_DEBUG_INFO_UPDATE_LOC();
2585 #if defined(__i386__) || defined(__x86_64__)
2586 	return x86::psrad(lhs, rhs);
2587 #else
2588 	return As<Int4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2589 #endif
2590 }
2591 
CmpEQ(RValue<Int4> x,RValue<Int4> y)2592 RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
2593 {
2594 	RR_DEBUG_INFO_UPDATE_LOC();
2595 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2596 }
2597 
CmpLT(RValue<Int4> x,RValue<Int4> y)2598 RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
2599 {
2600 	RR_DEBUG_INFO_UPDATE_LOC();
2601 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value(), y.value()), Int4::type()));
2602 }
2603 
CmpLE(RValue<Int4> x,RValue<Int4> y)2604 RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
2605 {
2606 	RR_DEBUG_INFO_UPDATE_LOC();
2607 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value(), y.value()), Int4::type()));
2608 }
2609 
CmpNEQ(RValue<Int4> x,RValue<Int4> y)2610 RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
2611 {
2612 	RR_DEBUG_INFO_UPDATE_LOC();
2613 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2614 }
2615 
CmpNLT(RValue<Int4> x,RValue<Int4> y)2616 RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
2617 {
2618 	RR_DEBUG_INFO_UPDATE_LOC();
2619 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value(), y.value()), Int4::type()));
2620 }
2621 
CmpNLE(RValue<Int4> x,RValue<Int4> y)2622 RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
2623 {
2624 	RR_DEBUG_INFO_UPDATE_LOC();
2625 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value(), y.value()), Int4::type()));
2626 }
2627 
Abs(RValue<Int4> x)2628 RValue<Int4> Abs(RValue<Int4> x)
2629 {
2630 #if LLVM_VERSION_MAJOR >= 12
2631 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::abs, { V(x.value())->getType() });
2632 	return RValue<Int4>(V(jit->builder->CreateCall(func, { V(x.value()), llvm::ConstantInt::getFalse(*jit->context) })));
2633 #else
2634 	auto negative = x >> 31;
2635 	return (x ^ negative) - negative;
2636 #endif
2637 }
2638 
Max(RValue<Int4> x,RValue<Int4> y)2639 RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
2640 {
2641 	RR_DEBUG_INFO_UPDATE_LOC();
2642 #if defined(__i386__) || defined(__x86_64__)
2643 	if(CPUID::supportsSSE4_1())
2644 	{
2645 		return x86::pmaxsd(x, y);
2646 	}
2647 	else
2648 #endif
2649 	{
2650 		RValue<Int4> greater = CmpNLE(x, y);
2651 		return (x & greater) | (y & ~greater);
2652 	}
2653 }
2654 
Min(RValue<Int4> x,RValue<Int4> y)2655 RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
2656 {
2657 	RR_DEBUG_INFO_UPDATE_LOC();
2658 #if defined(__i386__) || defined(__x86_64__)
2659 	if(CPUID::supportsSSE4_1())
2660 	{
2661 		return x86::pminsd(x, y);
2662 	}
2663 	else
2664 #endif
2665 	{
2666 		RValue<Int4> less = CmpLT(x, y);
2667 		return (x & less) | (y & ~less);
2668 	}
2669 }
2670 
RoundInt(RValue<Float4> cast)2671 RValue<Int4> RoundInt(RValue<Float4> cast)
2672 {
2673 	RR_DEBUG_INFO_UPDATE_LOC();
2674 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
2675 	return x86::cvtps2dq(cast);
2676 #else
2677 	return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2678 #endif
2679 }
2680 
RoundIntClamped(RValue<Float4> cast)2681 RValue<Int4> RoundIntClamped(RValue<Float4> cast)
2682 {
2683 	RR_DEBUG_INFO_UPDATE_LOC();
2684 
2685 // TODO(b/165000222): Check if fptosi_sat produces optimal code for x86 and ARM.
2686 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
2687 	// cvtps2dq produces 0x80000000, a negative value, for input larger than
2688 	// 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
2689 	// saturate to 0x80000000.
2690 	return x86::cvtps2dq(Min(cast, Float4(0x7FFFFF80)));
2691 #elif defined(__arm__) || defined(__aarch64__)
2692 	// ARM saturates to the largest positive or negative integer. Unit tests
2693 	// verify that lowerRoundInt() behaves as desired.
2694 	return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2695 #elif LLVM_VERSION_MAJOR >= 14
2696 	llvm::Value *rounded = lowerRound(V(cast.value()));
2697 	llvm::Function *fptosi_sat = llvm::Intrinsic::getDeclaration(
2698 	    jit->module.get(), llvm::Intrinsic::fptosi_sat, { T(Int4::type()), T(Float4::type()) });
2699 	return RValue<Int4>(V(jit->builder->CreateCall(fptosi_sat, { rounded })));
2700 #else
2701 	RValue<Float4> clamped = Max(Min(cast, Float4(0x7FFFFF80)), Float4(static_cast<int>(0x80000000)));
2702 	return As<Int4>(V(lowerRoundInt(V(clamped.value()), T(Int4::type()))));
2703 #endif
2704 }
2705 
MulHigh(RValue<Int4> x,RValue<Int4> y)2706 RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
2707 {
2708 	RR_DEBUG_INFO_UPDATE_LOC();
2709 	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2710 	return As<Int4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2711 }
2712 
MulHigh(RValue<UInt4> x,RValue<UInt4> y)2713 RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
2714 {
2715 	RR_DEBUG_INFO_UPDATE_LOC();
2716 	// TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2717 	return As<UInt4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2718 }
2719 
PackSigned(RValue<Int4> x,RValue<Int4> y)2720 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
2721 {
2722 	RR_DEBUG_INFO_UPDATE_LOC();
2723 #if defined(__i386__) || defined(__x86_64__)
2724 	return x86::packssdw(x, y);
2725 #else
2726 	return As<Short8>(V(lowerPack(V(x.value()), V(y.value()), true)));
2727 #endif
2728 }
2729 
PackUnsigned(RValue<Int4> x,RValue<Int4> y)2730 RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
2731 {
2732 	RR_DEBUG_INFO_UPDATE_LOC();
2733 #if defined(__i386__) || defined(__x86_64__)
2734 	return x86::packusdw(x, y);
2735 #else
2736 	return As<UShort8>(V(lowerPack(V(x.value()), V(y.value()), false)));
2737 #endif
2738 }
2739 
SignMask(RValue<Int4> x)2740 RValue<Int> SignMask(RValue<Int4> x)
2741 {
2742 	RR_DEBUG_INFO_UPDATE_LOC();
2743 #if defined(__i386__) || defined(__x86_64__)
2744 	return x86::movmskps(As<Float4>(x));
2745 #else
2746 	return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
2747 #endif
2748 }
2749 
type()2750 Type *Int4::type()
2751 {
2752 	return T(llvm::VectorType::get(T(Int::type()), 4, false));
2753 }
2754 
UInt4(RValue<Float4> cast)2755 UInt4::UInt4(RValue<Float4> cast)
2756     : XYZW(this)
2757 {
2758 	RR_DEBUG_INFO_UPDATE_LOC();
2759 	Value *xyzw = Nucleus::createFPToUI(cast.value(), UInt4::type());
2760 	storeValue(xyzw);
2761 }
2762 
UInt4(RValue<UInt> rhs)2763 UInt4::UInt4(RValue<UInt> rhs)
2764     : XYZW(this)
2765 {
2766 	RR_DEBUG_INFO_UPDATE_LOC();
2767 	Value *vector = loadValue();
2768 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2769 
2770 	std::vector<int> swizzle = { 0, 0, 0, 0 };
2771 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2772 
2773 	storeValue(replicate);
2774 }
2775 
operator <<(RValue<UInt4> lhs,unsigned char rhs)2776 RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
2777 {
2778 	RR_DEBUG_INFO_UPDATE_LOC();
2779 #if defined(__i386__) || defined(__x86_64__)
2780 	return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
2781 #else
2782 	return As<UInt4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2783 #endif
2784 }
2785 
operator >>(RValue<UInt4> lhs,unsigned char rhs)2786 RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
2787 {
2788 	RR_DEBUG_INFO_UPDATE_LOC();
2789 #if defined(__i386__) || defined(__x86_64__)
2790 	return x86::psrld(lhs, rhs);
2791 #else
2792 	return As<UInt4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2793 #endif
2794 }
2795 
CmpEQ(RValue<UInt4> x,RValue<UInt4> y)2796 RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
2797 {
2798 	RR_DEBUG_INFO_UPDATE_LOC();
2799 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2800 }
2801 
CmpLT(RValue<UInt4> x,RValue<UInt4> y)2802 RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
2803 {
2804 	RR_DEBUG_INFO_UPDATE_LOC();
2805 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value(), y.value()), Int4::type()));
2806 }
2807 
CmpLE(RValue<UInt4> x,RValue<UInt4> y)2808 RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
2809 {
2810 	RR_DEBUG_INFO_UPDATE_LOC();
2811 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value(), y.value()), Int4::type()));
2812 }
2813 
CmpNEQ(RValue<UInt4> x,RValue<UInt4> y)2814 RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
2815 {
2816 	RR_DEBUG_INFO_UPDATE_LOC();
2817 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2818 }
2819 
CmpNLT(RValue<UInt4> x,RValue<UInt4> y)2820 RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
2821 {
2822 	RR_DEBUG_INFO_UPDATE_LOC();
2823 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value(), y.value()), Int4::type()));
2824 }
2825 
CmpNLE(RValue<UInt4> x,RValue<UInt4> y)2826 RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
2827 {
2828 	RR_DEBUG_INFO_UPDATE_LOC();
2829 	return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value(), y.value()), Int4::type()));
2830 }
2831 
Max(RValue<UInt4> x,RValue<UInt4> y)2832 RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
2833 {
2834 	RR_DEBUG_INFO_UPDATE_LOC();
2835 #if defined(__i386__) || defined(__x86_64__)
2836 	if(CPUID::supportsSSE4_1())
2837 	{
2838 		return x86::pmaxud(x, y);
2839 	}
2840 	else
2841 #endif
2842 	{
2843 		RValue<UInt4> greater = CmpNLE(x, y);
2844 		return (x & greater) | (y & ~greater);
2845 	}
2846 }
2847 
Min(RValue<UInt4> x,RValue<UInt4> y)2848 RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
2849 {
2850 	RR_DEBUG_INFO_UPDATE_LOC();
2851 #if defined(__i386__) || defined(__x86_64__)
2852 	if(CPUID::supportsSSE4_1())
2853 	{
2854 		return x86::pminud(x, y);
2855 	}
2856 	else
2857 #endif
2858 	{
2859 		RValue<UInt4> less = CmpLT(x, y);
2860 		return (x & less) | (y & ~less);
2861 	}
2862 }
2863 
type()2864 Type *UInt4::type()
2865 {
2866 	return T(llvm::VectorType::get(T(UInt::type()), 4, false));
2867 }
2868 
type()2869 Type *Half::type()
2870 {
2871 	return T(llvm::Type::getInt16Ty(*jit->context));
2872 }
2873 
HasRcpApprox()2874 bool HasRcpApprox()
2875 {
2876 #if defined(__i386__) || defined(__x86_64__)
2877 	return true;
2878 #else
2879 	return false;
2880 #endif
2881 }
2882 
RcpApprox(RValue<Float4> x,bool exactAtPow2)2883 RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
2884 {
2885 #if defined(__i386__) || defined(__x86_64__)
2886 	if(exactAtPow2)
2887 	{
2888 		// rcpps uses a piecewise-linear approximation which minimizes the relative error
2889 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2890 		return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2891 	}
2892 	return x86::rcpps(x);
2893 #else
2894 	UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2895 	return { 0.0f };
2896 #endif
2897 }
2898 
RcpApprox(RValue<Float> x,bool exactAtPow2)2899 RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
2900 {
2901 #if defined(__i386__) || defined(__x86_64__)
2902 	if(exactAtPow2)
2903 	{
2904 		// rcpss uses a piecewise-linear approximation which minimizes the relative error
2905 		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2906 		return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2907 	}
2908 	return x86::rcpss(x);
2909 #else
2910 	UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2911 	return { 0.0f };
2912 #endif
2913 }
2914 
HasRcpSqrtApprox()2915 bool HasRcpSqrtApprox()
2916 {
2917 #if defined(__i386__) || defined(__x86_64__)
2918 	return true;
2919 #else
2920 	return false;
2921 #endif
2922 }
2923 
RcpSqrtApprox(RValue<Float4> x)2924 RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
2925 {
2926 #if defined(__i386__) || defined(__x86_64__)
2927 	return x86::rsqrtps(x);
2928 #else
2929 	UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
2930 	return { 0.0f };
2931 #endif
2932 }
2933 
RcpSqrtApprox(RValue<Float> x)2934 RValue<Float> RcpSqrtApprox(RValue<Float> x)
2935 {
2936 #if defined(__i386__) || defined(__x86_64__)
2937 	return x86::rsqrtss(x);
2938 #else
2939 	UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
2940 	return { 0.0f };
2941 #endif
2942 }
2943 
Sqrt(RValue<Float> x)2944 RValue<Float> Sqrt(RValue<Float> x)
2945 {
2946 	RR_DEBUG_INFO_UPDATE_LOC();
2947 #if defined(__i386__) || defined(__x86_64__)
2948 	return x86::sqrtss(x);
2949 #else
2950 	return As<Float>(V(lowerSQRT(V(x.value()))));
2951 #endif
2952 }
2953 
Round(RValue<Float> x)2954 RValue<Float> Round(RValue<Float> x)
2955 {
2956 	RR_DEBUG_INFO_UPDATE_LOC();
2957 #if defined(__i386__) || defined(__x86_64__)
2958 	if(CPUID::supportsSSE4_1())
2959 	{
2960 		return x86::roundss(x, 0);
2961 	}
2962 	else
2963 	{
2964 		return Float4(Round(Float4(x))).x;
2965 	}
2966 #else
2967 	return RValue<Float>(V(lowerRound(V(x.value()))));
2968 #endif
2969 }
2970 
Trunc(RValue<Float> x)2971 RValue<Float> Trunc(RValue<Float> x)
2972 {
2973 	RR_DEBUG_INFO_UPDATE_LOC();
2974 #if defined(__i386__) || defined(__x86_64__)
2975 	if(CPUID::supportsSSE4_1())
2976 	{
2977 		return x86::roundss(x, 3);
2978 	}
2979 	else
2980 	{
2981 		return Float(Int(x));  // Rounded toward zero
2982 	}
2983 #else
2984 	return RValue<Float>(V(lowerTrunc(V(x.value()))));
2985 #endif
2986 }
2987 
Frac(RValue<Float> x)2988 RValue<Float> Frac(RValue<Float> x)
2989 {
2990 	RR_DEBUG_INFO_UPDATE_LOC();
2991 #if defined(__i386__) || defined(__x86_64__)
2992 	if(CPUID::supportsSSE4_1())
2993 	{
2994 		return x - x86::floorss(x);
2995 	}
2996 	else
2997 	{
2998 		return Float4(Frac(Float4(x))).x;
2999 	}
3000 #else
3001 	// x - floor(x) can be 1.0 for very small negative x.
3002 	// Clamp against the value just below 1.0.
3003 	return Min(x - Floor(x), As<Float>(Int(0x3F7FFFFF)));
3004 #endif
3005 }
3006 
Floor(RValue<Float> x)3007 RValue<Float> Floor(RValue<Float> x)
3008 {
3009 	RR_DEBUG_INFO_UPDATE_LOC();
3010 #if defined(__i386__) || defined(__x86_64__)
3011 	if(CPUID::supportsSSE4_1())
3012 	{
3013 		return x86::floorss(x);
3014 	}
3015 	else
3016 	{
3017 		return Float4(Floor(Float4(x))).x;
3018 	}
3019 #else
3020 	return RValue<Float>(V(lowerFloor(V(x.value()))));
3021 #endif
3022 }
3023 
Ceil(RValue<Float> x)3024 RValue<Float> Ceil(RValue<Float> x)
3025 {
3026 	RR_DEBUG_INFO_UPDATE_LOC();
3027 #if defined(__i386__) || defined(__x86_64__)
3028 	if(CPUID::supportsSSE4_1())
3029 	{
3030 		return x86::ceilss(x);
3031 	}
3032 	else
3033 #endif
3034 	{
3035 		return Float4(Ceil(Float4(x))).x;
3036 	}
3037 }
3038 
type()3039 Type *Float::type()
3040 {
3041 	return T(llvm::Type::getFloatTy(*jit->context));
3042 }
3043 
type()3044 Type *Float2::type()
3045 {
3046 	return T(Type_v2f32);
3047 }
3048 
Float4(RValue<Float> rhs)3049 Float4::Float4(RValue<Float> rhs)
3050     : XYZW(this)
3051 {
3052 	RR_DEBUG_INFO_UPDATE_LOC();
3053 	Value *vector = loadValue();
3054 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
3055 
3056 	std::vector<int> swizzle = { 0, 0, 0, 0 };
3057 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
3058 
3059 	storeValue(replicate);
3060 }
3061 
MulAdd(RValue<Float4> x,RValue<Float4> y,RValue<Float4> z)3062 RValue<Float4> MulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
3063 {
3064 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fmuladd, { T(Float4::type()) });
3065 	return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
3066 }
3067 
FMA(RValue<Float4> x,RValue<Float4> y,RValue<Float4> z)3068 RValue<Float4> FMA(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
3069 {
3070 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fma, { T(Float4::type()) });
3071 	return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
3072 }
3073 
Abs(RValue<Float4> x)3074 RValue<Float4> Abs(RValue<Float4> x)
3075 {
3076 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fabs, { V(x.value())->getType() });
3077 	return RValue<Float4>(V(jit->builder->CreateCall(func, V(x.value()))));
3078 }
3079 
Max(RValue<Float4> x,RValue<Float4> y)3080 RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
3081 {
3082 	RR_DEBUG_INFO_UPDATE_LOC();
3083 #if defined(__i386__) || defined(__x86_64__)
3084 	return x86::maxps(x, y);
3085 #else
3086 	return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OGT)));
3087 #endif
3088 }
3089 
Min(RValue<Float4> x,RValue<Float4> y)3090 RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
3091 {
3092 	RR_DEBUG_INFO_UPDATE_LOC();
3093 #if defined(__i386__) || defined(__x86_64__)
3094 	return x86::minps(x, y);
3095 #else
3096 	return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OLT)));
3097 #endif
3098 }
3099 
Sqrt(RValue<Float4> x)3100 RValue<Float4> Sqrt(RValue<Float4> x)
3101 {
3102 	RR_DEBUG_INFO_UPDATE_LOC();
3103 #if defined(__i386__) || defined(__x86_64__)
3104 	return x86::sqrtps(x);
3105 #else
3106 	return As<Float4>(V(lowerSQRT(V(x.value()))));
3107 #endif
3108 }
3109 
SignMask(RValue<Float4> x)3110 RValue<Int> SignMask(RValue<Float4> x)
3111 {
3112 	RR_DEBUG_INFO_UPDATE_LOC();
3113 #if defined(__i386__) || defined(__x86_64__)
3114 	return x86::movmskps(x);
3115 #else
3116 	return As<Int>(V(lowerFPSignMask(V(x.value()), T(Int::type()))));
3117 #endif
3118 }
3119 
CmpEQ(RValue<Float4> x,RValue<Float4> y)3120 RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
3121 {
3122 	RR_DEBUG_INFO_UPDATE_LOC();
3123 	//	return As<Int4>(x86::cmpeqps(x, y));
3124 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value(), y.value()), Int4::type()));
3125 }
3126 
CmpLT(RValue<Float4> x,RValue<Float4> y)3127 RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
3128 {
3129 	RR_DEBUG_INFO_UPDATE_LOC();
3130 	//	return As<Int4>(x86::cmpltps(x, y));
3131 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value(), y.value()), Int4::type()));
3132 }
3133 
CmpLE(RValue<Float4> x,RValue<Float4> y)3134 RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
3135 {
3136 	RR_DEBUG_INFO_UPDATE_LOC();
3137 	//	return As<Int4>(x86::cmpleps(x, y));
3138 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value(), y.value()), Int4::type()));
3139 }
3140 
CmpNEQ(RValue<Float4> x,RValue<Float4> y)3141 RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
3142 {
3143 	RR_DEBUG_INFO_UPDATE_LOC();
3144 	//	return As<Int4>(x86::cmpneqps(x, y));
3145 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value(), y.value()), Int4::type()));
3146 }
3147 
CmpNLT(RValue<Float4> x,RValue<Float4> y)3148 RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
3149 {
3150 	RR_DEBUG_INFO_UPDATE_LOC();
3151 	//	return As<Int4>(x86::cmpnltps(x, y));
3152 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value(), y.value()), Int4::type()));
3153 }
3154 
CmpNLE(RValue<Float4> x,RValue<Float4> y)3155 RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
3156 {
3157 	RR_DEBUG_INFO_UPDATE_LOC();
3158 	//	return As<Int4>(x86::cmpnleps(x, y));
3159 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value(), y.value()), Int4::type()));
3160 }
3161 
CmpUEQ(RValue<Float4> x,RValue<Float4> y)3162 RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
3163 {
3164 	RR_DEBUG_INFO_UPDATE_LOC();
3165 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value(), y.value()), Int4::type()));
3166 }
3167 
CmpULT(RValue<Float4> x,RValue<Float4> y)3168 RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
3169 {
3170 	RR_DEBUG_INFO_UPDATE_LOC();
3171 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value(), y.value()), Int4::type()));
3172 }
3173 
CmpULE(RValue<Float4> x,RValue<Float4> y)3174 RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
3175 {
3176 	RR_DEBUG_INFO_UPDATE_LOC();
3177 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value(), y.value()), Int4::type()));
3178 }
3179 
CmpUNEQ(RValue<Float4> x,RValue<Float4> y)3180 RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
3181 {
3182 	RR_DEBUG_INFO_UPDATE_LOC();
3183 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value(), y.value()), Int4::type()));
3184 }
3185 
CmpUNLT(RValue<Float4> x,RValue<Float4> y)3186 RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
3187 {
3188 	RR_DEBUG_INFO_UPDATE_LOC();
3189 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value(), y.value()), Int4::type()));
3190 }
3191 
CmpUNLE(RValue<Float4> x,RValue<Float4> y)3192 RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
3193 {
3194 	RR_DEBUG_INFO_UPDATE_LOC();
3195 	return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value(), y.value()), Int4::type()));
3196 }
3197 
Round(RValue<Float4> x)3198 RValue<Float4> Round(RValue<Float4> x)
3199 {
3200 	RR_DEBUG_INFO_UPDATE_LOC();
3201 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3202 	if(CPUID::supportsSSE4_1())
3203 	{
3204 		return x86::roundps(x, 0);
3205 	}
3206 	else
3207 	{
3208 		return Float4(RoundInt(x));
3209 	}
3210 #else
3211 	return RValue<Float4>(V(lowerRound(V(x.value()))));
3212 #endif
3213 }
3214 
Trunc(RValue<Float4> x)3215 RValue<Float4> Trunc(RValue<Float4> x)
3216 {
3217 	RR_DEBUG_INFO_UPDATE_LOC();
3218 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3219 	if(CPUID::supportsSSE4_1())
3220 	{
3221 		return x86::roundps(x, 3);
3222 	}
3223 	else
3224 	{
3225 		return Float4(Int4(x));
3226 	}
3227 #else
3228 	return RValue<Float4>(V(lowerTrunc(V(x.value()))));
3229 #endif
3230 }
3231 
Frac(RValue<Float4> x)3232 RValue<Float4> Frac(RValue<Float4> x)
3233 {
3234 	RR_DEBUG_INFO_UPDATE_LOC();
3235 	Float4 frc;
3236 
3237 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3238 	if(CPUID::supportsSSE4_1())
3239 	{
3240 		frc = x - x86::floorps(x);
3241 	}
3242 	else
3243 	{
3244 		frc = x - Float4(Int4(x));  // Signed fractional part.
3245 
3246 		frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f)));  // Add 1.0 if negative.
3247 	}
3248 #else
3249 	frc = x - Floor(x);
3250 #endif
3251 
3252 	// x - floor(x) can be 1.0 for very small negative x.
3253 	// Clamp against the value just below 1.0.
3254 	return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
3255 }
3256 
Floor(RValue<Float4> x)3257 RValue<Float4> Floor(RValue<Float4> x)
3258 {
3259 	RR_DEBUG_INFO_UPDATE_LOC();
3260 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3261 	if(CPUID::supportsSSE4_1())
3262 	{
3263 		return x86::floorps(x);
3264 	}
3265 	else
3266 	{
3267 		return x - Frac(x);
3268 	}
3269 #else
3270 	return RValue<Float4>(V(lowerFloor(V(x.value()))));
3271 #endif
3272 }
3273 
Ceil(RValue<Float4> x)3274 RValue<Float4> Ceil(RValue<Float4> x)
3275 {
3276 	RR_DEBUG_INFO_UPDATE_LOC();
3277 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3278 	if(CPUID::supportsSSE4_1())
3279 	{
3280 		return x86::ceilps(x);
3281 	}
3282 	else
3283 #endif
3284 	{
3285 		return -Floor(-x);
3286 	}
3287 }
3288 
Ctlz(RValue<UInt> v,bool isZeroUndef)3289 RValue<UInt> Ctlz(RValue<UInt> v, bool isZeroUndef)
3290 {
3291 	RR_DEBUG_INFO_UPDATE_LOC();
3292 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt::type()) });
3293 	return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3294 	                                                       isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3295 }
3296 
Ctlz(RValue<UInt4> v,bool isZeroUndef)3297 RValue<UInt4> Ctlz(RValue<UInt4> v, bool isZeroUndef)
3298 {
3299 	RR_DEBUG_INFO_UPDATE_LOC();
3300 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt4::type()) });
3301 	return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3302 	                                                        isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3303 }
3304 
Cttz(RValue<UInt> v,bool isZeroUndef)3305 RValue<UInt> Cttz(RValue<UInt> v, bool isZeroUndef)
3306 {
3307 	RR_DEBUG_INFO_UPDATE_LOC();
3308 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt::type()) });
3309 	return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3310 	                                                       isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3311 }
3312 
Cttz(RValue<UInt4> v,bool isZeroUndef)3313 RValue<UInt4> Cttz(RValue<UInt4> v, bool isZeroUndef)
3314 {
3315 	RR_DEBUG_INFO_UPDATE_LOC();
3316 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt4::type()) });
3317 	return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3318 	                                                        isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3319 }
3320 
MinAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3321 RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3322 {
3323 	return RValue<Int>(Nucleus::createAtomicMin(x.value(), y.value(), memoryOrder));
3324 }
3325 
MinAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3326 RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3327 {
3328 	return RValue<UInt>(Nucleus::createAtomicUMin(x.value(), y.value(), memoryOrder));
3329 }
3330 
MaxAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3331 RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3332 {
3333 	return RValue<Int>(Nucleus::createAtomicMax(x.value(), y.value(), memoryOrder));
3334 }
3335 
MaxAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3336 RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3337 {
3338 	return RValue<UInt>(Nucleus::createAtomicUMax(x.value(), y.value(), memoryOrder));
3339 }
3340 
type()3341 Type *Float4::type()
3342 {
3343 	return T(llvm::VectorType::get(T(Float::type()), 4, false));
3344 }
3345 
Ticks()3346 RValue<Long> Ticks()
3347 {
3348 	RR_DEBUG_INFO_UPDATE_LOC();
3349 	llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::readcyclecounter);
3350 
3351 	return RValue<Long>(V(jit->builder->CreateCall(rdtsc)));
3352 }
3353 
ConstantPointer(const void * ptr)3354 RValue<Pointer<Byte>> ConstantPointer(const void *ptr)
3355 {
3356 	RR_DEBUG_INFO_UPDATE_LOC();
3357 	// Note: this should work for 32-bit pointers as well because 'inttoptr'
3358 	// is defined to truncate (and zero extend) if necessary.
3359 	auto ptrAsInt = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), reinterpret_cast<uintptr_t>(ptr));
3360 	return RValue<Pointer<Byte>>(V(jit->builder->CreateIntToPtr(ptrAsInt, T(Pointer<Byte>::type()))));
3361 }
3362 
ConstantData(const void * data,size_t size)3363 RValue<Pointer<Byte>> ConstantData(const void *data, size_t size)
3364 {
3365 	RR_DEBUG_INFO_UPDATE_LOC();
3366 	auto str = ::std::string(reinterpret_cast<const char *>(data), size);
3367 	auto ptr = jit->builder->CreateGlobalStringPtr(str);
3368 	return RValue<Pointer<Byte>>(V(ptr));
3369 }
3370 
Call(RValue<Pointer<Byte>> fptr,Type * retTy,std::initializer_list<Value * > args,std::initializer_list<Type * > argTys)3371 Value *Call(RValue<Pointer<Byte>> fptr, Type *retTy, std::initializer_list<Value *> args, std::initializer_list<Type *> argTys)
3372 {
3373 	// If this is a MemorySanitizer build, but Reactor routine instrumentation is not enabled,
3374 	// mark all call arguments as initialized by calling __msan_unpoison_param().
3375 	if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
3376 	{
3377 		// void __msan_unpoison_param(size_t n)
3378 		auto voidTy = llvm::Type::getVoidTy(*jit->context);
3379 		auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
3380 		auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy }, false);
3381 		auto func = jit->module->getOrInsertFunction("__msan_unpoison_param", funcTy);
3382 
3383 		jit->builder->CreateCall(func, { llvm::ConstantInt::get(sizetTy, args.size()) });
3384 	}
3385 
3386 	RR_DEBUG_INFO_UPDATE_LOC();
3387 	llvm::SmallVector<llvm::Type *, 8> paramTys;
3388 	for(auto ty : argTys) { paramTys.push_back(T(ty)); }
3389 	auto funcTy = llvm::FunctionType::get(T(retTy), paramTys, false);
3390 
3391 	auto funcPtrTy = funcTy->getPointerTo();
3392 	auto funcPtr = jit->builder->CreatePointerCast(V(fptr.value()), funcPtrTy);
3393 
3394 	llvm::SmallVector<llvm::Value *, 8> arguments;
3395 	for(auto arg : args) { arguments.push_back(V(arg)); }
3396 	return V(jit->builder->CreateCall(funcTy, funcPtr, arguments));
3397 }
3398 
Breakpoint()3399 void Breakpoint()
3400 {
3401 	RR_DEBUG_INFO_UPDATE_LOC();
3402 	llvm::Function *debugtrap = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::debugtrap);
3403 
3404 	jit->builder->CreateCall(debugtrap);
3405 }
3406 
3407 }  // namespace rr
3408 
3409 namespace rr {
3410 
3411 #if defined(__i386__) || defined(__x86_64__)
3412 namespace x86 {
3413 
3414 // Differs from IRBuilder<>::CreateUnaryIntrinsic() in that it only accepts native instruction intrinsics which have
3415 // implicit types, such as 'x86_sse_rcp_ps' operating on v4f32, while 'sqrt' requires explicitly specifying the operand type.
createInstruction(llvm::Intrinsic::ID id,Value * x)3416 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x)
3417 {
3418 	llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3419 
3420 	return V(jit->builder->CreateCall(intrinsic, V(x)));
3421 }
3422 
3423 // Differs from IRBuilder<>::CreateBinaryIntrinsic() in that it only accepts native instruction intrinsics which have
3424 // implicit types, such as 'x86_sse_max_ps' operating on v4f32, while 'sadd_sat' requires explicitly specifying the operand types.
createInstruction(llvm::Intrinsic::ID id,Value * x,Value * y)3425 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x, Value *y)
3426 {
3427 	llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3428 
3429 	return V(jit->builder->CreateCall(intrinsic, { V(x), V(y) }));
3430 }
3431 
cvtss2si(RValue<Float> val)3432 RValue<Int> cvtss2si(RValue<Float> val)
3433 {
3434 	Float4 vector;
3435 	vector.x = val;
3436 
3437 	return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_cvtss2si, RValue<Float4>(vector).value()));
3438 }
3439 
cvtps2dq(RValue<Float4> val)3440 RValue<Int4> cvtps2dq(RValue<Float4> val)
3441 {
3442 	ASSERT(!__has_feature(memory_sanitizer));  // TODO(b/172238865): Not correctly instrumented by MemorySanitizer.
3443 
3444 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_cvtps2dq, val.value()));
3445 }
3446 
rcpss(RValue<Float> val)3447 RValue<Float> rcpss(RValue<Float> val)
3448 {
3449 	Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::type()))), val.value(), 0);
3450 
3451 	return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rcp_ss, vector), Float::type(), 0));
3452 }
3453 
sqrtss(RValue<Float> val)3454 RValue<Float> sqrtss(RValue<Float> val)
3455 {
3456 	return RValue<Float>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3457 }
3458 
rsqrtss(RValue<Float> val)3459 RValue<Float> rsqrtss(RValue<Float> val)
3460 {
3461 	Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::type()))), val.value(), 0);
3462 
3463 	return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ss, vector), Float::type(), 0));
3464 }
3465 
rcpps(RValue<Float4> val)3466 RValue<Float4> rcpps(RValue<Float4> val)
3467 {
3468 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rcp_ps, val.value()));
3469 }
3470 
sqrtps(RValue<Float4> val)3471 RValue<Float4> sqrtps(RValue<Float4> val)
3472 {
3473 	return RValue<Float4>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3474 }
3475 
rsqrtps(RValue<Float4> val)3476 RValue<Float4> rsqrtps(RValue<Float4> val)
3477 {
3478 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ps, val.value()));
3479 }
3480 
maxps(RValue<Float4> x,RValue<Float4> y)3481 RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
3482 {
3483 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_max_ps, x.value(), y.value()));
3484 }
3485 
minps(RValue<Float4> x,RValue<Float4> y)3486 RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
3487 {
3488 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_min_ps, x.value(), y.value()));
3489 }
3490 
roundss(RValue<Float> val,unsigned char imm)3491 RValue<Float> roundss(RValue<Float> val, unsigned char imm)
3492 {
3493 	llvm::Function *roundss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ss);
3494 
3495 	Value *undef = V(llvm::UndefValue::get(T(Float4::type())));
3496 	Value *vector = Nucleus::createInsertElement(undef, val.value(), 0);
3497 
3498 	return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(roundss, { V(undef), V(vector), V(Nucleus::createConstantInt(imm)) })), Float::type(), 0));
3499 }
3500 
floorss(RValue<Float> val)3501 RValue<Float> floorss(RValue<Float> val)
3502 {
3503 	return roundss(val, 1);
3504 }
3505 
ceilss(RValue<Float> val)3506 RValue<Float> ceilss(RValue<Float> val)
3507 {
3508 	return roundss(val, 2);
3509 }
3510 
roundps(RValue<Float4> val,unsigned char imm)3511 RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
3512 {
3513 	ASSERT(!__has_feature(memory_sanitizer));  // TODO(b/172238865): Not correctly instrumented by MemorySanitizer.
3514 
3515 	return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse41_round_ps, val.value(), Nucleus::createConstantInt(imm)));
3516 }
3517 
floorps(RValue<Float4> val)3518 RValue<Float4> floorps(RValue<Float4> val)
3519 {
3520 	return roundps(val, 1);
3521 }
3522 
ceilps(RValue<Float4> val)3523 RValue<Float4> ceilps(RValue<Float4> val)
3524 {
3525 	return roundps(val, 2);
3526 }
3527 
paddsw(RValue<Short4> x,RValue<Short4> y)3528 RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
3529 {
3530 	return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3531 }
3532 
psubsw(RValue<Short4> x,RValue<Short4> y)3533 RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
3534 {
3535 	return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3536 }
3537 
paddusw(RValue<UShort4> x,RValue<UShort4> y)3538 RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
3539 {
3540 	return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3541 }
3542 
psubusw(RValue<UShort4> x,RValue<UShort4> y)3543 RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
3544 {
3545 	return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3546 }
3547 
paddsb(RValue<SByte8> x,RValue<SByte8> y)3548 RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
3549 {
3550 	return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3551 }
3552 
psubsb(RValue<SByte8> x,RValue<SByte8> y)3553 RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
3554 {
3555 	return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3556 }
3557 
paddusb(RValue<Byte8> x,RValue<Byte8> y)3558 RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
3559 {
3560 	return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3561 }
3562 
psubusb(RValue<Byte8> x,RValue<Byte8> y)3563 RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
3564 {
3565 	return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3566 }
3567 
pavgw(RValue<UShort4> x,RValue<UShort4> y)3568 RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
3569 {
3570 	return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
3571 }
3572 
pmaxsw(RValue<Short4> x,RValue<Short4> y)3573 RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
3574 {
3575 	return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3576 }
3577 
pminsw(RValue<Short4> x,RValue<Short4> y)3578 RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
3579 {
3580 	return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3581 }
3582 
pcmpgtw(RValue<Short4> x,RValue<Short4> y)3583 RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
3584 {
3585 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
3586 }
3587 
pcmpeqw(RValue<Short4> x,RValue<Short4> y)3588 RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
3589 {
3590 	return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
3591 }
3592 
pcmpgtb(RValue<SByte8> x,RValue<SByte8> y)3593 RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
3594 {
3595 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
3596 }
3597 
pcmpeqb(RValue<Byte8> x,RValue<Byte8> y)3598 RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
3599 {
3600 	return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
3601 }
3602 
packssdw(RValue<Int2> x,RValue<Int2> y)3603 RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
3604 {
3605 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3606 }
3607 
packssdw(RValue<Int4> x,RValue<Int4> y)3608 RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
3609 {
3610 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3611 }
3612 
packsswb(RValue<Short4> x,RValue<Short4> y)3613 RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
3614 {
3615 	return As<SByte8>(createInstruction(llvm::Intrinsic::x86_sse2_packsswb_128, x.value(), y.value()));
3616 }
3617 
packuswb(RValue<Short4> x,RValue<Short4> y)3618 RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
3619 {
3620 	return As<Byte8>(createInstruction(llvm::Intrinsic::x86_sse2_packuswb_128, x.value(), y.value()));
3621 }
3622 
packusdw(RValue<Int4> x,RValue<Int4> y)3623 RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
3624 {
3625 	if(CPUID::supportsSSE4_1())
3626 	{
3627 		return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse41_packusdw, x.value(), y.value()));
3628 	}
3629 	else
3630 	{
3631 		RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
3632 		RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
3633 
3634 		return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
3635 	}
3636 }
3637 
psrlw(RValue<UShort4> x,unsigned char y)3638 RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
3639 {
3640 	return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3641 }
3642 
psrlw(RValue<UShort8> x,unsigned char y)3643 RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
3644 {
3645 	return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3646 }
3647 
psraw(RValue<Short4> x,unsigned char y)3648 RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
3649 {
3650 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3651 }
3652 
psraw(RValue<Short8> x,unsigned char y)3653 RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
3654 {
3655 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3656 }
3657 
psllw(RValue<Short4> x,unsigned char y)3658 RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
3659 {
3660 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3661 }
3662 
psllw(RValue<Short8> x,unsigned char y)3663 RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
3664 {
3665 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3666 }
3667 
pslld(RValue<Int2> x,unsigned char y)3668 RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
3669 {
3670 	return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3671 }
3672 
pslld(RValue<Int4> x,unsigned char y)3673 RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
3674 {
3675 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3676 }
3677 
psrad(RValue<Int2> x,unsigned char y)3678 RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
3679 {
3680 	return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3681 }
3682 
psrad(RValue<Int4> x,unsigned char y)3683 RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
3684 {
3685 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3686 }
3687 
psrld(RValue<UInt2> x,unsigned char y)3688 RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
3689 {
3690 	return As<UInt2>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3691 }
3692 
psrld(RValue<UInt4> x,unsigned char y)3693 RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
3694 {
3695 	return RValue<UInt4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3696 }
3697 
pmaxsd(RValue<Int4> x,RValue<Int4> y)3698 RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
3699 {
3700 	return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3701 }
3702 
pminsd(RValue<Int4> x,RValue<Int4> y)3703 RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
3704 {
3705 	return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3706 }
3707 
pmaxud(RValue<UInt4> x,RValue<UInt4> y)3708 RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
3709 {
3710 	return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_UGT)));
3711 }
3712 
pminud(RValue<UInt4> x,RValue<UInt4> y)3713 RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
3714 {
3715 	return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_ULT)));
3716 }
3717 
pmulhw(RValue<Short4> x,RValue<Short4> y)3718 RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
3719 {
3720 	return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
3721 }
3722 
pmulhuw(RValue<UShort4> x,RValue<UShort4> y)3723 RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
3724 {
3725 	return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
3726 }
3727 
pmaddwd(RValue<Short4> x,RValue<Short4> y)3728 RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
3729 {
3730 	return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
3731 }
3732 
pmulhw(RValue<Short8> x,RValue<Short8> y)3733 RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
3734 {
3735 	return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
3736 }
3737 
pmulhuw(RValue<UShort8> x,RValue<UShort8> y)3738 RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
3739 {
3740 	return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
3741 }
3742 
pmaddwd(RValue<Short8> x,RValue<Short8> y)3743 RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
3744 {
3745 	return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
3746 }
3747 
movmskps(RValue<Float4> x)3748 RValue<Int> movmskps(RValue<Float4> x)
3749 {
3750 	Value *v = x.value();
3751 
3752 	// TODO(b/172238865): MemorySanitizer does not support movmsk instructions,
3753 	// which makes it look at the entire 128-bit input for undefined bits. Mask off
3754 	// just the sign bits to avoid false positives.
3755 	if(__has_feature(memory_sanitizer))
3756 	{
3757 		v = As<Float4>(As<Int4>(v) & Int4(0x80000000u)).value();
3758 	}
3759 
3760 	return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_movmsk_ps, v));
3761 }
3762 
pmovmskb(RValue<Byte8> x)3763 RValue<Int> pmovmskb(RValue<Byte8> x)
3764 {
3765 	Value *v = x.value();
3766 
3767 	// TODO(b/172238865): MemorySanitizer does not support movmsk instructions,
3768 	// which makes it look at the entire 128-bit input for undefined bits. Mask off
3769 	// just the sign bits in the lower 64-bit vector to avoid false positives.
3770 	if(__has_feature(memory_sanitizer))
3771 	{
3772 		v = As<Byte16>(As<Int4>(v) & Int4(0x80808080u, 0x80808080u, 0, 0)).value();
3773 	}
3774 
3775 	return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse2_pmovmskb_128, v)) & 0xFF;
3776 }
3777 
3778 }  // namespace x86
3779 #endif  // defined(__i386__) || defined(__x86_64__)
3780 
3781 #ifdef ENABLE_RR_PRINT
VPrintf(const std::vector<Value * > & vals)3782 void VPrintf(const std::vector<Value *> &vals)
3783 {
3784 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
3785 	auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
3786 	auto funcTy = llvm::FunctionType::get(i32Ty, { i8PtrTy }, true);
3787 	auto func = jit->module->getOrInsertFunction("rr::DebugPrintf", funcTy);
3788 	jit->builder->CreateCall(func, V(vals));
3789 }
3790 #endif  // ENABLE_RR_PRINT
3791 
Nop()3792 void Nop()
3793 {
3794 	auto voidTy = llvm::Type::getVoidTy(*jit->context);
3795 	auto funcTy = llvm::FunctionType::get(voidTy, {}, false);
3796 	auto func = jit->module->getOrInsertFunction("nop", funcTy);
3797 	jit->builder->CreateCall(func);
3798 }
3799 
EmitDebugLocation()3800 void EmitDebugLocation()
3801 {
3802 #ifdef ENABLE_RR_DEBUG_INFO
3803 	if(jit->debugInfo != nullptr)
3804 	{
3805 		jit->debugInfo->EmitLocation();
3806 	}
3807 #endif  // ENABLE_RR_DEBUG_INFO
3808 }
3809 
EmitDebugVariable(Value * value)3810 void EmitDebugVariable(Value *value)
3811 {
3812 #ifdef ENABLE_RR_DEBUG_INFO
3813 	if(jit->debugInfo != nullptr)
3814 	{
3815 		jit->debugInfo->EmitVariable(value);
3816 	}
3817 #endif  // ENABLE_RR_DEBUG_INFO
3818 }
3819 
FlushDebug()3820 void FlushDebug()
3821 {
3822 #ifdef ENABLE_RR_DEBUG_INFO
3823 	if(jit->debugInfo != nullptr)
3824 	{
3825 		jit->debugInfo->Flush();
3826 	}
3827 #endif  // ENABLE_RR_DEBUG_INFO
3828 }
3829 
3830 }  // namespace rr
3831 
3832 // ------------------------------  Coroutines ------------------------------
3833 
3834 namespace {
3835 
3836 // Magic values retuned by llvm.coro.suspend.
3837 // See: https://llvm.org/docs/Coroutines.html#llvm-coro-suspend-intrinsic
3838 enum SuspendAction
3839 {
3840 	SuspendActionSuspend = -1,
3841 	SuspendActionResume = 0,
3842 	SuspendActionDestroy = 1
3843 };
3844 
promoteFunctionToCoroutine()3845 void promoteFunctionToCoroutine()
3846 {
3847 	ASSERT(jit->coroutine.id == nullptr);
3848 
3849 	// Types
3850 	auto voidTy = llvm::Type::getVoidTy(*jit->context);
3851 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
3852 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
3853 	auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
3854 	auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
3855 	auto promiseTy = jit->coroutine.yieldType;
3856 	auto promisePtrTy = promiseTy->getPointerTo();
3857 
3858 	// LLVM intrinsics
3859 	auto coro_id = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_id);
3860 	auto coro_size = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_size, { i32Ty });
3861 	auto coro_begin = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_begin);
3862 	auto coro_resume = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_resume);
3863 	auto coro_end = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_end);
3864 	auto coro_free = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_free);
3865 	auto coro_destroy = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_destroy);
3866 	auto coro_promise = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_promise);
3867 	auto coro_done = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_done);
3868 	auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
3869 
3870 	auto allocFrameTy = llvm::FunctionType::get(i8PtrTy, { i32Ty }, false);
3871 	auto allocFrame = jit->module->getOrInsertFunction("coroutine_alloc_frame", allocFrameTy);
3872 	auto freeFrameTy = llvm::FunctionType::get(voidTy, { i8PtrTy }, false);
3873 	auto freeFrame = jit->module->getOrInsertFunction("coroutine_free_frame", freeFrameTy);
3874 
3875 	auto oldInsertionPoint = jit->builder->saveIP();
3876 
3877 	// Build the coroutine_await() function:
3878 	//
3879 	//    bool coroutine_await(CoroutineHandle* handle, YieldType* out)
3880 	//    {
3881 	//        if(llvm.coro.done(handle))
3882 	//        {
3883 	//            return false;
3884 	//        }
3885 	//        else
3886 	//        {
3887 	//            *value = (T*)llvm.coro.promise(handle);
3888 	//            llvm.coro.resume(handle);
3889 	//            return true;
3890 	//        }
3891 	//    }
3892 	//
3893 	{
3894 		auto args = jit->coroutine.await->arg_begin();
3895 		auto handle = args++;
3896 		auto outPtr = args++;
3897 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "co_await", jit->coroutine.await));
3898 		auto doneBlock = llvm::BasicBlock::Create(*jit->context, "done", jit->coroutine.await);
3899 		auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->coroutine.await);
3900 
3901 		auto done = jit->builder->CreateCall(coro_done, { handle }, "done");
3902 		jit->builder->CreateCondBr(done, doneBlock, resumeBlock);
3903 
3904 		jit->builder->SetInsertPoint(doneBlock);
3905 		jit->builder->CreateRet(llvm::ConstantInt::getFalse(i1Ty));
3906 
3907 		jit->builder->SetInsertPoint(resumeBlock);
3908 		auto promiseAlignment = llvm::ConstantInt::get(i32Ty, 4);  // TODO: Get correct alignment.
3909 		auto promisePtr = jit->builder->CreateCall(coro_promise, { handle, promiseAlignment, llvm::ConstantInt::get(i1Ty, 0) });
3910 		auto promise = jit->builder->CreateLoad(promiseTy, jit->builder->CreatePointerCast(promisePtr, promisePtrTy));
3911 		jit->builder->CreateStore(promise, outPtr);
3912 		jit->builder->CreateCall(coro_resume, { handle });
3913 		jit->builder->CreateRet(llvm::ConstantInt::getTrue(i1Ty));
3914 	}
3915 
3916 	// Build the coroutine_destroy() function:
3917 	//
3918 	//    void coroutine_destroy(CoroutineHandle* handle)
3919 	//    {
3920 	//        llvm.coro.destroy(handle);
3921 	//    }
3922 	//
3923 	{
3924 		auto handle = jit->coroutine.destroy->arg_begin();
3925 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
3926 		jit->builder->CreateCall(coro_destroy, { handle });
3927 		jit->builder->CreateRetVoid();
3928 	}
3929 
3930 	// Begin building the main coroutine_begin() function.
3931 	//
3932 	//    CoroutineHandle* coroutine_begin(<Arguments>)
3933 	//    {
3934 	//        YieldType promise;
3935 	//        auto id = llvm.coro.id(0, &promise, nullptr, nullptr);
3936 	//        void* frame = coroutine_alloc_frame(llvm.coro.size.i32());
3937 	//        CoroutineHandle *handle = llvm.coro.begin(id, frame);
3938 	//
3939 	//        ... <REACTOR CODE> ...
3940 	//
3941 	//    end:
3942 	//        SuspendAction action = llvm.coro.suspend(none, true /* final */);  // <-- RESUME POINT
3943 	//        switch(action)
3944 	//        {
3945 	//        case SuspendActionResume:
3946 	//            UNREACHABLE(); // Illegal to resume after final suspend.
3947 	//        case SuspendActionDestroy:
3948 	//            goto destroy;
3949 	//        default: // (SuspendActionSuspend)
3950 	//            goto suspend;
3951 	//        }
3952 	//
3953 	//    destroy:
3954 	//        coroutine_free_frame(llvm.coro.free(id, handle));
3955 	//        goto suspend;
3956 	//
3957 	//    suspend:
3958 	//        llvm.coro.end(handle, false);
3959 	//        return handle;
3960 	//    }
3961 	//
3962 
3963 #ifdef ENABLE_RR_DEBUG_INFO
3964 	jit->debugInfo = std::make_unique<rr::DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
3965 #endif  // ENABLE_RR_DEBUG_INFO
3966 
3967 	jit->coroutine.suspendBlock = llvm::BasicBlock::Create(*jit->context, "suspend", jit->function);
3968 	jit->coroutine.endBlock = llvm::BasicBlock::Create(*jit->context, "end", jit->function);
3969 	jit->coroutine.destroyBlock = llvm::BasicBlock::Create(*jit->context, "destroy", jit->function);
3970 
3971 	jit->builder->SetInsertPoint(jit->coroutine.entryBlock, jit->coroutine.entryBlock->begin());
3972 	jit->coroutine.promise = jit->builder->CreateAlloca(promiseTy, nullptr, "promise");
3973 	jit->coroutine.id = jit->builder->CreateCall(coro_id, {
3974 	                                                          llvm::ConstantInt::get(i32Ty, 0),
3975 	                                                          jit->builder->CreatePointerCast(jit->coroutine.promise, i8PtrTy),
3976 	                                                          llvm::ConstantPointerNull::get(i8PtrTy),
3977 	                                                          llvm::ConstantPointerNull::get(i8PtrTy),
3978 	                                                      });
3979 	auto size = jit->builder->CreateCall(coro_size, {});
3980 	auto frame = jit->builder->CreateCall(allocFrame, { size });
3981 	jit->coroutine.handle = jit->builder->CreateCall(coro_begin, { jit->coroutine.id, frame });
3982 
3983 	// Build the suspend block
3984 	jit->builder->SetInsertPoint(jit->coroutine.suspendBlock);
3985 	jit->builder->CreateCall(coro_end, { jit->coroutine.handle, llvm::ConstantInt::get(i1Ty, 0) });
3986 	jit->builder->CreateRet(jit->coroutine.handle);
3987 
3988 	// Build the end block
3989 	jit->builder->SetInsertPoint(jit->coroutine.endBlock);
3990 	auto action = jit->builder->CreateCall(coro_suspend, {
3991 	                                                         llvm::ConstantTokenNone::get(*jit->context),
3992 	                                                         llvm::ConstantInt::get(i1Ty, 1),  // final: true
3993 	                                                     });
3994 	auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
3995 	// switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), trapBlock); // TODO: Trap attempting to resume after final suspend
3996 	switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
3997 
3998 	// Build the destroy block
3999 	jit->builder->SetInsertPoint(jit->coroutine.destroyBlock);
4000 	auto memory = jit->builder->CreateCall(coro_free, { jit->coroutine.id, jit->coroutine.handle });
4001 	jit->builder->CreateCall(freeFrame, { memory });
4002 	jit->builder->CreateBr(jit->coroutine.suspendBlock);
4003 
4004 	// Switch back to original insert point to continue building the coroutine.
4005 	jit->builder->restoreIP(oldInsertionPoint);
4006 }
4007 
4008 }  // anonymous namespace
4009 
4010 namespace rr {
4011 
createCoroutine(Type * YieldType,const std::vector<Type * > & Params)4012 void Nucleus::createCoroutine(Type *YieldType, const std::vector<Type *> &Params)
4013 {
4014 	// Coroutines are initially created as a regular function.
4015 	// Upon the first call to Yield(), the function is promoted to a true
4016 	// coroutine.
4017 	auto voidTy = llvm::Type::getVoidTy(*jit->context);
4018 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4019 	auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4020 	auto handleTy = i8PtrTy;
4021 	auto boolTy = i1Ty;
4022 	auto promiseTy = T(YieldType);
4023 	auto promisePtrTy = promiseTy->getPointerTo();
4024 
4025 	jit->function = rr::createFunction("coroutine_begin", handleTy, T(Params));
4026 #if LLVM_VERSION_MAJOR >= 16
4027 	jit->function->setPresplitCoroutine();
4028 #else
4029 	jit->function->addFnAttr("coroutine.presplit", "0");
4030 #endif
4031 	jit->coroutine.await = rr::createFunction("coroutine_await", boolTy, { handleTy, promisePtrTy });
4032 	jit->coroutine.destroy = rr::createFunction("coroutine_destroy", voidTy, { handleTy });
4033 	jit->coroutine.yieldType = promiseTy;
4034 	jit->coroutine.entryBlock = llvm::BasicBlock::Create(*jit->context, "function", jit->function);
4035 
4036 	jit->builder->SetInsertPoint(jit->coroutine.entryBlock);
4037 }
4038 
yield(Value * val)4039 void Nucleus::yield(Value *val)
4040 {
4041 	if(jit->coroutine.id == nullptr)
4042 	{
4043 		// First call to yield().
4044 		// Promote the function to a full coroutine.
4045 		promoteFunctionToCoroutine();
4046 		ASSERT(jit->coroutine.id != nullptr);
4047 	}
4048 
4049 	//      promise = val;
4050 	//
4051 	//      auto action = llvm.coro.suspend(none, false /* final */); // <-- RESUME POINT
4052 	//      switch(action)
4053 	//      {
4054 	//      case SuspendActionResume:
4055 	//          goto resume;
4056 	//      case SuspendActionDestroy:
4057 	//          goto destroy;
4058 	//      default: // (SuspendActionSuspend)
4059 	//          goto suspend;
4060 	//      }
4061 	//  resume:
4062 	//
4063 
4064 	RR_DEBUG_INFO_UPDATE_LOC();
4065 	Variable::materializeAll();
4066 
4067 	// Types
4068 	auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4069 	auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
4070 
4071 	// Intrinsics
4072 	auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
4073 
4074 	// Create a block to resume execution.
4075 	auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->function);
4076 
4077 	// Store the promise (yield value)
4078 	jit->builder->CreateStore(V(val), jit->coroutine.promise);
4079 	auto action = jit->builder->CreateCall(coro_suspend, {
4080 	                                                         llvm::ConstantTokenNone::get(*jit->context),
4081 	                                                         llvm::ConstantInt::get(i1Ty, 0),  // final: true
4082 	                                                     });
4083 	auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4084 	switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), resumeBlock);
4085 	switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4086 
4087 	// Continue building in the resume block.
4088 	jit->builder->SetInsertPoint(resumeBlock);
4089 }
4090 
acquireCoroutine(const char * name)4091 std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name)
4092 {
4093 	if(jit->coroutine.id)
4094 	{
4095 		jit->builder->CreateBr(jit->coroutine.endBlock);
4096 	}
4097 	else
4098 	{
4099 		// Coroutine without a Yield acts as a regular function.
4100 		// The 'coroutine_begin' function returns a nullptr for the coroutine
4101 		// handle.
4102 		jit->builder->CreateRet(llvm::Constant::getNullValue(jit->function->getReturnType()));
4103 		// The 'coroutine_await' function always returns false (coroutine done).
4104 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.await));
4105 		jit->builder->CreateRet(llvm::Constant::getNullValue(jit->coroutine.await->getReturnType()));
4106 		// The 'coroutine_destroy' does nothing, returns void.
4107 		jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
4108 		jit->builder->CreateRetVoid();
4109 	}
4110 
4111 #ifdef ENABLE_RR_DEBUG_INFO
4112 	if(jit->debugInfo != nullptr)
4113 	{
4114 		jit->debugInfo->Finalize();
4115 	}
4116 #endif  // ENABLE_RR_DEBUG_INFO
4117 
4118 	if(false)
4119 	{
4120 		std::error_code error;
4121 		llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
4122 		jit->module->print(file, 0);
4123 	}
4124 
4125 	jit->runPasses();
4126 
4127 	if(false)
4128 	{
4129 		std::error_code error;
4130 		llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
4131 		jit->module->print(file, 0);
4132 	}
4133 
4134 	llvm::Function *funcs[Nucleus::CoroutineEntryCount];
4135 	funcs[Nucleus::CoroutineEntryBegin] = jit->function;
4136 	funcs[Nucleus::CoroutineEntryAwait] = jit->coroutine.await;
4137 	funcs[Nucleus::CoroutineEntryDestroy] = jit->coroutine.destroy;
4138 
4139 	auto routine = jit->acquireRoutine(name, funcs, Nucleus::CoroutineEntryCount);
4140 
4141 	delete jit;
4142 	jit = nullptr;
4143 
4144 	return routine;
4145 }
4146 
invokeCoroutineBegin(Routine & routine,std::function<Nucleus::CoroutineHandle ()> func)4147 Nucleus::CoroutineHandle Nucleus::invokeCoroutineBegin(Routine &routine, std::function<Nucleus::CoroutineHandle()> func)
4148 {
4149 	return func();
4150 }
4151 
Int(RValue<scalar::Int> rhs)4152 SIMD::Int::Int(RValue<scalar::Int> rhs)
4153     : XYZW(this)
4154 {
4155 	RR_DEBUG_INFO_UPDATE_LOC();
4156 	Value *vector = loadValue();
4157 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
4158 
4159 	std::vector<int> swizzle = { 0 };
4160 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
4161 
4162 	storeValue(replicate);
4163 }
4164 
operator <<(RValue<SIMD::Int> lhs,unsigned char rhs)4165 RValue<SIMD::Int> operator<<(RValue<SIMD::Int> lhs, unsigned char rhs)
4166 {
4167 	RR_DEBUG_INFO_UPDATE_LOC();
4168 	return As<SIMD::Int>(V(lowerVectorShl(V(lhs.value()), rhs)));
4169 }
4170 
operator >>(RValue<SIMD::Int> lhs,unsigned char rhs)4171 RValue<SIMD::Int> operator>>(RValue<SIMD::Int> lhs, unsigned char rhs)
4172 {
4173 	RR_DEBUG_INFO_UPDATE_LOC();
4174 	return As<SIMD::Int>(V(lowerVectorAShr(V(lhs.value()), rhs)));
4175 }
4176 
CmpEQ(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4177 RValue<SIMD::Int> CmpEQ(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4178 {
4179 	RR_DEBUG_INFO_UPDATE_LOC();
4180 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), SIMD::Int::type()));
4181 }
4182 
CmpLT(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4183 RValue<SIMD::Int> CmpLT(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4184 {
4185 	RR_DEBUG_INFO_UPDATE_LOC();
4186 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value(), y.value()), SIMD::Int::type()));
4187 }
4188 
CmpLE(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4189 RValue<SIMD::Int> CmpLE(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4190 {
4191 	RR_DEBUG_INFO_UPDATE_LOC();
4192 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value(), y.value()), SIMD::Int::type()));
4193 }
4194 
CmpNEQ(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4195 RValue<SIMD::Int> CmpNEQ(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4196 {
4197 	RR_DEBUG_INFO_UPDATE_LOC();
4198 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), SIMD::Int::type()));
4199 }
4200 
CmpNLT(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4201 RValue<SIMD::Int> CmpNLT(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4202 {
4203 	RR_DEBUG_INFO_UPDATE_LOC();
4204 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value(), y.value()), SIMD::Int::type()));
4205 }
4206 
CmpNLE(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4207 RValue<SIMD::Int> CmpNLE(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4208 {
4209 	RR_DEBUG_INFO_UPDATE_LOC();
4210 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value(), y.value()), SIMD::Int::type()));
4211 }
4212 
Abs(RValue<SIMD::Int> x)4213 RValue<SIMD::Int> Abs(RValue<SIMD::Int> x)
4214 {
4215 #if LLVM_VERSION_MAJOR >= 12
4216 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::abs, { V(x.value())->getType() });
4217 	return RValue<SIMD::Int>(V(jit->builder->CreateCall(func, { V(x.value()), llvm::ConstantInt::getFalse(*jit->context) })));
4218 #else
4219 	auto negative = x >> 31;
4220 	return (x ^ negative) - negative;
4221 #endif
4222 }
4223 
Max(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4224 RValue<SIMD::Int> Max(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4225 {
4226 	RR_DEBUG_INFO_UPDATE_LOC();
4227 	RValue<SIMD::Int> greater = CmpNLE(x, y);
4228 	return (x & greater) | (y & ~greater);
4229 }
4230 
Min(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4231 RValue<SIMD::Int> Min(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4232 {
4233 	RR_DEBUG_INFO_UPDATE_LOC();
4234 	RValue<SIMD::Int> less = CmpLT(x, y);
4235 	return (x & less) | (y & ~less);
4236 }
4237 
RoundInt(RValue<SIMD::Float> cast)4238 RValue<SIMD::Int> RoundInt(RValue<SIMD::Float> cast)
4239 {
4240 	RR_DEBUG_INFO_UPDATE_LOC();
4241 	return As<SIMD::Int>(V(lowerRoundInt(V(cast.value()), T(SIMD::Int::type()))));
4242 }
4243 
RoundIntClamped(RValue<SIMD::Float> cast)4244 RValue<SIMD::Int> RoundIntClamped(RValue<SIMD::Float> cast)
4245 {
4246 	RR_DEBUG_INFO_UPDATE_LOC();
4247 
4248 // TODO(b/165000222): Check if fptosi_sat produces optimal code for x86 and ARM.
4249 #if defined(__arm__) || defined(__aarch64__)
4250 	// ARM saturates to the largest positive or negative integer. Unit tests
4251 	// verify that lowerRoundInt() behaves as desired.
4252 	return As<SIMD::Int>(V(lowerRoundInt(V(cast.value()), T(SIMD::Int::type()))));
4253 #elif LLVM_VERSION_MAJOR >= 14
4254 	llvm::Value *rounded = lowerRound(V(cast.value()));
4255 	llvm::Function *fptosi_sat = llvm::Intrinsic::getDeclaration(
4256 	    jit->module.get(), llvm::Intrinsic::fptosi_sat, { T(SIMD::Int::type()), T(SIMD::Float::type()) });
4257 	return RValue<SIMD::Int>(V(jit->builder->CreateCall(fptosi_sat, { rounded })));
4258 #else
4259 	RValue<SIMD::Float> clamped = Max(Min(cast, SIMD::Float(0x7FFFFF80)), SIMD::Float(static_cast<int>(0x80000000)));
4260 	return As<SIMD::Int>(V(lowerRoundInt(V(clamped.value()), T(SIMD::Int::type()))));
4261 #endif
4262 }
4263 
Extract128(RValue<SIMD::Int> val,int i)4264 RValue<Int4> Extract128(RValue<SIMD::Int> val, int i)
4265 {
4266 	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4267 
4268 	return As<Int4>(V(jit->builder->CreateExtractElement(v128, i)));
4269 }
4270 
Insert128(RValue<SIMD::Int> val,RValue<Int4> element,int i)4271 RValue<SIMD::Int> Insert128(RValue<SIMD::Int> val, RValue<Int4> element, int i)
4272 {
4273 	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4274 	llvm::Value *a = jit->builder->CreateBitCast(V(element.value()), llvm::IntegerType::get(*jit->context, 128));
4275 
4276 	return As<SIMD::Int>(V(jit->builder->CreateInsertElement(v128, a, i)));
4277 }
4278 
type()4279 Type *SIMD::Int::type()
4280 {
4281 	return T(llvm::VectorType::get(T(scalar::Int::type()), SIMD::Width, false));
4282 }
4283 
UInt(RValue<SIMD::Float> cast)4284 SIMD::UInt::UInt(RValue<SIMD::Float> cast)
4285     : XYZW(this)
4286 {
4287 	RR_DEBUG_INFO_UPDATE_LOC();
4288 	Value *xyzw = Nucleus::createFPToUI(cast.value(), SIMD::UInt::type());
4289 	storeValue(xyzw);
4290 }
4291 
UInt(RValue<scalar::UInt> rhs)4292 SIMD::UInt::UInt(RValue<scalar::UInt> rhs)
4293     : XYZW(this)
4294 {
4295 	RR_DEBUG_INFO_UPDATE_LOC();
4296 	Value *vector = loadValue();
4297 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
4298 
4299 	std::vector<int> swizzle = { 0 };
4300 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
4301 
4302 	storeValue(replicate);
4303 }
4304 
operator <<(RValue<SIMD::UInt> lhs,unsigned char rhs)4305 RValue<SIMD::UInt> operator<<(RValue<SIMD::UInt> lhs, unsigned char rhs)
4306 {
4307 	RR_DEBUG_INFO_UPDATE_LOC();
4308 	return As<SIMD::UInt>(V(lowerVectorShl(V(lhs.value()), rhs)));
4309 }
4310 
operator >>(RValue<SIMD::UInt> lhs,unsigned char rhs)4311 RValue<SIMD::UInt> operator>>(RValue<SIMD::UInt> lhs, unsigned char rhs)
4312 {
4313 	RR_DEBUG_INFO_UPDATE_LOC();
4314 	return As<SIMD::UInt>(V(lowerVectorLShr(V(lhs.value()), rhs)));
4315 }
4316 
CmpEQ(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4317 RValue<SIMD::UInt> CmpEQ(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4318 {
4319 	RR_DEBUG_INFO_UPDATE_LOC();
4320 	return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), SIMD::Int::type()));
4321 }
4322 
CmpLT(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4323 RValue<SIMD::UInt> CmpLT(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4324 {
4325 	RR_DEBUG_INFO_UPDATE_LOC();
4326 	return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpULT(x.value(), y.value()), SIMD::Int::type()));
4327 }
4328 
CmpLE(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4329 RValue<SIMD::UInt> CmpLE(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4330 {
4331 	RR_DEBUG_INFO_UPDATE_LOC();
4332 	return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpULE(x.value(), y.value()), SIMD::Int::type()));
4333 }
4334 
CmpNEQ(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4335 RValue<SIMD::UInt> CmpNEQ(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4336 {
4337 	RR_DEBUG_INFO_UPDATE_LOC();
4338 	return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), SIMD::Int::type()));
4339 }
4340 
CmpNLT(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4341 RValue<SIMD::UInt> CmpNLT(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4342 {
4343 	RR_DEBUG_INFO_UPDATE_LOC();
4344 	return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value(), y.value()), SIMD::Int::type()));
4345 }
4346 
CmpNLE(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4347 RValue<SIMD::UInt> CmpNLE(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4348 {
4349 	RR_DEBUG_INFO_UPDATE_LOC();
4350 	return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value(), y.value()), SIMD::Int::type()));
4351 }
4352 
Max(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4353 RValue<SIMD::UInt> Max(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4354 {
4355 	RR_DEBUG_INFO_UPDATE_LOC();
4356 	RValue<SIMD::UInt> greater = CmpNLE(x, y);
4357 	return (x & greater) | (y & ~greater);
4358 }
4359 
Min(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4360 RValue<SIMD::UInt> Min(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4361 {
4362 	RR_DEBUG_INFO_UPDATE_LOC();
4363 	RValue<SIMD::UInt> less = CmpLT(x, y);
4364 	return (x & less) | (y & ~less);
4365 }
4366 
Extract128(RValue<SIMD::UInt> val,int i)4367 RValue<UInt4> Extract128(RValue<SIMD::UInt> val, int i)
4368 {
4369 	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4370 
4371 	return As<UInt4>(V(jit->builder->CreateExtractElement(v128, i)));
4372 }
4373 
Insert128(RValue<SIMD::UInt> val,RValue<UInt4> element,int i)4374 RValue<SIMD::UInt> Insert128(RValue<SIMD::UInt> val, RValue<UInt4> element, int i)
4375 {
4376 	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4377 	llvm::Value *a = jit->builder->CreateBitCast(V(element.value()), llvm::IntegerType::get(*jit->context, 128));
4378 
4379 	return As<SIMD::UInt>(V(jit->builder->CreateInsertElement(v128, a, i)));
4380 }
4381 
type()4382 Type *SIMD::UInt::type()
4383 {
4384 	return T(llvm::VectorType::get(T(scalar::UInt::type()), SIMD::Width, false));
4385 }
4386 
Float(RValue<scalar::Float> rhs)4387 SIMD::Float::Float(RValue<scalar::Float> rhs)
4388     : XYZW(this)
4389 {
4390 	RR_DEBUG_INFO_UPDATE_LOC();
4391 	Value *vector = loadValue();
4392 	Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
4393 
4394 	std::vector<int> swizzle = { 0 };
4395 	Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
4396 
4397 	storeValue(replicate);
4398 }
4399 
operator %(RValue<SIMD::Float> lhs,RValue<SIMD::Float> rhs)4400 RValue<SIMD::Float> operator%(RValue<SIMD::Float> lhs, RValue<SIMD::Float> rhs)
4401 {
4402 	return RValue<SIMD::Float>(Nucleus::createFRem(lhs.value(), rhs.value()));
4403 }
4404 
MulAdd(RValue<SIMD::Float> x,RValue<SIMD::Float> y,RValue<SIMD::Float> z)4405 RValue<SIMD::Float> MulAdd(RValue<SIMD::Float> x, RValue<SIMD::Float> y, RValue<SIMD::Float> z)
4406 {
4407 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fmuladd, { T(SIMD::Float::type()) });
4408 	return RValue<SIMD::Float>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
4409 }
4410 
FMA(RValue<SIMD::Float> x,RValue<SIMD::Float> y,RValue<SIMD::Float> z)4411 RValue<SIMD::Float> FMA(RValue<SIMD::Float> x, RValue<SIMD::Float> y, RValue<SIMD::Float> z)
4412 {
4413 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fma, { T(SIMD::Float::type()) });
4414 	return RValue<SIMD::Float>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
4415 }
4416 
Abs(RValue<SIMD::Float> x)4417 RValue<SIMD::Float> Abs(RValue<SIMD::Float> x)
4418 {
4419 	auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fabs, { V(x.value())->getType() });
4420 	return RValue<SIMD::Float>(V(jit->builder->CreateCall(func, V(x.value()))));
4421 }
4422 
Max(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4423 RValue<SIMD::Float> Max(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4424 {
4425 	RR_DEBUG_INFO_UPDATE_LOC();
4426 	return As<SIMD::Float>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OGT)));
4427 }
4428 
Min(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4429 RValue<SIMD::Float> Min(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4430 {
4431 	RR_DEBUG_INFO_UPDATE_LOC();
4432 	return As<SIMD::Float>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OLT)));
4433 }
4434 
Sqrt(RValue<SIMD::Float> x)4435 RValue<SIMD::Float> Sqrt(RValue<SIMD::Float> x)
4436 {
4437 	RR_DEBUG_INFO_UPDATE_LOC();
4438 	return As<SIMD::Float>(V(lowerSQRT(V(x.value()))));
4439 }
4440 
CmpEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4441 RValue<SIMD::Int> CmpEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4442 {
4443 	RR_DEBUG_INFO_UPDATE_LOC();
4444 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value(), y.value()), SIMD::Int::type()));
4445 }
4446 
CmpLT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4447 RValue<SIMD::Int> CmpLT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4448 {
4449 	RR_DEBUG_INFO_UPDATE_LOC();
4450 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value(), y.value()), SIMD::Int::type()));
4451 }
4452 
CmpLE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4453 RValue<SIMD::Int> CmpLE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4454 {
4455 	RR_DEBUG_INFO_UPDATE_LOC();
4456 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value(), y.value()), SIMD::Int::type()));
4457 }
4458 
CmpNEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4459 RValue<SIMD::Int> CmpNEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4460 {
4461 	RR_DEBUG_INFO_UPDATE_LOC();
4462 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value(), y.value()), SIMD::Int::type()));
4463 }
4464 
CmpNLT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4465 RValue<SIMD::Int> CmpNLT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4466 {
4467 	RR_DEBUG_INFO_UPDATE_LOC();
4468 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value(), y.value()), SIMD::Int::type()));
4469 }
4470 
CmpNLE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4471 RValue<SIMD::Int> CmpNLE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4472 {
4473 	RR_DEBUG_INFO_UPDATE_LOC();
4474 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value(), y.value()), SIMD::Int::type()));
4475 }
4476 
CmpUEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4477 RValue<SIMD::Int> CmpUEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4478 {
4479 	RR_DEBUG_INFO_UPDATE_LOC();
4480 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value(), y.value()), SIMD::Int::type()));
4481 }
4482 
CmpULT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4483 RValue<SIMD::Int> CmpULT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4484 {
4485 	RR_DEBUG_INFO_UPDATE_LOC();
4486 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value(), y.value()), SIMD::Int::type()));
4487 }
4488 
CmpULE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4489 RValue<SIMD::Int> CmpULE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4490 {
4491 	RR_DEBUG_INFO_UPDATE_LOC();
4492 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value(), y.value()), SIMD::Int::type()));
4493 }
4494 
CmpUNEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4495 RValue<SIMD::Int> CmpUNEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4496 {
4497 	RR_DEBUG_INFO_UPDATE_LOC();
4498 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value(), y.value()), SIMD::Int::type()));
4499 }
4500 
CmpUNLT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4501 RValue<SIMD::Int> CmpUNLT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4502 {
4503 	RR_DEBUG_INFO_UPDATE_LOC();
4504 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value(), y.value()), SIMD::Int::type()));
4505 }
4506 
CmpUNLE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4507 RValue<SIMD::Int> CmpUNLE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4508 {
4509 	RR_DEBUG_INFO_UPDATE_LOC();
4510 	return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value(), y.value()), SIMD::Int::type()));
4511 }
4512 
Round(RValue<SIMD::Float> x)4513 RValue<SIMD::Float> Round(RValue<SIMD::Float> x)
4514 {
4515 	RR_DEBUG_INFO_UPDATE_LOC();
4516 	return RValue<SIMD::Float>(V(lowerRound(V(x.value()))));
4517 }
4518 
Trunc(RValue<SIMD::Float> x)4519 RValue<SIMD::Float> Trunc(RValue<SIMD::Float> x)
4520 {
4521 	RR_DEBUG_INFO_UPDATE_LOC();
4522 	return RValue<SIMD::Float>(V(lowerTrunc(V(x.value()))));
4523 }
4524 
Frac(RValue<SIMD::Float> x)4525 RValue<SIMD::Float> Frac(RValue<SIMD::Float> x)
4526 {
4527 	RR_DEBUG_INFO_UPDATE_LOC();
4528 	SIMD::Float frc = x - Floor(x);
4529 
4530 	// x - floor(x) can be 1.0 for very small negative x.
4531 	// Clamp against the value just below 1.0.
4532 	return Min(frc, As<SIMD::Float>(SIMD::Int(0x3F7FFFFF)));
4533 }
4534 
Floor(RValue<SIMD::Float> x)4535 RValue<SIMD::Float> Floor(RValue<SIMD::Float> x)
4536 {
4537 	RR_DEBUG_INFO_UPDATE_LOC();
4538 	return RValue<SIMD::Float>(V(lowerFloor(V(x.value()))));
4539 }
4540 
Ceil(RValue<SIMD::Float> x)4541 RValue<SIMD::Float> Ceil(RValue<SIMD::Float> x)
4542 {
4543 	RR_DEBUG_INFO_UPDATE_LOC();
4544 	return -Floor(-x);
4545 }
4546 
Extract128(RValue<SIMD::Float> val,int i)4547 RValue<Float4> Extract128(RValue<SIMD::Float> val, int i)
4548 {
4549 	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4550 
4551 	return As<Float4>(V(jit->builder->CreateExtractElement(v128, i)));
4552 }
4553 
Insert128(RValue<SIMD::Float> val,RValue<Float4> element,int i)4554 RValue<SIMD::Float> Insert128(RValue<SIMD::Float> val, RValue<Float4> element, int i)
4555 {
4556 	llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4557 	llvm::Value *a = jit->builder->CreateBitCast(V(element.value()), llvm::IntegerType::get(*jit->context, 128));
4558 
4559 	return As<SIMD::Float>(V(jit->builder->CreateInsertElement(v128, a, i)));
4560 }
4561 
type()4562 Type *SIMD::Float::type()
4563 {
4564 	return T(llvm::VectorType::get(T(scalar::Float::type()), SIMD::Width, false));
4565 }
4566 
4567 }  // namespace rr
4568