1 // Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "LLVMReactor.hpp"
16
17 #include "CPUID.hpp"
18 #include "Debug.hpp"
19 #include "LLVMReactorDebugInfo.hpp"
20 #include "PragmaInternals.hpp"
21 #include "Print.hpp"
22 #include "Reactor.hpp"
23 #include "SIMD.hpp"
24 #include "x86.hpp"
25
26 #include "llvm/IR/Intrinsics.h"
27 #include "llvm/IR/IntrinsicsX86.h"
28 #include "llvm/Support/Alignment.h"
29 #include "llvm/Support/Error.h"
30 #include "llvm/Support/ManagedStatic.h"
31
32 #include <fstream>
33 #include <iostream>
34 #include <mutex>
35 #include <numeric>
36 #include <thread>
37 #include <unordered_map>
38
39 #if defined(__i386__) || defined(__x86_64__)
40 # include <xmmintrin.h>
41 #endif
42
43 #include <math.h>
44
45 #if defined(__x86_64__) && defined(_WIN32)
X86CompilationCallback()46 extern "C" void X86CompilationCallback()
47 {
48 UNIMPLEMENTED_NO_BUG("X86CompilationCallback");
49 }
50 #endif
51
52 #if !LLVM_ENABLE_THREADS
53 # error "LLVM_ENABLE_THREADS needs to be enabled"
54 #endif
55
56 #if LLVM_VERSION_MAJOR < 11
57 namespace llvm {
58 using FixedVectorType = VectorType;
59 } // namespace llvm
60 #endif
61
62 namespace {
63
64 // Used to automatically invoke llvm_shutdown() when driver is unloaded
65 llvm::llvm_shutdown_obj llvmShutdownObj;
66
67 // This has to be a raw pointer because glibc 2.17 doesn't support __cxa_thread_atexit_impl
68 // for destructing objects at exit. See crbug.com/1074222
69 thread_local rr::JITBuilder *jit = nullptr;
70
getNumElements(llvm::FixedVectorType * vec)71 auto getNumElements(llvm::FixedVectorType *vec)
72 {
73 #if LLVM_VERSION_MAJOR >= 11
74 return vec->getElementCount();
75 #else
76 return vec->getNumElements();
77 #endif
78 }
79
lowerPAVG(llvm::Value * x,llvm::Value * y)80 llvm::Value *lowerPAVG(llvm::Value *x, llvm::Value *y)
81 {
82 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
83
84 llvm::VectorType *extTy =
85 llvm::VectorType::getExtendedElementVectorType(ty);
86 x = jit->builder->CreateZExt(x, extTy);
87 y = jit->builder->CreateZExt(y, extTy);
88
89 // (x + y + 1) >> 1
90 llvm::Constant *one = llvm::ConstantInt::get(extTy, 1);
91 llvm::Value *res = jit->builder->CreateAdd(x, y);
92 res = jit->builder->CreateAdd(res, one);
93 res = jit->builder->CreateLShr(res, one);
94 return jit->builder->CreateTrunc(res, ty);
95 }
96
lowerPMINMAX(llvm::Value * x,llvm::Value * y,llvm::ICmpInst::Predicate pred)97 llvm::Value *lowerPMINMAX(llvm::Value *x, llvm::Value *y,
98 llvm::ICmpInst::Predicate pred)
99 {
100 return jit->builder->CreateSelect(jit->builder->CreateICmp(pred, x, y), x, y);
101 }
102
lowerPCMP(llvm::ICmpInst::Predicate pred,llvm::Value * x,llvm::Value * y,llvm::Type * dstTy)103 llvm::Value *lowerPCMP(llvm::ICmpInst::Predicate pred, llvm::Value *x,
104 llvm::Value *y, llvm::Type *dstTy)
105 {
106 return jit->builder->CreateSExt(jit->builder->CreateICmp(pred, x, y), dstTy, "");
107 }
108
lowerPFMINMAX(llvm::Value * x,llvm::Value * y,llvm::FCmpInst::Predicate pred)109 [[maybe_unused]] llvm::Value *lowerPFMINMAX(llvm::Value *x, llvm::Value *y,
110 llvm::FCmpInst::Predicate pred)
111 {
112 return jit->builder->CreateSelect(jit->builder->CreateFCmp(pred, x, y), x, y);
113 }
114
lowerRound(llvm::Value * x)115 [[maybe_unused]] llvm::Value *lowerRound(llvm::Value *x)
116 {
117 llvm::Function *nearbyint = llvm::Intrinsic::getDeclaration(
118 jit->module.get(), llvm::Intrinsic::nearbyint, { x->getType() });
119 return jit->builder->CreateCall(nearbyint, { x });
120 }
121
lowerRoundInt(llvm::Value * x,llvm::Type * ty)122 [[maybe_unused]] llvm::Value *lowerRoundInt(llvm::Value *x, llvm::Type *ty)
123 {
124 return jit->builder->CreateFPToSI(lowerRound(x), ty);
125 }
126
lowerFloor(llvm::Value * x)127 [[maybe_unused]] llvm::Value *lowerFloor(llvm::Value *x)
128 {
129 llvm::Function *floor = llvm::Intrinsic::getDeclaration(
130 jit->module.get(), llvm::Intrinsic::floor, { x->getType() });
131 return jit->builder->CreateCall(floor, { x });
132 }
133
lowerTrunc(llvm::Value * x)134 [[maybe_unused]] llvm::Value *lowerTrunc(llvm::Value *x)
135 {
136 llvm::Function *trunc = llvm::Intrinsic::getDeclaration(
137 jit->module.get(), llvm::Intrinsic::trunc, { x->getType() });
138 return jit->builder->CreateCall(trunc, { x });
139 }
140
lowerSQRT(llvm::Value * x)141 [[maybe_unused]] llvm::Value *lowerSQRT(llvm::Value *x)
142 {
143 llvm::Function *sqrt = llvm::Intrinsic::getDeclaration(
144 jit->module.get(), llvm::Intrinsic::sqrt, { x->getType() });
145 return jit->builder->CreateCall(sqrt, { x });
146 }
147
lowerRCP(llvm::Value * x)148 [[maybe_unused]] llvm::Value *lowerRCP(llvm::Value *x)
149 {
150 llvm::Type *ty = x->getType();
151 llvm::Constant *one;
152 if(llvm::FixedVectorType *vectorTy = llvm::dyn_cast<llvm::FixedVectorType>(ty))
153 {
154 one = llvm::ConstantVector::getSplat(getNumElements(vectorTy),
155 llvm::ConstantFP::get(vectorTy->getElementType(), 1));
156 }
157 else
158 {
159 one = llvm::ConstantFP::get(ty, 1);
160 }
161 return jit->builder->CreateFDiv(one, x);
162 }
163
lowerRSQRT(llvm::Value * x)164 [[maybe_unused]] llvm::Value *lowerRSQRT(llvm::Value *x)
165 {
166 return lowerRCP(lowerSQRT(x));
167 }
168
lowerVectorShl(llvm::Value * x,uint64_t scalarY)169 [[maybe_unused]] llvm::Value *lowerVectorShl(llvm::Value *x, uint64_t scalarY)
170 {
171 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
172 llvm::Value *y = llvm::ConstantVector::getSplat(getNumElements(ty),
173 llvm::ConstantInt::get(ty->getElementType(), scalarY));
174 return jit->builder->CreateShl(x, y);
175 }
176
lowerVectorAShr(llvm::Value * x,uint64_t scalarY)177 [[maybe_unused]] llvm::Value *lowerVectorAShr(llvm::Value *x, uint64_t scalarY)
178 {
179 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
180 llvm::Value *y = llvm::ConstantVector::getSplat(getNumElements(ty),
181 llvm::ConstantInt::get(ty->getElementType(), scalarY));
182 return jit->builder->CreateAShr(x, y);
183 }
184
lowerVectorLShr(llvm::Value * x,uint64_t scalarY)185 [[maybe_unused]] llvm::Value *lowerVectorLShr(llvm::Value *x, uint64_t scalarY)
186 {
187 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
188 llvm::Value *y = llvm::ConstantVector::getSplat(getNumElements(ty),
189 llvm::ConstantInt::get(ty->getElementType(), scalarY));
190 return jit->builder->CreateLShr(x, y);
191 }
192
lowerShuffleVector(llvm::Value * v1,llvm::Value * v2,llvm::ArrayRef<int> select)193 llvm::Value *lowerShuffleVector(llvm::Value *v1, llvm::Value *v2, llvm::ArrayRef<int> select)
194 {
195 int size = select.size();
196 const int maxSize = 16;
197 llvm::Constant *swizzle[maxSize];
198 ASSERT(size <= maxSize);
199
200 for(int i = 0; i < size; i++)
201 {
202 swizzle[i] = llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), select[i]);
203 }
204
205 llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(swizzle, size));
206
207 return jit->builder->CreateShuffleVector(v1, v2, shuffle);
208 }
209
lowerMulAdd(llvm::Value * x,llvm::Value * y)210 [[maybe_unused]] llvm::Value *lowerMulAdd(llvm::Value *x, llvm::Value *y)
211 {
212 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
213 llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
214
215 llvm::Value *extX = jit->builder->CreateSExt(x, extTy);
216 llvm::Value *extY = jit->builder->CreateSExt(y, extTy);
217 llvm::Value *mult = jit->builder->CreateMul(extX, extY);
218
219 llvm::Value *undef = llvm::UndefValue::get(extTy);
220
221 llvm::SmallVector<int, 16> evenIdx;
222 llvm::SmallVector<int, 16> oddIdx;
223 for(uint64_t i = 0, n = ty->getNumElements(); i < n; i += 2)
224 {
225 evenIdx.push_back(i);
226 oddIdx.push_back(i + 1);
227 }
228
229 llvm::Value *lhs = lowerShuffleVector(mult, undef, evenIdx);
230 llvm::Value *rhs = lowerShuffleVector(mult, undef, oddIdx);
231 return jit->builder->CreateAdd(lhs, rhs);
232 }
233
lowerPack(llvm::Value * x,llvm::Value * y,bool isSigned)234 [[maybe_unused]] llvm::Value *lowerPack(llvm::Value *x, llvm::Value *y, bool isSigned)
235 {
236 llvm::FixedVectorType *srcTy = llvm::cast<llvm::FixedVectorType>(x->getType());
237 llvm::VectorType *dstTy = llvm::VectorType::getTruncatedElementVectorType(srcTy);
238
239 llvm::IntegerType *dstElemTy =
240 llvm::cast<llvm::IntegerType>(dstTy->getElementType());
241
242 uint64_t truncNumBits = dstElemTy->getIntegerBitWidth();
243 ASSERT_MSG(truncNumBits < 64, "shift 64 must be handled separately. truncNumBits: %d", int(truncNumBits));
244 llvm::Constant *max, *min;
245 if(isSigned)
246 {
247 max = llvm::ConstantInt::get(srcTy, (1LL << (truncNumBits - 1)) - 1, true);
248 min = llvm::ConstantInt::get(srcTy, (-1LL << (truncNumBits - 1)), true);
249 }
250 else
251 {
252 max = llvm::ConstantInt::get(srcTy, (1ULL << truncNumBits) - 1, false);
253 min = llvm::ConstantInt::get(srcTy, 0, false);
254 }
255
256 x = lowerPMINMAX(x, min, llvm::ICmpInst::ICMP_SGT);
257 x = lowerPMINMAX(x, max, llvm::ICmpInst::ICMP_SLT);
258 y = lowerPMINMAX(y, min, llvm::ICmpInst::ICMP_SGT);
259 y = lowerPMINMAX(y, max, llvm::ICmpInst::ICMP_SLT);
260
261 x = jit->builder->CreateTrunc(x, dstTy);
262 y = jit->builder->CreateTrunc(y, dstTy);
263
264 llvm::SmallVector<int, 16> index(srcTy->getNumElements() * 2);
265 std::iota(index.begin(), index.end(), 0);
266
267 return lowerShuffleVector(x, y, index);
268 }
269
lowerSignMask(llvm::Value * x,llvm::Type * retTy)270 [[maybe_unused]] llvm::Value *lowerSignMask(llvm::Value *x, llvm::Type *retTy)
271 {
272 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
273 llvm::Constant *zero = llvm::ConstantInt::get(ty, 0);
274 llvm::Value *cmp = jit->builder->CreateICmpSLT(x, zero);
275
276 llvm::Value *ret = jit->builder->CreateZExt(
277 jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
278 for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
279 {
280 llvm::Value *elem = jit->builder->CreateZExt(
281 jit->builder->CreateExtractElement(cmp, i), retTy);
282 ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
283 }
284 return ret;
285 }
286
lowerFPSignMask(llvm::Value * x,llvm::Type * retTy)287 [[maybe_unused]] llvm::Value *lowerFPSignMask(llvm::Value *x, llvm::Type *retTy)
288 {
289 llvm::FixedVectorType *ty = llvm::cast<llvm::FixedVectorType>(x->getType());
290 llvm::Constant *zero = llvm::ConstantFP::get(ty, 0);
291 llvm::Value *cmp = jit->builder->CreateFCmpULT(x, zero);
292
293 llvm::Value *ret = jit->builder->CreateZExt(
294 jit->builder->CreateExtractElement(cmp, static_cast<uint64_t>(0)), retTy);
295 for(uint64_t i = 1, n = ty->getNumElements(); i < n; ++i)
296 {
297 llvm::Value *elem = jit->builder->CreateZExt(
298 jit->builder->CreateExtractElement(cmp, i), retTy);
299 ret = jit->builder->CreateOr(ret, jit->builder->CreateShl(elem, i));
300 }
301 return ret;
302 }
303
lowerPUADDSAT(llvm::Value * x,llvm::Value * y)304 llvm::Value *lowerPUADDSAT(llvm::Value *x, llvm::Value *y)
305 {
306 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::uadd_sat, x, y);
307 }
308
lowerPSADDSAT(llvm::Value * x,llvm::Value * y)309 llvm::Value *lowerPSADDSAT(llvm::Value *x, llvm::Value *y)
310 {
311 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::sadd_sat, x, y);
312 }
313
lowerPUSUBSAT(llvm::Value * x,llvm::Value * y)314 llvm::Value *lowerPUSUBSAT(llvm::Value *x, llvm::Value *y)
315 {
316 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::usub_sat, x, y);
317 }
318
lowerPSSUBSAT(llvm::Value * x,llvm::Value * y)319 llvm::Value *lowerPSSUBSAT(llvm::Value *x, llvm::Value *y)
320 {
321 return jit->builder->CreateBinaryIntrinsic(llvm::Intrinsic::ssub_sat, x, y);
322 }
323
lowerMulHigh(llvm::Value * x,llvm::Value * y,bool sext)324 llvm::Value *lowerMulHigh(llvm::Value *x, llvm::Value *y, bool sext)
325 {
326 llvm::VectorType *ty = llvm::cast<llvm::VectorType>(x->getType());
327 llvm::VectorType *extTy = llvm::VectorType::getExtendedElementVectorType(ty);
328
329 llvm::Value *extX, *extY;
330 if(sext)
331 {
332 extX = jit->builder->CreateSExt(x, extTy);
333 extY = jit->builder->CreateSExt(y, extTy);
334 }
335 else
336 {
337 extX = jit->builder->CreateZExt(x, extTy);
338 extY = jit->builder->CreateZExt(y, extTy);
339 }
340
341 llvm::Value *mult = jit->builder->CreateMul(extX, extY);
342
343 llvm::IntegerType *intTy = llvm::cast<llvm::IntegerType>(ty->getElementType());
344 llvm::Value *mulh = jit->builder->CreateAShr(mult, intTy->getBitWidth());
345 return jit->builder->CreateTrunc(mulh, ty);
346 }
347
348 // TODO(crbug.com/swiftshader/185): A temporary workaround for failing chromium tests.
clampForShift(llvm::Value * rhs)349 llvm::Value *clampForShift(llvm::Value *rhs)
350 {
351 llvm::Value *max;
352 if(auto *vec = llvm::dyn_cast<llvm::FixedVectorType>(rhs->getType()))
353 {
354 auto N = vec->getElementType()->getIntegerBitWidth() - 1;
355 max = llvm::ConstantVector::getSplat(getNumElements(vec), llvm::ConstantInt::get(vec->getElementType(), N));
356 }
357 else
358 {
359 auto N = rhs->getType()->getIntegerBitWidth() - 1;
360 max = llvm::ConstantInt::get(rhs->getType(), N);
361 }
362 return jit->builder->CreateSelect(jit->builder->CreateICmpULE(rhs, max), rhs, max);
363 }
364
365 } // namespace
366
367 namespace rr {
368
369 const int SIMD::Width = 4;
370
backendName()371 std::string Caps::backendName()
372 {
373 return std::string("LLVM ") + LLVM_VERSION_STRING;
374 }
375
coroutinesSupported()376 bool Caps::coroutinesSupported()
377 {
378 return true;
379 }
380
fmaIsFast()381 bool Caps::fmaIsFast()
382 {
383 static bool AVX2 = CPUID::supportsAVX2(); // Also checks for FMA support
384
385 // If x86 FMA instructions are supported, assume LLVM will emit them instead of making calls to std::fma().
386 return AVX2;
387 }
388
389 // The abstract Type* types are implemented as LLVM types, except that
390 // 64-bit vectors are emulated using 128-bit ones to avoid use of MMX in x86
391 // and VFP in ARM, and eliminate the overhead of converting them to explicit
392 // 128-bit ones. LLVM types are pointers, so we can represent emulated types
393 // as abstract pointers with small enum values.
394 enum InternalType : uintptr_t
395 {
396 // Emulated types:
397 Type_v2i32,
398 Type_v4i16,
399 Type_v2i16,
400 Type_v8i8,
401 Type_v4i8,
402 Type_v2f32,
403 EmulatedTypeCount,
404 // Returned by asInternalType() to indicate that the abstract Type*
405 // should be interpreted as LLVM type pointer:
406 Type_LLVM
407 };
408
asInternalType(Type * type)409 inline InternalType asInternalType(Type *type)
410 {
411 InternalType t = static_cast<InternalType>(reinterpret_cast<uintptr_t>(type));
412 return (t < EmulatedTypeCount) ? t : Type_LLVM;
413 }
414
T(Type * t)415 llvm::Type *T(Type *t)
416 {
417 // Use 128-bit vectors to implement logically shorter ones.
418 switch(asInternalType(t))
419 {
420 case Type_v2i32: return T(Int4::type());
421 case Type_v4i16: return T(Short8::type());
422 case Type_v2i16: return T(Short8::type());
423 case Type_v8i8: return T(Byte16::type());
424 case Type_v4i8: return T(Byte16::type());
425 case Type_v2f32: return T(Float4::type());
426 case Type_LLVM: return reinterpret_cast<llvm::Type *>(t);
427 default:
428 UNREACHABLE("asInternalType(t): %d", int(asInternalType(t)));
429 return nullptr;
430 }
431 }
432
T(InternalType t)433 Type *T(InternalType t)
434 {
435 return reinterpret_cast<Type *>(t);
436 }
437
T(const std::vector<Type * > & t)438 inline const std::vector<llvm::Type *> &T(const std::vector<Type *> &t)
439 {
440 return reinterpret_cast<const std::vector<llvm::Type *> &>(t);
441 }
442
B(BasicBlock * t)443 inline llvm::BasicBlock *B(BasicBlock *t)
444 {
445 return reinterpret_cast<llvm::BasicBlock *>(t);
446 }
447
B(llvm::BasicBlock * t)448 inline BasicBlock *B(llvm::BasicBlock *t)
449 {
450 return reinterpret_cast<BasicBlock *>(t);
451 }
452
typeSize(Type * type)453 static size_t typeSize(Type *type)
454 {
455 switch(asInternalType(type))
456 {
457 case Type_v2i32: return 8;
458 case Type_v4i16: return 8;
459 case Type_v2i16: return 4;
460 case Type_v8i8: return 8;
461 case Type_v4i8: return 4;
462 case Type_v2f32: return 8;
463 case Type_LLVM:
464 {
465 llvm::Type *t = T(type);
466
467 if(t->isPointerTy())
468 {
469 return sizeof(void *);
470 }
471
472 // At this point we should only have LLVM 'primitive' types.
473 unsigned int bits = t->getPrimitiveSizeInBits();
474 ASSERT_MSG(bits != 0, "bits: %d", int(bits));
475
476 // TODO(capn): Booleans are 1 bit integers in LLVM's SSA type system,
477 // but are typically stored as one byte. The DataLayout structure should
478 // be used here and many other places if this assumption fails.
479 return (bits + 7) / 8;
480 }
481 break;
482 default:
483 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
484 return 0;
485 }
486 }
487
createFunction(const char * name,llvm::Type * retTy,const std::vector<llvm::Type * > & params)488 static llvm::Function *createFunction(const char *name, llvm::Type *retTy, const std::vector<llvm::Type *> ¶ms)
489 {
490 llvm::FunctionType *functionType = llvm::FunctionType::get(retTy, params, false);
491 auto func = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, name, jit->module.get());
492
493 func->setLinkage(llvm::GlobalValue::ExternalLinkage);
494 func->setDoesNotThrow();
495 func->setCallingConv(llvm::CallingConv::C);
496
497 if(__has_feature(address_sanitizer))
498 {
499 func->addFnAttr(llvm::Attribute::SanitizeAddress);
500 }
501
502 func->addFnAttr("warn-stack-size", "524288"); // Warn when a function uses more than 512 KiB of stack memory
503
504 return func;
505 }
506
Nucleus()507 Nucleus::Nucleus()
508 {
509 #if !__has_feature(memory_sanitizer)
510 // thread_local variables in shared libraries are initialized at load-time,
511 // but this is not observed by MemorySanitizer if the loader itself was not
512 // instrumented, leading to false-positive uninitialized variable errors.
513 ASSERT(jit == nullptr);
514 ASSERT(Variable::unmaterializedVariables == nullptr);
515 #endif
516
517 jit = new JITBuilder();
518 Variable::unmaterializedVariables = new Variable::UnmaterializedVariables();
519 }
520
~Nucleus()521 Nucleus::~Nucleus()
522 {
523 delete Variable::unmaterializedVariables;
524 Variable::unmaterializedVariables = nullptr;
525
526 delete jit;
527 jit = nullptr;
528 }
529
acquireRoutine(const char * name)530 std::shared_ptr<Routine> Nucleus::acquireRoutine(const char *name)
531 {
532 if(jit->builder->GetInsertBlock()->empty() || !jit->builder->GetInsertBlock()->back().isTerminator())
533 {
534 llvm::Type *type = jit->function->getReturnType();
535
536 if(type->isVoidTy())
537 {
538 createRetVoid();
539 }
540 else
541 {
542 createRet(V(llvm::UndefValue::get(type)));
543 }
544 }
545
546 std::shared_ptr<Routine> routine;
547
548 auto acquire = [&](rr::JITBuilder *jit) {
549 // ::jit is thread-local, so when this is executed on a separate thread (see JIT_IN_SEPARATE_THREAD)
550 // it needs to only use the jit variable passed in as an argument.
551
552 #ifdef ENABLE_RR_DEBUG_INFO
553 if(jit->debugInfo != nullptr)
554 {
555 jit->debugInfo->Finalize();
556 }
557 #endif // ENABLE_RR_DEBUG_INFO
558
559 if(false)
560 {
561 std::error_code error;
562 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
563 jit->module->print(file, 0);
564 }
565
566 jit->runPasses();
567
568 if(false)
569 {
570 std::error_code error;
571 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
572 jit->module->print(file, 0);
573 }
574
575 routine = jit->acquireRoutine(name, &jit->function, 1);
576 };
577
578 #ifdef JIT_IN_SEPARATE_THREAD
579 // Perform optimizations and codegen in a separate thread to avoid stack overflow.
580 // FIXME(b/149829034): This is not a long-term solution. Reactor has no control
581 // over the threading and stack sizes of its users, so this should be addressed
582 // at a higher level instead.
583 std::thread thread(acquire, jit);
584 thread.join();
585 #else
586 acquire(jit);
587 #endif
588
589 return routine;
590 }
591
allocateStackVariable(Type * type,int arraySize)592 Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
593 {
594 // Need to allocate it in the entry block for mem2reg to work
595 llvm::BasicBlock &entryBlock = jit->function->getEntryBlock();
596
597 llvm::Instruction *declaration;
598
599 #if LLVM_VERSION_MAJOR >= 11
600 auto align = jit->module->getDataLayout().getPrefTypeAlign(T(type));
601 #else
602 auto align = llvm::MaybeAlign(jit->module->getDataLayout().getPrefTypeAlignment(T(type)));
603 #endif
604
605 if(arraySize)
606 {
607 Value *size = (sizeof(size_t) == 8) ? Nucleus::createConstantLong(arraySize) : Nucleus::createConstantInt(arraySize);
608 declaration = new llvm::AllocaInst(T(type), 0, V(size), align);
609 }
610 else
611 {
612 declaration = new llvm::AllocaInst(T(type), 0, (llvm::Value *)nullptr, align);
613 }
614
615 #if LLVM_VERSION_MAJOR >= 16
616 declaration->insertInto(&entryBlock, entryBlock.begin());
617 #else
618 entryBlock.getInstList().push_front(declaration);
619 #endif
620
621 if(getPragmaState(InitializeLocalVariables))
622 {
623 llvm::Type *i8PtrTy = llvm::Type::getInt8Ty(*jit->context)->getPointerTo();
624 llvm::Type *i32Ty = llvm::Type::getInt32Ty(*jit->context);
625 llvm::Function *memset = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::memset, { i8PtrTy, i32Ty });
626
627 jit->builder->CreateCall(memset, { jit->builder->CreatePointerCast(declaration, i8PtrTy),
628 V(Nucleus::createConstantByte((unsigned char)0)),
629 V(Nucleus::createConstantInt((int)typeSize(type) * (arraySize ? arraySize : 1))),
630 V(Nucleus::createConstantBool(false)) });
631 }
632
633 return V(declaration);
634 }
635
createBasicBlock()636 BasicBlock *Nucleus::createBasicBlock()
637 {
638 return B(llvm::BasicBlock::Create(*jit->context, "", jit->function));
639 }
640
getInsertBlock()641 BasicBlock *Nucleus::getInsertBlock()
642 {
643 return B(jit->builder->GetInsertBlock());
644 }
645
setInsertBlock(BasicBlock * basicBlock)646 void Nucleus::setInsertBlock(BasicBlock *basicBlock)
647 {
648 // assert(jit->builder->GetInsertBlock()->back().isTerminator());
649
650 jit->builder->SetInsertPoint(B(basicBlock));
651 }
652
createFunction(Type * ReturnType,const std::vector<Type * > & Params)653 void Nucleus::createFunction(Type *ReturnType, const std::vector<Type *> &Params)
654 {
655 jit->function = rr::createFunction("", T(ReturnType), T(Params));
656
657 #ifdef ENABLE_RR_DEBUG_INFO
658 jit->debugInfo = std::make_unique<DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
659 #endif // ENABLE_RR_DEBUG_INFO
660
661 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->function));
662 }
663
getArgument(unsigned int index)664 Value *Nucleus::getArgument(unsigned int index)
665 {
666 llvm::Function::arg_iterator args = jit->function->arg_begin();
667
668 while(index)
669 {
670 args++;
671 index--;
672 }
673
674 return V(&*args);
675 }
676
createRetVoid()677 void Nucleus::createRetVoid()
678 {
679 RR_DEBUG_INFO_UPDATE_LOC();
680
681 ASSERT_MSG(jit->function->getReturnType() == T(Void::type()), "Return type mismatch");
682
683 // Code generated after this point is unreachable, so any variables
684 // being read can safely return an undefined value. We have to avoid
685 // materializing variables after the terminator ret instruction.
686 Variable::killUnmaterialized();
687
688 jit->builder->CreateRetVoid();
689 }
690
createRet(Value * v)691 void Nucleus::createRet(Value *v)
692 {
693 RR_DEBUG_INFO_UPDATE_LOC();
694
695 ASSERT_MSG(jit->function->getReturnType() == V(v)->getType(), "Return type mismatch");
696
697 // Code generated after this point is unreachable, so any variables
698 // being read can safely return an undefined value. We have to avoid
699 // materializing variables after the terminator ret instruction.
700 Variable::killUnmaterialized();
701
702 jit->builder->CreateRet(V(v));
703 }
704
createBr(BasicBlock * dest)705 void Nucleus::createBr(BasicBlock *dest)
706 {
707 RR_DEBUG_INFO_UPDATE_LOC();
708 Variable::materializeAll();
709
710 jit->builder->CreateBr(B(dest));
711 }
712
createCondBr(Value * cond,BasicBlock * ifTrue,BasicBlock * ifFalse)713 void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
714 {
715 RR_DEBUG_INFO_UPDATE_LOC();
716 Variable::materializeAll();
717 jit->builder->CreateCondBr(V(cond), B(ifTrue), B(ifFalse));
718 }
719
createAdd(Value * lhs,Value * rhs)720 Value *Nucleus::createAdd(Value *lhs, Value *rhs)
721 {
722 RR_DEBUG_INFO_UPDATE_LOC();
723 return V(jit->builder->CreateAdd(V(lhs), V(rhs)));
724 }
725
createSub(Value * lhs,Value * rhs)726 Value *Nucleus::createSub(Value *lhs, Value *rhs)
727 {
728 RR_DEBUG_INFO_UPDATE_LOC();
729 return V(jit->builder->CreateSub(V(lhs), V(rhs)));
730 }
731
createMul(Value * lhs,Value * rhs)732 Value *Nucleus::createMul(Value *lhs, Value *rhs)
733 {
734 RR_DEBUG_INFO_UPDATE_LOC();
735 return V(jit->builder->CreateMul(V(lhs), V(rhs)));
736 }
737
createUDiv(Value * lhs,Value * rhs)738 Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
739 {
740 RR_DEBUG_INFO_UPDATE_LOC();
741 return V(jit->builder->CreateUDiv(V(lhs), V(rhs)));
742 }
743
createSDiv(Value * lhs,Value * rhs)744 Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
745 {
746 RR_DEBUG_INFO_UPDATE_LOC();
747 return V(jit->builder->CreateSDiv(V(lhs), V(rhs)));
748 }
749
createFAdd(Value * lhs,Value * rhs)750 Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
751 {
752 RR_DEBUG_INFO_UPDATE_LOC();
753 return V(jit->builder->CreateFAdd(V(lhs), V(rhs)));
754 }
755
createFSub(Value * lhs,Value * rhs)756 Value *Nucleus::createFSub(Value *lhs, Value *rhs)
757 {
758 RR_DEBUG_INFO_UPDATE_LOC();
759 return V(jit->builder->CreateFSub(V(lhs), V(rhs)));
760 }
761
createFMul(Value * lhs,Value * rhs)762 Value *Nucleus::createFMul(Value *lhs, Value *rhs)
763 {
764 RR_DEBUG_INFO_UPDATE_LOC();
765 return V(jit->builder->CreateFMul(V(lhs), V(rhs)));
766 }
767
createFDiv(Value * lhs,Value * rhs)768 Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
769 {
770 RR_DEBUG_INFO_UPDATE_LOC();
771 return V(jit->builder->CreateFDiv(V(lhs), V(rhs)));
772 }
773
createURem(Value * lhs,Value * rhs)774 Value *Nucleus::createURem(Value *lhs, Value *rhs)
775 {
776 RR_DEBUG_INFO_UPDATE_LOC();
777 return V(jit->builder->CreateURem(V(lhs), V(rhs)));
778 }
779
createSRem(Value * lhs,Value * rhs)780 Value *Nucleus::createSRem(Value *lhs, Value *rhs)
781 {
782 RR_DEBUG_INFO_UPDATE_LOC();
783 return V(jit->builder->CreateSRem(V(lhs), V(rhs)));
784 }
785
createFRem(Value * lhs,Value * rhs)786 Value *Nucleus::createFRem(Value *lhs, Value *rhs)
787 {
788 RR_DEBUG_INFO_UPDATE_LOC();
789 return V(jit->builder->CreateFRem(V(lhs), V(rhs)));
790 }
791
operator %(RValue<Float4> lhs,RValue<Float4> rhs)792 RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
793 {
794 return RValue<Float4>(Nucleus::createFRem(lhs.value(), rhs.value()));
795 }
796
createShl(Value * lhs,Value * rhs)797 Value *Nucleus::createShl(Value *lhs, Value *rhs)
798 {
799 RR_DEBUG_INFO_UPDATE_LOC();
800 auto *clamped_rhs = clampForShift(V(rhs));
801 return V(jit->builder->CreateShl(V(lhs), clamped_rhs));
802 }
803
createLShr(Value * lhs,Value * rhs)804 Value *Nucleus::createLShr(Value *lhs, Value *rhs)
805 {
806 RR_DEBUG_INFO_UPDATE_LOC();
807 auto *clamped_rhs = clampForShift(V(rhs));
808 return V(jit->builder->CreateLShr(V(lhs), clamped_rhs));
809 }
810
createAShr(Value * lhs,Value * rhs)811 Value *Nucleus::createAShr(Value *lhs, Value *rhs)
812 {
813 RR_DEBUG_INFO_UPDATE_LOC();
814 return V(jit->builder->CreateAShr(V(lhs), V(rhs)));
815 }
816
createAnd(Value * lhs,Value * rhs)817 Value *Nucleus::createAnd(Value *lhs, Value *rhs)
818 {
819 RR_DEBUG_INFO_UPDATE_LOC();
820 return V(jit->builder->CreateAnd(V(lhs), V(rhs)));
821 }
822
createOr(Value * lhs,Value * rhs)823 Value *Nucleus::createOr(Value *lhs, Value *rhs)
824 {
825 RR_DEBUG_INFO_UPDATE_LOC();
826 return V(jit->builder->CreateOr(V(lhs), V(rhs)));
827 }
828
createXor(Value * lhs,Value * rhs)829 Value *Nucleus::createXor(Value *lhs, Value *rhs)
830 {
831 RR_DEBUG_INFO_UPDATE_LOC();
832 return V(jit->builder->CreateXor(V(lhs), V(rhs)));
833 }
834
createNeg(Value * v)835 Value *Nucleus::createNeg(Value *v)
836 {
837 RR_DEBUG_INFO_UPDATE_LOC();
838 return V(jit->builder->CreateNeg(V(v)));
839 }
840
createFNeg(Value * v)841 Value *Nucleus::createFNeg(Value *v)
842 {
843 RR_DEBUG_INFO_UPDATE_LOC();
844 return V(jit->builder->CreateFNeg(V(v)));
845 }
846
createNot(Value * v)847 Value *Nucleus::createNot(Value *v)
848 {
849 RR_DEBUG_INFO_UPDATE_LOC();
850 return V(jit->builder->CreateNot(V(v)));
851 }
852
createLoad(Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)853 Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
854 {
855 RR_DEBUG_INFO_UPDATE_LOC();
856 switch(asInternalType(type))
857 {
858 case Type_v2i32:
859 case Type_v4i16:
860 case Type_v8i8:
861 case Type_v2f32:
862 return createBitCast(
863 createInsertElement(
864 V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false))),
865 createLoad(createBitCast(ptr, Pointer<Long>::type()), Long::type(), isVolatile, alignment, atomic, memoryOrder),
866 0),
867 type);
868 case Type_v2i16:
869 case Type_v4i8:
870 if(alignment != 0) // Not a local variable (all vectors are 128-bit).
871 {
872 Value *u = V(llvm::UndefValue::get(llvm::VectorType::get(T(Long::type()), 2, false)));
873 Value *i = createLoad(createBitCast(ptr, Pointer<Int>::type()), Int::type(), isVolatile, alignment, atomic, memoryOrder);
874 i = createZExt(i, Long::type());
875 Value *v = createInsertElement(u, i, 0);
876 return createBitCast(v, type);
877 }
878 // Fallthrough to non-emulated case.
879 case Type_LLVM:
880 {
881 auto elTy = T(type);
882
883 if(!atomic)
884 {
885 return V(jit->builder->CreateAlignedLoad(elTy, V(ptr), llvm::MaybeAlign(alignment), isVolatile));
886 }
887 else if(elTy->isIntegerTy() || elTy->isPointerTy())
888 {
889 // Integers and pointers can be atomically loaded by setting
890 // the ordering constraint on the load instruction.
891 auto load = jit->builder->CreateAlignedLoad(elTy, V(ptr), llvm::MaybeAlign(alignment), isVolatile);
892 load->setAtomic(atomicOrdering(atomic, memoryOrder));
893 return V(load);
894 }
895 else if(elTy->isFloatTy() || elTy->isDoubleTy())
896 {
897 // LLVM claims to support atomic loads of float types as
898 // above, but certain backends cannot deal with this.
899 // Load as an integer and bitcast. See b/136037244.
900 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
901 auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
902 auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
903 auto load = jit->builder->CreateAlignedLoad(elAsIntTy, ptrCast, llvm::MaybeAlign(alignment), isVolatile);
904 load->setAtomic(atomicOrdering(atomic, memoryOrder));
905 auto loadCast = jit->builder->CreateBitCast(load, elTy);
906 return V(loadCast);
907 }
908 else
909 {
910 // More exotic types require falling back to the extern:
911 // void __atomic_load(size_t size, void *ptr, void *ret, int ordering)
912 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
913 auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
914 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
915 auto i8PtrTy = i8Ty->getPointerTo();
916 auto voidTy = llvm::Type::getVoidTy(*jit->context);
917 auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
918 auto func = jit->module->getOrInsertFunction("__atomic_load", funcTy);
919 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
920 auto out = allocateStackVariable(type);
921 jit->builder->CreateCall(func, {
922 llvm::ConstantInt::get(sizetTy, size),
923 jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
924 jit->builder->CreatePointerCast(V(out), i8PtrTy),
925 llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
926 });
927 return V(jit->builder->CreateLoad(T(type), V(out)));
928 }
929 }
930 default:
931 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
932 return nullptr;
933 }
934 }
935
createStore(Value * value,Value * ptr,Type * type,bool isVolatile,unsigned int alignment,bool atomic,std::memory_order memoryOrder)936 Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int alignment, bool atomic, std::memory_order memoryOrder)
937 {
938 RR_DEBUG_INFO_UPDATE_LOC();
939 switch(asInternalType(type))
940 {
941 case Type_v2i32:
942 case Type_v4i16:
943 case Type_v8i8:
944 case Type_v2f32:
945 createStore(
946 createExtractElement(
947 createBitCast(value, T(llvm::VectorType::get(T(Long::type()), 2, false))), Long::type(), 0),
948 createBitCast(ptr, Pointer<Long>::type()),
949 Long::type(), isVolatile, alignment, atomic, memoryOrder);
950 return value;
951 case Type_v2i16:
952 case Type_v4i8:
953 if(alignment != 0) // Not a local variable (all vectors are 128-bit).
954 {
955 createStore(
956 createExtractElement(createBitCast(value, Int4::type()), Int::type(), 0),
957 createBitCast(ptr, Pointer<Int>::type()),
958 Int::type(), isVolatile, alignment, atomic, memoryOrder);
959 return value;
960 }
961 // Fallthrough to non-emulated case.
962 case Type_LLVM:
963 {
964 auto elTy = T(type);
965
966 if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
967 {
968 // Mark all memory writes as initialized by calling __msan_unpoison
969 // void __msan_unpoison(const volatile void *a, size_t size)
970 auto voidTy = llvm::Type::getVoidTy(*jit->context);
971 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
972 auto voidPtrTy = i8Ty->getPointerTo();
973 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
974 auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
975 auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
976 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
977
978 jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(V(ptr), voidPtrTy),
979 llvm::ConstantInt::get(sizetTy, size) });
980 }
981
982 if(!atomic)
983 {
984 jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
985 }
986 else if(elTy->isIntegerTy() || elTy->isPointerTy())
987 {
988 // Integers and pointers can be atomically stored by setting
989 // the ordering constraint on the store instruction.
990 auto store = jit->builder->CreateAlignedStore(V(value), V(ptr), llvm::MaybeAlign(alignment), isVolatile);
991 store->setAtomic(atomicOrdering(atomic, memoryOrder));
992 }
993 else if(elTy->isFloatTy() || elTy->isDoubleTy())
994 {
995 // LLVM claims to support atomic stores of float types as
996 // above, but certain backends cannot deal with this.
997 // Store as an bitcast integer. See b/136037244.
998 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
999 auto elAsIntTy = llvm::IntegerType::get(*jit->context, size * 8);
1000 auto valCast = jit->builder->CreateBitCast(V(value), elAsIntTy);
1001 auto ptrCast = jit->builder->CreatePointerCast(V(ptr), elAsIntTy->getPointerTo());
1002 auto store = jit->builder->CreateAlignedStore(valCast, ptrCast, llvm::MaybeAlign(alignment), isVolatile);
1003 store->setAtomic(atomicOrdering(atomic, memoryOrder));
1004 }
1005 else
1006 {
1007 // More exotic types require falling back to the extern:
1008 // void __atomic_store(size_t size, void *ptr, void *val, int ordering)
1009 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1010 auto intTy = llvm::IntegerType::get(*jit->context, sizeof(int) * 8);
1011 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1012 auto i8PtrTy = i8Ty->getPointerTo();
1013 auto voidTy = llvm::Type::getVoidTy(*jit->context);
1014 auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy, i8PtrTy, i8PtrTy, intTy }, false);
1015 auto func = jit->module->getOrInsertFunction("__atomic_store", funcTy);
1016 auto size = jit->module->getDataLayout().getTypeStoreSize(elTy);
1017 auto copy = allocateStackVariable(type);
1018 jit->builder->CreateStore(V(value), V(copy));
1019 jit->builder->CreateCall(func, {
1020 llvm::ConstantInt::get(sizetTy, size),
1021 jit->builder->CreatePointerCast(V(ptr), i8PtrTy),
1022 jit->builder->CreatePointerCast(V(copy), i8PtrTy),
1023 llvm::ConstantInt::get(intTy, uint64_t(atomicOrdering(true, memoryOrder))),
1024 });
1025 }
1026
1027 return value;
1028 }
1029 default:
1030 UNREACHABLE("asInternalType(type): %d", int(asInternalType(type)));
1031 return nullptr;
1032 }
1033 }
1034
createMaskedLoad(Value * ptr,Type * elTy,Value * mask,unsigned int alignment,bool zeroMaskedLanes)1035 Value *Nucleus::createMaskedLoad(Value *ptr, Type *elTy, Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1036 {
1037 RR_DEBUG_INFO_UPDATE_LOC();
1038
1039 ASSERT(V(ptr)->getType()->isPointerTy());
1040 ASSERT(V(mask)->getType()->isVectorTy());
1041
1042 auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1043 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1044 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1045 auto elVecTy = llvm::VectorType::get(T(elTy), numEls, false);
1046 auto elVecPtrTy = elVecTy->getPointerTo();
1047 auto i8Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false); // vec<int, int, ...> -> vec<bool, bool, ...>
1048 auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1049 auto align = llvm::ConstantInt::get(i32Ty, alignment);
1050 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_load, { elVecTy, elVecPtrTy });
1051 return V(jit->builder->CreateCall(func, { V(ptr), align, i8Mask, passthrough }));
1052 }
1053
createMaskedStore(Value * ptr,Value * val,Value * mask,unsigned int alignment)1054 void Nucleus::createMaskedStore(Value *ptr, Value *val, Value *mask, unsigned int alignment)
1055 {
1056 RR_DEBUG_INFO_UPDATE_LOC();
1057
1058 ASSERT(V(ptr)->getType()->isPointerTy());
1059 ASSERT(V(val)->getType()->isVectorTy());
1060 ASSERT(V(mask)->getType()->isVectorTy());
1061
1062 auto numEls = llvm::cast<llvm::FixedVectorType>(V(mask)->getType())->getNumElements();
1063 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1064 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1065 auto elVecTy = V(val)->getType();
1066 auto elVecPtrTy = elVecTy->getPointerTo();
1067 auto i1Mask = jit->builder->CreateIntCast(V(mask), llvm::VectorType::get(i1Ty, numEls, false), false); // vec<int, int, ...> -> vec<bool, bool, ...>
1068 auto align = llvm::ConstantInt::get(i32Ty, alignment);
1069 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_store, { elVecTy, elVecPtrTy });
1070 jit->builder->CreateCall(func, { V(val), V(ptr), align, i1Mask });
1071
1072 if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
1073 {
1074 // Mark memory writes as initialized by calling __msan_unpoison
1075 // void __msan_unpoison(const volatile void *a, size_t size)
1076 auto voidTy = llvm::Type::getVoidTy(*jit->context);
1077 auto voidPtrTy = voidTy->getPointerTo();
1078 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
1079 auto funcTy = llvm::FunctionType::get(voidTy, { voidPtrTy, sizetTy }, false);
1080 auto func = jit->module->getOrInsertFunction("__msan_unpoison", funcTy);
1081 auto size = jit->module->getDataLayout().getTypeStoreSize(llvm::cast<llvm::VectorType>(elVecTy)->getElementType());
1082
1083 for(unsigned i = 0; i < numEls; i++)
1084 {
1085 // Check mask for this element
1086 auto idx = llvm::ConstantInt::get(i32Ty, i);
1087 auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1088 auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1089 jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1090 jit->builder->SetInsertPoint(thenBlock);
1091
1092 // Insert __msan_unpoison call in conditional block
1093 auto elPtr = jit->builder->CreateGEP(elVecTy, V(ptr), idx);
1094 jit->builder->CreateCall(func, { jit->builder->CreatePointerCast(elPtr, voidPtrTy),
1095 llvm::ConstantInt::get(sizetTy, size) });
1096
1097 jit->builder->CreateBr(mergeBlock);
1098 jit->builder->SetInsertPoint(mergeBlock);
1099 }
1100 }
1101 }
1102
createGather(llvm::Value * base,llvm::Type * elTy,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment,bool zeroMaskedLanes)1103 static llvm::Value *createGather(llvm::Value *base, llvm::Type *elTy, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment, bool zeroMaskedLanes)
1104 {
1105 ASSERT(base->getType()->isPointerTy());
1106 ASSERT(offsets->getType()->isVectorTy());
1107 ASSERT(mask->getType()->isVectorTy());
1108
1109 auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1110 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1111 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1112 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1113 auto i8PtrTy = i8Ty->getPointerTo();
1114 auto elPtrTy = elTy->getPointerTo();
1115 auto elVecTy = llvm::VectorType::get(elTy, numEls, false);
1116 auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1117 auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1118 auto i8Ptrs = jit->builder->CreateGEP(i8Ty, i8Base, offsets);
1119 auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1120 auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false); // vec<int, int, ...> -> vec<bool, bool, ...>
1121 auto passthrough = zeroMaskedLanes ? llvm::Constant::getNullValue(elVecTy) : llvm::UndefValue::get(elVecTy);
1122
1123 if(!__has_feature(memory_sanitizer))
1124 {
1125 auto align = llvm::ConstantInt::get(i32Ty, alignment);
1126 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_gather, { elVecTy, elPtrVecTy });
1127 return jit->builder->CreateCall(func, { elPtrs, align, i1Mask, passthrough });
1128 }
1129 else // __has_feature(memory_sanitizer)
1130 {
1131 // MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_gather
1132 // Work around it by emulating gather with element-wise loads.
1133 // TODO(b/172238865): Remove when supported by MemorySanitizer.
1134
1135 Value *result = Nucleus::allocateStackVariable(T(elVecTy));
1136 Nucleus::createStore(V(passthrough), result, T(elVecTy));
1137
1138 for(unsigned i = 0; i < numEls; i++)
1139 {
1140 // Check mask for this element
1141 Value *elementMask = Nucleus::createExtractElement(V(i1Mask), T(i1Ty), i);
1142
1143 If(RValue<Bool>(elementMask))
1144 {
1145 Value *elPtr = Nucleus::createExtractElement(V(elPtrs), T(elPtrTy), i);
1146 Value *el = Nucleus::createLoad(elPtr, T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1147
1148 Value *v = Nucleus::createLoad(result, T(elVecTy));
1149 v = Nucleus::createInsertElement(v, el, i);
1150 Nucleus::createStore(v, result, T(elVecTy));
1151 }
1152 }
1153
1154 return V(Nucleus::createLoad(result, T(elVecTy)));
1155 }
1156 }
1157
Gather(RValue<Pointer<Float>> base,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment,bool zeroMaskedLanes)1158 RValue<SIMD::Float> Gather(RValue<Pointer<Float>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1159 {
1160 return As<SIMD::Float>(V(createGather(V(base.value()), T(Float::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1161 }
1162
Gather(RValue<Pointer<Int>> base,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment,bool zeroMaskedLanes)1163 RValue<SIMD::Int> Gather(RValue<Pointer<Int>> base, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment, bool zeroMaskedLanes /* = false */)
1164 {
1165 return As<SIMD::Int>(V(createGather(V(base.value()), T(Int::type()), V(offsets.value()), V(mask.value()), alignment, zeroMaskedLanes)));
1166 }
1167
createScatter(llvm::Value * base,llvm::Value * val,llvm::Value * offsets,llvm::Value * mask,unsigned int alignment)1168 static void createScatter(llvm::Value *base, llvm::Value *val, llvm::Value *offsets, llvm::Value *mask, unsigned int alignment)
1169 {
1170 ASSERT(base->getType()->isPointerTy());
1171 ASSERT(val->getType()->isVectorTy());
1172 ASSERT(offsets->getType()->isVectorTy());
1173 ASSERT(mask->getType()->isVectorTy());
1174
1175 auto numEls = llvm::cast<llvm::FixedVectorType>(mask->getType())->getNumElements();
1176 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
1177 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
1178 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
1179 auto i8PtrTy = i8Ty->getPointerTo();
1180 auto elVecTy = val->getType();
1181 auto elTy = llvm::cast<llvm::VectorType>(elVecTy)->getElementType();
1182 auto elPtrTy = elTy->getPointerTo();
1183 auto elPtrVecTy = llvm::VectorType::get(elPtrTy, numEls, false);
1184
1185 auto i8Base = jit->builder->CreatePointerCast(base, i8PtrTy);
1186 auto i8Ptrs = jit->builder->CreateGEP(i8Ty, i8Base, offsets);
1187 auto elPtrs = jit->builder->CreatePointerCast(i8Ptrs, elPtrVecTy);
1188 auto i1Mask = jit->builder->CreateIntCast(mask, llvm::VectorType::get(i1Ty, numEls, false), false); // vec<int, int, ...> -> vec<bool, bool, ...>
1189
1190 if(!__has_feature(memory_sanitizer))
1191 {
1192 auto align = llvm::ConstantInt::get(i32Ty, alignment);
1193 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::masked_scatter, { elVecTy, elPtrVecTy });
1194 jit->builder->CreateCall(func, { val, elPtrs, align, i1Mask });
1195 }
1196 else // __has_feature(memory_sanitizer)
1197 {
1198 // MemorySanitizer currently does not support instrumenting llvm::Intrinsic::masked_scatter
1199 // Work around it by emulating scatter with element-wise stores.
1200 // TODO(b/172238865): Remove when supported by MemorySanitizer.
1201
1202 for(unsigned i = 0; i < numEls; i++)
1203 {
1204 // Check mask for this element
1205 auto idx = llvm::ConstantInt::get(i32Ty, i);
1206 auto thenBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1207 auto mergeBlock = llvm::BasicBlock::Create(*jit->context, "", jit->function);
1208 jit->builder->CreateCondBr(jit->builder->CreateExtractElement(i1Mask, idx), thenBlock, mergeBlock);
1209 jit->builder->SetInsertPoint(thenBlock);
1210
1211 auto el = jit->builder->CreateExtractElement(val, idx);
1212 auto elPtr = jit->builder->CreateExtractElement(elPtrs, idx);
1213 Nucleus::createStore(V(el), V(elPtr), T(elTy), /*isVolatile */ false, alignment, /* atomic */ false, std::memory_order_relaxed);
1214
1215 jit->builder->CreateBr(mergeBlock);
1216 jit->builder->SetInsertPoint(mergeBlock);
1217 }
1218 }
1219 }
1220
Scatter(RValue<Pointer<Float>> base,RValue<SIMD::Float> val,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment)1221 void Scatter(RValue<Pointer<Float>> base, RValue<SIMD::Float> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
1222 {
1223 return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1224 }
1225
Scatter(RValue<Pointer<Int>> base,RValue<SIMD::Int> val,RValue<SIMD::Int> offsets,RValue<SIMD::Int> mask,unsigned int alignment)1226 void Scatter(RValue<Pointer<Int>> base, RValue<SIMD::Int> val, RValue<SIMD::Int> offsets, RValue<SIMD::Int> mask, unsigned int alignment)
1227 {
1228 return createScatter(V(base.value()), V(val.value()), V(offsets.value()), V(mask.value()), alignment);
1229 }
1230
createFence(std::memory_order memoryOrder)1231 void Nucleus::createFence(std::memory_order memoryOrder)
1232 {
1233 RR_DEBUG_INFO_UPDATE_LOC();
1234 jit->builder->CreateFence(atomicOrdering(true, memoryOrder));
1235 }
1236
createGEP(Value * ptr,Type * type,Value * index,bool unsignedIndex)1237 Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index, bool unsignedIndex)
1238 {
1239 RR_DEBUG_INFO_UPDATE_LOC();
1240
1241 if(sizeof(void *) == 8)
1242 {
1243 // LLVM manual: "When indexing into an array, pointer or vector,
1244 // integers of any width are allowed, and they are not required to
1245 // be constant. These integers are treated as signed values where
1246 // relevant."
1247 //
1248 // Thus if we want indexes to be treated as unsigned we have to
1249 // zero-extend them ourselves.
1250 //
1251 // Note that this is not because we want to address anywhere near
1252 // 4 GB of data. Instead this is important for performance because
1253 // x86 supports automatic zero-extending of 32-bit registers to
1254 // 64-bit. Thus when indexing into an array using a uint32 is
1255 // actually faster than an int32.
1256 index = unsignedIndex ? createZExt(index, Long::type()) : createSExt(index, Long::type());
1257 }
1258
1259 // For non-emulated types we can rely on LLVM's GEP to calculate the
1260 // effective address correctly.
1261 if(asInternalType(type) == Type_LLVM)
1262 {
1263 return V(jit->builder->CreateGEP(T(type), V(ptr), V(index)));
1264 }
1265
1266 // For emulated types we have to multiply the index by the intended
1267 // type size ourselves to obain the byte offset.
1268 index = (sizeof(void *) == 8) ? createMul(index, createConstantLong((int64_t)typeSize(type))) : createMul(index, createConstantInt((int)typeSize(type)));
1269
1270 // Cast to a byte pointer, apply the byte offset, and cast back to the
1271 // original pointer type.
1272 return createBitCast(
1273 V(jit->builder->CreateGEP(T(Byte::type()), V(createBitCast(ptr, T(llvm::PointerType::get(T(Byte::type()), 0)))), V(index))),
1274 T(llvm::PointerType::get(T(type), 0)));
1275 }
1276
createAtomicAdd(Value * ptr,Value * value,std::memory_order memoryOrder)1277 Value *Nucleus::createAtomicAdd(Value *ptr, Value *value, std::memory_order memoryOrder)
1278 {
1279 RR_DEBUG_INFO_UPDATE_LOC();
1280 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Add, V(ptr), V(value),
1281 #if LLVM_VERSION_MAJOR >= 11
1282 llvm::MaybeAlign(),
1283 #endif
1284 atomicOrdering(true, memoryOrder)));
1285 }
1286
createAtomicSub(Value * ptr,Value * value,std::memory_order memoryOrder)1287 Value *Nucleus::createAtomicSub(Value *ptr, Value *value, std::memory_order memoryOrder)
1288 {
1289 RR_DEBUG_INFO_UPDATE_LOC();
1290 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Sub, V(ptr), V(value),
1291 #if LLVM_VERSION_MAJOR >= 11
1292 llvm::MaybeAlign(),
1293 #endif
1294 atomicOrdering(true, memoryOrder)));
1295 }
1296
createAtomicAnd(Value * ptr,Value * value,std::memory_order memoryOrder)1297 Value *Nucleus::createAtomicAnd(Value *ptr, Value *value, std::memory_order memoryOrder)
1298 {
1299 RR_DEBUG_INFO_UPDATE_LOC();
1300 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::And, V(ptr), V(value),
1301 #if LLVM_VERSION_MAJOR >= 11
1302 llvm::MaybeAlign(),
1303 #endif
1304 atomicOrdering(true, memoryOrder)));
1305 }
1306
createAtomicOr(Value * ptr,Value * value,std::memory_order memoryOrder)1307 Value *Nucleus::createAtomicOr(Value *ptr, Value *value, std::memory_order memoryOrder)
1308 {
1309 RR_DEBUG_INFO_UPDATE_LOC();
1310 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Or, V(ptr), V(value),
1311 #if LLVM_VERSION_MAJOR >= 11
1312 llvm::MaybeAlign(),
1313 #endif
1314 atomicOrdering(true, memoryOrder)));
1315 }
1316
createAtomicXor(Value * ptr,Value * value,std::memory_order memoryOrder)1317 Value *Nucleus::createAtomicXor(Value *ptr, Value *value, std::memory_order memoryOrder)
1318 {
1319 RR_DEBUG_INFO_UPDATE_LOC();
1320 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xor, V(ptr), V(value),
1321 #if LLVM_VERSION_MAJOR >= 11
1322 llvm::MaybeAlign(),
1323 #endif
1324 atomicOrdering(true, memoryOrder)));
1325 }
1326
createAtomicMin(Value * ptr,Value * value,std::memory_order memoryOrder)1327 Value *Nucleus::createAtomicMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1328 {
1329 RR_DEBUG_INFO_UPDATE_LOC();
1330 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Min, V(ptr), V(value),
1331 #if LLVM_VERSION_MAJOR >= 11
1332 llvm::MaybeAlign(),
1333 #endif
1334 atomicOrdering(true, memoryOrder)));
1335 }
1336
createAtomicMax(Value * ptr,Value * value,std::memory_order memoryOrder)1337 Value *Nucleus::createAtomicMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1338 {
1339 RR_DEBUG_INFO_UPDATE_LOC();
1340 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Max, V(ptr), V(value),
1341 #if LLVM_VERSION_MAJOR >= 11
1342 llvm::MaybeAlign(),
1343 #endif
1344 atomicOrdering(true, memoryOrder)));
1345 }
1346
createAtomicUMin(Value * ptr,Value * value,std::memory_order memoryOrder)1347 Value *Nucleus::createAtomicUMin(Value *ptr, Value *value, std::memory_order memoryOrder)
1348 {
1349 RR_DEBUG_INFO_UPDATE_LOC();
1350 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMin, V(ptr), V(value),
1351 #if LLVM_VERSION_MAJOR >= 11
1352 llvm::MaybeAlign(),
1353 #endif
1354 atomicOrdering(true, memoryOrder)));
1355 }
1356
createAtomicUMax(Value * ptr,Value * value,std::memory_order memoryOrder)1357 Value *Nucleus::createAtomicUMax(Value *ptr, Value *value, std::memory_order memoryOrder)
1358 {
1359 RR_DEBUG_INFO_UPDATE_LOC();
1360 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::UMax, V(ptr), V(value),
1361 #if LLVM_VERSION_MAJOR >= 11
1362 llvm::MaybeAlign(),
1363 #endif
1364 atomicOrdering(true, memoryOrder)));
1365 }
1366
createAtomicExchange(Value * ptr,Value * value,std::memory_order memoryOrder)1367 Value *Nucleus::createAtomicExchange(Value *ptr, Value *value, std::memory_order memoryOrder)
1368 {
1369 RR_DEBUG_INFO_UPDATE_LOC();
1370 return V(jit->builder->CreateAtomicRMW(llvm::AtomicRMWInst::Xchg, V(ptr), V(value),
1371 #if LLVM_VERSION_MAJOR >= 11
1372 llvm::MaybeAlign(),
1373 #endif
1374 atomicOrdering(true, memoryOrder)));
1375 }
1376
createAtomicCompareExchange(Value * ptr,Value * value,Value * compare,std::memory_order memoryOrderEqual,std::memory_order memoryOrderUnequal)1377 Value *Nucleus::createAtomicCompareExchange(Value *ptr, Value *value, Value *compare, std::memory_order memoryOrderEqual, std::memory_order memoryOrderUnequal)
1378 {
1379 RR_DEBUG_INFO_UPDATE_LOC();
1380 // Note: AtomicCmpXchgInstruction returns a 2-member struct containing {result, success-flag}, not the result directly.
1381 return V(jit->builder->CreateExtractValue(
1382 jit->builder->CreateAtomicCmpXchg(V(ptr), V(compare), V(value),
1383 #if LLVM_VERSION_MAJOR >= 11
1384 llvm::MaybeAlign(),
1385 #endif
1386 atomicOrdering(true, memoryOrderEqual),
1387 atomicOrdering(true, memoryOrderUnequal)),
1388 llvm::ArrayRef<unsigned>(0u)));
1389 }
1390
createTrunc(Value * v,Type * destType)1391 Value *Nucleus::createTrunc(Value *v, Type *destType)
1392 {
1393 RR_DEBUG_INFO_UPDATE_LOC();
1394 return V(jit->builder->CreateTrunc(V(v), T(destType)));
1395 }
1396
createZExt(Value * v,Type * destType)1397 Value *Nucleus::createZExt(Value *v, Type *destType)
1398 {
1399 RR_DEBUG_INFO_UPDATE_LOC();
1400 return V(jit->builder->CreateZExt(V(v), T(destType)));
1401 }
1402
createSExt(Value * v,Type * destType)1403 Value *Nucleus::createSExt(Value *v, Type *destType)
1404 {
1405 RR_DEBUG_INFO_UPDATE_LOC();
1406 return V(jit->builder->CreateSExt(V(v), T(destType)));
1407 }
1408
createFPToUI(Value * v,Type * destType)1409 Value *Nucleus::createFPToUI(Value *v, Type *destType)
1410 {
1411 RR_DEBUG_INFO_UPDATE_LOC();
1412 return V(jit->builder->CreateFPToUI(V(v), T(destType)));
1413 }
1414
createFPToSI(Value * v,Type * destType)1415 Value *Nucleus::createFPToSI(Value *v, Type *destType)
1416 {
1417 RR_DEBUG_INFO_UPDATE_LOC();
1418 return V(jit->builder->CreateFPToSI(V(v), T(destType)));
1419 }
1420
createSIToFP(Value * v,Type * destType)1421 Value *Nucleus::createSIToFP(Value *v, Type *destType)
1422 {
1423 RR_DEBUG_INFO_UPDATE_LOC();
1424 return V(jit->builder->CreateSIToFP(V(v), T(destType)));
1425 }
1426
createFPTrunc(Value * v,Type * destType)1427 Value *Nucleus::createFPTrunc(Value *v, Type *destType)
1428 {
1429 RR_DEBUG_INFO_UPDATE_LOC();
1430 return V(jit->builder->CreateFPTrunc(V(v), T(destType)));
1431 }
1432
createFPExt(Value * v,Type * destType)1433 Value *Nucleus::createFPExt(Value *v, Type *destType)
1434 {
1435 RR_DEBUG_INFO_UPDATE_LOC();
1436 return V(jit->builder->CreateFPExt(V(v), T(destType)));
1437 }
1438
createBitCast(Value * v,Type * destType)1439 Value *Nucleus::createBitCast(Value *v, Type *destType)
1440 {
1441 RR_DEBUG_INFO_UPDATE_LOC();
1442 // Bitcasts must be between types of the same logical size. But with emulated narrow vectors we need
1443 // support for casting between scalars and wide vectors. Emulate them by writing to the stack and
1444 // reading back as the destination type.
1445 if(!V(v)->getType()->isVectorTy() && T(destType)->isVectorTy())
1446 {
1447 Value *readAddress = allocateStackVariable(destType);
1448 Value *writeAddress = createBitCast(readAddress, T(llvm::PointerType::get(V(v)->getType(), 0)));
1449 createStore(v, writeAddress, T(V(v)->getType()));
1450 return createLoad(readAddress, destType);
1451 }
1452 else if(V(v)->getType()->isVectorTy() && !T(destType)->isVectorTy())
1453 {
1454 Value *writeAddress = allocateStackVariable(T(V(v)->getType()));
1455 createStore(v, writeAddress, T(V(v)->getType()));
1456 Value *readAddress = createBitCast(writeAddress, T(llvm::PointerType::get(T(destType), 0)));
1457 return createLoad(readAddress, destType);
1458 }
1459
1460 return V(jit->builder->CreateBitCast(V(v), T(destType)));
1461 }
1462
createICmpEQ(Value * lhs,Value * rhs)1463 Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
1464 {
1465 RR_DEBUG_INFO_UPDATE_LOC();
1466 return V(jit->builder->CreateICmpEQ(V(lhs), V(rhs)));
1467 }
1468
createICmpNE(Value * lhs,Value * rhs)1469 Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
1470 {
1471 RR_DEBUG_INFO_UPDATE_LOC();
1472 return V(jit->builder->CreateICmpNE(V(lhs), V(rhs)));
1473 }
1474
createICmpUGT(Value * lhs,Value * rhs)1475 Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
1476 {
1477 RR_DEBUG_INFO_UPDATE_LOC();
1478 return V(jit->builder->CreateICmpUGT(V(lhs), V(rhs)));
1479 }
1480
createICmpUGE(Value * lhs,Value * rhs)1481 Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
1482 {
1483 RR_DEBUG_INFO_UPDATE_LOC();
1484 return V(jit->builder->CreateICmpUGE(V(lhs), V(rhs)));
1485 }
1486
createICmpULT(Value * lhs,Value * rhs)1487 Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
1488 {
1489 RR_DEBUG_INFO_UPDATE_LOC();
1490 return V(jit->builder->CreateICmpULT(V(lhs), V(rhs)));
1491 }
1492
createICmpULE(Value * lhs,Value * rhs)1493 Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
1494 {
1495 RR_DEBUG_INFO_UPDATE_LOC();
1496 return V(jit->builder->CreateICmpULE(V(lhs), V(rhs)));
1497 }
1498
createICmpSGT(Value * lhs,Value * rhs)1499 Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
1500 {
1501 RR_DEBUG_INFO_UPDATE_LOC();
1502 return V(jit->builder->CreateICmpSGT(V(lhs), V(rhs)));
1503 }
1504
createICmpSGE(Value * lhs,Value * rhs)1505 Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
1506 {
1507 RR_DEBUG_INFO_UPDATE_LOC();
1508 return V(jit->builder->CreateICmpSGE(V(lhs), V(rhs)));
1509 }
1510
createICmpSLT(Value * lhs,Value * rhs)1511 Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
1512 {
1513 RR_DEBUG_INFO_UPDATE_LOC();
1514 return V(jit->builder->CreateICmpSLT(V(lhs), V(rhs)));
1515 }
1516
createICmpSLE(Value * lhs,Value * rhs)1517 Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
1518 {
1519 RR_DEBUG_INFO_UPDATE_LOC();
1520 return V(jit->builder->CreateICmpSLE(V(lhs), V(rhs)));
1521 }
1522
createFCmpOEQ(Value * lhs,Value * rhs)1523 Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
1524 {
1525 RR_DEBUG_INFO_UPDATE_LOC();
1526 return V(jit->builder->CreateFCmpOEQ(V(lhs), V(rhs)));
1527 }
1528
createFCmpOGT(Value * lhs,Value * rhs)1529 Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
1530 {
1531 RR_DEBUG_INFO_UPDATE_LOC();
1532 return V(jit->builder->CreateFCmpOGT(V(lhs), V(rhs)));
1533 }
1534
createFCmpOGE(Value * lhs,Value * rhs)1535 Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
1536 {
1537 RR_DEBUG_INFO_UPDATE_LOC();
1538 return V(jit->builder->CreateFCmpOGE(V(lhs), V(rhs)));
1539 }
1540
createFCmpOLT(Value * lhs,Value * rhs)1541 Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
1542 {
1543 RR_DEBUG_INFO_UPDATE_LOC();
1544 return V(jit->builder->CreateFCmpOLT(V(lhs), V(rhs)));
1545 }
1546
createFCmpOLE(Value * lhs,Value * rhs)1547 Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
1548 {
1549 RR_DEBUG_INFO_UPDATE_LOC();
1550 return V(jit->builder->CreateFCmpOLE(V(lhs), V(rhs)));
1551 }
1552
createFCmpONE(Value * lhs,Value * rhs)1553 Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
1554 {
1555 RR_DEBUG_INFO_UPDATE_LOC();
1556 return V(jit->builder->CreateFCmpONE(V(lhs), V(rhs)));
1557 }
1558
createFCmpORD(Value * lhs,Value * rhs)1559 Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
1560 {
1561 RR_DEBUG_INFO_UPDATE_LOC();
1562 return V(jit->builder->CreateFCmpORD(V(lhs), V(rhs)));
1563 }
1564
createFCmpUNO(Value * lhs,Value * rhs)1565 Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
1566 {
1567 RR_DEBUG_INFO_UPDATE_LOC();
1568 return V(jit->builder->CreateFCmpUNO(V(lhs), V(rhs)));
1569 }
1570
createFCmpUEQ(Value * lhs,Value * rhs)1571 Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
1572 {
1573 RR_DEBUG_INFO_UPDATE_LOC();
1574 return V(jit->builder->CreateFCmpUEQ(V(lhs), V(rhs)));
1575 }
1576
createFCmpUGT(Value * lhs,Value * rhs)1577 Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
1578 {
1579 RR_DEBUG_INFO_UPDATE_LOC();
1580 return V(jit->builder->CreateFCmpUGT(V(lhs), V(rhs)));
1581 }
1582
createFCmpUGE(Value * lhs,Value * rhs)1583 Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
1584 {
1585 RR_DEBUG_INFO_UPDATE_LOC();
1586 return V(jit->builder->CreateFCmpUGE(V(lhs), V(rhs)));
1587 }
1588
createFCmpULT(Value * lhs,Value * rhs)1589 Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
1590 {
1591 RR_DEBUG_INFO_UPDATE_LOC();
1592 return V(jit->builder->CreateFCmpULT(V(lhs), V(rhs)));
1593 }
1594
createFCmpULE(Value * lhs,Value * rhs)1595 Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
1596 {
1597 RR_DEBUG_INFO_UPDATE_LOC();
1598 return V(jit->builder->CreateFCmpULE(V(lhs), V(rhs)));
1599 }
1600
createFCmpUNE(Value * lhs,Value * rhs)1601 Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
1602 {
1603 RR_DEBUG_INFO_UPDATE_LOC();
1604 return V(jit->builder->CreateFCmpUNE(V(lhs), V(rhs)));
1605 }
1606
createExtractElement(Value * vector,Type * type,int index)1607 Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
1608 {
1609 RR_DEBUG_INFO_UPDATE_LOC();
1610 ASSERT(V(vector)->getType()->getContainedType(0) == T(type));
1611 return V(jit->builder->CreateExtractElement(V(vector), V(createConstantInt(index))));
1612 }
1613
createInsertElement(Value * vector,Value * element,int index)1614 Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
1615 {
1616 RR_DEBUG_INFO_UPDATE_LOC();
1617 return V(jit->builder->CreateInsertElement(V(vector), V(element), V(createConstantInt(index))));
1618 }
1619
createShuffleVector(Value * v1,Value * v2,std::vector<int> select)1620 Value *Nucleus::createShuffleVector(Value *v1, Value *v2, std::vector<int> select)
1621 {
1622 RR_DEBUG_INFO_UPDATE_LOC();
1623
1624 size_t size = llvm::cast<llvm::FixedVectorType>(V(v1)->getType())->getNumElements();
1625 ASSERT(size == llvm::cast<llvm::FixedVectorType>(V(v2)->getType())->getNumElements());
1626
1627 llvm::SmallVector<int, 16> mask;
1628 const size_t selectSize = select.size();
1629 for(size_t i = 0; i < size; i++)
1630 {
1631 mask.push_back(select[i % selectSize]);
1632 }
1633
1634 return V(lowerShuffleVector(V(v1), V(v2), mask));
1635 }
1636
createSelect(Value * c,Value * ifTrue,Value * ifFalse)1637 Value *Nucleus::createSelect(Value *c, Value *ifTrue, Value *ifFalse)
1638 {
1639 RR_DEBUG_INFO_UPDATE_LOC();
1640 return V(jit->builder->CreateSelect(V(c), V(ifTrue), V(ifFalse)));
1641 }
1642
createSwitch(Value * control,BasicBlock * defaultBranch,unsigned numCases)1643 SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
1644 {
1645 RR_DEBUG_INFO_UPDATE_LOC();
1646 return reinterpret_cast<SwitchCases *>(jit->builder->CreateSwitch(V(control), B(defaultBranch), numCases));
1647 }
1648
addSwitchCase(SwitchCases * switchCases,int label,BasicBlock * branch)1649 void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
1650 {
1651 RR_DEBUG_INFO_UPDATE_LOC();
1652 llvm::SwitchInst *sw = reinterpret_cast<llvm::SwitchInst *>(switchCases);
1653 sw->addCase(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), label, true), B(branch));
1654 }
1655
createUnreachable()1656 void Nucleus::createUnreachable()
1657 {
1658 RR_DEBUG_INFO_UPDATE_LOC();
1659 jit->builder->CreateUnreachable();
1660 }
1661
getType(Value * value)1662 Type *Nucleus::getType(Value *value)
1663 {
1664 return T(V(value)->getType());
1665 }
1666
getContainedType(Type * vectorType)1667 Type *Nucleus::getContainedType(Type *vectorType)
1668 {
1669 return T(T(vectorType)->getContainedType(0));
1670 }
1671
getPointerType(Type * ElementType)1672 Type *Nucleus::getPointerType(Type *ElementType)
1673 {
1674 return T(llvm::PointerType::get(T(ElementType), 0));
1675 }
1676
getNaturalIntType()1677 static llvm::Type *getNaturalIntType()
1678 {
1679 return llvm::Type::getIntNTy(*jit->context, sizeof(int) * 8);
1680 }
1681
getPrintfStorageType(Type * valueType)1682 Type *Nucleus::getPrintfStorageType(Type *valueType)
1683 {
1684 llvm::Type *valueTy = T(valueType);
1685 if(valueTy->isIntegerTy())
1686 {
1687 return T(getNaturalIntType());
1688 }
1689 if(valueTy->isFloatTy())
1690 {
1691 return T(llvm::Type::getDoubleTy(*jit->context));
1692 }
1693
1694 UNIMPLEMENTED_NO_BUG("getPrintfStorageType: add more cases as needed");
1695 return {};
1696 }
1697
createNullValue(Type * Ty)1698 Value *Nucleus::createNullValue(Type *Ty)
1699 {
1700 RR_DEBUG_INFO_UPDATE_LOC();
1701 return V(llvm::Constant::getNullValue(T(Ty)));
1702 }
1703
createConstantLong(int64_t i)1704 Value *Nucleus::createConstantLong(int64_t i)
1705 {
1706 RR_DEBUG_INFO_UPDATE_LOC();
1707 return V(llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), i, true));
1708 }
1709
createConstantInt(int i)1710 Value *Nucleus::createConstantInt(int i)
1711 {
1712 RR_DEBUG_INFO_UPDATE_LOC();
1713 return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, true));
1714 }
1715
createConstantInt(unsigned int i)1716 Value *Nucleus::createConstantInt(unsigned int i)
1717 {
1718 RR_DEBUG_INFO_UPDATE_LOC();
1719 return V(llvm::ConstantInt::get(llvm::Type::getInt32Ty(*jit->context), i, false));
1720 }
1721
createConstantBool(bool b)1722 Value *Nucleus::createConstantBool(bool b)
1723 {
1724 RR_DEBUG_INFO_UPDATE_LOC();
1725 return V(llvm::ConstantInt::get(llvm::Type::getInt1Ty(*jit->context), b));
1726 }
1727
createConstantByte(signed char i)1728 Value *Nucleus::createConstantByte(signed char i)
1729 {
1730 RR_DEBUG_INFO_UPDATE_LOC();
1731 return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, true));
1732 }
1733
createConstantByte(unsigned char i)1734 Value *Nucleus::createConstantByte(unsigned char i)
1735 {
1736 RR_DEBUG_INFO_UPDATE_LOC();
1737 return V(llvm::ConstantInt::get(llvm::Type::getInt8Ty(*jit->context), i, false));
1738 }
1739
createConstantShort(short i)1740 Value *Nucleus::createConstantShort(short i)
1741 {
1742 RR_DEBUG_INFO_UPDATE_LOC();
1743 return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, true));
1744 }
1745
createConstantShort(unsigned short i)1746 Value *Nucleus::createConstantShort(unsigned short i)
1747 {
1748 RR_DEBUG_INFO_UPDATE_LOC();
1749 return V(llvm::ConstantInt::get(llvm::Type::getInt16Ty(*jit->context), i, false));
1750 }
1751
createConstantFloat(float x)1752 Value *Nucleus::createConstantFloat(float x)
1753 {
1754 RR_DEBUG_INFO_UPDATE_LOC();
1755 return V(llvm::ConstantFP::get(T(Float::type()), x));
1756 }
1757
createNullPointer(Type * Ty)1758 Value *Nucleus::createNullPointer(Type *Ty)
1759 {
1760 RR_DEBUG_INFO_UPDATE_LOC();
1761 return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(T(Ty), 0)));
1762 }
1763
createConstantVector(std::vector<int64_t> constants,Type * type)1764 Value *Nucleus::createConstantVector(std::vector<int64_t> constants, Type *type)
1765 {
1766 RR_DEBUG_INFO_UPDATE_LOC();
1767 ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1768 const size_t numConstants = constants.size(); // Number of provided constants for the (emulated) type.
1769 const size_t numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements(); // Number of elements of the underlying vector type.
1770 llvm::SmallVector<llvm::Constant *, 16> constantVector;
1771
1772 for(size_t i = 0; i < numElements; i++)
1773 {
1774 constantVector.push_back(llvm::ConstantInt::get(T(type)->getContainedType(0), constants[i % numConstants]));
1775 }
1776
1777 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector)));
1778 }
1779
createConstantVector(std::vector<double> constants,Type * type)1780 Value *Nucleus::createConstantVector(std::vector<double> constants, Type *type)
1781 {
1782 RR_DEBUG_INFO_UPDATE_LOC();
1783 ASSERT(llvm::isa<llvm::VectorType>(T(type)));
1784 const size_t numConstants = constants.size(); // Number of provided constants for the (emulated) type.
1785 const size_t numElements = llvm::cast<llvm::FixedVectorType>(T(type))->getNumElements(); // Number of elements of the underlying vector type.
1786 llvm::SmallVector<llvm::Constant *, 16> constantVector;
1787
1788 for(size_t i = 0; i < numElements; i++)
1789 {
1790 constantVector.push_back(llvm::ConstantFP::get(T(type)->getContainedType(0), constants[i % numConstants]));
1791 }
1792
1793 return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant *>(constantVector)));
1794 }
1795
createConstantString(const char * v)1796 Value *Nucleus::createConstantString(const char *v)
1797 {
1798 // NOTE: Do not call RR_DEBUG_INFO_UPDATE_LOC() here to avoid recursion when called from rr::Printv
1799 auto ptr = jit->builder->CreateGlobalStringPtr(v);
1800 return V(ptr);
1801 }
1802
setOptimizerCallback(OptimizerCallback * callback)1803 void Nucleus::setOptimizerCallback(OptimizerCallback *callback)
1804 {
1805 // The LLVM backend does not produce optimizer reports.
1806 (void)callback;
1807 }
1808
type()1809 Type *Void::type()
1810 {
1811 return T(llvm::Type::getVoidTy(*jit->context));
1812 }
1813
type()1814 Type *Bool::type()
1815 {
1816 return T(llvm::Type::getInt1Ty(*jit->context));
1817 }
1818
type()1819 Type *Byte::type()
1820 {
1821 return T(llvm::Type::getInt8Ty(*jit->context));
1822 }
1823
type()1824 Type *SByte::type()
1825 {
1826 return T(llvm::Type::getInt8Ty(*jit->context));
1827 }
1828
type()1829 Type *Short::type()
1830 {
1831 return T(llvm::Type::getInt16Ty(*jit->context));
1832 }
1833
type()1834 Type *UShort::type()
1835 {
1836 return T(llvm::Type::getInt16Ty(*jit->context));
1837 }
1838
type()1839 Type *Byte4::type()
1840 {
1841 return T(Type_v4i8);
1842 }
1843
type()1844 Type *SByte4::type()
1845 {
1846 return T(Type_v4i8);
1847 }
1848
AddSat(RValue<Byte8> x,RValue<Byte8> y)1849 RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
1850 {
1851 RR_DEBUG_INFO_UPDATE_LOC();
1852 #if defined(__i386__) || defined(__x86_64__)
1853 return x86::paddusb(x, y);
1854 #else
1855 return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
1856 #endif
1857 }
1858
SubSat(RValue<Byte8> x,RValue<Byte8> y)1859 RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
1860 {
1861 RR_DEBUG_INFO_UPDATE_LOC();
1862 #if defined(__i386__) || defined(__x86_64__)
1863 return x86::psubusb(x, y);
1864 #else
1865 return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
1866 #endif
1867 }
1868
SignMask(RValue<Byte8> x)1869 RValue<Int> SignMask(RValue<Byte8> x)
1870 {
1871 RR_DEBUG_INFO_UPDATE_LOC();
1872 #if defined(__i386__) || defined(__x86_64__)
1873 return x86::pmovmskb(x);
1874 #else
1875 return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1876 #endif
1877 }
1878
1879 // RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
1880 // {
1881 //#if defined(__i386__) || defined(__x86_64__)
1882 // return x86::pcmpgtb(x, y); // FIXME: Signedness
1883 //#else
1884 // return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1885 //#endif
1886 // }
1887
CmpEQ(RValue<Byte8> x,RValue<Byte8> y)1888 RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
1889 {
1890 RR_DEBUG_INFO_UPDATE_LOC();
1891 #if defined(__i386__) || defined(__x86_64__)
1892 return x86::pcmpeqb(x, y);
1893 #else
1894 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1895 #endif
1896 }
1897
type()1898 Type *Byte8::type()
1899 {
1900 return T(Type_v8i8);
1901 }
1902
AddSat(RValue<SByte8> x,RValue<SByte8> y)1903 RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
1904 {
1905 RR_DEBUG_INFO_UPDATE_LOC();
1906 #if defined(__i386__) || defined(__x86_64__)
1907 return x86::paddsb(x, y);
1908 #else
1909 return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
1910 #endif
1911 }
1912
SubSat(RValue<SByte8> x,RValue<SByte8> y)1913 RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
1914 {
1915 RR_DEBUG_INFO_UPDATE_LOC();
1916 #if defined(__i386__) || defined(__x86_64__)
1917 return x86::psubsb(x, y);
1918 #else
1919 return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
1920 #endif
1921 }
1922
SignMask(RValue<SByte8> x)1923 RValue<Int> SignMask(RValue<SByte8> x)
1924 {
1925 RR_DEBUG_INFO_UPDATE_LOC();
1926 #if defined(__i386__) || defined(__x86_64__)
1927 return x86::pmovmskb(As<Byte8>(x));
1928 #else
1929 return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
1930 #endif
1931 }
1932
CmpGT(RValue<SByte8> x,RValue<SByte8> y)1933 RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
1934 {
1935 RR_DEBUG_INFO_UPDATE_LOC();
1936 #if defined(__i386__) || defined(__x86_64__)
1937 return x86::pcmpgtb(x, y);
1938 #else
1939 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
1940 #endif
1941 }
1942
CmpEQ(RValue<SByte8> x,RValue<SByte8> y)1943 RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
1944 {
1945 RR_DEBUG_INFO_UPDATE_LOC();
1946 #if defined(__i386__) || defined(__x86_64__)
1947 return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
1948 #else
1949 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
1950 #endif
1951 }
1952
type()1953 Type *SByte8::type()
1954 {
1955 return T(Type_v8i8);
1956 }
1957
type()1958 Type *Byte16::type()
1959 {
1960 return T(llvm::VectorType::get(T(Byte::type()), 16, false));
1961 }
1962
type()1963 Type *SByte16::type()
1964 {
1965 return T(llvm::VectorType::get(T(SByte::type()), 16, false));
1966 }
1967
type()1968 Type *Short2::type()
1969 {
1970 return T(Type_v2i16);
1971 }
1972
type()1973 Type *UShort2::type()
1974 {
1975 return T(Type_v2i16);
1976 }
1977
Short4(RValue<Int4> cast)1978 Short4::Short4(RValue<Int4> cast)
1979 {
1980 RR_DEBUG_INFO_UPDATE_LOC();
1981 std::vector<int> select = { 0, 2, 4, 6, 0, 2, 4, 6 };
1982 Value *short8 = Nucleus::createBitCast(cast.value(), Short8::type());
1983
1984 Value *packed = Nucleus::createShuffleVector(short8, short8, select);
1985 Value *short4 = As<Short4>(Int2(As<Int4>(packed))).value();
1986
1987 storeValue(short4);
1988 }
1989
1990 // Short4::Short4(RValue<Float> cast)
1991 // {
1992 // }
1993
Short4(RValue<Float4> cast)1994 Short4::Short4(RValue<Float4> cast)
1995 {
1996 RR_DEBUG_INFO_UPDATE_LOC();
1997 Int4 v4i32 = Int4(cast);
1998 #if defined(__i386__) || defined(__x86_64__)
1999 v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
2000 #else
2001 Value *v = v4i32.loadValue();
2002 v4i32 = As<Int4>(V(lowerPack(V(v), V(v), true)));
2003 #endif
2004
2005 storeValue(As<Short4>(Int2(v4i32)).value());
2006 }
2007
operator <<(RValue<Short4> lhs,unsigned char rhs)2008 RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
2009 {
2010 RR_DEBUG_INFO_UPDATE_LOC();
2011 #if defined(__i386__) || defined(__x86_64__)
2012 // return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2013
2014 return x86::psllw(lhs, rhs);
2015 #else
2016 return As<Short4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2017 #endif
2018 }
2019
operator >>(RValue<Short4> lhs,unsigned char rhs)2020 RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
2021 {
2022 RR_DEBUG_INFO_UPDATE_LOC();
2023 #if defined(__i386__) || defined(__x86_64__)
2024 return x86::psraw(lhs, rhs);
2025 #else
2026 return As<Short4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2027 #endif
2028 }
2029
Max(RValue<Short4> x,RValue<Short4> y)2030 RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
2031 {
2032 RR_DEBUG_INFO_UPDATE_LOC();
2033 #if defined(__i386__) || defined(__x86_64__)
2034 return x86::pmaxsw(x, y);
2035 #else
2036 return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
2037 #endif
2038 }
2039
Min(RValue<Short4> x,RValue<Short4> y)2040 RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
2041 {
2042 RR_DEBUG_INFO_UPDATE_LOC();
2043 #if defined(__i386__) || defined(__x86_64__)
2044 return x86::pminsw(x, y);
2045 #else
2046 return RValue<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
2047 #endif
2048 }
2049
AddSat(RValue<Short4> x,RValue<Short4> y)2050 RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
2051 {
2052 RR_DEBUG_INFO_UPDATE_LOC();
2053 #if defined(__i386__) || defined(__x86_64__)
2054 return x86::paddsw(x, y);
2055 #else
2056 return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
2057 #endif
2058 }
2059
SubSat(RValue<Short4> x,RValue<Short4> y)2060 RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
2061 {
2062 RR_DEBUG_INFO_UPDATE_LOC();
2063 #if defined(__i386__) || defined(__x86_64__)
2064 return x86::psubsw(x, y);
2065 #else
2066 return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
2067 #endif
2068 }
2069
MulHigh(RValue<Short4> x,RValue<Short4> y)2070 RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
2071 {
2072 RR_DEBUG_INFO_UPDATE_LOC();
2073 #if defined(__i386__) || defined(__x86_64__)
2074 return x86::pmulhw(x, y);
2075 #else
2076 return As<Short4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2077 #endif
2078 }
2079
MulAdd(RValue<Short4> x,RValue<Short4> y)2080 RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
2081 {
2082 RR_DEBUG_INFO_UPDATE_LOC();
2083 #if defined(__i386__) || defined(__x86_64__)
2084 return x86::pmaddwd(x, y);
2085 #else
2086 return As<Int2>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2087 #endif
2088 }
2089
PackSigned(RValue<Short4> x,RValue<Short4> y)2090 RValue<SByte8> PackSigned(RValue<Short4> x, RValue<Short4> y)
2091 {
2092 RR_DEBUG_INFO_UPDATE_LOC();
2093 #if defined(__i386__) || defined(__x86_64__)
2094 auto result = x86::packsswb(x, y);
2095 #else
2096 auto result = V(lowerPack(V(x.value()), V(y.value()), true));
2097 #endif
2098 return As<SByte8>(Swizzle(As<Int4>(result), 0x0202));
2099 }
2100
PackUnsigned(RValue<Short4> x,RValue<Short4> y)2101 RValue<Byte8> PackUnsigned(RValue<Short4> x, RValue<Short4> y)
2102 {
2103 RR_DEBUG_INFO_UPDATE_LOC();
2104 #if defined(__i386__) || defined(__x86_64__)
2105 auto result = x86::packuswb(x, y);
2106 #else
2107 auto result = V(lowerPack(V(x.value()), V(y.value()), false));
2108 #endif
2109 return As<Byte8>(Swizzle(As<Int4>(result), 0x0202));
2110 }
2111
CmpGT(RValue<Short4> x,RValue<Short4> y)2112 RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
2113 {
2114 RR_DEBUG_INFO_UPDATE_LOC();
2115 #if defined(__i386__) || defined(__x86_64__)
2116 return x86::pcmpgtw(x, y);
2117 #else
2118 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
2119 #endif
2120 }
2121
CmpEQ(RValue<Short4> x,RValue<Short4> y)2122 RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
2123 {
2124 RR_DEBUG_INFO_UPDATE_LOC();
2125 #if defined(__i386__) || defined(__x86_64__)
2126 return x86::pcmpeqw(x, y);
2127 #else
2128 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
2129 #endif
2130 }
2131
type()2132 Type *Short4::type()
2133 {
2134 return T(Type_v4i16);
2135 }
2136
UShort4(RValue<Float4> cast,bool saturate)2137 UShort4::UShort4(RValue<Float4> cast, bool saturate)
2138 {
2139 RR_DEBUG_INFO_UPDATE_LOC();
2140 if(saturate)
2141 {
2142 #if defined(__i386__) || defined(__x86_64__)
2143 if(CPUID::supportsSSE4_1())
2144 {
2145 Int4 int4(Min(cast, Float4(0xFFFF))); // packusdw takes care of 0x0000 saturation
2146 *this = As<Short4>(PackUnsigned(int4, int4));
2147 }
2148 else
2149 #endif
2150 {
2151 *this = Short4(Int4(Max(Min(cast, Float4(0xFFFF)), Float4(0x0000))));
2152 }
2153 }
2154 else
2155 {
2156 *this = Short4(Int4(cast));
2157 }
2158 }
2159
operator <<(RValue<UShort4> lhs,unsigned char rhs)2160 RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
2161 {
2162 RR_DEBUG_INFO_UPDATE_LOC();
2163 #if defined(__i386__) || defined(__x86_64__)
2164 // return RValue<Short4>(Nucleus::createShl(lhs.value(), rhs.value()));
2165
2166 return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
2167 #else
2168 return As<UShort4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2169 #endif
2170 }
2171
operator >>(RValue<UShort4> lhs,unsigned char rhs)2172 RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
2173 {
2174 RR_DEBUG_INFO_UPDATE_LOC();
2175 #if defined(__i386__) || defined(__x86_64__)
2176 // return RValue<Short4>(Nucleus::createLShr(lhs.value(), rhs.value()));
2177
2178 return x86::psrlw(lhs, rhs);
2179 #else
2180 return As<UShort4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2181 #endif
2182 }
2183
Max(RValue<UShort4> x,RValue<UShort4> y)2184 RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
2185 {
2186 RR_DEBUG_INFO_UPDATE_LOC();
2187 return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2188 }
2189
Min(RValue<UShort4> x,RValue<UShort4> y)2190 RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
2191 {
2192 RR_DEBUG_INFO_UPDATE_LOC();
2193 return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
2194 }
2195
AddSat(RValue<UShort4> x,RValue<UShort4> y)2196 RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
2197 {
2198 RR_DEBUG_INFO_UPDATE_LOC();
2199 #if defined(__i386__) || defined(__x86_64__)
2200 return x86::paddusw(x, y);
2201 #else
2202 return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
2203 #endif
2204 }
2205
SubSat(RValue<UShort4> x,RValue<UShort4> y)2206 RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
2207 {
2208 RR_DEBUG_INFO_UPDATE_LOC();
2209 #if defined(__i386__) || defined(__x86_64__)
2210 return x86::psubusw(x, y);
2211 #else
2212 return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
2213 #endif
2214 }
2215
MulHigh(RValue<UShort4> x,RValue<UShort4> y)2216 RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
2217 {
2218 RR_DEBUG_INFO_UPDATE_LOC();
2219 #if defined(__i386__) || defined(__x86_64__)
2220 return x86::pmulhuw(x, y);
2221 #else
2222 return As<UShort4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2223 #endif
2224 }
2225
Average(RValue<UShort4> x,RValue<UShort4> y)2226 RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
2227 {
2228 RR_DEBUG_INFO_UPDATE_LOC();
2229 #if defined(__i386__) || defined(__x86_64__)
2230 return x86::pavgw(x, y);
2231 #else
2232 return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
2233 #endif
2234 }
2235
type()2236 Type *UShort4::type()
2237 {
2238 return T(Type_v4i16);
2239 }
2240
operator <<(RValue<Short8> lhs,unsigned char rhs)2241 RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
2242 {
2243 RR_DEBUG_INFO_UPDATE_LOC();
2244 #if defined(__i386__) || defined(__x86_64__)
2245 return x86::psllw(lhs, rhs);
2246 #else
2247 return As<Short8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2248 #endif
2249 }
2250
operator >>(RValue<Short8> lhs,unsigned char rhs)2251 RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
2252 {
2253 RR_DEBUG_INFO_UPDATE_LOC();
2254 #if defined(__i386__) || defined(__x86_64__)
2255 return x86::psraw(lhs, rhs);
2256 #else
2257 return As<Short8>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2258 #endif
2259 }
2260
MulAdd(RValue<Short8> x,RValue<Short8> y)2261 RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
2262 {
2263 RR_DEBUG_INFO_UPDATE_LOC();
2264 #if defined(__i386__) || defined(__x86_64__)
2265 return x86::pmaddwd(x, y);
2266 #else
2267 return As<Int4>(V(lowerMulAdd(V(x.value()), V(y.value()))));
2268 #endif
2269 }
2270
MulHigh(RValue<Short8> x,RValue<Short8> y)2271 RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
2272 {
2273 RR_DEBUG_INFO_UPDATE_LOC();
2274 #if defined(__i386__) || defined(__x86_64__)
2275 return x86::pmulhw(x, y);
2276 #else
2277 return As<Short8>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2278 #endif
2279 }
2280
type()2281 Type *Short8::type()
2282 {
2283 return T(llvm::VectorType::get(T(Short::type()), 8, false));
2284 }
2285
operator <<(RValue<UShort8> lhs,unsigned char rhs)2286 RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
2287 {
2288 RR_DEBUG_INFO_UPDATE_LOC();
2289 #if defined(__i386__) || defined(__x86_64__)
2290 return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));
2291 #else
2292 return As<UShort8>(V(lowerVectorShl(V(lhs.value()), rhs)));
2293 #endif
2294 }
2295
operator >>(RValue<UShort8> lhs,unsigned char rhs)2296 RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
2297 {
2298 RR_DEBUG_INFO_UPDATE_LOC();
2299 #if defined(__i386__) || defined(__x86_64__)
2300 return x86::psrlw(lhs, rhs); // FIXME: Fallback required
2301 #else
2302 return As<UShort8>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2303 #endif
2304 }
2305
MulHigh(RValue<UShort8> x,RValue<UShort8> y)2306 RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
2307 {
2308 RR_DEBUG_INFO_UPDATE_LOC();
2309 #if defined(__i386__) || defined(__x86_64__)
2310 return x86::pmulhuw(x, y);
2311 #else
2312 return As<UShort8>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2313 #endif
2314 }
2315
type()2316 Type *UShort8::type()
2317 {
2318 return T(llvm::VectorType::get(T(UShort::type()), 8, false));
2319 }
2320
operator ++(Int & val,int)2321 RValue<Int> operator++(Int &val, int) // Post-increment
2322 {
2323 RR_DEBUG_INFO_UPDATE_LOC();
2324 RValue<Int> res = val;
2325
2326 Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2327 val.storeValue(inc);
2328
2329 return res;
2330 }
2331
operator ++(Int & val)2332 const Int &operator++(Int &val) // Pre-increment
2333 {
2334 RR_DEBUG_INFO_UPDATE_LOC();
2335 Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2336 val.storeValue(inc);
2337
2338 return val;
2339 }
2340
operator --(Int & val,int)2341 RValue<Int> operator--(Int &val, int) // Post-decrement
2342 {
2343 RR_DEBUG_INFO_UPDATE_LOC();
2344 RValue<Int> res = val;
2345
2346 Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2347 val.storeValue(inc);
2348
2349 return res;
2350 }
2351
operator --(Int & val)2352 const Int &operator--(Int &val) // Pre-decrement
2353 {
2354 RR_DEBUG_INFO_UPDATE_LOC();
2355 Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2356 val.storeValue(inc);
2357
2358 return val;
2359 }
2360
RoundInt(RValue<Float> cast)2361 RValue<Int> RoundInt(RValue<Float> cast)
2362 {
2363 RR_DEBUG_INFO_UPDATE_LOC();
2364 #if defined(__i386__) || defined(__x86_64__)
2365 return x86::cvtss2si(cast);
2366 #else
2367 return RValue<Int>(V(lowerRoundInt(V(cast.value()), T(Int::type()))));
2368 #endif
2369 }
2370
type()2371 Type *Int::type()
2372 {
2373 return T(llvm::Type::getInt32Ty(*jit->context));
2374 }
2375
type()2376 Type *Long::type()
2377 {
2378 return T(llvm::Type::getInt64Ty(*jit->context));
2379 }
2380
UInt(RValue<Float> cast)2381 UInt::UInt(RValue<Float> cast)
2382 {
2383 RR_DEBUG_INFO_UPDATE_LOC();
2384 Value *integer = Nucleus::createFPToUI(cast.value(), UInt::type());
2385 storeValue(integer);
2386 }
2387
operator ++(UInt & val,int)2388 RValue<UInt> operator++(UInt &val, int) // Post-increment
2389 {
2390 RR_DEBUG_INFO_UPDATE_LOC();
2391 RValue<UInt> res = val;
2392
2393 Value *inc = Nucleus::createAdd(res.value(), Nucleus::createConstantInt(1));
2394 val.storeValue(inc);
2395
2396 return res;
2397 }
2398
operator ++(UInt & val)2399 const UInt &operator++(UInt &val) // Pre-increment
2400 {
2401 RR_DEBUG_INFO_UPDATE_LOC();
2402 Value *inc = Nucleus::createAdd(val.loadValue(), Nucleus::createConstantInt(1));
2403 val.storeValue(inc);
2404
2405 return val;
2406 }
2407
operator --(UInt & val,int)2408 RValue<UInt> operator--(UInt &val, int) // Post-decrement
2409 {
2410 RR_DEBUG_INFO_UPDATE_LOC();
2411 RValue<UInt> res = val;
2412
2413 Value *inc = Nucleus::createSub(res.value(), Nucleus::createConstantInt(1));
2414 val.storeValue(inc);
2415
2416 return res;
2417 }
2418
operator --(UInt & val)2419 const UInt &operator--(UInt &val) // Pre-decrement
2420 {
2421 RR_DEBUG_INFO_UPDATE_LOC();
2422 Value *inc = Nucleus::createSub(val.loadValue(), Nucleus::createConstantInt(1));
2423 val.storeValue(inc);
2424
2425 return val;
2426 }
2427
2428 // RValue<UInt> RoundUInt(RValue<Float> cast)
2429 // {
2430 //#if defined(__i386__) || defined(__x86_64__)
2431 // return x86::cvtss2si(val); // FIXME: Unsigned
2432 //#else
2433 // return IfThenElse(cast > 0.0f, Int(cast + 0.5f), Int(cast - 0.5f));
2434 //#endif
2435 // }
2436
type()2437 Type *UInt::type()
2438 {
2439 return T(llvm::Type::getInt32Ty(*jit->context));
2440 }
2441
2442 // Int2::Int2(RValue<Int> cast)
2443 // {
2444 // Value *extend = Nucleus::createZExt(cast.value(), Long::type());
2445 // Value *vector = Nucleus::createBitCast(extend, Int2::type());
2446 //
2447 // int shuffle[2] = {0, 0};
2448 // Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
2449 //
2450 // storeValue(replicate);
2451 // }
2452
operator <<(RValue<Int2> lhs,unsigned char rhs)2453 RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
2454 {
2455 RR_DEBUG_INFO_UPDATE_LOC();
2456 #if defined(__i386__) || defined(__x86_64__)
2457 // return RValue<Int2>(Nucleus::createShl(lhs.value(), rhs.value()));
2458
2459 return x86::pslld(lhs, rhs);
2460 #else
2461 return As<Int2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2462 #endif
2463 }
2464
operator >>(RValue<Int2> lhs,unsigned char rhs)2465 RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
2466 {
2467 RR_DEBUG_INFO_UPDATE_LOC();
2468 #if defined(__i386__) || defined(__x86_64__)
2469 // return RValue<Int2>(Nucleus::createAShr(lhs.value(), rhs.value()));
2470
2471 return x86::psrad(lhs, rhs);
2472 #else
2473 return As<Int2>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2474 #endif
2475 }
2476
type()2477 Type *Int2::type()
2478 {
2479 return T(Type_v2i32);
2480 }
2481
operator <<(RValue<UInt2> lhs,unsigned char rhs)2482 RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
2483 {
2484 RR_DEBUG_INFO_UPDATE_LOC();
2485 #if defined(__i386__) || defined(__x86_64__)
2486 // return RValue<UInt2>(Nucleus::createShl(lhs.value(), rhs.value()));
2487
2488 return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
2489 #else
2490 return As<UInt2>(V(lowerVectorShl(V(lhs.value()), rhs)));
2491 #endif
2492 }
2493
operator >>(RValue<UInt2> lhs,unsigned char rhs)2494 RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
2495 {
2496 RR_DEBUG_INFO_UPDATE_LOC();
2497 #if defined(__i386__) || defined(__x86_64__)
2498 // return RValue<UInt2>(Nucleus::createLShr(lhs.value(), rhs.value()));
2499
2500 return x86::psrld(lhs, rhs);
2501 #else
2502 return As<UInt2>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2503 #endif
2504 }
2505
type()2506 Type *UInt2::type()
2507 {
2508 return T(Type_v2i32);
2509 }
2510
Int4(RValue<Byte4> cast)2511 Int4::Int4(RValue<Byte4> cast)
2512 : XYZW(this)
2513 {
2514 RR_DEBUG_INFO_UPDATE_LOC();
2515 std::vector<int> swizzle = { 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23 };
2516 Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2517 Value *b = Nucleus::createShuffleVector(a, Nucleus::createNullValue(Byte16::type()), swizzle);
2518
2519 std::vector<int> swizzle2 = { 0, 8, 1, 9, 2, 10, 3, 11 };
2520 Value *c = Nucleus::createBitCast(b, Short8::type());
2521 Value *d = Nucleus::createShuffleVector(c, Nucleus::createNullValue(Short8::type()), swizzle2);
2522
2523 *this = As<Int4>(d);
2524 }
2525
Int4(RValue<SByte4> cast)2526 Int4::Int4(RValue<SByte4> cast)
2527 : XYZW(this)
2528 {
2529 RR_DEBUG_INFO_UPDATE_LOC();
2530 std::vector<int> swizzle = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 };
2531 Value *a = Nucleus::createBitCast(cast.value(), Byte16::type());
2532 Value *b = Nucleus::createShuffleVector(a, a, swizzle);
2533
2534 std::vector<int> swizzle2 = { 0, 0, 1, 1, 2, 2, 3, 3 };
2535 Value *c = Nucleus::createBitCast(b, Short8::type());
2536 Value *d = Nucleus::createShuffleVector(c, c, swizzle2);
2537
2538 *this = As<Int4>(d) >> 24;
2539 }
2540
Int4(RValue<Short4> cast)2541 Int4::Int4(RValue<Short4> cast)
2542 : XYZW(this)
2543 {
2544 RR_DEBUG_INFO_UPDATE_LOC();
2545 std::vector<int> swizzle = { 0, 0, 1, 1, 2, 2, 3, 3 };
2546 Value *c = Nucleus::createShuffleVector(cast.value(), cast.value(), swizzle);
2547 *this = As<Int4>(c) >> 16;
2548 }
2549
Int4(RValue<UShort4> cast)2550 Int4::Int4(RValue<UShort4> cast)
2551 : XYZW(this)
2552 {
2553 RR_DEBUG_INFO_UPDATE_LOC();
2554 std::vector<int> swizzle = { 0, 8, 1, 9, 2, 10, 3, 11 };
2555 Value *c = Nucleus::createShuffleVector(cast.value(), Short8(0, 0, 0, 0, 0, 0, 0, 0).loadValue(), swizzle);
2556 *this = As<Int4>(c);
2557 }
2558
Int4(RValue<Int> rhs)2559 Int4::Int4(RValue<Int> rhs)
2560 : XYZW(this)
2561 {
2562 RR_DEBUG_INFO_UPDATE_LOC();
2563 Value *vector = loadValue();
2564 Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2565
2566 std::vector<int> swizzle = { 0, 0, 0, 0 };
2567 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2568
2569 storeValue(replicate);
2570 }
2571
operator <<(RValue<Int4> lhs,unsigned char rhs)2572 RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
2573 {
2574 RR_DEBUG_INFO_UPDATE_LOC();
2575 #if defined(__i386__) || defined(__x86_64__)
2576 return x86::pslld(lhs, rhs);
2577 #else
2578 return As<Int4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2579 #endif
2580 }
2581
operator >>(RValue<Int4> lhs,unsigned char rhs)2582 RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
2583 {
2584 RR_DEBUG_INFO_UPDATE_LOC();
2585 #if defined(__i386__) || defined(__x86_64__)
2586 return x86::psrad(lhs, rhs);
2587 #else
2588 return As<Int4>(V(lowerVectorAShr(V(lhs.value()), rhs)));
2589 #endif
2590 }
2591
CmpEQ(RValue<Int4> x,RValue<Int4> y)2592 RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
2593 {
2594 RR_DEBUG_INFO_UPDATE_LOC();
2595 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2596 }
2597
CmpLT(RValue<Int4> x,RValue<Int4> y)2598 RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
2599 {
2600 RR_DEBUG_INFO_UPDATE_LOC();
2601 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value(), y.value()), Int4::type()));
2602 }
2603
CmpLE(RValue<Int4> x,RValue<Int4> y)2604 RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
2605 {
2606 RR_DEBUG_INFO_UPDATE_LOC();
2607 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value(), y.value()), Int4::type()));
2608 }
2609
CmpNEQ(RValue<Int4> x,RValue<Int4> y)2610 RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
2611 {
2612 RR_DEBUG_INFO_UPDATE_LOC();
2613 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2614 }
2615
CmpNLT(RValue<Int4> x,RValue<Int4> y)2616 RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
2617 {
2618 RR_DEBUG_INFO_UPDATE_LOC();
2619 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value(), y.value()), Int4::type()));
2620 }
2621
CmpNLE(RValue<Int4> x,RValue<Int4> y)2622 RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
2623 {
2624 RR_DEBUG_INFO_UPDATE_LOC();
2625 return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value(), y.value()), Int4::type()));
2626 }
2627
Abs(RValue<Int4> x)2628 RValue<Int4> Abs(RValue<Int4> x)
2629 {
2630 #if LLVM_VERSION_MAJOR >= 12
2631 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::abs, { V(x.value())->getType() });
2632 return RValue<Int4>(V(jit->builder->CreateCall(func, { V(x.value()), llvm::ConstantInt::getFalse(*jit->context) })));
2633 #else
2634 auto negative = x >> 31;
2635 return (x ^ negative) - negative;
2636 #endif
2637 }
2638
Max(RValue<Int4> x,RValue<Int4> y)2639 RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
2640 {
2641 RR_DEBUG_INFO_UPDATE_LOC();
2642 #if defined(__i386__) || defined(__x86_64__)
2643 if(CPUID::supportsSSE4_1())
2644 {
2645 return x86::pmaxsd(x, y);
2646 }
2647 else
2648 #endif
2649 {
2650 RValue<Int4> greater = CmpNLE(x, y);
2651 return (x & greater) | (y & ~greater);
2652 }
2653 }
2654
Min(RValue<Int4> x,RValue<Int4> y)2655 RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
2656 {
2657 RR_DEBUG_INFO_UPDATE_LOC();
2658 #if defined(__i386__) || defined(__x86_64__)
2659 if(CPUID::supportsSSE4_1())
2660 {
2661 return x86::pminsd(x, y);
2662 }
2663 else
2664 #endif
2665 {
2666 RValue<Int4> less = CmpLT(x, y);
2667 return (x & less) | (y & ~less);
2668 }
2669 }
2670
RoundInt(RValue<Float4> cast)2671 RValue<Int4> RoundInt(RValue<Float4> cast)
2672 {
2673 RR_DEBUG_INFO_UPDATE_LOC();
2674 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
2675 return x86::cvtps2dq(cast);
2676 #else
2677 return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2678 #endif
2679 }
2680
RoundIntClamped(RValue<Float4> cast)2681 RValue<Int4> RoundIntClamped(RValue<Float4> cast)
2682 {
2683 RR_DEBUG_INFO_UPDATE_LOC();
2684
2685 // TODO(b/165000222): Check if fptosi_sat produces optimal code for x86 and ARM.
2686 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
2687 // cvtps2dq produces 0x80000000, a negative value, for input larger than
2688 // 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
2689 // saturate to 0x80000000.
2690 return x86::cvtps2dq(Min(cast, Float4(0x7FFFFF80)));
2691 #elif defined(__arm__) || defined(__aarch64__)
2692 // ARM saturates to the largest positive or negative integer. Unit tests
2693 // verify that lowerRoundInt() behaves as desired.
2694 return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
2695 #elif LLVM_VERSION_MAJOR >= 14
2696 llvm::Value *rounded = lowerRound(V(cast.value()));
2697 llvm::Function *fptosi_sat = llvm::Intrinsic::getDeclaration(
2698 jit->module.get(), llvm::Intrinsic::fptosi_sat, { T(Int4::type()), T(Float4::type()) });
2699 return RValue<Int4>(V(jit->builder->CreateCall(fptosi_sat, { rounded })));
2700 #else
2701 RValue<Float4> clamped = Max(Min(cast, Float4(0x7FFFFF80)), Float4(static_cast<int>(0x80000000)));
2702 return As<Int4>(V(lowerRoundInt(V(clamped.value()), T(Int4::type()))));
2703 #endif
2704 }
2705
MulHigh(RValue<Int4> x,RValue<Int4> y)2706 RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
2707 {
2708 RR_DEBUG_INFO_UPDATE_LOC();
2709 // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2710 return As<Int4>(V(lowerMulHigh(V(x.value()), V(y.value()), true)));
2711 }
2712
MulHigh(RValue<UInt4> x,RValue<UInt4> y)2713 RValue<UInt4> MulHigh(RValue<UInt4> x, RValue<UInt4> y)
2714 {
2715 RR_DEBUG_INFO_UPDATE_LOC();
2716 // TODO: For x86, build an intrinsics version of this which uses shuffles + pmuludq.
2717 return As<UInt4>(V(lowerMulHigh(V(x.value()), V(y.value()), false)));
2718 }
2719
PackSigned(RValue<Int4> x,RValue<Int4> y)2720 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
2721 {
2722 RR_DEBUG_INFO_UPDATE_LOC();
2723 #if defined(__i386__) || defined(__x86_64__)
2724 return x86::packssdw(x, y);
2725 #else
2726 return As<Short8>(V(lowerPack(V(x.value()), V(y.value()), true)));
2727 #endif
2728 }
2729
PackUnsigned(RValue<Int4> x,RValue<Int4> y)2730 RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y)
2731 {
2732 RR_DEBUG_INFO_UPDATE_LOC();
2733 #if defined(__i386__) || defined(__x86_64__)
2734 return x86::packusdw(x, y);
2735 #else
2736 return As<UShort8>(V(lowerPack(V(x.value()), V(y.value()), false)));
2737 #endif
2738 }
2739
SignMask(RValue<Int4> x)2740 RValue<Int> SignMask(RValue<Int4> x)
2741 {
2742 RR_DEBUG_INFO_UPDATE_LOC();
2743 #if defined(__i386__) || defined(__x86_64__)
2744 return x86::movmskps(As<Float4>(x));
2745 #else
2746 return As<Int>(V(lowerSignMask(V(x.value()), T(Int::type()))));
2747 #endif
2748 }
2749
type()2750 Type *Int4::type()
2751 {
2752 return T(llvm::VectorType::get(T(Int::type()), 4, false));
2753 }
2754
UInt4(RValue<Float4> cast)2755 UInt4::UInt4(RValue<Float4> cast)
2756 : XYZW(this)
2757 {
2758 RR_DEBUG_INFO_UPDATE_LOC();
2759 Value *xyzw = Nucleus::createFPToUI(cast.value(), UInt4::type());
2760 storeValue(xyzw);
2761 }
2762
UInt4(RValue<UInt> rhs)2763 UInt4::UInt4(RValue<UInt> rhs)
2764 : XYZW(this)
2765 {
2766 RR_DEBUG_INFO_UPDATE_LOC();
2767 Value *vector = loadValue();
2768 Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
2769
2770 std::vector<int> swizzle = { 0, 0, 0, 0 };
2771 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
2772
2773 storeValue(replicate);
2774 }
2775
operator <<(RValue<UInt4> lhs,unsigned char rhs)2776 RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
2777 {
2778 RR_DEBUG_INFO_UPDATE_LOC();
2779 #if defined(__i386__) || defined(__x86_64__)
2780 return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
2781 #else
2782 return As<UInt4>(V(lowerVectorShl(V(lhs.value()), rhs)));
2783 #endif
2784 }
2785
operator >>(RValue<UInt4> lhs,unsigned char rhs)2786 RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
2787 {
2788 RR_DEBUG_INFO_UPDATE_LOC();
2789 #if defined(__i386__) || defined(__x86_64__)
2790 return x86::psrld(lhs, rhs);
2791 #else
2792 return As<UInt4>(V(lowerVectorLShr(V(lhs.value()), rhs)));
2793 #endif
2794 }
2795
CmpEQ(RValue<UInt4> x,RValue<UInt4> y)2796 RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
2797 {
2798 RR_DEBUG_INFO_UPDATE_LOC();
2799 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), Int4::type()));
2800 }
2801
CmpLT(RValue<UInt4> x,RValue<UInt4> y)2802 RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
2803 {
2804 RR_DEBUG_INFO_UPDATE_LOC();
2805 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value(), y.value()), Int4::type()));
2806 }
2807
CmpLE(RValue<UInt4> x,RValue<UInt4> y)2808 RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
2809 {
2810 RR_DEBUG_INFO_UPDATE_LOC();
2811 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value(), y.value()), Int4::type()));
2812 }
2813
CmpNEQ(RValue<UInt4> x,RValue<UInt4> y)2814 RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
2815 {
2816 RR_DEBUG_INFO_UPDATE_LOC();
2817 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), Int4::type()));
2818 }
2819
CmpNLT(RValue<UInt4> x,RValue<UInt4> y)2820 RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
2821 {
2822 RR_DEBUG_INFO_UPDATE_LOC();
2823 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value(), y.value()), Int4::type()));
2824 }
2825
CmpNLE(RValue<UInt4> x,RValue<UInt4> y)2826 RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
2827 {
2828 RR_DEBUG_INFO_UPDATE_LOC();
2829 return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value(), y.value()), Int4::type()));
2830 }
2831
Max(RValue<UInt4> x,RValue<UInt4> y)2832 RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
2833 {
2834 RR_DEBUG_INFO_UPDATE_LOC();
2835 #if defined(__i386__) || defined(__x86_64__)
2836 if(CPUID::supportsSSE4_1())
2837 {
2838 return x86::pmaxud(x, y);
2839 }
2840 else
2841 #endif
2842 {
2843 RValue<UInt4> greater = CmpNLE(x, y);
2844 return (x & greater) | (y & ~greater);
2845 }
2846 }
2847
Min(RValue<UInt4> x,RValue<UInt4> y)2848 RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
2849 {
2850 RR_DEBUG_INFO_UPDATE_LOC();
2851 #if defined(__i386__) || defined(__x86_64__)
2852 if(CPUID::supportsSSE4_1())
2853 {
2854 return x86::pminud(x, y);
2855 }
2856 else
2857 #endif
2858 {
2859 RValue<UInt4> less = CmpLT(x, y);
2860 return (x & less) | (y & ~less);
2861 }
2862 }
2863
type()2864 Type *UInt4::type()
2865 {
2866 return T(llvm::VectorType::get(T(UInt::type()), 4, false));
2867 }
2868
type()2869 Type *Half::type()
2870 {
2871 return T(llvm::Type::getInt16Ty(*jit->context));
2872 }
2873
HasRcpApprox()2874 bool HasRcpApprox()
2875 {
2876 #if defined(__i386__) || defined(__x86_64__)
2877 return true;
2878 #else
2879 return false;
2880 #endif
2881 }
2882
RcpApprox(RValue<Float4> x,bool exactAtPow2)2883 RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
2884 {
2885 #if defined(__i386__) || defined(__x86_64__)
2886 if(exactAtPow2)
2887 {
2888 // rcpps uses a piecewise-linear approximation which minimizes the relative error
2889 // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2890 return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2891 }
2892 return x86::rcpps(x);
2893 #else
2894 UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2895 return { 0.0f };
2896 #endif
2897 }
2898
RcpApprox(RValue<Float> x,bool exactAtPow2)2899 RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
2900 {
2901 #if defined(__i386__) || defined(__x86_64__)
2902 if(exactAtPow2)
2903 {
2904 // rcpss uses a piecewise-linear approximation which minimizes the relative error
2905 // but is not exact at power-of-two values. Rectify by multiplying by the inverse.
2906 return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
2907 }
2908 return x86::rcpss(x);
2909 #else
2910 UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
2911 return { 0.0f };
2912 #endif
2913 }
2914
HasRcpSqrtApprox()2915 bool HasRcpSqrtApprox()
2916 {
2917 #if defined(__i386__) || defined(__x86_64__)
2918 return true;
2919 #else
2920 return false;
2921 #endif
2922 }
2923
RcpSqrtApprox(RValue<Float4> x)2924 RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
2925 {
2926 #if defined(__i386__) || defined(__x86_64__)
2927 return x86::rsqrtps(x);
2928 #else
2929 UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
2930 return { 0.0f };
2931 #endif
2932 }
2933
RcpSqrtApprox(RValue<Float> x)2934 RValue<Float> RcpSqrtApprox(RValue<Float> x)
2935 {
2936 #if defined(__i386__) || defined(__x86_64__)
2937 return x86::rsqrtss(x);
2938 #else
2939 UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
2940 return { 0.0f };
2941 #endif
2942 }
2943
Sqrt(RValue<Float> x)2944 RValue<Float> Sqrt(RValue<Float> x)
2945 {
2946 RR_DEBUG_INFO_UPDATE_LOC();
2947 #if defined(__i386__) || defined(__x86_64__)
2948 return x86::sqrtss(x);
2949 #else
2950 return As<Float>(V(lowerSQRT(V(x.value()))));
2951 #endif
2952 }
2953
Round(RValue<Float> x)2954 RValue<Float> Round(RValue<Float> x)
2955 {
2956 RR_DEBUG_INFO_UPDATE_LOC();
2957 #if defined(__i386__) || defined(__x86_64__)
2958 if(CPUID::supportsSSE4_1())
2959 {
2960 return x86::roundss(x, 0);
2961 }
2962 else
2963 {
2964 return Float4(Round(Float4(x))).x;
2965 }
2966 #else
2967 return RValue<Float>(V(lowerRound(V(x.value()))));
2968 #endif
2969 }
2970
Trunc(RValue<Float> x)2971 RValue<Float> Trunc(RValue<Float> x)
2972 {
2973 RR_DEBUG_INFO_UPDATE_LOC();
2974 #if defined(__i386__) || defined(__x86_64__)
2975 if(CPUID::supportsSSE4_1())
2976 {
2977 return x86::roundss(x, 3);
2978 }
2979 else
2980 {
2981 return Float(Int(x)); // Rounded toward zero
2982 }
2983 #else
2984 return RValue<Float>(V(lowerTrunc(V(x.value()))));
2985 #endif
2986 }
2987
Frac(RValue<Float> x)2988 RValue<Float> Frac(RValue<Float> x)
2989 {
2990 RR_DEBUG_INFO_UPDATE_LOC();
2991 #if defined(__i386__) || defined(__x86_64__)
2992 if(CPUID::supportsSSE4_1())
2993 {
2994 return x - x86::floorss(x);
2995 }
2996 else
2997 {
2998 return Float4(Frac(Float4(x))).x;
2999 }
3000 #else
3001 // x - floor(x) can be 1.0 for very small negative x.
3002 // Clamp against the value just below 1.0.
3003 return Min(x - Floor(x), As<Float>(Int(0x3F7FFFFF)));
3004 #endif
3005 }
3006
Floor(RValue<Float> x)3007 RValue<Float> Floor(RValue<Float> x)
3008 {
3009 RR_DEBUG_INFO_UPDATE_LOC();
3010 #if defined(__i386__) || defined(__x86_64__)
3011 if(CPUID::supportsSSE4_1())
3012 {
3013 return x86::floorss(x);
3014 }
3015 else
3016 {
3017 return Float4(Floor(Float4(x))).x;
3018 }
3019 #else
3020 return RValue<Float>(V(lowerFloor(V(x.value()))));
3021 #endif
3022 }
3023
Ceil(RValue<Float> x)3024 RValue<Float> Ceil(RValue<Float> x)
3025 {
3026 RR_DEBUG_INFO_UPDATE_LOC();
3027 #if defined(__i386__) || defined(__x86_64__)
3028 if(CPUID::supportsSSE4_1())
3029 {
3030 return x86::ceilss(x);
3031 }
3032 else
3033 #endif
3034 {
3035 return Float4(Ceil(Float4(x))).x;
3036 }
3037 }
3038
type()3039 Type *Float::type()
3040 {
3041 return T(llvm::Type::getFloatTy(*jit->context));
3042 }
3043
type()3044 Type *Float2::type()
3045 {
3046 return T(Type_v2f32);
3047 }
3048
Float4(RValue<Float> rhs)3049 Float4::Float4(RValue<Float> rhs)
3050 : XYZW(this)
3051 {
3052 RR_DEBUG_INFO_UPDATE_LOC();
3053 Value *vector = loadValue();
3054 Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
3055
3056 std::vector<int> swizzle = { 0, 0, 0, 0 };
3057 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
3058
3059 storeValue(replicate);
3060 }
3061
MulAdd(RValue<Float4> x,RValue<Float4> y,RValue<Float4> z)3062 RValue<Float4> MulAdd(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
3063 {
3064 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fmuladd, { T(Float4::type()) });
3065 return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
3066 }
3067
FMA(RValue<Float4> x,RValue<Float4> y,RValue<Float4> z)3068 RValue<Float4> FMA(RValue<Float4> x, RValue<Float4> y, RValue<Float4> z)
3069 {
3070 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fma, { T(Float4::type()) });
3071 return RValue<Float4>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
3072 }
3073
Abs(RValue<Float4> x)3074 RValue<Float4> Abs(RValue<Float4> x)
3075 {
3076 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fabs, { V(x.value())->getType() });
3077 return RValue<Float4>(V(jit->builder->CreateCall(func, V(x.value()))));
3078 }
3079
Max(RValue<Float4> x,RValue<Float4> y)3080 RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
3081 {
3082 RR_DEBUG_INFO_UPDATE_LOC();
3083 #if defined(__i386__) || defined(__x86_64__)
3084 return x86::maxps(x, y);
3085 #else
3086 return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OGT)));
3087 #endif
3088 }
3089
Min(RValue<Float4> x,RValue<Float4> y)3090 RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
3091 {
3092 RR_DEBUG_INFO_UPDATE_LOC();
3093 #if defined(__i386__) || defined(__x86_64__)
3094 return x86::minps(x, y);
3095 #else
3096 return As<Float4>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OLT)));
3097 #endif
3098 }
3099
Sqrt(RValue<Float4> x)3100 RValue<Float4> Sqrt(RValue<Float4> x)
3101 {
3102 RR_DEBUG_INFO_UPDATE_LOC();
3103 #if defined(__i386__) || defined(__x86_64__)
3104 return x86::sqrtps(x);
3105 #else
3106 return As<Float4>(V(lowerSQRT(V(x.value()))));
3107 #endif
3108 }
3109
SignMask(RValue<Float4> x)3110 RValue<Int> SignMask(RValue<Float4> x)
3111 {
3112 RR_DEBUG_INFO_UPDATE_LOC();
3113 #if defined(__i386__) || defined(__x86_64__)
3114 return x86::movmskps(x);
3115 #else
3116 return As<Int>(V(lowerFPSignMask(V(x.value()), T(Int::type()))));
3117 #endif
3118 }
3119
CmpEQ(RValue<Float4> x,RValue<Float4> y)3120 RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
3121 {
3122 RR_DEBUG_INFO_UPDATE_LOC();
3123 // return As<Int4>(x86::cmpeqps(x, y));
3124 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value(), y.value()), Int4::type()));
3125 }
3126
CmpLT(RValue<Float4> x,RValue<Float4> y)3127 RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
3128 {
3129 RR_DEBUG_INFO_UPDATE_LOC();
3130 // return As<Int4>(x86::cmpltps(x, y));
3131 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value(), y.value()), Int4::type()));
3132 }
3133
CmpLE(RValue<Float4> x,RValue<Float4> y)3134 RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
3135 {
3136 RR_DEBUG_INFO_UPDATE_LOC();
3137 // return As<Int4>(x86::cmpleps(x, y));
3138 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value(), y.value()), Int4::type()));
3139 }
3140
CmpNEQ(RValue<Float4> x,RValue<Float4> y)3141 RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
3142 {
3143 RR_DEBUG_INFO_UPDATE_LOC();
3144 // return As<Int4>(x86::cmpneqps(x, y));
3145 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value(), y.value()), Int4::type()));
3146 }
3147
CmpNLT(RValue<Float4> x,RValue<Float4> y)3148 RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
3149 {
3150 RR_DEBUG_INFO_UPDATE_LOC();
3151 // return As<Int4>(x86::cmpnltps(x, y));
3152 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value(), y.value()), Int4::type()));
3153 }
3154
CmpNLE(RValue<Float4> x,RValue<Float4> y)3155 RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
3156 {
3157 RR_DEBUG_INFO_UPDATE_LOC();
3158 // return As<Int4>(x86::cmpnleps(x, y));
3159 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value(), y.value()), Int4::type()));
3160 }
3161
CmpUEQ(RValue<Float4> x,RValue<Float4> y)3162 RValue<Int4> CmpUEQ(RValue<Float4> x, RValue<Float4> y)
3163 {
3164 RR_DEBUG_INFO_UPDATE_LOC();
3165 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value(), y.value()), Int4::type()));
3166 }
3167
CmpULT(RValue<Float4> x,RValue<Float4> y)3168 RValue<Int4> CmpULT(RValue<Float4> x, RValue<Float4> y)
3169 {
3170 RR_DEBUG_INFO_UPDATE_LOC();
3171 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value(), y.value()), Int4::type()));
3172 }
3173
CmpULE(RValue<Float4> x,RValue<Float4> y)3174 RValue<Int4> CmpULE(RValue<Float4> x, RValue<Float4> y)
3175 {
3176 RR_DEBUG_INFO_UPDATE_LOC();
3177 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value(), y.value()), Int4::type()));
3178 }
3179
CmpUNEQ(RValue<Float4> x,RValue<Float4> y)3180 RValue<Int4> CmpUNEQ(RValue<Float4> x, RValue<Float4> y)
3181 {
3182 RR_DEBUG_INFO_UPDATE_LOC();
3183 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value(), y.value()), Int4::type()));
3184 }
3185
CmpUNLT(RValue<Float4> x,RValue<Float4> y)3186 RValue<Int4> CmpUNLT(RValue<Float4> x, RValue<Float4> y)
3187 {
3188 RR_DEBUG_INFO_UPDATE_LOC();
3189 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value(), y.value()), Int4::type()));
3190 }
3191
CmpUNLE(RValue<Float4> x,RValue<Float4> y)3192 RValue<Int4> CmpUNLE(RValue<Float4> x, RValue<Float4> y)
3193 {
3194 RR_DEBUG_INFO_UPDATE_LOC();
3195 return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value(), y.value()), Int4::type()));
3196 }
3197
Round(RValue<Float4> x)3198 RValue<Float4> Round(RValue<Float4> x)
3199 {
3200 RR_DEBUG_INFO_UPDATE_LOC();
3201 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3202 if(CPUID::supportsSSE4_1())
3203 {
3204 return x86::roundps(x, 0);
3205 }
3206 else
3207 {
3208 return Float4(RoundInt(x));
3209 }
3210 #else
3211 return RValue<Float4>(V(lowerRound(V(x.value()))));
3212 #endif
3213 }
3214
Trunc(RValue<Float4> x)3215 RValue<Float4> Trunc(RValue<Float4> x)
3216 {
3217 RR_DEBUG_INFO_UPDATE_LOC();
3218 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3219 if(CPUID::supportsSSE4_1())
3220 {
3221 return x86::roundps(x, 3);
3222 }
3223 else
3224 {
3225 return Float4(Int4(x));
3226 }
3227 #else
3228 return RValue<Float4>(V(lowerTrunc(V(x.value()))));
3229 #endif
3230 }
3231
Frac(RValue<Float4> x)3232 RValue<Float4> Frac(RValue<Float4> x)
3233 {
3234 RR_DEBUG_INFO_UPDATE_LOC();
3235 Float4 frc;
3236
3237 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3238 if(CPUID::supportsSSE4_1())
3239 {
3240 frc = x - x86::floorps(x);
3241 }
3242 else
3243 {
3244 frc = x - Float4(Int4(x)); // Signed fractional part.
3245
3246 frc += As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1.0f))); // Add 1.0 if negative.
3247 }
3248 #else
3249 frc = x - Floor(x);
3250 #endif
3251
3252 // x - floor(x) can be 1.0 for very small negative x.
3253 // Clamp against the value just below 1.0.
3254 return Min(frc, As<Float4>(Int4(0x3F7FFFFF)));
3255 }
3256
Floor(RValue<Float4> x)3257 RValue<Float4> Floor(RValue<Float4> x)
3258 {
3259 RR_DEBUG_INFO_UPDATE_LOC();
3260 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3261 if(CPUID::supportsSSE4_1())
3262 {
3263 return x86::floorps(x);
3264 }
3265 else
3266 {
3267 return x - Frac(x);
3268 }
3269 #else
3270 return RValue<Float4>(V(lowerFloor(V(x.value()))));
3271 #endif
3272 }
3273
Ceil(RValue<Float4> x)3274 RValue<Float4> Ceil(RValue<Float4> x)
3275 {
3276 RR_DEBUG_INFO_UPDATE_LOC();
3277 #if(defined(__i386__) || defined(__x86_64__)) && !__has_feature(memory_sanitizer)
3278 if(CPUID::supportsSSE4_1())
3279 {
3280 return x86::ceilps(x);
3281 }
3282 else
3283 #endif
3284 {
3285 return -Floor(-x);
3286 }
3287 }
3288
Ctlz(RValue<UInt> v,bool isZeroUndef)3289 RValue<UInt> Ctlz(RValue<UInt> v, bool isZeroUndef)
3290 {
3291 RR_DEBUG_INFO_UPDATE_LOC();
3292 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt::type()) });
3293 return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3294 isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3295 }
3296
Ctlz(RValue<UInt4> v,bool isZeroUndef)3297 RValue<UInt4> Ctlz(RValue<UInt4> v, bool isZeroUndef)
3298 {
3299 RR_DEBUG_INFO_UPDATE_LOC();
3300 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::ctlz, { T(UInt4::type()) });
3301 return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3302 isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3303 }
3304
Cttz(RValue<UInt> v,bool isZeroUndef)3305 RValue<UInt> Cttz(RValue<UInt> v, bool isZeroUndef)
3306 {
3307 RR_DEBUG_INFO_UPDATE_LOC();
3308 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt::type()) });
3309 return RValue<UInt>(V(jit->builder->CreateCall(func, { V(v.value()),
3310 isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3311 }
3312
Cttz(RValue<UInt4> v,bool isZeroUndef)3313 RValue<UInt4> Cttz(RValue<UInt4> v, bool isZeroUndef)
3314 {
3315 RR_DEBUG_INFO_UPDATE_LOC();
3316 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::cttz, { T(UInt4::type()) });
3317 return RValue<UInt4>(V(jit->builder->CreateCall(func, { V(v.value()),
3318 isZeroUndef ? llvm::ConstantInt::getTrue(*jit->context) : llvm::ConstantInt::getFalse(*jit->context) })));
3319 }
3320
MinAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3321 RValue<Int> MinAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3322 {
3323 return RValue<Int>(Nucleus::createAtomicMin(x.value(), y.value(), memoryOrder));
3324 }
3325
MinAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3326 RValue<UInt> MinAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3327 {
3328 return RValue<UInt>(Nucleus::createAtomicUMin(x.value(), y.value(), memoryOrder));
3329 }
3330
MaxAtomic(RValue<Pointer<Int>> x,RValue<Int> y,std::memory_order memoryOrder)3331 RValue<Int> MaxAtomic(RValue<Pointer<Int>> x, RValue<Int> y, std::memory_order memoryOrder)
3332 {
3333 return RValue<Int>(Nucleus::createAtomicMax(x.value(), y.value(), memoryOrder));
3334 }
3335
MaxAtomic(RValue<Pointer<UInt>> x,RValue<UInt> y,std::memory_order memoryOrder)3336 RValue<UInt> MaxAtomic(RValue<Pointer<UInt>> x, RValue<UInt> y, std::memory_order memoryOrder)
3337 {
3338 return RValue<UInt>(Nucleus::createAtomicUMax(x.value(), y.value(), memoryOrder));
3339 }
3340
type()3341 Type *Float4::type()
3342 {
3343 return T(llvm::VectorType::get(T(Float::type()), 4, false));
3344 }
3345
Ticks()3346 RValue<Long> Ticks()
3347 {
3348 RR_DEBUG_INFO_UPDATE_LOC();
3349 llvm::Function *rdtsc = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::readcyclecounter);
3350
3351 return RValue<Long>(V(jit->builder->CreateCall(rdtsc)));
3352 }
3353
ConstantPointer(const void * ptr)3354 RValue<Pointer<Byte>> ConstantPointer(const void *ptr)
3355 {
3356 RR_DEBUG_INFO_UPDATE_LOC();
3357 // Note: this should work for 32-bit pointers as well because 'inttoptr'
3358 // is defined to truncate (and zero extend) if necessary.
3359 auto ptrAsInt = llvm::ConstantInt::get(llvm::Type::getInt64Ty(*jit->context), reinterpret_cast<uintptr_t>(ptr));
3360 return RValue<Pointer<Byte>>(V(jit->builder->CreateIntToPtr(ptrAsInt, T(Pointer<Byte>::type()))));
3361 }
3362
ConstantData(const void * data,size_t size)3363 RValue<Pointer<Byte>> ConstantData(const void *data, size_t size)
3364 {
3365 RR_DEBUG_INFO_UPDATE_LOC();
3366 auto str = ::std::string(reinterpret_cast<const char *>(data), size);
3367 auto ptr = jit->builder->CreateGlobalStringPtr(str);
3368 return RValue<Pointer<Byte>>(V(ptr));
3369 }
3370
Call(RValue<Pointer<Byte>> fptr,Type * retTy,std::initializer_list<Value * > args,std::initializer_list<Type * > argTys)3371 Value *Call(RValue<Pointer<Byte>> fptr, Type *retTy, std::initializer_list<Value *> args, std::initializer_list<Type *> argTys)
3372 {
3373 // If this is a MemorySanitizer build, but Reactor routine instrumentation is not enabled,
3374 // mark all call arguments as initialized by calling __msan_unpoison_param().
3375 if(__has_feature(memory_sanitizer) && !jit->msanInstrumentation)
3376 {
3377 // void __msan_unpoison_param(size_t n)
3378 auto voidTy = llvm::Type::getVoidTy(*jit->context);
3379 auto sizetTy = llvm::IntegerType::get(*jit->context, sizeof(size_t) * 8);
3380 auto funcTy = llvm::FunctionType::get(voidTy, { sizetTy }, false);
3381 auto func = jit->module->getOrInsertFunction("__msan_unpoison_param", funcTy);
3382
3383 jit->builder->CreateCall(func, { llvm::ConstantInt::get(sizetTy, args.size()) });
3384 }
3385
3386 RR_DEBUG_INFO_UPDATE_LOC();
3387 llvm::SmallVector<llvm::Type *, 8> paramTys;
3388 for(auto ty : argTys) { paramTys.push_back(T(ty)); }
3389 auto funcTy = llvm::FunctionType::get(T(retTy), paramTys, false);
3390
3391 auto funcPtrTy = funcTy->getPointerTo();
3392 auto funcPtr = jit->builder->CreatePointerCast(V(fptr.value()), funcPtrTy);
3393
3394 llvm::SmallVector<llvm::Value *, 8> arguments;
3395 for(auto arg : args) { arguments.push_back(V(arg)); }
3396 return V(jit->builder->CreateCall(funcTy, funcPtr, arguments));
3397 }
3398
Breakpoint()3399 void Breakpoint()
3400 {
3401 RR_DEBUG_INFO_UPDATE_LOC();
3402 llvm::Function *debugtrap = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::debugtrap);
3403
3404 jit->builder->CreateCall(debugtrap);
3405 }
3406
3407 } // namespace rr
3408
3409 namespace rr {
3410
3411 #if defined(__i386__) || defined(__x86_64__)
3412 namespace x86 {
3413
3414 // Differs from IRBuilder<>::CreateUnaryIntrinsic() in that it only accepts native instruction intrinsics which have
3415 // implicit types, such as 'x86_sse_rcp_ps' operating on v4f32, while 'sqrt' requires explicitly specifying the operand type.
createInstruction(llvm::Intrinsic::ID id,Value * x)3416 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x)
3417 {
3418 llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3419
3420 return V(jit->builder->CreateCall(intrinsic, V(x)));
3421 }
3422
3423 // Differs from IRBuilder<>::CreateBinaryIntrinsic() in that it only accepts native instruction intrinsics which have
3424 // implicit types, such as 'x86_sse_max_ps' operating on v4f32, while 'sadd_sat' requires explicitly specifying the operand types.
createInstruction(llvm::Intrinsic::ID id,Value * x,Value * y)3425 static Value *createInstruction(llvm::Intrinsic::ID id, Value *x, Value *y)
3426 {
3427 llvm::Function *intrinsic = llvm::Intrinsic::getDeclaration(jit->module.get(), id);
3428
3429 return V(jit->builder->CreateCall(intrinsic, { V(x), V(y) }));
3430 }
3431
cvtss2si(RValue<Float> val)3432 RValue<Int> cvtss2si(RValue<Float> val)
3433 {
3434 Float4 vector;
3435 vector.x = val;
3436
3437 return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_cvtss2si, RValue<Float4>(vector).value()));
3438 }
3439
cvtps2dq(RValue<Float4> val)3440 RValue<Int4> cvtps2dq(RValue<Float4> val)
3441 {
3442 ASSERT(!__has_feature(memory_sanitizer)); // TODO(b/172238865): Not correctly instrumented by MemorySanitizer.
3443
3444 return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_cvtps2dq, val.value()));
3445 }
3446
rcpss(RValue<Float> val)3447 RValue<Float> rcpss(RValue<Float> val)
3448 {
3449 Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::type()))), val.value(), 0);
3450
3451 return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rcp_ss, vector), Float::type(), 0));
3452 }
3453
sqrtss(RValue<Float> val)3454 RValue<Float> sqrtss(RValue<Float> val)
3455 {
3456 return RValue<Float>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3457 }
3458
rsqrtss(RValue<Float> val)3459 RValue<Float> rsqrtss(RValue<Float> val)
3460 {
3461 Value *vector = Nucleus::createInsertElement(V(llvm::UndefValue::get(T(Float4::type()))), val.value(), 0);
3462
3463 return RValue<Float>(Nucleus::createExtractElement(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ss, vector), Float::type(), 0));
3464 }
3465
rcpps(RValue<Float4> val)3466 RValue<Float4> rcpps(RValue<Float4> val)
3467 {
3468 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rcp_ps, val.value()));
3469 }
3470
sqrtps(RValue<Float4> val)3471 RValue<Float4> sqrtps(RValue<Float4> val)
3472 {
3473 return RValue<Float4>(V(jit->builder->CreateUnaryIntrinsic(llvm::Intrinsic::sqrt, V(val.value()))));
3474 }
3475
rsqrtps(RValue<Float4> val)3476 RValue<Float4> rsqrtps(RValue<Float4> val)
3477 {
3478 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_rsqrt_ps, val.value()));
3479 }
3480
maxps(RValue<Float4> x,RValue<Float4> y)3481 RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
3482 {
3483 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_max_ps, x.value(), y.value()));
3484 }
3485
minps(RValue<Float4> x,RValue<Float4> y)3486 RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
3487 {
3488 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse_min_ps, x.value(), y.value()));
3489 }
3490
roundss(RValue<Float> val,unsigned char imm)3491 RValue<Float> roundss(RValue<Float> val, unsigned char imm)
3492 {
3493 llvm::Function *roundss = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::x86_sse41_round_ss);
3494
3495 Value *undef = V(llvm::UndefValue::get(T(Float4::type())));
3496 Value *vector = Nucleus::createInsertElement(undef, val.value(), 0);
3497
3498 return RValue<Float>(Nucleus::createExtractElement(V(jit->builder->CreateCall(roundss, { V(undef), V(vector), V(Nucleus::createConstantInt(imm)) })), Float::type(), 0));
3499 }
3500
floorss(RValue<Float> val)3501 RValue<Float> floorss(RValue<Float> val)
3502 {
3503 return roundss(val, 1);
3504 }
3505
ceilss(RValue<Float> val)3506 RValue<Float> ceilss(RValue<Float> val)
3507 {
3508 return roundss(val, 2);
3509 }
3510
roundps(RValue<Float4> val,unsigned char imm)3511 RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
3512 {
3513 ASSERT(!__has_feature(memory_sanitizer)); // TODO(b/172238865): Not correctly instrumented by MemorySanitizer.
3514
3515 return RValue<Float4>(createInstruction(llvm::Intrinsic::x86_sse41_round_ps, val.value(), Nucleus::createConstantInt(imm)));
3516 }
3517
floorps(RValue<Float4> val)3518 RValue<Float4> floorps(RValue<Float4> val)
3519 {
3520 return roundps(val, 1);
3521 }
3522
ceilps(RValue<Float4> val)3523 RValue<Float4> ceilps(RValue<Float4> val)
3524 {
3525 return roundps(val, 2);
3526 }
3527
paddsw(RValue<Short4> x,RValue<Short4> y)3528 RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
3529 {
3530 return As<Short4>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3531 }
3532
psubsw(RValue<Short4> x,RValue<Short4> y)3533 RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
3534 {
3535 return As<Short4>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3536 }
3537
paddusw(RValue<UShort4> x,RValue<UShort4> y)3538 RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
3539 {
3540 return As<UShort4>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3541 }
3542
psubusw(RValue<UShort4> x,RValue<UShort4> y)3543 RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
3544 {
3545 return As<UShort4>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3546 }
3547
paddsb(RValue<SByte8> x,RValue<SByte8> y)3548 RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
3549 {
3550 return As<SByte8>(V(lowerPSADDSAT(V(x.value()), V(y.value()))));
3551 }
3552
psubsb(RValue<SByte8> x,RValue<SByte8> y)3553 RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
3554 {
3555 return As<SByte8>(V(lowerPSSUBSAT(V(x.value()), V(y.value()))));
3556 }
3557
paddusb(RValue<Byte8> x,RValue<Byte8> y)3558 RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
3559 {
3560 return As<Byte8>(V(lowerPUADDSAT(V(x.value()), V(y.value()))));
3561 }
3562
psubusb(RValue<Byte8> x,RValue<Byte8> y)3563 RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
3564 {
3565 return As<Byte8>(V(lowerPUSUBSAT(V(x.value()), V(y.value()))));
3566 }
3567
pavgw(RValue<UShort4> x,RValue<UShort4> y)3568 RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
3569 {
3570 return As<UShort4>(V(lowerPAVG(V(x.value()), V(y.value()))));
3571 }
3572
pmaxsw(RValue<Short4> x,RValue<Short4> y)3573 RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
3574 {
3575 return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3576 }
3577
pminsw(RValue<Short4> x,RValue<Short4> y)3578 RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
3579 {
3580 return As<Short4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3581 }
3582
pcmpgtw(RValue<Short4> x,RValue<Short4> y)3583 RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
3584 {
3585 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Short4::type()))));
3586 }
3587
pcmpeqw(RValue<Short4> x,RValue<Short4> y)3588 RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
3589 {
3590 return As<Short4>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Short4::type()))));
3591 }
3592
pcmpgtb(RValue<SByte8> x,RValue<SByte8> y)3593 RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
3594 {
3595 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_SGT, V(x.value()), V(y.value()), T(Byte8::type()))));
3596 }
3597
pcmpeqb(RValue<Byte8> x,RValue<Byte8> y)3598 RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
3599 {
3600 return As<Byte8>(V(lowerPCMP(llvm::ICmpInst::ICMP_EQ, V(x.value()), V(y.value()), T(Byte8::type()))));
3601 }
3602
packssdw(RValue<Int2> x,RValue<Int2> y)3603 RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
3604 {
3605 return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3606 }
3607
packssdw(RValue<Int4> x,RValue<Int4> y)3608 RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
3609 {
3610 return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_packssdw_128, x.value(), y.value()));
3611 }
3612
packsswb(RValue<Short4> x,RValue<Short4> y)3613 RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
3614 {
3615 return As<SByte8>(createInstruction(llvm::Intrinsic::x86_sse2_packsswb_128, x.value(), y.value()));
3616 }
3617
packuswb(RValue<Short4> x,RValue<Short4> y)3618 RValue<Byte8> packuswb(RValue<Short4> x, RValue<Short4> y)
3619 {
3620 return As<Byte8>(createInstruction(llvm::Intrinsic::x86_sse2_packuswb_128, x.value(), y.value()));
3621 }
3622
packusdw(RValue<Int4> x,RValue<Int4> y)3623 RValue<UShort8> packusdw(RValue<Int4> x, RValue<Int4> y)
3624 {
3625 if(CPUID::supportsSSE4_1())
3626 {
3627 return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse41_packusdw, x.value(), y.value()));
3628 }
3629 else
3630 {
3631 RValue<Int4> bx = (x & ~(x >> 31)) - Int4(0x8000);
3632 RValue<Int4> by = (y & ~(y >> 31)) - Int4(0x8000);
3633
3634 return As<UShort8>(packssdw(bx, by) + Short8(0x8000u));
3635 }
3636 }
3637
psrlw(RValue<UShort4> x,unsigned char y)3638 RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
3639 {
3640 return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3641 }
3642
psrlw(RValue<UShort8> x,unsigned char y)3643 RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
3644 {
3645 return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_w, x.value(), Nucleus::createConstantInt(y)));
3646 }
3647
psraw(RValue<Short4> x,unsigned char y)3648 RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
3649 {
3650 return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3651 }
3652
psraw(RValue<Short8> x,unsigned char y)3653 RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
3654 {
3655 return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_w, x.value(), Nucleus::createConstantInt(y)));
3656 }
3657
psllw(RValue<Short4> x,unsigned char y)3658 RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
3659 {
3660 return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3661 }
3662
psllw(RValue<Short8> x,unsigned char y)3663 RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
3664 {
3665 return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_w, x.value(), Nucleus::createConstantInt(y)));
3666 }
3667
pslld(RValue<Int2> x,unsigned char y)3668 RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
3669 {
3670 return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3671 }
3672
pslld(RValue<Int4> x,unsigned char y)3673 RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
3674 {
3675 return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pslli_d, x.value(), Nucleus::createConstantInt(y)));
3676 }
3677
psrad(RValue<Int2> x,unsigned char y)3678 RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
3679 {
3680 return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3681 }
3682
psrad(RValue<Int4> x,unsigned char y)3683 RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
3684 {
3685 return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_psrai_d, x.value(), Nucleus::createConstantInt(y)));
3686 }
3687
psrld(RValue<UInt2> x,unsigned char y)3688 RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
3689 {
3690 return As<UInt2>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3691 }
3692
psrld(RValue<UInt4> x,unsigned char y)3693 RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
3694 {
3695 return RValue<UInt4>(createInstruction(llvm::Intrinsic::x86_sse2_psrli_d, x.value(), Nucleus::createConstantInt(y)));
3696 }
3697
pmaxsd(RValue<Int4> x,RValue<Int4> y)3698 RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
3699 {
3700 return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SGT)));
3701 }
3702
pminsd(RValue<Int4> x,RValue<Int4> y)3703 RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
3704 {
3705 return RValue<Int4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_SLT)));
3706 }
3707
pmaxud(RValue<UInt4> x,RValue<UInt4> y)3708 RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
3709 {
3710 return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_UGT)));
3711 }
3712
pminud(RValue<UInt4> x,RValue<UInt4> y)3713 RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
3714 {
3715 return RValue<UInt4>(V(lowerPMINMAX(V(x.value()), V(y.value()), llvm::ICmpInst::ICMP_ULT)));
3716 }
3717
pmulhw(RValue<Short4> x,RValue<Short4> y)3718 RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
3719 {
3720 return As<Short4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
3721 }
3722
pmulhuw(RValue<UShort4> x,RValue<UShort4> y)3723 RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
3724 {
3725 return As<UShort4>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
3726 }
3727
pmaddwd(RValue<Short4> x,RValue<Short4> y)3728 RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
3729 {
3730 return As<Int2>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
3731 }
3732
pmulhw(RValue<Short8> x,RValue<Short8> y)3733 RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
3734 {
3735 return RValue<Short8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulh_w, x.value(), y.value()));
3736 }
3737
pmulhuw(RValue<UShort8> x,RValue<UShort8> y)3738 RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
3739 {
3740 return RValue<UShort8>(createInstruction(llvm::Intrinsic::x86_sse2_pmulhu_w, x.value(), y.value()));
3741 }
3742
pmaddwd(RValue<Short8> x,RValue<Short8> y)3743 RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
3744 {
3745 return RValue<Int4>(createInstruction(llvm::Intrinsic::x86_sse2_pmadd_wd, x.value(), y.value()));
3746 }
3747
movmskps(RValue<Float4> x)3748 RValue<Int> movmskps(RValue<Float4> x)
3749 {
3750 Value *v = x.value();
3751
3752 // TODO(b/172238865): MemorySanitizer does not support movmsk instructions,
3753 // which makes it look at the entire 128-bit input for undefined bits. Mask off
3754 // just the sign bits to avoid false positives.
3755 if(__has_feature(memory_sanitizer))
3756 {
3757 v = As<Float4>(As<Int4>(v) & Int4(0x80000000u)).value();
3758 }
3759
3760 return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse_movmsk_ps, v));
3761 }
3762
pmovmskb(RValue<Byte8> x)3763 RValue<Int> pmovmskb(RValue<Byte8> x)
3764 {
3765 Value *v = x.value();
3766
3767 // TODO(b/172238865): MemorySanitizer does not support movmsk instructions,
3768 // which makes it look at the entire 128-bit input for undefined bits. Mask off
3769 // just the sign bits in the lower 64-bit vector to avoid false positives.
3770 if(__has_feature(memory_sanitizer))
3771 {
3772 v = As<Byte16>(As<Int4>(v) & Int4(0x80808080u, 0x80808080u, 0, 0)).value();
3773 }
3774
3775 return RValue<Int>(createInstruction(llvm::Intrinsic::x86_sse2_pmovmskb_128, v)) & 0xFF;
3776 }
3777
3778 } // namespace x86
3779 #endif // defined(__i386__) || defined(__x86_64__)
3780
3781 #ifdef ENABLE_RR_PRINT
VPrintf(const std::vector<Value * > & vals)3782 void VPrintf(const std::vector<Value *> &vals)
3783 {
3784 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
3785 auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
3786 auto funcTy = llvm::FunctionType::get(i32Ty, { i8PtrTy }, true);
3787 auto func = jit->module->getOrInsertFunction("rr::DebugPrintf", funcTy);
3788 jit->builder->CreateCall(func, V(vals));
3789 }
3790 #endif // ENABLE_RR_PRINT
3791
Nop()3792 void Nop()
3793 {
3794 auto voidTy = llvm::Type::getVoidTy(*jit->context);
3795 auto funcTy = llvm::FunctionType::get(voidTy, {}, false);
3796 auto func = jit->module->getOrInsertFunction("nop", funcTy);
3797 jit->builder->CreateCall(func);
3798 }
3799
EmitDebugLocation()3800 void EmitDebugLocation()
3801 {
3802 #ifdef ENABLE_RR_DEBUG_INFO
3803 if(jit->debugInfo != nullptr)
3804 {
3805 jit->debugInfo->EmitLocation();
3806 }
3807 #endif // ENABLE_RR_DEBUG_INFO
3808 }
3809
EmitDebugVariable(Value * value)3810 void EmitDebugVariable(Value *value)
3811 {
3812 #ifdef ENABLE_RR_DEBUG_INFO
3813 if(jit->debugInfo != nullptr)
3814 {
3815 jit->debugInfo->EmitVariable(value);
3816 }
3817 #endif // ENABLE_RR_DEBUG_INFO
3818 }
3819
FlushDebug()3820 void FlushDebug()
3821 {
3822 #ifdef ENABLE_RR_DEBUG_INFO
3823 if(jit->debugInfo != nullptr)
3824 {
3825 jit->debugInfo->Flush();
3826 }
3827 #endif // ENABLE_RR_DEBUG_INFO
3828 }
3829
3830 } // namespace rr
3831
3832 // ------------------------------ Coroutines ------------------------------
3833
3834 namespace {
3835
3836 // Magic values retuned by llvm.coro.suspend.
3837 // See: https://llvm.org/docs/Coroutines.html#llvm-coro-suspend-intrinsic
3838 enum SuspendAction
3839 {
3840 SuspendActionSuspend = -1,
3841 SuspendActionResume = 0,
3842 SuspendActionDestroy = 1
3843 };
3844
promoteFunctionToCoroutine()3845 void promoteFunctionToCoroutine()
3846 {
3847 ASSERT(jit->coroutine.id == nullptr);
3848
3849 // Types
3850 auto voidTy = llvm::Type::getVoidTy(*jit->context);
3851 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
3852 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
3853 auto i32Ty = llvm::Type::getInt32Ty(*jit->context);
3854 auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
3855 auto promiseTy = jit->coroutine.yieldType;
3856 auto promisePtrTy = promiseTy->getPointerTo();
3857
3858 // LLVM intrinsics
3859 auto coro_id = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_id);
3860 auto coro_size = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_size, { i32Ty });
3861 auto coro_begin = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_begin);
3862 auto coro_resume = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_resume);
3863 auto coro_end = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_end);
3864 auto coro_free = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_free);
3865 auto coro_destroy = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_destroy);
3866 auto coro_promise = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_promise);
3867 auto coro_done = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_done);
3868 auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
3869
3870 auto allocFrameTy = llvm::FunctionType::get(i8PtrTy, { i32Ty }, false);
3871 auto allocFrame = jit->module->getOrInsertFunction("coroutine_alloc_frame", allocFrameTy);
3872 auto freeFrameTy = llvm::FunctionType::get(voidTy, { i8PtrTy }, false);
3873 auto freeFrame = jit->module->getOrInsertFunction("coroutine_free_frame", freeFrameTy);
3874
3875 auto oldInsertionPoint = jit->builder->saveIP();
3876
3877 // Build the coroutine_await() function:
3878 //
3879 // bool coroutine_await(CoroutineHandle* handle, YieldType* out)
3880 // {
3881 // if(llvm.coro.done(handle))
3882 // {
3883 // return false;
3884 // }
3885 // else
3886 // {
3887 // *value = (T*)llvm.coro.promise(handle);
3888 // llvm.coro.resume(handle);
3889 // return true;
3890 // }
3891 // }
3892 //
3893 {
3894 auto args = jit->coroutine.await->arg_begin();
3895 auto handle = args++;
3896 auto outPtr = args++;
3897 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "co_await", jit->coroutine.await));
3898 auto doneBlock = llvm::BasicBlock::Create(*jit->context, "done", jit->coroutine.await);
3899 auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->coroutine.await);
3900
3901 auto done = jit->builder->CreateCall(coro_done, { handle }, "done");
3902 jit->builder->CreateCondBr(done, doneBlock, resumeBlock);
3903
3904 jit->builder->SetInsertPoint(doneBlock);
3905 jit->builder->CreateRet(llvm::ConstantInt::getFalse(i1Ty));
3906
3907 jit->builder->SetInsertPoint(resumeBlock);
3908 auto promiseAlignment = llvm::ConstantInt::get(i32Ty, 4); // TODO: Get correct alignment.
3909 auto promisePtr = jit->builder->CreateCall(coro_promise, { handle, promiseAlignment, llvm::ConstantInt::get(i1Ty, 0) });
3910 auto promise = jit->builder->CreateLoad(promiseTy, jit->builder->CreatePointerCast(promisePtr, promisePtrTy));
3911 jit->builder->CreateStore(promise, outPtr);
3912 jit->builder->CreateCall(coro_resume, { handle });
3913 jit->builder->CreateRet(llvm::ConstantInt::getTrue(i1Ty));
3914 }
3915
3916 // Build the coroutine_destroy() function:
3917 //
3918 // void coroutine_destroy(CoroutineHandle* handle)
3919 // {
3920 // llvm.coro.destroy(handle);
3921 // }
3922 //
3923 {
3924 auto handle = jit->coroutine.destroy->arg_begin();
3925 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
3926 jit->builder->CreateCall(coro_destroy, { handle });
3927 jit->builder->CreateRetVoid();
3928 }
3929
3930 // Begin building the main coroutine_begin() function.
3931 //
3932 // CoroutineHandle* coroutine_begin(<Arguments>)
3933 // {
3934 // YieldType promise;
3935 // auto id = llvm.coro.id(0, &promise, nullptr, nullptr);
3936 // void* frame = coroutine_alloc_frame(llvm.coro.size.i32());
3937 // CoroutineHandle *handle = llvm.coro.begin(id, frame);
3938 //
3939 // ... <REACTOR CODE> ...
3940 //
3941 // end:
3942 // SuspendAction action = llvm.coro.suspend(none, true /* final */); // <-- RESUME POINT
3943 // switch(action)
3944 // {
3945 // case SuspendActionResume:
3946 // UNREACHABLE(); // Illegal to resume after final suspend.
3947 // case SuspendActionDestroy:
3948 // goto destroy;
3949 // default: // (SuspendActionSuspend)
3950 // goto suspend;
3951 // }
3952 //
3953 // destroy:
3954 // coroutine_free_frame(llvm.coro.free(id, handle));
3955 // goto suspend;
3956 //
3957 // suspend:
3958 // llvm.coro.end(handle, false);
3959 // return handle;
3960 // }
3961 //
3962
3963 #ifdef ENABLE_RR_DEBUG_INFO
3964 jit->debugInfo = std::make_unique<rr::DebugInfo>(jit->builder.get(), jit->context.get(), jit->module.get(), jit->function);
3965 #endif // ENABLE_RR_DEBUG_INFO
3966
3967 jit->coroutine.suspendBlock = llvm::BasicBlock::Create(*jit->context, "suspend", jit->function);
3968 jit->coroutine.endBlock = llvm::BasicBlock::Create(*jit->context, "end", jit->function);
3969 jit->coroutine.destroyBlock = llvm::BasicBlock::Create(*jit->context, "destroy", jit->function);
3970
3971 jit->builder->SetInsertPoint(jit->coroutine.entryBlock, jit->coroutine.entryBlock->begin());
3972 jit->coroutine.promise = jit->builder->CreateAlloca(promiseTy, nullptr, "promise");
3973 jit->coroutine.id = jit->builder->CreateCall(coro_id, {
3974 llvm::ConstantInt::get(i32Ty, 0),
3975 jit->builder->CreatePointerCast(jit->coroutine.promise, i8PtrTy),
3976 llvm::ConstantPointerNull::get(i8PtrTy),
3977 llvm::ConstantPointerNull::get(i8PtrTy),
3978 });
3979 auto size = jit->builder->CreateCall(coro_size, {});
3980 auto frame = jit->builder->CreateCall(allocFrame, { size });
3981 jit->coroutine.handle = jit->builder->CreateCall(coro_begin, { jit->coroutine.id, frame });
3982
3983 // Build the suspend block
3984 jit->builder->SetInsertPoint(jit->coroutine.suspendBlock);
3985 jit->builder->CreateCall(coro_end, { jit->coroutine.handle, llvm::ConstantInt::get(i1Ty, 0) });
3986 jit->builder->CreateRet(jit->coroutine.handle);
3987
3988 // Build the end block
3989 jit->builder->SetInsertPoint(jit->coroutine.endBlock);
3990 auto action = jit->builder->CreateCall(coro_suspend, {
3991 llvm::ConstantTokenNone::get(*jit->context),
3992 llvm::ConstantInt::get(i1Ty, 1), // final: true
3993 });
3994 auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
3995 // switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), trapBlock); // TODO: Trap attempting to resume after final suspend
3996 switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
3997
3998 // Build the destroy block
3999 jit->builder->SetInsertPoint(jit->coroutine.destroyBlock);
4000 auto memory = jit->builder->CreateCall(coro_free, { jit->coroutine.id, jit->coroutine.handle });
4001 jit->builder->CreateCall(freeFrame, { memory });
4002 jit->builder->CreateBr(jit->coroutine.suspendBlock);
4003
4004 // Switch back to original insert point to continue building the coroutine.
4005 jit->builder->restoreIP(oldInsertionPoint);
4006 }
4007
4008 } // anonymous namespace
4009
4010 namespace rr {
4011
createCoroutine(Type * YieldType,const std::vector<Type * > & Params)4012 void Nucleus::createCoroutine(Type *YieldType, const std::vector<Type *> &Params)
4013 {
4014 // Coroutines are initially created as a regular function.
4015 // Upon the first call to Yield(), the function is promoted to a true
4016 // coroutine.
4017 auto voidTy = llvm::Type::getVoidTy(*jit->context);
4018 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4019 auto i8PtrTy = llvm::Type::getInt8PtrTy(*jit->context);
4020 auto handleTy = i8PtrTy;
4021 auto boolTy = i1Ty;
4022 auto promiseTy = T(YieldType);
4023 auto promisePtrTy = promiseTy->getPointerTo();
4024
4025 jit->function = rr::createFunction("coroutine_begin", handleTy, T(Params));
4026 #if LLVM_VERSION_MAJOR >= 16
4027 jit->function->setPresplitCoroutine();
4028 #else
4029 jit->function->addFnAttr("coroutine.presplit", "0");
4030 #endif
4031 jit->coroutine.await = rr::createFunction("coroutine_await", boolTy, { handleTy, promisePtrTy });
4032 jit->coroutine.destroy = rr::createFunction("coroutine_destroy", voidTy, { handleTy });
4033 jit->coroutine.yieldType = promiseTy;
4034 jit->coroutine.entryBlock = llvm::BasicBlock::Create(*jit->context, "function", jit->function);
4035
4036 jit->builder->SetInsertPoint(jit->coroutine.entryBlock);
4037 }
4038
yield(Value * val)4039 void Nucleus::yield(Value *val)
4040 {
4041 if(jit->coroutine.id == nullptr)
4042 {
4043 // First call to yield().
4044 // Promote the function to a full coroutine.
4045 promoteFunctionToCoroutine();
4046 ASSERT(jit->coroutine.id != nullptr);
4047 }
4048
4049 // promise = val;
4050 //
4051 // auto action = llvm.coro.suspend(none, false /* final */); // <-- RESUME POINT
4052 // switch(action)
4053 // {
4054 // case SuspendActionResume:
4055 // goto resume;
4056 // case SuspendActionDestroy:
4057 // goto destroy;
4058 // default: // (SuspendActionSuspend)
4059 // goto suspend;
4060 // }
4061 // resume:
4062 //
4063
4064 RR_DEBUG_INFO_UPDATE_LOC();
4065 Variable::materializeAll();
4066
4067 // Types
4068 auto i1Ty = llvm::Type::getInt1Ty(*jit->context);
4069 auto i8Ty = llvm::Type::getInt8Ty(*jit->context);
4070
4071 // Intrinsics
4072 auto coro_suspend = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::coro_suspend);
4073
4074 // Create a block to resume execution.
4075 auto resumeBlock = llvm::BasicBlock::Create(*jit->context, "resume", jit->function);
4076
4077 // Store the promise (yield value)
4078 jit->builder->CreateStore(V(val), jit->coroutine.promise);
4079 auto action = jit->builder->CreateCall(coro_suspend, {
4080 llvm::ConstantTokenNone::get(*jit->context),
4081 llvm::ConstantInt::get(i1Ty, 0), // final: true
4082 });
4083 auto switch_ = jit->builder->CreateSwitch(action, jit->coroutine.suspendBlock, 3);
4084 switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionResume), resumeBlock);
4085 switch_->addCase(llvm::ConstantInt::get(i8Ty, SuspendActionDestroy), jit->coroutine.destroyBlock);
4086
4087 // Continue building in the resume block.
4088 jit->builder->SetInsertPoint(resumeBlock);
4089 }
4090
acquireCoroutine(const char * name)4091 std::shared_ptr<Routine> Nucleus::acquireCoroutine(const char *name)
4092 {
4093 if(jit->coroutine.id)
4094 {
4095 jit->builder->CreateBr(jit->coroutine.endBlock);
4096 }
4097 else
4098 {
4099 // Coroutine without a Yield acts as a regular function.
4100 // The 'coroutine_begin' function returns a nullptr for the coroutine
4101 // handle.
4102 jit->builder->CreateRet(llvm::Constant::getNullValue(jit->function->getReturnType()));
4103 // The 'coroutine_await' function always returns false (coroutine done).
4104 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.await));
4105 jit->builder->CreateRet(llvm::Constant::getNullValue(jit->coroutine.await->getReturnType()));
4106 // The 'coroutine_destroy' does nothing, returns void.
4107 jit->builder->SetInsertPoint(llvm::BasicBlock::Create(*jit->context, "", jit->coroutine.destroy));
4108 jit->builder->CreateRetVoid();
4109 }
4110
4111 #ifdef ENABLE_RR_DEBUG_INFO
4112 if(jit->debugInfo != nullptr)
4113 {
4114 jit->debugInfo->Finalize();
4115 }
4116 #endif // ENABLE_RR_DEBUG_INFO
4117
4118 if(false)
4119 {
4120 std::error_code error;
4121 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-unopt.txt", error);
4122 jit->module->print(file, 0);
4123 }
4124
4125 jit->runPasses();
4126
4127 if(false)
4128 {
4129 std::error_code error;
4130 llvm::raw_fd_ostream file(std::string(name) + "-llvm-dump-opt.txt", error);
4131 jit->module->print(file, 0);
4132 }
4133
4134 llvm::Function *funcs[Nucleus::CoroutineEntryCount];
4135 funcs[Nucleus::CoroutineEntryBegin] = jit->function;
4136 funcs[Nucleus::CoroutineEntryAwait] = jit->coroutine.await;
4137 funcs[Nucleus::CoroutineEntryDestroy] = jit->coroutine.destroy;
4138
4139 auto routine = jit->acquireRoutine(name, funcs, Nucleus::CoroutineEntryCount);
4140
4141 delete jit;
4142 jit = nullptr;
4143
4144 return routine;
4145 }
4146
invokeCoroutineBegin(Routine & routine,std::function<Nucleus::CoroutineHandle ()> func)4147 Nucleus::CoroutineHandle Nucleus::invokeCoroutineBegin(Routine &routine, std::function<Nucleus::CoroutineHandle()> func)
4148 {
4149 return func();
4150 }
4151
Int(RValue<scalar::Int> rhs)4152 SIMD::Int::Int(RValue<scalar::Int> rhs)
4153 : XYZW(this)
4154 {
4155 RR_DEBUG_INFO_UPDATE_LOC();
4156 Value *vector = loadValue();
4157 Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
4158
4159 std::vector<int> swizzle = { 0 };
4160 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
4161
4162 storeValue(replicate);
4163 }
4164
operator <<(RValue<SIMD::Int> lhs,unsigned char rhs)4165 RValue<SIMD::Int> operator<<(RValue<SIMD::Int> lhs, unsigned char rhs)
4166 {
4167 RR_DEBUG_INFO_UPDATE_LOC();
4168 return As<SIMD::Int>(V(lowerVectorShl(V(lhs.value()), rhs)));
4169 }
4170
operator >>(RValue<SIMD::Int> lhs,unsigned char rhs)4171 RValue<SIMD::Int> operator>>(RValue<SIMD::Int> lhs, unsigned char rhs)
4172 {
4173 RR_DEBUG_INFO_UPDATE_LOC();
4174 return As<SIMD::Int>(V(lowerVectorAShr(V(lhs.value()), rhs)));
4175 }
4176
CmpEQ(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4177 RValue<SIMD::Int> CmpEQ(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4178 {
4179 RR_DEBUG_INFO_UPDATE_LOC();
4180 return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), SIMD::Int::type()));
4181 }
4182
CmpLT(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4183 RValue<SIMD::Int> CmpLT(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4184 {
4185 RR_DEBUG_INFO_UPDATE_LOC();
4186 return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value(), y.value()), SIMD::Int::type()));
4187 }
4188
CmpLE(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4189 RValue<SIMD::Int> CmpLE(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4190 {
4191 RR_DEBUG_INFO_UPDATE_LOC();
4192 return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value(), y.value()), SIMD::Int::type()));
4193 }
4194
CmpNEQ(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4195 RValue<SIMD::Int> CmpNEQ(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4196 {
4197 RR_DEBUG_INFO_UPDATE_LOC();
4198 return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), SIMD::Int::type()));
4199 }
4200
CmpNLT(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4201 RValue<SIMD::Int> CmpNLT(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4202 {
4203 RR_DEBUG_INFO_UPDATE_LOC();
4204 return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value(), y.value()), SIMD::Int::type()));
4205 }
4206
CmpNLE(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4207 RValue<SIMD::Int> CmpNLE(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4208 {
4209 RR_DEBUG_INFO_UPDATE_LOC();
4210 return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value(), y.value()), SIMD::Int::type()));
4211 }
4212
Abs(RValue<SIMD::Int> x)4213 RValue<SIMD::Int> Abs(RValue<SIMD::Int> x)
4214 {
4215 #if LLVM_VERSION_MAJOR >= 12
4216 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::abs, { V(x.value())->getType() });
4217 return RValue<SIMD::Int>(V(jit->builder->CreateCall(func, { V(x.value()), llvm::ConstantInt::getFalse(*jit->context) })));
4218 #else
4219 auto negative = x >> 31;
4220 return (x ^ negative) - negative;
4221 #endif
4222 }
4223
Max(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4224 RValue<SIMD::Int> Max(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4225 {
4226 RR_DEBUG_INFO_UPDATE_LOC();
4227 RValue<SIMD::Int> greater = CmpNLE(x, y);
4228 return (x & greater) | (y & ~greater);
4229 }
4230
Min(RValue<SIMD::Int> x,RValue<SIMD::Int> y)4231 RValue<SIMD::Int> Min(RValue<SIMD::Int> x, RValue<SIMD::Int> y)
4232 {
4233 RR_DEBUG_INFO_UPDATE_LOC();
4234 RValue<SIMD::Int> less = CmpLT(x, y);
4235 return (x & less) | (y & ~less);
4236 }
4237
RoundInt(RValue<SIMD::Float> cast)4238 RValue<SIMD::Int> RoundInt(RValue<SIMD::Float> cast)
4239 {
4240 RR_DEBUG_INFO_UPDATE_LOC();
4241 return As<SIMD::Int>(V(lowerRoundInt(V(cast.value()), T(SIMD::Int::type()))));
4242 }
4243
RoundIntClamped(RValue<SIMD::Float> cast)4244 RValue<SIMD::Int> RoundIntClamped(RValue<SIMD::Float> cast)
4245 {
4246 RR_DEBUG_INFO_UPDATE_LOC();
4247
4248 // TODO(b/165000222): Check if fptosi_sat produces optimal code for x86 and ARM.
4249 #if defined(__arm__) || defined(__aarch64__)
4250 // ARM saturates to the largest positive or negative integer. Unit tests
4251 // verify that lowerRoundInt() behaves as desired.
4252 return As<SIMD::Int>(V(lowerRoundInt(V(cast.value()), T(SIMD::Int::type()))));
4253 #elif LLVM_VERSION_MAJOR >= 14
4254 llvm::Value *rounded = lowerRound(V(cast.value()));
4255 llvm::Function *fptosi_sat = llvm::Intrinsic::getDeclaration(
4256 jit->module.get(), llvm::Intrinsic::fptosi_sat, { T(SIMD::Int::type()), T(SIMD::Float::type()) });
4257 return RValue<SIMD::Int>(V(jit->builder->CreateCall(fptosi_sat, { rounded })));
4258 #else
4259 RValue<SIMD::Float> clamped = Max(Min(cast, SIMD::Float(0x7FFFFF80)), SIMD::Float(static_cast<int>(0x80000000)));
4260 return As<SIMD::Int>(V(lowerRoundInt(V(clamped.value()), T(SIMD::Int::type()))));
4261 #endif
4262 }
4263
Extract128(RValue<SIMD::Int> val,int i)4264 RValue<Int4> Extract128(RValue<SIMD::Int> val, int i)
4265 {
4266 llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4267
4268 return As<Int4>(V(jit->builder->CreateExtractElement(v128, i)));
4269 }
4270
Insert128(RValue<SIMD::Int> val,RValue<Int4> element,int i)4271 RValue<SIMD::Int> Insert128(RValue<SIMD::Int> val, RValue<Int4> element, int i)
4272 {
4273 llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4274 llvm::Value *a = jit->builder->CreateBitCast(V(element.value()), llvm::IntegerType::get(*jit->context, 128));
4275
4276 return As<SIMD::Int>(V(jit->builder->CreateInsertElement(v128, a, i)));
4277 }
4278
type()4279 Type *SIMD::Int::type()
4280 {
4281 return T(llvm::VectorType::get(T(scalar::Int::type()), SIMD::Width, false));
4282 }
4283
UInt(RValue<SIMD::Float> cast)4284 SIMD::UInt::UInt(RValue<SIMD::Float> cast)
4285 : XYZW(this)
4286 {
4287 RR_DEBUG_INFO_UPDATE_LOC();
4288 Value *xyzw = Nucleus::createFPToUI(cast.value(), SIMD::UInt::type());
4289 storeValue(xyzw);
4290 }
4291
UInt(RValue<scalar::UInt> rhs)4292 SIMD::UInt::UInt(RValue<scalar::UInt> rhs)
4293 : XYZW(this)
4294 {
4295 RR_DEBUG_INFO_UPDATE_LOC();
4296 Value *vector = loadValue();
4297 Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
4298
4299 std::vector<int> swizzle = { 0 };
4300 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
4301
4302 storeValue(replicate);
4303 }
4304
operator <<(RValue<SIMD::UInt> lhs,unsigned char rhs)4305 RValue<SIMD::UInt> operator<<(RValue<SIMD::UInt> lhs, unsigned char rhs)
4306 {
4307 RR_DEBUG_INFO_UPDATE_LOC();
4308 return As<SIMD::UInt>(V(lowerVectorShl(V(lhs.value()), rhs)));
4309 }
4310
operator >>(RValue<SIMD::UInt> lhs,unsigned char rhs)4311 RValue<SIMD::UInt> operator>>(RValue<SIMD::UInt> lhs, unsigned char rhs)
4312 {
4313 RR_DEBUG_INFO_UPDATE_LOC();
4314 return As<SIMD::UInt>(V(lowerVectorLShr(V(lhs.value()), rhs)));
4315 }
4316
CmpEQ(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4317 RValue<SIMD::UInt> CmpEQ(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4318 {
4319 RR_DEBUG_INFO_UPDATE_LOC();
4320 return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value(), y.value()), SIMD::Int::type()));
4321 }
4322
CmpLT(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4323 RValue<SIMD::UInt> CmpLT(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4324 {
4325 RR_DEBUG_INFO_UPDATE_LOC();
4326 return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpULT(x.value(), y.value()), SIMD::Int::type()));
4327 }
4328
CmpLE(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4329 RValue<SIMD::UInt> CmpLE(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4330 {
4331 RR_DEBUG_INFO_UPDATE_LOC();
4332 return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpULE(x.value(), y.value()), SIMD::Int::type()));
4333 }
4334
CmpNEQ(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4335 RValue<SIMD::UInt> CmpNEQ(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4336 {
4337 RR_DEBUG_INFO_UPDATE_LOC();
4338 return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpNE(x.value(), y.value()), SIMD::Int::type()));
4339 }
4340
CmpNLT(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4341 RValue<SIMD::UInt> CmpNLT(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4342 {
4343 RR_DEBUG_INFO_UPDATE_LOC();
4344 return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value(), y.value()), SIMD::Int::type()));
4345 }
4346
CmpNLE(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4347 RValue<SIMD::UInt> CmpNLE(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4348 {
4349 RR_DEBUG_INFO_UPDATE_LOC();
4350 return RValue<SIMD::UInt>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value(), y.value()), SIMD::Int::type()));
4351 }
4352
Max(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4353 RValue<SIMD::UInt> Max(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4354 {
4355 RR_DEBUG_INFO_UPDATE_LOC();
4356 RValue<SIMD::UInt> greater = CmpNLE(x, y);
4357 return (x & greater) | (y & ~greater);
4358 }
4359
Min(RValue<SIMD::UInt> x,RValue<SIMD::UInt> y)4360 RValue<SIMD::UInt> Min(RValue<SIMD::UInt> x, RValue<SIMD::UInt> y)
4361 {
4362 RR_DEBUG_INFO_UPDATE_LOC();
4363 RValue<SIMD::UInt> less = CmpLT(x, y);
4364 return (x & less) | (y & ~less);
4365 }
4366
Extract128(RValue<SIMD::UInt> val,int i)4367 RValue<UInt4> Extract128(RValue<SIMD::UInt> val, int i)
4368 {
4369 llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4370
4371 return As<UInt4>(V(jit->builder->CreateExtractElement(v128, i)));
4372 }
4373
Insert128(RValue<SIMD::UInt> val,RValue<UInt4> element,int i)4374 RValue<SIMD::UInt> Insert128(RValue<SIMD::UInt> val, RValue<UInt4> element, int i)
4375 {
4376 llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4377 llvm::Value *a = jit->builder->CreateBitCast(V(element.value()), llvm::IntegerType::get(*jit->context, 128));
4378
4379 return As<SIMD::UInt>(V(jit->builder->CreateInsertElement(v128, a, i)));
4380 }
4381
type()4382 Type *SIMD::UInt::type()
4383 {
4384 return T(llvm::VectorType::get(T(scalar::UInt::type()), SIMD::Width, false));
4385 }
4386
Float(RValue<scalar::Float> rhs)4387 SIMD::Float::Float(RValue<scalar::Float> rhs)
4388 : XYZW(this)
4389 {
4390 RR_DEBUG_INFO_UPDATE_LOC();
4391 Value *vector = loadValue();
4392 Value *insert = Nucleus::createInsertElement(vector, rhs.value(), 0);
4393
4394 std::vector<int> swizzle = { 0 };
4395 Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
4396
4397 storeValue(replicate);
4398 }
4399
operator %(RValue<SIMD::Float> lhs,RValue<SIMD::Float> rhs)4400 RValue<SIMD::Float> operator%(RValue<SIMD::Float> lhs, RValue<SIMD::Float> rhs)
4401 {
4402 return RValue<SIMD::Float>(Nucleus::createFRem(lhs.value(), rhs.value()));
4403 }
4404
MulAdd(RValue<SIMD::Float> x,RValue<SIMD::Float> y,RValue<SIMD::Float> z)4405 RValue<SIMD::Float> MulAdd(RValue<SIMD::Float> x, RValue<SIMD::Float> y, RValue<SIMD::Float> z)
4406 {
4407 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fmuladd, { T(SIMD::Float::type()) });
4408 return RValue<SIMD::Float>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
4409 }
4410
FMA(RValue<SIMD::Float> x,RValue<SIMD::Float> y,RValue<SIMD::Float> z)4411 RValue<SIMD::Float> FMA(RValue<SIMD::Float> x, RValue<SIMD::Float> y, RValue<SIMD::Float> z)
4412 {
4413 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fma, { T(SIMD::Float::type()) });
4414 return RValue<SIMD::Float>(V(jit->builder->CreateCall(func, { V(x.value()), V(y.value()), V(z.value()) })));
4415 }
4416
Abs(RValue<SIMD::Float> x)4417 RValue<SIMD::Float> Abs(RValue<SIMD::Float> x)
4418 {
4419 auto func = llvm::Intrinsic::getDeclaration(jit->module.get(), llvm::Intrinsic::fabs, { V(x.value())->getType() });
4420 return RValue<SIMD::Float>(V(jit->builder->CreateCall(func, V(x.value()))));
4421 }
4422
Max(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4423 RValue<SIMD::Float> Max(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4424 {
4425 RR_DEBUG_INFO_UPDATE_LOC();
4426 return As<SIMD::Float>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OGT)));
4427 }
4428
Min(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4429 RValue<SIMD::Float> Min(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4430 {
4431 RR_DEBUG_INFO_UPDATE_LOC();
4432 return As<SIMD::Float>(V(lowerPFMINMAX(V(x.value()), V(y.value()), llvm::FCmpInst::FCMP_OLT)));
4433 }
4434
Sqrt(RValue<SIMD::Float> x)4435 RValue<SIMD::Float> Sqrt(RValue<SIMD::Float> x)
4436 {
4437 RR_DEBUG_INFO_UPDATE_LOC();
4438 return As<SIMD::Float>(V(lowerSQRT(V(x.value()))));
4439 }
4440
CmpEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4441 RValue<SIMD::Int> CmpEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4442 {
4443 RR_DEBUG_INFO_UPDATE_LOC();
4444 return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value(), y.value()), SIMD::Int::type()));
4445 }
4446
CmpLT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4447 RValue<SIMD::Int> CmpLT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4448 {
4449 RR_DEBUG_INFO_UPDATE_LOC();
4450 return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value(), y.value()), SIMD::Int::type()));
4451 }
4452
CmpLE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4453 RValue<SIMD::Int> CmpLE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4454 {
4455 RR_DEBUG_INFO_UPDATE_LOC();
4456 return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value(), y.value()), SIMD::Int::type()));
4457 }
4458
CmpNEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4459 RValue<SIMD::Int> CmpNEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4460 {
4461 RR_DEBUG_INFO_UPDATE_LOC();
4462 return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value(), y.value()), SIMD::Int::type()));
4463 }
4464
CmpNLT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4465 RValue<SIMD::Int> CmpNLT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4466 {
4467 RR_DEBUG_INFO_UPDATE_LOC();
4468 return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value(), y.value()), SIMD::Int::type()));
4469 }
4470
CmpNLE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4471 RValue<SIMD::Int> CmpNLE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4472 {
4473 RR_DEBUG_INFO_UPDATE_LOC();
4474 return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value(), y.value()), SIMD::Int::type()));
4475 }
4476
CmpUEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4477 RValue<SIMD::Int> CmpUEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4478 {
4479 RR_DEBUG_INFO_UPDATE_LOC();
4480 return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpUEQ(x.value(), y.value()), SIMD::Int::type()));
4481 }
4482
CmpULT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4483 RValue<SIMD::Int> CmpULT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4484 {
4485 RR_DEBUG_INFO_UPDATE_LOC();
4486 return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpULT(x.value(), y.value()), SIMD::Int::type()));
4487 }
4488
CmpULE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4489 RValue<SIMD::Int> CmpULE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4490 {
4491 RR_DEBUG_INFO_UPDATE_LOC();
4492 return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpULE(x.value(), y.value()), SIMD::Int::type()));
4493 }
4494
CmpUNEQ(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4495 RValue<SIMD::Int> CmpUNEQ(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4496 {
4497 RR_DEBUG_INFO_UPDATE_LOC();
4498 return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpUNE(x.value(), y.value()), SIMD::Int::type()));
4499 }
4500
CmpUNLT(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4501 RValue<SIMD::Int> CmpUNLT(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4502 {
4503 RR_DEBUG_INFO_UPDATE_LOC();
4504 return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpUGE(x.value(), y.value()), SIMD::Int::type()));
4505 }
4506
CmpUNLE(RValue<SIMD::Float> x,RValue<SIMD::Float> y)4507 RValue<SIMD::Int> CmpUNLE(RValue<SIMD::Float> x, RValue<SIMD::Float> y)
4508 {
4509 RR_DEBUG_INFO_UPDATE_LOC();
4510 return RValue<SIMD::Int>(Nucleus::createSExt(Nucleus::createFCmpUGT(x.value(), y.value()), SIMD::Int::type()));
4511 }
4512
Round(RValue<SIMD::Float> x)4513 RValue<SIMD::Float> Round(RValue<SIMD::Float> x)
4514 {
4515 RR_DEBUG_INFO_UPDATE_LOC();
4516 return RValue<SIMD::Float>(V(lowerRound(V(x.value()))));
4517 }
4518
Trunc(RValue<SIMD::Float> x)4519 RValue<SIMD::Float> Trunc(RValue<SIMD::Float> x)
4520 {
4521 RR_DEBUG_INFO_UPDATE_LOC();
4522 return RValue<SIMD::Float>(V(lowerTrunc(V(x.value()))));
4523 }
4524
Frac(RValue<SIMD::Float> x)4525 RValue<SIMD::Float> Frac(RValue<SIMD::Float> x)
4526 {
4527 RR_DEBUG_INFO_UPDATE_LOC();
4528 SIMD::Float frc = x - Floor(x);
4529
4530 // x - floor(x) can be 1.0 for very small negative x.
4531 // Clamp against the value just below 1.0.
4532 return Min(frc, As<SIMD::Float>(SIMD::Int(0x3F7FFFFF)));
4533 }
4534
Floor(RValue<SIMD::Float> x)4535 RValue<SIMD::Float> Floor(RValue<SIMD::Float> x)
4536 {
4537 RR_DEBUG_INFO_UPDATE_LOC();
4538 return RValue<SIMD::Float>(V(lowerFloor(V(x.value()))));
4539 }
4540
Ceil(RValue<SIMD::Float> x)4541 RValue<SIMD::Float> Ceil(RValue<SIMD::Float> x)
4542 {
4543 RR_DEBUG_INFO_UPDATE_LOC();
4544 return -Floor(-x);
4545 }
4546
Extract128(RValue<SIMD::Float> val,int i)4547 RValue<Float4> Extract128(RValue<SIMD::Float> val, int i)
4548 {
4549 llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4550
4551 return As<Float4>(V(jit->builder->CreateExtractElement(v128, i)));
4552 }
4553
Insert128(RValue<SIMD::Float> val,RValue<Float4> element,int i)4554 RValue<SIMD::Float> Insert128(RValue<SIMD::Float> val, RValue<Float4> element, int i)
4555 {
4556 llvm::Value *v128 = jit->builder->CreateBitCast(V(val.value()), llvm::FixedVectorType::get(llvm::IntegerType::get(*jit->context, 128), SIMD::Width / 4));
4557 llvm::Value *a = jit->builder->CreateBitCast(V(element.value()), llvm::IntegerType::get(*jit->context, 128));
4558
4559 return As<SIMD::Float>(V(jit->builder->CreateInsertElement(v128, a, i)));
4560 }
4561
type()4562 Type *SIMD::Float::type()
4563 {
4564 return T(llvm::VectorType::get(T(scalar::Float::type()), SIMD::Width, false));
4565 }
4566
4567 } // namespace rr
4568