1 //===- subzero/src/IceTargetLoweringX8664.cpp - x86-64 lowering -----------===//
2 //
3 // The Subzero Code Generator
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Implements the TargetLoweringX8664 class, which consists almost
12 /// entirely of the lowering sequence for each high-level instruction.
13 ///
14 //===----------------------------------------------------------------------===//
15 #include "IceTargetLoweringX8664.h"
16
17 #include "IceCfg.h"
18 #include "IceCfgNode.h"
19 #include "IceClFlags.h"
20 #include "IceDefs.h"
21 #include "IceELFObjectWriter.h"
22 #include "IceGlobalInits.h"
23 #include "IceInstVarIter.h"
24 #include "IceInstX8664.h"
25 #include "IceLiveness.h"
26 #include "IceOperand.h"
27 #include "IcePhiLoweringImpl.h"
28 #include "IceTargetLoweringX8664.def"
29 #include "IceUtils.h"
30 #include "IceVariableSplitting.h"
31
32 #include "llvm/Support/MathExtras.h"
33
34 #include <stack>
35
36 #if defined(_WIN64)
37 extern "C" void __chkstk();
38 #endif
39
40 namespace X8664 {
41
createTargetLowering(::Ice::Cfg * Func)42 std::unique_ptr<::Ice::TargetLowering> createTargetLowering(::Ice::Cfg *Func) {
43 return ::Ice::X8664::TargetX8664::create(Func);
44 }
45
46 std::unique_ptr<::Ice::TargetDataLowering>
createTargetDataLowering(::Ice::GlobalContext * Ctx)47 createTargetDataLowering(::Ice::GlobalContext *Ctx) {
48 return ::Ice::X8664::TargetDataX8664::create(Ctx);
49 }
50
51 std::unique_ptr<::Ice::TargetHeaderLowering>
createTargetHeaderLowering(::Ice::GlobalContext * Ctx)52 createTargetHeaderLowering(::Ice::GlobalContext *Ctx) {
53 return ::Ice::X8664::TargetHeaderX86::create(Ctx);
54 }
55
staticInit(::Ice::GlobalContext * Ctx)56 void staticInit(::Ice::GlobalContext *Ctx) {
57 ::Ice::X8664::TargetX8664::staticInit(Ctx);
58 }
59
shouldBePooled(const class::Ice::Constant * C)60 bool shouldBePooled(const class ::Ice::Constant *C) {
61 return ::Ice::X8664::TargetX8664::shouldBePooled(C);
62 }
63
getPointerType()64 ::Ice::Type getPointerType() {
65 return ::Ice::X8664::TargetX8664::getPointerType();
66 }
67
68 } // namespace X8664
69
70 namespace Ice {
71 namespace X8664 {
72
73 /// The number of bits in a byte
74 static constexpr uint32_t X86_CHAR_BIT = 8;
75 /// Size of the return address on the stack
76 static constexpr uint32_t X86_RET_IP_SIZE_BYTES = 8;
77
78 /// \name Limits for unrolling memory intrinsics.
79 /// @{
80 static constexpr uint32_t MEMCPY_UNROLL_LIMIT = 8;
81 static constexpr uint32_t MEMMOVE_UNROLL_LIMIT = 8;
82 static constexpr uint32_t MEMSET_UNROLL_LIMIT = 8;
83 /// @}
84
85 // The Microsoft x64 ABI requires the caller to allocate a minimum 32 byte
86 // "shadow store" (aka "home space") so that the callee may copy the 4
87 // register args to it.
getShadowStoreSize()88 SizeT getShadowStoreSize() {
89 #if defined(_WIN64)
90 static const SizeT ShadowStoreSize = 4 * typeWidthInBytes(WordType);
91 return ShadowStoreSize;
92 #else
93 return 0;
94 #endif
95 }
96
BoolFoldingEntry(Inst * I)97 BoolFoldingEntry::BoolFoldingEntry(Inst *I)
98 : Instr(I), IsComplex(BoolFolding::hasComplexLowering(I)) {}
99
100 BoolFolding::BoolFoldingProducerKind
getProducerKind(const Inst * Instr)101 BoolFolding::getProducerKind(const Inst *Instr) {
102 if (llvm::isa<InstIcmp>(Instr)) {
103 return PK_Icmp32;
104 }
105 if (llvm::isa<InstFcmp>(Instr))
106 return PK_Fcmp;
107 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
108 switch (Arith->getOp()) {
109 default:
110 return PK_None;
111 case InstArithmetic::And:
112 case InstArithmetic::Or:
113 return PK_Arith;
114 }
115 }
116 return PK_None; // TODO(stichnot): remove this
117
118 if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
119 switch (Cast->getCastKind()) {
120 default:
121 return PK_None;
122 case InstCast::Trunc:
123 return PK_Trunc;
124 }
125 }
126 return PK_None;
127 }
128
129 BoolFolding::BoolFoldingConsumerKind
getConsumerKind(const Inst * Instr)130 BoolFolding::getConsumerKind(const Inst *Instr) {
131 if (llvm::isa<InstBr>(Instr))
132 return CK_Br;
133 if (llvm::isa<InstSelect>(Instr))
134 return CK_Select;
135 return CK_None; // TODO(stichnot): remove this
136
137 if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
138 switch (Cast->getCastKind()) {
139 default:
140 return CK_None;
141 case InstCast::Sext:
142 return CK_Sext;
143 case InstCast::Zext:
144 return CK_Zext;
145 }
146 }
147 return CK_None;
148 }
149
150 /// Returns true if the producing instruction has a "complex" lowering sequence.
151 /// This generally means that its lowering sequence requires more than one
152 /// conditional branch, namely 64-bit integer compares and some floating-point
153 /// compares. When this is true, and there is more than one consumer, we prefer
154 /// to disable the folding optimization because it minimizes branches.
155
hasComplexLowering(const Inst * Instr)156 bool BoolFolding::hasComplexLowering(const Inst *Instr) {
157 switch (getProducerKind(Instr)) {
158 default:
159 return false;
160 case PK_Icmp64:
161 return false;
162 case PK_Fcmp:
163 return TargetX8664::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()]
164 .C2 != CondX86::Br_None;
165 }
166 }
167
isValidFolding(BoolFolding::BoolFoldingProducerKind ProducerKind,BoolFolding::BoolFoldingConsumerKind ConsumerKind)168 bool BoolFolding::isValidFolding(
169 BoolFolding::BoolFoldingProducerKind ProducerKind,
170 BoolFolding::BoolFoldingConsumerKind ConsumerKind) {
171 switch (ProducerKind) {
172 default:
173 return false;
174 case PK_Icmp32:
175 case PK_Icmp64:
176 case PK_Fcmp:
177 return (ConsumerKind == CK_Br) || (ConsumerKind == CK_Select);
178 case PK_Arith:
179 return ConsumerKind == CK_Br;
180 }
181 }
182
init(CfgNode * Node)183 void BoolFolding::init(CfgNode *Node) {
184 Producers.clear();
185 for (Inst &Instr : Node->getInsts()) {
186 if (Instr.isDeleted())
187 continue;
188 invalidateProducersOnStore(&Instr);
189 // Check whether Instr is a valid producer.
190 Variable *Var = Instr.getDest();
191 if (Var) { // only consider instructions with an actual dest var
192 if (isBooleanType(Var->getType())) { // only bool-type dest vars
193 if (getProducerKind(&Instr) != PK_None) { // white-listed instructions
194 Producers[Var->getIndex()] = BoolFoldingEntry(&Instr);
195 }
196 }
197 }
198 // Check each src variable against the map.
199 FOREACH_VAR_IN_INST(Var, Instr) {
200 SizeT VarNum = Var->getIndex();
201 if (!containsValid(VarNum))
202 continue;
203 // All valid consumers use Var as the first source operand
204 if (IndexOfVarOperandInInst(Var) != 0) {
205 setInvalid(VarNum);
206 continue;
207 }
208 // Consumer instructions must be white-listed
209 BoolFolding::BoolFoldingConsumerKind ConsumerKind =
210 getConsumerKind(&Instr);
211 if (ConsumerKind == CK_None) {
212 setInvalid(VarNum);
213 continue;
214 }
215 BoolFolding::BoolFoldingProducerKind ProducerKind =
216 getProducerKind(Producers[VarNum].Instr);
217 if (!isValidFolding(ProducerKind, ConsumerKind)) {
218 setInvalid(VarNum);
219 continue;
220 }
221 // Avoid creating multiple copies of complex producer instructions.
222 if (Producers[VarNum].IsComplex && Producers[VarNum].NumUses > 0) {
223 setInvalid(VarNum);
224 continue;
225 }
226 ++Producers[VarNum].NumUses;
227 if (Instr.isLastUse(Var)) {
228 Producers[VarNum].IsLiveOut = false;
229 }
230 }
231 }
232 for (auto &I : Producers) {
233 // Ignore entries previously marked invalid.
234 if (I.second.Instr == nullptr)
235 continue;
236 // Disable the producer if its dest may be live beyond this block.
237 if (I.second.IsLiveOut) {
238 setInvalid(I.first);
239 continue;
240 }
241 // Mark as "dead" rather than outright deleting. This is so that other
242 // peephole style optimizations during or before lowering have access to
243 // this instruction in undeleted form. See for example
244 // tryOptimizedCmpxchgCmpBr().
245 I.second.Instr->setDead();
246 }
247 }
248
getProducerFor(const Operand * Opnd) const249 const Inst *BoolFolding::getProducerFor(const Operand *Opnd) const {
250 auto *Var = llvm::dyn_cast<const Variable>(Opnd);
251 if (Var == nullptr)
252 return nullptr;
253 SizeT VarNum = Var->getIndex();
254 auto Element = Producers.find(VarNum);
255 if (Element == Producers.end())
256 return nullptr;
257 return Element->second.Instr;
258 }
259
dump(const Cfg * Func) const260 void BoolFolding::dump(const Cfg *Func) const {
261 if (!BuildDefs::dump() || !Func->isVerbose(IceV_Folding))
262 return;
263 OstreamLocker L(Func->getContext());
264 Ostream &Str = Func->getContext()->getStrDump();
265 for (auto &I : Producers) {
266 if (I.second.Instr == nullptr)
267 continue;
268 Str << "Found foldable producer:\n ";
269 I.second.Instr->dump(Func);
270 Str << "\n";
271 }
272 }
273
274 /// If the given instruction has potential memory side effects (e.g. store, rmw,
275 /// or a call instruction with potential memory side effects), then we must not
276 /// allow a pre-store Producer instruction with memory operands to be folded
277 /// into a post-store Consumer instruction. If this is detected, the Producer
278 /// is invalidated.
279 ///
280 /// We use the Producer's IsLiveOut field to determine whether any potential
281 /// Consumers come after this store instruction. The IsLiveOut field is
282 /// initialized to true, and BoolFolding::init() sets IsLiveOut to false when it
283 /// sees the variable's definitive last use (indicating the variable is not in
284 /// the node's live-out set). Thus if we see here that IsLiveOut is false, we
285 /// know that there can be no consumers after the store, and therefore we know
286 /// the folding is safe despite the store instruction.
287
invalidateProducersOnStore(const Inst * Instr)288 void BoolFolding::invalidateProducersOnStore(const Inst *Instr) {
289 if (!Instr->isMemoryWrite())
290 return;
291 for (auto &ProducerPair : Producers) {
292 if (!ProducerPair.second.IsLiveOut)
293 continue;
294 Inst *PInst = ProducerPair.second.Instr;
295 if (PInst == nullptr)
296 continue;
297 bool HasMemOperand = false;
298 const SizeT SrcSize = PInst->getSrcSize();
299 for (SizeT I = 0; I < SrcSize; ++I) {
300 if (llvm::isa<X86OperandMem>(PInst->getSrc(I))) {
301 HasMemOperand = true;
302 break;
303 }
304 }
305 if (!HasMemOperand)
306 continue;
307 setInvalid(ProducerPair.first);
308 }
309 }
310
initNodeForLowering(CfgNode * Node)311 void TargetX8664::initNodeForLowering(CfgNode *Node) {
312 FoldingInfo.init(Node);
313 FoldingInfo.dump(Func);
314 }
315
TargetX8664(Cfg * Func)316 TargetX8664::TargetX8664(Cfg *Func) : TargetX86(Func) {}
317
staticInit(GlobalContext * Ctx)318 void TargetX8664::staticInit(GlobalContext *Ctx) {
319 RegNumT::setLimit(RegX8664::Reg_NUM);
320 RegX8664::initRegisterSet(getFlags(), &TypeToRegisterSet, &RegisterAliases);
321 for (size_t i = 0; i < TypeToRegisterSet.size(); ++i)
322 TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
323 filterTypeToRegisterSet(Ctx, RegX8664::Reg_NUM, TypeToRegisterSet.data(),
324 TypeToRegisterSet.size(), RegX8664::getRegName,
325 getRegClassName);
326 }
327
shouldBePooled(const Constant * C)328 bool TargetX8664::shouldBePooled(const Constant *C) {
329 if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(C)) {
330 return !Utils::isPositiveZero(ConstFloat->getValue());
331 }
332 if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(C)) {
333 return !Utils::isPositiveZero(ConstDouble->getValue());
334 }
335 return false;
336 }
337
getPointerType()338 ::Ice::Type TargetX8664::getPointerType() { return ::Ice::IceType_i64; }
339
translateO2()340 void TargetX8664::translateO2() {
341 TimerMarker T(TimerStack::TT_O2, Func);
342
343 genTargetHelperCalls();
344 Func->dump("After target helper call insertion");
345
346 // Merge Alloca instructions, and lay out the stack.
347 static constexpr bool SortAndCombineAllocas = true;
348 Func->processAllocas(SortAndCombineAllocas);
349 Func->dump("After Alloca processing");
350
351 // Run this early so it can be used to focus optimizations on potentially hot
352 // code.
353 // TODO(stichnot,ascull): currently only used for regalloc not
354 // expensive high level optimizations which could be focused on potentially
355 // hot code.
356 Func->generateLoopInfo();
357 Func->dump("After loop analysis");
358 if (getFlags().getLoopInvariantCodeMotion()) {
359 Func->loopInvariantCodeMotion();
360 Func->dump("After LICM");
361 }
362
363 if (getFlags().getLocalCSE() != Ice::LCSE_Disabled) {
364 Func->localCSE(getFlags().getLocalCSE() == Ice::LCSE_EnabledSSA);
365 Func->dump("After Local CSE");
366 Func->floatConstantCSE();
367 }
368 if (getFlags().getEnableShortCircuit()) {
369 Func->shortCircuitJumps();
370 Func->dump("After Short Circuiting");
371 }
372
373 if (!getFlags().getEnablePhiEdgeSplit()) {
374 // Lower Phi instructions.
375 Func->placePhiLoads();
376 if (Func->hasError())
377 return;
378 Func->placePhiStores();
379 if (Func->hasError())
380 return;
381 Func->deletePhis();
382 if (Func->hasError())
383 return;
384 Func->dump("After Phi lowering");
385 }
386
387 // Address mode optimization.
388 Func->getVMetadata()->init(VMK_SingleDefs);
389 Func->doAddressOpt();
390 Func->materializeVectorShuffles();
391
392 // Find read-modify-write opportunities. Do this after address mode
393 // optimization so that doAddressOpt() doesn't need to be applied to RMW
394 // instructions as well.
395 findRMW();
396 Func->dump("After RMW transform");
397
398 // Argument lowering
399 Func->doArgLowering();
400
401 // Target lowering. This requires liveness analysis for some parts of the
402 // lowering decisions, such as compare/branch fusing. If non-lightweight
403 // liveness analysis is used, the instructions need to be renumbered first
404 // TODO: This renumbering should only be necessary if we're actually
405 // calculating live intervals, which we only do for register allocation.
406 Func->renumberInstructions();
407 if (Func->hasError())
408 return;
409
410 // TODO: It should be sufficient to use the fastest liveness calculation,
411 // i.e. livenessLightweight(). However, for some reason that slows down the
412 // rest of the translation. Investigate.
413 Func->liveness(Liveness_Basic);
414 if (Func->hasError())
415 return;
416 Func->dump("After x86 address mode opt");
417
418 doLoadOpt();
419
420 Func->genCode();
421 if (Func->hasError())
422 return;
423 Func->dump("After x86 codegen");
424 splitBlockLocalVariables(Func);
425
426 // Register allocation. This requires instruction renumbering and full
427 // liveness analysis. Loops must be identified before liveness so variable
428 // use weights are correct.
429 Func->renumberInstructions();
430 if (Func->hasError())
431 return;
432 Func->liveness(Liveness_Intervals);
433 if (Func->hasError())
434 return;
435 // The post-codegen dump is done here, after liveness analysis and associated
436 // cleanup, to make the dump cleaner and more useful.
437 Func->dump("After initial x86 codegen");
438 // Validate the live range computations. The expensive validation call is
439 // deliberately only made when assertions are enabled.
440 assert(Func->validateLiveness());
441 Func->getVMetadata()->init(VMK_All);
442 regAlloc(RAK_Global);
443 if (Func->hasError())
444 return;
445 Func->dump("After linear scan regalloc");
446
447 if (getFlags().getEnablePhiEdgeSplit()) {
448 Func->advancedPhiLowering();
449 Func->dump("After advanced Phi lowering");
450 }
451
452 // Stack frame mapping.
453 Func->genFrame();
454 if (Func->hasError())
455 return;
456 Func->dump("After stack frame mapping");
457
458 Func->contractEmptyNodes();
459 Func->reorderNodes();
460
461 // Branch optimization. This needs to be done just before code emission. In
462 // particular, no transformations that insert or reorder CfgNodes should be
463 // done after branch optimization. We go ahead and do it before nop insertion
464 // to reduce the amount of work needed for searching for opportunities.
465 Func->doBranchOpt();
466 Func->dump("After branch optimization");
467 }
468
translateOm1()469 void TargetX8664::translateOm1() {
470 TimerMarker T(TimerStack::TT_Om1, Func);
471
472 genTargetHelperCalls();
473
474 // Do not merge Alloca instructions, and lay out the stack.
475 // static constexpr bool SortAndCombineAllocas = false;
476 static constexpr bool SortAndCombineAllocas =
477 true; // TODO(b/171222930): Fix Win32 bug when this is false
478 Func->processAllocas(SortAndCombineAllocas);
479 Func->dump("After Alloca processing");
480
481 Func->placePhiLoads();
482 if (Func->hasError())
483 return;
484 Func->placePhiStores();
485 if (Func->hasError())
486 return;
487 Func->deletePhis();
488 if (Func->hasError())
489 return;
490 Func->dump("After Phi lowering");
491
492 Func->doArgLowering();
493 Func->genCode();
494 if (Func->hasError())
495 return;
496 Func->dump("After initial x86 codegen");
497
498 regAlloc(RAK_InfOnly);
499 if (Func->hasError())
500 return;
501 Func->dump("After regalloc of infinite-weight variables");
502
503 Func->genFrame();
504 if (Func->hasError())
505 return;
506 Func->dump("After stack frame mapping");
507 }
508
canRMW(const InstArithmetic * Arith)509 inline bool canRMW(const InstArithmetic *Arith) {
510 Type Ty = Arith->getDest()->getType();
511 // X86 vector instructions write to a register and have no RMW option.
512 if (isVectorType(Ty))
513 return false;
514 bool isI64 = Ty == IceType_i64;
515
516 switch (Arith->getOp()) {
517 // Not handled for lack of simple lowering:
518 // shift on i64
519 // mul, udiv, urem, sdiv, srem, frem
520 // Not handled for lack of RMW instructions:
521 // fadd, fsub, fmul, fdiv (also vector types)
522 default:
523 return false;
524 case InstArithmetic::Add:
525 case InstArithmetic::Sub:
526 case InstArithmetic::And:
527 case InstArithmetic::Or:
528 case InstArithmetic::Xor:
529 return true;
530 case InstArithmetic::Shl:
531 case InstArithmetic::Lshr:
532 case InstArithmetic::Ashr:
533 return false; // TODO(stichnot): implement
534 return !isI64;
535 }
536 }
537
isSameMemAddressOperand(const Operand * A,const Operand * B)538 bool isSameMemAddressOperand(const Operand *A, const Operand *B) {
539 if (A == B)
540 return true;
541 if (auto *MemA = llvm::dyn_cast<X86OperandMem>(A)) {
542 if (auto *MemB = llvm::dyn_cast<X86OperandMem>(B)) {
543 return MemA->getBase() == MemB->getBase() &&
544 MemA->getOffset() == MemB->getOffset() &&
545 MemA->getIndex() == MemB->getIndex() &&
546 MemA->getShift() == MemB->getShift() &&
547 MemA->getSegmentRegister() == MemB->getSegmentRegister();
548 }
549 }
550 return false;
551 }
552
findRMW()553 void TargetX8664::findRMW() {
554 TimerMarker _(TimerStack::TT_findRMW, Func);
555 Func->dump("Before RMW");
556 if (Func->isVerbose(IceV_RMW))
557 Func->getContext()->lockStr();
558 for (CfgNode *Node : Func->getNodes()) {
559 // Walk through the instructions, considering each sequence of 3
560 // instructions, and look for the particular RMW pattern. Note that this
561 // search can be "broken" (false negatives) if there are intervening
562 // deleted instructions, or intervening instructions that could be safely
563 // moved out of the way to reveal an RMW pattern.
564 auto E = Node->getInsts().end();
565 auto I1 = E, I2 = E, I3 = Node->getInsts().begin();
566 for (; I3 != E; I1 = I2, I2 = I3, ++I3) {
567 // Make I3 skip over deleted instructions.
568 while (I3 != E && I3->isDeleted())
569 ++I3;
570 if (I1 == E || I2 == E || I3 == E)
571 continue;
572 assert(!I1->isDeleted());
573 assert(!I2->isDeleted());
574 assert(!I3->isDeleted());
575 auto *Load = llvm::dyn_cast<InstLoad>(I1);
576 auto *Arith = llvm::dyn_cast<InstArithmetic>(I2);
577 auto *Store = llvm::dyn_cast<InstStore>(I3);
578 if (!Load || !Arith || !Store)
579 continue;
580 // Look for:
581 // a = Load addr
582 // b = <op> a, other
583 // Store b, addr
584 // Change to:
585 // a = Load addr
586 // b = <op> a, other
587 // x = FakeDef
588 // RMW <op>, addr, other, x
589 // b = Store b, addr, x
590 // Note that inferTwoAddress() makes sure setDestRedefined() gets called
591 // on the updated Store instruction, to avoid liveness problems later.
592 //
593 // With this transformation, the Store instruction acquires a Dest
594 // variable and is now subject to dead code elimination if there are no
595 // more uses of "b". Variable "x" is a beacon for determining whether the
596 // Store instruction gets dead-code eliminated. If the Store instruction
597 // is eliminated, then it must be the case that the RMW instruction ends
598 // x's live range, and therefore the RMW instruction will be retained and
599 // later lowered. On the other hand, if the RMW instruction does not end
600 // x's live range, then the Store instruction must still be present, and
601 // therefore the RMW instruction is ignored during lowering because it is
602 // redundant with the Store instruction.
603 //
604 // Note that if "a" has further uses, the RMW transformation may still
605 // trigger, resulting in two loads and one store, which is worse than the
606 // original one load and one store. However, this is probably rare, and
607 // caching probably keeps it just as fast.
608 if (!isSameMemAddressOperand(Load->getLoadAddress(),
609 Store->getStoreAddress()))
610 continue;
611 Operand *ArithSrcFromLoad = Arith->getSrc(0);
612 Operand *ArithSrcOther = Arith->getSrc(1);
613 if (ArithSrcFromLoad != Load->getDest()) {
614 if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
615 continue;
616 std::swap(ArithSrcFromLoad, ArithSrcOther);
617 }
618 if (Arith->getDest() != Store->getData())
619 continue;
620 if (!canRMW(Arith))
621 continue;
622 if (Func->isVerbose(IceV_RMW)) {
623 Ostream &Str = Func->getContext()->getStrDump();
624 Str << "Found RMW in " << Func->getFunctionName() << ":\n ";
625 Load->dump(Func);
626 Str << "\n ";
627 Arith->dump(Func);
628 Str << "\n ";
629 Store->dump(Func);
630 Str << "\n";
631 }
632 Variable *Beacon = Func->makeVariable(IceType_i32);
633 Beacon->setMustNotHaveReg();
634 Store->setRmwBeacon(Beacon);
635 auto *BeaconDef = InstFakeDef::create(Func, Beacon);
636 Node->getInsts().insert(I3, BeaconDef);
637 auto *RMW =
638 InstX86FakeRMW::create(Func, ArithSrcOther, Store->getStoreAddress(),
639 Beacon, Arith->getOp());
640 Node->getInsts().insert(I3, RMW);
641 }
642 }
643 if (Func->isVerbose(IceV_RMW))
644 Func->getContext()->unlockStr();
645 }
646
647 /// Value is in bytes. Return Value adjusted to the next highest multiple of
648 /// the stack alignment.
applyStackAlignment(uint32_t Value)649 uint32_t TargetX8664::applyStackAlignment(uint32_t Value) {
650 return Utils::applyAlignment(Value, X86_STACK_ALIGNMENT_BYTES);
651 }
652
653 // Converts a ConstantInteger32 operand into its constant value, or
654 // MemoryOrderInvalid if the operand is not a ConstantInteger32.
getConstantMemoryOrder(Operand * Opnd)655 inline uint64_t getConstantMemoryOrder(Operand *Opnd) {
656 if (auto *Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
657 return Integer->getValue();
658 return Intrinsics::MemoryOrderInvalid;
659 }
660
661 /// Determines whether the dest of a Load instruction can be folded into one of
662 /// the src operands of a 2-operand instruction. This is true as long as the
663 /// load dest matches exactly one of the binary instruction's src operands.
664 /// Replaces Src0 or Src1 with LoadSrc if the answer is true.
canFoldLoadIntoBinaryInst(Operand * LoadSrc,Variable * LoadDest,Operand * & Src0,Operand * & Src1)665 inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
666 Operand *&Src0, Operand *&Src1) {
667 if (Src0 == LoadDest && Src1 != LoadDest) {
668 Src0 = LoadSrc;
669 return true;
670 }
671 if (Src0 != LoadDest && Src1 == LoadDest) {
672 Src1 = LoadSrc;
673 return true;
674 }
675 return false;
676 }
677
doLoadOpt()678 void TargetX8664::doLoadOpt() {
679 TimerMarker _(TimerStack::TT_loadOpt, Func);
680 for (CfgNode *Node : Func->getNodes()) {
681 Context.init(Node);
682 while (!Context.atEnd()) {
683 Variable *LoadDest = nullptr;
684 Operand *LoadSrc = nullptr;
685 Inst *CurInst = iteratorToInst(Context.getCur());
686 Inst *Next = Context.getNextInst();
687 // Determine whether the current instruction is a Load instruction or
688 // equivalent.
689 if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
690 // An InstLoad qualifies unless it uses a 64-bit absolute address,
691 // which requires legalization to insert a copy to register.
692 // TODO(b/148272103): Fold these after legalization.
693 if (!llvm::isa<Constant>(Load->getLoadAddress())) {
694 LoadDest = Load->getDest();
695 constexpr bool DoLegalize = false;
696 LoadSrc = formMemoryOperand(Load->getLoadAddress(),
697 LoadDest->getType(), DoLegalize);
698 }
699 } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsic>(CurInst)) {
700 // An AtomicLoad intrinsic qualifies as long as it has a valid memory
701 // ordering, and can be implemented in a single instruction (i.e., not
702 // i64 on x86-32).
703 Intrinsics::IntrinsicID ID = Intrin->getIntrinsicID();
704 if (ID == Intrinsics::AtomicLoad &&
705 Intrinsics::isMemoryOrderValid(
706 ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
707 LoadDest = Intrin->getDest();
708 constexpr bool DoLegalize = false;
709 LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
710 DoLegalize);
711 }
712 }
713 // A Load instruction can be folded into the following instruction only
714 // if the following instruction ends the Load's Dest variable's live
715 // range.
716 if (LoadDest && Next && Next->isLastUse(LoadDest)) {
717 assert(LoadSrc);
718 Inst *NewInst = nullptr;
719 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
720 Operand *Src0 = Arith->getSrc(0);
721 Operand *Src1 = Arith->getSrc(1);
722 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
723 NewInst = InstArithmetic::create(Func, Arith->getOp(),
724 Arith->getDest(), Src0, Src1);
725 }
726 } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
727 Operand *Src0 = Icmp->getSrc(0);
728 Operand *Src1 = Icmp->getSrc(1);
729 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
730 NewInst = InstIcmp::create(Func, Icmp->getCondition(),
731 Icmp->getDest(), Src0, Src1);
732 }
733 } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
734 Operand *Src0 = Fcmp->getSrc(0);
735 Operand *Src1 = Fcmp->getSrc(1);
736 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
737 NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
738 Fcmp->getDest(), Src0, Src1);
739 }
740 } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
741 Operand *Src0 = Select->getTrueOperand();
742 Operand *Src1 = Select->getFalseOperand();
743 if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
744 NewInst = InstSelect::create(Func, Select->getDest(),
745 Select->getCondition(), Src0, Src1);
746 }
747 } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
748 // The load dest can always be folded into a Cast instruction.
749 auto *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
750 if (Src0 == LoadDest) {
751 NewInst = InstCast::create(Func, Cast->getCastKind(),
752 Cast->getDest(), LoadSrc);
753 }
754 }
755 if (NewInst) {
756 CurInst->setDeleted();
757 Next->setDeleted();
758 Context.insert(NewInst);
759 // Update NewInst->LiveRangesEnded so that target lowering may
760 // benefit. Also update NewInst->HasSideEffects.
761 NewInst->spliceLivenessInfo(Next, CurInst);
762 }
763 }
764 Context.advanceCur();
765 Context.advanceNext();
766 }
767 }
768 Func->dump("After load optimization");
769 }
770
doBranchOpt(Inst * I,const CfgNode * NextNode)771 bool TargetX8664::doBranchOpt(Inst *I, const CfgNode *NextNode) {
772 if (auto *Br = llvm::dyn_cast<InstX86Br>(I)) {
773 return Br->optimizeBranch(NextNode);
774 }
775 return false;
776 }
777
getPhysicalRegister(RegNumT RegNum,Type Ty)778 Variable *TargetX8664::getPhysicalRegister(RegNumT RegNum, Type Ty) {
779 if (Ty == IceType_void)
780 Ty = IceType_i32;
781 if (PhysicalRegisters[Ty].empty())
782 PhysicalRegisters[Ty].resize(RegX8664::Reg_NUM);
783 assert(unsigned(RegNum) < PhysicalRegisters[Ty].size());
784 Variable *Reg = PhysicalRegisters[Ty][RegNum];
785 if (Reg == nullptr) {
786 Reg = Func->makeVariable(Ty);
787 Reg->setRegNum(RegNum);
788 PhysicalRegisters[Ty][RegNum] = Reg;
789 // Specially mark a named physical register as an "argument" so that it is
790 // considered live upon function entry. Otherwise it's possible to get
791 // liveness validation errors for saving callee-save registers.
792 Func->addImplicitArg(Reg);
793 // Don't bother tracking the live range of a named physical register.
794 Reg->setIgnoreLiveness();
795 }
796 assert(RegX8664::getGprForType(Ty, RegNum) == RegNum);
797 return Reg;
798 }
799
getRegName(RegNumT RegNum,Type Ty) const800 const char *TargetX8664::getRegName(RegNumT RegNum, Type Ty) const {
801 return RegX8664::getRegName(RegX8664::getGprForType(Ty, RegNum));
802 }
803
emitVariable(const Variable * Var) const804 void TargetX8664::emitVariable(const Variable *Var) const {
805 if (!BuildDefs::dump())
806 return;
807 Ostream &Str = Ctx->getStrEmit();
808 if (Var->hasReg()) {
809 Str << "%" << getRegName(Var->getRegNum(), Var->getType());
810 return;
811 }
812 if (Var->mustHaveReg()) {
813 llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
814 ") has no register assigned - function " +
815 Func->getFunctionName());
816 }
817 const int32_t Offset = Var->getStackOffset();
818 auto BaseRegNum = Var->getBaseRegNum();
819 if (BaseRegNum.hasNoValue())
820 BaseRegNum = getFrameOrStackReg();
821
822 // Print in the form "Offset(%reg)", omitting Offset when it is 0.
823 if (getFlags().getDecorateAsm()) {
824 Str << Var->getSymbolicStackOffset();
825 } else if (Offset != 0) {
826 Str << Offset;
827 }
828 const Type FrameSPTy = WordType;
829 Str << "(%" << getRegName(BaseRegNum, FrameSPTy) << ")";
830 }
831
addProlog(CfgNode * Node)832 void TargetX8664::addProlog(CfgNode *Node) {
833 // Stack frame layout:
834 //
835 // +------------------------+ ^ +
836 // | 1. return address | |
837 // +------------------------+ v -
838 // | 2. preserved registers |
839 // +------------------------+ <--- BasePointer (if used)
840 // | 3. padding |
841 // +------------------------+
842 // | 4. global spill area |
843 // +------------------------+
844 // | 5. padding |
845 // +------------------------+
846 // | 6. local spill area |
847 // +------------------------+
848 // | 7. padding |
849 // +------------------------+
850 // | 7.5 shadow (WinX64) |
851 // +------------------------+
852 // | 8. allocas |
853 // +------------------------+
854 // | 9. padding |
855 // +------------------------+
856 // | 10. out args |
857 // +------------------------+ <--- StackPointer
858 //
859 // The following variables record the size in bytes of the given areas:
860 // * X86_RET_IP_SIZE_BYTES: area 1
861 // * PreservedRegsSizeBytes: area 2
862 // * SpillAreaPaddingBytes: area 3
863 // * GlobalsSize: area 4
864 // * LocalsSlotsPaddingBytes: area 5
865 // * GlobalsAndSubsequentPaddingSize: areas 4 - 5
866 // * LocalsSpillAreaSize: area 6
867 // * FixedAllocaSizeBytes: areas 7 - 8
868 // * SpillAreaSizeBytes: areas 3 - 10
869 // * maxOutArgsSizeBytes(): areas 9 - 10
870
871 // Determine stack frame offsets for each Variable without a register
872 // assignment. This can be done as one variable per stack slot. Or, do
873 // coalescing by running the register allocator again with an infinite set of
874 // registers (as a side effect, this gives variables a second chance at
875 // physical register assignment).
876 //
877 // A middle ground approach is to leverage sparsity and allocate one block of
878 // space on the frame for globals (variables with multi-block lifetime), and
879 // one block to share for locals (single-block lifetime).
880
881 const SizeT ShadowStoreSize = getShadowStoreSize();
882
883 // StackPointer: points just past return address of calling function
884
885 Context.init(Node);
886 Context.setInsertPoint(Context.getCur());
887
888 SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
889 RegsUsed = SmallBitVector(CalleeSaves.size());
890 VarList SortedSpilledVariables, VariablesLinkedToSpillSlots;
891 size_t GlobalsSize = 0;
892 // If there is a separate locals area, this represents that area. Otherwise
893 // it counts any variable not counted by GlobalsSize.
894 SpillAreaSizeBytes = 0;
895 // If there is a separate locals area, this specifies the alignment for it.
896 uint32_t LocalsSlotsAlignmentBytes = 0;
897 // The entire spill locations area gets aligned to largest natural alignment
898 // of the variables that have a spill slot.
899 uint32_t SpillAreaAlignmentBytes = 0;
900 // A spill slot linked to a variable with a stack slot should reuse that
901 // stack slot.
902 std::function<bool(Variable *)> TargetVarHook =
903 [&VariablesLinkedToSpillSlots](Variable *Var) {
904 // TODO(stichnot): Refactor this into the base class.
905 Variable *Root = Var->getLinkedToStackRoot();
906 if (Root != nullptr) {
907 assert(!Root->hasReg());
908 if (!Root->hasReg()) {
909 VariablesLinkedToSpillSlots.push_back(Var);
910 return true;
911 }
912 }
913 return false;
914 };
915
916 // Compute the list of spilled variables and bounds for GlobalsSize, etc.
917 getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
918 &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
919 &LocalsSlotsAlignmentBytes, TargetVarHook);
920 uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
921 SpillAreaSizeBytes += GlobalsSize;
922
923 // Add push instructions for preserved registers.
924 uint32_t NumCallee = 0;
925 size_t PreservedRegsSizeBytes = 0;
926 SmallBitVector Pushed(CalleeSaves.size());
927 for (RegNumT i : RegNumBVIter(CalleeSaves)) {
928 const auto Canonical = RegX8664::getBaseReg(i);
929 assert(Canonical == RegX8664::getBaseReg(Canonical));
930 if (RegsUsed[i]) {
931 Pushed[Canonical] = true;
932 }
933 }
934 for (RegNumT RegNum : RegNumBVIter(Pushed)) {
935 assert(RegNum == RegX8664::getBaseReg(RegNum));
936 ++NumCallee;
937 if (RegX8664::isXmm(RegNum)) {
938 PreservedRegsSizeBytes += 16;
939 } else {
940 PreservedRegsSizeBytes += typeWidthInBytes(WordType);
941 }
942 _push_reg(RegNum);
943 }
944 Ctx->statsUpdateRegistersSaved(NumCallee);
945
946 // StackPointer: points past preserved registers at start of spill area
947
948 // Generate "push frameptr; mov frameptr, stackptr"
949 if (IsEbpBasedFrame) {
950 assert(
951 (RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None)).count() ==
952 0);
953 PreservedRegsSizeBytes += typeWidthInBytes(WordType);
954 _link_bp();
955 }
956
957 // Align the variables area. SpillAreaPaddingBytes is the size of the region
958 // after the preserved registers and before the spill areas.
959 // LocalsSlotsPaddingBytes is the amount of padding between the globals and
960 // locals area if they are separate.
961 assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
962 uint32_t SpillAreaPaddingBytes = 0;
963 uint32_t LocalsSlotsPaddingBytes = 0;
964 alignStackSpillAreas(X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes,
965 SpillAreaAlignmentBytes, GlobalsSize,
966 LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes,
967 &LocalsSlotsPaddingBytes);
968 SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
969 uint32_t GlobalsAndSubsequentPaddingSize =
970 GlobalsSize + LocalsSlotsPaddingBytes;
971
972 // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the
973 // fixed allocations in the prolog.
974 if (PrologEmitsFixedAllocas)
975 SpillAreaSizeBytes += FixedAllocaSizeBytes;
976
977 // Win64 ABI: add space for shadow store (aka home space)
978 SpillAreaSizeBytes += ShadowStoreSize;
979
980 // Entering the function has made the stack pointer unaligned. Re-align it by
981 // adjusting the stack size.
982 // Note that StackOffset does not include spill area. It's the offset from the
983 // base stack pointer (epb), whether we set it or not, to the the first stack
984 // arg (if any). StackSize, on the other hand, does include the spill area.
985 const uint32_t StackOffset =
986 ShadowStoreSize + X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
987 uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes,
988 RequiredStackAlignment);
989 StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(),
990 RequiredStackAlignment);
991 SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any
992
993 if (SpillAreaSizeBytes) {
994 auto *Func = Node->getCfg();
995 if (SpillAreaSizeBytes > Func->getStackSizeLimit()) {
996 Func->setError("Stack size limit exceeded");
997 }
998
999 emitStackProbe(SpillAreaSizeBytes);
1000
1001 // Generate "sub stackptr, SpillAreaSizeBytes"
1002 _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
1003 }
1004
1005 // StackPointer: points just past the spill area (end of stack frame)
1006
1007 // Account for known-frame-offset alloca instructions that were not already
1008 // combined into the prolog.
1009 if (!PrologEmitsFixedAllocas)
1010 SpillAreaSizeBytes += FixedAllocaSizeBytes;
1011
1012 Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
1013
1014 // Fill in stack offsets for stack args, and copy args into registers for
1015 // those that were register-allocated. Args are pushed right to left, so
1016 // Arg[0] is closest to the stack/frame pointer.
1017 RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg();
1018 Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, WordType);
1019 size_t BasicFrameOffset = StackOffset;
1020 if (!IsEbpBasedFrame)
1021 BasicFrameOffset += SpillAreaSizeBytes;
1022
1023 const VarList &Args = Func->getArgs();
1024 size_t InArgsSizeBytes = 0;
1025 unsigned NumXmmArgs = 0;
1026 unsigned NumGPRArgs = 0;
1027 for (SizeT i = 0, NumArgs = Args.size(); i < NumArgs; ++i) {
1028 Variable *Arg = Args[i];
1029 // Skip arguments passed in registers.
1030 if (isVectorType(Arg->getType())) {
1031 if (RegX8664::getRegisterForXmmArgNum(
1032 RegX8664::getArgIndex(i, NumXmmArgs))
1033 .hasValue()) {
1034 ++NumXmmArgs;
1035 continue;
1036 }
1037 } else if (isScalarFloatingType(Arg->getType())) {
1038 if (RegX8664::getRegisterForXmmArgNum(
1039 RegX8664::getArgIndex(i, NumXmmArgs))
1040 .hasValue()) {
1041 ++NumXmmArgs;
1042 continue;
1043 }
1044 } else {
1045 assert(isScalarIntegerType(Arg->getType()));
1046 if (RegX8664::getRegisterForGprArgNum(
1047 WordType, RegX8664::getArgIndex(i, NumGPRArgs))
1048 .hasValue()) {
1049 ++NumGPRArgs;
1050 continue;
1051 }
1052 }
1053 // For esp-based frames where the allocas are done outside the prolog, the
1054 // esp value may not stabilize to its home value until after all the
1055 // fixed-size alloca instructions have executed. In this case, a stack
1056 // adjustment is needed when accessing in-args in order to copy them into
1057 // registers.
1058 size_t StackAdjBytes = 0;
1059 if (!IsEbpBasedFrame && !PrologEmitsFixedAllocas)
1060 StackAdjBytes -= FixedAllocaSizeBytes;
1061 finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
1062 InArgsSizeBytes);
1063 }
1064
1065 // Fill in stack offsets for locals.
1066 assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
1067 SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
1068 IsEbpBasedFrame && !needsStackPointerAlignment());
1069 // Assign stack offsets to variables that have been linked to spilled
1070 // variables.
1071 for (Variable *Var : VariablesLinkedToSpillSlots) {
1072 const Variable *Root = Var->getLinkedToStackRoot();
1073 assert(Root != nullptr);
1074 Var->setStackOffset(Root->getStackOffset());
1075
1076 // If the stack root variable is an arg, make this variable an arg too so
1077 // that stackVarToAsmAddress uses the correct base pointer (e.g. ebp on
1078 // x86).
1079 Var->setIsArg(Root->getIsArg());
1080 }
1081 this->HasComputedFrame = true;
1082
1083 if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) {
1084 OstreamLocker L(Func->getContext());
1085 Ostream &Str = Func->getContext()->getStrDump();
1086
1087 Str << "Stack layout:\n";
1088 uint32_t EspAdjustmentPaddingSize =
1089 SpillAreaSizeBytes - LocalsSpillAreaSize -
1090 GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -
1091 maxOutArgsSizeBytes();
1092 Str << " in-args = " << InArgsSizeBytes << " bytes\n"
1093 << " return address = " << X86_RET_IP_SIZE_BYTES << " bytes\n"
1094 << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
1095 << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
1096 << " globals spill area = " << GlobalsSize << " bytes\n"
1097 << " globals-locals spill areas intermediate padding = "
1098 << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
1099 << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
1100 << " esp alignment padding = " << EspAdjustmentPaddingSize
1101 << " bytes\n";
1102
1103 Str << "Stack details:\n"
1104 << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
1105 << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
1106 << " outgoing args size = " << maxOutArgsSizeBytes() << " bytes\n"
1107 << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
1108 << " bytes\n"
1109 << " is ebp based = " << IsEbpBasedFrame << "\n";
1110 }
1111 }
1112
1113 /// Helper function for addProlog().
1114 ///
1115 /// This assumes Arg is an argument passed on the stack. This sets the frame
1116 /// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an
1117 /// I64 arg that has been split into Lo and Hi components, it calls itself
1118 /// recursively on the components, taking care to handle Lo first because of the
1119 /// little-endian architecture. Lastly, this function generates an instruction
1120 /// to copy Arg into its assigned register if applicable.
1121
finishArgumentLowering(Variable * Arg,Variable * FramePtr,size_t BasicFrameOffset,size_t StackAdjBytes,size_t & InArgsSizeBytes)1122 void TargetX8664::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
1123 size_t BasicFrameOffset,
1124 size_t StackAdjBytes,
1125 size_t &InArgsSizeBytes) {
1126 Type Ty = Arg->getType();
1127 if (isVectorType(Ty)) {
1128 InArgsSizeBytes = applyStackAlignment(InArgsSizeBytes);
1129 }
1130 Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
1131 InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
1132 if (Arg->hasReg()) {
1133 auto *Mem = X86OperandMem::create(
1134 Func, Ty, FramePtr,
1135 Ctx->getConstantInt32(Arg->getStackOffset() + StackAdjBytes));
1136 if (isVectorType(Arg->getType())) {
1137 _movp(Arg, Mem);
1138 } else {
1139 _mov(Arg, Mem);
1140 }
1141 // This argument-copying instruction uses an explicit X86OperandMem
1142 // operand instead of a Variable, so its fill-from-stack operation has to
1143 // be tracked separately for statistics.
1144 Ctx->statsUpdateFills();
1145 }
1146 }
1147
addEpilog(CfgNode * Node)1148 void TargetX8664::addEpilog(CfgNode *Node) {
1149 InstList &Insts = Node->getInsts();
1150 InstList::reverse_iterator RI, E;
1151 for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
1152 if (llvm::isa<Insts::Ret>(*RI))
1153 break;
1154 }
1155 if (RI == E)
1156 return;
1157
1158 // Convert the reverse_iterator position into its corresponding (forward)
1159 // iterator position.
1160 InstList::iterator InsertPoint = reverseToForwardIterator(RI);
1161 --InsertPoint;
1162 Context.init(Node);
1163 Context.setInsertPoint(InsertPoint);
1164
1165 if (IsEbpBasedFrame) {
1166 _unlink_bp();
1167 } else {
1168 // add stackptr, SpillAreaSizeBytes
1169 if (SpillAreaSizeBytes != 0) {
1170 _add_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
1171 }
1172 }
1173
1174 // Add pop instructions for preserved registers.
1175 SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
1176 SmallBitVector Popped(CalleeSaves.size());
1177 for (int32_t i = CalleeSaves.size() - 1; i >= 0; --i) {
1178 const auto RegNum = RegNumT::fromInt(i);
1179 if (RegNum == getFrameReg() && IsEbpBasedFrame)
1180 continue;
1181 const RegNumT Canonical = RegX8664::getBaseReg(RegNum);
1182 if (CalleeSaves[i] && RegsUsed[i]) {
1183 Popped[Canonical] = true;
1184 }
1185 }
1186 for (int32_t i = Popped.size() - 1; i >= 0; --i) {
1187 if (!Popped[i])
1188 continue;
1189 const auto RegNum = RegNumT::fromInt(i);
1190 assert(RegNum == RegX8664::getBaseReg(RegNum));
1191 _pop_reg(RegNum);
1192 }
1193 }
1194
stackSlotType()1195 Type TargetX8664::stackSlotType() { return WordType; }
1196
getRegisterSet(RegSetMask Include,RegSetMask Exclude) const1197 SmallBitVector TargetX8664::getRegisterSet(RegSetMask Include,
1198 RegSetMask Exclude) const {
1199 return RegX8664::getRegisterSet(getFlags(), Include, Exclude);
1200 }
1201
lowerAlloca(const InstAlloca * Instr)1202 void TargetX8664::lowerAlloca(const InstAlloca *Instr) {
1203 // For default align=0, set it to the real value 1, to avoid any
1204 // bit-manipulation problems below.
1205 const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes());
1206
1207 // LLVM enforces power of 2 alignment.
1208 assert(llvm::isPowerOf2_32(AlignmentParam));
1209
1210 const uint32_t Alignment = std::max(AlignmentParam, RequiredStackAlignment);
1211 const bool OverAligned = Alignment > RequiredStackAlignment;
1212 const bool OptM1 = Func->getOptLevel() == Opt_m1;
1213 const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset();
1214 const bool UseFramePointer =
1215 hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;
1216
1217 if (UseFramePointer)
1218 setHasFramePointer();
1219
1220 Variable *esp = getPhysicalRegister(getStackReg(), WordType);
1221 if (OverAligned) {
1222 _and(esp, Ctx->getConstantInt32(-Alignment));
1223 }
1224
1225 Variable *Dest = Instr->getDest();
1226 Operand *TotalSize = legalize(Instr->getSizeInBytes());
1227
1228 if (const auto *ConstantTotalSize =
1229 llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
1230 const uint32_t Value =
1231 Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);
1232 if (UseFramePointer) {
1233 _sub_sp(Ctx->getConstantInt32(Value));
1234 } else {
1235 // If we don't need a Frame Pointer, this alloca has a known offset to the
1236 // stack pointer. We don't need adjust the stack pointer, nor assign any
1237 // value to Dest, as Dest is rematerializable.
1238 assert(Dest->isRematerializable());
1239 FixedAllocaSizeBytes += Value;
1240 Context.insert<InstFakeDef>(Dest);
1241 }
1242 } else {
1243 // Non-constant sizes need to be adjusted to the next highest multiple of
1244 // the required alignment at runtime.
1245 Variable *T = nullptr;
1246 if (TotalSize->getType() != IceType_i64) {
1247 T = makeReg(IceType_i64);
1248 _movzx(T, TotalSize);
1249 } else {
1250 T = makeReg(IceType_i32);
1251 _mov(T, TotalSize);
1252 }
1253 _add(T, Ctx->getConstantInt32(Alignment - 1));
1254 _and(T, Ctx->getConstantInt32(-Alignment));
1255 _sub_sp(T);
1256 }
1257 // Add enough to the returned address to account for the out args area.
1258 uint32_t OutArgsSize = maxOutArgsSizeBytes();
1259 if (OutArgsSize > 0) {
1260 Variable *T = makeReg(Dest->getType());
1261 auto *CalculateOperand = X86OperandMem::create(
1262 Func, IceType_void, esp, Ctx->getConstantInt(IceType_i32, OutArgsSize));
1263 _lea(T, CalculateOperand);
1264 _mov(Dest, T);
1265 } else {
1266 _mov(Dest, esp);
1267 }
1268 }
1269
lowerArguments()1270 void TargetX8664::lowerArguments() {
1271 const bool OptM1 = Func->getOptLevel() == Opt_m1;
1272 VarList &Args = Func->getArgs();
1273 unsigned NumXmmArgs = 0;
1274 bool XmmSlotsRemain = true;
1275 unsigned NumGprArgs = 0;
1276 bool GprSlotsRemain = true;
1277
1278 Context.init(Func->getEntryNode());
1279 Context.setInsertPoint(Context.getCur());
1280
1281 for (SizeT i = 0, End = Args.size();
1282 i < End && (XmmSlotsRemain || GprSlotsRemain); ++i) {
1283 Variable *Arg = Args[i];
1284 Type Ty = Arg->getType();
1285 Variable *RegisterArg = nullptr;
1286 RegNumT RegNum;
1287 if (isVectorType(Ty)) {
1288 RegNum = RegX8664::getRegisterForXmmArgNum(
1289 RegX8664::getArgIndex(i, NumXmmArgs));
1290 if (RegNum.hasNoValue()) {
1291 XmmSlotsRemain = false;
1292 continue;
1293 }
1294 ++NumXmmArgs;
1295 RegisterArg = Func->makeVariable(Ty);
1296 } else if (isScalarFloatingType(Ty)) {
1297 RegNum = RegX8664::getRegisterForXmmArgNum(
1298 RegX8664::getArgIndex(i, NumXmmArgs));
1299 if (RegNum.hasNoValue()) {
1300 XmmSlotsRemain = false;
1301 continue;
1302 }
1303 ++NumXmmArgs;
1304 RegisterArg = Func->makeVariable(Ty);
1305 } else if (isScalarIntegerType(Ty)) {
1306 RegNum = RegX8664::getRegisterForGprArgNum(
1307 Ty, RegX8664::getArgIndex(i, NumGprArgs));
1308 if (RegNum.hasNoValue()) {
1309 GprSlotsRemain = false;
1310 continue;
1311 }
1312 ++NumGprArgs;
1313 RegisterArg = Func->makeVariable(Ty);
1314 }
1315 assert(RegNum.hasValue());
1316 assert(RegisterArg != nullptr);
1317 // Replace Arg in the argument list with the home register. Then generate
1318 // an instruction in the prolog to copy the home register to the assigned
1319 // location of Arg.
1320 if (BuildDefs::dump())
1321 RegisterArg->setName(Func, "home_reg:" + Arg->getName());
1322 RegisterArg->setRegNum(RegNum);
1323 RegisterArg->setIsArg();
1324 Arg->setIsArg(false);
1325
1326 Args[i] = RegisterArg;
1327 // When not Om1, do the assignment through a temporary, instead of directly
1328 // from the pre-colored variable, so that a subsequent availabilityGet()
1329 // call has a chance to work. (In Om1, don't bother creating extra
1330 // instructions with extra variables to register-allocate.)
1331 if (OptM1) {
1332 Context.insert<InstAssign>(Arg, RegisterArg);
1333 } else {
1334 Variable *Tmp = makeReg(RegisterArg->getType());
1335 Context.insert<InstAssign>(Tmp, RegisterArg);
1336 Context.insert<InstAssign>(Arg, Tmp);
1337 }
1338 }
1339 if (!OptM1)
1340 Context.availabilityUpdate();
1341 }
1342
1343 /// Strength-reduce scalar integer multiplication by a constant (for i32 or
1344 /// narrower) for certain constants. The lea instruction can be used to multiply
1345 /// by 3, 5, or 9, and the lsh instruction can be used to multiply by powers of
1346 /// 2. These can be combined such that e.g. multiplying by 100 can be done as 2
1347 /// lea-based multiplies by 5, combined with left-shifting by 2.
1348
optimizeScalarMul(Variable * Dest,Operand * Src0,int32_t Src1)1349 bool TargetX8664::optimizeScalarMul(Variable *Dest, Operand *Src0,
1350 int32_t Src1) {
1351 // Disable this optimization for Om1 and O0, just to keep things simple
1352 // there.
1353 if (Func->getOptLevel() < Opt_1)
1354 return false;
1355 Type Ty = Dest->getType();
1356 if (Src1 == -1) {
1357 Variable *T = nullptr;
1358 _mov(T, Src0);
1359 _neg(T);
1360 _mov(Dest, T);
1361 return true;
1362 }
1363 if (Src1 == 0) {
1364 _mov(Dest, Ctx->getConstantZero(Ty));
1365 return true;
1366 }
1367 if (Src1 == 1) {
1368 Variable *T = nullptr;
1369 _mov(T, Src0);
1370 _mov(Dest, T);
1371 return true;
1372 }
1373 // Don't bother with the edge case where Src1 == MININT.
1374 if (Src1 == -Src1)
1375 return false;
1376 const bool Src1IsNegative = Src1 < 0;
1377 if (Src1IsNegative)
1378 Src1 = -Src1;
1379 uint32_t Count9 = 0;
1380 uint32_t Count5 = 0;
1381 uint32_t Count3 = 0;
1382 uint32_t Count2 = 0;
1383 uint32_t CountOps = 0;
1384 while (Src1 > 1) {
1385 if (Src1 % 9 == 0) {
1386 ++CountOps;
1387 ++Count9;
1388 Src1 /= 9;
1389 } else if (Src1 % 5 == 0) {
1390 ++CountOps;
1391 ++Count5;
1392 Src1 /= 5;
1393 } else if (Src1 % 3 == 0) {
1394 ++CountOps;
1395 ++Count3;
1396 Src1 /= 3;
1397 } else if (Src1 % 2 == 0) {
1398 if (Count2 == 0)
1399 ++CountOps;
1400 ++Count2;
1401 Src1 /= 2;
1402 } else {
1403 return false;
1404 }
1405 }
1406 // Lea optimization only works for i16 and i32 types, not i8.
1407 if (Ty != IceType_i32 && Ty != IceType_i64 && (Count3 || Count5 || Count9))
1408 return false;
1409 // Limit the number of lea/shl operations for a single multiply, to a
1410 // somewhat arbitrary choice of 3.
1411 constexpr uint32_t MaxOpsForOptimizedMul = 3;
1412 if (CountOps > MaxOpsForOptimizedMul)
1413 return false;
1414 Variable *T = makeReg(WordType);
1415 if (typeWidthInBytes(Src0->getType()) < typeWidthInBytes(T->getType())) {
1416 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
1417 _movzx(T, Src0RM);
1418 } else {
1419 _mov(T, Src0);
1420 }
1421 Constant *Zero = Ctx->getConstantZero(IceType_i32);
1422 for (uint32_t i = 0; i < Count9; ++i) {
1423 constexpr uint16_t Shift = 3; // log2(9-1)
1424 _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1425 }
1426 for (uint32_t i = 0; i < Count5; ++i) {
1427 constexpr uint16_t Shift = 2; // log2(5-1)
1428 _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1429 }
1430 for (uint32_t i = 0; i < Count3; ++i) {
1431 constexpr uint16_t Shift = 1; // log2(3-1)
1432 _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1433 }
1434 if (Count2) {
1435 _shl(T, Ctx->getConstantInt(Ty, Count2));
1436 }
1437 if (Src1IsNegative)
1438 _neg(T);
1439 _mov(Dest, T);
1440 return true;
1441 }
1442
lowerShift64(InstArithmetic::OpKind Op,Operand * Src0Lo,Operand * Src0Hi,Operand * Src1Lo,Variable * DestLo,Variable * DestHi)1443 void TargetX8664::lowerShift64(InstArithmetic::OpKind Op, Operand *Src0Lo,
1444 Operand *Src0Hi, Operand *Src1Lo,
1445 Variable *DestLo, Variable *DestHi) {
1446 // TODO: Refactor the similarities between Shl, Lshr, and Ashr.
1447 Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
1448 Constant *Zero = Ctx->getConstantZero(IceType_i32);
1449 Constant *SignExtend = Ctx->getConstantInt32(0x1f);
1450 if (auto *ConstantShiftAmount = llvm::dyn_cast<ConstantInteger32>(Src1Lo)) {
1451 uint32_t ShiftAmount = ConstantShiftAmount->getValue();
1452 if (ShiftAmount > 32) {
1453 Constant *ReducedShift = Ctx->getConstantInt32(ShiftAmount - 32);
1454 switch (Op) {
1455 default:
1456 assert(0 && "non-shift op");
1457 break;
1458 case InstArithmetic::Shl: {
1459 // a=b<<c ==>
1460 // t2 = b.lo
1461 // t2 = shl t2, ShiftAmount-32
1462 // t3 = t2
1463 // t2 = 0
1464 _mov(T_2, Src0Lo);
1465 _shl(T_2, ReducedShift);
1466 _mov(DestHi, T_2);
1467 _mov(DestLo, Zero);
1468 } break;
1469 case InstArithmetic::Lshr: {
1470 // a=b>>c (unsigned) ==>
1471 // t2 = b.hi
1472 // t2 = shr t2, ShiftAmount-32
1473 // a.lo = t2
1474 // a.hi = 0
1475 _mov(T_2, Src0Hi);
1476 _shr(T_2, ReducedShift);
1477 _mov(DestLo, T_2);
1478 _mov(DestHi, Zero);
1479 } break;
1480 case InstArithmetic::Ashr: {
1481 // a=b>>c (signed) ==>
1482 // t3 = b.hi
1483 // t3 = sar t3, 0x1f
1484 // t2 = b.hi
1485 // t2 = shrd t2, t3, ShiftAmount-32
1486 // a.lo = t2
1487 // a.hi = t3
1488 _mov(T_3, Src0Hi);
1489 _sar(T_3, SignExtend);
1490 _mov(T_2, Src0Hi);
1491 _shrd(T_2, T_3, ReducedShift);
1492 _mov(DestLo, T_2);
1493 _mov(DestHi, T_3);
1494 } break;
1495 }
1496 } else if (ShiftAmount == 32) {
1497 switch (Op) {
1498 default:
1499 assert(0 && "non-shift op");
1500 break;
1501 case InstArithmetic::Shl: {
1502 // a=b<<c ==>
1503 // t2 = b.lo
1504 // a.hi = t2
1505 // a.lo = 0
1506 _mov(T_2, Src0Lo);
1507 _mov(DestHi, T_2);
1508 _mov(DestLo, Zero);
1509 } break;
1510 case InstArithmetic::Lshr: {
1511 // a=b>>c (unsigned) ==>
1512 // t2 = b.hi
1513 // a.lo = t2
1514 // a.hi = 0
1515 _mov(T_2, Src0Hi);
1516 _mov(DestLo, T_2);
1517 _mov(DestHi, Zero);
1518 } break;
1519 case InstArithmetic::Ashr: {
1520 // a=b>>c (signed) ==>
1521 // t2 = b.hi
1522 // a.lo = t2
1523 // t3 = b.hi
1524 // t3 = sar t3, 0x1f
1525 // a.hi = t3
1526 _mov(T_2, Src0Hi);
1527 _mov(DestLo, T_2);
1528 _mov(T_3, Src0Hi);
1529 _sar(T_3, SignExtend);
1530 _mov(DestHi, T_3);
1531 } break;
1532 }
1533 } else {
1534 // COMMON PREFIX OF: a=b SHIFT_OP c ==>
1535 // t2 = b.lo
1536 // t3 = b.hi
1537 _mov(T_2, Src0Lo);
1538 _mov(T_3, Src0Hi);
1539 switch (Op) {
1540 default:
1541 assert(0 && "non-shift op");
1542 break;
1543 case InstArithmetic::Shl: {
1544 // a=b<<c ==>
1545 // t3 = shld t3, t2, ShiftAmount
1546 // t2 = shl t2, ShiftAmount
1547 _shld(T_3, T_2, ConstantShiftAmount);
1548 _shl(T_2, ConstantShiftAmount);
1549 } break;
1550 case InstArithmetic::Lshr: {
1551 // a=b>>c (unsigned) ==>
1552 // t2 = shrd t2, t3, ShiftAmount
1553 // t3 = shr t3, ShiftAmount
1554 _shrd(T_2, T_3, ConstantShiftAmount);
1555 _shr(T_3, ConstantShiftAmount);
1556 } break;
1557 case InstArithmetic::Ashr: {
1558 // a=b>>c (signed) ==>
1559 // t2 = shrd t2, t3, ShiftAmount
1560 // t3 = sar t3, ShiftAmount
1561 _shrd(T_2, T_3, ConstantShiftAmount);
1562 _sar(T_3, ConstantShiftAmount);
1563 } break;
1564 }
1565 // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
1566 // a.lo = t2
1567 // a.hi = t3
1568 _mov(DestLo, T_2);
1569 _mov(DestHi, T_3);
1570 }
1571 } else {
1572 // NON-CONSTANT CASES.
1573 Constant *BitTest = Ctx->getConstantInt32(0x20);
1574 InstX86Label *Label = InstX86Label::create(Func, this);
1575 // COMMON PREFIX OF: a=b SHIFT_OP c ==>
1576 // t1:ecx = c.lo & 0xff
1577 // t2 = b.lo
1578 // t3 = b.hi
1579 T_1 = copyToReg8(Src1Lo, RegX8664::Reg_cl);
1580 _mov(T_2, Src0Lo);
1581 _mov(T_3, Src0Hi);
1582 switch (Op) {
1583 default:
1584 assert(0 && "non-shift op");
1585 break;
1586 case InstArithmetic::Shl: {
1587 // a=b<<c ==>
1588 // t3 = shld t3, t2, t1
1589 // t2 = shl t2, t1
1590 // test t1, 0x20
1591 // je L1
1592 // use(t3)
1593 // t3 = t2
1594 // t2 = 0
1595 _shld(T_3, T_2, T_1);
1596 _shl(T_2, T_1);
1597 _test(T_1, BitTest);
1598 _br(CondX86::Br_e, Label);
1599 // T_2 and T_3 are being assigned again because of the intra-block control
1600 // flow, so we need to use _redefined to avoid liveness problems.
1601 _redefined(_mov(T_3, T_2));
1602 _redefined(_mov(T_2, Zero));
1603 } break;
1604 case InstArithmetic::Lshr: {
1605 // a=b>>c (unsigned) ==>
1606 // t2 = shrd t2, t3, t1
1607 // t3 = shr t3, t1
1608 // test t1, 0x20
1609 // je L1
1610 // use(t2)
1611 // t2 = t3
1612 // t3 = 0
1613 _shrd(T_2, T_3, T_1);
1614 _shr(T_3, T_1);
1615 _test(T_1, BitTest);
1616 _br(CondX86::Br_e, Label);
1617 // T_2 and T_3 are being assigned again because of the intra-block control
1618 // flow, so we need to use _redefined to avoid liveness problems.
1619 _redefined(_mov(T_2, T_3));
1620 _redefined(_mov(T_3, Zero));
1621 } break;
1622 case InstArithmetic::Ashr: {
1623 // a=b>>c (signed) ==>
1624 // t2 = shrd t2, t3, t1
1625 // t3 = sar t3, t1
1626 // test t1, 0x20
1627 // je L1
1628 // use(t2)
1629 // t2 = t3
1630 // t3 = sar t3, 0x1f
1631 Constant *SignExtend = Ctx->getConstantInt32(0x1f);
1632 _shrd(T_2, T_3, T_1);
1633 _sar(T_3, T_1);
1634 _test(T_1, BitTest);
1635 _br(CondX86::Br_e, Label);
1636 // T_2 and T_3 are being assigned again because of the intra-block control
1637 // flow, so T_2 needs to use _redefined to avoid liveness problems. T_3
1638 // doesn't need special treatment because it is reassigned via _sar
1639 // instead of _mov.
1640 _redefined(_mov(T_2, T_3));
1641 _sar(T_3, SignExtend);
1642 } break;
1643 }
1644 // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
1645 // L1:
1646 // a.lo = t2
1647 // a.hi = t3
1648 Context.insert(Label);
1649 _mov(DestLo, T_2);
1650 _mov(DestHi, T_3);
1651 }
1652 }
1653
lowerArithmetic(const InstArithmetic * Instr)1654 void TargetX8664::lowerArithmetic(const InstArithmetic *Instr) {
1655 Variable *Dest = Instr->getDest();
1656 if (Dest->isRematerializable()) {
1657 Context.insert<InstFakeDef>(Dest);
1658 return;
1659 }
1660 Type Ty = Dest->getType();
1661 Operand *Src0 = legalize(Instr->getSrc(0));
1662 Operand *Src1 = legalize(Instr->getSrc(1));
1663 if (Instr->isCommutative()) {
1664 uint32_t SwapCount = 0;
1665 if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) {
1666 std::swap(Src0, Src1);
1667 ++SwapCount;
1668 }
1669 if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) {
1670 std::swap(Src0, Src1);
1671 ++SwapCount;
1672 }
1673 // Improve two-address code patterns by avoiding a copy to the dest
1674 // register when one of the source operands ends its lifetime here.
1675 if (!Instr->isLastUse(Src0) && Instr->isLastUse(Src1)) {
1676 std::swap(Src0, Src1);
1677 ++SwapCount;
1678 }
1679 assert(SwapCount <= 1);
1680 (void)SwapCount;
1681 }
1682 if (isVectorType(Ty)) {
1683 // TODO: Trap on integer divide and integer modulo by zero. See:
1684 // https://code.google.com/p/nativeclient/issues/detail?id=3899
1685 if (llvm::isa<X86OperandMem>(Src1))
1686 Src1 = legalizeToReg(Src1);
1687 switch (Instr->getOp()) {
1688 case InstArithmetic::_num:
1689 llvm_unreachable("Unknown arithmetic operator");
1690 break;
1691 case InstArithmetic::Add: {
1692 Variable *T = makeReg(Ty);
1693 _movp(T, Src0);
1694 _padd(T, Src1);
1695 _movp(Dest, T);
1696 } break;
1697 case InstArithmetic::And: {
1698 Variable *T = makeReg(Ty);
1699 _movp(T, Src0);
1700 _pand(T, Src1);
1701 _movp(Dest, T);
1702 } break;
1703 case InstArithmetic::Or: {
1704 Variable *T = makeReg(Ty);
1705 _movp(T, Src0);
1706 _por(T, Src1);
1707 _movp(Dest, T);
1708 } break;
1709 case InstArithmetic::Xor: {
1710 Variable *T = makeReg(Ty);
1711 _movp(T, Src0);
1712 _pxor(T, Src1);
1713 _movp(Dest, T);
1714 } break;
1715 case InstArithmetic::Sub: {
1716 Variable *T = makeReg(Ty);
1717 _movp(T, Src0);
1718 _psub(T, Src1);
1719 _movp(Dest, T);
1720 } break;
1721 case InstArithmetic::Mul: {
1722 bool TypesAreValidForPmull = Ty == IceType_v4i32 || Ty == IceType_v8i16;
1723 bool InstructionSetIsValidForPmull =
1724 Ty == IceType_v8i16 || InstructionSet >= SSE4_1;
1725 if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
1726 Variable *T = makeReg(Ty);
1727 _movp(T, Src0);
1728 _pmull(T, Src0 == Src1 ? T : Src1);
1729 _movp(Dest, T);
1730 } else if (Ty == IceType_v4i32) {
1731 // Lowering sequence:
1732 // Note: The mask arguments have index 0 on the left.
1733 //
1734 // movups T1, Src0
1735 // pshufd T2, Src0, {1,0,3,0}
1736 // pshufd T3, Src1, {1,0,3,0}
1737 // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
1738 // pmuludq T1, Src1
1739 // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
1740 // pmuludq T2, T3
1741 // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
1742 // shufps T1, T2, {0,2,0,2}
1743 // pshufd T4, T1, {0,2,1,3}
1744 // movups Dest, T4
1745
1746 // Mask that directs pshufd to create a vector with entries
1747 // Src[1, 0, 3, 0]
1748 constexpr unsigned Constant1030 = 0x31;
1749 Constant *Mask1030 = Ctx->getConstantInt32(Constant1030);
1750 // Mask that directs shufps to create a vector with entries
1751 // Dest[0, 2], Src[0, 2]
1752 constexpr unsigned Mask0202 = 0x88;
1753 // Mask that directs pshufd to create a vector with entries
1754 // Src[0, 2, 1, 3]
1755 constexpr unsigned Mask0213 = 0xd8;
1756 Variable *T1 = makeReg(IceType_v4i32);
1757 Variable *T2 = makeReg(IceType_v4i32);
1758 Variable *T3 = makeReg(IceType_v4i32);
1759 Variable *T4 = makeReg(IceType_v4i32);
1760 _movp(T1, Src0);
1761 _pshufd(T2, Src0, Mask1030);
1762 _pshufd(T3, Src1, Mask1030);
1763 _pmuludq(T1, Src1);
1764 _pmuludq(T2, T3);
1765 _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
1766 _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
1767 _movp(Dest, T4);
1768 } else if (Ty == IceType_v16i8) {
1769 llvm::report_fatal_error("Scalarized operation was expected");
1770 } else {
1771 llvm::report_fatal_error("Invalid vector multiply type");
1772 }
1773 } break;
1774 case InstArithmetic::Shl: {
1775 assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
1776 Variable *T = makeReg(Ty);
1777 _movp(T, Src0);
1778 _psll(T, Src1);
1779 _movp(Dest, T);
1780 } break;
1781 case InstArithmetic::Lshr: {
1782 assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
1783 Variable *T = makeReg(Ty);
1784 _movp(T, Src0);
1785 _psrl(T, Src1);
1786 _movp(Dest, T);
1787 } break;
1788 case InstArithmetic::Ashr: {
1789 assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
1790 Variable *T = makeReg(Ty);
1791 _movp(T, Src0);
1792 _psra(T, Src1);
1793 _movp(Dest, T);
1794 } break;
1795 case InstArithmetic::Udiv:
1796 case InstArithmetic::Urem:
1797 case InstArithmetic::Sdiv:
1798 case InstArithmetic::Srem:
1799 llvm::report_fatal_error("Scalarized operation was expected");
1800 break;
1801 case InstArithmetic::Fadd: {
1802 Variable *T = makeReg(Ty);
1803 _movp(T, Src0);
1804 _addps(T, Src1);
1805 _movp(Dest, T);
1806 } break;
1807 case InstArithmetic::Fsub: {
1808 Variable *T = makeReg(Ty);
1809 _movp(T, Src0);
1810 _subps(T, Src1);
1811 _movp(Dest, T);
1812 } break;
1813 case InstArithmetic::Fmul: {
1814 Variable *T = makeReg(Ty);
1815 _movp(T, Src0);
1816 _mulps(T, Src0 == Src1 ? T : Src1);
1817 _movp(Dest, T);
1818 } break;
1819 case InstArithmetic::Fdiv: {
1820 Variable *T = makeReg(Ty);
1821 _movp(T, Src0);
1822 _divps(T, Src1);
1823 _movp(Dest, T);
1824 } break;
1825 case InstArithmetic::Frem:
1826 llvm::report_fatal_error("Scalarized operation was expected");
1827 break;
1828 }
1829 return;
1830 }
1831 Variable *T_edx = nullptr;
1832 Variable *T = nullptr;
1833 switch (Instr->getOp()) {
1834 case InstArithmetic::_num:
1835 llvm_unreachable("Unknown arithmetic operator");
1836 break;
1837 case InstArithmetic::Add: {
1838 const bool ValidType = Ty == IceType_i32 || Ty == IceType_i64;
1839 auto *Const = llvm::dyn_cast<Constant>(Instr->getSrc(1));
1840 const bool ValidKind =
1841 Const != nullptr && (llvm::isa<ConstantInteger32>(Const) ||
1842 llvm::isa<ConstantRelocatable>(Const));
1843 if (getFlags().getAggressiveLea() && ValidType && ValidKind) {
1844 auto *Var = legalizeToReg(Src0);
1845 auto *Mem = X86OperandMem::create(Func, IceType_void, Var, Const);
1846 T = makeReg(Ty);
1847 _lea(T, Mem);
1848 _mov(Dest, T);
1849 break;
1850 }
1851 _mov(T, Src0);
1852 _add(T, Src1);
1853 _mov(Dest, T);
1854 } break;
1855 case InstArithmetic::And:
1856 _mov(T, Src0);
1857 _and(T, Src1);
1858 _mov(Dest, T);
1859 break;
1860 case InstArithmetic::Or:
1861 _mov(T, Src0);
1862 _or(T, Src1);
1863 _mov(Dest, T);
1864 break;
1865 case InstArithmetic::Xor:
1866 _mov(T, Src0);
1867 _xor(T, Src1);
1868 _mov(Dest, T);
1869 break;
1870 case InstArithmetic::Sub:
1871 _mov(T, Src0);
1872 _sub(T, Src1);
1873 _mov(Dest, T);
1874 break;
1875 case InstArithmetic::Mul:
1876 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
1877 if (optimizeScalarMul(Dest, Src0, C->getValue()))
1878 return;
1879 }
1880 // The 8-bit version of imul only allows the form "imul r/m8" where T must
1881 // be in al.
1882 if (isByteSizedArithType(Ty)) {
1883 _mov(T, Src0, RegX8664::Reg_al);
1884 Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
1885 _imul(T, Src0 == Src1 ? T : Src1);
1886 _mov(Dest, T);
1887 } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) {
1888 T = makeReg(Ty);
1889 Src0 = legalize(Src0, Legal_Reg | Legal_Mem);
1890 _imul_imm(T, Src0, ImmConst);
1891 _mov(Dest, T);
1892 } else {
1893 _mov(T, Src0);
1894 // No need to legalize Src1 to Reg | Mem because the Imm case is handled
1895 // already by the ConstantInteger32 case above.
1896 _imul(T, Src0 == Src1 ? T : Src1);
1897 _mov(Dest, T);
1898 }
1899 break;
1900 case InstArithmetic::Shl:
1901 _mov(T, Src0);
1902 if (!llvm::isa<ConstantInteger32>(Src1) &&
1903 !llvm::isa<ConstantInteger64>(Src1))
1904 Src1 = copyToReg8(Src1, RegX8664::Reg_cl);
1905 _shl(T, Src1);
1906 _mov(Dest, T);
1907 break;
1908 case InstArithmetic::Lshr:
1909 _mov(T, Src0);
1910 if (!llvm::isa<ConstantInteger32>(Src1) &&
1911 !llvm::isa<ConstantInteger64>(Src1))
1912 Src1 = copyToReg8(Src1, RegX8664::Reg_cl);
1913 _shr(T, Src1);
1914 _mov(Dest, T);
1915 break;
1916 case InstArithmetic::Ashr:
1917 _mov(T, Src0);
1918 if (!llvm::isa<ConstantInteger32>(Src1) &&
1919 !llvm::isa<ConstantInteger64>(Src1))
1920 Src1 = copyToReg8(Src1, RegX8664::Reg_cl);
1921 _sar(T, Src1);
1922 _mov(Dest, T);
1923 break;
1924 case InstArithmetic::Udiv: {
1925 // div and idiv are the few arithmetic operators that do not allow
1926 // immediates as the operand.
1927 Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
1928 RegNumT Eax;
1929 RegNumT Edx;
1930 switch (Ty) {
1931 default:
1932 llvm::report_fatal_error("Bad type for udiv");
1933 case IceType_i64:
1934 Eax = RegX8664::Reg_rax;
1935 Edx = RegX8664::Reg_rdx;
1936 break;
1937 case IceType_i32:
1938 Eax = RegX8664::Reg_eax;
1939 Edx = RegX8664::Reg_edx;
1940 break;
1941 case IceType_i16:
1942 Eax = RegX8664::Reg_ax;
1943 Edx = RegX8664::Reg_dx;
1944 break;
1945 case IceType_i8:
1946 Eax = RegX8664::Reg_al;
1947 Edx = RegX8664::Reg_ah;
1948 break;
1949 }
1950 T_edx = makeReg(Ty, Edx);
1951 _mov(T, Src0, Eax);
1952 _mov(T_edx, Ctx->getConstantZero(Ty));
1953 _div(T_edx, Src1, T);
1954 _redefined(Context.insert<InstFakeDef>(T, T_edx));
1955 _mov(Dest, T);
1956 } break;
1957 case InstArithmetic::Sdiv:
1958 // TODO(stichnot): Enable this after doing better performance and cross
1959 // testing.
1960 if (false && Func->getOptLevel() >= Opt_1) {
1961 // Optimize division by constant power of 2, but not for Om1 or O0, just
1962 // to keep things simple there.
1963 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
1964 const int32_t Divisor = C->getValue();
1965 const uint32_t UDivisor = Divisor;
1966 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
1967 uint32_t LogDiv = llvm::Log2_32(UDivisor);
1968 // LLVM does the following for dest=src/(1<<log):
1969 // t=src
1970 // sar t,typewidth-1 // -1 if src is negative, 0 if not
1971 // shr t,typewidth-log
1972 // add t,src
1973 // sar t,log
1974 // dest=t
1975 uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);
1976 _mov(T, Src0);
1977 // If for some reason we are dividing by 1, just treat it like an
1978 // assignment.
1979 if (LogDiv > 0) {
1980 // The initial sar is unnecessary when dividing by 2.
1981 if (LogDiv > 1)
1982 _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
1983 _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
1984 _add(T, Src0);
1985 _sar(T, Ctx->getConstantInt(Ty, LogDiv));
1986 }
1987 _mov(Dest, T);
1988 return;
1989 }
1990 }
1991 }
1992 Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
1993 switch (Ty) {
1994 default:
1995 llvm::report_fatal_error("Bad type for sdiv");
1996 case IceType_i64:
1997 T_edx = makeReg(Ty, RegX8664::Reg_rdx);
1998 _mov(T, Src0, RegX8664::Reg_rax);
1999 break;
2000 case IceType_i32:
2001 T_edx = makeReg(Ty, RegX8664::Reg_edx);
2002 _mov(T, Src0, RegX8664::Reg_eax);
2003 break;
2004 case IceType_i16:
2005 T_edx = makeReg(Ty, RegX8664::Reg_dx);
2006 _mov(T, Src0, RegX8664::Reg_ax);
2007 break;
2008 case IceType_i8:
2009 T_edx = makeReg(IceType_i16, RegX8664::Reg_ax);
2010 _mov(T, Src0, RegX8664::Reg_al);
2011 break;
2012 }
2013 _cbwdq(T_edx, T);
2014 _idiv(T_edx, Src1, T);
2015 _redefined(Context.insert<InstFakeDef>(T, T_edx));
2016 _mov(Dest, T);
2017 break;
2018 case InstArithmetic::Urem: {
2019 Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2020 RegNumT Eax;
2021 RegNumT Edx;
2022 switch (Ty) {
2023 default:
2024 llvm::report_fatal_error("Bad type for urem");
2025 case IceType_i64:
2026 Eax = RegX8664::Reg_rax;
2027 Edx = RegX8664::Reg_rdx;
2028 break;
2029 case IceType_i32:
2030 Eax = RegX8664::Reg_eax;
2031 Edx = RegX8664::Reg_edx;
2032 break;
2033 case IceType_i16:
2034 Eax = RegX8664::Reg_ax;
2035 Edx = RegX8664::Reg_dx;
2036 break;
2037 case IceType_i8:
2038 Eax = RegX8664::Reg_al;
2039 Edx = RegX8664::Reg_ah;
2040 break;
2041 }
2042 T_edx = makeReg(Ty, Edx);
2043 _mov(T_edx, Ctx->getConstantZero(Ty));
2044 _mov(T, Src0, Eax);
2045 _div(T, Src1, T_edx);
2046 _redefined(Context.insert<InstFakeDef>(T_edx, T));
2047 if (Ty == IceType_i8) {
2048 // Register ah must be moved into one of {al,bl,cl,dl} before it can be
2049 // moved into a general 8-bit register.
2050 auto *T_AhRcvr = makeReg(Ty);
2051 T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
2052 _mov(T_AhRcvr, T_edx);
2053 T_edx = T_AhRcvr;
2054 }
2055 _mov(Dest, T_edx);
2056 } break;
2057 case InstArithmetic::Srem: {
2058 // TODO(stichnot): Enable this after doing better performance and cross
2059 // testing.
2060 if (false && Func->getOptLevel() >= Opt_1) {
2061 // Optimize mod by constant power of 2, but not for Om1 or O0, just to
2062 // keep things simple there.
2063 if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2064 const int32_t Divisor = C->getValue();
2065 const uint32_t UDivisor = Divisor;
2066 if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
2067 uint32_t LogDiv = llvm::Log2_32(UDivisor);
2068 // LLVM does the following for dest=src%(1<<log):
2069 // t=src
2070 // sar t,typewidth-1 // -1 if src is negative, 0 if not
2071 // shr t,typewidth-log
2072 // add t,src
2073 // and t, -(1<<log)
2074 // sub t,src
2075 // neg t
2076 // dest=t
2077 uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);
2078 // If for some reason we are dividing by 1, just assign 0.
2079 if (LogDiv == 0) {
2080 _mov(Dest, Ctx->getConstantZero(Ty));
2081 return;
2082 }
2083 _mov(T, Src0);
2084 // The initial sar is unnecessary when dividing by 2.
2085 if (LogDiv > 1)
2086 _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
2087 _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
2088 _add(T, Src0);
2089 _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
2090 _sub(T, Src0);
2091 _neg(T);
2092 _mov(Dest, T);
2093 return;
2094 }
2095 }
2096 }
2097 Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2098 RegNumT Eax;
2099 RegNumT Edx;
2100 switch (Ty) {
2101 default:
2102 llvm::report_fatal_error("Bad type for srem");
2103 case IceType_i64:
2104 Eax = RegX8664::Reg_rax;
2105 Edx = RegX8664::Reg_rdx;
2106 break;
2107 case IceType_i32:
2108 Eax = RegX8664::Reg_eax;
2109 Edx = RegX8664::Reg_edx;
2110 break;
2111 case IceType_i16:
2112 Eax = RegX8664::Reg_ax;
2113 Edx = RegX8664::Reg_dx;
2114 break;
2115 case IceType_i8:
2116 Eax = RegX8664::Reg_al;
2117 Edx = RegX8664::Reg_ah;
2118 break;
2119 }
2120 T_edx = makeReg(Ty, Edx);
2121 _mov(T, Src0, Eax);
2122 _cbwdq(T_edx, T);
2123 _idiv(T, Src1, T_edx);
2124 _redefined(Context.insert<InstFakeDef>(T_edx, T));
2125 if (Ty == IceType_i8) {
2126 // Register ah must be moved into one of {al,bl,cl,dl} before it can be
2127 // moved into a general 8-bit register.
2128 auto *T_AhRcvr = makeReg(Ty);
2129 T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
2130 _mov(T_AhRcvr, T_edx);
2131 T_edx = T_AhRcvr;
2132 }
2133 _mov(Dest, T_edx);
2134 } break;
2135 case InstArithmetic::Fadd:
2136 _mov(T, Src0);
2137 _addss(T, Src1);
2138 _mov(Dest, T);
2139 break;
2140 case InstArithmetic::Fsub:
2141 _mov(T, Src0);
2142 _subss(T, Src1);
2143 _mov(Dest, T);
2144 break;
2145 case InstArithmetic::Fmul:
2146 _mov(T, Src0);
2147 _mulss(T, Src0 == Src1 ? T : Src1);
2148 _mov(Dest, T);
2149 break;
2150 case InstArithmetic::Fdiv:
2151 _mov(T, Src0);
2152 _divss(T, Src1);
2153 _mov(Dest, T);
2154 break;
2155 case InstArithmetic::Frem:
2156 llvm::report_fatal_error("Helper call was expected");
2157 break;
2158 }
2159 }
2160
lowerAssign(const InstAssign * Instr)2161 void TargetX8664::lowerAssign(const InstAssign *Instr) {
2162 Variable *Dest = Instr->getDest();
2163 if (Dest->isRematerializable()) {
2164 Context.insert<InstFakeDef>(Dest);
2165 return;
2166 }
2167 Operand *Src = Instr->getSrc(0);
2168 assert(Dest->getType() == Src->getType());
2169 lowerMove(Dest, Src, false);
2170 }
2171
lowerBr(const InstBr * Br)2172 void TargetX8664::lowerBr(const InstBr *Br) {
2173 if (Br->isUnconditional()) {
2174 _br(Br->getTargetUnconditional());
2175 return;
2176 }
2177 Operand *Cond = Br->getCondition();
2178
2179 // Handle folding opportunities.
2180 if (const Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
2181 assert(Producer->isDeleted());
2182 switch (BoolFolding::getProducerKind(Producer)) {
2183 default:
2184 break;
2185 case BoolFolding::PK_Icmp32:
2186 case BoolFolding::PK_Icmp64: {
2187 lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Br);
2188 return;
2189 }
2190 case BoolFolding::PK_Fcmp: {
2191 lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Br);
2192 return;
2193 }
2194 case BoolFolding::PK_Arith: {
2195 lowerArithAndConsumer(llvm::cast<InstArithmetic>(Producer), Br);
2196 return;
2197 }
2198 }
2199 }
2200 Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
2201 Constant *Zero = Ctx->getConstantZero(IceType_i32);
2202 _cmp(Src0, Zero);
2203 _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
2204 }
2205
2206 // constexprMax returns a (constexpr) max(S0, S1), and it is used for defining
2207 // OperandList in lowerCall. std::max() is supposed to work, but it doesn't.
constexprMax(SizeT S0,SizeT S1)2208 inline constexpr SizeT constexprMax(SizeT S0, SizeT S1) {
2209 return S0 < S1 ? S1 : S0;
2210 }
2211
lowerCall(const InstCall * Instr)2212 void TargetX8664::lowerCall(const InstCall *Instr) {
2213 // Common x86-64 calling convention lowering:
2214 //
2215 // * At the point before the call, the stack must be aligned to 16 bytes.
2216 //
2217 // * Non-register arguments are pushed onto the stack in right-to-left order,
2218 // such that the left-most argument ends up on the top of the stack at the
2219 // lowest memory address.
2220 //
2221 // * Stack arguments of vector type are aligned to start at the next highest
2222 // multiple of 16 bytes. Other stack arguments are aligned to the next word
2223 // size boundary (4 or 8 bytes, respectively).
2224
2225 constexpr SizeT MaxOperands =
2226 constexprMax(RegX8664::X86_MAX_XMM_ARGS, RegX8664::X86_MAX_GPR_ARGS);
2227 using OperandList = llvm::SmallVector<Operand *, MaxOperands>;
2228
2229 OperandList XmmArgs;
2230 llvm::SmallVector<SizeT, MaxOperands> XmmArgIndices;
2231 CfgVector<std::pair<const Type, Operand *>> GprArgs;
2232 CfgVector<SizeT> GprArgIndices;
2233 OperandList StackArgs, StackArgLocations;
2234 uint32_t ParameterAreaSizeBytes = 0;
2235
2236 ParameterAreaSizeBytes += getShadowStoreSize();
2237
2238 // Classify each argument operand according to the location where the argument
2239 // is passed.
2240 for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
2241 Operand *Arg = Instr->getArg(i);
2242 const Type Ty = Arg->getType();
2243 // The PNaCl ABI requires the width of arguments to be at least 32 bits.
2244 assert(typeWidthInBytes(Ty) >= 4);
2245 if (isVectorType(Ty) && RegX8664::getRegisterForXmmArgNum(
2246 RegX8664::getArgIndex(i, XmmArgs.size()))
2247 .hasValue()) {
2248 XmmArgs.push_back(Arg);
2249 XmmArgIndices.push_back(i);
2250 } else if (isScalarFloatingType(Ty) &&
2251 RegX8664::getRegisterForXmmArgNum(
2252 RegX8664::getArgIndex(i, XmmArgs.size()))
2253 .hasValue()) {
2254 XmmArgs.push_back(Arg);
2255 XmmArgIndices.push_back(i);
2256 } else if (isScalarIntegerType(Ty) &&
2257 RegX8664::getRegisterForGprArgNum(
2258 Ty, RegX8664::getArgIndex(i, GprArgs.size()))
2259 .hasValue()) {
2260 GprArgs.emplace_back(Ty, Arg);
2261 GprArgIndices.push_back(i);
2262 } else {
2263 // Place on stack.
2264 StackArgs.push_back(Arg);
2265 if (isVectorType(Arg->getType())) {
2266 ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
2267 }
2268 Variable *esp = getPhysicalRegister(getStackReg(), WordType);
2269 Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
2270 StackArgLocations.push_back(X86OperandMem::create(Func, Ty, esp, Loc));
2271 ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
2272 }
2273 }
2274 // Adjust the parameter area so that the stack is aligned. It is assumed that
2275 // the stack is already aligned at the start of the calling sequence.
2276 ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
2277 assert(ParameterAreaSizeBytes <= maxOutArgsSizeBytes());
2278 // Copy arguments that are passed on the stack to the appropriate stack
2279 // locations. We make sure legalize() is called on each argument at this
2280 // point, to allow availabilityGet() to work.
2281 for (SizeT i = 0, NumStackArgs = StackArgs.size(); i < NumStackArgs; ++i) {
2282 lowerStore(
2283 InstStore::create(Func, legalize(StackArgs[i]), StackArgLocations[i]));
2284 }
2285 // Copy arguments to be passed in registers to the appropriate registers.
2286 for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
2287 XmmArgs[i] = legalizeToReg(legalize(XmmArgs[i]),
2288 RegX8664::getRegisterForXmmArgNum(
2289 RegX8664::getArgIndex(XmmArgIndices[i], i)));
2290 }
2291 // Materialize moves for arguments passed in GPRs.
2292 for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) {
2293 const Type SignatureTy = GprArgs[i].first;
2294 Operand *Arg =
2295 legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable);
2296 GprArgs[i].second = legalizeToReg(
2297 Arg, RegX8664::getRegisterForGprArgNum(
2298 Arg->getType(), RegX8664::getArgIndex(GprArgIndices[i], i)));
2299 assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32);
2300 assert(SignatureTy == Arg->getType());
2301 (void)SignatureTy;
2302 }
2303 // Generate a FakeUse of register arguments so that they do not get dead code
2304 // eliminated as a result of the FakeKill of scratch registers after the call.
2305 // These need to be right before the call instruction.
2306 for (auto *Arg : XmmArgs) {
2307 Context.insert<InstFakeUse>(llvm::cast<Variable>(Arg));
2308 }
2309 for (auto &ArgPair : GprArgs) {
2310 Context.insert<InstFakeUse>(llvm::cast<Variable>(ArgPair.second));
2311 }
2312 // Generate the call instruction. Assign its result to a temporary with high
2313 // register allocation weight.
2314 Variable *Dest = Instr->getDest();
2315 const Type DestTy = Dest ? Dest->getType() : IceType_void;
2316 Variable *ReturnReg = nullptr;
2317 if (Dest) {
2318 switch (DestTy) {
2319 case IceType_NUM:
2320 case IceType_void:
2321 case IceType_i1:
2322 case IceType_i8:
2323 case IceType_i16:
2324 llvm::report_fatal_error("Invalid Call dest type");
2325 break;
2326 case IceType_i32:
2327 ReturnReg = makeReg(DestTy, RegX8664::Reg_eax);
2328 break;
2329 case IceType_i64:
2330 ReturnReg = makeReg(IceType_i64, RegX8664::Reg_rax);
2331
2332 break;
2333 case IceType_f32:
2334 case IceType_f64:
2335 case IceType_v4i1:
2336 case IceType_v8i1:
2337 case IceType_v16i1:
2338 case IceType_v16i8:
2339 case IceType_v8i16:
2340 case IceType_v4i32:
2341 case IceType_v4f32:
2342 ReturnReg = makeReg(DestTy, RegX8664::Reg_xmm0);
2343 break;
2344 }
2345 }
2346 // Emit the call to the function.
2347 Operand *CallTarget =
2348 legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm | Legal_AddrAbs);
2349 size_t NumVariadicFpArgs = Instr->isVariadic() ? XmmArgs.size() : 0;
2350 Inst *NewCall = emitCallToTarget(CallTarget, ReturnReg, NumVariadicFpArgs);
2351 // Mark the call as killing all the caller-save registers.
2352 Context.insert<InstFakeKill>(NewCall);
2353 // Generate a FakeUse to keep the call live if necessary.
2354 if (Instr->hasSideEffects() && ReturnReg) {
2355 Context.insert<InstFakeUse>(ReturnReg);
2356 }
2357 // Process the return value, if any.
2358 if (Dest == nullptr)
2359 return;
2360 // Assign the result of the call to Dest. Route it through a temporary so
2361 // that the local register availability peephole can be subsequently used.
2362 Variable *Tmp = nullptr;
2363 if (isVectorType(DestTy)) {
2364 assert(ReturnReg && "Vector type requires a return register");
2365 Tmp = makeReg(DestTy);
2366 _movp(Tmp, ReturnReg);
2367 _movp(Dest, Tmp);
2368 } else if (isScalarFloatingType(DestTy)) {
2369 assert(ReturnReg && "FP type requires a return register");
2370 _mov(Tmp, ReturnReg);
2371 _mov(Dest, Tmp);
2372 } else {
2373 assert(isScalarIntegerType(DestTy));
2374 assert(ReturnReg && "Integer type requires a return register");
2375 _mov(Tmp, ReturnReg);
2376 _mov(Dest, Tmp);
2377 }
2378 }
2379
lowerCast(const InstCast * Instr)2380 void TargetX8664::lowerCast(const InstCast *Instr) {
2381 // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
2382 InstCast::OpKind CastKind = Instr->getCastKind();
2383 Variable *Dest = Instr->getDest();
2384 Type DestTy = Dest->getType();
2385 switch (CastKind) {
2386 default:
2387 Func->setError("Cast type not supported");
2388 return;
2389 case InstCast::Sext: {
2390 // Src0RM is the source operand legalized to physical register or memory,
2391 // but not immediate, since the relevant x86 native instructions don't
2392 // allow an immediate operand. If the operand is an immediate, we could
2393 // consider computing the strength-reduced result at translation time, but
2394 // we're unlikely to see something like that in the bitcode that the
2395 // optimizer wouldn't have already taken care of.
2396 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2397 if (isVectorType(DestTy)) {
2398 if (DestTy == IceType_v16i8) {
2399 // onemask = materialize(1,1,...); dst = (src & onemask) > 0
2400 Variable *OneMask = makeVectorOfOnes(DestTy);
2401 Variable *T = makeReg(DestTy);
2402 _movp(T, Src0RM);
2403 _pand(T, OneMask);
2404 Variable *Zeros = makeVectorOfZeros(DestTy);
2405 _pcmpgt(T, Zeros);
2406 _movp(Dest, T);
2407 } else {
2408 /// width = width(elty) - 1; dest = (src << width) >> width
2409 SizeT ShiftAmount =
2410 X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) - 1;
2411 Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount);
2412 Variable *T = makeReg(DestTy);
2413 _movp(T, Src0RM);
2414 _psll(T, ShiftConstant);
2415 _psra(T, ShiftConstant);
2416 _movp(Dest, T);
2417 }
2418 } else if (Src0RM->getType() == IceType_i1) {
2419 // t1 = src
2420 // shl t1, dst_bitwidth - 1
2421 // sar t1, dst_bitwidth - 1
2422 // dst = t1
2423 size_t DestBits = X86_CHAR_BIT * typeWidthInBytes(DestTy);
2424 Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
2425 Variable *T = makeReg(DestTy);
2426 if (typeWidthInBytes(DestTy) <= typeWidthInBytes(Src0RM->getType())) {
2427 _mov(T, Src0RM);
2428 } else {
2429 // Widen the source using movsx or movzx. (It doesn't matter which one,
2430 // since the following shl/sar overwrite the bits.)
2431 _movzx(T, Src0RM);
2432 }
2433 _shl(T, ShiftAmount);
2434 _sar(T, ShiftAmount);
2435 _mov(Dest, T);
2436 } else {
2437 // t1 = movsx src; dst = t1
2438 Variable *T = makeReg(DestTy);
2439 _movsx(T, Src0RM);
2440 _mov(Dest, T);
2441 }
2442 break;
2443 }
2444 case InstCast::Zext: {
2445 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2446 if (isVectorType(DestTy)) {
2447 // onemask = materialize(1,1,...); dest = onemask & src
2448 Variable *OneMask = makeVectorOfOnes(DestTy);
2449 Variable *T = makeReg(DestTy);
2450 _movp(T, Src0RM);
2451 _pand(T, OneMask);
2452 _movp(Dest, T);
2453 } else if (Src0RM->getType() == IceType_i1) {
2454 // t = Src0RM; Dest = t
2455 Variable *T = nullptr;
2456 if (DestTy == IceType_i8) {
2457 _mov(T, Src0RM);
2458 } else {
2459 assert(DestTy != IceType_i1);
2460 // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
2461 // In x86-64 we need to widen T to 64-bits to ensure that T -- if
2462 // written to the stack (i.e., in -Om1) will be fully zero-extended.
2463 T = makeReg(DestTy == IceType_i64 ? IceType_i64 : IceType_i32);
2464 _movzx(T, Src0RM);
2465 }
2466 _mov(Dest, T);
2467 } else {
2468 // t1 = movzx src; dst = t1
2469 Variable *T = makeReg(DestTy);
2470 _movzx(T, Src0RM);
2471 _mov(Dest, T);
2472 }
2473 break;
2474 }
2475 case InstCast::Trunc: {
2476 if (isVectorType(DestTy)) {
2477 // onemask = materialize(1,1,...); dst = src & onemask
2478 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2479 Type Src0Ty = Src0RM->getType();
2480 Variable *OneMask = makeVectorOfOnes(Src0Ty);
2481 Variable *T = makeReg(DestTy);
2482 _movp(T, Src0RM);
2483 _pand(T, OneMask);
2484 _movp(Dest, T);
2485 } else if (DestTy == IceType_i1 || DestTy == IceType_i8) {
2486 // Make sure we truncate from and into valid registers.
2487 Operand *Src0 = legalizeUndef(Instr->getSrc(0));
2488 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2489 Variable *T = copyToReg8(Src0RM);
2490 if (DestTy == IceType_i1)
2491 _and(T, Ctx->getConstantInt1(1));
2492 _mov(Dest, T);
2493 } else {
2494 Operand *Src0 = legalizeUndef(Instr->getSrc(0));
2495 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2496 // t1 = trunc Src0RM; Dest = t1
2497 Variable *T = makeReg(DestTy);
2498 _mov(T, Src0RM);
2499 _mov(Dest, T);
2500 }
2501 break;
2502 }
2503 case InstCast::Fptrunc:
2504 case InstCast::Fpext: {
2505 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2506 // t1 = cvt Src0RM; Dest = t1
2507 Variable *T = makeReg(DestTy);
2508 _cvt(T, Src0RM, Insts::Cvt::Float2float);
2509 _mov(Dest, T);
2510 break;
2511 }
2512 case InstCast::Fptosi:
2513 if (isVectorType(DestTy)) {
2514 assert(DestTy == IceType_v4i32);
2515 assert(Instr->getSrc(0)->getType() == IceType_v4f32);
2516 Operand *Src0R = legalizeToReg(Instr->getSrc(0));
2517 Variable *T = makeReg(DestTy);
2518 _cvt(T, Src0R, Insts::Cvt::Tps2dq);
2519 _movp(Dest, T);
2520 } else {
2521 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2522 // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
2523 Variable *T_1 = nullptr;
2524 if (DestTy == IceType_i64) {
2525 T_1 = makeReg(IceType_i64);
2526 } else {
2527 assert(DestTy != IceType_i64);
2528 T_1 = makeReg(IceType_i32);
2529 }
2530 // cvt() requires its integer argument to be a GPR.
2531 Variable *T_2 = makeReg(DestTy);
2532 if (isByteSizedType(DestTy)) {
2533 assert(T_1->getType() == IceType_i32);
2534 T_1->setRegClass(RCX86_Is32To8);
2535 T_2->setRegClass(RCX86_IsTrunc8Rcvr);
2536 }
2537 _cvt(T_1, Src0RM, Insts::Cvt::Tss2si);
2538 _mov(T_2, T_1); // T_1 and T_2 may have different integer types
2539 if (DestTy == IceType_i1)
2540 _and(T_2, Ctx->getConstantInt1(1));
2541 _mov(Dest, T_2);
2542 }
2543 break;
2544 case InstCast::Fptoui:
2545 if (isVectorType(DestTy)) {
2546 llvm::report_fatal_error("Helper call was expected");
2547 } else if (DestTy == IceType_i64) {
2548 llvm::report_fatal_error("Helper call was expected");
2549 } else {
2550 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2551 // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
2552 assert(DestTy != IceType_i64);
2553 Variable *T_1 = nullptr;
2554 if (DestTy == IceType_i32) {
2555 T_1 = makeReg(IceType_i64);
2556 } else {
2557 assert(DestTy != IceType_i32);
2558 T_1 = makeReg(IceType_i32);
2559 }
2560 Variable *T_2 = makeReg(DestTy);
2561 if (isByteSizedType(DestTy)) {
2562 assert(T_1->getType() == IceType_i32);
2563 T_1->setRegClass(RCX86_Is32To8);
2564 T_2->setRegClass(RCX86_IsTrunc8Rcvr);
2565 }
2566 _cvt(T_1, Src0RM, Insts::Cvt::Tss2si);
2567 _mov(T_2, T_1); // T_1 and T_2 may have different integer types
2568 if (DestTy == IceType_i1)
2569 _and(T_2, Ctx->getConstantInt1(1));
2570 _mov(Dest, T_2);
2571 }
2572 break;
2573 case InstCast::Sitofp:
2574 if (isVectorType(DestTy)) {
2575 assert(DestTy == IceType_v4f32);
2576 assert(Instr->getSrc(0)->getType() == IceType_v4i32);
2577 Operand *Src0R = legalizeToReg(Instr->getSrc(0));
2578 Variable *T = makeReg(DestTy);
2579 _cvt(T, Src0R, Insts::Cvt::Dq2ps);
2580 _movp(Dest, T);
2581 } else {
2582 Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2583 // Sign-extend the operand.
2584 // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
2585 Variable *T_1 = nullptr;
2586 if (Src0RM->getType() == IceType_i64) {
2587 T_1 = makeReg(IceType_i64);
2588 } else {
2589 assert(Src0RM->getType() != IceType_i64);
2590 T_1 = makeReg(IceType_i32);
2591 }
2592 Variable *T_2 = makeReg(DestTy);
2593 if (Src0RM->getType() == T_1->getType())
2594 _mov(T_1, Src0RM);
2595 else
2596 _movsx(T_1, Src0RM);
2597 _cvt(T_2, T_1, Insts::Cvt::Si2ss);
2598 _mov(Dest, T_2);
2599 }
2600 break;
2601 case InstCast::Uitofp: {
2602 Operand *Src0 = Instr->getSrc(0);
2603 if (isVectorType(Src0->getType())) {
2604 llvm::report_fatal_error("Helper call was expected");
2605 } else if (Src0->getType() == IceType_i64) {
2606 llvm::report_fatal_error("Helper call was expected");
2607 } else {
2608 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2609 // Zero-extend the operand.
2610 // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
2611 Variable *T_1 = nullptr;
2612 if (Src0RM->getType() == IceType_i32) {
2613 T_1 = makeReg(IceType_i64);
2614 } else {
2615 assert(Src0RM->getType() != IceType_i64);
2616 T_1 = makeReg(IceType_i32);
2617 }
2618 Variable *T_2 = makeReg(DestTy);
2619 if (Src0RM->getType() == T_1->getType())
2620 _mov(T_1, Src0RM);
2621 else
2622 _movzx(T_1, Src0RM)->setMustKeep();
2623 _cvt(T_2, T_1, Insts::Cvt::Si2ss);
2624 _mov(Dest, T_2);
2625 }
2626 break;
2627 }
2628 case InstCast::Bitcast: {
2629 Operand *Src0 = Instr->getSrc(0);
2630 if (DestTy == Src0->getType()) {
2631 auto *Assign = InstAssign::create(Func, Dest, Src0);
2632 lowerAssign(Assign);
2633 return;
2634 }
2635 switch (DestTy) {
2636 default:
2637 llvm_unreachable("Unexpected Bitcast dest type");
2638 case IceType_i8: {
2639 llvm::report_fatal_error("Helper call was expected");
2640 } break;
2641 case IceType_i16: {
2642 llvm::report_fatal_error("Helper call was expected");
2643 } break;
2644 case IceType_i32:
2645 case IceType_f32: {
2646 Variable *Src0R = legalizeToReg(Src0);
2647 Variable *T = makeReg(DestTy);
2648 _movd(T, Src0R);
2649 _mov(Dest, T);
2650 } break;
2651 case IceType_i64: {
2652 assert(Src0->getType() == IceType_f64);
2653 Variable *Src0R = legalizeToReg(Src0);
2654 Variable *T = makeReg(IceType_i64);
2655 _movd(T, Src0R);
2656 _mov(Dest, T);
2657 } break;
2658 case IceType_f64: {
2659 assert(Src0->getType() == IceType_i64);
2660 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2661 Variable *T = makeReg(IceType_f64);
2662 _movd(T, Src0RM);
2663 _mov(Dest, T);
2664 } break;
2665 case IceType_v8i1: {
2666 llvm::report_fatal_error("Helper call was expected");
2667 } break;
2668 case IceType_v16i1: {
2669 llvm::report_fatal_error("Helper call was expected");
2670 } break;
2671 case IceType_v8i16:
2672 case IceType_v16i8:
2673 case IceType_v4i32:
2674 case IceType_v4f32: {
2675 if (Src0->getType() == IceType_i32) {
2676 // Bitcast requires equal type sizes, which isn't strictly the case
2677 // between scalars and vectors, but to emulate v4i8 vectors one has to
2678 // use v16i8 vectors.
2679 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2680 Variable *T = makeReg(DestTy);
2681 _movd(T, Src0RM);
2682 _mov(Dest, T);
2683 } else {
2684 _movp(Dest, legalizeToReg(Src0));
2685 }
2686 } break;
2687 }
2688 break;
2689 }
2690 }
2691 }
2692
lowerExtractElement(const InstExtractElement * Instr)2693 void TargetX8664::lowerExtractElement(const InstExtractElement *Instr) {
2694 Operand *SourceVectNotLegalized = Instr->getSrc(0);
2695 auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(1));
2696 // Only constant indices are allowed in PNaCl IR.
2697 assert(ElementIndex);
2698
2699 unsigned Index = ElementIndex->getValue();
2700 Type Ty = SourceVectNotLegalized->getType();
2701 Type ElementTy = typeElementType(Ty);
2702 Type InVectorElementTy = InstX86Base::getInVectorElementType(Ty);
2703
2704 // TODO(wala): Determine the best lowering sequences for each type.
2705 bool CanUsePextr = Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
2706 (InstructionSet >= SSE4_1 && Ty != IceType_v4f32);
2707 Variable *ExtractedElementR =
2708 makeReg(CanUsePextr ? IceType_i32 : InVectorElementTy);
2709 if (CanUsePextr) {
2710 // Use pextrb, pextrw, or pextrd. The "b" and "w" versions clear the upper
2711 // bits of the destination register, so we represent this by always
2712 // extracting into an i32 register. The _mov into Dest below will do
2713 // truncation as necessary.
2714 Constant *Mask = Ctx->getConstantInt32(Index);
2715 Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized);
2716 _pextr(ExtractedElementR, SourceVectR, Mask);
2717 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
2718 // Use pshufd and movd/movss.
2719 Variable *T = nullptr;
2720 if (Index) {
2721 // The shuffle only needs to occur if the element to be extracted is not
2722 // at the lowest index.
2723 Constant *Mask = Ctx->getConstantInt32(Index);
2724 T = makeReg(Ty);
2725 _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask);
2726 } else {
2727 T = legalizeToReg(SourceVectNotLegalized);
2728 }
2729
2730 if (InVectorElementTy == IceType_i32) {
2731 _movd(ExtractedElementR, T);
2732 } else { // Ty == IceType_f32
2733 // TODO(wala): _movss is only used here because _mov does not allow a
2734 // vector source and a scalar destination. _mov should be able to be
2735 // used here.
2736 // _movss is a binary instruction, so the FakeDef is needed to keep the
2737 // live range analysis consistent.
2738 Context.insert<InstFakeDef>(ExtractedElementR);
2739 _movss(ExtractedElementR, T);
2740 }
2741 } else {
2742 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
2743 // Spill the value to a stack slot and do the extraction in memory.
2744 //
2745 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
2746 // for legalizing to mem is implemented.
2747 Variable *Slot = Func->makeVariable(Ty);
2748 Slot->setMustNotHaveReg();
2749 _movp(Slot, legalizeToReg(SourceVectNotLegalized));
2750
2751 // Compute the location of the element in memory.
2752 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
2753 X86OperandMem *Loc =
2754 getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
2755 _mov(ExtractedElementR, Loc);
2756 }
2757
2758 if (ElementTy == IceType_i1) {
2759 // Truncate extracted integers to i1s if necessary.
2760 Variable *T = makeReg(IceType_i1);
2761 InstCast *Cast =
2762 InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR);
2763 lowerCast(Cast);
2764 ExtractedElementR = T;
2765 }
2766
2767 // Copy the element to the destination.
2768 Variable *Dest = Instr->getDest();
2769 _mov(Dest, ExtractedElementR);
2770 }
2771
lowerFcmp(const InstFcmp * Fcmp)2772 void TargetX8664::lowerFcmp(const InstFcmp *Fcmp) {
2773 Variable *Dest = Fcmp->getDest();
2774
2775 if (isVectorType(Dest->getType())) {
2776 lowerFcmpVector(Fcmp);
2777 } else {
2778 constexpr Inst *Consumer = nullptr;
2779 lowerFcmpAndConsumer(Fcmp, Consumer);
2780 }
2781 }
2782
lowerFcmpAndConsumer(const InstFcmp * Fcmp,const Inst * Consumer)2783 void TargetX8664::lowerFcmpAndConsumer(const InstFcmp *Fcmp,
2784 const Inst *Consumer) {
2785 Operand *Src0 = Fcmp->getSrc(0);
2786 Operand *Src1 = Fcmp->getSrc(1);
2787 Variable *Dest = Fcmp->getDest();
2788
2789 if (Consumer != nullptr) {
2790 if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
2791 if (lowerOptimizeFcmpSelect(Fcmp, Select))
2792 return;
2793 }
2794 }
2795
2796 if (isVectorType(Dest->getType())) {
2797 lowerFcmp(Fcmp);
2798 if (Consumer != nullptr)
2799 lowerSelectVector(llvm::cast<InstSelect>(Consumer));
2800 return;
2801 }
2802
2803 // Lowering a = fcmp cond, b, c
2804 // ucomiss b, c /* only if C1 != Br_None */
2805 // /* but swap b,c order if SwapOperands==true */
2806 // mov a, <default>
2807 // j<C1> label /* only if C1 != Br_None */
2808 // j<C2> label /* only if C2 != Br_None */
2809 // FakeUse(a) /* only if C1 != Br_None */
2810 // mov a, !<default> /* only if C1 != Br_None */
2811 // label: /* only if C1 != Br_None */
2812 //
2813 // setcc lowering when C1 != Br_None && C2 == Br_None:
2814 // ucomiss b, c /* but swap b,c order if SwapOperands==true */
2815 // setcc a, C1
2816 InstFcmp::FCond Condition = Fcmp->getCondition();
2817 assert(static_cast<size_t>(Condition) < TableFcmpSize);
2818 if (TableFcmp[Condition].SwapScalarOperands)
2819 std::swap(Src0, Src1);
2820 const bool HasC1 = (TableFcmp[Condition].C1 != CondX86::Br_None);
2821 const bool HasC2 = (TableFcmp[Condition].C2 != CondX86::Br_None);
2822 if (HasC1) {
2823 Src0 = legalize(Src0);
2824 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
2825 Variable *T = nullptr;
2826 _mov(T, Src0);
2827 _ucomiss(T, Src1RM);
2828 if (!HasC2) {
2829 assert(TableFcmp[Condition].Default);
2830 setccOrConsumer(TableFcmp[Condition].C1, Dest, Consumer);
2831 return;
2832 }
2833 }
2834 int32_t IntDefault = TableFcmp[Condition].Default;
2835 if (Consumer == nullptr) {
2836 Constant *Default = Ctx->getConstantInt(Dest->getType(), IntDefault);
2837 _mov(Dest, Default);
2838 if (HasC1) {
2839 InstX86Label *Label = InstX86Label::create(Func, this);
2840 _br(TableFcmp[Condition].C1, Label);
2841 if (HasC2) {
2842 _br(TableFcmp[Condition].C2, Label);
2843 }
2844 Constant *NonDefault = Ctx->getConstantInt(Dest->getType(), !IntDefault);
2845 _redefined(_mov(Dest, NonDefault));
2846 Context.insert(Label);
2847 }
2848 return;
2849 }
2850 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
2851 CfgNode *TrueSucc = Br->getTargetTrue();
2852 CfgNode *FalseSucc = Br->getTargetFalse();
2853 if (IntDefault != 0)
2854 std::swap(TrueSucc, FalseSucc);
2855 if (HasC1) {
2856 _br(TableFcmp[Condition].C1, FalseSucc);
2857 if (HasC2) {
2858 _br(TableFcmp[Condition].C2, FalseSucc);
2859 }
2860 _br(TrueSucc);
2861 return;
2862 }
2863 _br(FalseSucc);
2864 return;
2865 }
2866 if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
2867 Operand *SrcT = Select->getTrueOperand();
2868 Operand *SrcF = Select->getFalseOperand();
2869 Variable *SelectDest = Select->getDest();
2870 if (IntDefault != 0)
2871 std::swap(SrcT, SrcF);
2872 lowerMove(SelectDest, SrcF, false);
2873 if (HasC1) {
2874 InstX86Label *Label = InstX86Label::create(Func, this);
2875 _br(TableFcmp[Condition].C1, Label);
2876 if (HasC2) {
2877 _br(TableFcmp[Condition].C2, Label);
2878 }
2879 static constexpr bool IsRedefinition = true;
2880 lowerMove(SelectDest, SrcT, IsRedefinition);
2881 Context.insert(Label);
2882 }
2883 return;
2884 }
2885 llvm::report_fatal_error("Unexpected consumer type");
2886 }
2887
lowerFcmpVector(const InstFcmp * Fcmp)2888 void TargetX8664::lowerFcmpVector(const InstFcmp *Fcmp) {
2889 Operand *Src0 = Fcmp->getSrc(0);
2890 Operand *Src1 = Fcmp->getSrc(1);
2891 Variable *Dest = Fcmp->getDest();
2892
2893 if (!isVectorType(Dest->getType()))
2894 llvm::report_fatal_error("Expected vector compare");
2895
2896 InstFcmp::FCond Condition = Fcmp->getCondition();
2897 assert(static_cast<size_t>(Condition) < TableFcmpSize);
2898
2899 if (TableFcmp[Condition].SwapVectorOperands)
2900 std::swap(Src0, Src1);
2901
2902 Variable *T = nullptr;
2903
2904 if (Condition == InstFcmp::True) {
2905 // makeVectorOfOnes() requires an integer vector type.
2906 T = makeVectorOfMinusOnes(IceType_v4i32);
2907 } else if (Condition == InstFcmp::False) {
2908 T = makeVectorOfZeros(Dest->getType());
2909 } else {
2910 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2911 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
2912 if (llvm::isa<X86OperandMem>(Src1RM))
2913 Src1RM = legalizeToReg(Src1RM);
2914
2915 switch (Condition) {
2916 default: {
2917 const CmppsCond Predicate = TableFcmp[Condition].Predicate;
2918 assert(Predicate != CondX86::Cmpps_Invalid);
2919 T = makeReg(Src0RM->getType());
2920 _movp(T, Src0RM);
2921 _cmpps(T, Src1RM, Predicate);
2922 } break;
2923 case InstFcmp::One: {
2924 // Check both unequal and ordered.
2925 T = makeReg(Src0RM->getType());
2926 Variable *T2 = makeReg(Src0RM->getType());
2927 _movp(T, Src0RM);
2928 _cmpps(T, Src1RM, CondX86::Cmpps_neq);
2929 _movp(T2, Src0RM);
2930 _cmpps(T2, Src1RM, CondX86::Cmpps_ord);
2931 _pand(T, T2);
2932 } break;
2933 case InstFcmp::Ueq: {
2934 // Check both equal or unordered.
2935 T = makeReg(Src0RM->getType());
2936 Variable *T2 = makeReg(Src0RM->getType());
2937 _movp(T, Src0RM);
2938 _cmpps(T, Src1RM, CondX86::Cmpps_eq);
2939 _movp(T2, Src0RM);
2940 _cmpps(T2, Src1RM, CondX86::Cmpps_unord);
2941 _por(T, T2);
2942 } break;
2943 }
2944 }
2945
2946 assert(T != nullptr);
2947 _movp(Dest, T);
2948 eliminateNextVectorSextInstruction(Dest);
2949 }
2950
isZero(const Operand * Opnd)2951 inline bool isZero(const Operand *Opnd) {
2952 if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Opnd))
2953 return C64->getValue() == 0;
2954 if (auto *C32 = llvm::dyn_cast<ConstantInteger32>(Opnd))
2955 return C32->getValue() == 0;
2956 return false;
2957 }
2958
lowerIcmpAndConsumer(const InstIcmp * Icmp,const Inst * Consumer)2959 void TargetX8664::lowerIcmpAndConsumer(const InstIcmp *Icmp,
2960 const Inst *Consumer) {
2961 Operand *Src0 = legalize(Icmp->getSrc(0));
2962 Operand *Src1 = legalize(Icmp->getSrc(1));
2963 Variable *Dest = Icmp->getDest();
2964
2965 if (isVectorType(Dest->getType())) {
2966 lowerIcmp(Icmp);
2967 if (Consumer != nullptr)
2968 lowerSelectVector(llvm::cast<InstSelect>(Consumer));
2969 return;
2970 }
2971
2972 // cmp b, c
2973 if (isZero(Src1)) {
2974 switch (Icmp->getCondition()) {
2975 default:
2976 break;
2977 case InstIcmp::Uge:
2978 movOrConsumer(true, Dest, Consumer);
2979 return;
2980 case InstIcmp::Ult:
2981 movOrConsumer(false, Dest, Consumer);
2982 return;
2983 }
2984 }
2985 Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
2986 _cmp(Src0RM, Src1);
2987 setccOrConsumer(getIcmp32Mapping(Icmp->getCondition()), Dest, Consumer);
2988 }
2989
lowerIcmpVector(const InstIcmp * Icmp)2990 void TargetX8664::lowerIcmpVector(const InstIcmp *Icmp) {
2991 Operand *Src0 = legalize(Icmp->getSrc(0));
2992 Operand *Src1 = legalize(Icmp->getSrc(1));
2993 Variable *Dest = Icmp->getDest();
2994
2995 if (!isVectorType(Dest->getType()))
2996 llvm::report_fatal_error("Expected a vector compare");
2997
2998 Type Ty = Src0->getType();
2999 // Promote i1 vectors to 128 bit integer vector types.
3000 if (typeElementType(Ty) == IceType_i1) {
3001 Type NewTy = IceType_NUM;
3002 switch (Ty) {
3003 default:
3004 llvm::report_fatal_error("unexpected type");
3005 break;
3006 case IceType_v4i1:
3007 NewTy = IceType_v4i32;
3008 break;
3009 case IceType_v8i1:
3010 NewTy = IceType_v8i16;
3011 break;
3012 case IceType_v16i1:
3013 NewTy = IceType_v16i8;
3014 break;
3015 }
3016 Variable *NewSrc0 = Func->makeVariable(NewTy);
3017 Variable *NewSrc1 = Func->makeVariable(NewTy);
3018 lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
3019 lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
3020 Src0 = NewSrc0;
3021 Src1 = NewSrc1;
3022 Ty = NewTy;
3023 }
3024
3025 InstIcmp::ICond Condition = Icmp->getCondition();
3026
3027 Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3028 Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3029
3030 // SSE2 only has signed comparison operations. Transform unsigned inputs in
3031 // a manner that allows for the use of signed comparison operations by
3032 // flipping the high order bits.
3033 if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
3034 Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
3035 Variable *T0 = makeReg(Ty);
3036 Variable *T1 = makeReg(Ty);
3037 Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
3038 _movp(T0, Src0RM);
3039 _pxor(T0, HighOrderBits);
3040 _movp(T1, Src1RM);
3041 _pxor(T1, HighOrderBits);
3042 Src0RM = T0;
3043 Src1RM = T1;
3044 }
3045
3046 Variable *T = makeReg(Ty);
3047 switch (Condition) {
3048 default:
3049 llvm_unreachable("unexpected condition");
3050 break;
3051 case InstIcmp::Eq: {
3052 if (llvm::isa<X86OperandMem>(Src1RM))
3053 Src1RM = legalizeToReg(Src1RM);
3054 _movp(T, Src0RM);
3055 _pcmpeq(T, Src1RM);
3056 } break;
3057 case InstIcmp::Ne: {
3058 if (llvm::isa<X86OperandMem>(Src1RM))
3059 Src1RM = legalizeToReg(Src1RM);
3060 _movp(T, Src0RM);
3061 _pcmpeq(T, Src1RM);
3062 Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3063 _pxor(T, MinusOne);
3064 } break;
3065 case InstIcmp::Ugt:
3066 case InstIcmp::Sgt: {
3067 if (llvm::isa<X86OperandMem>(Src1RM))
3068 Src1RM = legalizeToReg(Src1RM);
3069 _movp(T, Src0RM);
3070 _pcmpgt(T, Src1RM);
3071 } break;
3072 case InstIcmp::Uge:
3073 case InstIcmp::Sge: {
3074 // !(Src1RM > Src0RM)
3075 if (llvm::isa<X86OperandMem>(Src0RM))
3076 Src0RM = legalizeToReg(Src0RM);
3077 _movp(T, Src1RM);
3078 _pcmpgt(T, Src0RM);
3079 Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3080 _pxor(T, MinusOne);
3081 } break;
3082 case InstIcmp::Ult:
3083 case InstIcmp::Slt: {
3084 if (llvm::isa<X86OperandMem>(Src0RM))
3085 Src0RM = legalizeToReg(Src0RM);
3086 _movp(T, Src1RM);
3087 _pcmpgt(T, Src0RM);
3088 } break;
3089 case InstIcmp::Ule:
3090 case InstIcmp::Sle: {
3091 // !(Src0RM > Src1RM)
3092 if (llvm::isa<X86OperandMem>(Src1RM))
3093 Src1RM = legalizeToReg(Src1RM);
3094 _movp(T, Src0RM);
3095 _pcmpgt(T, Src1RM);
3096 Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3097 _pxor(T, MinusOne);
3098 } break;
3099 }
3100
3101 _movp(Dest, T);
3102 eliminateNextVectorSextInstruction(Dest);
3103 }
3104
setccOrConsumer(BrCond Condition,Variable * Dest,const Inst * Consumer)3105 void TargetX8664::setccOrConsumer(BrCond Condition, Variable *Dest,
3106 const Inst *Consumer) {
3107 if (Consumer == nullptr) {
3108 _setcc(Dest, Condition);
3109 return;
3110 }
3111 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3112 _br(Condition, Br->getTargetTrue(), Br->getTargetFalse());
3113 return;
3114 }
3115 if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3116 Operand *SrcT = Select->getTrueOperand();
3117 Operand *SrcF = Select->getFalseOperand();
3118 Variable *SelectDest = Select->getDest();
3119 lowerSelectMove(SelectDest, Condition, SrcT, SrcF);
3120 return;
3121 }
3122 llvm::report_fatal_error("Unexpected consumer type");
3123 }
3124
movOrConsumer(bool IcmpResult,Variable * Dest,const Inst * Consumer)3125 void TargetX8664::movOrConsumer(bool IcmpResult, Variable *Dest,
3126 const Inst *Consumer) {
3127 if (Consumer == nullptr) {
3128 _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
3129 return;
3130 }
3131 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3132 // TODO(sehr,stichnot): This could be done with a single unconditional
3133 // branch instruction, but subzero doesn't know how to handle the resulting
3134 // control flow graph changes now. Make it do so to eliminate mov and cmp.
3135 _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
3136 _cmp(Dest, Ctx->getConstantInt(Dest->getType(), 0));
3137 _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
3138 return;
3139 }
3140 if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3141 Operand *Src = nullptr;
3142 if (IcmpResult) {
3143 Src = legalize(Select->getTrueOperand(), Legal_Reg | Legal_Imm);
3144 } else {
3145 Src = legalize(Select->getFalseOperand(), Legal_Reg | Legal_Imm);
3146 }
3147 Variable *SelectDest = Select->getDest();
3148 lowerMove(SelectDest, Src, false);
3149 return;
3150 }
3151 llvm::report_fatal_error("Unexpected consumer type");
3152 }
3153
lowerArithAndConsumer(const InstArithmetic * Arith,const Inst * Consumer)3154 void TargetX8664::lowerArithAndConsumer(const InstArithmetic *Arith,
3155 const Inst *Consumer) {
3156 Variable *T = nullptr;
3157 Operand *Src0 = legalize(Arith->getSrc(0));
3158 Operand *Src1 = legalize(Arith->getSrc(1));
3159 Variable *Dest = Arith->getDest();
3160 switch (Arith->getOp()) {
3161 default:
3162 llvm_unreachable("arithmetic operator not AND or OR");
3163 break;
3164 case InstArithmetic::And:
3165 _mov(T, Src0);
3166 // Test cannot have an address in the second position. Since T is
3167 // guaranteed to be a register and Src1 could be a memory load, ensure
3168 // that the second argument is a register.
3169 if (llvm::isa<Constant>(Src1))
3170 _test(T, Src1);
3171 else
3172 _test(Src1, T);
3173 break;
3174 case InstArithmetic::Or:
3175 _mov(T, Src0);
3176 _or(T, Src1);
3177 break;
3178 }
3179
3180 if (Consumer == nullptr) {
3181 llvm::report_fatal_error("Expected a consumer instruction");
3182 }
3183 if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3184 Context.insert<InstFakeUse>(T);
3185 Context.insert<InstFakeDef>(Dest);
3186 _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
3187 return;
3188 }
3189 llvm::report_fatal_error("Unexpected consumer type");
3190 }
3191
lowerInsertElement(const InstInsertElement * Instr)3192 void TargetX8664::lowerInsertElement(const InstInsertElement *Instr) {
3193 Operand *SourceVectNotLegalized = Instr->getSrc(0);
3194 Operand *ElementToInsertNotLegalized = Instr->getSrc(1);
3195 auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(2));
3196 // Only constant indices are allowed in PNaCl IR.
3197 assert(ElementIndex);
3198 unsigned Index = ElementIndex->getValue();
3199 assert(Index < typeNumElements(SourceVectNotLegalized->getType()));
3200
3201 Type Ty = SourceVectNotLegalized->getType();
3202 Type ElementTy = typeElementType(Ty);
3203 Type InVectorElementTy = InstX86Base::getInVectorElementType(Ty);
3204
3205 if (ElementTy == IceType_i1) {
3206 // Expand the element to the appropriate size for it to be inserted in the
3207 // vector.
3208 Variable *Expanded = Func->makeVariable(InVectorElementTy);
3209 auto *Cast = InstCast::create(Func, InstCast::Zext, Expanded,
3210 ElementToInsertNotLegalized);
3211 lowerCast(Cast);
3212 ElementToInsertNotLegalized = Expanded;
3213 }
3214
3215 if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1) {
3216 // Use insertps, pinsrb, pinsrw, or pinsrd.
3217 Operand *ElementRM =
3218 legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
3219 Operand *SourceVectRM =
3220 legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
3221 Variable *T = makeReg(Ty);
3222 _movp(T, SourceVectRM);
3223 if (Ty == IceType_v4f32) {
3224 _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
3225 } else {
3226 // For the pinsrb and pinsrw instructions, when the source operand is a
3227 // register, it must be a full r32 register like eax, and not ax/al/ah.
3228 // For filetype=asm, InstX86Pinsr::emit() compensates for
3229 // the use
3230 // of r16 and r8 by converting them through getBaseReg(), while emitIAS()
3231 // validates that the original and base register encodings are the same.
3232 if (ElementRM->getType() == IceType_i8 &&
3233 llvm::isa<Variable>(ElementRM)) {
3234 // Don't use ah/bh/ch/dh for pinsrb.
3235 ElementRM = copyToReg8(ElementRM);
3236 }
3237 _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
3238 }
3239 _movp(Instr->getDest(), T);
3240 } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
3241 // Use shufps or movss.
3242 Variable *ElementR = nullptr;
3243 Operand *SourceVectRM =
3244 legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
3245
3246 if (InVectorElementTy == IceType_f32) {
3247 // ElementR will be in an XMM register since it is floating point.
3248 ElementR = legalizeToReg(ElementToInsertNotLegalized);
3249 } else {
3250 // Copy an integer to an XMM register.
3251 Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
3252 ElementR = makeReg(Ty);
3253 _movd(ElementR, T);
3254 }
3255
3256 if (Index == 0) {
3257 Variable *T = makeReg(Ty);
3258 _movp(T, SourceVectRM);
3259 _movss(T, ElementR);
3260 _movp(Instr->getDest(), T);
3261 return;
3262 }
3263
3264 // shufps treats the source and destination operands as vectors of four
3265 // doublewords. The destination's two high doublewords are selected from
3266 // the source operand and the two low doublewords are selected from the
3267 // (original value of) the destination operand. An insertelement operation
3268 // can be effected with a sequence of two shufps operations with
3269 // appropriate masks. In all cases below, Element[0] is being inserted into
3270 // SourceVectOperand. Indices are ordered from left to right.
3271 //
3272 // insertelement into index 1 (result is stored in ElementR):
3273 // ElementR := ElementR[0, 0] SourceVectRM[0, 0]
3274 // ElementR := ElementR[3, 0] SourceVectRM[2, 3]
3275 //
3276 // insertelement into index 2 (result is stored in T):
3277 // T := SourceVectRM
3278 // ElementR := ElementR[0, 0] T[0, 3]
3279 // T := T[0, 1] ElementR[0, 3]
3280 //
3281 // insertelement into index 3 (result is stored in T):
3282 // T := SourceVectRM
3283 // ElementR := ElementR[0, 0] T[0, 2]
3284 // T := T[0, 1] ElementR[3, 0]
3285 const unsigned char Mask1[3] = {0, 192, 128};
3286 const unsigned char Mask2[3] = {227, 196, 52};
3287
3288 Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]);
3289 Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]);
3290
3291 if (Index == 1) {
3292 _shufps(ElementR, SourceVectRM, Mask1Constant);
3293 _shufps(ElementR, SourceVectRM, Mask2Constant);
3294 _movp(Instr->getDest(), ElementR);
3295 } else {
3296 Variable *T = makeReg(Ty);
3297 _movp(T, SourceVectRM);
3298 _shufps(ElementR, T, Mask1Constant);
3299 _shufps(T, ElementR, Mask2Constant);
3300 _movp(Instr->getDest(), T);
3301 }
3302 } else {
3303 assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
3304 // Spill the value to a stack slot and perform the insertion in memory.
3305 //
3306 // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
3307 // for legalizing to mem is implemented.
3308 Variable *Slot = Func->makeVariable(Ty);
3309 Slot->setMustNotHaveReg();
3310 _movp(Slot, legalizeToReg(SourceVectNotLegalized));
3311
3312 // Compute the location of the position to insert in memory.
3313 unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
3314 X86OperandMem *Loc =
3315 getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
3316 _store(legalizeToReg(ElementToInsertNotLegalized), Loc);
3317
3318 Variable *T = makeReg(Ty);
3319 _movp(T, Slot);
3320 _movp(Instr->getDest(), T);
3321 }
3322 }
3323
lowerIntrinsic(const InstIntrinsic * Instr)3324 void TargetX8664::lowerIntrinsic(const InstIntrinsic *Instr) {
3325 switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicID()) {
3326 case Intrinsics::AtomicCmpxchg: {
3327 if (!Intrinsics::isMemoryOrderValid(
3328 ID, getConstantMemoryOrder(Instr->getArg(3)),
3329 getConstantMemoryOrder(Instr->getArg(4)))) {
3330 Func->setError("Unexpected memory ordering for AtomicCmpxchg");
3331 return;
3332 }
3333 Variable *DestPrev = Instr->getDest();
3334 Operand *PtrToMem = legalize(Instr->getArg(0));
3335 Operand *Expected = legalize(Instr->getArg(1));
3336 Operand *Desired = legalize(Instr->getArg(2));
3337 if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
3338 return;
3339 lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
3340 return;
3341 }
3342 case Intrinsics::AtomicFence:
3343 if (!Intrinsics::isMemoryOrderValid(
3344 ID, getConstantMemoryOrder(Instr->getArg(0)))) {
3345 Func->setError("Unexpected memory ordering for AtomicFence");
3346 return;
3347 }
3348 _mfence();
3349 return;
3350 case Intrinsics::AtomicFenceAll:
3351 // NOTE: FenceAll should prevent and load/store from being moved across the
3352 // fence (both atomic and non-atomic). The InstX8632Mfence instruction is
3353 // currently marked coarsely as "HasSideEffects".
3354 _mfence();
3355 return;
3356 case Intrinsics::AtomicIsLockFree: {
3357 // X86 is always lock free for 8/16/32/64 bit accesses.
3358 // TODO(jvoung): Since the result is constant when given a constant byte
3359 // size, this opens up DCE opportunities.
3360 Operand *ByteSize = Instr->getArg(0);
3361 Variable *Dest = Instr->getDest();
3362 if (auto *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) {
3363 Constant *Result;
3364 switch (CI->getValue()) {
3365 default:
3366 // Some x86-64 processors support the cmpxchg16b instruction, which can
3367 // make 16-byte operations lock free (when used with the LOCK prefix).
3368 // However, that's not supported in 32-bit mode, so just return 0 even
3369 // for large sizes.
3370 Result = Ctx->getConstantZero(IceType_i32);
3371 break;
3372 case 1:
3373 case 2:
3374 case 4:
3375 case 8:
3376 Result = Ctx->getConstantInt32(1);
3377 break;
3378 }
3379 _mov(Dest, Result);
3380 return;
3381 }
3382 // The PNaCl ABI requires the byte size to be a compile-time constant.
3383 Func->setError("AtomicIsLockFree byte size should be compile-time const");
3384 return;
3385 }
3386 case Intrinsics::AtomicLoad: {
3387 // We require the memory address to be naturally aligned. Given that is the
3388 // case, then normal loads are atomic.
3389 if (!Intrinsics::isMemoryOrderValid(
3390 ID, getConstantMemoryOrder(Instr->getArg(1)))) {
3391 Func->setError("Unexpected memory ordering for AtomicLoad");
3392 return;
3393 }
3394 Variable *Dest = Instr->getDest();
3395 auto *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
3396 lowerLoad(Load);
3397 // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
3398 // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert
3399 // the FakeUse on the last-inserted instruction's dest.
3400 Context.insert<InstFakeUse>(Context.getLastInserted()->getDest());
3401 return;
3402 }
3403 case Intrinsics::AtomicRMW:
3404 if (!Intrinsics::isMemoryOrderValid(
3405 ID, getConstantMemoryOrder(Instr->getArg(3)))) {
3406 Func->setError("Unexpected memory ordering for AtomicRMW");
3407 return;
3408 }
3409 lowerAtomicRMW(
3410 Instr->getDest(),
3411 static_cast<uint32_t>(
3412 llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
3413 Instr->getArg(1), Instr->getArg(2));
3414 return;
3415 case Intrinsics::AtomicStore: {
3416 if (!Intrinsics::isMemoryOrderValid(
3417 ID, getConstantMemoryOrder(Instr->getArg(2)))) {
3418 Func->setError("Unexpected memory ordering for AtomicStore");
3419 return;
3420 }
3421 // We require the memory address to be naturally aligned. Given that is the
3422 // case, then normal stores are atomic. Add a fence after the store to make
3423 // it visible.
3424 Operand *Value = Instr->getArg(0);
3425 Operand *Ptr = Instr->getArg(1);
3426 auto *Store = InstStore::create(Func, Value, Ptr);
3427 lowerStore(Store);
3428 _mfence();
3429 return;
3430 }
3431 case Intrinsics::Bswap: {
3432 Variable *Dest = Instr->getDest();
3433 Operand *Val = Instr->getArg(0);
3434 // Use rotate left for 16-bit bswap.
3435 if (Val->getType() == IceType_i64 || Val->getType() == IceType_i32) {
3436 Variable *T = legalizeToReg(Val);
3437 _bswap(T);
3438 _mov(Dest, T);
3439 } else {
3440 assert(Val->getType() == IceType_i16);
3441 Constant *Eight = Ctx->getConstantInt16(8);
3442 Variable *T = nullptr;
3443 Val = legalize(Val);
3444 _mov(T, Val);
3445 _rol(T, Eight);
3446 _mov(Dest, T);
3447 }
3448 return;
3449 }
3450 case Intrinsics::Ctpop: {
3451 Variable *Dest = Instr->getDest();
3452 Variable *T = nullptr;
3453 Operand *Val = Instr->getArg(0);
3454 Type ValTy = Val->getType();
3455 assert(ValTy == IceType_i32 || ValTy == IceType_i64);
3456
3457 T = makeReg(IceType_i64);
3458 if (ValTy == IceType_i32) {
3459 // in x86-64, __popcountsi2 is not defined, so we cheat a bit by
3460 // converting it to a 64-bit value, and using ctpop_i64. _movzx should
3461 // ensure we will not have any bits set on Val's upper 32 bits.
3462 Variable *V = makeReg(IceType_i64);
3463 Operand *ValRM = legalize(Val, Legal_Reg | Legal_Mem);
3464 _movzx(V, ValRM);
3465 Val = V;
3466 }
3467 ValTy = IceType_i64;
3468
3469 InstCall *Call =
3470 makeHelperCall(ValTy == IceType_i32 ? RuntimeHelper::H_call_ctpop_i32
3471 : RuntimeHelper::H_call_ctpop_i64,
3472 T, 1);
3473 Call->addArg(Val);
3474 lowerCall(Call);
3475 // The popcount helpers always return 32-bit values, while the intrinsic's
3476 // signature matches the native POPCNT instruction and fills a 64-bit reg
3477 // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
3478 // the user doesn't do that in the IR. If the user does that in the IR,
3479 // then this zero'ing instruction is dead and gets optimized out.
3480 assert(Val->getType() == IceType_i64);
3481 // T is 64 bit. It needs to be copied to dest. We need to:
3482 //
3483 // T_1.32 = trunc T.64 to i32
3484 // T_2.64 = zext T_1.32 to i64
3485 // Dest.<<right_size>> = T_2.<<right_size>>
3486 //
3487 // which ensures the upper 32 bits will always be cleared. Just doing a
3488 //
3489 // mov Dest.32 = trunc T.32 to i32
3490 //
3491 // is dangerous because there's a chance the compiler will optimize this
3492 // copy out. To use _movzx we need two new registers (one 32-, and
3493 // another 64-bit wide.)
3494 Variable *T_1 = makeReg(IceType_i32);
3495 _mov(T_1, T);
3496 Variable *T_2 = makeReg(IceType_i64);
3497 _movzx(T_2, T_1);
3498 _mov(Dest, T_2);
3499 return;
3500 }
3501 case Intrinsics::Ctlz: {
3502 // The "is zero undef" parameter is ignored and we always return a
3503 // well-defined value.
3504 Operand *Val = legalize(Instr->getArg(0));
3505 Operand *FirstVal = Val;
3506 Operand *SecondVal = nullptr;
3507 constexpr bool IsCttz = false;
3508 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
3509 SecondVal);
3510 return;
3511 }
3512 case Intrinsics::Cttz: {
3513 // The "is zero undef" parameter is ignored and we always return a
3514 // well-defined value.
3515 Operand *Val = legalize(Instr->getArg(0));
3516 Operand *FirstVal = Val;
3517 Operand *SecondVal = nullptr;
3518 constexpr bool IsCttz = true;
3519 lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
3520 SecondVal);
3521 return;
3522 }
3523 case Intrinsics::Fabs: {
3524 Operand *Src = legalize(Instr->getArg(0));
3525 Type Ty = Src->getType();
3526 Variable *Dest = Instr->getDest();
3527 Variable *T = makeVectorOfFabsMask(Ty);
3528 // The pand instruction operates on an m128 memory operand, so if Src is an
3529 // f32 or f64, we need to make sure it's in a register.
3530 if (isVectorType(Ty)) {
3531 if (llvm::isa<X86OperandMem>(Src))
3532 Src = legalizeToReg(Src);
3533 } else {
3534 Src = legalizeToReg(Src);
3535 }
3536 _pand(T, Src);
3537 if (isVectorType(Ty))
3538 _movp(Dest, T);
3539 else
3540 _mov(Dest, T);
3541 return;
3542 }
3543 case Intrinsics::Longjmp: {
3544 InstCall *Call = makeHelperCall(RuntimeHelper::H_call_longjmp, nullptr, 2);
3545 Call->addArg(Instr->getArg(0));
3546 Call->addArg(Instr->getArg(1));
3547 lowerCall(Call);
3548 return;
3549 }
3550 case Intrinsics::Memcpy: {
3551 lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
3552 return;
3553 }
3554 case Intrinsics::Memmove: {
3555 lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
3556 return;
3557 }
3558 case Intrinsics::Memset: {
3559 lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
3560 return;
3561 }
3562 case Intrinsics::Setjmp: {
3563 InstCall *Call =
3564 makeHelperCall(RuntimeHelper::H_call_setjmp, Instr->getDest(), 1);
3565 Call->addArg(Instr->getArg(0));
3566 lowerCall(Call);
3567 return;
3568 }
3569 case Intrinsics::Sqrt: {
3570 Operand *Src = legalize(Instr->getArg(0));
3571 Variable *Dest = Instr->getDest();
3572 Variable *T = makeReg(Dest->getType());
3573 _sqrt(T, Src);
3574 if (isVectorType(Dest->getType())) {
3575 _movp(Dest, T);
3576 } else {
3577 _mov(Dest, T);
3578 }
3579 return;
3580 }
3581 case Intrinsics::Stacksave: {
3582 Variable *rsp =
3583 Func->getTarget()->getPhysicalRegister(getStackReg(), WordType);
3584 Variable *Dest = Instr->getDest();
3585 _mov(Dest, rsp);
3586 return;
3587 }
3588 case Intrinsics::Stackrestore: {
3589 Operand *Src = Instr->getArg(0);
3590 _mov_sp(Src);
3591 return;
3592 }
3593
3594 case Intrinsics::Trap:
3595 _ud2();
3596 return;
3597 case Intrinsics::LoadSubVector: {
3598 assert(llvm::isa<ConstantInteger32>(Instr->getArg(1)) &&
3599 "LoadSubVector second argument must be a constant");
3600 Variable *Dest = Instr->getDest();
3601 Type Ty = Dest->getType();
3602 auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(1));
3603 Operand *Addr = Instr->getArg(0);
3604 X86OperandMem *Src = formMemoryOperand(Addr, Ty);
3605 doMockBoundsCheck(Src);
3606
3607 if (Dest->isRematerializable()) {
3608 Context.insert<InstFakeDef>(Dest);
3609 return;
3610 }
3611
3612 auto *T = makeReg(Ty);
3613 switch (SubVectorSize->getValue()) {
3614 case 4:
3615 _movd(T, Src);
3616 break;
3617 case 8:
3618 _movq(T, Src);
3619 break;
3620 default:
3621 Func->setError("Unexpected size for LoadSubVector");
3622 return;
3623 }
3624 _movp(Dest, T);
3625 return;
3626 }
3627 case Intrinsics::StoreSubVector: {
3628 assert(llvm::isa<ConstantInteger32>(Instr->getArg(2)) &&
3629 "StoreSubVector third argument must be a constant");
3630 auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(2));
3631 Operand *Value = Instr->getArg(0);
3632 Operand *Addr = Instr->getArg(1);
3633 X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
3634 doMockBoundsCheck(NewAddr);
3635
3636 Value = legalizeToReg(Value);
3637
3638 switch (SubVectorSize->getValue()) {
3639 case 4:
3640 _stored(Value, NewAddr);
3641 break;
3642 case 8:
3643 _storeq(Value, NewAddr);
3644 break;
3645 default:
3646 Func->setError("Unexpected size for StoreSubVector");
3647 return;
3648 }
3649 return;
3650 }
3651 case Intrinsics::VectorPackSigned: {
3652 Operand *Src0 = Instr->getArg(0);
3653 Operand *Src1 = Instr->getArg(1);
3654 Variable *Dest = Instr->getDest();
3655 auto *T = makeReg(Src0->getType());
3656 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3657 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3658 _movp(T, Src0RM);
3659 _packss(T, Src1RM);
3660 _movp(Dest, T);
3661 return;
3662 }
3663 case Intrinsics::VectorPackUnsigned: {
3664 Operand *Src0 = Instr->getArg(0);
3665 Operand *Src1 = Instr->getArg(1);
3666 Variable *Dest = Instr->getDest();
3667 auto *T = makeReg(Src0->getType());
3668 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3669 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3670 _movp(T, Src0RM);
3671 _packus(T, Src1RM);
3672 _movp(Dest, T);
3673 return;
3674 }
3675 case Intrinsics::SignMask: {
3676 Operand *SrcReg = legalizeToReg(Instr->getArg(0));
3677 Variable *Dest = Instr->getDest();
3678 Variable *T = makeReg(IceType_i32);
3679 if (SrcReg->getType() == IceType_v4f32 ||
3680 SrcReg->getType() == IceType_v4i32 ||
3681 SrcReg->getType() == IceType_v16i8) {
3682 _movmsk(T, SrcReg);
3683 } else {
3684 // TODO(capn): We could implement v8i16 sign mask using packsswb/pmovmskb
3685 llvm::report_fatal_error("Invalid type for SignMask intrinsic");
3686 }
3687 _mov(Dest, T);
3688 return;
3689 }
3690 case Intrinsics::MultiplyHighSigned: {
3691 Operand *Src0 = Instr->getArg(0);
3692 Operand *Src1 = Instr->getArg(1);
3693 Variable *Dest = Instr->getDest();
3694 auto *T = makeReg(Dest->getType());
3695 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3696 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3697 _movp(T, Src0RM);
3698 _pmulhw(T, Src1RM);
3699 _movp(Dest, T);
3700 return;
3701 }
3702 case Intrinsics::MultiplyHighUnsigned: {
3703 Operand *Src0 = Instr->getArg(0);
3704 Operand *Src1 = Instr->getArg(1);
3705 Variable *Dest = Instr->getDest();
3706 auto *T = makeReg(Dest->getType());
3707 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3708 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3709 _movp(T, Src0RM);
3710 _pmulhuw(T, Src1RM);
3711 _movp(Dest, T);
3712 return;
3713 }
3714 case Intrinsics::MultiplyAddPairs: {
3715 Operand *Src0 = Instr->getArg(0);
3716 Operand *Src1 = Instr->getArg(1);
3717 Variable *Dest = Instr->getDest();
3718 auto *T = makeReg(Dest->getType());
3719 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3720 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3721 _movp(T, Src0RM);
3722 _pmaddwd(T, Src1RM);
3723 _movp(Dest, T);
3724 return;
3725 }
3726 case Intrinsics::AddSaturateSigned: {
3727 Operand *Src0 = Instr->getArg(0);
3728 Operand *Src1 = Instr->getArg(1);
3729 Variable *Dest = Instr->getDest();
3730 auto *T = makeReg(Dest->getType());
3731 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3732 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3733 _movp(T, Src0RM);
3734 _padds(T, Src1RM);
3735 _movp(Dest, T);
3736 return;
3737 }
3738 case Intrinsics::SubtractSaturateSigned: {
3739 Operand *Src0 = Instr->getArg(0);
3740 Operand *Src1 = Instr->getArg(1);
3741 Variable *Dest = Instr->getDest();
3742 auto *T = makeReg(Dest->getType());
3743 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3744 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3745 _movp(T, Src0RM);
3746 _psubs(T, Src1RM);
3747 _movp(Dest, T);
3748 return;
3749 }
3750 case Intrinsics::AddSaturateUnsigned: {
3751 Operand *Src0 = Instr->getArg(0);
3752 Operand *Src1 = Instr->getArg(1);
3753 Variable *Dest = Instr->getDest();
3754 auto *T = makeReg(Dest->getType());
3755 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3756 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3757 _movp(T, Src0RM);
3758 _paddus(T, Src1RM);
3759 _movp(Dest, T);
3760 return;
3761 }
3762 case Intrinsics::SubtractSaturateUnsigned: {
3763 Operand *Src0 = Instr->getArg(0);
3764 Operand *Src1 = Instr->getArg(1);
3765 Variable *Dest = Instr->getDest();
3766 auto *T = makeReg(Dest->getType());
3767 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3768 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3769 _movp(T, Src0RM);
3770 _psubus(T, Src1RM);
3771 _movp(Dest, T);
3772 return;
3773 }
3774 case Intrinsics::Nearbyint: {
3775 Operand *Src = Instr->getArg(0);
3776 Variable *Dest = Instr->getDest();
3777 Type DestTy = Dest->getType();
3778 if (isVectorType(DestTy)) {
3779 assert(DestTy == IceType_v4i32);
3780 assert(Src->getType() == IceType_v4f32);
3781 Operand *Src0R = legalizeToReg(Src);
3782 Variable *T = makeReg(DestTy);
3783 _cvt(T, Src0R, Insts::Cvt::Ps2dq);
3784 _movp(Dest, T);
3785 } else {
3786 Operand *Src0RM = legalize(Src, Legal_Reg | Legal_Mem);
3787 // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
3788 Variable *T_1 = nullptr;
3789 if (DestTy == IceType_i64) {
3790 T_1 = makeReg(IceType_i64);
3791 } else {
3792 assert(DestTy != IceType_i64);
3793 T_1 = makeReg(IceType_i32);
3794 }
3795 // cvt() requires its integer argument to be a GPR.
3796 Variable *T_2 = makeReg(DestTy);
3797 if (isByteSizedType(DestTy)) {
3798 assert(T_1->getType() == IceType_i32);
3799 T_1->setRegClass(RCX86_Is32To8);
3800 T_2->setRegClass(RCX86_IsTrunc8Rcvr);
3801 }
3802 _cvt(T_1, Src0RM, Insts::Cvt::Ss2si);
3803 _mov(T_2, T_1); // T_1 and T_2 may have different integer types
3804 if (DestTy == IceType_i1)
3805 _and(T_2, Ctx->getConstantInt1(1));
3806 _mov(Dest, T_2);
3807 }
3808 return;
3809 }
3810 case Intrinsics::Round: {
3811 assert(InstructionSet >= SSE4_1);
3812 Variable *Dest = Instr->getDest();
3813 Operand *Src = Instr->getArg(0);
3814 Operand *Mode = Instr->getArg(1);
3815 assert(llvm::isa<ConstantInteger32>(Mode) &&
3816 "Round last argument must be a constant");
3817 auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
3818 int32_t Imm = llvm::cast<ConstantInteger32>(Mode)->getValue();
3819 (void)Imm;
3820 assert(Imm >= 0 && Imm < 4 && "Invalid rounding mode");
3821 auto *T = makeReg(Dest->getType());
3822 _round(T, SrcRM, Mode);
3823 _movp(Dest, T);
3824 return;
3825 }
3826 default: // UnknownIntrinsic
3827 Func->setError("Unexpected intrinsic");
3828 return;
3829 }
3830 return;
3831 }
3832
lowerAtomicCmpxchg(Variable * DestPrev,Operand * Ptr,Operand * Expected,Operand * Desired)3833 void TargetX8664::lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr,
3834 Operand *Expected, Operand *Desired) {
3835 Type Ty = Expected->getType();
3836 RegNumT Eax;
3837 switch (Ty) {
3838 default:
3839 llvm::report_fatal_error("Bad type for cmpxchg");
3840 case IceType_i64:
3841 Eax = RegX8664::Reg_rax;
3842 break;
3843 case IceType_i32:
3844 Eax = RegX8664::Reg_eax;
3845 break;
3846 case IceType_i16:
3847 Eax = RegX8664::Reg_ax;
3848 break;
3849 case IceType_i8:
3850 Eax = RegX8664::Reg_al;
3851 break;
3852 }
3853 Variable *T_eax = makeReg(Ty, Eax);
3854 _mov(T_eax, Expected);
3855 X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
3856 Variable *DesiredReg = legalizeToReg(Desired);
3857 constexpr bool Locked = true;
3858 _cmpxchg(Addr, T_eax, DesiredReg, Locked);
3859 _mov(DestPrev, T_eax);
3860 }
3861
tryOptimizedCmpxchgCmpBr(Variable * Dest,Operand * PtrToMem,Operand * Expected,Operand * Desired)3862 bool TargetX8664::tryOptimizedCmpxchgCmpBr(Variable *Dest, Operand *PtrToMem,
3863 Operand *Expected,
3864 Operand *Desired) {
3865 if (Func->getOptLevel() == Opt_m1)
3866 return false;
3867 // Peek ahead a few instructions and see how Dest is used.
3868 // It's very common to have:
3869 //
3870 // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
3871 // [%y_phi = ...] // list of phi stores
3872 // %p = icmp eq i32 %x, %expected
3873 // br i1 %p, label %l1, label %l2
3874 //
3875 // which we can optimize into:
3876 //
3877 // %x = <cmpxchg code>
3878 // [%y_phi = ...] // list of phi stores
3879 // br eq, %l1, %l2
3880 InstList::iterator I = Context.getCur();
3881 // I is currently the InstIntrinsic. Peek past that.
3882 // This assumes that the atomic cmpxchg has not been lowered yet,
3883 // so that the instructions seen in the scan from "Cur" is simple.
3884 assert(llvm::isa<InstIntrinsic>(*I));
3885 Inst *NextInst = Context.getNextInst(I);
3886 if (!NextInst)
3887 return false;
3888 // There might be phi assignments right before the compare+branch, since this
3889 // could be a backward branch for a loop. This placement of assignments is
3890 // determined by placePhiStores().
3891 CfgVector<InstAssign *> PhiAssigns;
3892 while (auto *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
3893 if (PhiAssign->getDest() == Dest)
3894 return false;
3895 PhiAssigns.push_back(PhiAssign);
3896 NextInst = Context.getNextInst(I);
3897 if (!NextInst)
3898 return false;
3899 }
3900 if (auto *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
3901 if (!(NextCmp->getCondition() == InstIcmp::Eq &&
3902 ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
3903 (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
3904 return false;
3905 }
3906 NextInst = Context.getNextInst(I);
3907 if (!NextInst)
3908 return false;
3909 if (auto *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
3910 if (!NextBr->isUnconditional() &&
3911 NextCmp->getDest() == NextBr->getCondition() &&
3912 NextBr->isLastUse(NextCmp->getDest())) {
3913 lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
3914 for (size_t i = 0; i < PhiAssigns.size(); ++i) {
3915 // Lower the phi assignments now, before the branch (same placement
3916 // as before).
3917 InstAssign *PhiAssign = PhiAssigns[i];
3918 PhiAssign->setDeleted();
3919 lowerAssign(PhiAssign);
3920 Context.advanceNext();
3921 }
3922 _br(CondX86::Br_e, NextBr->getTargetTrue(), NextBr->getTargetFalse());
3923 // Skip over the old compare and branch, by deleting them.
3924 NextCmp->setDeleted();
3925 NextBr->setDeleted();
3926 Context.advanceNext();
3927 Context.advanceNext();
3928 return true;
3929 }
3930 }
3931 }
3932 return false;
3933 }
3934
lowerAtomicRMW(Variable * Dest,uint32_t Operation,Operand * Ptr,Operand * Val)3935 void TargetX8664::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
3936 Operand *Ptr, Operand *Val) {
3937 bool NeedsCmpxchg = false;
3938 LowerBinOp Op_Lo = nullptr;
3939 LowerBinOp Op_Hi = nullptr;
3940 switch (Operation) {
3941 default:
3942 Func->setError("Unknown AtomicRMW operation");
3943 return;
3944 case Intrinsics::AtomicAdd: {
3945 X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
3946 constexpr bool Locked = true;
3947 Variable *T = nullptr;
3948 _mov(T, Val);
3949 _xadd(Addr, T, Locked);
3950 _mov(Dest, T);
3951 return;
3952 }
3953 case Intrinsics::AtomicSub: {
3954 X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
3955 constexpr bool Locked = true;
3956 Variable *T = nullptr;
3957 _mov(T, Val);
3958 _neg(T);
3959 _xadd(Addr, T, Locked);
3960 _mov(Dest, T);
3961 return;
3962 }
3963 case Intrinsics::AtomicOr:
3964 // TODO(jvoung): If Dest is null or dead, then some of these
3965 // operations do not need an "exchange", but just a locked op.
3966 // That appears to be "worth" it for sub, or, and, and xor.
3967 // xadd is probably fine vs lock add for add, and xchg is fine
3968 // vs an atomic store.
3969 NeedsCmpxchg = true;
3970 Op_Lo = &TargetX8664::_or;
3971 Op_Hi = &TargetX8664::_or;
3972 break;
3973 case Intrinsics::AtomicAnd:
3974 NeedsCmpxchg = true;
3975 Op_Lo = &TargetX8664::_and;
3976 Op_Hi = &TargetX8664::_and;
3977 break;
3978 case Intrinsics::AtomicXor:
3979 NeedsCmpxchg = true;
3980 Op_Lo = &TargetX8664::_xor;
3981 Op_Hi = &TargetX8664::_xor;
3982 break;
3983 case Intrinsics::AtomicExchange:
3984 X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
3985 Variable *T = nullptr;
3986 _mov(T, Val);
3987 _xchg(Addr, T);
3988 _mov(Dest, T);
3989 return;
3990 }
3991 // Otherwise, we need a cmpxchg loop.
3992 (void)NeedsCmpxchg;
3993 assert(NeedsCmpxchg);
3994 expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
3995 }
3996
expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo,LowerBinOp Op_Hi,Variable * Dest,Operand * Ptr,Operand * Val)3997 void TargetX8664::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo, LowerBinOp Op_Hi,
3998 Variable *Dest, Operand *Ptr,
3999 Operand *Val) {
4000 // Expand a more complex RMW operation as a cmpxchg loop:
4001 // For 64-bit:
4002 // mov eax, [ptr]
4003 // mov edx, [ptr + 4]
4004 // .LABEL:
4005 // mov ebx, eax
4006 // <Op_Lo> ebx, <desired_adj_lo>
4007 // mov ecx, edx
4008 // <Op_Hi> ecx, <desired_adj_hi>
4009 // lock cmpxchg8b [ptr]
4010 // jne .LABEL
4011 // mov <dest_lo>, eax
4012 // mov <dest_lo>, edx
4013 //
4014 // For 32-bit:
4015 // mov eax, [ptr]
4016 // .LABEL:
4017 // mov <reg>, eax
4018 // op <reg>, [desired_adj]
4019 // lock cmpxchg [ptr], <reg>
4020 // jne .LABEL
4021 // mov <dest>, eax
4022 //
4023 // If Op_{Lo,Hi} are nullptr, then just copy the value.
4024 Val = legalize(Val);
4025 Type Ty = Val->getType();
4026 X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4027 RegNumT Eax;
4028 switch (Ty) {
4029 default:
4030 llvm::report_fatal_error("Bad type for atomicRMW");
4031 case IceType_i64:
4032 Eax = RegX8664::Reg_rax;
4033 break;
4034 case IceType_i32:
4035 Eax = RegX8664::Reg_eax;
4036 break;
4037 case IceType_i16:
4038 Eax = RegX8664::Reg_ax;
4039 break;
4040 case IceType_i8:
4041 Eax = RegX8664::Reg_al;
4042 break;
4043 }
4044 Variable *T_eax = makeReg(Ty, Eax);
4045 _mov(T_eax, Addr);
4046 auto *Label = Context.insert<InstX86Label>(this);
4047 // We want to pick a different register for T than Eax, so don't use
4048 // _mov(T == nullptr, T_eax).
4049 Variable *T = makeReg(Ty);
4050 _mov(T, T_eax);
4051 (this->*Op_Lo)(T, Val);
4052 constexpr bool Locked = true;
4053 _cmpxchg(Addr, T_eax, T, Locked);
4054 _br(CondX86::Br_ne, Label);
4055 // If Val is a variable, model the extended live range of Val through
4056 // the end of the loop, since it will be re-used by the loop.
4057 if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
4058 Context.insert<InstFakeUse>(ValVar);
4059 }
4060 // The address base (if any) is also reused in the loop.
4061 if (Variable *Base = Addr->getBase())
4062 Context.insert<InstFakeUse>(Base);
4063 _mov(Dest, T_eax);
4064 }
4065
4066 /// Lowers count {trailing, leading} zeros intrinsic.
4067 ///
4068 /// We could do constant folding here, but that should have
4069 /// been done by the front-end/middle-end optimizations.
4070
lowerCountZeros(bool Cttz,Type Ty,Variable * Dest,Operand * FirstVal,Operand * SecondVal)4071 void TargetX8664::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,
4072 Operand *FirstVal, Operand *SecondVal) {
4073 // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
4074 // Then the instructions will handle the Val == 0 case much more simply
4075 // and won't require conversion from bit position to number of zeros.
4076 //
4077 // Otherwise:
4078 // bsr IF_NOT_ZERO, Val
4079 // mov T_DEST, ((Ty == i32) ? 63 : 127)
4080 // cmovne T_DEST, IF_NOT_ZERO
4081 // xor T_DEST, ((Ty == i32) ? 31 : 63)
4082 // mov DEST, T_DEST
4083 //
4084 // NOTE: T_DEST must be a register because cmov requires its dest to be a
4085 // register. Also, bsf and bsr require their dest to be a register.
4086 //
4087 // The xor DEST, C(31|63) converts a bit position to # of leading zeroes.
4088 // E.g., for 000... 00001100, bsr will say that the most significant bit
4089 // set is at position 3, while the number of leading zeros is 28. Xor is
4090 // like (M - N) for N <= M, and converts 63 to 32, and 127 to 64 (for the
4091 // all-zeros case).
4092 //
4093 // Cttz, is similar, but uses bsf instead, and doesn't require the xor
4094 // bit position conversion, and the speculation is reversed.
4095
4096 // TODO(jpp): refactor this method.
4097 assert(Ty == IceType_i32 || Ty == IceType_i64);
4098 const Type DestTy = Dest->getType();
4099 Variable *T = makeReg(DestTy);
4100 Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg);
4101 if (Cttz) {
4102 _bsf(T, FirstValRM);
4103 } else {
4104 _bsr(T, FirstValRM);
4105 }
4106 Variable *T_Dest = makeReg(DestTy);
4107 Constant *_31 = Ctx->getConstantInt32(31);
4108 Constant *_32 = Ctx->getConstantInt(DestTy, 32);
4109 Constant *_63 = Ctx->getConstantInt(DestTy, 63);
4110 Constant *_64 = Ctx->getConstantInt(DestTy, 64);
4111 if (Cttz) {
4112 if (DestTy == IceType_i64) {
4113 _mov(T_Dest, _64);
4114 } else {
4115 _mov(T_Dest, _32);
4116 }
4117 } else {
4118 Constant *_127 = Ctx->getConstantInt(DestTy, 127);
4119 if (DestTy == IceType_i64) {
4120 _mov(T_Dest, _127);
4121 } else {
4122 _mov(T_Dest, _63);
4123 }
4124 }
4125 _cmov(T_Dest, T, CondX86::Br_ne);
4126 if (!Cttz) {
4127 if (DestTy == IceType_i64) {
4128 // Even though there's a _63 available at this point, that constant might
4129 // not be an i32, which will cause the xor emission to fail.
4130 Constant *_63 = Ctx->getConstantInt32(63);
4131 _xor(T_Dest, _63);
4132 } else {
4133 _xor(T_Dest, _31);
4134 }
4135 }
4136 _mov(Dest, T_Dest);
4137 }
4138
typedLoad(Type Ty,Variable * Dest,Variable * Base,Constant * Offset)4139 void TargetX8664::typedLoad(Type Ty, Variable *Dest, Variable *Base,
4140 Constant *Offset) {
4141 // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
4142 // legalize Mem properly.
4143 if (Offset)
4144 assert(!llvm::isa<ConstantRelocatable>(Offset));
4145
4146 auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
4147
4148 if (isVectorType(Ty))
4149 _movp(Dest, Mem);
4150 else if (Ty == IceType_f64)
4151 _movq(Dest, Mem);
4152 else
4153 _mov(Dest, Mem);
4154 }
4155
typedStore(Type Ty,Variable * Value,Variable * Base,Constant * Offset)4156 void TargetX8664::typedStore(Type Ty, Variable *Value, Variable *Base,
4157 Constant *Offset) {
4158 // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
4159 // legalize Mem properly.
4160 if (Offset)
4161 assert(!llvm::isa<ConstantRelocatable>(Offset));
4162
4163 auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
4164
4165 if (isVectorType(Ty))
4166 _storep(Value, Mem);
4167 else if (Ty == IceType_f64)
4168 _storeq(Value, Mem);
4169 else
4170 _store(Value, Mem);
4171 }
4172
copyMemory(Type Ty,Variable * Dest,Variable * Src,int32_t OffsetAmt)4173 void TargetX8664::copyMemory(Type Ty, Variable *Dest, Variable *Src,
4174 int32_t OffsetAmt) {
4175 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
4176 // TODO(ascull): this or add nullptr test to _movp, _movq
4177 Variable *Data = makeReg(Ty);
4178
4179 typedLoad(Ty, Data, Src, Offset);
4180 typedStore(Ty, Data, Dest, Offset);
4181 }
4182
lowerMemcpy(Operand * Dest,Operand * Src,Operand * Count)4183 void TargetX8664::lowerMemcpy(Operand *Dest, Operand *Src, Operand *Count) {
4184 // There is a load and store for each chunk in the unroll
4185 constexpr uint32_t BytesPerStorep = 16;
4186
4187 // Check if the operands are constants
4188 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
4189 const bool IsCountConst = CountConst != nullptr;
4190 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
4191
4192 if (shouldOptimizeMemIntrins() && IsCountConst &&
4193 CountValue <= BytesPerStorep * MEMCPY_UNROLL_LIMIT) {
4194 // Unlikely, but nothing to do if it does happen
4195 if (CountValue == 0)
4196 return;
4197
4198 Variable *SrcBase = legalizeToReg(Src);
4199 Variable *DestBase = legalizeToReg(Dest);
4200
4201 // Find the largest type that can be used and use it as much as possible in
4202 // reverse order. Then handle any remainder with overlapping copies. Since
4203 // the remainder will be at the end, there will be reduced pressure on the
4204 // memory unit as the accesses to the same memory are far apart.
4205 Type Ty = largestTypeInSize(CountValue);
4206 uint32_t TyWidth = typeWidthInBytes(Ty);
4207
4208 uint32_t RemainingBytes = CountValue;
4209 int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
4210 while (RemainingBytes >= TyWidth) {
4211 copyMemory(Ty, DestBase, SrcBase, Offset);
4212 RemainingBytes -= TyWidth;
4213 Offset -= TyWidth;
4214 }
4215
4216 if (RemainingBytes == 0)
4217 return;
4218
4219 // Lower the remaining bytes. Adjust to larger types in order to make use
4220 // of overlaps in the copies.
4221 Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
4222 Offset = CountValue - typeWidthInBytes(LeftOverTy);
4223 copyMemory(LeftOverTy, DestBase, SrcBase, Offset);
4224 return;
4225 }
4226
4227 // Fall back on a function call
4228 InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memcpy, nullptr, 3);
4229 Call->addArg(Dest);
4230 Call->addArg(Src);
4231 Call->addArg(Count);
4232 lowerCall(Call);
4233 }
4234
lowerMemmove(Operand * Dest,Operand * Src,Operand * Count)4235 void TargetX8664::lowerMemmove(Operand *Dest, Operand *Src, Operand *Count) {
4236 // There is a load and store for each chunk in the unroll
4237 constexpr uint32_t BytesPerStorep = 16;
4238
4239 // Check if the operands are constants
4240 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
4241 const bool IsCountConst = CountConst != nullptr;
4242 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
4243
4244 if (shouldOptimizeMemIntrins() && IsCountConst &&
4245 CountValue <= BytesPerStorep * MEMMOVE_UNROLL_LIMIT) {
4246 // Unlikely, but nothing to do if it does happen
4247 if (CountValue == 0)
4248 return;
4249
4250 Variable *SrcBase = legalizeToReg(Src);
4251 Variable *DestBase = legalizeToReg(Dest);
4252
4253 std::tuple<Type, Constant *, Variable *> Moves[MEMMOVE_UNROLL_LIMIT];
4254 Constant *Offset;
4255 Variable *Reg;
4256
4257 // Copy the data into registers as the source and destination could overlap
4258 // so make sure not to clobber the memory. This also means overlapping
4259 // moves can be used as we are taking a safe snapshot of the memory.
4260 Type Ty = largestTypeInSize(CountValue);
4261 uint32_t TyWidth = typeWidthInBytes(Ty);
4262
4263 uint32_t RemainingBytes = CountValue;
4264 int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;
4265 size_t N = 0;
4266 while (RemainingBytes >= TyWidth) {
4267 assert(N <= MEMMOVE_UNROLL_LIMIT);
4268 Offset = Ctx->getConstantInt32(OffsetAmt);
4269 Reg = makeReg(Ty);
4270 typedLoad(Ty, Reg, SrcBase, Offset);
4271 RemainingBytes -= TyWidth;
4272 OffsetAmt -= TyWidth;
4273 Moves[N++] = std::make_tuple(Ty, Offset, Reg);
4274 }
4275
4276 if (RemainingBytes != 0) {
4277 // Lower the remaining bytes. Adjust to larger types in order to make use
4278 // of overlaps in the copies.
4279 assert(N <= MEMMOVE_UNROLL_LIMIT);
4280 Ty = firstTypeThatFitsSize(RemainingBytes);
4281 Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
4282 Reg = makeReg(Ty);
4283 typedLoad(Ty, Reg, SrcBase, Offset);
4284 Moves[N++] = std::make_tuple(Ty, Offset, Reg);
4285 }
4286
4287 // Copy the data out into the destination memory
4288 for (size_t i = 0; i < N; ++i) {
4289 std::tie(Ty, Offset, Reg) = Moves[i];
4290 typedStore(Ty, Reg, DestBase, Offset);
4291 }
4292
4293 return;
4294 }
4295
4296 // Fall back on a function call
4297 InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memmove, nullptr, 3);
4298 Call->addArg(Dest);
4299 Call->addArg(Src);
4300 Call->addArg(Count);
4301 lowerCall(Call);
4302 }
4303
lowerMemset(Operand * Dest,Operand * Val,Operand * Count)4304 void TargetX8664::lowerMemset(Operand *Dest, Operand *Val, Operand *Count) {
4305 constexpr uint32_t BytesPerStorep = 16;
4306 constexpr uint32_t BytesPerStoreq = 8;
4307 constexpr uint32_t BytesPerStorei32 = 4;
4308 assert(Val->getType() == IceType_i8);
4309
4310 // Check if the operands are constants
4311 const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
4312 const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
4313 const bool IsCountConst = CountConst != nullptr;
4314 const bool IsValConst = ValConst != nullptr;
4315 const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
4316 const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
4317
4318 // Unlikely, but nothing to do if it does happen
4319 if (IsCountConst && CountValue == 0)
4320 return;
4321
4322 // TODO(ascull): if the count is constant but val is not it would be possible
4323 // to inline by spreading the value across 4 bytes and accessing subregs e.g.
4324 // eax, ax and al.
4325 if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) {
4326 Variable *Base = nullptr;
4327 Variable *VecReg = nullptr;
4328 const uint32_t MaskValue = (ValValue & 0xff);
4329 const uint32_t SpreadValue =
4330 (MaskValue << 24) | (MaskValue << 16) | (MaskValue << 8) | MaskValue;
4331
4332 auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
4333 uint32_t OffsetAmt) {
4334 assert(Base != nullptr);
4335 Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
4336
4337 // TODO(ascull): is 64-bit better with vector or scalar movq?
4338 auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
4339 if (isVectorType(Ty)) {
4340 assert(VecReg != nullptr);
4341 _storep(VecReg, Mem);
4342 } else if (Ty == IceType_f64) {
4343 assert(VecReg != nullptr);
4344 _storeq(VecReg, Mem);
4345 } else {
4346 assert(Ty != IceType_i64);
4347 _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
4348 }
4349 };
4350
4351 // Find the largest type that can be used and use it as much as possible in
4352 // reverse order. Then handle any remainder with overlapping copies. Since
4353 // the remainder will be at the end, there will be reduces pressure on the
4354 // memory unit as the access to the same memory are far apart.
4355 Type Ty = IceType_void;
4356 if (ValValue == 0 && CountValue >= BytesPerStoreq &&
4357 CountValue <= BytesPerStorep * MEMSET_UNROLL_LIMIT) {
4358 // When the value is zero it can be loaded into a vector register cheaply
4359 // using the xor trick.
4360 Base = legalizeToReg(Dest);
4361 VecReg = makeVectorOfZeros(IceType_v16i8);
4362 Ty = largestTypeInSize(CountValue);
4363 } else if (CountValue <= BytesPerStorei32 * MEMSET_UNROLL_LIMIT) {
4364 // When the value is non-zero or the count is small we can't use vector
4365 // instructions so are limited to 32-bit stores.
4366 Base = legalizeToReg(Dest);
4367 constexpr uint32_t MaxSize = 4;
4368 Ty = largestTypeInSize(CountValue, MaxSize);
4369 }
4370
4371 if (Base) {
4372 uint32_t TyWidth = typeWidthInBytes(Ty);
4373
4374 uint32_t RemainingBytes = CountValue;
4375 uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
4376 while (RemainingBytes >= TyWidth) {
4377 lowerSet(Ty, Offset);
4378 RemainingBytes -= TyWidth;
4379 Offset -= TyWidth;
4380 }
4381
4382 if (RemainingBytes == 0)
4383 return;
4384
4385 // Lower the remaining bytes. Adjust to larger types in order to make use
4386 // of overlaps in the copies.
4387 Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
4388 Offset = CountValue - typeWidthInBytes(LeftOverTy);
4389 lowerSet(LeftOverTy, Offset);
4390 return;
4391 }
4392 }
4393
4394 // Fall back on calling the memset function. The value operand needs to be
4395 // extended to a stack slot size because the PNaCl ABI requires arguments to
4396 // be at least 32 bits wide.
4397 Operand *ValExt;
4398 if (IsValConst) {
4399 ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
4400 } else {
4401 Variable *ValExtVar = Func->makeVariable(stackSlotType());
4402 lowerCast(InstCast::create(Func, InstCast::Zext, ValExtVar, Val));
4403 ValExt = ValExtVar;
4404 }
4405 InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memset, nullptr, 3);
4406 Call->addArg(Dest);
4407 Call->addArg(ValExt);
4408 Call->addArg(Count);
4409 lowerCall(Call);
4410 }
4411
4412 class AddressOptimizer {
4413 AddressOptimizer() = delete;
4414 AddressOptimizer(const AddressOptimizer &) = delete;
4415 AddressOptimizer &operator=(const AddressOptimizer &) = delete;
4416
4417 public:
AddressOptimizer(const Cfg * Func)4418 explicit AddressOptimizer(const Cfg *Func)
4419 : Func(Func), VMetadata(Func->getVMetadata()) {}
4420
4421 inline void dumpAddressOpt(const ConstantRelocatable *const Relocatable,
4422 int32_t Offset, const Variable *Base,
4423 const Variable *Index, uint16_t Shift,
4424 const Inst *Reason) const;
4425
4426 inline const Inst *matchAssign(Variable **Var,
4427 ConstantRelocatable **Relocatable,
4428 int32_t *Offset);
4429
4430 inline const Inst *matchCombinedBaseIndex(Variable **Base, Variable **Index,
4431 uint16_t *Shift);
4432
4433 inline const Inst *matchShiftedIndex(Variable **Index, uint16_t *Shift);
4434
4435 inline const Inst *matchOffsetIndexOrBase(Variable **IndexOrBase,
4436 const uint16_t Shift,
4437 ConstantRelocatable **Relocatable,
4438 int32_t *Offset);
4439
4440 private:
4441 const Cfg *const Func;
4442 const VariablesMetadata *const VMetadata;
4443
isAdd(const Inst * Instr)4444 static bool isAdd(const Inst *Instr) {
4445 if (auto *Arith = llvm::dyn_cast_or_null<const InstArithmetic>(Instr)) {
4446 return (Arith->getOp() == InstArithmetic::Add);
4447 }
4448 return false;
4449 }
4450 };
4451
dumpAddressOpt(const ConstantRelocatable * const Relocatable,int32_t Offset,const Variable * Base,const Variable * Index,uint16_t Shift,const Inst * Reason) const4452 void AddressOptimizer::dumpAddressOpt(
4453 const ConstantRelocatable *const Relocatable, int32_t Offset,
4454 const Variable *Base, const Variable *Index, uint16_t Shift,
4455 const Inst *Reason) const {
4456 if (!BuildDefs::dump())
4457 return;
4458 if (!Func->isVerbose(IceV_AddrOpt))
4459 return;
4460 OstreamLocker L(Func->getContext());
4461 Ostream &Str = Func->getContext()->getStrDump();
4462 Str << "Instruction: ";
4463 Reason->dumpDecorated(Func);
4464 Str << " results in Base=";
4465 if (Base)
4466 Base->dump(Func);
4467 else
4468 Str << "<null>";
4469 Str << ", Index=";
4470 if (Index)
4471 Index->dump(Func);
4472 else
4473 Str << "<null>";
4474 Str << ", Shift=" << Shift << ", Offset=" << Offset
4475 << ", Relocatable=" << Relocatable << "\n";
4476 }
4477
matchAssign(Variable ** Var,ConstantRelocatable ** Relocatable,int32_t * Offset)4478 const Inst *AddressOptimizer::matchAssign(Variable **Var,
4479 ConstantRelocatable **Relocatable,
4480 int32_t *Offset) {
4481 // Var originates from Var=SrcVar ==> set Var:=SrcVar
4482 if (*Var == nullptr)
4483 return nullptr;
4484 if (const Inst *VarAssign = VMetadata->getSingleDefinition(*Var)) {
4485 assert(!VMetadata->isMultiDef(*Var));
4486 if (llvm::isa<InstAssign>(VarAssign)) {
4487 Operand *SrcOp = VarAssign->getSrc(0);
4488 assert(SrcOp);
4489 if (auto *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) {
4490 if (!VMetadata->isMultiDef(SrcVar) &&
4491 // TODO: ensure SrcVar stays single-BB
4492 true) {
4493 *Var = SrcVar;
4494 return VarAssign;
4495 }
4496 } else if (auto *Const = llvm::dyn_cast<ConstantInteger32>(SrcOp)) {
4497 int32_t MoreOffset = Const->getValue();
4498 if (Utils::WouldOverflowAdd(*Offset, MoreOffset))
4499 return nullptr;
4500 *Var = nullptr;
4501 *Offset += MoreOffset;
4502 return VarAssign;
4503 } else if (auto *AddReloc = llvm::dyn_cast<ConstantRelocatable>(SrcOp)) {
4504 if (*Relocatable == nullptr) {
4505 // It is always safe to fold a relocatable through assignment -- the
4506 // assignment frees a slot in the address operand that can be used to
4507 // hold the Sandbox Pointer -- if any.
4508 *Var = nullptr;
4509 *Relocatable = AddReloc;
4510 return VarAssign;
4511 }
4512 }
4513 }
4514 }
4515 return nullptr;
4516 }
4517
matchCombinedBaseIndex(Variable ** Base,Variable ** Index,uint16_t * Shift)4518 const Inst *AddressOptimizer::matchCombinedBaseIndex(Variable **Base,
4519 Variable **Index,
4520 uint16_t *Shift) {
4521 // Index==nullptr && Base is Base=Var1+Var2 ==>
4522 // set Base=Var1, Index=Var2, Shift=0
4523 if (*Base == nullptr)
4524 return nullptr;
4525 if (*Index != nullptr)
4526 return nullptr;
4527 auto *BaseInst = VMetadata->getSingleDefinition(*Base);
4528 if (BaseInst == nullptr)
4529 return nullptr;
4530 assert(!VMetadata->isMultiDef(*Base));
4531 if (BaseInst->getSrcSize() < 2)
4532 return nullptr;
4533 if (auto *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) {
4534 if (VMetadata->isMultiDef(Var1))
4535 return nullptr;
4536 if (auto *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) {
4537 if (VMetadata->isMultiDef(Var2))
4538 return nullptr;
4539 if (isAdd(BaseInst) &&
4540 // TODO: ensure Var1 and Var2 stay single-BB
4541 true) {
4542 *Base = Var1;
4543 *Index = Var2;
4544 *Shift = 0; // should already have been 0
4545 return BaseInst;
4546 }
4547 }
4548 }
4549 return nullptr;
4550 }
4551
matchShiftedIndex(Variable ** Index,uint16_t * Shift)4552 const Inst *AddressOptimizer::matchShiftedIndex(Variable **Index,
4553 uint16_t *Shift) {
4554 // Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
4555 // Index=Var, Shift+=log2(Const)
4556 if (*Index == nullptr)
4557 return nullptr;
4558 auto *IndexInst = VMetadata->getSingleDefinition(*Index);
4559 if (IndexInst == nullptr)
4560 return nullptr;
4561 assert(!VMetadata->isMultiDef(*Index));
4562
4563 // When using an unsigned 32-bit array index on x64, it gets zero-extended
4564 // before the shift & add. The explicit zero extension can be eliminated
4565 // because x86 32-bit operations automatically get zero-extended into the
4566 // corresponding 64-bit register.
4567 if (auto *CastInst = llvm::dyn_cast<InstCast>(IndexInst)) {
4568 if (CastInst->getCastKind() == InstCast::Zext) {
4569 if (auto *Var = llvm::dyn_cast<Variable>(CastInst->getSrc(0))) {
4570 if (Var->getType() == IceType_i32 &&
4571 CastInst->getDest()->getType() == IceType_i64) {
4572 IndexInst = VMetadata->getSingleDefinition(Var);
4573 }
4574 }
4575 }
4576 }
4577
4578 if (IndexInst->getSrcSize() < 2)
4579 return nullptr;
4580 if (auto *ArithInst = llvm::dyn_cast<InstArithmetic>(IndexInst)) {
4581 if (auto *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
4582 if (auto *Const =
4583 llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) {
4584 if (VMetadata->isMultiDef(Var) || Const->getType() != IceType_i32)
4585 return nullptr;
4586 switch (ArithInst->getOp()) {
4587 default:
4588 return nullptr;
4589 case InstArithmetic::Mul: {
4590 uint32_t Mult = Const->getValue();
4591 uint32_t LogMult;
4592 switch (Mult) {
4593 case 1:
4594 LogMult = 0;
4595 break;
4596 case 2:
4597 LogMult = 1;
4598 break;
4599 case 4:
4600 LogMult = 2;
4601 break;
4602 case 8:
4603 LogMult = 3;
4604 break;
4605 default:
4606 return nullptr;
4607 }
4608 if (*Shift + LogMult <= 3) {
4609 *Index = Var;
4610 *Shift += LogMult;
4611 return IndexInst;
4612 }
4613 }
4614 case InstArithmetic::Shl: {
4615 uint32_t ShiftAmount = Const->getValue();
4616 switch (ShiftAmount) {
4617 case 0:
4618 case 1:
4619 case 2:
4620 case 3:
4621 break;
4622 default:
4623 return nullptr;
4624 }
4625 if (*Shift + ShiftAmount <= 3) {
4626 *Index = Var;
4627 *Shift += ShiftAmount;
4628 return IndexInst;
4629 }
4630 }
4631 }
4632 }
4633 }
4634 }
4635 return nullptr;
4636 }
4637
matchOffsetIndexOrBase(Variable ** IndexOrBase,const uint16_t Shift,ConstantRelocatable ** Relocatable,int32_t * Offset)4638 const Inst *AddressOptimizer::matchOffsetIndexOrBase(
4639 Variable **IndexOrBase, const uint16_t Shift,
4640 ConstantRelocatable **Relocatable, int32_t *Offset) {
4641 // Base is Base=Var+Const || Base is Base=Const+Var ==>
4642 // set Base=Var, Offset+=Const
4643 // Base is Base=Var-Const ==>
4644 // set Base=Var, Offset-=Const
4645 // Index is Index=Var+Const ==>
4646 // set Index=Var, Offset+=(Const<<Shift)
4647 // Index is Index=Const+Var ==>
4648 // set Index=Var, Offset+=(Const<<Shift)
4649 // Index is Index=Var-Const ==>
4650 // set Index=Var, Offset-=(Const<<Shift)
4651 // Treat Index=Var Or Const as Index=Var + Const
4652 // when Var = Var' << N and log2(Const) <= N
4653 // or when Var = (2^M) * (2^N) and log2(Const) <= (M+N)
4654
4655 if (*IndexOrBase == nullptr) {
4656 return nullptr;
4657 }
4658 const Inst *Definition = VMetadata->getSingleDefinition(*IndexOrBase);
4659 if (Definition == nullptr) {
4660 return nullptr;
4661 }
4662 assert(!VMetadata->isMultiDef(*IndexOrBase));
4663 if (auto *ArithInst = llvm::dyn_cast<const InstArithmetic>(Definition)) {
4664 switch (ArithInst->getOp()) {
4665 case InstArithmetic::Add:
4666 case InstArithmetic::Sub:
4667 case InstArithmetic::Or:
4668 break;
4669 default:
4670 return nullptr;
4671 }
4672
4673 Operand *Src0 = ArithInst->getSrc(0);
4674 Operand *Src1 = ArithInst->getSrc(1);
4675 auto *Var0 = llvm::dyn_cast<Variable>(Src0);
4676 auto *Var1 = llvm::dyn_cast<Variable>(Src1);
4677 auto *Const0 = llvm::dyn_cast<ConstantInteger32>(Src0);
4678 auto *Const1 = llvm::dyn_cast<ConstantInteger32>(Src1);
4679 auto *Reloc0 = llvm::dyn_cast<ConstantRelocatable>(Src0);
4680 auto *Reloc1 = llvm::dyn_cast<ConstantRelocatable>(Src1);
4681
4682 bool IsAdd = false;
4683 if (ArithInst->getOp() == InstArithmetic::Or) {
4684 Variable *Var = nullptr;
4685 ConstantInteger32 *Const = nullptr;
4686 if (Var0 && Const1) {
4687 Var = Var0;
4688 Const = Const1;
4689 } else if (Const0 && Var1) {
4690 Var = Var1;
4691 Const = Const0;
4692 } else {
4693 return nullptr;
4694 }
4695 auto *VarDef =
4696 llvm::dyn_cast<InstArithmetic>(VMetadata->getSingleDefinition(Var));
4697 if (VarDef == nullptr)
4698 return nullptr;
4699
4700 SizeT ZeroesAvailable = 0;
4701 if (VarDef->getOp() == InstArithmetic::Shl) {
4702 if (auto *ConstInt =
4703 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
4704 ZeroesAvailable = ConstInt->getValue();
4705 }
4706 } else if (VarDef->getOp() == InstArithmetic::Mul) {
4707 SizeT PowerOfTwo = 0;
4708 if (auto *MultConst =
4709 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(0))) {
4710 if (llvm::isPowerOf2_32(MultConst->getValue())) {
4711 PowerOfTwo += MultConst->getValue();
4712 }
4713 }
4714 if (auto *MultConst =
4715 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
4716 if (llvm::isPowerOf2_32(MultConst->getValue())) {
4717 PowerOfTwo += MultConst->getValue();
4718 }
4719 }
4720 ZeroesAvailable = llvm::Log2_32(PowerOfTwo) + 1;
4721 }
4722 SizeT ZeroesNeeded = llvm::Log2_32(Const->getValue()) + 1;
4723 if (ZeroesNeeded == 0 || ZeroesNeeded > ZeroesAvailable)
4724 return nullptr;
4725 IsAdd = true; // treat it as an add if the above conditions hold
4726 } else {
4727 IsAdd = ArithInst->getOp() == InstArithmetic::Add;
4728 }
4729
4730 Variable *NewIndexOrBase = nullptr;
4731 int32_t NewOffset = 0;
4732 ConstantRelocatable *NewRelocatable = *Relocatable;
4733 if (Var0 && Var1)
4734 // TODO(sehr): merge base/index splitting into here.
4735 return nullptr;
4736 if (!IsAdd && Var1)
4737 return nullptr;
4738 if (Var0)
4739 NewIndexOrBase = Var0;
4740 else if (Var1)
4741 NewIndexOrBase = Var1;
4742 // Don't know how to add/subtract two relocatables.
4743 if ((*Relocatable && (Reloc0 || Reloc1)) || (Reloc0 && Reloc1))
4744 return nullptr;
4745 // Don't know how to subtract a relocatable.
4746 if (!IsAdd && Reloc1)
4747 return nullptr;
4748 // Incorporate ConstantRelocatables.
4749 if (Reloc0)
4750 NewRelocatable = Reloc0;
4751 else if (Reloc1)
4752 NewRelocatable = Reloc1;
4753 // Compute the updated constant offset.
4754 if (Const0) {
4755 const int32_t MoreOffset =
4756 IsAdd ? Const0->getValue() : -Const0->getValue();
4757 if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
4758 return nullptr;
4759 NewOffset += MoreOffset;
4760 }
4761 if (Const1) {
4762 const int32_t MoreOffset =
4763 IsAdd ? Const1->getValue() : -Const1->getValue();
4764 if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
4765 return nullptr;
4766 NewOffset += MoreOffset;
4767 }
4768 if (Utils::WouldOverflowAdd(*Offset, NewOffset << Shift))
4769 return nullptr;
4770 *IndexOrBase = NewIndexOrBase;
4771 *Offset += (NewOffset << Shift);
4772 // Shift is always zero if this is called with the base
4773 *Relocatable = NewRelocatable;
4774 return Definition;
4775 }
4776 return nullptr;
4777 }
4778
computeAddressOpt(const Inst * Instr,Type MemType,Operand * Addr)4779 X86OperandMem *TargetX8664::computeAddressOpt(const Inst *Instr, Type MemType,
4780 Operand *Addr) {
4781 Func->resetCurrentNode();
4782 if (Func->isVerbose(IceV_AddrOpt)) {
4783 OstreamLocker L(Func->getContext());
4784 Ostream &Str = Func->getContext()->getStrDump();
4785 Str << "\nStarting computeAddressOpt for instruction:\n ";
4786 Instr->dumpDecorated(Func);
4787 }
4788
4789 OptAddr NewAddr;
4790 NewAddr.Base = llvm::dyn_cast<Variable>(Addr);
4791 if (NewAddr.Base == nullptr)
4792 return nullptr;
4793
4794 // If the Base has more than one use or is live across multiple blocks, then
4795 // don't go further. Alternatively (?), never consider a transformation that
4796 // would change a variable that is currently *not* live across basic block
4797 // boundaries into one that *is*.
4798 if (!getFlags().getLoopInvariantCodeMotion()) {
4799 // Need multi block address opt when licm is enabled.
4800 // Might make sense to restrict to current node and loop header.
4801 if (Func->getVMetadata()->isMultiBlock(
4802 NewAddr.Base) /* || Base->getUseCount() > 1*/)
4803 return nullptr;
4804 }
4805 AddressOptimizer AddrOpt(Func);
4806 const bool MockBounds = getFlags().getMockBoundsCheck();
4807 const Inst *Reason = nullptr;
4808 bool AddressWasOptimized = false;
4809 // The following unnamed struct identifies the address mode formation steps
4810 // that could potentially create an invalid memory operand (i.e., no free
4811 // slots for RebasePtr.) We add all those variables to this struct so that we
4812 // can use memset() to reset all members to false.
4813 struct {
4814 bool AssignBase = false;
4815 bool AssignIndex = false;
4816 bool OffsetFromBase = false;
4817 bool OffsetFromIndex = false;
4818 bool CombinedBaseIndex = false;
4819 } Skip;
4820 // NewAddrCheckpoint is used to rollback the address being formed in case an
4821 // invalid address is formed.
4822 OptAddr NewAddrCheckpoint;
4823 Reason = Instr;
4824 do {
4825 if (Reason) {
4826 AddrOpt.dumpAddressOpt(NewAddr.Relocatable, NewAddr.Offset, NewAddr.Base,
4827 NewAddr.Index, NewAddr.Shift, Reason);
4828 AddressWasOptimized = true;
4829 Reason = nullptr;
4830 memset(reinterpret_cast<void *>(&Skip), 0, sizeof(Skip));
4831 }
4832
4833 NewAddrCheckpoint = NewAddr;
4834
4835 // Update Base and Index to follow through assignments to definitions.
4836 if (!Skip.AssignBase &&
4837 (Reason = AddrOpt.matchAssign(&NewAddr.Base, &NewAddr.Relocatable,
4838 &NewAddr.Offset))) {
4839 // Assignments of Base from a Relocatable or ConstantInt32 can result
4840 // in Base becoming nullptr. To avoid code duplication in this loop we
4841 // prefer that Base be non-nullptr if possible.
4842 if ((NewAddr.Base == nullptr) && (NewAddr.Index != nullptr) &&
4843 NewAddr.Shift == 0) {
4844 std::swap(NewAddr.Base, NewAddr.Index);
4845 }
4846 continue;
4847 }
4848 if (!Skip.AssignBase &&
4849 (Reason = AddrOpt.matchAssign(&NewAddr.Index, &NewAddr.Relocatable,
4850 &NewAddr.Offset))) {
4851 continue;
4852 }
4853
4854 if (!MockBounds) {
4855 // Transition from:
4856 // <Relocatable + Offset>(Base) to
4857 // <Relocatable + Offset>(Base, Index)
4858 if (!Skip.CombinedBaseIndex &&
4859 (Reason = AddrOpt.matchCombinedBaseIndex(
4860 &NewAddr.Base, &NewAddr.Index, &NewAddr.Shift))) {
4861 continue;
4862 }
4863
4864 // Recognize multiply/shift and update Shift amount.
4865 // Index becomes Index=Var<<Const && Const+Shift<=3 ==>
4866 // Index=Var, Shift+=Const
4867 // Index becomes Index=Const*Var && log2(Const)+Shift<=3 ==>
4868 // Index=Var, Shift+=log2(Const)
4869 if ((Reason =
4870 AddrOpt.matchShiftedIndex(&NewAddr.Index, &NewAddr.Shift))) {
4871 continue;
4872 }
4873
4874 // If Shift is zero, the choice of Base and Index was purely arbitrary.
4875 // Recognize multiply/shift and set Shift amount.
4876 // Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
4877 // swap(Index,Base)
4878 // Similar for Base=Const*Var and Base=Var<<Const
4879 if (NewAddr.Shift == 0 &&
4880 (Reason = AddrOpt.matchShiftedIndex(&NewAddr.Base, &NewAddr.Shift))) {
4881 std::swap(NewAddr.Base, NewAddr.Index);
4882 continue;
4883 }
4884 }
4885
4886 // Update Offset to reflect additions/subtractions with constants and
4887 // relocatables.
4888 // TODO: consider overflow issues with respect to Offset.
4889 if (!Skip.OffsetFromBase && (Reason = AddrOpt.matchOffsetIndexOrBase(
4890 &NewAddr.Base, /*Shift =*/0,
4891 &NewAddr.Relocatable, &NewAddr.Offset))) {
4892 continue;
4893 }
4894 if (!Skip.OffsetFromIndex && (Reason = AddrOpt.matchOffsetIndexOrBase(
4895 &NewAddr.Index, NewAddr.Shift,
4896 &NewAddr.Relocatable, &NewAddr.Offset))) {
4897 continue;
4898 }
4899
4900 break;
4901 } while (Reason);
4902
4903 if (!AddressWasOptimized) {
4904 return nullptr;
4905 }
4906
4907 Constant *OffsetOp = nullptr;
4908 if (NewAddr.Relocatable == nullptr) {
4909 OffsetOp = Ctx->getConstantInt32(NewAddr.Offset);
4910 } else {
4911 OffsetOp =
4912 Ctx->getConstantSym(NewAddr.Relocatable->getOffset() + NewAddr.Offset,
4913 NewAddr.Relocatable->getName());
4914 }
4915 // Vanilla ICE load instructions should not use the segment registers, and
4916 // computeAddressOpt only works at the level of Variables and Constants, not
4917 // other X86OperandMem, so there should be no mention of segment
4918 // registers there either.
4919 static constexpr auto SegmentReg =
4920 X86OperandMem::SegmentRegisters::DefaultSegment;
4921
4922 return X86OperandMem::create(Func, MemType, NewAddr.Base, OffsetOp,
4923 NewAddr.Index, NewAddr.Shift, SegmentReg);
4924 }
4925
4926 /// Add a mock bounds check on the memory address before using it as a load or
4927 /// store operand. The basic idea is that given a memory operand [reg], we
4928 /// would first add bounds-check code something like:
4929 ///
4930 /// cmp reg, <lb>
4931 /// jl out_of_line_error
4932 /// cmp reg, <ub>
4933 /// jg out_of_line_error
4934 ///
4935 /// In reality, the specific code will depend on how <lb> and <ub> are
4936 /// represented, e.g. an immediate, a global, or a function argument.
4937 ///
4938 /// As such, we need to enforce that the memory operand does not have the form
4939 /// [reg1+reg2], because then there is no simple cmp instruction that would
4940 /// suffice. However, we consider [reg+offset] to be OK because the offset is
4941 /// usually small, and so <ub> could have a safety buffer built in and then we
4942 /// could instead branch to a custom out_of_line_error that does the precise
4943 /// check and jumps back if it turns out OK.
4944 ///
4945 /// For the purpose of mocking the bounds check, we'll do something like this:
4946 ///
4947 /// cmp reg, 0
4948 /// je label
4949 /// cmp reg, 1
4950 /// je label
4951 /// label:
4952 ///
4953 /// Also note that we don't need to add a bounds check to a dereference of a
4954 /// simple global variable address.
4955
doMockBoundsCheck(Operand * Opnd)4956 void TargetX8664::doMockBoundsCheck(Operand *Opnd) {
4957 if (!getFlags().getMockBoundsCheck())
4958 return;
4959 if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd)) {
4960 if (Mem->getIndex()) {
4961 llvm::report_fatal_error("doMockBoundsCheck: Opnd contains index reg");
4962 }
4963 Opnd = Mem->getBase();
4964 }
4965 // At this point Opnd could be nullptr, or Variable, or Constant, or perhaps
4966 // something else. We only care if it is Variable.
4967 auto *Var = llvm::dyn_cast_or_null<Variable>(Opnd);
4968 if (Var == nullptr)
4969 return;
4970 // We use lowerStore() to copy out-args onto the stack. This creates a memory
4971 // operand with the stack pointer as the base register. Don't do bounds
4972 // checks on that.
4973 if (Var->getRegNum() == getStackReg())
4974 return;
4975
4976 auto *Label = InstX86Label::create(Func, this);
4977 _cmp(Opnd, Ctx->getConstantZero(IceType_i32));
4978 _br(CondX86::Br_e, Label);
4979 _cmp(Opnd, Ctx->getConstantInt32(1));
4980 _br(CondX86::Br_e, Label);
4981 Context.insert(Label);
4982 }
4983
lowerLoad(const InstLoad * Load)4984 void TargetX8664::lowerLoad(const InstLoad *Load) {
4985 // A Load instruction can be treated the same as an Assign instruction, after
4986 // the source operand is transformed into an X86OperandMem operand. Note that
4987 // the address mode optimization already creates an X86OperandMem operand, so
4988 // it doesn't need another level of transformation.
4989 Variable *DestLoad = Load->getDest();
4990 Type Ty = DestLoad->getType();
4991 Operand *Src0 = formMemoryOperand(Load->getLoadAddress(), Ty);
4992 doMockBoundsCheck(Src0);
4993 auto *Assign = InstAssign::create(Func, DestLoad, Src0);
4994 lowerAssign(Assign);
4995 }
4996
doAddressOptOther()4997 void TargetX8664::doAddressOptOther() {
4998 // Inverts some Icmp instructions which helps doAddressOptLoad later.
4999 // TODO(manasijm): Refactor to unify the conditions for Var0 and Var1
5000 Inst *Instr = iteratorToInst(Context.getCur());
5001 auto *VMetadata = Func->getVMetadata();
5002 if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Instr)) {
5003 if (llvm::isa<Constant>(Icmp->getSrc(0)) ||
5004 llvm::isa<Constant>(Icmp->getSrc(1)))
5005 return;
5006 auto *Var0 = llvm::dyn_cast<Variable>(Icmp->getSrc(0));
5007 if (Var0 == nullptr)
5008 return;
5009 if (!VMetadata->isTracked(Var0))
5010 return;
5011 auto *Op0Def = VMetadata->getFirstDefinitionSingleBlock(Var0);
5012 if (Op0Def == nullptr || !llvm::isa<InstLoad>(Op0Def))
5013 return;
5014 if (VMetadata->getLocalUseNode(Var0) != Context.getNode())
5015 return;
5016
5017 auto *Var1 = llvm::dyn_cast<Variable>(Icmp->getSrc(1));
5018 if (Var1 != nullptr && VMetadata->isTracked(Var1)) {
5019 auto *Op1Def = VMetadata->getFirstDefinitionSingleBlock(Var1);
5020 if (Op1Def != nullptr && !VMetadata->isMultiBlock(Var1) &&
5021 llvm::isa<InstLoad>(Op1Def)) {
5022 return; // Both are loads
5023 }
5024 }
5025 Icmp->reverseConditionAndOperands();
5026 }
5027 }
5028
doAddressOptLoad()5029 void TargetX8664::doAddressOptLoad() {
5030 Inst *Instr = iteratorToInst(Context.getCur());
5031 Operand *Addr = Instr->getSrc(0);
5032 Variable *Dest = Instr->getDest();
5033 if (auto *OptAddr = computeAddressOpt(Instr, Dest->getType(), Addr)) {
5034 Instr->setDeleted();
5035 Context.insert<InstLoad>(Dest, OptAddr);
5036 }
5037 }
5038
doAddressOptLoadSubVector()5039 void TargetX8664::doAddressOptLoadSubVector() {
5040 auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur());
5041 Operand *Addr = Intrinsic->getArg(0);
5042 Variable *Dest = Intrinsic->getDest();
5043 if (auto *OptAddr = computeAddressOpt(Intrinsic, Dest->getType(), Addr)) {
5044 Intrinsic->setDeleted();
5045 const Ice::Intrinsics::IntrinsicInfo Info = {
5046 Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F,
5047 Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
5048 auto *NewLoad = Context.insert<InstIntrinsic>(2, Dest, Info);
5049 NewLoad->addArg(OptAddr);
5050 NewLoad->addArg(Intrinsic->getArg(1));
5051 }
5052 }
5053
lowerPhi(const InstPhi *)5054 void TargetX8664::lowerPhi(const InstPhi * /*Instr*/) {
5055 Func->setError("Phi found in regular instruction list");
5056 }
5057
lowerRet(const InstRet * Instr)5058 void TargetX8664::lowerRet(const InstRet *Instr) {
5059 Variable *Reg = nullptr;
5060 if (Instr->hasRetValue()) {
5061 Operand *RetValue = legalize(Instr->getRetValue());
5062 const Type ReturnType = RetValue->getType();
5063 assert(isVectorType(ReturnType) || isScalarFloatingType(ReturnType) ||
5064 (ReturnType == IceType_i32) || (ReturnType == IceType_i64));
5065 Reg = moveReturnValueToRegister(RetValue, ReturnType);
5066 }
5067 // Add a ret instruction even if sandboxing is enabled, because addEpilog
5068 // explicitly looks for a ret instruction as a marker for where to insert the
5069 // frame removal instructions.
5070 _ret(Reg);
5071 // Add a fake use of esp to make sure esp stays alive for the entire
5072 // function. Otherwise post-call esp adjustments get dead-code eliminated.
5073 keepEspLiveAtExit();
5074 }
5075
makePshufdMask(SizeT Index0,SizeT Index1,SizeT Index2,SizeT Index3)5076 inline uint32_t makePshufdMask(SizeT Index0, SizeT Index1, SizeT Index2,
5077 SizeT Index3) {
5078 const SizeT Mask = (Index0 & 0x3) | ((Index1 & 0x3) << 2) |
5079 ((Index2 & 0x3) << 4) | ((Index3 & 0x3) << 6);
5080 assert(Mask < 256);
5081 return Mask;
5082 }
5083
lowerShuffleVector_AllFromSameSrc(Operand * Src,SizeT Index0,SizeT Index1,SizeT Index2,SizeT Index3)5084 Variable *TargetX8664::lowerShuffleVector_AllFromSameSrc(
5085 Operand *Src, SizeT Index0, SizeT Index1, SizeT Index2, SizeT Index3) {
5086 constexpr SizeT SrcBit = 1 << 2;
5087 assert((Index0 & SrcBit) == (Index1 & SrcBit));
5088 assert((Index0 & SrcBit) == (Index2 & SrcBit));
5089 assert((Index0 & SrcBit) == (Index3 & SrcBit));
5090 (void)SrcBit;
5091
5092 const Type SrcTy = Src->getType();
5093 auto *T = makeReg(SrcTy);
5094 auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
5095 auto *Mask =
5096 Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
5097 _pshufd(T, SrcRM, Mask);
5098 return T;
5099 }
5100
5101 Variable *
lowerShuffleVector_TwoFromSameSrc(Operand * Src0,SizeT Index0,SizeT Index1,Operand * Src1,SizeT Index2,SizeT Index3)5102 TargetX8664::lowerShuffleVector_TwoFromSameSrc(Operand *Src0, SizeT Index0,
5103 SizeT Index1, Operand *Src1,
5104 SizeT Index2, SizeT Index3) {
5105 constexpr SizeT SrcBit = 1 << 2;
5106 assert((Index0 & SrcBit) == (Index1 & SrcBit) || (Index1 == IGNORE_INDEX));
5107 assert((Index2 & SrcBit) == (Index3 & SrcBit) || (Index3 == IGNORE_INDEX));
5108 (void)SrcBit;
5109
5110 const Type SrcTy = Src0->getType();
5111 assert(Src1->getType() == SrcTy);
5112 auto *T = makeReg(SrcTy);
5113 auto *Src0R = legalizeToReg(Src0);
5114 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5115 auto *Mask =
5116 Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
5117 _movp(T, Src0R);
5118 _shufps(T, Src1RM, Mask);
5119 return T;
5120 }
5121
lowerShuffleVector_UnifyFromDifferentSrcs(Operand * Src0,SizeT Index0,Operand * Src1,SizeT Index1)5122 Variable *TargetX8664::lowerShuffleVector_UnifyFromDifferentSrcs(Operand *Src0,
5123 SizeT Index0,
5124 Operand *Src1,
5125 SizeT Index1) {
5126 return lowerShuffleVector_TwoFromSameSrc(Src0, Index0, IGNORE_INDEX, Src1,
5127 Index1, IGNORE_INDEX);
5128 }
5129
makeSrcSwitchMask(SizeT Index0,SizeT Index1,SizeT Index2,SizeT Index3)5130 inline SizeT makeSrcSwitchMask(SizeT Index0, SizeT Index1, SizeT Index2,
5131 SizeT Index3) {
5132 constexpr SizeT SrcBit = 1 << 2;
5133 const SizeT Index0Bits = ((Index0 & SrcBit) == 0) ? 0 : (1 << 0);
5134 const SizeT Index1Bits = ((Index1 & SrcBit) == 0) ? 0 : (1 << 1);
5135 const SizeT Index2Bits = ((Index2 & SrcBit) == 0) ? 0 : (1 << 2);
5136 const SizeT Index3Bits = ((Index3 & SrcBit) == 0) ? 0 : (1 << 3);
5137 return Index0Bits | Index1Bits | Index2Bits | Index3Bits;
5138 }
5139
lowerShuffleVector_NewMaskName()5140 GlobalString TargetX8664::lowerShuffleVector_NewMaskName() {
5141 GlobalString FuncName = Func->getFunctionName();
5142 const SizeT Id = PshufbMaskCount++;
5143 if (!BuildDefs::dump() || !FuncName.hasStdString()) {
5144 return GlobalString::createWithString(
5145 Ctx,
5146 "$PS" + std::to_string(FuncName.getID()) + "_" + std::to_string(Id));
5147 }
5148 return GlobalString::createWithString(
5149 Ctx, "Pshufb$" + Func->getFunctionName() + "$" + std::to_string(Id));
5150 }
5151
lowerShuffleVector_CreatePshufbMask(int8_t Idx0,int8_t Idx1,int8_t Idx2,int8_t Idx3,int8_t Idx4,int8_t Idx5,int8_t Idx6,int8_t Idx7,int8_t Idx8,int8_t Idx9,int8_t Idx10,int8_t Idx11,int8_t Idx12,int8_t Idx13,int8_t Idx14,int8_t Idx15)5152 ConstantRelocatable *TargetX8664::lowerShuffleVector_CreatePshufbMask(
5153 int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
5154 int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
5155 int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
5156 int8_t Idx15) {
5157 static constexpr uint8_t NumElements = 16;
5158 const char Initializer[NumElements] = {
5159 Idx0, Idx1, Idx2, Idx3, Idx4, Idx5, Idx6, Idx7,
5160 Idx8, Idx9, Idx10, Idx11, Idx12, Idx13, Idx14, Idx15,
5161 };
5162
5163 static constexpr Type V4VectorType = IceType_v4i32;
5164 const uint32_t MaskAlignment = typeWidthInBytesOnStack(V4VectorType);
5165 auto *Mask = VariableDeclaration::create(Func->getGlobalPool());
5166 GlobalString MaskName = lowerShuffleVector_NewMaskName();
5167 Mask->setIsConstant(true);
5168 Mask->addInitializer(VariableDeclaration::DataInitializer::create(
5169 Func->getGlobalPool(), Initializer, NumElements));
5170 Mask->setName(MaskName);
5171 // Mask needs to be 16-byte aligned, or pshufb will seg fault.
5172 Mask->setAlignment(MaskAlignment);
5173 Func->addGlobal(Mask);
5174
5175 constexpr RelocOffsetT Offset = 0;
5176 return llvm::cast<ConstantRelocatable>(Ctx->getConstantSym(Offset, MaskName));
5177 }
5178
lowerShuffleVector_UsingPshufb(Variable * Dest,Operand * Src0,Operand * Src1,int8_t Idx0,int8_t Idx1,int8_t Idx2,int8_t Idx3,int8_t Idx4,int8_t Idx5,int8_t Idx6,int8_t Idx7,int8_t Idx8,int8_t Idx9,int8_t Idx10,int8_t Idx11,int8_t Idx12,int8_t Idx13,int8_t Idx14,int8_t Idx15)5179 void TargetX8664::lowerShuffleVector_UsingPshufb(
5180 Variable *Dest, Operand *Src0, Operand *Src1, int8_t Idx0, int8_t Idx1,
5181 int8_t Idx2, int8_t Idx3, int8_t Idx4, int8_t Idx5, int8_t Idx6,
5182 int8_t Idx7, int8_t Idx8, int8_t Idx9, int8_t Idx10, int8_t Idx11,
5183 int8_t Idx12, int8_t Idx13, int8_t Idx14, int8_t Idx15) {
5184 const Type DestTy = Dest->getType();
5185 static constexpr bool NotRebased = false;
5186 static constexpr Variable *NoBase = nullptr;
5187 // We use void for the memory operand instead of DestTy because using the
5188 // latter causes a validation failure: the X86 Inst layer complains that
5189 // vector mem operands could be under aligned. Thus, using void we avoid the
5190 // validation error. Note that the mask global declaration is aligned, so it
5191 // can be used as an XMM mem operand.
5192 static constexpr Type MaskType = IceType_void;
5193 #define IDX_IN_SRC(N, S) \
5194 ((((N) & (1 << 4)) == (S << 4)) ? ((N)&0xf) : CLEAR_ALL_BITS)
5195 auto *Mask0M = X86OperandMem::create(
5196 Func, MaskType, NoBase,
5197 lowerShuffleVector_CreatePshufbMask(
5198 IDX_IN_SRC(Idx0, 0), IDX_IN_SRC(Idx1, 0), IDX_IN_SRC(Idx2, 0),
5199 IDX_IN_SRC(Idx3, 0), IDX_IN_SRC(Idx4, 0), IDX_IN_SRC(Idx5, 0),
5200 IDX_IN_SRC(Idx6, 0), IDX_IN_SRC(Idx7, 0), IDX_IN_SRC(Idx8, 0),
5201 IDX_IN_SRC(Idx9, 0), IDX_IN_SRC(Idx10, 0), IDX_IN_SRC(Idx11, 0),
5202 IDX_IN_SRC(Idx12, 0), IDX_IN_SRC(Idx13, 0), IDX_IN_SRC(Idx14, 0),
5203 IDX_IN_SRC(Idx15, 0)),
5204 NotRebased);
5205
5206 auto *T0 = makeReg(DestTy);
5207 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5208 _movp(T0, Src0RM);
5209
5210 _pshufb(T0, Mask0M);
5211
5212 if (Idx0 >= 16 || Idx1 >= 16 || Idx2 >= 16 || Idx3 >= 16 || Idx4 >= 16 ||
5213 Idx5 >= 16 || Idx6 >= 16 || Idx7 >= 16 || Idx8 >= 16 || Idx9 >= 16 ||
5214 Idx10 >= 16 || Idx11 >= 16 || Idx12 >= 16 || Idx13 >= 16 || Idx14 >= 16 ||
5215 Idx15 >= 16) {
5216 auto *Mask1M = X86OperandMem::create(
5217 Func, MaskType, NoBase,
5218 lowerShuffleVector_CreatePshufbMask(
5219 IDX_IN_SRC(Idx0, 1), IDX_IN_SRC(Idx1, 1), IDX_IN_SRC(Idx2, 1),
5220 IDX_IN_SRC(Idx3, 1), IDX_IN_SRC(Idx4, 1), IDX_IN_SRC(Idx5, 1),
5221 IDX_IN_SRC(Idx6, 1), IDX_IN_SRC(Idx7, 1), IDX_IN_SRC(Idx8, 1),
5222 IDX_IN_SRC(Idx9, 1), IDX_IN_SRC(Idx10, 1), IDX_IN_SRC(Idx11, 1),
5223 IDX_IN_SRC(Idx12, 1), IDX_IN_SRC(Idx13, 1), IDX_IN_SRC(Idx14, 1),
5224 IDX_IN_SRC(Idx15, 1)),
5225 NotRebased);
5226 #undef IDX_IN_SRC
5227 auto *T1 = makeReg(DestTy);
5228 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5229 _movp(T1, Src1RM);
5230 _pshufb(T1, Mask1M);
5231 _por(T0, T1);
5232 }
5233
5234 _movp(Dest, T0);
5235 }
5236
lowerShuffleVector(const InstShuffleVector * Instr)5237 void TargetX8664::lowerShuffleVector(const InstShuffleVector *Instr) {
5238 auto *Dest = Instr->getDest();
5239 const Type DestTy = Dest->getType();
5240 auto *Src0 = Instr->getSrc(0);
5241 auto *Src1 = Instr->getSrc(1);
5242 const SizeT NumElements = typeNumElements(DestTy);
5243
5244 auto *T = makeReg(DestTy);
5245
5246 switch (DestTy) {
5247 default:
5248 llvm::report_fatal_error("Unexpected vector type.");
5249 case IceType_v16i1:
5250 case IceType_v16i8: {
5251 static constexpr SizeT ExpectedNumElements = 16;
5252 assert(ExpectedNumElements == Instr->getNumIndexes());
5253 (void)ExpectedNumElements;
5254
5255 if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
5256 auto *T = makeReg(DestTy);
5257 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5258 _movp(T, Src0RM);
5259 _punpckl(T, Src0RM);
5260 _movp(Dest, T);
5261 return;
5262 }
5263
5264 if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
5265 23)) {
5266 auto *T = makeReg(DestTy);
5267 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5268 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5269 _movp(T, Src0RM);
5270 _punpckl(T, Src1RM);
5271 _movp(Dest, T);
5272 return;
5273 }
5274
5275 if (Instr->indexesAre(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
5276 15, 15)) {
5277 auto *T = makeReg(DestTy);
5278 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5279 _movp(T, Src0RM);
5280 _punpckh(T, Src0RM);
5281 _movp(Dest, T);
5282 return;
5283 }
5284
5285 if (Instr->indexesAre(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30,
5286 15, 31)) {
5287 auto *T = makeReg(DestTy);
5288 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5289 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5290 _movp(T, Src0RM);
5291 _punpckh(T, Src1RM);
5292 _movp(Dest, T);
5293 return;
5294 }
5295
5296 if (InstructionSet < SSE4_1) {
5297 // TODO(jpp): figure out how to lower with sse2.
5298 break;
5299 }
5300
5301 const SizeT Index0 = Instr->getIndexValue(0);
5302 const SizeT Index1 = Instr->getIndexValue(1);
5303 const SizeT Index2 = Instr->getIndexValue(2);
5304 const SizeT Index3 = Instr->getIndexValue(3);
5305 const SizeT Index4 = Instr->getIndexValue(4);
5306 const SizeT Index5 = Instr->getIndexValue(5);
5307 const SizeT Index6 = Instr->getIndexValue(6);
5308 const SizeT Index7 = Instr->getIndexValue(7);
5309 const SizeT Index8 = Instr->getIndexValue(8);
5310 const SizeT Index9 = Instr->getIndexValue(9);
5311 const SizeT Index10 = Instr->getIndexValue(10);
5312 const SizeT Index11 = Instr->getIndexValue(11);
5313 const SizeT Index12 = Instr->getIndexValue(12);
5314 const SizeT Index13 = Instr->getIndexValue(13);
5315 const SizeT Index14 = Instr->getIndexValue(14);
5316 const SizeT Index15 = Instr->getIndexValue(15);
5317
5318 lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
5319 Index3, Index4, Index5, Index6, Index7,
5320 Index8, Index9, Index10, Index11, Index12,
5321 Index13, Index14, Index15);
5322 return;
5323 }
5324 case IceType_v8i1:
5325 case IceType_v8i16: {
5326 static constexpr SizeT ExpectedNumElements = 8;
5327 assert(ExpectedNumElements == Instr->getNumIndexes());
5328 (void)ExpectedNumElements;
5329
5330 if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
5331 auto *T = makeReg(DestTy);
5332 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5333 _movp(T, Src0RM);
5334 _punpckl(T, Src0RM);
5335 _movp(Dest, T);
5336 return;
5337 }
5338
5339 if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
5340 auto *T = makeReg(DestTy);
5341 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5342 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5343 _movp(T, Src0RM);
5344 _punpckl(T, Src1RM);
5345 _movp(Dest, T);
5346 return;
5347 }
5348
5349 if (Instr->indexesAre(4, 4, 5, 5, 6, 6, 7, 7)) {
5350 auto *T = makeReg(DestTy);
5351 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5352 _movp(T, Src0RM);
5353 _punpckh(T, Src0RM);
5354 _movp(Dest, T);
5355 return;
5356 }
5357
5358 if (Instr->indexesAre(4, 12, 5, 13, 6, 14, 7, 15)) {
5359 auto *T = makeReg(DestTy);
5360 auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5361 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5362 _movp(T, Src0RM);
5363 _punpckh(T, Src1RM);
5364 _movp(Dest, T);
5365 return;
5366 }
5367
5368 if (InstructionSet < SSE4_1) {
5369 // TODO(jpp): figure out how to lower with sse2.
5370 break;
5371 }
5372
5373 const SizeT Index0 = Instr->getIndexValue(0);
5374 const SizeT Index1 = Instr->getIndexValue(1);
5375 const SizeT Index2 = Instr->getIndexValue(2);
5376 const SizeT Index3 = Instr->getIndexValue(3);
5377 const SizeT Index4 = Instr->getIndexValue(4);
5378 const SizeT Index5 = Instr->getIndexValue(5);
5379 const SizeT Index6 = Instr->getIndexValue(6);
5380 const SizeT Index7 = Instr->getIndexValue(7);
5381
5382 #define TO_BYTE_INDEX(I) ((I) << 1)
5383 lowerShuffleVector_UsingPshufb(
5384 Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
5385 TO_BYTE_INDEX(Index1), TO_BYTE_INDEX(Index1) + 1, TO_BYTE_INDEX(Index2),
5386 TO_BYTE_INDEX(Index2) + 1, TO_BYTE_INDEX(Index3),
5387 TO_BYTE_INDEX(Index3) + 1, TO_BYTE_INDEX(Index4),
5388 TO_BYTE_INDEX(Index4) + 1, TO_BYTE_INDEX(Index5),
5389 TO_BYTE_INDEX(Index5) + 1, TO_BYTE_INDEX(Index6),
5390 TO_BYTE_INDEX(Index6) + 1, TO_BYTE_INDEX(Index7),
5391 TO_BYTE_INDEX(Index7) + 1);
5392 #undef TO_BYTE_INDEX
5393 return;
5394 }
5395 case IceType_v4i1:
5396 case IceType_v4i32:
5397 case IceType_v4f32: {
5398 static constexpr SizeT ExpectedNumElements = 4;
5399 assert(ExpectedNumElements == Instr->getNumIndexes());
5400 const SizeT Index0 = Instr->getIndexValue(0);
5401 const SizeT Index1 = Instr->getIndexValue(1);
5402 const SizeT Index2 = Instr->getIndexValue(2);
5403 const SizeT Index3 = Instr->getIndexValue(3);
5404 Variable *T = nullptr;
5405 switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) {
5406 #define CASE_SRCS_IN(S0, S1, S2, S3) \
5407 case (((S0) << 0) | ((S1) << 1) | ((S2) << 2) | ((S3) << 3))
5408 CASE_SRCS_IN(0, 0, 0, 0) : {
5409 T = lowerShuffleVector_AllFromSameSrc(Src0, Index0, Index1, Index2,
5410 Index3);
5411 }
5412 break;
5413 CASE_SRCS_IN(0, 0, 0, 1) : {
5414 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
5415 Src1, Index3);
5416 T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
5417 UNIFIED_INDEX_0, UNIFIED_INDEX_1);
5418 }
5419 break;
5420 CASE_SRCS_IN(0, 0, 1, 0) : {
5421 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
5422 Src0, Index3);
5423 T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
5424 UNIFIED_INDEX_0, UNIFIED_INDEX_1);
5425 }
5426 break;
5427 CASE_SRCS_IN(0, 0, 1, 1) : {
5428 T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Src1,
5429 Index2, Index3);
5430 }
5431 break;
5432 CASE_SRCS_IN(0, 1, 0, 0) : {
5433 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
5434 Src1, Index1);
5435 T = lowerShuffleVector_TwoFromSameSrc(
5436 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
5437 }
5438 break;
5439 CASE_SRCS_IN(0, 1, 0, 1) : {
5440 if (Index0 == 0 && (Index1 - ExpectedNumElements) == 0 && Index2 == 1 &&
5441 (Index3 - ExpectedNumElements) == 1) {
5442 auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5443 auto *Src0R = legalizeToReg(Src0);
5444 T = makeReg(DestTy);
5445 _movp(T, Src0R);
5446 _punpckl(T, Src1RM);
5447 } else if (Index0 == Index2 && Index1 == Index3) {
5448 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
5449 Src0, Index0, Src1, Index1);
5450 T = lowerShuffleVector_AllFromSameSrc(
5451 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
5452 UNIFIED_INDEX_1);
5453 } else {
5454 auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
5455 Src0, Index0, Src1, Index1);
5456 auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
5457 Src0, Index2, Src1, Index3);
5458 T = lowerShuffleVector_TwoFromSameSrc(
5459 Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
5460 UNIFIED_INDEX_0, UNIFIED_INDEX_1);
5461 }
5462 }
5463 break;
5464 CASE_SRCS_IN(0, 1, 1, 0) : {
5465 if (Index0 == Index3 && Index1 == Index2) {
5466 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
5467 Src0, Index0, Src1, Index1);
5468 T = lowerShuffleVector_AllFromSameSrc(
5469 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
5470 UNIFIED_INDEX_0);
5471 } else {
5472 auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
5473 Src0, Index0, Src1, Index1);
5474 auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
5475 Src1, Index2, Src0, Index3);
5476 T = lowerShuffleVector_TwoFromSameSrc(
5477 Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
5478 UNIFIED_INDEX_0, UNIFIED_INDEX_1);
5479 }
5480 }
5481 break;
5482 CASE_SRCS_IN(0, 1, 1, 1) : {
5483 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
5484 Src1, Index1);
5485 T = lowerShuffleVector_TwoFromSameSrc(
5486 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
5487 }
5488 break;
5489 CASE_SRCS_IN(1, 0, 0, 0) : {
5490 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
5491 Src0, Index1);
5492 T = lowerShuffleVector_TwoFromSameSrc(
5493 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
5494 }
5495 break;
5496 CASE_SRCS_IN(1, 0, 0, 1) : {
5497 if (Index0 == Index3 && Index1 == Index2) {
5498 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
5499 Src1, Index0, Src0, Index1);
5500 T = lowerShuffleVector_AllFromSameSrc(
5501 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
5502 UNIFIED_INDEX_0);
5503 } else {
5504 auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
5505 Src1, Index0, Src0, Index1);
5506 auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
5507 Src0, Index2, Src1, Index3);
5508 T = lowerShuffleVector_TwoFromSameSrc(
5509 Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
5510 UNIFIED_INDEX_0, UNIFIED_INDEX_1);
5511 }
5512 }
5513 break;
5514 CASE_SRCS_IN(1, 0, 1, 0) : {
5515 if ((Index0 - ExpectedNumElements) == 0 && Index1 == 0 &&
5516 (Index2 - ExpectedNumElements) == 1 && Index3 == 1) {
5517 auto *Src1RM = legalize(Src0, Legal_Reg | Legal_Mem);
5518 auto *Src0R = legalizeToReg(Src1);
5519 T = makeReg(DestTy);
5520 _movp(T, Src0R);
5521 _punpckl(T, Src1RM);
5522 } else if (Index0 == Index2 && Index1 == Index3) {
5523 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
5524 Src1, Index0, Src0, Index1);
5525 T = lowerShuffleVector_AllFromSameSrc(
5526 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
5527 UNIFIED_INDEX_1);
5528 } else {
5529 auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
5530 Src1, Index0, Src0, Index1);
5531 auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
5532 Src1, Index2, Src0, Index3);
5533 T = lowerShuffleVector_TwoFromSameSrc(
5534 Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
5535 UNIFIED_INDEX_0, UNIFIED_INDEX_1);
5536 }
5537 }
5538 break;
5539 CASE_SRCS_IN(1, 0, 1, 1) : {
5540 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
5541 Src0, Index1);
5542 T = lowerShuffleVector_TwoFromSameSrc(
5543 Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
5544 }
5545 break;
5546 CASE_SRCS_IN(1, 1, 0, 0) : {
5547 T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Src0,
5548 Index2, Index3);
5549 }
5550 break;
5551 CASE_SRCS_IN(1, 1, 0, 1) : {
5552 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
5553 Src1, Index3);
5554 T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
5555 UNIFIED_INDEX_0, UNIFIED_INDEX_1);
5556 }
5557 break;
5558 CASE_SRCS_IN(1, 1, 1, 0) : {
5559 auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
5560 Src0, Index3);
5561 T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
5562 UNIFIED_INDEX_0, UNIFIED_INDEX_1);
5563 }
5564 break;
5565 CASE_SRCS_IN(1, 1, 1, 1) : {
5566 T = lowerShuffleVector_AllFromSameSrc(Src1, Index0, Index1, Index2,
5567 Index3);
5568 }
5569 break;
5570 #undef CASE_SRCS_IN
5571 }
5572
5573 assert(T != nullptr);
5574 assert(T->getType() == DestTy);
5575 _movp(Dest, T);
5576 return;
5577 } break;
5578 }
5579
5580 // Unoptimized shuffle. Perform a series of inserts and extracts.
5581 Context.insert<InstFakeDef>(T);
5582 const Type ElementType = typeElementType(DestTy);
5583 for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
5584 auto *Index = Instr->getIndex(I);
5585 const SizeT Elem = Index->getValue();
5586 auto *ExtElmt = makeReg(ElementType);
5587 if (Elem < NumElements) {
5588 lowerExtractElement(
5589 InstExtractElement::create(Func, ExtElmt, Src0, Index));
5590 } else {
5591 lowerExtractElement(InstExtractElement::create(
5592 Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements)));
5593 }
5594 auto *NewT = makeReg(DestTy);
5595 lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
5596 Ctx->getConstantInt32(I)));
5597 T = NewT;
5598 }
5599 _movp(Dest, T);
5600 }
5601
lowerSelect(const InstSelect * Select)5602 void TargetX8664::lowerSelect(const InstSelect *Select) {
5603 Variable *Dest = Select->getDest();
5604
5605 Operand *Condition = Select->getCondition();
5606 // Handle folding opportunities.
5607 if (const Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
5608 assert(Producer->isDeleted());
5609 switch (BoolFolding::getProducerKind(Producer)) {
5610 default:
5611 break;
5612 case BoolFolding::PK_Icmp32:
5613 case BoolFolding::PK_Icmp64: {
5614 lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Select);
5615 return;
5616 }
5617 case BoolFolding::PK_Fcmp: {
5618 lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Select);
5619 return;
5620 }
5621 }
5622 }
5623
5624 if (isVectorType(Dest->getType())) {
5625 lowerSelectVector(Select);
5626 return;
5627 }
5628
5629 Operand *CmpResult = legalize(Condition, Legal_Reg | Legal_Mem);
5630 Operand *Zero = Ctx->getConstantZero(IceType_i32);
5631 _cmp(CmpResult, Zero);
5632 Operand *SrcT = Select->getTrueOperand();
5633 Operand *SrcF = Select->getFalseOperand();
5634 const BrCond Cond = CondX86::Br_ne;
5635 lowerSelectMove(Dest, Cond, SrcT, SrcF);
5636 }
5637
lowerSelectMove(Variable * Dest,BrCond Cond,Operand * SrcT,Operand * SrcF)5638 void TargetX8664::lowerSelectMove(Variable *Dest, BrCond Cond, Operand *SrcT,
5639 Operand *SrcF) {
5640 Type DestTy = Dest->getType();
5641 if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) {
5642 // The cmov instruction doesn't allow 8-bit or FP operands, so we need
5643 // explicit control flow.
5644 // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
5645 auto *Label = InstX86Label::create(Func, this);
5646 SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
5647 _mov(Dest, SrcT);
5648 _br(Cond, Label);
5649 SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
5650 _redefined(_mov(Dest, SrcF));
5651 Context.insert(Label);
5652 return;
5653 }
5654 // mov t, SrcF; cmov_cond t, SrcT; mov dest, t
5655 // But if SrcT is immediate, we might be able to do better, as the cmov
5656 // instruction doesn't allow an immediate operand:
5657 // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t
5658 if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) {
5659 std::swap(SrcT, SrcF);
5660 Cond = InstX86Base::getOppositeCondition(Cond);
5661 }
5662
5663 assert(DestTy == IceType_i16 || DestTy == IceType_i32 ||
5664 DestTy == IceType_i64);
5665 lowerSelectIntMove(Dest, Cond, SrcT, SrcF);
5666 }
5667
lowerSelectIntMove(Variable * Dest,BrCond Cond,Operand * SrcT,Operand * SrcF)5668 void TargetX8664::lowerSelectIntMove(Variable *Dest, BrCond Cond, Operand *SrcT,
5669 Operand *SrcF) {
5670 Variable *T = nullptr;
5671 SrcF = legalize(SrcF);
5672 _mov(T, SrcF);
5673 SrcT = legalize(SrcT, Legal_Reg | Legal_Mem);
5674 _cmov(T, SrcT, Cond);
5675 _mov(Dest, T);
5676 }
5677
lowerMove(Variable * Dest,Operand * Src,bool IsRedefinition)5678 void TargetX8664::lowerMove(Variable *Dest, Operand *Src, bool IsRedefinition) {
5679 assert(Dest->getType() == Src->getType());
5680 assert(!Dest->isRematerializable());
5681 Operand *SrcLegal;
5682 if (Dest->hasReg()) {
5683 // If Dest already has a physical register, then only basic legalization
5684 // is needed, as the source operand can be a register, immediate, or
5685 // memory.
5686 SrcLegal = legalize(Src, Legal_Reg, Dest->getRegNum());
5687 } else {
5688 // If Dest could be a stack operand, then RI must be a physical register
5689 // or a scalar integer immediate.
5690 SrcLegal = legalize(Src, Legal_Reg | Legal_Imm);
5691 }
5692 if (isVectorType(Dest->getType())) {
5693 _redefined(_movp(Dest, SrcLegal), IsRedefinition);
5694 } else {
5695 _redefined(_mov(Dest, SrcLegal), IsRedefinition);
5696 }
5697 }
5698
lowerOptimizeFcmpSelect(const InstFcmp * Fcmp,const InstSelect * Select)5699 bool TargetX8664::lowerOptimizeFcmpSelect(const InstFcmp *Fcmp,
5700 const InstSelect *Select) {
5701 Operand *CmpSrc0 = Fcmp->getSrc(0);
5702 Operand *CmpSrc1 = Fcmp->getSrc(1);
5703 Operand *SelectSrcT = Select->getTrueOperand();
5704 Operand *SelectSrcF = Select->getFalseOperand();
5705 Variable *SelectDest = Select->getDest();
5706
5707 // TODO(capn): also handle swapped compare/select operand order.
5708 if (CmpSrc0 != SelectSrcT || CmpSrc1 != SelectSrcF)
5709 return false;
5710
5711 // TODO(sehr, stichnot): fcmp/select patterns (e.g., minsd/maxss) go here.
5712 InstFcmp::FCond Condition = Fcmp->getCondition();
5713 switch (Condition) {
5714 default:
5715 return false;
5716 case InstFcmp::True:
5717 break;
5718 case InstFcmp::False:
5719 break;
5720 case InstFcmp::Ogt: {
5721 Variable *T = makeReg(SelectDest->getType());
5722 if (isScalarFloatingType(SelectSrcT->getType())) {
5723 _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
5724 _maxss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
5725 _mov(SelectDest, T);
5726 } else {
5727 _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
5728 _maxps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
5729 _movp(SelectDest, T);
5730 }
5731 return true;
5732 } break;
5733 case InstFcmp::Olt: {
5734 Variable *T = makeReg(SelectSrcT->getType());
5735 if (isScalarFloatingType(SelectSrcT->getType())) {
5736 _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
5737 _minss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
5738 _mov(SelectDest, T);
5739 } else {
5740 _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
5741 _minps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
5742 _movp(SelectDest, T);
5743 }
5744 return true;
5745 } break;
5746 }
5747 return false;
5748 }
5749
lowerIcmp(const InstIcmp * Icmp)5750 void TargetX8664::lowerIcmp(const InstIcmp *Icmp) {
5751 Variable *Dest = Icmp->getDest();
5752 if (isVectorType(Dest->getType())) {
5753 lowerIcmpVector(Icmp);
5754 } else {
5755 constexpr Inst *Consumer = nullptr;
5756 lowerIcmpAndConsumer(Icmp, Consumer);
5757 }
5758 }
5759
lowerSelectVector(const InstSelect * Instr)5760 void TargetX8664::lowerSelectVector(const InstSelect *Instr) {
5761 Variable *Dest = Instr->getDest();
5762 Type DestTy = Dest->getType();
5763 Operand *SrcT = Instr->getTrueOperand();
5764 Operand *SrcF = Instr->getFalseOperand();
5765 Operand *Condition = Instr->getCondition();
5766
5767 if (!isVectorType(DestTy))
5768 llvm::report_fatal_error("Expected a vector select");
5769
5770 Type SrcTy = SrcT->getType();
5771 Variable *T = makeReg(SrcTy);
5772 Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
5773 Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
5774
5775 if (InstructionSet >= SSE4_1) {
5776 // TODO(wala): If the condition operand is a constant, use blendps or
5777 // pblendw.
5778 //
5779 // Use blendvps or pblendvb to implement select.
5780 if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
5781 SrcTy == IceType_v4f32) {
5782 Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
5783 Variable *xmm0 = makeReg(IceType_v4i32, RegX8664::Reg_xmm0);
5784 _movp(xmm0, ConditionRM);
5785 _psll(xmm0, Ctx->getConstantInt8(31));
5786 _movp(T, SrcFRM);
5787 _blendvps(T, SrcTRM, xmm0);
5788 _movp(Dest, T);
5789 } else {
5790 assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
5791 Type SignExtTy =
5792 Condition->getType() == IceType_v8i1 ? IceType_v8i16 : IceType_v16i8;
5793 Variable *xmm0 = makeReg(SignExtTy, RegX8664::Reg_xmm0);
5794 lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
5795 _movp(T, SrcFRM);
5796 _pblendvb(T, SrcTRM, xmm0);
5797 _movp(Dest, T);
5798 }
5799 return;
5800 }
5801 // Lower select without SSE4.1:
5802 // a=d?b:c ==>
5803 // if elementtype(d) != i1:
5804 // d=sext(d);
5805 // a=(b&d)|(c&~d);
5806 Variable *T2 = makeReg(SrcTy);
5807 // Sign extend the condition operand if applicable.
5808 if (SrcTy == IceType_v4f32) {
5809 // The sext operation takes only integer arguments.
5810 Variable *T3 = Func->makeVariable(IceType_v4i32);
5811 lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
5812 _movp(T, T3);
5813 } else if (typeElementType(SrcTy) != IceType_i1) {
5814 lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
5815 } else {
5816 Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
5817 _movp(T, ConditionRM);
5818 }
5819 _movp(T2, T);
5820 _pand(T, SrcTRM);
5821 _pandn(T2, SrcFRM);
5822 _por(T, T2);
5823 _movp(Dest, T);
5824
5825 return;
5826 }
5827
lowerStore(const InstStore * Instr)5828 void TargetX8664::lowerStore(const InstStore *Instr) {
5829 Operand *Value = Instr->getData();
5830 Operand *Addr = Instr->getStoreAddress();
5831 X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
5832 doMockBoundsCheck(NewAddr);
5833 Type Ty = NewAddr->getType();
5834
5835 if (isVectorType(Ty)) {
5836 _storep(legalizeToReg(Value), NewAddr);
5837 } else {
5838 Value = legalize(Value, Legal_Reg | Legal_Imm);
5839 _store(Value, NewAddr);
5840 }
5841 }
5842
doAddressOptStore()5843 void TargetX8664::doAddressOptStore() {
5844 auto *Instr = llvm::cast<InstStore>(Context.getCur());
5845 Operand *Addr = Instr->getStoreAddress();
5846 Operand *Data = Instr->getData();
5847 if (auto *OptAddr = computeAddressOpt(Instr, Data->getType(), Addr)) {
5848 Instr->setDeleted();
5849 auto *NewStore = Context.insert<InstStore>(Data, OptAddr);
5850 if (Instr->getDest())
5851 NewStore->setRmwBeacon(Instr->getRmwBeacon());
5852 }
5853 }
5854
doAddressOptStoreSubVector()5855 void TargetX8664::doAddressOptStoreSubVector() {
5856 auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur());
5857 Operand *Addr = Intrinsic->getArg(1);
5858 Operand *Data = Intrinsic->getArg(0);
5859 if (auto *OptAddr = computeAddressOpt(Intrinsic, Data->getType(), Addr)) {
5860 Intrinsic->setDeleted();
5861 const Ice::Intrinsics::IntrinsicInfo Info = {
5862 Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T,
5863 Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
5864 auto *NewStore = Context.insert<InstIntrinsic>(3, nullptr, Info);
5865 NewStore->addArg(Data);
5866 NewStore->addArg(OptAddr);
5867 NewStore->addArg(Intrinsic->getArg(2));
5868 }
5869 }
5870
lowerCmpRange(Operand * Comparison,uint64_t Min,uint64_t Max)5871 Operand *TargetX8664::lowerCmpRange(Operand *Comparison, uint64_t Min,
5872 uint64_t Max) {
5873 // Subtracting 0 is a nop so don't do it
5874 if (Min != 0) {
5875 // Avoid clobbering the comparison by copying it
5876 Variable *T = nullptr;
5877 _mov(T, Comparison);
5878 _sub(T, Ctx->getConstantInt32(Min));
5879 Comparison = T;
5880 }
5881
5882 _cmp(Comparison, Ctx->getConstantInt32(Max - Min));
5883
5884 return Comparison;
5885 }
5886
lowerCaseCluster(const CaseCluster & Case,Operand * Comparison,bool DoneCmp,CfgNode * DefaultTarget)5887 void TargetX8664::lowerCaseCluster(const CaseCluster &Case, Operand *Comparison,
5888 bool DoneCmp, CfgNode *DefaultTarget) {
5889 switch (Case.getKind()) {
5890 case CaseCluster::JumpTable: {
5891 InstX86Label *SkipJumpTable;
5892
5893 Operand *RangeIndex =
5894 lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
5895 if (DefaultTarget == nullptr) {
5896 // Skip over jump table logic if comparison not in range and no default
5897 SkipJumpTable = InstX86Label::create(Func, this);
5898 _br(CondX86::Br_a, SkipJumpTable);
5899 } else {
5900 _br(CondX86::Br_a, DefaultTarget);
5901 }
5902
5903 InstJumpTable *JumpTable = Case.getJumpTable();
5904 Context.insert(JumpTable);
5905
5906 // Make sure the index is a register of the same width as the base
5907 Variable *Index;
5908 const Type PointerType = getPointerType();
5909 if (RangeIndex->getType() != PointerType) {
5910 Index = makeReg(PointerType);
5911 if (RangeIndex->getType() == IceType_i64) {
5912 _mov(Index, RangeIndex); // trunc
5913 } else {
5914 Operand *RangeIndexRM = legalize(RangeIndex, Legal_Reg | Legal_Mem);
5915 _movzx(Index, RangeIndexRM);
5916 }
5917 } else {
5918 Index = legalizeToReg(RangeIndex);
5919 }
5920
5921 constexpr RelocOffsetT RelocOffset = 0;
5922 constexpr Variable *NoBase = nullptr;
5923 constexpr Constant *NoOffset = nullptr;
5924 auto JTName = GlobalString::createWithString(Ctx, JumpTable->getName());
5925 Constant *Offset = Ctx->getConstantSym(RelocOffset, JTName);
5926 uint16_t Shift = typeWidthInBytesLog2(PointerType);
5927 constexpr auto Segment = X86OperandMem::SegmentRegisters::DefaultSegment;
5928
5929 Variable *Target = nullptr;
5930 if (PointerType == IceType_i32) {
5931 _mov(Target, X86OperandMem::create(Func, PointerType, NoBase, Offset,
5932 Index, Shift, Segment));
5933 } else {
5934 auto *Base = makeReg(IceType_i64);
5935 _lea(Base, X86OperandMem::create(Func, IceType_void, NoBase, Offset));
5936 _mov(Target, X86OperandMem::create(Func, PointerType, Base, NoOffset,
5937 Index, Shift, Segment));
5938 }
5939
5940 lowerIndirectJump(Target);
5941
5942 if (DefaultTarget == nullptr)
5943 Context.insert(SkipJumpTable);
5944 return;
5945 }
5946 case CaseCluster::Range: {
5947 if (Case.isUnitRange()) {
5948 // Single item
5949 if (!DoneCmp) {
5950 Constant *Value = Ctx->getConstantInt32(Case.getLow());
5951 _cmp(Comparison, Value);
5952 }
5953 _br(CondX86::Br_e, Case.getTarget());
5954 } else if (DoneCmp && Case.isPairRange()) {
5955 // Range of two items with first item aleady compared against
5956 _br(CondX86::Br_e, Case.getTarget());
5957 Constant *Value = Ctx->getConstantInt32(Case.getHigh());
5958 _cmp(Comparison, Value);
5959 _br(CondX86::Br_e, Case.getTarget());
5960 } else {
5961 // Range
5962 lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
5963 _br(CondX86::Br_be, Case.getTarget());
5964 }
5965 if (DefaultTarget != nullptr)
5966 _br(DefaultTarget);
5967 return;
5968 }
5969 }
5970 }
5971
lowerSwitch(const InstSwitch * Instr)5972 void TargetX8664::lowerSwitch(const InstSwitch *Instr) {
5973 // Group cases together and navigate through them with a binary search
5974 CaseClusterArray CaseClusters = CaseCluster::clusterizeSwitch(Func, Instr);
5975 Operand *Src0 = Instr->getComparison();
5976 CfgNode *DefaultTarget = Instr->getLabelDefault();
5977
5978 assert(CaseClusters.size() != 0); // Should always be at least one
5979
5980 if (CaseClusters.size() == 1) {
5981 // Jump straight to default if needed. Currently a common case as jump
5982 // tables occur on their own.
5983 constexpr bool DoneCmp = false;
5984 lowerCaseCluster(CaseClusters.front(), Src0, DoneCmp, DefaultTarget);
5985 return;
5986 }
5987
5988 // Going to be using multiple times so get it in a register early
5989 Variable *Comparison = legalizeToReg(Src0);
5990
5991 // A span is over the clusters
5992 struct SearchSpan {
5993 SearchSpan(SizeT Begin, SizeT Size, InstX86Label *Label)
5994 : Begin(Begin), Size(Size), Label(Label) {}
5995
5996 SizeT Begin;
5997 SizeT Size;
5998 InstX86Label *Label;
5999 };
6000 // The stack will only grow to the height of the tree so 12 should be plenty
6001 std::stack<SearchSpan, llvm::SmallVector<SearchSpan, 12>> SearchSpanStack;
6002 SearchSpanStack.emplace(0, CaseClusters.size(), nullptr);
6003 bool DoneCmp = false;
6004
6005 while (!SearchSpanStack.empty()) {
6006 SearchSpan Span = SearchSpanStack.top();
6007 SearchSpanStack.pop();
6008
6009 if (Span.Label != nullptr)
6010 Context.insert(Span.Label);
6011
6012 switch (Span.Size) {
6013 case 0:
6014 llvm::report_fatal_error("Invalid SearchSpan size");
6015 break;
6016
6017 case 1:
6018 lowerCaseCluster(CaseClusters[Span.Begin], Comparison, DoneCmp,
6019 SearchSpanStack.empty() ? nullptr : DefaultTarget);
6020 DoneCmp = false;
6021 break;
6022
6023 case 2: {
6024 const CaseCluster *CaseA = &CaseClusters[Span.Begin];
6025 const CaseCluster *CaseB = &CaseClusters[Span.Begin + 1];
6026
6027 // Placing a range last may allow register clobbering during the range
6028 // test. That means there is no need to clone the register. If it is a
6029 // unit range the comparison may have already been done in the binary
6030 // search (DoneCmp) and so it should be placed first. If this is a range
6031 // of two items and the comparison with the low value has already been
6032 // done, comparing with the other element is cheaper than a range test.
6033 // If the low end of the range is zero then there is no subtraction and
6034 // nothing to be gained.
6035 if (!CaseA->isUnitRange() &&
6036 !(CaseA->getLow() == 0 || (DoneCmp && CaseA->isPairRange()))) {
6037 std::swap(CaseA, CaseB);
6038 DoneCmp = false;
6039 }
6040
6041 lowerCaseCluster(*CaseA, Comparison, DoneCmp);
6042 DoneCmp = false;
6043 lowerCaseCluster(*CaseB, Comparison, DoneCmp,
6044 SearchSpanStack.empty() ? nullptr : DefaultTarget);
6045 } break;
6046
6047 default:
6048 // Pick the middle item and branch b or ae
6049 SizeT PivotIndex = Span.Begin + (Span.Size / 2);
6050 const CaseCluster &Pivot = CaseClusters[PivotIndex];
6051 Constant *Value = Ctx->getConstantInt32(Pivot.getLow());
6052 InstX86Label *Label = InstX86Label::create(Func, this);
6053 _cmp(Comparison, Value);
6054 // TODO(ascull): does it alway have to be far?
6055 _br(CondX86::Br_b, Label, InstX86Br::Far);
6056 // Lower the left and (pivot+right) sides, falling through to the right
6057 SearchSpanStack.emplace(Span.Begin, Span.Size / 2, Label);
6058 SearchSpanStack.emplace(PivotIndex, Span.Size - (Span.Size / 2), nullptr);
6059 DoneCmp = true;
6060 break;
6061 }
6062 }
6063
6064 _br(DefaultTarget);
6065 }
6066
6067 /// The following pattern occurs often in lowered C and C++ code:
6068 ///
6069 /// %cmp = fcmp/icmp pred <n x ty> %src0, %src1
6070 /// %cmp.ext = sext <n x i1> %cmp to <n x ty>
6071 ///
6072 /// We can eliminate the sext operation by copying the result of pcmpeqd,
6073 /// pcmpgtd, or cmpps (which produce sign extended results) to the result of the
6074 /// sext operation.
6075
eliminateNextVectorSextInstruction(Variable * SignExtendedResult)6076 void TargetX8664::eliminateNextVectorSextInstruction(
6077 Variable *SignExtendedResult) {
6078 if (auto *NextCast =
6079 llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
6080 if (NextCast->getCastKind() == InstCast::Sext &&
6081 NextCast->getSrc(0) == SignExtendedResult) {
6082 NextCast->setDeleted();
6083 _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult));
6084 // Skip over the instruction.
6085 Context.advanceNext();
6086 }
6087 }
6088 }
6089
lowerUnreachable(const InstUnreachable *)6090 void TargetX8664::lowerUnreachable(const InstUnreachable * /*Instr*/) {
6091 _ud2();
6092 // Add a fake use of esp to make sure esp adjustments after the unreachable
6093 // do not get dead-code eliminated.
6094 keepEspLiveAtExit();
6095 }
6096
lowerBreakpoint(const InstBreakpoint *)6097 void TargetX8664::lowerBreakpoint(const InstBreakpoint * /*Instr*/) { _int3(); }
6098
lowerRMW(const InstX86FakeRMW * RMW)6099 void TargetX8664::lowerRMW(const InstX86FakeRMW *RMW) {
6100 // If the beacon variable's live range does not end in this instruction, then
6101 // it must end in the modified Store instruction that follows. This means
6102 // that the original Store instruction is still there, either because the
6103 // value being stored is used beyond the Store instruction, or because dead
6104 // code elimination did not happen. In either case, we cancel RMW lowering
6105 // (and the caller deletes the RMW instruction).
6106 if (!RMW->isLastUse(RMW->getBeacon()))
6107 return;
6108 Operand *Src = RMW->getData();
6109 Type Ty = Src->getType();
6110 X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty);
6111 doMockBoundsCheck(Addr);
6112 switch (RMW->getOp()) {
6113 default:
6114 // TODO(stichnot): Implement other arithmetic operators.
6115 break;
6116 case InstArithmetic::Add:
6117 Src = legalize(Src, Legal_Reg | Legal_Imm);
6118 _add_rmw(Addr, Src);
6119 return;
6120 case InstArithmetic::Sub:
6121 Src = legalize(Src, Legal_Reg | Legal_Imm);
6122 _sub_rmw(Addr, Src);
6123 return;
6124 case InstArithmetic::And:
6125 Src = legalize(Src, Legal_Reg | Legal_Imm);
6126 _and_rmw(Addr, Src);
6127 return;
6128 case InstArithmetic::Or:
6129 Src = legalize(Src, Legal_Reg | Legal_Imm);
6130 _or_rmw(Addr, Src);
6131 return;
6132 case InstArithmetic::Xor:
6133 Src = legalize(Src, Legal_Reg | Legal_Imm);
6134 _xor_rmw(Addr, Src);
6135 return;
6136 }
6137 llvm::report_fatal_error("Couldn't lower RMW instruction");
6138 }
6139
lowerOther(const Inst * Instr)6140 void TargetX8664::lowerOther(const Inst *Instr) {
6141 if (const auto *RMW = llvm::dyn_cast<InstX86FakeRMW>(Instr)) {
6142 lowerRMW(RMW);
6143 } else {
6144 TargetLowering::lowerOther(Instr);
6145 }
6146 }
6147
prelowerPhis()6148 void TargetX8664::prelowerPhis() {
6149 // On x86-64 we don't need to prelower phis -- the architecture can handle
6150 // 64-bit integer natively.
6151 return;
6152 }
6153
genTargetHelperCallFor(Inst * Instr)6154 void TargetX8664::genTargetHelperCallFor(Inst *Instr) {
6155 uint32_t StackArgumentsSize = 0;
6156 if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
6157 RuntimeHelper HelperID = RuntimeHelper::H_Num;
6158 Variable *Dest = Arith->getDest();
6159 Type DestTy = Dest->getType();
6160 if (isVectorType(DestTy)) {
6161 Variable *Dest = Arith->getDest();
6162 Operand *Src0 = Arith->getSrc(0);
6163 Operand *Src1 = Arith->getSrc(1);
6164 switch (Arith->getOp()) {
6165 default:
6166 return;
6167 case InstArithmetic::Mul:
6168 if (DestTy == IceType_v16i8) {
6169 scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
6170 Arith->setDeleted();
6171 }
6172 return;
6173 case InstArithmetic::Shl:
6174 case InstArithmetic::Lshr:
6175 case InstArithmetic::Ashr:
6176 if (llvm::isa<Constant>(Src1)) {
6177 return;
6178 }
6179 case InstArithmetic::Udiv:
6180 case InstArithmetic::Urem:
6181 case InstArithmetic::Sdiv:
6182 case InstArithmetic::Srem:
6183 case InstArithmetic::Frem:
6184 scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
6185 Arith->setDeleted();
6186 return;
6187 }
6188 } else {
6189 switch (Arith->getOp()) {
6190 default:
6191 return;
6192 case InstArithmetic::Frem:
6193 if (isFloat32Asserting32Or64(DestTy))
6194 HelperID = RuntimeHelper::H_frem_f32;
6195 else
6196 HelperID = RuntimeHelper::H_frem_f64;
6197 }
6198 }
6199 constexpr SizeT MaxSrcs = 2;
6200 InstCall *Call = makeHelperCall(HelperID, Dest, MaxSrcs);
6201 Call->addArg(Arith->getSrc(0));
6202 Call->addArg(Arith->getSrc(1));
6203 StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
6204 Context.insert(Call);
6205 Arith->setDeleted();
6206 } else if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
6207 InstCast::OpKind CastKind = Cast->getCastKind();
6208 Operand *Src0 = Cast->getSrc(0);
6209 const Type SrcType = Src0->getType();
6210 Variable *Dest = Cast->getDest();
6211 const Type DestTy = Dest->getType();
6212 RuntimeHelper HelperID = RuntimeHelper::H_Num;
6213 Variable *CallDest = Dest;
6214 switch (CastKind) {
6215 default:
6216 return;
6217 case InstCast::Fptoui:
6218 if (isVectorType(DestTy)) {
6219 assert(DestTy == IceType_v4i32);
6220 assert(SrcType == IceType_v4f32);
6221 HelperID = RuntimeHelper::H_fptoui_4xi32_f32;
6222 } else if (DestTy == IceType_i64) {
6223 HelperID = isFloat32Asserting32Or64(SrcType)
6224 ? RuntimeHelper::H_fptoui_f32_i64
6225 : RuntimeHelper::H_fptoui_f64_i64;
6226 } else {
6227 return;
6228 }
6229 break;
6230 case InstCast::Uitofp:
6231 if (isVectorType(SrcType)) {
6232 assert(DestTy == IceType_v4f32);
6233 assert(SrcType == IceType_v4i32);
6234 HelperID = RuntimeHelper::H_uitofp_4xi32_4xf32;
6235 } else if (SrcType == IceType_i64) {
6236 if (isInt32Asserting32Or64(SrcType)) {
6237 HelperID = isFloat32Asserting32Or64(DestTy)
6238 ? RuntimeHelper::H_uitofp_i32_f32
6239 : RuntimeHelper::H_uitofp_i32_f64;
6240 } else {
6241 HelperID = isFloat32Asserting32Or64(DestTy)
6242 ? RuntimeHelper::H_uitofp_i64_f32
6243 : RuntimeHelper::H_uitofp_i64_f64;
6244 }
6245 } else {
6246 return;
6247 }
6248 break;
6249 case InstCast::Bitcast: {
6250 if (DestTy == Src0->getType())
6251 return;
6252 switch (DestTy) {
6253 default:
6254 return;
6255 case IceType_i8:
6256 assert(Src0->getType() == IceType_v8i1);
6257 HelperID = RuntimeHelper::H_bitcast_8xi1_i8;
6258 CallDest = Func->makeVariable(IceType_i32);
6259 break;
6260 case IceType_i16:
6261 assert(Src0->getType() == IceType_v16i1);
6262 HelperID = RuntimeHelper::H_bitcast_16xi1_i16;
6263 CallDest = Func->makeVariable(IceType_i32);
6264 break;
6265 case IceType_v8i1: {
6266 assert(Src0->getType() == IceType_i8);
6267 HelperID = RuntimeHelper::H_bitcast_i8_8xi1;
6268 Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
6269 // Arguments to functions are required to be at least 32 bits wide.
6270 Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
6271 Src0 = Src0AsI32;
6272 } break;
6273 case IceType_v16i1: {
6274 assert(Src0->getType() == IceType_i16);
6275 HelperID = RuntimeHelper::H_bitcast_i16_16xi1;
6276 Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
6277 // Arguments to functions are required to be at least 32 bits wide.
6278 Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
6279 Src0 = Src0AsI32;
6280 } break;
6281 }
6282 } break;
6283 }
6284 constexpr SizeT MaxSrcs = 1;
6285 InstCall *Call = makeHelperCall(HelperID, CallDest, MaxSrcs);
6286 Call->addArg(Src0);
6287 StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
6288 Context.insert(Call);
6289 // The PNaCl ABI disallows i8/i16 return types, so truncate the helper call
6290 // result to the appropriate type as necessary.
6291 if (CallDest->getType() != Dest->getType())
6292 Context.insert<InstCast>(InstCast::Trunc, Dest, CallDest);
6293 Cast->setDeleted();
6294 } else if (auto *Intrinsic = llvm::dyn_cast<InstIntrinsic>(Instr)) {
6295 CfgVector<Type> ArgTypes;
6296 Type ReturnType = IceType_void;
6297 switch (Intrinsic->getIntrinsicID()) {
6298 default:
6299 return;
6300 case Intrinsics::Ctpop: {
6301 Operand *Val = Intrinsic->getArg(0);
6302 Type ValTy = Val->getType();
6303 if (ValTy == IceType_i64)
6304 ArgTypes = {IceType_i64};
6305 else
6306 ArgTypes = {IceType_i32};
6307 ReturnType = IceType_i32;
6308 } break;
6309 case Intrinsics::Longjmp:
6310 ArgTypes = {IceType_i32, IceType_i32};
6311 ReturnType = IceType_void;
6312 break;
6313 case Intrinsics::Memcpy:
6314 ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
6315 ReturnType = IceType_void;
6316 break;
6317 case Intrinsics::Memmove:
6318 ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
6319 ReturnType = IceType_void;
6320 break;
6321 case Intrinsics::Memset:
6322 ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
6323 ReturnType = IceType_void;
6324 break;
6325 case Intrinsics::Setjmp:
6326 ArgTypes = {IceType_i32};
6327 ReturnType = IceType_i32;
6328 break;
6329 }
6330 StackArgumentsSize = getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
6331 } else if (auto *Call = llvm::dyn_cast<InstCall>(Instr)) {
6332 StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
6333 } else if (auto *Ret = llvm::dyn_cast<InstRet>(Instr)) {
6334 if (!Ret->hasRetValue())
6335 return;
6336 Operand *RetValue = Ret->getRetValue();
6337 Type ReturnType = RetValue->getType();
6338 if (!isScalarFloatingType(ReturnType))
6339 return;
6340 StackArgumentsSize = typeWidthInBytes(ReturnType);
6341 } else {
6342 return;
6343 }
6344 StackArgumentsSize = applyStackAlignment(StackArgumentsSize);
6345 updateMaxOutArgsSizeBytes(StackArgumentsSize);
6346 }
6347
6348 uint32_t
getCallStackArgumentsSizeBytes(const CfgVector<Type> & ArgTypes,Type ReturnType)6349 TargetX8664::getCallStackArgumentsSizeBytes(const CfgVector<Type> &ArgTypes,
6350 Type ReturnType) {
6351 uint32_t OutArgumentsSizeBytes = 0;
6352 uint32_t XmmArgCount = 0;
6353 uint32_t GprArgCount = 0;
6354 for (SizeT i = 0, NumArgTypes = ArgTypes.size(); i < NumArgTypes; ++i) {
6355 Type Ty = ArgTypes[i];
6356 // The PNaCl ABI requires the width of arguments to be at least 32 bits.
6357 assert(typeWidthInBytes(Ty) >= 4);
6358 if (isVectorType(Ty) &&
6359 RegX8664::getRegisterForXmmArgNum(RegX8664::getArgIndex(i, XmmArgCount))
6360 .hasValue()) {
6361 ++XmmArgCount;
6362 } else if (isScalarFloatingType(Ty) &&
6363 RegX8664::getRegisterForXmmArgNum(
6364 RegX8664::getArgIndex(i, XmmArgCount))
6365 .hasValue()) {
6366 ++XmmArgCount;
6367 } else if (isScalarIntegerType(Ty) &&
6368 RegX8664::getRegisterForGprArgNum(
6369 Ty, RegX8664::getArgIndex(i, GprArgCount))
6370 .hasValue()) {
6371 // The 64 bit ABI allows some integers to be passed in GPRs.
6372 ++GprArgCount;
6373 } else {
6374 if (isVectorType(Ty)) {
6375 OutArgumentsSizeBytes = applyStackAlignment(OutArgumentsSizeBytes);
6376 }
6377 OutArgumentsSizeBytes += typeWidthInBytesOnStack(Ty);
6378 }
6379 }
6380 return OutArgumentsSizeBytes;
6381 }
6382
getCallStackArgumentsSizeBytes(const InstCall * Instr)6383 uint32_t TargetX8664::getCallStackArgumentsSizeBytes(const InstCall *Instr) {
6384 // Build a vector of the arguments' types.
6385 const SizeT NumArgs = Instr->getNumArgs();
6386 CfgVector<Type> ArgTypes;
6387 ArgTypes.reserve(NumArgs);
6388 for (SizeT i = 0; i < NumArgs; ++i) {
6389 Operand *Arg = Instr->getArg(i);
6390 ArgTypes.emplace_back(Arg->getType());
6391 }
6392 // Compute the return type (if any);
6393 Type ReturnType = IceType_void;
6394 Variable *Dest = Instr->getDest();
6395 if (Dest != nullptr)
6396 ReturnType = Dest->getType();
6397 return getShadowStoreSize() +
6398 getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
6399 }
6400
makeZeroedRegister(Type Ty,RegNumT RegNum)6401 Variable *TargetX8664::makeZeroedRegister(Type Ty, RegNumT RegNum) {
6402 Variable *Reg = makeReg(Ty, RegNum);
6403 switch (Ty) {
6404 case IceType_i1:
6405 case IceType_i8:
6406 case IceType_i16:
6407 case IceType_i32:
6408 case IceType_i64:
6409 // Conservatively do "mov reg, 0" to avoid modifying FLAGS.
6410 _mov(Reg, Ctx->getConstantZero(Ty));
6411 break;
6412 case IceType_f32:
6413 case IceType_f64:
6414 Context.insert<InstFakeDef>(Reg);
6415 _xorps(Reg, Reg);
6416 break;
6417 default:
6418 // All vector types use the same pxor instruction.
6419 assert(isVectorType(Ty));
6420 Context.insert<InstFakeDef>(Reg);
6421 _pxor(Reg, Reg);
6422 break;
6423 }
6424 return Reg;
6425 }
6426
6427 // There is no support for loading or emitting vector constants, so the vector
6428 // values returned from makeVectorOfZeros, makeVectorOfOnes, etc. are
6429 // initialized with register operations.
6430 //
6431 // TODO(wala): Add limited support for vector constants so that complex
6432 // initialization in registers is unnecessary.
6433
makeVectorOfZeros(Type Ty,RegNumT RegNum)6434 Variable *TargetX8664::makeVectorOfZeros(Type Ty, RegNumT RegNum) {
6435 return makeZeroedRegister(Ty, RegNum);
6436 }
6437
makeVectorOfMinusOnes(Type Ty,RegNumT RegNum)6438 Variable *TargetX8664::makeVectorOfMinusOnes(Type Ty, RegNumT RegNum) {
6439 Variable *MinusOnes = makeReg(Ty, RegNum);
6440 // Insert a FakeDef so the live range of MinusOnes is not overestimated.
6441 Context.insert<InstFakeDef>(MinusOnes);
6442 if (Ty == IceType_f64)
6443 // Making a vector of minus ones of type f64 is currently only used for the
6444 // fabs intrinsic. To use the f64 type to create this mask with pcmpeqq
6445 // requires SSE 4.1. Since we're just creating a mask, pcmpeqd does the
6446 // same job and only requires SSE2.
6447 _pcmpeq(MinusOnes, MinusOnes, IceType_f32);
6448 else
6449 _pcmpeq(MinusOnes, MinusOnes);
6450 return MinusOnes;
6451 }
6452
makeVectorOfOnes(Type Ty,RegNumT RegNum)6453 Variable *TargetX8664::makeVectorOfOnes(Type Ty, RegNumT RegNum) {
6454 Variable *Dest = makeVectorOfZeros(Ty, RegNum);
6455 Variable *MinusOne = makeVectorOfMinusOnes(Ty);
6456 _psub(Dest, MinusOne);
6457 return Dest;
6458 }
6459
makeVectorOfHighOrderBits(Type Ty,RegNumT RegNum)6460 Variable *TargetX8664::makeVectorOfHighOrderBits(Type Ty, RegNumT RegNum) {
6461 assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
6462 Ty == IceType_v16i8);
6463 if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
6464 Variable *Reg = makeVectorOfOnes(Ty, RegNum);
6465 SizeT Shift = typeWidthInBytes(typeElementType(Ty)) * X86_CHAR_BIT - 1;
6466 _psll(Reg, Ctx->getConstantInt8(Shift));
6467 return Reg;
6468 } else {
6469 // SSE has no left shift operation for vectors of 8 bit integers.
6470 constexpr uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
6471 Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK);
6472 Variable *Reg = makeReg(Ty, RegNum);
6473 _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
6474 _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
6475 return Reg;
6476 }
6477 }
6478
6479 /// Construct a mask in a register that can be and'ed with a floating-point
6480 /// value to mask off its sign bit. The value will be <4 x 0x7fffffff> for f32
6481 /// and v4f32, and <2 x 0x7fffffffffffffff> for f64. Construct it as vector of
6482 /// ones logically right shifted one bit.
6483 // TODO(stichnot): Fix the wala
6484 // TODO: above, to represent vector constants in memory.
6485
makeVectorOfFabsMask(Type Ty,RegNumT RegNum)6486 Variable *TargetX8664::makeVectorOfFabsMask(Type Ty, RegNumT RegNum) {
6487 Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
6488 _psrl(Reg, Ctx->getConstantInt8(1));
6489 return Reg;
6490 }
6491
getMemoryOperandForStackSlot(Type Ty,Variable * Slot,uint32_t Offset)6492 X86OperandMem *TargetX8664::getMemoryOperandForStackSlot(Type Ty,
6493 Variable *Slot,
6494 uint32_t Offset) {
6495 // Ensure that Loc is a stack slot.
6496 assert(Slot->mustNotHaveReg());
6497 assert(Slot->getRegNum().hasNoValue());
6498 // Compute the location of Loc in memory.
6499 // TODO(wala,stichnot): lea should not
6500 // be required. The address of the stack slot is known at compile time
6501 // (although not until after addProlog()).
6502 const Type PointerType = getPointerType();
6503 Variable *Loc = makeReg(PointerType);
6504 _lea(Loc, Slot);
6505 Constant *ConstantOffset = Ctx->getConstantInt32(Offset);
6506 return X86OperandMem::create(Func, Ty, Loc, ConstantOffset);
6507 }
6508
6509 /// Lowering helper to copy a scalar integer source operand into some 8-bit GPR.
6510 /// Src is assumed to already be legalized. If the source operand is known to
6511 /// be a memory or immediate operand, a simple mov will suffice. But if the
6512 /// source operand can be a physical register, then it must first be copied into
6513 /// a physical register that is truncable to 8-bit, then truncated into a
6514 /// physical register that can receive a truncation, and finally copied into the
6515 /// result 8-bit register (which in general can be any 8-bit register). For
6516 /// example, moving %ebp into %ah may be accomplished as:
6517 /// movl %ebp, %edx
6518 /// mov_trunc %edx, %dl // this redundant assignment is ultimately elided
6519 /// movb %dl, %ah
6520 /// On the other hand, moving a memory or immediate operand into ah:
6521 /// movb 4(%ebp), %ah
6522 /// movb $my_imm, %ah
6523 ///
6524 /// Note #1. On a 64-bit target, the "movb 4(%ebp), %ah" is likely not
6525 /// encodable, so RegNum=Reg_ah should NOT be given as an argument. Instead,
6526 /// use RegNum=RegNumT() and then let the caller do a separate copy into
6527 /// Reg_ah.
6528 ///
6529 /// Note #2. ConstantRelocatable operands are also put through this process
6530 /// (not truncated directly) because our ELF emitter does R_386_32 relocations
6531 /// but not R_386_8 relocations.
6532 ///
6533 /// Note #3. If Src is a Variable, the result will be an infinite-weight i8
6534 /// Variable with the RCX86_IsTrunc8Rcvr register class. As such, this helper
6535 /// is a convenient way to prevent ah/bh/ch/dh from being an (invalid) argument
6536 /// to the pinsrb instruction.
6537
copyToReg8(Operand * Src,RegNumT RegNum)6538 Variable *TargetX8664::copyToReg8(Operand *Src, RegNumT RegNum) {
6539 Type Ty = Src->getType();
6540 assert(isScalarIntegerType(Ty));
6541 assert(Ty != IceType_i1);
6542 Variable *Reg = makeReg(IceType_i8, RegNum);
6543 Reg->setRegClass(RCX86_IsTrunc8Rcvr);
6544 if (llvm::isa<Variable>(Src) || llvm::isa<ConstantRelocatable>(Src)) {
6545 Variable *SrcTruncable = makeReg(Ty);
6546 switch (Ty) {
6547 case IceType_i64:
6548 SrcTruncable->setRegClass(RCX86_Is64To8);
6549 break;
6550 case IceType_i32:
6551 SrcTruncable->setRegClass(RCX86_Is32To8);
6552 break;
6553 case IceType_i16:
6554 SrcTruncable->setRegClass(RCX86_Is16To8);
6555 break;
6556 default:
6557 // i8 - just use default register class
6558 break;
6559 }
6560 Variable *SrcRcvr = makeReg(IceType_i8);
6561 SrcRcvr->setRegClass(RCX86_IsTrunc8Rcvr);
6562 _mov(SrcTruncable, Src);
6563 _mov(SrcRcvr, SrcTruncable);
6564 Src = SrcRcvr;
6565 }
6566 _mov(Reg, Src);
6567 return Reg;
6568 }
6569
6570 /// Helper for legalize() to emit the right code to lower an operand to a
6571 /// register of the appropriate type.
6572
copyToReg(Operand * Src,RegNumT RegNum)6573 Variable *TargetX8664::copyToReg(Operand *Src, RegNumT RegNum) {
6574 Type Ty = Src->getType();
6575 Variable *Reg = makeReg(Ty, RegNum);
6576 if (isVectorType(Ty)) {
6577 _movp(Reg, Src);
6578 } else {
6579 _mov(Reg, Src);
6580 }
6581 return Reg;
6582 }
6583
legalize(Operand * From,LegalMask Allowed,RegNumT RegNum)6584 Operand *TargetX8664::legalize(Operand *From, LegalMask Allowed,
6585 RegNumT RegNum) {
6586 const Type Ty = From->getType();
6587 // Assert that a physical register is allowed. To date, all calls to
6588 // legalize() allow a physical register. If a physical register needs to be
6589 // explicitly disallowed, then new code will need to be written to force a
6590 // spill.
6591 assert(Allowed & Legal_Reg);
6592 // If we're asking for a specific physical register, make sure we're not
6593 // allowing any other operand kinds. (This could be future work, e.g. allow
6594 // the shl shift amount to be either an immediate or in ecx.)
6595 assert(RegNum.hasNoValue() || Allowed == Legal_Reg);
6596
6597 // Substitute with an available infinite-weight variable if possible. Only do
6598 // this when we are not asking for a specific register, and when the
6599 // substitution is not locked to a specific register, and when the types
6600 // match, in order to capture the vast majority of opportunities and avoid
6601 // corner cases in the lowering.
6602 if (RegNum.hasNoValue()) {
6603 if (Variable *Subst = getContext().availabilityGet(From)) {
6604 // At this point we know there is a potential substitution available.
6605 if (Subst->mustHaveReg() && !Subst->hasReg()) {
6606 // At this point we know the substitution will have a register.
6607 if (From->getType() == Subst->getType()) {
6608 // At this point we know the substitution's register is compatible.
6609 return Subst;
6610 }
6611 }
6612 }
6613 }
6614
6615 if (auto *Mem = llvm::dyn_cast<X86OperandMem>(From)) {
6616 // Before doing anything with a Mem operand, we need to ensure that the
6617 // Base and Index components are in physical registers.
6618 Variable *Base = Mem->getBase();
6619 Variable *Index = Mem->getIndex();
6620 Constant *Offset = Mem->getOffset();
6621 Variable *RegBase = nullptr;
6622 Variable *RegIndex = nullptr;
6623 uint16_t Shift = Mem->getShift();
6624 if (Base) {
6625 RegBase = llvm::cast<Variable>(
6626 legalize(Base, Legal_Reg | Legal_Rematerializable));
6627 }
6628 if (Index) {
6629 // TODO(jpp): perhaps we should only allow Legal_Reg if
6630 // Base->isRematerializable.
6631 RegIndex = llvm::cast<Variable>(
6632 legalize(Index, Legal_Reg | Legal_Rematerializable));
6633 }
6634
6635 if (Base != RegBase || Index != RegIndex) {
6636 Mem = X86OperandMem::create(Func, Ty, RegBase, Offset, RegIndex, Shift,
6637 Mem->getSegmentRegister());
6638 }
6639
6640 From = Mem;
6641
6642 if (!(Allowed & Legal_Mem)) {
6643 From = copyToReg(From, RegNum);
6644 }
6645 return From;
6646 }
6647
6648 if (auto *Const = llvm::dyn_cast<Constant>(From)) {
6649 if (llvm::isa<ConstantUndef>(Const)) {
6650 From = legalizeUndef(Const, RegNum);
6651 if (isVectorType(Ty))
6652 return From;
6653 Const = llvm::cast<Constant>(From);
6654 }
6655 // There should be no constants of vector type (other than undef).
6656 assert(!isVectorType(Ty));
6657
6658 // If the operand is a 64 bit constant integer we need to legalize it to a
6659 // register in x86-64.
6660 if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Const)) {
6661 if (!Utils::IsInt(32, C64->getValue())) {
6662 if (RegNum.hasValue()) {
6663 assert(RegX8664::getGprForType(IceType_i64, RegNum) == RegNum);
6664 }
6665 return copyToReg(Const, RegNum);
6666 }
6667 }
6668
6669 if (!llvm::dyn_cast<ConstantRelocatable>(Const)) {
6670 if (isScalarFloatingType(Ty)) {
6671 // Convert a scalar floating point constant into an explicit memory
6672 // operand.
6673 if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(Const)) {
6674 if (Utils::isPositiveZero(ConstFloat->getValue()))
6675 return makeZeroedRegister(Ty, RegNum);
6676 } else if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(Const)) {
6677 if (Utils::isPositiveZero(ConstDouble->getValue()))
6678 return makeZeroedRegister(Ty, RegNum);
6679 }
6680
6681 auto *CFrom = llvm::cast<Constant>(From);
6682 assert(CFrom->getShouldBePooled());
6683 Constant *Offset = Ctx->getConstantSym(0, CFrom->getLabelName());
6684 auto *Mem = X86OperandMem::create(Func, Ty, nullptr, Offset);
6685 From = Mem;
6686 }
6687 }
6688
6689 bool NeedsReg = false;
6690 if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty))
6691 // Immediate specifically not allowed.
6692 NeedsReg = true;
6693 if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty))
6694 // On x86, FP constants are lowered to mem operands.
6695 NeedsReg = true;
6696 if (NeedsReg) {
6697 From = copyToReg(From, RegNum);
6698 }
6699 return From;
6700 }
6701
6702 if (auto *Var = llvm::dyn_cast<Variable>(From)) {
6703 // Check if the variable is guaranteed a physical register. This can happen
6704 // either when the variable is pre-colored or when it is assigned infinite
6705 // weight.
6706 bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg());
6707 bool MustRematerialize =
6708 (Var->isRematerializable() && !(Allowed & Legal_Rematerializable));
6709 // We need a new physical register for the operand if:
6710 // - Mem is not allowed and Var isn't guaranteed a physical register, or
6711 // - RegNum is required and Var->getRegNum() doesn't match, or
6712 // - Var is a rematerializable variable and rematerializable pass-through is
6713 // not allowed (in which case we need a lea instruction).
6714 if (MustRematerialize) {
6715 Variable *NewVar = makeReg(Ty, RegNum);
6716 // Since Var is rematerializable, the offset will be added when the lea is
6717 // emitted.
6718 constexpr Constant *NoOffset = nullptr;
6719 auto *Mem = X86OperandMem::create(Func, Ty, Var, NoOffset);
6720 _lea(NewVar, Mem);
6721 From = NewVar;
6722 } else if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
6723 (RegNum.hasValue() && RegNum != Var->getRegNum())) {
6724 From = copyToReg(From, RegNum);
6725 }
6726 return From;
6727 }
6728
6729 llvm::report_fatal_error("Unhandled operand kind in legalize()");
6730 return From;
6731 }
6732
6733 /// Provide a trivial wrapper to legalize() for this common usage.
6734
legalizeToReg(Operand * From,RegNumT RegNum)6735 Variable *TargetX8664::legalizeToReg(Operand *From, RegNumT RegNum) {
6736 return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
6737 }
6738
6739 /// Legalize undef values to concrete values.
6740
legalizeUndef(Operand * From,RegNumT RegNum)6741 Operand *TargetX8664::legalizeUndef(Operand *From, RegNumT RegNum) {
6742 Type Ty = From->getType();
6743 if (llvm::isa<ConstantUndef>(From)) {
6744 // Lower undefs to zero. Another option is to lower undefs to an
6745 // uninitialized register; however, using an uninitialized register results
6746 // in less predictable code.
6747 //
6748 // If in the future the implementation is changed to lower undef values to
6749 // uninitialized registers, a FakeDef will be needed:
6750 // Context.insert<InstFakeDef>(Reg);
6751 // This is in order to ensure that the live range of Reg is not
6752 // overestimated. If the constant being lowered is a 64 bit value, then
6753 // the result should be split and the lo and hi components will need to go
6754 // in uninitialized registers.
6755 if (isVectorType(Ty))
6756 return makeVectorOfZeros(Ty, RegNum);
6757 return Ctx->getConstantZero(Ty);
6758 }
6759 return From;
6760 }
6761
6762 /// For the cmp instruction, if Src1 is an immediate, or known to be a physical
6763 /// register, we can allow Src0 to be a memory operand. Otherwise, Src0 must be
6764 /// copied into a physical register. (Actually, either Src0 or Src1 can be
6765 /// chosen for the physical register, but unfortunately we have to commit to one
6766 /// or the other before register allocation.)
6767
legalizeSrc0ForCmp(Operand * Src0,Operand * Src1)6768 Operand *TargetX8664::legalizeSrc0ForCmp(Operand *Src0, Operand *Src1) {
6769 bool IsSrc1ImmOrReg = false;
6770 if (llvm::isa<Constant>(Src1)) {
6771 IsSrc1ImmOrReg = true;
6772 } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) {
6773 if (Var->hasReg())
6774 IsSrc1ImmOrReg = true;
6775 }
6776 return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
6777 }
6778
formMemoryOperand(Operand * Opnd,Type Ty,bool DoLegalize)6779 X86OperandMem *TargetX8664::formMemoryOperand(Operand *Opnd, Type Ty,
6780 bool DoLegalize) {
6781 auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd);
6782 // It may be the case that address mode optimization already creates an
6783 // X86OperandMem, so in that case it wouldn't need another level of
6784 // transformation.
6785 if (!Mem) {
6786 auto *Base = llvm::dyn_cast<Variable>(Opnd);
6787 auto *Offset = llvm::dyn_cast<Constant>(Opnd);
6788 assert(Base || Offset);
6789 if (Offset) {
6790 if (!llvm::isa<ConstantRelocatable>(Offset)) {
6791 if (llvm::isa<ConstantInteger64>(Offset)) {
6792 // Memory operands cannot have 64-bit immediates, so they must be
6793 // legalized into a register only.
6794 Base = llvm::cast<Variable>(legalize(Offset, Legal_Reg));
6795 Offset = nullptr;
6796 } else {
6797 Offset = llvm::cast<Constant>(legalize(Offset));
6798
6799 assert(llvm::isa<ConstantInteger32>(Offset) ||
6800 llvm::isa<ConstantRelocatable>(Offset));
6801 }
6802 }
6803 }
6804 Mem = X86OperandMem::create(Func, Ty, Base, Offset);
6805 }
6806 return llvm::cast<X86OperandMem>(DoLegalize ? legalize(Mem) : Mem);
6807 }
6808
makeReg(Type Type,RegNumT RegNum)6809 Variable *TargetX8664::makeReg(Type Type, RegNumT RegNum) {
6810 Variable *Reg = Func->makeVariable(Type);
6811 if (RegNum.hasValue())
6812 Reg->setRegNum(RegNum);
6813 else
6814 Reg->setMustHaveReg();
6815 return Reg;
6816 }
6817
6818 const Type TypeForSize[] = {IceType_i8, IceType_i16, IceType_i32, IceType_f64,
6819 IceType_v16i8};
6820
largestTypeInSize(uint32_t Size,uint32_t MaxSize)6821 Type TargetX8664::largestTypeInSize(uint32_t Size, uint32_t MaxSize) {
6822 assert(Size != 0);
6823 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
6824 uint32_t MaxIndex = MaxSize == NoSizeLimit
6825 ? llvm::array_lengthof(TypeForSize) - 1
6826 : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
6827 return TypeForSize[std::min(TyIndex, MaxIndex)];
6828 }
6829
firstTypeThatFitsSize(uint32_t Size,uint32_t MaxSize)6830 Type TargetX8664::firstTypeThatFitsSize(uint32_t Size, uint32_t MaxSize) {
6831 assert(Size != 0);
6832 uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
6833 if (!llvm::isPowerOf2_32(Size))
6834 ++TyIndex;
6835 uint32_t MaxIndex = MaxSize == NoSizeLimit
6836 ? llvm::array_lengthof(TypeForSize) - 1
6837 : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
6838 return TypeForSize[std::min(TyIndex, MaxIndex)];
6839 }
6840
postLower()6841 void TargetX8664::postLower() {
6842 if (Func->getOptLevel() == Opt_m1)
6843 return;
6844 markRedefinitions();
6845 Context.availabilityUpdate();
6846 }
6847
emit(const ConstantInteger32 * C) const6848 void TargetX8664::emit(const ConstantInteger32 *C) const {
6849 if (!BuildDefs::dump())
6850 return;
6851 Ostream &Str = Ctx->getStrEmit();
6852 Str << "$" << C->getValue();
6853 }
6854
emit(const ConstantInteger64 * C) const6855 void TargetX8664::emit(const ConstantInteger64 *C) const {
6856 if (!BuildDefs::dump())
6857 return;
6858 Ostream &Str = Ctx->getStrEmit();
6859 Str << "$" << C->getValue();
6860 }
6861
emit(const ConstantFloat * C) const6862 void TargetX8664::emit(const ConstantFloat *C) const {
6863 if (!BuildDefs::dump())
6864 return;
6865 Ostream &Str = Ctx->getStrEmit();
6866 Str << C->getLabelName();
6867 }
6868
emit(const ConstantDouble * C) const6869 void TargetX8664::emit(const ConstantDouble *C) const {
6870 if (!BuildDefs::dump())
6871 return;
6872 Ostream &Str = Ctx->getStrEmit();
6873 Str << C->getLabelName();
6874 }
6875
emit(const ConstantUndef *) const6876 void TargetX8664::emit(const ConstantUndef *) const {
6877 llvm::report_fatal_error("undef value encountered by emitter.");
6878 }
6879
emit(const ConstantRelocatable * C) const6880 void TargetX8664::emit(const ConstantRelocatable *C) const {
6881 if (!BuildDefs::dump())
6882 return;
6883 Ostream &Str = Ctx->getStrEmit();
6884 Str << "$";
6885 emitWithoutPrefix(C);
6886 }
6887
emitJumpTable(const Cfg *,const InstJumpTable * JumpTable) const6888 void TargetX8664::emitJumpTable(const Cfg *,
6889 const InstJumpTable *JumpTable) const {
6890 if (!BuildDefs::dump())
6891 return;
6892 Ostream &Str = Ctx->getStrEmit();
6893 Str << "\t.section\t.rodata." << JumpTable->getSectionName()
6894 << ",\"a\",@progbits\n"
6895 "\t.align\t"
6896 << typeWidthInBytes(getPointerType()) << "\n"
6897 << JumpTable->getName() << ":";
6898
6899 for (SizeT I = 0; I < JumpTable->getNumTargets(); ++I)
6900 Str << "\n\t.var\t" << JumpTable->getTarget(I)->getAsmName();
6901 Str << "\n";
6902 }
6903
6904 const TargetX8664::TableFcmpType TargetX8664::TableFcmp[] = {
6905 #define X(val, dflt, swapS, C1, C2, swapV, pred) \
6906 {dflt, swapS, CondX86::C1, CondX86::C2, swapV, CondX86::pred},
6907 FCMPX8664_TABLE
6908 #undef X
6909 };
6910
6911 const size_t TargetX8664::TableFcmpSize = llvm::array_lengthof(TableFcmp);
6912
6913 const TargetX8664::TableIcmp32Type TargetX8664::TableIcmp32[] = {
6914 #define X(val, C_32, C1_64, C2_64, C3_64) {CondX86::C_32},
6915 ICMPX8664_TABLE
6916 #undef X
6917 };
6918
6919 const size_t TargetX8664::TableIcmp32Size = llvm::array_lengthof(TableIcmp32);
6920
6921 std::array<SmallBitVector, RCX86_NUM> TargetX8664::TypeToRegisterSet = {{}};
6922
6923 std::array<SmallBitVector, RCX86_NUM> TargetX8664::TypeToRegisterSetUnfiltered =
6924 {{}};
6925
6926 std::array<SmallBitVector, RegX8664::Reg_NUM> TargetX8664::RegisterAliases = {
6927 {}};
6928
6929 template <typename T>
emitConstantPool(GlobalContext * Ctx)6930 void TargetDataX8664::emitConstantPool(GlobalContext *Ctx) {
6931 if (!BuildDefs::dump())
6932 return;
6933 Ostream &Str = Ctx->getStrEmit();
6934 Type Ty = T::Ty;
6935 SizeT Align = typeAlignInBytes(Ty);
6936 ConstantList Pool = Ctx->getConstantPool(Ty);
6937
6938 Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align
6939 << "\n";
6940 Str << "\t.align\t" << Align << "\n";
6941
6942 for (Constant *C : Pool) {
6943 if (!C->getShouldBePooled())
6944 continue;
6945 auto *Const = llvm::cast<typename T::IceType>(C);
6946 typename T::IceType::PrimType Value = Const->getValue();
6947 // Use memcpy() to copy bits from Value into RawValue in a way that avoids
6948 // breaking strict-aliasing rules.
6949 typename T::PrimitiveIntType RawValue;
6950 memcpy(&RawValue, &Value, sizeof(Value));
6951 char buf[30];
6952 int CharsPrinted =
6953 snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue);
6954 assert(CharsPrinted >= 0);
6955 assert((size_t)CharsPrinted < llvm::array_lengthof(buf));
6956 (void)CharsPrinted; // avoid warnings if asserts are disabled
6957 Str << Const->getLabelName();
6958 Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t/* " << T::TypeName << " "
6959 << Value << " */\n";
6960 }
6961 }
6962
lowerConstants()6963 void TargetDataX8664::lowerConstants() {
6964 if (getFlags().getDisableTranslation())
6965 return;
6966 switch (getFlags().getOutFileType()) {
6967 case FT_Elf: {
6968 ELFObjectWriter *Writer = Ctx->getObjectWriter();
6969
6970 Writer->writeConstantPool<ConstantInteger32>(IceType_i8);
6971 Writer->writeConstantPool<ConstantInteger32>(IceType_i16);
6972 Writer->writeConstantPool<ConstantInteger32>(IceType_i32);
6973
6974 Writer->writeConstantPool<ConstantFloat>(IceType_f32);
6975 Writer->writeConstantPool<ConstantDouble>(IceType_f64);
6976 } break;
6977 case FT_Asm:
6978 case FT_Iasm: {
6979 OstreamLocker L(Ctx);
6980
6981 emitConstantPool<PoolTypeConverter<uint8_t>>(Ctx);
6982 emitConstantPool<PoolTypeConverter<uint16_t>>(Ctx);
6983 emitConstantPool<PoolTypeConverter<uint32_t>>(Ctx);
6984
6985 emitConstantPool<PoolTypeConverter<float>>(Ctx);
6986 emitConstantPool<PoolTypeConverter<double>>(Ctx);
6987 } break;
6988 }
6989 }
6990
lowerJumpTables()6991 void TargetDataX8664::lowerJumpTables() {
6992 const bool IsPIC = false;
6993 switch (getFlags().getOutFileType()) {
6994 case FT_Elf: {
6995 ELFObjectWriter *Writer = Ctx->getObjectWriter();
6996 constexpr FixupKind FK_Abs64 = llvm::ELF::R_X86_64_64;
6997 const FixupKind RelocationKind =
6998 (getPointerType() == IceType_i32) ? FK_Abs : FK_Abs64;
6999 for (const JumpTableData &JT : Ctx->getJumpTables())
7000 Writer->writeJumpTable(JT, RelocationKind, IsPIC);
7001 } break;
7002 case FT_Asm:
7003 // Already emitted from Cfg
7004 break;
7005 case FT_Iasm: {
7006 if (!BuildDefs::dump())
7007 return;
7008 Ostream &Str = Ctx->getStrEmit();
7009 const char *Prefix = IsPIC ? ".data.rel.ro." : ".rodata.";
7010 for (const JumpTableData &JT : Ctx->getJumpTables()) {
7011 Str << "\t.section\t" << Prefix << JT.getSectionName()
7012 << ",\"a\",@progbits\n"
7013 "\t.align\t"
7014 << typeWidthInBytes(getPointerType()) << "\n"
7015 << JT.getName().toString() << ":";
7016
7017 // On X8664 ILP32 pointers are 32-bit hence the use of .long
7018 for (intptr_t TargetOffset : JT.getTargetOffsets())
7019 Str << "\n\t.long\t" << JT.getFunctionName() << "+" << TargetOffset;
7020 Str << "\n";
7021 }
7022 } break;
7023 }
7024 }
7025
lowerGlobals(const VariableDeclarationList & Vars,const std::string & SectionSuffix)7026 void TargetDataX8664::lowerGlobals(const VariableDeclarationList &Vars,
7027 const std::string &SectionSuffix) {
7028 const bool IsPIC = false;
7029 switch (getFlags().getOutFileType()) {
7030 case FT_Elf: {
7031 ELFObjectWriter *Writer = Ctx->getObjectWriter();
7032 Writer->writeDataSection(Vars, FK_Abs, SectionSuffix, IsPIC);
7033 } break;
7034 case FT_Asm:
7035 case FT_Iasm: {
7036 OstreamLocker L(Ctx);
7037 for (const VariableDeclaration *Var : Vars) {
7038 if (getFlags().matchTranslateOnly(Var->getName(), 0)) {
7039 emitGlobal(*Var, SectionSuffix);
7040 }
7041 }
7042 } break;
7043 }
7044 }
7045
7046 //------------------------------------------------------------------------------
7047 // __ ______ __ __ ______ ______ __ __ __ ______
7048 // /\ \ /\ __ \/\ \ _ \ \/\ ___\/\ == \/\ \/\ "-.\ \/\ ___\
7049 // \ \ \___\ \ \/\ \ \ \/ ".\ \ \ __\\ \ __<\ \ \ \ \-. \ \ \__ \
7050 // \ \_____\ \_____\ \__/".~\_\ \_____\ \_\ \_\ \_\ \_\\"\_\ \_____\
7051 // \/_____/\/_____/\/_/ \/_/\/_____/\/_/ /_/\/_/\/_/ \/_/\/_____/
7052 //
7053 //------------------------------------------------------------------------------
_add_sp(Operand * Adjustment)7054 void TargetX8664::_add_sp(Operand *Adjustment) {
7055 Variable *rsp = getPhysicalRegister(RegX8664::Reg_rsp, IceType_i64);
7056 _add(rsp, Adjustment);
7057 }
7058
_mov_sp(Operand * NewValue)7059 void TargetX8664::_mov_sp(Operand *NewValue) {
7060 Variable *rsp = getPhysicalRegister(RegX8664::Reg_rsp, IceType_i64);
7061 _redefined(_mov(rsp, NewValue));
7062 }
7063
_link_bp()7064 void TargetX8664::_link_bp() {
7065 Variable *rsp = getPhysicalRegister(RegX8664::Reg_rsp, WordType);
7066 Variable *rbp = getPhysicalRegister(RegX8664::Reg_rbp, WordType);
7067
7068 _push(rbp);
7069 _mov(rbp, rsp);
7070 // Keep ebp live for late-stage liveness analysis (e.g. asm-verbose mode).
7071 Context.insert<InstFakeUse>(rbp);
7072 }
7073
_unlink_bp()7074 void TargetX8664::_unlink_bp() {
7075 Variable *rsp = getPhysicalRegister(RegX8664::Reg_rsp, IceType_i64);
7076 Variable *rbp = getPhysicalRegister(RegX8664::Reg_rbp, IceType_i64);
7077 // For late-stage liveness analysis (e.g. asm-verbose mode), adding a fake
7078 // use of rsp before the assignment of rsp=rbp keeps previous rsp
7079 // adjustments from being dead-code eliminated.
7080 Context.insert<InstFakeUse>(rsp);
7081
7082 _mov(rsp, rbp);
7083 _pop(rbp);
7084 }
7085
_push_reg(RegNumT RegNum)7086 void TargetX8664::_push_reg(RegNumT RegNum) {
7087 if (RegX8664::isXmm(RegNum)) {
7088 Variable *reg = getPhysicalRegister(RegNum, IceType_v4f32);
7089 Variable *rsp = getPhysicalRegister(RegX8664::Reg_rsp, WordType);
7090 auto *address = X86OperandMem::create(Func, reg->getType(), rsp, nullptr);
7091 _sub_sp(
7092 Ctx->getConstantInt32(16)); // TODO(capn): accumulate all the offsets
7093 // and adjust the stack pointer once.
7094 _storep(reg, address);
7095 } else {
7096 _push(getPhysicalRegister(RegNum, WordType));
7097 }
7098 }
7099
_pop_reg(RegNumT RegNum)7100 void TargetX8664::_pop_reg(RegNumT RegNum) {
7101 if (RegX8664::isXmm(RegNum)) {
7102 Variable *reg = getPhysicalRegister(RegNum, IceType_v4f32);
7103 Variable *rsp = getPhysicalRegister(RegX8664::Reg_rsp, WordType);
7104 auto *address = X86OperandMem::create(Func, reg->getType(), rsp, nullptr);
7105 _movp(reg, address);
7106 _add_sp(
7107 Ctx->getConstantInt32(16)); // TODO(capn): accumulate all the offsets
7108 // and adjust the stack pointer once.
7109 } else {
7110 _pop(getPhysicalRegister(RegNum, WordType));
7111 }
7112 }
7113
_sub_sp(Operand * Adjustment)7114 void TargetX8664::_sub_sp(Operand *Adjustment) {
7115 Variable *rsp = getPhysicalRegister(RegX8664::Reg_rsp, WordType);
7116
7117 _sub(rsp, Adjustment);
7118
7119 // Add a fake use of the stack pointer, to prevent the stack pointer adustment
7120 // from being dead-code eliminated in a function that doesn't return.
7121 Context.insert<InstFakeUse>(rsp);
7122 }
7123
lowerIndirectJump(Variable * JumpTarget)7124 void TargetX8664::lowerIndirectJump(Variable *JumpTarget) {
7125 if (JumpTarget->getType() != IceType_i64) {
7126 Variable *T = makeReg(IceType_i64);
7127 _movzx(T, JumpTarget);
7128 JumpTarget = T;
7129 }
7130
7131 _jmp(JumpTarget);
7132 }
7133
emitCallToTarget(Operand * CallTarget,Variable * ReturnReg,size_t NumVariadicFpArgs)7134 Inst *TargetX8664::emitCallToTarget(Operand *CallTarget, Variable *ReturnReg,
7135 size_t NumVariadicFpArgs) {
7136 if (CallTarget->getType() == IceType_i64) {
7137 // x86-64 does not support 64-bit direct calls, so write the value to a
7138 // register and make an indirect call for Constant call targets.
7139 RegNumT TargetReg = {};
7140
7141 // System V: force r11 when calling a variadic function so that rax isn't
7142 // used, since rax stores the number of FP args (see NumVariadicFpArgs
7143 // usage below).
7144 #if !defined(SUBZERO_USE_MICROSOFT_ABI)
7145 if (NumVariadicFpArgs > 0)
7146 TargetReg = RegX8664::Reg_r11;
7147 #endif
7148
7149 if (llvm::isa<Constant>(CallTarget)) {
7150 Variable *T = makeReg(IceType_i64, TargetReg);
7151 _mov(T, CallTarget);
7152 CallTarget = T;
7153 } else if (llvm::isa<Variable>(CallTarget)) {
7154 Operand *T = legalizeToReg(CallTarget, TargetReg);
7155 CallTarget = T;
7156 }
7157 }
7158
7159 // System V: store number of FP args in RAX for variadic calls
7160 #if !defined(SUBZERO_USE_MICROSOFT_ABI)
7161 if (NumVariadicFpArgs > 0) {
7162 // Store number of FP args (stored in XMM registers) in RAX for variadic
7163 // calls
7164 auto *NumFpArgs = Ctx->getConstantInt64(NumVariadicFpArgs);
7165 Variable *NumFpArgsReg = legalizeToReg(NumFpArgs, RegX8664::Reg_rax);
7166 Context.insert<InstFakeUse>(NumFpArgsReg);
7167 }
7168 #endif
7169
7170 return Context.insert<Insts::Call>(ReturnReg, CallTarget);
7171 }
7172
moveReturnValueToRegister(Operand * Value,Type ReturnType)7173 Variable *TargetX8664::moveReturnValueToRegister(Operand *Value,
7174 Type ReturnType) {
7175 if (isVectorType(ReturnType) || isScalarFloatingType(ReturnType)) {
7176 return legalizeToReg(Value, RegX8664::Reg_xmm0);
7177 } else {
7178 assert(ReturnType == IceType_i32 || ReturnType == IceType_i64);
7179 Variable *Reg = nullptr;
7180 _mov(Reg, Value, RegX8664::getGprForType(ReturnType, RegX8664::Reg_rax));
7181 return Reg;
7182 }
7183 }
7184
emitStackProbe(size_t StackSizeBytes)7185 void TargetX8664::emitStackProbe(size_t StackSizeBytes) {
7186 #if defined(_WIN64)
7187 // Mirroring the behavior of MSVC here, which emits a _chkstk when locals are
7188 // >= 4KB, rather than the 8KB claimed by the docs.
7189 if (StackSizeBytes >= 4096) {
7190 // __chkstk on Win64 probes the stack up to RSP - EAX, but does not clobber
7191 // RSP, so we don't need to save and restore it.
7192
7193 Variable *EAX = makeReg(IceType_i32, RegX8664::Reg_eax);
7194 _mov(EAX, Ctx->getConstantInt32(StackSizeBytes));
7195
7196 auto *CallTarget =
7197 Ctx->getConstantInt64(reinterpret_cast<int64_t>(&__chkstk));
7198 Operand *CallTargetReg = legalizeToReg(CallTarget, RegX8664::Reg_r11);
7199 emitCallToTarget(CallTargetReg, nullptr);
7200 }
7201 #endif
7202 }
7203
7204 // In some cases, there are x-macros tables for both high-level and low-level
7205 // instructions/operands that use the same enum key value. The tables are kept
7206 // separate to maintain a proper separation between abstraction layers. There
7207 // is a risk that the tables could get out of sync if enum values are reordered
7208 // or if entries are added or deleted. The following dummy namespaces use
7209 // static_asserts to ensure everything is kept in sync.
7210
7211 namespace {
7212 // Validate the enum values in FCMPX8664_TABLE.
7213 namespace dummy1 {
7214 // Define a temporary set of enum values based on low-level table entries.
7215 enum _tmp_enum {
7216 #define X(val, dflt, swapS, C1, C2, swapV, pred) _tmp_##val,
7217 FCMPX8664_TABLE
7218 #undef X
7219 _num
7220 };
7221 // Define a set of constants based on high-level table entries.
7222 #define X(tag, str) static const int _table1_##tag = InstFcmp::tag;
7223 ICEINSTFCMP_TABLE
7224 #undef X
7225 // Define a set of constants based on low-level table entries, and ensure the
7226 // table entry keys are consistent.
7227 #define X(val, dflt, swapS, C1, C2, swapV, pred) \
7228 static const int _table2_##val = _tmp_##val; \
7229 static_assert( \
7230 _table1_##val == _table2_##val, \
7231 "Inconsistency between FCMPX8664_TABLE and ICEINSTFCMP_TABLE");
7232 FCMPX8664_TABLE
7233 #undef X
7234 // Repeat the static asserts with respect to the high-level table entries in
7235 // case the high-level table has extra entries.
7236 #define X(tag, str) \
7237 static_assert( \
7238 _table1_##tag == _table2_##tag, \
7239 "Inconsistency between FCMPX8664_TABLE and ICEINSTFCMP_TABLE");
7240 ICEINSTFCMP_TABLE
7241 #undef X
7242 } // end of namespace dummy1
7243
7244 // Validate the enum values in ICMPX8664_TABLE.
7245 namespace dummy2 {
7246 // Define a temporary set of enum values based on low-level table entries.
7247 enum _tmp_enum {
7248 #define X(val, C_32, C1_64, C2_64, C3_64) _tmp_##val,
7249 ICMPX8664_TABLE
7250 #undef X
7251 _num
7252 };
7253 // Define a set of constants based on high-level table entries.
7254 #define X(tag, reverse, str) static const int _table1_##tag = InstIcmp::tag;
7255 ICEINSTICMP_TABLE
7256 #undef X
7257 // Define a set of constants based on low-level table entries, and ensure the
7258 // table entry keys are consistent.
7259 #define X(val, C_32, C1_64, C2_64, C3_64) \
7260 static const int _table2_##val = _tmp_##val; \
7261 static_assert( \
7262 _table1_##val == _table2_##val, \
7263 "Inconsistency between ICMPX8664_TABLE and ICEINSTICMP_TABLE");
7264 ICMPX8664_TABLE
7265 #undef X
7266 // Repeat the static asserts with respect to the high-level table entries in
7267 // case the high-level table has extra entries.
7268 #define X(tag, reverse, str) \
7269 static_assert( \
7270 _table1_##tag == _table2_##tag, \
7271 "Inconsistency between ICMPX8664_TABLE and ICEINSTICMP_TABLE");
7272 ICEINSTICMP_TABLE
7273 #undef X
7274 } // end of namespace dummy2
7275
7276 // Validate the enum values in ICETYPEX86_TABLE.
7277 namespace dummy3 {
7278 // Define a temporary set of enum values based on low-level table entries.
7279 enum _tmp_enum {
7280 #define X(tag, elty, cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld) \
7281 _tmp_##tag,
7282 ICETYPEX86_TABLE
7283 #undef X
7284 _num
7285 };
7286 // Define a set of constants based on high-level table entries.
7287 #define X(tag, sizeLog2, align, elts, elty, str, rcstr) \
7288 static const int _table1_##tag = IceType_##tag;
7289 ICETYPE_TABLE
7290 #undef X
7291 // Define a set of constants based on low-level table entries, and ensure the
7292 // table entry keys are consistent.
7293 #define X(tag, elty, cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld) \
7294 static const int _table2_##tag = _tmp_##tag; \
7295 static_assert(_table1_##tag == _table2_##tag, \
7296 "Inconsistency between ICETYPEX86_TABLE and ICETYPE_TABLE");
7297 ICETYPEX86_TABLE
7298 #undef X
7299 // Repeat the static asserts with respect to the high-level table entries in
7300 // case the high-level table has extra entries.
7301 #define X(tag, sizeLog2, align, elts, elty, str, rcstr) \
7302 static_assert(_table1_##tag == _table2_##tag, \
7303 "Inconsistency between ICETYPEX86_TABLE and ICETYPE_TABLE");
7304 ICETYPE_TABLE
7305 #undef X
7306
7307 } // end of namespace dummy3
7308 } // end of anonymous namespace
7309
7310 } // end of namespace X8664
7311 } // end of namespace Ice
7312