xref: /aosp_15_r20/external/swiftshader/third_party/subzero/src/IceTargetLoweringX8664.cpp (revision 03ce13f70fcc45d86ee91b7ee4cab1936a95046e)
1 //===- subzero/src/IceTargetLoweringX8664.cpp - x86-64 lowering -----------===//
2 //
3 //                        The Subzero Code Generator
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Implements the TargetLoweringX8664 class, which consists almost
12 /// entirely of the lowering sequence for each high-level instruction.
13 ///
14 //===----------------------------------------------------------------------===//
15 #include "IceTargetLoweringX8664.h"
16 
17 #include "IceCfg.h"
18 #include "IceCfgNode.h"
19 #include "IceClFlags.h"
20 #include "IceDefs.h"
21 #include "IceELFObjectWriter.h"
22 #include "IceGlobalInits.h"
23 #include "IceInstVarIter.h"
24 #include "IceInstX8664.h"
25 #include "IceLiveness.h"
26 #include "IceOperand.h"
27 #include "IcePhiLoweringImpl.h"
28 #include "IceTargetLoweringX8664.def"
29 #include "IceUtils.h"
30 #include "IceVariableSplitting.h"
31 
32 #include "llvm/Support/MathExtras.h"
33 
34 #include <stack>
35 
36 #if defined(_WIN64)
37 extern "C" void __chkstk();
38 #endif
39 
40 namespace X8664 {
41 
createTargetLowering(::Ice::Cfg * Func)42 std::unique_ptr<::Ice::TargetLowering> createTargetLowering(::Ice::Cfg *Func) {
43   return ::Ice::X8664::TargetX8664::create(Func);
44 }
45 
46 std::unique_ptr<::Ice::TargetDataLowering>
createTargetDataLowering(::Ice::GlobalContext * Ctx)47 createTargetDataLowering(::Ice::GlobalContext *Ctx) {
48   return ::Ice::X8664::TargetDataX8664::create(Ctx);
49 }
50 
51 std::unique_ptr<::Ice::TargetHeaderLowering>
createTargetHeaderLowering(::Ice::GlobalContext * Ctx)52 createTargetHeaderLowering(::Ice::GlobalContext *Ctx) {
53   return ::Ice::X8664::TargetHeaderX86::create(Ctx);
54 }
55 
staticInit(::Ice::GlobalContext * Ctx)56 void staticInit(::Ice::GlobalContext *Ctx) {
57   ::Ice::X8664::TargetX8664::staticInit(Ctx);
58 }
59 
shouldBePooled(const class::Ice::Constant * C)60 bool shouldBePooled(const class ::Ice::Constant *C) {
61   return ::Ice::X8664::TargetX8664::shouldBePooled(C);
62 }
63 
getPointerType()64 ::Ice::Type getPointerType() {
65   return ::Ice::X8664::TargetX8664::getPointerType();
66 }
67 
68 } // namespace X8664
69 
70 namespace Ice {
71 namespace X8664 {
72 
73 /// The number of bits in a byte
74 static constexpr uint32_t X86_CHAR_BIT = 8;
75 /// Size of the return address on the stack
76 static constexpr uint32_t X86_RET_IP_SIZE_BYTES = 8;
77 
78 /// \name Limits for unrolling memory intrinsics.
79 /// @{
80 static constexpr uint32_t MEMCPY_UNROLL_LIMIT = 8;
81 static constexpr uint32_t MEMMOVE_UNROLL_LIMIT = 8;
82 static constexpr uint32_t MEMSET_UNROLL_LIMIT = 8;
83 /// @}
84 
85 // The Microsoft x64 ABI requires the caller to allocate a minimum 32 byte
86 // "shadow store" (aka "home space") so that the callee may copy the 4
87 // register args to it.
getShadowStoreSize()88 SizeT getShadowStoreSize() {
89 #if defined(_WIN64)
90   static const SizeT ShadowStoreSize = 4 * typeWidthInBytes(WordType);
91   return ShadowStoreSize;
92 #else
93   return 0;
94 #endif
95 }
96 
BoolFoldingEntry(Inst * I)97 BoolFoldingEntry::BoolFoldingEntry(Inst *I)
98     : Instr(I), IsComplex(BoolFolding::hasComplexLowering(I)) {}
99 
100 BoolFolding::BoolFoldingProducerKind
getProducerKind(const Inst * Instr)101 BoolFolding::getProducerKind(const Inst *Instr) {
102   if (llvm::isa<InstIcmp>(Instr)) {
103     return PK_Icmp32;
104   }
105   if (llvm::isa<InstFcmp>(Instr))
106     return PK_Fcmp;
107   if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
108     switch (Arith->getOp()) {
109     default:
110       return PK_None;
111     case InstArithmetic::And:
112     case InstArithmetic::Or:
113       return PK_Arith;
114     }
115   }
116   return PK_None; // TODO(stichnot): remove this
117 
118   if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
119     switch (Cast->getCastKind()) {
120     default:
121       return PK_None;
122     case InstCast::Trunc:
123       return PK_Trunc;
124     }
125   }
126   return PK_None;
127 }
128 
129 BoolFolding::BoolFoldingConsumerKind
getConsumerKind(const Inst * Instr)130 BoolFolding::getConsumerKind(const Inst *Instr) {
131   if (llvm::isa<InstBr>(Instr))
132     return CK_Br;
133   if (llvm::isa<InstSelect>(Instr))
134     return CK_Select;
135   return CK_None; // TODO(stichnot): remove this
136 
137   if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
138     switch (Cast->getCastKind()) {
139     default:
140       return CK_None;
141     case InstCast::Sext:
142       return CK_Sext;
143     case InstCast::Zext:
144       return CK_Zext;
145     }
146   }
147   return CK_None;
148 }
149 
150 /// Returns true if the producing instruction has a "complex" lowering sequence.
151 /// This generally means that its lowering sequence requires more than one
152 /// conditional branch, namely 64-bit integer compares and some floating-point
153 /// compares. When this is true, and there is more than one consumer, we prefer
154 /// to disable the folding optimization because it minimizes branches.
155 
hasComplexLowering(const Inst * Instr)156 bool BoolFolding::hasComplexLowering(const Inst *Instr) {
157   switch (getProducerKind(Instr)) {
158   default:
159     return false;
160   case PK_Icmp64:
161     return false;
162   case PK_Fcmp:
163     return TargetX8664::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()]
164                .C2 != CondX86::Br_None;
165   }
166 }
167 
isValidFolding(BoolFolding::BoolFoldingProducerKind ProducerKind,BoolFolding::BoolFoldingConsumerKind ConsumerKind)168 bool BoolFolding::isValidFolding(
169     BoolFolding::BoolFoldingProducerKind ProducerKind,
170     BoolFolding::BoolFoldingConsumerKind ConsumerKind) {
171   switch (ProducerKind) {
172   default:
173     return false;
174   case PK_Icmp32:
175   case PK_Icmp64:
176   case PK_Fcmp:
177     return (ConsumerKind == CK_Br) || (ConsumerKind == CK_Select);
178   case PK_Arith:
179     return ConsumerKind == CK_Br;
180   }
181 }
182 
init(CfgNode * Node)183 void BoolFolding::init(CfgNode *Node) {
184   Producers.clear();
185   for (Inst &Instr : Node->getInsts()) {
186     if (Instr.isDeleted())
187       continue;
188     invalidateProducersOnStore(&Instr);
189     // Check whether Instr is a valid producer.
190     Variable *Var = Instr.getDest();
191     if (Var) { // only consider instructions with an actual dest var
192       if (isBooleanType(Var->getType())) {        // only bool-type dest vars
193         if (getProducerKind(&Instr) != PK_None) { // white-listed instructions
194           Producers[Var->getIndex()] = BoolFoldingEntry(&Instr);
195         }
196       }
197     }
198     // Check each src variable against the map.
199     FOREACH_VAR_IN_INST(Var, Instr) {
200       SizeT VarNum = Var->getIndex();
201       if (!containsValid(VarNum))
202         continue;
203       // All valid consumers use Var as the first source operand
204       if (IndexOfVarOperandInInst(Var) != 0) {
205         setInvalid(VarNum);
206         continue;
207       }
208       // Consumer instructions must be white-listed
209       BoolFolding::BoolFoldingConsumerKind ConsumerKind =
210           getConsumerKind(&Instr);
211       if (ConsumerKind == CK_None) {
212         setInvalid(VarNum);
213         continue;
214       }
215       BoolFolding::BoolFoldingProducerKind ProducerKind =
216           getProducerKind(Producers[VarNum].Instr);
217       if (!isValidFolding(ProducerKind, ConsumerKind)) {
218         setInvalid(VarNum);
219         continue;
220       }
221       // Avoid creating multiple copies of complex producer instructions.
222       if (Producers[VarNum].IsComplex && Producers[VarNum].NumUses > 0) {
223         setInvalid(VarNum);
224         continue;
225       }
226       ++Producers[VarNum].NumUses;
227       if (Instr.isLastUse(Var)) {
228         Producers[VarNum].IsLiveOut = false;
229       }
230     }
231   }
232   for (auto &I : Producers) {
233     // Ignore entries previously marked invalid.
234     if (I.second.Instr == nullptr)
235       continue;
236     // Disable the producer if its dest may be live beyond this block.
237     if (I.second.IsLiveOut) {
238       setInvalid(I.first);
239       continue;
240     }
241     // Mark as "dead" rather than outright deleting. This is so that other
242     // peephole style optimizations during or before lowering have access to
243     // this instruction in undeleted form. See for example
244     // tryOptimizedCmpxchgCmpBr().
245     I.second.Instr->setDead();
246   }
247 }
248 
getProducerFor(const Operand * Opnd) const249 const Inst *BoolFolding::getProducerFor(const Operand *Opnd) const {
250   auto *Var = llvm::dyn_cast<const Variable>(Opnd);
251   if (Var == nullptr)
252     return nullptr;
253   SizeT VarNum = Var->getIndex();
254   auto Element = Producers.find(VarNum);
255   if (Element == Producers.end())
256     return nullptr;
257   return Element->second.Instr;
258 }
259 
dump(const Cfg * Func) const260 void BoolFolding::dump(const Cfg *Func) const {
261   if (!BuildDefs::dump() || !Func->isVerbose(IceV_Folding))
262     return;
263   OstreamLocker L(Func->getContext());
264   Ostream &Str = Func->getContext()->getStrDump();
265   for (auto &I : Producers) {
266     if (I.second.Instr == nullptr)
267       continue;
268     Str << "Found foldable producer:\n  ";
269     I.second.Instr->dump(Func);
270     Str << "\n";
271   }
272 }
273 
274 /// If the given instruction has potential memory side effects (e.g. store, rmw,
275 /// or a call instruction with potential memory side effects), then we must not
276 /// allow a pre-store Producer instruction with memory operands to be folded
277 /// into a post-store Consumer instruction.  If this is detected, the Producer
278 /// is invalidated.
279 ///
280 /// We use the Producer's IsLiveOut field to determine whether any potential
281 /// Consumers come after this store instruction.  The IsLiveOut field is
282 /// initialized to true, and BoolFolding::init() sets IsLiveOut to false when it
283 /// sees the variable's definitive last use (indicating the variable is not in
284 /// the node's live-out set).  Thus if we see here that IsLiveOut is false, we
285 /// know that there can be no consumers after the store, and therefore we know
286 /// the folding is safe despite the store instruction.
287 
invalidateProducersOnStore(const Inst * Instr)288 void BoolFolding::invalidateProducersOnStore(const Inst *Instr) {
289   if (!Instr->isMemoryWrite())
290     return;
291   for (auto &ProducerPair : Producers) {
292     if (!ProducerPair.second.IsLiveOut)
293       continue;
294     Inst *PInst = ProducerPair.second.Instr;
295     if (PInst == nullptr)
296       continue;
297     bool HasMemOperand = false;
298     const SizeT SrcSize = PInst->getSrcSize();
299     for (SizeT I = 0; I < SrcSize; ++I) {
300       if (llvm::isa<X86OperandMem>(PInst->getSrc(I))) {
301         HasMemOperand = true;
302         break;
303       }
304     }
305     if (!HasMemOperand)
306       continue;
307     setInvalid(ProducerPair.first);
308   }
309 }
310 
initNodeForLowering(CfgNode * Node)311 void TargetX8664::initNodeForLowering(CfgNode *Node) {
312   FoldingInfo.init(Node);
313   FoldingInfo.dump(Func);
314 }
315 
TargetX8664(Cfg * Func)316 TargetX8664::TargetX8664(Cfg *Func) : TargetX86(Func) {}
317 
staticInit(GlobalContext * Ctx)318 void TargetX8664::staticInit(GlobalContext *Ctx) {
319   RegNumT::setLimit(RegX8664::Reg_NUM);
320   RegX8664::initRegisterSet(getFlags(), &TypeToRegisterSet, &RegisterAliases);
321   for (size_t i = 0; i < TypeToRegisterSet.size(); ++i)
322     TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
323   filterTypeToRegisterSet(Ctx, RegX8664::Reg_NUM, TypeToRegisterSet.data(),
324                           TypeToRegisterSet.size(), RegX8664::getRegName,
325                           getRegClassName);
326 }
327 
shouldBePooled(const Constant * C)328 bool TargetX8664::shouldBePooled(const Constant *C) {
329   if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(C)) {
330     return !Utils::isPositiveZero(ConstFloat->getValue());
331   }
332   if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(C)) {
333     return !Utils::isPositiveZero(ConstDouble->getValue());
334   }
335   return false;
336 }
337 
getPointerType()338 ::Ice::Type TargetX8664::getPointerType() { return ::Ice::IceType_i64; }
339 
translateO2()340 void TargetX8664::translateO2() {
341   TimerMarker T(TimerStack::TT_O2, Func);
342 
343   genTargetHelperCalls();
344   Func->dump("After target helper call insertion");
345 
346   // Merge Alloca instructions, and lay out the stack.
347   static constexpr bool SortAndCombineAllocas = true;
348   Func->processAllocas(SortAndCombineAllocas);
349   Func->dump("After Alloca processing");
350 
351   // Run this early so it can be used to focus optimizations on potentially hot
352   // code.
353   // TODO(stichnot,ascull): currently only used for regalloc not
354   // expensive high level optimizations which could be focused on potentially
355   // hot code.
356   Func->generateLoopInfo();
357   Func->dump("After loop analysis");
358   if (getFlags().getLoopInvariantCodeMotion()) {
359     Func->loopInvariantCodeMotion();
360     Func->dump("After LICM");
361   }
362 
363   if (getFlags().getLocalCSE() != Ice::LCSE_Disabled) {
364     Func->localCSE(getFlags().getLocalCSE() == Ice::LCSE_EnabledSSA);
365     Func->dump("After Local CSE");
366     Func->floatConstantCSE();
367   }
368   if (getFlags().getEnableShortCircuit()) {
369     Func->shortCircuitJumps();
370     Func->dump("After Short Circuiting");
371   }
372 
373   if (!getFlags().getEnablePhiEdgeSplit()) {
374     // Lower Phi instructions.
375     Func->placePhiLoads();
376     if (Func->hasError())
377       return;
378     Func->placePhiStores();
379     if (Func->hasError())
380       return;
381     Func->deletePhis();
382     if (Func->hasError())
383       return;
384     Func->dump("After Phi lowering");
385   }
386 
387   // Address mode optimization.
388   Func->getVMetadata()->init(VMK_SingleDefs);
389   Func->doAddressOpt();
390   Func->materializeVectorShuffles();
391 
392   // Find read-modify-write opportunities. Do this after address mode
393   // optimization so that doAddressOpt() doesn't need to be applied to RMW
394   // instructions as well.
395   findRMW();
396   Func->dump("After RMW transform");
397 
398   // Argument lowering
399   Func->doArgLowering();
400 
401   // Target lowering. This requires liveness analysis for some parts of the
402   // lowering decisions, such as compare/branch fusing. If non-lightweight
403   // liveness analysis is used, the instructions need to be renumbered first
404   // TODO: This renumbering should only be necessary if we're actually
405   // calculating live intervals, which we only do for register allocation.
406   Func->renumberInstructions();
407   if (Func->hasError())
408     return;
409 
410   // TODO: It should be sufficient to use the fastest liveness calculation,
411   // i.e. livenessLightweight(). However, for some reason that slows down the
412   // rest of the translation. Investigate.
413   Func->liveness(Liveness_Basic);
414   if (Func->hasError())
415     return;
416   Func->dump("After x86 address mode opt");
417 
418   doLoadOpt();
419 
420   Func->genCode();
421   if (Func->hasError())
422     return;
423   Func->dump("After x86 codegen");
424   splitBlockLocalVariables(Func);
425 
426   // Register allocation. This requires instruction renumbering and full
427   // liveness analysis. Loops must be identified before liveness so variable
428   // use weights are correct.
429   Func->renumberInstructions();
430   if (Func->hasError())
431     return;
432   Func->liveness(Liveness_Intervals);
433   if (Func->hasError())
434     return;
435   // The post-codegen dump is done here, after liveness analysis and associated
436   // cleanup, to make the dump cleaner and more useful.
437   Func->dump("After initial x86 codegen");
438   // Validate the live range computations. The expensive validation call is
439   // deliberately only made when assertions are enabled.
440   assert(Func->validateLiveness());
441   Func->getVMetadata()->init(VMK_All);
442   regAlloc(RAK_Global);
443   if (Func->hasError())
444     return;
445   Func->dump("After linear scan regalloc");
446 
447   if (getFlags().getEnablePhiEdgeSplit()) {
448     Func->advancedPhiLowering();
449     Func->dump("After advanced Phi lowering");
450   }
451 
452   // Stack frame mapping.
453   Func->genFrame();
454   if (Func->hasError())
455     return;
456   Func->dump("After stack frame mapping");
457 
458   Func->contractEmptyNodes();
459   Func->reorderNodes();
460 
461   // Branch optimization.  This needs to be done just before code emission. In
462   // particular, no transformations that insert or reorder CfgNodes should be
463   // done after branch optimization. We go ahead and do it before nop insertion
464   // to reduce the amount of work needed for searching for opportunities.
465   Func->doBranchOpt();
466   Func->dump("After branch optimization");
467 }
468 
translateOm1()469 void TargetX8664::translateOm1() {
470   TimerMarker T(TimerStack::TT_Om1, Func);
471 
472   genTargetHelperCalls();
473 
474   // Do not merge Alloca instructions, and lay out the stack.
475   // static constexpr bool SortAndCombineAllocas = false;
476   static constexpr bool SortAndCombineAllocas =
477       true; // TODO(b/171222930): Fix Win32 bug when this is false
478   Func->processAllocas(SortAndCombineAllocas);
479   Func->dump("After Alloca processing");
480 
481   Func->placePhiLoads();
482   if (Func->hasError())
483     return;
484   Func->placePhiStores();
485   if (Func->hasError())
486     return;
487   Func->deletePhis();
488   if (Func->hasError())
489     return;
490   Func->dump("After Phi lowering");
491 
492   Func->doArgLowering();
493   Func->genCode();
494   if (Func->hasError())
495     return;
496   Func->dump("After initial x86 codegen");
497 
498   regAlloc(RAK_InfOnly);
499   if (Func->hasError())
500     return;
501   Func->dump("After regalloc of infinite-weight variables");
502 
503   Func->genFrame();
504   if (Func->hasError())
505     return;
506   Func->dump("After stack frame mapping");
507 }
508 
canRMW(const InstArithmetic * Arith)509 inline bool canRMW(const InstArithmetic *Arith) {
510   Type Ty = Arith->getDest()->getType();
511   // X86 vector instructions write to a register and have no RMW option.
512   if (isVectorType(Ty))
513     return false;
514   bool isI64 = Ty == IceType_i64;
515 
516   switch (Arith->getOp()) {
517   // Not handled for lack of simple lowering:
518   //   shift on i64
519   //   mul, udiv, urem, sdiv, srem, frem
520   // Not handled for lack of RMW instructions:
521   //   fadd, fsub, fmul, fdiv (also vector types)
522   default:
523     return false;
524   case InstArithmetic::Add:
525   case InstArithmetic::Sub:
526   case InstArithmetic::And:
527   case InstArithmetic::Or:
528   case InstArithmetic::Xor:
529     return true;
530   case InstArithmetic::Shl:
531   case InstArithmetic::Lshr:
532   case InstArithmetic::Ashr:
533     return false; // TODO(stichnot): implement
534     return !isI64;
535   }
536 }
537 
isSameMemAddressOperand(const Operand * A,const Operand * B)538 bool isSameMemAddressOperand(const Operand *A, const Operand *B) {
539   if (A == B)
540     return true;
541   if (auto *MemA = llvm::dyn_cast<X86OperandMem>(A)) {
542     if (auto *MemB = llvm::dyn_cast<X86OperandMem>(B)) {
543       return MemA->getBase() == MemB->getBase() &&
544              MemA->getOffset() == MemB->getOffset() &&
545              MemA->getIndex() == MemB->getIndex() &&
546              MemA->getShift() == MemB->getShift() &&
547              MemA->getSegmentRegister() == MemB->getSegmentRegister();
548     }
549   }
550   return false;
551 }
552 
findRMW()553 void TargetX8664::findRMW() {
554   TimerMarker _(TimerStack::TT_findRMW, Func);
555   Func->dump("Before RMW");
556   if (Func->isVerbose(IceV_RMW))
557     Func->getContext()->lockStr();
558   for (CfgNode *Node : Func->getNodes()) {
559     // Walk through the instructions, considering each sequence of 3
560     // instructions, and look for the particular RMW pattern. Note that this
561     // search can be "broken" (false negatives) if there are intervening
562     // deleted instructions, or intervening instructions that could be safely
563     // moved out of the way to reveal an RMW pattern.
564     auto E = Node->getInsts().end();
565     auto I1 = E, I2 = E, I3 = Node->getInsts().begin();
566     for (; I3 != E; I1 = I2, I2 = I3, ++I3) {
567       // Make I3 skip over deleted instructions.
568       while (I3 != E && I3->isDeleted())
569         ++I3;
570       if (I1 == E || I2 == E || I3 == E)
571         continue;
572       assert(!I1->isDeleted());
573       assert(!I2->isDeleted());
574       assert(!I3->isDeleted());
575       auto *Load = llvm::dyn_cast<InstLoad>(I1);
576       auto *Arith = llvm::dyn_cast<InstArithmetic>(I2);
577       auto *Store = llvm::dyn_cast<InstStore>(I3);
578       if (!Load || !Arith || !Store)
579         continue;
580       // Look for:
581       //   a = Load addr
582       //   b = <op> a, other
583       //   Store b, addr
584       // Change to:
585       //   a = Load addr
586       //   b = <op> a, other
587       //   x = FakeDef
588       //   RMW <op>, addr, other, x
589       //   b = Store b, addr, x
590       // Note that inferTwoAddress() makes sure setDestRedefined() gets called
591       // on the updated Store instruction, to avoid liveness problems later.
592       //
593       // With this transformation, the Store instruction acquires a Dest
594       // variable and is now subject to dead code elimination if there are no
595       // more uses of "b".  Variable "x" is a beacon for determining whether the
596       // Store instruction gets dead-code eliminated.  If the Store instruction
597       // is eliminated, then it must be the case that the RMW instruction ends
598       // x's live range, and therefore the RMW instruction will be retained and
599       // later lowered.  On the other hand, if the RMW instruction does not end
600       // x's live range, then the Store instruction must still be present, and
601       // therefore the RMW instruction is ignored during lowering because it is
602       // redundant with the Store instruction.
603       //
604       // Note that if "a" has further uses, the RMW transformation may still
605       // trigger, resulting in two loads and one store, which is worse than the
606       // original one load and one store.  However, this is probably rare, and
607       // caching probably keeps it just as fast.
608       if (!isSameMemAddressOperand(Load->getLoadAddress(),
609                                    Store->getStoreAddress()))
610         continue;
611       Operand *ArithSrcFromLoad = Arith->getSrc(0);
612       Operand *ArithSrcOther = Arith->getSrc(1);
613       if (ArithSrcFromLoad != Load->getDest()) {
614         if (!Arith->isCommutative() || ArithSrcOther != Load->getDest())
615           continue;
616         std::swap(ArithSrcFromLoad, ArithSrcOther);
617       }
618       if (Arith->getDest() != Store->getData())
619         continue;
620       if (!canRMW(Arith))
621         continue;
622       if (Func->isVerbose(IceV_RMW)) {
623         Ostream &Str = Func->getContext()->getStrDump();
624         Str << "Found RMW in " << Func->getFunctionName() << ":\n  ";
625         Load->dump(Func);
626         Str << "\n  ";
627         Arith->dump(Func);
628         Str << "\n  ";
629         Store->dump(Func);
630         Str << "\n";
631       }
632       Variable *Beacon = Func->makeVariable(IceType_i32);
633       Beacon->setMustNotHaveReg();
634       Store->setRmwBeacon(Beacon);
635       auto *BeaconDef = InstFakeDef::create(Func, Beacon);
636       Node->getInsts().insert(I3, BeaconDef);
637       auto *RMW =
638           InstX86FakeRMW::create(Func, ArithSrcOther, Store->getStoreAddress(),
639                                  Beacon, Arith->getOp());
640       Node->getInsts().insert(I3, RMW);
641     }
642   }
643   if (Func->isVerbose(IceV_RMW))
644     Func->getContext()->unlockStr();
645 }
646 
647 /// Value is in bytes. Return Value adjusted to the next highest multiple of
648 /// the stack alignment.
applyStackAlignment(uint32_t Value)649 uint32_t TargetX8664::applyStackAlignment(uint32_t Value) {
650   return Utils::applyAlignment(Value, X86_STACK_ALIGNMENT_BYTES);
651 }
652 
653 // Converts a ConstantInteger32 operand into its constant value, or
654 // MemoryOrderInvalid if the operand is not a ConstantInteger32.
getConstantMemoryOrder(Operand * Opnd)655 inline uint64_t getConstantMemoryOrder(Operand *Opnd) {
656   if (auto *Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
657     return Integer->getValue();
658   return Intrinsics::MemoryOrderInvalid;
659 }
660 
661 /// Determines whether the dest of a Load instruction can be folded into one of
662 /// the src operands of a 2-operand instruction. This is true as long as the
663 /// load dest matches exactly one of the binary instruction's src operands.
664 /// Replaces Src0 or Src1 with LoadSrc if the answer is true.
canFoldLoadIntoBinaryInst(Operand * LoadSrc,Variable * LoadDest,Operand * & Src0,Operand * & Src1)665 inline bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
666                                       Operand *&Src0, Operand *&Src1) {
667   if (Src0 == LoadDest && Src1 != LoadDest) {
668     Src0 = LoadSrc;
669     return true;
670   }
671   if (Src0 != LoadDest && Src1 == LoadDest) {
672     Src1 = LoadSrc;
673     return true;
674   }
675   return false;
676 }
677 
doLoadOpt()678 void TargetX8664::doLoadOpt() {
679   TimerMarker _(TimerStack::TT_loadOpt, Func);
680   for (CfgNode *Node : Func->getNodes()) {
681     Context.init(Node);
682     while (!Context.atEnd()) {
683       Variable *LoadDest = nullptr;
684       Operand *LoadSrc = nullptr;
685       Inst *CurInst = iteratorToInst(Context.getCur());
686       Inst *Next = Context.getNextInst();
687       // Determine whether the current instruction is a Load instruction or
688       // equivalent.
689       if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
690         // An InstLoad qualifies unless it uses a 64-bit absolute address,
691         // which requires legalization to insert a copy to register.
692         // TODO(b/148272103): Fold these after legalization.
693         if (!llvm::isa<Constant>(Load->getLoadAddress())) {
694           LoadDest = Load->getDest();
695           constexpr bool DoLegalize = false;
696           LoadSrc = formMemoryOperand(Load->getLoadAddress(),
697                                       LoadDest->getType(), DoLegalize);
698         }
699       } else if (auto *Intrin = llvm::dyn_cast<InstIntrinsic>(CurInst)) {
700         // An AtomicLoad intrinsic qualifies as long as it has a valid memory
701         // ordering, and can be implemented in a single instruction (i.e., not
702         // i64 on x86-32).
703         Intrinsics::IntrinsicID ID = Intrin->getIntrinsicID();
704         if (ID == Intrinsics::AtomicLoad &&
705             Intrinsics::isMemoryOrderValid(
706                 ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
707           LoadDest = Intrin->getDest();
708           constexpr bool DoLegalize = false;
709           LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
710                                       DoLegalize);
711         }
712       }
713       // A Load instruction can be folded into the following instruction only
714       // if the following instruction ends the Load's Dest variable's live
715       // range.
716       if (LoadDest && Next && Next->isLastUse(LoadDest)) {
717         assert(LoadSrc);
718         Inst *NewInst = nullptr;
719         if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
720           Operand *Src0 = Arith->getSrc(0);
721           Operand *Src1 = Arith->getSrc(1);
722           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
723             NewInst = InstArithmetic::create(Func, Arith->getOp(),
724                                              Arith->getDest(), Src0, Src1);
725           }
726         } else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
727           Operand *Src0 = Icmp->getSrc(0);
728           Operand *Src1 = Icmp->getSrc(1);
729           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
730             NewInst = InstIcmp::create(Func, Icmp->getCondition(),
731                                        Icmp->getDest(), Src0, Src1);
732           }
733         } else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
734           Operand *Src0 = Fcmp->getSrc(0);
735           Operand *Src1 = Fcmp->getSrc(1);
736           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
737             NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
738                                        Fcmp->getDest(), Src0, Src1);
739           }
740         } else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
741           Operand *Src0 = Select->getTrueOperand();
742           Operand *Src1 = Select->getFalseOperand();
743           if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
744             NewInst = InstSelect::create(Func, Select->getDest(),
745                                          Select->getCondition(), Src0, Src1);
746           }
747         } else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
748           // The load dest can always be folded into a Cast instruction.
749           auto *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
750           if (Src0 == LoadDest) {
751             NewInst = InstCast::create(Func, Cast->getCastKind(),
752                                        Cast->getDest(), LoadSrc);
753           }
754         }
755         if (NewInst) {
756           CurInst->setDeleted();
757           Next->setDeleted();
758           Context.insert(NewInst);
759           // Update NewInst->LiveRangesEnded so that target lowering may
760           // benefit. Also update NewInst->HasSideEffects.
761           NewInst->spliceLivenessInfo(Next, CurInst);
762         }
763       }
764       Context.advanceCur();
765       Context.advanceNext();
766     }
767   }
768   Func->dump("After load optimization");
769 }
770 
doBranchOpt(Inst * I,const CfgNode * NextNode)771 bool TargetX8664::doBranchOpt(Inst *I, const CfgNode *NextNode) {
772   if (auto *Br = llvm::dyn_cast<InstX86Br>(I)) {
773     return Br->optimizeBranch(NextNode);
774   }
775   return false;
776 }
777 
getPhysicalRegister(RegNumT RegNum,Type Ty)778 Variable *TargetX8664::getPhysicalRegister(RegNumT RegNum, Type Ty) {
779   if (Ty == IceType_void)
780     Ty = IceType_i32;
781   if (PhysicalRegisters[Ty].empty())
782     PhysicalRegisters[Ty].resize(RegX8664::Reg_NUM);
783   assert(unsigned(RegNum) < PhysicalRegisters[Ty].size());
784   Variable *Reg = PhysicalRegisters[Ty][RegNum];
785   if (Reg == nullptr) {
786     Reg = Func->makeVariable(Ty);
787     Reg->setRegNum(RegNum);
788     PhysicalRegisters[Ty][RegNum] = Reg;
789     // Specially mark a named physical register as an "argument" so that it is
790     // considered live upon function entry.  Otherwise it's possible to get
791     // liveness validation errors for saving callee-save registers.
792     Func->addImplicitArg(Reg);
793     // Don't bother tracking the live range of a named physical register.
794     Reg->setIgnoreLiveness();
795   }
796   assert(RegX8664::getGprForType(Ty, RegNum) == RegNum);
797   return Reg;
798 }
799 
getRegName(RegNumT RegNum,Type Ty) const800 const char *TargetX8664::getRegName(RegNumT RegNum, Type Ty) const {
801   return RegX8664::getRegName(RegX8664::getGprForType(Ty, RegNum));
802 }
803 
emitVariable(const Variable * Var) const804 void TargetX8664::emitVariable(const Variable *Var) const {
805   if (!BuildDefs::dump())
806     return;
807   Ostream &Str = Ctx->getStrEmit();
808   if (Var->hasReg()) {
809     Str << "%" << getRegName(Var->getRegNum(), Var->getType());
810     return;
811   }
812   if (Var->mustHaveReg()) {
813     llvm::report_fatal_error("Infinite-weight Variable (" + Var->getName() +
814                              ") has no register assigned - function " +
815                              Func->getFunctionName());
816   }
817   const int32_t Offset = Var->getStackOffset();
818   auto BaseRegNum = Var->getBaseRegNum();
819   if (BaseRegNum.hasNoValue())
820     BaseRegNum = getFrameOrStackReg();
821 
822   // Print in the form "Offset(%reg)", omitting Offset when it is 0.
823   if (getFlags().getDecorateAsm()) {
824     Str << Var->getSymbolicStackOffset();
825   } else if (Offset != 0) {
826     Str << Offset;
827   }
828   const Type FrameSPTy = WordType;
829   Str << "(%" << getRegName(BaseRegNum, FrameSPTy) << ")";
830 }
831 
addProlog(CfgNode * Node)832 void TargetX8664::addProlog(CfgNode *Node) {
833   // Stack frame layout:
834   //
835   // +------------------------+  ^ +
836   // | 1. return address      |  |
837   // +------------------------+  v -
838   // | 2. preserved registers |
839   // +------------------------+ <--- BasePointer (if used)
840   // | 3. padding             |
841   // +------------------------+
842   // | 4. global spill area   |
843   // +------------------------+
844   // | 5. padding             |
845   // +------------------------+
846   // | 6. local spill area    |
847   // +------------------------+
848   // | 7. padding             |
849   // +------------------------+
850   // | 7.5 shadow (WinX64)    |
851   // +------------------------+
852   // | 8. allocas             |
853   // +------------------------+
854   // | 9. padding             |
855   // +------------------------+
856   // | 10. out args           |
857   // +------------------------+ <--- StackPointer
858   //
859   // The following variables record the size in bytes of the given areas:
860   //  * X86_RET_IP_SIZE_BYTES:   area 1
861   //  * PreservedRegsSizeBytes:  area 2
862   //  * SpillAreaPaddingBytes:   area 3
863   //  * GlobalsSize:             area 4
864   //  * LocalsSlotsPaddingBytes: area 5
865   //  * GlobalsAndSubsequentPaddingSize: areas 4 - 5
866   //  * LocalsSpillAreaSize:     area 6
867   //  * FixedAllocaSizeBytes:    areas 7 - 8
868   //  * SpillAreaSizeBytes:      areas 3 - 10
869   //  * maxOutArgsSizeBytes():   areas 9 - 10
870 
871   // Determine stack frame offsets for each Variable without a register
872   // assignment. This can be done as one variable per stack slot. Or, do
873   // coalescing by running the register allocator again with an infinite set of
874   // registers (as a side effect, this gives variables a second chance at
875   // physical register assignment).
876   //
877   // A middle ground approach is to leverage sparsity and allocate one block of
878   // space on the frame for globals (variables with multi-block lifetime), and
879   // one block to share for locals (single-block lifetime).
880 
881   const SizeT ShadowStoreSize = getShadowStoreSize();
882 
883   // StackPointer: points just past return address of calling function
884 
885   Context.init(Node);
886   Context.setInsertPoint(Context.getCur());
887 
888   SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
889   RegsUsed = SmallBitVector(CalleeSaves.size());
890   VarList SortedSpilledVariables, VariablesLinkedToSpillSlots;
891   size_t GlobalsSize = 0;
892   // If there is a separate locals area, this represents that area. Otherwise
893   // it counts any variable not counted by GlobalsSize.
894   SpillAreaSizeBytes = 0;
895   // If there is a separate locals area, this specifies the alignment for it.
896   uint32_t LocalsSlotsAlignmentBytes = 0;
897   // The entire spill locations area gets aligned to largest natural alignment
898   // of the variables that have a spill slot.
899   uint32_t SpillAreaAlignmentBytes = 0;
900   // A spill slot linked to a variable with a stack slot should reuse that
901   // stack slot.
902   std::function<bool(Variable *)> TargetVarHook =
903       [&VariablesLinkedToSpillSlots](Variable *Var) {
904         // TODO(stichnot): Refactor this into the base class.
905         Variable *Root = Var->getLinkedToStackRoot();
906         if (Root != nullptr) {
907           assert(!Root->hasReg());
908           if (!Root->hasReg()) {
909             VariablesLinkedToSpillSlots.push_back(Var);
910             return true;
911           }
912         }
913         return false;
914       };
915 
916   // Compute the list of spilled variables and bounds for GlobalsSize, etc.
917   getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
918                         &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
919                         &LocalsSlotsAlignmentBytes, TargetVarHook);
920   uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
921   SpillAreaSizeBytes += GlobalsSize;
922 
923   // Add push instructions for preserved registers.
924   uint32_t NumCallee = 0;
925   size_t PreservedRegsSizeBytes = 0;
926   SmallBitVector Pushed(CalleeSaves.size());
927   for (RegNumT i : RegNumBVIter(CalleeSaves)) {
928     const auto Canonical = RegX8664::getBaseReg(i);
929     assert(Canonical == RegX8664::getBaseReg(Canonical));
930     if (RegsUsed[i]) {
931       Pushed[Canonical] = true;
932     }
933   }
934   for (RegNumT RegNum : RegNumBVIter(Pushed)) {
935     assert(RegNum == RegX8664::getBaseReg(RegNum));
936     ++NumCallee;
937     if (RegX8664::isXmm(RegNum)) {
938       PreservedRegsSizeBytes += 16;
939     } else {
940       PreservedRegsSizeBytes += typeWidthInBytes(WordType);
941     }
942     _push_reg(RegNum);
943   }
944   Ctx->statsUpdateRegistersSaved(NumCallee);
945 
946   // StackPointer: points past preserved registers at start of spill area
947 
948   // Generate "push frameptr; mov frameptr, stackptr"
949   if (IsEbpBasedFrame) {
950     assert(
951         (RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None)).count() ==
952         0);
953     PreservedRegsSizeBytes += typeWidthInBytes(WordType);
954     _link_bp();
955   }
956 
957   // Align the variables area. SpillAreaPaddingBytes is the size of the region
958   // after the preserved registers and before the spill areas.
959   // LocalsSlotsPaddingBytes is the amount of padding between the globals and
960   // locals area if they are separate.
961   assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
962   uint32_t SpillAreaPaddingBytes = 0;
963   uint32_t LocalsSlotsPaddingBytes = 0;
964   alignStackSpillAreas(X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes,
965                        SpillAreaAlignmentBytes, GlobalsSize,
966                        LocalsSlotsAlignmentBytes, &SpillAreaPaddingBytes,
967                        &LocalsSlotsPaddingBytes);
968   SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
969   uint32_t GlobalsAndSubsequentPaddingSize =
970       GlobalsSize + LocalsSlotsPaddingBytes;
971 
972   // Combine fixed allocations into SpillAreaSizeBytes if we are emitting the
973   // fixed allocations in the prolog.
974   if (PrologEmitsFixedAllocas)
975     SpillAreaSizeBytes += FixedAllocaSizeBytes;
976 
977   // Win64 ABI: add space for shadow store (aka home space)
978   SpillAreaSizeBytes += ShadowStoreSize;
979 
980   // Entering the function has made the stack pointer unaligned. Re-align it by
981   // adjusting the stack size.
982   // Note that StackOffset does not include spill area. It's the offset from the
983   // base stack pointer (epb), whether we set it or not, to the the first stack
984   // arg (if any). StackSize, on the other hand, does include the spill area.
985   const uint32_t StackOffset =
986       ShadowStoreSize + X86_RET_IP_SIZE_BYTES + PreservedRegsSizeBytes;
987   uint32_t StackSize = Utils::applyAlignment(StackOffset + SpillAreaSizeBytes,
988                                              RequiredStackAlignment);
989   StackSize = Utils::applyAlignment(StackSize + maxOutArgsSizeBytes(),
990                                     RequiredStackAlignment);
991   SpillAreaSizeBytes = StackSize - StackOffset; // Adjust for alignment, if any
992 
993   if (SpillAreaSizeBytes) {
994     auto *Func = Node->getCfg();
995     if (SpillAreaSizeBytes > Func->getStackSizeLimit()) {
996       Func->setError("Stack size limit exceeded");
997     }
998 
999     emitStackProbe(SpillAreaSizeBytes);
1000 
1001     // Generate "sub stackptr, SpillAreaSizeBytes"
1002     _sub_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
1003   }
1004 
1005   // StackPointer: points just past the spill area (end of stack frame)
1006 
1007   // Account for known-frame-offset alloca instructions that were not already
1008   // combined into the prolog.
1009   if (!PrologEmitsFixedAllocas)
1010     SpillAreaSizeBytes += FixedAllocaSizeBytes;
1011 
1012   Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);
1013 
1014   // Fill in stack offsets for stack args, and copy args into registers for
1015   // those that were register-allocated. Args are pushed right to left, so
1016   // Arg[0] is closest to the stack/frame pointer.
1017   RegNumT FrameOrStackReg = IsEbpBasedFrame ? getFrameReg() : getStackReg();
1018   Variable *FramePtr = getPhysicalRegister(FrameOrStackReg, WordType);
1019   size_t BasicFrameOffset = StackOffset;
1020   if (!IsEbpBasedFrame)
1021     BasicFrameOffset += SpillAreaSizeBytes;
1022 
1023   const VarList &Args = Func->getArgs();
1024   size_t InArgsSizeBytes = 0;
1025   unsigned NumXmmArgs = 0;
1026   unsigned NumGPRArgs = 0;
1027   for (SizeT i = 0, NumArgs = Args.size(); i < NumArgs; ++i) {
1028     Variable *Arg = Args[i];
1029     // Skip arguments passed in registers.
1030     if (isVectorType(Arg->getType())) {
1031       if (RegX8664::getRegisterForXmmArgNum(
1032               RegX8664::getArgIndex(i, NumXmmArgs))
1033               .hasValue()) {
1034         ++NumXmmArgs;
1035         continue;
1036       }
1037     } else if (isScalarFloatingType(Arg->getType())) {
1038       if (RegX8664::getRegisterForXmmArgNum(
1039               RegX8664::getArgIndex(i, NumXmmArgs))
1040               .hasValue()) {
1041         ++NumXmmArgs;
1042         continue;
1043       }
1044     } else {
1045       assert(isScalarIntegerType(Arg->getType()));
1046       if (RegX8664::getRegisterForGprArgNum(
1047               WordType, RegX8664::getArgIndex(i, NumGPRArgs))
1048               .hasValue()) {
1049         ++NumGPRArgs;
1050         continue;
1051       }
1052     }
1053     // For esp-based frames where the allocas are done outside the prolog, the
1054     // esp value may not stabilize to its home value until after all the
1055     // fixed-size alloca instructions have executed.  In this case, a stack
1056     // adjustment is needed when accessing in-args in order to copy them into
1057     // registers.
1058     size_t StackAdjBytes = 0;
1059     if (!IsEbpBasedFrame && !PrologEmitsFixedAllocas)
1060       StackAdjBytes -= FixedAllocaSizeBytes;
1061     finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, StackAdjBytes,
1062                            InArgsSizeBytes);
1063   }
1064 
1065   // Fill in stack offsets for locals.
1066   assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
1067                       SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
1068                       IsEbpBasedFrame && !needsStackPointerAlignment());
1069   // Assign stack offsets to variables that have been linked to spilled
1070   // variables.
1071   for (Variable *Var : VariablesLinkedToSpillSlots) {
1072     const Variable *Root = Var->getLinkedToStackRoot();
1073     assert(Root != nullptr);
1074     Var->setStackOffset(Root->getStackOffset());
1075 
1076     // If the stack root variable is an arg, make this variable an arg too so
1077     // that stackVarToAsmAddress uses the correct base pointer (e.g. ebp on
1078     // x86).
1079     Var->setIsArg(Root->getIsArg());
1080   }
1081   this->HasComputedFrame = true;
1082 
1083   if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) {
1084     OstreamLocker L(Func->getContext());
1085     Ostream &Str = Func->getContext()->getStrDump();
1086 
1087     Str << "Stack layout:\n";
1088     uint32_t EspAdjustmentPaddingSize =
1089         SpillAreaSizeBytes - LocalsSpillAreaSize -
1090         GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes -
1091         maxOutArgsSizeBytes();
1092     Str << " in-args = " << InArgsSizeBytes << " bytes\n"
1093         << " return address = " << X86_RET_IP_SIZE_BYTES << " bytes\n"
1094         << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
1095         << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
1096         << " globals spill area = " << GlobalsSize << " bytes\n"
1097         << " globals-locals spill areas intermediate padding = "
1098         << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
1099         << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
1100         << " esp alignment padding = " << EspAdjustmentPaddingSize
1101         << " bytes\n";
1102 
1103     Str << "Stack details:\n"
1104         << " esp adjustment = " << SpillAreaSizeBytes << " bytes\n"
1105         << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
1106         << " outgoing args size = " << maxOutArgsSizeBytes() << " bytes\n"
1107         << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
1108         << " bytes\n"
1109         << " is ebp based = " << IsEbpBasedFrame << "\n";
1110   }
1111 }
1112 
1113 /// Helper function for addProlog().
1114 ///
1115 /// This assumes Arg is an argument passed on the stack. This sets the frame
1116 /// offset for Arg and updates InArgsSizeBytes according to Arg's width. For an
1117 /// I64 arg that has been split into Lo and Hi components, it calls itself
1118 /// recursively on the components, taking care to handle Lo first because of the
1119 /// little-endian architecture. Lastly, this function generates an instruction
1120 /// to copy Arg into its assigned register if applicable.
1121 
finishArgumentLowering(Variable * Arg,Variable * FramePtr,size_t BasicFrameOffset,size_t StackAdjBytes,size_t & InArgsSizeBytes)1122 void TargetX8664::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
1123                                          size_t BasicFrameOffset,
1124                                          size_t StackAdjBytes,
1125                                          size_t &InArgsSizeBytes) {
1126   Type Ty = Arg->getType();
1127   if (isVectorType(Ty)) {
1128     InArgsSizeBytes = applyStackAlignment(InArgsSizeBytes);
1129   }
1130   Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
1131   InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
1132   if (Arg->hasReg()) {
1133     auto *Mem = X86OperandMem::create(
1134         Func, Ty, FramePtr,
1135         Ctx->getConstantInt32(Arg->getStackOffset() + StackAdjBytes));
1136     if (isVectorType(Arg->getType())) {
1137       _movp(Arg, Mem);
1138     } else {
1139       _mov(Arg, Mem);
1140     }
1141     // This argument-copying instruction uses an explicit X86OperandMem
1142     // operand instead of a Variable, so its fill-from-stack operation has to
1143     // be tracked separately for statistics.
1144     Ctx->statsUpdateFills();
1145   }
1146 }
1147 
addEpilog(CfgNode * Node)1148 void TargetX8664::addEpilog(CfgNode *Node) {
1149   InstList &Insts = Node->getInsts();
1150   InstList::reverse_iterator RI, E;
1151   for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
1152     if (llvm::isa<Insts::Ret>(*RI))
1153       break;
1154   }
1155   if (RI == E)
1156     return;
1157 
1158   // Convert the reverse_iterator position into its corresponding (forward)
1159   // iterator position.
1160   InstList::iterator InsertPoint = reverseToForwardIterator(RI);
1161   --InsertPoint;
1162   Context.init(Node);
1163   Context.setInsertPoint(InsertPoint);
1164 
1165   if (IsEbpBasedFrame) {
1166     _unlink_bp();
1167   } else {
1168     // add stackptr, SpillAreaSizeBytes
1169     if (SpillAreaSizeBytes != 0) {
1170       _add_sp(Ctx->getConstantInt32(SpillAreaSizeBytes));
1171     }
1172   }
1173 
1174   // Add pop instructions for preserved registers.
1175   SmallBitVector CalleeSaves = getRegisterSet(RegSet_CalleeSave, RegSet_None);
1176   SmallBitVector Popped(CalleeSaves.size());
1177   for (int32_t i = CalleeSaves.size() - 1; i >= 0; --i) {
1178     const auto RegNum = RegNumT::fromInt(i);
1179     if (RegNum == getFrameReg() && IsEbpBasedFrame)
1180       continue;
1181     const RegNumT Canonical = RegX8664::getBaseReg(RegNum);
1182     if (CalleeSaves[i] && RegsUsed[i]) {
1183       Popped[Canonical] = true;
1184     }
1185   }
1186   for (int32_t i = Popped.size() - 1; i >= 0; --i) {
1187     if (!Popped[i])
1188       continue;
1189     const auto RegNum = RegNumT::fromInt(i);
1190     assert(RegNum == RegX8664::getBaseReg(RegNum));
1191     _pop_reg(RegNum);
1192   }
1193 }
1194 
stackSlotType()1195 Type TargetX8664::stackSlotType() { return WordType; }
1196 
getRegisterSet(RegSetMask Include,RegSetMask Exclude) const1197 SmallBitVector TargetX8664::getRegisterSet(RegSetMask Include,
1198                                            RegSetMask Exclude) const {
1199   return RegX8664::getRegisterSet(getFlags(), Include, Exclude);
1200 }
1201 
lowerAlloca(const InstAlloca * Instr)1202 void TargetX8664::lowerAlloca(const InstAlloca *Instr) {
1203   // For default align=0, set it to the real value 1, to avoid any
1204   // bit-manipulation problems below.
1205   const uint32_t AlignmentParam = std::max(1u, Instr->getAlignInBytes());
1206 
1207   // LLVM enforces power of 2 alignment.
1208   assert(llvm::isPowerOf2_32(AlignmentParam));
1209 
1210   const uint32_t Alignment = std::max(AlignmentParam, RequiredStackAlignment);
1211   const bool OverAligned = Alignment > RequiredStackAlignment;
1212   const bool OptM1 = Func->getOptLevel() == Opt_m1;
1213   const bool AllocaWithKnownOffset = Instr->getKnownFrameOffset();
1214   const bool UseFramePointer =
1215       hasFramePointer() || OverAligned || !AllocaWithKnownOffset || OptM1;
1216 
1217   if (UseFramePointer)
1218     setHasFramePointer();
1219 
1220   Variable *esp = getPhysicalRegister(getStackReg(), WordType);
1221   if (OverAligned) {
1222     _and(esp, Ctx->getConstantInt32(-Alignment));
1223   }
1224 
1225   Variable *Dest = Instr->getDest();
1226   Operand *TotalSize = legalize(Instr->getSizeInBytes());
1227 
1228   if (const auto *ConstantTotalSize =
1229           llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
1230     const uint32_t Value =
1231         Utils::applyAlignment(ConstantTotalSize->getValue(), Alignment);
1232     if (UseFramePointer) {
1233       _sub_sp(Ctx->getConstantInt32(Value));
1234     } else {
1235       // If we don't need a Frame Pointer, this alloca has a known offset to the
1236       // stack pointer. We don't need adjust the stack pointer, nor assign any
1237       // value to Dest, as Dest is rematerializable.
1238       assert(Dest->isRematerializable());
1239       FixedAllocaSizeBytes += Value;
1240       Context.insert<InstFakeDef>(Dest);
1241     }
1242   } else {
1243     // Non-constant sizes need to be adjusted to the next highest multiple of
1244     // the required alignment at runtime.
1245     Variable *T = nullptr;
1246     if (TotalSize->getType() != IceType_i64) {
1247       T = makeReg(IceType_i64);
1248       _movzx(T, TotalSize);
1249     } else {
1250       T = makeReg(IceType_i32);
1251       _mov(T, TotalSize);
1252     }
1253     _add(T, Ctx->getConstantInt32(Alignment - 1));
1254     _and(T, Ctx->getConstantInt32(-Alignment));
1255     _sub_sp(T);
1256   }
1257   // Add enough to the returned address to account for the out args area.
1258   uint32_t OutArgsSize = maxOutArgsSizeBytes();
1259   if (OutArgsSize > 0) {
1260     Variable *T = makeReg(Dest->getType());
1261     auto *CalculateOperand = X86OperandMem::create(
1262         Func, IceType_void, esp, Ctx->getConstantInt(IceType_i32, OutArgsSize));
1263     _lea(T, CalculateOperand);
1264     _mov(Dest, T);
1265   } else {
1266     _mov(Dest, esp);
1267   }
1268 }
1269 
lowerArguments()1270 void TargetX8664::lowerArguments() {
1271   const bool OptM1 = Func->getOptLevel() == Opt_m1;
1272   VarList &Args = Func->getArgs();
1273   unsigned NumXmmArgs = 0;
1274   bool XmmSlotsRemain = true;
1275   unsigned NumGprArgs = 0;
1276   bool GprSlotsRemain = true;
1277 
1278   Context.init(Func->getEntryNode());
1279   Context.setInsertPoint(Context.getCur());
1280 
1281   for (SizeT i = 0, End = Args.size();
1282        i < End && (XmmSlotsRemain || GprSlotsRemain); ++i) {
1283     Variable *Arg = Args[i];
1284     Type Ty = Arg->getType();
1285     Variable *RegisterArg = nullptr;
1286     RegNumT RegNum;
1287     if (isVectorType(Ty)) {
1288       RegNum = RegX8664::getRegisterForXmmArgNum(
1289           RegX8664::getArgIndex(i, NumXmmArgs));
1290       if (RegNum.hasNoValue()) {
1291         XmmSlotsRemain = false;
1292         continue;
1293       }
1294       ++NumXmmArgs;
1295       RegisterArg = Func->makeVariable(Ty);
1296     } else if (isScalarFloatingType(Ty)) {
1297       RegNum = RegX8664::getRegisterForXmmArgNum(
1298           RegX8664::getArgIndex(i, NumXmmArgs));
1299       if (RegNum.hasNoValue()) {
1300         XmmSlotsRemain = false;
1301         continue;
1302       }
1303       ++NumXmmArgs;
1304       RegisterArg = Func->makeVariable(Ty);
1305     } else if (isScalarIntegerType(Ty)) {
1306       RegNum = RegX8664::getRegisterForGprArgNum(
1307           Ty, RegX8664::getArgIndex(i, NumGprArgs));
1308       if (RegNum.hasNoValue()) {
1309         GprSlotsRemain = false;
1310         continue;
1311       }
1312       ++NumGprArgs;
1313       RegisterArg = Func->makeVariable(Ty);
1314     }
1315     assert(RegNum.hasValue());
1316     assert(RegisterArg != nullptr);
1317     // Replace Arg in the argument list with the home register. Then generate
1318     // an instruction in the prolog to copy the home register to the assigned
1319     // location of Arg.
1320     if (BuildDefs::dump())
1321       RegisterArg->setName(Func, "home_reg:" + Arg->getName());
1322     RegisterArg->setRegNum(RegNum);
1323     RegisterArg->setIsArg();
1324     Arg->setIsArg(false);
1325 
1326     Args[i] = RegisterArg;
1327     // When not Om1, do the assignment through a temporary, instead of directly
1328     // from the pre-colored variable, so that a subsequent availabilityGet()
1329     // call has a chance to work.  (In Om1, don't bother creating extra
1330     // instructions with extra variables to register-allocate.)
1331     if (OptM1) {
1332       Context.insert<InstAssign>(Arg, RegisterArg);
1333     } else {
1334       Variable *Tmp = makeReg(RegisterArg->getType());
1335       Context.insert<InstAssign>(Tmp, RegisterArg);
1336       Context.insert<InstAssign>(Arg, Tmp);
1337     }
1338   }
1339   if (!OptM1)
1340     Context.availabilityUpdate();
1341 }
1342 
1343 /// Strength-reduce scalar integer multiplication by a constant (for i32 or
1344 /// narrower) for certain constants. The lea instruction can be used to multiply
1345 /// by 3, 5, or 9, and the lsh instruction can be used to multiply by powers of
1346 /// 2. These can be combined such that e.g. multiplying by 100 can be done as 2
1347 /// lea-based multiplies by 5, combined with left-shifting by 2.
1348 
optimizeScalarMul(Variable * Dest,Operand * Src0,int32_t Src1)1349 bool TargetX8664::optimizeScalarMul(Variable *Dest, Operand *Src0,
1350                                     int32_t Src1) {
1351   // Disable this optimization for Om1 and O0, just to keep things simple
1352   // there.
1353   if (Func->getOptLevel() < Opt_1)
1354     return false;
1355   Type Ty = Dest->getType();
1356   if (Src1 == -1) {
1357     Variable *T = nullptr;
1358     _mov(T, Src0);
1359     _neg(T);
1360     _mov(Dest, T);
1361     return true;
1362   }
1363   if (Src1 == 0) {
1364     _mov(Dest, Ctx->getConstantZero(Ty));
1365     return true;
1366   }
1367   if (Src1 == 1) {
1368     Variable *T = nullptr;
1369     _mov(T, Src0);
1370     _mov(Dest, T);
1371     return true;
1372   }
1373   // Don't bother with the edge case where Src1 == MININT.
1374   if (Src1 == -Src1)
1375     return false;
1376   const bool Src1IsNegative = Src1 < 0;
1377   if (Src1IsNegative)
1378     Src1 = -Src1;
1379   uint32_t Count9 = 0;
1380   uint32_t Count5 = 0;
1381   uint32_t Count3 = 0;
1382   uint32_t Count2 = 0;
1383   uint32_t CountOps = 0;
1384   while (Src1 > 1) {
1385     if (Src1 % 9 == 0) {
1386       ++CountOps;
1387       ++Count9;
1388       Src1 /= 9;
1389     } else if (Src1 % 5 == 0) {
1390       ++CountOps;
1391       ++Count5;
1392       Src1 /= 5;
1393     } else if (Src1 % 3 == 0) {
1394       ++CountOps;
1395       ++Count3;
1396       Src1 /= 3;
1397     } else if (Src1 % 2 == 0) {
1398       if (Count2 == 0)
1399         ++CountOps;
1400       ++Count2;
1401       Src1 /= 2;
1402     } else {
1403       return false;
1404     }
1405   }
1406   // Lea optimization only works for i16 and i32 types, not i8.
1407   if (Ty != IceType_i32 && Ty != IceType_i64 && (Count3 || Count5 || Count9))
1408     return false;
1409   // Limit the number of lea/shl operations for a single multiply, to a
1410   // somewhat arbitrary choice of 3.
1411   constexpr uint32_t MaxOpsForOptimizedMul = 3;
1412   if (CountOps > MaxOpsForOptimizedMul)
1413     return false;
1414   Variable *T = makeReg(WordType);
1415   if (typeWidthInBytes(Src0->getType()) < typeWidthInBytes(T->getType())) {
1416     Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
1417     _movzx(T, Src0RM);
1418   } else {
1419     _mov(T, Src0);
1420   }
1421   Constant *Zero = Ctx->getConstantZero(IceType_i32);
1422   for (uint32_t i = 0; i < Count9; ++i) {
1423     constexpr uint16_t Shift = 3; // log2(9-1)
1424     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1425   }
1426   for (uint32_t i = 0; i < Count5; ++i) {
1427     constexpr uint16_t Shift = 2; // log2(5-1)
1428     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1429   }
1430   for (uint32_t i = 0; i < Count3; ++i) {
1431     constexpr uint16_t Shift = 1; // log2(3-1)
1432     _lea(T, X86OperandMem::create(Func, IceType_void, T, Zero, T, Shift));
1433   }
1434   if (Count2) {
1435     _shl(T, Ctx->getConstantInt(Ty, Count2));
1436   }
1437   if (Src1IsNegative)
1438     _neg(T);
1439   _mov(Dest, T);
1440   return true;
1441 }
1442 
lowerShift64(InstArithmetic::OpKind Op,Operand * Src0Lo,Operand * Src0Hi,Operand * Src1Lo,Variable * DestLo,Variable * DestHi)1443 void TargetX8664::lowerShift64(InstArithmetic::OpKind Op, Operand *Src0Lo,
1444                                Operand *Src0Hi, Operand *Src1Lo,
1445                                Variable *DestLo, Variable *DestHi) {
1446   // TODO: Refactor the similarities between Shl, Lshr, and Ashr.
1447   Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
1448   Constant *Zero = Ctx->getConstantZero(IceType_i32);
1449   Constant *SignExtend = Ctx->getConstantInt32(0x1f);
1450   if (auto *ConstantShiftAmount = llvm::dyn_cast<ConstantInteger32>(Src1Lo)) {
1451     uint32_t ShiftAmount = ConstantShiftAmount->getValue();
1452     if (ShiftAmount > 32) {
1453       Constant *ReducedShift = Ctx->getConstantInt32(ShiftAmount - 32);
1454       switch (Op) {
1455       default:
1456         assert(0 && "non-shift op");
1457         break;
1458       case InstArithmetic::Shl: {
1459         // a=b<<c ==>
1460         //   t2 = b.lo
1461         //   t2 = shl t2, ShiftAmount-32
1462         //   t3 = t2
1463         //   t2 = 0
1464         _mov(T_2, Src0Lo);
1465         _shl(T_2, ReducedShift);
1466         _mov(DestHi, T_2);
1467         _mov(DestLo, Zero);
1468       } break;
1469       case InstArithmetic::Lshr: {
1470         // a=b>>c (unsigned) ==>
1471         //   t2 = b.hi
1472         //   t2 = shr t2, ShiftAmount-32
1473         //   a.lo = t2
1474         //   a.hi = 0
1475         _mov(T_2, Src0Hi);
1476         _shr(T_2, ReducedShift);
1477         _mov(DestLo, T_2);
1478         _mov(DestHi, Zero);
1479       } break;
1480       case InstArithmetic::Ashr: {
1481         // a=b>>c (signed) ==>
1482         //   t3 = b.hi
1483         //   t3 = sar t3, 0x1f
1484         //   t2 = b.hi
1485         //   t2 = shrd t2, t3, ShiftAmount-32
1486         //   a.lo = t2
1487         //   a.hi = t3
1488         _mov(T_3, Src0Hi);
1489         _sar(T_3, SignExtend);
1490         _mov(T_2, Src0Hi);
1491         _shrd(T_2, T_3, ReducedShift);
1492         _mov(DestLo, T_2);
1493         _mov(DestHi, T_3);
1494       } break;
1495       }
1496     } else if (ShiftAmount == 32) {
1497       switch (Op) {
1498       default:
1499         assert(0 && "non-shift op");
1500         break;
1501       case InstArithmetic::Shl: {
1502         // a=b<<c ==>
1503         //   t2 = b.lo
1504         //   a.hi = t2
1505         //   a.lo = 0
1506         _mov(T_2, Src0Lo);
1507         _mov(DestHi, T_2);
1508         _mov(DestLo, Zero);
1509       } break;
1510       case InstArithmetic::Lshr: {
1511         // a=b>>c (unsigned) ==>
1512         //   t2 = b.hi
1513         //   a.lo = t2
1514         //   a.hi = 0
1515         _mov(T_2, Src0Hi);
1516         _mov(DestLo, T_2);
1517         _mov(DestHi, Zero);
1518       } break;
1519       case InstArithmetic::Ashr: {
1520         // a=b>>c (signed) ==>
1521         //   t2 = b.hi
1522         //   a.lo = t2
1523         //   t3 = b.hi
1524         //   t3 = sar t3, 0x1f
1525         //   a.hi = t3
1526         _mov(T_2, Src0Hi);
1527         _mov(DestLo, T_2);
1528         _mov(T_3, Src0Hi);
1529         _sar(T_3, SignExtend);
1530         _mov(DestHi, T_3);
1531       } break;
1532       }
1533     } else {
1534       // COMMON PREFIX OF: a=b SHIFT_OP c ==>
1535       //   t2 = b.lo
1536       //   t3 = b.hi
1537       _mov(T_2, Src0Lo);
1538       _mov(T_3, Src0Hi);
1539       switch (Op) {
1540       default:
1541         assert(0 && "non-shift op");
1542         break;
1543       case InstArithmetic::Shl: {
1544         // a=b<<c ==>
1545         //   t3 = shld t3, t2, ShiftAmount
1546         //   t2 = shl t2, ShiftAmount
1547         _shld(T_3, T_2, ConstantShiftAmount);
1548         _shl(T_2, ConstantShiftAmount);
1549       } break;
1550       case InstArithmetic::Lshr: {
1551         // a=b>>c (unsigned) ==>
1552         //   t2 = shrd t2, t3, ShiftAmount
1553         //   t3 = shr t3, ShiftAmount
1554         _shrd(T_2, T_3, ConstantShiftAmount);
1555         _shr(T_3, ConstantShiftAmount);
1556       } break;
1557       case InstArithmetic::Ashr: {
1558         // a=b>>c (signed) ==>
1559         //   t2 = shrd t2, t3, ShiftAmount
1560         //   t3 = sar t3, ShiftAmount
1561         _shrd(T_2, T_3, ConstantShiftAmount);
1562         _sar(T_3, ConstantShiftAmount);
1563       } break;
1564       }
1565       // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
1566       //   a.lo = t2
1567       //   a.hi = t3
1568       _mov(DestLo, T_2);
1569       _mov(DestHi, T_3);
1570     }
1571   } else {
1572     // NON-CONSTANT CASES.
1573     Constant *BitTest = Ctx->getConstantInt32(0x20);
1574     InstX86Label *Label = InstX86Label::create(Func, this);
1575     // COMMON PREFIX OF: a=b SHIFT_OP c ==>
1576     //   t1:ecx = c.lo & 0xff
1577     //   t2 = b.lo
1578     //   t3 = b.hi
1579     T_1 = copyToReg8(Src1Lo, RegX8664::Reg_cl);
1580     _mov(T_2, Src0Lo);
1581     _mov(T_3, Src0Hi);
1582     switch (Op) {
1583     default:
1584       assert(0 && "non-shift op");
1585       break;
1586     case InstArithmetic::Shl: {
1587       // a=b<<c ==>
1588       //   t3 = shld t3, t2, t1
1589       //   t2 = shl t2, t1
1590       //   test t1, 0x20
1591       //   je L1
1592       //   use(t3)
1593       //   t3 = t2
1594       //   t2 = 0
1595       _shld(T_3, T_2, T_1);
1596       _shl(T_2, T_1);
1597       _test(T_1, BitTest);
1598       _br(CondX86::Br_e, Label);
1599       // T_2 and T_3 are being assigned again because of the intra-block control
1600       // flow, so we need to use _redefined to avoid liveness problems.
1601       _redefined(_mov(T_3, T_2));
1602       _redefined(_mov(T_2, Zero));
1603     } break;
1604     case InstArithmetic::Lshr: {
1605       // a=b>>c (unsigned) ==>
1606       //   t2 = shrd t2, t3, t1
1607       //   t3 = shr t3, t1
1608       //   test t1, 0x20
1609       //   je L1
1610       //   use(t2)
1611       //   t2 = t3
1612       //   t3 = 0
1613       _shrd(T_2, T_3, T_1);
1614       _shr(T_3, T_1);
1615       _test(T_1, BitTest);
1616       _br(CondX86::Br_e, Label);
1617       // T_2 and T_3 are being assigned again because of the intra-block control
1618       // flow, so we need to use _redefined to avoid liveness problems.
1619       _redefined(_mov(T_2, T_3));
1620       _redefined(_mov(T_3, Zero));
1621     } break;
1622     case InstArithmetic::Ashr: {
1623       // a=b>>c (signed) ==>
1624       //   t2 = shrd t2, t3, t1
1625       //   t3 = sar t3, t1
1626       //   test t1, 0x20
1627       //   je L1
1628       //   use(t2)
1629       //   t2 = t3
1630       //   t3 = sar t3, 0x1f
1631       Constant *SignExtend = Ctx->getConstantInt32(0x1f);
1632       _shrd(T_2, T_3, T_1);
1633       _sar(T_3, T_1);
1634       _test(T_1, BitTest);
1635       _br(CondX86::Br_e, Label);
1636       // T_2 and T_3 are being assigned again because of the intra-block control
1637       // flow, so T_2 needs to use _redefined to avoid liveness problems. T_3
1638       // doesn't need special treatment because it is reassigned via _sar
1639       // instead of _mov.
1640       _redefined(_mov(T_2, T_3));
1641       _sar(T_3, SignExtend);
1642     } break;
1643     }
1644     // COMMON SUFFIX OF: a=b SHIFT_OP c ==>
1645     // L1:
1646     //   a.lo = t2
1647     //   a.hi = t3
1648     Context.insert(Label);
1649     _mov(DestLo, T_2);
1650     _mov(DestHi, T_3);
1651   }
1652 }
1653 
lowerArithmetic(const InstArithmetic * Instr)1654 void TargetX8664::lowerArithmetic(const InstArithmetic *Instr) {
1655   Variable *Dest = Instr->getDest();
1656   if (Dest->isRematerializable()) {
1657     Context.insert<InstFakeDef>(Dest);
1658     return;
1659   }
1660   Type Ty = Dest->getType();
1661   Operand *Src0 = legalize(Instr->getSrc(0));
1662   Operand *Src1 = legalize(Instr->getSrc(1));
1663   if (Instr->isCommutative()) {
1664     uint32_t SwapCount = 0;
1665     if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1)) {
1666       std::swap(Src0, Src1);
1667       ++SwapCount;
1668     }
1669     if (llvm::isa<Constant>(Src0) && !llvm::isa<Constant>(Src1)) {
1670       std::swap(Src0, Src1);
1671       ++SwapCount;
1672     }
1673     // Improve two-address code patterns by avoiding a copy to the dest
1674     // register when one of the source operands ends its lifetime here.
1675     if (!Instr->isLastUse(Src0) && Instr->isLastUse(Src1)) {
1676       std::swap(Src0, Src1);
1677       ++SwapCount;
1678     }
1679     assert(SwapCount <= 1);
1680     (void)SwapCount;
1681   }
1682   if (isVectorType(Ty)) {
1683     // TODO: Trap on integer divide and integer modulo by zero. See:
1684     // https://code.google.com/p/nativeclient/issues/detail?id=3899
1685     if (llvm::isa<X86OperandMem>(Src1))
1686       Src1 = legalizeToReg(Src1);
1687     switch (Instr->getOp()) {
1688     case InstArithmetic::_num:
1689       llvm_unreachable("Unknown arithmetic operator");
1690       break;
1691     case InstArithmetic::Add: {
1692       Variable *T = makeReg(Ty);
1693       _movp(T, Src0);
1694       _padd(T, Src1);
1695       _movp(Dest, T);
1696     } break;
1697     case InstArithmetic::And: {
1698       Variable *T = makeReg(Ty);
1699       _movp(T, Src0);
1700       _pand(T, Src1);
1701       _movp(Dest, T);
1702     } break;
1703     case InstArithmetic::Or: {
1704       Variable *T = makeReg(Ty);
1705       _movp(T, Src0);
1706       _por(T, Src1);
1707       _movp(Dest, T);
1708     } break;
1709     case InstArithmetic::Xor: {
1710       Variable *T = makeReg(Ty);
1711       _movp(T, Src0);
1712       _pxor(T, Src1);
1713       _movp(Dest, T);
1714     } break;
1715     case InstArithmetic::Sub: {
1716       Variable *T = makeReg(Ty);
1717       _movp(T, Src0);
1718       _psub(T, Src1);
1719       _movp(Dest, T);
1720     } break;
1721     case InstArithmetic::Mul: {
1722       bool TypesAreValidForPmull = Ty == IceType_v4i32 || Ty == IceType_v8i16;
1723       bool InstructionSetIsValidForPmull =
1724           Ty == IceType_v8i16 || InstructionSet >= SSE4_1;
1725       if (TypesAreValidForPmull && InstructionSetIsValidForPmull) {
1726         Variable *T = makeReg(Ty);
1727         _movp(T, Src0);
1728         _pmull(T, Src0 == Src1 ? T : Src1);
1729         _movp(Dest, T);
1730       } else if (Ty == IceType_v4i32) {
1731         // Lowering sequence:
1732         // Note: The mask arguments have index 0 on the left.
1733         //
1734         // movups  T1, Src0
1735         // pshufd  T2, Src0, {1,0,3,0}
1736         // pshufd  T3, Src1, {1,0,3,0}
1737         // # T1 = {Src0[0] * Src1[0], Src0[2] * Src1[2]}
1738         // pmuludq T1, Src1
1739         // # T2 = {Src0[1] * Src1[1], Src0[3] * Src1[3]}
1740         // pmuludq T2, T3
1741         // # T1 = {lo(T1[0]), lo(T1[2]), lo(T2[0]), lo(T2[2])}
1742         // shufps  T1, T2, {0,2,0,2}
1743         // pshufd  T4, T1, {0,2,1,3}
1744         // movups  Dest, T4
1745 
1746         // Mask that directs pshufd to create a vector with entries
1747         // Src[1, 0, 3, 0]
1748         constexpr unsigned Constant1030 = 0x31;
1749         Constant *Mask1030 = Ctx->getConstantInt32(Constant1030);
1750         // Mask that directs shufps to create a vector with entries
1751         // Dest[0, 2], Src[0, 2]
1752         constexpr unsigned Mask0202 = 0x88;
1753         // Mask that directs pshufd to create a vector with entries
1754         // Src[0, 2, 1, 3]
1755         constexpr unsigned Mask0213 = 0xd8;
1756         Variable *T1 = makeReg(IceType_v4i32);
1757         Variable *T2 = makeReg(IceType_v4i32);
1758         Variable *T3 = makeReg(IceType_v4i32);
1759         Variable *T4 = makeReg(IceType_v4i32);
1760         _movp(T1, Src0);
1761         _pshufd(T2, Src0, Mask1030);
1762         _pshufd(T3, Src1, Mask1030);
1763         _pmuludq(T1, Src1);
1764         _pmuludq(T2, T3);
1765         _shufps(T1, T2, Ctx->getConstantInt32(Mask0202));
1766         _pshufd(T4, T1, Ctx->getConstantInt32(Mask0213));
1767         _movp(Dest, T4);
1768       } else if (Ty == IceType_v16i8) {
1769         llvm::report_fatal_error("Scalarized operation was expected");
1770       } else {
1771         llvm::report_fatal_error("Invalid vector multiply type");
1772       }
1773     } break;
1774     case InstArithmetic::Shl: {
1775       assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
1776       Variable *T = makeReg(Ty);
1777       _movp(T, Src0);
1778       _psll(T, Src1);
1779       _movp(Dest, T);
1780     } break;
1781     case InstArithmetic::Lshr: {
1782       assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
1783       Variable *T = makeReg(Ty);
1784       _movp(T, Src0);
1785       _psrl(T, Src1);
1786       _movp(Dest, T);
1787     } break;
1788     case InstArithmetic::Ashr: {
1789       assert(llvm::isa<Constant>(Src1) && "Non-constant shift not scalarized");
1790       Variable *T = makeReg(Ty);
1791       _movp(T, Src0);
1792       _psra(T, Src1);
1793       _movp(Dest, T);
1794     } break;
1795     case InstArithmetic::Udiv:
1796     case InstArithmetic::Urem:
1797     case InstArithmetic::Sdiv:
1798     case InstArithmetic::Srem:
1799       llvm::report_fatal_error("Scalarized operation was expected");
1800       break;
1801     case InstArithmetic::Fadd: {
1802       Variable *T = makeReg(Ty);
1803       _movp(T, Src0);
1804       _addps(T, Src1);
1805       _movp(Dest, T);
1806     } break;
1807     case InstArithmetic::Fsub: {
1808       Variable *T = makeReg(Ty);
1809       _movp(T, Src0);
1810       _subps(T, Src1);
1811       _movp(Dest, T);
1812     } break;
1813     case InstArithmetic::Fmul: {
1814       Variable *T = makeReg(Ty);
1815       _movp(T, Src0);
1816       _mulps(T, Src0 == Src1 ? T : Src1);
1817       _movp(Dest, T);
1818     } break;
1819     case InstArithmetic::Fdiv: {
1820       Variable *T = makeReg(Ty);
1821       _movp(T, Src0);
1822       _divps(T, Src1);
1823       _movp(Dest, T);
1824     } break;
1825     case InstArithmetic::Frem:
1826       llvm::report_fatal_error("Scalarized operation was expected");
1827       break;
1828     }
1829     return;
1830   }
1831   Variable *T_edx = nullptr;
1832   Variable *T = nullptr;
1833   switch (Instr->getOp()) {
1834   case InstArithmetic::_num:
1835     llvm_unreachable("Unknown arithmetic operator");
1836     break;
1837   case InstArithmetic::Add: {
1838     const bool ValidType = Ty == IceType_i32 || Ty == IceType_i64;
1839     auto *Const = llvm::dyn_cast<Constant>(Instr->getSrc(1));
1840     const bool ValidKind =
1841         Const != nullptr && (llvm::isa<ConstantInteger32>(Const) ||
1842                              llvm::isa<ConstantRelocatable>(Const));
1843     if (getFlags().getAggressiveLea() && ValidType && ValidKind) {
1844       auto *Var = legalizeToReg(Src0);
1845       auto *Mem = X86OperandMem::create(Func, IceType_void, Var, Const);
1846       T = makeReg(Ty);
1847       _lea(T, Mem);
1848       _mov(Dest, T);
1849       break;
1850     }
1851     _mov(T, Src0);
1852     _add(T, Src1);
1853     _mov(Dest, T);
1854   } break;
1855   case InstArithmetic::And:
1856     _mov(T, Src0);
1857     _and(T, Src1);
1858     _mov(Dest, T);
1859     break;
1860   case InstArithmetic::Or:
1861     _mov(T, Src0);
1862     _or(T, Src1);
1863     _mov(Dest, T);
1864     break;
1865   case InstArithmetic::Xor:
1866     _mov(T, Src0);
1867     _xor(T, Src1);
1868     _mov(Dest, T);
1869     break;
1870   case InstArithmetic::Sub:
1871     _mov(T, Src0);
1872     _sub(T, Src1);
1873     _mov(Dest, T);
1874     break;
1875   case InstArithmetic::Mul:
1876     if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
1877       if (optimizeScalarMul(Dest, Src0, C->getValue()))
1878         return;
1879     }
1880     // The 8-bit version of imul only allows the form "imul r/m8" where T must
1881     // be in al.
1882     if (isByteSizedArithType(Ty)) {
1883       _mov(T, Src0, RegX8664::Reg_al);
1884       Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
1885       _imul(T, Src0 == Src1 ? T : Src1);
1886       _mov(Dest, T);
1887     } else if (auto *ImmConst = llvm::dyn_cast<ConstantInteger32>(Src1)) {
1888       T = makeReg(Ty);
1889       Src0 = legalize(Src0, Legal_Reg | Legal_Mem);
1890       _imul_imm(T, Src0, ImmConst);
1891       _mov(Dest, T);
1892     } else {
1893       _mov(T, Src0);
1894       // No need to legalize Src1 to Reg | Mem because the Imm case is handled
1895       // already by the ConstantInteger32 case above.
1896       _imul(T, Src0 == Src1 ? T : Src1);
1897       _mov(Dest, T);
1898     }
1899     break;
1900   case InstArithmetic::Shl:
1901     _mov(T, Src0);
1902     if (!llvm::isa<ConstantInteger32>(Src1) &&
1903         !llvm::isa<ConstantInteger64>(Src1))
1904       Src1 = copyToReg8(Src1, RegX8664::Reg_cl);
1905     _shl(T, Src1);
1906     _mov(Dest, T);
1907     break;
1908   case InstArithmetic::Lshr:
1909     _mov(T, Src0);
1910     if (!llvm::isa<ConstantInteger32>(Src1) &&
1911         !llvm::isa<ConstantInteger64>(Src1))
1912       Src1 = copyToReg8(Src1, RegX8664::Reg_cl);
1913     _shr(T, Src1);
1914     _mov(Dest, T);
1915     break;
1916   case InstArithmetic::Ashr:
1917     _mov(T, Src0);
1918     if (!llvm::isa<ConstantInteger32>(Src1) &&
1919         !llvm::isa<ConstantInteger64>(Src1))
1920       Src1 = copyToReg8(Src1, RegX8664::Reg_cl);
1921     _sar(T, Src1);
1922     _mov(Dest, T);
1923     break;
1924   case InstArithmetic::Udiv: {
1925     // div and idiv are the few arithmetic operators that do not allow
1926     // immediates as the operand.
1927     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
1928     RegNumT Eax;
1929     RegNumT Edx;
1930     switch (Ty) {
1931     default:
1932       llvm::report_fatal_error("Bad type for udiv");
1933     case IceType_i64:
1934       Eax = RegX8664::Reg_rax;
1935       Edx = RegX8664::Reg_rdx;
1936       break;
1937     case IceType_i32:
1938       Eax = RegX8664::Reg_eax;
1939       Edx = RegX8664::Reg_edx;
1940       break;
1941     case IceType_i16:
1942       Eax = RegX8664::Reg_ax;
1943       Edx = RegX8664::Reg_dx;
1944       break;
1945     case IceType_i8:
1946       Eax = RegX8664::Reg_al;
1947       Edx = RegX8664::Reg_ah;
1948       break;
1949     }
1950     T_edx = makeReg(Ty, Edx);
1951     _mov(T, Src0, Eax);
1952     _mov(T_edx, Ctx->getConstantZero(Ty));
1953     _div(T_edx, Src1, T);
1954     _redefined(Context.insert<InstFakeDef>(T, T_edx));
1955     _mov(Dest, T);
1956   } break;
1957   case InstArithmetic::Sdiv:
1958     // TODO(stichnot): Enable this after doing better performance and cross
1959     // testing.
1960     if (false && Func->getOptLevel() >= Opt_1) {
1961       // Optimize division by constant power of 2, but not for Om1 or O0, just
1962       // to keep things simple there.
1963       if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
1964         const int32_t Divisor = C->getValue();
1965         const uint32_t UDivisor = Divisor;
1966         if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
1967           uint32_t LogDiv = llvm::Log2_32(UDivisor);
1968           // LLVM does the following for dest=src/(1<<log):
1969           //   t=src
1970           //   sar t,typewidth-1 // -1 if src is negative, 0 if not
1971           //   shr t,typewidth-log
1972           //   add t,src
1973           //   sar t,log
1974           //   dest=t
1975           uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);
1976           _mov(T, Src0);
1977           // If for some reason we are dividing by 1, just treat it like an
1978           // assignment.
1979           if (LogDiv > 0) {
1980             // The initial sar is unnecessary when dividing by 2.
1981             if (LogDiv > 1)
1982               _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
1983             _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
1984             _add(T, Src0);
1985             _sar(T, Ctx->getConstantInt(Ty, LogDiv));
1986           }
1987           _mov(Dest, T);
1988           return;
1989         }
1990       }
1991     }
1992     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
1993     switch (Ty) {
1994     default:
1995       llvm::report_fatal_error("Bad type for sdiv");
1996     case IceType_i64:
1997       T_edx = makeReg(Ty, RegX8664::Reg_rdx);
1998       _mov(T, Src0, RegX8664::Reg_rax);
1999       break;
2000     case IceType_i32:
2001       T_edx = makeReg(Ty, RegX8664::Reg_edx);
2002       _mov(T, Src0, RegX8664::Reg_eax);
2003       break;
2004     case IceType_i16:
2005       T_edx = makeReg(Ty, RegX8664::Reg_dx);
2006       _mov(T, Src0, RegX8664::Reg_ax);
2007       break;
2008     case IceType_i8:
2009       T_edx = makeReg(IceType_i16, RegX8664::Reg_ax);
2010       _mov(T, Src0, RegX8664::Reg_al);
2011       break;
2012     }
2013     _cbwdq(T_edx, T);
2014     _idiv(T_edx, Src1, T);
2015     _redefined(Context.insert<InstFakeDef>(T, T_edx));
2016     _mov(Dest, T);
2017     break;
2018   case InstArithmetic::Urem: {
2019     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2020     RegNumT Eax;
2021     RegNumT Edx;
2022     switch (Ty) {
2023     default:
2024       llvm::report_fatal_error("Bad type for urem");
2025     case IceType_i64:
2026       Eax = RegX8664::Reg_rax;
2027       Edx = RegX8664::Reg_rdx;
2028       break;
2029     case IceType_i32:
2030       Eax = RegX8664::Reg_eax;
2031       Edx = RegX8664::Reg_edx;
2032       break;
2033     case IceType_i16:
2034       Eax = RegX8664::Reg_ax;
2035       Edx = RegX8664::Reg_dx;
2036       break;
2037     case IceType_i8:
2038       Eax = RegX8664::Reg_al;
2039       Edx = RegX8664::Reg_ah;
2040       break;
2041     }
2042     T_edx = makeReg(Ty, Edx);
2043     _mov(T_edx, Ctx->getConstantZero(Ty));
2044     _mov(T, Src0, Eax);
2045     _div(T, Src1, T_edx);
2046     _redefined(Context.insert<InstFakeDef>(T_edx, T));
2047     if (Ty == IceType_i8) {
2048       // Register ah must be moved into one of {al,bl,cl,dl} before it can be
2049       // moved into a general 8-bit register.
2050       auto *T_AhRcvr = makeReg(Ty);
2051       T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
2052       _mov(T_AhRcvr, T_edx);
2053       T_edx = T_AhRcvr;
2054     }
2055     _mov(Dest, T_edx);
2056   } break;
2057   case InstArithmetic::Srem: {
2058     // TODO(stichnot): Enable this after doing better performance and cross
2059     // testing.
2060     if (false && Func->getOptLevel() >= Opt_1) {
2061       // Optimize mod by constant power of 2, but not for Om1 or O0, just to
2062       // keep things simple there.
2063       if (auto *C = llvm::dyn_cast<ConstantInteger32>(Src1)) {
2064         const int32_t Divisor = C->getValue();
2065         const uint32_t UDivisor = Divisor;
2066         if (Divisor > 0 && llvm::isPowerOf2_32(UDivisor)) {
2067           uint32_t LogDiv = llvm::Log2_32(UDivisor);
2068           // LLVM does the following for dest=src%(1<<log):
2069           //   t=src
2070           //   sar t,typewidth-1 // -1 if src is negative, 0 if not
2071           //   shr t,typewidth-log
2072           //   add t,src
2073           //   and t, -(1<<log)
2074           //   sub t,src
2075           //   neg t
2076           //   dest=t
2077           uint32_t TypeWidth = X86_CHAR_BIT * typeWidthInBytes(Ty);
2078           // If for some reason we are dividing by 1, just assign 0.
2079           if (LogDiv == 0) {
2080             _mov(Dest, Ctx->getConstantZero(Ty));
2081             return;
2082           }
2083           _mov(T, Src0);
2084           // The initial sar is unnecessary when dividing by 2.
2085           if (LogDiv > 1)
2086             _sar(T, Ctx->getConstantInt(Ty, TypeWidth - 1));
2087           _shr(T, Ctx->getConstantInt(Ty, TypeWidth - LogDiv));
2088           _add(T, Src0);
2089           _and(T, Ctx->getConstantInt(Ty, -(1 << LogDiv)));
2090           _sub(T, Src0);
2091           _neg(T);
2092           _mov(Dest, T);
2093           return;
2094         }
2095       }
2096     }
2097     Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
2098     RegNumT Eax;
2099     RegNumT Edx;
2100     switch (Ty) {
2101     default:
2102       llvm::report_fatal_error("Bad type for srem");
2103     case IceType_i64:
2104       Eax = RegX8664::Reg_rax;
2105       Edx = RegX8664::Reg_rdx;
2106       break;
2107     case IceType_i32:
2108       Eax = RegX8664::Reg_eax;
2109       Edx = RegX8664::Reg_edx;
2110       break;
2111     case IceType_i16:
2112       Eax = RegX8664::Reg_ax;
2113       Edx = RegX8664::Reg_dx;
2114       break;
2115     case IceType_i8:
2116       Eax = RegX8664::Reg_al;
2117       Edx = RegX8664::Reg_ah;
2118       break;
2119     }
2120     T_edx = makeReg(Ty, Edx);
2121     _mov(T, Src0, Eax);
2122     _cbwdq(T_edx, T);
2123     _idiv(T, Src1, T_edx);
2124     _redefined(Context.insert<InstFakeDef>(T_edx, T));
2125     if (Ty == IceType_i8) {
2126       // Register ah must be moved into one of {al,bl,cl,dl} before it can be
2127       // moved into a general 8-bit register.
2128       auto *T_AhRcvr = makeReg(Ty);
2129       T_AhRcvr->setRegClass(RCX86_IsAhRcvr);
2130       _mov(T_AhRcvr, T_edx);
2131       T_edx = T_AhRcvr;
2132     }
2133     _mov(Dest, T_edx);
2134   } break;
2135   case InstArithmetic::Fadd:
2136     _mov(T, Src0);
2137     _addss(T, Src1);
2138     _mov(Dest, T);
2139     break;
2140   case InstArithmetic::Fsub:
2141     _mov(T, Src0);
2142     _subss(T, Src1);
2143     _mov(Dest, T);
2144     break;
2145   case InstArithmetic::Fmul:
2146     _mov(T, Src0);
2147     _mulss(T, Src0 == Src1 ? T : Src1);
2148     _mov(Dest, T);
2149     break;
2150   case InstArithmetic::Fdiv:
2151     _mov(T, Src0);
2152     _divss(T, Src1);
2153     _mov(Dest, T);
2154     break;
2155   case InstArithmetic::Frem:
2156     llvm::report_fatal_error("Helper call was expected");
2157     break;
2158   }
2159 }
2160 
lowerAssign(const InstAssign * Instr)2161 void TargetX8664::lowerAssign(const InstAssign *Instr) {
2162   Variable *Dest = Instr->getDest();
2163   if (Dest->isRematerializable()) {
2164     Context.insert<InstFakeDef>(Dest);
2165     return;
2166   }
2167   Operand *Src = Instr->getSrc(0);
2168   assert(Dest->getType() == Src->getType());
2169   lowerMove(Dest, Src, false);
2170 }
2171 
lowerBr(const InstBr * Br)2172 void TargetX8664::lowerBr(const InstBr *Br) {
2173   if (Br->isUnconditional()) {
2174     _br(Br->getTargetUnconditional());
2175     return;
2176   }
2177   Operand *Cond = Br->getCondition();
2178 
2179   // Handle folding opportunities.
2180   if (const Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
2181     assert(Producer->isDeleted());
2182     switch (BoolFolding::getProducerKind(Producer)) {
2183     default:
2184       break;
2185     case BoolFolding::PK_Icmp32:
2186     case BoolFolding::PK_Icmp64: {
2187       lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Br);
2188       return;
2189     }
2190     case BoolFolding::PK_Fcmp: {
2191       lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Br);
2192       return;
2193     }
2194     case BoolFolding::PK_Arith: {
2195       lowerArithAndConsumer(llvm::cast<InstArithmetic>(Producer), Br);
2196       return;
2197     }
2198     }
2199   }
2200   Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
2201   Constant *Zero = Ctx->getConstantZero(IceType_i32);
2202   _cmp(Src0, Zero);
2203   _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
2204 }
2205 
2206 // constexprMax returns a (constexpr) max(S0, S1), and it is used for defining
2207 // OperandList in lowerCall. std::max() is supposed to work, but it doesn't.
constexprMax(SizeT S0,SizeT S1)2208 inline constexpr SizeT constexprMax(SizeT S0, SizeT S1) {
2209   return S0 < S1 ? S1 : S0;
2210 }
2211 
lowerCall(const InstCall * Instr)2212 void TargetX8664::lowerCall(const InstCall *Instr) {
2213   // Common x86-64 calling convention lowering:
2214   //
2215   // * At the point before the call, the stack must be aligned to 16 bytes.
2216   //
2217   // * Non-register arguments are pushed onto the stack in right-to-left order,
2218   // such that the left-most argument ends up on the top of the stack at the
2219   // lowest memory address.
2220   //
2221   // * Stack arguments of vector type are aligned to start at the next highest
2222   // multiple of 16 bytes. Other stack arguments are aligned to the next word
2223   // size boundary (4 or 8 bytes, respectively).
2224 
2225   constexpr SizeT MaxOperands =
2226       constexprMax(RegX8664::X86_MAX_XMM_ARGS, RegX8664::X86_MAX_GPR_ARGS);
2227   using OperandList = llvm::SmallVector<Operand *, MaxOperands>;
2228 
2229   OperandList XmmArgs;
2230   llvm::SmallVector<SizeT, MaxOperands> XmmArgIndices;
2231   CfgVector<std::pair<const Type, Operand *>> GprArgs;
2232   CfgVector<SizeT> GprArgIndices;
2233   OperandList StackArgs, StackArgLocations;
2234   uint32_t ParameterAreaSizeBytes = 0;
2235 
2236   ParameterAreaSizeBytes += getShadowStoreSize();
2237 
2238   // Classify each argument operand according to the location where the argument
2239   // is passed.
2240   for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
2241     Operand *Arg = Instr->getArg(i);
2242     const Type Ty = Arg->getType();
2243     // The PNaCl ABI requires the width of arguments to be at least 32 bits.
2244     assert(typeWidthInBytes(Ty) >= 4);
2245     if (isVectorType(Ty) && RegX8664::getRegisterForXmmArgNum(
2246                                 RegX8664::getArgIndex(i, XmmArgs.size()))
2247                                 .hasValue()) {
2248       XmmArgs.push_back(Arg);
2249       XmmArgIndices.push_back(i);
2250     } else if (isScalarFloatingType(Ty) &&
2251                RegX8664::getRegisterForXmmArgNum(
2252                    RegX8664::getArgIndex(i, XmmArgs.size()))
2253                    .hasValue()) {
2254       XmmArgs.push_back(Arg);
2255       XmmArgIndices.push_back(i);
2256     } else if (isScalarIntegerType(Ty) &&
2257                RegX8664::getRegisterForGprArgNum(
2258                    Ty, RegX8664::getArgIndex(i, GprArgs.size()))
2259                    .hasValue()) {
2260       GprArgs.emplace_back(Ty, Arg);
2261       GprArgIndices.push_back(i);
2262     } else {
2263       // Place on stack.
2264       StackArgs.push_back(Arg);
2265       if (isVectorType(Arg->getType())) {
2266         ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
2267       }
2268       Variable *esp = getPhysicalRegister(getStackReg(), WordType);
2269       Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
2270       StackArgLocations.push_back(X86OperandMem::create(Func, Ty, esp, Loc));
2271       ParameterAreaSizeBytes += typeWidthInBytesOnStack(Arg->getType());
2272     }
2273   }
2274   // Adjust the parameter area so that the stack is aligned. It is assumed that
2275   // the stack is already aligned at the start of the calling sequence.
2276   ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);
2277   assert(ParameterAreaSizeBytes <= maxOutArgsSizeBytes());
2278   // Copy arguments that are passed on the stack to the appropriate stack
2279   // locations.  We make sure legalize() is called on each argument at this
2280   // point, to allow availabilityGet() to work.
2281   for (SizeT i = 0, NumStackArgs = StackArgs.size(); i < NumStackArgs; ++i) {
2282     lowerStore(
2283         InstStore::create(Func, legalize(StackArgs[i]), StackArgLocations[i]));
2284   }
2285   // Copy arguments to be passed in registers to the appropriate registers.
2286   for (SizeT i = 0, NumXmmArgs = XmmArgs.size(); i < NumXmmArgs; ++i) {
2287     XmmArgs[i] = legalizeToReg(legalize(XmmArgs[i]),
2288                                RegX8664::getRegisterForXmmArgNum(
2289                                    RegX8664::getArgIndex(XmmArgIndices[i], i)));
2290   }
2291   // Materialize moves for arguments passed in GPRs.
2292   for (SizeT i = 0, NumGprArgs = GprArgs.size(); i < NumGprArgs; ++i) {
2293     const Type SignatureTy = GprArgs[i].first;
2294     Operand *Arg =
2295         legalize(GprArgs[i].second, Legal_Default | Legal_Rematerializable);
2296     GprArgs[i].second = legalizeToReg(
2297         Arg, RegX8664::getRegisterForGprArgNum(
2298                  Arg->getType(), RegX8664::getArgIndex(GprArgIndices[i], i)));
2299     assert(SignatureTy == IceType_i64 || SignatureTy == IceType_i32);
2300     assert(SignatureTy == Arg->getType());
2301     (void)SignatureTy;
2302   }
2303   // Generate a FakeUse of register arguments so that they do not get dead code
2304   // eliminated as a result of the FakeKill of scratch registers after the call.
2305   // These need to be right before the call instruction.
2306   for (auto *Arg : XmmArgs) {
2307     Context.insert<InstFakeUse>(llvm::cast<Variable>(Arg));
2308   }
2309   for (auto &ArgPair : GprArgs) {
2310     Context.insert<InstFakeUse>(llvm::cast<Variable>(ArgPair.second));
2311   }
2312   // Generate the call instruction. Assign its result to a temporary with high
2313   // register allocation weight.
2314   Variable *Dest = Instr->getDest();
2315   const Type DestTy = Dest ? Dest->getType() : IceType_void;
2316   Variable *ReturnReg = nullptr;
2317   if (Dest) {
2318     switch (DestTy) {
2319     case IceType_NUM:
2320     case IceType_void:
2321     case IceType_i1:
2322     case IceType_i8:
2323     case IceType_i16:
2324       llvm::report_fatal_error("Invalid Call dest type");
2325       break;
2326     case IceType_i32:
2327       ReturnReg = makeReg(DestTy, RegX8664::Reg_eax);
2328       break;
2329     case IceType_i64:
2330       ReturnReg = makeReg(IceType_i64, RegX8664::Reg_rax);
2331 
2332       break;
2333     case IceType_f32:
2334     case IceType_f64:
2335     case IceType_v4i1:
2336     case IceType_v8i1:
2337     case IceType_v16i1:
2338     case IceType_v16i8:
2339     case IceType_v8i16:
2340     case IceType_v4i32:
2341     case IceType_v4f32:
2342       ReturnReg = makeReg(DestTy, RegX8664::Reg_xmm0);
2343       break;
2344     }
2345   }
2346   // Emit the call to the function.
2347   Operand *CallTarget =
2348       legalize(Instr->getCallTarget(), Legal_Reg | Legal_Imm | Legal_AddrAbs);
2349   size_t NumVariadicFpArgs = Instr->isVariadic() ? XmmArgs.size() : 0;
2350   Inst *NewCall = emitCallToTarget(CallTarget, ReturnReg, NumVariadicFpArgs);
2351   // Mark the call as killing all the caller-save registers.
2352   Context.insert<InstFakeKill>(NewCall);
2353   // Generate a FakeUse to keep the call live if necessary.
2354   if (Instr->hasSideEffects() && ReturnReg) {
2355     Context.insert<InstFakeUse>(ReturnReg);
2356   }
2357   // Process the return value, if any.
2358   if (Dest == nullptr)
2359     return;
2360   // Assign the result of the call to Dest.  Route it through a temporary so
2361   // that the local register availability peephole can be subsequently used.
2362   Variable *Tmp = nullptr;
2363   if (isVectorType(DestTy)) {
2364     assert(ReturnReg && "Vector type requires a return register");
2365     Tmp = makeReg(DestTy);
2366     _movp(Tmp, ReturnReg);
2367     _movp(Dest, Tmp);
2368   } else if (isScalarFloatingType(DestTy)) {
2369     assert(ReturnReg && "FP type requires a return register");
2370     _mov(Tmp, ReturnReg);
2371     _mov(Dest, Tmp);
2372   } else {
2373     assert(isScalarIntegerType(DestTy));
2374     assert(ReturnReg && "Integer type requires a return register");
2375     _mov(Tmp, ReturnReg);
2376     _mov(Dest, Tmp);
2377   }
2378 }
2379 
lowerCast(const InstCast * Instr)2380 void TargetX8664::lowerCast(const InstCast *Instr) {
2381   // a = cast(b) ==> t=cast(b); a=t; (link t->b, link a->t, no overlap)
2382   InstCast::OpKind CastKind = Instr->getCastKind();
2383   Variable *Dest = Instr->getDest();
2384   Type DestTy = Dest->getType();
2385   switch (CastKind) {
2386   default:
2387     Func->setError("Cast type not supported");
2388     return;
2389   case InstCast::Sext: {
2390     // Src0RM is the source operand legalized to physical register or memory,
2391     // but not immediate, since the relevant x86 native instructions don't
2392     // allow an immediate operand. If the operand is an immediate, we could
2393     // consider computing the strength-reduced result at translation time, but
2394     // we're unlikely to see something like that in the bitcode that the
2395     // optimizer wouldn't have already taken care of.
2396     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2397     if (isVectorType(DestTy)) {
2398       if (DestTy == IceType_v16i8) {
2399         // onemask = materialize(1,1,...); dst = (src & onemask) > 0
2400         Variable *OneMask = makeVectorOfOnes(DestTy);
2401         Variable *T = makeReg(DestTy);
2402         _movp(T, Src0RM);
2403         _pand(T, OneMask);
2404         Variable *Zeros = makeVectorOfZeros(DestTy);
2405         _pcmpgt(T, Zeros);
2406         _movp(Dest, T);
2407       } else {
2408         /// width = width(elty) - 1; dest = (src << width) >> width
2409         SizeT ShiftAmount =
2410             X86_CHAR_BIT * typeWidthInBytes(typeElementType(DestTy)) - 1;
2411         Constant *ShiftConstant = Ctx->getConstantInt8(ShiftAmount);
2412         Variable *T = makeReg(DestTy);
2413         _movp(T, Src0RM);
2414         _psll(T, ShiftConstant);
2415         _psra(T, ShiftConstant);
2416         _movp(Dest, T);
2417       }
2418     } else if (Src0RM->getType() == IceType_i1) {
2419       // t1 = src
2420       // shl t1, dst_bitwidth - 1
2421       // sar t1, dst_bitwidth - 1
2422       // dst = t1
2423       size_t DestBits = X86_CHAR_BIT * typeWidthInBytes(DestTy);
2424       Constant *ShiftAmount = Ctx->getConstantInt32(DestBits - 1);
2425       Variable *T = makeReg(DestTy);
2426       if (typeWidthInBytes(DestTy) <= typeWidthInBytes(Src0RM->getType())) {
2427         _mov(T, Src0RM);
2428       } else {
2429         // Widen the source using movsx or movzx. (It doesn't matter which one,
2430         // since the following shl/sar overwrite the bits.)
2431         _movzx(T, Src0RM);
2432       }
2433       _shl(T, ShiftAmount);
2434       _sar(T, ShiftAmount);
2435       _mov(Dest, T);
2436     } else {
2437       // t1 = movsx src; dst = t1
2438       Variable *T = makeReg(DestTy);
2439       _movsx(T, Src0RM);
2440       _mov(Dest, T);
2441     }
2442     break;
2443   }
2444   case InstCast::Zext: {
2445     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2446     if (isVectorType(DestTy)) {
2447       // onemask = materialize(1,1,...); dest = onemask & src
2448       Variable *OneMask = makeVectorOfOnes(DestTy);
2449       Variable *T = makeReg(DestTy);
2450       _movp(T, Src0RM);
2451       _pand(T, OneMask);
2452       _movp(Dest, T);
2453     } else if (Src0RM->getType() == IceType_i1) {
2454       // t = Src0RM; Dest = t
2455       Variable *T = nullptr;
2456       if (DestTy == IceType_i8) {
2457         _mov(T, Src0RM);
2458       } else {
2459         assert(DestTy != IceType_i1);
2460         // Use 32-bit for both 16-bit and 32-bit, since 32-bit ops are shorter.
2461         // In x86-64 we need to widen T to 64-bits to ensure that T -- if
2462         // written to the stack (i.e., in -Om1) will be fully zero-extended.
2463         T = makeReg(DestTy == IceType_i64 ? IceType_i64 : IceType_i32);
2464         _movzx(T, Src0RM);
2465       }
2466       _mov(Dest, T);
2467     } else {
2468       // t1 = movzx src; dst = t1
2469       Variable *T = makeReg(DestTy);
2470       _movzx(T, Src0RM);
2471       _mov(Dest, T);
2472     }
2473     break;
2474   }
2475   case InstCast::Trunc: {
2476     if (isVectorType(DestTy)) {
2477       // onemask = materialize(1,1,...); dst = src & onemask
2478       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2479       Type Src0Ty = Src0RM->getType();
2480       Variable *OneMask = makeVectorOfOnes(Src0Ty);
2481       Variable *T = makeReg(DestTy);
2482       _movp(T, Src0RM);
2483       _pand(T, OneMask);
2484       _movp(Dest, T);
2485     } else if (DestTy == IceType_i1 || DestTy == IceType_i8) {
2486       // Make sure we truncate from and into valid registers.
2487       Operand *Src0 = legalizeUndef(Instr->getSrc(0));
2488       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2489       Variable *T = copyToReg8(Src0RM);
2490       if (DestTy == IceType_i1)
2491         _and(T, Ctx->getConstantInt1(1));
2492       _mov(Dest, T);
2493     } else {
2494       Operand *Src0 = legalizeUndef(Instr->getSrc(0));
2495       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2496       // t1 = trunc Src0RM; Dest = t1
2497       Variable *T = makeReg(DestTy);
2498       _mov(T, Src0RM);
2499       _mov(Dest, T);
2500     }
2501     break;
2502   }
2503   case InstCast::Fptrunc:
2504   case InstCast::Fpext: {
2505     Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2506     // t1 = cvt Src0RM; Dest = t1
2507     Variable *T = makeReg(DestTy);
2508     _cvt(T, Src0RM, Insts::Cvt::Float2float);
2509     _mov(Dest, T);
2510     break;
2511   }
2512   case InstCast::Fptosi:
2513     if (isVectorType(DestTy)) {
2514       assert(DestTy == IceType_v4i32);
2515       assert(Instr->getSrc(0)->getType() == IceType_v4f32);
2516       Operand *Src0R = legalizeToReg(Instr->getSrc(0));
2517       Variable *T = makeReg(DestTy);
2518       _cvt(T, Src0R, Insts::Cvt::Tps2dq);
2519       _movp(Dest, T);
2520     } else {
2521       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2522       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
2523       Variable *T_1 = nullptr;
2524       if (DestTy == IceType_i64) {
2525         T_1 = makeReg(IceType_i64);
2526       } else {
2527         assert(DestTy != IceType_i64);
2528         T_1 = makeReg(IceType_i32);
2529       }
2530       // cvt() requires its integer argument to be a GPR.
2531       Variable *T_2 = makeReg(DestTy);
2532       if (isByteSizedType(DestTy)) {
2533         assert(T_1->getType() == IceType_i32);
2534         T_1->setRegClass(RCX86_Is32To8);
2535         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
2536       }
2537       _cvt(T_1, Src0RM, Insts::Cvt::Tss2si);
2538       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
2539       if (DestTy == IceType_i1)
2540         _and(T_2, Ctx->getConstantInt1(1));
2541       _mov(Dest, T_2);
2542     }
2543     break;
2544   case InstCast::Fptoui:
2545     if (isVectorType(DestTy)) {
2546       llvm::report_fatal_error("Helper call was expected");
2547     } else if (DestTy == IceType_i64) {
2548       llvm::report_fatal_error("Helper call was expected");
2549     } else {
2550       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2551       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
2552       assert(DestTy != IceType_i64);
2553       Variable *T_1 = nullptr;
2554       if (DestTy == IceType_i32) {
2555         T_1 = makeReg(IceType_i64);
2556       } else {
2557         assert(DestTy != IceType_i32);
2558         T_1 = makeReg(IceType_i32);
2559       }
2560       Variable *T_2 = makeReg(DestTy);
2561       if (isByteSizedType(DestTy)) {
2562         assert(T_1->getType() == IceType_i32);
2563         T_1->setRegClass(RCX86_Is32To8);
2564         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
2565       }
2566       _cvt(T_1, Src0RM, Insts::Cvt::Tss2si);
2567       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
2568       if (DestTy == IceType_i1)
2569         _and(T_2, Ctx->getConstantInt1(1));
2570       _mov(Dest, T_2);
2571     }
2572     break;
2573   case InstCast::Sitofp:
2574     if (isVectorType(DestTy)) {
2575       assert(DestTy == IceType_v4f32);
2576       assert(Instr->getSrc(0)->getType() == IceType_v4i32);
2577       Operand *Src0R = legalizeToReg(Instr->getSrc(0));
2578       Variable *T = makeReg(DestTy);
2579       _cvt(T, Src0R, Insts::Cvt::Dq2ps);
2580       _movp(Dest, T);
2581     } else {
2582       Operand *Src0RM = legalize(Instr->getSrc(0), Legal_Reg | Legal_Mem);
2583       // Sign-extend the operand.
2584       // t1.i32 = movsx Src0RM; t2 = Cvt t1.i32; Dest = t2
2585       Variable *T_1 = nullptr;
2586       if (Src0RM->getType() == IceType_i64) {
2587         T_1 = makeReg(IceType_i64);
2588       } else {
2589         assert(Src0RM->getType() != IceType_i64);
2590         T_1 = makeReg(IceType_i32);
2591       }
2592       Variable *T_2 = makeReg(DestTy);
2593       if (Src0RM->getType() == T_1->getType())
2594         _mov(T_1, Src0RM);
2595       else
2596         _movsx(T_1, Src0RM);
2597       _cvt(T_2, T_1, Insts::Cvt::Si2ss);
2598       _mov(Dest, T_2);
2599     }
2600     break;
2601   case InstCast::Uitofp: {
2602     Operand *Src0 = Instr->getSrc(0);
2603     if (isVectorType(Src0->getType())) {
2604       llvm::report_fatal_error("Helper call was expected");
2605     } else if (Src0->getType() == IceType_i64) {
2606       llvm::report_fatal_error("Helper call was expected");
2607     } else {
2608       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2609       // Zero-extend the operand.
2610       // t1.i32 = movzx Src0RM; t2 = Cvt t1.i32; Dest = t2
2611       Variable *T_1 = nullptr;
2612       if (Src0RM->getType() == IceType_i32) {
2613         T_1 = makeReg(IceType_i64);
2614       } else {
2615         assert(Src0RM->getType() != IceType_i64);
2616         T_1 = makeReg(IceType_i32);
2617       }
2618       Variable *T_2 = makeReg(DestTy);
2619       if (Src0RM->getType() == T_1->getType())
2620         _mov(T_1, Src0RM);
2621       else
2622         _movzx(T_1, Src0RM)->setMustKeep();
2623       _cvt(T_2, T_1, Insts::Cvt::Si2ss);
2624       _mov(Dest, T_2);
2625     }
2626     break;
2627   }
2628   case InstCast::Bitcast: {
2629     Operand *Src0 = Instr->getSrc(0);
2630     if (DestTy == Src0->getType()) {
2631       auto *Assign = InstAssign::create(Func, Dest, Src0);
2632       lowerAssign(Assign);
2633       return;
2634     }
2635     switch (DestTy) {
2636     default:
2637       llvm_unreachable("Unexpected Bitcast dest type");
2638     case IceType_i8: {
2639       llvm::report_fatal_error("Helper call was expected");
2640     } break;
2641     case IceType_i16: {
2642       llvm::report_fatal_error("Helper call was expected");
2643     } break;
2644     case IceType_i32:
2645     case IceType_f32: {
2646       Variable *Src0R = legalizeToReg(Src0);
2647       Variable *T = makeReg(DestTy);
2648       _movd(T, Src0R);
2649       _mov(Dest, T);
2650     } break;
2651     case IceType_i64: {
2652       assert(Src0->getType() == IceType_f64);
2653       Variable *Src0R = legalizeToReg(Src0);
2654       Variable *T = makeReg(IceType_i64);
2655       _movd(T, Src0R);
2656       _mov(Dest, T);
2657     } break;
2658     case IceType_f64: {
2659       assert(Src0->getType() == IceType_i64);
2660       Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2661       Variable *T = makeReg(IceType_f64);
2662       _movd(T, Src0RM);
2663       _mov(Dest, T);
2664     } break;
2665     case IceType_v8i1: {
2666       llvm::report_fatal_error("Helper call was expected");
2667     } break;
2668     case IceType_v16i1: {
2669       llvm::report_fatal_error("Helper call was expected");
2670     } break;
2671     case IceType_v8i16:
2672     case IceType_v16i8:
2673     case IceType_v4i32:
2674     case IceType_v4f32: {
2675       if (Src0->getType() == IceType_i32) {
2676         // Bitcast requires equal type sizes, which isn't strictly the case
2677         // between scalars and vectors, but to emulate v4i8 vectors one has to
2678         // use v16i8 vectors.
2679         Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2680         Variable *T = makeReg(DestTy);
2681         _movd(T, Src0RM);
2682         _mov(Dest, T);
2683       } else {
2684         _movp(Dest, legalizeToReg(Src0));
2685       }
2686     } break;
2687     }
2688     break;
2689   }
2690   }
2691 }
2692 
lowerExtractElement(const InstExtractElement * Instr)2693 void TargetX8664::lowerExtractElement(const InstExtractElement *Instr) {
2694   Operand *SourceVectNotLegalized = Instr->getSrc(0);
2695   auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(1));
2696   // Only constant indices are allowed in PNaCl IR.
2697   assert(ElementIndex);
2698 
2699   unsigned Index = ElementIndex->getValue();
2700   Type Ty = SourceVectNotLegalized->getType();
2701   Type ElementTy = typeElementType(Ty);
2702   Type InVectorElementTy = InstX86Base::getInVectorElementType(Ty);
2703 
2704   // TODO(wala): Determine the best lowering sequences for each type.
2705   bool CanUsePextr = Ty == IceType_v8i16 || Ty == IceType_v8i1 ||
2706                      (InstructionSet >= SSE4_1 && Ty != IceType_v4f32);
2707   Variable *ExtractedElementR =
2708       makeReg(CanUsePextr ? IceType_i32 : InVectorElementTy);
2709   if (CanUsePextr) {
2710     // Use pextrb, pextrw, or pextrd.  The "b" and "w" versions clear the upper
2711     // bits of the destination register, so we represent this by always
2712     // extracting into an i32 register.  The _mov into Dest below will do
2713     // truncation as necessary.
2714     Constant *Mask = Ctx->getConstantInt32(Index);
2715     Variable *SourceVectR = legalizeToReg(SourceVectNotLegalized);
2716     _pextr(ExtractedElementR, SourceVectR, Mask);
2717   } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
2718     // Use pshufd and movd/movss.
2719     Variable *T = nullptr;
2720     if (Index) {
2721       // The shuffle only needs to occur if the element to be extracted is not
2722       // at the lowest index.
2723       Constant *Mask = Ctx->getConstantInt32(Index);
2724       T = makeReg(Ty);
2725       _pshufd(T, legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem), Mask);
2726     } else {
2727       T = legalizeToReg(SourceVectNotLegalized);
2728     }
2729 
2730     if (InVectorElementTy == IceType_i32) {
2731       _movd(ExtractedElementR, T);
2732     } else { // Ty == IceType_f32
2733       // TODO(wala): _movss is only used here because _mov does not allow a
2734       // vector source and a scalar destination.  _mov should be able to be
2735       // used here.
2736       // _movss is a binary instruction, so the FakeDef is needed to keep the
2737       // live range analysis consistent.
2738       Context.insert<InstFakeDef>(ExtractedElementR);
2739       _movss(ExtractedElementR, T);
2740     }
2741   } else {
2742     assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
2743     // Spill the value to a stack slot and do the extraction in memory.
2744     //
2745     // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
2746     // for legalizing to mem is implemented.
2747     Variable *Slot = Func->makeVariable(Ty);
2748     Slot->setMustNotHaveReg();
2749     _movp(Slot, legalizeToReg(SourceVectNotLegalized));
2750 
2751     // Compute the location of the element in memory.
2752     unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
2753     X86OperandMem *Loc =
2754         getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
2755     _mov(ExtractedElementR, Loc);
2756   }
2757 
2758   if (ElementTy == IceType_i1) {
2759     // Truncate extracted integers to i1s if necessary.
2760     Variable *T = makeReg(IceType_i1);
2761     InstCast *Cast =
2762         InstCast::create(Func, InstCast::Trunc, T, ExtractedElementR);
2763     lowerCast(Cast);
2764     ExtractedElementR = T;
2765   }
2766 
2767   // Copy the element to the destination.
2768   Variable *Dest = Instr->getDest();
2769   _mov(Dest, ExtractedElementR);
2770 }
2771 
lowerFcmp(const InstFcmp * Fcmp)2772 void TargetX8664::lowerFcmp(const InstFcmp *Fcmp) {
2773   Variable *Dest = Fcmp->getDest();
2774 
2775   if (isVectorType(Dest->getType())) {
2776     lowerFcmpVector(Fcmp);
2777   } else {
2778     constexpr Inst *Consumer = nullptr;
2779     lowerFcmpAndConsumer(Fcmp, Consumer);
2780   }
2781 }
2782 
lowerFcmpAndConsumer(const InstFcmp * Fcmp,const Inst * Consumer)2783 void TargetX8664::lowerFcmpAndConsumer(const InstFcmp *Fcmp,
2784                                        const Inst *Consumer) {
2785   Operand *Src0 = Fcmp->getSrc(0);
2786   Operand *Src1 = Fcmp->getSrc(1);
2787   Variable *Dest = Fcmp->getDest();
2788 
2789   if (Consumer != nullptr) {
2790     if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
2791       if (lowerOptimizeFcmpSelect(Fcmp, Select))
2792         return;
2793     }
2794   }
2795 
2796   if (isVectorType(Dest->getType())) {
2797     lowerFcmp(Fcmp);
2798     if (Consumer != nullptr)
2799       lowerSelectVector(llvm::cast<InstSelect>(Consumer));
2800     return;
2801   }
2802 
2803   // Lowering a = fcmp cond, b, c
2804   //   ucomiss b, c       /* only if C1 != Br_None */
2805   //                      /* but swap b,c order if SwapOperands==true */
2806   //   mov a, <default>
2807   //   j<C1> label        /* only if C1 != Br_None */
2808   //   j<C2> label        /* only if C2 != Br_None */
2809   //   FakeUse(a)         /* only if C1 != Br_None */
2810   //   mov a, !<default>  /* only if C1 != Br_None */
2811   //   label:             /* only if C1 != Br_None */
2812   //
2813   // setcc lowering when C1 != Br_None && C2 == Br_None:
2814   //   ucomiss b, c       /* but swap b,c order if SwapOperands==true */
2815   //   setcc a, C1
2816   InstFcmp::FCond Condition = Fcmp->getCondition();
2817   assert(static_cast<size_t>(Condition) < TableFcmpSize);
2818   if (TableFcmp[Condition].SwapScalarOperands)
2819     std::swap(Src0, Src1);
2820   const bool HasC1 = (TableFcmp[Condition].C1 != CondX86::Br_None);
2821   const bool HasC2 = (TableFcmp[Condition].C2 != CondX86::Br_None);
2822   if (HasC1) {
2823     Src0 = legalize(Src0);
2824     Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
2825     Variable *T = nullptr;
2826     _mov(T, Src0);
2827     _ucomiss(T, Src1RM);
2828     if (!HasC2) {
2829       assert(TableFcmp[Condition].Default);
2830       setccOrConsumer(TableFcmp[Condition].C1, Dest, Consumer);
2831       return;
2832     }
2833   }
2834   int32_t IntDefault = TableFcmp[Condition].Default;
2835   if (Consumer == nullptr) {
2836     Constant *Default = Ctx->getConstantInt(Dest->getType(), IntDefault);
2837     _mov(Dest, Default);
2838     if (HasC1) {
2839       InstX86Label *Label = InstX86Label::create(Func, this);
2840       _br(TableFcmp[Condition].C1, Label);
2841       if (HasC2) {
2842         _br(TableFcmp[Condition].C2, Label);
2843       }
2844       Constant *NonDefault = Ctx->getConstantInt(Dest->getType(), !IntDefault);
2845       _redefined(_mov(Dest, NonDefault));
2846       Context.insert(Label);
2847     }
2848     return;
2849   }
2850   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
2851     CfgNode *TrueSucc = Br->getTargetTrue();
2852     CfgNode *FalseSucc = Br->getTargetFalse();
2853     if (IntDefault != 0)
2854       std::swap(TrueSucc, FalseSucc);
2855     if (HasC1) {
2856       _br(TableFcmp[Condition].C1, FalseSucc);
2857       if (HasC2) {
2858         _br(TableFcmp[Condition].C2, FalseSucc);
2859       }
2860       _br(TrueSucc);
2861       return;
2862     }
2863     _br(FalseSucc);
2864     return;
2865   }
2866   if (auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
2867     Operand *SrcT = Select->getTrueOperand();
2868     Operand *SrcF = Select->getFalseOperand();
2869     Variable *SelectDest = Select->getDest();
2870     if (IntDefault != 0)
2871       std::swap(SrcT, SrcF);
2872     lowerMove(SelectDest, SrcF, false);
2873     if (HasC1) {
2874       InstX86Label *Label = InstX86Label::create(Func, this);
2875       _br(TableFcmp[Condition].C1, Label);
2876       if (HasC2) {
2877         _br(TableFcmp[Condition].C2, Label);
2878       }
2879       static constexpr bool IsRedefinition = true;
2880       lowerMove(SelectDest, SrcT, IsRedefinition);
2881       Context.insert(Label);
2882     }
2883     return;
2884   }
2885   llvm::report_fatal_error("Unexpected consumer type");
2886 }
2887 
lowerFcmpVector(const InstFcmp * Fcmp)2888 void TargetX8664::lowerFcmpVector(const InstFcmp *Fcmp) {
2889   Operand *Src0 = Fcmp->getSrc(0);
2890   Operand *Src1 = Fcmp->getSrc(1);
2891   Variable *Dest = Fcmp->getDest();
2892 
2893   if (!isVectorType(Dest->getType()))
2894     llvm::report_fatal_error("Expected vector compare");
2895 
2896   InstFcmp::FCond Condition = Fcmp->getCondition();
2897   assert(static_cast<size_t>(Condition) < TableFcmpSize);
2898 
2899   if (TableFcmp[Condition].SwapVectorOperands)
2900     std::swap(Src0, Src1);
2901 
2902   Variable *T = nullptr;
2903 
2904   if (Condition == InstFcmp::True) {
2905     // makeVectorOfOnes() requires an integer vector type.
2906     T = makeVectorOfMinusOnes(IceType_v4i32);
2907   } else if (Condition == InstFcmp::False) {
2908     T = makeVectorOfZeros(Dest->getType());
2909   } else {
2910     Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
2911     Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
2912     if (llvm::isa<X86OperandMem>(Src1RM))
2913       Src1RM = legalizeToReg(Src1RM);
2914 
2915     switch (Condition) {
2916     default: {
2917       const CmppsCond Predicate = TableFcmp[Condition].Predicate;
2918       assert(Predicate != CondX86::Cmpps_Invalid);
2919       T = makeReg(Src0RM->getType());
2920       _movp(T, Src0RM);
2921       _cmpps(T, Src1RM, Predicate);
2922     } break;
2923     case InstFcmp::One: {
2924       // Check both unequal and ordered.
2925       T = makeReg(Src0RM->getType());
2926       Variable *T2 = makeReg(Src0RM->getType());
2927       _movp(T, Src0RM);
2928       _cmpps(T, Src1RM, CondX86::Cmpps_neq);
2929       _movp(T2, Src0RM);
2930       _cmpps(T2, Src1RM, CondX86::Cmpps_ord);
2931       _pand(T, T2);
2932     } break;
2933     case InstFcmp::Ueq: {
2934       // Check both equal or unordered.
2935       T = makeReg(Src0RM->getType());
2936       Variable *T2 = makeReg(Src0RM->getType());
2937       _movp(T, Src0RM);
2938       _cmpps(T, Src1RM, CondX86::Cmpps_eq);
2939       _movp(T2, Src0RM);
2940       _cmpps(T2, Src1RM, CondX86::Cmpps_unord);
2941       _por(T, T2);
2942     } break;
2943     }
2944   }
2945 
2946   assert(T != nullptr);
2947   _movp(Dest, T);
2948   eliminateNextVectorSextInstruction(Dest);
2949 }
2950 
isZero(const Operand * Opnd)2951 inline bool isZero(const Operand *Opnd) {
2952   if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Opnd))
2953     return C64->getValue() == 0;
2954   if (auto *C32 = llvm::dyn_cast<ConstantInteger32>(Opnd))
2955     return C32->getValue() == 0;
2956   return false;
2957 }
2958 
lowerIcmpAndConsumer(const InstIcmp * Icmp,const Inst * Consumer)2959 void TargetX8664::lowerIcmpAndConsumer(const InstIcmp *Icmp,
2960                                        const Inst *Consumer) {
2961   Operand *Src0 = legalize(Icmp->getSrc(0));
2962   Operand *Src1 = legalize(Icmp->getSrc(1));
2963   Variable *Dest = Icmp->getDest();
2964 
2965   if (isVectorType(Dest->getType())) {
2966     lowerIcmp(Icmp);
2967     if (Consumer != nullptr)
2968       lowerSelectVector(llvm::cast<InstSelect>(Consumer));
2969     return;
2970   }
2971 
2972   // cmp b, c
2973   if (isZero(Src1)) {
2974     switch (Icmp->getCondition()) {
2975     default:
2976       break;
2977     case InstIcmp::Uge:
2978       movOrConsumer(true, Dest, Consumer);
2979       return;
2980     case InstIcmp::Ult:
2981       movOrConsumer(false, Dest, Consumer);
2982       return;
2983     }
2984   }
2985   Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
2986   _cmp(Src0RM, Src1);
2987   setccOrConsumer(getIcmp32Mapping(Icmp->getCondition()), Dest, Consumer);
2988 }
2989 
lowerIcmpVector(const InstIcmp * Icmp)2990 void TargetX8664::lowerIcmpVector(const InstIcmp *Icmp) {
2991   Operand *Src0 = legalize(Icmp->getSrc(0));
2992   Operand *Src1 = legalize(Icmp->getSrc(1));
2993   Variable *Dest = Icmp->getDest();
2994 
2995   if (!isVectorType(Dest->getType()))
2996     llvm::report_fatal_error("Expected a vector compare");
2997 
2998   Type Ty = Src0->getType();
2999   // Promote i1 vectors to 128 bit integer vector types.
3000   if (typeElementType(Ty) == IceType_i1) {
3001     Type NewTy = IceType_NUM;
3002     switch (Ty) {
3003     default:
3004       llvm::report_fatal_error("unexpected type");
3005       break;
3006     case IceType_v4i1:
3007       NewTy = IceType_v4i32;
3008       break;
3009     case IceType_v8i1:
3010       NewTy = IceType_v8i16;
3011       break;
3012     case IceType_v16i1:
3013       NewTy = IceType_v16i8;
3014       break;
3015     }
3016     Variable *NewSrc0 = Func->makeVariable(NewTy);
3017     Variable *NewSrc1 = Func->makeVariable(NewTy);
3018     lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc0, Src0));
3019     lowerCast(InstCast::create(Func, InstCast::Sext, NewSrc1, Src1));
3020     Src0 = NewSrc0;
3021     Src1 = NewSrc1;
3022     Ty = NewTy;
3023   }
3024 
3025   InstIcmp::ICond Condition = Icmp->getCondition();
3026 
3027   Operand *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3028   Operand *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3029 
3030   // SSE2 only has signed comparison operations. Transform unsigned inputs in
3031   // a manner that allows for the use of signed comparison operations by
3032   // flipping the high order bits.
3033   if (Condition == InstIcmp::Ugt || Condition == InstIcmp::Uge ||
3034       Condition == InstIcmp::Ult || Condition == InstIcmp::Ule) {
3035     Variable *T0 = makeReg(Ty);
3036     Variable *T1 = makeReg(Ty);
3037     Variable *HighOrderBits = makeVectorOfHighOrderBits(Ty);
3038     _movp(T0, Src0RM);
3039     _pxor(T0, HighOrderBits);
3040     _movp(T1, Src1RM);
3041     _pxor(T1, HighOrderBits);
3042     Src0RM = T0;
3043     Src1RM = T1;
3044   }
3045 
3046   Variable *T = makeReg(Ty);
3047   switch (Condition) {
3048   default:
3049     llvm_unreachable("unexpected condition");
3050     break;
3051   case InstIcmp::Eq: {
3052     if (llvm::isa<X86OperandMem>(Src1RM))
3053       Src1RM = legalizeToReg(Src1RM);
3054     _movp(T, Src0RM);
3055     _pcmpeq(T, Src1RM);
3056   } break;
3057   case InstIcmp::Ne: {
3058     if (llvm::isa<X86OperandMem>(Src1RM))
3059       Src1RM = legalizeToReg(Src1RM);
3060     _movp(T, Src0RM);
3061     _pcmpeq(T, Src1RM);
3062     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3063     _pxor(T, MinusOne);
3064   } break;
3065   case InstIcmp::Ugt:
3066   case InstIcmp::Sgt: {
3067     if (llvm::isa<X86OperandMem>(Src1RM))
3068       Src1RM = legalizeToReg(Src1RM);
3069     _movp(T, Src0RM);
3070     _pcmpgt(T, Src1RM);
3071   } break;
3072   case InstIcmp::Uge:
3073   case InstIcmp::Sge: {
3074     // !(Src1RM > Src0RM)
3075     if (llvm::isa<X86OperandMem>(Src0RM))
3076       Src0RM = legalizeToReg(Src0RM);
3077     _movp(T, Src1RM);
3078     _pcmpgt(T, Src0RM);
3079     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3080     _pxor(T, MinusOne);
3081   } break;
3082   case InstIcmp::Ult:
3083   case InstIcmp::Slt: {
3084     if (llvm::isa<X86OperandMem>(Src0RM))
3085       Src0RM = legalizeToReg(Src0RM);
3086     _movp(T, Src1RM);
3087     _pcmpgt(T, Src0RM);
3088   } break;
3089   case InstIcmp::Ule:
3090   case InstIcmp::Sle: {
3091     // !(Src0RM > Src1RM)
3092     if (llvm::isa<X86OperandMem>(Src1RM))
3093       Src1RM = legalizeToReg(Src1RM);
3094     _movp(T, Src0RM);
3095     _pcmpgt(T, Src1RM);
3096     Variable *MinusOne = makeVectorOfMinusOnes(Ty);
3097     _pxor(T, MinusOne);
3098   } break;
3099   }
3100 
3101   _movp(Dest, T);
3102   eliminateNextVectorSextInstruction(Dest);
3103 }
3104 
setccOrConsumer(BrCond Condition,Variable * Dest,const Inst * Consumer)3105 void TargetX8664::setccOrConsumer(BrCond Condition, Variable *Dest,
3106                                   const Inst *Consumer) {
3107   if (Consumer == nullptr) {
3108     _setcc(Dest, Condition);
3109     return;
3110   }
3111   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3112     _br(Condition, Br->getTargetTrue(), Br->getTargetFalse());
3113     return;
3114   }
3115   if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3116     Operand *SrcT = Select->getTrueOperand();
3117     Operand *SrcF = Select->getFalseOperand();
3118     Variable *SelectDest = Select->getDest();
3119     lowerSelectMove(SelectDest, Condition, SrcT, SrcF);
3120     return;
3121   }
3122   llvm::report_fatal_error("Unexpected consumer type");
3123 }
3124 
movOrConsumer(bool IcmpResult,Variable * Dest,const Inst * Consumer)3125 void TargetX8664::movOrConsumer(bool IcmpResult, Variable *Dest,
3126                                 const Inst *Consumer) {
3127   if (Consumer == nullptr) {
3128     _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
3129     return;
3130   }
3131   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3132     // TODO(sehr,stichnot): This could be done with a single unconditional
3133     // branch instruction, but subzero doesn't know how to handle the resulting
3134     // control flow graph changes now.  Make it do so to eliminate mov and cmp.
3135     _mov(Dest, Ctx->getConstantInt(Dest->getType(), (IcmpResult ? 1 : 0)));
3136     _cmp(Dest, Ctx->getConstantInt(Dest->getType(), 0));
3137     _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
3138     return;
3139   }
3140   if (const auto *Select = llvm::dyn_cast<InstSelect>(Consumer)) {
3141     Operand *Src = nullptr;
3142     if (IcmpResult) {
3143       Src = legalize(Select->getTrueOperand(), Legal_Reg | Legal_Imm);
3144     } else {
3145       Src = legalize(Select->getFalseOperand(), Legal_Reg | Legal_Imm);
3146     }
3147     Variable *SelectDest = Select->getDest();
3148     lowerMove(SelectDest, Src, false);
3149     return;
3150   }
3151   llvm::report_fatal_error("Unexpected consumer type");
3152 }
3153 
lowerArithAndConsumer(const InstArithmetic * Arith,const Inst * Consumer)3154 void TargetX8664::lowerArithAndConsumer(const InstArithmetic *Arith,
3155                                         const Inst *Consumer) {
3156   Variable *T = nullptr;
3157   Operand *Src0 = legalize(Arith->getSrc(0));
3158   Operand *Src1 = legalize(Arith->getSrc(1));
3159   Variable *Dest = Arith->getDest();
3160   switch (Arith->getOp()) {
3161   default:
3162     llvm_unreachable("arithmetic operator not AND or OR");
3163     break;
3164   case InstArithmetic::And:
3165     _mov(T, Src0);
3166     // Test cannot have an address in the second position.  Since T is
3167     // guaranteed to be a register and Src1 could be a memory load, ensure
3168     // that the second argument is a register.
3169     if (llvm::isa<Constant>(Src1))
3170       _test(T, Src1);
3171     else
3172       _test(Src1, T);
3173     break;
3174   case InstArithmetic::Or:
3175     _mov(T, Src0);
3176     _or(T, Src1);
3177     break;
3178   }
3179 
3180   if (Consumer == nullptr) {
3181     llvm::report_fatal_error("Expected a consumer instruction");
3182   }
3183   if (const auto *Br = llvm::dyn_cast<InstBr>(Consumer)) {
3184     Context.insert<InstFakeUse>(T);
3185     Context.insert<InstFakeDef>(Dest);
3186     _br(CondX86::Br_ne, Br->getTargetTrue(), Br->getTargetFalse());
3187     return;
3188   }
3189   llvm::report_fatal_error("Unexpected consumer type");
3190 }
3191 
lowerInsertElement(const InstInsertElement * Instr)3192 void TargetX8664::lowerInsertElement(const InstInsertElement *Instr) {
3193   Operand *SourceVectNotLegalized = Instr->getSrc(0);
3194   Operand *ElementToInsertNotLegalized = Instr->getSrc(1);
3195   auto *ElementIndex = llvm::dyn_cast<ConstantInteger32>(Instr->getSrc(2));
3196   // Only constant indices are allowed in PNaCl IR.
3197   assert(ElementIndex);
3198   unsigned Index = ElementIndex->getValue();
3199   assert(Index < typeNumElements(SourceVectNotLegalized->getType()));
3200 
3201   Type Ty = SourceVectNotLegalized->getType();
3202   Type ElementTy = typeElementType(Ty);
3203   Type InVectorElementTy = InstX86Base::getInVectorElementType(Ty);
3204 
3205   if (ElementTy == IceType_i1) {
3206     // Expand the element to the appropriate size for it to be inserted in the
3207     // vector.
3208     Variable *Expanded = Func->makeVariable(InVectorElementTy);
3209     auto *Cast = InstCast::create(Func, InstCast::Zext, Expanded,
3210                                   ElementToInsertNotLegalized);
3211     lowerCast(Cast);
3212     ElementToInsertNotLegalized = Expanded;
3213   }
3214 
3215   if (Ty == IceType_v8i16 || Ty == IceType_v8i1 || InstructionSet >= SSE4_1) {
3216     // Use insertps, pinsrb, pinsrw, or pinsrd.
3217     Operand *ElementRM =
3218         legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
3219     Operand *SourceVectRM =
3220         legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
3221     Variable *T = makeReg(Ty);
3222     _movp(T, SourceVectRM);
3223     if (Ty == IceType_v4f32) {
3224       _insertps(T, ElementRM, Ctx->getConstantInt32(Index << 4));
3225     } else {
3226       // For the pinsrb and pinsrw instructions, when the source operand is a
3227       // register, it must be a full r32 register like eax, and not ax/al/ah.
3228       // For filetype=asm, InstX86Pinsr::emit() compensates for
3229       // the use
3230       // of r16 and r8 by converting them through getBaseReg(), while emitIAS()
3231       // validates that the original and base register encodings are the same.
3232       if (ElementRM->getType() == IceType_i8 &&
3233           llvm::isa<Variable>(ElementRM)) {
3234         // Don't use ah/bh/ch/dh for pinsrb.
3235         ElementRM = copyToReg8(ElementRM);
3236       }
3237       _pinsr(T, ElementRM, Ctx->getConstantInt32(Index));
3238     }
3239     _movp(Instr->getDest(), T);
3240   } else if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
3241     // Use shufps or movss.
3242     Variable *ElementR = nullptr;
3243     Operand *SourceVectRM =
3244         legalize(SourceVectNotLegalized, Legal_Reg | Legal_Mem);
3245 
3246     if (InVectorElementTy == IceType_f32) {
3247       // ElementR will be in an XMM register since it is floating point.
3248       ElementR = legalizeToReg(ElementToInsertNotLegalized);
3249     } else {
3250       // Copy an integer to an XMM register.
3251       Operand *T = legalize(ElementToInsertNotLegalized, Legal_Reg | Legal_Mem);
3252       ElementR = makeReg(Ty);
3253       _movd(ElementR, T);
3254     }
3255 
3256     if (Index == 0) {
3257       Variable *T = makeReg(Ty);
3258       _movp(T, SourceVectRM);
3259       _movss(T, ElementR);
3260       _movp(Instr->getDest(), T);
3261       return;
3262     }
3263 
3264     // shufps treats the source and destination operands as vectors of four
3265     // doublewords. The destination's two high doublewords are selected from
3266     // the source operand and the two low doublewords are selected from the
3267     // (original value of) the destination operand. An insertelement operation
3268     // can be effected with a sequence of two shufps operations with
3269     // appropriate masks. In all cases below, Element[0] is being inserted into
3270     // SourceVectOperand. Indices are ordered from left to right.
3271     //
3272     // insertelement into index 1 (result is stored in ElementR):
3273     //   ElementR := ElementR[0, 0] SourceVectRM[0, 0]
3274     //   ElementR := ElementR[3, 0] SourceVectRM[2, 3]
3275     //
3276     // insertelement into index 2 (result is stored in T):
3277     //   T := SourceVectRM
3278     //   ElementR := ElementR[0, 0] T[0, 3]
3279     //   T := T[0, 1] ElementR[0, 3]
3280     //
3281     // insertelement into index 3 (result is stored in T):
3282     //   T := SourceVectRM
3283     //   ElementR := ElementR[0, 0] T[0, 2]
3284     //   T := T[0, 1] ElementR[3, 0]
3285     const unsigned char Mask1[3] = {0, 192, 128};
3286     const unsigned char Mask2[3] = {227, 196, 52};
3287 
3288     Constant *Mask1Constant = Ctx->getConstantInt32(Mask1[Index - 1]);
3289     Constant *Mask2Constant = Ctx->getConstantInt32(Mask2[Index - 1]);
3290 
3291     if (Index == 1) {
3292       _shufps(ElementR, SourceVectRM, Mask1Constant);
3293       _shufps(ElementR, SourceVectRM, Mask2Constant);
3294       _movp(Instr->getDest(), ElementR);
3295     } else {
3296       Variable *T = makeReg(Ty);
3297       _movp(T, SourceVectRM);
3298       _shufps(ElementR, T, Mask1Constant);
3299       _shufps(T, ElementR, Mask2Constant);
3300       _movp(Instr->getDest(), T);
3301     }
3302   } else {
3303     assert(Ty == IceType_v16i8 || Ty == IceType_v16i1);
3304     // Spill the value to a stack slot and perform the insertion in memory.
3305     //
3306     // TODO(wala): use legalize(SourceVectNotLegalized, Legal_Mem) when support
3307     // for legalizing to mem is implemented.
3308     Variable *Slot = Func->makeVariable(Ty);
3309     Slot->setMustNotHaveReg();
3310     _movp(Slot, legalizeToReg(SourceVectNotLegalized));
3311 
3312     // Compute the location of the position to insert in memory.
3313     unsigned Offset = Index * typeWidthInBytes(InVectorElementTy);
3314     X86OperandMem *Loc =
3315         getMemoryOperandForStackSlot(InVectorElementTy, Slot, Offset);
3316     _store(legalizeToReg(ElementToInsertNotLegalized), Loc);
3317 
3318     Variable *T = makeReg(Ty);
3319     _movp(T, Slot);
3320     _movp(Instr->getDest(), T);
3321   }
3322 }
3323 
lowerIntrinsic(const InstIntrinsic * Instr)3324 void TargetX8664::lowerIntrinsic(const InstIntrinsic *Instr) {
3325   switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicID()) {
3326   case Intrinsics::AtomicCmpxchg: {
3327     if (!Intrinsics::isMemoryOrderValid(
3328             ID, getConstantMemoryOrder(Instr->getArg(3)),
3329             getConstantMemoryOrder(Instr->getArg(4)))) {
3330       Func->setError("Unexpected memory ordering for AtomicCmpxchg");
3331       return;
3332     }
3333     Variable *DestPrev = Instr->getDest();
3334     Operand *PtrToMem = legalize(Instr->getArg(0));
3335     Operand *Expected = legalize(Instr->getArg(1));
3336     Operand *Desired = legalize(Instr->getArg(2));
3337     if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
3338       return;
3339     lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
3340     return;
3341   }
3342   case Intrinsics::AtomicFence:
3343     if (!Intrinsics::isMemoryOrderValid(
3344             ID, getConstantMemoryOrder(Instr->getArg(0)))) {
3345       Func->setError("Unexpected memory ordering for AtomicFence");
3346       return;
3347     }
3348     _mfence();
3349     return;
3350   case Intrinsics::AtomicFenceAll:
3351     // NOTE: FenceAll should prevent and load/store from being moved across the
3352     // fence (both atomic and non-atomic). The InstX8632Mfence instruction is
3353     // currently marked coarsely as "HasSideEffects".
3354     _mfence();
3355     return;
3356   case Intrinsics::AtomicIsLockFree: {
3357     // X86 is always lock free for 8/16/32/64 bit accesses.
3358     // TODO(jvoung): Since the result is constant when given a constant byte
3359     // size, this opens up DCE opportunities.
3360     Operand *ByteSize = Instr->getArg(0);
3361     Variable *Dest = Instr->getDest();
3362     if (auto *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize)) {
3363       Constant *Result;
3364       switch (CI->getValue()) {
3365       default:
3366         // Some x86-64 processors support the cmpxchg16b instruction, which can
3367         // make 16-byte operations lock free (when used with the LOCK prefix).
3368         // However, that's not supported in 32-bit mode, so just return 0 even
3369         // for large sizes.
3370         Result = Ctx->getConstantZero(IceType_i32);
3371         break;
3372       case 1:
3373       case 2:
3374       case 4:
3375       case 8:
3376         Result = Ctx->getConstantInt32(1);
3377         break;
3378       }
3379       _mov(Dest, Result);
3380       return;
3381     }
3382     // The PNaCl ABI requires the byte size to be a compile-time constant.
3383     Func->setError("AtomicIsLockFree byte size should be compile-time const");
3384     return;
3385   }
3386   case Intrinsics::AtomicLoad: {
3387     // We require the memory address to be naturally aligned. Given that is the
3388     // case, then normal loads are atomic.
3389     if (!Intrinsics::isMemoryOrderValid(
3390             ID, getConstantMemoryOrder(Instr->getArg(1)))) {
3391       Func->setError("Unexpected memory ordering for AtomicLoad");
3392       return;
3393     }
3394     Variable *Dest = Instr->getDest();
3395     auto *Load = InstLoad::create(Func, Dest, Instr->getArg(0));
3396     lowerLoad(Load);
3397     // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
3398     // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert
3399     // the FakeUse on the last-inserted instruction's dest.
3400     Context.insert<InstFakeUse>(Context.getLastInserted()->getDest());
3401     return;
3402   }
3403   case Intrinsics::AtomicRMW:
3404     if (!Intrinsics::isMemoryOrderValid(
3405             ID, getConstantMemoryOrder(Instr->getArg(3)))) {
3406       Func->setError("Unexpected memory ordering for AtomicRMW");
3407       return;
3408     }
3409     lowerAtomicRMW(
3410         Instr->getDest(),
3411         static_cast<uint32_t>(
3412             llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
3413         Instr->getArg(1), Instr->getArg(2));
3414     return;
3415   case Intrinsics::AtomicStore: {
3416     if (!Intrinsics::isMemoryOrderValid(
3417             ID, getConstantMemoryOrder(Instr->getArg(2)))) {
3418       Func->setError("Unexpected memory ordering for AtomicStore");
3419       return;
3420     }
3421     // We require the memory address to be naturally aligned. Given that is the
3422     // case, then normal stores are atomic. Add a fence after the store to make
3423     // it visible.
3424     Operand *Value = Instr->getArg(0);
3425     Operand *Ptr = Instr->getArg(1);
3426     auto *Store = InstStore::create(Func, Value, Ptr);
3427     lowerStore(Store);
3428     _mfence();
3429     return;
3430   }
3431   case Intrinsics::Bswap: {
3432     Variable *Dest = Instr->getDest();
3433     Operand *Val = Instr->getArg(0);
3434     // Use rotate left for 16-bit bswap.
3435     if (Val->getType() == IceType_i64 || Val->getType() == IceType_i32) {
3436       Variable *T = legalizeToReg(Val);
3437       _bswap(T);
3438       _mov(Dest, T);
3439     } else {
3440       assert(Val->getType() == IceType_i16);
3441       Constant *Eight = Ctx->getConstantInt16(8);
3442       Variable *T = nullptr;
3443       Val = legalize(Val);
3444       _mov(T, Val);
3445       _rol(T, Eight);
3446       _mov(Dest, T);
3447     }
3448     return;
3449   }
3450   case Intrinsics::Ctpop: {
3451     Variable *Dest = Instr->getDest();
3452     Variable *T = nullptr;
3453     Operand *Val = Instr->getArg(0);
3454     Type ValTy = Val->getType();
3455     assert(ValTy == IceType_i32 || ValTy == IceType_i64);
3456 
3457     T = makeReg(IceType_i64);
3458     if (ValTy == IceType_i32) {
3459       // in x86-64, __popcountsi2 is not defined, so we cheat a bit by
3460       // converting it to a 64-bit value, and using ctpop_i64. _movzx should
3461       // ensure we will not have any bits set on Val's upper 32 bits.
3462       Variable *V = makeReg(IceType_i64);
3463       Operand *ValRM = legalize(Val, Legal_Reg | Legal_Mem);
3464       _movzx(V, ValRM);
3465       Val = V;
3466     }
3467     ValTy = IceType_i64;
3468 
3469     InstCall *Call =
3470         makeHelperCall(ValTy == IceType_i32 ? RuntimeHelper::H_call_ctpop_i32
3471                                             : RuntimeHelper::H_call_ctpop_i64,
3472                        T, 1);
3473     Call->addArg(Val);
3474     lowerCall(Call);
3475     // The popcount helpers always return 32-bit values, while the intrinsic's
3476     // signature matches the native POPCNT instruction and fills a 64-bit reg
3477     // (in 64-bit mode). Thus, clear the upper bits of the dest just in case
3478     // the user doesn't do that in the IR. If the user does that in the IR,
3479     // then this zero'ing instruction is dead and gets optimized out.
3480     assert(Val->getType() == IceType_i64);
3481     // T is 64 bit. It needs to be copied to dest. We need to:
3482     //
3483     // T_1.32 = trunc T.64 to i32
3484     // T_2.64 = zext T_1.32 to i64
3485     // Dest.<<right_size>> = T_2.<<right_size>>
3486     //
3487     // which ensures the upper 32 bits will always be cleared. Just doing a
3488     //
3489     // mov Dest.32 = trunc T.32 to i32
3490     //
3491     // is dangerous because there's a chance the compiler will optimize this
3492     // copy out. To use _movzx we need two new registers (one 32-, and
3493     // another 64-bit wide.)
3494     Variable *T_1 = makeReg(IceType_i32);
3495     _mov(T_1, T);
3496     Variable *T_2 = makeReg(IceType_i64);
3497     _movzx(T_2, T_1);
3498     _mov(Dest, T_2);
3499     return;
3500   }
3501   case Intrinsics::Ctlz: {
3502     // The "is zero undef" parameter is ignored and we always return a
3503     // well-defined value.
3504     Operand *Val = legalize(Instr->getArg(0));
3505     Operand *FirstVal = Val;
3506     Operand *SecondVal = nullptr;
3507     constexpr bool IsCttz = false;
3508     lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
3509                     SecondVal);
3510     return;
3511   }
3512   case Intrinsics::Cttz: {
3513     // The "is zero undef" parameter is ignored and we always return a
3514     // well-defined value.
3515     Operand *Val = legalize(Instr->getArg(0));
3516     Operand *FirstVal = Val;
3517     Operand *SecondVal = nullptr;
3518     constexpr bool IsCttz = true;
3519     lowerCountZeros(IsCttz, Val->getType(), Instr->getDest(), FirstVal,
3520                     SecondVal);
3521     return;
3522   }
3523   case Intrinsics::Fabs: {
3524     Operand *Src = legalize(Instr->getArg(0));
3525     Type Ty = Src->getType();
3526     Variable *Dest = Instr->getDest();
3527     Variable *T = makeVectorOfFabsMask(Ty);
3528     // The pand instruction operates on an m128 memory operand, so if Src is an
3529     // f32 or f64, we need to make sure it's in a register.
3530     if (isVectorType(Ty)) {
3531       if (llvm::isa<X86OperandMem>(Src))
3532         Src = legalizeToReg(Src);
3533     } else {
3534       Src = legalizeToReg(Src);
3535     }
3536     _pand(T, Src);
3537     if (isVectorType(Ty))
3538       _movp(Dest, T);
3539     else
3540       _mov(Dest, T);
3541     return;
3542   }
3543   case Intrinsics::Longjmp: {
3544     InstCall *Call = makeHelperCall(RuntimeHelper::H_call_longjmp, nullptr, 2);
3545     Call->addArg(Instr->getArg(0));
3546     Call->addArg(Instr->getArg(1));
3547     lowerCall(Call);
3548     return;
3549   }
3550   case Intrinsics::Memcpy: {
3551     lowerMemcpy(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
3552     return;
3553   }
3554   case Intrinsics::Memmove: {
3555     lowerMemmove(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
3556     return;
3557   }
3558   case Intrinsics::Memset: {
3559     lowerMemset(Instr->getArg(0), Instr->getArg(1), Instr->getArg(2));
3560     return;
3561   }
3562   case Intrinsics::Setjmp: {
3563     InstCall *Call =
3564         makeHelperCall(RuntimeHelper::H_call_setjmp, Instr->getDest(), 1);
3565     Call->addArg(Instr->getArg(0));
3566     lowerCall(Call);
3567     return;
3568   }
3569   case Intrinsics::Sqrt: {
3570     Operand *Src = legalize(Instr->getArg(0));
3571     Variable *Dest = Instr->getDest();
3572     Variable *T = makeReg(Dest->getType());
3573     _sqrt(T, Src);
3574     if (isVectorType(Dest->getType())) {
3575       _movp(Dest, T);
3576     } else {
3577       _mov(Dest, T);
3578     }
3579     return;
3580   }
3581   case Intrinsics::Stacksave: {
3582     Variable *rsp =
3583         Func->getTarget()->getPhysicalRegister(getStackReg(), WordType);
3584     Variable *Dest = Instr->getDest();
3585     _mov(Dest, rsp);
3586     return;
3587   }
3588   case Intrinsics::Stackrestore: {
3589     Operand *Src = Instr->getArg(0);
3590     _mov_sp(Src);
3591     return;
3592   }
3593 
3594   case Intrinsics::Trap:
3595     _ud2();
3596     return;
3597   case Intrinsics::LoadSubVector: {
3598     assert(llvm::isa<ConstantInteger32>(Instr->getArg(1)) &&
3599            "LoadSubVector second argument must be a constant");
3600     Variable *Dest = Instr->getDest();
3601     Type Ty = Dest->getType();
3602     auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(1));
3603     Operand *Addr = Instr->getArg(0);
3604     X86OperandMem *Src = formMemoryOperand(Addr, Ty);
3605     doMockBoundsCheck(Src);
3606 
3607     if (Dest->isRematerializable()) {
3608       Context.insert<InstFakeDef>(Dest);
3609       return;
3610     }
3611 
3612     auto *T = makeReg(Ty);
3613     switch (SubVectorSize->getValue()) {
3614     case 4:
3615       _movd(T, Src);
3616       break;
3617     case 8:
3618       _movq(T, Src);
3619       break;
3620     default:
3621       Func->setError("Unexpected size for LoadSubVector");
3622       return;
3623     }
3624     _movp(Dest, T);
3625     return;
3626   }
3627   case Intrinsics::StoreSubVector: {
3628     assert(llvm::isa<ConstantInteger32>(Instr->getArg(2)) &&
3629            "StoreSubVector third argument must be a constant");
3630     auto *SubVectorSize = llvm::cast<ConstantInteger32>(Instr->getArg(2));
3631     Operand *Value = Instr->getArg(0);
3632     Operand *Addr = Instr->getArg(1);
3633     X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
3634     doMockBoundsCheck(NewAddr);
3635 
3636     Value = legalizeToReg(Value);
3637 
3638     switch (SubVectorSize->getValue()) {
3639     case 4:
3640       _stored(Value, NewAddr);
3641       break;
3642     case 8:
3643       _storeq(Value, NewAddr);
3644       break;
3645     default:
3646       Func->setError("Unexpected size for StoreSubVector");
3647       return;
3648     }
3649     return;
3650   }
3651   case Intrinsics::VectorPackSigned: {
3652     Operand *Src0 = Instr->getArg(0);
3653     Operand *Src1 = Instr->getArg(1);
3654     Variable *Dest = Instr->getDest();
3655     auto *T = makeReg(Src0->getType());
3656     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3657     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3658     _movp(T, Src0RM);
3659     _packss(T, Src1RM);
3660     _movp(Dest, T);
3661     return;
3662   }
3663   case Intrinsics::VectorPackUnsigned: {
3664     Operand *Src0 = Instr->getArg(0);
3665     Operand *Src1 = Instr->getArg(1);
3666     Variable *Dest = Instr->getDest();
3667     auto *T = makeReg(Src0->getType());
3668     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3669     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3670     _movp(T, Src0RM);
3671     _packus(T, Src1RM);
3672     _movp(Dest, T);
3673     return;
3674   }
3675   case Intrinsics::SignMask: {
3676     Operand *SrcReg = legalizeToReg(Instr->getArg(0));
3677     Variable *Dest = Instr->getDest();
3678     Variable *T = makeReg(IceType_i32);
3679     if (SrcReg->getType() == IceType_v4f32 ||
3680         SrcReg->getType() == IceType_v4i32 ||
3681         SrcReg->getType() == IceType_v16i8) {
3682       _movmsk(T, SrcReg);
3683     } else {
3684       // TODO(capn): We could implement v8i16 sign mask using packsswb/pmovmskb
3685       llvm::report_fatal_error("Invalid type for SignMask intrinsic");
3686     }
3687     _mov(Dest, T);
3688     return;
3689   }
3690   case Intrinsics::MultiplyHighSigned: {
3691     Operand *Src0 = Instr->getArg(0);
3692     Operand *Src1 = Instr->getArg(1);
3693     Variable *Dest = Instr->getDest();
3694     auto *T = makeReg(Dest->getType());
3695     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3696     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3697     _movp(T, Src0RM);
3698     _pmulhw(T, Src1RM);
3699     _movp(Dest, T);
3700     return;
3701   }
3702   case Intrinsics::MultiplyHighUnsigned: {
3703     Operand *Src0 = Instr->getArg(0);
3704     Operand *Src1 = Instr->getArg(1);
3705     Variable *Dest = Instr->getDest();
3706     auto *T = makeReg(Dest->getType());
3707     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3708     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3709     _movp(T, Src0RM);
3710     _pmulhuw(T, Src1RM);
3711     _movp(Dest, T);
3712     return;
3713   }
3714   case Intrinsics::MultiplyAddPairs: {
3715     Operand *Src0 = Instr->getArg(0);
3716     Operand *Src1 = Instr->getArg(1);
3717     Variable *Dest = Instr->getDest();
3718     auto *T = makeReg(Dest->getType());
3719     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3720     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3721     _movp(T, Src0RM);
3722     _pmaddwd(T, Src1RM);
3723     _movp(Dest, T);
3724     return;
3725   }
3726   case Intrinsics::AddSaturateSigned: {
3727     Operand *Src0 = Instr->getArg(0);
3728     Operand *Src1 = Instr->getArg(1);
3729     Variable *Dest = Instr->getDest();
3730     auto *T = makeReg(Dest->getType());
3731     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3732     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3733     _movp(T, Src0RM);
3734     _padds(T, Src1RM);
3735     _movp(Dest, T);
3736     return;
3737   }
3738   case Intrinsics::SubtractSaturateSigned: {
3739     Operand *Src0 = Instr->getArg(0);
3740     Operand *Src1 = Instr->getArg(1);
3741     Variable *Dest = Instr->getDest();
3742     auto *T = makeReg(Dest->getType());
3743     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3744     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3745     _movp(T, Src0RM);
3746     _psubs(T, Src1RM);
3747     _movp(Dest, T);
3748     return;
3749   }
3750   case Intrinsics::AddSaturateUnsigned: {
3751     Operand *Src0 = Instr->getArg(0);
3752     Operand *Src1 = Instr->getArg(1);
3753     Variable *Dest = Instr->getDest();
3754     auto *T = makeReg(Dest->getType());
3755     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3756     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3757     _movp(T, Src0RM);
3758     _paddus(T, Src1RM);
3759     _movp(Dest, T);
3760     return;
3761   }
3762   case Intrinsics::SubtractSaturateUnsigned: {
3763     Operand *Src0 = Instr->getArg(0);
3764     Operand *Src1 = Instr->getArg(1);
3765     Variable *Dest = Instr->getDest();
3766     auto *T = makeReg(Dest->getType());
3767     auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
3768     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
3769     _movp(T, Src0RM);
3770     _psubus(T, Src1RM);
3771     _movp(Dest, T);
3772     return;
3773   }
3774   case Intrinsics::Nearbyint: {
3775     Operand *Src = Instr->getArg(0);
3776     Variable *Dest = Instr->getDest();
3777     Type DestTy = Dest->getType();
3778     if (isVectorType(DestTy)) {
3779       assert(DestTy == IceType_v4i32);
3780       assert(Src->getType() == IceType_v4f32);
3781       Operand *Src0R = legalizeToReg(Src);
3782       Variable *T = makeReg(DestTy);
3783       _cvt(T, Src0R, Insts::Cvt::Ps2dq);
3784       _movp(Dest, T);
3785     } else {
3786       Operand *Src0RM = legalize(Src, Legal_Reg | Legal_Mem);
3787       // t1.i32 = cvt Src0RM; t2.dest_type = t1; Dest = t2.dest_type
3788       Variable *T_1 = nullptr;
3789       if (DestTy == IceType_i64) {
3790         T_1 = makeReg(IceType_i64);
3791       } else {
3792         assert(DestTy != IceType_i64);
3793         T_1 = makeReg(IceType_i32);
3794       }
3795       // cvt() requires its integer argument to be a GPR.
3796       Variable *T_2 = makeReg(DestTy);
3797       if (isByteSizedType(DestTy)) {
3798         assert(T_1->getType() == IceType_i32);
3799         T_1->setRegClass(RCX86_Is32To8);
3800         T_2->setRegClass(RCX86_IsTrunc8Rcvr);
3801       }
3802       _cvt(T_1, Src0RM, Insts::Cvt::Ss2si);
3803       _mov(T_2, T_1); // T_1 and T_2 may have different integer types
3804       if (DestTy == IceType_i1)
3805         _and(T_2, Ctx->getConstantInt1(1));
3806       _mov(Dest, T_2);
3807     }
3808     return;
3809   }
3810   case Intrinsics::Round: {
3811     assert(InstructionSet >= SSE4_1);
3812     Variable *Dest = Instr->getDest();
3813     Operand *Src = Instr->getArg(0);
3814     Operand *Mode = Instr->getArg(1);
3815     assert(llvm::isa<ConstantInteger32>(Mode) &&
3816            "Round last argument must be a constant");
3817     auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
3818     int32_t Imm = llvm::cast<ConstantInteger32>(Mode)->getValue();
3819     (void)Imm;
3820     assert(Imm >= 0 && Imm < 4 && "Invalid rounding mode");
3821     auto *T = makeReg(Dest->getType());
3822     _round(T, SrcRM, Mode);
3823     _movp(Dest, T);
3824     return;
3825   }
3826   default: // UnknownIntrinsic
3827     Func->setError("Unexpected intrinsic");
3828     return;
3829   }
3830   return;
3831 }
3832 
lowerAtomicCmpxchg(Variable * DestPrev,Operand * Ptr,Operand * Expected,Operand * Desired)3833 void TargetX8664::lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr,
3834                                      Operand *Expected, Operand *Desired) {
3835   Type Ty = Expected->getType();
3836   RegNumT Eax;
3837   switch (Ty) {
3838   default:
3839     llvm::report_fatal_error("Bad type for cmpxchg");
3840   case IceType_i64:
3841     Eax = RegX8664::Reg_rax;
3842     break;
3843   case IceType_i32:
3844     Eax = RegX8664::Reg_eax;
3845     break;
3846   case IceType_i16:
3847     Eax = RegX8664::Reg_ax;
3848     break;
3849   case IceType_i8:
3850     Eax = RegX8664::Reg_al;
3851     break;
3852   }
3853   Variable *T_eax = makeReg(Ty, Eax);
3854   _mov(T_eax, Expected);
3855   X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
3856   Variable *DesiredReg = legalizeToReg(Desired);
3857   constexpr bool Locked = true;
3858   _cmpxchg(Addr, T_eax, DesiredReg, Locked);
3859   _mov(DestPrev, T_eax);
3860 }
3861 
tryOptimizedCmpxchgCmpBr(Variable * Dest,Operand * PtrToMem,Operand * Expected,Operand * Desired)3862 bool TargetX8664::tryOptimizedCmpxchgCmpBr(Variable *Dest, Operand *PtrToMem,
3863                                            Operand *Expected,
3864                                            Operand *Desired) {
3865   if (Func->getOptLevel() == Opt_m1)
3866     return false;
3867   // Peek ahead a few instructions and see how Dest is used.
3868   // It's very common to have:
3869   //
3870   // %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
3871   // [%y_phi = ...] // list of phi stores
3872   // %p = icmp eq i32 %x, %expected
3873   // br i1 %p, label %l1, label %l2
3874   //
3875   // which we can optimize into:
3876   //
3877   // %x = <cmpxchg code>
3878   // [%y_phi = ...] // list of phi stores
3879   // br eq, %l1, %l2
3880   InstList::iterator I = Context.getCur();
3881   // I is currently the InstIntrinsic. Peek past that.
3882   // This assumes that the atomic cmpxchg has not been lowered yet,
3883   // so that the instructions seen in the scan from "Cur" is simple.
3884   assert(llvm::isa<InstIntrinsic>(*I));
3885   Inst *NextInst = Context.getNextInst(I);
3886   if (!NextInst)
3887     return false;
3888   // There might be phi assignments right before the compare+branch, since this
3889   // could be a backward branch for a loop. This placement of assignments is
3890   // determined by placePhiStores().
3891   CfgVector<InstAssign *> PhiAssigns;
3892   while (auto *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
3893     if (PhiAssign->getDest() == Dest)
3894       return false;
3895     PhiAssigns.push_back(PhiAssign);
3896     NextInst = Context.getNextInst(I);
3897     if (!NextInst)
3898       return false;
3899   }
3900   if (auto *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
3901     if (!(NextCmp->getCondition() == InstIcmp::Eq &&
3902           ((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
3903            (NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
3904       return false;
3905     }
3906     NextInst = Context.getNextInst(I);
3907     if (!NextInst)
3908       return false;
3909     if (auto *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
3910       if (!NextBr->isUnconditional() &&
3911           NextCmp->getDest() == NextBr->getCondition() &&
3912           NextBr->isLastUse(NextCmp->getDest())) {
3913         lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
3914         for (size_t i = 0; i < PhiAssigns.size(); ++i) {
3915           // Lower the phi assignments now, before the branch (same placement
3916           // as before).
3917           InstAssign *PhiAssign = PhiAssigns[i];
3918           PhiAssign->setDeleted();
3919           lowerAssign(PhiAssign);
3920           Context.advanceNext();
3921         }
3922         _br(CondX86::Br_e, NextBr->getTargetTrue(), NextBr->getTargetFalse());
3923         // Skip over the old compare and branch, by deleting them.
3924         NextCmp->setDeleted();
3925         NextBr->setDeleted();
3926         Context.advanceNext();
3927         Context.advanceNext();
3928         return true;
3929       }
3930     }
3931   }
3932   return false;
3933 }
3934 
lowerAtomicRMW(Variable * Dest,uint32_t Operation,Operand * Ptr,Operand * Val)3935 void TargetX8664::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
3936                                  Operand *Ptr, Operand *Val) {
3937   bool NeedsCmpxchg = false;
3938   LowerBinOp Op_Lo = nullptr;
3939   LowerBinOp Op_Hi = nullptr;
3940   switch (Operation) {
3941   default:
3942     Func->setError("Unknown AtomicRMW operation");
3943     return;
3944   case Intrinsics::AtomicAdd: {
3945     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
3946     constexpr bool Locked = true;
3947     Variable *T = nullptr;
3948     _mov(T, Val);
3949     _xadd(Addr, T, Locked);
3950     _mov(Dest, T);
3951     return;
3952   }
3953   case Intrinsics::AtomicSub: {
3954     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
3955     constexpr bool Locked = true;
3956     Variable *T = nullptr;
3957     _mov(T, Val);
3958     _neg(T);
3959     _xadd(Addr, T, Locked);
3960     _mov(Dest, T);
3961     return;
3962   }
3963   case Intrinsics::AtomicOr:
3964     // TODO(jvoung): If Dest is null or dead, then some of these
3965     // operations do not need an "exchange", but just a locked op.
3966     // That appears to be "worth" it for sub, or, and, and xor.
3967     // xadd is probably fine vs lock add for add, and xchg is fine
3968     // vs an atomic store.
3969     NeedsCmpxchg = true;
3970     Op_Lo = &TargetX8664::_or;
3971     Op_Hi = &TargetX8664::_or;
3972     break;
3973   case Intrinsics::AtomicAnd:
3974     NeedsCmpxchg = true;
3975     Op_Lo = &TargetX8664::_and;
3976     Op_Hi = &TargetX8664::_and;
3977     break;
3978   case Intrinsics::AtomicXor:
3979     NeedsCmpxchg = true;
3980     Op_Lo = &TargetX8664::_xor;
3981     Op_Hi = &TargetX8664::_xor;
3982     break;
3983   case Intrinsics::AtomicExchange:
3984     X86OperandMem *Addr = formMemoryOperand(Ptr, Dest->getType());
3985     Variable *T = nullptr;
3986     _mov(T, Val);
3987     _xchg(Addr, T);
3988     _mov(Dest, T);
3989     return;
3990   }
3991   // Otherwise, we need a cmpxchg loop.
3992   (void)NeedsCmpxchg;
3993   assert(NeedsCmpxchg);
3994   expandAtomicRMWAsCmpxchg(Op_Lo, Op_Hi, Dest, Ptr, Val);
3995 }
3996 
expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo,LowerBinOp Op_Hi,Variable * Dest,Operand * Ptr,Operand * Val)3997 void TargetX8664::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo, LowerBinOp Op_Hi,
3998                                            Variable *Dest, Operand *Ptr,
3999                                            Operand *Val) {
4000   // Expand a more complex RMW operation as a cmpxchg loop:
4001   // For 64-bit:
4002   //   mov     eax, [ptr]
4003   //   mov     edx, [ptr + 4]
4004   // .LABEL:
4005   //   mov     ebx, eax
4006   //   <Op_Lo> ebx, <desired_adj_lo>
4007   //   mov     ecx, edx
4008   //   <Op_Hi> ecx, <desired_adj_hi>
4009   //   lock cmpxchg8b [ptr]
4010   //   jne     .LABEL
4011   //   mov     <dest_lo>, eax
4012   //   mov     <dest_lo>, edx
4013   //
4014   // For 32-bit:
4015   //   mov     eax, [ptr]
4016   // .LABEL:
4017   //   mov     <reg>, eax
4018   //   op      <reg>, [desired_adj]
4019   //   lock cmpxchg [ptr], <reg>
4020   //   jne     .LABEL
4021   //   mov     <dest>, eax
4022   //
4023   // If Op_{Lo,Hi} are nullptr, then just copy the value.
4024   Val = legalize(Val);
4025   Type Ty = Val->getType();
4026   X86OperandMem *Addr = formMemoryOperand(Ptr, Ty);
4027   RegNumT Eax;
4028   switch (Ty) {
4029   default:
4030     llvm::report_fatal_error("Bad type for atomicRMW");
4031   case IceType_i64:
4032     Eax = RegX8664::Reg_rax;
4033     break;
4034   case IceType_i32:
4035     Eax = RegX8664::Reg_eax;
4036     break;
4037   case IceType_i16:
4038     Eax = RegX8664::Reg_ax;
4039     break;
4040   case IceType_i8:
4041     Eax = RegX8664::Reg_al;
4042     break;
4043   }
4044   Variable *T_eax = makeReg(Ty, Eax);
4045   _mov(T_eax, Addr);
4046   auto *Label = Context.insert<InstX86Label>(this);
4047   // We want to pick a different register for T than Eax, so don't use
4048   // _mov(T == nullptr, T_eax).
4049   Variable *T = makeReg(Ty);
4050   _mov(T, T_eax);
4051   (this->*Op_Lo)(T, Val);
4052   constexpr bool Locked = true;
4053   _cmpxchg(Addr, T_eax, T, Locked);
4054   _br(CondX86::Br_ne, Label);
4055   // If Val is a variable, model the extended live range of Val through
4056   // the end of the loop, since it will be re-used by the loop.
4057   if (auto *ValVar = llvm::dyn_cast<Variable>(Val)) {
4058     Context.insert<InstFakeUse>(ValVar);
4059   }
4060   // The address base (if any) is also reused in the loop.
4061   if (Variable *Base = Addr->getBase())
4062     Context.insert<InstFakeUse>(Base);
4063   _mov(Dest, T_eax);
4064 }
4065 
4066 /// Lowers count {trailing, leading} zeros intrinsic.
4067 ///
4068 /// We could do constant folding here, but that should have
4069 /// been done by the front-end/middle-end optimizations.
4070 
lowerCountZeros(bool Cttz,Type Ty,Variable * Dest,Operand * FirstVal,Operand * SecondVal)4071 void TargetX8664::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,
4072                                   Operand *FirstVal, Operand *SecondVal) {
4073   // TODO(jvoung): Determine if the user CPU supports LZCNT (BMI).
4074   // Then the instructions will handle the Val == 0 case much more simply
4075   // and won't require conversion from bit position to number of zeros.
4076   //
4077   // Otherwise:
4078   //   bsr IF_NOT_ZERO, Val
4079   //   mov T_DEST, ((Ty == i32) ? 63 : 127)
4080   //   cmovne T_DEST, IF_NOT_ZERO
4081   //   xor T_DEST, ((Ty == i32) ? 31 : 63)
4082   //   mov DEST, T_DEST
4083   //
4084   // NOTE: T_DEST must be a register because cmov requires its dest to be a
4085   // register. Also, bsf and bsr require their dest to be a register.
4086   //
4087   // The xor DEST, C(31|63) converts a bit position to # of leading zeroes.
4088   // E.g., for 000... 00001100, bsr will say that the most significant bit
4089   // set is at position 3, while the number of leading zeros is 28. Xor is
4090   // like (M - N) for N <= M, and converts 63 to 32, and 127 to 64 (for the
4091   // all-zeros case).
4092   //
4093   // Cttz, is similar, but uses bsf instead, and doesn't require the xor
4094   // bit position conversion, and the speculation is reversed.
4095 
4096   // TODO(jpp): refactor this method.
4097   assert(Ty == IceType_i32 || Ty == IceType_i64);
4098   const Type DestTy = Dest->getType();
4099   Variable *T = makeReg(DestTy);
4100   Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg);
4101   if (Cttz) {
4102     _bsf(T, FirstValRM);
4103   } else {
4104     _bsr(T, FirstValRM);
4105   }
4106   Variable *T_Dest = makeReg(DestTy);
4107   Constant *_31 = Ctx->getConstantInt32(31);
4108   Constant *_32 = Ctx->getConstantInt(DestTy, 32);
4109   Constant *_63 = Ctx->getConstantInt(DestTy, 63);
4110   Constant *_64 = Ctx->getConstantInt(DestTy, 64);
4111   if (Cttz) {
4112     if (DestTy == IceType_i64) {
4113       _mov(T_Dest, _64);
4114     } else {
4115       _mov(T_Dest, _32);
4116     }
4117   } else {
4118     Constant *_127 = Ctx->getConstantInt(DestTy, 127);
4119     if (DestTy == IceType_i64) {
4120       _mov(T_Dest, _127);
4121     } else {
4122       _mov(T_Dest, _63);
4123     }
4124   }
4125   _cmov(T_Dest, T, CondX86::Br_ne);
4126   if (!Cttz) {
4127     if (DestTy == IceType_i64) {
4128       // Even though there's a _63 available at this point, that constant might
4129       // not be an i32, which will cause the xor emission to fail.
4130       Constant *_63 = Ctx->getConstantInt32(63);
4131       _xor(T_Dest, _63);
4132     } else {
4133       _xor(T_Dest, _31);
4134     }
4135   }
4136   _mov(Dest, T_Dest);
4137 }
4138 
typedLoad(Type Ty,Variable * Dest,Variable * Base,Constant * Offset)4139 void TargetX8664::typedLoad(Type Ty, Variable *Dest, Variable *Base,
4140                             Constant *Offset) {
4141   // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
4142   // legalize Mem properly.
4143   if (Offset)
4144     assert(!llvm::isa<ConstantRelocatable>(Offset));
4145 
4146   auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
4147 
4148   if (isVectorType(Ty))
4149     _movp(Dest, Mem);
4150   else if (Ty == IceType_f64)
4151     _movq(Dest, Mem);
4152   else
4153     _mov(Dest, Mem);
4154 }
4155 
typedStore(Type Ty,Variable * Value,Variable * Base,Constant * Offset)4156 void TargetX8664::typedStore(Type Ty, Variable *Value, Variable *Base,
4157                              Constant *Offset) {
4158   // If Offset is a ConstantRelocatable in Non-SFI mode, we will need to
4159   // legalize Mem properly.
4160   if (Offset)
4161     assert(!llvm::isa<ConstantRelocatable>(Offset));
4162 
4163   auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
4164 
4165   if (isVectorType(Ty))
4166     _storep(Value, Mem);
4167   else if (Ty == IceType_f64)
4168     _storeq(Value, Mem);
4169   else
4170     _store(Value, Mem);
4171 }
4172 
copyMemory(Type Ty,Variable * Dest,Variable * Src,int32_t OffsetAmt)4173 void TargetX8664::copyMemory(Type Ty, Variable *Dest, Variable *Src,
4174                              int32_t OffsetAmt) {
4175   Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
4176   // TODO(ascull): this or add nullptr test to _movp, _movq
4177   Variable *Data = makeReg(Ty);
4178 
4179   typedLoad(Ty, Data, Src, Offset);
4180   typedStore(Ty, Data, Dest, Offset);
4181 }
4182 
lowerMemcpy(Operand * Dest,Operand * Src,Operand * Count)4183 void TargetX8664::lowerMemcpy(Operand *Dest, Operand *Src, Operand *Count) {
4184   // There is a load and store for each chunk in the unroll
4185   constexpr uint32_t BytesPerStorep = 16;
4186 
4187   // Check if the operands are constants
4188   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
4189   const bool IsCountConst = CountConst != nullptr;
4190   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
4191 
4192   if (shouldOptimizeMemIntrins() && IsCountConst &&
4193       CountValue <= BytesPerStorep * MEMCPY_UNROLL_LIMIT) {
4194     // Unlikely, but nothing to do if it does happen
4195     if (CountValue == 0)
4196       return;
4197 
4198     Variable *SrcBase = legalizeToReg(Src);
4199     Variable *DestBase = legalizeToReg(Dest);
4200 
4201     // Find the largest type that can be used and use it as much as possible in
4202     // reverse order. Then handle any remainder with overlapping copies. Since
4203     // the remainder will be at the end, there will be reduced pressure on the
4204     // memory unit as the accesses to the same memory are far apart.
4205     Type Ty = largestTypeInSize(CountValue);
4206     uint32_t TyWidth = typeWidthInBytes(Ty);
4207 
4208     uint32_t RemainingBytes = CountValue;
4209     int32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
4210     while (RemainingBytes >= TyWidth) {
4211       copyMemory(Ty, DestBase, SrcBase, Offset);
4212       RemainingBytes -= TyWidth;
4213       Offset -= TyWidth;
4214     }
4215 
4216     if (RemainingBytes == 0)
4217       return;
4218 
4219     // Lower the remaining bytes. Adjust to larger types in order to make use
4220     // of overlaps in the copies.
4221     Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
4222     Offset = CountValue - typeWidthInBytes(LeftOverTy);
4223     copyMemory(LeftOverTy, DestBase, SrcBase, Offset);
4224     return;
4225   }
4226 
4227   // Fall back on a function call
4228   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memcpy, nullptr, 3);
4229   Call->addArg(Dest);
4230   Call->addArg(Src);
4231   Call->addArg(Count);
4232   lowerCall(Call);
4233 }
4234 
lowerMemmove(Operand * Dest,Operand * Src,Operand * Count)4235 void TargetX8664::lowerMemmove(Operand *Dest, Operand *Src, Operand *Count) {
4236   // There is a load and store for each chunk in the unroll
4237   constexpr uint32_t BytesPerStorep = 16;
4238 
4239   // Check if the operands are constants
4240   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
4241   const bool IsCountConst = CountConst != nullptr;
4242   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
4243 
4244   if (shouldOptimizeMemIntrins() && IsCountConst &&
4245       CountValue <= BytesPerStorep * MEMMOVE_UNROLL_LIMIT) {
4246     // Unlikely, but nothing to do if it does happen
4247     if (CountValue == 0)
4248       return;
4249 
4250     Variable *SrcBase = legalizeToReg(Src);
4251     Variable *DestBase = legalizeToReg(Dest);
4252 
4253     std::tuple<Type, Constant *, Variable *> Moves[MEMMOVE_UNROLL_LIMIT];
4254     Constant *Offset;
4255     Variable *Reg;
4256 
4257     // Copy the data into registers as the source and destination could overlap
4258     // so make sure not to clobber the memory. This also means overlapping
4259     // moves can be used as we are taking a safe snapshot of the memory.
4260     Type Ty = largestTypeInSize(CountValue);
4261     uint32_t TyWidth = typeWidthInBytes(Ty);
4262 
4263     uint32_t RemainingBytes = CountValue;
4264     int32_t OffsetAmt = (CountValue & ~(TyWidth - 1)) - TyWidth;
4265     size_t N = 0;
4266     while (RemainingBytes >= TyWidth) {
4267       assert(N <= MEMMOVE_UNROLL_LIMIT);
4268       Offset = Ctx->getConstantInt32(OffsetAmt);
4269       Reg = makeReg(Ty);
4270       typedLoad(Ty, Reg, SrcBase, Offset);
4271       RemainingBytes -= TyWidth;
4272       OffsetAmt -= TyWidth;
4273       Moves[N++] = std::make_tuple(Ty, Offset, Reg);
4274     }
4275 
4276     if (RemainingBytes != 0) {
4277       // Lower the remaining bytes. Adjust to larger types in order to make use
4278       // of overlaps in the copies.
4279       assert(N <= MEMMOVE_UNROLL_LIMIT);
4280       Ty = firstTypeThatFitsSize(RemainingBytes);
4281       Offset = Ctx->getConstantInt32(CountValue - typeWidthInBytes(Ty));
4282       Reg = makeReg(Ty);
4283       typedLoad(Ty, Reg, SrcBase, Offset);
4284       Moves[N++] = std::make_tuple(Ty, Offset, Reg);
4285     }
4286 
4287     // Copy the data out into the destination memory
4288     for (size_t i = 0; i < N; ++i) {
4289       std::tie(Ty, Offset, Reg) = Moves[i];
4290       typedStore(Ty, Reg, DestBase, Offset);
4291     }
4292 
4293     return;
4294   }
4295 
4296   // Fall back on a function call
4297   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memmove, nullptr, 3);
4298   Call->addArg(Dest);
4299   Call->addArg(Src);
4300   Call->addArg(Count);
4301   lowerCall(Call);
4302 }
4303 
lowerMemset(Operand * Dest,Operand * Val,Operand * Count)4304 void TargetX8664::lowerMemset(Operand *Dest, Operand *Val, Operand *Count) {
4305   constexpr uint32_t BytesPerStorep = 16;
4306   constexpr uint32_t BytesPerStoreq = 8;
4307   constexpr uint32_t BytesPerStorei32 = 4;
4308   assert(Val->getType() == IceType_i8);
4309 
4310   // Check if the operands are constants
4311   const auto *CountConst = llvm::dyn_cast<const ConstantInteger32>(Count);
4312   const auto *ValConst = llvm::dyn_cast<const ConstantInteger32>(Val);
4313   const bool IsCountConst = CountConst != nullptr;
4314   const bool IsValConst = ValConst != nullptr;
4315   const uint32_t CountValue = IsCountConst ? CountConst->getValue() : 0;
4316   const uint32_t ValValue = IsValConst ? ValConst->getValue() : 0;
4317 
4318   // Unlikely, but nothing to do if it does happen
4319   if (IsCountConst && CountValue == 0)
4320     return;
4321 
4322   // TODO(ascull): if the count is constant but val is not it would be possible
4323   // to inline by spreading the value across 4 bytes and accessing subregs e.g.
4324   // eax, ax and al.
4325   if (shouldOptimizeMemIntrins() && IsCountConst && IsValConst) {
4326     Variable *Base = nullptr;
4327     Variable *VecReg = nullptr;
4328     const uint32_t MaskValue = (ValValue & 0xff);
4329     const uint32_t SpreadValue =
4330         (MaskValue << 24) | (MaskValue << 16) | (MaskValue << 8) | MaskValue;
4331 
4332     auto lowerSet = [this, &Base, SpreadValue, &VecReg](Type Ty,
4333                                                         uint32_t OffsetAmt) {
4334       assert(Base != nullptr);
4335       Constant *Offset = OffsetAmt ? Ctx->getConstantInt32(OffsetAmt) : nullptr;
4336 
4337       // TODO(ascull): is 64-bit better with vector or scalar movq?
4338       auto *Mem = X86OperandMem::create(Func, Ty, Base, Offset);
4339       if (isVectorType(Ty)) {
4340         assert(VecReg != nullptr);
4341         _storep(VecReg, Mem);
4342       } else if (Ty == IceType_f64) {
4343         assert(VecReg != nullptr);
4344         _storeq(VecReg, Mem);
4345       } else {
4346         assert(Ty != IceType_i64);
4347         _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
4348       }
4349     };
4350 
4351     // Find the largest type that can be used and use it as much as possible in
4352     // reverse order. Then handle any remainder with overlapping copies. Since
4353     // the remainder will be at the end, there will be reduces pressure on the
4354     // memory unit as the access to the same memory are far apart.
4355     Type Ty = IceType_void;
4356     if (ValValue == 0 && CountValue >= BytesPerStoreq &&
4357         CountValue <= BytesPerStorep * MEMSET_UNROLL_LIMIT) {
4358       // When the value is zero it can be loaded into a vector register cheaply
4359       // using the xor trick.
4360       Base = legalizeToReg(Dest);
4361       VecReg = makeVectorOfZeros(IceType_v16i8);
4362       Ty = largestTypeInSize(CountValue);
4363     } else if (CountValue <= BytesPerStorei32 * MEMSET_UNROLL_LIMIT) {
4364       // When the value is non-zero or the count is small we can't use vector
4365       // instructions so are limited to 32-bit stores.
4366       Base = legalizeToReg(Dest);
4367       constexpr uint32_t MaxSize = 4;
4368       Ty = largestTypeInSize(CountValue, MaxSize);
4369     }
4370 
4371     if (Base) {
4372       uint32_t TyWidth = typeWidthInBytes(Ty);
4373 
4374       uint32_t RemainingBytes = CountValue;
4375       uint32_t Offset = (CountValue & ~(TyWidth - 1)) - TyWidth;
4376       while (RemainingBytes >= TyWidth) {
4377         lowerSet(Ty, Offset);
4378         RemainingBytes -= TyWidth;
4379         Offset -= TyWidth;
4380       }
4381 
4382       if (RemainingBytes == 0)
4383         return;
4384 
4385       // Lower the remaining bytes. Adjust to larger types in order to make use
4386       // of overlaps in the copies.
4387       Type LeftOverTy = firstTypeThatFitsSize(RemainingBytes);
4388       Offset = CountValue - typeWidthInBytes(LeftOverTy);
4389       lowerSet(LeftOverTy, Offset);
4390       return;
4391     }
4392   }
4393 
4394   // Fall back on calling the memset function. The value operand needs to be
4395   // extended to a stack slot size because the PNaCl ABI requires arguments to
4396   // be at least 32 bits wide.
4397   Operand *ValExt;
4398   if (IsValConst) {
4399     ValExt = Ctx->getConstantInt(stackSlotType(), ValValue);
4400   } else {
4401     Variable *ValExtVar = Func->makeVariable(stackSlotType());
4402     lowerCast(InstCast::create(Func, InstCast::Zext, ValExtVar, Val));
4403     ValExt = ValExtVar;
4404   }
4405   InstCall *Call = makeHelperCall(RuntimeHelper::H_call_memset, nullptr, 3);
4406   Call->addArg(Dest);
4407   Call->addArg(ValExt);
4408   Call->addArg(Count);
4409   lowerCall(Call);
4410 }
4411 
4412 class AddressOptimizer {
4413   AddressOptimizer() = delete;
4414   AddressOptimizer(const AddressOptimizer &) = delete;
4415   AddressOptimizer &operator=(const AddressOptimizer &) = delete;
4416 
4417 public:
AddressOptimizer(const Cfg * Func)4418   explicit AddressOptimizer(const Cfg *Func)
4419       : Func(Func), VMetadata(Func->getVMetadata()) {}
4420 
4421   inline void dumpAddressOpt(const ConstantRelocatable *const Relocatable,
4422                              int32_t Offset, const Variable *Base,
4423                              const Variable *Index, uint16_t Shift,
4424                              const Inst *Reason) const;
4425 
4426   inline const Inst *matchAssign(Variable **Var,
4427                                  ConstantRelocatable **Relocatable,
4428                                  int32_t *Offset);
4429 
4430   inline const Inst *matchCombinedBaseIndex(Variable **Base, Variable **Index,
4431                                             uint16_t *Shift);
4432 
4433   inline const Inst *matchShiftedIndex(Variable **Index, uint16_t *Shift);
4434 
4435   inline const Inst *matchOffsetIndexOrBase(Variable **IndexOrBase,
4436                                             const uint16_t Shift,
4437                                             ConstantRelocatable **Relocatable,
4438                                             int32_t *Offset);
4439 
4440 private:
4441   const Cfg *const Func;
4442   const VariablesMetadata *const VMetadata;
4443 
isAdd(const Inst * Instr)4444   static bool isAdd(const Inst *Instr) {
4445     if (auto *Arith = llvm::dyn_cast_or_null<const InstArithmetic>(Instr)) {
4446       return (Arith->getOp() == InstArithmetic::Add);
4447     }
4448     return false;
4449   }
4450 };
4451 
dumpAddressOpt(const ConstantRelocatable * const Relocatable,int32_t Offset,const Variable * Base,const Variable * Index,uint16_t Shift,const Inst * Reason) const4452 void AddressOptimizer::dumpAddressOpt(
4453     const ConstantRelocatable *const Relocatable, int32_t Offset,
4454     const Variable *Base, const Variable *Index, uint16_t Shift,
4455     const Inst *Reason) const {
4456   if (!BuildDefs::dump())
4457     return;
4458   if (!Func->isVerbose(IceV_AddrOpt))
4459     return;
4460   OstreamLocker L(Func->getContext());
4461   Ostream &Str = Func->getContext()->getStrDump();
4462   Str << "Instruction: ";
4463   Reason->dumpDecorated(Func);
4464   Str << "  results in Base=";
4465   if (Base)
4466     Base->dump(Func);
4467   else
4468     Str << "<null>";
4469   Str << ", Index=";
4470   if (Index)
4471     Index->dump(Func);
4472   else
4473     Str << "<null>";
4474   Str << ", Shift=" << Shift << ", Offset=" << Offset
4475       << ", Relocatable=" << Relocatable << "\n";
4476 }
4477 
matchAssign(Variable ** Var,ConstantRelocatable ** Relocatable,int32_t * Offset)4478 const Inst *AddressOptimizer::matchAssign(Variable **Var,
4479                                           ConstantRelocatable **Relocatable,
4480                                           int32_t *Offset) {
4481   // Var originates from Var=SrcVar ==> set Var:=SrcVar
4482   if (*Var == nullptr)
4483     return nullptr;
4484   if (const Inst *VarAssign = VMetadata->getSingleDefinition(*Var)) {
4485     assert(!VMetadata->isMultiDef(*Var));
4486     if (llvm::isa<InstAssign>(VarAssign)) {
4487       Operand *SrcOp = VarAssign->getSrc(0);
4488       assert(SrcOp);
4489       if (auto *SrcVar = llvm::dyn_cast<Variable>(SrcOp)) {
4490         if (!VMetadata->isMultiDef(SrcVar) &&
4491             // TODO: ensure SrcVar stays single-BB
4492             true) {
4493           *Var = SrcVar;
4494           return VarAssign;
4495         }
4496       } else if (auto *Const = llvm::dyn_cast<ConstantInteger32>(SrcOp)) {
4497         int32_t MoreOffset = Const->getValue();
4498         if (Utils::WouldOverflowAdd(*Offset, MoreOffset))
4499           return nullptr;
4500         *Var = nullptr;
4501         *Offset += MoreOffset;
4502         return VarAssign;
4503       } else if (auto *AddReloc = llvm::dyn_cast<ConstantRelocatable>(SrcOp)) {
4504         if (*Relocatable == nullptr) {
4505           // It is always safe to fold a relocatable through assignment -- the
4506           // assignment frees a slot in the address operand that can be used to
4507           // hold the Sandbox Pointer -- if any.
4508           *Var = nullptr;
4509           *Relocatable = AddReloc;
4510           return VarAssign;
4511         }
4512       }
4513     }
4514   }
4515   return nullptr;
4516 }
4517 
matchCombinedBaseIndex(Variable ** Base,Variable ** Index,uint16_t * Shift)4518 const Inst *AddressOptimizer::matchCombinedBaseIndex(Variable **Base,
4519                                                      Variable **Index,
4520                                                      uint16_t *Shift) {
4521   // Index==nullptr && Base is Base=Var1+Var2 ==>
4522   //   set Base=Var1, Index=Var2, Shift=0
4523   if (*Base == nullptr)
4524     return nullptr;
4525   if (*Index != nullptr)
4526     return nullptr;
4527   auto *BaseInst = VMetadata->getSingleDefinition(*Base);
4528   if (BaseInst == nullptr)
4529     return nullptr;
4530   assert(!VMetadata->isMultiDef(*Base));
4531   if (BaseInst->getSrcSize() < 2)
4532     return nullptr;
4533   if (auto *Var1 = llvm::dyn_cast<Variable>(BaseInst->getSrc(0))) {
4534     if (VMetadata->isMultiDef(Var1))
4535       return nullptr;
4536     if (auto *Var2 = llvm::dyn_cast<Variable>(BaseInst->getSrc(1))) {
4537       if (VMetadata->isMultiDef(Var2))
4538         return nullptr;
4539       if (isAdd(BaseInst) &&
4540           // TODO: ensure Var1 and Var2 stay single-BB
4541           true) {
4542         *Base = Var1;
4543         *Index = Var2;
4544         *Shift = 0; // should already have been 0
4545         return BaseInst;
4546       }
4547     }
4548   }
4549   return nullptr;
4550 }
4551 
matchShiftedIndex(Variable ** Index,uint16_t * Shift)4552 const Inst *AddressOptimizer::matchShiftedIndex(Variable **Index,
4553                                                 uint16_t *Shift) {
4554   // Index is Index=Var*Const && log2(Const)+Shift<=3 ==>
4555   //   Index=Var, Shift+=log2(Const)
4556   if (*Index == nullptr)
4557     return nullptr;
4558   auto *IndexInst = VMetadata->getSingleDefinition(*Index);
4559   if (IndexInst == nullptr)
4560     return nullptr;
4561   assert(!VMetadata->isMultiDef(*Index));
4562 
4563   // When using an unsigned 32-bit array index on x64, it gets zero-extended
4564   // before the shift & add. The explicit zero extension can be eliminated
4565   // because x86 32-bit operations automatically get zero-extended into the
4566   // corresponding 64-bit register.
4567   if (auto *CastInst = llvm::dyn_cast<InstCast>(IndexInst)) {
4568     if (CastInst->getCastKind() == InstCast::Zext) {
4569       if (auto *Var = llvm::dyn_cast<Variable>(CastInst->getSrc(0))) {
4570         if (Var->getType() == IceType_i32 &&
4571             CastInst->getDest()->getType() == IceType_i64) {
4572           IndexInst = VMetadata->getSingleDefinition(Var);
4573         }
4574       }
4575     }
4576   }
4577 
4578   if (IndexInst->getSrcSize() < 2)
4579     return nullptr;
4580   if (auto *ArithInst = llvm::dyn_cast<InstArithmetic>(IndexInst)) {
4581     if (auto *Var = llvm::dyn_cast<Variable>(ArithInst->getSrc(0))) {
4582       if (auto *Const =
4583               llvm::dyn_cast<ConstantInteger32>(ArithInst->getSrc(1))) {
4584         if (VMetadata->isMultiDef(Var) || Const->getType() != IceType_i32)
4585           return nullptr;
4586         switch (ArithInst->getOp()) {
4587         default:
4588           return nullptr;
4589         case InstArithmetic::Mul: {
4590           uint32_t Mult = Const->getValue();
4591           uint32_t LogMult;
4592           switch (Mult) {
4593           case 1:
4594             LogMult = 0;
4595             break;
4596           case 2:
4597             LogMult = 1;
4598             break;
4599           case 4:
4600             LogMult = 2;
4601             break;
4602           case 8:
4603             LogMult = 3;
4604             break;
4605           default:
4606             return nullptr;
4607           }
4608           if (*Shift + LogMult <= 3) {
4609             *Index = Var;
4610             *Shift += LogMult;
4611             return IndexInst;
4612           }
4613         }
4614         case InstArithmetic::Shl: {
4615           uint32_t ShiftAmount = Const->getValue();
4616           switch (ShiftAmount) {
4617           case 0:
4618           case 1:
4619           case 2:
4620           case 3:
4621             break;
4622           default:
4623             return nullptr;
4624           }
4625           if (*Shift + ShiftAmount <= 3) {
4626             *Index = Var;
4627             *Shift += ShiftAmount;
4628             return IndexInst;
4629           }
4630         }
4631         }
4632       }
4633     }
4634   }
4635   return nullptr;
4636 }
4637 
matchOffsetIndexOrBase(Variable ** IndexOrBase,const uint16_t Shift,ConstantRelocatable ** Relocatable,int32_t * Offset)4638 const Inst *AddressOptimizer::matchOffsetIndexOrBase(
4639     Variable **IndexOrBase, const uint16_t Shift,
4640     ConstantRelocatable **Relocatable, int32_t *Offset) {
4641   // Base is Base=Var+Const || Base is Base=Const+Var ==>
4642   //   set Base=Var, Offset+=Const
4643   // Base is Base=Var-Const ==>
4644   //   set Base=Var, Offset-=Const
4645   // Index is Index=Var+Const ==>
4646   //   set Index=Var, Offset+=(Const<<Shift)
4647   // Index is Index=Const+Var ==>
4648   //   set Index=Var, Offset+=(Const<<Shift)
4649   // Index is Index=Var-Const ==>
4650   //   set Index=Var, Offset-=(Const<<Shift)
4651   // Treat Index=Var Or Const as Index=Var + Const
4652   //    when Var = Var' << N and log2(Const) <= N
4653   // or when Var = (2^M) * (2^N) and log2(Const) <= (M+N)
4654 
4655   if (*IndexOrBase == nullptr) {
4656     return nullptr;
4657   }
4658   const Inst *Definition = VMetadata->getSingleDefinition(*IndexOrBase);
4659   if (Definition == nullptr) {
4660     return nullptr;
4661   }
4662   assert(!VMetadata->isMultiDef(*IndexOrBase));
4663   if (auto *ArithInst = llvm::dyn_cast<const InstArithmetic>(Definition)) {
4664     switch (ArithInst->getOp()) {
4665     case InstArithmetic::Add:
4666     case InstArithmetic::Sub:
4667     case InstArithmetic::Or:
4668       break;
4669     default:
4670       return nullptr;
4671     }
4672 
4673     Operand *Src0 = ArithInst->getSrc(0);
4674     Operand *Src1 = ArithInst->getSrc(1);
4675     auto *Var0 = llvm::dyn_cast<Variable>(Src0);
4676     auto *Var1 = llvm::dyn_cast<Variable>(Src1);
4677     auto *Const0 = llvm::dyn_cast<ConstantInteger32>(Src0);
4678     auto *Const1 = llvm::dyn_cast<ConstantInteger32>(Src1);
4679     auto *Reloc0 = llvm::dyn_cast<ConstantRelocatable>(Src0);
4680     auto *Reloc1 = llvm::dyn_cast<ConstantRelocatable>(Src1);
4681 
4682     bool IsAdd = false;
4683     if (ArithInst->getOp() == InstArithmetic::Or) {
4684       Variable *Var = nullptr;
4685       ConstantInteger32 *Const = nullptr;
4686       if (Var0 && Const1) {
4687         Var = Var0;
4688         Const = Const1;
4689       } else if (Const0 && Var1) {
4690         Var = Var1;
4691         Const = Const0;
4692       } else {
4693         return nullptr;
4694       }
4695       auto *VarDef =
4696           llvm::dyn_cast<InstArithmetic>(VMetadata->getSingleDefinition(Var));
4697       if (VarDef == nullptr)
4698         return nullptr;
4699 
4700       SizeT ZeroesAvailable = 0;
4701       if (VarDef->getOp() == InstArithmetic::Shl) {
4702         if (auto *ConstInt =
4703                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
4704           ZeroesAvailable = ConstInt->getValue();
4705         }
4706       } else if (VarDef->getOp() == InstArithmetic::Mul) {
4707         SizeT PowerOfTwo = 0;
4708         if (auto *MultConst =
4709                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(0))) {
4710           if (llvm::isPowerOf2_32(MultConst->getValue())) {
4711             PowerOfTwo += MultConst->getValue();
4712           }
4713         }
4714         if (auto *MultConst =
4715                 llvm::dyn_cast<ConstantInteger32>(VarDef->getSrc(1))) {
4716           if (llvm::isPowerOf2_32(MultConst->getValue())) {
4717             PowerOfTwo += MultConst->getValue();
4718           }
4719         }
4720         ZeroesAvailable = llvm::Log2_32(PowerOfTwo) + 1;
4721       }
4722       SizeT ZeroesNeeded = llvm::Log2_32(Const->getValue()) + 1;
4723       if (ZeroesNeeded == 0 || ZeroesNeeded > ZeroesAvailable)
4724         return nullptr;
4725       IsAdd = true; // treat it as an add if the above conditions hold
4726     } else {
4727       IsAdd = ArithInst->getOp() == InstArithmetic::Add;
4728     }
4729 
4730     Variable *NewIndexOrBase = nullptr;
4731     int32_t NewOffset = 0;
4732     ConstantRelocatable *NewRelocatable = *Relocatable;
4733     if (Var0 && Var1)
4734       // TODO(sehr): merge base/index splitting into here.
4735       return nullptr;
4736     if (!IsAdd && Var1)
4737       return nullptr;
4738     if (Var0)
4739       NewIndexOrBase = Var0;
4740     else if (Var1)
4741       NewIndexOrBase = Var1;
4742     // Don't know how to add/subtract two relocatables.
4743     if ((*Relocatable && (Reloc0 || Reloc1)) || (Reloc0 && Reloc1))
4744       return nullptr;
4745     // Don't know how to subtract a relocatable.
4746     if (!IsAdd && Reloc1)
4747       return nullptr;
4748     // Incorporate ConstantRelocatables.
4749     if (Reloc0)
4750       NewRelocatable = Reloc0;
4751     else if (Reloc1)
4752       NewRelocatable = Reloc1;
4753     // Compute the updated constant offset.
4754     if (Const0) {
4755       const int32_t MoreOffset =
4756           IsAdd ? Const0->getValue() : -Const0->getValue();
4757       if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
4758         return nullptr;
4759       NewOffset += MoreOffset;
4760     }
4761     if (Const1) {
4762       const int32_t MoreOffset =
4763           IsAdd ? Const1->getValue() : -Const1->getValue();
4764       if (Utils::WouldOverflowAdd(*Offset + NewOffset, MoreOffset))
4765         return nullptr;
4766       NewOffset += MoreOffset;
4767     }
4768     if (Utils::WouldOverflowAdd(*Offset, NewOffset << Shift))
4769       return nullptr;
4770     *IndexOrBase = NewIndexOrBase;
4771     *Offset += (NewOffset << Shift);
4772     // Shift is always zero if this is called with the base
4773     *Relocatable = NewRelocatable;
4774     return Definition;
4775   }
4776   return nullptr;
4777 }
4778 
computeAddressOpt(const Inst * Instr,Type MemType,Operand * Addr)4779 X86OperandMem *TargetX8664::computeAddressOpt(const Inst *Instr, Type MemType,
4780                                               Operand *Addr) {
4781   Func->resetCurrentNode();
4782   if (Func->isVerbose(IceV_AddrOpt)) {
4783     OstreamLocker L(Func->getContext());
4784     Ostream &Str = Func->getContext()->getStrDump();
4785     Str << "\nStarting computeAddressOpt for instruction:\n  ";
4786     Instr->dumpDecorated(Func);
4787   }
4788 
4789   OptAddr NewAddr;
4790   NewAddr.Base = llvm::dyn_cast<Variable>(Addr);
4791   if (NewAddr.Base == nullptr)
4792     return nullptr;
4793 
4794   // If the Base has more than one use or is live across multiple blocks, then
4795   // don't go further. Alternatively (?), never consider a transformation that
4796   // would change a variable that is currently *not* live across basic block
4797   // boundaries into one that *is*.
4798   if (!getFlags().getLoopInvariantCodeMotion()) {
4799     // Need multi block address opt when licm is enabled.
4800     // Might make sense to restrict to current node and loop header.
4801     if (Func->getVMetadata()->isMultiBlock(
4802             NewAddr.Base) /* || Base->getUseCount() > 1*/)
4803       return nullptr;
4804   }
4805   AddressOptimizer AddrOpt(Func);
4806   const bool MockBounds = getFlags().getMockBoundsCheck();
4807   const Inst *Reason = nullptr;
4808   bool AddressWasOptimized = false;
4809   // The following unnamed struct identifies the address mode formation steps
4810   // that could potentially create an invalid memory operand (i.e., no free
4811   // slots for RebasePtr.) We add all those variables to this struct so that we
4812   // can use memset() to reset all members to false.
4813   struct {
4814     bool AssignBase = false;
4815     bool AssignIndex = false;
4816     bool OffsetFromBase = false;
4817     bool OffsetFromIndex = false;
4818     bool CombinedBaseIndex = false;
4819   } Skip;
4820   // NewAddrCheckpoint is used to rollback the address being formed in case an
4821   // invalid address is formed.
4822   OptAddr NewAddrCheckpoint;
4823   Reason = Instr;
4824   do {
4825     if (Reason) {
4826       AddrOpt.dumpAddressOpt(NewAddr.Relocatable, NewAddr.Offset, NewAddr.Base,
4827                              NewAddr.Index, NewAddr.Shift, Reason);
4828       AddressWasOptimized = true;
4829       Reason = nullptr;
4830       memset(reinterpret_cast<void *>(&Skip), 0, sizeof(Skip));
4831     }
4832 
4833     NewAddrCheckpoint = NewAddr;
4834 
4835     // Update Base and Index to follow through assignments to definitions.
4836     if (!Skip.AssignBase &&
4837         (Reason = AddrOpt.matchAssign(&NewAddr.Base, &NewAddr.Relocatable,
4838                                       &NewAddr.Offset))) {
4839       // Assignments of Base from a Relocatable or ConstantInt32 can result
4840       // in Base becoming nullptr.  To avoid code duplication in this loop we
4841       // prefer that Base be non-nullptr if possible.
4842       if ((NewAddr.Base == nullptr) && (NewAddr.Index != nullptr) &&
4843           NewAddr.Shift == 0) {
4844         std::swap(NewAddr.Base, NewAddr.Index);
4845       }
4846       continue;
4847     }
4848     if (!Skip.AssignBase &&
4849         (Reason = AddrOpt.matchAssign(&NewAddr.Index, &NewAddr.Relocatable,
4850                                       &NewAddr.Offset))) {
4851       continue;
4852     }
4853 
4854     if (!MockBounds) {
4855       // Transition from:
4856       //   <Relocatable + Offset>(Base) to
4857       //   <Relocatable + Offset>(Base, Index)
4858       if (!Skip.CombinedBaseIndex &&
4859           (Reason = AddrOpt.matchCombinedBaseIndex(
4860                &NewAddr.Base, &NewAddr.Index, &NewAddr.Shift))) {
4861         continue;
4862       }
4863 
4864       // Recognize multiply/shift and update Shift amount.
4865       // Index becomes Index=Var<<Const && Const+Shift<=3 ==>
4866       //   Index=Var, Shift+=Const
4867       // Index becomes Index=Const*Var && log2(Const)+Shift<=3 ==>
4868       //   Index=Var, Shift+=log2(Const)
4869       if ((Reason =
4870                AddrOpt.matchShiftedIndex(&NewAddr.Index, &NewAddr.Shift))) {
4871         continue;
4872       }
4873 
4874       // If Shift is zero, the choice of Base and Index was purely arbitrary.
4875       // Recognize multiply/shift and set Shift amount.
4876       // Shift==0 && Base is Base=Var*Const && log2(Const)+Shift<=3 ==>
4877       //   swap(Index,Base)
4878       // Similar for Base=Const*Var and Base=Var<<Const
4879       if (NewAddr.Shift == 0 &&
4880           (Reason = AddrOpt.matchShiftedIndex(&NewAddr.Base, &NewAddr.Shift))) {
4881         std::swap(NewAddr.Base, NewAddr.Index);
4882         continue;
4883       }
4884     }
4885 
4886     // Update Offset to reflect additions/subtractions with constants and
4887     // relocatables.
4888     // TODO: consider overflow issues with respect to Offset.
4889     if (!Skip.OffsetFromBase && (Reason = AddrOpt.matchOffsetIndexOrBase(
4890                                      &NewAddr.Base, /*Shift =*/0,
4891                                      &NewAddr.Relocatable, &NewAddr.Offset))) {
4892       continue;
4893     }
4894     if (!Skip.OffsetFromIndex && (Reason = AddrOpt.matchOffsetIndexOrBase(
4895                                       &NewAddr.Index, NewAddr.Shift,
4896                                       &NewAddr.Relocatable, &NewAddr.Offset))) {
4897       continue;
4898     }
4899 
4900     break;
4901   } while (Reason);
4902 
4903   if (!AddressWasOptimized) {
4904     return nullptr;
4905   }
4906 
4907   Constant *OffsetOp = nullptr;
4908   if (NewAddr.Relocatable == nullptr) {
4909     OffsetOp = Ctx->getConstantInt32(NewAddr.Offset);
4910   } else {
4911     OffsetOp =
4912         Ctx->getConstantSym(NewAddr.Relocatable->getOffset() + NewAddr.Offset,
4913                             NewAddr.Relocatable->getName());
4914   }
4915   // Vanilla ICE load instructions should not use the segment registers, and
4916   // computeAddressOpt only works at the level of Variables and Constants, not
4917   // other X86OperandMem, so there should be no mention of segment
4918   // registers there either.
4919   static constexpr auto SegmentReg =
4920       X86OperandMem::SegmentRegisters::DefaultSegment;
4921 
4922   return X86OperandMem::create(Func, MemType, NewAddr.Base, OffsetOp,
4923                                NewAddr.Index, NewAddr.Shift, SegmentReg);
4924 }
4925 
4926 /// Add a mock bounds check on the memory address before using it as a load or
4927 /// store operand.  The basic idea is that given a memory operand [reg], we
4928 /// would first add bounds-check code something like:
4929 ///
4930 ///   cmp reg, <lb>
4931 ///   jl out_of_line_error
4932 ///   cmp reg, <ub>
4933 ///   jg out_of_line_error
4934 ///
4935 /// In reality, the specific code will depend on how <lb> and <ub> are
4936 /// represented, e.g. an immediate, a global, or a function argument.
4937 ///
4938 /// As such, we need to enforce that the memory operand does not have the form
4939 /// [reg1+reg2], because then there is no simple cmp instruction that would
4940 /// suffice.  However, we consider [reg+offset] to be OK because the offset is
4941 /// usually small, and so <ub> could have a safety buffer built in and then we
4942 /// could instead branch to a custom out_of_line_error that does the precise
4943 /// check and jumps back if it turns out OK.
4944 ///
4945 /// For the purpose of mocking the bounds check, we'll do something like this:
4946 ///
4947 ///   cmp reg, 0
4948 ///   je label
4949 ///   cmp reg, 1
4950 ///   je label
4951 ///   label:
4952 ///
4953 /// Also note that we don't need to add a bounds check to a dereference of a
4954 /// simple global variable address.
4955 
doMockBoundsCheck(Operand * Opnd)4956 void TargetX8664::doMockBoundsCheck(Operand *Opnd) {
4957   if (!getFlags().getMockBoundsCheck())
4958     return;
4959   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd)) {
4960     if (Mem->getIndex()) {
4961       llvm::report_fatal_error("doMockBoundsCheck: Opnd contains index reg");
4962     }
4963     Opnd = Mem->getBase();
4964   }
4965   // At this point Opnd could be nullptr, or Variable, or Constant, or perhaps
4966   // something else.  We only care if it is Variable.
4967   auto *Var = llvm::dyn_cast_or_null<Variable>(Opnd);
4968   if (Var == nullptr)
4969     return;
4970   // We use lowerStore() to copy out-args onto the stack.  This creates a memory
4971   // operand with the stack pointer as the base register.  Don't do bounds
4972   // checks on that.
4973   if (Var->getRegNum() == getStackReg())
4974     return;
4975 
4976   auto *Label = InstX86Label::create(Func, this);
4977   _cmp(Opnd, Ctx->getConstantZero(IceType_i32));
4978   _br(CondX86::Br_e, Label);
4979   _cmp(Opnd, Ctx->getConstantInt32(1));
4980   _br(CondX86::Br_e, Label);
4981   Context.insert(Label);
4982 }
4983 
lowerLoad(const InstLoad * Load)4984 void TargetX8664::lowerLoad(const InstLoad *Load) {
4985   // A Load instruction can be treated the same as an Assign instruction, after
4986   // the source operand is transformed into an X86OperandMem operand.  Note that
4987   // the address mode optimization already creates an X86OperandMem operand, so
4988   // it doesn't need another level of transformation.
4989   Variable *DestLoad = Load->getDest();
4990   Type Ty = DestLoad->getType();
4991   Operand *Src0 = formMemoryOperand(Load->getLoadAddress(), Ty);
4992   doMockBoundsCheck(Src0);
4993   auto *Assign = InstAssign::create(Func, DestLoad, Src0);
4994   lowerAssign(Assign);
4995 }
4996 
doAddressOptOther()4997 void TargetX8664::doAddressOptOther() {
4998   // Inverts some Icmp instructions which helps doAddressOptLoad later.
4999   // TODO(manasijm): Refactor to unify the conditions for Var0 and Var1
5000   Inst *Instr = iteratorToInst(Context.getCur());
5001   auto *VMetadata = Func->getVMetadata();
5002   if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Instr)) {
5003     if (llvm::isa<Constant>(Icmp->getSrc(0)) ||
5004         llvm::isa<Constant>(Icmp->getSrc(1)))
5005       return;
5006     auto *Var0 = llvm::dyn_cast<Variable>(Icmp->getSrc(0));
5007     if (Var0 == nullptr)
5008       return;
5009     if (!VMetadata->isTracked(Var0))
5010       return;
5011     auto *Op0Def = VMetadata->getFirstDefinitionSingleBlock(Var0);
5012     if (Op0Def == nullptr || !llvm::isa<InstLoad>(Op0Def))
5013       return;
5014     if (VMetadata->getLocalUseNode(Var0) != Context.getNode())
5015       return;
5016 
5017     auto *Var1 = llvm::dyn_cast<Variable>(Icmp->getSrc(1));
5018     if (Var1 != nullptr && VMetadata->isTracked(Var1)) {
5019       auto *Op1Def = VMetadata->getFirstDefinitionSingleBlock(Var1);
5020       if (Op1Def != nullptr && !VMetadata->isMultiBlock(Var1) &&
5021           llvm::isa<InstLoad>(Op1Def)) {
5022         return; // Both are loads
5023       }
5024     }
5025     Icmp->reverseConditionAndOperands();
5026   }
5027 }
5028 
doAddressOptLoad()5029 void TargetX8664::doAddressOptLoad() {
5030   Inst *Instr = iteratorToInst(Context.getCur());
5031   Operand *Addr = Instr->getSrc(0);
5032   Variable *Dest = Instr->getDest();
5033   if (auto *OptAddr = computeAddressOpt(Instr, Dest->getType(), Addr)) {
5034     Instr->setDeleted();
5035     Context.insert<InstLoad>(Dest, OptAddr);
5036   }
5037 }
5038 
doAddressOptLoadSubVector()5039 void TargetX8664::doAddressOptLoadSubVector() {
5040   auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur());
5041   Operand *Addr = Intrinsic->getArg(0);
5042   Variable *Dest = Intrinsic->getDest();
5043   if (auto *OptAddr = computeAddressOpt(Intrinsic, Dest->getType(), Addr)) {
5044     Intrinsic->setDeleted();
5045     const Ice::Intrinsics::IntrinsicInfo Info = {
5046         Ice::Intrinsics::LoadSubVector, Ice::Intrinsics::SideEffects_F,
5047         Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F};
5048     auto *NewLoad = Context.insert<InstIntrinsic>(2, Dest, Info);
5049     NewLoad->addArg(OptAddr);
5050     NewLoad->addArg(Intrinsic->getArg(1));
5051   }
5052 }
5053 
lowerPhi(const InstPhi *)5054 void TargetX8664::lowerPhi(const InstPhi * /*Instr*/) {
5055   Func->setError("Phi found in regular instruction list");
5056 }
5057 
lowerRet(const InstRet * Instr)5058 void TargetX8664::lowerRet(const InstRet *Instr) {
5059   Variable *Reg = nullptr;
5060   if (Instr->hasRetValue()) {
5061     Operand *RetValue = legalize(Instr->getRetValue());
5062     const Type ReturnType = RetValue->getType();
5063     assert(isVectorType(ReturnType) || isScalarFloatingType(ReturnType) ||
5064            (ReturnType == IceType_i32) || (ReturnType == IceType_i64));
5065     Reg = moveReturnValueToRegister(RetValue, ReturnType);
5066   }
5067   // Add a ret instruction even if sandboxing is enabled, because addEpilog
5068   // explicitly looks for a ret instruction as a marker for where to insert the
5069   // frame removal instructions.
5070   _ret(Reg);
5071   // Add a fake use of esp to make sure esp stays alive for the entire
5072   // function. Otherwise post-call esp adjustments get dead-code eliminated.
5073   keepEspLiveAtExit();
5074 }
5075 
makePshufdMask(SizeT Index0,SizeT Index1,SizeT Index2,SizeT Index3)5076 inline uint32_t makePshufdMask(SizeT Index0, SizeT Index1, SizeT Index2,
5077                                SizeT Index3) {
5078   const SizeT Mask = (Index0 & 0x3) | ((Index1 & 0x3) << 2) |
5079                      ((Index2 & 0x3) << 4) | ((Index3 & 0x3) << 6);
5080   assert(Mask < 256);
5081   return Mask;
5082 }
5083 
lowerShuffleVector_AllFromSameSrc(Operand * Src,SizeT Index0,SizeT Index1,SizeT Index2,SizeT Index3)5084 Variable *TargetX8664::lowerShuffleVector_AllFromSameSrc(
5085     Operand *Src, SizeT Index0, SizeT Index1, SizeT Index2, SizeT Index3) {
5086   constexpr SizeT SrcBit = 1 << 2;
5087   assert((Index0 & SrcBit) == (Index1 & SrcBit));
5088   assert((Index0 & SrcBit) == (Index2 & SrcBit));
5089   assert((Index0 & SrcBit) == (Index3 & SrcBit));
5090   (void)SrcBit;
5091 
5092   const Type SrcTy = Src->getType();
5093   auto *T = makeReg(SrcTy);
5094   auto *SrcRM = legalize(Src, Legal_Reg | Legal_Mem);
5095   auto *Mask =
5096       Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
5097   _pshufd(T, SrcRM, Mask);
5098   return T;
5099 }
5100 
5101 Variable *
lowerShuffleVector_TwoFromSameSrc(Operand * Src0,SizeT Index0,SizeT Index1,Operand * Src1,SizeT Index2,SizeT Index3)5102 TargetX8664::lowerShuffleVector_TwoFromSameSrc(Operand *Src0, SizeT Index0,
5103                                                SizeT Index1, Operand *Src1,
5104                                                SizeT Index2, SizeT Index3) {
5105   constexpr SizeT SrcBit = 1 << 2;
5106   assert((Index0 & SrcBit) == (Index1 & SrcBit) || (Index1 == IGNORE_INDEX));
5107   assert((Index2 & SrcBit) == (Index3 & SrcBit) || (Index3 == IGNORE_INDEX));
5108   (void)SrcBit;
5109 
5110   const Type SrcTy = Src0->getType();
5111   assert(Src1->getType() == SrcTy);
5112   auto *T = makeReg(SrcTy);
5113   auto *Src0R = legalizeToReg(Src0);
5114   auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5115   auto *Mask =
5116       Ctx->getConstantInt32(makePshufdMask(Index0, Index1, Index2, Index3));
5117   _movp(T, Src0R);
5118   _shufps(T, Src1RM, Mask);
5119   return T;
5120 }
5121 
lowerShuffleVector_UnifyFromDifferentSrcs(Operand * Src0,SizeT Index0,Operand * Src1,SizeT Index1)5122 Variable *TargetX8664::lowerShuffleVector_UnifyFromDifferentSrcs(Operand *Src0,
5123                                                                  SizeT Index0,
5124                                                                  Operand *Src1,
5125                                                                  SizeT Index1) {
5126   return lowerShuffleVector_TwoFromSameSrc(Src0, Index0, IGNORE_INDEX, Src1,
5127                                            Index1, IGNORE_INDEX);
5128 }
5129 
makeSrcSwitchMask(SizeT Index0,SizeT Index1,SizeT Index2,SizeT Index3)5130 inline SizeT makeSrcSwitchMask(SizeT Index0, SizeT Index1, SizeT Index2,
5131                                SizeT Index3) {
5132   constexpr SizeT SrcBit = 1 << 2;
5133   const SizeT Index0Bits = ((Index0 & SrcBit) == 0) ? 0 : (1 << 0);
5134   const SizeT Index1Bits = ((Index1 & SrcBit) == 0) ? 0 : (1 << 1);
5135   const SizeT Index2Bits = ((Index2 & SrcBit) == 0) ? 0 : (1 << 2);
5136   const SizeT Index3Bits = ((Index3 & SrcBit) == 0) ? 0 : (1 << 3);
5137   return Index0Bits | Index1Bits | Index2Bits | Index3Bits;
5138 }
5139 
lowerShuffleVector_NewMaskName()5140 GlobalString TargetX8664::lowerShuffleVector_NewMaskName() {
5141   GlobalString FuncName = Func->getFunctionName();
5142   const SizeT Id = PshufbMaskCount++;
5143   if (!BuildDefs::dump() || !FuncName.hasStdString()) {
5144     return GlobalString::createWithString(
5145         Ctx,
5146         "$PS" + std::to_string(FuncName.getID()) + "_" + std::to_string(Id));
5147   }
5148   return GlobalString::createWithString(
5149       Ctx, "Pshufb$" + Func->getFunctionName() + "$" + std::to_string(Id));
5150 }
5151 
lowerShuffleVector_CreatePshufbMask(int8_t Idx0,int8_t Idx1,int8_t Idx2,int8_t Idx3,int8_t Idx4,int8_t Idx5,int8_t Idx6,int8_t Idx7,int8_t Idx8,int8_t Idx9,int8_t Idx10,int8_t Idx11,int8_t Idx12,int8_t Idx13,int8_t Idx14,int8_t Idx15)5152 ConstantRelocatable *TargetX8664::lowerShuffleVector_CreatePshufbMask(
5153     int8_t Idx0, int8_t Idx1, int8_t Idx2, int8_t Idx3, int8_t Idx4,
5154     int8_t Idx5, int8_t Idx6, int8_t Idx7, int8_t Idx8, int8_t Idx9,
5155     int8_t Idx10, int8_t Idx11, int8_t Idx12, int8_t Idx13, int8_t Idx14,
5156     int8_t Idx15) {
5157   static constexpr uint8_t NumElements = 16;
5158   const char Initializer[NumElements] = {
5159       Idx0, Idx1, Idx2,  Idx3,  Idx4,  Idx5,  Idx6,  Idx7,
5160       Idx8, Idx9, Idx10, Idx11, Idx12, Idx13, Idx14, Idx15,
5161   };
5162 
5163   static constexpr Type V4VectorType = IceType_v4i32;
5164   const uint32_t MaskAlignment = typeWidthInBytesOnStack(V4VectorType);
5165   auto *Mask = VariableDeclaration::create(Func->getGlobalPool());
5166   GlobalString MaskName = lowerShuffleVector_NewMaskName();
5167   Mask->setIsConstant(true);
5168   Mask->addInitializer(VariableDeclaration::DataInitializer::create(
5169       Func->getGlobalPool(), Initializer, NumElements));
5170   Mask->setName(MaskName);
5171   // Mask needs to be 16-byte aligned, or pshufb will seg fault.
5172   Mask->setAlignment(MaskAlignment);
5173   Func->addGlobal(Mask);
5174 
5175   constexpr RelocOffsetT Offset = 0;
5176   return llvm::cast<ConstantRelocatable>(Ctx->getConstantSym(Offset, MaskName));
5177 }
5178 
lowerShuffleVector_UsingPshufb(Variable * Dest,Operand * Src0,Operand * Src1,int8_t Idx0,int8_t Idx1,int8_t Idx2,int8_t Idx3,int8_t Idx4,int8_t Idx5,int8_t Idx6,int8_t Idx7,int8_t Idx8,int8_t Idx9,int8_t Idx10,int8_t Idx11,int8_t Idx12,int8_t Idx13,int8_t Idx14,int8_t Idx15)5179 void TargetX8664::lowerShuffleVector_UsingPshufb(
5180     Variable *Dest, Operand *Src0, Operand *Src1, int8_t Idx0, int8_t Idx1,
5181     int8_t Idx2, int8_t Idx3, int8_t Idx4, int8_t Idx5, int8_t Idx6,
5182     int8_t Idx7, int8_t Idx8, int8_t Idx9, int8_t Idx10, int8_t Idx11,
5183     int8_t Idx12, int8_t Idx13, int8_t Idx14, int8_t Idx15) {
5184   const Type DestTy = Dest->getType();
5185   static constexpr bool NotRebased = false;
5186   static constexpr Variable *NoBase = nullptr;
5187   // We use void for the memory operand instead of DestTy because using the
5188   // latter causes a validation failure: the X86 Inst layer complains that
5189   // vector mem operands could be under aligned. Thus, using void we avoid the
5190   // validation error. Note that the mask global declaration is aligned, so it
5191   // can be used as an XMM mem operand.
5192   static constexpr Type MaskType = IceType_void;
5193 #define IDX_IN_SRC(N, S)                                                       \
5194   ((((N) & (1 << 4)) == (S << 4)) ? ((N)&0xf) : CLEAR_ALL_BITS)
5195   auto *Mask0M = X86OperandMem::create(
5196       Func, MaskType, NoBase,
5197       lowerShuffleVector_CreatePshufbMask(
5198           IDX_IN_SRC(Idx0, 0), IDX_IN_SRC(Idx1, 0), IDX_IN_SRC(Idx2, 0),
5199           IDX_IN_SRC(Idx3, 0), IDX_IN_SRC(Idx4, 0), IDX_IN_SRC(Idx5, 0),
5200           IDX_IN_SRC(Idx6, 0), IDX_IN_SRC(Idx7, 0), IDX_IN_SRC(Idx8, 0),
5201           IDX_IN_SRC(Idx9, 0), IDX_IN_SRC(Idx10, 0), IDX_IN_SRC(Idx11, 0),
5202           IDX_IN_SRC(Idx12, 0), IDX_IN_SRC(Idx13, 0), IDX_IN_SRC(Idx14, 0),
5203           IDX_IN_SRC(Idx15, 0)),
5204       NotRebased);
5205 
5206   auto *T0 = makeReg(DestTy);
5207   auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5208   _movp(T0, Src0RM);
5209 
5210   _pshufb(T0, Mask0M);
5211 
5212   if (Idx0 >= 16 || Idx1 >= 16 || Idx2 >= 16 || Idx3 >= 16 || Idx4 >= 16 ||
5213       Idx5 >= 16 || Idx6 >= 16 || Idx7 >= 16 || Idx8 >= 16 || Idx9 >= 16 ||
5214       Idx10 >= 16 || Idx11 >= 16 || Idx12 >= 16 || Idx13 >= 16 || Idx14 >= 16 ||
5215       Idx15 >= 16) {
5216     auto *Mask1M = X86OperandMem::create(
5217         Func, MaskType, NoBase,
5218         lowerShuffleVector_CreatePshufbMask(
5219             IDX_IN_SRC(Idx0, 1), IDX_IN_SRC(Idx1, 1), IDX_IN_SRC(Idx2, 1),
5220             IDX_IN_SRC(Idx3, 1), IDX_IN_SRC(Idx4, 1), IDX_IN_SRC(Idx5, 1),
5221             IDX_IN_SRC(Idx6, 1), IDX_IN_SRC(Idx7, 1), IDX_IN_SRC(Idx8, 1),
5222             IDX_IN_SRC(Idx9, 1), IDX_IN_SRC(Idx10, 1), IDX_IN_SRC(Idx11, 1),
5223             IDX_IN_SRC(Idx12, 1), IDX_IN_SRC(Idx13, 1), IDX_IN_SRC(Idx14, 1),
5224             IDX_IN_SRC(Idx15, 1)),
5225         NotRebased);
5226 #undef IDX_IN_SRC
5227     auto *T1 = makeReg(DestTy);
5228     auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5229     _movp(T1, Src1RM);
5230     _pshufb(T1, Mask1M);
5231     _por(T0, T1);
5232   }
5233 
5234   _movp(Dest, T0);
5235 }
5236 
lowerShuffleVector(const InstShuffleVector * Instr)5237 void TargetX8664::lowerShuffleVector(const InstShuffleVector *Instr) {
5238   auto *Dest = Instr->getDest();
5239   const Type DestTy = Dest->getType();
5240   auto *Src0 = Instr->getSrc(0);
5241   auto *Src1 = Instr->getSrc(1);
5242   const SizeT NumElements = typeNumElements(DestTy);
5243 
5244   auto *T = makeReg(DestTy);
5245 
5246   switch (DestTy) {
5247   default:
5248     llvm::report_fatal_error("Unexpected vector type.");
5249   case IceType_v16i1:
5250   case IceType_v16i8: {
5251     static constexpr SizeT ExpectedNumElements = 16;
5252     assert(ExpectedNumElements == Instr->getNumIndexes());
5253     (void)ExpectedNumElements;
5254 
5255     if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
5256       auto *T = makeReg(DestTy);
5257       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5258       _movp(T, Src0RM);
5259       _punpckl(T, Src0RM);
5260       _movp(Dest, T);
5261       return;
5262     }
5263 
5264     if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
5265                           23)) {
5266       auto *T = makeReg(DestTy);
5267       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5268       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5269       _movp(T, Src0RM);
5270       _punpckl(T, Src1RM);
5271       _movp(Dest, T);
5272       return;
5273     }
5274 
5275     if (Instr->indexesAre(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
5276                           15, 15)) {
5277       auto *T = makeReg(DestTy);
5278       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5279       _movp(T, Src0RM);
5280       _punpckh(T, Src0RM);
5281       _movp(Dest, T);
5282       return;
5283     }
5284 
5285     if (Instr->indexesAre(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30,
5286                           15, 31)) {
5287       auto *T = makeReg(DestTy);
5288       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5289       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5290       _movp(T, Src0RM);
5291       _punpckh(T, Src1RM);
5292       _movp(Dest, T);
5293       return;
5294     }
5295 
5296     if (InstructionSet < SSE4_1) {
5297       // TODO(jpp): figure out how to lower with sse2.
5298       break;
5299     }
5300 
5301     const SizeT Index0 = Instr->getIndexValue(0);
5302     const SizeT Index1 = Instr->getIndexValue(1);
5303     const SizeT Index2 = Instr->getIndexValue(2);
5304     const SizeT Index3 = Instr->getIndexValue(3);
5305     const SizeT Index4 = Instr->getIndexValue(4);
5306     const SizeT Index5 = Instr->getIndexValue(5);
5307     const SizeT Index6 = Instr->getIndexValue(6);
5308     const SizeT Index7 = Instr->getIndexValue(7);
5309     const SizeT Index8 = Instr->getIndexValue(8);
5310     const SizeT Index9 = Instr->getIndexValue(9);
5311     const SizeT Index10 = Instr->getIndexValue(10);
5312     const SizeT Index11 = Instr->getIndexValue(11);
5313     const SizeT Index12 = Instr->getIndexValue(12);
5314     const SizeT Index13 = Instr->getIndexValue(13);
5315     const SizeT Index14 = Instr->getIndexValue(14);
5316     const SizeT Index15 = Instr->getIndexValue(15);
5317 
5318     lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
5319                                    Index3, Index4, Index5, Index6, Index7,
5320                                    Index8, Index9, Index10, Index11, Index12,
5321                                    Index13, Index14, Index15);
5322     return;
5323   }
5324   case IceType_v8i1:
5325   case IceType_v8i16: {
5326     static constexpr SizeT ExpectedNumElements = 8;
5327     assert(ExpectedNumElements == Instr->getNumIndexes());
5328     (void)ExpectedNumElements;
5329 
5330     if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
5331       auto *T = makeReg(DestTy);
5332       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5333       _movp(T, Src0RM);
5334       _punpckl(T, Src0RM);
5335       _movp(Dest, T);
5336       return;
5337     }
5338 
5339     if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
5340       auto *T = makeReg(DestTy);
5341       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5342       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5343       _movp(T, Src0RM);
5344       _punpckl(T, Src1RM);
5345       _movp(Dest, T);
5346       return;
5347     }
5348 
5349     if (Instr->indexesAre(4, 4, 5, 5, 6, 6, 7, 7)) {
5350       auto *T = makeReg(DestTy);
5351       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5352       _movp(T, Src0RM);
5353       _punpckh(T, Src0RM);
5354       _movp(Dest, T);
5355       return;
5356     }
5357 
5358     if (Instr->indexesAre(4, 12, 5, 13, 6, 14, 7, 15)) {
5359       auto *T = makeReg(DestTy);
5360       auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
5361       auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5362       _movp(T, Src0RM);
5363       _punpckh(T, Src1RM);
5364       _movp(Dest, T);
5365       return;
5366     }
5367 
5368     if (InstructionSet < SSE4_1) {
5369       // TODO(jpp): figure out how to lower with sse2.
5370       break;
5371     }
5372 
5373     const SizeT Index0 = Instr->getIndexValue(0);
5374     const SizeT Index1 = Instr->getIndexValue(1);
5375     const SizeT Index2 = Instr->getIndexValue(2);
5376     const SizeT Index3 = Instr->getIndexValue(3);
5377     const SizeT Index4 = Instr->getIndexValue(4);
5378     const SizeT Index5 = Instr->getIndexValue(5);
5379     const SizeT Index6 = Instr->getIndexValue(6);
5380     const SizeT Index7 = Instr->getIndexValue(7);
5381 
5382 #define TO_BYTE_INDEX(I) ((I) << 1)
5383     lowerShuffleVector_UsingPshufb(
5384         Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
5385         TO_BYTE_INDEX(Index1), TO_BYTE_INDEX(Index1) + 1, TO_BYTE_INDEX(Index2),
5386         TO_BYTE_INDEX(Index2) + 1, TO_BYTE_INDEX(Index3),
5387         TO_BYTE_INDEX(Index3) + 1, TO_BYTE_INDEX(Index4),
5388         TO_BYTE_INDEX(Index4) + 1, TO_BYTE_INDEX(Index5),
5389         TO_BYTE_INDEX(Index5) + 1, TO_BYTE_INDEX(Index6),
5390         TO_BYTE_INDEX(Index6) + 1, TO_BYTE_INDEX(Index7),
5391         TO_BYTE_INDEX(Index7) + 1);
5392 #undef TO_BYTE_INDEX
5393     return;
5394   }
5395   case IceType_v4i1:
5396   case IceType_v4i32:
5397   case IceType_v4f32: {
5398     static constexpr SizeT ExpectedNumElements = 4;
5399     assert(ExpectedNumElements == Instr->getNumIndexes());
5400     const SizeT Index0 = Instr->getIndexValue(0);
5401     const SizeT Index1 = Instr->getIndexValue(1);
5402     const SizeT Index2 = Instr->getIndexValue(2);
5403     const SizeT Index3 = Instr->getIndexValue(3);
5404     Variable *T = nullptr;
5405     switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) {
5406 #define CASE_SRCS_IN(S0, S1, S2, S3)                                           \
5407   case (((S0) << 0) | ((S1) << 1) | ((S2) << 2) | ((S3) << 3))
5408       CASE_SRCS_IN(0, 0, 0, 0) : {
5409         T = lowerShuffleVector_AllFromSameSrc(Src0, Index0, Index1, Index2,
5410                                               Index3);
5411       }
5412       break;
5413       CASE_SRCS_IN(0, 0, 0, 1) : {
5414         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
5415                                                                   Src1, Index3);
5416         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
5417                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
5418       }
5419       break;
5420       CASE_SRCS_IN(0, 0, 1, 0) : {
5421         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
5422                                                                   Src0, Index3);
5423         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Unified,
5424                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
5425       }
5426       break;
5427       CASE_SRCS_IN(0, 0, 1, 1) : {
5428         T = lowerShuffleVector_TwoFromSameSrc(Src0, Index0, Index1, Src1,
5429                                               Index2, Index3);
5430       }
5431       break;
5432       CASE_SRCS_IN(0, 1, 0, 0) : {
5433         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
5434                                                                   Src1, Index1);
5435         T = lowerShuffleVector_TwoFromSameSrc(
5436             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
5437       }
5438       break;
5439       CASE_SRCS_IN(0, 1, 0, 1) : {
5440         if (Index0 == 0 && (Index1 - ExpectedNumElements) == 0 && Index2 == 1 &&
5441             (Index3 - ExpectedNumElements) == 1) {
5442           auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
5443           auto *Src0R = legalizeToReg(Src0);
5444           T = makeReg(DestTy);
5445           _movp(T, Src0R);
5446           _punpckl(T, Src1RM);
5447         } else if (Index0 == Index2 && Index1 == Index3) {
5448           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
5449               Src0, Index0, Src1, Index1);
5450           T = lowerShuffleVector_AllFromSameSrc(
5451               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
5452               UNIFIED_INDEX_1);
5453         } else {
5454           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
5455               Src0, Index0, Src1, Index1);
5456           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
5457               Src0, Index2, Src1, Index3);
5458           T = lowerShuffleVector_TwoFromSameSrc(
5459               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
5460               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
5461         }
5462       }
5463       break;
5464       CASE_SRCS_IN(0, 1, 1, 0) : {
5465         if (Index0 == Index3 && Index1 == Index2) {
5466           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
5467               Src0, Index0, Src1, Index1);
5468           T = lowerShuffleVector_AllFromSameSrc(
5469               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
5470               UNIFIED_INDEX_0);
5471         } else {
5472           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
5473               Src0, Index0, Src1, Index1);
5474           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
5475               Src1, Index2, Src0, Index3);
5476           T = lowerShuffleVector_TwoFromSameSrc(
5477               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
5478               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
5479         }
5480       }
5481       break;
5482       CASE_SRCS_IN(0, 1, 1, 1) : {
5483         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index0,
5484                                                                   Src1, Index1);
5485         T = lowerShuffleVector_TwoFromSameSrc(
5486             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
5487       }
5488       break;
5489       CASE_SRCS_IN(1, 0, 0, 0) : {
5490         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
5491                                                                   Src0, Index1);
5492         T = lowerShuffleVector_TwoFromSameSrc(
5493             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src0, Index2, Index3);
5494       }
5495       break;
5496       CASE_SRCS_IN(1, 0, 0, 1) : {
5497         if (Index0 == Index3 && Index1 == Index2) {
5498           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
5499               Src1, Index0, Src0, Index1);
5500           T = lowerShuffleVector_AllFromSameSrc(
5501               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_1,
5502               UNIFIED_INDEX_0);
5503         } else {
5504           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
5505               Src1, Index0, Src0, Index1);
5506           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
5507               Src0, Index2, Src1, Index3);
5508           T = lowerShuffleVector_TwoFromSameSrc(
5509               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
5510               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
5511         }
5512       }
5513       break;
5514       CASE_SRCS_IN(1, 0, 1, 0) : {
5515         if ((Index0 - ExpectedNumElements) == 0 && Index1 == 0 &&
5516             (Index2 - ExpectedNumElements) == 1 && Index3 == 1) {
5517           auto *Src1RM = legalize(Src0, Legal_Reg | Legal_Mem);
5518           auto *Src0R = legalizeToReg(Src1);
5519           T = makeReg(DestTy);
5520           _movp(T, Src0R);
5521           _punpckl(T, Src1RM);
5522         } else if (Index0 == Index2 && Index1 == Index3) {
5523           auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(
5524               Src1, Index0, Src0, Index1);
5525           T = lowerShuffleVector_AllFromSameSrc(
5526               Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, UNIFIED_INDEX_0,
5527               UNIFIED_INDEX_1);
5528         } else {
5529           auto *Unified0 = lowerShuffleVector_UnifyFromDifferentSrcs(
5530               Src1, Index0, Src0, Index1);
5531           auto *Unified1 = lowerShuffleVector_UnifyFromDifferentSrcs(
5532               Src1, Index2, Src0, Index3);
5533           T = lowerShuffleVector_TwoFromSameSrc(
5534               Unified0, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Unified1,
5535               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
5536         }
5537       }
5538       break;
5539       CASE_SRCS_IN(1, 0, 1, 1) : {
5540         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index0,
5541                                                                   Src0, Index1);
5542         T = lowerShuffleVector_TwoFromSameSrc(
5543             Unified, UNIFIED_INDEX_0, UNIFIED_INDEX_1, Src1, Index2, Index3);
5544       }
5545       break;
5546       CASE_SRCS_IN(1, 1, 0, 0) : {
5547         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Src0,
5548                                               Index2, Index3);
5549       }
5550       break;
5551       CASE_SRCS_IN(1, 1, 0, 1) : {
5552         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src0, Index2,
5553                                                                   Src1, Index3);
5554         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
5555                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
5556       }
5557       break;
5558       CASE_SRCS_IN(1, 1, 1, 0) : {
5559         auto *Unified = lowerShuffleVector_UnifyFromDifferentSrcs(Src1, Index2,
5560                                                                   Src0, Index3);
5561         T = lowerShuffleVector_TwoFromSameSrc(Src1, Index0, Index1, Unified,
5562                                               UNIFIED_INDEX_0, UNIFIED_INDEX_1);
5563       }
5564       break;
5565       CASE_SRCS_IN(1, 1, 1, 1) : {
5566         T = lowerShuffleVector_AllFromSameSrc(Src1, Index0, Index1, Index2,
5567                                               Index3);
5568       }
5569       break;
5570 #undef CASE_SRCS_IN
5571     }
5572 
5573     assert(T != nullptr);
5574     assert(T->getType() == DestTy);
5575     _movp(Dest, T);
5576     return;
5577   } break;
5578   }
5579 
5580   // Unoptimized shuffle. Perform a series of inserts and extracts.
5581   Context.insert<InstFakeDef>(T);
5582   const Type ElementType = typeElementType(DestTy);
5583   for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
5584     auto *Index = Instr->getIndex(I);
5585     const SizeT Elem = Index->getValue();
5586     auto *ExtElmt = makeReg(ElementType);
5587     if (Elem < NumElements) {
5588       lowerExtractElement(
5589           InstExtractElement::create(Func, ExtElmt, Src0, Index));
5590     } else {
5591       lowerExtractElement(InstExtractElement::create(
5592           Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements)));
5593     }
5594     auto *NewT = makeReg(DestTy);
5595     lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
5596                                                  Ctx->getConstantInt32(I)));
5597     T = NewT;
5598   }
5599   _movp(Dest, T);
5600 }
5601 
lowerSelect(const InstSelect * Select)5602 void TargetX8664::lowerSelect(const InstSelect *Select) {
5603   Variable *Dest = Select->getDest();
5604 
5605   Operand *Condition = Select->getCondition();
5606   // Handle folding opportunities.
5607   if (const Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
5608     assert(Producer->isDeleted());
5609     switch (BoolFolding::getProducerKind(Producer)) {
5610     default:
5611       break;
5612     case BoolFolding::PK_Icmp32:
5613     case BoolFolding::PK_Icmp64: {
5614       lowerIcmpAndConsumer(llvm::cast<InstIcmp>(Producer), Select);
5615       return;
5616     }
5617     case BoolFolding::PK_Fcmp: {
5618       lowerFcmpAndConsumer(llvm::cast<InstFcmp>(Producer), Select);
5619       return;
5620     }
5621     }
5622   }
5623 
5624   if (isVectorType(Dest->getType())) {
5625     lowerSelectVector(Select);
5626     return;
5627   }
5628 
5629   Operand *CmpResult = legalize(Condition, Legal_Reg | Legal_Mem);
5630   Operand *Zero = Ctx->getConstantZero(IceType_i32);
5631   _cmp(CmpResult, Zero);
5632   Operand *SrcT = Select->getTrueOperand();
5633   Operand *SrcF = Select->getFalseOperand();
5634   const BrCond Cond = CondX86::Br_ne;
5635   lowerSelectMove(Dest, Cond, SrcT, SrcF);
5636 }
5637 
lowerSelectMove(Variable * Dest,BrCond Cond,Operand * SrcT,Operand * SrcF)5638 void TargetX8664::lowerSelectMove(Variable *Dest, BrCond Cond, Operand *SrcT,
5639                                   Operand *SrcF) {
5640   Type DestTy = Dest->getType();
5641   if (typeWidthInBytes(DestTy) == 1 || isFloatingType(DestTy)) {
5642     // The cmov instruction doesn't allow 8-bit or FP operands, so we need
5643     // explicit control flow.
5644     // d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
5645     auto *Label = InstX86Label::create(Func, this);
5646     SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
5647     _mov(Dest, SrcT);
5648     _br(Cond, Label);
5649     SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
5650     _redefined(_mov(Dest, SrcF));
5651     Context.insert(Label);
5652     return;
5653   }
5654   // mov t, SrcF; cmov_cond t, SrcT; mov dest, t
5655   // But if SrcT is immediate, we might be able to do better, as the cmov
5656   // instruction doesn't allow an immediate operand:
5657   // mov t, SrcT; cmov_!cond t, SrcF; mov dest, t
5658   if (llvm::isa<Constant>(SrcT) && !llvm::isa<Constant>(SrcF)) {
5659     std::swap(SrcT, SrcF);
5660     Cond = InstX86Base::getOppositeCondition(Cond);
5661   }
5662 
5663   assert(DestTy == IceType_i16 || DestTy == IceType_i32 ||
5664          DestTy == IceType_i64);
5665   lowerSelectIntMove(Dest, Cond, SrcT, SrcF);
5666 }
5667 
lowerSelectIntMove(Variable * Dest,BrCond Cond,Operand * SrcT,Operand * SrcF)5668 void TargetX8664::lowerSelectIntMove(Variable *Dest, BrCond Cond, Operand *SrcT,
5669                                      Operand *SrcF) {
5670   Variable *T = nullptr;
5671   SrcF = legalize(SrcF);
5672   _mov(T, SrcF);
5673   SrcT = legalize(SrcT, Legal_Reg | Legal_Mem);
5674   _cmov(T, SrcT, Cond);
5675   _mov(Dest, T);
5676 }
5677 
lowerMove(Variable * Dest,Operand * Src,bool IsRedefinition)5678 void TargetX8664::lowerMove(Variable *Dest, Operand *Src, bool IsRedefinition) {
5679   assert(Dest->getType() == Src->getType());
5680   assert(!Dest->isRematerializable());
5681   Operand *SrcLegal;
5682   if (Dest->hasReg()) {
5683     // If Dest already has a physical register, then only basic legalization
5684     // is needed, as the source operand can be a register, immediate, or
5685     // memory.
5686     SrcLegal = legalize(Src, Legal_Reg, Dest->getRegNum());
5687   } else {
5688     // If Dest could be a stack operand, then RI must be a physical register
5689     // or a scalar integer immediate.
5690     SrcLegal = legalize(Src, Legal_Reg | Legal_Imm);
5691   }
5692   if (isVectorType(Dest->getType())) {
5693     _redefined(_movp(Dest, SrcLegal), IsRedefinition);
5694   } else {
5695     _redefined(_mov(Dest, SrcLegal), IsRedefinition);
5696   }
5697 }
5698 
lowerOptimizeFcmpSelect(const InstFcmp * Fcmp,const InstSelect * Select)5699 bool TargetX8664::lowerOptimizeFcmpSelect(const InstFcmp *Fcmp,
5700                                           const InstSelect *Select) {
5701   Operand *CmpSrc0 = Fcmp->getSrc(0);
5702   Operand *CmpSrc1 = Fcmp->getSrc(1);
5703   Operand *SelectSrcT = Select->getTrueOperand();
5704   Operand *SelectSrcF = Select->getFalseOperand();
5705   Variable *SelectDest = Select->getDest();
5706 
5707   // TODO(capn): also handle swapped compare/select operand order.
5708   if (CmpSrc0 != SelectSrcT || CmpSrc1 != SelectSrcF)
5709     return false;
5710 
5711   // TODO(sehr, stichnot): fcmp/select patterns (e.g., minsd/maxss) go here.
5712   InstFcmp::FCond Condition = Fcmp->getCondition();
5713   switch (Condition) {
5714   default:
5715     return false;
5716   case InstFcmp::True:
5717     break;
5718   case InstFcmp::False:
5719     break;
5720   case InstFcmp::Ogt: {
5721     Variable *T = makeReg(SelectDest->getType());
5722     if (isScalarFloatingType(SelectSrcT->getType())) {
5723       _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
5724       _maxss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
5725       _mov(SelectDest, T);
5726     } else {
5727       _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
5728       _maxps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
5729       _movp(SelectDest, T);
5730     }
5731     return true;
5732   } break;
5733   case InstFcmp::Olt: {
5734     Variable *T = makeReg(SelectSrcT->getType());
5735     if (isScalarFloatingType(SelectSrcT->getType())) {
5736       _mov(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
5737       _minss(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
5738       _mov(SelectDest, T);
5739     } else {
5740       _movp(T, legalize(SelectSrcT, Legal_Reg | Legal_Mem));
5741       _minps(T, legalize(SelectSrcF, Legal_Reg | Legal_Mem));
5742       _movp(SelectDest, T);
5743     }
5744     return true;
5745   } break;
5746   }
5747   return false;
5748 }
5749 
lowerIcmp(const InstIcmp * Icmp)5750 void TargetX8664::lowerIcmp(const InstIcmp *Icmp) {
5751   Variable *Dest = Icmp->getDest();
5752   if (isVectorType(Dest->getType())) {
5753     lowerIcmpVector(Icmp);
5754   } else {
5755     constexpr Inst *Consumer = nullptr;
5756     lowerIcmpAndConsumer(Icmp, Consumer);
5757   }
5758 }
5759 
lowerSelectVector(const InstSelect * Instr)5760 void TargetX8664::lowerSelectVector(const InstSelect *Instr) {
5761   Variable *Dest = Instr->getDest();
5762   Type DestTy = Dest->getType();
5763   Operand *SrcT = Instr->getTrueOperand();
5764   Operand *SrcF = Instr->getFalseOperand();
5765   Operand *Condition = Instr->getCondition();
5766 
5767   if (!isVectorType(DestTy))
5768     llvm::report_fatal_error("Expected a vector select");
5769 
5770   Type SrcTy = SrcT->getType();
5771   Variable *T = makeReg(SrcTy);
5772   Operand *SrcTRM = legalize(SrcT, Legal_Reg | Legal_Mem);
5773   Operand *SrcFRM = legalize(SrcF, Legal_Reg | Legal_Mem);
5774 
5775   if (InstructionSet >= SSE4_1) {
5776     // TODO(wala): If the condition operand is a constant, use blendps or
5777     // pblendw.
5778     //
5779     // Use blendvps or pblendvb to implement select.
5780     if (SrcTy == IceType_v4i1 || SrcTy == IceType_v4i32 ||
5781         SrcTy == IceType_v4f32) {
5782       Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
5783       Variable *xmm0 = makeReg(IceType_v4i32, RegX8664::Reg_xmm0);
5784       _movp(xmm0, ConditionRM);
5785       _psll(xmm0, Ctx->getConstantInt8(31));
5786       _movp(T, SrcFRM);
5787       _blendvps(T, SrcTRM, xmm0);
5788       _movp(Dest, T);
5789     } else {
5790       assert(typeNumElements(SrcTy) == 8 || typeNumElements(SrcTy) == 16);
5791       Type SignExtTy =
5792           Condition->getType() == IceType_v8i1 ? IceType_v8i16 : IceType_v16i8;
5793       Variable *xmm0 = makeReg(SignExtTy, RegX8664::Reg_xmm0);
5794       lowerCast(InstCast::create(Func, InstCast::Sext, xmm0, Condition));
5795       _movp(T, SrcFRM);
5796       _pblendvb(T, SrcTRM, xmm0);
5797       _movp(Dest, T);
5798     }
5799     return;
5800   }
5801   // Lower select without SSE4.1:
5802   // a=d?b:c ==>
5803   //   if elementtype(d) != i1:
5804   //      d=sext(d);
5805   //   a=(b&d)|(c&~d);
5806   Variable *T2 = makeReg(SrcTy);
5807   // Sign extend the condition operand if applicable.
5808   if (SrcTy == IceType_v4f32) {
5809     // The sext operation takes only integer arguments.
5810     Variable *T3 = Func->makeVariable(IceType_v4i32);
5811     lowerCast(InstCast::create(Func, InstCast::Sext, T3, Condition));
5812     _movp(T, T3);
5813   } else if (typeElementType(SrcTy) != IceType_i1) {
5814     lowerCast(InstCast::create(Func, InstCast::Sext, T, Condition));
5815   } else {
5816     Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
5817     _movp(T, ConditionRM);
5818   }
5819   _movp(T2, T);
5820   _pand(T, SrcTRM);
5821   _pandn(T2, SrcFRM);
5822   _por(T, T2);
5823   _movp(Dest, T);
5824 
5825   return;
5826 }
5827 
lowerStore(const InstStore * Instr)5828 void TargetX8664::lowerStore(const InstStore *Instr) {
5829   Operand *Value = Instr->getData();
5830   Operand *Addr = Instr->getStoreAddress();
5831   X86OperandMem *NewAddr = formMemoryOperand(Addr, Value->getType());
5832   doMockBoundsCheck(NewAddr);
5833   Type Ty = NewAddr->getType();
5834 
5835   if (isVectorType(Ty)) {
5836     _storep(legalizeToReg(Value), NewAddr);
5837   } else {
5838     Value = legalize(Value, Legal_Reg | Legal_Imm);
5839     _store(Value, NewAddr);
5840   }
5841 }
5842 
doAddressOptStore()5843 void TargetX8664::doAddressOptStore() {
5844   auto *Instr = llvm::cast<InstStore>(Context.getCur());
5845   Operand *Addr = Instr->getStoreAddress();
5846   Operand *Data = Instr->getData();
5847   if (auto *OptAddr = computeAddressOpt(Instr, Data->getType(), Addr)) {
5848     Instr->setDeleted();
5849     auto *NewStore = Context.insert<InstStore>(Data, OptAddr);
5850     if (Instr->getDest())
5851       NewStore->setRmwBeacon(Instr->getRmwBeacon());
5852   }
5853 }
5854 
doAddressOptStoreSubVector()5855 void TargetX8664::doAddressOptStoreSubVector() {
5856   auto *Intrinsic = llvm::cast<InstIntrinsic>(Context.getCur());
5857   Operand *Addr = Intrinsic->getArg(1);
5858   Operand *Data = Intrinsic->getArg(0);
5859   if (auto *OptAddr = computeAddressOpt(Intrinsic, Data->getType(), Addr)) {
5860     Intrinsic->setDeleted();
5861     const Ice::Intrinsics::IntrinsicInfo Info = {
5862         Ice::Intrinsics::StoreSubVector, Ice::Intrinsics::SideEffects_T,
5863         Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_T};
5864     auto *NewStore = Context.insert<InstIntrinsic>(3, nullptr, Info);
5865     NewStore->addArg(Data);
5866     NewStore->addArg(OptAddr);
5867     NewStore->addArg(Intrinsic->getArg(2));
5868   }
5869 }
5870 
lowerCmpRange(Operand * Comparison,uint64_t Min,uint64_t Max)5871 Operand *TargetX8664::lowerCmpRange(Operand *Comparison, uint64_t Min,
5872                                     uint64_t Max) {
5873   // Subtracting 0 is a nop so don't do it
5874   if (Min != 0) {
5875     // Avoid clobbering the comparison by copying it
5876     Variable *T = nullptr;
5877     _mov(T, Comparison);
5878     _sub(T, Ctx->getConstantInt32(Min));
5879     Comparison = T;
5880   }
5881 
5882   _cmp(Comparison, Ctx->getConstantInt32(Max - Min));
5883 
5884   return Comparison;
5885 }
5886 
lowerCaseCluster(const CaseCluster & Case,Operand * Comparison,bool DoneCmp,CfgNode * DefaultTarget)5887 void TargetX8664::lowerCaseCluster(const CaseCluster &Case, Operand *Comparison,
5888                                    bool DoneCmp, CfgNode *DefaultTarget) {
5889   switch (Case.getKind()) {
5890   case CaseCluster::JumpTable: {
5891     InstX86Label *SkipJumpTable;
5892 
5893     Operand *RangeIndex =
5894         lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
5895     if (DefaultTarget == nullptr) {
5896       // Skip over jump table logic if comparison not in range and no default
5897       SkipJumpTable = InstX86Label::create(Func, this);
5898       _br(CondX86::Br_a, SkipJumpTable);
5899     } else {
5900       _br(CondX86::Br_a, DefaultTarget);
5901     }
5902 
5903     InstJumpTable *JumpTable = Case.getJumpTable();
5904     Context.insert(JumpTable);
5905 
5906     // Make sure the index is a register of the same width as the base
5907     Variable *Index;
5908     const Type PointerType = getPointerType();
5909     if (RangeIndex->getType() != PointerType) {
5910       Index = makeReg(PointerType);
5911       if (RangeIndex->getType() == IceType_i64) {
5912         _mov(Index, RangeIndex); // trunc
5913       } else {
5914         Operand *RangeIndexRM = legalize(RangeIndex, Legal_Reg | Legal_Mem);
5915         _movzx(Index, RangeIndexRM);
5916       }
5917     } else {
5918       Index = legalizeToReg(RangeIndex);
5919     }
5920 
5921     constexpr RelocOffsetT RelocOffset = 0;
5922     constexpr Variable *NoBase = nullptr;
5923     constexpr Constant *NoOffset = nullptr;
5924     auto JTName = GlobalString::createWithString(Ctx, JumpTable->getName());
5925     Constant *Offset = Ctx->getConstantSym(RelocOffset, JTName);
5926     uint16_t Shift = typeWidthInBytesLog2(PointerType);
5927     constexpr auto Segment = X86OperandMem::SegmentRegisters::DefaultSegment;
5928 
5929     Variable *Target = nullptr;
5930     if (PointerType == IceType_i32) {
5931       _mov(Target, X86OperandMem::create(Func, PointerType, NoBase, Offset,
5932                                          Index, Shift, Segment));
5933     } else {
5934       auto *Base = makeReg(IceType_i64);
5935       _lea(Base, X86OperandMem::create(Func, IceType_void, NoBase, Offset));
5936       _mov(Target, X86OperandMem::create(Func, PointerType, Base, NoOffset,
5937                                          Index, Shift, Segment));
5938     }
5939 
5940     lowerIndirectJump(Target);
5941 
5942     if (DefaultTarget == nullptr)
5943       Context.insert(SkipJumpTable);
5944     return;
5945   }
5946   case CaseCluster::Range: {
5947     if (Case.isUnitRange()) {
5948       // Single item
5949       if (!DoneCmp) {
5950         Constant *Value = Ctx->getConstantInt32(Case.getLow());
5951         _cmp(Comparison, Value);
5952       }
5953       _br(CondX86::Br_e, Case.getTarget());
5954     } else if (DoneCmp && Case.isPairRange()) {
5955       // Range of two items with first item aleady compared against
5956       _br(CondX86::Br_e, Case.getTarget());
5957       Constant *Value = Ctx->getConstantInt32(Case.getHigh());
5958       _cmp(Comparison, Value);
5959       _br(CondX86::Br_e, Case.getTarget());
5960     } else {
5961       // Range
5962       lowerCmpRange(Comparison, Case.getLow(), Case.getHigh());
5963       _br(CondX86::Br_be, Case.getTarget());
5964     }
5965     if (DefaultTarget != nullptr)
5966       _br(DefaultTarget);
5967     return;
5968   }
5969   }
5970 }
5971 
lowerSwitch(const InstSwitch * Instr)5972 void TargetX8664::lowerSwitch(const InstSwitch *Instr) {
5973   // Group cases together and navigate through them with a binary search
5974   CaseClusterArray CaseClusters = CaseCluster::clusterizeSwitch(Func, Instr);
5975   Operand *Src0 = Instr->getComparison();
5976   CfgNode *DefaultTarget = Instr->getLabelDefault();
5977 
5978   assert(CaseClusters.size() != 0); // Should always be at least one
5979 
5980   if (CaseClusters.size() == 1) {
5981     // Jump straight to default if needed. Currently a common case as jump
5982     // tables occur on their own.
5983     constexpr bool DoneCmp = false;
5984     lowerCaseCluster(CaseClusters.front(), Src0, DoneCmp, DefaultTarget);
5985     return;
5986   }
5987 
5988   // Going to be using multiple times so get it in a register early
5989   Variable *Comparison = legalizeToReg(Src0);
5990 
5991   // A span is over the clusters
5992   struct SearchSpan {
5993     SearchSpan(SizeT Begin, SizeT Size, InstX86Label *Label)
5994         : Begin(Begin), Size(Size), Label(Label) {}
5995 
5996     SizeT Begin;
5997     SizeT Size;
5998     InstX86Label *Label;
5999   };
6000   // The stack will only grow to the height of the tree so 12 should be plenty
6001   std::stack<SearchSpan, llvm::SmallVector<SearchSpan, 12>> SearchSpanStack;
6002   SearchSpanStack.emplace(0, CaseClusters.size(), nullptr);
6003   bool DoneCmp = false;
6004 
6005   while (!SearchSpanStack.empty()) {
6006     SearchSpan Span = SearchSpanStack.top();
6007     SearchSpanStack.pop();
6008 
6009     if (Span.Label != nullptr)
6010       Context.insert(Span.Label);
6011 
6012     switch (Span.Size) {
6013     case 0:
6014       llvm::report_fatal_error("Invalid SearchSpan size");
6015       break;
6016 
6017     case 1:
6018       lowerCaseCluster(CaseClusters[Span.Begin], Comparison, DoneCmp,
6019                        SearchSpanStack.empty() ? nullptr : DefaultTarget);
6020       DoneCmp = false;
6021       break;
6022 
6023     case 2: {
6024       const CaseCluster *CaseA = &CaseClusters[Span.Begin];
6025       const CaseCluster *CaseB = &CaseClusters[Span.Begin + 1];
6026 
6027       // Placing a range last may allow register clobbering during the range
6028       // test. That means there is no need to clone the register. If it is a
6029       // unit range the comparison may have already been done in the binary
6030       // search (DoneCmp) and so it should be placed first. If this is a range
6031       // of two items and the comparison with the low value has already been
6032       // done, comparing with the other element is cheaper than a range test.
6033       // If the low end of the range is zero then there is no subtraction and
6034       // nothing to be gained.
6035       if (!CaseA->isUnitRange() &&
6036           !(CaseA->getLow() == 0 || (DoneCmp && CaseA->isPairRange()))) {
6037         std::swap(CaseA, CaseB);
6038         DoneCmp = false;
6039       }
6040 
6041       lowerCaseCluster(*CaseA, Comparison, DoneCmp);
6042       DoneCmp = false;
6043       lowerCaseCluster(*CaseB, Comparison, DoneCmp,
6044                        SearchSpanStack.empty() ? nullptr : DefaultTarget);
6045     } break;
6046 
6047     default:
6048       // Pick the middle item and branch b or ae
6049       SizeT PivotIndex = Span.Begin + (Span.Size / 2);
6050       const CaseCluster &Pivot = CaseClusters[PivotIndex];
6051       Constant *Value = Ctx->getConstantInt32(Pivot.getLow());
6052       InstX86Label *Label = InstX86Label::create(Func, this);
6053       _cmp(Comparison, Value);
6054       // TODO(ascull): does it alway have to be far?
6055       _br(CondX86::Br_b, Label, InstX86Br::Far);
6056       // Lower the left and (pivot+right) sides, falling through to the right
6057       SearchSpanStack.emplace(Span.Begin, Span.Size / 2, Label);
6058       SearchSpanStack.emplace(PivotIndex, Span.Size - (Span.Size / 2), nullptr);
6059       DoneCmp = true;
6060       break;
6061     }
6062   }
6063 
6064   _br(DefaultTarget);
6065 }
6066 
6067 /// The following pattern occurs often in lowered C and C++ code:
6068 ///
6069 ///   %cmp     = fcmp/icmp pred <n x ty> %src0, %src1
6070 ///   %cmp.ext = sext <n x i1> %cmp to <n x ty>
6071 ///
6072 /// We can eliminate the sext operation by copying the result of pcmpeqd,
6073 /// pcmpgtd, or cmpps (which produce sign extended results) to the result of the
6074 /// sext operation.
6075 
eliminateNextVectorSextInstruction(Variable * SignExtendedResult)6076 void TargetX8664::eliminateNextVectorSextInstruction(
6077     Variable *SignExtendedResult) {
6078   if (auto *NextCast =
6079           llvm::dyn_cast_or_null<InstCast>(Context.getNextInst())) {
6080     if (NextCast->getCastKind() == InstCast::Sext &&
6081         NextCast->getSrc(0) == SignExtendedResult) {
6082       NextCast->setDeleted();
6083       _movp(NextCast->getDest(), legalizeToReg(SignExtendedResult));
6084       // Skip over the instruction.
6085       Context.advanceNext();
6086     }
6087   }
6088 }
6089 
lowerUnreachable(const InstUnreachable *)6090 void TargetX8664::lowerUnreachable(const InstUnreachable * /*Instr*/) {
6091   _ud2();
6092   // Add a fake use of esp to make sure esp adjustments after the unreachable
6093   // do not get dead-code eliminated.
6094   keepEspLiveAtExit();
6095 }
6096 
lowerBreakpoint(const InstBreakpoint *)6097 void TargetX8664::lowerBreakpoint(const InstBreakpoint * /*Instr*/) { _int3(); }
6098 
lowerRMW(const InstX86FakeRMW * RMW)6099 void TargetX8664::lowerRMW(const InstX86FakeRMW *RMW) {
6100   // If the beacon variable's live range does not end in this instruction, then
6101   // it must end in the modified Store instruction that follows. This means
6102   // that the original Store instruction is still there, either because the
6103   // value being stored is used beyond the Store instruction, or because dead
6104   // code elimination did not happen. In either case, we cancel RMW lowering
6105   // (and the caller deletes the RMW instruction).
6106   if (!RMW->isLastUse(RMW->getBeacon()))
6107     return;
6108   Operand *Src = RMW->getData();
6109   Type Ty = Src->getType();
6110   X86OperandMem *Addr = formMemoryOperand(RMW->getAddr(), Ty);
6111   doMockBoundsCheck(Addr);
6112   switch (RMW->getOp()) {
6113   default:
6114     // TODO(stichnot): Implement other arithmetic operators.
6115     break;
6116   case InstArithmetic::Add:
6117     Src = legalize(Src, Legal_Reg | Legal_Imm);
6118     _add_rmw(Addr, Src);
6119     return;
6120   case InstArithmetic::Sub:
6121     Src = legalize(Src, Legal_Reg | Legal_Imm);
6122     _sub_rmw(Addr, Src);
6123     return;
6124   case InstArithmetic::And:
6125     Src = legalize(Src, Legal_Reg | Legal_Imm);
6126     _and_rmw(Addr, Src);
6127     return;
6128   case InstArithmetic::Or:
6129     Src = legalize(Src, Legal_Reg | Legal_Imm);
6130     _or_rmw(Addr, Src);
6131     return;
6132   case InstArithmetic::Xor:
6133     Src = legalize(Src, Legal_Reg | Legal_Imm);
6134     _xor_rmw(Addr, Src);
6135     return;
6136   }
6137   llvm::report_fatal_error("Couldn't lower RMW instruction");
6138 }
6139 
lowerOther(const Inst * Instr)6140 void TargetX8664::lowerOther(const Inst *Instr) {
6141   if (const auto *RMW = llvm::dyn_cast<InstX86FakeRMW>(Instr)) {
6142     lowerRMW(RMW);
6143   } else {
6144     TargetLowering::lowerOther(Instr);
6145   }
6146 }
6147 
prelowerPhis()6148 void TargetX8664::prelowerPhis() {
6149   // On x86-64 we don't need to prelower phis -- the architecture can handle
6150   // 64-bit integer natively.
6151   return;
6152 }
6153 
genTargetHelperCallFor(Inst * Instr)6154 void TargetX8664::genTargetHelperCallFor(Inst *Instr) {
6155   uint32_t StackArgumentsSize = 0;
6156   if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr)) {
6157     RuntimeHelper HelperID = RuntimeHelper::H_Num;
6158     Variable *Dest = Arith->getDest();
6159     Type DestTy = Dest->getType();
6160     if (isVectorType(DestTy)) {
6161       Variable *Dest = Arith->getDest();
6162       Operand *Src0 = Arith->getSrc(0);
6163       Operand *Src1 = Arith->getSrc(1);
6164       switch (Arith->getOp()) {
6165       default:
6166         return;
6167       case InstArithmetic::Mul:
6168         if (DestTy == IceType_v16i8) {
6169           scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
6170           Arith->setDeleted();
6171         }
6172         return;
6173       case InstArithmetic::Shl:
6174       case InstArithmetic::Lshr:
6175       case InstArithmetic::Ashr:
6176         if (llvm::isa<Constant>(Src1)) {
6177           return;
6178         }
6179       case InstArithmetic::Udiv:
6180       case InstArithmetic::Urem:
6181       case InstArithmetic::Sdiv:
6182       case InstArithmetic::Srem:
6183       case InstArithmetic::Frem:
6184         scalarizeArithmetic(Arith->getOp(), Dest, Src0, Src1);
6185         Arith->setDeleted();
6186         return;
6187       }
6188     } else {
6189       switch (Arith->getOp()) {
6190       default:
6191         return;
6192       case InstArithmetic::Frem:
6193         if (isFloat32Asserting32Or64(DestTy))
6194           HelperID = RuntimeHelper::H_frem_f32;
6195         else
6196           HelperID = RuntimeHelper::H_frem_f64;
6197       }
6198     }
6199     constexpr SizeT MaxSrcs = 2;
6200     InstCall *Call = makeHelperCall(HelperID, Dest, MaxSrcs);
6201     Call->addArg(Arith->getSrc(0));
6202     Call->addArg(Arith->getSrc(1));
6203     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
6204     Context.insert(Call);
6205     Arith->setDeleted();
6206   } else if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
6207     InstCast::OpKind CastKind = Cast->getCastKind();
6208     Operand *Src0 = Cast->getSrc(0);
6209     const Type SrcType = Src0->getType();
6210     Variable *Dest = Cast->getDest();
6211     const Type DestTy = Dest->getType();
6212     RuntimeHelper HelperID = RuntimeHelper::H_Num;
6213     Variable *CallDest = Dest;
6214     switch (CastKind) {
6215     default:
6216       return;
6217     case InstCast::Fptoui:
6218       if (isVectorType(DestTy)) {
6219         assert(DestTy == IceType_v4i32);
6220         assert(SrcType == IceType_v4f32);
6221         HelperID = RuntimeHelper::H_fptoui_4xi32_f32;
6222       } else if (DestTy == IceType_i64) {
6223         HelperID = isFloat32Asserting32Or64(SrcType)
6224                        ? RuntimeHelper::H_fptoui_f32_i64
6225                        : RuntimeHelper::H_fptoui_f64_i64;
6226       } else {
6227         return;
6228       }
6229       break;
6230     case InstCast::Uitofp:
6231       if (isVectorType(SrcType)) {
6232         assert(DestTy == IceType_v4f32);
6233         assert(SrcType == IceType_v4i32);
6234         HelperID = RuntimeHelper::H_uitofp_4xi32_4xf32;
6235       } else if (SrcType == IceType_i64) {
6236         if (isInt32Asserting32Or64(SrcType)) {
6237           HelperID = isFloat32Asserting32Or64(DestTy)
6238                          ? RuntimeHelper::H_uitofp_i32_f32
6239                          : RuntimeHelper::H_uitofp_i32_f64;
6240         } else {
6241           HelperID = isFloat32Asserting32Or64(DestTy)
6242                          ? RuntimeHelper::H_uitofp_i64_f32
6243                          : RuntimeHelper::H_uitofp_i64_f64;
6244         }
6245       } else {
6246         return;
6247       }
6248       break;
6249     case InstCast::Bitcast: {
6250       if (DestTy == Src0->getType())
6251         return;
6252       switch (DestTy) {
6253       default:
6254         return;
6255       case IceType_i8:
6256         assert(Src0->getType() == IceType_v8i1);
6257         HelperID = RuntimeHelper::H_bitcast_8xi1_i8;
6258         CallDest = Func->makeVariable(IceType_i32);
6259         break;
6260       case IceType_i16:
6261         assert(Src0->getType() == IceType_v16i1);
6262         HelperID = RuntimeHelper::H_bitcast_16xi1_i16;
6263         CallDest = Func->makeVariable(IceType_i32);
6264         break;
6265       case IceType_v8i1: {
6266         assert(Src0->getType() == IceType_i8);
6267         HelperID = RuntimeHelper::H_bitcast_i8_8xi1;
6268         Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
6269         // Arguments to functions are required to be at least 32 bits wide.
6270         Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
6271         Src0 = Src0AsI32;
6272       } break;
6273       case IceType_v16i1: {
6274         assert(Src0->getType() == IceType_i16);
6275         HelperID = RuntimeHelper::H_bitcast_i16_16xi1;
6276         Variable *Src0AsI32 = Func->makeVariable(stackSlotType());
6277         // Arguments to functions are required to be at least 32 bits wide.
6278         Context.insert<InstCast>(InstCast::Zext, Src0AsI32, Src0);
6279         Src0 = Src0AsI32;
6280       } break;
6281       }
6282     } break;
6283     }
6284     constexpr SizeT MaxSrcs = 1;
6285     InstCall *Call = makeHelperCall(HelperID, CallDest, MaxSrcs);
6286     Call->addArg(Src0);
6287     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
6288     Context.insert(Call);
6289     // The PNaCl ABI disallows i8/i16 return types, so truncate the helper call
6290     // result to the appropriate type as necessary.
6291     if (CallDest->getType() != Dest->getType())
6292       Context.insert<InstCast>(InstCast::Trunc, Dest, CallDest);
6293     Cast->setDeleted();
6294   } else if (auto *Intrinsic = llvm::dyn_cast<InstIntrinsic>(Instr)) {
6295     CfgVector<Type> ArgTypes;
6296     Type ReturnType = IceType_void;
6297     switch (Intrinsic->getIntrinsicID()) {
6298     default:
6299       return;
6300     case Intrinsics::Ctpop: {
6301       Operand *Val = Intrinsic->getArg(0);
6302       Type ValTy = Val->getType();
6303       if (ValTy == IceType_i64)
6304         ArgTypes = {IceType_i64};
6305       else
6306         ArgTypes = {IceType_i32};
6307       ReturnType = IceType_i32;
6308     } break;
6309     case Intrinsics::Longjmp:
6310       ArgTypes = {IceType_i32, IceType_i32};
6311       ReturnType = IceType_void;
6312       break;
6313     case Intrinsics::Memcpy:
6314       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
6315       ReturnType = IceType_void;
6316       break;
6317     case Intrinsics::Memmove:
6318       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
6319       ReturnType = IceType_void;
6320       break;
6321     case Intrinsics::Memset:
6322       ArgTypes = {IceType_i32, IceType_i32, IceType_i32};
6323       ReturnType = IceType_void;
6324       break;
6325     case Intrinsics::Setjmp:
6326       ArgTypes = {IceType_i32};
6327       ReturnType = IceType_i32;
6328       break;
6329     }
6330     StackArgumentsSize = getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
6331   } else if (auto *Call = llvm::dyn_cast<InstCall>(Instr)) {
6332     StackArgumentsSize = getCallStackArgumentsSizeBytes(Call);
6333   } else if (auto *Ret = llvm::dyn_cast<InstRet>(Instr)) {
6334     if (!Ret->hasRetValue())
6335       return;
6336     Operand *RetValue = Ret->getRetValue();
6337     Type ReturnType = RetValue->getType();
6338     if (!isScalarFloatingType(ReturnType))
6339       return;
6340     StackArgumentsSize = typeWidthInBytes(ReturnType);
6341   } else {
6342     return;
6343   }
6344   StackArgumentsSize = applyStackAlignment(StackArgumentsSize);
6345   updateMaxOutArgsSizeBytes(StackArgumentsSize);
6346 }
6347 
6348 uint32_t
getCallStackArgumentsSizeBytes(const CfgVector<Type> & ArgTypes,Type ReturnType)6349 TargetX8664::getCallStackArgumentsSizeBytes(const CfgVector<Type> &ArgTypes,
6350                                             Type ReturnType) {
6351   uint32_t OutArgumentsSizeBytes = 0;
6352   uint32_t XmmArgCount = 0;
6353   uint32_t GprArgCount = 0;
6354   for (SizeT i = 0, NumArgTypes = ArgTypes.size(); i < NumArgTypes; ++i) {
6355     Type Ty = ArgTypes[i];
6356     // The PNaCl ABI requires the width of arguments to be at least 32 bits.
6357     assert(typeWidthInBytes(Ty) >= 4);
6358     if (isVectorType(Ty) &&
6359         RegX8664::getRegisterForXmmArgNum(RegX8664::getArgIndex(i, XmmArgCount))
6360             .hasValue()) {
6361       ++XmmArgCount;
6362     } else if (isScalarFloatingType(Ty) &&
6363                RegX8664::getRegisterForXmmArgNum(
6364                    RegX8664::getArgIndex(i, XmmArgCount))
6365                    .hasValue()) {
6366       ++XmmArgCount;
6367     } else if (isScalarIntegerType(Ty) &&
6368                RegX8664::getRegisterForGprArgNum(
6369                    Ty, RegX8664::getArgIndex(i, GprArgCount))
6370                    .hasValue()) {
6371       // The 64 bit ABI allows some integers to be passed in GPRs.
6372       ++GprArgCount;
6373     } else {
6374       if (isVectorType(Ty)) {
6375         OutArgumentsSizeBytes = applyStackAlignment(OutArgumentsSizeBytes);
6376       }
6377       OutArgumentsSizeBytes += typeWidthInBytesOnStack(Ty);
6378     }
6379   }
6380   return OutArgumentsSizeBytes;
6381 }
6382 
getCallStackArgumentsSizeBytes(const InstCall * Instr)6383 uint32_t TargetX8664::getCallStackArgumentsSizeBytes(const InstCall *Instr) {
6384   // Build a vector of the arguments' types.
6385   const SizeT NumArgs = Instr->getNumArgs();
6386   CfgVector<Type> ArgTypes;
6387   ArgTypes.reserve(NumArgs);
6388   for (SizeT i = 0; i < NumArgs; ++i) {
6389     Operand *Arg = Instr->getArg(i);
6390     ArgTypes.emplace_back(Arg->getType());
6391   }
6392   // Compute the return type (if any);
6393   Type ReturnType = IceType_void;
6394   Variable *Dest = Instr->getDest();
6395   if (Dest != nullptr)
6396     ReturnType = Dest->getType();
6397   return getShadowStoreSize() +
6398          getCallStackArgumentsSizeBytes(ArgTypes, ReturnType);
6399 }
6400 
makeZeroedRegister(Type Ty,RegNumT RegNum)6401 Variable *TargetX8664::makeZeroedRegister(Type Ty, RegNumT RegNum) {
6402   Variable *Reg = makeReg(Ty, RegNum);
6403   switch (Ty) {
6404   case IceType_i1:
6405   case IceType_i8:
6406   case IceType_i16:
6407   case IceType_i32:
6408   case IceType_i64:
6409     // Conservatively do "mov reg, 0" to avoid modifying FLAGS.
6410     _mov(Reg, Ctx->getConstantZero(Ty));
6411     break;
6412   case IceType_f32:
6413   case IceType_f64:
6414     Context.insert<InstFakeDef>(Reg);
6415     _xorps(Reg, Reg);
6416     break;
6417   default:
6418     // All vector types use the same pxor instruction.
6419     assert(isVectorType(Ty));
6420     Context.insert<InstFakeDef>(Reg);
6421     _pxor(Reg, Reg);
6422     break;
6423   }
6424   return Reg;
6425 }
6426 
6427 // There is no support for loading or emitting vector constants, so the vector
6428 // values returned from makeVectorOfZeros, makeVectorOfOnes, etc. are
6429 // initialized with register operations.
6430 //
6431 // TODO(wala): Add limited support for vector constants so that complex
6432 // initialization in registers is unnecessary.
6433 
makeVectorOfZeros(Type Ty,RegNumT RegNum)6434 Variable *TargetX8664::makeVectorOfZeros(Type Ty, RegNumT RegNum) {
6435   return makeZeroedRegister(Ty, RegNum);
6436 }
6437 
makeVectorOfMinusOnes(Type Ty,RegNumT RegNum)6438 Variable *TargetX8664::makeVectorOfMinusOnes(Type Ty, RegNumT RegNum) {
6439   Variable *MinusOnes = makeReg(Ty, RegNum);
6440   // Insert a FakeDef so the live range of MinusOnes is not overestimated.
6441   Context.insert<InstFakeDef>(MinusOnes);
6442   if (Ty == IceType_f64)
6443     // Making a vector of minus ones of type f64 is currently only used for the
6444     // fabs intrinsic.  To use the f64 type to create this mask with pcmpeqq
6445     // requires SSE 4.1.  Since we're just creating a mask, pcmpeqd does the
6446     // same job and only requires SSE2.
6447     _pcmpeq(MinusOnes, MinusOnes, IceType_f32);
6448   else
6449     _pcmpeq(MinusOnes, MinusOnes);
6450   return MinusOnes;
6451 }
6452 
makeVectorOfOnes(Type Ty,RegNumT RegNum)6453 Variable *TargetX8664::makeVectorOfOnes(Type Ty, RegNumT RegNum) {
6454   Variable *Dest = makeVectorOfZeros(Ty, RegNum);
6455   Variable *MinusOne = makeVectorOfMinusOnes(Ty);
6456   _psub(Dest, MinusOne);
6457   return Dest;
6458 }
6459 
makeVectorOfHighOrderBits(Type Ty,RegNumT RegNum)6460 Variable *TargetX8664::makeVectorOfHighOrderBits(Type Ty, RegNumT RegNum) {
6461   assert(Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v8i16 ||
6462          Ty == IceType_v16i8);
6463   if (Ty == IceType_v4f32 || Ty == IceType_v4i32 || Ty == IceType_v8i16) {
6464     Variable *Reg = makeVectorOfOnes(Ty, RegNum);
6465     SizeT Shift = typeWidthInBytes(typeElementType(Ty)) * X86_CHAR_BIT - 1;
6466     _psll(Reg, Ctx->getConstantInt8(Shift));
6467     return Reg;
6468   } else {
6469     // SSE has no left shift operation for vectors of 8 bit integers.
6470     constexpr uint32_t HIGH_ORDER_BITS_MASK = 0x80808080;
6471     Constant *ConstantMask = Ctx->getConstantInt32(HIGH_ORDER_BITS_MASK);
6472     Variable *Reg = makeReg(Ty, RegNum);
6473     _movd(Reg, legalize(ConstantMask, Legal_Reg | Legal_Mem));
6474     _pshufd(Reg, Reg, Ctx->getConstantZero(IceType_i8));
6475     return Reg;
6476   }
6477 }
6478 
6479 /// Construct a mask in a register that can be and'ed with a floating-point
6480 /// value to mask off its sign bit. The value will be <4 x 0x7fffffff> for f32
6481 /// and v4f32, and <2 x 0x7fffffffffffffff> for f64. Construct it as vector of
6482 /// ones logically right shifted one bit.
6483 // TODO(stichnot): Fix the wala
6484 // TODO: above, to represent vector constants in memory.
6485 
makeVectorOfFabsMask(Type Ty,RegNumT RegNum)6486 Variable *TargetX8664::makeVectorOfFabsMask(Type Ty, RegNumT RegNum) {
6487   Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
6488   _psrl(Reg, Ctx->getConstantInt8(1));
6489   return Reg;
6490 }
6491 
getMemoryOperandForStackSlot(Type Ty,Variable * Slot,uint32_t Offset)6492 X86OperandMem *TargetX8664::getMemoryOperandForStackSlot(Type Ty,
6493                                                          Variable *Slot,
6494                                                          uint32_t Offset) {
6495   // Ensure that Loc is a stack slot.
6496   assert(Slot->mustNotHaveReg());
6497   assert(Slot->getRegNum().hasNoValue());
6498   // Compute the location of Loc in memory.
6499   // TODO(wala,stichnot): lea should not
6500   // be required. The address of the stack slot is known at compile time
6501   // (although not until after addProlog()).
6502   const Type PointerType = getPointerType();
6503   Variable *Loc = makeReg(PointerType);
6504   _lea(Loc, Slot);
6505   Constant *ConstantOffset = Ctx->getConstantInt32(Offset);
6506   return X86OperandMem::create(Func, Ty, Loc, ConstantOffset);
6507 }
6508 
6509 /// Lowering helper to copy a scalar integer source operand into some 8-bit GPR.
6510 /// Src is assumed to already be legalized.  If the source operand is known to
6511 /// be a memory or immediate operand, a simple mov will suffice.  But if the
6512 /// source operand can be a physical register, then it must first be copied into
6513 /// a physical register that is truncable to 8-bit, then truncated into a
6514 /// physical register that can receive a truncation, and finally copied into the
6515 /// result 8-bit register (which in general can be any 8-bit register).  For
6516 /// example, moving %ebp into %ah may be accomplished as:
6517 ///   movl %ebp, %edx
6518 ///   mov_trunc %edx, %dl  // this redundant assignment is ultimately elided
6519 ///   movb %dl, %ah
6520 /// On the other hand, moving a memory or immediate operand into ah:
6521 ///   movb 4(%ebp), %ah
6522 ///   movb $my_imm, %ah
6523 ///
6524 /// Note #1.  On a 64-bit target, the "movb 4(%ebp), %ah" is likely not
6525 /// encodable, so RegNum=Reg_ah should NOT be given as an argument.  Instead,
6526 /// use RegNum=RegNumT() and then let the caller do a separate copy into
6527 /// Reg_ah.
6528 ///
6529 /// Note #2.  ConstantRelocatable operands are also put through this process
6530 /// (not truncated directly) because our ELF emitter does R_386_32 relocations
6531 /// but not R_386_8 relocations.
6532 ///
6533 /// Note #3.  If Src is a Variable, the result will be an infinite-weight i8
6534 /// Variable with the RCX86_IsTrunc8Rcvr register class.  As such, this helper
6535 /// is a convenient way to prevent ah/bh/ch/dh from being an (invalid) argument
6536 /// to the pinsrb instruction.
6537 
copyToReg8(Operand * Src,RegNumT RegNum)6538 Variable *TargetX8664::copyToReg8(Operand *Src, RegNumT RegNum) {
6539   Type Ty = Src->getType();
6540   assert(isScalarIntegerType(Ty));
6541   assert(Ty != IceType_i1);
6542   Variable *Reg = makeReg(IceType_i8, RegNum);
6543   Reg->setRegClass(RCX86_IsTrunc8Rcvr);
6544   if (llvm::isa<Variable>(Src) || llvm::isa<ConstantRelocatable>(Src)) {
6545     Variable *SrcTruncable = makeReg(Ty);
6546     switch (Ty) {
6547     case IceType_i64:
6548       SrcTruncable->setRegClass(RCX86_Is64To8);
6549       break;
6550     case IceType_i32:
6551       SrcTruncable->setRegClass(RCX86_Is32To8);
6552       break;
6553     case IceType_i16:
6554       SrcTruncable->setRegClass(RCX86_Is16To8);
6555       break;
6556     default:
6557       // i8 - just use default register class
6558       break;
6559     }
6560     Variable *SrcRcvr = makeReg(IceType_i8);
6561     SrcRcvr->setRegClass(RCX86_IsTrunc8Rcvr);
6562     _mov(SrcTruncable, Src);
6563     _mov(SrcRcvr, SrcTruncable);
6564     Src = SrcRcvr;
6565   }
6566   _mov(Reg, Src);
6567   return Reg;
6568 }
6569 
6570 /// Helper for legalize() to emit the right code to lower an operand to a
6571 /// register of the appropriate type.
6572 
copyToReg(Operand * Src,RegNumT RegNum)6573 Variable *TargetX8664::copyToReg(Operand *Src, RegNumT RegNum) {
6574   Type Ty = Src->getType();
6575   Variable *Reg = makeReg(Ty, RegNum);
6576   if (isVectorType(Ty)) {
6577     _movp(Reg, Src);
6578   } else {
6579     _mov(Reg, Src);
6580   }
6581   return Reg;
6582 }
6583 
legalize(Operand * From,LegalMask Allowed,RegNumT RegNum)6584 Operand *TargetX8664::legalize(Operand *From, LegalMask Allowed,
6585                                RegNumT RegNum) {
6586   const Type Ty = From->getType();
6587   // Assert that a physical register is allowed. To date, all calls to
6588   // legalize() allow a physical register. If a physical register needs to be
6589   // explicitly disallowed, then new code will need to be written to force a
6590   // spill.
6591   assert(Allowed & Legal_Reg);
6592   // If we're asking for a specific physical register, make sure we're not
6593   // allowing any other operand kinds. (This could be future work, e.g. allow
6594   // the shl shift amount to be either an immediate or in ecx.)
6595   assert(RegNum.hasNoValue() || Allowed == Legal_Reg);
6596 
6597   // Substitute with an available infinite-weight variable if possible.  Only do
6598   // this when we are not asking for a specific register, and when the
6599   // substitution is not locked to a specific register, and when the types
6600   // match, in order to capture the vast majority of opportunities and avoid
6601   // corner cases in the lowering.
6602   if (RegNum.hasNoValue()) {
6603     if (Variable *Subst = getContext().availabilityGet(From)) {
6604       // At this point we know there is a potential substitution available.
6605       if (Subst->mustHaveReg() && !Subst->hasReg()) {
6606         // At this point we know the substitution will have a register.
6607         if (From->getType() == Subst->getType()) {
6608           // At this point we know the substitution's register is compatible.
6609           return Subst;
6610         }
6611       }
6612     }
6613   }
6614 
6615   if (auto *Mem = llvm::dyn_cast<X86OperandMem>(From)) {
6616     // Before doing anything with a Mem operand, we need to ensure that the
6617     // Base and Index components are in physical registers.
6618     Variable *Base = Mem->getBase();
6619     Variable *Index = Mem->getIndex();
6620     Constant *Offset = Mem->getOffset();
6621     Variable *RegBase = nullptr;
6622     Variable *RegIndex = nullptr;
6623     uint16_t Shift = Mem->getShift();
6624     if (Base) {
6625       RegBase = llvm::cast<Variable>(
6626           legalize(Base, Legal_Reg | Legal_Rematerializable));
6627     }
6628     if (Index) {
6629       // TODO(jpp): perhaps we should only allow Legal_Reg if
6630       // Base->isRematerializable.
6631       RegIndex = llvm::cast<Variable>(
6632           legalize(Index, Legal_Reg | Legal_Rematerializable));
6633     }
6634 
6635     if (Base != RegBase || Index != RegIndex) {
6636       Mem = X86OperandMem::create(Func, Ty, RegBase, Offset, RegIndex, Shift,
6637                                   Mem->getSegmentRegister());
6638     }
6639 
6640     From = Mem;
6641 
6642     if (!(Allowed & Legal_Mem)) {
6643       From = copyToReg(From, RegNum);
6644     }
6645     return From;
6646   }
6647 
6648   if (auto *Const = llvm::dyn_cast<Constant>(From)) {
6649     if (llvm::isa<ConstantUndef>(Const)) {
6650       From = legalizeUndef(Const, RegNum);
6651       if (isVectorType(Ty))
6652         return From;
6653       Const = llvm::cast<Constant>(From);
6654     }
6655     // There should be no constants of vector type (other than undef).
6656     assert(!isVectorType(Ty));
6657 
6658     // If the operand is a 64 bit constant integer we need to legalize it to a
6659     // register in x86-64.
6660     if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Const)) {
6661       if (!Utils::IsInt(32, C64->getValue())) {
6662         if (RegNum.hasValue()) {
6663           assert(RegX8664::getGprForType(IceType_i64, RegNum) == RegNum);
6664         }
6665         return copyToReg(Const, RegNum);
6666       }
6667     }
6668 
6669     if (!llvm::dyn_cast<ConstantRelocatable>(Const)) {
6670       if (isScalarFloatingType(Ty)) {
6671         // Convert a scalar floating point constant into an explicit memory
6672         // operand.
6673         if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(Const)) {
6674           if (Utils::isPositiveZero(ConstFloat->getValue()))
6675             return makeZeroedRegister(Ty, RegNum);
6676         } else if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(Const)) {
6677           if (Utils::isPositiveZero(ConstDouble->getValue()))
6678             return makeZeroedRegister(Ty, RegNum);
6679         }
6680 
6681         auto *CFrom = llvm::cast<Constant>(From);
6682         assert(CFrom->getShouldBePooled());
6683         Constant *Offset = Ctx->getConstantSym(0, CFrom->getLabelName());
6684         auto *Mem = X86OperandMem::create(Func, Ty, nullptr, Offset);
6685         From = Mem;
6686       }
6687     }
6688 
6689     bool NeedsReg = false;
6690     if (!(Allowed & Legal_Imm) && !isScalarFloatingType(Ty))
6691       // Immediate specifically not allowed.
6692       NeedsReg = true;
6693     if (!(Allowed & Legal_Mem) && isScalarFloatingType(Ty))
6694       // On x86, FP constants are lowered to mem operands.
6695       NeedsReg = true;
6696     if (NeedsReg) {
6697       From = copyToReg(From, RegNum);
6698     }
6699     return From;
6700   }
6701 
6702   if (auto *Var = llvm::dyn_cast<Variable>(From)) {
6703     // Check if the variable is guaranteed a physical register. This can happen
6704     // either when the variable is pre-colored or when it is assigned infinite
6705     // weight.
6706     bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg());
6707     bool MustRematerialize =
6708         (Var->isRematerializable() && !(Allowed & Legal_Rematerializable));
6709     // We need a new physical register for the operand if:
6710     // - Mem is not allowed and Var isn't guaranteed a physical register, or
6711     // - RegNum is required and Var->getRegNum() doesn't match, or
6712     // - Var is a rematerializable variable and rematerializable pass-through is
6713     //   not allowed (in which case we need a lea instruction).
6714     if (MustRematerialize) {
6715       Variable *NewVar = makeReg(Ty, RegNum);
6716       // Since Var is rematerializable, the offset will be added when the lea is
6717       // emitted.
6718       constexpr Constant *NoOffset = nullptr;
6719       auto *Mem = X86OperandMem::create(Func, Ty, Var, NoOffset);
6720       _lea(NewVar, Mem);
6721       From = NewVar;
6722     } else if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
6723                (RegNum.hasValue() && RegNum != Var->getRegNum())) {
6724       From = copyToReg(From, RegNum);
6725     }
6726     return From;
6727   }
6728 
6729   llvm::report_fatal_error("Unhandled operand kind in legalize()");
6730   return From;
6731 }
6732 
6733 /// Provide a trivial wrapper to legalize() for this common usage.
6734 
legalizeToReg(Operand * From,RegNumT RegNum)6735 Variable *TargetX8664::legalizeToReg(Operand *From, RegNumT RegNum) {
6736   return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
6737 }
6738 
6739 /// Legalize undef values to concrete values.
6740 
legalizeUndef(Operand * From,RegNumT RegNum)6741 Operand *TargetX8664::legalizeUndef(Operand *From, RegNumT RegNum) {
6742   Type Ty = From->getType();
6743   if (llvm::isa<ConstantUndef>(From)) {
6744     // Lower undefs to zero.  Another option is to lower undefs to an
6745     // uninitialized register; however, using an uninitialized register results
6746     // in less predictable code.
6747     //
6748     // If in the future the implementation is changed to lower undef values to
6749     // uninitialized registers, a FakeDef will be needed:
6750     //     Context.insert<InstFakeDef>(Reg);
6751     // This is in order to ensure that the live range of Reg is not
6752     // overestimated.  If the constant being lowered is a 64 bit value, then
6753     // the result should be split and the lo and hi components will need to go
6754     // in uninitialized registers.
6755     if (isVectorType(Ty))
6756       return makeVectorOfZeros(Ty, RegNum);
6757     return Ctx->getConstantZero(Ty);
6758   }
6759   return From;
6760 }
6761 
6762 /// For the cmp instruction, if Src1 is an immediate, or known to be a physical
6763 /// register, we can allow Src0 to be a memory operand. Otherwise, Src0 must be
6764 /// copied into a physical register. (Actually, either Src0 or Src1 can be
6765 /// chosen for the physical register, but unfortunately we have to commit to one
6766 /// or the other before register allocation.)
6767 
legalizeSrc0ForCmp(Operand * Src0,Operand * Src1)6768 Operand *TargetX8664::legalizeSrc0ForCmp(Operand *Src0, Operand *Src1) {
6769   bool IsSrc1ImmOrReg = false;
6770   if (llvm::isa<Constant>(Src1)) {
6771     IsSrc1ImmOrReg = true;
6772   } else if (auto *Var = llvm::dyn_cast<Variable>(Src1)) {
6773     if (Var->hasReg())
6774       IsSrc1ImmOrReg = true;
6775   }
6776   return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
6777 }
6778 
formMemoryOperand(Operand * Opnd,Type Ty,bool DoLegalize)6779 X86OperandMem *TargetX8664::formMemoryOperand(Operand *Opnd, Type Ty,
6780                                               bool DoLegalize) {
6781   auto *Mem = llvm::dyn_cast<X86OperandMem>(Opnd);
6782   // It may be the case that address mode optimization already creates an
6783   // X86OperandMem, so in that case it wouldn't need another level of
6784   // transformation.
6785   if (!Mem) {
6786     auto *Base = llvm::dyn_cast<Variable>(Opnd);
6787     auto *Offset = llvm::dyn_cast<Constant>(Opnd);
6788     assert(Base || Offset);
6789     if (Offset) {
6790       if (!llvm::isa<ConstantRelocatable>(Offset)) {
6791         if (llvm::isa<ConstantInteger64>(Offset)) {
6792           // Memory operands cannot have 64-bit immediates, so they must be
6793           // legalized into a register only.
6794           Base = llvm::cast<Variable>(legalize(Offset, Legal_Reg));
6795           Offset = nullptr;
6796         } else {
6797           Offset = llvm::cast<Constant>(legalize(Offset));
6798 
6799           assert(llvm::isa<ConstantInteger32>(Offset) ||
6800                  llvm::isa<ConstantRelocatable>(Offset));
6801         }
6802       }
6803     }
6804     Mem = X86OperandMem::create(Func, Ty, Base, Offset);
6805   }
6806   return llvm::cast<X86OperandMem>(DoLegalize ? legalize(Mem) : Mem);
6807 }
6808 
makeReg(Type Type,RegNumT RegNum)6809 Variable *TargetX8664::makeReg(Type Type, RegNumT RegNum) {
6810   Variable *Reg = Func->makeVariable(Type);
6811   if (RegNum.hasValue())
6812     Reg->setRegNum(RegNum);
6813   else
6814     Reg->setMustHaveReg();
6815   return Reg;
6816 }
6817 
6818 const Type TypeForSize[] = {IceType_i8, IceType_i16, IceType_i32, IceType_f64,
6819                             IceType_v16i8};
6820 
largestTypeInSize(uint32_t Size,uint32_t MaxSize)6821 Type TargetX8664::largestTypeInSize(uint32_t Size, uint32_t MaxSize) {
6822   assert(Size != 0);
6823   uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
6824   uint32_t MaxIndex = MaxSize == NoSizeLimit
6825                           ? llvm::array_lengthof(TypeForSize) - 1
6826                           : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
6827   return TypeForSize[std::min(TyIndex, MaxIndex)];
6828 }
6829 
firstTypeThatFitsSize(uint32_t Size,uint32_t MaxSize)6830 Type TargetX8664::firstTypeThatFitsSize(uint32_t Size, uint32_t MaxSize) {
6831   assert(Size != 0);
6832   uint32_t TyIndex = llvm::findLastSet(Size, llvm::ZB_Undefined);
6833   if (!llvm::isPowerOf2_32(Size))
6834     ++TyIndex;
6835   uint32_t MaxIndex = MaxSize == NoSizeLimit
6836                           ? llvm::array_lengthof(TypeForSize) - 1
6837                           : llvm::findLastSet(MaxSize, llvm::ZB_Undefined);
6838   return TypeForSize[std::min(TyIndex, MaxIndex)];
6839 }
6840 
postLower()6841 void TargetX8664::postLower() {
6842   if (Func->getOptLevel() == Opt_m1)
6843     return;
6844   markRedefinitions();
6845   Context.availabilityUpdate();
6846 }
6847 
emit(const ConstantInteger32 * C) const6848 void TargetX8664::emit(const ConstantInteger32 *C) const {
6849   if (!BuildDefs::dump())
6850     return;
6851   Ostream &Str = Ctx->getStrEmit();
6852   Str << "$" << C->getValue();
6853 }
6854 
emit(const ConstantInteger64 * C) const6855 void TargetX8664::emit(const ConstantInteger64 *C) const {
6856   if (!BuildDefs::dump())
6857     return;
6858   Ostream &Str = Ctx->getStrEmit();
6859   Str << "$" << C->getValue();
6860 }
6861 
emit(const ConstantFloat * C) const6862 void TargetX8664::emit(const ConstantFloat *C) const {
6863   if (!BuildDefs::dump())
6864     return;
6865   Ostream &Str = Ctx->getStrEmit();
6866   Str << C->getLabelName();
6867 }
6868 
emit(const ConstantDouble * C) const6869 void TargetX8664::emit(const ConstantDouble *C) const {
6870   if (!BuildDefs::dump())
6871     return;
6872   Ostream &Str = Ctx->getStrEmit();
6873   Str << C->getLabelName();
6874 }
6875 
emit(const ConstantUndef *) const6876 void TargetX8664::emit(const ConstantUndef *) const {
6877   llvm::report_fatal_error("undef value encountered by emitter.");
6878 }
6879 
emit(const ConstantRelocatable * C) const6880 void TargetX8664::emit(const ConstantRelocatable *C) const {
6881   if (!BuildDefs::dump())
6882     return;
6883   Ostream &Str = Ctx->getStrEmit();
6884   Str << "$";
6885   emitWithoutPrefix(C);
6886 }
6887 
emitJumpTable(const Cfg *,const InstJumpTable * JumpTable) const6888 void TargetX8664::emitJumpTable(const Cfg *,
6889                                 const InstJumpTable *JumpTable) const {
6890   if (!BuildDefs::dump())
6891     return;
6892   Ostream &Str = Ctx->getStrEmit();
6893   Str << "\t.section\t.rodata." << JumpTable->getSectionName()
6894       << ",\"a\",@progbits\n"
6895          "\t.align\t"
6896       << typeWidthInBytes(getPointerType()) << "\n"
6897       << JumpTable->getName() << ":";
6898 
6899   for (SizeT I = 0; I < JumpTable->getNumTargets(); ++I)
6900     Str << "\n\t.var\t" << JumpTable->getTarget(I)->getAsmName();
6901   Str << "\n";
6902 }
6903 
6904 const TargetX8664::TableFcmpType TargetX8664::TableFcmp[] = {
6905 #define X(val, dflt, swapS, C1, C2, swapV, pred)                               \
6906   {dflt, swapS, CondX86::C1, CondX86::C2, swapV, CondX86::pred},
6907     FCMPX8664_TABLE
6908 #undef X
6909 };
6910 
6911 const size_t TargetX8664::TableFcmpSize = llvm::array_lengthof(TableFcmp);
6912 
6913 const TargetX8664::TableIcmp32Type TargetX8664::TableIcmp32[] = {
6914 #define X(val, C_32, C1_64, C2_64, C3_64) {CondX86::C_32},
6915     ICMPX8664_TABLE
6916 #undef X
6917 };
6918 
6919 const size_t TargetX8664::TableIcmp32Size = llvm::array_lengthof(TableIcmp32);
6920 
6921 std::array<SmallBitVector, RCX86_NUM> TargetX8664::TypeToRegisterSet = {{}};
6922 
6923 std::array<SmallBitVector, RCX86_NUM> TargetX8664::TypeToRegisterSetUnfiltered =
6924     {{}};
6925 
6926 std::array<SmallBitVector, RegX8664::Reg_NUM> TargetX8664::RegisterAliases = {
6927     {}};
6928 
6929 template <typename T>
emitConstantPool(GlobalContext * Ctx)6930 void TargetDataX8664::emitConstantPool(GlobalContext *Ctx) {
6931   if (!BuildDefs::dump())
6932     return;
6933   Ostream &Str = Ctx->getStrEmit();
6934   Type Ty = T::Ty;
6935   SizeT Align = typeAlignInBytes(Ty);
6936   ConstantList Pool = Ctx->getConstantPool(Ty);
6937 
6938   Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",@progbits," << Align
6939       << "\n";
6940   Str << "\t.align\t" << Align << "\n";
6941 
6942   for (Constant *C : Pool) {
6943     if (!C->getShouldBePooled())
6944       continue;
6945     auto *Const = llvm::cast<typename T::IceType>(C);
6946     typename T::IceType::PrimType Value = Const->getValue();
6947     // Use memcpy() to copy bits from Value into RawValue in a way that avoids
6948     // breaking strict-aliasing rules.
6949     typename T::PrimitiveIntType RawValue;
6950     memcpy(&RawValue, &Value, sizeof(Value));
6951     char buf[30];
6952     int CharsPrinted =
6953         snprintf(buf, llvm::array_lengthof(buf), T::PrintfString, RawValue);
6954     assert(CharsPrinted >= 0);
6955     assert((size_t)CharsPrinted < llvm::array_lengthof(buf));
6956     (void)CharsPrinted; // avoid warnings if asserts are disabled
6957     Str << Const->getLabelName();
6958     Str << ":\n\t" << T::AsmTag << "\t" << buf << "\t/* " << T::TypeName << " "
6959         << Value << " */\n";
6960   }
6961 }
6962 
lowerConstants()6963 void TargetDataX8664::lowerConstants() {
6964   if (getFlags().getDisableTranslation())
6965     return;
6966   switch (getFlags().getOutFileType()) {
6967   case FT_Elf: {
6968     ELFObjectWriter *Writer = Ctx->getObjectWriter();
6969 
6970     Writer->writeConstantPool<ConstantInteger32>(IceType_i8);
6971     Writer->writeConstantPool<ConstantInteger32>(IceType_i16);
6972     Writer->writeConstantPool<ConstantInteger32>(IceType_i32);
6973 
6974     Writer->writeConstantPool<ConstantFloat>(IceType_f32);
6975     Writer->writeConstantPool<ConstantDouble>(IceType_f64);
6976   } break;
6977   case FT_Asm:
6978   case FT_Iasm: {
6979     OstreamLocker L(Ctx);
6980 
6981     emitConstantPool<PoolTypeConverter<uint8_t>>(Ctx);
6982     emitConstantPool<PoolTypeConverter<uint16_t>>(Ctx);
6983     emitConstantPool<PoolTypeConverter<uint32_t>>(Ctx);
6984 
6985     emitConstantPool<PoolTypeConverter<float>>(Ctx);
6986     emitConstantPool<PoolTypeConverter<double>>(Ctx);
6987   } break;
6988   }
6989 }
6990 
lowerJumpTables()6991 void TargetDataX8664::lowerJumpTables() {
6992   const bool IsPIC = false;
6993   switch (getFlags().getOutFileType()) {
6994   case FT_Elf: {
6995     ELFObjectWriter *Writer = Ctx->getObjectWriter();
6996     constexpr FixupKind FK_Abs64 = llvm::ELF::R_X86_64_64;
6997     const FixupKind RelocationKind =
6998         (getPointerType() == IceType_i32) ? FK_Abs : FK_Abs64;
6999     for (const JumpTableData &JT : Ctx->getJumpTables())
7000       Writer->writeJumpTable(JT, RelocationKind, IsPIC);
7001   } break;
7002   case FT_Asm:
7003     // Already emitted from Cfg
7004     break;
7005   case FT_Iasm: {
7006     if (!BuildDefs::dump())
7007       return;
7008     Ostream &Str = Ctx->getStrEmit();
7009     const char *Prefix = IsPIC ? ".data.rel.ro." : ".rodata.";
7010     for (const JumpTableData &JT : Ctx->getJumpTables()) {
7011       Str << "\t.section\t" << Prefix << JT.getSectionName()
7012           << ",\"a\",@progbits\n"
7013              "\t.align\t"
7014           << typeWidthInBytes(getPointerType()) << "\n"
7015           << JT.getName().toString() << ":";
7016 
7017       // On X8664 ILP32 pointers are 32-bit hence the use of .long
7018       for (intptr_t TargetOffset : JT.getTargetOffsets())
7019         Str << "\n\t.long\t" << JT.getFunctionName() << "+" << TargetOffset;
7020       Str << "\n";
7021     }
7022   } break;
7023   }
7024 }
7025 
lowerGlobals(const VariableDeclarationList & Vars,const std::string & SectionSuffix)7026 void TargetDataX8664::lowerGlobals(const VariableDeclarationList &Vars,
7027                                    const std::string &SectionSuffix) {
7028   const bool IsPIC = false;
7029   switch (getFlags().getOutFileType()) {
7030   case FT_Elf: {
7031     ELFObjectWriter *Writer = Ctx->getObjectWriter();
7032     Writer->writeDataSection(Vars, FK_Abs, SectionSuffix, IsPIC);
7033   } break;
7034   case FT_Asm:
7035   case FT_Iasm: {
7036     OstreamLocker L(Ctx);
7037     for (const VariableDeclaration *Var : Vars) {
7038       if (getFlags().matchTranslateOnly(Var->getName(), 0)) {
7039         emitGlobal(*Var, SectionSuffix);
7040       }
7041     }
7042   } break;
7043   }
7044 }
7045 
7046 //------------------------------------------------------------------------------
7047 //     __      ______  __     __  ______  ______  __  __   __  ______
7048 //    /\ \    /\  __ \/\ \  _ \ \/\  ___\/\  == \/\ \/\ "-.\ \/\  ___\
7049 //    \ \ \___\ \ \/\ \ \ \/ ".\ \ \  __\\ \  __<\ \ \ \ \-.  \ \ \__ \
7050 //     \ \_____\ \_____\ \__/".~\_\ \_____\ \_\ \_\ \_\ \_\\"\_\ \_____\
7051 //      \/_____/\/_____/\/_/   \/_/\/_____/\/_/ /_/\/_/\/_/ \/_/\/_____/
7052 //
7053 //------------------------------------------------------------------------------
_add_sp(Operand * Adjustment)7054 void TargetX8664::_add_sp(Operand *Adjustment) {
7055   Variable *rsp = getPhysicalRegister(RegX8664::Reg_rsp, IceType_i64);
7056   _add(rsp, Adjustment);
7057 }
7058 
_mov_sp(Operand * NewValue)7059 void TargetX8664::_mov_sp(Operand *NewValue) {
7060   Variable *rsp = getPhysicalRegister(RegX8664::Reg_rsp, IceType_i64);
7061   _redefined(_mov(rsp, NewValue));
7062 }
7063 
_link_bp()7064 void TargetX8664::_link_bp() {
7065   Variable *rsp = getPhysicalRegister(RegX8664::Reg_rsp, WordType);
7066   Variable *rbp = getPhysicalRegister(RegX8664::Reg_rbp, WordType);
7067 
7068   _push(rbp);
7069   _mov(rbp, rsp);
7070   // Keep ebp live for late-stage liveness analysis (e.g. asm-verbose mode).
7071   Context.insert<InstFakeUse>(rbp);
7072 }
7073 
_unlink_bp()7074 void TargetX8664::_unlink_bp() {
7075   Variable *rsp = getPhysicalRegister(RegX8664::Reg_rsp, IceType_i64);
7076   Variable *rbp = getPhysicalRegister(RegX8664::Reg_rbp, IceType_i64);
7077   // For late-stage liveness analysis (e.g. asm-verbose mode), adding a fake
7078   // use of rsp before the assignment of rsp=rbp keeps previous rsp
7079   // adjustments from being dead-code eliminated.
7080   Context.insert<InstFakeUse>(rsp);
7081 
7082   _mov(rsp, rbp);
7083   _pop(rbp);
7084 }
7085 
_push_reg(RegNumT RegNum)7086 void TargetX8664::_push_reg(RegNumT RegNum) {
7087   if (RegX8664::isXmm(RegNum)) {
7088     Variable *reg = getPhysicalRegister(RegNum, IceType_v4f32);
7089     Variable *rsp = getPhysicalRegister(RegX8664::Reg_rsp, WordType);
7090     auto *address = X86OperandMem::create(Func, reg->getType(), rsp, nullptr);
7091     _sub_sp(
7092         Ctx->getConstantInt32(16)); // TODO(capn): accumulate all the offsets
7093                                     // and adjust the stack pointer once.
7094     _storep(reg, address);
7095   } else {
7096     _push(getPhysicalRegister(RegNum, WordType));
7097   }
7098 }
7099 
_pop_reg(RegNumT RegNum)7100 void TargetX8664::_pop_reg(RegNumT RegNum) {
7101   if (RegX8664::isXmm(RegNum)) {
7102     Variable *reg = getPhysicalRegister(RegNum, IceType_v4f32);
7103     Variable *rsp = getPhysicalRegister(RegX8664::Reg_rsp, WordType);
7104     auto *address = X86OperandMem::create(Func, reg->getType(), rsp, nullptr);
7105     _movp(reg, address);
7106     _add_sp(
7107         Ctx->getConstantInt32(16)); // TODO(capn): accumulate all the offsets
7108                                     // and adjust the stack pointer once.
7109   } else {
7110     _pop(getPhysicalRegister(RegNum, WordType));
7111   }
7112 }
7113 
_sub_sp(Operand * Adjustment)7114 void TargetX8664::_sub_sp(Operand *Adjustment) {
7115   Variable *rsp = getPhysicalRegister(RegX8664::Reg_rsp, WordType);
7116 
7117   _sub(rsp, Adjustment);
7118 
7119   // Add a fake use of the stack pointer, to prevent the stack pointer adustment
7120   // from being dead-code eliminated in a function that doesn't return.
7121   Context.insert<InstFakeUse>(rsp);
7122 }
7123 
lowerIndirectJump(Variable * JumpTarget)7124 void TargetX8664::lowerIndirectJump(Variable *JumpTarget) {
7125   if (JumpTarget->getType() != IceType_i64) {
7126     Variable *T = makeReg(IceType_i64);
7127     _movzx(T, JumpTarget);
7128     JumpTarget = T;
7129   }
7130 
7131   _jmp(JumpTarget);
7132 }
7133 
emitCallToTarget(Operand * CallTarget,Variable * ReturnReg,size_t NumVariadicFpArgs)7134 Inst *TargetX8664::emitCallToTarget(Operand *CallTarget, Variable *ReturnReg,
7135                                     size_t NumVariadicFpArgs) {
7136   if (CallTarget->getType() == IceType_i64) {
7137     // x86-64 does not support 64-bit direct calls, so write the value to a
7138     // register and make an indirect call for Constant call targets.
7139     RegNumT TargetReg = {};
7140 
7141     // System V: force r11 when calling a variadic function so that rax isn't
7142     // used, since rax stores the number of FP args (see NumVariadicFpArgs
7143     // usage below).
7144 #if !defined(SUBZERO_USE_MICROSOFT_ABI)
7145     if (NumVariadicFpArgs > 0)
7146       TargetReg = RegX8664::Reg_r11;
7147 #endif
7148 
7149     if (llvm::isa<Constant>(CallTarget)) {
7150       Variable *T = makeReg(IceType_i64, TargetReg);
7151       _mov(T, CallTarget);
7152       CallTarget = T;
7153     } else if (llvm::isa<Variable>(CallTarget)) {
7154       Operand *T = legalizeToReg(CallTarget, TargetReg);
7155       CallTarget = T;
7156     }
7157   }
7158 
7159   // System V: store number of FP args in RAX for variadic calls
7160 #if !defined(SUBZERO_USE_MICROSOFT_ABI)
7161   if (NumVariadicFpArgs > 0) {
7162     // Store number of FP args (stored in XMM registers) in RAX for variadic
7163     // calls
7164     auto *NumFpArgs = Ctx->getConstantInt64(NumVariadicFpArgs);
7165     Variable *NumFpArgsReg = legalizeToReg(NumFpArgs, RegX8664::Reg_rax);
7166     Context.insert<InstFakeUse>(NumFpArgsReg);
7167   }
7168 #endif
7169 
7170   return Context.insert<Insts::Call>(ReturnReg, CallTarget);
7171 }
7172 
moveReturnValueToRegister(Operand * Value,Type ReturnType)7173 Variable *TargetX8664::moveReturnValueToRegister(Operand *Value,
7174                                                  Type ReturnType) {
7175   if (isVectorType(ReturnType) || isScalarFloatingType(ReturnType)) {
7176     return legalizeToReg(Value, RegX8664::Reg_xmm0);
7177   } else {
7178     assert(ReturnType == IceType_i32 || ReturnType == IceType_i64);
7179     Variable *Reg = nullptr;
7180     _mov(Reg, Value, RegX8664::getGprForType(ReturnType, RegX8664::Reg_rax));
7181     return Reg;
7182   }
7183 }
7184 
emitStackProbe(size_t StackSizeBytes)7185 void TargetX8664::emitStackProbe(size_t StackSizeBytes) {
7186 #if defined(_WIN64)
7187   // Mirroring the behavior of MSVC here, which emits a _chkstk when locals are
7188   // >= 4KB, rather than the 8KB claimed by the docs.
7189   if (StackSizeBytes >= 4096) {
7190     // __chkstk on Win64 probes the stack up to RSP - EAX, but does not clobber
7191     // RSP, so we don't need to save and restore it.
7192 
7193     Variable *EAX = makeReg(IceType_i32, RegX8664::Reg_eax);
7194     _mov(EAX, Ctx->getConstantInt32(StackSizeBytes));
7195 
7196     auto *CallTarget =
7197         Ctx->getConstantInt64(reinterpret_cast<int64_t>(&__chkstk));
7198     Operand *CallTargetReg = legalizeToReg(CallTarget, RegX8664::Reg_r11);
7199     emitCallToTarget(CallTargetReg, nullptr);
7200   }
7201 #endif
7202 }
7203 
7204 // In some cases, there are x-macros tables for both high-level and low-level
7205 // instructions/operands that use the same enum key value. The tables are kept
7206 // separate to maintain a proper separation between abstraction layers. There
7207 // is a risk that the tables could get out of sync if enum values are reordered
7208 // or if entries are added or deleted. The following dummy namespaces use
7209 // static_asserts to ensure everything is kept in sync.
7210 
7211 namespace {
7212 // Validate the enum values in FCMPX8664_TABLE.
7213 namespace dummy1 {
7214 // Define a temporary set of enum values based on low-level table entries.
7215 enum _tmp_enum {
7216 #define X(val, dflt, swapS, C1, C2, swapV, pred) _tmp_##val,
7217   FCMPX8664_TABLE
7218 #undef X
7219       _num
7220 };
7221 // Define a set of constants based on high-level table entries.
7222 #define X(tag, str) static const int _table1_##tag = InstFcmp::tag;
7223 ICEINSTFCMP_TABLE
7224 #undef X
7225 // Define a set of constants based on low-level table entries, and ensure the
7226 // table entry keys are consistent.
7227 #define X(val, dflt, swapS, C1, C2, swapV, pred)                               \
7228   static const int _table2_##val = _tmp_##val;                                 \
7229   static_assert(                                                               \
7230       _table1_##val == _table2_##val,                                          \
7231       "Inconsistency between FCMPX8664_TABLE and ICEINSTFCMP_TABLE");
7232 FCMPX8664_TABLE
7233 #undef X
7234 // Repeat the static asserts with respect to the high-level table entries in
7235 // case the high-level table has extra entries.
7236 #define X(tag, str)                                                            \
7237   static_assert(                                                               \
7238       _table1_##tag == _table2_##tag,                                          \
7239       "Inconsistency between FCMPX8664_TABLE and ICEINSTFCMP_TABLE");
7240 ICEINSTFCMP_TABLE
7241 #undef X
7242 } // end of namespace dummy1
7243 
7244 // Validate the enum values in ICMPX8664_TABLE.
7245 namespace dummy2 {
7246 // Define a temporary set of enum values based on low-level table entries.
7247 enum _tmp_enum {
7248 #define X(val, C_32, C1_64, C2_64, C3_64) _tmp_##val,
7249   ICMPX8664_TABLE
7250 #undef X
7251       _num
7252 };
7253 // Define a set of constants based on high-level table entries.
7254 #define X(tag, reverse, str) static const int _table1_##tag = InstIcmp::tag;
7255 ICEINSTICMP_TABLE
7256 #undef X
7257 // Define a set of constants based on low-level table entries, and ensure the
7258 // table entry keys are consistent.
7259 #define X(val, C_32, C1_64, C2_64, C3_64)                                      \
7260   static const int _table2_##val = _tmp_##val;                                 \
7261   static_assert(                                                               \
7262       _table1_##val == _table2_##val,                                          \
7263       "Inconsistency between ICMPX8664_TABLE and ICEINSTICMP_TABLE");
7264 ICMPX8664_TABLE
7265 #undef X
7266 // Repeat the static asserts with respect to the high-level table entries in
7267 // case the high-level table has extra entries.
7268 #define X(tag, reverse, str)                                                   \
7269   static_assert(                                                               \
7270       _table1_##tag == _table2_##tag,                                          \
7271       "Inconsistency between ICMPX8664_TABLE and ICEINSTICMP_TABLE");
7272 ICEINSTICMP_TABLE
7273 #undef X
7274 } // end of namespace dummy2
7275 
7276 // Validate the enum values in ICETYPEX86_TABLE.
7277 namespace dummy3 {
7278 // Define a temporary set of enum values based on low-level table entries.
7279 enum _tmp_enum {
7280 #define X(tag, elty, cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld)    \
7281   _tmp_##tag,
7282   ICETYPEX86_TABLE
7283 #undef X
7284       _num
7285 };
7286 // Define a set of constants based on high-level table entries.
7287 #define X(tag, sizeLog2, align, elts, elty, str, rcstr)                        \
7288   static const int _table1_##tag = IceType_##tag;
7289 ICETYPE_TABLE
7290 #undef X
7291 // Define a set of constants based on low-level table entries, and ensure the
7292 // table entry keys are consistent.
7293 #define X(tag, elty, cvt, sdss, pdps, spsd, int_, unpack, pack, width, fld)    \
7294   static const int _table2_##tag = _tmp_##tag;                                 \
7295   static_assert(_table1_##tag == _table2_##tag,                                \
7296                 "Inconsistency between ICETYPEX86_TABLE and ICETYPE_TABLE");
7297 ICETYPEX86_TABLE
7298 #undef X
7299 // Repeat the static asserts with respect to the high-level table entries in
7300 // case the high-level table has extra entries.
7301 #define X(tag, sizeLog2, align, elts, elty, str, rcstr)                        \
7302   static_assert(_table1_##tag == _table2_##tag,                                \
7303                 "Inconsistency between ICETYPEX86_TABLE and ICETYPE_TABLE");
7304 ICETYPE_TABLE
7305 #undef X
7306 
7307 } // end of namespace dummy3
7308 } // end of anonymous namespace
7309 
7310 } // end of namespace X8664
7311 } // end of namespace Ice
7312