1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file provides a helper that implements much of the TTI interface in 11 /// terms of the target-independent code generator and TargetLowering 12 /// interfaces. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H 17 #define LLVM_CODEGEN_BASICTTIIMPL_H 18 19 #include "llvm/ADT/APInt.h" 20 #include "llvm/ADT/ArrayRef.h" 21 #include "llvm/ADT/BitVector.h" 22 #include "llvm/ADT/SmallPtrSet.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/Analysis/LoopInfo.h" 25 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 26 #include "llvm/Analysis/TargetTransformInfo.h" 27 #include "llvm/Analysis/TargetTransformInfoImpl.h" 28 #include "llvm/CodeGen/ISDOpcodes.h" 29 #include "llvm/CodeGen/TargetLowering.h" 30 #include "llvm/CodeGen/TargetSubtargetInfo.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/CodeGenTypes/MachineValueType.h" 33 #include "llvm/IR/BasicBlock.h" 34 #include "llvm/IR/Constant.h" 35 #include "llvm/IR/Constants.h" 36 #include "llvm/IR/DataLayout.h" 37 #include "llvm/IR/DerivedTypes.h" 38 #include "llvm/IR/InstrTypes.h" 39 #include "llvm/IR/Instruction.h" 40 #include "llvm/IR/Instructions.h" 41 #include "llvm/IR/Intrinsics.h" 42 #include "llvm/IR/Operator.h" 43 #include "llvm/IR/Type.h" 44 #include "llvm/IR/Value.h" 45 #include "llvm/Support/Casting.h" 46 #include "llvm/Support/CommandLine.h" 47 #include "llvm/Support/ErrorHandling.h" 48 #include "llvm/Support/MathExtras.h" 49 #include "llvm/Target/TargetMachine.h" 50 #include "llvm/Target/TargetOptions.h" 51 #include "llvm/Transforms/Utils/LoopUtils.h" 52 #include <algorithm> 53 #include <cassert> 54 #include <cstdint> 55 #include <limits> 56 #include <optional> 57 #include <utility> 58 59 namespace llvm { 60 61 class Function; 62 class GlobalValue; 63 class LLVMContext; 64 class ScalarEvolution; 65 class SCEV; 66 class TargetMachine; 67 68 extern cl::opt<unsigned> PartialUnrollingThreshold; 69 70 /// Base class which can be used to help build a TTI implementation. 71 /// 72 /// This class provides as much implementation of the TTI interface as is 73 /// possible using the target independent parts of the code generator. 74 /// 75 /// In order to subclass it, your class must implement a getST() method to 76 /// return the subtarget, and a getTLI() method to return the target lowering. 77 /// We need these methods implemented in the derived class so that this class 78 /// doesn't have to duplicate storage for them. 79 template <typename T> 80 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { 81 private: 82 using BaseT = TargetTransformInfoImplCRTPBase<T>; 83 using TTI = TargetTransformInfo; 84 85 /// Helper function to access this as a T. thisT()86 T *thisT() { return static_cast<T *>(this); } 87 88 /// Estimate a cost of Broadcast as an extract and sequence of insert 89 /// operations. getBroadcastShuffleOverhead(FixedVectorType * VTy,TTI::TargetCostKind CostKind)90 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy, 91 TTI::TargetCostKind CostKind) { 92 InstructionCost Cost = 0; 93 // Broadcast cost is equal to the cost of extracting the zero'th element 94 // plus the cost of inserting it into every element of the result vector. 95 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 96 CostKind, 0, nullptr, nullptr); 97 98 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 99 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, 100 CostKind, i, nullptr, nullptr); 101 } 102 return Cost; 103 } 104 105 /// Estimate a cost of shuffle as a sequence of extract and insert 106 /// operations. getPermuteShuffleOverhead(FixedVectorType * VTy,TTI::TargetCostKind CostKind)107 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy, 108 TTI::TargetCostKind CostKind) { 109 InstructionCost Cost = 0; 110 // Shuffle cost is equal to the cost of extracting element from its argument 111 // plus the cost of inserting them onto the result vector. 112 113 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from 114 // index 0 of first vector, index 1 of second vector,index 2 of first 115 // vector and finally index 3 of second vector and insert them at index 116 // <0,1,2,3> of result vector. 117 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 118 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, 119 CostKind, i, nullptr, nullptr); 120 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 121 CostKind, i, nullptr, nullptr); 122 } 123 return Cost; 124 } 125 126 /// Estimate a cost of subvector extraction as a sequence of extract and 127 /// insert operations. getExtractSubvectorOverhead(VectorType * VTy,TTI::TargetCostKind CostKind,int Index,FixedVectorType * SubVTy)128 InstructionCost getExtractSubvectorOverhead(VectorType *VTy, 129 TTI::TargetCostKind CostKind, 130 int Index, 131 FixedVectorType *SubVTy) { 132 assert(VTy && SubVTy && 133 "Can only extract subvectors from vectors"); 134 int NumSubElts = SubVTy->getNumElements(); 135 assert((!isa<FixedVectorType>(VTy) || 136 (Index + NumSubElts) <= 137 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 138 "SK_ExtractSubvector index out of range"); 139 140 InstructionCost Cost = 0; 141 // Subvector extraction cost is equal to the cost of extracting element from 142 // the source type plus the cost of inserting them into the result vector 143 // type. 144 for (int i = 0; i != NumSubElts; ++i) { 145 Cost += 146 thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 147 CostKind, i + Index, nullptr, nullptr); 148 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, 149 CostKind, i, nullptr, nullptr); 150 } 151 return Cost; 152 } 153 154 /// Estimate a cost of subvector insertion as a sequence of extract and 155 /// insert operations. getInsertSubvectorOverhead(VectorType * VTy,TTI::TargetCostKind CostKind,int Index,FixedVectorType * SubVTy)156 InstructionCost getInsertSubvectorOverhead(VectorType *VTy, 157 TTI::TargetCostKind CostKind, 158 int Index, 159 FixedVectorType *SubVTy) { 160 assert(VTy && SubVTy && 161 "Can only insert subvectors into vectors"); 162 int NumSubElts = SubVTy->getNumElements(); 163 assert((!isa<FixedVectorType>(VTy) || 164 (Index + NumSubElts) <= 165 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 166 "SK_InsertSubvector index out of range"); 167 168 InstructionCost Cost = 0; 169 // Subvector insertion cost is equal to the cost of extracting element from 170 // the source type plus the cost of inserting them into the result vector 171 // type. 172 for (int i = 0; i != NumSubElts; ++i) { 173 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, 174 CostKind, i, nullptr, nullptr); 175 Cost += 176 thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind, 177 i + Index, nullptr, nullptr); 178 } 179 return Cost; 180 } 181 182 /// Local query method delegates up to T which *must* implement this! getST()183 const TargetSubtargetInfo *getST() const { 184 return static_cast<const T *>(this)->getST(); 185 } 186 187 /// Local query method delegates up to T which *must* implement this! getTLI()188 const TargetLoweringBase *getTLI() const { 189 return static_cast<const T *>(this)->getTLI(); 190 } 191 getISDIndexedMode(TTI::MemIndexedMode M)192 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) { 193 switch (M) { 194 case TTI::MIM_Unindexed: 195 return ISD::UNINDEXED; 196 case TTI::MIM_PreInc: 197 return ISD::PRE_INC; 198 case TTI::MIM_PreDec: 199 return ISD::PRE_DEC; 200 case TTI::MIM_PostInc: 201 return ISD::POST_INC; 202 case TTI::MIM_PostDec: 203 return ISD::POST_DEC; 204 } 205 llvm_unreachable("Unexpected MemIndexedMode"); 206 } 207 getCommonMaskedMemoryOpCost(unsigned Opcode,Type * DataTy,Align Alignment,bool VariableMask,bool IsGatherScatter,TTI::TargetCostKind CostKind)208 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, 209 Align Alignment, 210 bool VariableMask, 211 bool IsGatherScatter, 212 TTI::TargetCostKind CostKind) { 213 // We cannot scalarize scalable vectors, so return Invalid. 214 if (isa<ScalableVectorType>(DataTy)) 215 return InstructionCost::getInvalid(); 216 217 auto *VT = cast<FixedVectorType>(DataTy); 218 // Assume the target does not have support for gather/scatter operations 219 // and provide a rough estimate. 220 // 221 // First, compute the cost of the individual memory operations. 222 InstructionCost AddrExtractCost = 223 IsGatherScatter 224 ? getVectorInstrCost(Instruction::ExtractElement, 225 FixedVectorType::get( 226 PointerType::get(VT->getElementType(), 0), 227 VT->getNumElements()), 228 CostKind, -1, nullptr, nullptr) 229 : 0; 230 InstructionCost LoadCost = 231 VT->getNumElements() * 232 (AddrExtractCost + 233 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind)); 234 235 // Next, compute the cost of packing the result in a vector. 236 InstructionCost PackingCost = 237 getScalarizationOverhead(VT, Opcode != Instruction::Store, 238 Opcode == Instruction::Store, CostKind); 239 240 InstructionCost ConditionalCost = 0; 241 if (VariableMask) { 242 // Compute the cost of conditionally executing the memory operations with 243 // variable masks. This includes extracting the individual conditions, a 244 // branches and PHIs to combine the results. 245 // NOTE: Estimating the cost of conditionally executing the memory 246 // operations accurately is quite difficult and the current solution 247 // provides a very rough estimate only. 248 ConditionalCost = 249 VT->getNumElements() * 250 (getVectorInstrCost( 251 Instruction::ExtractElement, 252 FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), 253 VT->getNumElements()), 254 CostKind, -1, nullptr, nullptr) + 255 getCFInstrCost(Instruction::Br, CostKind) + 256 getCFInstrCost(Instruction::PHI, CostKind)); 257 } 258 259 return LoadCost + PackingCost + ConditionalCost; 260 } 261 262 protected: BasicTTIImplBase(const TargetMachine * TM,const DataLayout & DL)263 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) 264 : BaseT(DL) {} 265 virtual ~BasicTTIImplBase() = default; 266 267 using TargetTransformInfoImplBase::DL; 268 269 public: 270 /// \name Scalar TTI Implementations 271 /// @{ allowsMisalignedMemoryAccesses(LLVMContext & Context,unsigned BitWidth,unsigned AddressSpace,Align Alignment,unsigned * Fast)272 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, 273 unsigned AddressSpace, Align Alignment, 274 unsigned *Fast) const { 275 EVT E = EVT::getIntegerVT(Context, BitWidth); 276 return getTLI()->allowsMisalignedMemoryAccesses( 277 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast); 278 } 279 280 bool hasBranchDivergence(const Function *F = nullptr) { return false; } 281 isSourceOfDivergence(const Value * V)282 bool isSourceOfDivergence(const Value *V) { return false; } 283 isAlwaysUniform(const Value * V)284 bool isAlwaysUniform(const Value *V) { return false; } 285 isValidAddrSpaceCast(unsigned FromAS,unsigned ToAS)286 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { 287 return false; 288 } 289 addrspacesMayAlias(unsigned AS0,unsigned AS1)290 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const { 291 return true; 292 } 293 getFlatAddressSpace()294 unsigned getFlatAddressSpace() { 295 // Return an invalid address space. 296 return -1; 297 } 298 collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID)299 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 300 Intrinsic::ID IID) const { 301 return false; 302 } 303 isNoopAddrSpaceCast(unsigned FromAS,unsigned ToAS)304 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { 305 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS); 306 } 307 getAssumedAddrSpace(const Value * V)308 unsigned getAssumedAddrSpace(const Value *V) const { 309 return getTLI()->getTargetMachine().getAssumedAddrSpace(V); 310 } 311 isSingleThreaded()312 bool isSingleThreaded() const { 313 return getTLI()->getTargetMachine().Options.ThreadModel == 314 ThreadModel::Single; 315 } 316 317 std::pair<const Value *, unsigned> getPredicatedAddrSpace(const Value * V)318 getPredicatedAddrSpace(const Value *V) const { 319 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V); 320 } 321 rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV)322 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, 323 Value *NewV) const { 324 return nullptr; 325 } 326 isLegalAddImmediate(int64_t imm)327 bool isLegalAddImmediate(int64_t imm) { 328 return getTLI()->isLegalAddImmediate(imm); 329 } 330 isLegalICmpImmediate(int64_t imm)331 bool isLegalICmpImmediate(int64_t imm) { 332 return getTLI()->isLegalICmpImmediate(imm); 333 } 334 335 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 336 bool HasBaseReg, int64_t Scale, 337 unsigned AddrSpace, Instruction *I = nullptr) { 338 TargetLoweringBase::AddrMode AM; 339 AM.BaseGV = BaseGV; 340 AM.BaseOffs = BaseOffset; 341 AM.HasBaseReg = HasBaseReg; 342 AM.Scale = Scale; 343 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); 344 } 345 getPreferredLargeGEPBaseOffset(int64_t MinOffset,int64_t MaxOffset)346 int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) { 347 return getTLI()->getPreferredLargeGEPBaseOffset(MinOffset, MaxOffset); 348 } 349 getStoreMinimumVF(unsigned VF,Type * ScalarMemTy,Type * ScalarValTy)350 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, 351 Type *ScalarValTy) const { 352 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) { 353 auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2); 354 EVT VT = getTLI()->getValueType(DL, SrcTy); 355 if (getTLI()->isOperationLegal(ISD::STORE, VT) || 356 getTLI()->isOperationCustom(ISD::STORE, VT)) 357 return true; 358 359 EVT ValVT = 360 getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2)); 361 EVT LegalizedVT = 362 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT); 363 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT); 364 }; 365 while (VF > 2 && IsSupportedByTarget(VF)) 366 VF /= 2; 367 return VF; 368 } 369 isIndexedLoadLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)370 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, 371 const DataLayout &DL) const { 372 EVT VT = getTLI()->getValueType(DL, Ty); 373 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT); 374 } 375 isIndexedStoreLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)376 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, 377 const DataLayout &DL) const { 378 EVT VT = getTLI()->getValueType(DL, Ty); 379 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT); 380 } 381 isLSRCostLess(TTI::LSRCost C1,TTI::LSRCost C2)382 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { 383 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); 384 } 385 isNumRegsMajorCostOfLSR()386 bool isNumRegsMajorCostOfLSR() { 387 return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR(); 388 } 389 shouldFoldTerminatingConditionAfterLSR()390 bool shouldFoldTerminatingConditionAfterLSR() const { 391 return TargetTransformInfoImplBase:: 392 shouldFoldTerminatingConditionAfterLSR(); 393 } 394 isProfitableLSRChainElement(Instruction * I)395 bool isProfitableLSRChainElement(Instruction *I) { 396 return TargetTransformInfoImplBase::isProfitableLSRChainElement(I); 397 } 398 getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,int64_t BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace)399 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, 400 int64_t BaseOffset, bool HasBaseReg, 401 int64_t Scale, unsigned AddrSpace) { 402 TargetLoweringBase::AddrMode AM; 403 AM.BaseGV = BaseGV; 404 AM.BaseOffs = BaseOffset; 405 AM.HasBaseReg = HasBaseReg; 406 AM.Scale = Scale; 407 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) 408 return 0; 409 return -1; 410 } 411 isTruncateFree(Type * Ty1,Type * Ty2)412 bool isTruncateFree(Type *Ty1, Type *Ty2) { 413 return getTLI()->isTruncateFree(Ty1, Ty2); 414 } 415 isProfitableToHoist(Instruction * I)416 bool isProfitableToHoist(Instruction *I) { 417 return getTLI()->isProfitableToHoist(I); 418 } 419 useAA()420 bool useAA() const { return getST()->useAA(); } 421 isTypeLegal(Type * Ty)422 bool isTypeLegal(Type *Ty) { 423 EVT VT = getTLI()->getValueType(DL, Ty); 424 return getTLI()->isTypeLegal(VT); 425 } 426 getRegUsageForType(Type * Ty)427 unsigned getRegUsageForType(Type *Ty) { 428 EVT ETy = getTLI()->getValueType(DL, Ty); 429 return getTLI()->getNumRegisters(Ty->getContext(), ETy); 430 } 431 getGEPCost(Type * PointeeType,const Value * Ptr,ArrayRef<const Value * > Operands,Type * AccessType,TTI::TargetCostKind CostKind)432 InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, 433 ArrayRef<const Value *> Operands, Type *AccessType, 434 TTI::TargetCostKind CostKind) { 435 return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind); 436 } 437 getEstimatedNumberOfCaseClusters(const SwitchInst & SI,unsigned & JumpTableSize,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)438 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, 439 unsigned &JumpTableSize, 440 ProfileSummaryInfo *PSI, 441 BlockFrequencyInfo *BFI) { 442 /// Try to find the estimated number of clusters. Note that the number of 443 /// clusters identified in this function could be different from the actual 444 /// numbers found in lowering. This function ignore switches that are 445 /// lowered with a mix of jump table / bit test / BTree. This function was 446 /// initially intended to be used when estimating the cost of switch in 447 /// inline cost heuristic, but it's a generic cost model to be used in other 448 /// places (e.g., in loop unrolling). 449 unsigned N = SI.getNumCases(); 450 const TargetLoweringBase *TLI = getTLI(); 451 const DataLayout &DL = this->getDataLayout(); 452 453 JumpTableSize = 0; 454 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); 455 456 // Early exit if both a jump table and bit test are not allowed. 457 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N)) 458 return N; 459 460 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); 461 APInt MinCaseVal = MaxCaseVal; 462 for (auto CI : SI.cases()) { 463 const APInt &CaseVal = CI.getCaseValue()->getValue(); 464 if (CaseVal.sgt(MaxCaseVal)) 465 MaxCaseVal = CaseVal; 466 if (CaseVal.slt(MinCaseVal)) 467 MinCaseVal = CaseVal; 468 } 469 470 // Check if suitable for a bit test 471 if (N <= DL.getIndexSizeInBits(0u)) { 472 SmallPtrSet<const BasicBlock *, 4> Dests; 473 for (auto I : SI.cases()) 474 Dests.insert(I.getCaseSuccessor()); 475 476 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, 477 DL)) 478 return 1; 479 } 480 481 // Check if suitable for a jump table. 482 if (IsJTAllowed) { 483 if (N < 2 || N < TLI->getMinimumJumpTableEntries()) 484 return N; 485 uint64_t Range = 486 (MaxCaseVal - MinCaseVal) 487 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1; 488 // Check whether a range of clusters is dense enough for a jump table 489 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) { 490 JumpTableSize = Range; 491 return 1; 492 } 493 } 494 return N; 495 } 496 shouldBuildLookupTables()497 bool shouldBuildLookupTables() { 498 const TargetLoweringBase *TLI = getTLI(); 499 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || 500 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); 501 } 502 shouldBuildRelLookupTables()503 bool shouldBuildRelLookupTables() const { 504 const TargetMachine &TM = getTLI()->getTargetMachine(); 505 // If non-PIC mode, do not generate a relative lookup table. 506 if (!TM.isPositionIndependent()) 507 return false; 508 509 /// Relative lookup table entries consist of 32-bit offsets. 510 /// Do not generate relative lookup tables for large code models 511 /// in 64-bit achitectures where 32-bit offsets might not be enough. 512 if (TM.getCodeModel() == CodeModel::Medium || 513 TM.getCodeModel() == CodeModel::Large) 514 return false; 515 516 Triple TargetTriple = TM.getTargetTriple(); 517 if (!TargetTriple.isArch64Bit()) 518 return false; 519 520 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it 521 // there. 522 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin()) 523 return false; 524 525 return true; 526 } 527 haveFastSqrt(Type * Ty)528 bool haveFastSqrt(Type *Ty) { 529 const TargetLoweringBase *TLI = getTLI(); 530 EVT VT = TLI->getValueType(DL, Ty); 531 return TLI->isTypeLegal(VT) && 532 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); 533 } 534 isFCmpOrdCheaperThanFCmpZero(Type * Ty)535 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { 536 return true; 537 } 538 getFPOpCost(Type * Ty)539 InstructionCost getFPOpCost(Type *Ty) { 540 // Check whether FADD is available, as a proxy for floating-point in 541 // general. 542 const TargetLoweringBase *TLI = getTLI(); 543 EVT VT = TLI->getValueType(DL, Ty); 544 if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT)) 545 return TargetTransformInfo::TCC_Basic; 546 return TargetTransformInfo::TCC_Expensive; 547 } 548 preferToKeepConstantsAttached(const Instruction & Inst,const Function & Fn)549 bool preferToKeepConstantsAttached(const Instruction &Inst, 550 const Function &Fn) const { 551 switch (Inst.getOpcode()) { 552 default: 553 break; 554 case Instruction::SDiv: 555 case Instruction::SRem: 556 case Instruction::UDiv: 557 case Instruction::URem: { 558 if (!isa<ConstantInt>(Inst.getOperand(1))) 559 return false; 560 EVT VT = getTLI()->getValueType(DL, Inst.getType()); 561 return !getTLI()->isIntDivCheap(VT, Fn.getAttributes()); 562 } 563 }; 564 565 return false; 566 } 567 getInliningThresholdMultiplier()568 unsigned getInliningThresholdMultiplier() const { return 1; } adjustInliningThreshold(const CallBase * CB)569 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; } getCallerAllocaCost(const CallBase * CB,const AllocaInst * AI)570 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const { 571 return 0; 572 } 573 getInlinerVectorBonusPercent()574 int getInlinerVectorBonusPercent() const { return 150; } 575 getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)576 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 577 TTI::UnrollingPreferences &UP, 578 OptimizationRemarkEmitter *ORE) { 579 // This unrolling functionality is target independent, but to provide some 580 // motivation for its intended use, for x86: 581 582 // According to the Intel 64 and IA-32 Architectures Optimization Reference 583 // Manual, Intel Core models and later have a loop stream detector (and 584 // associated uop queue) that can benefit from partial unrolling. 585 // The relevant requirements are: 586 // - The loop must have no more than 4 (8 for Nehalem and later) branches 587 // taken, and none of them may be calls. 588 // - The loop can have no more than 18 (28 for Nehalem and later) uops. 589 590 // According to the Software Optimization Guide for AMD Family 15h 591 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor 592 // and loop buffer which can benefit from partial unrolling. 593 // The relevant requirements are: 594 // - The loop must have fewer than 16 branches 595 // - The loop must have less than 40 uops in all executed loop branches 596 597 // The number of taken branches in a loop is hard to estimate here, and 598 // benchmarking has revealed that it is better not to be conservative when 599 // estimating the branch count. As a result, we'll ignore the branch limits 600 // until someone finds a case where it matters in practice. 601 602 unsigned MaxOps; 603 const TargetSubtargetInfo *ST = getST(); 604 if (PartialUnrollingThreshold.getNumOccurrences() > 0) 605 MaxOps = PartialUnrollingThreshold; 606 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) 607 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; 608 else 609 return; 610 611 // Scan the loop: don't unroll loops with calls. 612 for (BasicBlock *BB : L->blocks()) { 613 for (Instruction &I : *BB) { 614 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 615 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 616 if (!thisT()->isLoweredToCall(F)) 617 continue; 618 } 619 620 if (ORE) { 621 ORE->emit([&]() { 622 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(), 623 L->getHeader()) 624 << "advising against unrolling the loop because it " 625 "contains a " 626 << ore::NV("Call", &I); 627 }); 628 } 629 return; 630 } 631 } 632 } 633 634 // Enable runtime and partial unrolling up to the specified size. 635 // Enable using trip count upper bound to unroll loops. 636 UP.Partial = UP.Runtime = UP.UpperBound = true; 637 UP.PartialThreshold = MaxOps; 638 639 // Avoid unrolling when optimizing for size. 640 UP.OptSizeThreshold = 0; 641 UP.PartialOptSizeThreshold = 0; 642 643 // Set number of instructions optimized when "back edge" 644 // becomes "fall through" to default value of 2. 645 UP.BEInsns = 2; 646 } 647 getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)648 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 649 TTI::PeelingPreferences &PP) { 650 PP.PeelCount = 0; 651 PP.AllowPeeling = true; 652 PP.AllowLoopNestsPeeling = false; 653 PP.PeelProfiledIterations = true; 654 } 655 isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)656 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, 657 AssumptionCache &AC, 658 TargetLibraryInfo *LibInfo, 659 HardwareLoopInfo &HWLoopInfo) { 660 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); 661 } 662 preferPredicateOverEpilogue(TailFoldingInfo * TFI)663 bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) { 664 return BaseT::preferPredicateOverEpilogue(TFI); 665 } 666 667 TailFoldingStyle 668 getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) { 669 return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow); 670 } 671 instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II)672 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, 673 IntrinsicInst &II) { 674 return BaseT::instCombineIntrinsic(IC, II); 675 } 676 677 std::optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedMask,KnownBits & Known,bool & KnownBitsComputed)678 simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, 679 APInt DemandedMask, KnownBits &Known, 680 bool &KnownBitsComputed) { 681 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, 682 KnownBitsComputed); 683 } 684 simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp)685 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic( 686 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 687 APInt &UndefElts2, APInt &UndefElts3, 688 std::function<void(Instruction *, unsigned, APInt, APInt &)> 689 SimplifyAndSetOp) { 690 return BaseT::simplifyDemandedVectorEltsIntrinsic( 691 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, 692 SimplifyAndSetOp); 693 } 694 695 virtual std::optional<unsigned> getCacheSize(TargetTransformInfo::CacheLevel Level)696 getCacheSize(TargetTransformInfo::CacheLevel Level) const { 697 return std::optional<unsigned>( 698 getST()->getCacheSize(static_cast<unsigned>(Level))); 699 } 700 701 virtual std::optional<unsigned> getCacheAssociativity(TargetTransformInfo::CacheLevel Level)702 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const { 703 std::optional<unsigned> TargetResult = 704 getST()->getCacheAssociativity(static_cast<unsigned>(Level)); 705 706 if (TargetResult) 707 return TargetResult; 708 709 return BaseT::getCacheAssociativity(Level); 710 } 711 getCacheLineSize()712 virtual unsigned getCacheLineSize() const { 713 return getST()->getCacheLineSize(); 714 } 715 getPrefetchDistance()716 virtual unsigned getPrefetchDistance() const { 717 return getST()->getPrefetchDistance(); 718 } 719 getMinPrefetchStride(unsigned NumMemAccesses,unsigned NumStridedMemAccesses,unsigned NumPrefetches,bool HasCall)720 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, 721 unsigned NumStridedMemAccesses, 722 unsigned NumPrefetches, 723 bool HasCall) const { 724 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, 725 NumPrefetches, HasCall); 726 } 727 getMaxPrefetchIterationsAhead()728 virtual unsigned getMaxPrefetchIterationsAhead() const { 729 return getST()->getMaxPrefetchIterationsAhead(); 730 } 731 enableWritePrefetching()732 virtual bool enableWritePrefetching() const { 733 return getST()->enableWritePrefetching(); 734 } 735 shouldPrefetchAddressSpace(unsigned AS)736 virtual bool shouldPrefetchAddressSpace(unsigned AS) const { 737 return getST()->shouldPrefetchAddressSpace(AS); 738 } 739 740 /// @} 741 742 /// \name Vector TTI Implementations 743 /// @{ 744 getRegisterBitWidth(TargetTransformInfo::RegisterKind K)745 TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 746 return TypeSize::getFixed(32); 747 } 748 getMaxVScale()749 std::optional<unsigned> getMaxVScale() const { return std::nullopt; } getVScaleForTuning()750 std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; } isVScaleKnownToBeAPowerOfTwo()751 bool isVScaleKnownToBeAPowerOfTwo() const { return false; } 752 753 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 754 /// are set if the demanded result elements need to be inserted and/or 755 /// extracted from vectors. getScalarizationOverhead(VectorType * InTy,const APInt & DemandedElts,bool Insert,bool Extract,TTI::TargetCostKind CostKind)756 InstructionCost getScalarizationOverhead(VectorType *InTy, 757 const APInt &DemandedElts, 758 bool Insert, bool Extract, 759 TTI::TargetCostKind CostKind) { 760 /// FIXME: a bitfield is not a reasonable abstraction for talking about 761 /// which elements are needed from a scalable vector 762 if (isa<ScalableVectorType>(InTy)) 763 return InstructionCost::getInvalid(); 764 auto *Ty = cast<FixedVectorType>(InTy); 765 766 assert(DemandedElts.getBitWidth() == Ty->getNumElements() && 767 "Vector size mismatch"); 768 769 InstructionCost Cost = 0; 770 771 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) { 772 if (!DemandedElts[i]) 773 continue; 774 if (Insert) 775 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, 776 CostKind, i, nullptr, nullptr); 777 if (Extract) 778 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 779 CostKind, i, nullptr, nullptr); 780 } 781 782 return Cost; 783 } 784 785 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. getScalarizationOverhead(VectorType * InTy,bool Insert,bool Extract,TTI::TargetCostKind CostKind)786 InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, 787 bool Extract, 788 TTI::TargetCostKind CostKind) { 789 if (isa<ScalableVectorType>(InTy)) 790 return InstructionCost::getInvalid(); 791 auto *Ty = cast<FixedVectorType>(InTy); 792 793 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements()); 794 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract, 795 CostKind); 796 } 797 798 /// Estimate the overhead of scalarizing an instructions unique 799 /// non-constant operands. The (potentially vector) types to use for each of 800 /// argument are passes via Tys. 801 InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value * > Args,ArrayRef<Type * > Tys,TTI::TargetCostKind CostKind)802 getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, 803 ArrayRef<Type *> Tys, 804 TTI::TargetCostKind CostKind) { 805 assert(Args.size() == Tys.size() && "Expected matching Args and Tys"); 806 807 InstructionCost Cost = 0; 808 SmallPtrSet<const Value*, 4> UniqueOperands; 809 for (int I = 0, E = Args.size(); I != E; I++) { 810 // Disregard things like metadata arguments. 811 const Value *A = Args[I]; 812 Type *Ty = Tys[I]; 813 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && 814 !Ty->isPtrOrPtrVectorTy()) 815 continue; 816 817 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { 818 if (auto *VecTy = dyn_cast<VectorType>(Ty)) 819 Cost += getScalarizationOverhead(VecTy, /*Insert*/ false, 820 /*Extract*/ true, CostKind); 821 } 822 } 823 824 return Cost; 825 } 826 827 /// Estimate the overhead of scalarizing the inputs and outputs of an 828 /// instruction, with return type RetTy and arguments Args of type Tys. If 829 /// Args are unknown (empty), then the cost associated with one argument is 830 /// added as a heuristic. getScalarizationOverhead(VectorType * RetTy,ArrayRef<const Value * > Args,ArrayRef<Type * > Tys,TTI::TargetCostKind CostKind)831 InstructionCost getScalarizationOverhead(VectorType *RetTy, 832 ArrayRef<const Value *> Args, 833 ArrayRef<Type *> Tys, 834 TTI::TargetCostKind CostKind) { 835 InstructionCost Cost = getScalarizationOverhead( 836 RetTy, /*Insert*/ true, /*Extract*/ false, CostKind); 837 if (!Args.empty()) 838 Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind); 839 else 840 // When no information on arguments is provided, we add the cost 841 // associated with one argument as a heuristic. 842 Cost += getScalarizationOverhead(RetTy, /*Insert*/ false, 843 /*Extract*/ true, CostKind); 844 845 return Cost; 846 } 847 848 /// Estimate the cost of type-legalization and the legalized type. getTypeLegalizationCost(Type * Ty)849 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const { 850 LLVMContext &C = Ty->getContext(); 851 EVT MTy = getTLI()->getValueType(DL, Ty); 852 853 InstructionCost Cost = 1; 854 // We keep legalizing the type until we find a legal kind. We assume that 855 // the only operation that costs anything is the split. After splitting 856 // we need to handle two types. 857 while (true) { 858 TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy); 859 860 if (LK.first == TargetLoweringBase::TypeScalarizeScalableVector) { 861 // Ensure we return a sensible simple VT here, since many callers of 862 // this function require it. 863 MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64; 864 return std::make_pair(InstructionCost::getInvalid(), VT); 865 } 866 867 if (LK.first == TargetLoweringBase::TypeLegal) 868 return std::make_pair(Cost, MTy.getSimpleVT()); 869 870 if (LK.first == TargetLoweringBase::TypeSplitVector || 871 LK.first == TargetLoweringBase::TypeExpandInteger) 872 Cost *= 2; 873 874 // Do not loop with f128 type. 875 if (MTy == LK.second) 876 return std::make_pair(Cost, MTy.getSimpleVT()); 877 878 // Keep legalizing the type. 879 MTy = LK.second; 880 } 881 } 882 getMaxInterleaveFactor(ElementCount VF)883 unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; } 884 885 InstructionCost getArithmeticInstrCost( 886 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 887 TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None}, 888 TTI::OperandValueInfo Opd2Info = {TTI::OK_AnyValue, TTI::OP_None}, 889 ArrayRef<const Value *> Args = ArrayRef<const Value *>(), 890 const Instruction *CxtI = nullptr) { 891 // Check if any of the operands are vector operands. 892 const TargetLoweringBase *TLI = getTLI(); 893 int ISD = TLI->InstructionOpcodeToISD(Opcode); 894 assert(ISD && "Invalid opcode"); 895 896 // TODO: Handle more cost kinds. 897 if (CostKind != TTI::TCK_RecipThroughput) 898 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, 899 Opd1Info, Opd2Info, 900 Args, CxtI); 901 902 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty); 903 904 bool IsFloat = Ty->isFPOrFPVectorTy(); 905 // Assume that floating point arithmetic operations cost twice as much as 906 // integer operations. 907 InstructionCost OpCost = (IsFloat ? 2 : 1); 908 909 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 910 // The operation is legal. Assume it costs 1. 911 // TODO: Once we have extract/insert subvector cost we need to use them. 912 return LT.first * OpCost; 913 } 914 915 if (!TLI->isOperationExpand(ISD, LT.second)) { 916 // If the operation is custom lowered, then assume that the code is twice 917 // as expensive. 918 return LT.first * 2 * OpCost; 919 } 920 921 // An 'Expand' of URem and SRem is special because it may default 922 // to expanding the operation into a sequence of sub-operations 923 // i.e. X % Y -> X-(X/Y)*Y. 924 if (ISD == ISD::UREM || ISD == ISD::SREM) { 925 bool IsSigned = ISD == ISD::SREM; 926 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, 927 LT.second) || 928 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV, 929 LT.second)) { 930 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv; 931 InstructionCost DivCost = thisT()->getArithmeticInstrCost( 932 DivOpc, Ty, CostKind, Opd1Info, Opd2Info); 933 InstructionCost MulCost = 934 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind); 935 InstructionCost SubCost = 936 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); 937 return DivCost + MulCost + SubCost; 938 } 939 } 940 941 // We cannot scalarize scalable vectors, so return Invalid. 942 if (isa<ScalableVectorType>(Ty)) 943 return InstructionCost::getInvalid(); 944 945 // Else, assume that we need to scalarize this op. 946 // TODO: If one of the types get legalized by splitting, handle this 947 // similarly to what getCastInstrCost() does. 948 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { 949 InstructionCost Cost = thisT()->getArithmeticInstrCost( 950 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, 951 Args, CxtI); 952 // Return the cost of multiple scalar invocation plus the cost of 953 // inserting and extracting the values. 954 SmallVector<Type *> Tys(Args.size(), Ty); 955 return getScalarizationOverhead(VTy, Args, Tys, CostKind) + 956 VTy->getNumElements() * Cost; 957 } 958 959 // We don't know anything about this scalar instruction. 960 return OpCost; 961 } 962 improveShuffleKindFromMask(TTI::ShuffleKind Kind,ArrayRef<int> Mask,VectorType * Ty,int & Index,VectorType * & SubTy)963 TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, 964 ArrayRef<int> Mask, 965 VectorType *Ty, int &Index, 966 VectorType *&SubTy) const { 967 if (Mask.empty()) 968 return Kind; 969 int NumSrcElts = Ty->getElementCount().getKnownMinValue(); 970 switch (Kind) { 971 case TTI::SK_PermuteSingleSrc: 972 if (ShuffleVectorInst::isReverseMask(Mask, NumSrcElts)) 973 return TTI::SK_Reverse; 974 if (ShuffleVectorInst::isZeroEltSplatMask(Mask, NumSrcElts)) 975 return TTI::SK_Broadcast; 976 if (ShuffleVectorInst::isExtractSubvectorMask(Mask, NumSrcElts, Index) && 977 (Index + Mask.size()) <= (size_t)NumSrcElts) { 978 SubTy = FixedVectorType::get(Ty->getElementType(), Mask.size()); 979 return TTI::SK_ExtractSubvector; 980 } 981 break; 982 case TTI::SK_PermuteTwoSrc: { 983 int NumSubElts; 984 if (Mask.size() > 2 && ShuffleVectorInst::isInsertSubvectorMask( 985 Mask, NumSrcElts, NumSubElts, Index)) { 986 if (Index + NumSubElts > NumSrcElts) 987 return Kind; 988 SubTy = FixedVectorType::get(Ty->getElementType(), NumSubElts); 989 return TTI::SK_InsertSubvector; 990 } 991 if (ShuffleVectorInst::isSelectMask(Mask, NumSrcElts)) 992 return TTI::SK_Select; 993 if (ShuffleVectorInst::isTransposeMask(Mask, NumSrcElts)) 994 return TTI::SK_Transpose; 995 if (ShuffleVectorInst::isSpliceMask(Mask, NumSrcElts, Index)) 996 return TTI::SK_Splice; 997 break; 998 } 999 case TTI::SK_Select: 1000 case TTI::SK_Reverse: 1001 case TTI::SK_Broadcast: 1002 case TTI::SK_Transpose: 1003 case TTI::SK_InsertSubvector: 1004 case TTI::SK_ExtractSubvector: 1005 case TTI::SK_Splice: 1006 break; 1007 } 1008 return Kind; 1009 } 1010 1011 InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, 1012 ArrayRef<int> Mask, 1013 TTI::TargetCostKind CostKind, int Index, 1014 VectorType *SubTp, 1015 ArrayRef<const Value *> Args = std::nullopt) { 1016 switch (improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp)) { 1017 case TTI::SK_Broadcast: 1018 if (auto *FVT = dyn_cast<FixedVectorType>(Tp)) 1019 return getBroadcastShuffleOverhead(FVT, CostKind); 1020 return InstructionCost::getInvalid(); 1021 case TTI::SK_Select: 1022 case TTI::SK_Splice: 1023 case TTI::SK_Reverse: 1024 case TTI::SK_Transpose: 1025 case TTI::SK_PermuteSingleSrc: 1026 case TTI::SK_PermuteTwoSrc: 1027 if (auto *FVT = dyn_cast<FixedVectorType>(Tp)) 1028 return getPermuteShuffleOverhead(FVT, CostKind); 1029 return InstructionCost::getInvalid(); 1030 case TTI::SK_ExtractSubvector: 1031 return getExtractSubvectorOverhead(Tp, CostKind, Index, 1032 cast<FixedVectorType>(SubTp)); 1033 case TTI::SK_InsertSubvector: 1034 return getInsertSubvectorOverhead(Tp, CostKind, Index, 1035 cast<FixedVectorType>(SubTp)); 1036 } 1037 llvm_unreachable("Unknown TTI::ShuffleKind"); 1038 } 1039 1040 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 1041 TTI::CastContextHint CCH, 1042 TTI::TargetCostKind CostKind, 1043 const Instruction *I = nullptr) { 1044 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0) 1045 return 0; 1046 1047 const TargetLoweringBase *TLI = getTLI(); 1048 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1049 assert(ISD && "Invalid opcode"); 1050 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src); 1051 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst); 1052 1053 TypeSize SrcSize = SrcLT.second.getSizeInBits(); 1054 TypeSize DstSize = DstLT.second.getSizeInBits(); 1055 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy(); 1056 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy(); 1057 1058 switch (Opcode) { 1059 default: 1060 break; 1061 case Instruction::Trunc: 1062 // Check for NOOP conversions. 1063 if (TLI->isTruncateFree(SrcLT.second, DstLT.second)) 1064 return 0; 1065 [[fallthrough]]; 1066 case Instruction::BitCast: 1067 // Bitcast between types that are legalized to the same type are free and 1068 // assume int to/from ptr of the same size is also free. 1069 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst && 1070 SrcSize == DstSize) 1071 return 0; 1072 break; 1073 case Instruction::FPExt: 1074 if (I && getTLI()->isExtFree(I)) 1075 return 0; 1076 break; 1077 case Instruction::ZExt: 1078 if (TLI->isZExtFree(SrcLT.second, DstLT.second)) 1079 return 0; 1080 [[fallthrough]]; 1081 case Instruction::SExt: 1082 if (I && getTLI()->isExtFree(I)) 1083 return 0; 1084 1085 // If this is a zext/sext of a load, return 0 if the corresponding 1086 // extending load exists on target and the result type is legal. 1087 if (CCH == TTI::CastContextHint::Normal) { 1088 EVT ExtVT = EVT::getEVT(Dst); 1089 EVT LoadVT = EVT::getEVT(Src); 1090 unsigned LType = 1091 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); 1092 if (DstLT.first == SrcLT.first && 1093 TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) 1094 return 0; 1095 } 1096 break; 1097 case Instruction::AddrSpaceCast: 1098 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(), 1099 Dst->getPointerAddressSpace())) 1100 return 0; 1101 break; 1102 } 1103 1104 auto *SrcVTy = dyn_cast<VectorType>(Src); 1105 auto *DstVTy = dyn_cast<VectorType>(Dst); 1106 1107 // If the cast is marked as legal (or promote) then assume low cost. 1108 if (SrcLT.first == DstLT.first && 1109 TLI->isOperationLegalOrPromote(ISD, DstLT.second)) 1110 return SrcLT.first; 1111 1112 // Handle scalar conversions. 1113 if (!SrcVTy && !DstVTy) { 1114 // Just check the op cost. If the operation is legal then assume it costs 1115 // 1. 1116 if (!TLI->isOperationExpand(ISD, DstLT.second)) 1117 return 1; 1118 1119 // Assume that illegal scalar instruction are expensive. 1120 return 4; 1121 } 1122 1123 // Check vector-to-vector casts. 1124 if (DstVTy && SrcVTy) { 1125 // If the cast is between same-sized registers, then the check is simple. 1126 if (SrcLT.first == DstLT.first && SrcSize == DstSize) { 1127 1128 // Assume that Zext is done using AND. 1129 if (Opcode == Instruction::ZExt) 1130 return SrcLT.first; 1131 1132 // Assume that sext is done using SHL and SRA. 1133 if (Opcode == Instruction::SExt) 1134 return SrcLT.first * 2; 1135 1136 // Just check the op cost. If the operation is legal then assume it 1137 // costs 1138 // 1 and multiply by the type-legalization overhead. 1139 if (!TLI->isOperationExpand(ISD, DstLT.second)) 1140 return SrcLT.first * 1; 1141 } 1142 1143 // If we are legalizing by splitting, query the concrete TTI for the cost 1144 // of casting the original vector twice. We also need to factor in the 1145 // cost of the split itself. Count that as 1, to be consistent with 1146 // getTypeLegalizationCost(). 1147 bool SplitSrc = 1148 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == 1149 TargetLowering::TypeSplitVector; 1150 bool SplitDst = 1151 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == 1152 TargetLowering::TypeSplitVector; 1153 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() && 1154 DstVTy->getElementCount().isVector()) { 1155 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy); 1156 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy); 1157 T *TTI = static_cast<T *>(this); 1158 // If both types need to be split then the split is free. 1159 InstructionCost SplitCost = 1160 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0; 1161 return SplitCost + 1162 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH, 1163 CostKind, I)); 1164 } 1165 1166 // Scalarization cost is Invalid, can't assume any num elements. 1167 if (isa<ScalableVectorType>(DstVTy)) 1168 return InstructionCost::getInvalid(); 1169 1170 // In other cases where the source or destination are illegal, assume 1171 // the operation will get scalarized. 1172 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements(); 1173 InstructionCost Cost = thisT()->getCastInstrCost( 1174 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I); 1175 1176 // Return the cost of multiple scalar invocation plus the cost of 1177 // inserting and extracting the values. 1178 return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true, 1179 CostKind) + 1180 Num * Cost; 1181 } 1182 1183 // We already handled vector-to-vector and scalar-to-scalar conversions. 1184 // This 1185 // is where we handle bitcast between vectors and scalars. We need to assume 1186 // that the conversion is scalarized in one way or another. 1187 if (Opcode == Instruction::BitCast) { 1188 // Illegal bitcasts are done by storing and loading from a stack slot. 1189 return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false, 1190 /*Extract*/ true, CostKind) 1191 : 0) + 1192 (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true, 1193 /*Extract*/ false, CostKind) 1194 : 0); 1195 } 1196 1197 llvm_unreachable("Unhandled cast"); 1198 } 1199 getExtractWithExtendCost(unsigned Opcode,Type * Dst,VectorType * VecTy,unsigned Index)1200 InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, 1201 VectorType *VecTy, unsigned Index) { 1202 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; 1203 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, 1204 CostKind, Index, nullptr, nullptr) + 1205 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), 1206 TTI::CastContextHint::None, CostKind); 1207 } 1208 1209 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, 1210 const Instruction *I = nullptr) { 1211 return BaseT::getCFInstrCost(Opcode, CostKind, I); 1212 } 1213 1214 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 1215 CmpInst::Predicate VecPred, 1216 TTI::TargetCostKind CostKind, 1217 const Instruction *I = nullptr) { 1218 const TargetLoweringBase *TLI = getTLI(); 1219 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1220 assert(ISD && "Invalid opcode"); 1221 1222 // TODO: Handle other cost kinds. 1223 if (CostKind != TTI::TCK_RecipThroughput) 1224 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1225 I); 1226 1227 // Selects on vectors are actually vector selects. 1228 if (ISD == ISD::SELECT) { 1229 assert(CondTy && "CondTy must exist"); 1230 if (CondTy->isVectorTy()) 1231 ISD = ISD::VSELECT; 1232 } 1233 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy); 1234 1235 if (!(ValTy->isVectorTy() && !LT.second.isVector()) && 1236 !TLI->isOperationExpand(ISD, LT.second)) { 1237 // The operation is legal. Assume it costs 1. Multiply 1238 // by the type-legalization overhead. 1239 return LT.first * 1; 1240 } 1241 1242 // Otherwise, assume that the cast is scalarized. 1243 // TODO: If one of the types get legalized by splitting, handle this 1244 // similarly to what getCastInstrCost() does. 1245 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) { 1246 if (isa<ScalableVectorType>(ValTy)) 1247 return InstructionCost::getInvalid(); 1248 1249 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements(); 1250 if (CondTy) 1251 CondTy = CondTy->getScalarType(); 1252 InstructionCost Cost = thisT()->getCmpSelInstrCost( 1253 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I); 1254 1255 // Return the cost of multiple scalar invocation plus the cost of 1256 // inserting and extracting the values. 1257 return getScalarizationOverhead(ValVTy, /*Insert*/ true, 1258 /*Extract*/ false, CostKind) + 1259 Num * Cost; 1260 } 1261 1262 // Unknown scalar opcode. 1263 return 1; 1264 } 1265 getVectorInstrCost(unsigned Opcode,Type * Val,TTI::TargetCostKind CostKind,unsigned Index,Value * Op0,Value * Op1)1266 InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, 1267 TTI::TargetCostKind CostKind, 1268 unsigned Index, Value *Op0, Value *Op1) { 1269 return getRegUsageForType(Val->getScalarType()); 1270 } 1271 getVectorInstrCost(const Instruction & I,Type * Val,TTI::TargetCostKind CostKind,unsigned Index)1272 InstructionCost getVectorInstrCost(const Instruction &I, Type *Val, 1273 TTI::TargetCostKind CostKind, 1274 unsigned Index) { 1275 Value *Op0 = nullptr; 1276 Value *Op1 = nullptr; 1277 if (auto *IE = dyn_cast<InsertElementInst>(&I)) { 1278 Op0 = IE->getOperand(0); 1279 Op1 = IE->getOperand(1); 1280 } 1281 return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0, 1282 Op1); 1283 } 1284 getReplicationShuffleCost(Type * EltTy,int ReplicationFactor,int VF,const APInt & DemandedDstElts,TTI::TargetCostKind CostKind)1285 InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, 1286 int VF, 1287 const APInt &DemandedDstElts, 1288 TTI::TargetCostKind CostKind) { 1289 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor && 1290 "Unexpected size of DemandedDstElts."); 1291 1292 InstructionCost Cost; 1293 1294 auto *SrcVT = FixedVectorType::get(EltTy, VF); 1295 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor); 1296 1297 // The Mask shuffling cost is extract all the elements of the Mask 1298 // and insert each of them Factor times into the wide vector: 1299 // 1300 // E.g. an interleaved group with factor 3: 1301 // %mask = icmp ult <8 x i32> %vec1, %vec2 1302 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, 1303 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> 1304 // The cost is estimated as extract all mask elements from the <8xi1> mask 1305 // vector and insert them factor times into the <24xi1> shuffled mask 1306 // vector. 1307 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF); 1308 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts, 1309 /*Insert*/ false, 1310 /*Extract*/ true, CostKind); 1311 Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts, 1312 /*Insert*/ true, 1313 /*Extract*/ false, CostKind); 1314 1315 return Cost; 1316 } 1317 1318 InstructionCost 1319 getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, 1320 unsigned AddressSpace, TTI::TargetCostKind CostKind, 1321 TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None}, 1322 const Instruction *I = nullptr) { 1323 assert(!Src->isVoidTy() && "Invalid type"); 1324 // Assume types, such as structs, are expensive. 1325 if (getTLI()->getValueType(DL, Src, true) == MVT::Other) 1326 return 4; 1327 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src); 1328 1329 // Assuming that all loads of legal types cost 1. 1330 InstructionCost Cost = LT.first; 1331 if (CostKind != TTI::TCK_RecipThroughput) 1332 return Cost; 1333 1334 const DataLayout &DL = this->getDataLayout(); 1335 if (Src->isVectorTy() && 1336 // In practice it's not currently possible to have a change in lane 1337 // length for extending loads or truncating stores so both types should 1338 // have the same scalable property. 1339 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src), 1340 LT.second.getSizeInBits())) { 1341 // This is a vector load that legalizes to a larger type than the vector 1342 // itself. Unless the corresponding extending load or truncating store is 1343 // legal, then this will scalarize. 1344 TargetLowering::LegalizeAction LA = TargetLowering::Expand; 1345 EVT MemVT = getTLI()->getValueType(DL, Src); 1346 if (Opcode == Instruction::Store) 1347 LA = getTLI()->getTruncStoreAction(LT.second, MemVT); 1348 else 1349 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); 1350 1351 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { 1352 // This is a vector load/store for some illegal type that is scalarized. 1353 // We must account for the cost of building or decomposing the vector. 1354 Cost += getScalarizationOverhead( 1355 cast<VectorType>(Src), Opcode != Instruction::Store, 1356 Opcode == Instruction::Store, CostKind); 1357 } 1358 } 1359 1360 return Cost; 1361 } 1362 getMaskedMemoryOpCost(unsigned Opcode,Type * DataTy,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)1363 InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, 1364 Align Alignment, unsigned AddressSpace, 1365 TTI::TargetCostKind CostKind) { 1366 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false, 1367 CostKind); 1368 } 1369 1370 InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, 1371 const Value *Ptr, bool VariableMask, 1372 Align Alignment, 1373 TTI::TargetCostKind CostKind, 1374 const Instruction *I = nullptr) { 1375 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask, 1376 true, CostKind); 1377 } 1378 getStridedMemoryOpCost(unsigned Opcode,Type * DataTy,const Value * Ptr,bool VariableMask,Align Alignment,TTI::TargetCostKind CostKind,const Instruction * I)1379 InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, 1380 const Value *Ptr, bool VariableMask, 1381 Align Alignment, 1382 TTI::TargetCostKind CostKind, 1383 const Instruction *I) { 1384 // For a target without strided memory operations (or for an illegal 1385 // operation type on one which does), assume we lower to a gather/scatter 1386 // operation. (Which may in turn be scalarized.) 1387 return thisT()->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask, 1388 Alignment, CostKind, I); 1389 } 1390 1391 InstructionCost getInterleavedMemoryOpCost( 1392 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 1393 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 1394 bool UseMaskForCond = false, bool UseMaskForGaps = false) { 1395 1396 // We cannot scalarize scalable vectors, so return Invalid. 1397 if (isa<ScalableVectorType>(VecTy)) 1398 return InstructionCost::getInvalid(); 1399 1400 auto *VT = cast<FixedVectorType>(VecTy); 1401 1402 unsigned NumElts = VT->getNumElements(); 1403 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 1404 1405 unsigned NumSubElts = NumElts / Factor; 1406 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts); 1407 1408 // Firstly, the cost of load/store operation. 1409 InstructionCost Cost; 1410 if (UseMaskForCond || UseMaskForGaps) 1411 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, 1412 AddressSpace, CostKind); 1413 else 1414 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, 1415 CostKind); 1416 1417 // Legalize the vector type, and get the legalized and unlegalized type 1418 // sizes. 1419 MVT VecTyLT = getTypeLegalizationCost(VecTy).second; 1420 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy); 1421 unsigned VecTyLTSize = VecTyLT.getStoreSize(); 1422 1423 // Scale the cost of the memory operation by the fraction of legalized 1424 // instructions that will actually be used. We shouldn't account for the 1425 // cost of dead instructions since they will be removed. 1426 // 1427 // E.g., An interleaved load of factor 8: 1428 // %vec = load <16 x i64>, <16 x i64>* %ptr 1429 // %v0 = shufflevector %vec, undef, <0, 8> 1430 // 1431 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be 1432 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized 1433 // type). The other loads are unused. 1434 // 1435 // TODO: Note that legalization can turn masked loads/stores into unmasked 1436 // (legalized) loads/stores. This can be reflected in the cost. 1437 if (Cost.isValid() && VecTySize > VecTyLTSize) { 1438 // The number of loads of a legal type it will take to represent a load 1439 // of the unlegalized vector type. 1440 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize); 1441 1442 // The number of elements of the unlegalized type that correspond to a 1443 // single legal instruction. 1444 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts); 1445 1446 // Determine which legal instructions will be used. 1447 BitVector UsedInsts(NumLegalInsts, false); 1448 for (unsigned Index : Indices) 1449 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) 1450 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); 1451 1452 // Scale the cost of the load by the fraction of legal instructions that 1453 // will be used. 1454 Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts); 1455 } 1456 1457 // Then plus the cost of interleave operation. 1458 assert(Indices.size() <= Factor && 1459 "Interleaved memory op has too many members"); 1460 1461 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts); 1462 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts); 1463 1464 APInt DemandedLoadStoreElts = APInt::getZero(NumElts); 1465 for (unsigned Index : Indices) { 1466 assert(Index < Factor && "Invalid index for interleaved memory op"); 1467 for (unsigned Elm = 0; Elm < NumSubElts; Elm++) 1468 DemandedLoadStoreElts.setBit(Index + Elm * Factor); 1469 } 1470 1471 if (Opcode == Instruction::Load) { 1472 // The interleave cost is similar to extract sub vectors' elements 1473 // from the wide vector, and insert them into sub vectors. 1474 // 1475 // E.g. An interleaved load of factor 2 (with one member of index 0): 1476 // %vec = load <8 x i32>, <8 x i32>* %ptr 1477 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 1478 // The cost is estimated as extract elements at 0, 2, 4, 6 from the 1479 // <8 x i32> vector and insert them into a <4 x i32> vector. 1480 InstructionCost InsSubCost = thisT()->getScalarizationOverhead( 1481 SubVT, DemandedAllSubElts, 1482 /*Insert*/ true, /*Extract*/ false, CostKind); 1483 Cost += Indices.size() * InsSubCost; 1484 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, 1485 /*Insert*/ false, 1486 /*Extract*/ true, CostKind); 1487 } else { 1488 // The interleave cost is extract elements from sub vectors, and 1489 // insert them into the wide vector. 1490 // 1491 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1: 1492 // (using VF=4): 1493 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef> 1494 // %gaps.mask = <true, true, false, true, true, false, 1495 // true, true, false, true, true, false> 1496 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr, 1497 // i32 Align, <12 x i1> %gaps.mask 1498 // The cost is estimated as extract all elements (of actual members, 1499 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x 1500 // i32> vector. 1501 InstructionCost ExtSubCost = thisT()->getScalarizationOverhead( 1502 SubVT, DemandedAllSubElts, 1503 /*Insert*/ false, /*Extract*/ true, CostKind); 1504 Cost += ExtSubCost * Indices.size(); 1505 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, 1506 /*Insert*/ true, 1507 /*Extract*/ false, CostKind); 1508 } 1509 1510 if (!UseMaskForCond) 1511 return Cost; 1512 1513 Type *I8Type = Type::getInt8Ty(VT->getContext()); 1514 1515 Cost += thisT()->getReplicationShuffleCost( 1516 I8Type, Factor, NumSubElts, 1517 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts, 1518 CostKind); 1519 1520 // The Gaps mask is invariant and created outside the loop, therefore the 1521 // cost of creating it is not accounted for here. However if we have both 1522 // a MaskForGaps and some other mask that guards the execution of the 1523 // memory access, we need to account for the cost of And-ing the two masks 1524 // inside the loop. 1525 if (UseMaskForGaps) { 1526 auto *MaskVT = FixedVectorType::get(I8Type, NumElts); 1527 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT, 1528 CostKind); 1529 } 1530 1531 return Cost; 1532 } 1533 1534 /// Get intrinsic cost based on arguments. getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1535 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1536 TTI::TargetCostKind CostKind) { 1537 // Check for generically free intrinsics. 1538 if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0) 1539 return 0; 1540 1541 // Assume that target intrinsics are cheap. 1542 Intrinsic::ID IID = ICA.getID(); 1543 if (Function::isTargetIntrinsic(IID)) 1544 return TargetTransformInfo::TCC_Basic; 1545 1546 if (ICA.isTypeBasedOnly()) 1547 return getTypeBasedIntrinsicInstrCost(ICA, CostKind); 1548 1549 Type *RetTy = ICA.getReturnType(); 1550 1551 ElementCount RetVF = 1552 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount() 1553 : ElementCount::getFixed(1)); 1554 const IntrinsicInst *I = ICA.getInst(); 1555 const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 1556 FastMathFlags FMF = ICA.getFlags(); 1557 switch (IID) { 1558 default: 1559 break; 1560 1561 case Intrinsic::powi: 1562 if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) { 1563 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize(); 1564 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(), 1565 ShouldOptForSize)) { 1566 // The cost is modeled on the expansion performed by ExpandPowI in 1567 // SelectionDAGBuilder. 1568 APInt Exponent = RHSC->getValue().abs(); 1569 unsigned ActiveBits = Exponent.getActiveBits(); 1570 unsigned PopCount = Exponent.popcount(); 1571 InstructionCost Cost = (ActiveBits + PopCount - 2) * 1572 thisT()->getArithmeticInstrCost( 1573 Instruction::FMul, RetTy, CostKind); 1574 if (RHSC->isNegative()) 1575 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy, 1576 CostKind); 1577 return Cost; 1578 } 1579 } 1580 break; 1581 case Intrinsic::cttz: 1582 // FIXME: If necessary, this should go in target-specific overrides. 1583 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy)) 1584 return TargetTransformInfo::TCC_Basic; 1585 break; 1586 1587 case Intrinsic::ctlz: 1588 // FIXME: If necessary, this should go in target-specific overrides. 1589 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy)) 1590 return TargetTransformInfo::TCC_Basic; 1591 break; 1592 1593 case Intrinsic::memcpy: 1594 return thisT()->getMemcpyCost(ICA.getInst()); 1595 1596 case Intrinsic::masked_scatter: { 1597 const Value *Mask = Args[3]; 1598 bool VarMask = !isa<Constant>(Mask); 1599 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue(); 1600 return thisT()->getGatherScatterOpCost(Instruction::Store, 1601 ICA.getArgTypes()[0], Args[1], 1602 VarMask, Alignment, CostKind, I); 1603 } 1604 case Intrinsic::masked_gather: { 1605 const Value *Mask = Args[2]; 1606 bool VarMask = !isa<Constant>(Mask); 1607 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue(); 1608 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0], 1609 VarMask, Alignment, CostKind, I); 1610 } 1611 case Intrinsic::experimental_vp_strided_store: { 1612 const Value *Data = Args[0]; 1613 const Value *Ptr = Args[1]; 1614 const Value *Mask = Args[3]; 1615 const Value *EVL = Args[4]; 1616 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL); 1617 Align Alignment = I->getParamAlign(1).valueOrOne(); 1618 return thisT()->getStridedMemoryOpCost(Instruction::Store, 1619 Data->getType(), Ptr, VarMask, 1620 Alignment, CostKind, I); 1621 } 1622 case Intrinsic::experimental_vp_strided_load: { 1623 const Value *Ptr = Args[0]; 1624 const Value *Mask = Args[2]; 1625 const Value *EVL = Args[3]; 1626 bool VarMask = !isa<Constant>(Mask) || !isa<Constant>(EVL); 1627 Align Alignment = I->getParamAlign(0).valueOrOne(); 1628 return thisT()->getStridedMemoryOpCost(Instruction::Load, RetTy, Ptr, 1629 VarMask, Alignment, CostKind, I); 1630 } 1631 case Intrinsic::experimental_stepvector: { 1632 if (isa<ScalableVectorType>(RetTy)) 1633 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1634 // The cost of materialising a constant integer vector. 1635 return TargetTransformInfo::TCC_Basic; 1636 } 1637 case Intrinsic::vector_extract: { 1638 // FIXME: Handle case where a scalable vector is extracted from a scalable 1639 // vector 1640 if (isa<ScalableVectorType>(RetTy)) 1641 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1642 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue(); 1643 return thisT()->getShuffleCost( 1644 TTI::SK_ExtractSubvector, cast<VectorType>(Args[0]->getType()), 1645 std::nullopt, CostKind, Index, cast<VectorType>(RetTy)); 1646 } 1647 case Intrinsic::vector_insert: { 1648 // FIXME: Handle case where a scalable vector is inserted into a scalable 1649 // vector 1650 if (isa<ScalableVectorType>(Args[1]->getType())) 1651 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1652 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); 1653 return thisT()->getShuffleCost( 1654 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), 1655 std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType())); 1656 } 1657 case Intrinsic::experimental_vector_reverse: { 1658 return thisT()->getShuffleCost( 1659 TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt, 1660 CostKind, 0, cast<VectorType>(RetTy)); 1661 } 1662 case Intrinsic::experimental_vector_splice: { 1663 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); 1664 return thisT()->getShuffleCost( 1665 TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt, 1666 CostKind, Index, cast<VectorType>(RetTy)); 1667 } 1668 case Intrinsic::vector_reduce_add: 1669 case Intrinsic::vector_reduce_mul: 1670 case Intrinsic::vector_reduce_and: 1671 case Intrinsic::vector_reduce_or: 1672 case Intrinsic::vector_reduce_xor: 1673 case Intrinsic::vector_reduce_smax: 1674 case Intrinsic::vector_reduce_smin: 1675 case Intrinsic::vector_reduce_fmax: 1676 case Intrinsic::vector_reduce_fmin: 1677 case Intrinsic::vector_reduce_fmaximum: 1678 case Intrinsic::vector_reduce_fminimum: 1679 case Intrinsic::vector_reduce_umax: 1680 case Intrinsic::vector_reduce_umin: { 1681 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1); 1682 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1683 } 1684 case Intrinsic::vector_reduce_fadd: 1685 case Intrinsic::vector_reduce_fmul: { 1686 IntrinsicCostAttributes Attrs( 1687 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1); 1688 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1689 } 1690 case Intrinsic::fshl: 1691 case Intrinsic::fshr: { 1692 const Value *X = Args[0]; 1693 const Value *Y = Args[1]; 1694 const Value *Z = Args[2]; 1695 const TTI::OperandValueInfo OpInfoX = TTI::getOperandInfo(X); 1696 const TTI::OperandValueInfo OpInfoY = TTI::getOperandInfo(Y); 1697 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z); 1698 const TTI::OperandValueInfo OpInfoBW = 1699 {TTI::OK_UniformConstantValue, 1700 isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 1701 : TTI::OP_None}; 1702 1703 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) 1704 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) 1705 InstructionCost Cost = 0; 1706 Cost += 1707 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); 1708 Cost += 1709 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); 1710 Cost += thisT()->getArithmeticInstrCost( 1711 BinaryOperator::Shl, RetTy, CostKind, OpInfoX, 1712 {OpInfoZ.Kind, TTI::OP_None}); 1713 Cost += thisT()->getArithmeticInstrCost( 1714 BinaryOperator::LShr, RetTy, CostKind, OpInfoY, 1715 {OpInfoZ.Kind, TTI::OP_None}); 1716 // Non-constant shift amounts requires a modulo. 1717 if (!OpInfoZ.isConstant()) 1718 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy, 1719 CostKind, OpInfoZ, OpInfoBW); 1720 // For non-rotates (X != Y) we must add shift-by-zero handling costs. 1721 if (X != Y) { 1722 Type *CondTy = RetTy->getWithNewBitWidth(1); 1723 Cost += 1724 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1725 CmpInst::ICMP_EQ, CostKind); 1726 Cost += 1727 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1728 CmpInst::ICMP_EQ, CostKind); 1729 } 1730 return Cost; 1731 } 1732 case Intrinsic::get_active_lane_mask: { 1733 EVT ResVT = getTLI()->getValueType(DL, RetTy, true); 1734 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true); 1735 1736 // If we're not expanding the intrinsic then we assume this is cheap 1737 // to implement. 1738 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) { 1739 return getTypeLegalizationCost(RetTy).first; 1740 } 1741 1742 // Create the expanded types that will be used to calculate the uadd_sat 1743 // operation. 1744 Type *ExpRetTy = VectorType::get( 1745 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount()); 1746 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF); 1747 InstructionCost Cost = 1748 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1749 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy, 1750 CmpInst::ICMP_ULT, CostKind); 1751 return Cost; 1752 } 1753 } 1754 1755 // VP Intrinsics should have the same cost as their non-vp counterpart. 1756 // TODO: Adjust the cost to make the vp intrinsic cheaper than its non-vp 1757 // counterpart when the vector length argument is smaller than the maximum 1758 // vector length. 1759 // TODO: Support other kinds of VPIntrinsics 1760 if (VPIntrinsic::isVPIntrinsic(ICA.getID())) { 1761 std::optional<unsigned> FOp = 1762 VPIntrinsic::getFunctionalOpcodeForVP(ICA.getID()); 1763 if (FOp) { 1764 if (ICA.getID() == Intrinsic::vp_load) { 1765 Align Alignment; 1766 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst())) 1767 Alignment = VPI->getPointerAlignment().valueOrOne(); 1768 unsigned AS = 0; 1769 if (ICA.getArgs().size() > 1) 1770 if (auto *PtrTy = 1771 dyn_cast<PointerType>(ICA.getArgs()[0]->getType())) 1772 AS = PtrTy->getAddressSpace(); 1773 return thisT()->getMemoryOpCost(*FOp, ICA.getReturnType(), Alignment, 1774 AS, CostKind); 1775 } 1776 if (ICA.getID() == Intrinsic::vp_store) { 1777 Align Alignment; 1778 if (auto *VPI = dyn_cast_or_null<VPIntrinsic>(ICA.getInst())) 1779 Alignment = VPI->getPointerAlignment().valueOrOne(); 1780 unsigned AS = 0; 1781 if (ICA.getArgs().size() >= 2) 1782 if (auto *PtrTy = 1783 dyn_cast<PointerType>(ICA.getArgs()[1]->getType())) 1784 AS = PtrTy->getAddressSpace(); 1785 return thisT()->getMemoryOpCost(*FOp, Args[0]->getType(), Alignment, 1786 AS, CostKind); 1787 } 1788 if (VPBinOpIntrinsic::isVPBinOp(ICA.getID())) { 1789 return thisT()->getArithmeticInstrCost(*FOp, ICA.getReturnType(), 1790 CostKind); 1791 } 1792 } 1793 1794 std::optional<Intrinsic::ID> FID = 1795 VPIntrinsic::getFunctionalIntrinsicIDForVP(ICA.getID()); 1796 if (FID) { 1797 // Non-vp version will have same Args/Tys except mask and vector length. 1798 assert(ICA.getArgs().size() >= 2 && ICA.getArgTypes().size() >= 2 && 1799 "Expected VPIntrinsic to have Mask and Vector Length args and " 1800 "types"); 1801 ArrayRef<Type *> NewTys = ArrayRef(ICA.getArgTypes()).drop_back(2); 1802 1803 // VPReduction intrinsics have a start value argument that their non-vp 1804 // counterparts do not have, except for the fadd and fmul non-vp 1805 // counterpart. 1806 if (VPReductionIntrinsic::isVPReduction(ICA.getID()) && 1807 *FID != Intrinsic::vector_reduce_fadd && 1808 *FID != Intrinsic::vector_reduce_fmul) 1809 NewTys = NewTys.drop_front(); 1810 1811 IntrinsicCostAttributes NewICA(*FID, ICA.getReturnType(), NewTys, 1812 ICA.getFlags()); 1813 return thisT()->getIntrinsicInstrCost(NewICA, CostKind); 1814 } 1815 } 1816 1817 // Assume that we need to scalarize this intrinsic.) 1818 // Compute the scalarization overhead based on Args for a vector 1819 // intrinsic. 1820 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 1821 if (RetVF.isVector() && !RetVF.isScalable()) { 1822 ScalarizationCost = 0; 1823 if (!RetTy->isVoidTy()) 1824 ScalarizationCost += getScalarizationOverhead( 1825 cast<VectorType>(RetTy), 1826 /*Insert*/ true, /*Extract*/ false, CostKind); 1827 ScalarizationCost += 1828 getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind); 1829 } 1830 1831 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, 1832 ScalarizationCost); 1833 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1834 } 1835 1836 /// Get intrinsic cost based on argument types. 1837 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the 1838 /// cost of scalarizing the arguments and the return value will be computed 1839 /// based on types. 1840 InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1841 getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1842 TTI::TargetCostKind CostKind) { 1843 Intrinsic::ID IID = ICA.getID(); 1844 Type *RetTy = ICA.getReturnType(); 1845 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes(); 1846 FastMathFlags FMF = ICA.getFlags(); 1847 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost(); 1848 bool SkipScalarizationCost = ICA.skipScalarizationCost(); 1849 1850 VectorType *VecOpTy = nullptr; 1851 if (!Tys.empty()) { 1852 // The vector reduction operand is operand 0 except for fadd/fmul. 1853 // Their operand 0 is a scalar start value, so the vector op is operand 1. 1854 unsigned VecTyIndex = 0; 1855 if (IID == Intrinsic::vector_reduce_fadd || 1856 IID == Intrinsic::vector_reduce_fmul) 1857 VecTyIndex = 1; 1858 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes"); 1859 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]); 1860 } 1861 1862 // Library call cost - other than size, make it expensive. 1863 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10; 1864 unsigned ISD = 0; 1865 switch (IID) { 1866 default: { 1867 // Scalable vectors cannot be scalarized, so return Invalid. 1868 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { 1869 return isa<ScalableVectorType>(Ty); 1870 })) 1871 return InstructionCost::getInvalid(); 1872 1873 // Assume that we need to scalarize this intrinsic. 1874 InstructionCost ScalarizationCost = 1875 SkipScalarizationCost ? ScalarizationCostPassed : 0; 1876 unsigned ScalarCalls = 1; 1877 Type *ScalarRetTy = RetTy; 1878 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 1879 if (!SkipScalarizationCost) 1880 ScalarizationCost = getScalarizationOverhead( 1881 RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind); 1882 ScalarCalls = std::max(ScalarCalls, 1883 cast<FixedVectorType>(RetVTy)->getNumElements()); 1884 ScalarRetTy = RetTy->getScalarType(); 1885 } 1886 SmallVector<Type *, 4> ScalarTys; 1887 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1888 Type *Ty = Tys[i]; 1889 if (auto *VTy = dyn_cast<VectorType>(Ty)) { 1890 if (!SkipScalarizationCost) 1891 ScalarizationCost += getScalarizationOverhead( 1892 VTy, /*Insert*/ false, /*Extract*/ true, CostKind); 1893 ScalarCalls = std::max(ScalarCalls, 1894 cast<FixedVectorType>(VTy)->getNumElements()); 1895 Ty = Ty->getScalarType(); 1896 } 1897 ScalarTys.push_back(Ty); 1898 } 1899 if (ScalarCalls == 1) 1900 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. 1901 1902 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF); 1903 InstructionCost ScalarCost = 1904 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind); 1905 1906 return ScalarCalls * ScalarCost + ScalarizationCost; 1907 } 1908 // Look for intrinsics that can be lowered directly or turned into a scalar 1909 // intrinsic call. 1910 case Intrinsic::sqrt: 1911 ISD = ISD::FSQRT; 1912 break; 1913 case Intrinsic::sin: 1914 ISD = ISD::FSIN; 1915 break; 1916 case Intrinsic::cos: 1917 ISD = ISD::FCOS; 1918 break; 1919 case Intrinsic::exp: 1920 ISD = ISD::FEXP; 1921 break; 1922 case Intrinsic::exp2: 1923 ISD = ISD::FEXP2; 1924 break; 1925 case Intrinsic::exp10: 1926 ISD = ISD::FEXP10; 1927 break; 1928 case Intrinsic::log: 1929 ISD = ISD::FLOG; 1930 break; 1931 case Intrinsic::log10: 1932 ISD = ISD::FLOG10; 1933 break; 1934 case Intrinsic::log2: 1935 ISD = ISD::FLOG2; 1936 break; 1937 case Intrinsic::fabs: 1938 ISD = ISD::FABS; 1939 break; 1940 case Intrinsic::canonicalize: 1941 ISD = ISD::FCANONICALIZE; 1942 break; 1943 case Intrinsic::minnum: 1944 ISD = ISD::FMINNUM; 1945 break; 1946 case Intrinsic::maxnum: 1947 ISD = ISD::FMAXNUM; 1948 break; 1949 case Intrinsic::minimum: 1950 ISD = ISD::FMINIMUM; 1951 break; 1952 case Intrinsic::maximum: 1953 ISD = ISD::FMAXIMUM; 1954 break; 1955 case Intrinsic::copysign: 1956 ISD = ISD::FCOPYSIGN; 1957 break; 1958 case Intrinsic::floor: 1959 ISD = ISD::FFLOOR; 1960 break; 1961 case Intrinsic::ceil: 1962 ISD = ISD::FCEIL; 1963 break; 1964 case Intrinsic::trunc: 1965 ISD = ISD::FTRUNC; 1966 break; 1967 case Intrinsic::nearbyint: 1968 ISD = ISD::FNEARBYINT; 1969 break; 1970 case Intrinsic::rint: 1971 ISD = ISD::FRINT; 1972 break; 1973 case Intrinsic::lrint: 1974 ISD = ISD::LRINT; 1975 break; 1976 case Intrinsic::llrint: 1977 ISD = ISD::LLRINT; 1978 break; 1979 case Intrinsic::round: 1980 ISD = ISD::FROUND; 1981 break; 1982 case Intrinsic::roundeven: 1983 ISD = ISD::FROUNDEVEN; 1984 break; 1985 case Intrinsic::pow: 1986 ISD = ISD::FPOW; 1987 break; 1988 case Intrinsic::fma: 1989 ISD = ISD::FMA; 1990 break; 1991 case Intrinsic::fmuladd: 1992 ISD = ISD::FMA; 1993 break; 1994 case Intrinsic::experimental_constrained_fmuladd: 1995 ISD = ISD::STRICT_FMA; 1996 break; 1997 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. 1998 case Intrinsic::lifetime_start: 1999 case Intrinsic::lifetime_end: 2000 case Intrinsic::sideeffect: 2001 case Intrinsic::pseudoprobe: 2002 case Intrinsic::arithmetic_fence: 2003 return 0; 2004 case Intrinsic::masked_store: { 2005 Type *Ty = Tys[0]; 2006 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 2007 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, 2008 CostKind); 2009 } 2010 case Intrinsic::masked_load: { 2011 Type *Ty = RetTy; 2012 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 2013 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, 2014 CostKind); 2015 } 2016 case Intrinsic::vector_reduce_add: 2017 case Intrinsic::vector_reduce_mul: 2018 case Intrinsic::vector_reduce_and: 2019 case Intrinsic::vector_reduce_or: 2020 case Intrinsic::vector_reduce_xor: 2021 return thisT()->getArithmeticReductionCost( 2022 getArithmeticReductionInstruction(IID), VecOpTy, std::nullopt, 2023 CostKind); 2024 case Intrinsic::vector_reduce_fadd: 2025 case Intrinsic::vector_reduce_fmul: 2026 return thisT()->getArithmeticReductionCost( 2027 getArithmeticReductionInstruction(IID), VecOpTy, FMF, CostKind); 2028 case Intrinsic::vector_reduce_smax: 2029 case Intrinsic::vector_reduce_smin: 2030 case Intrinsic::vector_reduce_umax: 2031 case Intrinsic::vector_reduce_umin: 2032 case Intrinsic::vector_reduce_fmax: 2033 case Intrinsic::vector_reduce_fmin: 2034 case Intrinsic::vector_reduce_fmaximum: 2035 case Intrinsic::vector_reduce_fminimum: 2036 return thisT()->getMinMaxReductionCost(getMinMaxReductionIntrinsicOp(IID), 2037 VecOpTy, ICA.getFlags(), CostKind); 2038 case Intrinsic::abs: { 2039 // abs(X) = select(icmp(X,0),X,sub(0,X)) 2040 Type *CondTy = RetTy->getWithNewBitWidth(1); 2041 CmpInst::Predicate Pred = CmpInst::ICMP_SGT; 2042 InstructionCost Cost = 0; 2043 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 2044 Pred, CostKind); 2045 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 2046 Pred, CostKind); 2047 // TODO: Should we add an OperandValueProperties::OP_Zero property? 2048 Cost += thisT()->getArithmeticInstrCost( 2049 BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None}); 2050 return Cost; 2051 } 2052 case Intrinsic::smax: 2053 case Intrinsic::smin: 2054 case Intrinsic::umax: 2055 case Intrinsic::umin: { 2056 // minmax(X,Y) = select(icmp(X,Y),X,Y) 2057 Type *CondTy = RetTy->getWithNewBitWidth(1); 2058 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin; 2059 CmpInst::Predicate Pred = 2060 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT; 2061 InstructionCost Cost = 0; 2062 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 2063 Pred, CostKind); 2064 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 2065 Pred, CostKind); 2066 return Cost; 2067 } 2068 case Intrinsic::sadd_sat: 2069 case Intrinsic::ssub_sat: { 2070 Type *CondTy = RetTy->getWithNewBitWidth(1); 2071 2072 Type *OpTy = StructType::create({RetTy, CondTy}); 2073 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat 2074 ? Intrinsic::sadd_with_overflow 2075 : Intrinsic::ssub_with_overflow; 2076 CmpInst::Predicate Pred = CmpInst::ICMP_SGT; 2077 2078 // SatMax -> Overflow && SumDiff < 0 2079 // SatMin -> Overflow && SumDiff >= 0 2080 InstructionCost Cost = 0; 2081 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 2082 nullptr, ScalarizationCostPassed); 2083 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 2084 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 2085 Pred, CostKind); 2086 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, 2087 CondTy, Pred, CostKind); 2088 return Cost; 2089 } 2090 case Intrinsic::uadd_sat: 2091 case Intrinsic::usub_sat: { 2092 Type *CondTy = RetTy->getWithNewBitWidth(1); 2093 2094 Type *OpTy = StructType::create({RetTy, CondTy}); 2095 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat 2096 ? Intrinsic::uadd_with_overflow 2097 : Intrinsic::usub_with_overflow; 2098 2099 InstructionCost Cost = 0; 2100 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 2101 nullptr, ScalarizationCostPassed); 2102 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 2103 Cost += 2104 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 2105 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2106 return Cost; 2107 } 2108 case Intrinsic::smul_fix: 2109 case Intrinsic::umul_fix: { 2110 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; 2111 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize); 2112 2113 unsigned ExtOp = 2114 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; 2115 TTI::CastContextHint CCH = TTI::CastContextHint::None; 2116 2117 InstructionCost Cost = 0; 2118 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); 2119 Cost += 2120 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2121 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, 2122 CCH, CostKind); 2123 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy, 2124 CostKind, 2125 {TTI::OK_AnyValue, TTI::OP_None}, 2126 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2127 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind, 2128 {TTI::OK_AnyValue, TTI::OP_None}, 2129 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2130 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); 2131 return Cost; 2132 } 2133 case Intrinsic::sadd_with_overflow: 2134 case Intrinsic::ssub_with_overflow: { 2135 Type *SumTy = RetTy->getContainedType(0); 2136 Type *OverflowTy = RetTy->getContainedType(1); 2137 unsigned Opcode = IID == Intrinsic::sadd_with_overflow 2138 ? BinaryOperator::Add 2139 : BinaryOperator::Sub; 2140 2141 // Add: 2142 // Overflow -> (Result < LHS) ^ (RHS < 0) 2143 // Sub: 2144 // Overflow -> (Result < LHS) ^ (RHS > 0) 2145 InstructionCost Cost = 0; 2146 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 2147 Cost += 2 * thisT()->getCmpSelInstrCost( 2148 Instruction::ICmp, SumTy, OverflowTy, 2149 CmpInst::ICMP_SGT, CostKind); 2150 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy, 2151 CostKind); 2152 return Cost; 2153 } 2154 case Intrinsic::uadd_with_overflow: 2155 case Intrinsic::usub_with_overflow: { 2156 Type *SumTy = RetTy->getContainedType(0); 2157 Type *OverflowTy = RetTy->getContainedType(1); 2158 unsigned Opcode = IID == Intrinsic::uadd_with_overflow 2159 ? BinaryOperator::Add 2160 : BinaryOperator::Sub; 2161 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow 2162 ? CmpInst::ICMP_ULT 2163 : CmpInst::ICMP_UGT; 2164 2165 InstructionCost Cost = 0; 2166 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 2167 Cost += 2168 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy, 2169 Pred, CostKind); 2170 return Cost; 2171 } 2172 case Intrinsic::smul_with_overflow: 2173 case Intrinsic::umul_with_overflow: { 2174 Type *MulTy = RetTy->getContainedType(0); 2175 Type *OverflowTy = RetTy->getContainedType(1); 2176 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; 2177 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize); 2178 bool IsSigned = IID == Intrinsic::smul_with_overflow; 2179 2180 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt; 2181 TTI::CastContextHint CCH = TTI::CastContextHint::None; 2182 2183 InstructionCost Cost = 0; 2184 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); 2185 Cost += 2186 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2187 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, 2188 CCH, CostKind); 2189 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy, 2190 CostKind, 2191 {TTI::OK_AnyValue, TTI::OP_None}, 2192 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2193 2194 if (IsSigned) 2195 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy, 2196 CostKind, 2197 {TTI::OK_AnyValue, TTI::OP_None}, 2198 {TTI::OK_UniformConstantValue, TTI::OP_None}); 2199 2200 Cost += thisT()->getCmpSelInstrCost( 2201 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind); 2202 return Cost; 2203 } 2204 case Intrinsic::fptosi_sat: 2205 case Intrinsic::fptoui_sat: { 2206 if (Tys.empty()) 2207 break; 2208 Type *FromTy = Tys[0]; 2209 bool IsSigned = IID == Intrinsic::fptosi_sat; 2210 2211 InstructionCost Cost = 0; 2212 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy, 2213 {FromTy, FromTy}); 2214 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind); 2215 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy, 2216 {FromTy, FromTy}); 2217 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind); 2218 Cost += thisT()->getCastInstrCost( 2219 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy, 2220 TTI::CastContextHint::None, CostKind); 2221 if (IsSigned) { 2222 Type *CondTy = RetTy->getWithNewBitWidth(1); 2223 Cost += thisT()->getCmpSelInstrCost( 2224 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind); 2225 Cost += thisT()->getCmpSelInstrCost( 2226 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind); 2227 } 2228 return Cost; 2229 } 2230 case Intrinsic::ctpop: 2231 ISD = ISD::CTPOP; 2232 // In case of legalization use TCC_Expensive. This is cheaper than a 2233 // library call but still not a cheap instruction. 2234 SingleCallCost = TargetTransformInfo::TCC_Expensive; 2235 break; 2236 case Intrinsic::ctlz: 2237 ISD = ISD::CTLZ; 2238 break; 2239 case Intrinsic::cttz: 2240 ISD = ISD::CTTZ; 2241 break; 2242 case Intrinsic::bswap: 2243 ISD = ISD::BSWAP; 2244 break; 2245 case Intrinsic::bitreverse: 2246 ISD = ISD::BITREVERSE; 2247 break; 2248 } 2249 2250 const TargetLoweringBase *TLI = getTLI(); 2251 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy); 2252 2253 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 2254 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && 2255 TLI->isFAbsFree(LT.second)) { 2256 return 0; 2257 } 2258 2259 // The operation is legal. Assume it costs 1. 2260 // If the type is split to multiple registers, assume that there is some 2261 // overhead to this. 2262 // TODO: Once we have extract/insert subvector cost we need to use them. 2263 if (LT.first > 1) 2264 return (LT.first * 2); 2265 else 2266 return (LT.first * 1); 2267 } else if (!TLI->isOperationExpand(ISD, LT.second)) { 2268 // If the operation is custom lowered then assume 2269 // that the code is twice as expensive. 2270 return (LT.first * 2); 2271 } 2272 2273 // If we can't lower fmuladd into an FMA estimate the cost as a floating 2274 // point mul followed by an add. 2275 if (IID == Intrinsic::fmuladd) 2276 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy, 2277 CostKind) + 2278 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy, 2279 CostKind); 2280 if (IID == Intrinsic::experimental_constrained_fmuladd) { 2281 IntrinsicCostAttributes FMulAttrs( 2282 Intrinsic::experimental_constrained_fmul, RetTy, Tys); 2283 IntrinsicCostAttributes FAddAttrs( 2284 Intrinsic::experimental_constrained_fadd, RetTy, Tys); 2285 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) + 2286 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind); 2287 } 2288 2289 // Else, assume that we need to scalarize this intrinsic. For math builtins 2290 // this will emit a costly libcall, adding call overhead and spills. Make it 2291 // very expensive. 2292 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 2293 // Scalable vectors cannot be scalarized, so return Invalid. 2294 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { 2295 return isa<ScalableVectorType>(Ty); 2296 })) 2297 return InstructionCost::getInvalid(); 2298 2299 InstructionCost ScalarizationCost = 2300 SkipScalarizationCost 2301 ? ScalarizationCostPassed 2302 : getScalarizationOverhead(RetVTy, /*Insert*/ true, 2303 /*Extract*/ false, CostKind); 2304 2305 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements(); 2306 SmallVector<Type *, 4> ScalarTys; 2307 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 2308 Type *Ty = Tys[i]; 2309 if (Ty->isVectorTy()) 2310 Ty = Ty->getScalarType(); 2311 ScalarTys.push_back(Ty); 2312 } 2313 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); 2314 InstructionCost ScalarCost = 2315 thisT()->getIntrinsicInstrCost(Attrs, CostKind); 2316 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 2317 if (auto *VTy = dyn_cast<VectorType>(Tys[i])) { 2318 if (!ICA.skipScalarizationCost()) 2319 ScalarizationCost += getScalarizationOverhead( 2320 VTy, /*Insert*/ false, /*Extract*/ true, CostKind); 2321 ScalarCalls = std::max(ScalarCalls, 2322 cast<FixedVectorType>(VTy)->getNumElements()); 2323 } 2324 } 2325 return ScalarCalls * ScalarCost + ScalarizationCost; 2326 } 2327 2328 // This is going to be turned into a library call, make it expensive. 2329 return SingleCallCost; 2330 } 2331 2332 /// Compute a cost of the given call instruction. 2333 /// 2334 /// Compute the cost of calling function F with return type RetTy and 2335 /// argument types Tys. F might be nullptr, in this case the cost of an 2336 /// arbitrary call with the specified signature will be returned. 2337 /// This is used, for instance, when we estimate call of a vector 2338 /// counterpart of the given function. 2339 /// \param F Called function, might be nullptr. 2340 /// \param RetTy Return value types. 2341 /// \param Tys Argument types. 2342 /// \returns The cost of Call instruction. getCallInstrCost(Function * F,Type * RetTy,ArrayRef<Type * > Tys,TTI::TargetCostKind CostKind)2343 InstructionCost getCallInstrCost(Function *F, Type *RetTy, 2344 ArrayRef<Type *> Tys, 2345 TTI::TargetCostKind CostKind) { 2346 return 10; 2347 } 2348 getNumberOfParts(Type * Tp)2349 unsigned getNumberOfParts(Type *Tp) { 2350 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp); 2351 return LT.first.isValid() ? *LT.first.getValue() : 0; 2352 } 2353 getAddressComputationCost(Type * Ty,ScalarEvolution *,const SCEV *)2354 InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, 2355 const SCEV *) { 2356 return 0; 2357 } 2358 2359 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics. 2360 /// We're assuming that reduction operation are performing the following way: 2361 /// 2362 /// %val1 = shufflevector<n x t> %val, <n x t> %undef, 2363 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> 2364 /// \----------------v-------------/ \----------v------------/ 2365 /// n/2 elements n/2 elements 2366 /// %red1 = op <n x t> %val, <n x t> val1 2367 /// After this operation we have a vector %red1 where only the first n/2 2368 /// elements are meaningful, the second n/2 elements are undefined and can be 2369 /// dropped. All other operations are actually working with the vector of 2370 /// length n/2, not n, though the real vector length is still n. 2371 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, 2372 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> 2373 /// \----------------v-------------/ \----------v------------/ 2374 /// n/4 elements 3*n/4 elements 2375 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of 2376 /// length n/2, the resulting vector has length n/4 etc. 2377 /// 2378 /// The cost model should take into account that the actual length of the 2379 /// vector is reduced on each iteration. getTreeReductionCost(unsigned Opcode,VectorType * Ty,TTI::TargetCostKind CostKind)2380 InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, 2381 TTI::TargetCostKind CostKind) { 2382 // Targets must implement a default value for the scalable case, since 2383 // we don't know how many lanes the vector has. 2384 if (isa<ScalableVectorType>(Ty)) 2385 return InstructionCost::getInvalid(); 2386 2387 Type *ScalarTy = Ty->getElementType(); 2388 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 2389 if ((Opcode == Instruction::Or || Opcode == Instruction::And) && 2390 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) && 2391 NumVecElts >= 2) { 2392 // Or reduction for i1 is represented as: 2393 // %val = bitcast <ReduxWidth x i1> to iReduxWidth 2394 // %res = cmp ne iReduxWidth %val, 0 2395 // And reduction for i1 is represented as: 2396 // %val = bitcast <ReduxWidth x i1> to iReduxWidth 2397 // %res = cmp eq iReduxWidth %val, 11111 2398 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts); 2399 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty, 2400 TTI::CastContextHint::None, CostKind) + 2401 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy, 2402 CmpInst::makeCmpResultType(ValTy), 2403 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2404 } 2405 unsigned NumReduxLevels = Log2_32(NumVecElts); 2406 InstructionCost ArithCost = 0; 2407 InstructionCost ShuffleCost = 0; 2408 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty); 2409 unsigned LongVectorCount = 0; 2410 unsigned MVTLen = 2411 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 2412 while (NumVecElts > MVTLen) { 2413 NumVecElts /= 2; 2414 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 2415 ShuffleCost += 2416 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, 2417 CostKind, NumVecElts, SubTy); 2418 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); 2419 Ty = SubTy; 2420 ++LongVectorCount; 2421 } 2422 2423 NumReduxLevels -= LongVectorCount; 2424 2425 // The minimal length of the vector is limited by the real length of vector 2426 // operations performed on the current platform. That's why several final 2427 // reduction operations are performed on the vectors with the same 2428 // architecture-dependent length. 2429 2430 // By default reductions need one shuffle per reduction level. 2431 ShuffleCost += 2432 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 2433 std::nullopt, CostKind, 0, Ty); 2434 ArithCost += 2435 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind); 2436 return ShuffleCost + ArithCost + 2437 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 2438 CostKind, 0, nullptr, nullptr); 2439 } 2440 2441 /// Try to calculate the cost of performing strict (in-order) reductions, 2442 /// which involves doing a sequence of floating point additions in lane 2443 /// order, starting with an initial value. For example, consider a scalar 2444 /// initial value 'InitVal' of type float and a vector of type <4 x float>: 2445 /// 2446 /// Vector = <float %v0, float %v1, float %v2, float %v3> 2447 /// 2448 /// %add1 = %InitVal + %v0 2449 /// %add2 = %add1 + %v1 2450 /// %add3 = %add2 + %v2 2451 /// %add4 = %add3 + %v3 2452 /// 2453 /// As a simple estimate we can say the cost of such a reduction is 4 times 2454 /// the cost of a scalar FP addition. We can only estimate the costs for 2455 /// fixed-width vectors here because for scalable vectors we do not know the 2456 /// runtime number of operations. getOrderedReductionCost(unsigned Opcode,VectorType * Ty,TTI::TargetCostKind CostKind)2457 InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, 2458 TTI::TargetCostKind CostKind) { 2459 // Targets must implement a default value for the scalable case, since 2460 // we don't know how many lanes the vector has. 2461 if (isa<ScalableVectorType>(Ty)) 2462 return InstructionCost::getInvalid(); 2463 2464 auto *VTy = cast<FixedVectorType>(Ty); 2465 InstructionCost ExtractCost = getScalarizationOverhead( 2466 VTy, /*Insert=*/false, /*Extract=*/true, CostKind); 2467 InstructionCost ArithCost = thisT()->getArithmeticInstrCost( 2468 Opcode, VTy->getElementType(), CostKind); 2469 ArithCost *= VTy->getNumElements(); 2470 2471 return ExtractCost + ArithCost; 2472 } 2473 getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,std::optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)2474 InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 2475 std::optional<FastMathFlags> FMF, 2476 TTI::TargetCostKind CostKind) { 2477 assert(Ty && "Unknown reduction vector type"); 2478 if (TTI::requiresOrderedReduction(FMF)) 2479 return getOrderedReductionCost(Opcode, Ty, CostKind); 2480 return getTreeReductionCost(Opcode, Ty, CostKind); 2481 } 2482 2483 /// Try to calculate op costs for min/max reduction operations. 2484 /// \param CondTy Conditional type for the Select instruction. getMinMaxReductionCost(Intrinsic::ID IID,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind)2485 InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, 2486 FastMathFlags FMF, 2487 TTI::TargetCostKind CostKind) { 2488 // Targets must implement a default value for the scalable case, since 2489 // we don't know how many lanes the vector has. 2490 if (isa<ScalableVectorType>(Ty)) 2491 return InstructionCost::getInvalid(); 2492 2493 Type *ScalarTy = Ty->getElementType(); 2494 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 2495 unsigned NumReduxLevels = Log2_32(NumVecElts); 2496 InstructionCost MinMaxCost = 0; 2497 InstructionCost ShuffleCost = 0; 2498 std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty); 2499 unsigned LongVectorCount = 0; 2500 unsigned MVTLen = 2501 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 2502 while (NumVecElts > MVTLen) { 2503 NumVecElts /= 2; 2504 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 2505 2506 ShuffleCost += 2507 thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt, 2508 CostKind, NumVecElts, SubTy); 2509 2510 IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF); 2511 MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind); 2512 Ty = SubTy; 2513 ++LongVectorCount; 2514 } 2515 2516 NumReduxLevels -= LongVectorCount; 2517 2518 // The minimal length of the vector is limited by the real length of vector 2519 // operations performed on the current platform. That's why several final 2520 // reduction opertions are perfomed on the vectors with the same 2521 // architecture-dependent length. 2522 ShuffleCost += 2523 NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 2524 std::nullopt, CostKind, 0, Ty); 2525 IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF); 2526 MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind); 2527 // The last min/max should be in vector registers and we counted it above. 2528 // So just need a single extractelement. 2529 return ShuffleCost + MinMaxCost + 2530 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 2531 CostKind, 0, nullptr, nullptr); 2532 } 2533 getExtendedReductionCost(unsigned Opcode,bool IsUnsigned,Type * ResTy,VectorType * Ty,FastMathFlags FMF,TTI::TargetCostKind CostKind)2534 InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, 2535 Type *ResTy, VectorType *Ty, 2536 FastMathFlags FMF, 2537 TTI::TargetCostKind CostKind) { 2538 // Without any native support, this is equivalent to the cost of 2539 // vecreduce.opcode(ext(Ty A)). 2540 VectorType *ExtTy = VectorType::get(ResTy, Ty); 2541 InstructionCost RedCost = 2542 thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind); 2543 InstructionCost ExtCost = thisT()->getCastInstrCost( 2544 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, 2545 TTI::CastContextHint::None, CostKind); 2546 2547 return RedCost + ExtCost; 2548 } 2549 getMulAccReductionCost(bool IsUnsigned,Type * ResTy,VectorType * Ty,TTI::TargetCostKind CostKind)2550 InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy, 2551 VectorType *Ty, 2552 TTI::TargetCostKind CostKind) { 2553 // Without any native support, this is equivalent to the cost of 2554 // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or 2555 // vecreduce.add(mul(A, B)). 2556 VectorType *ExtTy = VectorType::get(ResTy, Ty); 2557 InstructionCost RedCost = thisT()->getArithmeticReductionCost( 2558 Instruction::Add, ExtTy, std::nullopt, CostKind); 2559 InstructionCost ExtCost = thisT()->getCastInstrCost( 2560 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, 2561 TTI::CastContextHint::None, CostKind); 2562 2563 InstructionCost MulCost = 2564 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2565 2566 return RedCost + MulCost + 2 * ExtCost; 2567 } 2568 getVectorSplitCost()2569 InstructionCost getVectorSplitCost() { return 1; } 2570 2571 /// @} 2572 }; 2573 2574 /// Concrete BasicTTIImpl that can be used if no further customization 2575 /// is needed. 2576 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { 2577 using BaseT = BasicTTIImplBase<BasicTTIImpl>; 2578 2579 friend class BasicTTIImplBase<BasicTTIImpl>; 2580 2581 const TargetSubtargetInfo *ST; 2582 const TargetLoweringBase *TLI; 2583 getST()2584 const TargetSubtargetInfo *getST() const { return ST; } getTLI()2585 const TargetLoweringBase *getTLI() const { return TLI; } 2586 2587 public: 2588 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F); 2589 }; 2590 2591 } // end namespace llvm 2592 2593 #endif // LLVM_CODEGEN_BASICTTIIMPL_H 2594