xref: /aosp_15_r20/external/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp (revision 9880d6810fe72a1726cb53787c6711e909410d58)
1*9880d681SAndroid Build Coastguard Worker //===-- SILoadStoreOptimizer.cpp ------------------------------------------===//
2*9880d681SAndroid Build Coastguard Worker //
3*9880d681SAndroid Build Coastguard Worker //                     The LLVM Compiler Infrastructure
4*9880d681SAndroid Build Coastguard Worker //
5*9880d681SAndroid Build Coastguard Worker // This file is distributed under the University of Illinois Open Source
6*9880d681SAndroid Build Coastguard Worker // License. See LICENSE.TXT for details.
7*9880d681SAndroid Build Coastguard Worker //
8*9880d681SAndroid Build Coastguard Worker //===----------------------------------------------------------------------===//
9*9880d681SAndroid Build Coastguard Worker //
10*9880d681SAndroid Build Coastguard Worker // This pass tries to fuse DS instructions with close by immediate offsets.
11*9880d681SAndroid Build Coastguard Worker // This will fuse operations such as
12*9880d681SAndroid Build Coastguard Worker //  ds_read_b32 v0, v2 offset:16
13*9880d681SAndroid Build Coastguard Worker //  ds_read_b32 v1, v2 offset:32
14*9880d681SAndroid Build Coastguard Worker // ==>
15*9880d681SAndroid Build Coastguard Worker //   ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
16*9880d681SAndroid Build Coastguard Worker //
17*9880d681SAndroid Build Coastguard Worker //
18*9880d681SAndroid Build Coastguard Worker // Future improvements:
19*9880d681SAndroid Build Coastguard Worker //
20*9880d681SAndroid Build Coastguard Worker // - This currently relies on the scheduler to place loads and stores next to
21*9880d681SAndroid Build Coastguard Worker //   each other, and then only merges adjacent pairs of instructions. It would
22*9880d681SAndroid Build Coastguard Worker //   be good to be more flexible with interleaved instructions, and possibly run
23*9880d681SAndroid Build Coastguard Worker //   before scheduling. It currently missing stores of constants because loading
24*9880d681SAndroid Build Coastguard Worker //   the constant into the data register is placed between the stores, although
25*9880d681SAndroid Build Coastguard Worker //   this is arguably a scheduling problem.
26*9880d681SAndroid Build Coastguard Worker //
27*9880d681SAndroid Build Coastguard Worker // - Live interval recomputing seems inefficient. This currently only matches
28*9880d681SAndroid Build Coastguard Worker //   one pair, and recomputes live intervals and moves on to the next pair. It
29*9880d681SAndroid Build Coastguard Worker //   would be better to compute a list of all merges that need to occur.
30*9880d681SAndroid Build Coastguard Worker //
31*9880d681SAndroid Build Coastguard Worker // - With a list of instructions to process, we can also merge more. If a
32*9880d681SAndroid Build Coastguard Worker //   cluster of loads have offsets that are too large to fit in the 8-bit
33*9880d681SAndroid Build Coastguard Worker //   offsets, but are close enough to fit in the 8 bits, we can add to the base
34*9880d681SAndroid Build Coastguard Worker //   pointer and use the new reduced offsets.
35*9880d681SAndroid Build Coastguard Worker //
36*9880d681SAndroid Build Coastguard Worker //===----------------------------------------------------------------------===//
37*9880d681SAndroid Build Coastguard Worker 
38*9880d681SAndroid Build Coastguard Worker #include "AMDGPU.h"
39*9880d681SAndroid Build Coastguard Worker #include "AMDGPUSubtarget.h"
40*9880d681SAndroid Build Coastguard Worker #include "SIInstrInfo.h"
41*9880d681SAndroid Build Coastguard Worker #include "SIRegisterInfo.h"
42*9880d681SAndroid Build Coastguard Worker #include "llvm/CodeGen/LiveIntervalAnalysis.h"
43*9880d681SAndroid Build Coastguard Worker #include "llvm/CodeGen/LiveVariables.h"
44*9880d681SAndroid Build Coastguard Worker #include "llvm/CodeGen/MachineFunction.h"
45*9880d681SAndroid Build Coastguard Worker #include "llvm/CodeGen/MachineFunctionPass.h"
46*9880d681SAndroid Build Coastguard Worker #include "llvm/CodeGen/MachineInstrBuilder.h"
47*9880d681SAndroid Build Coastguard Worker #include "llvm/CodeGen/MachineRegisterInfo.h"
48*9880d681SAndroid Build Coastguard Worker #include "llvm/Support/Debug.h"
49*9880d681SAndroid Build Coastguard Worker #include "llvm/Support/raw_ostream.h"
50*9880d681SAndroid Build Coastguard Worker #include "llvm/Target/TargetMachine.h"
51*9880d681SAndroid Build Coastguard Worker 
52*9880d681SAndroid Build Coastguard Worker using namespace llvm;
53*9880d681SAndroid Build Coastguard Worker 
54*9880d681SAndroid Build Coastguard Worker #define DEBUG_TYPE "si-load-store-opt"
55*9880d681SAndroid Build Coastguard Worker 
56*9880d681SAndroid Build Coastguard Worker namespace {
57*9880d681SAndroid Build Coastguard Worker 
58*9880d681SAndroid Build Coastguard Worker class SILoadStoreOptimizer : public MachineFunctionPass {
59*9880d681SAndroid Build Coastguard Worker private:
60*9880d681SAndroid Build Coastguard Worker   const SIInstrInfo *TII;
61*9880d681SAndroid Build Coastguard Worker   const SIRegisterInfo *TRI;
62*9880d681SAndroid Build Coastguard Worker   MachineRegisterInfo *MRI;
63*9880d681SAndroid Build Coastguard Worker   LiveIntervals *LIS;
64*9880d681SAndroid Build Coastguard Worker 
65*9880d681SAndroid Build Coastguard Worker   static bool offsetsCanBeCombined(unsigned Offset0,
66*9880d681SAndroid Build Coastguard Worker                                    unsigned Offset1,
67*9880d681SAndroid Build Coastguard Worker                                    unsigned EltSize);
68*9880d681SAndroid Build Coastguard Worker 
69*9880d681SAndroid Build Coastguard Worker   MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I,
70*9880d681SAndroid Build Coastguard Worker                                                  unsigned EltSize);
71*9880d681SAndroid Build Coastguard Worker 
72*9880d681SAndroid Build Coastguard Worker   MachineBasicBlock::iterator mergeRead2Pair(
73*9880d681SAndroid Build Coastguard Worker     MachineBasicBlock::iterator I,
74*9880d681SAndroid Build Coastguard Worker     MachineBasicBlock::iterator Paired,
75*9880d681SAndroid Build Coastguard Worker     unsigned EltSize);
76*9880d681SAndroid Build Coastguard Worker 
77*9880d681SAndroid Build Coastguard Worker   MachineBasicBlock::iterator mergeWrite2Pair(
78*9880d681SAndroid Build Coastguard Worker     MachineBasicBlock::iterator I,
79*9880d681SAndroid Build Coastguard Worker     MachineBasicBlock::iterator Paired,
80*9880d681SAndroid Build Coastguard Worker     unsigned EltSize);
81*9880d681SAndroid Build Coastguard Worker 
82*9880d681SAndroid Build Coastguard Worker public:
83*9880d681SAndroid Build Coastguard Worker   static char ID;
84*9880d681SAndroid Build Coastguard Worker 
SILoadStoreOptimizer()85*9880d681SAndroid Build Coastguard Worker   SILoadStoreOptimizer()
86*9880d681SAndroid Build Coastguard Worker       : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr),
87*9880d681SAndroid Build Coastguard Worker         LIS(nullptr) {}
88*9880d681SAndroid Build Coastguard Worker 
SILoadStoreOptimizer(const TargetMachine & TM_)89*9880d681SAndroid Build Coastguard Worker   SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
90*9880d681SAndroid Build Coastguard Worker     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
91*9880d681SAndroid Build Coastguard Worker   }
92*9880d681SAndroid Build Coastguard Worker 
93*9880d681SAndroid Build Coastguard Worker   bool optimizeBlock(MachineBasicBlock &MBB);
94*9880d681SAndroid Build Coastguard Worker 
95*9880d681SAndroid Build Coastguard Worker   bool runOnMachineFunction(MachineFunction &MF) override;
96*9880d681SAndroid Build Coastguard Worker 
getPassName() const97*9880d681SAndroid Build Coastguard Worker   const char *getPassName() const override {
98*9880d681SAndroid Build Coastguard Worker     return "SI Load / Store Optimizer";
99*9880d681SAndroid Build Coastguard Worker   }
100*9880d681SAndroid Build Coastguard Worker 
getAnalysisUsage(AnalysisUsage & AU) const101*9880d681SAndroid Build Coastguard Worker   void getAnalysisUsage(AnalysisUsage &AU) const override {
102*9880d681SAndroid Build Coastguard Worker     AU.setPreservesCFG();
103*9880d681SAndroid Build Coastguard Worker     AU.addPreserved<SlotIndexes>();
104*9880d681SAndroid Build Coastguard Worker     AU.addPreserved<LiveIntervals>();
105*9880d681SAndroid Build Coastguard Worker     AU.addPreserved<LiveVariables>();
106*9880d681SAndroid Build Coastguard Worker     AU.addRequired<LiveIntervals>();
107*9880d681SAndroid Build Coastguard Worker 
108*9880d681SAndroid Build Coastguard Worker     MachineFunctionPass::getAnalysisUsage(AU);
109*9880d681SAndroid Build Coastguard Worker   }
110*9880d681SAndroid Build Coastguard Worker };
111*9880d681SAndroid Build Coastguard Worker 
112*9880d681SAndroid Build Coastguard Worker } // End anonymous namespace.
113*9880d681SAndroid Build Coastguard Worker 
114*9880d681SAndroid Build Coastguard Worker INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
115*9880d681SAndroid Build Coastguard Worker                       "SI Load / Store Optimizer", false, false)
116*9880d681SAndroid Build Coastguard Worker INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
117*9880d681SAndroid Build Coastguard Worker INITIALIZE_PASS_DEPENDENCY(LiveVariables)
118*9880d681SAndroid Build Coastguard Worker INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
119*9880d681SAndroid Build Coastguard Worker INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
120*9880d681SAndroid Build Coastguard Worker                     "SI Load / Store Optimizer", false, false)
121*9880d681SAndroid Build Coastguard Worker 
122*9880d681SAndroid Build Coastguard Worker char SILoadStoreOptimizer::ID = 0;
123*9880d681SAndroid Build Coastguard Worker 
124*9880d681SAndroid Build Coastguard Worker char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
125*9880d681SAndroid Build Coastguard Worker 
createSILoadStoreOptimizerPass(TargetMachine & TM)126*9880d681SAndroid Build Coastguard Worker FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) {
127*9880d681SAndroid Build Coastguard Worker   return new SILoadStoreOptimizer(TM);
128*9880d681SAndroid Build Coastguard Worker }
129*9880d681SAndroid Build Coastguard Worker 
offsetsCanBeCombined(unsigned Offset0,unsigned Offset1,unsigned Size)130*9880d681SAndroid Build Coastguard Worker bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
131*9880d681SAndroid Build Coastguard Worker                                                 unsigned Offset1,
132*9880d681SAndroid Build Coastguard Worker                                                 unsigned Size) {
133*9880d681SAndroid Build Coastguard Worker   // XXX - Would the same offset be OK? Is there any reason this would happen or
134*9880d681SAndroid Build Coastguard Worker   // be useful?
135*9880d681SAndroid Build Coastguard Worker   if (Offset0 == Offset1)
136*9880d681SAndroid Build Coastguard Worker     return false;
137*9880d681SAndroid Build Coastguard Worker 
138*9880d681SAndroid Build Coastguard Worker   // This won't be valid if the offset isn't aligned.
139*9880d681SAndroid Build Coastguard Worker   if ((Offset0 % Size != 0) || (Offset1 % Size != 0))
140*9880d681SAndroid Build Coastguard Worker     return false;
141*9880d681SAndroid Build Coastguard Worker 
142*9880d681SAndroid Build Coastguard Worker   unsigned EltOffset0 = Offset0 / Size;
143*9880d681SAndroid Build Coastguard Worker   unsigned EltOffset1 = Offset1 / Size;
144*9880d681SAndroid Build Coastguard Worker 
145*9880d681SAndroid Build Coastguard Worker   // Check if the new offsets fit in the reduced 8-bit range.
146*9880d681SAndroid Build Coastguard Worker   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1))
147*9880d681SAndroid Build Coastguard Worker     return true;
148*9880d681SAndroid Build Coastguard Worker 
149*9880d681SAndroid Build Coastguard Worker   // If the offset in elements doesn't fit in 8-bits, we might be able to use
150*9880d681SAndroid Build Coastguard Worker   // the stride 64 versions.
151*9880d681SAndroid Build Coastguard Worker   if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0)
152*9880d681SAndroid Build Coastguard Worker     return false;
153*9880d681SAndroid Build Coastguard Worker 
154*9880d681SAndroid Build Coastguard Worker   return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64);
155*9880d681SAndroid Build Coastguard Worker }
156*9880d681SAndroid Build Coastguard Worker 
157*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator
findMatchingDSInst(MachineBasicBlock::iterator I,unsigned EltSize)158*9880d681SAndroid Build Coastguard Worker SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
159*9880d681SAndroid Build Coastguard Worker                                          unsigned EltSize){
160*9880d681SAndroid Build Coastguard Worker   MachineBasicBlock::iterator E = I->getParent()->end();
161*9880d681SAndroid Build Coastguard Worker   MachineBasicBlock::iterator MBBI = I;
162*9880d681SAndroid Build Coastguard Worker   ++MBBI;
163*9880d681SAndroid Build Coastguard Worker 
164*9880d681SAndroid Build Coastguard Worker   if (MBBI->getOpcode() != I->getOpcode())
165*9880d681SAndroid Build Coastguard Worker     return E;
166*9880d681SAndroid Build Coastguard Worker 
167*9880d681SAndroid Build Coastguard Worker   // Don't merge volatiles.
168*9880d681SAndroid Build Coastguard Worker   if (MBBI->hasOrderedMemoryRef())
169*9880d681SAndroid Build Coastguard Worker     return E;
170*9880d681SAndroid Build Coastguard Worker 
171*9880d681SAndroid Build Coastguard Worker   int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
172*9880d681SAndroid Build Coastguard Worker   const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
173*9880d681SAndroid Build Coastguard Worker   const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
174*9880d681SAndroid Build Coastguard Worker 
175*9880d681SAndroid Build Coastguard Worker   // Check same base pointer. Be careful of subregisters, which can occur with
176*9880d681SAndroid Build Coastguard Worker   // vectors of pointers.
177*9880d681SAndroid Build Coastguard Worker   if (AddrReg0.getReg() == AddrReg1.getReg() &&
178*9880d681SAndroid Build Coastguard Worker       AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
179*9880d681SAndroid Build Coastguard Worker     int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
180*9880d681SAndroid Build Coastguard Worker                                                AMDGPU::OpName::offset);
181*9880d681SAndroid Build Coastguard Worker     unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
182*9880d681SAndroid Build Coastguard Worker     unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
183*9880d681SAndroid Build Coastguard Worker 
184*9880d681SAndroid Build Coastguard Worker     // Check both offsets fit in the reduced range.
185*9880d681SAndroid Build Coastguard Worker     if (offsetsCanBeCombined(Offset0, Offset1, EltSize))
186*9880d681SAndroid Build Coastguard Worker       return MBBI;
187*9880d681SAndroid Build Coastguard Worker   }
188*9880d681SAndroid Build Coastguard Worker 
189*9880d681SAndroid Build Coastguard Worker   return E;
190*9880d681SAndroid Build Coastguard Worker }
191*9880d681SAndroid Build Coastguard Worker 
mergeRead2Pair(MachineBasicBlock::iterator I,MachineBasicBlock::iterator Paired,unsigned EltSize)192*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator  SILoadStoreOptimizer::mergeRead2Pair(
193*9880d681SAndroid Build Coastguard Worker   MachineBasicBlock::iterator I,
194*9880d681SAndroid Build Coastguard Worker   MachineBasicBlock::iterator Paired,
195*9880d681SAndroid Build Coastguard Worker   unsigned EltSize) {
196*9880d681SAndroid Build Coastguard Worker   MachineBasicBlock *MBB = I->getParent();
197*9880d681SAndroid Build Coastguard Worker 
198*9880d681SAndroid Build Coastguard Worker   // Be careful, since the addresses could be subregisters themselves in weird
199*9880d681SAndroid Build Coastguard Worker   // cases, like vectors of pointers.
200*9880d681SAndroid Build Coastguard Worker   const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
201*9880d681SAndroid Build Coastguard Worker 
202*9880d681SAndroid Build Coastguard Worker   const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst);
203*9880d681SAndroid Build Coastguard Worker   const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst);
204*9880d681SAndroid Build Coastguard Worker 
205*9880d681SAndroid Build Coastguard Worker   unsigned Offset0
206*9880d681SAndroid Build Coastguard Worker     = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
207*9880d681SAndroid Build Coastguard Worker   unsigned Offset1
208*9880d681SAndroid Build Coastguard Worker     = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
209*9880d681SAndroid Build Coastguard Worker 
210*9880d681SAndroid Build Coastguard Worker   unsigned NewOffset0 = Offset0 / EltSize;
211*9880d681SAndroid Build Coastguard Worker   unsigned NewOffset1 = Offset1 / EltSize;
212*9880d681SAndroid Build Coastguard Worker   unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
213*9880d681SAndroid Build Coastguard Worker 
214*9880d681SAndroid Build Coastguard Worker   // Prefer the st64 form if we can use it, even if we can fit the offset in the
215*9880d681SAndroid Build Coastguard Worker   // non st64 version. I'm not sure if there's any real reason to do this.
216*9880d681SAndroid Build Coastguard Worker   bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
217*9880d681SAndroid Build Coastguard Worker   if (UseST64) {
218*9880d681SAndroid Build Coastguard Worker     NewOffset0 /= 64;
219*9880d681SAndroid Build Coastguard Worker     NewOffset1 /= 64;
220*9880d681SAndroid Build Coastguard Worker     Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
221*9880d681SAndroid Build Coastguard Worker   }
222*9880d681SAndroid Build Coastguard Worker 
223*9880d681SAndroid Build Coastguard Worker   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
224*9880d681SAndroid Build Coastguard Worker          (NewOffset0 != NewOffset1) &&
225*9880d681SAndroid Build Coastguard Worker          "Computed offset doesn't fit");
226*9880d681SAndroid Build Coastguard Worker 
227*9880d681SAndroid Build Coastguard Worker   const MCInstrDesc &Read2Desc = TII->get(Opc);
228*9880d681SAndroid Build Coastguard Worker 
229*9880d681SAndroid Build Coastguard Worker   const TargetRegisterClass *SuperRC
230*9880d681SAndroid Build Coastguard Worker     = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
231*9880d681SAndroid Build Coastguard Worker   unsigned DestReg = MRI->createVirtualRegister(SuperRC);
232*9880d681SAndroid Build Coastguard Worker 
233*9880d681SAndroid Build Coastguard Worker   DebugLoc DL = I->getDebugLoc();
234*9880d681SAndroid Build Coastguard Worker   MachineInstrBuilder Read2
235*9880d681SAndroid Build Coastguard Worker     = BuildMI(*MBB, I, DL, Read2Desc, DestReg)
236*9880d681SAndroid Build Coastguard Worker     .addOperand(*AddrReg) // addr
237*9880d681SAndroid Build Coastguard Worker     .addImm(NewOffset0) // offset0
238*9880d681SAndroid Build Coastguard Worker     .addImm(NewOffset1) // offset1
239*9880d681SAndroid Build Coastguard Worker     .addImm(0) // gds
240*9880d681SAndroid Build Coastguard Worker     .addMemOperand(*I->memoperands_begin())
241*9880d681SAndroid Build Coastguard Worker     .addMemOperand(*Paired->memoperands_begin());
242*9880d681SAndroid Build Coastguard Worker 
243*9880d681SAndroid Build Coastguard Worker   unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
244*9880d681SAndroid Build Coastguard Worker   unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
245*9880d681SAndroid Build Coastguard Worker 
246*9880d681SAndroid Build Coastguard Worker   const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
247*9880d681SAndroid Build Coastguard Worker 
248*9880d681SAndroid Build Coastguard Worker   // Copy to the old destination registers.
249*9880d681SAndroid Build Coastguard Worker   MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc)
250*9880d681SAndroid Build Coastguard Worker     .addOperand(*Dest0) // Copy to same destination including flags and sub reg.
251*9880d681SAndroid Build Coastguard Worker     .addReg(DestReg, 0, SubRegIdx0);
252*9880d681SAndroid Build Coastguard Worker   MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc)
253*9880d681SAndroid Build Coastguard Worker     .addOperand(*Dest1)
254*9880d681SAndroid Build Coastguard Worker     .addReg(DestReg, RegState::Kill, SubRegIdx1);
255*9880d681SAndroid Build Coastguard Worker 
256*9880d681SAndroid Build Coastguard Worker   LIS->InsertMachineInstrInMaps(*Read2);
257*9880d681SAndroid Build Coastguard Worker 
258*9880d681SAndroid Build Coastguard Worker   // repairLiveintervalsInRange() doesn't handle physical register, so we have
259*9880d681SAndroid Build Coastguard Worker   // to update the M0 range manually.
260*9880d681SAndroid Build Coastguard Worker   SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
261*9880d681SAndroid Build Coastguard Worker   LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
262*9880d681SAndroid Build Coastguard Worker   LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
263*9880d681SAndroid Build Coastguard Worker   bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
264*9880d681SAndroid Build Coastguard Worker 
265*9880d681SAndroid Build Coastguard Worker   // The new write to the original destination register is now the copy. Steal
266*9880d681SAndroid Build Coastguard Worker   // the old SlotIndex.
267*9880d681SAndroid Build Coastguard Worker   LIS->ReplaceMachineInstrInMaps(*I, *Copy0);
268*9880d681SAndroid Build Coastguard Worker   LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1);
269*9880d681SAndroid Build Coastguard Worker 
270*9880d681SAndroid Build Coastguard Worker   I->eraseFromParent();
271*9880d681SAndroid Build Coastguard Worker   Paired->eraseFromParent();
272*9880d681SAndroid Build Coastguard Worker 
273*9880d681SAndroid Build Coastguard Worker   LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
274*9880d681SAndroid Build Coastguard Worker   LIS->shrinkToUses(&AddrRegLI);
275*9880d681SAndroid Build Coastguard Worker 
276*9880d681SAndroid Build Coastguard Worker   LIS->createAndComputeVirtRegInterval(DestReg);
277*9880d681SAndroid Build Coastguard Worker 
278*9880d681SAndroid Build Coastguard Worker   if (UpdateM0Range) {
279*9880d681SAndroid Build Coastguard Worker     SlotIndex Read2Index = LIS->getInstructionIndex(*Read2);
280*9880d681SAndroid Build Coastguard Worker     M0Segment->end = Read2Index.getRegSlot();
281*9880d681SAndroid Build Coastguard Worker   }
282*9880d681SAndroid Build Coastguard Worker 
283*9880d681SAndroid Build Coastguard Worker   DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
284*9880d681SAndroid Build Coastguard Worker   return Read2.getInstr();
285*9880d681SAndroid Build Coastguard Worker }
286*9880d681SAndroid Build Coastguard Worker 
mergeWrite2Pair(MachineBasicBlock::iterator I,MachineBasicBlock::iterator Paired,unsigned EltSize)287*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
288*9880d681SAndroid Build Coastguard Worker   MachineBasicBlock::iterator I,
289*9880d681SAndroid Build Coastguard Worker   MachineBasicBlock::iterator Paired,
290*9880d681SAndroid Build Coastguard Worker   unsigned EltSize) {
291*9880d681SAndroid Build Coastguard Worker   MachineBasicBlock *MBB = I->getParent();
292*9880d681SAndroid Build Coastguard Worker 
293*9880d681SAndroid Build Coastguard Worker   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
294*9880d681SAndroid Build Coastguard Worker   // sure we preserve the subregister index and any register flags set on them.
295*9880d681SAndroid Build Coastguard Worker   const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
296*9880d681SAndroid Build Coastguard Worker   const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0);
297*9880d681SAndroid Build Coastguard Worker   const MachineOperand *Data1
298*9880d681SAndroid Build Coastguard Worker     = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0);
299*9880d681SAndroid Build Coastguard Worker 
300*9880d681SAndroid Build Coastguard Worker 
301*9880d681SAndroid Build Coastguard Worker   unsigned Offset0
302*9880d681SAndroid Build Coastguard Worker     = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
303*9880d681SAndroid Build Coastguard Worker   unsigned Offset1
304*9880d681SAndroid Build Coastguard Worker     = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
305*9880d681SAndroid Build Coastguard Worker 
306*9880d681SAndroid Build Coastguard Worker   unsigned NewOffset0 = Offset0 / EltSize;
307*9880d681SAndroid Build Coastguard Worker   unsigned NewOffset1 = Offset1 / EltSize;
308*9880d681SAndroid Build Coastguard Worker   unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
309*9880d681SAndroid Build Coastguard Worker 
310*9880d681SAndroid Build Coastguard Worker   // Prefer the st64 form if we can use it, even if we can fit the offset in the
311*9880d681SAndroid Build Coastguard Worker   // non st64 version. I'm not sure if there's any real reason to do this.
312*9880d681SAndroid Build Coastguard Worker   bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
313*9880d681SAndroid Build Coastguard Worker   if (UseST64) {
314*9880d681SAndroid Build Coastguard Worker     NewOffset0 /= 64;
315*9880d681SAndroid Build Coastguard Worker     NewOffset1 /= 64;
316*9880d681SAndroid Build Coastguard Worker     Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
317*9880d681SAndroid Build Coastguard Worker   }
318*9880d681SAndroid Build Coastguard Worker 
319*9880d681SAndroid Build Coastguard Worker   assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
320*9880d681SAndroid Build Coastguard Worker          (NewOffset0 != NewOffset1) &&
321*9880d681SAndroid Build Coastguard Worker          "Computed offset doesn't fit");
322*9880d681SAndroid Build Coastguard Worker 
323*9880d681SAndroid Build Coastguard Worker   const MCInstrDesc &Write2Desc = TII->get(Opc);
324*9880d681SAndroid Build Coastguard Worker   DebugLoc DL = I->getDebugLoc();
325*9880d681SAndroid Build Coastguard Worker 
326*9880d681SAndroid Build Coastguard Worker   // repairLiveintervalsInRange() doesn't handle physical register, so we have
327*9880d681SAndroid Build Coastguard Worker   // to update the M0 range manually.
328*9880d681SAndroid Build Coastguard Worker   SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
329*9880d681SAndroid Build Coastguard Worker   LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
330*9880d681SAndroid Build Coastguard Worker   LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
331*9880d681SAndroid Build Coastguard Worker   bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
332*9880d681SAndroid Build Coastguard Worker 
333*9880d681SAndroid Build Coastguard Worker   MachineInstrBuilder Write2
334*9880d681SAndroid Build Coastguard Worker     = BuildMI(*MBB, I, DL, Write2Desc)
335*9880d681SAndroid Build Coastguard Worker     .addOperand(*Addr) // addr
336*9880d681SAndroid Build Coastguard Worker     .addOperand(*Data0) // data0
337*9880d681SAndroid Build Coastguard Worker     .addOperand(*Data1) // data1
338*9880d681SAndroid Build Coastguard Worker     .addImm(NewOffset0) // offset0
339*9880d681SAndroid Build Coastguard Worker     .addImm(NewOffset1) // offset1
340*9880d681SAndroid Build Coastguard Worker     .addImm(0) // gds
341*9880d681SAndroid Build Coastguard Worker     .addMemOperand(*I->memoperands_begin())
342*9880d681SAndroid Build Coastguard Worker     .addMemOperand(*Paired->memoperands_begin());
343*9880d681SAndroid Build Coastguard Worker 
344*9880d681SAndroid Build Coastguard Worker   // XXX - How do we express subregisters here?
345*9880d681SAndroid Build Coastguard Worker   unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
346*9880d681SAndroid Build Coastguard Worker 
347*9880d681SAndroid Build Coastguard Worker   LIS->RemoveMachineInstrFromMaps(*I);
348*9880d681SAndroid Build Coastguard Worker   LIS->RemoveMachineInstrFromMaps(*Paired);
349*9880d681SAndroid Build Coastguard Worker   I->eraseFromParent();
350*9880d681SAndroid Build Coastguard Worker   Paired->eraseFromParent();
351*9880d681SAndroid Build Coastguard Worker 
352*9880d681SAndroid Build Coastguard Worker   // This doesn't handle physical registers like M0
353*9880d681SAndroid Build Coastguard Worker   LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);
354*9880d681SAndroid Build Coastguard Worker 
355*9880d681SAndroid Build Coastguard Worker   if (UpdateM0Range) {
356*9880d681SAndroid Build Coastguard Worker     SlotIndex Write2Index = LIS->getInstructionIndex(*Write2);
357*9880d681SAndroid Build Coastguard Worker     M0Segment->end = Write2Index.getRegSlot();
358*9880d681SAndroid Build Coastguard Worker   }
359*9880d681SAndroid Build Coastguard Worker 
360*9880d681SAndroid Build Coastguard Worker   DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
361*9880d681SAndroid Build Coastguard Worker   return Write2.getInstr();
362*9880d681SAndroid Build Coastguard Worker }
363*9880d681SAndroid Build Coastguard Worker 
364*9880d681SAndroid Build Coastguard Worker // Scan through looking for adjacent LDS operations with constant offsets from
365*9880d681SAndroid Build Coastguard Worker // the same base register. We rely on the scheduler to do the hard work of
366*9880d681SAndroid Build Coastguard Worker // clustering nearby loads, and assume these are all adjacent.
optimizeBlock(MachineBasicBlock & MBB)367*9880d681SAndroid Build Coastguard Worker bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
368*9880d681SAndroid Build Coastguard Worker   bool Modified = false;
369*9880d681SAndroid Build Coastguard Worker 
370*9880d681SAndroid Build Coastguard Worker   for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
371*9880d681SAndroid Build Coastguard Worker     MachineInstr &MI = *I;
372*9880d681SAndroid Build Coastguard Worker 
373*9880d681SAndroid Build Coastguard Worker     // Don't combine if volatile.
374*9880d681SAndroid Build Coastguard Worker     if (MI.hasOrderedMemoryRef()) {
375*9880d681SAndroid Build Coastguard Worker       ++I;
376*9880d681SAndroid Build Coastguard Worker       continue;
377*9880d681SAndroid Build Coastguard Worker     }
378*9880d681SAndroid Build Coastguard Worker 
379*9880d681SAndroid Build Coastguard Worker     unsigned Opc = MI.getOpcode();
380*9880d681SAndroid Build Coastguard Worker     if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
381*9880d681SAndroid Build Coastguard Worker       unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
382*9880d681SAndroid Build Coastguard Worker       MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
383*9880d681SAndroid Build Coastguard Worker       if (Match != E) {
384*9880d681SAndroid Build Coastguard Worker         Modified = true;
385*9880d681SAndroid Build Coastguard Worker         I = mergeRead2Pair(I, Match, Size);
386*9880d681SAndroid Build Coastguard Worker       } else {
387*9880d681SAndroid Build Coastguard Worker         ++I;
388*9880d681SAndroid Build Coastguard Worker       }
389*9880d681SAndroid Build Coastguard Worker 
390*9880d681SAndroid Build Coastguard Worker       continue;
391*9880d681SAndroid Build Coastguard Worker     } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
392*9880d681SAndroid Build Coastguard Worker       unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
393*9880d681SAndroid Build Coastguard Worker       MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
394*9880d681SAndroid Build Coastguard Worker       if (Match != E) {
395*9880d681SAndroid Build Coastguard Worker         Modified = true;
396*9880d681SAndroid Build Coastguard Worker         I = mergeWrite2Pair(I, Match, Size);
397*9880d681SAndroid Build Coastguard Worker       } else {
398*9880d681SAndroid Build Coastguard Worker         ++I;
399*9880d681SAndroid Build Coastguard Worker       }
400*9880d681SAndroid Build Coastguard Worker 
401*9880d681SAndroid Build Coastguard Worker       continue;
402*9880d681SAndroid Build Coastguard Worker     }
403*9880d681SAndroid Build Coastguard Worker 
404*9880d681SAndroid Build Coastguard Worker     ++I;
405*9880d681SAndroid Build Coastguard Worker   }
406*9880d681SAndroid Build Coastguard Worker 
407*9880d681SAndroid Build Coastguard Worker   return Modified;
408*9880d681SAndroid Build Coastguard Worker }
409*9880d681SAndroid Build Coastguard Worker 
runOnMachineFunction(MachineFunction & MF)410*9880d681SAndroid Build Coastguard Worker bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
411*9880d681SAndroid Build Coastguard Worker   if (skipFunction(*MF.getFunction()))
412*9880d681SAndroid Build Coastguard Worker     return false;
413*9880d681SAndroid Build Coastguard Worker 
414*9880d681SAndroid Build Coastguard Worker   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
415*9880d681SAndroid Build Coastguard Worker   if (!STM.loadStoreOptEnabled())
416*9880d681SAndroid Build Coastguard Worker     return false;
417*9880d681SAndroid Build Coastguard Worker 
418*9880d681SAndroid Build Coastguard Worker   TII = STM.getInstrInfo();
419*9880d681SAndroid Build Coastguard Worker   TRI = &TII->getRegisterInfo();
420*9880d681SAndroid Build Coastguard Worker 
421*9880d681SAndroid Build Coastguard Worker   MRI = &MF.getRegInfo();
422*9880d681SAndroid Build Coastguard Worker 
423*9880d681SAndroid Build Coastguard Worker   LIS = &getAnalysis<LiveIntervals>();
424*9880d681SAndroid Build Coastguard Worker 
425*9880d681SAndroid Build Coastguard Worker   DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
426*9880d681SAndroid Build Coastguard Worker 
427*9880d681SAndroid Build Coastguard Worker   assert(!MRI->isSSA());
428*9880d681SAndroid Build Coastguard Worker 
429*9880d681SAndroid Build Coastguard Worker   bool Modified = false;
430*9880d681SAndroid Build Coastguard Worker 
431*9880d681SAndroid Build Coastguard Worker   for (MachineBasicBlock &MBB : MF)
432*9880d681SAndroid Build Coastguard Worker     Modified |= optimizeBlock(MBB);
433*9880d681SAndroid Build Coastguard Worker 
434*9880d681SAndroid Build Coastguard Worker   return Modified;
435*9880d681SAndroid Build Coastguard Worker }
436