1*9880d681SAndroid Build Coastguard Worker //===-- SILoadStoreOptimizer.cpp ------------------------------------------===//
2*9880d681SAndroid Build Coastguard Worker //
3*9880d681SAndroid Build Coastguard Worker // The LLVM Compiler Infrastructure
4*9880d681SAndroid Build Coastguard Worker //
5*9880d681SAndroid Build Coastguard Worker // This file is distributed under the University of Illinois Open Source
6*9880d681SAndroid Build Coastguard Worker // License. See LICENSE.TXT for details.
7*9880d681SAndroid Build Coastguard Worker //
8*9880d681SAndroid Build Coastguard Worker //===----------------------------------------------------------------------===//
9*9880d681SAndroid Build Coastguard Worker //
10*9880d681SAndroid Build Coastguard Worker // This pass tries to fuse DS instructions with close by immediate offsets.
11*9880d681SAndroid Build Coastguard Worker // This will fuse operations such as
12*9880d681SAndroid Build Coastguard Worker // ds_read_b32 v0, v2 offset:16
13*9880d681SAndroid Build Coastguard Worker // ds_read_b32 v1, v2 offset:32
14*9880d681SAndroid Build Coastguard Worker // ==>
15*9880d681SAndroid Build Coastguard Worker // ds_read2_b32 v[0:1], v2, offset0:4 offset1:8
16*9880d681SAndroid Build Coastguard Worker //
17*9880d681SAndroid Build Coastguard Worker //
18*9880d681SAndroid Build Coastguard Worker // Future improvements:
19*9880d681SAndroid Build Coastguard Worker //
20*9880d681SAndroid Build Coastguard Worker // - This currently relies on the scheduler to place loads and stores next to
21*9880d681SAndroid Build Coastguard Worker // each other, and then only merges adjacent pairs of instructions. It would
22*9880d681SAndroid Build Coastguard Worker // be good to be more flexible with interleaved instructions, and possibly run
23*9880d681SAndroid Build Coastguard Worker // before scheduling. It currently missing stores of constants because loading
24*9880d681SAndroid Build Coastguard Worker // the constant into the data register is placed between the stores, although
25*9880d681SAndroid Build Coastguard Worker // this is arguably a scheduling problem.
26*9880d681SAndroid Build Coastguard Worker //
27*9880d681SAndroid Build Coastguard Worker // - Live interval recomputing seems inefficient. This currently only matches
28*9880d681SAndroid Build Coastguard Worker // one pair, and recomputes live intervals and moves on to the next pair. It
29*9880d681SAndroid Build Coastguard Worker // would be better to compute a list of all merges that need to occur.
30*9880d681SAndroid Build Coastguard Worker //
31*9880d681SAndroid Build Coastguard Worker // - With a list of instructions to process, we can also merge more. If a
32*9880d681SAndroid Build Coastguard Worker // cluster of loads have offsets that are too large to fit in the 8-bit
33*9880d681SAndroid Build Coastguard Worker // offsets, but are close enough to fit in the 8 bits, we can add to the base
34*9880d681SAndroid Build Coastguard Worker // pointer and use the new reduced offsets.
35*9880d681SAndroid Build Coastguard Worker //
36*9880d681SAndroid Build Coastguard Worker //===----------------------------------------------------------------------===//
37*9880d681SAndroid Build Coastguard Worker
38*9880d681SAndroid Build Coastguard Worker #include "AMDGPU.h"
39*9880d681SAndroid Build Coastguard Worker #include "AMDGPUSubtarget.h"
40*9880d681SAndroid Build Coastguard Worker #include "SIInstrInfo.h"
41*9880d681SAndroid Build Coastguard Worker #include "SIRegisterInfo.h"
42*9880d681SAndroid Build Coastguard Worker #include "llvm/CodeGen/LiveIntervalAnalysis.h"
43*9880d681SAndroid Build Coastguard Worker #include "llvm/CodeGen/LiveVariables.h"
44*9880d681SAndroid Build Coastguard Worker #include "llvm/CodeGen/MachineFunction.h"
45*9880d681SAndroid Build Coastguard Worker #include "llvm/CodeGen/MachineFunctionPass.h"
46*9880d681SAndroid Build Coastguard Worker #include "llvm/CodeGen/MachineInstrBuilder.h"
47*9880d681SAndroid Build Coastguard Worker #include "llvm/CodeGen/MachineRegisterInfo.h"
48*9880d681SAndroid Build Coastguard Worker #include "llvm/Support/Debug.h"
49*9880d681SAndroid Build Coastguard Worker #include "llvm/Support/raw_ostream.h"
50*9880d681SAndroid Build Coastguard Worker #include "llvm/Target/TargetMachine.h"
51*9880d681SAndroid Build Coastguard Worker
52*9880d681SAndroid Build Coastguard Worker using namespace llvm;
53*9880d681SAndroid Build Coastguard Worker
54*9880d681SAndroid Build Coastguard Worker #define DEBUG_TYPE "si-load-store-opt"
55*9880d681SAndroid Build Coastguard Worker
56*9880d681SAndroid Build Coastguard Worker namespace {
57*9880d681SAndroid Build Coastguard Worker
58*9880d681SAndroid Build Coastguard Worker class SILoadStoreOptimizer : public MachineFunctionPass {
59*9880d681SAndroid Build Coastguard Worker private:
60*9880d681SAndroid Build Coastguard Worker const SIInstrInfo *TII;
61*9880d681SAndroid Build Coastguard Worker const SIRegisterInfo *TRI;
62*9880d681SAndroid Build Coastguard Worker MachineRegisterInfo *MRI;
63*9880d681SAndroid Build Coastguard Worker LiveIntervals *LIS;
64*9880d681SAndroid Build Coastguard Worker
65*9880d681SAndroid Build Coastguard Worker static bool offsetsCanBeCombined(unsigned Offset0,
66*9880d681SAndroid Build Coastguard Worker unsigned Offset1,
67*9880d681SAndroid Build Coastguard Worker unsigned EltSize);
68*9880d681SAndroid Build Coastguard Worker
69*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I,
70*9880d681SAndroid Build Coastguard Worker unsigned EltSize);
71*9880d681SAndroid Build Coastguard Worker
72*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator mergeRead2Pair(
73*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator I,
74*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator Paired,
75*9880d681SAndroid Build Coastguard Worker unsigned EltSize);
76*9880d681SAndroid Build Coastguard Worker
77*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator mergeWrite2Pair(
78*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator I,
79*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator Paired,
80*9880d681SAndroid Build Coastguard Worker unsigned EltSize);
81*9880d681SAndroid Build Coastguard Worker
82*9880d681SAndroid Build Coastguard Worker public:
83*9880d681SAndroid Build Coastguard Worker static char ID;
84*9880d681SAndroid Build Coastguard Worker
SILoadStoreOptimizer()85*9880d681SAndroid Build Coastguard Worker SILoadStoreOptimizer()
86*9880d681SAndroid Build Coastguard Worker : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr),
87*9880d681SAndroid Build Coastguard Worker LIS(nullptr) {}
88*9880d681SAndroid Build Coastguard Worker
SILoadStoreOptimizer(const TargetMachine & TM_)89*9880d681SAndroid Build Coastguard Worker SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
90*9880d681SAndroid Build Coastguard Worker initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
91*9880d681SAndroid Build Coastguard Worker }
92*9880d681SAndroid Build Coastguard Worker
93*9880d681SAndroid Build Coastguard Worker bool optimizeBlock(MachineBasicBlock &MBB);
94*9880d681SAndroid Build Coastguard Worker
95*9880d681SAndroid Build Coastguard Worker bool runOnMachineFunction(MachineFunction &MF) override;
96*9880d681SAndroid Build Coastguard Worker
getPassName() const97*9880d681SAndroid Build Coastguard Worker const char *getPassName() const override {
98*9880d681SAndroid Build Coastguard Worker return "SI Load / Store Optimizer";
99*9880d681SAndroid Build Coastguard Worker }
100*9880d681SAndroid Build Coastguard Worker
getAnalysisUsage(AnalysisUsage & AU) const101*9880d681SAndroid Build Coastguard Worker void getAnalysisUsage(AnalysisUsage &AU) const override {
102*9880d681SAndroid Build Coastguard Worker AU.setPreservesCFG();
103*9880d681SAndroid Build Coastguard Worker AU.addPreserved<SlotIndexes>();
104*9880d681SAndroid Build Coastguard Worker AU.addPreserved<LiveIntervals>();
105*9880d681SAndroid Build Coastguard Worker AU.addPreserved<LiveVariables>();
106*9880d681SAndroid Build Coastguard Worker AU.addRequired<LiveIntervals>();
107*9880d681SAndroid Build Coastguard Worker
108*9880d681SAndroid Build Coastguard Worker MachineFunctionPass::getAnalysisUsage(AU);
109*9880d681SAndroid Build Coastguard Worker }
110*9880d681SAndroid Build Coastguard Worker };
111*9880d681SAndroid Build Coastguard Worker
112*9880d681SAndroid Build Coastguard Worker } // End anonymous namespace.
113*9880d681SAndroid Build Coastguard Worker
114*9880d681SAndroid Build Coastguard Worker INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
115*9880d681SAndroid Build Coastguard Worker "SI Load / Store Optimizer", false, false)
116*9880d681SAndroid Build Coastguard Worker INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
117*9880d681SAndroid Build Coastguard Worker INITIALIZE_PASS_DEPENDENCY(LiveVariables)
118*9880d681SAndroid Build Coastguard Worker INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
119*9880d681SAndroid Build Coastguard Worker INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
120*9880d681SAndroid Build Coastguard Worker "SI Load / Store Optimizer", false, false)
121*9880d681SAndroid Build Coastguard Worker
122*9880d681SAndroid Build Coastguard Worker char SILoadStoreOptimizer::ID = 0;
123*9880d681SAndroid Build Coastguard Worker
124*9880d681SAndroid Build Coastguard Worker char &llvm::SILoadStoreOptimizerID = SILoadStoreOptimizer::ID;
125*9880d681SAndroid Build Coastguard Worker
createSILoadStoreOptimizerPass(TargetMachine & TM)126*9880d681SAndroid Build Coastguard Worker FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) {
127*9880d681SAndroid Build Coastguard Worker return new SILoadStoreOptimizer(TM);
128*9880d681SAndroid Build Coastguard Worker }
129*9880d681SAndroid Build Coastguard Worker
offsetsCanBeCombined(unsigned Offset0,unsigned Offset1,unsigned Size)130*9880d681SAndroid Build Coastguard Worker bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
131*9880d681SAndroid Build Coastguard Worker unsigned Offset1,
132*9880d681SAndroid Build Coastguard Worker unsigned Size) {
133*9880d681SAndroid Build Coastguard Worker // XXX - Would the same offset be OK? Is there any reason this would happen or
134*9880d681SAndroid Build Coastguard Worker // be useful?
135*9880d681SAndroid Build Coastguard Worker if (Offset0 == Offset1)
136*9880d681SAndroid Build Coastguard Worker return false;
137*9880d681SAndroid Build Coastguard Worker
138*9880d681SAndroid Build Coastguard Worker // This won't be valid if the offset isn't aligned.
139*9880d681SAndroid Build Coastguard Worker if ((Offset0 % Size != 0) || (Offset1 % Size != 0))
140*9880d681SAndroid Build Coastguard Worker return false;
141*9880d681SAndroid Build Coastguard Worker
142*9880d681SAndroid Build Coastguard Worker unsigned EltOffset0 = Offset0 / Size;
143*9880d681SAndroid Build Coastguard Worker unsigned EltOffset1 = Offset1 / Size;
144*9880d681SAndroid Build Coastguard Worker
145*9880d681SAndroid Build Coastguard Worker // Check if the new offsets fit in the reduced 8-bit range.
146*9880d681SAndroid Build Coastguard Worker if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1))
147*9880d681SAndroid Build Coastguard Worker return true;
148*9880d681SAndroid Build Coastguard Worker
149*9880d681SAndroid Build Coastguard Worker // If the offset in elements doesn't fit in 8-bits, we might be able to use
150*9880d681SAndroid Build Coastguard Worker // the stride 64 versions.
151*9880d681SAndroid Build Coastguard Worker if ((EltOffset0 % 64 != 0) || (EltOffset1 % 64) != 0)
152*9880d681SAndroid Build Coastguard Worker return false;
153*9880d681SAndroid Build Coastguard Worker
154*9880d681SAndroid Build Coastguard Worker return isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64);
155*9880d681SAndroid Build Coastguard Worker }
156*9880d681SAndroid Build Coastguard Worker
157*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator
findMatchingDSInst(MachineBasicBlock::iterator I,unsigned EltSize)158*9880d681SAndroid Build Coastguard Worker SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
159*9880d681SAndroid Build Coastguard Worker unsigned EltSize){
160*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator E = I->getParent()->end();
161*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator MBBI = I;
162*9880d681SAndroid Build Coastguard Worker ++MBBI;
163*9880d681SAndroid Build Coastguard Worker
164*9880d681SAndroid Build Coastguard Worker if (MBBI->getOpcode() != I->getOpcode())
165*9880d681SAndroid Build Coastguard Worker return E;
166*9880d681SAndroid Build Coastguard Worker
167*9880d681SAndroid Build Coastguard Worker // Don't merge volatiles.
168*9880d681SAndroid Build Coastguard Worker if (MBBI->hasOrderedMemoryRef())
169*9880d681SAndroid Build Coastguard Worker return E;
170*9880d681SAndroid Build Coastguard Worker
171*9880d681SAndroid Build Coastguard Worker int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
172*9880d681SAndroid Build Coastguard Worker const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
173*9880d681SAndroid Build Coastguard Worker const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
174*9880d681SAndroid Build Coastguard Worker
175*9880d681SAndroid Build Coastguard Worker // Check same base pointer. Be careful of subregisters, which can occur with
176*9880d681SAndroid Build Coastguard Worker // vectors of pointers.
177*9880d681SAndroid Build Coastguard Worker if (AddrReg0.getReg() == AddrReg1.getReg() &&
178*9880d681SAndroid Build Coastguard Worker AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
179*9880d681SAndroid Build Coastguard Worker int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
180*9880d681SAndroid Build Coastguard Worker AMDGPU::OpName::offset);
181*9880d681SAndroid Build Coastguard Worker unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
182*9880d681SAndroid Build Coastguard Worker unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
183*9880d681SAndroid Build Coastguard Worker
184*9880d681SAndroid Build Coastguard Worker // Check both offsets fit in the reduced range.
185*9880d681SAndroid Build Coastguard Worker if (offsetsCanBeCombined(Offset0, Offset1, EltSize))
186*9880d681SAndroid Build Coastguard Worker return MBBI;
187*9880d681SAndroid Build Coastguard Worker }
188*9880d681SAndroid Build Coastguard Worker
189*9880d681SAndroid Build Coastguard Worker return E;
190*9880d681SAndroid Build Coastguard Worker }
191*9880d681SAndroid Build Coastguard Worker
mergeRead2Pair(MachineBasicBlock::iterator I,MachineBasicBlock::iterator Paired,unsigned EltSize)192*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
193*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator I,
194*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator Paired,
195*9880d681SAndroid Build Coastguard Worker unsigned EltSize) {
196*9880d681SAndroid Build Coastguard Worker MachineBasicBlock *MBB = I->getParent();
197*9880d681SAndroid Build Coastguard Worker
198*9880d681SAndroid Build Coastguard Worker // Be careful, since the addresses could be subregisters themselves in weird
199*9880d681SAndroid Build Coastguard Worker // cases, like vectors of pointers.
200*9880d681SAndroid Build Coastguard Worker const MachineOperand *AddrReg = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
201*9880d681SAndroid Build Coastguard Worker
202*9880d681SAndroid Build Coastguard Worker const MachineOperand *Dest0 = TII->getNamedOperand(*I, AMDGPU::OpName::vdst);
203*9880d681SAndroid Build Coastguard Worker const MachineOperand *Dest1 = TII->getNamedOperand(*Paired, AMDGPU::OpName::vdst);
204*9880d681SAndroid Build Coastguard Worker
205*9880d681SAndroid Build Coastguard Worker unsigned Offset0
206*9880d681SAndroid Build Coastguard Worker = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
207*9880d681SAndroid Build Coastguard Worker unsigned Offset1
208*9880d681SAndroid Build Coastguard Worker = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
209*9880d681SAndroid Build Coastguard Worker
210*9880d681SAndroid Build Coastguard Worker unsigned NewOffset0 = Offset0 / EltSize;
211*9880d681SAndroid Build Coastguard Worker unsigned NewOffset1 = Offset1 / EltSize;
212*9880d681SAndroid Build Coastguard Worker unsigned Opc = (EltSize == 4) ? AMDGPU::DS_READ2_B32 : AMDGPU::DS_READ2_B64;
213*9880d681SAndroid Build Coastguard Worker
214*9880d681SAndroid Build Coastguard Worker // Prefer the st64 form if we can use it, even if we can fit the offset in the
215*9880d681SAndroid Build Coastguard Worker // non st64 version. I'm not sure if there's any real reason to do this.
216*9880d681SAndroid Build Coastguard Worker bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
217*9880d681SAndroid Build Coastguard Worker if (UseST64) {
218*9880d681SAndroid Build Coastguard Worker NewOffset0 /= 64;
219*9880d681SAndroid Build Coastguard Worker NewOffset1 /= 64;
220*9880d681SAndroid Build Coastguard Worker Opc = (EltSize == 4) ? AMDGPU::DS_READ2ST64_B32 : AMDGPU::DS_READ2ST64_B64;
221*9880d681SAndroid Build Coastguard Worker }
222*9880d681SAndroid Build Coastguard Worker
223*9880d681SAndroid Build Coastguard Worker assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
224*9880d681SAndroid Build Coastguard Worker (NewOffset0 != NewOffset1) &&
225*9880d681SAndroid Build Coastguard Worker "Computed offset doesn't fit");
226*9880d681SAndroid Build Coastguard Worker
227*9880d681SAndroid Build Coastguard Worker const MCInstrDesc &Read2Desc = TII->get(Opc);
228*9880d681SAndroid Build Coastguard Worker
229*9880d681SAndroid Build Coastguard Worker const TargetRegisterClass *SuperRC
230*9880d681SAndroid Build Coastguard Worker = (EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
231*9880d681SAndroid Build Coastguard Worker unsigned DestReg = MRI->createVirtualRegister(SuperRC);
232*9880d681SAndroid Build Coastguard Worker
233*9880d681SAndroid Build Coastguard Worker DebugLoc DL = I->getDebugLoc();
234*9880d681SAndroid Build Coastguard Worker MachineInstrBuilder Read2
235*9880d681SAndroid Build Coastguard Worker = BuildMI(*MBB, I, DL, Read2Desc, DestReg)
236*9880d681SAndroid Build Coastguard Worker .addOperand(*AddrReg) // addr
237*9880d681SAndroid Build Coastguard Worker .addImm(NewOffset0) // offset0
238*9880d681SAndroid Build Coastguard Worker .addImm(NewOffset1) // offset1
239*9880d681SAndroid Build Coastguard Worker .addImm(0) // gds
240*9880d681SAndroid Build Coastguard Worker .addMemOperand(*I->memoperands_begin())
241*9880d681SAndroid Build Coastguard Worker .addMemOperand(*Paired->memoperands_begin());
242*9880d681SAndroid Build Coastguard Worker
243*9880d681SAndroid Build Coastguard Worker unsigned SubRegIdx0 = (EltSize == 4) ? AMDGPU::sub0 : AMDGPU::sub0_sub1;
244*9880d681SAndroid Build Coastguard Worker unsigned SubRegIdx1 = (EltSize == 4) ? AMDGPU::sub1 : AMDGPU::sub2_sub3;
245*9880d681SAndroid Build Coastguard Worker
246*9880d681SAndroid Build Coastguard Worker const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
247*9880d681SAndroid Build Coastguard Worker
248*9880d681SAndroid Build Coastguard Worker // Copy to the old destination registers.
249*9880d681SAndroid Build Coastguard Worker MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc)
250*9880d681SAndroid Build Coastguard Worker .addOperand(*Dest0) // Copy to same destination including flags and sub reg.
251*9880d681SAndroid Build Coastguard Worker .addReg(DestReg, 0, SubRegIdx0);
252*9880d681SAndroid Build Coastguard Worker MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc)
253*9880d681SAndroid Build Coastguard Worker .addOperand(*Dest1)
254*9880d681SAndroid Build Coastguard Worker .addReg(DestReg, RegState::Kill, SubRegIdx1);
255*9880d681SAndroid Build Coastguard Worker
256*9880d681SAndroid Build Coastguard Worker LIS->InsertMachineInstrInMaps(*Read2);
257*9880d681SAndroid Build Coastguard Worker
258*9880d681SAndroid Build Coastguard Worker // repairLiveintervalsInRange() doesn't handle physical register, so we have
259*9880d681SAndroid Build Coastguard Worker // to update the M0 range manually.
260*9880d681SAndroid Build Coastguard Worker SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
261*9880d681SAndroid Build Coastguard Worker LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
262*9880d681SAndroid Build Coastguard Worker LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
263*9880d681SAndroid Build Coastguard Worker bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
264*9880d681SAndroid Build Coastguard Worker
265*9880d681SAndroid Build Coastguard Worker // The new write to the original destination register is now the copy. Steal
266*9880d681SAndroid Build Coastguard Worker // the old SlotIndex.
267*9880d681SAndroid Build Coastguard Worker LIS->ReplaceMachineInstrInMaps(*I, *Copy0);
268*9880d681SAndroid Build Coastguard Worker LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1);
269*9880d681SAndroid Build Coastguard Worker
270*9880d681SAndroid Build Coastguard Worker I->eraseFromParent();
271*9880d681SAndroid Build Coastguard Worker Paired->eraseFromParent();
272*9880d681SAndroid Build Coastguard Worker
273*9880d681SAndroid Build Coastguard Worker LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
274*9880d681SAndroid Build Coastguard Worker LIS->shrinkToUses(&AddrRegLI);
275*9880d681SAndroid Build Coastguard Worker
276*9880d681SAndroid Build Coastguard Worker LIS->createAndComputeVirtRegInterval(DestReg);
277*9880d681SAndroid Build Coastguard Worker
278*9880d681SAndroid Build Coastguard Worker if (UpdateM0Range) {
279*9880d681SAndroid Build Coastguard Worker SlotIndex Read2Index = LIS->getInstructionIndex(*Read2);
280*9880d681SAndroid Build Coastguard Worker M0Segment->end = Read2Index.getRegSlot();
281*9880d681SAndroid Build Coastguard Worker }
282*9880d681SAndroid Build Coastguard Worker
283*9880d681SAndroid Build Coastguard Worker DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
284*9880d681SAndroid Build Coastguard Worker return Read2.getInstr();
285*9880d681SAndroid Build Coastguard Worker }
286*9880d681SAndroid Build Coastguard Worker
mergeWrite2Pair(MachineBasicBlock::iterator I,MachineBasicBlock::iterator Paired,unsigned EltSize)287*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
288*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator I,
289*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator Paired,
290*9880d681SAndroid Build Coastguard Worker unsigned EltSize) {
291*9880d681SAndroid Build Coastguard Worker MachineBasicBlock *MBB = I->getParent();
292*9880d681SAndroid Build Coastguard Worker
293*9880d681SAndroid Build Coastguard Worker // Be sure to use .addOperand(), and not .addReg() with these. We want to be
294*9880d681SAndroid Build Coastguard Worker // sure we preserve the subregister index and any register flags set on them.
295*9880d681SAndroid Build Coastguard Worker const MachineOperand *Addr = TII->getNamedOperand(*I, AMDGPU::OpName::addr);
296*9880d681SAndroid Build Coastguard Worker const MachineOperand *Data0 = TII->getNamedOperand(*I, AMDGPU::OpName::data0);
297*9880d681SAndroid Build Coastguard Worker const MachineOperand *Data1
298*9880d681SAndroid Build Coastguard Worker = TII->getNamedOperand(*Paired, AMDGPU::OpName::data0);
299*9880d681SAndroid Build Coastguard Worker
300*9880d681SAndroid Build Coastguard Worker
301*9880d681SAndroid Build Coastguard Worker unsigned Offset0
302*9880d681SAndroid Build Coastguard Worker = TII->getNamedOperand(*I, AMDGPU::OpName::offset)->getImm() & 0xffff;
303*9880d681SAndroid Build Coastguard Worker unsigned Offset1
304*9880d681SAndroid Build Coastguard Worker = TII->getNamedOperand(*Paired, AMDGPU::OpName::offset)->getImm() & 0xffff;
305*9880d681SAndroid Build Coastguard Worker
306*9880d681SAndroid Build Coastguard Worker unsigned NewOffset0 = Offset0 / EltSize;
307*9880d681SAndroid Build Coastguard Worker unsigned NewOffset1 = Offset1 / EltSize;
308*9880d681SAndroid Build Coastguard Worker unsigned Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2_B32 : AMDGPU::DS_WRITE2_B64;
309*9880d681SAndroid Build Coastguard Worker
310*9880d681SAndroid Build Coastguard Worker // Prefer the st64 form if we can use it, even if we can fit the offset in the
311*9880d681SAndroid Build Coastguard Worker // non st64 version. I'm not sure if there's any real reason to do this.
312*9880d681SAndroid Build Coastguard Worker bool UseST64 = (NewOffset0 % 64 == 0) && (NewOffset1 % 64 == 0);
313*9880d681SAndroid Build Coastguard Worker if (UseST64) {
314*9880d681SAndroid Build Coastguard Worker NewOffset0 /= 64;
315*9880d681SAndroid Build Coastguard Worker NewOffset1 /= 64;
316*9880d681SAndroid Build Coastguard Worker Opc = (EltSize == 4) ? AMDGPU::DS_WRITE2ST64_B32 : AMDGPU::DS_WRITE2ST64_B64;
317*9880d681SAndroid Build Coastguard Worker }
318*9880d681SAndroid Build Coastguard Worker
319*9880d681SAndroid Build Coastguard Worker assert((isUInt<8>(NewOffset0) && isUInt<8>(NewOffset1)) &&
320*9880d681SAndroid Build Coastguard Worker (NewOffset0 != NewOffset1) &&
321*9880d681SAndroid Build Coastguard Worker "Computed offset doesn't fit");
322*9880d681SAndroid Build Coastguard Worker
323*9880d681SAndroid Build Coastguard Worker const MCInstrDesc &Write2Desc = TII->get(Opc);
324*9880d681SAndroid Build Coastguard Worker DebugLoc DL = I->getDebugLoc();
325*9880d681SAndroid Build Coastguard Worker
326*9880d681SAndroid Build Coastguard Worker // repairLiveintervalsInRange() doesn't handle physical register, so we have
327*9880d681SAndroid Build Coastguard Worker // to update the M0 range manually.
328*9880d681SAndroid Build Coastguard Worker SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
329*9880d681SAndroid Build Coastguard Worker LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
330*9880d681SAndroid Build Coastguard Worker LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
331*9880d681SAndroid Build Coastguard Worker bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
332*9880d681SAndroid Build Coastguard Worker
333*9880d681SAndroid Build Coastguard Worker MachineInstrBuilder Write2
334*9880d681SAndroid Build Coastguard Worker = BuildMI(*MBB, I, DL, Write2Desc)
335*9880d681SAndroid Build Coastguard Worker .addOperand(*Addr) // addr
336*9880d681SAndroid Build Coastguard Worker .addOperand(*Data0) // data0
337*9880d681SAndroid Build Coastguard Worker .addOperand(*Data1) // data1
338*9880d681SAndroid Build Coastguard Worker .addImm(NewOffset0) // offset0
339*9880d681SAndroid Build Coastguard Worker .addImm(NewOffset1) // offset1
340*9880d681SAndroid Build Coastguard Worker .addImm(0) // gds
341*9880d681SAndroid Build Coastguard Worker .addMemOperand(*I->memoperands_begin())
342*9880d681SAndroid Build Coastguard Worker .addMemOperand(*Paired->memoperands_begin());
343*9880d681SAndroid Build Coastguard Worker
344*9880d681SAndroid Build Coastguard Worker // XXX - How do we express subregisters here?
345*9880d681SAndroid Build Coastguard Worker unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
346*9880d681SAndroid Build Coastguard Worker
347*9880d681SAndroid Build Coastguard Worker LIS->RemoveMachineInstrFromMaps(*I);
348*9880d681SAndroid Build Coastguard Worker LIS->RemoveMachineInstrFromMaps(*Paired);
349*9880d681SAndroid Build Coastguard Worker I->eraseFromParent();
350*9880d681SAndroid Build Coastguard Worker Paired->eraseFromParent();
351*9880d681SAndroid Build Coastguard Worker
352*9880d681SAndroid Build Coastguard Worker // This doesn't handle physical registers like M0
353*9880d681SAndroid Build Coastguard Worker LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);
354*9880d681SAndroid Build Coastguard Worker
355*9880d681SAndroid Build Coastguard Worker if (UpdateM0Range) {
356*9880d681SAndroid Build Coastguard Worker SlotIndex Write2Index = LIS->getInstructionIndex(*Write2);
357*9880d681SAndroid Build Coastguard Worker M0Segment->end = Write2Index.getRegSlot();
358*9880d681SAndroid Build Coastguard Worker }
359*9880d681SAndroid Build Coastguard Worker
360*9880d681SAndroid Build Coastguard Worker DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
361*9880d681SAndroid Build Coastguard Worker return Write2.getInstr();
362*9880d681SAndroid Build Coastguard Worker }
363*9880d681SAndroid Build Coastguard Worker
364*9880d681SAndroid Build Coastguard Worker // Scan through looking for adjacent LDS operations with constant offsets from
365*9880d681SAndroid Build Coastguard Worker // the same base register. We rely on the scheduler to do the hard work of
366*9880d681SAndroid Build Coastguard Worker // clustering nearby loads, and assume these are all adjacent.
optimizeBlock(MachineBasicBlock & MBB)367*9880d681SAndroid Build Coastguard Worker bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
368*9880d681SAndroid Build Coastguard Worker bool Modified = false;
369*9880d681SAndroid Build Coastguard Worker
370*9880d681SAndroid Build Coastguard Worker for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
371*9880d681SAndroid Build Coastguard Worker MachineInstr &MI = *I;
372*9880d681SAndroid Build Coastguard Worker
373*9880d681SAndroid Build Coastguard Worker // Don't combine if volatile.
374*9880d681SAndroid Build Coastguard Worker if (MI.hasOrderedMemoryRef()) {
375*9880d681SAndroid Build Coastguard Worker ++I;
376*9880d681SAndroid Build Coastguard Worker continue;
377*9880d681SAndroid Build Coastguard Worker }
378*9880d681SAndroid Build Coastguard Worker
379*9880d681SAndroid Build Coastguard Worker unsigned Opc = MI.getOpcode();
380*9880d681SAndroid Build Coastguard Worker if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
381*9880d681SAndroid Build Coastguard Worker unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
382*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
383*9880d681SAndroid Build Coastguard Worker if (Match != E) {
384*9880d681SAndroid Build Coastguard Worker Modified = true;
385*9880d681SAndroid Build Coastguard Worker I = mergeRead2Pair(I, Match, Size);
386*9880d681SAndroid Build Coastguard Worker } else {
387*9880d681SAndroid Build Coastguard Worker ++I;
388*9880d681SAndroid Build Coastguard Worker }
389*9880d681SAndroid Build Coastguard Worker
390*9880d681SAndroid Build Coastguard Worker continue;
391*9880d681SAndroid Build Coastguard Worker } else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
392*9880d681SAndroid Build Coastguard Worker unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
393*9880d681SAndroid Build Coastguard Worker MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
394*9880d681SAndroid Build Coastguard Worker if (Match != E) {
395*9880d681SAndroid Build Coastguard Worker Modified = true;
396*9880d681SAndroid Build Coastguard Worker I = mergeWrite2Pair(I, Match, Size);
397*9880d681SAndroid Build Coastguard Worker } else {
398*9880d681SAndroid Build Coastguard Worker ++I;
399*9880d681SAndroid Build Coastguard Worker }
400*9880d681SAndroid Build Coastguard Worker
401*9880d681SAndroid Build Coastguard Worker continue;
402*9880d681SAndroid Build Coastguard Worker }
403*9880d681SAndroid Build Coastguard Worker
404*9880d681SAndroid Build Coastguard Worker ++I;
405*9880d681SAndroid Build Coastguard Worker }
406*9880d681SAndroid Build Coastguard Worker
407*9880d681SAndroid Build Coastguard Worker return Modified;
408*9880d681SAndroid Build Coastguard Worker }
409*9880d681SAndroid Build Coastguard Worker
runOnMachineFunction(MachineFunction & MF)410*9880d681SAndroid Build Coastguard Worker bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
411*9880d681SAndroid Build Coastguard Worker if (skipFunction(*MF.getFunction()))
412*9880d681SAndroid Build Coastguard Worker return false;
413*9880d681SAndroid Build Coastguard Worker
414*9880d681SAndroid Build Coastguard Worker const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
415*9880d681SAndroid Build Coastguard Worker if (!STM.loadStoreOptEnabled())
416*9880d681SAndroid Build Coastguard Worker return false;
417*9880d681SAndroid Build Coastguard Worker
418*9880d681SAndroid Build Coastguard Worker TII = STM.getInstrInfo();
419*9880d681SAndroid Build Coastguard Worker TRI = &TII->getRegisterInfo();
420*9880d681SAndroid Build Coastguard Worker
421*9880d681SAndroid Build Coastguard Worker MRI = &MF.getRegInfo();
422*9880d681SAndroid Build Coastguard Worker
423*9880d681SAndroid Build Coastguard Worker LIS = &getAnalysis<LiveIntervals>();
424*9880d681SAndroid Build Coastguard Worker
425*9880d681SAndroid Build Coastguard Worker DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
426*9880d681SAndroid Build Coastguard Worker
427*9880d681SAndroid Build Coastguard Worker assert(!MRI->isSSA());
428*9880d681SAndroid Build Coastguard Worker
429*9880d681SAndroid Build Coastguard Worker bool Modified = false;
430*9880d681SAndroid Build Coastguard Worker
431*9880d681SAndroid Build Coastguard Worker for (MachineBasicBlock &MBB : MF)
432*9880d681SAndroid Build Coastguard Worker Modified |= optimizeBlock(MBB);
433*9880d681SAndroid Build Coastguard Worker
434*9880d681SAndroid Build Coastguard Worker return Modified;
435*9880d681SAndroid Build Coastguard Worker }
436