]>
Commit | Line | Data |
---|---|---|
1a4d82fc JJ |
1 | //=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=// |
2 | // | |
3 | // The LLVM Compiler Infrastructure | |
4 | // | |
5 | // This file is distributed under the University of Illinois Open Source | |
6 | // License. See LICENSE.TXT for details. | |
7 | // | |
8 | //===----------------------------------------------------------------------===// | |
9 | // | |
10 | // This file contains a pass that performs load / store related peephole | |
11 | // optimizations. This pass should be run after register allocation. | |
12 | // | |
13 | //===----------------------------------------------------------------------===// | |
14 | ||
15 | #include "AArch64InstrInfo.h" | |
16 | #include "AArch64Subtarget.h" | |
17 | #include "MCTargetDesc/AArch64AddressingModes.h" | |
18 | #include "llvm/ADT/BitVector.h" | |
19 | #include "llvm/ADT/Statistic.h" | |
20 | #include "llvm/CodeGen/MachineBasicBlock.h" | |
21 | #include "llvm/CodeGen/MachineFunctionPass.h" | |
22 | #include "llvm/CodeGen/MachineInstr.h" | |
23 | #include "llvm/CodeGen/MachineInstrBuilder.h" | |
24 | #include "llvm/Support/CommandLine.h" | |
25 | #include "llvm/Support/Debug.h" | |
26 | #include "llvm/Support/ErrorHandling.h" | |
27 | #include "llvm/Support/raw_ostream.h" | |
28 | #include "llvm/Target/TargetInstrInfo.h" | |
29 | #include "llvm/Target/TargetMachine.h" | |
30 | #include "llvm/Target/TargetRegisterInfo.h" | |
31 | using namespace llvm; | |
32 | ||
33 | #define DEBUG_TYPE "aarch64-ldst-opt" | |
34 | ||
35 | /// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine | |
36 | /// load / store instructions to form ldp / stp instructions. | |
37 | ||
38 | STATISTIC(NumPairCreated, "Number of load/store pair instructions generated"); | |
39 | STATISTIC(NumPostFolded, "Number of post-index updates folded"); | |
40 | STATISTIC(NumPreFolded, "Number of pre-index updates folded"); | |
41 | STATISTIC(NumUnscaledPairCreated, | |
42 | "Number of load/store from unscaled generated"); | |
43 | ||
44 | static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", | |
45 | cl::init(20), cl::Hidden); | |
46 | ||
47 | // Place holder while testing unscaled load/store combining | |
48 | static cl::opt<bool> EnableAArch64UnscaledMemOp( | |
49 | "aarch64-unscaled-mem-op", cl::Hidden, | |
50 | cl::desc("Allow AArch64 unscaled load/store combining"), cl::init(true)); | |
51 | ||
52 | namespace { | |
53 | struct AArch64LoadStoreOpt : public MachineFunctionPass { | |
54 | static char ID; | |
55 | AArch64LoadStoreOpt() : MachineFunctionPass(ID) {} | |
56 | ||
57 | const AArch64InstrInfo *TII; | |
58 | const TargetRegisterInfo *TRI; | |
59 | ||
60 | // Scan the instructions looking for a load/store that can be combined | |
61 | // with the current instruction into a load/store pair. | |
62 | // Return the matching instruction if one is found, else MBB->end(). | |
63 | // If a matching instruction is found, MergeForward is set to true if the | |
64 | // merge is to remove the first instruction and replace the second with | |
65 | // a pair-wise insn, and false if the reverse is true. | |
66 | MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I, | |
67 | bool &MergeForward, | |
68 | unsigned Limit); | |
69 | // Merge the two instructions indicated into a single pair-wise instruction. | |
70 | // If MergeForward is true, erase the first instruction and fold its | |
71 | // operation into the second. If false, the reverse. Return the instruction | |
72 | // following the first instruction (which may change during processing). | |
73 | MachineBasicBlock::iterator | |
74 | mergePairedInsns(MachineBasicBlock::iterator I, | |
75 | MachineBasicBlock::iterator Paired, bool MergeForward); | |
76 | ||
77 | // Scan the instruction list to find a base register update that can | |
78 | // be combined with the current instruction (a load or store) using | |
79 | // pre or post indexed addressing with writeback. Scan forwards. | |
80 | MachineBasicBlock::iterator | |
81 | findMatchingUpdateInsnForward(MachineBasicBlock::iterator I, unsigned Limit, | |
82 | int Value); | |
83 | ||
84 | // Scan the instruction list to find a base register update that can | |
85 | // be combined with the current instruction (a load or store) using | |
86 | // pre or post indexed addressing with writeback. Scan backwards. | |
87 | MachineBasicBlock::iterator | |
88 | findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); | |
89 | ||
90 | // Merge a pre-index base register update into a ld/st instruction. | |
91 | MachineBasicBlock::iterator | |
92 | mergePreIdxUpdateInsn(MachineBasicBlock::iterator I, | |
93 | MachineBasicBlock::iterator Update); | |
94 | ||
95 | // Merge a post-index base register update into a ld/st instruction. | |
96 | MachineBasicBlock::iterator | |
97 | mergePostIdxUpdateInsn(MachineBasicBlock::iterator I, | |
98 | MachineBasicBlock::iterator Update); | |
99 | ||
100 | bool optimizeBlock(MachineBasicBlock &MBB); | |
101 | ||
102 | bool runOnMachineFunction(MachineFunction &Fn) override; | |
103 | ||
104 | const char *getPassName() const override { | |
105 | return "AArch64 load / store optimization pass"; | |
106 | } | |
107 | ||
108 | private: | |
109 | int getMemSize(MachineInstr *MemMI); | |
110 | }; | |
111 | char AArch64LoadStoreOpt::ID = 0; | |
112 | } // namespace | |
113 | ||
114 | static bool isUnscaledLdst(unsigned Opc) { | |
115 | switch (Opc) { | |
116 | default: | |
117 | return false; | |
118 | case AArch64::STURSi: | |
119 | return true; | |
120 | case AArch64::STURDi: | |
121 | return true; | |
122 | case AArch64::STURQi: | |
123 | return true; | |
124 | case AArch64::STURWi: | |
125 | return true; | |
126 | case AArch64::STURXi: | |
127 | return true; | |
128 | case AArch64::LDURSi: | |
129 | return true; | |
130 | case AArch64::LDURDi: | |
131 | return true; | |
132 | case AArch64::LDURQi: | |
133 | return true; | |
134 | case AArch64::LDURWi: | |
135 | return true; | |
136 | case AArch64::LDURXi: | |
137 | return true; | |
138 | } | |
139 | } | |
140 | ||
141 | // Size in bytes of the data moved by an unscaled load or store | |
142 | int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) { | |
143 | switch (MemMI->getOpcode()) { | |
144 | default: | |
145 | llvm_unreachable("Opcode has unknown size!"); | |
146 | case AArch64::STRSui: | |
147 | case AArch64::STURSi: | |
148 | return 4; | |
149 | case AArch64::STRDui: | |
150 | case AArch64::STURDi: | |
151 | return 8; | |
152 | case AArch64::STRQui: | |
153 | case AArch64::STURQi: | |
154 | return 16; | |
155 | case AArch64::STRWui: | |
156 | case AArch64::STURWi: | |
157 | return 4; | |
158 | case AArch64::STRXui: | |
159 | case AArch64::STURXi: | |
160 | return 8; | |
161 | case AArch64::LDRSui: | |
162 | case AArch64::LDURSi: | |
163 | return 4; | |
164 | case AArch64::LDRDui: | |
165 | case AArch64::LDURDi: | |
166 | return 8; | |
167 | case AArch64::LDRQui: | |
168 | case AArch64::LDURQi: | |
169 | return 16; | |
170 | case AArch64::LDRWui: | |
171 | case AArch64::LDURWi: | |
172 | return 4; | |
173 | case AArch64::LDRXui: | |
174 | case AArch64::LDURXi: | |
175 | return 8; | |
176 | } | |
177 | } | |
178 | ||
179 | static unsigned getMatchingPairOpcode(unsigned Opc) { | |
180 | switch (Opc) { | |
181 | default: | |
182 | llvm_unreachable("Opcode has no pairwise equivalent!"); | |
183 | case AArch64::STRSui: | |
184 | case AArch64::STURSi: | |
185 | return AArch64::STPSi; | |
186 | case AArch64::STRDui: | |
187 | case AArch64::STURDi: | |
188 | return AArch64::STPDi; | |
189 | case AArch64::STRQui: | |
190 | case AArch64::STURQi: | |
191 | return AArch64::STPQi; | |
192 | case AArch64::STRWui: | |
193 | case AArch64::STURWi: | |
194 | return AArch64::STPWi; | |
195 | case AArch64::STRXui: | |
196 | case AArch64::STURXi: | |
197 | return AArch64::STPXi; | |
198 | case AArch64::LDRSui: | |
199 | case AArch64::LDURSi: | |
200 | return AArch64::LDPSi; | |
201 | case AArch64::LDRDui: | |
202 | case AArch64::LDURDi: | |
203 | return AArch64::LDPDi; | |
204 | case AArch64::LDRQui: | |
205 | case AArch64::LDURQi: | |
206 | return AArch64::LDPQi; | |
207 | case AArch64::LDRWui: | |
208 | case AArch64::LDURWi: | |
209 | return AArch64::LDPWi; | |
210 | case AArch64::LDRXui: | |
211 | case AArch64::LDURXi: | |
212 | return AArch64::LDPXi; | |
213 | } | |
214 | } | |
215 | ||
216 | static unsigned getPreIndexedOpcode(unsigned Opc) { | |
217 | switch (Opc) { | |
218 | default: | |
219 | llvm_unreachable("Opcode has no pre-indexed equivalent!"); | |
220 | case AArch64::STRSui: | |
221 | return AArch64::STRSpre; | |
222 | case AArch64::STRDui: | |
223 | return AArch64::STRDpre; | |
224 | case AArch64::STRQui: | |
225 | return AArch64::STRQpre; | |
226 | case AArch64::STRWui: | |
227 | return AArch64::STRWpre; | |
228 | case AArch64::STRXui: | |
229 | return AArch64::STRXpre; | |
230 | case AArch64::LDRSui: | |
231 | return AArch64::LDRSpre; | |
232 | case AArch64::LDRDui: | |
233 | return AArch64::LDRDpre; | |
234 | case AArch64::LDRQui: | |
235 | return AArch64::LDRQpre; | |
236 | case AArch64::LDRWui: | |
237 | return AArch64::LDRWpre; | |
238 | case AArch64::LDRXui: | |
239 | return AArch64::LDRXpre; | |
240 | } | |
241 | } | |
242 | ||
243 | static unsigned getPostIndexedOpcode(unsigned Opc) { | |
244 | switch (Opc) { | |
245 | default: | |
246 | llvm_unreachable("Opcode has no post-indexed wise equivalent!"); | |
247 | case AArch64::STRSui: | |
248 | return AArch64::STRSpost; | |
249 | case AArch64::STRDui: | |
250 | return AArch64::STRDpost; | |
251 | case AArch64::STRQui: | |
252 | return AArch64::STRQpost; | |
253 | case AArch64::STRWui: | |
254 | return AArch64::STRWpost; | |
255 | case AArch64::STRXui: | |
256 | return AArch64::STRXpost; | |
257 | case AArch64::LDRSui: | |
258 | return AArch64::LDRSpost; | |
259 | case AArch64::LDRDui: | |
260 | return AArch64::LDRDpost; | |
261 | case AArch64::LDRQui: | |
262 | return AArch64::LDRQpost; | |
263 | case AArch64::LDRWui: | |
264 | return AArch64::LDRWpost; | |
265 | case AArch64::LDRXui: | |
266 | return AArch64::LDRXpost; | |
267 | } | |
268 | } | |
269 | ||
270 | MachineBasicBlock::iterator | |
271 | AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, | |
272 | MachineBasicBlock::iterator Paired, | |
273 | bool MergeForward) { | |
274 | MachineBasicBlock::iterator NextI = I; | |
275 | ++NextI; | |
276 | // If NextI is the second of the two instructions to be merged, we need | |
277 | // to skip one further. Either way we merge will invalidate the iterator, | |
278 | // and we don't need to scan the new instruction, as it's a pairwise | |
279 | // instruction, which we're not considering for further action anyway. | |
280 | if (NextI == Paired) | |
281 | ++NextI; | |
282 | ||
283 | bool IsUnscaled = isUnscaledLdst(I->getOpcode()); | |
284 | int OffsetStride = | |
285 | IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1; | |
286 | ||
287 | unsigned NewOpc = getMatchingPairOpcode(I->getOpcode()); | |
288 | // Insert our new paired instruction after whichever of the paired | |
289 | // instructions MergeForward indicates. | |
290 | MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; | |
291 | // Also based on MergeForward is from where we copy the base register operand | |
292 | // so we get the flags compatible with the input code. | |
293 | MachineOperand &BaseRegOp = | |
294 | MergeForward ? Paired->getOperand(1) : I->getOperand(1); | |
295 | ||
296 | // Which register is Rt and which is Rt2 depends on the offset order. | |
297 | MachineInstr *RtMI, *Rt2MI; | |
298 | if (I->getOperand(2).getImm() == | |
299 | Paired->getOperand(2).getImm() + OffsetStride) { | |
300 | RtMI = Paired; | |
301 | Rt2MI = I; | |
302 | } else { | |
303 | RtMI = I; | |
304 | Rt2MI = Paired; | |
305 | } | |
306 | // Handle Unscaled | |
307 | int OffsetImm = RtMI->getOperand(2).getImm(); | |
308 | if (IsUnscaled && EnableAArch64UnscaledMemOp) | |
309 | OffsetImm /= OffsetStride; | |
310 | ||
311 | // Construct the new instruction. | |
312 | MachineInstrBuilder MIB = BuildMI(*I->getParent(), InsertionPoint, | |
313 | I->getDebugLoc(), TII->get(NewOpc)) | |
314 | .addOperand(RtMI->getOperand(0)) | |
315 | .addOperand(Rt2MI->getOperand(0)) | |
316 | .addOperand(BaseRegOp) | |
317 | .addImm(OffsetImm); | |
318 | (void)MIB; | |
319 | ||
320 | // FIXME: Do we need/want to copy the mem operands from the source | |
321 | // instructions? Probably. What uses them after this? | |
322 | ||
323 | DEBUG(dbgs() << "Creating pair load/store. Replacing instructions:\n "); | |
324 | DEBUG(I->print(dbgs())); | |
325 | DEBUG(dbgs() << " "); | |
326 | DEBUG(Paired->print(dbgs())); | |
327 | DEBUG(dbgs() << " with instruction:\n "); | |
328 | DEBUG(((MachineInstr *)MIB)->print(dbgs())); | |
329 | DEBUG(dbgs() << "\n"); | |
330 | ||
331 | // Erase the old instructions. | |
332 | I->eraseFromParent(); | |
333 | Paired->eraseFromParent(); | |
334 | ||
335 | return NextI; | |
336 | } | |
337 | ||
338 | /// trackRegDefsUses - Remember what registers the specified instruction uses | |
339 | /// and modifies. | |
340 | static void trackRegDefsUses(MachineInstr *MI, BitVector &ModifiedRegs, | |
341 | BitVector &UsedRegs, | |
342 | const TargetRegisterInfo *TRI) { | |
343 | for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { | |
344 | MachineOperand &MO = MI->getOperand(i); | |
345 | if (MO.isRegMask()) | |
346 | ModifiedRegs.setBitsNotInMask(MO.getRegMask()); | |
347 | ||
348 | if (!MO.isReg()) | |
349 | continue; | |
350 | unsigned Reg = MO.getReg(); | |
351 | if (MO.isDef()) { | |
352 | for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) | |
353 | ModifiedRegs.set(*AI); | |
354 | } else { | |
355 | assert(MO.isUse() && "Reg operand not a def and not a use?!?"); | |
356 | for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) | |
357 | UsedRegs.set(*AI); | |
358 | } | |
359 | } | |
360 | } | |
361 | ||
362 | static bool inBoundsForPair(bool IsUnscaled, int Offset, int OffsetStride) { | |
363 | if (!IsUnscaled && (Offset > 63 || Offset < -64)) | |
364 | return false; | |
365 | if (IsUnscaled) { | |
366 | // Convert the byte-offset used by unscaled into an "element" offset used | |
367 | // by the scaled pair load/store instructions. | |
368 | int ElemOffset = Offset / OffsetStride; | |
369 | if (ElemOffset > 63 || ElemOffset < -64) | |
370 | return false; | |
371 | } | |
372 | return true; | |
373 | } | |
374 | ||
375 | // Do alignment, specialized to power of 2 and for signed ints, | |
376 | // avoiding having to do a C-style cast from uint_64t to int when | |
377 | // using RoundUpToAlignment from include/llvm/Support/MathExtras.h. | |
378 | // FIXME: Move this function to include/MathExtras.h? | |
379 | static int alignTo(int Num, int PowOf2) { | |
380 | return (Num + PowOf2 - 1) & ~(PowOf2 - 1); | |
381 | } | |
382 | ||
383 | /// findMatchingInsn - Scan the instructions looking for a load/store that can | |
384 | /// be combined with the current instruction into a load/store pair. | |
385 | MachineBasicBlock::iterator | |
386 | AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, | |
387 | bool &MergeForward, unsigned Limit) { | |
388 | MachineBasicBlock::iterator E = I->getParent()->end(); | |
389 | MachineBasicBlock::iterator MBBI = I; | |
390 | MachineInstr *FirstMI = I; | |
391 | ++MBBI; | |
392 | ||
393 | int Opc = FirstMI->getOpcode(); | |
394 | bool MayLoad = FirstMI->mayLoad(); | |
395 | bool IsUnscaled = isUnscaledLdst(Opc); | |
396 | unsigned Reg = FirstMI->getOperand(0).getReg(); | |
397 | unsigned BaseReg = FirstMI->getOperand(1).getReg(); | |
398 | int Offset = FirstMI->getOperand(2).getImm(); | |
399 | ||
400 | // Early exit if the first instruction modifies the base register. | |
401 | // e.g., ldr x0, [x0] | |
402 | // Early exit if the offset if not possible to match. (6 bits of positive | |
403 | // range, plus allow an extra one in case we find a later insn that matches | |
404 | // with Offset-1 | |
405 | if (FirstMI->modifiesRegister(BaseReg, TRI)) | |
406 | return E; | |
407 | int OffsetStride = | |
408 | IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1; | |
409 | if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride)) | |
410 | return E; | |
411 | ||
412 | // Track which registers have been modified and used between the first insn | |
413 | // (inclusive) and the second insn. | |
414 | BitVector ModifiedRegs, UsedRegs; | |
415 | ModifiedRegs.resize(TRI->getNumRegs()); | |
416 | UsedRegs.resize(TRI->getNumRegs()); | |
417 | for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { | |
418 | MachineInstr *MI = MBBI; | |
419 | // Skip DBG_VALUE instructions. Otherwise debug info can affect the | |
420 | // optimization by changing how far we scan. | |
421 | if (MI->isDebugValue()) | |
422 | continue; | |
423 | ||
424 | // Now that we know this is a real instruction, count it. | |
425 | ++Count; | |
426 | ||
427 | if (Opc == MI->getOpcode() && MI->getOperand(2).isImm()) { | |
428 | // If we've found another instruction with the same opcode, check to see | |
429 | // if the base and offset are compatible with our starting instruction. | |
430 | // These instructions all have scaled immediate operands, so we just | |
431 | // check for +1/-1. Make sure to check the new instruction offset is | |
432 | // actually an immediate and not a symbolic reference destined for | |
433 | // a relocation. | |
434 | // | |
435 | // Pairwise instructions have a 7-bit signed offset field. Single insns | |
436 | // have a 12-bit unsigned offset field. To be a valid combine, the | |
437 | // final offset must be in range. | |
438 | unsigned MIBaseReg = MI->getOperand(1).getReg(); | |
439 | int MIOffset = MI->getOperand(2).getImm(); | |
440 | if (BaseReg == MIBaseReg && ((Offset == MIOffset + OffsetStride) || | |
441 | (Offset + OffsetStride == MIOffset))) { | |
442 | int MinOffset = Offset < MIOffset ? Offset : MIOffset; | |
443 | // If this is a volatile load/store that otherwise matched, stop looking | |
444 | // as something is going on that we don't have enough information to | |
445 | // safely transform. Similarly, stop if we see a hint to avoid pairs. | |
446 | if (MI->hasOrderedMemoryRef() || TII->isLdStPairSuppressed(MI)) | |
447 | return E; | |
448 | // If the resultant immediate offset of merging these instructions | |
449 | // is out of range for a pairwise instruction, bail and keep looking. | |
450 | bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode()); | |
451 | if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { | |
452 | trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); | |
453 | continue; | |
454 | } | |
455 | // If the alignment requirements of the paired (scaled) instruction | |
456 | // can't express the offset of the unscaled input, bail and keep | |
457 | // looking. | |
458 | if (IsUnscaled && EnableAArch64UnscaledMemOp && | |
459 | (alignTo(MinOffset, OffsetStride) != MinOffset)) { | |
460 | trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); | |
461 | continue; | |
462 | } | |
463 | // If the destination register of the loads is the same register, bail | |
464 | // and keep looking. A load-pair instruction with both destination | |
465 | // registers the same is UNPREDICTABLE and will result in an exception. | |
466 | if (MayLoad && Reg == MI->getOperand(0).getReg()) { | |
467 | trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); | |
468 | continue; | |
469 | } | |
470 | ||
471 | // If the Rt of the second instruction was not modified or used between | |
472 | // the two instructions, we can combine the second into the first. | |
473 | if (!ModifiedRegs[MI->getOperand(0).getReg()] && | |
474 | !UsedRegs[MI->getOperand(0).getReg()]) { | |
475 | MergeForward = false; | |
476 | return MBBI; | |
477 | } | |
478 | ||
479 | // Likewise, if the Rt of the first instruction is not modified or used | |
480 | // between the two instructions, we can combine the first into the | |
481 | // second. | |
482 | if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] && | |
483 | !UsedRegs[FirstMI->getOperand(0).getReg()]) { | |
484 | MergeForward = true; | |
485 | return MBBI; | |
486 | } | |
487 | // Unable to combine these instructions due to interference in between. | |
488 | // Keep looking. | |
489 | } | |
490 | } | |
491 | ||
492 | // If the instruction wasn't a matching load or store, but does (or can) | |
493 | // modify memory, stop searching, as we don't have alias analysis or | |
494 | // anything like that to tell us whether the access is tromping on the | |
495 | // locations we care about. The big one we want to catch is calls. | |
496 | // | |
497 | // FIXME: Theoretically, we can do better than that for SP and FP based | |
498 | // references since we can effectively know where those are touching. It's | |
499 | // unclear if it's worth the extra code, though. Most paired instructions | |
500 | // will be sequential, perhaps with a few intervening non-memory related | |
501 | // instructions. | |
502 | if (MI->mayStore() || MI->isCall()) | |
503 | return E; | |
504 | // Likewise, if we're matching a store instruction, we don't want to | |
505 | // move across a load, as it may be reading the same location. | |
506 | if (FirstMI->mayStore() && MI->mayLoad()) | |
507 | return E; | |
508 | ||
509 | // Update modified / uses register lists. | |
510 | trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); | |
511 | ||
512 | // Otherwise, if the base register is modified, we have no match, so | |
513 | // return early. | |
514 | if (ModifiedRegs[BaseReg]) | |
515 | return E; | |
516 | } | |
517 | return E; | |
518 | } | |
519 | ||
520 | MachineBasicBlock::iterator | |
521 | AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I, | |
522 | MachineBasicBlock::iterator Update) { | |
523 | assert((Update->getOpcode() == AArch64::ADDXri || | |
524 | Update->getOpcode() == AArch64::SUBXri) && | |
525 | "Unexpected base register update instruction to merge!"); | |
526 | MachineBasicBlock::iterator NextI = I; | |
527 | // Return the instruction following the merged instruction, which is | |
528 | // the instruction following our unmerged load. Unless that's the add/sub | |
529 | // instruction we're merging, in which case it's the one after that. | |
530 | if (++NextI == Update) | |
531 | ++NextI; | |
532 | ||
533 | int Value = Update->getOperand(2).getImm(); | |
534 | assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && | |
535 | "Can't merge 1 << 12 offset into pre-indexed load / store"); | |
536 | if (Update->getOpcode() == AArch64::SUBXri) | |
537 | Value = -Value; | |
538 | ||
539 | unsigned NewOpc = getPreIndexedOpcode(I->getOpcode()); | |
540 | MachineInstrBuilder MIB = | |
541 | BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) | |
542 | .addOperand(Update->getOperand(0)) | |
543 | .addOperand(I->getOperand(0)) | |
544 | .addOperand(I->getOperand(1)) | |
545 | .addImm(Value); | |
546 | (void)MIB; | |
547 | ||
548 | DEBUG(dbgs() << "Creating pre-indexed load/store."); | |
549 | DEBUG(dbgs() << " Replacing instructions:\n "); | |
550 | DEBUG(I->print(dbgs())); | |
551 | DEBUG(dbgs() << " "); | |
552 | DEBUG(Update->print(dbgs())); | |
553 | DEBUG(dbgs() << " with instruction:\n "); | |
554 | DEBUG(((MachineInstr *)MIB)->print(dbgs())); | |
555 | DEBUG(dbgs() << "\n"); | |
556 | ||
557 | // Erase the old instructions for the block. | |
558 | I->eraseFromParent(); | |
559 | Update->eraseFromParent(); | |
560 | ||
561 | return NextI; | |
562 | } | |
563 | ||
564 | MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn( | |
565 | MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) { | |
566 | assert((Update->getOpcode() == AArch64::ADDXri || | |
567 | Update->getOpcode() == AArch64::SUBXri) && | |
568 | "Unexpected base register update instruction to merge!"); | |
569 | MachineBasicBlock::iterator NextI = I; | |
570 | // Return the instruction following the merged instruction, which is | |
571 | // the instruction following our unmerged load. Unless that's the add/sub | |
572 | // instruction we're merging, in which case it's the one after that. | |
573 | if (++NextI == Update) | |
574 | ++NextI; | |
575 | ||
576 | int Value = Update->getOperand(2).getImm(); | |
577 | assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 && | |
578 | "Can't merge 1 << 12 offset into post-indexed load / store"); | |
579 | if (Update->getOpcode() == AArch64::SUBXri) | |
580 | Value = -Value; | |
581 | ||
582 | unsigned NewOpc = getPostIndexedOpcode(I->getOpcode()); | |
583 | MachineInstrBuilder MIB = | |
584 | BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) | |
585 | .addOperand(Update->getOperand(0)) | |
586 | .addOperand(I->getOperand(0)) | |
587 | .addOperand(I->getOperand(1)) | |
588 | .addImm(Value); | |
589 | (void)MIB; | |
590 | ||
591 | DEBUG(dbgs() << "Creating post-indexed load/store."); | |
592 | DEBUG(dbgs() << " Replacing instructions:\n "); | |
593 | DEBUG(I->print(dbgs())); | |
594 | DEBUG(dbgs() << " "); | |
595 | DEBUG(Update->print(dbgs())); | |
596 | DEBUG(dbgs() << " with instruction:\n "); | |
597 | DEBUG(((MachineInstr *)MIB)->print(dbgs())); | |
598 | DEBUG(dbgs() << "\n"); | |
599 | ||
600 | // Erase the old instructions for the block. | |
601 | I->eraseFromParent(); | |
602 | Update->eraseFromParent(); | |
603 | ||
604 | return NextI; | |
605 | } | |
606 | ||
607 | static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg, | |
608 | int Offset) { | |
609 | switch (MI->getOpcode()) { | |
610 | default: | |
611 | break; | |
612 | case AArch64::SUBXri: | |
613 | // Negate the offset for a SUB instruction. | |
614 | Offset *= -1; | |
615 | // FALLTHROUGH | |
616 | case AArch64::ADDXri: | |
617 | // Make sure it's a vanilla immediate operand, not a relocation or | |
618 | // anything else we can't handle. | |
619 | if (!MI->getOperand(2).isImm()) | |
620 | break; | |
621 | // Watch out for 1 << 12 shifted value. | |
622 | if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm())) | |
623 | break; | |
624 | // If the instruction has the base register as source and dest and the | |
625 | // immediate will fit in a signed 9-bit integer, then we have a match. | |
626 | if (MI->getOperand(0).getReg() == BaseReg && | |
627 | MI->getOperand(1).getReg() == BaseReg && | |
628 | MI->getOperand(2).getImm() <= 255 && | |
629 | MI->getOperand(2).getImm() >= -256) { | |
630 | // If we have a non-zero Offset, we check that it matches the amount | |
631 | // we're adding to the register. | |
632 | if (!Offset || Offset == MI->getOperand(2).getImm()) | |
633 | return true; | |
634 | } | |
635 | break; | |
636 | } | |
637 | return false; | |
638 | } | |
639 | ||
640 | MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( | |
641 | MachineBasicBlock::iterator I, unsigned Limit, int Value) { | |
642 | MachineBasicBlock::iterator E = I->getParent()->end(); | |
643 | MachineInstr *MemMI = I; | |
644 | MachineBasicBlock::iterator MBBI = I; | |
645 | const MachineFunction &MF = *MemMI->getParent()->getParent(); | |
646 | ||
647 | unsigned DestReg = MemMI->getOperand(0).getReg(); | |
648 | unsigned BaseReg = MemMI->getOperand(1).getReg(); | |
649 | int Offset = MemMI->getOperand(2).getImm() * | |
650 | TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); | |
651 | ||
652 | // If the base register overlaps the destination register, we can't | |
653 | // merge the update. | |
654 | if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) | |
655 | return E; | |
656 | ||
657 | // Scan forward looking for post-index opportunities. | |
658 | // Updating instructions can't be formed if the memory insn already | |
659 | // has an offset other than the value we're looking for. | |
660 | if (Offset != Value) | |
661 | return E; | |
662 | ||
663 | // Track which registers have been modified and used between the first insn | |
664 | // (inclusive) and the second insn. | |
665 | BitVector ModifiedRegs, UsedRegs; | |
666 | ModifiedRegs.resize(TRI->getNumRegs()); | |
667 | UsedRegs.resize(TRI->getNumRegs()); | |
668 | ++MBBI; | |
669 | for (unsigned Count = 0; MBBI != E; ++MBBI) { | |
670 | MachineInstr *MI = MBBI; | |
671 | // Skip DBG_VALUE instructions. Otherwise debug info can affect the | |
672 | // optimization by changing how far we scan. | |
673 | if (MI->isDebugValue()) | |
674 | continue; | |
675 | ||
676 | // Now that we know this is a real instruction, count it. | |
677 | ++Count; | |
678 | ||
679 | // If we found a match, return it. | |
680 | if (isMatchingUpdateInsn(MI, BaseReg, Value)) | |
681 | return MBBI; | |
682 | ||
683 | // Update the status of what the instruction clobbered and used. | |
684 | trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); | |
685 | ||
686 | // Otherwise, if the base register is used or modified, we have no match, so | |
687 | // return early. | |
688 | if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) | |
689 | return E; | |
690 | } | |
691 | return E; | |
692 | } | |
693 | ||
694 | MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward( | |
695 | MachineBasicBlock::iterator I, unsigned Limit) { | |
696 | MachineBasicBlock::iterator B = I->getParent()->begin(); | |
697 | MachineBasicBlock::iterator E = I->getParent()->end(); | |
698 | MachineInstr *MemMI = I; | |
699 | MachineBasicBlock::iterator MBBI = I; | |
700 | const MachineFunction &MF = *MemMI->getParent()->getParent(); | |
701 | ||
702 | unsigned DestReg = MemMI->getOperand(0).getReg(); | |
703 | unsigned BaseReg = MemMI->getOperand(1).getReg(); | |
704 | int Offset = MemMI->getOperand(2).getImm(); | |
705 | unsigned RegSize = TII->getRegClass(MemMI->getDesc(), 0, TRI, MF)->getSize(); | |
706 | ||
707 | // If the load/store is the first instruction in the block, there's obviously | |
708 | // not any matching update. Ditto if the memory offset isn't zero. | |
709 | if (MBBI == B || Offset != 0) | |
710 | return E; | |
711 | // If the base register overlaps the destination register, we can't | |
712 | // merge the update. | |
713 | if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) | |
714 | return E; | |
715 | ||
716 | // Track which registers have been modified and used between the first insn | |
717 | // (inclusive) and the second insn. | |
718 | BitVector ModifiedRegs, UsedRegs; | |
719 | ModifiedRegs.resize(TRI->getNumRegs()); | |
720 | UsedRegs.resize(TRI->getNumRegs()); | |
721 | --MBBI; | |
722 | for (unsigned Count = 0; MBBI != B; --MBBI) { | |
723 | MachineInstr *MI = MBBI; | |
724 | // Skip DBG_VALUE instructions. Otherwise debug info can affect the | |
725 | // optimization by changing how far we scan. | |
726 | if (MI->isDebugValue()) | |
727 | continue; | |
728 | ||
729 | // Now that we know this is a real instruction, count it. | |
730 | ++Count; | |
731 | ||
732 | // If we found a match, return it. | |
733 | if (isMatchingUpdateInsn(MI, BaseReg, RegSize)) | |
734 | return MBBI; | |
735 | ||
736 | // Update the status of what the instruction clobbered and used. | |
737 | trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); | |
738 | ||
739 | // Otherwise, if the base register is used or modified, we have no match, so | |
740 | // return early. | |
741 | if (ModifiedRegs[BaseReg] || UsedRegs[BaseReg]) | |
742 | return E; | |
743 | } | |
744 | return E; | |
745 | } | |
746 | ||
747 | bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { | |
748 | bool Modified = false; | |
749 | // Two tranformations to do here: | |
750 | // 1) Find loads and stores that can be merged into a single load or store | |
751 | // pair instruction. | |
752 | // e.g., | |
753 | // ldr x0, [x2] | |
754 | // ldr x1, [x2, #8] | |
755 | // ; becomes | |
756 | // ldp x0, x1, [x2] | |
757 | // 2) Find base register updates that can be merged into the load or store | |
758 | // as a base-reg writeback. | |
759 | // e.g., | |
760 | // ldr x0, [x2] | |
761 | // add x2, x2, #4 | |
762 | // ; becomes | |
763 | // ldr x0, [x2], #4 | |
764 | ||
765 | for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); | |
766 | MBBI != E;) { | |
767 | MachineInstr *MI = MBBI; | |
768 | switch (MI->getOpcode()) { | |
769 | default: | |
770 | // Just move on to the next instruction. | |
771 | ++MBBI; | |
772 | break; | |
773 | case AArch64::STRSui: | |
774 | case AArch64::STRDui: | |
775 | case AArch64::STRQui: | |
776 | case AArch64::STRXui: | |
777 | case AArch64::STRWui: | |
778 | case AArch64::LDRSui: | |
779 | case AArch64::LDRDui: | |
780 | case AArch64::LDRQui: | |
781 | case AArch64::LDRXui: | |
782 | case AArch64::LDRWui: | |
783 | // do the unscaled versions as well | |
784 | case AArch64::STURSi: | |
785 | case AArch64::STURDi: | |
786 | case AArch64::STURQi: | |
787 | case AArch64::STURWi: | |
788 | case AArch64::STURXi: | |
789 | case AArch64::LDURSi: | |
790 | case AArch64::LDURDi: | |
791 | case AArch64::LDURQi: | |
792 | case AArch64::LDURWi: | |
793 | case AArch64::LDURXi: { | |
794 | // If this is a volatile load/store, don't mess with it. | |
795 | if (MI->hasOrderedMemoryRef()) { | |
796 | ++MBBI; | |
797 | break; | |
798 | } | |
799 | // Make sure this is a reg+imm (as opposed to an address reloc). | |
800 | if (!MI->getOperand(2).isImm()) { | |
801 | ++MBBI; | |
802 | break; | |
803 | } | |
804 | // Check if this load/store has a hint to avoid pair formation. | |
805 | // MachineMemOperands hints are set by the AArch64StorePairSuppress pass. | |
806 | if (TII->isLdStPairSuppressed(MI)) { | |
807 | ++MBBI; | |
808 | break; | |
809 | } | |
810 | // Look ahead up to ScanLimit instructions for a pairable instruction. | |
811 | bool MergeForward = false; | |
812 | MachineBasicBlock::iterator Paired = | |
813 | findMatchingInsn(MBBI, MergeForward, ScanLimit); | |
814 | if (Paired != E) { | |
815 | // Merge the loads into a pair. Keeping the iterator straight is a | |
816 | // pain, so we let the merge routine tell us what the next instruction | |
817 | // is after it's done mucking about. | |
818 | MBBI = mergePairedInsns(MBBI, Paired, MergeForward); | |
819 | ||
820 | Modified = true; | |
821 | ++NumPairCreated; | |
822 | if (isUnscaledLdst(MI->getOpcode())) | |
823 | ++NumUnscaledPairCreated; | |
824 | break; | |
825 | } | |
826 | ++MBBI; | |
827 | break; | |
828 | } | |
829 | // FIXME: Do the other instructions. | |
830 | } | |
831 | } | |
832 | ||
833 | for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); | |
834 | MBBI != E;) { | |
835 | MachineInstr *MI = MBBI; | |
836 | // Do update merging. It's simpler to keep this separate from the above | |
837 | // switch, though not strictly necessary. | |
838 | int Opc = MI->getOpcode(); | |
839 | switch (Opc) { | |
840 | default: | |
841 | // Just move on to the next instruction. | |
842 | ++MBBI; | |
843 | break; | |
844 | case AArch64::STRSui: | |
845 | case AArch64::STRDui: | |
846 | case AArch64::STRQui: | |
847 | case AArch64::STRXui: | |
848 | case AArch64::STRWui: | |
849 | case AArch64::LDRSui: | |
850 | case AArch64::LDRDui: | |
851 | case AArch64::LDRQui: | |
852 | case AArch64::LDRXui: | |
853 | case AArch64::LDRWui: | |
854 | // do the unscaled versions as well | |
855 | case AArch64::STURSi: | |
856 | case AArch64::STURDi: | |
857 | case AArch64::STURQi: | |
858 | case AArch64::STURWi: | |
859 | case AArch64::STURXi: | |
860 | case AArch64::LDURSi: | |
861 | case AArch64::LDURDi: | |
862 | case AArch64::LDURQi: | |
863 | case AArch64::LDURWi: | |
864 | case AArch64::LDURXi: { | |
865 | // Make sure this is a reg+imm (as opposed to an address reloc). | |
866 | if (!MI->getOperand(2).isImm()) { | |
867 | ++MBBI; | |
868 | break; | |
869 | } | |
870 | // Look ahead up to ScanLimit instructions for a mergable instruction. | |
871 | MachineBasicBlock::iterator Update = | |
872 | findMatchingUpdateInsnForward(MBBI, ScanLimit, 0); | |
873 | if (Update != E) { | |
874 | // Merge the update into the ld/st. | |
875 | MBBI = mergePostIdxUpdateInsn(MBBI, Update); | |
876 | Modified = true; | |
877 | ++NumPostFolded; | |
878 | break; | |
879 | } | |
880 | // Don't know how to handle pre/post-index versions, so move to the next | |
881 | // instruction. | |
882 | if (isUnscaledLdst(Opc)) { | |
883 | ++MBBI; | |
884 | break; | |
885 | } | |
886 | ||
887 | // Look back to try to find a pre-index instruction. For example, | |
888 | // add x0, x0, #8 | |
889 | // ldr x1, [x0] | |
890 | // merged into: | |
891 | // ldr x1, [x0, #8]! | |
892 | Update = findMatchingUpdateInsnBackward(MBBI, ScanLimit); | |
893 | if (Update != E) { | |
894 | // Merge the update into the ld/st. | |
895 | MBBI = mergePreIdxUpdateInsn(MBBI, Update); | |
896 | Modified = true; | |
897 | ++NumPreFolded; | |
898 | break; | |
899 | } | |
900 | ||
901 | // Look forward to try to find a post-index instruction. For example, | |
902 | // ldr x1, [x0, #64] | |
903 | // add x0, x0, #64 | |
904 | // merged into: | |
905 | // ldr x1, [x0, #64]! | |
906 | ||
907 | // The immediate in the load/store is scaled by the size of the register | |
908 | // being loaded. The immediate in the add we're looking for, | |
909 | // however, is not, so adjust here. | |
910 | int Value = MI->getOperand(2).getImm() * | |
911 | TII->getRegClass(MI->getDesc(), 0, TRI, *(MBB.getParent())) | |
912 | ->getSize(); | |
913 | Update = findMatchingUpdateInsnForward(MBBI, ScanLimit, Value); | |
914 | if (Update != E) { | |
915 | // Merge the update into the ld/st. | |
916 | MBBI = mergePreIdxUpdateInsn(MBBI, Update); | |
917 | Modified = true; | |
918 | ++NumPreFolded; | |
919 | break; | |
920 | } | |
921 | ||
922 | // Nothing found. Just move to the next instruction. | |
923 | ++MBBI; | |
924 | break; | |
925 | } | |
926 | // FIXME: Do the other instructions. | |
927 | } | |
928 | } | |
929 | ||
930 | return Modified; | |
931 | } | |
932 | ||
933 | bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { | |
934 | const TargetMachine &TM = Fn.getTarget(); | |
935 | TII = static_cast<const AArch64InstrInfo *>( | |
936 | TM.getSubtargetImpl()->getInstrInfo()); | |
937 | TRI = TM.getSubtargetImpl()->getRegisterInfo(); | |
938 | ||
939 | bool Modified = false; | |
940 | for (auto &MBB : Fn) | |
941 | Modified |= optimizeBlock(MBB); | |
942 | ||
943 | return Modified; | |
944 | } | |
945 | ||
946 | // FIXME: Do we need/want a pre-alloc pass like ARM has to try to keep | |
947 | // loads and stores near one another? | |
948 | ||
949 | /// createARMLoadStoreOptimizationPass - returns an instance of the load / store | |
950 | /// optimization pass. | |
951 | FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() { | |
952 | return new AArch64LoadStoreOpt(); | |
953 | } |