src/llvm/lib/CodeGen/MachineCombiner.cpp

   1 //===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
   2 //
   3 //                     The LLVM Compiler Infrastructure
   4 //
   5 // This file is distributed under the University of Illinois Open Source
   6 // License. See LICENSE.TXT for details.
   7 //
   8 //===----------------------------------------------------------------------===//
   9 //
  10 // The machine combiner pass uses machine trace metrics to ensure the combined
  11 // instructions does not lengthen the critical path or the resource depth.
  12 //===----------------------------------------------------------------------===//
  13 #define DEBUG_TYPE "machine-combiner"
  14
  15 #include "llvm/ADT/Statistic.h"
  16 #include "llvm/ADT/DenseMap.h"
  17 #include "llvm/CodeGen/MachineDominators.h"
  18 #include "llvm/CodeGen/MachineFunction.h"
  19 #include "llvm/CodeGen/MachineFunctionPass.h"
  20 #include "llvm/CodeGen/MachineInstrBuilder.h"
  21 #include "llvm/CodeGen/MachineLoopInfo.h"
  22 #include "llvm/CodeGen/MachineRegisterInfo.h"
  23 #include "llvm/CodeGen/MachineTraceMetrics.h"
  24 #include "llvm/CodeGen/Passes.h"
  25 #include "llvm/CodeGen/TargetSchedule.h"
  26 #include "llvm/Support/CommandLine.h"
  27 #include "llvm/Support/Debug.h"
  28 #include "llvm/Support/raw_ostream.h"
  29 #include "llvm/Target/TargetInstrInfo.h"
  30 #include "llvm/Target/TargetRegisterInfo.h"
  31 #include "llvm/Target/TargetSubtargetInfo.h"
  32
  33 using namespace llvm;
  34
  35 STATISTIC(NumInstCombined, "Number of machineinst combined");
  36
  37 namespace {
  38 class MachineCombiner : public MachineFunctionPass {
  39   const TargetInstrInfo *TII;
  40   const TargetRegisterInfo *TRI;
  41   MCSchedModel SchedModel;
  42   MachineRegisterInfo *MRI;
  43   MachineTraceMetrics *Traces;
  44   MachineTraceMetrics::Ensemble *MinInstr;
  45
  46   TargetSchedModel TSchedModel;
  47
  48   /// OptSize - True if optimizing for code size.
  49   bool OptSize;
  50
  51 public:
  52   static char ID;
  53   MachineCombiner() : MachineFunctionPass(ID) {
  54     initializeMachineCombinerPass(*PassRegistry::getPassRegistry());
  55   }
  56   void getAnalysisUsage(AnalysisUsage &AU) const override;
  57   bool runOnMachineFunction(MachineFunction &MF) override;
  58   const char *getPassName() const override { return "Machine InstCombiner"; }
  59
  60 private:
  61   bool doSubstitute(unsigned NewSize, unsigned OldSize);
  62   bool combineInstructions(MachineBasicBlock *);
  63   MachineInstr *getOperandDef(const MachineOperand &MO);
  64   unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
  65                     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
  66                     MachineTraceMetrics::Trace BlockTrace);
  67   unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
  68                       MachineTraceMetrics::Trace BlockTrace);
  69   bool
  70   preservesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
  71                            MachineTraceMetrics::Trace BlockTrace,
  72                            SmallVectorImpl<MachineInstr *> &InsInstrs,
  73                            DenseMap<unsigned, unsigned> &InstrIdxForVirtReg);
  74   bool preservesResourceLen(MachineBasicBlock *MBB,
  75                             MachineTraceMetrics::Trace BlockTrace,
  76                             SmallVectorImpl<MachineInstr *> &InsInstrs,
  77                             SmallVectorImpl<MachineInstr *> &DelInstrs);
  78   void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
  79                      SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
  80 };
  81 }
  82
  83 char MachineCombiner::ID = 0;
  84 char &llvm::MachineCombinerID = MachineCombiner::ID;
  85
  86 INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner",
  87                       "Machine InstCombiner", false, false)
  88 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
  89 INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner",
  90                     false, false)
  91
  92 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
  93   AU.setPreservesCFG();
  94   AU.addPreserved<MachineDominatorTree>();
  95   AU.addPreserved<MachineLoopInfo>();
  96   AU.addRequired<MachineTraceMetrics>();
  97   AU.addPreserved<MachineTraceMetrics>();
  98   MachineFunctionPass::getAnalysisUsage(AU);
  99 }
 100
 101 MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
 102   MachineInstr *DefInstr = nullptr;
 103   // We need a virtual register definition.
 104   if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
 105     DefInstr = MRI->getUniqueVRegDef(MO.getReg());
 106   // PHI's have no depth etc.
 107   if (DefInstr && DefInstr->isPHI())
 108     DefInstr = nullptr;
 109   return DefInstr;
 110 }
 111
 112 /// getDepth - Computes depth of instructions in vector \InsInstr.
 113 ///
 114 /// \param InsInstrs is a vector of machine instructions
 115 /// \param InstrIdxForVirtReg is a dense map of virtual register to index
 116 /// of defining machine instruction in \p InsInstrs
 117 /// \param BlockTrace is a trace of machine instructions
 118 ///
 119 /// \returns Depth of last instruction in \InsInstrs ("NewRoot")
 120 unsigned
 121 MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
 122                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
 123                           MachineTraceMetrics::Trace BlockTrace) {
 124
 125   SmallVector<unsigned, 16> InstrDepth;
 126   assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n");
 127
 128   // Foreach instruction in in the new sequence compute the depth based on the
 129   // operands. Use the trace information when possible. For new operands which
 130   // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
 131   for (auto *InstrPtr : InsInstrs) { // for each Use
 132     unsigned IDepth = 0;
 133     DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(); dbgs() << "\n";);
 134     for (unsigned i = 0, e = InstrPtr->getNumOperands(); i != e; ++i) {
 135       const MachineOperand &MO = InstrPtr->getOperand(i);
 136       // Check for virtual register operand.
 137       if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
 138         continue;
 139       if (!MO.isUse())
 140         continue;
 141       unsigned DepthOp = 0;
 142       unsigned LatencyOp = 0;
 143       DenseMap<unsigned, unsigned>::iterator II =
 144           InstrIdxForVirtReg.find(MO.getReg());
 145       if (II != InstrIdxForVirtReg.end()) {
 146         // Operand is new virtual register not in trace
 147         assert(II->second < InstrDepth.size() && "Bad Index");
 148         MachineInstr *DefInstr = InsInstrs[II->second];
 149         assert(DefInstr &&
 150                "There must be a definition for a new virtual register");
 151         DepthOp = InstrDepth[II->second];
 152         LatencyOp = TSchedModel.computeOperandLatency(
 153             DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
 154             InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
 155       } else {
 156         MachineInstr *DefInstr = getOperandDef(MO);
 157         if (DefInstr) {
 158           DepthOp = BlockTrace.getInstrCycles(DefInstr).Depth;
 159           LatencyOp = TSchedModel.computeOperandLatency(
 160               DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
 161               InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
 162         }
 163       }
 164       IDepth = std::max(IDepth, DepthOp + LatencyOp);
 165     }
 166     InstrDepth.push_back(IDepth);
 167   }
 168   unsigned NewRootIdx = InsInstrs.size() - 1;
 169   return InstrDepth[NewRootIdx];
 170 }
 171
 172 /// getLatency - Computes instruction latency as max of latency of defined
 173 /// operands
 174 ///
 175 /// \param Root is a machine instruction that could be replaced by NewRoot.
 176 /// It is used to compute a more accurate latency information for NewRoot in
 177 /// case there is a dependent instruction in the same trace (\p BlockTrace)
 178 /// \param NewRoot is the instruction for which the latency is computed
 179 /// \param BlockTrace is a trace of machine instructions
 180 ///
 181 /// \returns Latency of \p NewRoot
 182 unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
 183                                      MachineTraceMetrics::Trace BlockTrace) {
 184
 185   assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n");
 186
 187   // Check each definition in NewRoot and compute the latency
 188   unsigned NewRootLatency = 0;
 189
 190   for (unsigned i = 0, e = NewRoot->getNumOperands(); i != e; ++i) {
 191     const MachineOperand &MO = NewRoot->getOperand(i);
 192     // Check for virtual register operand.
 193     if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
 194       continue;
 195     if (!MO.isDef())
 196       continue;
 197     // Get the first instruction that uses MO
 198     MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
 199     RI++;
 200     MachineInstr *UseMO = RI->getParent();
 201     unsigned LatencyOp = 0;
 202     if (UseMO && BlockTrace.isDepInTrace(Root, UseMO)) {
 203       LatencyOp = TSchedModel.computeOperandLatency(
 204           NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO,
 205           UseMO->findRegisterUseOperandIdx(MO.getReg()));
 206     } else {
 207       LatencyOp = TSchedModel.computeInstrLatency(NewRoot->getOpcode());
 208     }
 209     NewRootLatency = std::max(NewRootLatency, LatencyOp);
 210   }
 211   return NewRootLatency;
 212 }
 213
 214 /// preservesCriticalPathlen - True when the new instruction sequence does not
 215 /// lengthen the critical path. The DAGCombine code sequence ends in MI
 216 /// (Machine Instruction) Root. The new code sequence ends in MI NewRoot. A
 217 /// necessary condition for the new sequence to replace the old sequence is that
 218 /// is cannot lengthen the critical path. This is decided by the formula
 219 /// (NewRootDepth + NewRootLatency) <=  (RootDepth + RootLatency + RootSlack)).
 220 /// The slack is the number of cycles Root can be delayed before the critical
 221 /// patch becomes longer.
 222 bool MachineCombiner::preservesCriticalPathLen(
 223     MachineBasicBlock *MBB, MachineInstr *Root,
 224     MachineTraceMetrics::Trace BlockTrace,
 225     SmallVectorImpl<MachineInstr *> &InsInstrs,
 226     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
 227
 228   assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n");
 229   // NewRoot is the last instruction in the \p InsInstrs vector
 230   // Get depth and latency of NewRoot
 231   unsigned NewRootIdx = InsInstrs.size() - 1;
 232   MachineInstr *NewRoot = InsInstrs[NewRootIdx];
 233   unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
 234   unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
 235
 236   // Get depth, latency and slack of Root
 237   unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth;
 238   unsigned RootLatency = TSchedModel.computeInstrLatency(Root);
 239   unsigned RootSlack = BlockTrace.getInstrSlack(Root);
 240
 241   DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n";
 242         dbgs() << " NewRootDepth: " << NewRootDepth
 243                << " NewRootLatency: " << NewRootLatency << "\n";
 244         dbgs() << " RootDepth: " << RootDepth << " RootLatency: " << RootLatency
 245                << " RootSlack: " << RootSlack << "\n";
 246         dbgs() << " NewRootDepth + NewRootLatency "
 247                << NewRootDepth + NewRootLatency << "\n";
 248         dbgs() << " RootDepth + RootLatency + RootSlack "
 249                << RootDepth + RootLatency + RootSlack << "\n";);
 250
 251   /// True when the new sequence does not lenghten the critical path.
 252   return ((NewRootDepth + NewRootLatency) <=
 253           (RootDepth + RootLatency + RootSlack));
 254 }
 255
 256 /// helper routine to convert instructions into SC
 257 void MachineCombiner::instr2instrSC(
 258     SmallVectorImpl<MachineInstr *> &Instrs,
 259     SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
 260   for (auto *InstrPtr : Instrs) {
 261     unsigned Opc = InstrPtr->getOpcode();
 262     unsigned Idx = TII->get(Opc).getSchedClass();
 263     const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
 264     InstrsSC.push_back(SC);
 265   }
 266 }
 267 /// preservesResourceLen - True when the new instructions do not increase
 268 /// resource length
 269 bool MachineCombiner::preservesResourceLen(
 270     MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
 271     SmallVectorImpl<MachineInstr *> &InsInstrs,
 272     SmallVectorImpl<MachineInstr *> &DelInstrs) {
 273
 274   // Compute current resource length
 275
 276   //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
 277   SmallVector <const MachineBasicBlock *, 1> MBBarr;
 278   MBBarr.push_back(MBB);
 279   unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
 280
 281   // Deal with SC rather than Instructions.
 282   SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC;
 283   SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC;
 284
 285   instr2instrSC(InsInstrs, InsInstrsSC);
 286   instr2instrSC(DelInstrs, DelInstrsSC);
 287
 288   ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC);
 289   ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC);
 290
 291   // Compute new resource length
 292   unsigned ResLenAfterCombine =
 293       BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
 294
 295   DEBUG(dbgs() << "RESOURCE DATA: \n";
 296         dbgs() << " resource len before: " << ResLenBeforeCombine
 297                << " after: " << ResLenAfterCombine << "\n";);
 298
 299   return ResLenAfterCombine <= ResLenBeforeCombine;
 300 }
 301
 302 /// \returns true when new instruction sequence should be generated
 303 /// independent if it lenghtens critical path or not
 304 bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
 305   if (OptSize && (NewSize < OldSize))
 306     return true;
 307   if (!TSchedModel.hasInstrSchedModel())
 308     return true;
 309   return false;
 310 }
 311
 312 /// combineInstructions - substitute a slow code sequence with a faster one by
 313 /// evaluating instruction combining pattern.
 314 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
 315 /// combining based on machine trace metrics. Only combine a sequence of
 316 /// instructions  when this neither lengthens the critical path nor increases
 317 /// resource pressure. When optimizing for codesize always combine when the new
 318 /// sequence is shorter.
 319 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
 320   bool Changed = false;
 321   DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
 322
 323   auto BlockIter = MBB->begin();
 324
 325   while (BlockIter != MBB->end()) {
 326     auto &MI = *BlockIter++;
 327
 328     DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";);
 329     SmallVector<MachineCombinerPattern::MC_PATTERN, 16> Pattern;
 330     // The motivating example is:
 331     //
 332     //     MUL  Other        MUL_op1 MUL_op2  Other
 333     //      \    /               \      |    /
 334     //      ADD/SUB      =>        MADD/MSUB
 335     //      (=Root)                (=NewRoot)
 336
 337     // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
 338     // usually beneficial for code size it unfortunately can hurt performance
 339     // when the ADD is on the critical path, but the MUL is not. With the
 340     // substitution the MUL becomes part of the critical path (in form of the
 341     // MADD) and can lengthen it on architectures where the MADD latency is
 342     // longer than the ADD latency.
 343     //
 344     // For each instruction we check if it can be the root of a combiner
 345     // pattern. Then for each pattern the new code sequence in form of MI is
 346     // generated and evaluated. When the efficiency criteria (don't lengthen
 347     // critical path, don't use more resources) is met the new sequence gets
 348     // hooked up into the basic block before the old sequence is removed.
 349     //
 350     // The algorithm does not try to evaluate all patterns and pick the best.
 351     // This is only an artificial restriction though. In practice there is
 352     // mostly one pattern and hasPattern() can order patterns based on an
 353     // internal cost heuristic.
 354
 355     if (TII->hasPattern(MI, Pattern)) {
 356       for (auto P : Pattern) {
 357         SmallVector<MachineInstr *, 16> InsInstrs;
 358         SmallVector<MachineInstr *, 16> DelInstrs;
 359         DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
 360         if (!MinInstr)
 361           MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
 362         MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
 363         Traces->verifyAnalysis();
 364         TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
 365                                         InstrIdxForVirtReg);
 366         // Found pattern, but did not generate alternative sequence.
 367         // This can happen e.g. when an immediate could not be materialized
 368         // in a single instruction.
 369         if (!InsInstrs.size())
 370           continue;
 371         // Substitute when we optimize for codesize and the new sequence has
 372         // fewer instructions OR
 373         // the new sequence neither lenghten the critical path nor increases
 374         // resource pressure.
 375         if (doSubstitute(InsInstrs.size(), DelInstrs.size()) ||
 376             (preservesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
 377                                       InstrIdxForVirtReg) &&
 378              preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
 379           for (auto *InstrPtr : InsInstrs)
 380             MBB->insert((MachineBasicBlock::iterator) & MI,
 381                         (MachineInstr *)InstrPtr);
 382           for (auto *InstrPtr : DelInstrs)
 383             InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
 384
 385           Changed = true;
 386           ++NumInstCombined;
 387
 388           Traces->invalidate(MBB);
 389           Traces->verifyAnalysis();
 390           // Eagerly stop after the first pattern fired
 391           break;
 392         } else {
 393           // Cleanup instructions of the alternative code sequence. There is no
 394           // use for them.
 395           for (auto *InstrPtr : InsInstrs) {
 396             MachineFunction *MF = MBB->getParent();
 397             MF->DeleteMachineInstr((MachineInstr *)InstrPtr);
 398           }
 399         }
 400         InstrIdxForVirtReg.clear();
 401       }
 402     }
 403   }
 404
 405   return Changed;
 406 }
 407
 408 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
 409   const TargetSubtargetInfo &STI =
 410       MF.getTarget().getSubtarget<TargetSubtargetInfo>();
 411   TII = STI.getInstrInfo();
 412   TRI = STI.getRegisterInfo();
 413   SchedModel = STI.getSchedModel();
 414   TSchedModel.init(SchedModel, &STI, TII);
 415   MRI = &MF.getRegInfo();
 416   Traces = &getAnalysis<MachineTraceMetrics>();
 417   MinInstr = 0;
 418
 419   OptSize = MF.getFunction()->getAttributes().hasAttribute(
 420       AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
 421
 422   DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
 423   if (!TII->useMachineCombiner()) {
 424     DEBUG(dbgs() << "  Skipping pass: Target does not support machine combiner\n");
 425     return false;
 426   }
 427
 428   bool Changed = false;
 429
 430   // Try to combine instructions.
 431   for (auto &MBB : MF)
 432     Changed |= combineInstructions(&MBB);
 433
 434   return Changed;
 435 }