[rustc.git] / src / llvm / lib / Target / R600 / SILowerControlFlow.cpp

//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file
/// \brief This pass lowers the pseudo control flow instructions to real
/// machine instructions.
///
/// All control flow is handled using predicated instructions and
/// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
/// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
/// by writting to the 64-bit EXEC register (each bit corresponds to a
/// single vector ALU).  Typically, for predicates, a vector ALU will write
/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
/// Vector ALU) and then the ScalarALU will AND the VCC register with the
/// EXEC to update the predicates.
///
/// For example:
/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
/// %SGPR0 = SI_IF %VCC
///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
/// %SGPR0 = SI_ELSE %SGPR0
///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
/// SI_END_CF %SGPR0
///
/// becomes:
///
/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
/// S_CBRANCH_EXECZ label0            // This instruction is an optional
///                                   // optimization which allows us to
///                                   // branch if all the bits of
///                                   // EXEC are zero.
/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
///
/// label0:
/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
/// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
/// S_BRANCH_EXECZ label1              // Use our branch optimization
///                                    // instruction again.
/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
/// label1:
/// %EXEC = S_OR_B64 %EXEC, %SGPR0     // Re-enable saved exec mask bits
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/Constants.h"

using namespace llvm;

namespace {

class SILowerControlFlowPass : public MachineFunctionPass {

private:
  static const unsigned SkipThreshold = 12;

  static char ID;
  const SIRegisterInfo *TRI;
  const SIInstrInfo *TII;

  bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);

  void Skip(MachineInstr &From, MachineOperand &To);
  void SkipIfDead(MachineInstr &MI);

  void If(MachineInstr &MI);
  void Else(MachineInstr &MI);
  void Break(MachineInstr &MI);
  void IfBreak(MachineInstr &MI);
  void ElseBreak(MachineInstr &MI);
  void Loop(MachineInstr &MI);
  void EndCf(MachineInstr &MI);

  void Kill(MachineInstr &MI);
  void Branch(MachineInstr &MI);

  void InitM0ForLDS(MachineBasicBlock::iterator MI);
  void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
  void IndirectSrc(MachineInstr &MI);
  void IndirectDst(MachineInstr &MI);

public:
  SILowerControlFlowPass(TargetMachine &tm) :
    MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }

  bool runOnMachineFunction(MachineFunction &MF) override;

  const char *getPassName() const override {
    return "SI Lower control flow instructions";
  }

};

} // End anonymous namespace

char SILowerControlFlowPass::ID = 0;

FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
  return new SILowerControlFlowPass(tm);
}

bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
                                        MachineBasicBlock *To) {

  unsigned NumInstr = 0;

  for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
       MBB = *MBB->succ_begin()) {

    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
         NumInstr < SkipThreshold && I != E; ++I) {

      if (I->isBundle() || !I->isBundled())
        if (++NumInstr >= SkipThreshold)
          return true;
    }
  }

  return false;
}

void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {

  if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
    return;

  DebugLoc DL = From.getDebugLoc();
  BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
          .addOperand(To)
          .addReg(AMDGPU::EXEC);
}

void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {

  MachineBasicBlock &MBB = *MI.getParent();
  DebugLoc DL = MI.getDebugLoc();

  if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() !=
      ShaderType::PIXEL ||
      !shouldSkip(&MBB, &MBB.getParent()->back()))
    return;

  MachineBasicBlock::iterator Insert = &MI;
  ++Insert;

  // If the exec mask is non-zero, skip the next two instructions
  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
          .addImm(3)
          .addReg(AMDGPU::EXEC);

  // Exec mask is zero: Export to NULL target...
  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
          .addImm(0)
          .addImm(0x09) // V_008DFC_SQ_EXP_NULL
          .addImm(0)
          .addImm(1)
          .addImm(1)
          .addReg(AMDGPU::VGPR0)
          .addReg(AMDGPU::VGPR0)
          .addReg(AMDGPU::VGPR0)
          .addReg(AMDGPU::VGPR0);

  // ... and terminate wavefront
  BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
}

void SILowerControlFlowPass::If(MachineInstr &MI) {
  MachineBasicBlock &MBB = *MI.getParent();
  DebugLoc DL = MI.getDebugLoc();
  unsigned Reg = MI.getOperand(0).getReg();
  unsigned Vcc = MI.getOperand(1).getReg();

  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
          .addReg(Vcc);

  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
          .addReg(AMDGPU::EXEC)
          .addReg(Reg);

  Skip(MI, MI.getOperand(2));

  MI.eraseFromParent();
}

void SILowerControlFlowPass::Else(MachineInstr &MI) {
  MachineBasicBlock &MBB = *MI.getParent();
  DebugLoc DL = MI.getDebugLoc();
  unsigned Dst = MI.getOperand(0).getReg();
  unsigned Src = MI.getOperand(1).getReg();

  BuildMI(MBB, MBB.getFirstNonPHI(), DL,
          TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
          .addReg(Src); // Saved EXEC

  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
          .addReg(AMDGPU::EXEC)
          .addReg(Dst);

  Skip(MI, MI.getOperand(2));

  MI.eraseFromParent();
}

void SILowerControlFlowPass::Break(MachineInstr &MI) {
  MachineBasicBlock &MBB = *MI.getParent();
  DebugLoc DL = MI.getDebugLoc();

  unsigned Dst = MI.getOperand(0).getReg();
  unsigned Src = MI.getOperand(1).getReg();
 
  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
          .addReg(AMDGPU::EXEC)
          .addReg(Src);

  MI.eraseFromParent();
}

void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
  MachineBasicBlock &MBB = *MI.getParent();
  DebugLoc DL = MI.getDebugLoc();

  unsigned Dst = MI.getOperand(0).getReg();
  unsigned Vcc = MI.getOperand(1).getReg();
  unsigned Src = MI.getOperand(2).getReg();
 
  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
          .addReg(Vcc)
          .addReg(Src);

  MI.eraseFromParent();
}

void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
  MachineBasicBlock &MBB = *MI.getParent();
  DebugLoc DL = MI.getDebugLoc();

  unsigned Dst = MI.getOperand(0).getReg();
  unsigned Saved = MI.getOperand(1).getReg();
  unsigned Src = MI.getOperand(2).getReg();
 
  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
          .addReg(Saved)
          .addReg(Src);

  MI.eraseFromParent();
}

void SILowerControlFlowPass::Loop(MachineInstr &MI) {
  MachineBasicBlock &MBB = *MI.getParent();
  DebugLoc DL = MI.getDebugLoc();
  unsigned Src = MI.getOperand(0).getReg();

  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
          .addReg(AMDGPU::EXEC)
          .addReg(Src);

  BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
          .addOperand(MI.getOperand(1))
          .addReg(AMDGPU::EXEC);

  MI.eraseFromParent();
}

void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
  MachineBasicBlock &MBB = *MI.getParent();
  DebugLoc DL = MI.getDebugLoc();
  unsigned Reg = MI.getOperand(0).getReg();

  BuildMI(MBB, MBB.getFirstNonPHI(), DL,
          TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
          .addReg(AMDGPU::EXEC)
          .addReg(Reg);

  MI.eraseFromParent();
}

void SILowerControlFlowPass::Branch(MachineInstr &MI) {
  if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
    MI.eraseFromParent();

  // If these aren't equal, this is probably an infinite loop.
}

void SILowerControlFlowPass::Kill(MachineInstr &MI) {
  MachineBasicBlock &MBB = *MI.getParent();
  DebugLoc DL = MI.getDebugLoc();
  const MachineOperand &Op = MI.getOperand(0);

#ifndef NDEBUG
  const SIMachineFunctionInfo *MFI
    = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
  // Kill is only allowed in pixel / geometry shaders.
  assert(MFI->getShaderType() == ShaderType::PIXEL ||
         MFI->getShaderType() == ShaderType::GEOMETRY);
#endif

  // Clear this thread from the exec mask if the operand is negative
  if ((Op.isImm() || Op.isFPImm())) {
    // Constant operand: Set exec mask to 0 or do nothing
    if (Op.isImm() ? (Op.getImm() & 0x80000000) :
        Op.getFPImm()->isNegative()) {
      BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
              .addImm(0);
    }
  } else {
    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
           .addImm(0)
           .addOperand(Op);
  }

  MI.eraseFromParent();
}

/// The m0 register stores the maximum allowable address for LDS reads and
/// writes.  Its value must be at least the size in bytes of LDS allocated by
/// the shader.  For simplicity, we set it to the maximum possible value.
void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) {
    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),  TII->get(AMDGPU::S_MOV_B32),
            AMDGPU::M0).addImm(0xffffffff);
}

void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {

  MachineBasicBlock &MBB = *MI.getParent();
  DebugLoc DL = MI.getDebugLoc();
  MachineBasicBlock::iterator I = MI;

  unsigned Save = MI.getOperand(1).getReg();
  unsigned Idx = MI.getOperand(3).getReg();

  if (AMDGPU::SReg_32RegClass.contains(Idx)) {
    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
            .addReg(Idx);
    MBB.insert(I, MovRel);
  } else {

    assert(AMDGPU::SReg_64RegClass.contains(Save));
    assert(AMDGPU::VReg_32RegClass.contains(Idx));

    // Save the EXEC mask
    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
            .addReg(AMDGPU::EXEC);

    // Read the next variant into VCC (lower 32 bits) <- also loop target
    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
            AMDGPU::VCC_LO)
            .addReg(Idx);

    // Move index from VCC into M0
    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
            .addReg(AMDGPU::VCC_LO);

    // Compare the just read M0 value to all possible Idx values
    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
            .addReg(AMDGPU::M0)
            .addReg(Idx);

    // Update EXEC, save the original EXEC value to VCC
    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
            .addReg(AMDGPU::VCC);

    // Do the actual move
    MBB.insert(I, MovRel);

    // Update EXEC, switch all done bits to 0 and all todo bits to 1
    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
            .addReg(AMDGPU::EXEC)
            .addReg(AMDGPU::VCC);

    // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
            .addImm(-7)
            .addReg(AMDGPU::EXEC);

    // Restore EXEC
    BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
            .addReg(Save);

  }
  // FIXME: Are there any values other than the LDS address clamp that need to
  // be stored in the m0 register and may be live for more than a few
  // instructions?  If so, we should save the m0 register at the beginning
  // of this function and restore it here.
  // FIXME: Add support for LDS direct loads.
  InitM0ForLDS(&MI);
  MI.eraseFromParent();
}

void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {

  MachineBasicBlock &MBB = *MI.getParent();
  DebugLoc DL = MI.getDebugLoc();

  unsigned Dst = MI.getOperand(0).getReg();
  unsigned Vec = MI.getOperand(2).getReg();
  unsigned Off = MI.getOperand(4).getImm();
  unsigned SubReg = TRI->getSubReg(Vec, AMDGPU::sub0);
  if (!SubReg)
    SubReg = Vec;

  MachineInstr *MovRel =
    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
            .addReg(SubReg + Off)
            .addReg(AMDGPU::M0, RegState::Implicit)
            .addReg(Vec, RegState::Implicit);

  LoadM0(MI, MovRel);
}

void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {

  MachineBasicBlock &MBB = *MI.getParent();
  DebugLoc DL = MI.getDebugLoc();

  unsigned Dst = MI.getOperand(0).getReg();
  unsigned Off = MI.getOperand(4).getImm();
  unsigned Val = MI.getOperand(5).getReg();
  unsigned SubReg = TRI->getSubReg(Dst, AMDGPU::sub0);
  if (!SubReg)
    SubReg = Dst;

  MachineInstr *MovRel = 
    BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
            .addReg(SubReg + Off, RegState::Define)
            .addReg(Val)
            .addReg(AMDGPU::M0, RegState::Implicit)
            .addReg(Dst, RegState::Implicit);

  LoadM0(MI, MovRel);
}

bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
  TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
  TRI =
      static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

  bool HaveKill = false;
  bool NeedM0 = false;
  bool NeedWQM = false;
  bool NeedFlat = false;
  unsigned Depth = 0;

  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
       BI != BE; ++BI) {

    MachineBasicBlock &MBB = *BI;
    MachineBasicBlock::iterator I, Next;
    for (I = MBB.begin(); I != MBB.end(); I = Next) {
      Next = std::next(I);

      MachineInstr &MI = *I;
      if (TII->isDS(MI.getOpcode())) {
        NeedM0 = true;
        NeedWQM = true;
      }

      // Flat uses m0 in case it needs to access LDS.
      if (TII->isFLAT(MI.getOpcode())) {
        NeedM0 = true;
        NeedFlat = true;
      }

      switch (MI.getOpcode()) {
        default: break;
        case AMDGPU::SI_IF:
          ++Depth;
          If(MI);
          break;

        case AMDGPU::SI_ELSE:
          Else(MI);
          break;

        case AMDGPU::SI_BREAK:
          Break(MI);
          break;

        case AMDGPU::SI_IF_BREAK:
          IfBreak(MI);
          break;

        case AMDGPU::SI_ELSE_BREAK:
          ElseBreak(MI);
          break;

        case AMDGPU::SI_LOOP:
          ++Depth;
          Loop(MI);
          break;

        case AMDGPU::SI_END_CF:
          if (--Depth == 0 && HaveKill) {
            SkipIfDead(MI);
            HaveKill = false;
          }
          EndCf(MI);
          break;

        case AMDGPU::SI_KILL:
          if (Depth == 0)
            SkipIfDead(MI);
          else
            HaveKill = true;
          Kill(MI);
          break;

        case AMDGPU::S_BRANCH:
          Branch(MI);
          break;

        case AMDGPU::SI_INDIRECT_SRC:
          IndirectSrc(MI);
          break;

        case AMDGPU::SI_INDIRECT_DST_V1:
        case AMDGPU::SI_INDIRECT_DST_V2:
        case AMDGPU::SI_INDIRECT_DST_V4:
        case AMDGPU::SI_INDIRECT_DST_V8:
        case AMDGPU::SI_INDIRECT_DST_V16:
          IndirectDst(MI);
          break;

        case AMDGPU::V_INTERP_P1_F32:
        case AMDGPU::V_INTERP_P2_F32:
        case AMDGPU::V_INTERP_MOV_F32:
          NeedWQM = true;
          break;
      }
    }
  }

  if (NeedM0) {
    MachineBasicBlock &MBB = MF.front();
    // Initialize M0 to a value that won't cause LDS access to be discarded
    // due to offset clamping
    InitM0ForLDS(MBB.getFirstNonPHI());
  }

  if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
    MachineBasicBlock &MBB = MF.front();
    BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
            AMDGPU::EXEC).addReg(AMDGPU::EXEC);
  }

  // FIXME: This seems inappropriate to do here.
  if (NeedFlat && MFI->IsKernel) {
    // Insert the prologue initializing the SGPRs pointing to the scratch space
    // for flat accesses.
    const MachineFrameInfo *FrameInfo = MF.getFrameInfo();

    // TODO: What to use with function calls?

    // FIXME: This is reporting stack size that is used in a scratch buffer
    // rather than registers as well.
    uint64_t StackSizeBytes = FrameInfo->getStackSize();

    int IndirectBegin
      = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
    // Convert register index to 256-byte unit.
    uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);

    assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
           "Stack limits should be smaller than 16-bits");

    // Initialize the flat scratch register pair.
    // TODO: Can we use one s_mov_b64 here?

    // Offset is in units of 256-bytes.
    MachineBasicBlock &MBB = MF.front();
    DebugLoc NoDL;
    MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
    const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);

    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
      .addImm(StackOffset);

    // Documentation says size is "per-thread scratch size in bytes"
    BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
      .addImm(StackSizeBytes);
  }

  return true;
}
Commit	Line	Data
970d7e83 LB	1	//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
	2	//
	3	// The LLVM Compiler Infrastructure
	4	//
	5	// This file is distributed under the University of Illinois Open Source
	6	// License. See LICENSE.TXT for details.
	7	//
	8	//===----------------------------------------------------------------------===//
	9	//
	10	/// \file
	11	/// \brief This pass lowers the pseudo control flow instructions to real
	12	/// machine instructions.
	13	///
	14	/// All control flow is handled using predicated instructions and
	15	/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector
	16	/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs
	17	/// by writting to the 64-bit EXEC register (each bit corresponds to a
	18	/// single vector ALU). Typically, for predicates, a vector ALU will write
	19	/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
	20	/// Vector ALU) and then the ScalarALU will AND the VCC register with the
	21	/// EXEC to update the predicates.
	22	///
	23	/// For example:
	24	/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
	25	/// %SGPR0 = SI_IF %VCC
	26	/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
	27	/// %SGPR0 = SI_ELSE %SGPR0
	28	/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
	29	/// SI_END_CF %SGPR0
	30	///
	31	/// becomes:
	32	///
	33	/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask
	34	/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
	35	/// S_CBRANCH_EXECZ label0 // This instruction is an optional
	36	/// // optimization which allows us to
	37	/// // branch if all the bits of
	38	/// // EXEC are zero.
	39	/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
	40	///
	41	/// label0:
	42	/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block
	43	/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
	44	/// S_BRANCH_EXECZ label1 // Use our branch optimization
	45	/// // instruction again.
	46	/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block
	47	/// label1:
	48	/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits
	49	//===----------------------------------------------------------------------===//
	50
	51	#include "AMDGPU.h"
1a4d82fc	52	#include "AMDGPUSubtarget.h"
970d7e83 LB	53	#include "SIInstrInfo.h"
970d7e83 LB	54	#include "SIMachineFunctionInfo.h"
1a4d82fc	55	#include "llvm/CodeGen/MachineFrameInfo.h"
970d7e83 LB	56	#include "llvm/CodeGen/MachineFunction.h"
	57	#include "llvm/CodeGen/MachineFunctionPass.h"
	58	#include "llvm/CodeGen/MachineInstrBuilder.h"
	59	#include "llvm/CodeGen/MachineRegisterInfo.h"
1a4d82fc	60	#include "llvm/IR/Constants.h"
970d7e83 LB	61
	62	using namespace llvm;
	63
	64	namespace {
	65
	66	class SILowerControlFlowPass : public MachineFunctionPass {
	67
	68	private:
	69	static const unsigned SkipThreshold = 12;
	70
	71	static char ID;
1a4d82fc JJ	72	const SIRegisterInfo *TRI;
1a4d82fc JJ	73	const SIInstrInfo *TII;
970d7e83 LB	74
	75	bool shouldSkip(MachineBasicBlock From, MachineBasicBlock To);
	76
	77	void Skip(MachineInstr &From, MachineOperand &To);
	78	void SkipIfDead(MachineInstr &MI);
	79
	80	void If(MachineInstr &MI);
	81	void Else(MachineInstr &MI);
	82	void Break(MachineInstr &MI);
	83	void IfBreak(MachineInstr &MI);
	84	void ElseBreak(MachineInstr &MI);
	85	void Loop(MachineInstr &MI);
	86	void EndCf(MachineInstr &MI);
	87
	88	void Kill(MachineInstr &MI);
	89	void Branch(MachineInstr &MI);
	90
1a4d82fc	91	void InitM0ForLDS(MachineBasicBlock::iterator MI);
970d7e83 LB	92	void LoadM0(MachineInstr &MI, MachineInstr *MovRel);
	93	void IndirectSrc(MachineInstr &MI);
	94	void IndirectDst(MachineInstr &MI);
	95
	96	public:
	97	SILowerControlFlowPass(TargetMachine &tm) :
1a4d82fc	98	MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
970d7e83	99
1a4d82fc	100	bool runOnMachineFunction(MachineFunction &MF) override;
970d7e83	101
1a4d82fc	102	const char *getPassName() const override {
970d7e83 LB	103	return "SI Lower control flow instructions";
	104	}
	105
	106	};
	107
	108	} // End anonymous namespace
	109
	110	char SILowerControlFlowPass::ID = 0;
	111
	112	FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
	113	return new SILowerControlFlowPass(tm);
	114	}
	115
	116	bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
	117	MachineBasicBlock *To) {
	118
	119	unsigned NumInstr = 0;
	120
	121	for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
	122	MBB = *MBB->succ_begin()) {
	123
	124	for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
	125	NumInstr < SkipThreshold && I != E; ++I) {
	126
	127	if (I->isBundle() \|\| !I->isBundled())
	128	if (++NumInstr >= SkipThreshold)
	129	return true;
	130	}
	131	}
	132
	133	return false;
	134	}
	135
	136	void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
	137
	138	if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
	139	return;
	140
	141	DebugLoc DL = From.getDebugLoc();
	142	BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
	143	.addOperand(To)
	144	.addReg(AMDGPU::EXEC);
	145	}
	146
	147	void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
	148
	149	MachineBasicBlock &MBB = *MI.getParent();
	150	DebugLoc DL = MI.getDebugLoc();
	151
1a4d82fc JJ	152	if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() !=
	153	ShaderType::PIXEL \|\|
	154	!shouldSkip(&MBB, &MBB.getParent()->back()))
970d7e83 LB	155	return;
	156
	157	MachineBasicBlock::iterator Insert = &MI;
	158	++Insert;
	159
	160	// If the exec mask is non-zero, skip the next two instructions
	161	BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
	162	.addImm(3)
	163	.addReg(AMDGPU::EXEC);
	164
	165	// Exec mask is zero: Export to NULL target...
	166	BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
	167	.addImm(0)
	168	.addImm(0x09) // V_008DFC_SQ_EXP_NULL
	169	.addImm(0)
	170	.addImm(1)
	171	.addImm(1)
	172	.addReg(AMDGPU::VGPR0)
	173	.addReg(AMDGPU::VGPR0)
	174	.addReg(AMDGPU::VGPR0)
	175	.addReg(AMDGPU::VGPR0);
	176
	177	// ... and terminate wavefront
	178	BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
	179	}
	180
	181	void SILowerControlFlowPass::If(MachineInstr &MI) {
	182	MachineBasicBlock &MBB = *MI.getParent();
	183	DebugLoc DL = MI.getDebugLoc();
	184	unsigned Reg = MI.getOperand(0).getReg();
	185	unsigned Vcc = MI.getOperand(1).getReg();
	186
	187	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
	188	.addReg(Vcc);
	189
	190	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
	191	.addReg(AMDGPU::EXEC)
	192	.addReg(Reg);
	193
	194	Skip(MI, MI.getOperand(2));
	195
	196	MI.eraseFromParent();
	197	}
	198
	199	void SILowerControlFlowPass::Else(MachineInstr &MI) {
	200	MachineBasicBlock &MBB = *MI.getParent();
	201	DebugLoc DL = MI.getDebugLoc();
	202	unsigned Dst = MI.getOperand(0).getReg();
	203	unsigned Src = MI.getOperand(1).getReg();
	204
1a4d82fc JJ	205	BuildMI(MBB, MBB.getFirstNonPHI(), DL,
1a4d82fc JJ	206	TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
970d7e83 LB	207	.addReg(Src); // Saved EXEC
	208
	209	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
	210	.addReg(AMDGPU::EXEC)
	211	.addReg(Dst);
	212
	213	Skip(MI, MI.getOperand(2));
	214
	215	MI.eraseFromParent();
	216	}
	217
	218	void SILowerControlFlowPass::Break(MachineInstr &MI) {
	219	MachineBasicBlock &MBB = *MI.getParent();
	220	DebugLoc DL = MI.getDebugLoc();
	221
	222	unsigned Dst = MI.getOperand(0).getReg();
	223	unsigned Src = MI.getOperand(1).getReg();
	224
	225	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
	226	.addReg(AMDGPU::EXEC)
	227	.addReg(Src);
	228
	229	MI.eraseFromParent();
	230	}
	231
	232	void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
	233	MachineBasicBlock &MBB = *MI.getParent();
	234	DebugLoc DL = MI.getDebugLoc();
	235
	236	unsigned Dst = MI.getOperand(0).getReg();
	237	unsigned Vcc = MI.getOperand(1).getReg();
	238	unsigned Src = MI.getOperand(2).getReg();
	239
	240	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
	241	.addReg(Vcc)
	242	.addReg(Src);
	243
	244	MI.eraseFromParent();
	245	}
	246
	247	void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
	248	MachineBasicBlock &MBB = *MI.getParent();
	249	DebugLoc DL = MI.getDebugLoc();
	250
	251	unsigned Dst = MI.getOperand(0).getReg();
	252	unsigned Saved = MI.getOperand(1).getReg();
	253	unsigned Src = MI.getOperand(2).getReg();
	254
	255	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
	256	.addReg(Saved)
	257	.addReg(Src);
	258
	259	MI.eraseFromParent();
	260	}
	261
	262	void SILowerControlFlowPass::Loop(MachineInstr &MI) {
	263	MachineBasicBlock &MBB = *MI.getParent();
	264	DebugLoc DL = MI.getDebugLoc();
	265	unsigned Src = MI.getOperand(0).getReg();
	266
	267	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
	268	.addReg(AMDGPU::EXEC)
	269	.addReg(Src);
	270
271	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
272	.addOperand(MI.getOperand(1))
273	.addReg(AMDGPU::EXEC);
274
275	MI.eraseFromParent();
276	}
277
278	void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
279	MachineBasicBlock &MBB = *MI.getParent();
280	DebugLoc DL = MI.getDebugLoc();
281	unsigned Reg = MI.getOperand(0).getReg();
282
283	BuildMI(MBB, MBB.getFirstNonPHI(), DL,
284	TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
285	.addReg(AMDGPU::EXEC)
286	.addReg(Reg);
287
288	MI.eraseFromParent();
289	}
290
291	void SILowerControlFlowPass::Branch(MachineInstr &MI) {
1a4d82fc	292	if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
970d7e83	293	MI.eraseFromParent();
1a4d82fc JJ	294
1a4d82fc JJ	295	// If these aren't equal, this is probably an infinite loop.
970d7e83 LB	296	}
	297
	298	void SILowerControlFlowPass::Kill(MachineInstr &MI) {
970d7e83 LB	299	MachineBasicBlock &MBB = *MI.getParent();
970d7e83 LB	300	DebugLoc DL = MI.getDebugLoc();
1a4d82fc JJ	301	const MachineOperand &Op = MI.getOperand(0);
	302
	303	#ifndef NDEBUG
	304	const SIMachineFunctionInfo *MFI
	305	= MBB.getParent()->getInfo<SIMachineFunctionInfo>();
	306	// Kill is only allowed in pixel / geometry shaders.
	307	assert(MFI->getShaderType() == ShaderType::PIXEL \|\|
	308	MFI->getShaderType() == ShaderType::GEOMETRY);
	309	#endif
	310
	311	// Clear this thread from the exec mask if the operand is negative
	312	if ((Op.isImm() \|\| Op.isFPImm())) {
	313	// Constant operand: Set exec mask to 0 or do nothing
	314	if (Op.isImm() ? (Op.getImm() & 0x80000000) :
	315	Op.getFPImm()->isNegative()) {
	316	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
	317	.addImm(0);
	318	}
	319	} else {
	320	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
	321	.addImm(0)
	322	.addOperand(Op);
	323	}
970d7e83 LB	324
	325	MI.eraseFromParent();
	326	}
	327
1a4d82fc JJ	328	/// The m0 register stores the maximum allowable address for LDS reads and
	329	/// writes. Its value must be at least the size in bytes of LDS allocated by
	330	/// the shader. For simplicity, we set it to the maximum possible value.
	331	void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) {
	332	BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32),
	333	AMDGPU::M0).addImm(0xffffffff);
	334	}
	335
970d7e83 LB	336	void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) {
	337
	338	MachineBasicBlock &MBB = *MI.getParent();
	339	DebugLoc DL = MI.getDebugLoc();
	340	MachineBasicBlock::iterator I = MI;
	341
	342	unsigned Save = MI.getOperand(1).getReg();
	343	unsigned Idx = MI.getOperand(3).getReg();
	344
	345	if (AMDGPU::SReg_32RegClass.contains(Idx)) {
	346	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
	347	.addReg(Idx);
	348	MBB.insert(I, MovRel);
1a4d82fc	349	} else {
970d7e83	350
1a4d82fc JJ	351	assert(AMDGPU::SReg_64RegClass.contains(Save));
1a4d82fc JJ	352	assert(AMDGPU::VReg_32RegClass.contains(Idx));
970d7e83	353
1a4d82fc JJ	354	// Save the EXEC mask
	355	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
	356	.addReg(AMDGPU::EXEC);
970d7e83	357
1a4d82fc JJ	358	// Read the next variant into VCC (lower 32 bits) <- also loop target
	359	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
	360	AMDGPU::VCC_LO)
	361	.addReg(Idx);
970d7e83	362
1a4d82fc JJ	363	// Move index from VCC into M0
	364	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
	365	.addReg(AMDGPU::VCC_LO);
970d7e83	366
1a4d82fc JJ	367	// Compare the just read M0 value to all possible Idx values
	368	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
	369	.addReg(AMDGPU::M0)
	370	.addReg(Idx);
970d7e83	371
1a4d82fc JJ	372	// Update EXEC, save the original EXEC value to VCC
	373	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
	374	.addReg(AMDGPU::VCC);
970d7e83	375
1a4d82fc JJ	376	// Do the actual move
1a4d82fc JJ	377	MBB.insert(I, MovRel);
970d7e83	378
1a4d82fc JJ	379	// Update EXEC, switch all done bits to 0 and all todo bits to 1
	380	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
	381	.addReg(AMDGPU::EXEC)
	382	.addReg(AMDGPU::VCC);
970d7e83	383
1a4d82fc JJ	384	// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
	385	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
	386	.addImm(-7)
	387	.addReg(AMDGPU::EXEC);
970d7e83	388
1a4d82fc JJ	389	// Restore EXEC
	390	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
	391	.addReg(Save);
970d7e83	392
1a4d82fc JJ	393	}
	394	// FIXME: Are there any values other than the LDS address clamp that need to
	395	// be stored in the m0 register and may be live for more than a few
	396	// instructions? If so, we should save the m0 register at the beginning
	397	// of this function and restore it here.
	398	// FIXME: Add support for LDS direct loads.
	399	InitM0ForLDS(&MI);
970d7e83 LB	400	MI.eraseFromParent();
	401	}
	402
	403	void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
	404
	405	MachineBasicBlock &MBB = *MI.getParent();
	406	DebugLoc DL = MI.getDebugLoc();
	407
	408	unsigned Dst = MI.getOperand(0).getReg();
	409	unsigned Vec = MI.getOperand(2).getReg();
	410	unsigned Off = MI.getOperand(4).getImm();
1a4d82fc JJ	411	unsigned SubReg = TRI->getSubReg(Vec, AMDGPU::sub0);
	412	if (!SubReg)
	413	SubReg = Vec;
970d7e83	414
1a4d82fc	415	MachineInstr *MovRel =
970d7e83	416	BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
1a4d82fc	417	.addReg(SubReg + Off)
970d7e83 LB	418	.addReg(AMDGPU::M0, RegState::Implicit)
	419	.addReg(Vec, RegState::Implicit);
	420
	421	LoadM0(MI, MovRel);
	422	}
	423
	424	void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
	425
	426	MachineBasicBlock &MBB = *MI.getParent();
	427	DebugLoc DL = MI.getDebugLoc();
	428
	429	unsigned Dst = MI.getOperand(0).getReg();
	430	unsigned Off = MI.getOperand(4).getImm();
	431	unsigned Val = MI.getOperand(5).getReg();
1a4d82fc JJ	432	unsigned SubReg = TRI->getSubReg(Dst, AMDGPU::sub0);
	433	if (!SubReg)
	434	SubReg = Dst;
970d7e83 LB	435
	436	MachineInstr *MovRel =
	437	BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
1a4d82fc	438	.addReg(SubReg + Off, RegState::Define)
970d7e83 LB	439	.addReg(Val)
	440	.addReg(AMDGPU::M0, RegState::Implicit)
	441	.addReg(Dst, RegState::Implicit);
	442
	443	LoadM0(MI, MovRel);
	444	}
	445
	446	bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
1a4d82fc JJ	447	TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
	448	TRI =
	449	static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
	450	SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
970d7e83 LB	451
970d7e83 LB	452	bool HaveKill = false;
1a4d82fc JJ	453	bool NeedM0 = false;
	454	bool NeedWQM = false;
	455	bool NeedFlat = false;
970d7e83 LB	456	unsigned Depth = 0;
	457
	458	for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
	459	BI != BE; ++BI) {
	460
	461	MachineBasicBlock &MBB = *BI;
1a4d82fc JJ	462	MachineBasicBlock::iterator I, Next;
	463	for (I = MBB.begin(); I != MBB.end(); I = Next) {
	464	Next = std::next(I);
970d7e83	465
970d7e83	466	MachineInstr &MI = *I;
1a4d82fc JJ	467	if (TII->isDS(MI.getOpcode())) {
	468	NeedM0 = true;
	469	NeedWQM = true;
	470	}
	471
	472	// Flat uses m0 in case it needs to access LDS.
	473	if (TII->isFLAT(MI.getOpcode())) {
	474	NeedM0 = true;
	475	NeedFlat = true;
	476	}
	477
970d7e83 LB	478	switch (MI.getOpcode()) {
	479	default: break;
	480	case AMDGPU::SI_IF:
	481	++Depth;
	482	If(MI);
	483	break;
	484
	485	case AMDGPU::SI_ELSE:
	486	Else(MI);
	487	break;
	488
	489	case AMDGPU::SI_BREAK:
	490	Break(MI);
	491	break;
	492
	493	case AMDGPU::SI_IF_BREAK:
	494	IfBreak(MI);
	495	break;
	496
	497	case AMDGPU::SI_ELSE_BREAK:
	498	ElseBreak(MI);
	499	break;
	500
	501	case AMDGPU::SI_LOOP:
	502	++Depth;
	503	Loop(MI);
	504	break;
	505
	506	case AMDGPU::SI_END_CF:
	507	if (--Depth == 0 && HaveKill) {
	508	SkipIfDead(MI);
	509	HaveKill = false;
	510	}
	511	EndCf(MI);
	512	break;
	513
	514	case AMDGPU::SI_KILL:
	515	if (Depth == 0)
	516	SkipIfDead(MI);
	517	else
	518	HaveKill = true;
	519	Kill(MI);
	520	break;
	521
	522	case AMDGPU::S_BRANCH:
	523	Branch(MI);
	524	break;
	525
	526	case AMDGPU::SI_INDIRECT_SRC:
	527	IndirectSrc(MI);
	528	break;
	529
1a4d82fc	530	case AMDGPU::SI_INDIRECT_DST_V1:
970d7e83 LB	531	case AMDGPU::SI_INDIRECT_DST_V2:
	532	case AMDGPU::SI_INDIRECT_DST_V4:
	533	case AMDGPU::SI_INDIRECT_DST_V8:
	534	case AMDGPU::SI_INDIRECT_DST_V16:
	535	IndirectDst(MI);
	536	break;
1a4d82fc JJ	537
	538	case AMDGPU::V_INTERP_P1_F32:
	539	case AMDGPU::V_INTERP_P2_F32:
	540	case AMDGPU::V_INTERP_MOV_F32:
	541	NeedWQM = true;
	542	break;
970d7e83 LB	543	}
	544	}
	545	}
	546
1a4d82fc JJ	547	if (NeedM0) {
	548	MachineBasicBlock &MBB = MF.front();
	549	// Initialize M0 to a value that won't cause LDS access to be discarded
	550	// due to offset clamping
	551	InitM0ForLDS(MBB.getFirstNonPHI());
	552	}
	553
	554	if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
	555	MachineBasicBlock &MBB = MF.front();
	556	BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
	557	AMDGPU::EXEC).addReg(AMDGPU::EXEC);
	558	}
	559
	560	// FIXME: This seems inappropriate to do here.
	561	if (NeedFlat && MFI->IsKernel) {
	562	// Insert the prologue initializing the SGPRs pointing to the scratch space
	563	// for flat accesses.
	564	const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
	565
	566	// TODO: What to use with function calls?
	567
	568	// FIXME: This is reporting stack size that is used in a scratch buffer
	569	// rather than registers as well.
	570	uint64_t StackSizeBytes = FrameInfo->getStackSize();
	571
	572	int IndirectBegin
	573	= static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
	574	// Convert register index to 256-byte unit.
	575	uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
	576
	577	assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
	578	"Stack limits should be smaller than 16-bits");
	579
	580	// Initialize the flat scratch register pair.
	581	// TODO: Can we use one s_mov_b64 here?
	582
	583	// Offset is in units of 256-bytes.
	584	MachineBasicBlock &MBB = MF.front();
	585	DebugLoc NoDL;
	586	MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
	587	const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);
	588
	589	BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
	590	.addImm(StackOffset);
	591
	592	// Documentation says size is "per-thread scratch size in bytes"
	593	BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
	594	.addImm(StackSizeBytes);
	595	}
	596
970d7e83 LB	597	return true;
970d7e83 LB	598	}