]>
Commit | Line | Data |
---|---|---|
970d7e83 LB |
1 | //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===// |
2 | // | |
3 | // The LLVM Compiler Infrastructure | |
4 | // | |
5 | // This file is distributed under the University of Illinois Open Source | |
6 | // License. See LICENSE.TXT for details. | |
7 | // | |
8 | //===----------------------------------------------------------------------===// | |
9 | // | |
10 | /// \file | |
11 | /// \brief This pass lowers the pseudo control flow instructions to real | |
12 | /// machine instructions. | |
13 | /// | |
14 | /// All control flow is handled using predicated instructions and | |
15 | /// a predicate stack. Each Scalar ALU controls the operations of 64 Vector | |
16 | /// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs | |
17 | /// by writting to the 64-bit EXEC register (each bit corresponds to a | |
18 | /// single vector ALU). Typically, for predicates, a vector ALU will write | |
19 | /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each | |
20 | /// Vector ALU) and then the ScalarALU will AND the VCC register with the | |
21 | /// EXEC to update the predicates. | |
22 | /// | |
23 | /// For example: | |
24 | /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 | |
25 | /// %SGPR0 = SI_IF %VCC | |
26 | /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 | |
27 | /// %SGPR0 = SI_ELSE %SGPR0 | |
28 | /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 | |
29 | /// SI_END_CF %SGPR0 | |
30 | /// | |
31 | /// becomes: | |
32 | /// | |
33 | /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask | |
34 | /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask | |
35 | /// S_CBRANCH_EXECZ label0 // This instruction is an optional | |
36 | /// // optimization which allows us to | |
37 | /// // branch if all the bits of | |
38 | /// // EXEC are zero. | |
39 | /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch | |
40 | /// | |
41 | /// label0: | |
42 | /// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block | |
43 | /// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask | |
44 | /// S_BRANCH_EXECZ label1 // Use our branch optimization | |
45 | /// // instruction again. | |
46 | /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block | |
47 | /// label1: | |
48 | /// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits | |
49 | //===----------------------------------------------------------------------===// | |
50 | ||
51 | #include "AMDGPU.h" | |
1a4d82fc | 52 | #include "AMDGPUSubtarget.h" |
970d7e83 LB |
53 | #include "SIInstrInfo.h" |
54 | #include "SIMachineFunctionInfo.h" | |
1a4d82fc | 55 | #include "llvm/CodeGen/MachineFrameInfo.h" |
970d7e83 LB |
56 | #include "llvm/CodeGen/MachineFunction.h" |
57 | #include "llvm/CodeGen/MachineFunctionPass.h" | |
58 | #include "llvm/CodeGen/MachineInstrBuilder.h" | |
59 | #include "llvm/CodeGen/MachineRegisterInfo.h" | |
1a4d82fc | 60 | #include "llvm/IR/Constants.h" |
970d7e83 LB |
61 | |
62 | using namespace llvm; | |
63 | ||
64 | namespace { | |
65 | ||
66 | class SILowerControlFlowPass : public MachineFunctionPass { | |
67 | ||
68 | private: | |
69 | static const unsigned SkipThreshold = 12; | |
70 | ||
71 | static char ID; | |
1a4d82fc JJ |
72 | const SIRegisterInfo *TRI; |
73 | const SIInstrInfo *TII; | |
970d7e83 LB |
74 | |
75 | bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To); | |
76 | ||
77 | void Skip(MachineInstr &From, MachineOperand &To); | |
78 | void SkipIfDead(MachineInstr &MI); | |
79 | ||
80 | void If(MachineInstr &MI); | |
81 | void Else(MachineInstr &MI); | |
82 | void Break(MachineInstr &MI); | |
83 | void IfBreak(MachineInstr &MI); | |
84 | void ElseBreak(MachineInstr &MI); | |
85 | void Loop(MachineInstr &MI); | |
86 | void EndCf(MachineInstr &MI); | |
87 | ||
88 | void Kill(MachineInstr &MI); | |
89 | void Branch(MachineInstr &MI); | |
90 | ||
1a4d82fc | 91 | void InitM0ForLDS(MachineBasicBlock::iterator MI); |
970d7e83 LB |
92 | void LoadM0(MachineInstr &MI, MachineInstr *MovRel); |
93 | void IndirectSrc(MachineInstr &MI); | |
94 | void IndirectDst(MachineInstr &MI); | |
95 | ||
96 | public: | |
97 | SILowerControlFlowPass(TargetMachine &tm) : | |
1a4d82fc | 98 | MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { } |
970d7e83 | 99 | |
1a4d82fc | 100 | bool runOnMachineFunction(MachineFunction &MF) override; |
970d7e83 | 101 | |
1a4d82fc | 102 | const char *getPassName() const override { |
970d7e83 LB |
103 | return "SI Lower control flow instructions"; |
104 | } | |
105 | ||
106 | }; | |
107 | ||
108 | } // End anonymous namespace | |
109 | ||
110 | char SILowerControlFlowPass::ID = 0; | |
111 | ||
112 | FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { | |
113 | return new SILowerControlFlowPass(tm); | |
114 | } | |
115 | ||
116 | bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From, | |
117 | MachineBasicBlock *To) { | |
118 | ||
119 | unsigned NumInstr = 0; | |
120 | ||
121 | for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty(); | |
122 | MBB = *MBB->succ_begin()) { | |
123 | ||
124 | for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); | |
125 | NumInstr < SkipThreshold && I != E; ++I) { | |
126 | ||
127 | if (I->isBundle() || !I->isBundled()) | |
128 | if (++NumInstr >= SkipThreshold) | |
129 | return true; | |
130 | } | |
131 | } | |
132 | ||
133 | return false; | |
134 | } | |
135 | ||
136 | void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) { | |
137 | ||
138 | if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB())) | |
139 | return; | |
140 | ||
141 | DebugLoc DL = From.getDebugLoc(); | |
142 | BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ)) | |
143 | .addOperand(To) | |
144 | .addReg(AMDGPU::EXEC); | |
145 | } | |
146 | ||
147 | void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) { | |
148 | ||
149 | MachineBasicBlock &MBB = *MI.getParent(); | |
150 | DebugLoc DL = MI.getDebugLoc(); | |
151 | ||
1a4d82fc JJ |
152 | if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() != |
153 | ShaderType::PIXEL || | |
154 | !shouldSkip(&MBB, &MBB.getParent()->back())) | |
970d7e83 LB |
155 | return; |
156 | ||
157 | MachineBasicBlock::iterator Insert = &MI; | |
158 | ++Insert; | |
159 | ||
160 | // If the exec mask is non-zero, skip the next two instructions | |
161 | BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) | |
162 | .addImm(3) | |
163 | .addReg(AMDGPU::EXEC); | |
164 | ||
165 | // Exec mask is zero: Export to NULL target... | |
166 | BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP)) | |
167 | .addImm(0) | |
168 | .addImm(0x09) // V_008DFC_SQ_EXP_NULL | |
169 | .addImm(0) | |
170 | .addImm(1) | |
171 | .addImm(1) | |
172 | .addReg(AMDGPU::VGPR0) | |
173 | .addReg(AMDGPU::VGPR0) | |
174 | .addReg(AMDGPU::VGPR0) | |
175 | .addReg(AMDGPU::VGPR0); | |
176 | ||
177 | // ... and terminate wavefront | |
178 | BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)); | |
179 | } | |
180 | ||
181 | void SILowerControlFlowPass::If(MachineInstr &MI) { | |
182 | MachineBasicBlock &MBB = *MI.getParent(); | |
183 | DebugLoc DL = MI.getDebugLoc(); | |
184 | unsigned Reg = MI.getOperand(0).getReg(); | |
185 | unsigned Vcc = MI.getOperand(1).getReg(); | |
186 | ||
187 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg) | |
188 | .addReg(Vcc); | |
189 | ||
190 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg) | |
191 | .addReg(AMDGPU::EXEC) | |
192 | .addReg(Reg); | |
193 | ||
194 | Skip(MI, MI.getOperand(2)); | |
195 | ||
196 | MI.eraseFromParent(); | |
197 | } | |
198 | ||
199 | void SILowerControlFlowPass::Else(MachineInstr &MI) { | |
200 | MachineBasicBlock &MBB = *MI.getParent(); | |
201 | DebugLoc DL = MI.getDebugLoc(); | |
202 | unsigned Dst = MI.getOperand(0).getReg(); | |
203 | unsigned Src = MI.getOperand(1).getReg(); | |
204 | ||
1a4d82fc JJ |
205 | BuildMI(MBB, MBB.getFirstNonPHI(), DL, |
206 | TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) | |
970d7e83 LB |
207 | .addReg(Src); // Saved EXEC |
208 | ||
209 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) | |
210 | .addReg(AMDGPU::EXEC) | |
211 | .addReg(Dst); | |
212 | ||
213 | Skip(MI, MI.getOperand(2)); | |
214 | ||
215 | MI.eraseFromParent(); | |
216 | } | |
217 | ||
218 | void SILowerControlFlowPass::Break(MachineInstr &MI) { | |
219 | MachineBasicBlock &MBB = *MI.getParent(); | |
220 | DebugLoc DL = MI.getDebugLoc(); | |
221 | ||
222 | unsigned Dst = MI.getOperand(0).getReg(); | |
223 | unsigned Src = MI.getOperand(1).getReg(); | |
224 | ||
225 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) | |
226 | .addReg(AMDGPU::EXEC) | |
227 | .addReg(Src); | |
228 | ||
229 | MI.eraseFromParent(); | |
230 | } | |
231 | ||
232 | void SILowerControlFlowPass::IfBreak(MachineInstr &MI) { | |
233 | MachineBasicBlock &MBB = *MI.getParent(); | |
234 | DebugLoc DL = MI.getDebugLoc(); | |
235 | ||
236 | unsigned Dst = MI.getOperand(0).getReg(); | |
237 | unsigned Vcc = MI.getOperand(1).getReg(); | |
238 | unsigned Src = MI.getOperand(2).getReg(); | |
239 | ||
240 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) | |
241 | .addReg(Vcc) | |
242 | .addReg(Src); | |
243 | ||
244 | MI.eraseFromParent(); | |
245 | } | |
246 | ||
247 | void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) { | |
248 | MachineBasicBlock &MBB = *MI.getParent(); | |
249 | DebugLoc DL = MI.getDebugLoc(); | |
250 | ||
251 | unsigned Dst = MI.getOperand(0).getReg(); | |
252 | unsigned Saved = MI.getOperand(1).getReg(); | |
253 | unsigned Src = MI.getOperand(2).getReg(); | |
254 | ||
255 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) | |
256 | .addReg(Saved) | |
257 | .addReg(Src); | |
258 | ||
259 | MI.eraseFromParent(); | |
260 | } | |
261 | ||
262 | void SILowerControlFlowPass::Loop(MachineInstr &MI) { | |
263 | MachineBasicBlock &MBB = *MI.getParent(); | |
264 | DebugLoc DL = MI.getDebugLoc(); | |
265 | unsigned Src = MI.getOperand(0).getReg(); | |
266 | ||
267 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) | |
268 | .addReg(AMDGPU::EXEC) | |
269 | .addReg(Src); | |
270 | ||
271 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) | |
272 | .addOperand(MI.getOperand(1)) | |
273 | .addReg(AMDGPU::EXEC); | |
274 | ||
275 | MI.eraseFromParent(); | |
276 | } | |
277 | ||
278 | void SILowerControlFlowPass::EndCf(MachineInstr &MI) { | |
279 | MachineBasicBlock &MBB = *MI.getParent(); | |
280 | DebugLoc DL = MI.getDebugLoc(); | |
281 | unsigned Reg = MI.getOperand(0).getReg(); | |
282 | ||
283 | BuildMI(MBB, MBB.getFirstNonPHI(), DL, | |
284 | TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) | |
285 | .addReg(AMDGPU::EXEC) | |
286 | .addReg(Reg); | |
287 | ||
288 | MI.eraseFromParent(); | |
289 | } | |
290 | ||
291 | void SILowerControlFlowPass::Branch(MachineInstr &MI) { | |
1a4d82fc | 292 | if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode()) |
970d7e83 | 293 | MI.eraseFromParent(); |
1a4d82fc JJ |
294 | |
295 | // If these aren't equal, this is probably an infinite loop. | |
970d7e83 LB |
296 | } |
297 | ||
298 | void SILowerControlFlowPass::Kill(MachineInstr &MI) { | |
970d7e83 LB |
299 | MachineBasicBlock &MBB = *MI.getParent(); |
300 | DebugLoc DL = MI.getDebugLoc(); | |
1a4d82fc JJ |
301 | const MachineOperand &Op = MI.getOperand(0); |
302 | ||
303 | #ifndef NDEBUG | |
304 | const SIMachineFunctionInfo *MFI | |
305 | = MBB.getParent()->getInfo<SIMachineFunctionInfo>(); | |
306 | // Kill is only allowed in pixel / geometry shaders. | |
307 | assert(MFI->getShaderType() == ShaderType::PIXEL || | |
308 | MFI->getShaderType() == ShaderType::GEOMETRY); | |
309 | #endif | |
310 | ||
311 | // Clear this thread from the exec mask if the operand is negative | |
312 | if ((Op.isImm() || Op.isFPImm())) { | |
313 | // Constant operand: Set exec mask to 0 or do nothing | |
314 | if (Op.isImm() ? (Op.getImm() & 0x80000000) : | |
315 | Op.getFPImm()->isNegative()) { | |
316 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) | |
317 | .addImm(0); | |
318 | } | |
319 | } else { | |
320 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC) | |
321 | .addImm(0) | |
322 | .addOperand(Op); | |
323 | } | |
970d7e83 LB |
324 | |
325 | MI.eraseFromParent(); | |
326 | } | |
327 | ||
1a4d82fc JJ |
328 | /// The m0 register stores the maximum allowable address for LDS reads and |
329 | /// writes. Its value must be at least the size in bytes of LDS allocated by | |
330 | /// the shader. For simplicity, we set it to the maximum possible value. | |
331 | void SILowerControlFlowPass::InitM0ForLDS(MachineBasicBlock::iterator MI) { | |
332 | BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), | |
333 | AMDGPU::M0).addImm(0xffffffff); | |
334 | } | |
335 | ||
970d7e83 LB |
336 | void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel) { |
337 | ||
338 | MachineBasicBlock &MBB = *MI.getParent(); | |
339 | DebugLoc DL = MI.getDebugLoc(); | |
340 | MachineBasicBlock::iterator I = MI; | |
341 | ||
342 | unsigned Save = MI.getOperand(1).getReg(); | |
343 | unsigned Idx = MI.getOperand(3).getReg(); | |
344 | ||
345 | if (AMDGPU::SReg_32RegClass.contains(Idx)) { | |
346 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) | |
347 | .addReg(Idx); | |
348 | MBB.insert(I, MovRel); | |
1a4d82fc | 349 | } else { |
970d7e83 | 350 | |
1a4d82fc JJ |
351 | assert(AMDGPU::SReg_64RegClass.contains(Save)); |
352 | assert(AMDGPU::VReg_32RegClass.contains(Idx)); | |
970d7e83 | 353 | |
1a4d82fc JJ |
354 | // Save the EXEC mask |
355 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save) | |
356 | .addReg(AMDGPU::EXEC); | |
970d7e83 | 357 | |
1a4d82fc JJ |
358 | // Read the next variant into VCC (lower 32 bits) <- also loop target |
359 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), | |
360 | AMDGPU::VCC_LO) | |
361 | .addReg(Idx); | |
970d7e83 | 362 | |
1a4d82fc JJ |
363 | // Move index from VCC into M0 |
364 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0) | |
365 | .addReg(AMDGPU::VCC_LO); | |
970d7e83 | 366 | |
1a4d82fc JJ |
367 | // Compare the just read M0 value to all possible Idx values |
368 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC) | |
369 | .addReg(AMDGPU::M0) | |
370 | .addReg(Idx); | |
970d7e83 | 371 | |
1a4d82fc JJ |
372 | // Update EXEC, save the original EXEC value to VCC |
373 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC) | |
374 | .addReg(AMDGPU::VCC); | |
970d7e83 | 375 | |
1a4d82fc JJ |
376 | // Do the actual move |
377 | MBB.insert(I, MovRel); | |
970d7e83 | 378 | |
1a4d82fc JJ |
379 | // Update EXEC, switch all done bits to 0 and all todo bits to 1 |
380 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) | |
381 | .addReg(AMDGPU::EXEC) | |
382 | .addReg(AMDGPU::VCC); | |
970d7e83 | 383 | |
1a4d82fc JJ |
384 | // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover |
385 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) | |
386 | .addImm(-7) | |
387 | .addReg(AMDGPU::EXEC); | |
970d7e83 | 388 | |
1a4d82fc JJ |
389 | // Restore EXEC |
390 | BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) | |
391 | .addReg(Save); | |
970d7e83 | 392 | |
1a4d82fc JJ |
393 | } |
394 | // FIXME: Are there any values other than the LDS address clamp that need to | |
395 | // be stored in the m0 register and may be live for more than a few | |
396 | // instructions? If so, we should save the m0 register at the beginning | |
397 | // of this function and restore it here. | |
398 | // FIXME: Add support for LDS direct loads. | |
399 | InitM0ForLDS(&MI); | |
970d7e83 LB |
400 | MI.eraseFromParent(); |
401 | } | |
402 | ||
403 | void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) { | |
404 | ||
405 | MachineBasicBlock &MBB = *MI.getParent(); | |
406 | DebugLoc DL = MI.getDebugLoc(); | |
407 | ||
408 | unsigned Dst = MI.getOperand(0).getReg(); | |
409 | unsigned Vec = MI.getOperand(2).getReg(); | |
410 | unsigned Off = MI.getOperand(4).getImm(); | |
1a4d82fc JJ |
411 | unsigned SubReg = TRI->getSubReg(Vec, AMDGPU::sub0); |
412 | if (!SubReg) | |
413 | SubReg = Vec; | |
970d7e83 | 414 | |
1a4d82fc | 415 | MachineInstr *MovRel = |
970d7e83 | 416 | BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst) |
1a4d82fc | 417 | .addReg(SubReg + Off) |
970d7e83 LB |
418 | .addReg(AMDGPU::M0, RegState::Implicit) |
419 | .addReg(Vec, RegState::Implicit); | |
420 | ||
421 | LoadM0(MI, MovRel); | |
422 | } | |
423 | ||
424 | void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) { | |
425 | ||
426 | MachineBasicBlock &MBB = *MI.getParent(); | |
427 | DebugLoc DL = MI.getDebugLoc(); | |
428 | ||
429 | unsigned Dst = MI.getOperand(0).getReg(); | |
430 | unsigned Off = MI.getOperand(4).getImm(); | |
431 | unsigned Val = MI.getOperand(5).getReg(); | |
1a4d82fc JJ |
432 | unsigned SubReg = TRI->getSubReg(Dst, AMDGPU::sub0); |
433 | if (!SubReg) | |
434 | SubReg = Dst; | |
970d7e83 LB |
435 | |
436 | MachineInstr *MovRel = | |
437 | BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32)) | |
1a4d82fc | 438 | .addReg(SubReg + Off, RegState::Define) |
970d7e83 LB |
439 | .addReg(Val) |
440 | .addReg(AMDGPU::M0, RegState::Implicit) | |
441 | .addReg(Dst, RegState::Implicit); | |
442 | ||
443 | LoadM0(MI, MovRel); | |
444 | } | |
445 | ||
446 | bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { | |
1a4d82fc JJ |
447 | TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo()); |
448 | TRI = | |
449 | static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo()); | |
450 | SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); | |
970d7e83 LB |
451 | |
452 | bool HaveKill = false; | |
1a4d82fc JJ |
453 | bool NeedM0 = false; |
454 | bool NeedWQM = false; | |
455 | bool NeedFlat = false; | |
970d7e83 LB |
456 | unsigned Depth = 0; |
457 | ||
458 | for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); | |
459 | BI != BE; ++BI) { | |
460 | ||
461 | MachineBasicBlock &MBB = *BI; | |
1a4d82fc JJ |
462 | MachineBasicBlock::iterator I, Next; |
463 | for (I = MBB.begin(); I != MBB.end(); I = Next) { | |
464 | Next = std::next(I); | |
970d7e83 | 465 | |
970d7e83 | 466 | MachineInstr &MI = *I; |
1a4d82fc JJ |
467 | if (TII->isDS(MI.getOpcode())) { |
468 | NeedM0 = true; | |
469 | NeedWQM = true; | |
470 | } | |
471 | ||
472 | // Flat uses m0 in case it needs to access LDS. | |
473 | if (TII->isFLAT(MI.getOpcode())) { | |
474 | NeedM0 = true; | |
475 | NeedFlat = true; | |
476 | } | |
477 | ||
970d7e83 LB |
478 | switch (MI.getOpcode()) { |
479 | default: break; | |
480 | case AMDGPU::SI_IF: | |
481 | ++Depth; | |
482 | If(MI); | |
483 | break; | |
484 | ||
485 | case AMDGPU::SI_ELSE: | |
486 | Else(MI); | |
487 | break; | |
488 | ||
489 | case AMDGPU::SI_BREAK: | |
490 | Break(MI); | |
491 | break; | |
492 | ||
493 | case AMDGPU::SI_IF_BREAK: | |
494 | IfBreak(MI); | |
495 | break; | |
496 | ||
497 | case AMDGPU::SI_ELSE_BREAK: | |
498 | ElseBreak(MI); | |
499 | break; | |
500 | ||
501 | case AMDGPU::SI_LOOP: | |
502 | ++Depth; | |
503 | Loop(MI); | |
504 | break; | |
505 | ||
506 | case AMDGPU::SI_END_CF: | |
507 | if (--Depth == 0 && HaveKill) { | |
508 | SkipIfDead(MI); | |
509 | HaveKill = false; | |
510 | } | |
511 | EndCf(MI); | |
512 | break; | |
513 | ||
514 | case AMDGPU::SI_KILL: | |
515 | if (Depth == 0) | |
516 | SkipIfDead(MI); | |
517 | else | |
518 | HaveKill = true; | |
519 | Kill(MI); | |
520 | break; | |
521 | ||
522 | case AMDGPU::S_BRANCH: | |
523 | Branch(MI); | |
524 | break; | |
525 | ||
526 | case AMDGPU::SI_INDIRECT_SRC: | |
527 | IndirectSrc(MI); | |
528 | break; | |
529 | ||
1a4d82fc | 530 | case AMDGPU::SI_INDIRECT_DST_V1: |
970d7e83 LB |
531 | case AMDGPU::SI_INDIRECT_DST_V2: |
532 | case AMDGPU::SI_INDIRECT_DST_V4: | |
533 | case AMDGPU::SI_INDIRECT_DST_V8: | |
534 | case AMDGPU::SI_INDIRECT_DST_V16: | |
535 | IndirectDst(MI); | |
536 | break; | |
1a4d82fc JJ |
537 | |
538 | case AMDGPU::V_INTERP_P1_F32: | |
539 | case AMDGPU::V_INTERP_P2_F32: | |
540 | case AMDGPU::V_INTERP_MOV_F32: | |
541 | NeedWQM = true; | |
542 | break; | |
970d7e83 LB |
543 | } |
544 | } | |
545 | } | |
546 | ||
1a4d82fc JJ |
547 | if (NeedM0) { |
548 | MachineBasicBlock &MBB = MF.front(); | |
549 | // Initialize M0 to a value that won't cause LDS access to be discarded | |
550 | // due to offset clamping | |
551 | InitM0ForLDS(MBB.getFirstNonPHI()); | |
552 | } | |
553 | ||
554 | if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { | |
555 | MachineBasicBlock &MBB = MF.front(); | |
556 | BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), | |
557 | AMDGPU::EXEC).addReg(AMDGPU::EXEC); | |
558 | } | |
559 | ||
560 | // FIXME: This seems inappropriate to do here. | |
561 | if (NeedFlat && MFI->IsKernel) { | |
562 | // Insert the prologue initializing the SGPRs pointing to the scratch space | |
563 | // for flat accesses. | |
564 | const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); | |
565 | ||
566 | // TODO: What to use with function calls? | |
567 | ||
568 | // FIXME: This is reporting stack size that is used in a scratch buffer | |
569 | // rather than registers as well. | |
570 | uint64_t StackSizeBytes = FrameInfo->getStackSize(); | |
571 | ||
572 | int IndirectBegin | |
573 | = static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF); | |
574 | // Convert register index to 256-byte unit. | |
575 | uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256); | |
576 | ||
577 | assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff && | |
578 | "Stack limits should be smaller than 16-bits"); | |
579 | ||
580 | // Initialize the flat scratch register pair. | |
581 | // TODO: Can we use one s_mov_b64 here? | |
582 | ||
583 | // Offset is in units of 256-bytes. | |
584 | MachineBasicBlock &MBB = MF.front(); | |
585 | DebugLoc NoDL; | |
586 | MachineBasicBlock::iterator Start = MBB.getFirstNonPHI(); | |
587 | const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32); | |
588 | ||
589 | BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO) | |
590 | .addImm(StackOffset); | |
591 | ||
592 | // Documentation says size is "per-thread scratch size in bytes" | |
593 | BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI) | |
594 | .addImm(StackSizeBytes); | |
595 | } | |
596 | ||
970d7e83 LB |
597 | return true; |
598 | } |