]>
Commit | Line | Data |
---|---|---|
970d7e83 LB |
1 | //===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===// |
2 | // | |
3 | // The LLVM Compiler Infrastructure | |
4 | // | |
5 | // This file is distributed under the University of Illinois Open Source | |
6 | // License. See LICENSE.TXT for details. | |
7 | // | |
8 | //===----------------------------------------------------------------------===// | |
9 | // | |
10 | /// \file | |
11 | /// Vector, Reduction, and Cube instructions need to fill the entire instruction | |
12 | /// group to work correctly. This pass expands these individual instructions | |
13 | /// into several instructions that will completely fill the instruction group. | |
14 | // | |
15 | //===----------------------------------------------------------------------===// | |
16 | ||
17 | #include "AMDGPU.h" | |
18 | #include "R600Defines.h" | |
19 | #include "R600InstrInfo.h" | |
20 | #include "R600MachineFunctionInfo.h" | |
21 | #include "R600RegisterInfo.h" | |
1a4d82fc | 22 | #include "AMDGPUSubtarget.h" |
970d7e83 LB |
23 | #include "llvm/CodeGen/MachineFunctionPass.h" |
24 | #include "llvm/CodeGen/MachineInstrBuilder.h" | |
25 | #include "llvm/CodeGen/MachineRegisterInfo.h" | |
26 | ||
27 | using namespace llvm; | |
28 | ||
29 | namespace { | |
30 | ||
31 | class R600ExpandSpecialInstrsPass : public MachineFunctionPass { | |
32 | ||
33 | private: | |
34 | static char ID; | |
35 | const R600InstrInfo *TII; | |
36 | ||
1a4d82fc JJ |
37 | void SetFlagInNewMI(MachineInstr *NewMI, const MachineInstr *OldMI, |
38 | unsigned Op); | |
970d7e83 LB |
39 | |
40 | public: | |
41 | R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID), | |
1a4d82fc | 42 | TII(nullptr) { } |
970d7e83 | 43 | |
1a4d82fc | 44 | bool runOnMachineFunction(MachineFunction &MF) override; |
970d7e83 | 45 | |
1a4d82fc | 46 | const char *getPassName() const override { |
970d7e83 LB |
47 | return "R600 Expand special instructions pass"; |
48 | } | |
49 | }; | |
50 | ||
51 | } // End anonymous namespace | |
52 | ||
53 | char R600ExpandSpecialInstrsPass::ID = 0; | |
54 | ||
55 | FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) { | |
56 | return new R600ExpandSpecialInstrsPass(TM); | |
57 | } | |
58 | ||
1a4d82fc JJ |
59 | void R600ExpandSpecialInstrsPass::SetFlagInNewMI(MachineInstr *NewMI, |
60 | const MachineInstr *OldMI, unsigned Op) { | |
61 | int OpIdx = TII->getOperandIdx(*OldMI, Op); | |
62 | if (OpIdx > -1) { | |
63 | uint64_t Val = OldMI->getOperand(OpIdx).getImm(); | |
64 | TII->setImmOperand(NewMI, Op, Val); | |
65 | } | |
66 | } | |
67 | ||
970d7e83 | 68 | bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { |
1a4d82fc | 69 | TII = static_cast<const R600InstrInfo *>(MF.getSubtarget().getInstrInfo()); |
970d7e83 LB |
70 | |
71 | const R600RegisterInfo &TRI = TII->getRegisterInfo(); | |
72 | ||
73 | for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); | |
74 | BB != BB_E; ++BB) { | |
75 | MachineBasicBlock &MBB = *BB; | |
76 | MachineBasicBlock::iterator I = MBB.begin(); | |
77 | while (I != MBB.end()) { | |
78 | MachineInstr &MI = *I; | |
1a4d82fc JJ |
79 | I = std::next(I); |
80 | ||
81 | // Expand LDS_*_RET instructions | |
82 | if (TII->isLDSRetInstr(MI.getOpcode())) { | |
83 | int DstIdx = TII->getOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); | |
84 | assert(DstIdx != -1); | |
85 | MachineOperand &DstOp = MI.getOperand(DstIdx); | |
86 | MachineInstr *Mov = TII->buildMovInstr(&MBB, I, | |
87 | DstOp.getReg(), AMDGPU::OQAP); | |
88 | DstOp.setReg(AMDGPU::OQAP); | |
89 | int LDSPredSelIdx = TII->getOperandIdx(MI.getOpcode(), | |
90 | AMDGPU::OpName::pred_sel); | |
91 | int MovPredSelIdx = TII->getOperandIdx(Mov->getOpcode(), | |
92 | AMDGPU::OpName::pred_sel); | |
93 | // Copy the pred_sel bit | |
94 | Mov->getOperand(MovPredSelIdx).setReg( | |
95 | MI.getOperand(LDSPredSelIdx).getReg()); | |
96 | } | |
970d7e83 LB |
97 | |
98 | switch (MI.getOpcode()) { | |
99 | default: break; | |
100 | // Expand PRED_X to one of the PRED_SET instructions. | |
101 | case AMDGPU::PRED_X: { | |
102 | uint64_t Flags = MI.getOperand(3).getImm(); | |
103 | // The native opcode used by PRED_X is stored as an immediate in the | |
104 | // third operand. | |
105 | MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I, | |
106 | MI.getOperand(2).getImm(), // opcode | |
107 | MI.getOperand(0).getReg(), // dst | |
108 | MI.getOperand(1).getReg(), // src0 | |
109 | AMDGPU::ZERO); // src1 | |
110 | TII->addFlag(PredSet, 0, MO_FLAG_MASK); | |
111 | if (Flags & MO_FLAG_PUSH) { | |
1a4d82fc | 112 | TII->setImmOperand(PredSet, AMDGPU::OpName::update_exec_mask, 1); |
970d7e83 | 113 | } else { |
1a4d82fc | 114 | TII->setImmOperand(PredSet, AMDGPU::OpName::update_pred, 1); |
970d7e83 LB |
115 | } |
116 | MI.eraseFromParent(); | |
117 | continue; | |
118 | } | |
970d7e83 LB |
119 | |
120 | case AMDGPU::INTERP_PAIR_XY: { | |
121 | MachineInstr *BMI; | |
122 | unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( | |
123 | MI.getOperand(2).getImm()); | |
124 | ||
125 | for (unsigned Chan = 0; Chan < 4; ++Chan) { | |
126 | unsigned DstReg; | |
127 | ||
128 | if (Chan < 2) | |
129 | DstReg = MI.getOperand(Chan).getReg(); | |
130 | else | |
131 | DstReg = Chan == 2 ? AMDGPU::T0_Z : AMDGPU::T0_W; | |
132 | ||
133 | BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_XY, | |
134 | DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); | |
135 | ||
136 | if (Chan > 0) { | |
137 | BMI->bundleWithPred(); | |
138 | } | |
139 | if (Chan >= 2) | |
140 | TII->addFlag(BMI, 0, MO_FLAG_MASK); | |
141 | if (Chan != 3) | |
142 | TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); | |
143 | } | |
144 | ||
145 | MI.eraseFromParent(); | |
146 | continue; | |
147 | } | |
148 | ||
149 | case AMDGPU::INTERP_PAIR_ZW: { | |
150 | MachineInstr *BMI; | |
151 | unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( | |
152 | MI.getOperand(2).getImm()); | |
153 | ||
154 | for (unsigned Chan = 0; Chan < 4; ++Chan) { | |
155 | unsigned DstReg; | |
156 | ||
157 | if (Chan < 2) | |
158 | DstReg = Chan == 0 ? AMDGPU::T0_X : AMDGPU::T0_Y; | |
159 | else | |
160 | DstReg = MI.getOperand(Chan-2).getReg(); | |
161 | ||
162 | BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_ZW, | |
163 | DstReg, MI.getOperand(3 + (Chan % 2)).getReg(), PReg); | |
164 | ||
165 | if (Chan > 0) { | |
166 | BMI->bundleWithPred(); | |
167 | } | |
168 | if (Chan < 2) | |
169 | TII->addFlag(BMI, 0, MO_FLAG_MASK); | |
170 | if (Chan != 3) | |
171 | TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); | |
172 | } | |
173 | ||
174 | MI.eraseFromParent(); | |
175 | continue; | |
176 | } | |
177 | ||
178 | case AMDGPU::INTERP_VEC_LOAD: { | |
179 | const R600RegisterInfo &TRI = TII->getRegisterInfo(); | |
180 | MachineInstr *BMI; | |
181 | unsigned PReg = AMDGPU::R600_ArrayBaseRegClass.getRegister( | |
182 | MI.getOperand(1).getImm()); | |
183 | unsigned DstReg = MI.getOperand(0).getReg(); | |
184 | ||
185 | for (unsigned Chan = 0; Chan < 4; ++Chan) { | |
186 | BMI = TII->buildDefaultInstruction(MBB, I, AMDGPU::INTERP_LOAD_P0, | |
187 | TRI.getSubReg(DstReg, TRI.getSubRegFromChannel(Chan)), PReg); | |
188 | if (Chan > 0) { | |
189 | BMI->bundleWithPred(); | |
190 | } | |
191 | if (Chan != 3) | |
192 | TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); | |
193 | } | |
194 | ||
195 | MI.eraseFromParent(); | |
196 | continue; | |
197 | } | |
1a4d82fc JJ |
198 | case AMDGPU::DOT_4: { |
199 | ||
200 | const R600RegisterInfo &TRI = TII->getRegisterInfo(); | |
201 | ||
202 | unsigned DstReg = MI.getOperand(0).getReg(); | |
203 | unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; | |
204 | ||
205 | for (unsigned Chan = 0; Chan < 4; ++Chan) { | |
206 | bool Mask = (Chan != TRI.getHWRegChan(DstReg)); | |
207 | unsigned SubDstReg = | |
208 | AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); | |
209 | MachineInstr *BMI = | |
210 | TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg); | |
211 | if (Chan > 0) { | |
212 | BMI->bundleWithPred(); | |
213 | } | |
214 | if (Mask) { | |
215 | TII->addFlag(BMI, 0, MO_FLAG_MASK); | |
216 | } | |
217 | if (Chan != 3) | |
218 | TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); | |
219 | unsigned Opcode = BMI->getOpcode(); | |
220 | // While not strictly necessary from hw point of view, we force | |
221 | // all src operands of a dot4 inst to belong to the same slot. | |
222 | unsigned Src0 = BMI->getOperand( | |
223 | TII->getOperandIdx(Opcode, AMDGPU::OpName::src0)) | |
224 | .getReg(); | |
225 | unsigned Src1 = BMI->getOperand( | |
226 | TII->getOperandIdx(Opcode, AMDGPU::OpName::src1)) | |
227 | .getReg(); | |
228 | (void) Src0; | |
229 | (void) Src1; | |
230 | if ((TRI.getEncodingValue(Src0) & 0xff) < 127 && | |
231 | (TRI.getEncodingValue(Src1) & 0xff) < 127) | |
232 | assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1)); | |
233 | } | |
234 | MI.eraseFromParent(); | |
235 | continue; | |
236 | } | |
970d7e83 LB |
237 | } |
238 | ||
239 | bool IsReduction = TII->isReductionOp(MI.getOpcode()); | |
240 | bool IsVector = TII->isVector(MI); | |
241 | bool IsCube = TII->isCubeOp(MI.getOpcode()); | |
242 | if (!IsReduction && !IsVector && !IsCube) { | |
243 | continue; | |
244 | } | |
245 | ||
246 | // Expand the instruction | |
247 | // | |
248 | // Reduction instructions: | |
249 | // T0_X = DP4 T1_XYZW, T2_XYZW | |
250 | // becomes: | |
251 | // TO_X = DP4 T1_X, T2_X | |
252 | // TO_Y (write masked) = DP4 T1_Y, T2_Y | |
253 | // TO_Z (write masked) = DP4 T1_Z, T2_Z | |
254 | // TO_W (write masked) = DP4 T1_W, T2_W | |
255 | // | |
256 | // Vector instructions: | |
257 | // T0_X = MULLO_INT T1_X, T2_X | |
258 | // becomes: | |
259 | // T0_X = MULLO_INT T1_X, T2_X | |
260 | // T0_Y (write masked) = MULLO_INT T1_X, T2_X | |
261 | // T0_Z (write masked) = MULLO_INT T1_X, T2_X | |
262 | // T0_W (write masked) = MULLO_INT T1_X, T2_X | |
263 | // | |
264 | // Cube instructions: | |
265 | // T0_XYZW = CUBE T1_XYZW | |
266 | // becomes: | |
267 | // TO_X = CUBE T1_Z, T1_Y | |
268 | // T0_Y = CUBE T1_Z, T1_X | |
269 | // T0_Z = CUBE T1_X, T1_Z | |
270 | // T0_W = CUBE T1_Y, T1_Z | |
271 | for (unsigned Chan = 0; Chan < 4; Chan++) { | |
272 | unsigned DstReg = MI.getOperand( | |
1a4d82fc | 273 | TII->getOperandIdx(MI, AMDGPU::OpName::dst)).getReg(); |
970d7e83 | 274 | unsigned Src0 = MI.getOperand( |
1a4d82fc | 275 | TII->getOperandIdx(MI, AMDGPU::OpName::src0)).getReg(); |
970d7e83 LB |
276 | unsigned Src1 = 0; |
277 | ||
278 | // Determine the correct source registers | |
279 | if (!IsCube) { | |
1a4d82fc | 280 | int Src1Idx = TII->getOperandIdx(MI, AMDGPU::OpName::src1); |
970d7e83 LB |
281 | if (Src1Idx != -1) { |
282 | Src1 = MI.getOperand(Src1Idx).getReg(); | |
283 | } | |
284 | } | |
285 | if (IsReduction) { | |
286 | unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); | |
287 | Src0 = TRI.getSubReg(Src0, SubRegIndex); | |
288 | Src1 = TRI.getSubReg(Src1, SubRegIndex); | |
289 | } else if (IsCube) { | |
290 | static const int CubeSrcSwz[] = {2, 2, 0, 1}; | |
291 | unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]); | |
292 | unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]); | |
293 | Src1 = TRI.getSubReg(Src0, SubRegIndex1); | |
294 | Src0 = TRI.getSubReg(Src0, SubRegIndex0); | |
295 | } | |
296 | ||
297 | // Determine the correct destination registers; | |
298 | bool Mask = false; | |
299 | bool NotLast = true; | |
300 | if (IsCube) { | |
301 | unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan); | |
302 | DstReg = TRI.getSubReg(DstReg, SubRegIndex); | |
303 | } else { | |
304 | // Mask the write if the original instruction does not write to | |
305 | // the current Channel. | |
306 | Mask = (Chan != TRI.getHWRegChan(DstReg)); | |
307 | unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; | |
308 | DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); | |
309 | } | |
310 | ||
311 | // Set the IsLast bit | |
312 | NotLast = (Chan != 3 ); | |
313 | ||
314 | // Add the new instruction | |
315 | unsigned Opcode = MI.getOpcode(); | |
316 | switch (Opcode) { | |
317 | case AMDGPU::CUBE_r600_pseudo: | |
318 | Opcode = AMDGPU::CUBE_r600_real; | |
319 | break; | |
320 | case AMDGPU::CUBE_eg_pseudo: | |
321 | Opcode = AMDGPU::CUBE_eg_real; | |
322 | break; | |
970d7e83 LB |
323 | default: |
324 | break; | |
325 | } | |
326 | ||
327 | MachineInstr *NewMI = | |
328 | TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1); | |
329 | ||
330 | if (Chan != 0) | |
331 | NewMI->bundleWithPred(); | |
332 | if (Mask) { | |
333 | TII->addFlag(NewMI, 0, MO_FLAG_MASK); | |
334 | } | |
335 | if (NotLast) { | |
336 | TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST); | |
337 | } | |
1a4d82fc JJ |
338 | SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::clamp); |
339 | SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::literal); | |
340 | SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_abs); | |
341 | SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_abs); | |
342 | SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src0_neg); | |
343 | SetFlagInNewMI(NewMI, &MI, AMDGPU::OpName::src1_neg); | |
970d7e83 LB |
344 | } |
345 | MI.eraseFromParent(); | |
346 | } | |
347 | } | |
348 | return false; | |
349 | } |