]> git.proxmox.com Git - rustc.git/blob - src/llvm/lib/Target/R600/R600ISelLowering.cpp
Imported Upstream version 1.0.0+dfsg1
[rustc.git] / src / llvm / lib / Target / R600 / R600ISelLowering.cpp
1 //===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// \brief Custom DAG lowering for R600
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "R600ISelLowering.h"
16 #include "AMDGPUFrameLowering.h"
17 #include "AMDGPUIntrinsicInfo.h"
18 #include "AMDGPUSubtarget.h"
19 #include "R600Defines.h"
20 #include "R600InstrInfo.h"
21 #include "R600MachineFunctionInfo.h"
22 #include "llvm/Analysis/ValueTracking.h"
23 #include "llvm/CodeGen/CallingConvLower.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/MachineInstrBuilder.h"
26 #include "llvm/CodeGen/MachineRegisterInfo.h"
27 #include "llvm/CodeGen/SelectionDAG.h"
28 #include "llvm/IR/Argument.h"
29 #include "llvm/IR/Function.h"
30
31 using namespace llvm;
32
33 R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
34 AMDGPUTargetLowering(TM),
35 Gen(TM.getSubtarget<AMDGPUSubtarget>().getGeneration()) {
36 addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
37 addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
38 addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
39 addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
40 addRegisterClass(MVT::v2f32, &AMDGPU::R600_Reg64RegClass);
41 addRegisterClass(MVT::v2i32, &AMDGPU::R600_Reg64RegClass);
42
43 computeRegisterProperties();
44
45 // Set condition code actions
46 setCondCodeAction(ISD::SETO, MVT::f32, Expand);
47 setCondCodeAction(ISD::SETUO, MVT::f32, Expand);
48 setCondCodeAction(ISD::SETLT, MVT::f32, Expand);
49 setCondCodeAction(ISD::SETLE, MVT::f32, Expand);
50 setCondCodeAction(ISD::SETOLT, MVT::f32, Expand);
51 setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
52 setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
53 setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
54 setCondCodeAction(ISD::SETUGE, MVT::f32, Expand);
55 setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
56 setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
57 setCondCodeAction(ISD::SETULE, MVT::f32, Expand);
58
59 setCondCodeAction(ISD::SETLE, MVT::i32, Expand);
60 setCondCodeAction(ISD::SETLT, MVT::i32, Expand);
61 setCondCodeAction(ISD::SETULE, MVT::i32, Expand);
62 setCondCodeAction(ISD::SETULT, MVT::i32, Expand);
63
64 setOperationAction(ISD::FCOS, MVT::f32, Custom);
65 setOperationAction(ISD::FSIN, MVT::f32, Custom);
66
67 setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
68 setOperationAction(ISD::SETCC, MVT::v2i32, Expand);
69
70 setOperationAction(ISD::BR_CC, MVT::i32, Expand);
71 setOperationAction(ISD::BR_CC, MVT::f32, Expand);
72 setOperationAction(ISD::BRCOND, MVT::Other, Custom);
73
74 setOperationAction(ISD::FSUB, MVT::f32, Expand);
75
76 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
77 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
78 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
79
80 setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
81 setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
82
83 setOperationAction(ISD::SETCC, MVT::i32, Expand);
84 setOperationAction(ISD::SETCC, MVT::f32, Expand);
85 setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
86 setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
87 setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
88
89 setOperationAction(ISD::SELECT, MVT::i32, Expand);
90 setOperationAction(ISD::SELECT, MVT::f32, Expand);
91 setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
92 setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
93
94 // Expand sign extension of vectors
95 if (!Subtarget->hasBFE())
96 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
97
98 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
99 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
100
101 if (!Subtarget->hasBFE())
102 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
103 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
104 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
105
106 if (!Subtarget->hasBFE())
107 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
108 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
109 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
110
111 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
112 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
113 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
114
115 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
116
117
118 // Legalize loads and stores to the private address space.
119 setOperationAction(ISD::LOAD, MVT::i32, Custom);
120 setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
121 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
122
123 // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
124 // spaces, so it is custom lowered to handle those where it isn't.
125 for (MVT VT : MVT::integer_valuetypes()) {
126 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
127 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Custom);
128 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i16, Custom);
129
130 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
131 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i8, Custom);
132 setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i16, Custom);
133
134 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
135 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i8, Custom);
136 setLoadExtAction(ISD::EXTLOAD, VT, MVT::i16, Custom);
137 }
138
139 setOperationAction(ISD::STORE, MVT::i8, Custom);
140 setOperationAction(ISD::STORE, MVT::i32, Custom);
141 setOperationAction(ISD::STORE, MVT::v2i32, Custom);
142 setOperationAction(ISD::STORE, MVT::v4i32, Custom);
143 setTruncStoreAction(MVT::i32, MVT::i8, Custom);
144 setTruncStoreAction(MVT::i32, MVT::i16, Custom);
145
146 setOperationAction(ISD::LOAD, MVT::i32, Custom);
147 setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
148 setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
149
150 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
151 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
152 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
153 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
154
155 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
156 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
157 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
158 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
159
160 setTargetDAGCombine(ISD::FP_ROUND);
161 setTargetDAGCombine(ISD::FP_TO_SINT);
162 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
163 setTargetDAGCombine(ISD::SELECT_CC);
164 setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
165
166 setOperationAction(ISD::SUB, MVT::i64, Expand);
167
168 // These should be replaced by UDVIREM, but it does not happen automatically
169 // during Type Legalization
170 setOperationAction(ISD::UDIV, MVT::i64, Custom);
171 setOperationAction(ISD::UREM, MVT::i64, Custom);
172 setOperationAction(ISD::SDIV, MVT::i64, Custom);
173 setOperationAction(ISD::SREM, MVT::i64, Custom);
174
175 // We don't have 64-bit shifts. Thus we need either SHX i64 or SHX_PARTS i32
176 // to be Legal/Custom in order to avoid library calls.
177 setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
178 setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
179 setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
180
181 setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
182
183 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
184 for (MVT VT : ScalarIntVTs) {
185 setOperationAction(ISD::ADDC, VT, Expand);
186 setOperationAction(ISD::SUBC, VT, Expand);
187 setOperationAction(ISD::ADDE, VT, Expand);
188 setOperationAction(ISD::SUBE, VT, Expand);
189 }
190
191 setSchedulingPreference(Sched::Source);
192 }
193
194 MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
195 MachineInstr * MI, MachineBasicBlock * BB) const {
196 MachineFunction * MF = BB->getParent();
197 MachineRegisterInfo &MRI = MF->getRegInfo();
198 MachineBasicBlock::iterator I = *MI;
199 const R600InstrInfo *TII =
200 static_cast<const R600InstrInfo *>(MF->getSubtarget().getInstrInfo());
201
202 switch (MI->getOpcode()) {
203 default:
204 // Replace LDS_*_RET instruction that don't have any uses with the
205 // equivalent LDS_*_NORET instruction.
206 if (TII->isLDSRetInstr(MI->getOpcode())) {
207 int DstIdx = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst);
208 assert(DstIdx != -1);
209 MachineInstrBuilder NewMI;
210 // FIXME: getLDSNoRetOp method only handles LDS_1A1D LDS ops. Add
211 // LDS_1A2D support and remove this special case.
212 if (!MRI.use_empty(MI->getOperand(DstIdx).getReg()) ||
213 MI->getOpcode() == AMDGPU::LDS_CMPST_RET)
214 return BB;
215
216 NewMI = BuildMI(*BB, I, BB->findDebugLoc(I),
217 TII->get(AMDGPU::getLDSNoRetOp(MI->getOpcode())));
218 for (unsigned i = 1, e = MI->getNumOperands(); i < e; ++i) {
219 NewMI.addOperand(MI->getOperand(i));
220 }
221 } else {
222 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
223 }
224 break;
225 case AMDGPU::CLAMP_R600: {
226 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
227 AMDGPU::MOV,
228 MI->getOperand(0).getReg(),
229 MI->getOperand(1).getReg());
230 TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
231 break;
232 }
233
234 case AMDGPU::FABS_R600: {
235 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
236 AMDGPU::MOV,
237 MI->getOperand(0).getReg(),
238 MI->getOperand(1).getReg());
239 TII->addFlag(NewMI, 0, MO_FLAG_ABS);
240 break;
241 }
242
243 case AMDGPU::FNEG_R600: {
244 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
245 AMDGPU::MOV,
246 MI->getOperand(0).getReg(),
247 MI->getOperand(1).getReg());
248 TII->addFlag(NewMI, 0, MO_FLAG_NEG);
249 break;
250 }
251
252 case AMDGPU::MASK_WRITE: {
253 unsigned maskedRegister = MI->getOperand(0).getReg();
254 assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
255 MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
256 TII->addFlag(defInstr, 0, MO_FLAG_MASK);
257 break;
258 }
259
260 case AMDGPU::MOV_IMM_F32:
261 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
262 MI->getOperand(1).getFPImm()->getValueAPF()
263 .bitcastToAPInt().getZExtValue());
264 break;
265 case AMDGPU::MOV_IMM_I32:
266 TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
267 MI->getOperand(1).getImm());
268 break;
269 case AMDGPU::CONST_COPY: {
270 MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, MI, AMDGPU::MOV,
271 MI->getOperand(0).getReg(), AMDGPU::ALU_CONST);
272 TII->setImmOperand(NewMI, AMDGPU::OpName::src0_sel,
273 MI->getOperand(1).getImm());
274 break;
275 }
276
277 case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
278 case AMDGPU::RAT_WRITE_CACHELESS_64_eg:
279 case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
280 unsigned EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
281
282 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
283 .addOperand(MI->getOperand(0))
284 .addOperand(MI->getOperand(1))
285 .addImm(EOP); // Set End of program bit
286 break;
287 }
288
289 case AMDGPU::TXD: {
290 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
291 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
292 MachineOperand &RID = MI->getOperand(4);
293 MachineOperand &SID = MI->getOperand(5);
294 unsigned TextureId = MI->getOperand(6).getImm();
295 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
296 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
297
298 switch (TextureId) {
299 case 5: // Rect
300 CTX = CTY = 0;
301 break;
302 case 6: // Shadow1D
303 SrcW = SrcZ;
304 break;
305 case 7: // Shadow2D
306 SrcW = SrcZ;
307 break;
308 case 8: // ShadowRect
309 CTX = CTY = 0;
310 SrcW = SrcZ;
311 break;
312 case 9: // 1DArray
313 SrcZ = SrcY;
314 CTZ = 0;
315 break;
316 case 10: // 2DArray
317 CTZ = 0;
318 break;
319 case 11: // Shadow1DArray
320 SrcZ = SrcY;
321 CTZ = 0;
322 break;
323 case 12: // Shadow2DArray
324 CTZ = 0;
325 break;
326 }
327 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
328 .addOperand(MI->getOperand(3))
329 .addImm(SrcX)
330 .addImm(SrcY)
331 .addImm(SrcZ)
332 .addImm(SrcW)
333 .addImm(0)
334 .addImm(0)
335 .addImm(0)
336 .addImm(0)
337 .addImm(1)
338 .addImm(2)
339 .addImm(3)
340 .addOperand(RID)
341 .addOperand(SID)
342 .addImm(CTX)
343 .addImm(CTY)
344 .addImm(CTZ)
345 .addImm(CTW);
346 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
347 .addOperand(MI->getOperand(2))
348 .addImm(SrcX)
349 .addImm(SrcY)
350 .addImm(SrcZ)
351 .addImm(SrcW)
352 .addImm(0)
353 .addImm(0)
354 .addImm(0)
355 .addImm(0)
356 .addImm(1)
357 .addImm(2)
358 .addImm(3)
359 .addOperand(RID)
360 .addOperand(SID)
361 .addImm(CTX)
362 .addImm(CTY)
363 .addImm(CTZ)
364 .addImm(CTW);
365 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
366 .addOperand(MI->getOperand(0))
367 .addOperand(MI->getOperand(1))
368 .addImm(SrcX)
369 .addImm(SrcY)
370 .addImm(SrcZ)
371 .addImm(SrcW)
372 .addImm(0)
373 .addImm(0)
374 .addImm(0)
375 .addImm(0)
376 .addImm(1)
377 .addImm(2)
378 .addImm(3)
379 .addOperand(RID)
380 .addOperand(SID)
381 .addImm(CTX)
382 .addImm(CTY)
383 .addImm(CTZ)
384 .addImm(CTW)
385 .addReg(T0, RegState::Implicit)
386 .addReg(T1, RegState::Implicit);
387 break;
388 }
389
390 case AMDGPU::TXD_SHADOW: {
391 unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
392 unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
393 MachineOperand &RID = MI->getOperand(4);
394 MachineOperand &SID = MI->getOperand(5);
395 unsigned TextureId = MI->getOperand(6).getImm();
396 unsigned SrcX = 0, SrcY = 1, SrcZ = 2, SrcW = 3;
397 unsigned CTX = 1, CTY = 1, CTZ = 1, CTW = 1;
398
399 switch (TextureId) {
400 case 5: // Rect
401 CTX = CTY = 0;
402 break;
403 case 6: // Shadow1D
404 SrcW = SrcZ;
405 break;
406 case 7: // Shadow2D
407 SrcW = SrcZ;
408 break;
409 case 8: // ShadowRect
410 CTX = CTY = 0;
411 SrcW = SrcZ;
412 break;
413 case 9: // 1DArray
414 SrcZ = SrcY;
415 CTZ = 0;
416 break;
417 case 10: // 2DArray
418 CTZ = 0;
419 break;
420 case 11: // Shadow1DArray
421 SrcZ = SrcY;
422 CTZ = 0;
423 break;
424 case 12: // Shadow2DArray
425 CTZ = 0;
426 break;
427 }
428
429 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
430 .addOperand(MI->getOperand(3))
431 .addImm(SrcX)
432 .addImm(SrcY)
433 .addImm(SrcZ)
434 .addImm(SrcW)
435 .addImm(0)
436 .addImm(0)
437 .addImm(0)
438 .addImm(0)
439 .addImm(1)
440 .addImm(2)
441 .addImm(3)
442 .addOperand(RID)
443 .addOperand(SID)
444 .addImm(CTX)
445 .addImm(CTY)
446 .addImm(CTZ)
447 .addImm(CTW);
448 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
449 .addOperand(MI->getOperand(2))
450 .addImm(SrcX)
451 .addImm(SrcY)
452 .addImm(SrcZ)
453 .addImm(SrcW)
454 .addImm(0)
455 .addImm(0)
456 .addImm(0)
457 .addImm(0)
458 .addImm(1)
459 .addImm(2)
460 .addImm(3)
461 .addOperand(RID)
462 .addOperand(SID)
463 .addImm(CTX)
464 .addImm(CTY)
465 .addImm(CTZ)
466 .addImm(CTW);
467 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
468 .addOperand(MI->getOperand(0))
469 .addOperand(MI->getOperand(1))
470 .addImm(SrcX)
471 .addImm(SrcY)
472 .addImm(SrcZ)
473 .addImm(SrcW)
474 .addImm(0)
475 .addImm(0)
476 .addImm(0)
477 .addImm(0)
478 .addImm(1)
479 .addImm(2)
480 .addImm(3)
481 .addOperand(RID)
482 .addOperand(SID)
483 .addImm(CTX)
484 .addImm(CTY)
485 .addImm(CTZ)
486 .addImm(CTW)
487 .addReg(T0, RegState::Implicit)
488 .addReg(T1, RegState::Implicit);
489 break;
490 }
491
492 case AMDGPU::BRANCH:
493 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
494 .addOperand(MI->getOperand(0));
495 break;
496
497 case AMDGPU::BRANCH_COND_f32: {
498 MachineInstr *NewMI =
499 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
500 AMDGPU::PREDICATE_BIT)
501 .addOperand(MI->getOperand(1))
502 .addImm(OPCODE_IS_NOT_ZERO)
503 .addImm(0); // Flags
504 TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
505 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
506 .addOperand(MI->getOperand(0))
507 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
508 break;
509 }
510
511 case AMDGPU::BRANCH_COND_i32: {
512 MachineInstr *NewMI =
513 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
514 AMDGPU::PREDICATE_BIT)
515 .addOperand(MI->getOperand(1))
516 .addImm(OPCODE_IS_NOT_ZERO_INT)
517 .addImm(0); // Flags
518 TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
519 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP_COND))
520 .addOperand(MI->getOperand(0))
521 .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
522 break;
523 }
524
525 case AMDGPU::EG_ExportSwz:
526 case AMDGPU::R600_ExportSwz: {
527 // Instruction is left unmodified if its not the last one of its type
528 bool isLastInstructionOfItsType = true;
529 unsigned InstExportType = MI->getOperand(1).getImm();
530 for (MachineBasicBlock::iterator NextExportInst = std::next(I),
531 EndBlock = BB->end(); NextExportInst != EndBlock;
532 NextExportInst = std::next(NextExportInst)) {
533 if (NextExportInst->getOpcode() == AMDGPU::EG_ExportSwz ||
534 NextExportInst->getOpcode() == AMDGPU::R600_ExportSwz) {
535 unsigned CurrentInstExportType = NextExportInst->getOperand(1)
536 .getImm();
537 if (CurrentInstExportType == InstExportType) {
538 isLastInstructionOfItsType = false;
539 break;
540 }
541 }
542 }
543 bool EOP = (std::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
544 if (!EOP && !isLastInstructionOfItsType)
545 return BB;
546 unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
547 BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
548 .addOperand(MI->getOperand(0))
549 .addOperand(MI->getOperand(1))
550 .addOperand(MI->getOperand(2))
551 .addOperand(MI->getOperand(3))
552 .addOperand(MI->getOperand(4))
553 .addOperand(MI->getOperand(5))
554 .addOperand(MI->getOperand(6))
555 .addImm(CfInst)
556 .addImm(EOP);
557 break;
558 }
559 case AMDGPU::RETURN: {
560 // RETURN instructions must have the live-out registers as implicit uses,
561 // otherwise they appear dead.
562 R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
563 MachineInstrBuilder MIB(*MF, MI);
564 for (unsigned i = 0, e = MFI->LiveOuts.size(); i != e; ++i)
565 MIB.addReg(MFI->LiveOuts[i], RegState::Implicit);
566 return BB;
567 }
568 }
569
570 MI->eraseFromParent();
571 return BB;
572 }
573
574 //===----------------------------------------------------------------------===//
575 // Custom DAG Lowering Operations
576 //===----------------------------------------------------------------------===//
577
578 SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
579 MachineFunction &MF = DAG.getMachineFunction();
580 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
581 switch (Op.getOpcode()) {
582 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
583 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
584 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
585 case ISD::SHL_PARTS: return LowerSHLParts(Op, DAG);
586 case ISD::SRA_PARTS:
587 case ISD::SRL_PARTS: return LowerSRXParts(Op, DAG);
588 case ISD::FCOS:
589 case ISD::FSIN: return LowerTrig(Op, DAG);
590 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
591 case ISD::STORE: return LowerSTORE(Op, DAG);
592 case ISD::LOAD: {
593 SDValue Result = LowerLOAD(Op, DAG);
594 assert((!Result.getNode() ||
595 Result.getNode()->getNumValues() == 2) &&
596 "Load should return a value and a chain");
597 return Result;
598 }
599
600 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
601 case ISD::GlobalAddress: return LowerGlobalAddress(MFI, Op, DAG);
602 case ISD::INTRINSIC_VOID: {
603 SDValue Chain = Op.getOperand(0);
604 unsigned IntrinsicID =
605 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
606 switch (IntrinsicID) {
607 case AMDGPUIntrinsic::AMDGPU_store_output: {
608 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
609 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
610 MFI->LiveOuts.push_back(Reg);
611 return DAG.getCopyToReg(Chain, SDLoc(Op), Reg, Op.getOperand(2));
612 }
613 case AMDGPUIntrinsic::R600_store_swizzle: {
614 const SDValue Args[8] = {
615 Chain,
616 Op.getOperand(2), // Export Value
617 Op.getOperand(3), // ArrayBase
618 Op.getOperand(4), // Type
619 DAG.getConstant(0, MVT::i32), // SWZ_X
620 DAG.getConstant(1, MVT::i32), // SWZ_Y
621 DAG.getConstant(2, MVT::i32), // SWZ_Z
622 DAG.getConstant(3, MVT::i32) // SWZ_W
623 };
624 return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
625 }
626
627 // default for switch(IntrinsicID)
628 default: break;
629 }
630 // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
631 break;
632 }
633 case ISD::INTRINSIC_WO_CHAIN: {
634 unsigned IntrinsicID =
635 cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
636 EVT VT = Op.getValueType();
637 SDLoc DL(Op);
638 switch(IntrinsicID) {
639 default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
640 case AMDGPUIntrinsic::R600_load_input: {
641 int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
642 unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
643 MachineFunction &MF = DAG.getMachineFunction();
644 MachineRegisterInfo &MRI = MF.getRegInfo();
645 MRI.addLiveIn(Reg);
646 return DAG.getCopyFromReg(DAG.getEntryNode(),
647 SDLoc(DAG.getEntryNode()), Reg, VT);
648 }
649
650 case AMDGPUIntrinsic::R600_interp_input: {
651 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
652 int ijb = cast<ConstantSDNode>(Op.getOperand(2))->getSExtValue();
653 MachineSDNode *interp;
654 if (ijb < 0) {
655 const MachineFunction &MF = DAG.getMachineFunction();
656 const R600InstrInfo *TII = static_cast<const R600InstrInfo *>(
657 MF.getSubtarget().getInstrInfo());
658 interp = DAG.getMachineNode(AMDGPU::INTERP_VEC_LOAD, DL,
659 MVT::v4f32, DAG.getTargetConstant(slot / 4 , MVT::i32));
660 return DAG.getTargetExtractSubreg(
661 TII->getRegisterInfo().getSubRegFromChannel(slot % 4),
662 DL, MVT::f32, SDValue(interp, 0));
663 }
664 MachineFunction &MF = DAG.getMachineFunction();
665 MachineRegisterInfo &MRI = MF.getRegInfo();
666 unsigned RegisterI = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb);
667 unsigned RegisterJ = AMDGPU::R600_TReg32RegClass.getRegister(2 * ijb + 1);
668 MRI.addLiveIn(RegisterI);
669 MRI.addLiveIn(RegisterJ);
670 SDValue RegisterINode = DAG.getCopyFromReg(DAG.getEntryNode(),
671 SDLoc(DAG.getEntryNode()), RegisterI, MVT::f32);
672 SDValue RegisterJNode = DAG.getCopyFromReg(DAG.getEntryNode(),
673 SDLoc(DAG.getEntryNode()), RegisterJ, MVT::f32);
674
675 if (slot % 4 < 2)
676 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
677 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
678 RegisterJNode, RegisterINode);
679 else
680 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
681 MVT::f32, MVT::f32, DAG.getTargetConstant(slot / 4 , MVT::i32),
682 RegisterJNode, RegisterINode);
683 return SDValue(interp, slot % 2);
684 }
685 case AMDGPUIntrinsic::R600_interp_xy:
686 case AMDGPUIntrinsic::R600_interp_zw: {
687 int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
688 MachineSDNode *interp;
689 SDValue RegisterINode = Op.getOperand(2);
690 SDValue RegisterJNode = Op.getOperand(3);
691
692 if (IntrinsicID == AMDGPUIntrinsic::R600_interp_xy)
693 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_XY, DL,
694 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
695 RegisterJNode, RegisterINode);
696 else
697 interp = DAG.getMachineNode(AMDGPU::INTERP_PAIR_ZW, DL,
698 MVT::f32, MVT::f32, DAG.getTargetConstant(slot, MVT::i32),
699 RegisterJNode, RegisterINode);
700 return DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32,
701 SDValue(interp, 0), SDValue(interp, 1));
702 }
703 case AMDGPUIntrinsic::R600_tex:
704 case AMDGPUIntrinsic::R600_texc:
705 case AMDGPUIntrinsic::R600_txl:
706 case AMDGPUIntrinsic::R600_txlc:
707 case AMDGPUIntrinsic::R600_txb:
708 case AMDGPUIntrinsic::R600_txbc:
709 case AMDGPUIntrinsic::R600_txf:
710 case AMDGPUIntrinsic::R600_txq:
711 case AMDGPUIntrinsic::R600_ddx:
712 case AMDGPUIntrinsic::R600_ddy:
713 case AMDGPUIntrinsic::R600_ldptr: {
714 unsigned TextureOp;
715 switch (IntrinsicID) {
716 case AMDGPUIntrinsic::R600_tex:
717 TextureOp = 0;
718 break;
719 case AMDGPUIntrinsic::R600_texc:
720 TextureOp = 1;
721 break;
722 case AMDGPUIntrinsic::R600_txl:
723 TextureOp = 2;
724 break;
725 case AMDGPUIntrinsic::R600_txlc:
726 TextureOp = 3;
727 break;
728 case AMDGPUIntrinsic::R600_txb:
729 TextureOp = 4;
730 break;
731 case AMDGPUIntrinsic::R600_txbc:
732 TextureOp = 5;
733 break;
734 case AMDGPUIntrinsic::R600_txf:
735 TextureOp = 6;
736 break;
737 case AMDGPUIntrinsic::R600_txq:
738 TextureOp = 7;
739 break;
740 case AMDGPUIntrinsic::R600_ddx:
741 TextureOp = 8;
742 break;
743 case AMDGPUIntrinsic::R600_ddy:
744 TextureOp = 9;
745 break;
746 case AMDGPUIntrinsic::R600_ldptr:
747 TextureOp = 10;
748 break;
749 default:
750 llvm_unreachable("Unknow Texture Operation");
751 }
752
753 SDValue TexArgs[19] = {
754 DAG.getConstant(TextureOp, MVT::i32),
755 Op.getOperand(1),
756 DAG.getConstant(0, MVT::i32),
757 DAG.getConstant(1, MVT::i32),
758 DAG.getConstant(2, MVT::i32),
759 DAG.getConstant(3, MVT::i32),
760 Op.getOperand(2),
761 Op.getOperand(3),
762 Op.getOperand(4),
763 DAG.getConstant(0, MVT::i32),
764 DAG.getConstant(1, MVT::i32),
765 DAG.getConstant(2, MVT::i32),
766 DAG.getConstant(3, MVT::i32),
767 Op.getOperand(5),
768 Op.getOperand(6),
769 Op.getOperand(7),
770 Op.getOperand(8),
771 Op.getOperand(9),
772 Op.getOperand(10)
773 };
774 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
775 }
776 case AMDGPUIntrinsic::AMDGPU_dp4: {
777 SDValue Args[8] = {
778 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
779 DAG.getConstant(0, MVT::i32)),
780 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
781 DAG.getConstant(0, MVT::i32)),
782 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
783 DAG.getConstant(1, MVT::i32)),
784 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
785 DAG.getConstant(1, MVT::i32)),
786 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
787 DAG.getConstant(2, MVT::i32)),
788 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
789 DAG.getConstant(2, MVT::i32)),
790 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
791 DAG.getConstant(3, MVT::i32)),
792 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
793 DAG.getConstant(3, MVT::i32))
794 };
795 return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
796 }
797
798 case Intrinsic::r600_read_ngroups_x:
799 return LowerImplicitParameter(DAG, VT, DL, 0);
800 case Intrinsic::r600_read_ngroups_y:
801 return LowerImplicitParameter(DAG, VT, DL, 1);
802 case Intrinsic::r600_read_ngroups_z:
803 return LowerImplicitParameter(DAG, VT, DL, 2);
804 case Intrinsic::r600_read_global_size_x:
805 return LowerImplicitParameter(DAG, VT, DL, 3);
806 case Intrinsic::r600_read_global_size_y:
807 return LowerImplicitParameter(DAG, VT, DL, 4);
808 case Intrinsic::r600_read_global_size_z:
809 return LowerImplicitParameter(DAG, VT, DL, 5);
810 case Intrinsic::r600_read_local_size_x:
811 return LowerImplicitParameter(DAG, VT, DL, 6);
812 case Intrinsic::r600_read_local_size_y:
813 return LowerImplicitParameter(DAG, VT, DL, 7);
814 case Intrinsic::r600_read_local_size_z:
815 return LowerImplicitParameter(DAG, VT, DL, 8);
816
817 case Intrinsic::AMDGPU_read_workdim:
818 return LowerImplicitParameter(DAG, VT, DL, MFI->ABIArgOffset / 4);
819
820 case Intrinsic::r600_read_tgid_x:
821 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
822 AMDGPU::T1_X, VT);
823 case Intrinsic::r600_read_tgid_y:
824 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
825 AMDGPU::T1_Y, VT);
826 case Intrinsic::r600_read_tgid_z:
827 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
828 AMDGPU::T1_Z, VT);
829 case Intrinsic::r600_read_tidig_x:
830 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
831 AMDGPU::T0_X, VT);
832 case Intrinsic::r600_read_tidig_y:
833 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
834 AMDGPU::T0_Y, VT);
835 case Intrinsic::r600_read_tidig_z:
836 return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
837 AMDGPU::T0_Z, VT);
838 case Intrinsic::AMDGPU_rsq:
839 // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior.
840 return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
841 }
842 // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
843 break;
844 }
845 } // end switch(Op.getOpcode())
846 return SDValue();
847 }
848
849 void R600TargetLowering::ReplaceNodeResults(SDNode *N,
850 SmallVectorImpl<SDValue> &Results,
851 SelectionDAG &DAG) const {
852 switch (N->getOpcode()) {
853 default:
854 AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
855 return;
856 case ISD::FP_TO_UINT:
857 if (N->getValueType(0) == MVT::i1) {
858 Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
859 return;
860 }
861 // Fall-through. Since we don't care about out of bounds values
862 // we can use FP_TO_SINT for uints too. The DAGLegalizer code for uint
863 // considers some extra cases which are not necessary here.
864 case ISD::FP_TO_SINT: {
865 SDValue Result;
866 if (expandFP_TO_SINT(N, Result, DAG))
867 Results.push_back(Result);
868 return;
869 }
870 case ISD::UDIV: {
871 SDValue Op = SDValue(N, 0);
872 SDLoc DL(Op);
873 EVT VT = Op.getValueType();
874 SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
875 N->getOperand(0), N->getOperand(1));
876 Results.push_back(UDIVREM);
877 break;
878 }
879 case ISD::UREM: {
880 SDValue Op = SDValue(N, 0);
881 SDLoc DL(Op);
882 EVT VT = Op.getValueType();
883 SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
884 N->getOperand(0), N->getOperand(1));
885 Results.push_back(UDIVREM.getValue(1));
886 break;
887 }
888 case ISD::SDIV: {
889 SDValue Op = SDValue(N, 0);
890 SDLoc DL(Op);
891 EVT VT = Op.getValueType();
892 SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
893 N->getOperand(0), N->getOperand(1));
894 Results.push_back(SDIVREM);
895 break;
896 }
897 case ISD::SREM: {
898 SDValue Op = SDValue(N, 0);
899 SDLoc DL(Op);
900 EVT VT = Op.getValueType();
901 SDValue SDIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(VT, VT),
902 N->getOperand(0), N->getOperand(1));
903 Results.push_back(SDIVREM.getValue(1));
904 break;
905 }
906 case ISD::SDIVREM: {
907 SDValue Op = SDValue(N, 1);
908 SDValue RES = LowerSDIVREM(Op, DAG);
909 Results.push_back(RES);
910 Results.push_back(RES.getValue(1));
911 break;
912 }
913 case ISD::UDIVREM: {
914 SDValue Op = SDValue(N, 0);
915 LowerUDIVREM64(Op, DAG, Results);
916 break;
917 }
918 }
919 }
920
921 SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
922 SDValue Vector) const {
923
924 SDLoc DL(Vector);
925 EVT VecVT = Vector.getValueType();
926 EVT EltVT = VecVT.getVectorElementType();
927 SmallVector<SDValue, 8> Args;
928
929 for (unsigned i = 0, e = VecVT.getVectorNumElements();
930 i != e; ++i) {
931 Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
932 Vector, DAG.getConstant(i, getVectorIdxTy())));
933 }
934
935 return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
936 }
937
938 SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
939 SelectionDAG &DAG) const {
940
941 SDLoc DL(Op);
942 SDValue Vector = Op.getOperand(0);
943 SDValue Index = Op.getOperand(1);
944
945 if (isa<ConstantSDNode>(Index) ||
946 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
947 return Op;
948
949 Vector = vectorToVerticalVector(DAG, Vector);
950 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
951 Vector, Index);
952 }
953
954 SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
955 SelectionDAG &DAG) const {
956 SDLoc DL(Op);
957 SDValue Vector = Op.getOperand(0);
958 SDValue Value = Op.getOperand(1);
959 SDValue Index = Op.getOperand(2);
960
961 if (isa<ConstantSDNode>(Index) ||
962 Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
963 return Op;
964
965 Vector = vectorToVerticalVector(DAG, Vector);
966 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
967 Vector, Value, Index);
968 return vectorToVerticalVector(DAG, Insert);
969 }
970
971 SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
972 // On hw >= R700, COS/SIN input must be between -1. and 1.
973 // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
974 EVT VT = Op.getValueType();
975 SDValue Arg = Op.getOperand(0);
976 SDValue FractPart = DAG.getNode(AMDGPUISD::FRACT, SDLoc(Op), VT,
977 DAG.getNode(ISD::FADD, SDLoc(Op), VT,
978 DAG.getNode(ISD::FMUL, SDLoc(Op), VT, Arg,
979 DAG.getConstantFP(0.15915494309, MVT::f32)),
980 DAG.getConstantFP(0.5, MVT::f32)));
981 unsigned TrigNode;
982 switch (Op.getOpcode()) {
983 case ISD::FCOS:
984 TrigNode = AMDGPUISD::COS_HW;
985 break;
986 case ISD::FSIN:
987 TrigNode = AMDGPUISD::SIN_HW;
988 break;
989 default:
990 llvm_unreachable("Wrong trig opcode");
991 }
992 SDValue TrigVal = DAG.getNode(TrigNode, SDLoc(Op), VT,
993 DAG.getNode(ISD::FADD, SDLoc(Op), VT, FractPart,
994 DAG.getConstantFP(-0.5, MVT::f32)));
995 if (Gen >= AMDGPUSubtarget::R700)
996 return TrigVal;
997 // On R600 hw, COS/SIN input must be between -Pi and Pi.
998 return DAG.getNode(ISD::FMUL, SDLoc(Op), VT, TrigVal,
999 DAG.getConstantFP(3.14159265359, MVT::f32));
1000 }
1001
1002 SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
1003 SDLoc DL(Op);
1004 EVT VT = Op.getValueType();
1005
1006 SDValue Lo = Op.getOperand(0);
1007 SDValue Hi = Op.getOperand(1);
1008 SDValue Shift = Op.getOperand(2);
1009 SDValue Zero = DAG.getConstant(0, VT);
1010 SDValue One = DAG.getConstant(1, VT);
1011
1012 SDValue Width = DAG.getConstant(VT.getSizeInBits(), VT);
1013 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1014 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1015 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1016
1017 // The dance around Width1 is necessary for 0 special case.
1018 // Without it the CompShift might be 32, producing incorrect results in
1019 // Overflow. So we do the shift in two steps, the alternative is to
1020 // add a conditional to filter the special case.
1021
1022 SDValue Overflow = DAG.getNode(ISD::SRL, DL, VT, Lo, CompShift);
1023 Overflow = DAG.getNode(ISD::SRL, DL, VT, Overflow, One);
1024
1025 SDValue HiSmall = DAG.getNode(ISD::SHL, DL, VT, Hi, Shift);
1026 HiSmall = DAG.getNode(ISD::OR, DL, VT, HiSmall, Overflow);
1027 SDValue LoSmall = DAG.getNode(ISD::SHL, DL, VT, Lo, Shift);
1028
1029 SDValue HiBig = DAG.getNode(ISD::SHL, DL, VT, Lo, BigShift);
1030 SDValue LoBig = Zero;
1031
1032 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1033 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1034
1035 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1036 }
1037
1038 SDValue R600TargetLowering::LowerSRXParts(SDValue Op, SelectionDAG &DAG) const {
1039 SDLoc DL(Op);
1040 EVT VT = Op.getValueType();
1041
1042 SDValue Lo = Op.getOperand(0);
1043 SDValue Hi = Op.getOperand(1);
1044 SDValue Shift = Op.getOperand(2);
1045 SDValue Zero = DAG.getConstant(0, VT);
1046 SDValue One = DAG.getConstant(1, VT);
1047
1048 const bool SRA = Op.getOpcode() == ISD::SRA_PARTS;
1049
1050 SDValue Width = DAG.getConstant(VT.getSizeInBits(), VT);
1051 SDValue Width1 = DAG.getConstant(VT.getSizeInBits() - 1, VT);
1052 SDValue BigShift = DAG.getNode(ISD::SUB, DL, VT, Shift, Width);
1053 SDValue CompShift = DAG.getNode(ISD::SUB, DL, VT, Width1, Shift);
1054
1055 // The dance around Width1 is necessary for 0 special case.
1056 // Without it the CompShift might be 32, producing incorrect results in
1057 // Overflow. So we do the shift in two steps, the alternative is to
1058 // add a conditional to filter the special case.
1059
1060 SDValue Overflow = DAG.getNode(ISD::SHL, DL, VT, Hi, CompShift);
1061 Overflow = DAG.getNode(ISD::SHL, DL, VT, Overflow, One);
1062
1063 SDValue HiSmall = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, Shift);
1064 SDValue LoSmall = DAG.getNode(ISD::SRL, DL, VT, Lo, Shift);
1065 LoSmall = DAG.getNode(ISD::OR, DL, VT, LoSmall, Overflow);
1066
1067 SDValue LoBig = DAG.getNode(SRA ? ISD::SRA : ISD::SRL, DL, VT, Hi, BigShift);
1068 SDValue HiBig = SRA ? DAG.getNode(ISD::SRA, DL, VT, Hi, Width1) : Zero;
1069
1070 Hi = DAG.getSelectCC(DL, Shift, Width, HiSmall, HiBig, ISD::SETULT);
1071 Lo = DAG.getSelectCC(DL, Shift, Width, LoSmall, LoBig, ISD::SETULT);
1072
1073 return DAG.getNode(ISD::MERGE_VALUES, DL, DAG.getVTList(VT,VT), Lo, Hi);
1074 }
1075
1076 SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
1077 return DAG.getNode(
1078 ISD::SETCC,
1079 SDLoc(Op),
1080 MVT::i1,
1081 Op, DAG.getConstantFP(0.0f, MVT::f32),
1082 DAG.getCondCode(ISD::SETNE)
1083 );
1084 }
1085
1086 SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
1087 SDLoc DL,
1088 unsigned DwordOffset) const {
1089 unsigned ByteOffset = DwordOffset * 4;
1090 PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1091 AMDGPUAS::CONSTANT_BUFFER_0);
1092
1093 // We shouldn't be using an offset wider than 16-bits for implicit parameters.
1094 assert(isInt<16>(ByteOffset));
1095
1096 return DAG.getLoad(VT, DL, DAG.getEntryNode(),
1097 DAG.getConstant(ByteOffset, MVT::i32), // PTR
1098 MachinePointerInfo(ConstantPointerNull::get(PtrType)),
1099 false, false, false, 0);
1100 }
1101
1102 bool R600TargetLowering::isZero(SDValue Op) const {
1103 if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
1104 return Cst->isNullValue();
1105 } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
1106 return CstFP->isZero();
1107 } else {
1108 return false;
1109 }
1110 }
1111
1112 SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
1113 SDLoc DL(Op);
1114 EVT VT = Op.getValueType();
1115
1116 SDValue LHS = Op.getOperand(0);
1117 SDValue RHS = Op.getOperand(1);
1118 SDValue True = Op.getOperand(2);
1119 SDValue False = Op.getOperand(3);
1120 SDValue CC = Op.getOperand(4);
1121 SDValue Temp;
1122
1123 if (VT == MVT::f32) {
1124 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
1125 SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
1126 if (MinMax)
1127 return MinMax;
1128 }
1129
1130 // LHS and RHS are guaranteed to be the same value type
1131 EVT CompareVT = LHS.getValueType();
1132
1133 // Check if we can lower this to a native operation.
1134
1135 // Try to lower to a SET* instruction:
1136 //
1137 // SET* can match the following patterns:
1138 //
1139 // select_cc f32, f32, -1, 0, cc_supported
1140 // select_cc f32, f32, 1.0f, 0.0f, cc_supported
1141 // select_cc i32, i32, -1, 0, cc_supported
1142 //
1143
1144 // Move hardware True/False values to the correct operand.
1145 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1146 ISD::CondCode InverseCC =
1147 ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1148 if (isHWTrueValue(False) && isHWFalseValue(True)) {
1149 if (isCondCodeLegal(InverseCC, CompareVT.getSimpleVT())) {
1150 std::swap(False, True);
1151 CC = DAG.getCondCode(InverseCC);
1152 } else {
1153 ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InverseCC);
1154 if (isCondCodeLegal(SwapInvCC, CompareVT.getSimpleVT())) {
1155 std::swap(False, True);
1156 std::swap(LHS, RHS);
1157 CC = DAG.getCondCode(SwapInvCC);
1158 }
1159 }
1160 }
1161
1162 if (isHWTrueValue(True) && isHWFalseValue(False) &&
1163 (CompareVT == VT || VT == MVT::i32)) {
1164 // This can be matched by a SET* instruction.
1165 return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
1166 }
1167
1168 // Try to lower to a CND* instruction:
1169 //
1170 // CND* can match the following patterns:
1171 //
1172 // select_cc f32, 0.0, f32, f32, cc_supported
1173 // select_cc f32, 0.0, i32, i32, cc_supported
1174 // select_cc i32, 0, f32, f32, cc_supported
1175 // select_cc i32, 0, i32, i32, cc_supported
1176 //
1177
1178 // Try to move the zero value to the RHS
1179 if (isZero(LHS)) {
1180 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1181 // Try swapping the operands
1182 ISD::CondCode CCSwapped = ISD::getSetCCSwappedOperands(CCOpcode);
1183 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1184 std::swap(LHS, RHS);
1185 CC = DAG.getCondCode(CCSwapped);
1186 } else {
1187 // Try inverting the conditon and then swapping the operands
1188 ISD::CondCode CCInv = ISD::getSetCCInverse(CCOpcode, CompareVT.isInteger());
1189 CCSwapped = ISD::getSetCCSwappedOperands(CCInv);
1190 if (isCondCodeLegal(CCSwapped, CompareVT.getSimpleVT())) {
1191 std::swap(True, False);
1192 std::swap(LHS, RHS);
1193 CC = DAG.getCondCode(CCSwapped);
1194 }
1195 }
1196 }
1197 if (isZero(RHS)) {
1198 SDValue Cond = LHS;
1199 SDValue Zero = RHS;
1200 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1201 if (CompareVT != VT) {
1202 // Bitcast True / False to the correct types. This will end up being
1203 // a nop, but it allows us to define only a single pattern in the
1204 // .TD files for each CND* instruction rather than having to have
1205 // one pattern for integer True/False and one for fp True/False
1206 True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
1207 False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
1208 }
1209
1210 switch (CCOpcode) {
1211 case ISD::SETONE:
1212 case ISD::SETUNE:
1213 case ISD::SETNE:
1214 CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
1215 Temp = True;
1216 True = False;
1217 False = Temp;
1218 break;
1219 default:
1220 break;
1221 }
1222 SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
1223 Cond, Zero,
1224 True, False,
1225 DAG.getCondCode(CCOpcode));
1226 return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
1227 }
1228
1229 // If we make it this for it means we have no native instructions to handle
1230 // this SELECT_CC, so we must lower it.
1231 SDValue HWTrue, HWFalse;
1232
1233 if (CompareVT == MVT::f32) {
1234 HWTrue = DAG.getConstantFP(1.0f, CompareVT);
1235 HWFalse = DAG.getConstantFP(0.0f, CompareVT);
1236 } else if (CompareVT == MVT::i32) {
1237 HWTrue = DAG.getConstant(-1, CompareVT);
1238 HWFalse = DAG.getConstant(0, CompareVT);
1239 }
1240 else {
1241 llvm_unreachable("Unhandled value type in LowerSELECT_CC");
1242 }
1243
1244 // Lower this unsupported SELECT_CC into a combination of two supported
1245 // SELECT_CC operations.
1246 SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
1247
1248 return DAG.getNode(ISD::SELECT_CC, DL, VT,
1249 Cond, HWFalse,
1250 True, False,
1251 DAG.getCondCode(ISD::SETNE));
1252 }
1253
1254 /// LLVM generates byte-addressed pointers. For indirect addressing, we need to
1255 /// convert these pointers to a register index. Each register holds
1256 /// 16 bytes, (4 x 32bit sub-register), but we need to take into account the
1257 /// \p StackWidth, which tells us how many of the 4 sub-registrers will be used
1258 /// for indirect addressing.
1259 SDValue R600TargetLowering::stackPtrToRegIndex(SDValue Ptr,
1260 unsigned StackWidth,
1261 SelectionDAG &DAG) const {
1262 unsigned SRLPad;
1263 switch(StackWidth) {
1264 case 1:
1265 SRLPad = 2;
1266 break;
1267 case 2:
1268 SRLPad = 3;
1269 break;
1270 case 4:
1271 SRLPad = 4;
1272 break;
1273 default: llvm_unreachable("Invalid stack width");
1274 }
1275
1276 return DAG.getNode(ISD::SRL, SDLoc(Ptr), Ptr.getValueType(), Ptr,
1277 DAG.getConstant(SRLPad, MVT::i32));
1278 }
1279
1280 void R600TargetLowering::getStackAddress(unsigned StackWidth,
1281 unsigned ElemIdx,
1282 unsigned &Channel,
1283 unsigned &PtrIncr) const {
1284 switch (StackWidth) {
1285 default:
1286 case 1:
1287 Channel = 0;
1288 if (ElemIdx > 0) {
1289 PtrIncr = 1;
1290 } else {
1291 PtrIncr = 0;
1292 }
1293 break;
1294 case 2:
1295 Channel = ElemIdx % 2;
1296 if (ElemIdx == 2) {
1297 PtrIncr = 1;
1298 } else {
1299 PtrIncr = 0;
1300 }
1301 break;
1302 case 4:
1303 Channel = ElemIdx;
1304 PtrIncr = 0;
1305 break;
1306 }
1307 }
1308
1309 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
1310 SDLoc DL(Op);
1311 StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
1312 SDValue Chain = Op.getOperand(0);
1313 SDValue Value = Op.getOperand(1);
1314 SDValue Ptr = Op.getOperand(2);
1315
1316 SDValue Result = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1317 if (Result.getNode()) {
1318 return Result;
1319 }
1320
1321 if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS) {
1322 if (StoreNode->isTruncatingStore()) {
1323 EVT VT = Value.getValueType();
1324 assert(VT.bitsLE(MVT::i32));
1325 EVT MemVT = StoreNode->getMemoryVT();
1326 SDValue MaskConstant;
1327 if (MemVT == MVT::i8) {
1328 MaskConstant = DAG.getConstant(0xFF, MVT::i32);
1329 } else {
1330 assert(MemVT == MVT::i16);
1331 MaskConstant = DAG.getConstant(0xFFFF, MVT::i32);
1332 }
1333 SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, VT, Ptr,
1334 DAG.getConstant(2, MVT::i32));
1335 SDValue ByteIndex = DAG.getNode(ISD::AND, DL, Ptr.getValueType(), Ptr,
1336 DAG.getConstant(0x00000003, VT));
1337 SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
1338 SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, ByteIndex,
1339 DAG.getConstant(3, VT));
1340 SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, Shift);
1341 SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, Shift);
1342 // XXX: If we add a 64-bit ZW register class, then we could use a 2 x i32
1343 // vector instead.
1344 SDValue Src[4] = {
1345 ShiftedValue,
1346 DAG.getConstant(0, MVT::i32),
1347 DAG.getConstant(0, MVT::i32),
1348 Mask
1349 };
1350 SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
1351 SDValue Args[3] = { Chain, Input, DWordAddr };
1352 return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
1353 Op->getVTList(), Args, MemVT,
1354 StoreNode->getMemOperand());
1355 } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
1356 Value.getValueType().bitsGE(MVT::i32)) {
1357 // Convert pointer from byte address to dword address.
1358 Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
1359 DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
1360 Ptr, DAG.getConstant(2, MVT::i32)));
1361
1362 if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
1363 llvm_unreachable("Truncated and indexed stores not supported yet");
1364 } else {
1365 Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
1366 }
1367 return Chain;
1368 }
1369 }
1370
1371 EVT ValueVT = Value.getValueType();
1372
1373 if (StoreNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1374 return SDValue();
1375 }
1376
1377 SDValue Ret = AMDGPUTargetLowering::LowerSTORE(Op, DAG);
1378 if (Ret.getNode()) {
1379 return Ret;
1380 }
1381 // Lowering for indirect addressing
1382
1383 const MachineFunction &MF = DAG.getMachineFunction();
1384 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
1385 getTargetMachine().getSubtargetImpl()->getFrameLowering());
1386 unsigned StackWidth = TFL->getStackWidth(MF);
1387
1388 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1389
1390 if (ValueVT.isVector()) {
1391 unsigned NumElemVT = ValueVT.getVectorNumElements();
1392 EVT ElemVT = ValueVT.getVectorElementType();
1393 SmallVector<SDValue, 4> Stores(NumElemVT);
1394
1395 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1396 "vector width in load");
1397
1398 for (unsigned i = 0; i < NumElemVT; ++i) {
1399 unsigned Channel, PtrIncr;
1400 getStackAddress(StackWidth, i, Channel, PtrIncr);
1401 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1402 DAG.getConstant(PtrIncr, MVT::i32));
1403 SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT,
1404 Value, DAG.getConstant(i, MVT::i32));
1405
1406 Stores[i] = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other,
1407 Chain, Elem, Ptr,
1408 DAG.getTargetConstant(Channel, MVT::i32));
1409 }
1410 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
1411 } else {
1412 if (ValueVT == MVT::i8) {
1413 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
1414 }
1415 Chain = DAG.getNode(AMDGPUISD::REGISTER_STORE, DL, MVT::Other, Chain, Value, Ptr,
1416 DAG.getTargetConstant(0, MVT::i32)); // Channel
1417 }
1418
1419 return Chain;
1420 }
1421
1422 // return (512 + (kc_bank << 12)
1423 static int
1424 ConstantAddressBlock(unsigned AddressSpace) {
1425 switch (AddressSpace) {
1426 case AMDGPUAS::CONSTANT_BUFFER_0:
1427 return 512;
1428 case AMDGPUAS::CONSTANT_BUFFER_1:
1429 return 512 + 4096;
1430 case AMDGPUAS::CONSTANT_BUFFER_2:
1431 return 512 + 4096 * 2;
1432 case AMDGPUAS::CONSTANT_BUFFER_3:
1433 return 512 + 4096 * 3;
1434 case AMDGPUAS::CONSTANT_BUFFER_4:
1435 return 512 + 4096 * 4;
1436 case AMDGPUAS::CONSTANT_BUFFER_5:
1437 return 512 + 4096 * 5;
1438 case AMDGPUAS::CONSTANT_BUFFER_6:
1439 return 512 + 4096 * 6;
1440 case AMDGPUAS::CONSTANT_BUFFER_7:
1441 return 512 + 4096 * 7;
1442 case AMDGPUAS::CONSTANT_BUFFER_8:
1443 return 512 + 4096 * 8;
1444 case AMDGPUAS::CONSTANT_BUFFER_9:
1445 return 512 + 4096 * 9;
1446 case AMDGPUAS::CONSTANT_BUFFER_10:
1447 return 512 + 4096 * 10;
1448 case AMDGPUAS::CONSTANT_BUFFER_11:
1449 return 512 + 4096 * 11;
1450 case AMDGPUAS::CONSTANT_BUFFER_12:
1451 return 512 + 4096 * 12;
1452 case AMDGPUAS::CONSTANT_BUFFER_13:
1453 return 512 + 4096 * 13;
1454 case AMDGPUAS::CONSTANT_BUFFER_14:
1455 return 512 + 4096 * 14;
1456 case AMDGPUAS::CONSTANT_BUFFER_15:
1457 return 512 + 4096 * 15;
1458 default:
1459 return -1;
1460 }
1461 }
1462
1463 SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
1464 {
1465 EVT VT = Op.getValueType();
1466 SDLoc DL(Op);
1467 LoadSDNode *LoadNode = cast<LoadSDNode>(Op);
1468 SDValue Chain = Op.getOperand(0);
1469 SDValue Ptr = Op.getOperand(1);
1470 SDValue LoweredLoad;
1471
1472 SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
1473 if (Ret.getNode()) {
1474 SDValue Ops[2] = {
1475 Ret,
1476 Chain
1477 };
1478 return DAG.getMergeValues(Ops, DL);
1479 }
1480
1481 // Lower loads constant address space global variable loads
1482 if (LoadNode->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
1483 isa<GlobalVariable>(
1484 GetUnderlyingObject(LoadNode->getMemOperand()->getValue()))) {
1485
1486 SDValue Ptr = DAG.getZExtOrTrunc(LoadNode->getBasePtr(), DL,
1487 getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
1488 Ptr = DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
1489 DAG.getConstant(2, MVT::i32));
1490 return DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, Op->getVTList(),
1491 LoadNode->getChain(), Ptr,
1492 DAG.getTargetConstant(0, MVT::i32), Op.getOperand(2));
1493 }
1494
1495 if (LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && VT.isVector()) {
1496 SDValue MergedValues[2] = {
1497 ScalarizeVectorLoad(Op, DAG),
1498 Chain
1499 };
1500 return DAG.getMergeValues(MergedValues, DL);
1501 }
1502
1503 int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
1504 if (ConstantBlock > -1 &&
1505 ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
1506 (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
1507 SDValue Result;
1508 if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
1509 isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
1510 isa<ConstantSDNode>(Ptr)) {
1511 SDValue Slots[4];
1512 for (unsigned i = 0; i < 4; i++) {
1513 // We want Const position encoded with the following formula :
1514 // (((512 + (kc_bank << 12) + const_index) << 2) + chan)
1515 // const_index is Ptr computed by llvm using an alignment of 16.
1516 // Thus we add (((512 + (kc_bank << 12)) + chan ) * 4 here and
1517 // then div by 4 at the ISel step
1518 SDValue NewPtr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
1519 DAG.getConstant(4 * i + ConstantBlock * 16, MVT::i32));
1520 Slots[i] = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::i32, NewPtr);
1521 }
1522 EVT NewVT = MVT::v4i32;
1523 unsigned NumElements = 4;
1524 if (VT.isVector()) {
1525 NewVT = VT;
1526 NumElements = VT.getVectorNumElements();
1527 }
1528 Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
1529 makeArrayRef(Slots, NumElements));
1530 } else {
1531 // non-constant ptr can't be folded, keeps it as a v4f32 load
1532 Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
1533 DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr, DAG.getConstant(4, MVT::i32)),
1534 DAG.getConstant(LoadNode->getAddressSpace() -
1535 AMDGPUAS::CONSTANT_BUFFER_0, MVT::i32)
1536 );
1537 }
1538
1539 if (!VT.isVector()) {
1540 Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Result,
1541 DAG.getConstant(0, MVT::i32));
1542 }
1543
1544 SDValue MergedValues[2] = {
1545 Result,
1546 Chain
1547 };
1548 return DAG.getMergeValues(MergedValues, DL);
1549 }
1550
1551 // For most operations returning SDValue() will result in the node being
1552 // expanded by the DAG Legalizer. This is not the case for ISD::LOAD, so we
1553 // need to manually expand loads that may be legal in some address spaces and
1554 // illegal in others. SEXT loads from CONSTANT_BUFFER_0 are supported for
1555 // compute shaders, since the data is sign extended when it is uploaded to the
1556 // buffer. However SEXT loads from other address spaces are not supported, so
1557 // we need to expand them here.
1558 if (LoadNode->getExtensionType() == ISD::SEXTLOAD) {
1559 EVT MemVT = LoadNode->getMemoryVT();
1560 assert(!MemVT.isVector() && (MemVT == MVT::i16 || MemVT == MVT::i8));
1561 SDValue ShiftAmount =
1562 DAG.getConstant(VT.getSizeInBits() - MemVT.getSizeInBits(), MVT::i32);
1563 SDValue NewLoad = DAG.getExtLoad(ISD::EXTLOAD, DL, VT, Chain, Ptr,
1564 LoadNode->getPointerInfo(), MemVT,
1565 LoadNode->isVolatile(),
1566 LoadNode->isNonTemporal(),
1567 LoadNode->isInvariant(),
1568 LoadNode->getAlignment());
1569 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, NewLoad, ShiftAmount);
1570 SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
1571
1572 SDValue MergedValues[2] = { Sra, Chain };
1573 return DAG.getMergeValues(MergedValues, DL);
1574 }
1575
1576 if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
1577 return SDValue();
1578 }
1579
1580 // Lowering for indirect addressing
1581 const MachineFunction &MF = DAG.getMachineFunction();
1582 const AMDGPUFrameLowering *TFL = static_cast<const AMDGPUFrameLowering *>(
1583 getTargetMachine().getSubtargetImpl()->getFrameLowering());
1584 unsigned StackWidth = TFL->getStackWidth(MF);
1585
1586 Ptr = stackPtrToRegIndex(Ptr, StackWidth, DAG);
1587
1588 if (VT.isVector()) {
1589 unsigned NumElemVT = VT.getVectorNumElements();
1590 EVT ElemVT = VT.getVectorElementType();
1591 SDValue Loads[4];
1592
1593 assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
1594 "vector width in load");
1595
1596 for (unsigned i = 0; i < NumElemVT; ++i) {
1597 unsigned Channel, PtrIncr;
1598 getStackAddress(StackWidth, i, Channel, PtrIncr);
1599 Ptr = DAG.getNode(ISD::ADD, DL, MVT::i32, Ptr,
1600 DAG.getConstant(PtrIncr, MVT::i32));
1601 Loads[i] = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, ElemVT,
1602 Chain, Ptr,
1603 DAG.getTargetConstant(Channel, MVT::i32),
1604 Op.getOperand(2));
1605 }
1606 for (unsigned i = NumElemVT; i < 4; ++i) {
1607 Loads[i] = DAG.getUNDEF(ElemVT);
1608 }
1609 EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
1610 LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
1611 } else {
1612 LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
1613 Chain, Ptr,
1614 DAG.getTargetConstant(0, MVT::i32), // Channel
1615 Op.getOperand(2));
1616 }
1617
1618 SDValue Ops[2] = {
1619 LoweredLoad,
1620 Chain
1621 };
1622
1623 return DAG.getMergeValues(Ops, DL);
1624 }
1625
1626 SDValue R600TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
1627 SDValue Chain = Op.getOperand(0);
1628 SDValue Cond = Op.getOperand(1);
1629 SDValue Jump = Op.getOperand(2);
1630
1631 return DAG.getNode(AMDGPUISD::BRANCH_COND, SDLoc(Op), Op.getValueType(),
1632 Chain, Jump, Cond);
1633 }
1634
1635 /// XXX Only kernel functions are supported, so we can assume for now that
1636 /// every function is a kernel function, but in the future we should use
1637 /// separate calling conventions for kernel and non-kernel functions.
1638 SDValue R600TargetLowering::LowerFormalArguments(
1639 SDValue Chain,
1640 CallingConv::ID CallConv,
1641 bool isVarArg,
1642 const SmallVectorImpl<ISD::InputArg> &Ins,
1643 SDLoc DL, SelectionDAG &DAG,
1644 SmallVectorImpl<SDValue> &InVals) const {
1645 SmallVector<CCValAssign, 16> ArgLocs;
1646 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
1647 *DAG.getContext());
1648 MachineFunction &MF = DAG.getMachineFunction();
1649 R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
1650
1651 SmallVector<ISD::InputArg, 8> LocalIns;
1652
1653 getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
1654
1655 AnalyzeFormalArguments(CCInfo, LocalIns);
1656
1657 for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
1658 CCValAssign &VA = ArgLocs[i];
1659 const ISD::InputArg &In = Ins[i];
1660 EVT VT = In.VT;
1661 EVT MemVT = VA.getLocVT();
1662 if (!VT.isVector() && MemVT.isVector()) {
1663 // Get load source type if scalarized.
1664 MemVT = MemVT.getVectorElementType();
1665 }
1666
1667 if (MFI->getShaderType() != ShaderType::COMPUTE) {
1668 unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass);
1669 SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT);
1670 InVals.push_back(Register);
1671 continue;
1672 }
1673
1674 PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
1675 AMDGPUAS::CONSTANT_BUFFER_0);
1676
1677 // i64 isn't a legal type, so the register type used ends up as i32, which
1678 // isn't expected here. It attempts to create this sextload, but it ends up
1679 // being invalid. Somehow this seems to work with i64 arguments, but breaks
1680 // for <1 x i64>.
1681
1682 // The first 36 bytes of the input buffer contains information about
1683 // thread group and global sizes.
1684 ISD::LoadExtType Ext = ISD::NON_EXTLOAD;
1685 if (MemVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) {
1686 // FIXME: This should really check the extload type, but the handling of
1687 // extload vector parameters seems to be broken.
1688
1689 // Ext = In.Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
1690 Ext = ISD::SEXTLOAD;
1691 }
1692
1693 // Compute the offset from the value.
1694 // XXX - I think PartOffset should give you this, but it seems to give the
1695 // size of the register which isn't useful.
1696
1697 unsigned ValBase = ArgLocs[In.OrigArgIndex].getLocMemOffset();
1698 unsigned PartOffset = VA.getLocMemOffset();
1699 unsigned Offset = 36 + VA.getLocMemOffset();
1700
1701 MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
1702 SDValue Arg = DAG.getLoad(ISD::UNINDEXED, Ext, VT, DL, Chain,
1703 DAG.getConstant(Offset, MVT::i32),
1704 DAG.getUNDEF(MVT::i32),
1705 PtrInfo,
1706 MemVT, false, true, true, 4);
1707
1708 // 4 is the preferred alignment for the CONSTANT memory space.
1709 InVals.push_back(Arg);
1710 MFI->ABIArgOffset = Offset + MemVT.getStoreSize();
1711 }
1712 return Chain;
1713 }
1714
1715 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
1716 if (!VT.isVector())
1717 return MVT::i32;
1718 return VT.changeVectorElementTypeToInteger();
1719 }
1720
1721 static SDValue CompactSwizzlableVector(
1722 SelectionDAG &DAG, SDValue VectorEntry,
1723 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1724 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1725 assert(RemapSwizzle.empty());
1726 SDValue NewBldVec[4] = {
1727 VectorEntry.getOperand(0),
1728 VectorEntry.getOperand(1),
1729 VectorEntry.getOperand(2),
1730 VectorEntry.getOperand(3)
1731 };
1732
1733 for (unsigned i = 0; i < 4; i++) {
1734 if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1735 // We mask write here to teach later passes that the ith element of this
1736 // vector is undef. Thus we can use it to reduce 128 bits reg usage,
1737 // break false dependencies and additionnaly make assembly easier to read.
1738 RemapSwizzle[i] = 7; // SEL_MASK_WRITE
1739 if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(NewBldVec[i])) {
1740 if (C->isZero()) {
1741 RemapSwizzle[i] = 4; // SEL_0
1742 NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1743 } else if (C->isExactlyValue(1.0)) {
1744 RemapSwizzle[i] = 5; // SEL_1
1745 NewBldVec[i] = DAG.getUNDEF(MVT::f32);
1746 }
1747 }
1748
1749 if (NewBldVec[i].getOpcode() == ISD::UNDEF)
1750 continue;
1751 for (unsigned j = 0; j < i; j++) {
1752 if (NewBldVec[i] == NewBldVec[j]) {
1753 NewBldVec[i] = DAG.getUNDEF(NewBldVec[i].getValueType());
1754 RemapSwizzle[i] = j;
1755 break;
1756 }
1757 }
1758 }
1759
1760 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1761 VectorEntry.getValueType(), NewBldVec);
1762 }
1763
1764 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
1765 DenseMap<unsigned, unsigned> &RemapSwizzle) {
1766 assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
1767 assert(RemapSwizzle.empty());
1768 SDValue NewBldVec[4] = {
1769 VectorEntry.getOperand(0),
1770 VectorEntry.getOperand(1),
1771 VectorEntry.getOperand(2),
1772 VectorEntry.getOperand(3)
1773 };
1774 bool isUnmovable[4] = { false, false, false, false };
1775 for (unsigned i = 0; i < 4; i++) {
1776 RemapSwizzle[i] = i;
1777 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1778 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1779 ->getZExtValue();
1780 if (i == Idx)
1781 isUnmovable[Idx] = true;
1782 }
1783 }
1784
1785 for (unsigned i = 0; i < 4; i++) {
1786 if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
1787 unsigned Idx = dyn_cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
1788 ->getZExtValue();
1789 if (isUnmovable[Idx])
1790 continue;
1791 // Swap i and Idx
1792 std::swap(NewBldVec[Idx], NewBldVec[i]);
1793 std::swap(RemapSwizzle[i], RemapSwizzle[Idx]);
1794 break;
1795 }
1796 }
1797
1798 return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
1799 VectorEntry.getValueType(), NewBldVec);
1800 }
1801
1802
1803 SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector,
1804 SDValue Swz[4], SelectionDAG &DAG) const {
1805 assert(BuildVector.getOpcode() == ISD::BUILD_VECTOR);
1806 // Old -> New swizzle values
1807 DenseMap<unsigned, unsigned> SwizzleRemap;
1808
1809 BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
1810 for (unsigned i = 0; i < 4; i++) {
1811 unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1812 if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1813 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1814 }
1815
1816 SwizzleRemap.clear();
1817 BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
1818 for (unsigned i = 0; i < 4; i++) {
1819 unsigned Idx = dyn_cast<ConstantSDNode>(Swz[i])->getZExtValue();
1820 if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
1821 Swz[i] = DAG.getConstant(SwizzleRemap[Idx], MVT::i32);
1822 }
1823
1824 return BuildVector;
1825 }
1826
1827
1828 //===----------------------------------------------------------------------===//
1829 // Custom DAG Optimizations
1830 //===----------------------------------------------------------------------===//
1831
1832 SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
1833 DAGCombinerInfo &DCI) const {
1834 SelectionDAG &DAG = DCI.DAG;
1835
1836 switch (N->getOpcode()) {
1837 default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1838 // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
1839 case ISD::FP_ROUND: {
1840 SDValue Arg = N->getOperand(0);
1841 if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
1842 return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), N->getValueType(0),
1843 Arg.getOperand(0));
1844 }
1845 break;
1846 }
1847
1848 // (i32 fp_to_sint (fneg (select_cc f32, f32, 1.0, 0.0 cc))) ->
1849 // (i32 select_cc f32, f32, -1, 0 cc)
1850 //
1851 // Mesa's GLSL frontend generates the above pattern a lot and we can lower
1852 // this to one of the SET*_DX10 instructions.
1853 case ISD::FP_TO_SINT: {
1854 SDValue FNeg = N->getOperand(0);
1855 if (FNeg.getOpcode() != ISD::FNEG) {
1856 return SDValue();
1857 }
1858 SDValue SelectCC = FNeg.getOperand(0);
1859 if (SelectCC.getOpcode() != ISD::SELECT_CC ||
1860 SelectCC.getOperand(0).getValueType() != MVT::f32 || // LHS
1861 SelectCC.getOperand(2).getValueType() != MVT::f32 || // True
1862 !isHWTrueValue(SelectCC.getOperand(2)) ||
1863 !isHWFalseValue(SelectCC.getOperand(3))) {
1864 return SDValue();
1865 }
1866
1867 return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
1868 SelectCC.getOperand(0), // LHS
1869 SelectCC.getOperand(1), // RHS
1870 DAG.getConstant(-1, MVT::i32), // True
1871 DAG.getConstant(0, MVT::i32), // Flase
1872 SelectCC.getOperand(4)); // CC
1873
1874 break;
1875 }
1876
1877 // insert_vector_elt (build_vector elt0, ... , eltN), NewEltIdx, idx
1878 // => build_vector elt0, ... , NewEltIdx, ... , eltN
1879 case ISD::INSERT_VECTOR_ELT: {
1880 SDValue InVec = N->getOperand(0);
1881 SDValue InVal = N->getOperand(1);
1882 SDValue EltNo = N->getOperand(2);
1883 SDLoc dl(N);
1884
1885 // If the inserted element is an UNDEF, just use the input vector.
1886 if (InVal.getOpcode() == ISD::UNDEF)
1887 return InVec;
1888
1889 EVT VT = InVec.getValueType();
1890
1891 // If we can't generate a legal BUILD_VECTOR, exit
1892 if (!isOperationLegal(ISD::BUILD_VECTOR, VT))
1893 return SDValue();
1894
1895 // Check that we know which element is being inserted
1896 if (!isa<ConstantSDNode>(EltNo))
1897 return SDValue();
1898 unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
1899
1900 // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
1901 // be converted to a BUILD_VECTOR). Fill in the Ops vector with the
1902 // vector elements.
1903 SmallVector<SDValue, 8> Ops;
1904 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
1905 Ops.append(InVec.getNode()->op_begin(),
1906 InVec.getNode()->op_end());
1907 } else if (InVec.getOpcode() == ISD::UNDEF) {
1908 unsigned NElts = VT.getVectorNumElements();
1909 Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
1910 } else {
1911 return SDValue();
1912 }
1913
1914 // Insert the element
1915 if (Elt < Ops.size()) {
1916 // All the operands of BUILD_VECTOR must have the same type;
1917 // we enforce that here.
1918 EVT OpVT = Ops[0].getValueType();
1919 if (InVal.getValueType() != OpVT)
1920 InVal = OpVT.bitsGT(InVal.getValueType()) ?
1921 DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
1922 DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
1923 Ops[Elt] = InVal;
1924 }
1925
1926 // Return the new vector
1927 return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
1928 }
1929
1930 // Extract_vec (Build_vector) generated by custom lowering
1931 // also needs to be customly combined
1932 case ISD::EXTRACT_VECTOR_ELT: {
1933 SDValue Arg = N->getOperand(0);
1934 if (Arg.getOpcode() == ISD::BUILD_VECTOR) {
1935 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1936 unsigned Element = Const->getZExtValue();
1937 return Arg->getOperand(Element);
1938 }
1939 }
1940 if (Arg.getOpcode() == ISD::BITCAST &&
1941 Arg.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
1942 if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
1943 unsigned Element = Const->getZExtValue();
1944 return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getVTList(),
1945 Arg->getOperand(0).getOperand(Element));
1946 }
1947 }
1948 }
1949
1950 case ISD::SELECT_CC: {
1951 // Try common optimizations
1952 SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
1953 if (Ret.getNode())
1954 return Ret;
1955
1956 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
1957 // selectcc x, y, a, b, inv(cc)
1958 //
1959 // fold selectcc (selectcc x, y, a, b, cc), b, a, b, setne ->
1960 // selectcc x, y, a, b, cc
1961 SDValue LHS = N->getOperand(0);
1962 if (LHS.getOpcode() != ISD::SELECT_CC) {
1963 return SDValue();
1964 }
1965
1966 SDValue RHS = N->getOperand(1);
1967 SDValue True = N->getOperand(2);
1968 SDValue False = N->getOperand(3);
1969 ISD::CondCode NCC = cast<CondCodeSDNode>(N->getOperand(4))->get();
1970
1971 if (LHS.getOperand(2).getNode() != True.getNode() ||
1972 LHS.getOperand(3).getNode() != False.getNode() ||
1973 RHS.getNode() != False.getNode()) {
1974 return SDValue();
1975 }
1976
1977 switch (NCC) {
1978 default: return SDValue();
1979 case ISD::SETNE: return LHS;
1980 case ISD::SETEQ: {
1981 ISD::CondCode LHSCC = cast<CondCodeSDNode>(LHS.getOperand(4))->get();
1982 LHSCC = ISD::getSetCCInverse(LHSCC,
1983 LHS.getOperand(0).getValueType().isInteger());
1984 if (DCI.isBeforeLegalizeOps() ||
1985 isCondCodeLegal(LHSCC, LHS.getOperand(0).getSimpleValueType()))
1986 return DAG.getSelectCC(SDLoc(N),
1987 LHS.getOperand(0),
1988 LHS.getOperand(1),
1989 LHS.getOperand(2),
1990 LHS.getOperand(3),
1991 LHSCC);
1992 break;
1993 }
1994 }
1995 return SDValue();
1996 }
1997
1998 case AMDGPUISD::EXPORT: {
1999 SDValue Arg = N->getOperand(1);
2000 if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2001 break;
2002
2003 SDValue NewArgs[8] = {
2004 N->getOperand(0), // Chain
2005 SDValue(),
2006 N->getOperand(2), // ArrayBase
2007 N->getOperand(3), // Type
2008 N->getOperand(4), // SWZ_X
2009 N->getOperand(5), // SWZ_Y
2010 N->getOperand(6), // SWZ_Z
2011 N->getOperand(7) // SWZ_W
2012 };
2013 SDLoc DL(N);
2014 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
2015 return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
2016 }
2017 case AMDGPUISD::TEXTURE_FETCH: {
2018 SDValue Arg = N->getOperand(1);
2019 if (Arg.getOpcode() != ISD::BUILD_VECTOR)
2020 break;
2021
2022 SDValue NewArgs[19] = {
2023 N->getOperand(0),
2024 N->getOperand(1),
2025 N->getOperand(2),
2026 N->getOperand(3),
2027 N->getOperand(4),
2028 N->getOperand(5),
2029 N->getOperand(6),
2030 N->getOperand(7),
2031 N->getOperand(8),
2032 N->getOperand(9),
2033 N->getOperand(10),
2034 N->getOperand(11),
2035 N->getOperand(12),
2036 N->getOperand(13),
2037 N->getOperand(14),
2038 N->getOperand(15),
2039 N->getOperand(16),
2040 N->getOperand(17),
2041 N->getOperand(18),
2042 };
2043 NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
2044 return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
2045 NewArgs);
2046 }
2047 }
2048
2049 return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
2050 }
2051
2052 static bool
2053 FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
2054 SDValue &Abs, SDValue &Sel, SDValue &Imm, SelectionDAG &DAG) {
2055 const R600InstrInfo *TII =
2056 static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2057 if (!Src.isMachineOpcode())
2058 return false;
2059 switch (Src.getMachineOpcode()) {
2060 case AMDGPU::FNEG_R600:
2061 if (!Neg.getNode())
2062 return false;
2063 Src = Src.getOperand(0);
2064 Neg = DAG.getTargetConstant(1, MVT::i32);
2065 return true;
2066 case AMDGPU::FABS_R600:
2067 if (!Abs.getNode())
2068 return false;
2069 Src = Src.getOperand(0);
2070 Abs = DAG.getTargetConstant(1, MVT::i32);
2071 return true;
2072 case AMDGPU::CONST_COPY: {
2073 unsigned Opcode = ParentNode->getMachineOpcode();
2074 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2075
2076 if (!Sel.getNode())
2077 return false;
2078
2079 SDValue CstOffset = Src.getOperand(0);
2080 if (ParentNode->getValueType(0).isVector())
2081 return false;
2082
2083 // Gather constants values
2084 int SrcIndices[] = {
2085 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2086 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2087 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2),
2088 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2089 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2090 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2091 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2092 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2093 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2094 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2095 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2096 };
2097 std::vector<unsigned> Consts;
2098 for (int OtherSrcIdx : SrcIndices) {
2099 int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
2100 if (OtherSrcIdx < 0 || OtherSelIdx < 0)
2101 continue;
2102 if (HasDst) {
2103 OtherSrcIdx--;
2104 OtherSelIdx--;
2105 }
2106 if (RegisterSDNode *Reg =
2107 dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
2108 if (Reg->getReg() == AMDGPU::ALU_CONST) {
2109 ConstantSDNode *Cst
2110 = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
2111 Consts.push_back(Cst->getZExtValue());
2112 }
2113 }
2114 }
2115
2116 ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
2117 Consts.push_back(Cst->getZExtValue());
2118 if (!TII->fitsConstReadLimitations(Consts)) {
2119 return false;
2120 }
2121
2122 Sel = CstOffset;
2123 Src = DAG.getRegister(AMDGPU::ALU_CONST, MVT::f32);
2124 return true;
2125 }
2126 case AMDGPU::MOV_IMM_I32:
2127 case AMDGPU::MOV_IMM_F32: {
2128 unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
2129 uint64_t ImmValue = 0;
2130
2131
2132 if (Src.getMachineOpcode() == AMDGPU::MOV_IMM_F32) {
2133 ConstantFPSDNode *FPC = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
2134 float FloatValue = FPC->getValueAPF().convertToFloat();
2135 if (FloatValue == 0.0) {
2136 ImmReg = AMDGPU::ZERO;
2137 } else if (FloatValue == 0.5) {
2138 ImmReg = AMDGPU::HALF;
2139 } else if (FloatValue == 1.0) {
2140 ImmReg = AMDGPU::ONE;
2141 } else {
2142 ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
2143 }
2144 } else {
2145 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src.getOperand(0));
2146 uint64_t Value = C->getZExtValue();
2147 if (Value == 0) {
2148 ImmReg = AMDGPU::ZERO;
2149 } else if (Value == 1) {
2150 ImmReg = AMDGPU::ONE_INT;
2151 } else {
2152 ImmValue = Value;
2153 }
2154 }
2155
2156 // Check that we aren't already using an immediate.
2157 // XXX: It's possible for an instruction to have more than one
2158 // immediate operand, but this is not supported yet.
2159 if (ImmReg == AMDGPU::ALU_LITERAL_X) {
2160 if (!Imm.getNode())
2161 return false;
2162 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Imm);
2163 assert(C);
2164 if (C->getZExtValue())
2165 return false;
2166 Imm = DAG.getTargetConstant(ImmValue, MVT::i32);
2167 }
2168 Src = DAG.getRegister(ImmReg, MVT::i32);
2169 return true;
2170 }
2171 default:
2172 return false;
2173 }
2174 }
2175
2176
2177 /// \brief Fold the instructions after selecting them
2178 SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
2179 SelectionDAG &DAG) const {
2180 const R600InstrInfo *TII =
2181 static_cast<const R600InstrInfo *>(DAG.getSubtarget().getInstrInfo());
2182 if (!Node->isMachineOpcode())
2183 return Node;
2184 unsigned Opcode = Node->getMachineOpcode();
2185 SDValue FakeOp;
2186
2187 std::vector<SDValue> Ops;
2188 for (const SDUse &I : Node->ops())
2189 Ops.push_back(I);
2190
2191 if (Opcode == AMDGPU::DOT_4) {
2192 int OperandIdx[] = {
2193 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
2194 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
2195 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Z),
2196 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_W),
2197 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_X),
2198 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Y),
2199 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_Z),
2200 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
2201 };
2202 int NegIdx[] = {
2203 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_X),
2204 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Y),
2205 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_Z),
2206 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg_W),
2207 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_X),
2208 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Y),
2209 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_Z),
2210 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg_W)
2211 };
2212 int AbsIdx[] = {
2213 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_X),
2214 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Y),
2215 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_Z),
2216 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs_W),
2217 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_X),
2218 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Y),
2219 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_Z),
2220 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs_W)
2221 };
2222 for (unsigned i = 0; i < 8; i++) {
2223 if (OperandIdx[i] < 0)
2224 return Node;
2225 SDValue &Src = Ops[OperandIdx[i] - 1];
2226 SDValue &Neg = Ops[NegIdx[i] - 1];
2227 SDValue &Abs = Ops[AbsIdx[i] - 1];
2228 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2229 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2230 if (HasDst)
2231 SelIdx--;
2232 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2233 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, FakeOp, DAG))
2234 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2235 }
2236 } else if (Opcode == AMDGPU::REG_SEQUENCE) {
2237 for (unsigned i = 1, e = Node->getNumOperands(); i < e; i += 2) {
2238 SDValue &Src = Ops[i];
2239 if (FoldOperand(Node, i, Src, FakeOp, FakeOp, FakeOp, FakeOp, DAG))
2240 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2241 }
2242 } else if (Opcode == AMDGPU::CLAMP_R600) {
2243 SDValue Src = Node->getOperand(0);
2244 if (!Src.isMachineOpcode() ||
2245 !TII->hasInstrModifiers(Src.getMachineOpcode()))
2246 return Node;
2247 int ClampIdx = TII->getOperandIdx(Src.getMachineOpcode(),
2248 AMDGPU::OpName::clamp);
2249 if (ClampIdx < 0)
2250 return Node;
2251 std::vector<SDValue> Ops;
2252 unsigned NumOp = Src.getNumOperands();
2253 for(unsigned i = 0; i < NumOp; ++i)
2254 Ops.push_back(Src.getOperand(i));
2255 Ops[ClampIdx - 1] = DAG.getTargetConstant(1, MVT::i32);
2256 return DAG.getMachineNode(Src.getMachineOpcode(), SDLoc(Node),
2257 Node->getVTList(), Ops);
2258 } else {
2259 if (!TII->hasInstrModifiers(Opcode))
2260 return Node;
2261 int OperandIdx[] = {
2262 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0),
2263 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1),
2264 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2)
2265 };
2266 int NegIdx[] = {
2267 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_neg),
2268 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_neg),
2269 TII->getOperandIdx(Opcode, AMDGPU::OpName::src2_neg)
2270 };
2271 int AbsIdx[] = {
2272 TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_abs),
2273 TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_abs),
2274 -1
2275 };
2276 for (unsigned i = 0; i < 3; i++) {
2277 if (OperandIdx[i] < 0)
2278 return Node;
2279 SDValue &Src = Ops[OperandIdx[i] - 1];
2280 SDValue &Neg = Ops[NegIdx[i] - 1];
2281 SDValue FakeAbs;
2282 SDValue &Abs = (AbsIdx[i] > -1) ? Ops[AbsIdx[i] - 1] : FakeAbs;
2283 bool HasDst = TII->getOperandIdx(Opcode, AMDGPU::OpName::dst) > -1;
2284 int SelIdx = TII->getSelIdx(Opcode, OperandIdx[i]);
2285 int ImmIdx = TII->getOperandIdx(Opcode, AMDGPU::OpName::literal);
2286 if (HasDst) {
2287 SelIdx--;
2288 ImmIdx--;
2289 }
2290 SDValue &Sel = (SelIdx > -1) ? Ops[SelIdx] : FakeOp;
2291 SDValue &Imm = Ops[ImmIdx];
2292 if (FoldOperand(Node, i, Src, Neg, Abs, Sel, Imm, DAG))
2293 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
2294 }
2295 }
2296
2297 return Node;
2298 }