1 //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// \brief Custom DAG lowering for SI
13 //===----------------------------------------------------------------------===//
17 #define _USE_MATH_DEFINES
21 #include "SIISelLowering.h"
23 #include "AMDGPUIntrinsicInfo.h"
24 #include "AMDGPUSubtarget.h"
25 #include "SIInstrInfo.h"
26 #include "SIMachineFunctionInfo.h"
27 #include "SIRegisterInfo.h"
28 #include "llvm/ADT/BitVector.h"
29 #include "llvm/CodeGen/CallingConvLower.h"
30 #include "llvm/CodeGen/MachineInstrBuilder.h"
31 #include "llvm/CodeGen/MachineRegisterInfo.h"
32 #include "llvm/CodeGen/SelectionDAG.h"
33 #include "llvm/IR/Function.h"
34 #include "llvm/ADT/SmallString.h"
38 SITargetLowering::SITargetLowering(TargetMachine
&TM
) :
39 AMDGPUTargetLowering(TM
) {
40 addRegisterClass(MVT::i1
, &AMDGPU::VReg_1RegClass
);
41 addRegisterClass(MVT::i64
, &AMDGPU::SReg_64RegClass
);
43 addRegisterClass(MVT::v32i8
, &AMDGPU::SReg_256RegClass
);
44 addRegisterClass(MVT::v64i8
, &AMDGPU::SReg_512RegClass
);
46 addRegisterClass(MVT::i32
, &AMDGPU::SReg_32RegClass
);
47 addRegisterClass(MVT::f32
, &AMDGPU::VGPR_32RegClass
);
49 addRegisterClass(MVT::f64
, &AMDGPU::VReg_64RegClass
);
50 addRegisterClass(MVT::v2i32
, &AMDGPU::SReg_64RegClass
);
51 addRegisterClass(MVT::v2f32
, &AMDGPU::VReg_64RegClass
);
53 addRegisterClass(MVT::v4i32
, &AMDGPU::SReg_128RegClass
);
54 addRegisterClass(MVT::v4f32
, &AMDGPU::VReg_128RegClass
);
56 addRegisterClass(MVT::v8i32
, &AMDGPU::SReg_256RegClass
);
57 addRegisterClass(MVT::v8f32
, &AMDGPU::VReg_256RegClass
);
59 addRegisterClass(MVT::v16i32
, &AMDGPU::SReg_512RegClass
);
60 addRegisterClass(MVT::v16f32
, &AMDGPU::VReg_512RegClass
);
62 computeRegisterProperties();
64 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v8i32
, Expand
);
65 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v8f32
, Expand
);
66 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v16i32
, Expand
);
67 setOperationAction(ISD::VECTOR_SHUFFLE
, MVT::v16f32
, Expand
);
69 setOperationAction(ISD::ADD
, MVT::i32
, Legal
);
70 setOperationAction(ISD::ADDC
, MVT::i32
, Legal
);
71 setOperationAction(ISD::ADDE
, MVT::i32
, Legal
);
72 setOperationAction(ISD::SUBC
, MVT::i32
, Legal
);
73 setOperationAction(ISD::SUBE
, MVT::i32
, Legal
);
75 setOperationAction(ISD::FSIN
, MVT::f32
, Custom
);
76 setOperationAction(ISD::FCOS
, MVT::f32
, Custom
);
78 setOperationAction(ISD::FMINNUM
, MVT::f32
, Legal
);
79 setOperationAction(ISD::FMAXNUM
, MVT::f32
, Legal
);
80 setOperationAction(ISD::FMINNUM
, MVT::f64
, Legal
);
81 setOperationAction(ISD::FMAXNUM
, MVT::f64
, Legal
);
83 // We need to custom lower vector stores from local memory
84 setOperationAction(ISD::LOAD
, MVT::v4i32
, Custom
);
85 setOperationAction(ISD::LOAD
, MVT::v8i32
, Custom
);
86 setOperationAction(ISD::LOAD
, MVT::v16i32
, Custom
);
88 setOperationAction(ISD::STORE
, MVT::v8i32
, Custom
);
89 setOperationAction(ISD::STORE
, MVT::v16i32
, Custom
);
91 setOperationAction(ISD::STORE
, MVT::i1
, Custom
);
92 setOperationAction(ISD::STORE
, MVT::i32
, Custom
);
93 setOperationAction(ISD::STORE
, MVT::v2i32
, Custom
);
94 setOperationAction(ISD::STORE
, MVT::v4i32
, Custom
);
96 setOperationAction(ISD::SELECT
, MVT::i64
, Custom
);
97 setOperationAction(ISD::SELECT
, MVT::f64
, Promote
);
98 AddPromotedToType(ISD::SELECT
, MVT::f64
, MVT::i64
);
100 setOperationAction(ISD::SELECT_CC
, MVT::f32
, Expand
);
101 setOperationAction(ISD::SELECT_CC
, MVT::i32
, Expand
);
102 setOperationAction(ISD::SELECT_CC
, MVT::i64
, Expand
);
103 setOperationAction(ISD::SELECT_CC
, MVT::f64
, Expand
);
105 setOperationAction(ISD::SETCC
, MVT::v2i1
, Expand
);
106 setOperationAction(ISD::SETCC
, MVT::v4i1
, Expand
);
108 setOperationAction(ISD::BSWAP
, MVT::i32
, Legal
);
110 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i1
, Legal
);
111 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i1
, Custom
);
112 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i1
, Custom
);
114 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i8
, Legal
);
115 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i8
, Custom
);
116 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i8
, Custom
);
118 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i16
, Legal
);
119 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v2i16
, Custom
);
120 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::v4i16
, Custom
);
122 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::i32
, Legal
);
123 setOperationAction(ISD::SIGN_EXTEND_INREG
, MVT::Other
, Custom
);
125 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::Other
, Custom
);
126 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::f32
, Custom
);
127 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v16i8
, Custom
);
128 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::v4f32
, Custom
);
130 setOperationAction(ISD::INTRINSIC_VOID
, MVT::Other
, Custom
);
131 setOperationAction(ISD::BRCOND
, MVT::Other
, Custom
);
133 for (MVT VT
: MVT::integer_valuetypes()) {
137 setLoadExtAction(ISD::SEXTLOAD
, VT
, MVT::i1
, Promote
);
138 setLoadExtAction(ISD::SEXTLOAD
, VT
, MVT::i8
, Legal
);
139 setLoadExtAction(ISD::SEXTLOAD
, VT
, MVT::i16
, Legal
);
140 setLoadExtAction(ISD::SEXTLOAD
, VT
, MVT::i32
, Expand
);
142 setLoadExtAction(ISD::ZEXTLOAD
, VT
, MVT::i1
, Promote
);
143 setLoadExtAction(ISD::ZEXTLOAD
, VT
, MVT::i8
, Legal
);
144 setLoadExtAction(ISD::ZEXTLOAD
, VT
, MVT::i16
, Legal
);
145 setLoadExtAction(ISD::ZEXTLOAD
, VT
, MVT::i32
, Expand
);
147 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::i1
, Promote
);
148 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::i8
, Legal
);
149 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::i16
, Legal
);
150 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::i32
, Expand
);
153 for (MVT VT
: MVT::integer_vector_valuetypes()) {
154 setLoadExtAction(ISD::SEXTLOAD
, VT
, MVT::v8i16
, Expand
);
155 setLoadExtAction(ISD::SEXTLOAD
, VT
, MVT::v16i16
, Expand
);
158 for (MVT VT
: MVT::fp_valuetypes())
159 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::f32
, Expand
);
161 setTruncStoreAction(MVT::i32
, MVT::i8
, Custom
);
162 setTruncStoreAction(MVT::i32
, MVT::i16
, Custom
);
163 setTruncStoreAction(MVT::f64
, MVT::f32
, Expand
);
164 setTruncStoreAction(MVT::i64
, MVT::i32
, Expand
);
165 setTruncStoreAction(MVT::v8i32
, MVT::v8i16
, Expand
);
166 setTruncStoreAction(MVT::v16i32
, MVT::v16i16
, Expand
);
168 setOperationAction(ISD::LOAD
, MVT::i1
, Custom
);
170 setOperationAction(ISD::GlobalAddress
, MVT::i32
, Custom
);
171 setOperationAction(ISD::GlobalAddress
, MVT::i64
, Custom
);
172 setOperationAction(ISD::FrameIndex
, MVT::i32
, Custom
);
174 // These should use UDIVREM, so set them to expand
175 setOperationAction(ISD::UDIV
, MVT::i64
, Expand
);
176 setOperationAction(ISD::UREM
, MVT::i64
, Expand
);
178 // We only support LOAD/STORE and vector manipulation ops for vectors
179 // with > 4 elements.
181 MVT::v8i32
, MVT::v8f32
, MVT::v16i32
, MVT::v16f32
184 setOperationAction(ISD::SELECT_CC
, MVT::i1
, Expand
);
185 setOperationAction(ISD::SELECT
, MVT::i1
, Promote
);
187 for (MVT VT
: VecTypes
) {
188 for (unsigned Op
= 0; Op
< ISD::BUILTIN_OP_END
; ++Op
) {
192 case ISD::BUILD_VECTOR
:
194 case ISD::EXTRACT_VECTOR_ELT
:
195 case ISD::INSERT_VECTOR_ELT
:
196 case ISD::INSERT_SUBVECTOR
:
197 case ISD::EXTRACT_SUBVECTOR
:
199 case ISD::CONCAT_VECTORS
:
200 setOperationAction(Op
, VT
, Custom
);
203 setOperationAction(Op
, VT
, Expand
);
209 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS
) {
210 setOperationAction(ISD::FTRUNC
, MVT::f64
, Legal
);
211 setOperationAction(ISD::FCEIL
, MVT::f64
, Legal
);
212 setOperationAction(ISD::FFLOOR
, MVT::f64
, Legal
);
213 setOperationAction(ISD::FRINT
, MVT::f64
, Legal
);
216 setOperationAction(ISD::FDIV
, MVT::f32
, Custom
);
218 setTargetDAGCombine(ISD::FADD
);
219 setTargetDAGCombine(ISD::FSUB
);
220 setTargetDAGCombine(ISD::FMINNUM
);
221 setTargetDAGCombine(ISD::FMAXNUM
);
222 setTargetDAGCombine(ISD::SELECT_CC
);
223 setTargetDAGCombine(ISD::SETCC
);
224 setTargetDAGCombine(ISD::AND
);
225 setTargetDAGCombine(ISD::OR
);
226 setTargetDAGCombine(ISD::UINT_TO_FP
);
228 // All memory operations. Some folding on the pointer operand is done to help
229 // matching the constant offsets in the addressing modes.
230 setTargetDAGCombine(ISD::LOAD
);
231 setTargetDAGCombine(ISD::STORE
);
232 setTargetDAGCombine(ISD::ATOMIC_LOAD
);
233 setTargetDAGCombine(ISD::ATOMIC_STORE
);
234 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP
);
235 setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
);
236 setTargetDAGCombine(ISD::ATOMIC_SWAP
);
237 setTargetDAGCombine(ISD::ATOMIC_LOAD_ADD
);
238 setTargetDAGCombine(ISD::ATOMIC_LOAD_SUB
);
239 setTargetDAGCombine(ISD::ATOMIC_LOAD_AND
);
240 setTargetDAGCombine(ISD::ATOMIC_LOAD_OR
);
241 setTargetDAGCombine(ISD::ATOMIC_LOAD_XOR
);
242 setTargetDAGCombine(ISD::ATOMIC_LOAD_NAND
);
243 setTargetDAGCombine(ISD::ATOMIC_LOAD_MIN
);
244 setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX
);
245 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN
);
246 setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX
);
248 setSchedulingPreference(Sched::RegPressure
);
251 //===----------------------------------------------------------------------===//
252 // TargetLowering queries
253 //===----------------------------------------------------------------------===//
255 bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl
<int> &,
257 // SI has some legal vector types, but no legal vector operations. Say no
258 // shuffles are legal in order to prefer scalarizing some vector operations.
262 // FIXME: This really needs an address space argument. The immediate offset
263 // size is different for different sets of memory instruction sets.
265 // The single offset DS instructions have a 16-bit unsigned byte offset.
267 // MUBUF / MTBUF have a 12-bit unsigned byte offset, and additionally can do r +
268 // r + i with addr64. 32-bit has more addressing mode options. Depending on the
269 // resource constant, it can also do (i64 r0) + (i32 r1) * (i14 i).
271 // SMRD instructions have an 8-bit, dword offset.
273 bool SITargetLowering::isLegalAddressingMode(const AddrMode
&AM
,
275 // No global is ever allowed as a base.
279 // Allow a 16-bit unsigned immediate field, since this is what DS instructions
281 if (!isUInt
<16>(AM
.BaseOffs
))
286 case 0: // "r+i" or just "i", depending on HasBaseReg.
289 if (AM
.HasBaseReg
&& AM
.BaseOffs
) // "r+r+i" is not allowed.
291 // Otherwise we have r+r or r+i.
294 if (AM
.HasBaseReg
|| AM
.BaseOffs
) // 2*r+r or 2*r+i is not allowed.
298 default: // Don't allow n * r
305 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT
,
308 bool *IsFast
) const {
312 // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
313 // which isn't a simple VT.
314 if (!VT
.isSimple() || VT
== MVT::Other
)
317 // XXX - CI changes say "Support for unaligned memory accesses" but I don't
318 // see what for specifically. The wording everywhere else seems to be the
321 // XXX - The only mention I see of this in the ISA manual is for LDS direct
322 // reads the "byte address and must be dword aligned". Is it also true for the
323 // normal loads and stores?
324 if (AddrSpace
== AMDGPUAS::LOCAL_ADDRESS
) {
325 // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
326 // aligned, 8 byte access in a single operation using ds_read2/write2_b32
327 // with adjacent offsets.
328 return Align
% 4 == 0;
331 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
332 // byte-address are ignored, thus forcing Dword alignment.
333 // This applies to private, global, and constant memory.
336 return VT
.bitsGT(MVT::i32
);
339 EVT
SITargetLowering::getOptimalMemOpType(uint64_t Size
, unsigned DstAlign
,
340 unsigned SrcAlign
, bool IsMemset
,
343 MachineFunction
&MF
) const {
344 // FIXME: Should account for address space here.
346 // The default fallback uses the private pointer size as a guess for a type to
347 // use. Make sure we switch these to 64-bit accesses.
349 if (Size
>= 16 && DstAlign
>= 4) // XXX: Should only do for global
352 if (Size
>= 8 && DstAlign
>= 4)
359 TargetLoweringBase::LegalizeTypeAction
360 SITargetLowering::getPreferredVectorAction(EVT VT
) const {
361 if (VT
.getVectorNumElements() != 1 && VT
.getScalarType().bitsLE(MVT::i16
))
362 return TypeSplitVector
;
364 return TargetLoweringBase::getPreferredVectorAction(VT
);
367 bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt
&Imm
,
369 const SIInstrInfo
*TII
= static_cast<const SIInstrInfo
*>(
370 getTargetMachine().getSubtargetImpl()->getInstrInfo());
371 return TII
->isInlineConstant(Imm
);
374 SDValue
SITargetLowering::LowerParameter(SelectionDAG
&DAG
, EVT VT
, EVT MemVT
,
375 SDLoc SL
, SDValue Chain
,
376 unsigned Offset
, bool Signed
) const {
377 const DataLayout
*DL
= getDataLayout();
378 MachineFunction
&MF
= DAG
.getMachineFunction();
379 const SIRegisterInfo
*TRI
=
380 static_cast<const SIRegisterInfo
*>(Subtarget
->getRegisterInfo());
381 unsigned InputPtrReg
= TRI
->getPreloadedValue(MF
, SIRegisterInfo::INPUT_PTR
);
383 Type
*Ty
= VT
.getTypeForEVT(*DAG
.getContext());
385 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
386 PointerType
*PtrTy
= PointerType::get(Ty
, AMDGPUAS::CONSTANT_ADDRESS
);
387 SDValue BasePtr
= DAG
.getCopyFromReg(Chain
, SL
,
388 MRI
.getLiveInVirtReg(InputPtrReg
), MVT::i64
);
389 SDValue Ptr
= DAG
.getNode(ISD::ADD
, SL
, MVT::i64
, BasePtr
,
390 DAG
.getConstant(Offset
, MVT::i64
));
391 SDValue PtrOffset
= DAG
.getUNDEF(getPointerTy(AMDGPUAS::CONSTANT_ADDRESS
));
392 MachinePointerInfo
PtrInfo(UndefValue::get(PtrTy
));
394 return DAG
.getLoad(ISD::UNINDEXED
, Signed
? ISD::SEXTLOAD
: ISD::ZEXTLOAD
,
395 VT
, SL
, Chain
, Ptr
, PtrOffset
, PtrInfo
, MemVT
,
397 true, // isNonTemporal
399 DL
->getABITypeAlignment(Ty
)); // Alignment
402 SDValue
SITargetLowering::LowerFormalArguments(
404 CallingConv::ID CallConv
,
406 const SmallVectorImpl
<ISD::InputArg
> &Ins
,
407 SDLoc DL
, SelectionDAG
&DAG
,
408 SmallVectorImpl
<SDValue
> &InVals
) const {
410 const TargetMachine
&TM
= getTargetMachine();
411 const SIRegisterInfo
*TRI
=
412 static_cast<const SIRegisterInfo
*>(TM
.getSubtargetImpl()->getRegisterInfo());
414 MachineFunction
&MF
= DAG
.getMachineFunction();
415 FunctionType
*FType
= MF
.getFunction()->getFunctionType();
416 SIMachineFunctionInfo
*Info
= MF
.getInfo
<SIMachineFunctionInfo
>();
418 assert(CallConv
== CallingConv::C
);
420 SmallVector
<ISD::InputArg
, 16> Splits
;
421 BitVector
Skipped(Ins
.size());
423 for (unsigned i
= 0, e
= Ins
.size(), PSInputNum
= 0; i
!= e
; ++i
) {
424 const ISD::InputArg
&Arg
= Ins
[i
];
426 // First check if it's a PS input addr
427 if (Info
->getShaderType() == ShaderType::PIXEL
&& !Arg
.Flags
.isInReg() &&
428 !Arg
.Flags
.isByVal()) {
430 assert((PSInputNum
<= 15) && "Too many PS inputs!");
433 // We can savely skip PS inputs
439 Info
->PSInputAddr
|= 1 << PSInputNum
++;
442 // Second split vertices into their elements
443 if (Info
->getShaderType() != ShaderType::COMPUTE
&& Arg
.VT
.isVector()) {
444 ISD::InputArg NewArg
= Arg
;
445 NewArg
.Flags
.setSplit();
446 NewArg
.VT
= Arg
.VT
.getVectorElementType();
448 // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
449 // three or five element vertex only needs three or five registers,
450 // NOT four or eigth.
451 Type
*ParamType
= FType
->getParamType(Arg
.OrigArgIndex
);
452 unsigned NumElements
= ParamType
->getVectorNumElements();
454 for (unsigned j
= 0; j
!= NumElements
; ++j
) {
455 Splits
.push_back(NewArg
);
456 NewArg
.PartOffset
+= NewArg
.VT
.getStoreSize();
459 } else if (Info
->getShaderType() != ShaderType::COMPUTE
) {
460 Splits
.push_back(Arg
);
464 SmallVector
<CCValAssign
, 16> ArgLocs
;
465 CCState
CCInfo(CallConv
, isVarArg
, DAG
.getMachineFunction(), ArgLocs
,
468 // At least one interpolation mode must be enabled or else the GPU will hang.
469 if (Info
->getShaderType() == ShaderType::PIXEL
&&
470 (Info
->PSInputAddr
& 0x7F) == 0) {
471 Info
->PSInputAddr
|= 1;
472 CCInfo
.AllocateReg(AMDGPU::VGPR0
);
473 CCInfo
.AllocateReg(AMDGPU::VGPR1
);
476 // The pointer to the list of arguments is stored in SGPR0, SGPR1
477 // The pointer to the scratch buffer is stored in SGPR2, SGPR3
478 if (Info
->getShaderType() == ShaderType::COMPUTE
) {
479 if (Subtarget
->isAmdHsaOS())
480 Info
->NumUserSGPRs
= 2; // FIXME: Need to support scratch buffers.
482 Info
->NumUserSGPRs
= 4;
484 unsigned InputPtrReg
=
485 TRI
->getPreloadedValue(MF
, SIRegisterInfo::INPUT_PTR
);
486 unsigned InputPtrRegLo
=
487 TRI
->getPhysRegSubReg(InputPtrReg
, &AMDGPU::SReg_32RegClass
, 0);
488 unsigned InputPtrRegHi
=
489 TRI
->getPhysRegSubReg(InputPtrReg
, &AMDGPU::SReg_32RegClass
, 1);
491 unsigned ScratchPtrReg
=
492 TRI
->getPreloadedValue(MF
, SIRegisterInfo::SCRATCH_PTR
);
493 unsigned ScratchPtrRegLo
=
494 TRI
->getPhysRegSubReg(ScratchPtrReg
, &AMDGPU::SReg_32RegClass
, 0);
495 unsigned ScratchPtrRegHi
=
496 TRI
->getPhysRegSubReg(ScratchPtrReg
, &AMDGPU::SReg_32RegClass
, 1);
498 CCInfo
.AllocateReg(InputPtrRegLo
);
499 CCInfo
.AllocateReg(InputPtrRegHi
);
500 CCInfo
.AllocateReg(ScratchPtrRegLo
);
501 CCInfo
.AllocateReg(ScratchPtrRegHi
);
502 MF
.addLiveIn(InputPtrReg
, &AMDGPU::SReg_64RegClass
);
503 MF
.addLiveIn(ScratchPtrReg
, &AMDGPU::SReg_64RegClass
);
506 if (Info
->getShaderType() == ShaderType::COMPUTE
) {
507 getOriginalFunctionArgs(DAG
, DAG
.getMachineFunction().getFunction(), Ins
,
511 AnalyzeFormalArguments(CCInfo
, Splits
);
513 for (unsigned i
= 0, e
= Ins
.size(), ArgIdx
= 0; i
!= e
; ++i
) {
515 const ISD::InputArg
&Arg
= Ins
[i
];
517 InVals
.push_back(DAG
.getUNDEF(Arg
.VT
));
521 CCValAssign
&VA
= ArgLocs
[ArgIdx
++];
522 MVT VT
= VA
.getLocVT();
526 EVT MemVT
= Splits
[i
].VT
;
527 const unsigned Offset
= 36 + VA
.getLocMemOffset();
528 // The first 36 bytes of the input buffer contains information about
529 // thread group and global sizes.
530 SDValue Arg
= LowerParameter(DAG
, VT
, MemVT
, DL
, DAG
.getRoot(),
531 Offset
, Ins
[i
].Flags
.isSExt());
533 const PointerType
*ParamTy
=
534 dyn_cast
<PointerType
>(FType
->getParamType(Ins
[i
].OrigArgIndex
));
535 if (Subtarget
->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS
&&
536 ParamTy
&& ParamTy
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
) {
537 // On SI local pointers are just offsets into LDS, so they are always
538 // less than 16-bits. On CI and newer they could potentially be
539 // real pointers, so we can't guarantee their size.
540 Arg
= DAG
.getNode(ISD::AssertZext
, DL
, Arg
.getValueType(), Arg
,
541 DAG
.getValueType(MVT::i16
));
544 InVals
.push_back(Arg
);
545 Info
->ABIArgOffset
= Offset
+ MemVT
.getStoreSize();
548 assert(VA
.isRegLoc() && "Parameter must be in a register!");
550 unsigned Reg
= VA
.getLocReg();
552 if (VT
== MVT::i64
) {
553 // For now assume it is a pointer
554 Reg
= TRI
->getMatchingSuperReg(Reg
, AMDGPU::sub0
,
555 &AMDGPU::SReg_64RegClass
);
556 Reg
= MF
.addLiveIn(Reg
, &AMDGPU::SReg_64RegClass
);
557 InVals
.push_back(DAG
.getCopyFromReg(Chain
, DL
, Reg
, VT
));
561 const TargetRegisterClass
*RC
= TRI
->getMinimalPhysRegClass(Reg
, VT
);
563 Reg
= MF
.addLiveIn(Reg
, RC
);
564 SDValue Val
= DAG
.getCopyFromReg(Chain
, DL
, Reg
, VT
);
566 if (Arg
.VT
.isVector()) {
568 // Build a vector from the registers
569 Type
*ParamType
= FType
->getParamType(Arg
.OrigArgIndex
);
570 unsigned NumElements
= ParamType
->getVectorNumElements();
572 SmallVector
<SDValue
, 4> Regs
;
574 for (unsigned j
= 1; j
!= NumElements
; ++j
) {
575 Reg
= ArgLocs
[ArgIdx
++].getLocReg();
576 Reg
= MF
.addLiveIn(Reg
, RC
);
577 Regs
.push_back(DAG
.getCopyFromReg(Chain
, DL
, Reg
, VT
));
580 // Fill up the missing vector elements
581 NumElements
= Arg
.VT
.getVectorNumElements() - NumElements
;
582 for (unsigned j
= 0; j
!= NumElements
; ++j
)
583 Regs
.push_back(DAG
.getUNDEF(VT
));
585 InVals
.push_back(DAG
.getNode(ISD::BUILD_VECTOR
, DL
, Arg
.VT
, Regs
));
589 InVals
.push_back(Val
);
592 if (Info
->getShaderType() != ShaderType::COMPUTE
) {
593 unsigned ScratchIdx
= CCInfo
.getFirstUnallocated(
594 AMDGPU::SGPR_32RegClass
.begin(), AMDGPU::SGPR_32RegClass
.getNumRegs());
595 Info
->ScratchOffsetReg
= AMDGPU::SGPR_32RegClass
.getRegister(ScratchIdx
);
600 MachineBasicBlock
* SITargetLowering::EmitInstrWithCustomInserter(
601 MachineInstr
* MI
, MachineBasicBlock
* BB
) const {
603 MachineBasicBlock::iterator I
= *MI
;
604 const SIInstrInfo
*TII
= static_cast<const SIInstrInfo
*>(
605 getTargetMachine().getSubtargetImpl()->getInstrInfo());
607 switch (MI
->getOpcode()) {
609 return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI
, BB
);
610 case AMDGPU::BRANCH
: return BB
;
611 case AMDGPU::V_SUB_F64
: {
612 unsigned DestReg
= MI
->getOperand(0).getReg();
613 BuildMI(*BB
, I
, MI
->getDebugLoc(), TII
->get(AMDGPU::V_ADD_F64
), DestReg
)
614 .addImm(0) // SRC0 modifiers
615 .addReg(MI
->getOperand(1).getReg())
616 .addImm(1) // SRC1 modifiers
617 .addReg(MI
->getOperand(2).getReg())
620 MI
->eraseFromParent();
623 case AMDGPU::SI_RegisterStorePseudo
: {
624 MachineRegisterInfo
&MRI
= BB
->getParent()->getRegInfo();
625 unsigned Reg
= MRI
.createVirtualRegister(&AMDGPU::SReg_64RegClass
);
626 MachineInstrBuilder MIB
=
627 BuildMI(*BB
, I
, MI
->getDebugLoc(), TII
->get(AMDGPU::SI_RegisterStore
),
629 for (unsigned i
= 0, e
= MI
->getNumOperands(); i
!= e
; ++i
)
630 MIB
.addOperand(MI
->getOperand(i
));
632 MI
->eraseFromParent();
639 EVT
SITargetLowering::getSetCCResultType(LLVMContext
&Ctx
, EVT VT
) const {
640 if (!VT
.isVector()) {
643 return EVT::getVectorVT(Ctx
, MVT::i1
, VT
.getVectorNumElements());
646 MVT
SITargetLowering::getScalarShiftAmountTy(EVT VT
) const {
650 bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT
) const {
651 VT
= VT
.getScalarType();
656 switch (VT
.getSimpleVT().SimpleTy
) {
658 return false; /* There is V_MAD_F32 for f32 */
668 //===----------------------------------------------------------------------===//
669 // Custom DAG Lowering Operations
670 //===----------------------------------------------------------------------===//
672 SDValue
SITargetLowering::LowerOperation(SDValue Op
, SelectionDAG
&DAG
) const {
673 switch (Op
.getOpcode()) {
674 default: return AMDGPUTargetLowering::LowerOperation(Op
, DAG
);
675 case ISD::FrameIndex
: return LowerFrameIndex(Op
, DAG
);
676 case ISD::BRCOND
: return LowerBRCOND(Op
, DAG
);
678 SDValue Result
= LowerLOAD(Op
, DAG
);
679 assert((!Result
.getNode() ||
680 Result
.getNode()->getNumValues() == 2) &&
681 "Load should return a value and a chain");
687 return LowerTrig(Op
, DAG
);
688 case ISD::SELECT
: return LowerSELECT(Op
, DAG
);
689 case ISD::FDIV
: return LowerFDIV(Op
, DAG
);
690 case ISD::STORE
: return LowerSTORE(Op
, DAG
);
691 case ISD::GlobalAddress
: {
692 MachineFunction
&MF
= DAG
.getMachineFunction();
693 SIMachineFunctionInfo
*MFI
= MF
.getInfo
<SIMachineFunctionInfo
>();
694 return LowerGlobalAddress(MFI
, Op
, DAG
);
696 case ISD::INTRINSIC_WO_CHAIN
: return LowerINTRINSIC_WO_CHAIN(Op
, DAG
);
697 case ISD::INTRINSIC_VOID
: return LowerINTRINSIC_VOID(Op
, DAG
);
702 /// \brief Helper function for LowerBRCOND
703 static SDNode
*findUser(SDValue Value
, unsigned Opcode
) {
705 SDNode
*Parent
= Value
.getNode();
706 for (SDNode::use_iterator I
= Parent
->use_begin(), E
= Parent
->use_end();
709 if (I
.getUse().get() != Value
)
712 if (I
->getOpcode() == Opcode
)
718 SDValue
SITargetLowering::LowerFrameIndex(SDValue Op
, SelectionDAG
&DAG
) const {
720 FrameIndexSDNode
*FINode
= cast
<FrameIndexSDNode
>(Op
);
721 unsigned FrameIndex
= FINode
->getIndex();
723 return DAG
.getTargetFrameIndex(FrameIndex
, MVT::i32
);
726 /// This transforms the control flow intrinsics to get the branch destination as
727 /// last parameter, also switches branch target with BR if the need arise
728 SDValue
SITargetLowering::LowerBRCOND(SDValue BRCOND
,
729 SelectionDAG
&DAG
) const {
733 SDNode
*Intr
= BRCOND
.getOperand(1).getNode();
734 SDValue Target
= BRCOND
.getOperand(2);
735 SDNode
*BR
= nullptr;
737 if (Intr
->getOpcode() == ISD::SETCC
) {
738 // As long as we negate the condition everything is fine
739 SDNode
*SetCC
= Intr
;
740 assert(SetCC
->getConstantOperandVal(1) == 1);
741 assert(cast
<CondCodeSDNode
>(SetCC
->getOperand(2).getNode())->get() ==
743 Intr
= SetCC
->getOperand(0).getNode();
746 // Get the target from BR if we don't negate the condition
747 BR
= findUser(BRCOND
, ISD::BR
);
748 Target
= BR
->getOperand(1);
751 assert(Intr
->getOpcode() == ISD::INTRINSIC_W_CHAIN
);
753 // Build the result and
754 SmallVector
<EVT
, 4> Res
;
755 for (unsigned i
= 1, e
= Intr
->getNumValues(); i
!= e
; ++i
)
756 Res
.push_back(Intr
->getValueType(i
));
758 // operands of the new intrinsic call
759 SmallVector
<SDValue
, 4> Ops
;
760 Ops
.push_back(BRCOND
.getOperand(0));
761 for (unsigned i
= 1, e
= Intr
->getNumOperands(); i
!= e
; ++i
)
762 Ops
.push_back(Intr
->getOperand(i
));
763 Ops
.push_back(Target
);
765 // build the new intrinsic call
766 SDNode
*Result
= DAG
.getNode(
767 Res
.size() > 1 ? ISD::INTRINSIC_W_CHAIN
: ISD::INTRINSIC_VOID
, DL
,
768 DAG
.getVTList(Res
), Ops
).getNode();
771 // Give the branch instruction our target
776 SDValue NewBR
= DAG
.getNode(ISD::BR
, DL
, BR
->getVTList(), Ops
);
777 DAG
.ReplaceAllUsesWith(BR
, NewBR
.getNode());
778 BR
= NewBR
.getNode();
781 SDValue Chain
= SDValue(Result
, Result
->getNumValues() - 1);
783 // Copy the intrinsic results to registers
784 for (unsigned i
= 1, e
= Intr
->getNumValues() - 1; i
!= e
; ++i
) {
785 SDNode
*CopyToReg
= findUser(SDValue(Intr
, i
), ISD::CopyToReg
);
789 Chain
= DAG
.getCopyToReg(
791 CopyToReg
->getOperand(1),
792 SDValue(Result
, i
- 1),
795 DAG
.ReplaceAllUsesWith(SDValue(CopyToReg
, 0), CopyToReg
->getOperand(0));
798 // Remove the old intrinsic from the chain
799 DAG
.ReplaceAllUsesOfValueWith(
800 SDValue(Intr
, Intr
->getNumValues() - 1),
801 Intr
->getOperand(0));
806 SDValue
SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction
*MFI
,
808 SelectionDAG
&DAG
) const {
809 GlobalAddressSDNode
*GSD
= cast
<GlobalAddressSDNode
>(Op
);
811 if (GSD
->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS
)
812 return AMDGPUTargetLowering::LowerGlobalAddress(MFI
, Op
, DAG
);
815 const GlobalValue
*GV
= GSD
->getGlobal();
816 MVT PtrVT
= getPointerTy(GSD
->getAddressSpace());
818 SDValue Ptr
= DAG
.getNode(AMDGPUISD::CONST_DATA_PTR
, DL
, PtrVT
);
819 SDValue GA
= DAG
.getTargetGlobalAddress(GV
, DL
, MVT::i32
);
821 SDValue PtrLo
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, DL
, MVT::i32
, Ptr
,
822 DAG
.getConstant(0, MVT::i32
));
823 SDValue PtrHi
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, DL
, MVT::i32
, Ptr
,
824 DAG
.getConstant(1, MVT::i32
));
826 SDValue Lo
= DAG
.getNode(ISD::ADDC
, DL
, DAG
.getVTList(MVT::i32
, MVT::Glue
),
828 SDValue Hi
= DAG
.getNode(ISD::ADDE
, DL
, DAG
.getVTList(MVT::i32
, MVT::Glue
),
829 PtrHi
, DAG
.getConstant(0, MVT::i32
),
830 SDValue(Lo
.getNode(), 1));
831 return DAG
.getNode(ISD::BUILD_PAIR
, DL
, MVT::i64
, Lo
, Hi
);
834 SDValue
SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op
,
835 SelectionDAG
&DAG
) const {
836 MachineFunction
&MF
= DAG
.getMachineFunction();
837 const SIRegisterInfo
*TRI
=
838 static_cast<const SIRegisterInfo
*>(MF
.getSubtarget().getRegisterInfo());
840 EVT VT
= Op
.getValueType();
842 unsigned IntrinsicID
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
844 switch (IntrinsicID
) {
845 case Intrinsic::r600_read_ngroups_x
:
846 return LowerParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
847 SI::KernelInputOffsets::NGROUPS_X
, false);
848 case Intrinsic::r600_read_ngroups_y
:
849 return LowerParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
850 SI::KernelInputOffsets::NGROUPS_Y
, false);
851 case Intrinsic::r600_read_ngroups_z
:
852 return LowerParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
853 SI::KernelInputOffsets::NGROUPS_Z
, false);
854 case Intrinsic::r600_read_global_size_x
:
855 return LowerParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
856 SI::KernelInputOffsets::GLOBAL_SIZE_X
, false);
857 case Intrinsic::r600_read_global_size_y
:
858 return LowerParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
859 SI::KernelInputOffsets::GLOBAL_SIZE_Y
, false);
860 case Intrinsic::r600_read_global_size_z
:
861 return LowerParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
862 SI::KernelInputOffsets::GLOBAL_SIZE_Z
, false);
863 case Intrinsic::r600_read_local_size_x
:
864 return LowerParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
865 SI::KernelInputOffsets::LOCAL_SIZE_X
, false);
866 case Intrinsic::r600_read_local_size_y
:
867 return LowerParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
868 SI::KernelInputOffsets::LOCAL_SIZE_Y
, false);
869 case Intrinsic::r600_read_local_size_z
:
870 return LowerParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
871 SI::KernelInputOffsets::LOCAL_SIZE_Z
, false);
873 case Intrinsic::AMDGPU_read_workdim
:
874 return LowerParameter(DAG
, VT
, VT
, DL
, DAG
.getEntryNode(),
875 MF
.getInfo
<SIMachineFunctionInfo
>()->ABIArgOffset
,
878 case Intrinsic::r600_read_tgid_x
:
879 return CreateLiveInRegister(DAG
, &AMDGPU::SReg_32RegClass
,
880 TRI
->getPreloadedValue(MF
, SIRegisterInfo::TGID_X
), VT
);
881 case Intrinsic::r600_read_tgid_y
:
882 return CreateLiveInRegister(DAG
, &AMDGPU::SReg_32RegClass
,
883 TRI
->getPreloadedValue(MF
, SIRegisterInfo::TGID_Y
), VT
);
884 case Intrinsic::r600_read_tgid_z
:
885 return CreateLiveInRegister(DAG
, &AMDGPU::SReg_32RegClass
,
886 TRI
->getPreloadedValue(MF
, SIRegisterInfo::TGID_Z
), VT
);
887 case Intrinsic::r600_read_tidig_x
:
888 return CreateLiveInRegister(DAG
, &AMDGPU::VGPR_32RegClass
,
889 TRI
->getPreloadedValue(MF
, SIRegisterInfo::TIDIG_X
), VT
);
890 case Intrinsic::r600_read_tidig_y
:
891 return CreateLiveInRegister(DAG
, &AMDGPU::VGPR_32RegClass
,
892 TRI
->getPreloadedValue(MF
, SIRegisterInfo::TIDIG_Y
), VT
);
893 case Intrinsic::r600_read_tidig_z
:
894 return CreateLiveInRegister(DAG
, &AMDGPU::VGPR_32RegClass
,
895 TRI
->getPreloadedValue(MF
, SIRegisterInfo::TIDIG_Z
), VT
);
896 case AMDGPUIntrinsic::SI_load_const
: {
902 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
903 MachinePointerInfo(),
904 MachineMemOperand::MOLoad
| MachineMemOperand::MOInvariant
,
905 VT
.getStoreSize(), 4);
906 return DAG
.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT
, DL
,
907 Op
->getVTList(), Ops
, VT
, MMO
);
909 case AMDGPUIntrinsic::SI_sample
:
910 return LowerSampleIntrinsic(AMDGPUISD::SAMPLE
, Op
, DAG
);
911 case AMDGPUIntrinsic::SI_sampleb
:
912 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEB
, Op
, DAG
);
913 case AMDGPUIntrinsic::SI_sampled
:
914 return LowerSampleIntrinsic(AMDGPUISD::SAMPLED
, Op
, DAG
);
915 case AMDGPUIntrinsic::SI_samplel
:
916 return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL
, Op
, DAG
);
917 case AMDGPUIntrinsic::SI_vs_load_input
:
918 return DAG
.getNode(AMDGPUISD::LOAD_INPUT
, DL
, VT
,
923 return AMDGPUTargetLowering::LowerOperation(Op
, DAG
);
927 SDValue
SITargetLowering::LowerINTRINSIC_VOID(SDValue Op
,
928 SelectionDAG
&DAG
) const {
929 MachineFunction
&MF
= DAG
.getMachineFunction();
930 SDValue Chain
= Op
.getOperand(0);
931 unsigned IntrinsicID
= cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue();
933 switch (IntrinsicID
) {
934 case AMDGPUIntrinsic::SI_tbuffer_store
: {
953 EVT VT
= Op
.getOperand(3).getValueType();
955 MachineMemOperand
*MMO
= MF
.getMachineMemOperand(
956 MachinePointerInfo(),
957 MachineMemOperand::MOStore
,
958 VT
.getStoreSize(), 4);
959 return DAG
.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT
, DL
,
960 Op
->getVTList(), Ops
, VT
, MMO
);
967 SDValue
SITargetLowering::LowerLOAD(SDValue Op
, SelectionDAG
&DAG
) const {
969 LoadSDNode
*Load
= cast
<LoadSDNode
>(Op
);
971 if (Op
.getValueType().isVector()) {
972 assert(Op
.getValueType().getVectorElementType() == MVT::i32
&&
973 "Custom lowering for non-i32 vectors hasn't been implemented.");
974 unsigned NumElements
= Op
.getValueType().getVectorNumElements();
975 assert(NumElements
!= 2 && "v2 loads are supported for all address spaces.");
976 switch (Load
->getAddressSpace()) {
978 case AMDGPUAS::GLOBAL_ADDRESS
:
979 case AMDGPUAS::PRIVATE_ADDRESS
:
980 // v4 loads are supported for private and global memory.
981 if (NumElements
<= 4)
984 case AMDGPUAS::LOCAL_ADDRESS
:
985 return ScalarizeVectorLoad(Op
, DAG
);
989 return AMDGPUTargetLowering::LowerLOAD(Op
, DAG
);
992 SDValue
SITargetLowering::LowerSampleIntrinsic(unsigned Opcode
,
994 SelectionDAG
&DAG
) const {
995 return DAG
.getNode(Opcode
, SDLoc(Op
), Op
.getValueType(), Op
.getOperand(1),
1001 SDValue
SITargetLowering::LowerSELECT(SDValue Op
, SelectionDAG
&DAG
) const {
1002 if (Op
.getValueType() != MVT::i64
)
1006 SDValue Cond
= Op
.getOperand(0);
1008 SDValue Zero
= DAG
.getConstant(0, MVT::i32
);
1009 SDValue One
= DAG
.getConstant(1, MVT::i32
);
1011 SDValue LHS
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
, Op
.getOperand(1));
1012 SDValue RHS
= DAG
.getNode(ISD::BITCAST
, DL
, MVT::v2i32
, Op
.getOperand(2));
1014 SDValue Lo0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, LHS
, Zero
);
1015 SDValue Lo1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, RHS
, Zero
);
1017 SDValue Lo
= DAG
.getSelect(DL
, MVT::i32
, Cond
, Lo0
, Lo1
);
1019 SDValue Hi0
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, LHS
, One
);
1020 SDValue Hi1
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, MVT::i32
, RHS
, One
);
1022 SDValue Hi
= DAG
.getSelect(DL
, MVT::i32
, Cond
, Hi0
, Hi1
);
1024 SDValue Res
= DAG
.getNode(ISD::BUILD_VECTOR
, DL
, MVT::v2i32
, Lo
, Hi
);
1025 return DAG
.getNode(ISD::BITCAST
, DL
, MVT::i64
, Res
);
1028 // Catch division cases where we can use shortcuts with rcp and rsq
1030 SDValue
SITargetLowering::LowerFastFDIV(SDValue Op
, SelectionDAG
&DAG
) const {
1032 SDValue LHS
= Op
.getOperand(0);
1033 SDValue RHS
= Op
.getOperand(1);
1034 EVT VT
= Op
.getValueType();
1035 bool Unsafe
= DAG
.getTarget().Options
.UnsafeFPMath
;
1037 if (const ConstantFPSDNode
*CLHS
= dyn_cast
<ConstantFPSDNode
>(LHS
)) {
1038 if ((Unsafe
|| (VT
== MVT::f32
&& !Subtarget
->hasFP32Denormals())) &&
1039 CLHS
->isExactlyValue(1.0)) {
1040 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
1041 // the CI documentation has a worst case error of 1 ulp.
1042 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
1043 // use it as long as we aren't trying to use denormals.
1045 // 1.0 / sqrt(x) -> rsq(x)
1047 // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
1048 // error seems really high at 2^29 ULP.
1049 if (RHS
.getOpcode() == ISD::FSQRT
)
1050 return DAG
.getNode(AMDGPUISD::RSQ
, SL
, VT
, RHS
.getOperand(0));
1052 // 1.0 / x -> rcp(x)
1053 return DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, RHS
);
1058 // Turn into multiply by the reciprocal.
1059 // x / y -> x * (1.0 / y)
1060 SDValue Recip
= DAG
.getNode(AMDGPUISD::RCP
, SL
, VT
, RHS
);
1061 return DAG
.getNode(ISD::FMUL
, SL
, VT
, LHS
, Recip
);
1067 SDValue
SITargetLowering::LowerFDIV32(SDValue Op
, SelectionDAG
&DAG
) const {
1068 SDValue FastLowered
= LowerFastFDIV(Op
, DAG
);
1069 if (FastLowered
.getNode())
1072 // This uses v_rcp_f32 which does not handle denormals. Let this hit a
1073 // selection error for now rather than do something incorrect.
1074 if (Subtarget
->hasFP32Denormals())
1078 SDValue LHS
= Op
.getOperand(0);
1079 SDValue RHS
= Op
.getOperand(1);
1081 SDValue r1
= DAG
.getNode(ISD::FABS
, SL
, MVT::f32
, RHS
);
1083 const APFloat
K0Val(BitsToFloat(0x6f800000));
1084 const SDValue K0
= DAG
.getConstantFP(K0Val
, MVT::f32
);
1086 const APFloat
K1Val(BitsToFloat(0x2f800000));
1087 const SDValue K1
= DAG
.getConstantFP(K1Val
, MVT::f32
);
1089 const SDValue One
= DAG
.getConstantFP(1.0, MVT::f32
);
1091 EVT SetCCVT
= getSetCCResultType(*DAG
.getContext(), MVT::f32
);
1093 SDValue r2
= DAG
.getSetCC(SL
, SetCCVT
, r1
, K0
, ISD::SETOGT
);
1095 SDValue r3
= DAG
.getNode(ISD::SELECT
, SL
, MVT::f32
, r2
, K1
, One
);
1097 r1
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, RHS
, r3
);
1099 SDValue r0
= DAG
.getNode(AMDGPUISD::RCP
, SL
, MVT::f32
, r1
);
1101 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, LHS
, r0
);
1103 return DAG
.getNode(ISD::FMUL
, SL
, MVT::f32
, r3
, Mul
);
1106 SDValue
SITargetLowering::LowerFDIV64(SDValue Op
, SelectionDAG
&DAG
) const {
1110 SDValue
SITargetLowering::LowerFDIV(SDValue Op
, SelectionDAG
&DAG
) const {
1111 EVT VT
= Op
.getValueType();
1114 return LowerFDIV32(Op
, DAG
);
1117 return LowerFDIV64(Op
, DAG
);
1119 llvm_unreachable("Unexpected type for fdiv");
1122 SDValue
SITargetLowering::LowerSTORE(SDValue Op
, SelectionDAG
&DAG
) const {
1124 StoreSDNode
*Store
= cast
<StoreSDNode
>(Op
);
1125 EVT VT
= Store
->getMemoryVT();
1127 // These stores are legal.
1128 if (Store
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
&&
1129 VT
.isVector() && VT
.getVectorNumElements() == 2 &&
1130 VT
.getVectorElementType() == MVT::i32
)
1133 if (Store
->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
) {
1134 if (VT
.isVector() && VT
.getVectorNumElements() > 4)
1135 return ScalarizeVectorStore(Op
, DAG
);
1139 SDValue Ret
= AMDGPUTargetLowering::LowerSTORE(Op
, DAG
);
1143 if (VT
.isVector() && VT
.getVectorNumElements() >= 8)
1144 return ScalarizeVectorStore(Op
, DAG
);
1147 return DAG
.getTruncStore(Store
->getChain(), DL
,
1148 DAG
.getSExtOrTrunc(Store
->getValue(), DL
, MVT::i32
),
1149 Store
->getBasePtr(), MVT::i1
, Store
->getMemOperand());
1154 SDValue
SITargetLowering::LowerTrig(SDValue Op
, SelectionDAG
&DAG
) const {
1155 EVT VT
= Op
.getValueType();
1156 SDValue Arg
= Op
.getOperand(0);
1157 SDValue FractPart
= DAG
.getNode(AMDGPUISD::FRACT
, SDLoc(Op
), VT
,
1158 DAG
.getNode(ISD::FMUL
, SDLoc(Op
), VT
, Arg
,
1159 DAG
.getConstantFP(0.5 / M_PI
, VT
)));
1161 switch (Op
.getOpcode()) {
1163 return DAG
.getNode(AMDGPUISD::COS_HW
, SDLoc(Op
), VT
, FractPart
);
1165 return DAG
.getNode(AMDGPUISD::SIN_HW
, SDLoc(Op
), VT
, FractPart
);
1167 llvm_unreachable("Wrong trig opcode");
1171 //===----------------------------------------------------------------------===//
1172 // Custom DAG optimizations
1173 //===----------------------------------------------------------------------===//
1175 SDValue
SITargetLowering::performUCharToFloatCombine(SDNode
*N
,
1176 DAGCombinerInfo
&DCI
) const {
1177 EVT VT
= N
->getValueType(0);
1178 EVT ScalarVT
= VT
.getScalarType();
1179 if (ScalarVT
!= MVT::f32
)
1182 SelectionDAG
&DAG
= DCI
.DAG
;
1185 SDValue Src
= N
->getOperand(0);
1186 EVT SrcVT
= Src
.getValueType();
1188 // TODO: We could try to match extracting the higher bytes, which would be
1189 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
1190 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
1191 // about in practice.
1192 if (DCI
.isAfterLegalizeVectorOps() && SrcVT
== MVT::i32
) {
1193 if (DAG
.MaskedValueIsZero(Src
, APInt::getHighBitsSet(32, 24))) {
1194 SDValue Cvt
= DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE0
, DL
, VT
, Src
);
1195 DCI
.AddToWorklist(Cvt
.getNode());
1200 // We are primarily trying to catch operations on illegal vector types
1201 // before they are expanded.
1202 // For scalars, we can use the more flexible method of checking masked bits
1203 // after legalization.
1204 if (!DCI
.isBeforeLegalize() ||
1205 !SrcVT
.isVector() ||
1206 SrcVT
.getVectorElementType() != MVT::i8
) {
1210 assert(DCI
.isBeforeLegalize() && "Unexpected legal type");
1212 // Weird sized vectors are a pain to handle, but we know 3 is really the same
1214 unsigned NElts
= SrcVT
.getVectorNumElements();
1215 if (!SrcVT
.isSimple() && NElts
!= 3)
1218 // Handle v4i8 -> v4f32 extload. Replace the v4i8 with a legal i32 load to
1219 // prevent a mess from expanding to v4i32 and repacking.
1220 if (ISD::isNormalLoad(Src
.getNode()) && Src
.hasOneUse()) {
1221 EVT LoadVT
= getEquivalentMemType(*DAG
.getContext(), SrcVT
);
1222 EVT RegVT
= getEquivalentLoadRegType(*DAG
.getContext(), SrcVT
);
1223 EVT FloatVT
= EVT::getVectorVT(*DAG
.getContext(), MVT::f32
, NElts
);
1224 LoadSDNode
*Load
= cast
<LoadSDNode
>(Src
);
1226 unsigned AS
= Load
->getAddressSpace();
1227 unsigned Align
= Load
->getAlignment();
1228 Type
*Ty
= LoadVT
.getTypeForEVT(*DAG
.getContext());
1229 unsigned ABIAlignment
= getDataLayout()->getABITypeAlignment(Ty
);
1231 // Don't try to replace the load if we have to expand it due to alignment
1232 // problems. Otherwise we will end up scalarizing the load, and trying to
1233 // repack into the vector for no real reason.
1234 if (Align
< ABIAlignment
&&
1235 !allowsMisalignedMemoryAccesses(LoadVT
, AS
, Align
, nullptr)) {
1239 SDValue NewLoad
= DAG
.getExtLoad(ISD::ZEXTLOAD
, DL
, RegVT
,
1243 Load
->getMemOperand());
1245 // Make sure successors of the original load stay after it by updating
1246 // them to use the new Chain.
1247 DAG
.ReplaceAllUsesOfValueWith(SDValue(Load
, 1), NewLoad
.getValue(1));
1249 SmallVector
<SDValue
, 4> Elts
;
1250 if (RegVT
.isVector())
1251 DAG
.ExtractVectorElements(NewLoad
, Elts
);
1253 Elts
.push_back(NewLoad
);
1255 SmallVector
<SDValue
, 4> Ops
;
1257 unsigned EltIdx
= 0;
1258 for (SDValue Elt
: Elts
) {
1259 unsigned ComponentsInElt
= std::min(4u, NElts
- 4 * EltIdx
);
1260 for (unsigned I
= 0; I
< ComponentsInElt
; ++I
) {
1261 unsigned Opc
= AMDGPUISD::CVT_F32_UBYTE0
+ I
;
1262 SDValue Cvt
= DAG
.getNode(Opc
, DL
, MVT::f32
, Elt
);
1263 DCI
.AddToWorklist(Cvt
.getNode());
1270 assert(Ops
.size() == NElts
);
1272 return DAG
.getNode(ISD::BUILD_VECTOR
, DL
, FloatVT
, Ops
);
1278 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
1280 // This is a variant of
1281 // (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
1283 // The normal DAG combiner will do this, but only if the add has one use since
1284 // that would increase the number of instructions.
1286 // This prevents us from seeing a constant offset that can be folded into a
1287 // memory instruction's addressing mode. If we know the resulting add offset of
1288 // a pointer can be folded into an addressing offset, we can replace the pointer
1289 // operand with the add of new constant offset. This eliminates one of the uses,
1290 // and may allow the remaining use to also be simplified.
1292 SDValue
SITargetLowering::performSHLPtrCombine(SDNode
*N
,
1294 DAGCombinerInfo
&DCI
) const {
1295 SDValue N0
= N
->getOperand(0);
1296 SDValue N1
= N
->getOperand(1);
1298 if (N0
.getOpcode() != ISD::ADD
)
1301 const ConstantSDNode
*CN1
= dyn_cast
<ConstantSDNode
>(N1
);
1305 const ConstantSDNode
*CAdd
= dyn_cast
<ConstantSDNode
>(N0
.getOperand(1));
1309 const SIInstrInfo
*TII
= static_cast<const SIInstrInfo
*>(
1310 getTargetMachine().getSubtargetImpl()->getInstrInfo());
1312 // If the resulting offset is too large, we can't fold it into the addressing
1314 APInt Offset
= CAdd
->getAPIntValue() << CN1
->getAPIntValue();
1315 if (!TII
->canFoldOffset(Offset
.getZExtValue(), AddrSpace
))
1318 SelectionDAG
&DAG
= DCI
.DAG
;
1320 EVT VT
= N
->getValueType(0);
1322 SDValue ShlX
= DAG
.getNode(ISD::SHL
, SL
, VT
, N0
.getOperand(0), N1
);
1323 SDValue COffset
= DAG
.getConstant(Offset
, MVT::i32
);
1325 return DAG
.getNode(ISD::ADD
, SL
, VT
, ShlX
, COffset
);
1328 SDValue
SITargetLowering::performAndCombine(SDNode
*N
,
1329 DAGCombinerInfo
&DCI
) const {
1330 if (DCI
.isBeforeLegalize())
1333 SelectionDAG
&DAG
= DCI
.DAG
;
1335 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
1336 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
1337 SDValue LHS
= N
->getOperand(0);
1338 SDValue RHS
= N
->getOperand(1);
1340 if (LHS
.getOpcode() == ISD::SETCC
&&
1341 RHS
.getOpcode() == ISD::SETCC
) {
1342 ISD::CondCode LCC
= cast
<CondCodeSDNode
>(LHS
.getOperand(2))->get();
1343 ISD::CondCode RCC
= cast
<CondCodeSDNode
>(RHS
.getOperand(2))->get();
1345 SDValue X
= LHS
.getOperand(0);
1346 SDValue Y
= RHS
.getOperand(0);
1347 if (Y
.getOpcode() != ISD::FABS
|| Y
.getOperand(0) != X
)
1350 if (LCC
== ISD::SETO
) {
1351 if (X
!= LHS
.getOperand(1))
1354 if (RCC
== ISD::SETUNE
) {
1355 const ConstantFPSDNode
*C1
= dyn_cast
<ConstantFPSDNode
>(RHS
.getOperand(1));
1356 if (!C1
|| !C1
->isInfinity() || C1
->isNegative())
1359 const uint32_t Mask
= SIInstrFlags::N_NORMAL
|
1360 SIInstrFlags::N_SUBNORMAL
|
1361 SIInstrFlags::N_ZERO
|
1362 SIInstrFlags::P_ZERO
|
1363 SIInstrFlags::P_SUBNORMAL
|
1364 SIInstrFlags::P_NORMAL
;
1366 static_assert(((~(SIInstrFlags::S_NAN
|
1367 SIInstrFlags::Q_NAN
|
1368 SIInstrFlags::N_INFINITY
|
1369 SIInstrFlags::P_INFINITY
)) & 0x3ff) == Mask
,
1372 return DAG
.getNode(AMDGPUISD::FP_CLASS
, SDLoc(N
), MVT::i1
,
1373 X
, DAG
.getConstant(Mask
, MVT::i32
));
1381 SDValue
SITargetLowering::performOrCombine(SDNode
*N
,
1382 DAGCombinerInfo
&DCI
) const {
1383 SelectionDAG
&DAG
= DCI
.DAG
;
1384 SDValue LHS
= N
->getOperand(0);
1385 SDValue RHS
= N
->getOperand(1);
1387 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
1388 if (LHS
.getOpcode() == AMDGPUISD::FP_CLASS
&&
1389 RHS
.getOpcode() == AMDGPUISD::FP_CLASS
) {
1390 SDValue Src
= LHS
.getOperand(0);
1391 if (Src
!= RHS
.getOperand(0))
1394 const ConstantSDNode
*CLHS
= dyn_cast
<ConstantSDNode
>(LHS
.getOperand(1));
1395 const ConstantSDNode
*CRHS
= dyn_cast
<ConstantSDNode
>(RHS
.getOperand(1));
1399 // Only 10 bits are used.
1400 static const uint32_t MaxMask
= 0x3ff;
1402 uint32_t NewMask
= (CLHS
->getZExtValue() | CRHS
->getZExtValue()) & MaxMask
;
1403 return DAG
.getNode(AMDGPUISD::FP_CLASS
, SDLoc(N
), MVT::i1
,
1404 Src
, DAG
.getConstant(NewMask
, MVT::i32
));
1410 SDValue
SITargetLowering::performClassCombine(SDNode
*N
,
1411 DAGCombinerInfo
&DCI
) const {
1412 SelectionDAG
&DAG
= DCI
.DAG
;
1413 SDValue Mask
= N
->getOperand(1);
1415 // fp_class x, 0 -> false
1416 if (const ConstantSDNode
*CMask
= dyn_cast
<ConstantSDNode
>(Mask
)) {
1417 if (CMask
->isNullValue())
1418 return DAG
.getConstant(0, MVT::i1
);
1424 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc
) {
1427 return AMDGPUISD::FMAX3
;
1428 case AMDGPUISD::SMAX
:
1429 return AMDGPUISD::SMAX3
;
1430 case AMDGPUISD::UMAX
:
1431 return AMDGPUISD::UMAX3
;
1433 return AMDGPUISD::FMIN3
;
1434 case AMDGPUISD::SMIN
:
1435 return AMDGPUISD::SMIN3
;
1436 case AMDGPUISD::UMIN
:
1437 return AMDGPUISD::UMIN3
;
1439 llvm_unreachable("Not a min/max opcode");
1443 SDValue
SITargetLowering::performMin3Max3Combine(SDNode
*N
,
1444 DAGCombinerInfo
&DCI
) const {
1445 SelectionDAG
&DAG
= DCI
.DAG
;
1447 unsigned Opc
= N
->getOpcode();
1448 SDValue Op0
= N
->getOperand(0);
1449 SDValue Op1
= N
->getOperand(1);
1451 // Only do this if the inner op has one use since this will just increases
1452 // register pressure for no benefit.
1454 // max(max(a, b), c)
1455 if (Op0
.getOpcode() == Opc
&& Op0
.hasOneUse()) {
1457 return DAG
.getNode(minMaxOpcToMin3Max3Opc(Opc
),
1465 // max(a, max(b, c))
1466 if (Op1
.getOpcode() == Opc
&& Op1
.hasOneUse()) {
1468 return DAG
.getNode(minMaxOpcToMin3Max3Opc(Opc
),
1479 SDValue
SITargetLowering::performSetCCCombine(SDNode
*N
,
1480 DAGCombinerInfo
&DCI
) const {
1481 SelectionDAG
&DAG
= DCI
.DAG
;
1484 SDValue LHS
= N
->getOperand(0);
1485 SDValue RHS
= N
->getOperand(1);
1486 EVT VT
= LHS
.getValueType();
1488 if (VT
!= MVT::f32
&& VT
!= MVT::f64
)
1491 // Match isinf pattern
1492 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
1493 ISD::CondCode CC
= cast
<CondCodeSDNode
>(N
->getOperand(2))->get();
1494 if (CC
== ISD::SETOEQ
&& LHS
.getOpcode() == ISD::FABS
) {
1495 const ConstantFPSDNode
*CRHS
= dyn_cast
<ConstantFPSDNode
>(RHS
);
1499 const APFloat
&APF
= CRHS
->getValueAPF();
1500 if (APF
.isInfinity() && !APF
.isNegative()) {
1501 unsigned Mask
= SIInstrFlags::P_INFINITY
| SIInstrFlags::N_INFINITY
;
1502 return DAG
.getNode(AMDGPUISD::FP_CLASS
, SL
, MVT::i1
,
1503 LHS
.getOperand(0), DAG
.getConstant(Mask
, MVT::i32
));
1510 SDValue
SITargetLowering::PerformDAGCombine(SDNode
*N
,
1511 DAGCombinerInfo
&DCI
) const {
1512 SelectionDAG
&DAG
= DCI
.DAG
;
1515 switch (N
->getOpcode()) {
1517 return AMDGPUTargetLowering::PerformDAGCombine(N
, DCI
);
1519 return performSetCCCombine(N
, DCI
);
1520 case ISD::FMAXNUM
: // TODO: What about fmax_legacy?
1522 case AMDGPUISD::SMAX
:
1523 case AMDGPUISD::SMIN
:
1524 case AMDGPUISD::UMAX
:
1525 case AMDGPUISD::UMIN
: {
1526 if (DCI
.getDAGCombineLevel() >= AfterLegalizeDAG
&&
1527 getTargetMachine().getOptLevel() > CodeGenOpt::None
)
1528 return performMin3Max3Combine(N
, DCI
);
1532 case AMDGPUISD::CVT_F32_UBYTE0
:
1533 case AMDGPUISD::CVT_F32_UBYTE1
:
1534 case AMDGPUISD::CVT_F32_UBYTE2
:
1535 case AMDGPUISD::CVT_F32_UBYTE3
: {
1536 unsigned Offset
= N
->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0
;
1538 SDValue Src
= N
->getOperand(0);
1539 APInt Demanded
= APInt::getBitsSet(32, 8 * Offset
, 8 * Offset
+ 8);
1541 APInt KnownZero
, KnownOne
;
1542 TargetLowering::TargetLoweringOpt
TLO(DAG
, !DCI
.isBeforeLegalize(),
1543 !DCI
.isBeforeLegalizeOps());
1544 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
1545 if (TLO
.ShrinkDemandedConstant(Src
, Demanded
) ||
1546 TLI
.SimplifyDemandedBits(Src
, Demanded
, KnownZero
, KnownOne
, TLO
)) {
1547 DCI
.CommitTargetLoweringOpt(TLO
);
1553 case ISD::UINT_TO_FP
: {
1554 return performUCharToFloatCombine(N
, DCI
);
1557 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
1560 EVT VT
= N
->getValueType(0);
1564 SDValue LHS
= N
->getOperand(0);
1565 SDValue RHS
= N
->getOperand(1);
1567 // These should really be instruction patterns, but writing patterns with
1568 // source modiifiers is a pain.
1570 // fadd (fadd (a, a), b) -> mad 2.0, a, b
1571 if (LHS
.getOpcode() == ISD::FADD
) {
1572 SDValue A
= LHS
.getOperand(0);
1573 if (A
== LHS
.getOperand(1)) {
1574 const SDValue Two
= DAG
.getConstantFP(2.0, MVT::f32
);
1575 return DAG
.getNode(AMDGPUISD::MAD
, DL
, VT
, Two
, A
, RHS
);
1579 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
1580 if (RHS
.getOpcode() == ISD::FADD
) {
1581 SDValue A
= RHS
.getOperand(0);
1582 if (A
== RHS
.getOperand(1)) {
1583 const SDValue Two
= DAG
.getConstantFP(2.0, MVT::f32
);
1584 return DAG
.getNode(AMDGPUISD::MAD
, DL
, VT
, Two
, A
, LHS
);
1591 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
)
1594 EVT VT
= N
->getValueType(0);
1596 // Try to get the fneg to fold into the source modifier. This undoes generic
1597 // DAG combines and folds them into the mad.
1598 if (VT
== MVT::f32
) {
1599 SDValue LHS
= N
->getOperand(0);
1600 SDValue RHS
= N
->getOperand(1);
1602 if (LHS
.getOpcode() == ISD::FMUL
) {
1603 // (fsub (fmul a, b), c) -> mad a, b, (fneg c)
1605 SDValue A
= LHS
.getOperand(0);
1606 SDValue B
= LHS
.getOperand(1);
1607 SDValue C
= DAG
.getNode(ISD::FNEG
, DL
, VT
, RHS
);
1609 return DAG
.getNode(AMDGPUISD::MAD
, DL
, VT
, A
, B
, C
);
1612 if (RHS
.getOpcode() == ISD::FMUL
) {
1613 // (fsub c, (fmul a, b)) -> mad (fneg a), b, c
1615 SDValue A
= DAG
.getNode(ISD::FNEG
, DL
, VT
, RHS
.getOperand(0));
1616 SDValue B
= RHS
.getOperand(1);
1619 return DAG
.getNode(AMDGPUISD::MAD
, DL
, VT
, A
, B
, C
);
1622 if (LHS
.getOpcode() == ISD::FADD
) {
1623 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
1625 SDValue A
= LHS
.getOperand(0);
1626 if (A
== LHS
.getOperand(1)) {
1627 const SDValue Two
= DAG
.getConstantFP(2.0, MVT::f32
);
1628 SDValue NegRHS
= DAG
.getNode(ISD::FNEG
, DL
, VT
, RHS
);
1630 return DAG
.getNode(AMDGPUISD::MAD
, DL
, VT
, Two
, A
, NegRHS
);
1634 if (RHS
.getOpcode() == ISD::FADD
) {
1635 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
1637 SDValue A
= RHS
.getOperand(0);
1638 if (A
== RHS
.getOperand(1)) {
1639 const SDValue NegTwo
= DAG
.getConstantFP(-2.0, MVT::f32
);
1640 return DAG
.getNode(AMDGPUISD::MAD
, DL
, VT
, NegTwo
, A
, LHS
);
1650 case ISD::ATOMIC_LOAD
:
1651 case ISD::ATOMIC_STORE
:
1652 case ISD::ATOMIC_CMP_SWAP
:
1653 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS
:
1654 case ISD::ATOMIC_SWAP
:
1655 case ISD::ATOMIC_LOAD_ADD
:
1656 case ISD::ATOMIC_LOAD_SUB
:
1657 case ISD::ATOMIC_LOAD_AND
:
1658 case ISD::ATOMIC_LOAD_OR
:
1659 case ISD::ATOMIC_LOAD_XOR
:
1660 case ISD::ATOMIC_LOAD_NAND
:
1661 case ISD::ATOMIC_LOAD_MIN
:
1662 case ISD::ATOMIC_LOAD_MAX
:
1663 case ISD::ATOMIC_LOAD_UMIN
:
1664 case ISD::ATOMIC_LOAD_UMAX
: { // TODO: Target mem intrinsics.
1665 if (DCI
.isBeforeLegalize())
1668 MemSDNode
*MemNode
= cast
<MemSDNode
>(N
);
1669 SDValue Ptr
= MemNode
->getBasePtr();
1671 // TODO: We could also do this for multiplies.
1672 unsigned AS
= MemNode
->getAddressSpace();
1673 if (Ptr
.getOpcode() == ISD::SHL
&& AS
!= AMDGPUAS::PRIVATE_ADDRESS
) {
1674 SDValue NewPtr
= performSHLPtrCombine(Ptr
.getNode(), AS
, DCI
);
1676 SmallVector
<SDValue
, 8> NewOps
;
1677 for (unsigned I
= 0, E
= MemNode
->getNumOperands(); I
!= E
; ++I
)
1678 NewOps
.push_back(MemNode
->getOperand(I
));
1680 NewOps
[N
->getOpcode() == ISD::STORE
? 2 : 1] = NewPtr
;
1681 return SDValue(DAG
.UpdateNodeOperands(MemNode
, NewOps
), 0);
1687 return performAndCombine(N
, DCI
);
1689 return performOrCombine(N
, DCI
);
1690 case AMDGPUISD::FP_CLASS
:
1691 return performClassCombine(N
, DCI
);
1693 return AMDGPUTargetLowering::PerformDAGCombine(N
, DCI
);
1696 /// \brief Test if RegClass is one of the VSrc classes
1697 static bool isVSrc(unsigned RegClass
) {
1699 default: return false;
1700 case AMDGPU::VS_32RegClassID
:
1701 case AMDGPU::VS_64RegClassID
:
1706 /// \brief Analyze the possible immediate value Op
1708 /// Returns -1 if it isn't an immediate, 0 if it's and inline immediate
1709 /// and the immediate value if it's a literal immediate
1710 int32_t SITargetLowering::analyzeImmediate(const SDNode
*N
) const {
1712 const SIInstrInfo
*TII
= static_cast<const SIInstrInfo
*>(
1713 getTargetMachine().getSubtargetImpl()->getInstrInfo());
1715 if (const ConstantSDNode
*Node
= dyn_cast
<ConstantSDNode
>(N
)) {
1716 if (Node
->getZExtValue() >> 32)
1719 if (TII
->isInlineConstant(Node
->getAPIntValue()))
1722 return Node
->getZExtValue();
1725 if (const ConstantFPSDNode
*Node
= dyn_cast
<ConstantFPSDNode
>(N
)) {
1726 if (TII
->isInlineConstant(Node
->getValueAPF().bitcastToAPInt()))
1729 if (Node
->getValueType(0) == MVT::f32
)
1730 return FloatToBits(Node
->getValueAPF().convertToFloat());
1738 const TargetRegisterClass
*SITargetLowering::getRegClassForNode(
1739 SelectionDAG
&DAG
, const SDValue
&Op
) const {
1740 const SIInstrInfo
*TII
= static_cast<const SIInstrInfo
*>(
1741 getTargetMachine().getSubtargetImpl()->getInstrInfo());
1742 const SIRegisterInfo
&TRI
= TII
->getRegisterInfo();
1744 if (!Op
->isMachineOpcode()) {
1745 switch(Op
->getOpcode()) {
1746 case ISD::CopyFromReg
: {
1747 MachineRegisterInfo
&MRI
= DAG
.getMachineFunction().getRegInfo();
1748 unsigned Reg
= cast
<RegisterSDNode
>(Op
->getOperand(1))->getReg();
1749 if (TargetRegisterInfo::isVirtualRegister(Reg
)) {
1750 return MRI
.getRegClass(Reg
);
1752 return TRI
.getPhysRegClass(Reg
);
1754 default: return nullptr;
1757 const MCInstrDesc
&Desc
= TII
->get(Op
->getMachineOpcode());
1758 int OpClassID
= Desc
.OpInfo
[Op
.getResNo()].RegClass
;
1759 if (OpClassID
!= -1) {
1760 return TRI
.getRegClass(OpClassID
);
1762 switch(Op
.getMachineOpcode()) {
1763 case AMDGPU::COPY_TO_REGCLASS
:
1764 // Operand 1 is the register class id for COPY_TO_REGCLASS instructions.
1765 OpClassID
= cast
<ConstantSDNode
>(Op
->getOperand(1))->getZExtValue();
1767 // If the COPY_TO_REGCLASS instruction is copying to a VSrc register
1768 // class, then the register class for the value could be either a
1769 // VReg or and SReg. In order to get a more accurate
1770 if (isVSrc(OpClassID
))
1771 return getRegClassForNode(DAG
, Op
.getOperand(0));
1773 return TRI
.getRegClass(OpClassID
);
1774 case AMDGPU::EXTRACT_SUBREG
: {
1775 int SubIdx
= cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue();
1776 const TargetRegisterClass
*SuperClass
=
1777 getRegClassForNode(DAG
, Op
.getOperand(0));
1778 return TRI
.getSubClassWithSubReg(SuperClass
, SubIdx
);
1780 case AMDGPU::REG_SEQUENCE
:
1781 // Operand 0 is the register class id for REG_SEQUENCE instructions.
1782 return TRI
.getRegClass(
1783 cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue());
1785 return getRegClassFor(Op
.getSimpleValueType());
1789 /// \brief Does "Op" fit into register class "RegClass" ?
1790 bool SITargetLowering::fitsRegClass(SelectionDAG
&DAG
, const SDValue
&Op
,
1791 unsigned RegClass
) const {
1792 const TargetRegisterInfo
*TRI
=
1793 getTargetMachine().getSubtargetImpl()->getRegisterInfo();
1794 const TargetRegisterClass
*RC
= getRegClassForNode(DAG
, Op
);
1798 return TRI
->getRegClass(RegClass
)->hasSubClassEq(RC
);
1801 /// \brief Helper function for adjustWritemask
1802 static unsigned SubIdx2Lane(unsigned Idx
) {
1805 case AMDGPU::sub0
: return 0;
1806 case AMDGPU::sub1
: return 1;
1807 case AMDGPU::sub2
: return 2;
1808 case AMDGPU::sub3
: return 3;
1812 /// \brief Adjust the writemask of MIMG instructions
1813 void SITargetLowering::adjustWritemask(MachineSDNode
*&Node
,
1814 SelectionDAG
&DAG
) const {
1815 SDNode
*Users
[4] = { };
1817 unsigned OldDmask
= Node
->getConstantOperandVal(0);
1818 unsigned NewDmask
= 0;
1820 // Try to figure out the used register components
1821 for (SDNode::use_iterator I
= Node
->use_begin(), E
= Node
->use_end();
1824 // Abort if we can't understand the usage
1825 if (!I
->isMachineOpcode() ||
1826 I
->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG
)
1829 // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
1830 // Note that subregs are packed, i.e. Lane==0 is the first bit set
1831 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
1833 Lane
= SubIdx2Lane(I
->getConstantOperandVal(1));
1835 // Set which texture component corresponds to the lane.
1837 for (unsigned i
= 0, Dmask
= OldDmask
; i
<= Lane
; i
++) {
1839 Comp
= countTrailingZeros(Dmask
);
1840 Dmask
&= ~(1 << Comp
);
1843 // Abort if we have more than one user per component
1848 NewDmask
|= 1 << Comp
;
1851 // Abort if there's no change
1852 if (NewDmask
== OldDmask
)
1855 // Adjust the writemask in the node
1856 std::vector
<SDValue
> Ops
;
1857 Ops
.push_back(DAG
.getTargetConstant(NewDmask
, MVT::i32
));
1858 for (unsigned i
= 1, e
= Node
->getNumOperands(); i
!= e
; ++i
)
1859 Ops
.push_back(Node
->getOperand(i
));
1860 Node
= (MachineSDNode
*)DAG
.UpdateNodeOperands(Node
, Ops
);
1862 // If we only got one lane, replace it with a copy
1863 // (if NewDmask has only one bit set...)
1864 if (NewDmask
&& (NewDmask
& (NewDmask
-1)) == 0) {
1865 SDValue RC
= DAG
.getTargetConstant(AMDGPU::VGPR_32RegClassID
, MVT::i32
);
1866 SDNode
*Copy
= DAG
.getMachineNode(TargetOpcode::COPY_TO_REGCLASS
,
1867 SDLoc(), Users
[Lane
]->getValueType(0),
1868 SDValue(Node
, 0), RC
);
1869 DAG
.ReplaceAllUsesWith(Users
[Lane
], Copy
);
1873 // Update the users of the node with the new indices
1874 for (unsigned i
= 0, Idx
= AMDGPU::sub0
; i
< 4; ++i
) {
1876 SDNode
*User
= Users
[i
];
1880 SDValue Op
= DAG
.getTargetConstant(Idx
, MVT::i32
);
1881 DAG
.UpdateNodeOperands(User
, User
->getOperand(0), Op
);
1885 case AMDGPU::sub0
: Idx
= AMDGPU::sub1
; break;
1886 case AMDGPU::sub1
: Idx
= AMDGPU::sub2
; break;
1887 case AMDGPU::sub2
: Idx
= AMDGPU::sub3
; break;
1892 /// \brief Legalize target independent instructions (e.g. INSERT_SUBREG)
1893 /// with frame index operands.
1894 /// LLVM assumes that inputs are to these instructions are registers.
1895 void SITargetLowering::legalizeTargetIndependentNode(SDNode
*Node
,
1896 SelectionDAG
&DAG
) const {
1898 SmallVector
<SDValue
, 8> Ops
;
1899 for (unsigned i
= 0; i
< Node
->getNumOperands(); ++i
) {
1900 if (!isa
<FrameIndexSDNode
>(Node
->getOperand(i
))) {
1901 Ops
.push_back(Node
->getOperand(i
));
1906 Ops
.push_back(SDValue(DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
,
1907 Node
->getOperand(i
).getValueType(),
1908 Node
->getOperand(i
)), 0));
1911 DAG
.UpdateNodeOperands(Node
, Ops
);
1914 /// \brief Fold the instructions after selecting them.
1915 SDNode
*SITargetLowering::PostISelFolding(MachineSDNode
*Node
,
1916 SelectionDAG
&DAG
) const {
1917 const SIInstrInfo
*TII
= static_cast<const SIInstrInfo
*>(
1918 getTargetMachine().getSubtargetImpl()->getInstrInfo());
1919 Node
= AdjustRegClass(Node
, DAG
);
1921 if (TII
->isMIMG(Node
->getMachineOpcode()))
1922 adjustWritemask(Node
, DAG
);
1924 if (Node
->getMachineOpcode() == AMDGPU::INSERT_SUBREG
||
1925 Node
->getMachineOpcode() == AMDGPU::REG_SEQUENCE
) {
1926 legalizeTargetIndependentNode(Node
, DAG
);
1932 /// \brief Assign the register class depending on the number of
1933 /// bits set in the writemask
1934 void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr
*MI
,
1935 SDNode
*Node
) const {
1936 const SIInstrInfo
*TII
= static_cast<const SIInstrInfo
*>(
1937 getTargetMachine().getSubtargetImpl()->getInstrInfo());
1939 MachineRegisterInfo
&MRI
= MI
->getParent()->getParent()->getRegInfo();
1940 TII
->legalizeOperands(MI
);
1942 if (TII
->isMIMG(MI
->getOpcode())) {
1943 unsigned VReg
= MI
->getOperand(0).getReg();
1944 unsigned Writemask
= MI
->getOperand(1).getImm();
1945 unsigned BitsSet
= 0;
1946 for (unsigned i
= 0; i
< 4; ++i
)
1947 BitsSet
+= Writemask
& (1 << i
) ? 1 : 0;
1949 const TargetRegisterClass
*RC
;
1952 case 1: RC
= &AMDGPU::VGPR_32RegClass
; break;
1953 case 2: RC
= &AMDGPU::VReg_64RegClass
; break;
1954 case 3: RC
= &AMDGPU::VReg_96RegClass
; break;
1957 unsigned NewOpcode
= TII
->getMaskedMIMGOp(MI
->getOpcode(), BitsSet
);
1958 MI
->setDesc(TII
->get(NewOpcode
));
1959 MRI
.setRegClass(VReg
, RC
);
1963 // Replace unused atomics with the no return version.
1964 int NoRetAtomicOp
= AMDGPU::getAtomicNoRetOp(MI
->getOpcode());
1965 if (NoRetAtomicOp
!= -1) {
1966 if (!Node
->hasAnyUseOfValue(0)) {
1967 MI
->setDesc(TII
->get(NoRetAtomicOp
));
1968 MI
->RemoveOperand(0);
1975 static SDValue
buildSMovImm32(SelectionDAG
&DAG
, SDLoc DL
, uint64_t Val
) {
1976 SDValue K
= DAG
.getTargetConstant(Val
, MVT::i32
);
1977 return SDValue(DAG
.getMachineNode(AMDGPU::S_MOV_B32
, DL
, MVT::i32
, K
), 0);
1980 MachineSDNode
*SITargetLowering::wrapAddr64Rsrc(SelectionDAG
&DAG
,
1982 SDValue Ptr
) const {
1983 const SIInstrInfo
*TII
= static_cast<const SIInstrInfo
*>(
1984 getTargetMachine().getSubtargetImpl()->getInstrInfo());
1986 // XXX - Workaround for moveToVALU not handling different register class
1987 // inserts for REG_SEQUENCE.
1989 // Build the half of the subregister with the constants.
1990 const SDValue Ops0
[] = {
1991 DAG
.getTargetConstant(AMDGPU::SGPR_64RegClassID
, MVT::i32
),
1992 buildSMovImm32(DAG
, DL
, 0),
1993 DAG
.getTargetConstant(AMDGPU::sub0
, MVT::i32
),
1994 buildSMovImm32(DAG
, DL
, TII
->getDefaultRsrcDataFormat() >> 32),
1995 DAG
.getTargetConstant(AMDGPU::sub1
, MVT::i32
)
1998 SDValue SubRegHi
= SDValue(DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
,
1999 MVT::v2i32
, Ops0
), 0);
2001 // Combine the constants and the pointer.
2002 const SDValue Ops1
[] = {
2003 DAG
.getTargetConstant(AMDGPU::SReg_128RegClassID
, MVT::i32
),
2005 DAG
.getTargetConstant(AMDGPU::sub0_sub1
, MVT::i32
),
2007 DAG
.getTargetConstant(AMDGPU::sub2_sub3
, MVT::i32
)
2010 return DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
, MVT::v4i32
, Ops1
);
2012 const SDValue Ops
[] = {
2013 DAG
.getTargetConstant(AMDGPU::SReg_128RegClassID
, MVT::i32
),
2015 DAG
.getTargetConstant(AMDGPU::sub0_sub1
, MVT::i32
),
2016 buildSMovImm32(DAG
, DL
, 0),
2017 DAG
.getTargetConstant(AMDGPU::sub2
, MVT::i32
),
2018 buildSMovImm32(DAG
, DL
, TII
->getDefaultRsrcFormat() >> 32),
2019 DAG
.getTargetConstant(AMDGPU::sub3
, MVT::i32
)
2022 return DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
, MVT::v4i32
, Ops
);
2027 /// \brief Return a resource descriptor with the 'Add TID' bit enabled
2028 /// The TID (Thread ID) is multipled by the stride value (bits [61:48]
2029 /// of the resource descriptor) to create an offset, which is added to the
2030 /// resource ponter.
2031 MachineSDNode
*SITargetLowering::buildRSRC(SelectionDAG
&DAG
,
2034 uint32_t RsrcDword1
,
2035 uint64_t RsrcDword2And3
) const {
2036 SDValue PtrLo
= DAG
.getTargetExtractSubreg(AMDGPU::sub0
, DL
, MVT::i32
, Ptr
);
2037 SDValue PtrHi
= DAG
.getTargetExtractSubreg(AMDGPU::sub1
, DL
, MVT::i32
, Ptr
);
2039 PtrHi
= SDValue(DAG
.getMachineNode(AMDGPU::S_OR_B32
, DL
, MVT::i32
, PtrHi
,
2040 DAG
.getConstant(RsrcDword1
, MVT::i32
)), 0);
2043 SDValue DataLo
= buildSMovImm32(DAG
, DL
,
2044 RsrcDword2And3
& UINT64_C(0xFFFFFFFF));
2045 SDValue DataHi
= buildSMovImm32(DAG
, DL
, RsrcDword2And3
>> 32);
2047 const SDValue Ops
[] = {
2048 DAG
.getTargetConstant(AMDGPU::SReg_128RegClassID
, MVT::i32
),
2050 DAG
.getTargetConstant(AMDGPU::sub0
, MVT::i32
),
2052 DAG
.getTargetConstant(AMDGPU::sub1
, MVT::i32
),
2054 DAG
.getTargetConstant(AMDGPU::sub2
, MVT::i32
),
2056 DAG
.getTargetConstant(AMDGPU::sub3
, MVT::i32
)
2059 return DAG
.getMachineNode(AMDGPU::REG_SEQUENCE
, DL
, MVT::v4i32
, Ops
);
2062 MachineSDNode
*SITargetLowering::buildScratchRSRC(SelectionDAG
&DAG
,
2064 SDValue Ptr
) const {
2065 const SIInstrInfo
*TII
= static_cast<const SIInstrInfo
*>(
2066 getTargetMachine().getSubtargetImpl()->getInstrInfo());
2067 uint64_t Rsrc
= TII
->getDefaultRsrcDataFormat() | AMDGPU::RSRC_TID_ENABLE
|
2070 return buildRSRC(DAG
, DL
, Ptr
, 0, Rsrc
);
2073 MachineSDNode
*SITargetLowering::AdjustRegClass(MachineSDNode
*N
,
2074 SelectionDAG
&DAG
) const {
2077 unsigned NewOpcode
= N
->getMachineOpcode();
2079 switch (N
->getMachineOpcode()) {
2081 case AMDGPU::S_LOAD_DWORD_IMM
:
2082 NewOpcode
= AMDGPU::BUFFER_LOAD_DWORD_ADDR64
;
2084 case AMDGPU::S_LOAD_DWORDX2_SGPR
:
2085 if (NewOpcode
== N
->getMachineOpcode()) {
2086 NewOpcode
= AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64
;
2089 case AMDGPU::S_LOAD_DWORDX4_IMM
:
2090 case AMDGPU::S_LOAD_DWORDX4_SGPR
: {
2091 if (NewOpcode
== N
->getMachineOpcode()) {
2092 NewOpcode
= AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64
;
2094 if (fitsRegClass(DAG
, N
->getOperand(0), AMDGPU::SReg_64RegClassID
)) {
2097 ConstantSDNode
*Offset
= cast
<ConstantSDNode
>(N
->getOperand(1));
2099 const SDValue Zero64
= DAG
.getTargetConstant(0, MVT::i64
);
2100 SDValue
Ptr(DAG
.getMachineNode(AMDGPU::S_MOV_B64
, DL
, MVT::i64
, Zero64
), 0);
2101 MachineSDNode
*RSrc
= wrapAddr64Rsrc(DAG
, DL
, Ptr
);
2103 SmallVector
<SDValue
, 8> Ops
;
2104 Ops
.push_back(SDValue(RSrc
, 0));
2105 Ops
.push_back(N
->getOperand(0));
2107 // The immediate offset is in dwords on SI and in bytes on VI.
2108 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
2109 Ops
.push_back(DAG
.getTargetConstant(Offset
->getSExtValue(), MVT::i32
));
2111 Ops
.push_back(DAG
.getTargetConstant(Offset
->getSExtValue() << 2, MVT::i32
));
2113 // Copy remaining operands so we keep any chain and glue nodes that follow
2114 // the normal operands.
2115 for (unsigned I
= 2, E
= N
->getNumOperands(); I
!= E
; ++I
)
2116 Ops
.push_back(N
->getOperand(I
));
2118 return DAG
.getMachineNode(NewOpcode
, DL
, N
->getVTList(), Ops
);
2123 SDValue
SITargetLowering::CreateLiveInRegister(SelectionDAG
&DAG
,
2124 const TargetRegisterClass
*RC
,
2125 unsigned Reg
, EVT VT
) const {
2126 SDValue VReg
= AMDGPUTargetLowering::CreateLiveInRegister(DAG
, RC
, Reg
, VT
);
2128 return DAG
.getCopyFromReg(DAG
.getEntryNode(), SDLoc(DAG
.getEntryNode()),
2129 cast
<RegisterSDNode
>(VReg
)->getReg(), VT
);