1 //===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
11 /// \brief This is the parent TargetLowering class for hardware code gen
14 //===----------------------------------------------------------------------===//
16 #include "AMDGPUISelLowering.h"
18 #include "AMDGPUFrameLowering.h"
19 #include "AMDGPUIntrinsicInfo.h"
20 #include "AMDGPURegisterInfo.h"
21 #include "AMDGPUSubtarget.h"
22 #include "R600MachineFunctionInfo.h"
23 #include "SIMachineFunctionInfo.h"
24 #include "llvm/CodeGen/CallingConvLower.h"
25 #include "llvm/CodeGen/MachineFunction.h"
26 #include "llvm/CodeGen/MachineRegisterInfo.h"
27 #include "llvm/CodeGen/SelectionDAG.h"
28 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
29 #include "llvm/IR/DataLayout.h"
30 #include "llvm/IR/DiagnosticInfo.h"
31 #include "llvm/IR/DiagnosticPrinter.h"
37 /// Diagnostic information for unimplemented or unsupported feature reporting.
38 class DiagnosticInfoUnsupported
: public DiagnosticInfo
{
40 const Twine
&Description
;
45 static int getKindID() {
47 KindID
= llvm::getNextAvailablePluginDiagnosticKind();
52 DiagnosticInfoUnsupported(const Function
&Fn
, const Twine
&Desc
,
53 DiagnosticSeverity Severity
= DS_Error
)
54 : DiagnosticInfo(getKindID(), Severity
),
58 const Function
&getFunction() const { return Fn
; }
59 const Twine
&getDescription() const { return Description
; }
61 void print(DiagnosticPrinter
&DP
) const override
{
62 DP
<< "unsupported " << getDescription() << " in " << Fn
.getName();
65 static bool classof(const DiagnosticInfo
*DI
) {
66 return DI
->getKind() == getKindID();
70 int DiagnosticInfoUnsupported::KindID
= 0;
74 static bool allocateStack(unsigned ValNo
, MVT ValVT
, MVT LocVT
,
75 CCValAssign::LocInfo LocInfo
,
76 ISD::ArgFlagsTy ArgFlags
, CCState
&State
) {
77 unsigned Offset
= State
.AllocateStack(ValVT
.getStoreSize(),
78 ArgFlags
.getOrigAlign());
79 State
.addLoc(CCValAssign::getMem(ValNo
, ValVT
, Offset
, LocVT
, LocInfo
));
84 #include "AMDGPUGenCallingConv.inc"
86 // Find a larger type to do a load / store of a vector with.
87 EVT
AMDGPUTargetLowering::getEquivalentMemType(LLVMContext
&Ctx
, EVT VT
) {
88 unsigned StoreSize
= VT
.getStoreSizeInBits();
90 return EVT::getIntegerVT(Ctx
, StoreSize
);
92 assert(StoreSize
% 32 == 0 && "Store size not a multiple of 32");
93 return EVT::getVectorVT(Ctx
, MVT::i32
, StoreSize
/ 32);
96 // Type for a vector that will be loaded to.
97 EVT
AMDGPUTargetLowering::getEquivalentLoadRegType(LLVMContext
&Ctx
, EVT VT
) {
98 unsigned StoreSize
= VT
.getStoreSizeInBits();
100 return EVT::getIntegerVT(Ctx
, 32);
102 return EVT::getVectorVT(Ctx
, MVT::i32
, StoreSize
/ 32);
105 AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine
&TM
) :
108 Subtarget
= &TM
.getSubtarget
<AMDGPUSubtarget
>();
110 setOperationAction(ISD::Constant
, MVT::i32
, Legal
);
111 setOperationAction(ISD::Constant
, MVT::i64
, Legal
);
112 setOperationAction(ISD::ConstantFP
, MVT::f32
, Legal
);
113 setOperationAction(ISD::ConstantFP
, MVT::f64
, Legal
);
115 setOperationAction(ISD::BR_JT
, MVT::Other
, Expand
);
116 setOperationAction(ISD::BRIND
, MVT::Other
, Expand
);
118 // We need to custom lower some of the intrinsics
119 setOperationAction(ISD::INTRINSIC_WO_CHAIN
, MVT::Other
, Custom
);
121 // Library functions. These default to Expand, but we have instructions
123 setOperationAction(ISD::FCEIL
, MVT::f32
, Legal
);
124 setOperationAction(ISD::FEXP2
, MVT::f32
, Legal
);
125 setOperationAction(ISD::FPOW
, MVT::f32
, Legal
);
126 setOperationAction(ISD::FLOG2
, MVT::f32
, Legal
);
127 setOperationAction(ISD::FABS
, MVT::f32
, Legal
);
128 setOperationAction(ISD::FFLOOR
, MVT::f32
, Legal
);
129 setOperationAction(ISD::FRINT
, MVT::f32
, Legal
);
130 setOperationAction(ISD::FROUND
, MVT::f32
, Legal
);
131 setOperationAction(ISD::FTRUNC
, MVT::f32
, Legal
);
133 setOperationAction(ISD::FREM
, MVT::f32
, Custom
);
134 setOperationAction(ISD::FREM
, MVT::f64
, Custom
);
136 // Lower floating point store/load to integer store/load to reduce the number
137 // of patterns in tablegen.
138 setOperationAction(ISD::STORE
, MVT::f32
, Promote
);
139 AddPromotedToType(ISD::STORE
, MVT::f32
, MVT::i32
);
141 setOperationAction(ISD::STORE
, MVT::v2f32
, Promote
);
142 AddPromotedToType(ISD::STORE
, MVT::v2f32
, MVT::v2i32
);
144 setOperationAction(ISD::STORE
, MVT::i64
, Promote
);
145 AddPromotedToType(ISD::STORE
, MVT::i64
, MVT::v2i32
);
147 setOperationAction(ISD::STORE
, MVT::v4f32
, Promote
);
148 AddPromotedToType(ISD::STORE
, MVT::v4f32
, MVT::v4i32
);
150 setOperationAction(ISD::STORE
, MVT::v8f32
, Promote
);
151 AddPromotedToType(ISD::STORE
, MVT::v8f32
, MVT::v8i32
);
153 setOperationAction(ISD::STORE
, MVT::v16f32
, Promote
);
154 AddPromotedToType(ISD::STORE
, MVT::v16f32
, MVT::v16i32
);
156 setOperationAction(ISD::STORE
, MVT::f64
, Promote
);
157 AddPromotedToType(ISD::STORE
, MVT::f64
, MVT::i64
);
159 setOperationAction(ISD::STORE
, MVT::v2f64
, Promote
);
160 AddPromotedToType(ISD::STORE
, MVT::v2f64
, MVT::v2i64
);
162 // Custom lowering of vector stores is required for local address space
164 setOperationAction(ISD::STORE
, MVT::v4i32
, Custom
);
165 // XXX: Native v2i32 local address space stores are possible, but not
166 // currently implemented.
167 setOperationAction(ISD::STORE
, MVT::v2i32
, Custom
);
169 setTruncStoreAction(MVT::v2i32
, MVT::v2i16
, Custom
);
170 setTruncStoreAction(MVT::v2i32
, MVT::v2i8
, Custom
);
171 setTruncStoreAction(MVT::v4i32
, MVT::v4i8
, Custom
);
173 // XXX: This can be change to Custom, once ExpandVectorStores can
174 // handle 64-bit stores.
175 setTruncStoreAction(MVT::v4i32
, MVT::v4i16
, Expand
);
177 setTruncStoreAction(MVT::i64
, MVT::i16
, Expand
);
178 setTruncStoreAction(MVT::i64
, MVT::i8
, Expand
);
179 setTruncStoreAction(MVT::i64
, MVT::i1
, Expand
);
180 setTruncStoreAction(MVT::v2i64
, MVT::v2i1
, Expand
);
181 setTruncStoreAction(MVT::v4i64
, MVT::v4i1
, Expand
);
184 setOperationAction(ISD::LOAD
, MVT::f32
, Promote
);
185 AddPromotedToType(ISD::LOAD
, MVT::f32
, MVT::i32
);
187 setOperationAction(ISD::LOAD
, MVT::v2f32
, Promote
);
188 AddPromotedToType(ISD::LOAD
, MVT::v2f32
, MVT::v2i32
);
190 setOperationAction(ISD::LOAD
, MVT::v4f32
, Promote
);
191 AddPromotedToType(ISD::LOAD
, MVT::v4f32
, MVT::v4i32
);
193 setOperationAction(ISD::LOAD
, MVT::v8f32
, Promote
);
194 AddPromotedToType(ISD::LOAD
, MVT::v8f32
, MVT::v8i32
);
196 setOperationAction(ISD::LOAD
, MVT::v16f32
, Promote
);
197 AddPromotedToType(ISD::LOAD
, MVT::v16f32
, MVT::v16i32
);
199 setOperationAction(ISD::LOAD
, MVT::f64
, Promote
);
200 AddPromotedToType(ISD::LOAD
, MVT::f64
, MVT::i64
);
202 setOperationAction(ISD::LOAD
, MVT::v2f64
, Promote
);
203 AddPromotedToType(ISD::LOAD
, MVT::v2f64
, MVT::v2i64
);
205 setOperationAction(ISD::CONCAT_VECTORS
, MVT::v4i32
, Custom
);
206 setOperationAction(ISD::CONCAT_VECTORS
, MVT::v4f32
, Custom
);
207 setOperationAction(ISD::CONCAT_VECTORS
, MVT::v8i32
, Custom
);
208 setOperationAction(ISD::CONCAT_VECTORS
, MVT::v8f32
, Custom
);
209 setOperationAction(ISD::EXTRACT_SUBVECTOR
, MVT::v2f32
, Custom
);
210 setOperationAction(ISD::EXTRACT_SUBVECTOR
, MVT::v2i32
, Custom
);
211 setOperationAction(ISD::EXTRACT_SUBVECTOR
, MVT::v4f32
, Custom
);
212 setOperationAction(ISD::EXTRACT_SUBVECTOR
, MVT::v4i32
, Custom
);
213 setOperationAction(ISD::EXTRACT_SUBVECTOR
, MVT::v8f32
, Custom
);
214 setOperationAction(ISD::EXTRACT_SUBVECTOR
, MVT::v8i32
, Custom
);
216 // There are no 64-bit extloads. These should be done as a 32-bit extload and
217 // an extension to 64-bit.
218 for (MVT VT
: MVT::integer_valuetypes()) {
219 setLoadExtAction(ISD::EXTLOAD
, MVT::i64
, VT
, Expand
);
220 setLoadExtAction(ISD::SEXTLOAD
, MVT::i64
, VT
, Expand
);
221 setLoadExtAction(ISD::ZEXTLOAD
, MVT::i64
, VT
, Expand
);
224 for (MVT VT
: MVT::integer_vector_valuetypes()) {
225 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::v2i8
, Expand
);
226 setLoadExtAction(ISD::SEXTLOAD
, VT
, MVT::v2i8
, Expand
);
227 setLoadExtAction(ISD::ZEXTLOAD
, VT
, MVT::v2i8
, Expand
);
228 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::v4i8
, Expand
);
229 setLoadExtAction(ISD::SEXTLOAD
, VT
, MVT::v4i8
, Expand
);
230 setLoadExtAction(ISD::ZEXTLOAD
, VT
, MVT::v4i8
, Expand
);
231 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::v2i16
, Expand
);
232 setLoadExtAction(ISD::SEXTLOAD
, VT
, MVT::v2i16
, Expand
);
233 setLoadExtAction(ISD::ZEXTLOAD
, VT
, MVT::v2i16
, Expand
);
234 setLoadExtAction(ISD::EXTLOAD
, VT
, MVT::v4i16
, Expand
);
235 setLoadExtAction(ISD::SEXTLOAD
, VT
, MVT::v4i16
, Expand
);
236 setLoadExtAction(ISD::ZEXTLOAD
, VT
, MVT::v4i16
, Expand
);
239 setOperationAction(ISD::BR_CC
, MVT::i1
, Expand
);
241 if (Subtarget
->getGeneration() < AMDGPUSubtarget::SEA_ISLANDS
) {
242 setOperationAction(ISD::FCEIL
, MVT::f64
, Custom
);
243 setOperationAction(ISD::FTRUNC
, MVT::f64
, Custom
);
244 setOperationAction(ISD::FRINT
, MVT::f64
, Custom
);
245 setOperationAction(ISD::FFLOOR
, MVT::f64
, Custom
);
248 if (!Subtarget
->hasBFI()) {
249 // fcopysign can be done in a single instruction with BFI.
250 setOperationAction(ISD::FCOPYSIGN
, MVT::f32
, Expand
);
251 setOperationAction(ISD::FCOPYSIGN
, MVT::f64
, Expand
);
254 setOperationAction(ISD::FP16_TO_FP
, MVT::f64
, Expand
);
256 setLoadExtAction(ISD::EXTLOAD
, MVT::f32
, MVT::f16
, Expand
);
257 setLoadExtAction(ISD::EXTLOAD
, MVT::f64
, MVT::f16
, Expand
);
258 setTruncStoreAction(MVT::f32
, MVT::f16
, Expand
);
259 setTruncStoreAction(MVT::f64
, MVT::f16
, Expand
);
261 const MVT ScalarIntVTs
[] = { MVT::i32
, MVT::i64
};
262 for (MVT VT
: ScalarIntVTs
) {
263 setOperationAction(ISD::SREM
, VT
, Expand
);
264 setOperationAction(ISD::SDIV
, VT
, Expand
);
266 // GPU does not have divrem function for signed or unsigned.
267 setOperationAction(ISD::SDIVREM
, VT
, Custom
);
268 setOperationAction(ISD::UDIVREM
, VT
, Custom
);
270 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
271 setOperationAction(ISD::SMUL_LOHI
, VT
, Expand
);
272 setOperationAction(ISD::UMUL_LOHI
, VT
, Expand
);
274 setOperationAction(ISD::BSWAP
, VT
, Expand
);
275 setOperationAction(ISD::CTTZ
, VT
, Expand
);
276 setOperationAction(ISD::CTLZ
, VT
, Expand
);
279 if (!Subtarget
->hasBCNT(32))
280 setOperationAction(ISD::CTPOP
, MVT::i32
, Expand
);
282 if (!Subtarget
->hasBCNT(64))
283 setOperationAction(ISD::CTPOP
, MVT::i64
, Expand
);
285 // The hardware supports 32-bit ROTR, but not ROTL.
286 setOperationAction(ISD::ROTL
, MVT::i32
, Expand
);
287 setOperationAction(ISD::ROTL
, MVT::i64
, Expand
);
288 setOperationAction(ISD::ROTR
, MVT::i64
, Expand
);
290 setOperationAction(ISD::MUL
, MVT::i64
, Expand
);
291 setOperationAction(ISD::MULHU
, MVT::i64
, Expand
);
292 setOperationAction(ISD::MULHS
, MVT::i64
, Expand
);
293 setOperationAction(ISD::UDIV
, MVT::i32
, Expand
);
294 setOperationAction(ISD::UREM
, MVT::i32
, Expand
);
295 setOperationAction(ISD::UINT_TO_FP
, MVT::i64
, Custom
);
296 setOperationAction(ISD::SINT_TO_FP
, MVT::i64
, Custom
);
297 setOperationAction(ISD::FP_TO_SINT
, MVT::i64
, Custom
);
298 setOperationAction(ISD::FP_TO_UINT
, MVT::i64
, Custom
);
299 setOperationAction(ISD::SELECT_CC
, MVT::i64
, Expand
);
301 if (!Subtarget
->hasFFBH())
302 setOperationAction(ISD::CTLZ_ZERO_UNDEF
, MVT::i32
, Expand
);
304 if (!Subtarget
->hasFFBL())
305 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, MVT::i32
, Expand
);
307 static const MVT::SimpleValueType VectorIntTypes
[] = {
308 MVT::v2i32
, MVT::v4i32
311 for (MVT VT
: VectorIntTypes
) {
312 // Expand the following operations for the current type by default.
313 setOperationAction(ISD::ADD
, VT
, Expand
);
314 setOperationAction(ISD::AND
, VT
, Expand
);
315 setOperationAction(ISD::FP_TO_SINT
, VT
, Expand
);
316 setOperationAction(ISD::FP_TO_UINT
, VT
, Expand
);
317 setOperationAction(ISD::MUL
, VT
, Expand
);
318 setOperationAction(ISD::OR
, VT
, Expand
);
319 setOperationAction(ISD::SHL
, VT
, Expand
);
320 setOperationAction(ISD::SRA
, VT
, Expand
);
321 setOperationAction(ISD::SRL
, VT
, Expand
);
322 setOperationAction(ISD::ROTL
, VT
, Expand
);
323 setOperationAction(ISD::ROTR
, VT
, Expand
);
324 setOperationAction(ISD::SUB
, VT
, Expand
);
325 setOperationAction(ISD::SINT_TO_FP
, VT
, Expand
);
326 setOperationAction(ISD::UINT_TO_FP
, VT
, Expand
);
327 setOperationAction(ISD::SDIV
, VT
, Expand
);
328 setOperationAction(ISD::UDIV
, VT
, Expand
);
329 setOperationAction(ISD::SREM
, VT
, Expand
);
330 setOperationAction(ISD::UREM
, VT
, Expand
);
331 setOperationAction(ISD::SMUL_LOHI
, VT
, Expand
);
332 setOperationAction(ISD::UMUL_LOHI
, VT
, Expand
);
333 setOperationAction(ISD::SDIVREM
, VT
, Custom
);
334 setOperationAction(ISD::UDIVREM
, VT
, Custom
);
335 setOperationAction(ISD::ADDC
, VT
, Expand
);
336 setOperationAction(ISD::SUBC
, VT
, Expand
);
337 setOperationAction(ISD::ADDE
, VT
, Expand
);
338 setOperationAction(ISD::SUBE
, VT
, Expand
);
339 setOperationAction(ISD::SELECT
, VT
, Expand
);
340 setOperationAction(ISD::VSELECT
, VT
, Expand
);
341 setOperationAction(ISD::SELECT_CC
, VT
, Expand
);
342 setOperationAction(ISD::XOR
, VT
, Expand
);
343 setOperationAction(ISD::BSWAP
, VT
, Expand
);
344 setOperationAction(ISD::CTPOP
, VT
, Expand
);
345 setOperationAction(ISD::CTTZ
, VT
, Expand
);
346 setOperationAction(ISD::CTTZ_ZERO_UNDEF
, VT
, Expand
);
347 setOperationAction(ISD::CTLZ
, VT
, Expand
);
348 setOperationAction(ISD::CTLZ_ZERO_UNDEF
, VT
, Expand
);
349 setOperationAction(ISD::VECTOR_SHUFFLE
, VT
, Expand
);
352 static const MVT::SimpleValueType FloatVectorTypes
[] = {
353 MVT::v2f32
, MVT::v4f32
356 for (MVT VT
: FloatVectorTypes
) {
357 setOperationAction(ISD::FABS
, VT
, Expand
);
358 setOperationAction(ISD::FMINNUM
, VT
, Expand
);
359 setOperationAction(ISD::FMAXNUM
, VT
, Expand
);
360 setOperationAction(ISD::FADD
, VT
, Expand
);
361 setOperationAction(ISD::FCEIL
, VT
, Expand
);
362 setOperationAction(ISD::FCOS
, VT
, Expand
);
363 setOperationAction(ISD::FDIV
, VT
, Expand
);
364 setOperationAction(ISD::FEXP2
, VT
, Expand
);
365 setOperationAction(ISD::FLOG2
, VT
, Expand
);
366 setOperationAction(ISD::FREM
, VT
, Expand
);
367 setOperationAction(ISD::FPOW
, VT
, Expand
);
368 setOperationAction(ISD::FFLOOR
, VT
, Expand
);
369 setOperationAction(ISD::FTRUNC
, VT
, Expand
);
370 setOperationAction(ISD::FMUL
, VT
, Expand
);
371 setOperationAction(ISD::FMA
, VT
, Expand
);
372 setOperationAction(ISD::FRINT
, VT
, Expand
);
373 setOperationAction(ISD::FNEARBYINT
, VT
, Expand
);
374 setOperationAction(ISD::FSQRT
, VT
, Expand
);
375 setOperationAction(ISD::FSIN
, VT
, Expand
);
376 setOperationAction(ISD::FSUB
, VT
, Expand
);
377 setOperationAction(ISD::FNEG
, VT
, Expand
);
378 setOperationAction(ISD::SELECT
, VT
, Expand
);
379 setOperationAction(ISD::VSELECT
, VT
, Expand
);
380 setOperationAction(ISD::SELECT_CC
, VT
, Expand
);
381 setOperationAction(ISD::FCOPYSIGN
, VT
, Expand
);
382 setOperationAction(ISD::VECTOR_SHUFFLE
, VT
, Expand
);
385 setOperationAction(ISD::FNEARBYINT
, MVT::f32
, Custom
);
386 setOperationAction(ISD::FNEARBYINT
, MVT::f64
, Custom
);
388 setTargetDAGCombine(ISD::MUL
);
389 setTargetDAGCombine(ISD::SELECT
);
390 setTargetDAGCombine(ISD::SELECT_CC
);
391 setTargetDAGCombine(ISD::STORE
);
393 setBooleanContents(ZeroOrNegativeOneBooleanContent
);
394 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent
);
396 setSchedulingPreference(Sched::RegPressure
);
397 setJumpIsExpensive(true);
399 // SI at least has hardware support for floating point exceptions, but no way
400 // of using or handling them is implemented. They are also optional in OpenCL
402 setHasFloatingPointExceptions(false);
404 setSelectIsExpensive(false);
405 PredictableSelectIsExpensive
= false;
407 // There are no integer divide instructions, and these expand to a pretty
408 // large sequence of instructions.
409 setIntDivIsCheap(false);
410 setPow2SDivIsCheap(false);
411 setFsqrtIsCheap(true);
413 // FIXME: Need to really handle these.
414 MaxStoresPerMemcpy
= 4096;
415 MaxStoresPerMemmove
= 4096;
416 MaxStoresPerMemset
= 4096;
419 //===----------------------------------------------------------------------===//
420 // Target Information
421 //===----------------------------------------------------------------------===//
423 MVT
AMDGPUTargetLowering::getVectorIdxTy() const {
427 bool AMDGPUTargetLowering::isSelectSupported(SelectSupportKind SelType
) const {
431 // The backend supports 32 and 64 bit floating point immediates.
432 // FIXME: Why are we reporting vectors of FP immediates as legal?
433 bool AMDGPUTargetLowering::isFPImmLegal(const APFloat
&Imm
, EVT VT
) const {
434 EVT ScalarVT
= VT
.getScalarType();
435 return (ScalarVT
== MVT::f32
|| ScalarVT
== MVT::f64
);
438 // We don't want to shrink f64 / f32 constants.
439 bool AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT
) const {
440 EVT ScalarVT
= VT
.getScalarType();
441 return (ScalarVT
!= MVT::f32
&& ScalarVT
!= MVT::f64
);
444 bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode
*N
,
448 unsigned NewSize
= NewVT
.getStoreSizeInBits();
450 // If we are reducing to a 32-bit load, this is always better.
454 EVT OldVT
= N
->getValueType(0);
455 unsigned OldSize
= OldVT
.getStoreSizeInBits();
457 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
458 // extloads, so doing one requires using a buffer_load. In cases where we
459 // still couldn't use a scalar load, using the wider load shouldn't really
462 // If the old size already had to be an extload, there's no harm in continuing
463 // to reduce the width.
464 return (OldSize
< 32);
467 bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy
,
469 if (LoadTy
.getSizeInBits() != CastTy
.getSizeInBits())
472 unsigned LScalarSize
= LoadTy
.getScalarType().getSizeInBits();
473 unsigned CastScalarSize
= CastTy
.getScalarType().getSizeInBits();
475 return ((LScalarSize
<= CastScalarSize
) ||
476 (CastScalarSize
>= 32) ||
480 // SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
481 // profitable with the expansion for 64-bit since it's generally good to
483 // FIXME: These should really have the size as a parameter.
484 bool AMDGPUTargetLowering::isCheapToSpeculateCttz() const {
488 bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
492 //===---------------------------------------------------------------------===//
494 //===---------------------------------------------------------------------===//
496 bool AMDGPUTargetLowering::isFAbsFree(EVT VT
) const {
497 assert(VT
.isFloatingPoint());
498 return VT
== MVT::f32
|| VT
== MVT::f64
;
501 bool AMDGPUTargetLowering::isFNegFree(EVT VT
) const {
502 assert(VT
.isFloatingPoint());
503 return VT
== MVT::f32
|| VT
== MVT::f64
;
506 bool AMDGPUTargetLowering::isTruncateFree(EVT Source
, EVT Dest
) const {
507 // Truncate is just accessing a subregister.
508 return Dest
.bitsLT(Source
) && (Dest
.getSizeInBits() % 32 == 0);
511 bool AMDGPUTargetLowering::isTruncateFree(Type
*Source
, Type
*Dest
) const {
512 // Truncate is just accessing a subregister.
513 return Dest
->getPrimitiveSizeInBits() < Source
->getPrimitiveSizeInBits() &&
514 (Dest
->getPrimitiveSizeInBits() % 32 == 0);
517 bool AMDGPUTargetLowering::isZExtFree(Type
*Src
, Type
*Dest
) const {
518 const DataLayout
*DL
= getDataLayout();
519 unsigned SrcSize
= DL
->getTypeSizeInBits(Src
->getScalarType());
520 unsigned DestSize
= DL
->getTypeSizeInBits(Dest
->getScalarType());
522 return SrcSize
== 32 && DestSize
== 64;
525 bool AMDGPUTargetLowering::isZExtFree(EVT Src
, EVT Dest
) const {
526 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
527 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
528 // this will enable reducing 64-bit operations the 32-bit, which is always
530 return Src
== MVT::i32
&& Dest
== MVT::i64
;
533 bool AMDGPUTargetLowering::isZExtFree(SDValue Val
, EVT VT2
) const {
534 return isZExtFree(Val
.getValueType(), VT2
);
537 bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT
, EVT DestVT
) const {
538 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
539 // limited number of native 64-bit operations. Shrinking an operation to fit
540 // in a single 32-bit register should always be helpful. As currently used,
541 // this is much less general than the name suggests, and is only used in
542 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
543 // not profitable, and may actually be harmful.
544 return SrcVT
.getSizeInBits() > 32 && DestVT
.getSizeInBits() == 32;
547 //===---------------------------------------------------------------------===//
548 // TargetLowering Callbacks
549 //===---------------------------------------------------------------------===//
551 void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState
&State
,
552 const SmallVectorImpl
<ISD::InputArg
> &Ins
) const {
554 State
.AnalyzeFormalArguments(Ins
, CC_AMDGPU
);
557 SDValue
AMDGPUTargetLowering::LowerReturn(
559 CallingConv::ID CallConv
,
561 const SmallVectorImpl
<ISD::OutputArg
> &Outs
,
562 const SmallVectorImpl
<SDValue
> &OutVals
,
563 SDLoc DL
, SelectionDAG
&DAG
) const {
564 return DAG
.getNode(AMDGPUISD::RET_FLAG
, DL
, MVT::Other
, Chain
);
567 //===---------------------------------------------------------------------===//
568 // Target specific lowering
569 //===---------------------------------------------------------------------===//
571 SDValue
AMDGPUTargetLowering::LowerCall(CallLoweringInfo
&CLI
,
572 SmallVectorImpl
<SDValue
> &InVals
) const {
573 SDValue Callee
= CLI
.Callee
;
574 SelectionDAG
&DAG
= CLI
.DAG
;
576 const Function
&Fn
= *DAG
.getMachineFunction().getFunction();
578 StringRef
FuncName("<unknown>");
580 if (const ExternalSymbolSDNode
*G
= dyn_cast
<ExternalSymbolSDNode
>(Callee
))
581 FuncName
= G
->getSymbol();
582 else if (const GlobalAddressSDNode
*G
= dyn_cast
<GlobalAddressSDNode
>(Callee
))
583 FuncName
= G
->getGlobal()->getName();
585 DiagnosticInfoUnsupported
NoCalls(Fn
, "call to function " + FuncName
);
586 DAG
.getContext()->diagnose(NoCalls
);
590 SDValue
AMDGPUTargetLowering::LowerOperation(SDValue Op
,
591 SelectionDAG
&DAG
) const {
592 switch (Op
.getOpcode()) {
594 Op
.getNode()->dump();
595 llvm_unreachable("Custom lowering code for this"
596 "instruction is not implemented yet!");
598 case ISD::SIGN_EXTEND_INREG
: return LowerSIGN_EXTEND_INREG(Op
, DAG
);
599 case ISD::CONCAT_VECTORS
: return LowerCONCAT_VECTORS(Op
, DAG
);
600 case ISD::EXTRACT_SUBVECTOR
: return LowerEXTRACT_SUBVECTOR(Op
, DAG
);
601 case ISD::FrameIndex
: return LowerFrameIndex(Op
, DAG
);
602 case ISD::INTRINSIC_WO_CHAIN
: return LowerINTRINSIC_WO_CHAIN(Op
, DAG
);
603 case ISD::UDIVREM
: return LowerUDIVREM(Op
, DAG
);
604 case ISD::SDIVREM
: return LowerSDIVREM(Op
, DAG
);
605 case ISD::FREM
: return LowerFREM(Op
, DAG
);
606 case ISD::FCEIL
: return LowerFCEIL(Op
, DAG
);
607 case ISD::FTRUNC
: return LowerFTRUNC(Op
, DAG
);
608 case ISD::FRINT
: return LowerFRINT(Op
, DAG
);
609 case ISD::FNEARBYINT
: return LowerFNEARBYINT(Op
, DAG
);
610 case ISD::FFLOOR
: return LowerFFLOOR(Op
, DAG
);
611 case ISD::SINT_TO_FP
: return LowerSINT_TO_FP(Op
, DAG
);
612 case ISD::UINT_TO_FP
: return LowerUINT_TO_FP(Op
, DAG
);
613 case ISD::FP_TO_SINT
: return LowerFP_TO_SINT(Op
, DAG
);
614 case ISD::FP_TO_UINT
: return LowerFP_TO_UINT(Op
, DAG
);
619 void AMDGPUTargetLowering::ReplaceNodeResults(SDNode
*N
,
620 SmallVectorImpl
<SDValue
> &Results
,
621 SelectionDAG
&DAG
) const {
622 switch (N
->getOpcode()) {
623 case ISD::SIGN_EXTEND_INREG
:
624 // Different parts of legalization seem to interpret which type of
625 // sign_extend_inreg is the one to check for custom lowering. The extended
626 // from type is what really matters, but some places check for custom
627 // lowering of the result type. This results in trying to use
628 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
629 // nothing here and let the illegal result integer be handled normally.
632 SDNode
*Node
= LowerLOAD(SDValue(N
, 0), DAG
).getNode();
636 Results
.push_back(SDValue(Node
, 0));
637 Results
.push_back(SDValue(Node
, 1));
638 // XXX: LLVM seems not to replace Chain Value inside CustomWidenLowerNode
640 DAG
.ReplaceAllUsesOfValueWith(SDValue(N
,1), SDValue(Node
, 1));
644 SDValue Lowered
= LowerSTORE(SDValue(N
, 0), DAG
);
645 if (Lowered
.getNode())
646 Results
.push_back(Lowered
);
654 // FIXME: This implements accesses to initialized globals in the constant
655 // address space by copying them to private and accessing that. It does not
656 // properly handle illegal types or vectors. The private vector loads are not
657 // scalarized, and the illegal scalars hit an assertion. This technique will not
658 // work well with large initializers, and this should eventually be
659 // removed. Initialized globals should be placed into a data section that the
660 // runtime will load into a buffer before the kernel is executed. Uses of the
661 // global need to be replaced with a pointer loaded from an implicit kernel
662 // argument into this buffer holding the copy of the data, which will remove the
663 // need for any of this.
664 SDValue
AMDGPUTargetLowering::LowerConstantInitializer(const Constant
* Init
,
665 const GlobalValue
*GV
,
666 const SDValue
&InitPtr
,
668 SelectionDAG
&DAG
) const {
669 const DataLayout
*TD
= getTargetMachine().getSubtargetImpl()->getDataLayout();
671 Type
*InitTy
= Init
->getType();
673 if (const ConstantInt
*CI
= dyn_cast
<ConstantInt
>(Init
)) {
674 EVT VT
= EVT::getEVT(InitTy
);
675 PointerType
*PtrTy
= PointerType::get(InitTy
, AMDGPUAS::PRIVATE_ADDRESS
);
676 return DAG
.getStore(Chain
, DL
, DAG
.getConstant(*CI
, VT
), InitPtr
,
677 MachinePointerInfo(UndefValue::get(PtrTy
)), false, false,
678 TD
->getPrefTypeAlignment(InitTy
));
681 if (const ConstantFP
*CFP
= dyn_cast
<ConstantFP
>(Init
)) {
682 EVT VT
= EVT::getEVT(CFP
->getType());
683 PointerType
*PtrTy
= PointerType::get(CFP
->getType(), 0);
684 return DAG
.getStore(Chain
, DL
, DAG
.getConstantFP(*CFP
, VT
), InitPtr
,
685 MachinePointerInfo(UndefValue::get(PtrTy
)), false, false,
686 TD
->getPrefTypeAlignment(CFP
->getType()));
689 if (StructType
*ST
= dyn_cast
<StructType
>(InitTy
)) {
690 const StructLayout
*SL
= TD
->getStructLayout(ST
);
692 EVT PtrVT
= InitPtr
.getValueType();
693 SmallVector
<SDValue
, 8> Chains
;
695 for (unsigned I
= 0, N
= ST
->getNumElements(); I
!= N
; ++I
) {
696 SDValue Offset
= DAG
.getConstant(SL
->getElementOffset(I
), PtrVT
);
697 SDValue Ptr
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, InitPtr
, Offset
);
699 Constant
*Elt
= Init
->getAggregateElement(I
);
700 Chains
.push_back(LowerConstantInitializer(Elt
, GV
, Ptr
, Chain
, DAG
));
703 return DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, Chains
);
706 if (SequentialType
*SeqTy
= dyn_cast
<SequentialType
>(InitTy
)) {
707 EVT PtrVT
= InitPtr
.getValueType();
709 unsigned NumElements
;
710 if (ArrayType
*AT
= dyn_cast
<ArrayType
>(SeqTy
))
711 NumElements
= AT
->getNumElements();
712 else if (VectorType
*VT
= dyn_cast
<VectorType
>(SeqTy
))
713 NumElements
= VT
->getNumElements();
715 llvm_unreachable("Unexpected type");
717 unsigned EltSize
= TD
->getTypeAllocSize(SeqTy
->getElementType());
718 SmallVector
<SDValue
, 8> Chains
;
719 for (unsigned i
= 0; i
< NumElements
; ++i
) {
720 SDValue Offset
= DAG
.getConstant(i
* EltSize
, PtrVT
);
721 SDValue Ptr
= DAG
.getNode(ISD::ADD
, DL
, PtrVT
, InitPtr
, Offset
);
723 Constant
*Elt
= Init
->getAggregateElement(i
);
724 Chains
.push_back(LowerConstantInitializer(Elt
, GV
, Ptr
, Chain
, DAG
));
727 return DAG
.getNode(ISD::TokenFactor
, DL
, MVT::Other
, Chains
);
730 if (isa
<UndefValue
>(Init
)) {
731 EVT VT
= EVT::getEVT(InitTy
);
732 PointerType
*PtrTy
= PointerType::get(InitTy
, AMDGPUAS::PRIVATE_ADDRESS
);
733 return DAG
.getStore(Chain
, DL
, DAG
.getUNDEF(VT
), InitPtr
,
734 MachinePointerInfo(UndefValue::get(PtrTy
)), false, false,
735 TD
->getPrefTypeAlignment(InitTy
));
739 llvm_unreachable("Unhandled constant initializer");
742 static bool hasDefinedInitializer(const GlobalValue
*GV
) {
743 const GlobalVariable
*GVar
= dyn_cast
<GlobalVariable
>(GV
);
744 if (!GVar
|| !GVar
->hasInitializer())
747 if (isa
<UndefValue
>(GVar
->getInitializer()))
753 SDValue
AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction
* MFI
,
755 SelectionDAG
&DAG
) const {
757 const DataLayout
*TD
= getTargetMachine().getSubtargetImpl()->getDataLayout();
758 GlobalAddressSDNode
*G
= cast
<GlobalAddressSDNode
>(Op
);
759 const GlobalValue
*GV
= G
->getGlobal();
761 switch (G
->getAddressSpace()) {
762 case AMDGPUAS::LOCAL_ADDRESS
: {
763 // XXX: What does the value of G->getOffset() mean?
764 assert(G
->getOffset() == 0 &&
765 "Do not know what to do with an non-zero offset");
767 // TODO: We could emit code to handle the initialization somewhere.
768 if (hasDefinedInitializer(GV
))
772 if (MFI
->LocalMemoryObjects
.count(GV
) == 0) {
773 uint64_t Size
= TD
->getTypeAllocSize(GV
->getType()->getElementType());
774 Offset
= MFI
->LDSSize
;
775 MFI
->LocalMemoryObjects
[GV
] = Offset
;
776 // XXX: Account for alignment?
777 MFI
->LDSSize
+= Size
;
779 Offset
= MFI
->LocalMemoryObjects
[GV
];
782 return DAG
.getConstant(Offset
, getPointerTy(AMDGPUAS::LOCAL_ADDRESS
));
784 case AMDGPUAS::CONSTANT_ADDRESS
: {
785 MachineFrameInfo
*FrameInfo
= DAG
.getMachineFunction().getFrameInfo();
786 Type
*EltType
= GV
->getType()->getElementType();
787 unsigned Size
= TD
->getTypeAllocSize(EltType
);
788 unsigned Alignment
= TD
->getPrefTypeAlignment(EltType
);
790 MVT PrivPtrVT
= getPointerTy(AMDGPUAS::PRIVATE_ADDRESS
);
791 MVT ConstPtrVT
= getPointerTy(AMDGPUAS::CONSTANT_ADDRESS
);
793 int FI
= FrameInfo
->CreateStackObject(Size
, Alignment
, false);
794 SDValue InitPtr
= DAG
.getFrameIndex(FI
, PrivPtrVT
);
796 const GlobalVariable
*Var
= cast
<GlobalVariable
>(GV
);
797 if (!Var
->hasInitializer()) {
798 // This has no use, but bugpoint will hit it.
799 return DAG
.getZExtOrTrunc(InitPtr
, SDLoc(Op
), ConstPtrVT
);
802 const Constant
*Init
= Var
->getInitializer();
803 SmallVector
<SDNode
*, 8> WorkList
;
805 for (SDNode::use_iterator I
= DAG
.getEntryNode()->use_begin(),
806 E
= DAG
.getEntryNode()->use_end(); I
!= E
; ++I
) {
807 if (I
->getOpcode() != AMDGPUISD::REGISTER_LOAD
&& I
->getOpcode() != ISD::LOAD
)
809 WorkList
.push_back(*I
);
811 SDValue Chain
= LowerConstantInitializer(Init
, GV
, InitPtr
, DAG
.getEntryNode(), DAG
);
812 for (SmallVector
<SDNode
*, 8>::iterator I
= WorkList
.begin(),
813 E
= WorkList
.end(); I
!= E
; ++I
) {
814 SmallVector
<SDValue
, 8> Ops
;
815 Ops
.push_back(Chain
);
816 for (unsigned i
= 1; i
< (*I
)->getNumOperands(); ++i
) {
817 Ops
.push_back((*I
)->getOperand(i
));
819 DAG
.UpdateNodeOperands(*I
, Ops
);
821 return DAG
.getZExtOrTrunc(InitPtr
, SDLoc(Op
), ConstPtrVT
);
825 const Function
&Fn
= *DAG
.getMachineFunction().getFunction();
826 DiagnosticInfoUnsupported
BadInit(Fn
,
827 "initializer for address space");
828 DAG
.getContext()->diagnose(BadInit
);
832 SDValue
AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op
,
833 SelectionDAG
&DAG
) const {
834 SmallVector
<SDValue
, 8> Args
;
835 SDValue A
= Op
.getOperand(0);
836 SDValue B
= Op
.getOperand(1);
838 DAG
.ExtractVectorElements(A
, Args
);
839 DAG
.ExtractVectorElements(B
, Args
);
841 return DAG
.getNode(ISD::BUILD_VECTOR
, SDLoc(Op
), Op
.getValueType(), Args
);
844 SDValue
AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op
,
845 SelectionDAG
&DAG
) const {
847 SmallVector
<SDValue
, 8> Args
;
848 unsigned Start
= cast
<ConstantSDNode
>(Op
.getOperand(1))->getZExtValue();
849 EVT VT
= Op
.getValueType();
850 DAG
.ExtractVectorElements(Op
.getOperand(0), Args
, Start
,
851 VT
.getVectorNumElements());
853 return DAG
.getNode(ISD::BUILD_VECTOR
, SDLoc(Op
), Op
.getValueType(), Args
);
856 SDValue
AMDGPUTargetLowering::LowerFrameIndex(SDValue Op
,
857 SelectionDAG
&DAG
) const {
859 MachineFunction
&MF
= DAG
.getMachineFunction();
860 const AMDGPUFrameLowering
*TFL
= static_cast<const AMDGPUFrameLowering
*>(
861 getTargetMachine().getSubtargetImpl()->getFrameLowering());
863 FrameIndexSDNode
*FIN
= cast
<FrameIndexSDNode
>(Op
);
865 unsigned FrameIndex
= FIN
->getIndex();
866 unsigned Offset
= TFL
->getFrameIndexOffset(MF
, FrameIndex
);
867 return DAG
.getConstant(Offset
* 4 * TFL
->getStackWidth(MF
),
871 SDValue
AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op
,
872 SelectionDAG
&DAG
) const {
873 unsigned IntrinsicID
= cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue();
875 EVT VT
= Op
.getValueType();
877 switch (IntrinsicID
) {
879 case AMDGPUIntrinsic::AMDGPU_abs
:
880 case AMDGPUIntrinsic::AMDIL_abs
: // Legacy name.
881 return LowerIntrinsicIABS(Op
, DAG
);
882 case AMDGPUIntrinsic::AMDGPU_lrp
:
883 return LowerIntrinsicLRP(Op
, DAG
);
884 case AMDGPUIntrinsic::AMDGPU_fract
:
885 case AMDGPUIntrinsic::AMDIL_fraction
: // Legacy name.
886 return DAG
.getNode(AMDGPUISD::FRACT
, DL
, VT
, Op
.getOperand(1));
888 case AMDGPUIntrinsic::AMDGPU_clamp
:
889 case AMDGPUIntrinsic::AMDIL_clamp
: // Legacy name.
890 return DAG
.getNode(AMDGPUISD::CLAMP
, DL
, VT
,
891 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
893 case Intrinsic::AMDGPU_div_scale
: {
894 // 3rd parameter required to be a constant.
895 const ConstantSDNode
*Param
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(3));
897 return DAG
.getUNDEF(VT
);
899 // Translate to the operands expected by the machine instruction. The
900 // first parameter must be the same as the first instruction.
901 SDValue Numerator
= Op
.getOperand(1);
902 SDValue Denominator
= Op
.getOperand(2);
904 // Note this order is opposite of the machine instruction's operations,
905 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
906 // intrinsic has the numerator as the first operand to match a normal
907 // division operation.
909 SDValue Src0
= Param
->isAllOnesValue() ? Numerator
: Denominator
;
911 return DAG
.getNode(AMDGPUISD::DIV_SCALE
, DL
, Op
->getVTList(), Src0
,
912 Denominator
, Numerator
);
915 case Intrinsic::AMDGPU_div_fmas
:
916 // FIXME: Dropping bool parameter. Work is needed to support the implicit
918 return DAG
.getNode(AMDGPUISD::DIV_FMAS
, DL
, VT
,
919 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
921 case Intrinsic::AMDGPU_div_fixup
:
922 return DAG
.getNode(AMDGPUISD::DIV_FIXUP
, DL
, VT
,
923 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
925 case Intrinsic::AMDGPU_trig_preop
:
926 return DAG
.getNode(AMDGPUISD::TRIG_PREOP
, DL
, VT
,
927 Op
.getOperand(1), Op
.getOperand(2));
929 case Intrinsic::AMDGPU_rcp
:
930 return DAG
.getNode(AMDGPUISD::RCP
, DL
, VT
, Op
.getOperand(1));
932 case Intrinsic::AMDGPU_rsq
:
933 return DAG
.getNode(AMDGPUISD::RSQ
, DL
, VT
, Op
.getOperand(1));
935 case AMDGPUIntrinsic::AMDGPU_legacy_rsq
:
936 return DAG
.getNode(AMDGPUISD::RSQ_LEGACY
, DL
, VT
, Op
.getOperand(1));
938 case Intrinsic::AMDGPU_rsq_clamped
:
939 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
) {
940 Type
*Type
= VT
.getTypeForEVT(*DAG
.getContext());
941 APFloat Max
= APFloat::getLargest(Type
->getFltSemantics());
942 APFloat Min
= APFloat::getLargest(Type
->getFltSemantics(), true);
944 SDValue Rsq
= DAG
.getNode(AMDGPUISD::RSQ
, DL
, VT
, Op
.getOperand(1));
945 SDValue Tmp
= DAG
.getNode(ISD::FMINNUM
, DL
, VT
, Rsq
,
946 DAG
.getConstantFP(Max
, VT
));
947 return DAG
.getNode(ISD::FMAXNUM
, DL
, VT
, Tmp
,
948 DAG
.getConstantFP(Min
, VT
));
950 return DAG
.getNode(AMDGPUISD::RSQ_CLAMPED
, DL
, VT
, Op
.getOperand(1));
953 case Intrinsic::AMDGPU_ldexp
:
954 return DAG
.getNode(AMDGPUISD::LDEXP
, DL
, VT
, Op
.getOperand(1),
957 case AMDGPUIntrinsic::AMDGPU_imax
:
958 return DAG
.getNode(AMDGPUISD::SMAX
, DL
, VT
, Op
.getOperand(1),
960 case AMDGPUIntrinsic::AMDGPU_umax
:
961 return DAG
.getNode(AMDGPUISD::UMAX
, DL
, VT
, Op
.getOperand(1),
963 case AMDGPUIntrinsic::AMDGPU_imin
:
964 return DAG
.getNode(AMDGPUISD::SMIN
, DL
, VT
, Op
.getOperand(1),
966 case AMDGPUIntrinsic::AMDGPU_umin
:
967 return DAG
.getNode(AMDGPUISD::UMIN
, DL
, VT
, Op
.getOperand(1),
970 case AMDGPUIntrinsic::AMDGPU_umul24
:
971 return DAG
.getNode(AMDGPUISD::MUL_U24
, DL
, VT
,
972 Op
.getOperand(1), Op
.getOperand(2));
974 case AMDGPUIntrinsic::AMDGPU_imul24
:
975 return DAG
.getNode(AMDGPUISD::MUL_I24
, DL
, VT
,
976 Op
.getOperand(1), Op
.getOperand(2));
978 case AMDGPUIntrinsic::AMDGPU_umad24
:
979 return DAG
.getNode(AMDGPUISD::MAD_U24
, DL
, VT
,
980 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
982 case AMDGPUIntrinsic::AMDGPU_imad24
:
983 return DAG
.getNode(AMDGPUISD::MAD_I24
, DL
, VT
,
984 Op
.getOperand(1), Op
.getOperand(2), Op
.getOperand(3));
986 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte0
:
987 return DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE0
, DL
, VT
, Op
.getOperand(1));
989 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte1
:
990 return DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE1
, DL
, VT
, Op
.getOperand(1));
992 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte2
:
993 return DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE2
, DL
, VT
, Op
.getOperand(1));
995 case AMDGPUIntrinsic::AMDGPU_cvt_f32_ubyte3
:
996 return DAG
.getNode(AMDGPUISD::CVT_F32_UBYTE3
, DL
, VT
, Op
.getOperand(1));
998 case AMDGPUIntrinsic::AMDGPU_bfe_i32
:
999 return DAG
.getNode(AMDGPUISD::BFE_I32
, DL
, VT
,
1004 case AMDGPUIntrinsic::AMDGPU_bfe_u32
:
1005 return DAG
.getNode(AMDGPUISD::BFE_U32
, DL
, VT
,
1010 case AMDGPUIntrinsic::AMDGPU_bfi
:
1011 return DAG
.getNode(AMDGPUISD::BFI
, DL
, VT
,
1016 case AMDGPUIntrinsic::AMDGPU_bfm
:
1017 return DAG
.getNode(AMDGPUISD::BFM
, DL
, VT
,
1021 case AMDGPUIntrinsic::AMDGPU_brev
:
1022 return DAG
.getNode(AMDGPUISD::BREV
, DL
, VT
, Op
.getOperand(1));
1024 case Intrinsic::AMDGPU_class
:
1025 return DAG
.getNode(AMDGPUISD::FP_CLASS
, DL
, VT
,
1026 Op
.getOperand(1), Op
.getOperand(2));
1028 case AMDGPUIntrinsic::AMDIL_exp
: // Legacy name.
1029 return DAG
.getNode(ISD::FEXP2
, DL
, VT
, Op
.getOperand(1));
1031 case AMDGPUIntrinsic::AMDIL_round_nearest
: // Legacy name.
1032 return DAG
.getNode(ISD::FRINT
, DL
, VT
, Op
.getOperand(1));
1033 case AMDGPUIntrinsic::AMDGPU_trunc
: // Legacy name.
1034 return DAG
.getNode(ISD::FTRUNC
, DL
, VT
, Op
.getOperand(1));
1038 ///IABS(a) = SMAX(sub(0, a), a)
1039 SDValue
AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op
,
1040 SelectionDAG
&DAG
) const {
1042 EVT VT
= Op
.getValueType();
1043 SDValue Neg
= DAG
.getNode(ISD::SUB
, DL
, VT
, DAG
.getConstant(0, VT
),
1046 return DAG
.getNode(AMDGPUISD::SMAX
, DL
, VT
, Neg
, Op
.getOperand(1));
1049 /// Linear Interpolation
1050 /// LRP(a, b, c) = muladd(a, b, (1 - a) * c)
1051 SDValue
AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op
,
1052 SelectionDAG
&DAG
) const {
1054 EVT VT
= Op
.getValueType();
1055 SDValue OneSubA
= DAG
.getNode(ISD::FSUB
, DL
, VT
,
1056 DAG
.getConstantFP(1.0f
, MVT::f32
),
1058 SDValue OneSubAC
= DAG
.getNode(ISD::FMUL
, DL
, VT
, OneSubA
,
1060 return DAG
.getNode(ISD::FADD
, DL
, VT
,
1061 DAG
.getNode(ISD::FMUL
, DL
, VT
, Op
.getOperand(1), Op
.getOperand(2)),
1065 /// \brief Generate Min/Max node
1066 SDValue
AMDGPUTargetLowering::CombineFMinMaxLegacy(SDLoc DL
,
1073 DAGCombinerInfo
&DCI
) const {
1074 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS
)
1077 if (!(LHS
== True
&& RHS
== False
) && !(LHS
== False
&& RHS
== True
))
1080 SelectionDAG
&DAG
= DCI
.DAG
;
1081 ISD::CondCode CCOpcode
= cast
<CondCodeSDNode
>(CC
)->get();
1090 case ISD::SETFALSE2
:
1099 return DAG
.getNode(AMDGPUISD::FMIN_LEGACY
, DL
, VT
, RHS
, LHS
);
1100 return DAG
.getNode(AMDGPUISD::FMAX_LEGACY
, DL
, VT
, LHS
, RHS
);
1106 // Ordered. Assume ordered for undefined.
1108 // Only do this after legalization to avoid interfering with other combines
1109 // which might occur.
1110 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
&&
1111 !DCI
.isCalledByLegalizer())
1114 // We need to permute the operands to get the correct NaN behavior. The
1115 // selected operand is the second one based on the failing compare with NaN,
1116 // so permute it based on the compare type the hardware uses.
1118 return DAG
.getNode(AMDGPUISD::FMIN_LEGACY
, DL
, VT
, LHS
, RHS
);
1119 return DAG
.getNode(AMDGPUISD::FMAX_LEGACY
, DL
, VT
, RHS
, LHS
);
1124 return DAG
.getNode(AMDGPUISD::FMAX_LEGACY
, DL
, VT
, RHS
, LHS
);
1125 return DAG
.getNode(AMDGPUISD::FMIN_LEGACY
, DL
, VT
, LHS
, RHS
);
1131 if (DCI
.getDAGCombineLevel() < AfterLegalizeDAG
&&
1132 !DCI
.isCalledByLegalizer())
1136 return DAG
.getNode(AMDGPUISD::FMAX_LEGACY
, DL
, VT
, LHS
, RHS
);
1137 return DAG
.getNode(AMDGPUISD::FMIN_LEGACY
, DL
, VT
, RHS
, LHS
);
1139 case ISD::SETCC_INVALID
:
1140 llvm_unreachable("Invalid setcc condcode!");
1145 /// \brief Generate Min/Max node
1146 SDValue
AMDGPUTargetLowering::CombineIMinMax(SDLoc DL
,
1153 SelectionDAG
&DAG
) const {
1154 if (!(LHS
== True
&& RHS
== False
) && !(LHS
== False
&& RHS
== True
))
1157 ISD::CondCode CCOpcode
= cast
<CondCodeSDNode
>(CC
)->get();
1161 unsigned Opc
= (LHS
== True
) ? AMDGPUISD::UMIN
: AMDGPUISD::UMAX
;
1162 return DAG
.getNode(Opc
, DL
, VT
, LHS
, RHS
);
1166 unsigned Opc
= (LHS
== True
) ? AMDGPUISD::SMIN
: AMDGPUISD::SMAX
;
1167 return DAG
.getNode(Opc
, DL
, VT
, LHS
, RHS
);
1171 unsigned Opc
= (LHS
== True
) ? AMDGPUISD::SMAX
: AMDGPUISD::SMIN
;
1172 return DAG
.getNode(Opc
, DL
, VT
, LHS
, RHS
);
1176 unsigned Opc
= (LHS
== True
) ? AMDGPUISD::UMAX
: AMDGPUISD::UMIN
;
1177 return DAG
.getNode(Opc
, DL
, VT
, LHS
, RHS
);
1184 SDValue
AMDGPUTargetLowering::ScalarizeVectorLoad(const SDValue Op
,
1185 SelectionDAG
&DAG
) const {
1186 LoadSDNode
*Load
= cast
<LoadSDNode
>(Op
);
1187 EVT MemVT
= Load
->getMemoryVT();
1188 EVT MemEltVT
= MemVT
.getVectorElementType();
1190 EVT LoadVT
= Op
.getValueType();
1191 EVT EltVT
= LoadVT
.getVectorElementType();
1192 EVT PtrVT
= Load
->getBasePtr().getValueType();
1194 unsigned NumElts
= Load
->getMemoryVT().getVectorNumElements();
1195 SmallVector
<SDValue
, 8> Loads
;
1196 SmallVector
<SDValue
, 8> Chains
;
1199 unsigned MemEltSize
= MemEltVT
.getStoreSize();
1200 MachinePointerInfo
SrcValue(Load
->getMemOperand()->getValue());
1202 for (unsigned i
= 0; i
< NumElts
; ++i
) {
1203 SDValue Ptr
= DAG
.getNode(ISD::ADD
, SL
, PtrVT
, Load
->getBasePtr(),
1204 DAG
.getConstant(i
* MemEltSize
, PtrVT
));
1207 = DAG
.getExtLoad(Load
->getExtensionType(), SL
, EltVT
,
1208 Load
->getChain(), Ptr
,
1209 SrcValue
.getWithOffset(i
* MemEltSize
),
1210 MemEltVT
, Load
->isVolatile(), Load
->isNonTemporal(),
1211 Load
->isInvariant(), Load
->getAlignment());
1212 Loads
.push_back(NewLoad
.getValue(0));
1213 Chains
.push_back(NewLoad
.getValue(1));
1217 DAG
.getNode(ISD::BUILD_VECTOR
, SL
, LoadVT
, Loads
),
1218 DAG
.getNode(ISD::TokenFactor
, SL
, MVT::Other
, Chains
)
1221 return DAG
.getMergeValues(Ops
, SL
);
1224 SDValue
AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op
,
1225 SelectionDAG
&DAG
) const {
1226 EVT VT
= Op
.getValueType();
1228 // If this is a 2 element vector, we really want to scalarize and not create
1229 // weird 1 element vectors.
1230 if (VT
.getVectorNumElements() == 2)
1231 return ScalarizeVectorLoad(Op
, DAG
);
1233 LoadSDNode
*Load
= cast
<LoadSDNode
>(Op
);
1234 SDValue BasePtr
= Load
->getBasePtr();
1235 EVT PtrVT
= BasePtr
.getValueType();
1236 EVT MemVT
= Load
->getMemoryVT();
1238 MachinePointerInfo
SrcValue(Load
->getMemOperand()->getValue());
1241 EVT LoMemVT
, HiMemVT
;
1244 std::tie(LoVT
, HiVT
) = DAG
.GetSplitDestVTs(VT
);
1245 std::tie(LoMemVT
, HiMemVT
) = DAG
.GetSplitDestVTs(MemVT
);
1246 std::tie(Lo
, Hi
) = DAG
.SplitVector(Op
, SL
, LoVT
, HiVT
);
1248 = DAG
.getExtLoad(Load
->getExtensionType(), SL
, LoVT
,
1249 Load
->getChain(), BasePtr
,
1251 LoMemVT
, Load
->isVolatile(), Load
->isNonTemporal(),
1252 Load
->isInvariant(), Load
->getAlignment());
1254 SDValue HiPtr
= DAG
.getNode(ISD::ADD
, SL
, PtrVT
, BasePtr
,
1255 DAG
.getConstant(LoMemVT
.getStoreSize(), PtrVT
));
1258 = DAG
.getExtLoad(Load
->getExtensionType(), SL
, HiVT
,
1259 Load
->getChain(), HiPtr
,
1260 SrcValue
.getWithOffset(LoMemVT
.getStoreSize()),
1261 HiMemVT
, Load
->isVolatile(), Load
->isNonTemporal(),
1262 Load
->isInvariant(), Load
->getAlignment());
1265 DAG
.getNode(ISD::CONCAT_VECTORS
, SL
, VT
, LoLoad
, HiLoad
),
1266 DAG
.getNode(ISD::TokenFactor
, SL
, MVT::Other
,
1267 LoLoad
.getValue(1), HiLoad
.getValue(1))
1270 return DAG
.getMergeValues(Ops
, SL
);
1273 SDValue
AMDGPUTargetLowering::MergeVectorStore(const SDValue
&Op
,
1274 SelectionDAG
&DAG
) const {
1275 StoreSDNode
*Store
= cast
<StoreSDNode
>(Op
);
1276 EVT MemVT
= Store
->getMemoryVT();
1277 unsigned MemBits
= MemVT
.getSizeInBits();
1279 // Byte stores are really expensive, so if possible, try to pack 32-bit vector
1280 // truncating store into an i32 store.
1281 // XXX: We could also handle optimize other vector bitwidths.
1282 if (!MemVT
.isVector() || MemBits
> 32) {
1287 SDValue Value
= Store
->getValue();
1288 EVT VT
= Value
.getValueType();
1289 EVT ElemVT
= VT
.getVectorElementType();
1290 SDValue Ptr
= Store
->getBasePtr();
1291 EVT MemEltVT
= MemVT
.getVectorElementType();
1292 unsigned MemEltBits
= MemEltVT
.getSizeInBits();
1293 unsigned MemNumElements
= MemVT
.getVectorNumElements();
1294 unsigned PackedSize
= MemVT
.getStoreSizeInBits();
1295 SDValue Mask
= DAG
.getConstant((1 << MemEltBits
) - 1, MVT::i32
);
1297 assert(Value
.getValueType().getScalarSizeInBits() >= 32);
1299 SDValue PackedValue
;
1300 for (unsigned i
= 0; i
< MemNumElements
; ++i
) {
1301 SDValue Elt
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, DL
, ElemVT
, Value
,
1302 DAG
.getConstant(i
, MVT::i32
));
1303 Elt
= DAG
.getZExtOrTrunc(Elt
, DL
, MVT::i32
);
1304 Elt
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
, Elt
, Mask
); // getZeroExtendInReg
1306 SDValue Shift
= DAG
.getConstant(MemEltBits
* i
, MVT::i32
);
1307 Elt
= DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, Elt
, Shift
);
1312 PackedValue
= DAG
.getNode(ISD::OR
, DL
, MVT::i32
, PackedValue
, Elt
);
1316 if (PackedSize
< 32) {
1317 EVT PackedVT
= EVT::getIntegerVT(*DAG
.getContext(), PackedSize
);
1318 return DAG
.getTruncStore(Store
->getChain(), DL
, PackedValue
, Ptr
,
1319 Store
->getMemOperand()->getPointerInfo(),
1321 Store
->isNonTemporal(), Store
->isVolatile(),
1322 Store
->getAlignment());
1325 return DAG
.getStore(Store
->getChain(), DL
, PackedValue
, Ptr
,
1326 Store
->getMemOperand()->getPointerInfo(),
1327 Store
->isVolatile(), Store
->isNonTemporal(),
1328 Store
->getAlignment());
1331 SDValue
AMDGPUTargetLowering::ScalarizeVectorStore(SDValue Op
,
1332 SelectionDAG
&DAG
) const {
1333 StoreSDNode
*Store
= cast
<StoreSDNode
>(Op
);
1334 EVT MemEltVT
= Store
->getMemoryVT().getVectorElementType();
1335 EVT EltVT
= Store
->getValue().getValueType().getVectorElementType();
1336 EVT PtrVT
= Store
->getBasePtr().getValueType();
1337 unsigned NumElts
= Store
->getMemoryVT().getVectorNumElements();
1340 SmallVector
<SDValue
, 8> Chains
;
1342 unsigned EltSize
= MemEltVT
.getStoreSize();
1343 MachinePointerInfo
SrcValue(Store
->getMemOperand()->getValue());
1345 for (unsigned i
= 0, e
= NumElts
; i
!= e
; ++i
) {
1346 SDValue Val
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, EltVT
,
1348 DAG
.getConstant(i
, MVT::i32
));
1350 SDValue Offset
= DAG
.getConstant(i
* MemEltVT
.getStoreSize(), PtrVT
);
1351 SDValue Ptr
= DAG
.getNode(ISD::ADD
, SL
, PtrVT
, Store
->getBasePtr(), Offset
);
1353 DAG
.getTruncStore(Store
->getChain(), SL
, Val
, Ptr
,
1354 SrcValue
.getWithOffset(i
* EltSize
),
1355 MemEltVT
, Store
->isNonTemporal(), Store
->isVolatile(),
1356 Store
->getAlignment());
1357 Chains
.push_back(NewStore
);
1360 return DAG
.getNode(ISD::TokenFactor
, SL
, MVT::Other
, Chains
);
1363 SDValue
AMDGPUTargetLowering::SplitVectorStore(SDValue Op
,
1364 SelectionDAG
&DAG
) const {
1365 StoreSDNode
*Store
= cast
<StoreSDNode
>(Op
);
1366 SDValue Val
= Store
->getValue();
1367 EVT VT
= Val
.getValueType();
1369 // If this is a 2 element vector, we really want to scalarize and not create
1370 // weird 1 element vectors.
1371 if (VT
.getVectorNumElements() == 2)
1372 return ScalarizeVectorStore(Op
, DAG
);
1374 EVT MemVT
= Store
->getMemoryVT();
1375 SDValue Chain
= Store
->getChain();
1376 SDValue BasePtr
= Store
->getBasePtr();
1380 EVT LoMemVT
, HiMemVT
;
1383 std::tie(LoVT
, HiVT
) = DAG
.GetSplitDestVTs(VT
);
1384 std::tie(LoMemVT
, HiMemVT
) = DAG
.GetSplitDestVTs(MemVT
);
1385 std::tie(Lo
, Hi
) = DAG
.SplitVector(Val
, SL
, LoVT
, HiVT
);
1387 EVT PtrVT
= BasePtr
.getValueType();
1388 SDValue HiPtr
= DAG
.getNode(ISD::ADD
, SL
, PtrVT
, BasePtr
,
1389 DAG
.getConstant(LoMemVT
.getStoreSize(), PtrVT
));
1391 MachinePointerInfo
SrcValue(Store
->getMemOperand()->getValue());
1393 = DAG
.getTruncStore(Chain
, SL
, Lo
,
1397 Store
->isNonTemporal(),
1398 Store
->isVolatile(),
1399 Store
->getAlignment());
1401 = DAG
.getTruncStore(Chain
, SL
, Hi
,
1403 SrcValue
.getWithOffset(LoMemVT
.getStoreSize()),
1405 Store
->isNonTemporal(),
1406 Store
->isVolatile(),
1407 Store
->getAlignment());
1409 return DAG
.getNode(ISD::TokenFactor
, SL
, MVT::Other
, LoStore
, HiStore
);
1413 SDValue
AMDGPUTargetLowering::LowerLOAD(SDValue Op
, SelectionDAG
&DAG
) const {
1415 LoadSDNode
*Load
= cast
<LoadSDNode
>(Op
);
1416 ISD::LoadExtType ExtType
= Load
->getExtensionType();
1417 EVT VT
= Op
.getValueType();
1418 EVT MemVT
= Load
->getMemoryVT();
1420 if (ExtType
== ISD::NON_EXTLOAD
&& VT
.getSizeInBits() < 32) {
1421 assert(VT
== MVT::i1
&& "Only i1 non-extloads expected");
1422 // FIXME: Copied from PPC
1423 // First, load into 32 bits, then truncate to 1 bit.
1425 SDValue Chain
= Load
->getChain();
1426 SDValue BasePtr
= Load
->getBasePtr();
1427 MachineMemOperand
*MMO
= Load
->getMemOperand();
1429 SDValue NewLD
= DAG
.getExtLoad(ISD::EXTLOAD
, DL
, MVT::i32
, Chain
,
1430 BasePtr
, MVT::i8
, MMO
);
1433 DAG
.getNode(ISD::TRUNCATE
, DL
, VT
, NewLD
),
1437 return DAG
.getMergeValues(Ops
, DL
);
1440 if (Subtarget
->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS
||
1441 Load
->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS
||
1442 ExtType
== ISD::NON_EXTLOAD
|| Load
->getMemoryVT().bitsGE(MVT::i32
))
1446 SDValue Ptr
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, Load
->getBasePtr(),
1447 DAG
.getConstant(2, MVT::i32
));
1448 SDValue Ret
= DAG
.getNode(AMDGPUISD::REGISTER_LOAD
, DL
, Op
.getValueType(),
1449 Load
->getChain(), Ptr
,
1450 DAG
.getTargetConstant(0, MVT::i32
),
1452 SDValue ByteIdx
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
,
1454 DAG
.getConstant(0x3, MVT::i32
));
1455 SDValue ShiftAmt
= DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, ByteIdx
,
1456 DAG
.getConstant(3, MVT::i32
));
1458 Ret
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, Ret
, ShiftAmt
);
1460 EVT MemEltVT
= MemVT
.getScalarType();
1461 if (ExtType
== ISD::SEXTLOAD
) {
1462 SDValue MemEltVTNode
= DAG
.getValueType(MemEltVT
);
1465 DAG
.getNode(ISD::SIGN_EXTEND_INREG
, DL
, MVT::i32
, Ret
, MemEltVTNode
),
1469 return DAG
.getMergeValues(Ops
, DL
);
1473 DAG
.getZeroExtendInReg(Ret
, DL
, MemEltVT
),
1477 return DAG
.getMergeValues(Ops
, DL
);
1480 SDValue
AMDGPUTargetLowering::LowerSTORE(SDValue Op
, SelectionDAG
&DAG
) const {
1482 SDValue Result
= AMDGPUTargetLowering::MergeVectorStore(Op
, DAG
);
1483 if (Result
.getNode()) {
1487 StoreSDNode
*Store
= cast
<StoreSDNode
>(Op
);
1488 SDValue Chain
= Store
->getChain();
1489 if ((Store
->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS
||
1490 Store
->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
) &&
1491 Store
->getValue().getValueType().isVector()) {
1492 return ScalarizeVectorStore(Op
, DAG
);
1495 EVT MemVT
= Store
->getMemoryVT();
1496 if (Store
->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS
&&
1497 MemVT
.bitsLT(MVT::i32
)) {
1499 if (Store
->getMemoryVT() == MVT::i8
) {
1501 } else if (Store
->getMemoryVT() == MVT::i16
) {
1504 SDValue BasePtr
= Store
->getBasePtr();
1505 SDValue Ptr
= DAG
.getNode(ISD::SRL
, DL
, MVT::i32
, BasePtr
,
1506 DAG
.getConstant(2, MVT::i32
));
1507 SDValue Dst
= DAG
.getNode(AMDGPUISD::REGISTER_LOAD
, DL
, MVT::i32
,
1508 Chain
, Ptr
, DAG
.getTargetConstant(0, MVT::i32
));
1510 SDValue ByteIdx
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
, BasePtr
,
1511 DAG
.getConstant(0x3, MVT::i32
));
1513 SDValue ShiftAmt
= DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, ByteIdx
,
1514 DAG
.getConstant(3, MVT::i32
));
1516 SDValue SExtValue
= DAG
.getNode(ISD::SIGN_EXTEND
, DL
, MVT::i32
,
1519 SDValue MaskedValue
= DAG
.getZeroExtendInReg(SExtValue
, DL
, MemVT
);
1521 SDValue ShiftedValue
= DAG
.getNode(ISD::SHL
, DL
, MVT::i32
,
1522 MaskedValue
, ShiftAmt
);
1524 SDValue DstMask
= DAG
.getNode(ISD::SHL
, DL
, MVT::i32
, DAG
.getConstant(Mask
, MVT::i32
),
1526 DstMask
= DAG
.getNode(ISD::XOR
, DL
, MVT::i32
, DstMask
,
1527 DAG
.getConstant(0xffffffff, MVT::i32
));
1528 Dst
= DAG
.getNode(ISD::AND
, DL
, MVT::i32
, Dst
, DstMask
);
1530 SDValue Value
= DAG
.getNode(ISD::OR
, DL
, MVT::i32
, Dst
, ShiftedValue
);
1531 return DAG
.getNode(AMDGPUISD::REGISTER_STORE
, DL
, MVT::Other
,
1532 Chain
, Value
, Ptr
, DAG
.getTargetConstant(0, MVT::i32
));
1537 // This is a shortcut for integer division because we have fast i32<->f32
1538 // conversions, and fast f32 reciprocal instructions. The fractional part of a
1539 // float is enough to accurately represent up to a 24-bit integer.
1540 SDValue
AMDGPUTargetLowering::LowerDIVREM24(SDValue Op
, SelectionDAG
&DAG
, bool sign
) const {
1542 EVT VT
= Op
.getValueType();
1543 SDValue LHS
= Op
.getOperand(0);
1544 SDValue RHS
= Op
.getOperand(1);
1545 MVT IntVT
= MVT::i32
;
1546 MVT FltVT
= MVT::f32
;
1548 ISD::NodeType ToFp
= sign
? ISD::SINT_TO_FP
: ISD::UINT_TO_FP
;
1549 ISD::NodeType ToInt
= sign
? ISD::FP_TO_SINT
: ISD::FP_TO_UINT
;
1551 if (VT
.isVector()) {
1552 unsigned NElts
= VT
.getVectorNumElements();
1553 IntVT
= MVT::getVectorVT(MVT::i32
, NElts
);
1554 FltVT
= MVT::getVectorVT(MVT::f32
, NElts
);
1557 unsigned BitSize
= VT
.getScalarType().getSizeInBits();
1559 SDValue jq
= DAG
.getConstant(1, IntVT
);
1562 // char|short jq = ia ^ ib;
1563 jq
= DAG
.getNode(ISD::XOR
, DL
, VT
, LHS
, RHS
);
1565 // jq = jq >> (bitsize - 2)
1566 jq
= DAG
.getNode(ISD::SRA
, DL
, VT
, jq
, DAG
.getConstant(BitSize
- 2, VT
));
1569 jq
= DAG
.getNode(ISD::OR
, DL
, VT
, jq
, DAG
.getConstant(1, VT
));
1572 jq
= DAG
.getSExtOrTrunc(jq
, DL
, IntVT
);
1575 // int ia = (int)LHS;
1577 DAG
.getSExtOrTrunc(LHS
, DL
, IntVT
) : DAG
.getZExtOrTrunc(LHS
, DL
, IntVT
);
1579 // int ib, (int)RHS;
1581 DAG
.getSExtOrTrunc(RHS
, DL
, IntVT
) : DAG
.getZExtOrTrunc(RHS
, DL
, IntVT
);
1583 // float fa = (float)ia;
1584 SDValue fa
= DAG
.getNode(ToFp
, DL
, FltVT
, ia
);
1586 // float fb = (float)ib;
1587 SDValue fb
= DAG
.getNode(ToFp
, DL
, FltVT
, ib
);
1589 // float fq = native_divide(fa, fb);
1590 SDValue fq
= DAG
.getNode(ISD::FMUL
, DL
, FltVT
,
1591 fa
, DAG
.getNode(AMDGPUISD::RCP
, DL
, FltVT
, fb
));
1594 fq
= DAG
.getNode(ISD::FTRUNC
, DL
, FltVT
, fq
);
1596 // float fqneg = -fq;
1597 SDValue fqneg
= DAG
.getNode(ISD::FNEG
, DL
, FltVT
, fq
);
1599 // float fr = mad(fqneg, fb, fa);
1600 SDValue fr
= DAG
.getNode(ISD::FADD
, DL
, FltVT
,
1601 DAG
.getNode(ISD::FMUL
, DL
, FltVT
, fqneg
, fb
), fa
);
1603 // int iq = (int)fq;
1604 SDValue iq
= DAG
.getNode(ToInt
, DL
, IntVT
, fq
);
1607 fr
= DAG
.getNode(ISD::FABS
, DL
, FltVT
, fr
);
1610 fb
= DAG
.getNode(ISD::FABS
, DL
, FltVT
, fb
);
1612 EVT SetCCVT
= getSetCCResultType(*DAG
.getContext(), VT
);
1614 // int cv = fr >= fb;
1615 SDValue cv
= DAG
.getSetCC(DL
, SetCCVT
, fr
, fb
, ISD::SETOGE
);
1617 // jq = (cv ? jq : 0);
1618 jq
= DAG
.getNode(ISD::SELECT
, DL
, VT
, cv
, jq
, DAG
.getConstant(0, VT
));
1620 // dst = trunc/extend to legal type
1621 iq
= sign
? DAG
.getSExtOrTrunc(iq
, DL
, VT
) : DAG
.getZExtOrTrunc(iq
, DL
, VT
);
1624 SDValue Div
= DAG
.getNode(ISD::ADD
, DL
, VT
, iq
, jq
);
1626 // Rem needs compensation, it's easier to recompute it
1627 SDValue Rem
= DAG
.getNode(ISD::MUL
, DL
, VT
, Div
, RHS
);
1628 Rem
= DAG
.getNode(ISD::SUB
, DL
, VT
, LHS
, Rem
);
1634 return DAG
.getMergeValues(Res
, DL
);
1637 void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op
,
1639 SmallVectorImpl
<SDValue
> &Results
) const {
1640 assert(Op
.getValueType() == MVT::i64
);
1643 EVT VT
= Op
.getValueType();
1644 EVT HalfVT
= VT
.getHalfSizedIntegerVT(*DAG
.getContext());
1646 SDValue one
= DAG
.getConstant(1, HalfVT
);
1647 SDValue zero
= DAG
.getConstant(0, HalfVT
);
1650 SDValue LHS
= Op
.getOperand(0);
1651 SDValue LHS_Lo
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, DL
, HalfVT
, LHS
, zero
);
1652 SDValue LHS_Hi
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, DL
, HalfVT
, LHS
, one
);
1654 SDValue RHS
= Op
.getOperand(1);
1655 SDValue RHS_Lo
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, DL
, HalfVT
, RHS
, zero
);
1656 SDValue RHS_Hi
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, DL
, HalfVT
, RHS
, one
);
1658 // Get Speculative values
1659 SDValue DIV_Part
= DAG
.getNode(ISD::UDIV
, DL
, HalfVT
, LHS_Hi
, RHS_Lo
);
1660 SDValue REM_Part
= DAG
.getNode(ISD::UREM
, DL
, HalfVT
, LHS_Hi
, RHS_Lo
);
1662 SDValue REM_Hi
= zero
;
1663 SDValue REM_Lo
= DAG
.getSelectCC(DL
, RHS_Hi
, zero
, REM_Part
, LHS_Hi
, ISD::SETEQ
);
1665 SDValue DIV_Hi
= DAG
.getSelectCC(DL
, RHS_Hi
, zero
, DIV_Part
, zero
, ISD::SETEQ
);
1666 SDValue DIV_Lo
= zero
;
1668 const unsigned halfBitWidth
= HalfVT
.getSizeInBits();
1670 for (unsigned i
= 0; i
< halfBitWidth
; ++i
) {
1671 SDValue POS
= DAG
.getConstant(halfBitWidth
- i
- 1, HalfVT
);
1672 // Get Value of high bit
1674 if (halfBitWidth
== 32 && Subtarget
->hasBFE()) {
1675 HBit
= DAG
.getNode(AMDGPUISD::BFE_U32
, DL
, HalfVT
, LHS_Lo
, POS
, one
);
1677 HBit
= DAG
.getNode(ISD::SRL
, DL
, HalfVT
, LHS_Lo
, POS
);
1678 HBit
= DAG
.getNode(ISD::AND
, DL
, HalfVT
, HBit
, one
);
1681 SDValue Carry
= DAG
.getNode(ISD::SRL
, DL
, HalfVT
, REM_Lo
,
1682 DAG
.getConstant(halfBitWidth
- 1, HalfVT
));
1683 REM_Hi
= DAG
.getNode(ISD::SHL
, DL
, HalfVT
, REM_Hi
, one
);
1684 REM_Hi
= DAG
.getNode(ISD::OR
, DL
, HalfVT
, REM_Hi
, Carry
);
1686 REM_Lo
= DAG
.getNode(ISD::SHL
, DL
, HalfVT
, REM_Lo
, one
);
1687 REM_Lo
= DAG
.getNode(ISD::OR
, DL
, HalfVT
, REM_Lo
, HBit
);
1690 SDValue REM
= DAG
.getNode(ISD::BUILD_PAIR
, DL
, VT
, REM_Lo
, REM_Hi
);
1692 SDValue BIT
= DAG
.getConstant(1 << (halfBitWidth
- i
- 1), HalfVT
);
1693 SDValue realBIT
= DAG
.getSelectCC(DL
, REM
, RHS
, BIT
, zero
, ISD::SETUGE
);
1695 DIV_Lo
= DAG
.getNode(ISD::OR
, DL
, HalfVT
, DIV_Lo
, realBIT
);
1699 SDValue REM_sub
= DAG
.getNode(ISD::SUB
, DL
, VT
, REM
, RHS
);
1701 REM
= DAG
.getSelectCC(DL
, REM
, RHS
, REM_sub
, REM
, ISD::SETUGE
);
1702 REM_Lo
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, DL
, HalfVT
, REM
, zero
);
1703 REM_Hi
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, DL
, HalfVT
, REM
, one
);
1706 SDValue REM
= DAG
.getNode(ISD::BUILD_PAIR
, DL
, VT
, REM_Lo
, REM_Hi
);
1707 SDValue DIV
= DAG
.getNode(ISD::BUILD_PAIR
, DL
, VT
, DIV_Lo
, DIV_Hi
);
1708 Results
.push_back(DIV
);
1709 Results
.push_back(REM
);
1712 SDValue
AMDGPUTargetLowering::LowerUDIVREM(SDValue Op
,
1713 SelectionDAG
&DAG
) const {
1715 EVT VT
= Op
.getValueType();
1717 if (VT
== MVT::i64
) {
1718 SmallVector
<SDValue
, 2> Results
;
1719 LowerUDIVREM64(Op
, DAG
, Results
);
1720 return DAG
.getMergeValues(Results
, DL
);
1723 SDValue Num
= Op
.getOperand(0);
1724 SDValue Den
= Op
.getOperand(1);
1726 if (VT
== MVT::i32
) {
1727 if (DAG
.MaskedValueIsZero(Op
.getOperand(0), APInt(32, 0xff << 24)) &&
1728 DAG
.MaskedValueIsZero(Op
.getOperand(1), APInt(32, 0xff << 24))) {
1729 // TODO: We technically could do this for i64, but shouldn't that just be
1730 // handled by something generally reducing 64-bit division on 32-bit
1731 // values to 32-bit?
1732 return LowerDIVREM24(Op
, DAG
, false);
1736 // RCP = URECIP(Den) = 2^32 / Den + e
1737 // e is rounding error.
1738 SDValue RCP
= DAG
.getNode(AMDGPUISD::URECIP
, DL
, VT
, Den
);
1740 // RCP_LO = mul(RCP, Den) */
1741 SDValue RCP_LO
= DAG
.getNode(ISD::MUL
, DL
, VT
, RCP
, Den
);
1743 // RCP_HI = mulhu (RCP, Den) */
1744 SDValue RCP_HI
= DAG
.getNode(ISD::MULHU
, DL
, VT
, RCP
, Den
);
1746 // NEG_RCP_LO = -RCP_LO
1747 SDValue NEG_RCP_LO
= DAG
.getNode(ISD::SUB
, DL
, VT
, DAG
.getConstant(0, VT
),
1750 // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
1751 SDValue ABS_RCP_LO
= DAG
.getSelectCC(DL
, RCP_HI
, DAG
.getConstant(0, VT
),
1754 // Calculate the rounding error from the URECIP instruction
1755 // E = mulhu(ABS_RCP_LO, RCP)
1756 SDValue E
= DAG
.getNode(ISD::MULHU
, DL
, VT
, ABS_RCP_LO
, RCP
);
1758 // RCP_A_E = RCP + E
1759 SDValue RCP_A_E
= DAG
.getNode(ISD::ADD
, DL
, VT
, RCP
, E
);
1761 // RCP_S_E = RCP - E
1762 SDValue RCP_S_E
= DAG
.getNode(ISD::SUB
, DL
, VT
, RCP
, E
);
1764 // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
1765 SDValue Tmp0
= DAG
.getSelectCC(DL
, RCP_HI
, DAG
.getConstant(0, VT
),
1768 // Quotient = mulhu(Tmp0, Num)
1769 SDValue Quotient
= DAG
.getNode(ISD::MULHU
, DL
, VT
, Tmp0
, Num
);
1771 // Num_S_Remainder = Quotient * Den
1772 SDValue Num_S_Remainder
= DAG
.getNode(ISD::MUL
, DL
, VT
, Quotient
, Den
);
1774 // Remainder = Num - Num_S_Remainder
1775 SDValue Remainder
= DAG
.getNode(ISD::SUB
, DL
, VT
, Num
, Num_S_Remainder
);
1777 // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
1778 SDValue Remainder_GE_Den
= DAG
.getSelectCC(DL
, Remainder
, Den
,
1779 DAG
.getConstant(-1, VT
),
1780 DAG
.getConstant(0, VT
),
1782 // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
1783 SDValue Remainder_GE_Zero
= DAG
.getSelectCC(DL
, Num
,
1785 DAG
.getConstant(-1, VT
),
1786 DAG
.getConstant(0, VT
),
1788 // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
1789 SDValue Tmp1
= DAG
.getNode(ISD::AND
, DL
, VT
, Remainder_GE_Den
,
1792 // Calculate Division result:
1794 // Quotient_A_One = Quotient + 1
1795 SDValue Quotient_A_One
= DAG
.getNode(ISD::ADD
, DL
, VT
, Quotient
,
1796 DAG
.getConstant(1, VT
));
1798 // Quotient_S_One = Quotient - 1
1799 SDValue Quotient_S_One
= DAG
.getNode(ISD::SUB
, DL
, VT
, Quotient
,
1800 DAG
.getConstant(1, VT
));
1802 // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
1803 SDValue Div
= DAG
.getSelectCC(DL
, Tmp1
, DAG
.getConstant(0, VT
),
1804 Quotient
, Quotient_A_One
, ISD::SETEQ
);
1806 // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
1807 Div
= DAG
.getSelectCC(DL
, Remainder_GE_Zero
, DAG
.getConstant(0, VT
),
1808 Quotient_S_One
, Div
, ISD::SETEQ
);
1810 // Calculate Rem result:
1812 // Remainder_S_Den = Remainder - Den
1813 SDValue Remainder_S_Den
= DAG
.getNode(ISD::SUB
, DL
, VT
, Remainder
, Den
);
1815 // Remainder_A_Den = Remainder + Den
1816 SDValue Remainder_A_Den
= DAG
.getNode(ISD::ADD
, DL
, VT
, Remainder
, Den
);
1818 // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
1819 SDValue Rem
= DAG
.getSelectCC(DL
, Tmp1
, DAG
.getConstant(0, VT
),
1820 Remainder
, Remainder_S_Den
, ISD::SETEQ
);
1822 // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
1823 Rem
= DAG
.getSelectCC(DL
, Remainder_GE_Zero
, DAG
.getConstant(0, VT
),
1824 Remainder_A_Den
, Rem
, ISD::SETEQ
);
1829 return DAG
.getMergeValues(Ops
, DL
);
1832 SDValue
AMDGPUTargetLowering::LowerSDIVREM(SDValue Op
,
1833 SelectionDAG
&DAG
) const {
1835 EVT VT
= Op
.getValueType();
1837 SDValue LHS
= Op
.getOperand(0);
1838 SDValue RHS
= Op
.getOperand(1);
1840 if (VT
== MVT::i32
) {
1841 if (DAG
.ComputeNumSignBits(Op
.getOperand(0)) > 8 &&
1842 DAG
.ComputeNumSignBits(Op
.getOperand(1)) > 8) {
1843 // TODO: We technically could do this for i64, but shouldn't that just be
1844 // handled by something generally reducing 64-bit division on 32-bit
1845 // values to 32-bit?
1846 return LowerDIVREM24(Op
, DAG
, true);
1850 SDValue Zero
= DAG
.getConstant(0, VT
);
1851 SDValue NegOne
= DAG
.getConstant(-1, VT
);
1853 SDValue LHSign
= DAG
.getSelectCC(DL
, LHS
, Zero
, NegOne
, Zero
, ISD::SETLT
);
1854 SDValue RHSign
= DAG
.getSelectCC(DL
, RHS
, Zero
, NegOne
, Zero
, ISD::SETLT
);
1855 SDValue DSign
= DAG
.getNode(ISD::XOR
, DL
, VT
, LHSign
, RHSign
);
1856 SDValue RSign
= LHSign
; // Remainder sign is the same as LHS
1858 LHS
= DAG
.getNode(ISD::ADD
, DL
, VT
, LHS
, LHSign
);
1859 RHS
= DAG
.getNode(ISD::ADD
, DL
, VT
, RHS
, RHSign
);
1861 LHS
= DAG
.getNode(ISD::XOR
, DL
, VT
, LHS
, LHSign
);
1862 RHS
= DAG
.getNode(ISD::XOR
, DL
, VT
, RHS
, RHSign
);
1864 SDValue Div
= DAG
.getNode(ISD::UDIVREM
, DL
, DAG
.getVTList(VT
, VT
), LHS
, RHS
);
1865 SDValue Rem
= Div
.getValue(1);
1867 Div
= DAG
.getNode(ISD::XOR
, DL
, VT
, Div
, DSign
);
1868 Rem
= DAG
.getNode(ISD::XOR
, DL
, VT
, Rem
, RSign
);
1870 Div
= DAG
.getNode(ISD::SUB
, DL
, VT
, Div
, DSign
);
1871 Rem
= DAG
.getNode(ISD::SUB
, DL
, VT
, Rem
, RSign
);
1877 return DAG
.getMergeValues(Res
, DL
);
1880 // (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y))
1881 SDValue
AMDGPUTargetLowering::LowerFREM(SDValue Op
, SelectionDAG
&DAG
) const {
1883 EVT VT
= Op
.getValueType();
1884 SDValue X
= Op
.getOperand(0);
1885 SDValue Y
= Op
.getOperand(1);
1887 SDValue Div
= DAG
.getNode(ISD::FDIV
, SL
, VT
, X
, Y
);
1888 SDValue Floor
= DAG
.getNode(ISD::FTRUNC
, SL
, VT
, Div
);
1889 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, VT
, Floor
, Y
);
1891 return DAG
.getNode(ISD::FSUB
, SL
, VT
, X
, Mul
);
1894 SDValue
AMDGPUTargetLowering::LowerFCEIL(SDValue Op
, SelectionDAG
&DAG
) const {
1896 SDValue Src
= Op
.getOperand(0);
1898 // result = trunc(src)
1899 // if (src > 0.0 && src != result)
1902 SDValue Trunc
= DAG
.getNode(ISD::FTRUNC
, SL
, MVT::f64
, Src
);
1904 const SDValue Zero
= DAG
.getConstantFP(0.0, MVT::f64
);
1905 const SDValue One
= DAG
.getConstantFP(1.0, MVT::f64
);
1907 EVT SetCCVT
= getSetCCResultType(*DAG
.getContext(), MVT::f64
);
1909 SDValue Lt0
= DAG
.getSetCC(SL
, SetCCVT
, Src
, Zero
, ISD::SETOGT
);
1910 SDValue NeTrunc
= DAG
.getSetCC(SL
, SetCCVT
, Src
, Trunc
, ISD::SETONE
);
1911 SDValue And
= DAG
.getNode(ISD::AND
, SL
, SetCCVT
, Lt0
, NeTrunc
);
1913 SDValue Add
= DAG
.getNode(ISD::SELECT
, SL
, MVT::f64
, And
, One
, Zero
);
1914 return DAG
.getNode(ISD::FADD
, SL
, MVT::f64
, Trunc
, Add
);
1917 SDValue
AMDGPUTargetLowering::LowerFTRUNC(SDValue Op
, SelectionDAG
&DAG
) const {
1919 SDValue Src
= Op
.getOperand(0);
1921 assert(Op
.getValueType() == MVT::f64
);
1923 const SDValue Zero
= DAG
.getConstant(0, MVT::i32
);
1924 const SDValue One
= DAG
.getConstant(1, MVT::i32
);
1926 SDValue VecSrc
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Src
);
1928 // Extract the upper half, since this is where we will find the sign and
1930 SDValue Hi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, VecSrc
, One
);
1932 const unsigned FractBits
= 52;
1933 const unsigned ExpBits
= 11;
1935 // Extract the exponent.
1936 SDValue ExpPart
= DAG
.getNode(AMDGPUISD::BFE_U32
, SL
, MVT::i32
,
1938 DAG
.getConstant(FractBits
- 32, MVT::i32
),
1939 DAG
.getConstant(ExpBits
, MVT::i32
));
1940 SDValue Exp
= DAG
.getNode(ISD::SUB
, SL
, MVT::i32
, ExpPart
,
1941 DAG
.getConstant(1023, MVT::i32
));
1943 // Extract the sign bit.
1944 const SDValue SignBitMask
= DAG
.getConstant(UINT32_C(1) << 31, MVT::i32
);
1945 SDValue SignBit
= DAG
.getNode(ISD::AND
, SL
, MVT::i32
, Hi
, SignBitMask
);
1947 // Extend back to to 64-bits.
1948 SDValue SignBit64
= DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
,
1950 SignBit64
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, SignBit64
);
1952 SDValue BcInt
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, Src
);
1953 const SDValue FractMask
1954 = DAG
.getConstant((UINT64_C(1) << FractBits
) - 1, MVT::i64
);
1956 SDValue Shr
= DAG
.getNode(ISD::SRA
, SL
, MVT::i64
, FractMask
, Exp
);
1957 SDValue Not
= DAG
.getNOT(SL
, Shr
, MVT::i64
);
1958 SDValue Tmp0
= DAG
.getNode(ISD::AND
, SL
, MVT::i64
, BcInt
, Not
);
1960 EVT SetCCVT
= getSetCCResultType(*DAG
.getContext(), MVT::i32
);
1962 const SDValue FiftyOne
= DAG
.getConstant(FractBits
- 1, MVT::i32
);
1964 SDValue ExpLt0
= DAG
.getSetCC(SL
, SetCCVT
, Exp
, Zero
, ISD::SETLT
);
1965 SDValue ExpGt51
= DAG
.getSetCC(SL
, SetCCVT
, Exp
, FiftyOne
, ISD::SETGT
);
1967 SDValue Tmp1
= DAG
.getNode(ISD::SELECT
, SL
, MVT::i64
, ExpLt0
, SignBit64
, Tmp0
);
1968 SDValue Tmp2
= DAG
.getNode(ISD::SELECT
, SL
, MVT::i64
, ExpGt51
, BcInt
, Tmp1
);
1970 return DAG
.getNode(ISD::BITCAST
, SL
, MVT::f64
, Tmp2
);
1973 SDValue
AMDGPUTargetLowering::LowerFRINT(SDValue Op
, SelectionDAG
&DAG
) const {
1975 SDValue Src
= Op
.getOperand(0);
1977 assert(Op
.getValueType() == MVT::f64
);
1979 APFloat
C1Val(APFloat::IEEEdouble
, "0x1.0p+52");
1980 SDValue C1
= DAG
.getConstantFP(C1Val
, MVT::f64
);
1981 SDValue CopySign
= DAG
.getNode(ISD::FCOPYSIGN
, SL
, MVT::f64
, C1
, Src
);
1983 SDValue Tmp1
= DAG
.getNode(ISD::FADD
, SL
, MVT::f64
, Src
, CopySign
);
1984 SDValue Tmp2
= DAG
.getNode(ISD::FSUB
, SL
, MVT::f64
, Tmp1
, CopySign
);
1986 SDValue Fabs
= DAG
.getNode(ISD::FABS
, SL
, MVT::f64
, Src
);
1988 APFloat
C2Val(APFloat::IEEEdouble
, "0x1.fffffffffffffp+51");
1989 SDValue C2
= DAG
.getConstantFP(C2Val
, MVT::f64
);
1991 EVT SetCCVT
= getSetCCResultType(*DAG
.getContext(), MVT::f64
);
1992 SDValue Cond
= DAG
.getSetCC(SL
, SetCCVT
, Fabs
, C2
, ISD::SETOGT
);
1994 return DAG
.getSelect(SL
, MVT::f64
, Cond
, Src
, Tmp2
);
1997 SDValue
AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op
, SelectionDAG
&DAG
) const {
1998 // FNEARBYINT and FRINT are the same, except in their handling of FP
1999 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2000 // rint, so just treat them as equivalent.
2001 return DAG
.getNode(ISD::FRINT
, SDLoc(Op
), Op
.getValueType(), Op
.getOperand(0));
2004 SDValue
AMDGPUTargetLowering::LowerFFLOOR(SDValue Op
, SelectionDAG
&DAG
) const {
2006 SDValue Src
= Op
.getOperand(0);
2008 // result = trunc(src);
2009 // if (src < 0.0 && src != result)
2012 SDValue Trunc
= DAG
.getNode(ISD::FTRUNC
, SL
, MVT::f64
, Src
);
2014 const SDValue Zero
= DAG
.getConstantFP(0.0, MVT::f64
);
2015 const SDValue NegOne
= DAG
.getConstantFP(-1.0, MVT::f64
);
2017 EVT SetCCVT
= getSetCCResultType(*DAG
.getContext(), MVT::f64
);
2019 SDValue Lt0
= DAG
.getSetCC(SL
, SetCCVT
, Src
, Zero
, ISD::SETOLT
);
2020 SDValue NeTrunc
= DAG
.getSetCC(SL
, SetCCVT
, Src
, Trunc
, ISD::SETONE
);
2021 SDValue And
= DAG
.getNode(ISD::AND
, SL
, SetCCVT
, Lt0
, NeTrunc
);
2023 SDValue Add
= DAG
.getNode(ISD::SELECT
, SL
, MVT::f64
, And
, NegOne
, Zero
);
2024 return DAG
.getNode(ISD::FADD
, SL
, MVT::f64
, Trunc
, Add
);
2027 SDValue
AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op
, SelectionDAG
&DAG
,
2028 bool Signed
) const {
2030 SDValue Src
= Op
.getOperand(0);
2032 SDValue BC
= DAG
.getNode(ISD::BITCAST
, SL
, MVT::v2i32
, Src
);
2034 SDValue Lo
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, BC
,
2035 DAG
.getConstant(0, MVT::i32
));
2036 SDValue Hi
= DAG
.getNode(ISD::EXTRACT_VECTOR_ELT
, SL
, MVT::i32
, BC
,
2037 DAG
.getConstant(1, MVT::i32
));
2039 SDValue CvtHi
= DAG
.getNode(Signed
? ISD::SINT_TO_FP
: ISD::UINT_TO_FP
,
2042 SDValue CvtLo
= DAG
.getNode(ISD::UINT_TO_FP
, SL
, MVT::f64
, Lo
);
2044 SDValue LdExp
= DAG
.getNode(AMDGPUISD::LDEXP
, SL
, MVT::f64
, CvtHi
,
2045 DAG
.getConstant(32, MVT::i32
));
2047 return DAG
.getNode(ISD::FADD
, SL
, MVT::f64
, LdExp
, CvtLo
);
2050 SDValue
AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op
,
2051 SelectionDAG
&DAG
) const {
2052 SDValue S0
= Op
.getOperand(0);
2053 if (S0
.getValueType() != MVT::i64
)
2056 EVT DestVT
= Op
.getValueType();
2057 if (DestVT
== MVT::f64
)
2058 return LowerINT_TO_FP64(Op
, DAG
, false);
2060 assert(DestVT
== MVT::f32
);
2064 // f32 uint_to_fp i64
2065 SDValue Lo
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, DL
, MVT::i32
, S0
,
2066 DAG
.getConstant(0, MVT::i32
));
2067 SDValue FloatLo
= DAG
.getNode(ISD::UINT_TO_FP
, DL
, MVT::f32
, Lo
);
2068 SDValue Hi
= DAG
.getNode(ISD::EXTRACT_ELEMENT
, DL
, MVT::i32
, S0
,
2069 DAG
.getConstant(1, MVT::i32
));
2070 SDValue FloatHi
= DAG
.getNode(ISD::UINT_TO_FP
, DL
, MVT::f32
, Hi
);
2071 FloatHi
= DAG
.getNode(ISD::FMUL
, DL
, MVT::f32
, FloatHi
,
2072 DAG
.getConstantFP(4294967296.0f
, MVT::f32
)); // 2^32
2073 return DAG
.getNode(ISD::FADD
, DL
, MVT::f32
, FloatLo
, FloatHi
);
2076 SDValue
AMDGPUTargetLowering::LowerSINT_TO_FP(SDValue Op
,
2077 SelectionDAG
&DAG
) const {
2078 SDValue Src
= Op
.getOperand(0);
2079 if (Src
.getValueType() == MVT::i64
&& Op
.getValueType() == MVT::f64
)
2080 return LowerINT_TO_FP64(Op
, DAG
, true);
2085 SDValue
AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op
, SelectionDAG
&DAG
,
2086 bool Signed
) const {
2089 SDValue Src
= Op
.getOperand(0);
2091 SDValue Trunc
= DAG
.getNode(ISD::FTRUNC
, SL
, MVT::f64
, Src
);
2094 = DAG
.getConstantFP(BitsToDouble(UINT64_C(0x3df0000000000000)), MVT::f64
);
2096 = DAG
.getConstantFP(BitsToDouble(UINT64_C(0xc1f0000000000000)), MVT::f64
);
2098 SDValue Mul
= DAG
.getNode(ISD::FMUL
, SL
, MVT::f64
, Trunc
, K0
);
2100 SDValue FloorMul
= DAG
.getNode(ISD::FFLOOR
, SL
, MVT::f64
, Mul
);
2103 SDValue Fma
= DAG
.getNode(ISD::FMA
, SL
, MVT::f64
, FloorMul
, K1
, Trunc
);
2105 SDValue Hi
= DAG
.getNode(Signed
? ISD::FP_TO_SINT
: ISD::FP_TO_UINT
, SL
,
2106 MVT::i32
, FloorMul
);
2107 SDValue Lo
= DAG
.getNode(ISD::FP_TO_UINT
, SL
, MVT::i32
, Fma
);
2109 SDValue Result
= DAG
.getNode(ISD::BUILD_VECTOR
, SL
, MVT::v2i32
, Lo
, Hi
);
2111 return DAG
.getNode(ISD::BITCAST
, SL
, MVT::i64
, Result
);
2114 SDValue
AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op
,
2115 SelectionDAG
&DAG
) const {
2116 SDValue Src
= Op
.getOperand(0);
2118 if (Op
.getValueType() == MVT::i64
&& Src
.getValueType() == MVT::f64
)
2119 return LowerFP64_TO_INT(Op
, DAG
, true);
2124 SDValue
AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op
,
2125 SelectionDAG
&DAG
) const {
2126 SDValue Src
= Op
.getOperand(0);
2128 if (Op
.getValueType() == MVT::i64
&& Src
.getValueType() == MVT::f64
)
2129 return LowerFP64_TO_INT(Op
, DAG
, false);
2134 SDValue
AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op
,
2135 SelectionDAG
&DAG
) const {
2136 EVT ExtraVT
= cast
<VTSDNode
>(Op
.getOperand(1))->getVT();
2137 MVT VT
= Op
.getSimpleValueType();
2138 MVT ScalarVT
= VT
.getScalarType();
2143 SDValue Src
= Op
.getOperand(0);
2146 // TODO: Don't scalarize on Evergreen?
2147 unsigned NElts
= VT
.getVectorNumElements();
2148 SmallVector
<SDValue
, 8> Args
;
2149 DAG
.ExtractVectorElements(Src
, Args
, 0, NElts
);
2151 SDValue VTOp
= DAG
.getValueType(ExtraVT
.getScalarType());
2152 for (unsigned I
= 0; I
< NElts
; ++I
)
2153 Args
[I
] = DAG
.getNode(ISD::SIGN_EXTEND_INREG
, DL
, ScalarVT
, Args
[I
], VTOp
);
2155 return DAG
.getNode(ISD::BUILD_VECTOR
, DL
, VT
, Args
);
2158 //===----------------------------------------------------------------------===//
2159 // Custom DAG optimizations
2160 //===----------------------------------------------------------------------===//
2162 static bool isU24(SDValue Op
, SelectionDAG
&DAG
) {
2163 APInt KnownZero
, KnownOne
;
2164 EVT VT
= Op
.getValueType();
2165 DAG
.computeKnownBits(Op
, KnownZero
, KnownOne
);
2167 return (VT
.getSizeInBits() - KnownZero
.countLeadingOnes()) <= 24;
2170 static bool isI24(SDValue Op
, SelectionDAG
&DAG
) {
2171 EVT VT
= Op
.getValueType();
2173 // In order for this to be a signed 24-bit value, bit 23, must
2175 return VT
.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
2176 // as unsigned 24-bit values.
2177 (VT
.getSizeInBits() - DAG
.ComputeNumSignBits(Op
)) < 24;
2180 static void simplifyI24(SDValue Op
, TargetLowering::DAGCombinerInfo
&DCI
) {
2182 SelectionDAG
&DAG
= DCI
.DAG
;
2183 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
2184 EVT VT
= Op
.getValueType();
2186 APInt Demanded
= APInt::getLowBitsSet(VT
.getSizeInBits(), 24);
2187 APInt KnownZero
, KnownOne
;
2188 TargetLowering::TargetLoweringOpt
TLO(DAG
, true, true);
2189 if (TLI
.SimplifyDemandedBits(Op
, Demanded
, KnownZero
, KnownOne
, TLO
))
2190 DCI
.CommitTargetLoweringOpt(TLO
);
2193 template <typename IntTy
>
2194 static SDValue
constantFoldBFE(SelectionDAG
&DAG
, IntTy Src0
,
2195 uint32_t Offset
, uint32_t Width
) {
2196 if (Width
+ Offset
< 32) {
2197 uint32_t Shl
= static_cast<uint32_t>(Src0
) << (32 - Offset
- Width
);
2198 IntTy Result
= static_cast<IntTy
>(Shl
) >> (32 - Width
);
2199 return DAG
.getConstant(Result
, MVT::i32
);
2202 return DAG
.getConstant(Src0
>> Offset
, MVT::i32
);
2205 static bool usesAllNormalStores(SDNode
*LoadVal
) {
2206 for (SDNode::use_iterator I
= LoadVal
->use_begin(); !I
.atEnd(); ++I
) {
2207 if (!ISD::isNormalStore(*I
))
2214 // If we have a copy of an illegal type, replace it with a load / store of an
2215 // equivalently sized legal type. This avoids intermediate bit pack / unpack
2216 // instructions emitted when handling extloads and truncstores. Ideally we could
2217 // recognize the pack / unpack pattern to eliminate it.
2218 SDValue
AMDGPUTargetLowering::performStoreCombine(SDNode
*N
,
2219 DAGCombinerInfo
&DCI
) const {
2220 if (!DCI
.isBeforeLegalize())
2223 StoreSDNode
*SN
= cast
<StoreSDNode
>(N
);
2224 SDValue Value
= SN
->getValue();
2225 EVT VT
= Value
.getValueType();
2227 if (isTypeLegal(VT
) || SN
->isVolatile() ||
2228 !ISD::isNormalLoad(Value
.getNode()) || VT
.getSizeInBits() < 8)
2231 LoadSDNode
*LoadVal
= cast
<LoadSDNode
>(Value
);
2232 if (LoadVal
->isVolatile() || !usesAllNormalStores(LoadVal
))
2235 EVT MemVT
= LoadVal
->getMemoryVT();
2238 SelectionDAG
&DAG
= DCI
.DAG
;
2239 EVT LoadVT
= getEquivalentMemType(*DAG
.getContext(), MemVT
);
2241 SDValue NewLoad
= DAG
.getLoad(ISD::UNINDEXED
, ISD::NON_EXTLOAD
,
2243 LoadVal
->getChain(),
2244 LoadVal
->getBasePtr(),
2245 LoadVal
->getOffset(),
2247 LoadVal
->getMemOperand());
2249 SDValue CastLoad
= DAG
.getNode(ISD::BITCAST
, SL
, VT
, NewLoad
.getValue(0));
2250 DCI
.CombineTo(LoadVal
, CastLoad
, NewLoad
.getValue(1), false);
2252 return DAG
.getStore(SN
->getChain(), SL
, NewLoad
,
2253 SN
->getBasePtr(), SN
->getMemOperand());
2256 SDValue
AMDGPUTargetLowering::performMulCombine(SDNode
*N
,
2257 DAGCombinerInfo
&DCI
) const {
2258 EVT VT
= N
->getValueType(0);
2260 if (VT
.isVector() || VT
.getSizeInBits() > 32)
2263 SelectionDAG
&DAG
= DCI
.DAG
;
2266 SDValue N0
= N
->getOperand(0);
2267 SDValue N1
= N
->getOperand(1);
2270 if (Subtarget
->hasMulU24() && isU24(N0
, DAG
) && isU24(N1
, DAG
)) {
2271 N0
= DAG
.getZExtOrTrunc(N0
, DL
, MVT::i32
);
2272 N1
= DAG
.getZExtOrTrunc(N1
, DL
, MVT::i32
);
2273 Mul
= DAG
.getNode(AMDGPUISD::MUL_U24
, DL
, MVT::i32
, N0
, N1
);
2274 } else if (Subtarget
->hasMulI24() && isI24(N0
, DAG
) && isI24(N1
, DAG
)) {
2275 N0
= DAG
.getSExtOrTrunc(N0
, DL
, MVT::i32
);
2276 N1
= DAG
.getSExtOrTrunc(N1
, DL
, MVT::i32
);
2277 Mul
= DAG
.getNode(AMDGPUISD::MUL_I24
, DL
, MVT::i32
, N0
, N1
);
2282 // We need to use sext even for MUL_U24, because MUL_U24 is used
2283 // for signed multiply of 8 and 16-bit types.
2284 return DAG
.getSExtOrTrunc(Mul
, DL
, VT
);
2287 SDValue
AMDGPUTargetLowering::PerformDAGCombine(SDNode
*N
,
2288 DAGCombinerInfo
&DCI
) const {
2289 SelectionDAG
&DAG
= DCI
.DAG
;
2292 switch(N
->getOpcode()) {
2295 return performMulCombine(N
, DCI
);
2296 case AMDGPUISD::MUL_I24
:
2297 case AMDGPUISD::MUL_U24
: {
2298 SDValue N0
= N
->getOperand(0);
2299 SDValue N1
= N
->getOperand(1);
2300 simplifyI24(N0
, DCI
);
2301 simplifyI24(N1
, DCI
);
2305 SDValue Cond
= N
->getOperand(0);
2306 if (Cond
.getOpcode() == ISD::SETCC
&& Cond
.hasOneUse()) {
2308 EVT VT
= N
->getValueType(0);
2309 SDValue LHS
= Cond
.getOperand(0);
2310 SDValue RHS
= Cond
.getOperand(1);
2311 SDValue CC
= Cond
.getOperand(2);
2313 SDValue True
= N
->getOperand(1);
2314 SDValue False
= N
->getOperand(2);
2317 return CombineFMinMaxLegacy(DL
, VT
, LHS
, RHS
, True
, False
, CC
, DCI
);
2319 // TODO: Implement min / max Evergreen instructions.
2320 if (VT
== MVT::i32
&&
2321 Subtarget
->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS
) {
2322 return CombineIMinMax(DL
, VT
, LHS
, RHS
, True
, False
, CC
, DAG
);
2328 case AMDGPUISD::BFE_I32
:
2329 case AMDGPUISD::BFE_U32
: {
2330 assert(!N
->getValueType(0).isVector() &&
2331 "Vector handling of BFE not implemented");
2332 ConstantSDNode
*Width
= dyn_cast
<ConstantSDNode
>(N
->getOperand(2));
2336 uint32_t WidthVal
= Width
->getZExtValue() & 0x1f;
2338 return DAG
.getConstant(0, MVT::i32
);
2340 ConstantSDNode
*Offset
= dyn_cast
<ConstantSDNode
>(N
->getOperand(1));
2344 SDValue BitsFrom
= N
->getOperand(0);
2345 uint32_t OffsetVal
= Offset
->getZExtValue() & 0x1f;
2347 bool Signed
= N
->getOpcode() == AMDGPUISD::BFE_I32
;
2349 if (OffsetVal
== 0) {
2350 // This is already sign / zero extended, so try to fold away extra BFEs.
2351 unsigned SignBits
= Signed
? (32 - WidthVal
+ 1) : (32 - WidthVal
);
2353 unsigned OpSignBits
= DAG
.ComputeNumSignBits(BitsFrom
);
2354 if (OpSignBits
>= SignBits
)
2357 EVT SmallVT
= EVT::getIntegerVT(*DAG
.getContext(), WidthVal
);
2359 // This is a sign_extend_inreg. Replace it to take advantage of existing
2360 // DAG Combines. If not eliminated, we will match back to BFE during
2363 // TODO: The sext_inreg of extended types ends, although we can could
2364 // handle them in a single BFE.
2365 return DAG
.getNode(ISD::SIGN_EXTEND_INREG
, DL
, MVT::i32
, BitsFrom
,
2366 DAG
.getValueType(SmallVT
));
2369 return DAG
.getZeroExtendInReg(BitsFrom
, DL
, SmallVT
);
2372 if (ConstantSDNode
*CVal
= dyn_cast
<ConstantSDNode
>(BitsFrom
)) {
2374 return constantFoldBFE
<int32_t>(DAG
,
2375 CVal
->getSExtValue(),
2380 return constantFoldBFE
<uint32_t>(DAG
,
2381 CVal
->getZExtValue(),
2386 if ((OffsetVal
+ WidthVal
) >= 32) {
2387 SDValue ShiftVal
= DAG
.getConstant(OffsetVal
, MVT::i32
);
2388 return DAG
.getNode(Signed
? ISD::SRA
: ISD::SRL
, DL
, MVT::i32
,
2389 BitsFrom
, ShiftVal
);
2392 if (BitsFrom
.hasOneUse()) {
2393 APInt Demanded
= APInt::getBitsSet(32,
2395 OffsetVal
+ WidthVal
);
2397 APInt KnownZero
, KnownOne
;
2398 TargetLowering::TargetLoweringOpt
TLO(DAG
, !DCI
.isBeforeLegalize(),
2399 !DCI
.isBeforeLegalizeOps());
2400 const TargetLowering
&TLI
= DAG
.getTargetLoweringInfo();
2401 if (TLO
.ShrinkDemandedConstant(BitsFrom
, Demanded
) ||
2402 TLI
.SimplifyDemandedBits(BitsFrom
, Demanded
,
2403 KnownZero
, KnownOne
, TLO
)) {
2404 DCI
.CommitTargetLoweringOpt(TLO
);
2412 return performStoreCombine(N
, DCI
);
2417 //===----------------------------------------------------------------------===//
2419 //===----------------------------------------------------------------------===//
2421 void AMDGPUTargetLowering::getOriginalFunctionArgs(
2424 const SmallVectorImpl
<ISD::InputArg
> &Ins
,
2425 SmallVectorImpl
<ISD::InputArg
> &OrigIns
) const {
2427 for (unsigned i
= 0, e
= Ins
.size(); i
< e
; ++i
) {
2428 if (Ins
[i
].ArgVT
== Ins
[i
].VT
) {
2429 OrigIns
.push_back(Ins
[i
]);
2434 if (Ins
[i
].ArgVT
.isVector() && !Ins
[i
].VT
.isVector()) {
2435 // Vector has been split into scalars.
2436 VT
= Ins
[i
].ArgVT
.getVectorElementType();
2437 } else if (Ins
[i
].VT
.isVector() && Ins
[i
].ArgVT
.isVector() &&
2438 Ins
[i
].ArgVT
.getVectorElementType() !=
2439 Ins
[i
].VT
.getVectorElementType()) {
2440 // Vector elements have been promoted
2443 // Vector has been spilt into smaller vectors.
2447 ISD::InputArg
Arg(Ins
[i
].Flags
, VT
, VT
, Ins
[i
].Used
,
2448 Ins
[i
].OrigArgIndex
, Ins
[i
].PartOffset
);
2449 OrigIns
.push_back(Arg
);
2453 bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op
) const {
2454 if (ConstantFPSDNode
* CFP
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
2455 return CFP
->isExactlyValue(1.0);
2457 if (ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Op
)) {
2458 return C
->isAllOnesValue();
2463 bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op
) const {
2464 if (ConstantFPSDNode
* CFP
= dyn_cast
<ConstantFPSDNode
>(Op
)) {
2465 return CFP
->getValueAPF().isZero();
2467 if (ConstantSDNode
*C
= dyn_cast
<ConstantSDNode
>(Op
)) {
2468 return C
->isNullValue();
2473 SDValue
AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG
&DAG
,
2474 const TargetRegisterClass
*RC
,
2475 unsigned Reg
, EVT VT
) const {
2476 MachineFunction
&MF
= DAG
.getMachineFunction();
2477 MachineRegisterInfo
&MRI
= MF
.getRegInfo();
2478 unsigned VirtualRegister
;
2479 if (!MRI
.isLiveIn(Reg
)) {
2480 VirtualRegister
= MRI
.createVirtualRegister(RC
);
2481 MRI
.addLiveIn(Reg
, VirtualRegister
);
2483 VirtualRegister
= MRI
.getLiveInVirtReg(Reg
);
2485 return DAG
.getRegister(VirtualRegister
, VT
);
2488 #define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
2490 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode
) const {
2492 default: return nullptr;
2494 NODE_NAME_CASE(CALL
);
2495 NODE_NAME_CASE(UMUL
);
2496 NODE_NAME_CASE(RET_FLAG
);
2497 NODE_NAME_CASE(BRANCH_COND
);
2500 NODE_NAME_CASE(DWORDADDR
)
2501 NODE_NAME_CASE(FRACT
)
2502 NODE_NAME_CASE(CLAMP
)
2504 NODE_NAME_CASE(FMAX_LEGACY
)
2505 NODE_NAME_CASE(SMAX
)
2506 NODE_NAME_CASE(UMAX
)
2507 NODE_NAME_CASE(FMIN_LEGACY
)
2508 NODE_NAME_CASE(SMIN
)
2509 NODE_NAME_CASE(UMIN
)
2510 NODE_NAME_CASE(FMAX3
)
2511 NODE_NAME_CASE(SMAX3
)
2512 NODE_NAME_CASE(UMAX3
)
2513 NODE_NAME_CASE(FMIN3
)
2514 NODE_NAME_CASE(SMIN3
)
2515 NODE_NAME_CASE(UMIN3
)
2516 NODE_NAME_CASE(URECIP
)
2517 NODE_NAME_CASE(DIV_SCALE
)
2518 NODE_NAME_CASE(DIV_FMAS
)
2519 NODE_NAME_CASE(DIV_FIXUP
)
2520 NODE_NAME_CASE(TRIG_PREOP
)
2523 NODE_NAME_CASE(RSQ_LEGACY
)
2524 NODE_NAME_CASE(RSQ_CLAMPED
)
2525 NODE_NAME_CASE(LDEXP
)
2526 NODE_NAME_CASE(FP_CLASS
)
2527 NODE_NAME_CASE(DOT4
)
2528 NODE_NAME_CASE(BFE_U32
)
2529 NODE_NAME_CASE(BFE_I32
)
2532 NODE_NAME_CASE(BREV
)
2533 NODE_NAME_CASE(MUL_U24
)
2534 NODE_NAME_CASE(MUL_I24
)
2535 NODE_NAME_CASE(MAD_U24
)
2536 NODE_NAME_CASE(MAD_I24
)
2537 NODE_NAME_CASE(EXPORT
)
2538 NODE_NAME_CASE(CONST_ADDRESS
)
2539 NODE_NAME_CASE(REGISTER_LOAD
)
2540 NODE_NAME_CASE(REGISTER_STORE
)
2541 NODE_NAME_CASE(LOAD_CONSTANT
)
2542 NODE_NAME_CASE(LOAD_INPUT
)
2543 NODE_NAME_CASE(SAMPLE
)
2544 NODE_NAME_CASE(SAMPLEB
)
2545 NODE_NAME_CASE(SAMPLED
)
2546 NODE_NAME_CASE(SAMPLEL
)
2547 NODE_NAME_CASE(CVT_F32_UBYTE0
)
2548 NODE_NAME_CASE(CVT_F32_UBYTE1
)
2549 NODE_NAME_CASE(CVT_F32_UBYTE2
)
2550 NODE_NAME_CASE(CVT_F32_UBYTE3
)
2551 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR
)
2552 NODE_NAME_CASE(CONST_DATA_PTR
)
2553 NODE_NAME_CASE(STORE_MSKOR
)
2554 NODE_NAME_CASE(TBUFFER_STORE_FORMAT
)
2558 SDValue
AMDGPUTargetLowering::getRsqrtEstimate(SDValue Operand
,
2559 DAGCombinerInfo
&DCI
,
2560 unsigned &RefinementSteps
,
2561 bool &UseOneConstNR
) const {
2562 SelectionDAG
&DAG
= DCI
.DAG
;
2563 EVT VT
= Operand
.getValueType();
2565 if (VT
== MVT::f32
) {
2566 RefinementSteps
= 0;
2567 return DAG
.getNode(AMDGPUISD::RSQ
, SDLoc(Operand
), VT
, Operand
);
2570 // TODO: There is also f64 rsq instruction, but the documentation is less
2571 // clear on its precision.
2576 SDValue
AMDGPUTargetLowering::getRecipEstimate(SDValue Operand
,
2577 DAGCombinerInfo
&DCI
,
2578 unsigned &RefinementSteps
) const {
2579 SelectionDAG
&DAG
= DCI
.DAG
;
2580 EVT VT
= Operand
.getValueType();
2582 if (VT
== MVT::f32
) {
2583 // Reciprocal, < 1 ulp error.
2585 // This reciprocal approximation converges to < 0.5 ulp error with one
2586 // newton rhapson performed with two fused multiple adds (FMAs).
2588 RefinementSteps
= 0;
2589 return DAG
.getNode(AMDGPUISD::RCP
, SDLoc(Operand
), VT
, Operand
);
2592 // TODO: There is also f64 rcp instruction, but the documentation is less
2593 // clear on its precision.
2598 static void computeKnownBitsForMinMax(const SDValue Op0
,
2602 const SelectionDAG
&DAG
,
2604 APInt Op0Zero
, Op0One
;
2605 APInt Op1Zero
, Op1One
;
2606 DAG
.computeKnownBits(Op0
, Op0Zero
, Op0One
, Depth
);
2607 DAG
.computeKnownBits(Op1
, Op1Zero
, Op1One
, Depth
);
2609 KnownZero
= Op0Zero
& Op1Zero
;
2610 KnownOne
= Op0One
& Op1One
;
2613 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
2617 const SelectionDAG
&DAG
,
2618 unsigned Depth
) const {
2620 KnownZero
= KnownOne
= APInt(KnownOne
.getBitWidth(), 0); // Don't know anything.
2624 unsigned Opc
= Op
.getOpcode();
2629 case ISD::INTRINSIC_WO_CHAIN
: {
2630 // FIXME: The intrinsic should just use the node.
2631 switch (cast
<ConstantSDNode
>(Op
.getOperand(0))->getZExtValue()) {
2632 case AMDGPUIntrinsic::AMDGPU_imax
:
2633 case AMDGPUIntrinsic::AMDGPU_umax
:
2634 case AMDGPUIntrinsic::AMDGPU_imin
:
2635 case AMDGPUIntrinsic::AMDGPU_umin
:
2636 computeKnownBitsForMinMax(Op
.getOperand(1), Op
.getOperand(2),
2637 KnownZero
, KnownOne
, DAG
, Depth
);
2645 case AMDGPUISD::SMAX
:
2646 case AMDGPUISD::UMAX
:
2647 case AMDGPUISD::SMIN
:
2648 case AMDGPUISD::UMIN
:
2649 computeKnownBitsForMinMax(Op
.getOperand(0), Op
.getOperand(1),
2650 KnownZero
, KnownOne
, DAG
, Depth
);
2653 case AMDGPUISD::BFE_I32
:
2654 case AMDGPUISD::BFE_U32
: {
2655 ConstantSDNode
*CWidth
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(2));
2659 unsigned BitWidth
= 32;
2660 uint32_t Width
= CWidth
->getZExtValue() & 0x1f;
2662 if (Opc
== AMDGPUISD::BFE_U32
)
2663 KnownZero
= APInt::getHighBitsSet(BitWidth
, BitWidth
- Width
);
2670 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
2672 const SelectionDAG
&DAG
,
2673 unsigned Depth
) const {
2674 switch (Op
.getOpcode()) {
2675 case AMDGPUISD::BFE_I32
: {
2676 ConstantSDNode
*Width
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(2));
2680 unsigned SignBits
= 32 - Width
->getZExtValue() + 1;
2681 ConstantSDNode
*Offset
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(1));
2682 if (!Offset
|| !Offset
->isNullValue())
2685 // TODO: Could probably figure something out with non-0 offsets.
2686 unsigned Op0SignBits
= DAG
.ComputeNumSignBits(Op
.getOperand(0), Depth
+ 1);
2687 return std::max(SignBits
, Op0SignBits
);
2690 case AMDGPUISD::BFE_U32
: {
2691 ConstantSDNode
*Width
= dyn_cast
<ConstantSDNode
>(Op
.getOperand(2));
2692 return Width
? 32 - (Width
->getZExtValue() & 0x1f) : 1;