]>
Commit | Line | Data |
---|---|---|
1a4d82fc | 1 | //===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===// |
970d7e83 LB |
2 | // |
3 | // The LLVM Compiler Infrastructure | |
4 | // | |
5 | // This file is distributed under the University of Illinois Open Source | |
6 | // License. See LICENSE.TXT for details. | |
7 | // | |
8 | //===----------------------------------------------------------------------===// | |
9 | // | |
1a4d82fc | 10 | // This file implements the AArch64TargetLowering class. |
970d7e83 LB |
11 | // |
12 | //===----------------------------------------------------------------------===// | |
13 | ||
970d7e83 | 14 | #include "AArch64ISelLowering.h" |
85aaf69f | 15 | #include "AArch64CallingConvention.h" |
970d7e83 | 16 | #include "AArch64MachineFunctionInfo.h" |
1a4d82fc JJ |
17 | #include "AArch64PerfectShuffle.h" |
18 | #include "AArch64Subtarget.h" | |
970d7e83 LB |
19 | #include "AArch64TargetMachine.h" |
20 | #include "AArch64TargetObjectFile.h" | |
1a4d82fc JJ |
21 | #include "MCTargetDesc/AArch64AddressingModes.h" |
22 | #include "llvm/ADT/Statistic.h" | |
970d7e83 LB |
23 | #include "llvm/CodeGen/CallingConvLower.h" |
24 | #include "llvm/CodeGen/MachineFrameInfo.h" | |
25 | #include "llvm/CodeGen/MachineInstrBuilder.h" | |
26 | #include "llvm/CodeGen/MachineRegisterInfo.h" | |
1a4d82fc JJ |
27 | #include "llvm/IR/Function.h" |
28 | #include "llvm/IR/Intrinsics.h" | |
29 | #include "llvm/IR/Type.h" | |
30 | #include "llvm/Support/CommandLine.h" | |
31 | #include "llvm/Support/Debug.h" | |
32 | #include "llvm/Support/ErrorHandling.h" | |
33 | #include "llvm/Support/raw_ostream.h" | |
34 | #include "llvm/Target/TargetOptions.h" | |
970d7e83 LB |
35 | using namespace llvm; |
36 | ||
1a4d82fc | 37 | #define DEBUG_TYPE "aarch64-lower" |
970d7e83 | 38 | |
1a4d82fc JJ |
39 | STATISTIC(NumTailCalls, "Number of tail calls"); |
40 | STATISTIC(NumShiftInserts, "Number of vector shift inserts"); | |
970d7e83 | 41 | |
1a4d82fc JJ |
42 | namespace { |
43 | enum AlignMode { | |
44 | StrictAlign, | |
45 | NoStrictAlign | |
46 | }; | |
47 | } | |
970d7e83 | 48 | |
1a4d82fc JJ |
49 | static cl::opt<AlignMode> |
50 | Align(cl::desc("Load/store alignment support"), | |
51 | cl::Hidden, cl::init(NoStrictAlign), | |
52 | cl::values( | |
53 | clEnumValN(StrictAlign, "aarch64-strict-align", | |
54 | "Disallow all unaligned memory accesses"), | |
55 | clEnumValN(NoStrictAlign, "aarch64-no-strict-align", | |
56 | "Allow unaligned memory accesses"), | |
57 | clEnumValEnd)); | |
58 | ||
59 | // Place holder until extr generation is tested fully. | |
60 | static cl::opt<bool> | |
61 | EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden, | |
62 | cl::desc("Allow AArch64 (or (shift)(shift))->extract"), | |
63 | cl::init(true)); | |
64 | ||
65 | static cl::opt<bool> | |
66 | EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden, | |
67 | cl::desc("Allow AArch64 SLI/SRI formation"), | |
68 | cl::init(false)); | |
970d7e83 | 69 | |
970d7e83 | 70 | |
85aaf69f SL |
71 | AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM) |
72 | : TargetLowering(TM) { | |
1a4d82fc | 73 | Subtarget = &TM.getSubtarget<AArch64Subtarget>(); |
970d7e83 | 74 | |
1a4d82fc JJ |
75 | // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so |
76 | // we have to make something up. Arbitrarily, choose ZeroOrOne. | |
77 | setBooleanContents(ZeroOrOneBooleanContent); | |
78 | // When comparing vectors the result sets the different elements in the | |
79 | // vector to all-one or all-zero. | |
80 | setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); | |
970d7e83 | 81 | |
1a4d82fc JJ |
82 | // Set up the register classes. |
83 | addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass); | |
84 | addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass); | |
970d7e83 | 85 | |
1a4d82fc JJ |
86 | if (Subtarget->hasFPARMv8()) { |
87 | addRegisterClass(MVT::f16, &AArch64::FPR16RegClass); | |
88 | addRegisterClass(MVT::f32, &AArch64::FPR32RegClass); | |
89 | addRegisterClass(MVT::f64, &AArch64::FPR64RegClass); | |
90 | addRegisterClass(MVT::f128, &AArch64::FPR128RegClass); | |
91 | } | |
970d7e83 | 92 | |
1a4d82fc JJ |
93 | if (Subtarget->hasNEON()) { |
94 | addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass); | |
95 | addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass); | |
96 | // Someone set us up the NEON. | |
97 | addDRTypeForNEON(MVT::v2f32); | |
98 | addDRTypeForNEON(MVT::v8i8); | |
99 | addDRTypeForNEON(MVT::v4i16); | |
100 | addDRTypeForNEON(MVT::v2i32); | |
101 | addDRTypeForNEON(MVT::v1i64); | |
102 | addDRTypeForNEON(MVT::v1f64); | |
103 | addDRTypeForNEON(MVT::v4f16); | |
104 | ||
105 | addQRTypeForNEON(MVT::v4f32); | |
106 | addQRTypeForNEON(MVT::v2f64); | |
107 | addQRTypeForNEON(MVT::v16i8); | |
108 | addQRTypeForNEON(MVT::v8i16); | |
109 | addQRTypeForNEON(MVT::v4i32); | |
110 | addQRTypeForNEON(MVT::v2i64); | |
111 | addQRTypeForNEON(MVT::v8f16); | |
112 | } | |
970d7e83 | 113 | |
1a4d82fc JJ |
114 | // Compute derived properties from the register classes |
115 | computeRegisterProperties(); | |
970d7e83 | 116 | |
1a4d82fc | 117 | // Provide all sorts of operation actions |
970d7e83 LB |
118 | setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); |
119 | setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); | |
1a4d82fc JJ |
120 | setOperationAction(ISD::SETCC, MVT::i32, Custom); |
121 | setOperationAction(ISD::SETCC, MVT::i64, Custom); | |
122 | setOperationAction(ISD::SETCC, MVT::f32, Custom); | |
123 | setOperationAction(ISD::SETCC, MVT::f64, Custom); | |
124 | setOperationAction(ISD::BRCOND, MVT::Other, Expand); | |
970d7e83 LB |
125 | setOperationAction(ISD::BR_CC, MVT::i32, Custom); |
126 | setOperationAction(ISD::BR_CC, MVT::i64, Custom); | |
127 | setOperationAction(ISD::BR_CC, MVT::f32, Custom); | |
128 | setOperationAction(ISD::BR_CC, MVT::f64, Custom); | |
970d7e83 LB |
129 | setOperationAction(ISD::SELECT, MVT::i32, Custom); |
130 | setOperationAction(ISD::SELECT, MVT::i64, Custom); | |
131 | setOperationAction(ISD::SELECT, MVT::f32, Custom); | |
132 | setOperationAction(ISD::SELECT, MVT::f64, Custom); | |
970d7e83 LB |
133 | setOperationAction(ISD::SELECT_CC, MVT::i32, Custom); |
134 | setOperationAction(ISD::SELECT_CC, MVT::i64, Custom); | |
135 | setOperationAction(ISD::SELECT_CC, MVT::f32, Custom); | |
136 | setOperationAction(ISD::SELECT_CC, MVT::f64, Custom); | |
970d7e83 | 137 | setOperationAction(ISD::BR_JT, MVT::Other, Expand); |
970d7e83 LB |
138 | setOperationAction(ISD::JumpTable, MVT::i64, Custom); |
139 | ||
1a4d82fc JJ |
140 | setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom); |
141 | setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom); | |
142 | setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom); | |
970d7e83 LB |
143 | |
144 | setOperationAction(ISD::FREM, MVT::f32, Expand); | |
145 | setOperationAction(ISD::FREM, MVT::f64, Expand); | |
1a4d82fc | 146 | setOperationAction(ISD::FREM, MVT::f80, Expand); |
970d7e83 | 147 | |
1a4d82fc JJ |
148 | // Custom lowering hooks are needed for XOR |
149 | // to fold it into CSINC/CSINV. | |
150 | setOperationAction(ISD::XOR, MVT::i32, Custom); | |
151 | setOperationAction(ISD::XOR, MVT::i64, Custom); | |
970d7e83 LB |
152 | |
153 | // Virtually no operation on f128 is legal, but LLVM can't expand them when | |
154 | // there's a valid register class, so we need custom operations in most cases. | |
1a4d82fc JJ |
155 | setOperationAction(ISD::FABS, MVT::f128, Expand); |
156 | setOperationAction(ISD::FADD, MVT::f128, Custom); | |
157 | setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); | |
158 | setOperationAction(ISD::FCOS, MVT::f128, Expand); | |
159 | setOperationAction(ISD::FDIV, MVT::f128, Custom); | |
160 | setOperationAction(ISD::FMA, MVT::f128, Expand); | |
161 | setOperationAction(ISD::FMUL, MVT::f128, Custom); | |
162 | setOperationAction(ISD::FNEG, MVT::f128, Expand); | |
163 | setOperationAction(ISD::FPOW, MVT::f128, Expand); | |
164 | setOperationAction(ISD::FREM, MVT::f128, Expand); | |
165 | setOperationAction(ISD::FRINT, MVT::f128, Expand); | |
166 | setOperationAction(ISD::FSIN, MVT::f128, Expand); | |
167 | setOperationAction(ISD::FSINCOS, MVT::f128, Expand); | |
168 | setOperationAction(ISD::FSQRT, MVT::f128, Expand); | |
169 | setOperationAction(ISD::FSUB, MVT::f128, Custom); | |
170 | setOperationAction(ISD::FTRUNC, MVT::f128, Expand); | |
171 | setOperationAction(ISD::SETCC, MVT::f128, Custom); | |
172 | setOperationAction(ISD::BR_CC, MVT::f128, Custom); | |
173 | setOperationAction(ISD::SELECT, MVT::f128, Custom); | |
174 | setOperationAction(ISD::SELECT_CC, MVT::f128, Custom); | |
175 | setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); | |
970d7e83 LB |
176 | |
177 | // Lowering for many of the conversions is actually specified by the non-f128 | |
178 | // type. The LowerXXX function will be trivial when f128 isn't involved. | |
179 | setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); | |
180 | setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); | |
181 | setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); | |
182 | setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); | |
183 | setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); | |
184 | setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); | |
185 | setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); | |
186 | setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); | |
187 | setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); | |
188 | setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); | |
189 | setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); | |
190 | setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); | |
1a4d82fc JJ |
191 | setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); |
192 | setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); | |
970d7e83 | 193 | |
1a4d82fc JJ |
194 | // Variable arguments. |
195 | setOperationAction(ISD::VASTART, MVT::Other, Custom); | |
196 | setOperationAction(ISD::VAARG, MVT::Other, Custom); | |
197 | setOperationAction(ISD::VACOPY, MVT::Other, Custom); | |
198 | setOperationAction(ISD::VAEND, MVT::Other, Expand); | |
970d7e83 | 199 | |
1a4d82fc JJ |
200 | // Variable-sized objects. |
201 | setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); | |
202 | setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); | |
203 | setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand); | |
970d7e83 | 204 | |
1a4d82fc JJ |
205 | // Exception handling. |
206 | // FIXME: These are guesses. Has this been defined yet? | |
970d7e83 LB |
207 | setExceptionPointerRegister(AArch64::X0); |
208 | setExceptionSelectorRegister(AArch64::X1); | |
970d7e83 | 209 | |
1a4d82fc JJ |
210 | // Constant pool entries |
211 | setOperationAction(ISD::ConstantPool, MVT::i64, Custom); | |
970d7e83 | 212 | |
1a4d82fc JJ |
213 | // BlockAddress |
214 | setOperationAction(ISD::BlockAddress, MVT::i64, Custom); | |
970d7e83 | 215 | |
1a4d82fc JJ |
216 | // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences. |
217 | setOperationAction(ISD::ADDC, MVT::i32, Custom); | |
218 | setOperationAction(ISD::ADDE, MVT::i32, Custom); | |
219 | setOperationAction(ISD::SUBC, MVT::i32, Custom); | |
220 | setOperationAction(ISD::SUBE, MVT::i32, Custom); | |
221 | setOperationAction(ISD::ADDC, MVT::i64, Custom); | |
222 | setOperationAction(ISD::ADDE, MVT::i64, Custom); | |
223 | setOperationAction(ISD::SUBC, MVT::i64, Custom); | |
224 | setOperationAction(ISD::SUBE, MVT::i64, Custom); | |
225 | ||
226 | // AArch64 lacks both left-rotate and popcount instructions. | |
227 | setOperationAction(ISD::ROTL, MVT::i32, Expand); | |
228 | setOperationAction(ISD::ROTL, MVT::i64, Expand); | |
970d7e83 | 229 | |
1a4d82fc JJ |
230 | // AArch64 doesn't have {U|S}MUL_LOHI. |
231 | setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); | |
232 | setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); | |
970d7e83 | 233 | |
970d7e83 | 234 | |
1a4d82fc JJ |
235 | // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero |
236 | // counterparts, which AArch64 supports directly. | |
237 | setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand); | |
238 | setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand); | |
239 | setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); | |
240 | setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); | |
970d7e83 | 241 | |
1a4d82fc JJ |
242 | setOperationAction(ISD::CTPOP, MVT::i32, Custom); |
243 | setOperationAction(ISD::CTPOP, MVT::i64, Custom); | |
970d7e83 | 244 | |
1a4d82fc JJ |
245 | setOperationAction(ISD::SDIVREM, MVT::i32, Expand); |
246 | setOperationAction(ISD::SDIVREM, MVT::i64, Expand); | |
247 | setOperationAction(ISD::SREM, MVT::i32, Expand); | |
248 | setOperationAction(ISD::SREM, MVT::i64, Expand); | |
249 | setOperationAction(ISD::UDIVREM, MVT::i32, Expand); | |
250 | setOperationAction(ISD::UDIVREM, MVT::i64, Expand); | |
251 | setOperationAction(ISD::UREM, MVT::i32, Expand); | |
252 | setOperationAction(ISD::UREM, MVT::i64, Expand); | |
970d7e83 | 253 | |
1a4d82fc JJ |
254 | // Custom lower Add/Sub/Mul with overflow. |
255 | setOperationAction(ISD::SADDO, MVT::i32, Custom); | |
256 | setOperationAction(ISD::SADDO, MVT::i64, Custom); | |
257 | setOperationAction(ISD::UADDO, MVT::i32, Custom); | |
258 | setOperationAction(ISD::UADDO, MVT::i64, Custom); | |
259 | setOperationAction(ISD::SSUBO, MVT::i32, Custom); | |
260 | setOperationAction(ISD::SSUBO, MVT::i64, Custom); | |
261 | setOperationAction(ISD::USUBO, MVT::i32, Custom); | |
262 | setOperationAction(ISD::USUBO, MVT::i64, Custom); | |
263 | setOperationAction(ISD::SMULO, MVT::i32, Custom); | |
264 | setOperationAction(ISD::SMULO, MVT::i64, Custom); | |
265 | setOperationAction(ISD::UMULO, MVT::i32, Custom); | |
266 | setOperationAction(ISD::UMULO, MVT::i64, Custom); | |
970d7e83 | 267 | |
1a4d82fc JJ |
268 | setOperationAction(ISD::FSIN, MVT::f32, Expand); |
269 | setOperationAction(ISD::FSIN, MVT::f64, Expand); | |
270 | setOperationAction(ISD::FCOS, MVT::f32, Expand); | |
271 | setOperationAction(ISD::FCOS, MVT::f64, Expand); | |
272 | setOperationAction(ISD::FPOW, MVT::f32, Expand); | |
273 | setOperationAction(ISD::FPOW, MVT::f64, Expand); | |
274 | setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); | |
275 | setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); | |
276 | ||
277 | // f16 is storage-only, so we promote operations to f32 if we know this is | |
278 | // valid, and ignore them otherwise. The operations not mentioned here will | |
279 | // fail to select, but this is not a major problem as no source language | |
280 | // should be emitting native f16 operations yet. | |
281 | setOperationAction(ISD::FADD, MVT::f16, Promote); | |
282 | setOperationAction(ISD::FDIV, MVT::f16, Promote); | |
283 | setOperationAction(ISD::FMUL, MVT::f16, Promote); | |
284 | setOperationAction(ISD::FSUB, MVT::f16, Promote); | |
285 | ||
286 | // v4f16 is also a storage-only type, so promote it to v4f32 when that is | |
287 | // known to be safe. | |
288 | setOperationAction(ISD::FADD, MVT::v4f16, Promote); | |
289 | setOperationAction(ISD::FSUB, MVT::v4f16, Promote); | |
290 | setOperationAction(ISD::FMUL, MVT::v4f16, Promote); | |
291 | setOperationAction(ISD::FDIV, MVT::v4f16, Promote); | |
292 | setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote); | |
293 | setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote); | |
294 | AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); | |
295 | AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); | |
296 | AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); | |
297 | AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); | |
298 | AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32); | |
299 | AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32); | |
300 | ||
301 | // Expand all other v4f16 operations. | |
302 | // FIXME: We could generate better code by promoting some operations to | |
303 | // a pair of v4f32s | |
304 | setOperationAction(ISD::FABS, MVT::v4f16, Expand); | |
305 | setOperationAction(ISD::FCEIL, MVT::v4f16, Expand); | |
306 | setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand); | |
307 | setOperationAction(ISD::FCOS, MVT::v4f16, Expand); | |
308 | setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand); | |
309 | setOperationAction(ISD::FMA, MVT::v4f16, Expand); | |
310 | setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand); | |
311 | setOperationAction(ISD::FNEG, MVT::v4f16, Expand); | |
312 | setOperationAction(ISD::FPOW, MVT::v4f16, Expand); | |
313 | setOperationAction(ISD::FPOWI, MVT::v4f16, Expand); | |
314 | setOperationAction(ISD::FREM, MVT::v4f16, Expand); | |
315 | setOperationAction(ISD::FROUND, MVT::v4f16, Expand); | |
316 | setOperationAction(ISD::FRINT, MVT::v4f16, Expand); | |
317 | setOperationAction(ISD::FSIN, MVT::v4f16, Expand); | |
318 | setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand); | |
319 | setOperationAction(ISD::FSQRT, MVT::v4f16, Expand); | |
320 | setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand); | |
321 | setOperationAction(ISD::SETCC, MVT::v4f16, Expand); | |
322 | setOperationAction(ISD::BR_CC, MVT::v4f16, Expand); | |
323 | setOperationAction(ISD::SELECT, MVT::v4f16, Expand); | |
324 | setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand); | |
325 | setOperationAction(ISD::FEXP, MVT::v4f16, Expand); | |
326 | setOperationAction(ISD::FEXP2, MVT::v4f16, Expand); | |
327 | setOperationAction(ISD::FLOG, MVT::v4f16, Expand); | |
328 | setOperationAction(ISD::FLOG2, MVT::v4f16, Expand); | |
329 | setOperationAction(ISD::FLOG10, MVT::v4f16, Expand); | |
330 | ||
331 | ||
332 | // v8f16 is also a storage-only type, so expand it. | |
333 | setOperationAction(ISD::FABS, MVT::v8f16, Expand); | |
334 | setOperationAction(ISD::FADD, MVT::v8f16, Expand); | |
335 | setOperationAction(ISD::FCEIL, MVT::v8f16, Expand); | |
336 | setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand); | |
337 | setOperationAction(ISD::FCOS, MVT::v8f16, Expand); | |
338 | setOperationAction(ISD::FDIV, MVT::v8f16, Expand); | |
339 | setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand); | |
340 | setOperationAction(ISD::FMA, MVT::v8f16, Expand); | |
341 | setOperationAction(ISD::FMUL, MVT::v8f16, Expand); | |
342 | setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand); | |
343 | setOperationAction(ISD::FNEG, MVT::v8f16, Expand); | |
344 | setOperationAction(ISD::FPOW, MVT::v8f16, Expand); | |
345 | setOperationAction(ISD::FPOWI, MVT::v8f16, Expand); | |
346 | setOperationAction(ISD::FREM, MVT::v8f16, Expand); | |
347 | setOperationAction(ISD::FROUND, MVT::v8f16, Expand); | |
348 | setOperationAction(ISD::FRINT, MVT::v8f16, Expand); | |
349 | setOperationAction(ISD::FSIN, MVT::v8f16, Expand); | |
350 | setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand); | |
351 | setOperationAction(ISD::FSQRT, MVT::v8f16, Expand); | |
352 | setOperationAction(ISD::FSUB, MVT::v8f16, Expand); | |
353 | setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand); | |
354 | setOperationAction(ISD::SETCC, MVT::v8f16, Expand); | |
355 | setOperationAction(ISD::BR_CC, MVT::v8f16, Expand); | |
356 | setOperationAction(ISD::SELECT, MVT::v8f16, Expand); | |
357 | setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand); | |
358 | setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand); | |
359 | setOperationAction(ISD::FEXP, MVT::v8f16, Expand); | |
360 | setOperationAction(ISD::FEXP2, MVT::v8f16, Expand); | |
361 | setOperationAction(ISD::FLOG, MVT::v8f16, Expand); | |
362 | setOperationAction(ISD::FLOG2, MVT::v8f16, Expand); | |
363 | setOperationAction(ISD::FLOG10, MVT::v8f16, Expand); | |
364 | ||
365 | // AArch64 has implementations of a lot of rounding-like FP operations. | |
366 | static MVT RoundingTypes[] = { MVT::f32, MVT::f64}; | |
367 | for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) { | |
368 | MVT Ty = RoundingTypes[I]; | |
369 | setOperationAction(ISD::FFLOOR, Ty, Legal); | |
370 | setOperationAction(ISD::FNEARBYINT, Ty, Legal); | |
371 | setOperationAction(ISD::FCEIL, Ty, Legal); | |
372 | setOperationAction(ISD::FRINT, Ty, Legal); | |
373 | setOperationAction(ISD::FTRUNC, Ty, Legal); | |
374 | setOperationAction(ISD::FROUND, Ty, Legal); | |
375 | } | |
970d7e83 | 376 | |
1a4d82fc | 377 | setOperationAction(ISD::PREFETCH, MVT::Other, Custom); |
970d7e83 | 378 | |
1a4d82fc JJ |
379 | if (Subtarget->isTargetMachO()) { |
380 | // For iOS, we don't want to the normal expansion of a libcall to | |
381 | // sincos. We want to issue a libcall to __sincos_stret to avoid memory | |
382 | // traffic. | |
383 | setOperationAction(ISD::FSINCOS, MVT::f64, Custom); | |
384 | setOperationAction(ISD::FSINCOS, MVT::f32, Custom); | |
970d7e83 | 385 | } else { |
1a4d82fc JJ |
386 | setOperationAction(ISD::FSINCOS, MVT::f64, Expand); |
387 | setOperationAction(ISD::FSINCOS, MVT::f32, Expand); | |
388 | } | |
389 | ||
85aaf69f SL |
390 | // Make floating-point constants legal for the large code model, so they don't |
391 | // become loads from the constant pool. | |
392 | if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { | |
393 | setOperationAction(ISD::ConstantFP, MVT::f32, Legal); | |
394 | setOperationAction(ISD::ConstantFP, MVT::f64, Legal); | |
395 | } | |
396 | ||
1a4d82fc JJ |
397 | // AArch64 does not have floating-point extending loads, i1 sign-extending |
398 | // load, floating-point truncating stores, or v2i32->v2i16 truncating store. | |
85aaf69f SL |
399 | for (MVT VT : MVT::fp_valuetypes()) { |
400 | setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); | |
401 | setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand); | |
402 | setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand); | |
403 | setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); | |
404 | } | |
405 | for (MVT VT : MVT::integer_valuetypes()) | |
406 | setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand); | |
407 | ||
1a4d82fc JJ |
408 | setTruncStoreAction(MVT::f32, MVT::f16, Expand); |
409 | setTruncStoreAction(MVT::f64, MVT::f32, Expand); | |
410 | setTruncStoreAction(MVT::f64, MVT::f16, Expand); | |
411 | setTruncStoreAction(MVT::f128, MVT::f80, Expand); | |
412 | setTruncStoreAction(MVT::f128, MVT::f64, Expand); | |
413 | setTruncStoreAction(MVT::f128, MVT::f32, Expand); | |
414 | setTruncStoreAction(MVT::f128, MVT::f16, Expand); | |
415 | ||
416 | setOperationAction(ISD::BITCAST, MVT::i16, Custom); | |
417 | setOperationAction(ISD::BITCAST, MVT::f16, Custom); | |
418 | ||
419 | // Indexed loads and stores are supported. | |
420 | for (unsigned im = (unsigned)ISD::PRE_INC; | |
421 | im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { | |
422 | setIndexedLoadAction(im, MVT::i8, Legal); | |
423 | setIndexedLoadAction(im, MVT::i16, Legal); | |
424 | setIndexedLoadAction(im, MVT::i32, Legal); | |
425 | setIndexedLoadAction(im, MVT::i64, Legal); | |
426 | setIndexedLoadAction(im, MVT::f64, Legal); | |
427 | setIndexedLoadAction(im, MVT::f32, Legal); | |
428 | setIndexedStoreAction(im, MVT::i8, Legal); | |
429 | setIndexedStoreAction(im, MVT::i16, Legal); | |
430 | setIndexedStoreAction(im, MVT::i32, Legal); | |
431 | setIndexedStoreAction(im, MVT::i64, Legal); | |
432 | setIndexedStoreAction(im, MVT::f64, Legal); | |
433 | setIndexedStoreAction(im, MVT::f32, Legal); | |
970d7e83 LB |
434 | } |
435 | ||
1a4d82fc JJ |
436 | // Trap. |
437 | setOperationAction(ISD::TRAP, MVT::Other, Legal); | |
438 | ||
439 | // We combine OR nodes for bitfield operations. | |
440 | setTargetDAGCombine(ISD::OR); | |
441 | ||
442 | // Vector add and sub nodes may conceal a high-half opportunity. | |
443 | // Also, try to fold ADD into CSINC/CSINV.. | |
444 | setTargetDAGCombine(ISD::ADD); | |
445 | setTargetDAGCombine(ISD::SUB); | |
446 | ||
447 | setTargetDAGCombine(ISD::XOR); | |
448 | setTargetDAGCombine(ISD::SINT_TO_FP); | |
449 | setTargetDAGCombine(ISD::UINT_TO_FP); | |
450 | ||
451 | setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); | |
452 | ||
453 | setTargetDAGCombine(ISD::ANY_EXTEND); | |
454 | setTargetDAGCombine(ISD::ZERO_EXTEND); | |
455 | setTargetDAGCombine(ISD::SIGN_EXTEND); | |
456 | setTargetDAGCombine(ISD::BITCAST); | |
457 | setTargetDAGCombine(ISD::CONCAT_VECTORS); | |
458 | setTargetDAGCombine(ISD::STORE); | |
459 | ||
460 | setTargetDAGCombine(ISD::MUL); | |
461 | ||
462 | setTargetDAGCombine(ISD::SELECT); | |
463 | setTargetDAGCombine(ISD::VSELECT); | |
464 | ||
465 | setTargetDAGCombine(ISD::INTRINSIC_VOID); | |
466 | setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN); | |
467 | setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); | |
468 | ||
469 | MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; | |
470 | MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; | |
471 | MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4; | |
472 | ||
473 | setStackPointerRegisterToSaveRestore(AArch64::SP); | |
474 | ||
475 | setSchedulingPreference(Sched::Hybrid); | |
476 | ||
477 | // Enable TBZ/TBNZ | |
478 | MaskAndBranchFoldingIsLegal = true; | |
479 | ||
480 | setMinFunctionAlignment(2); | |
481 | ||
482 | RequireStrictAlign = (Align == StrictAlign); | |
483 | ||
484 | setHasExtractBitsInsn(true); | |
485 | ||
486 | if (Subtarget->hasNEON()) { | |
487 | // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to | |
488 | // silliness like this: | |
489 | setOperationAction(ISD::FABS, MVT::v1f64, Expand); | |
490 | setOperationAction(ISD::FADD, MVT::v1f64, Expand); | |
491 | setOperationAction(ISD::FCEIL, MVT::v1f64, Expand); | |
492 | setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand); | |
493 | setOperationAction(ISD::FCOS, MVT::v1f64, Expand); | |
494 | setOperationAction(ISD::FDIV, MVT::v1f64, Expand); | |
495 | setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand); | |
496 | setOperationAction(ISD::FMA, MVT::v1f64, Expand); | |
497 | setOperationAction(ISD::FMUL, MVT::v1f64, Expand); | |
498 | setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand); | |
499 | setOperationAction(ISD::FNEG, MVT::v1f64, Expand); | |
500 | setOperationAction(ISD::FPOW, MVT::v1f64, Expand); | |
501 | setOperationAction(ISD::FREM, MVT::v1f64, Expand); | |
502 | setOperationAction(ISD::FROUND, MVT::v1f64, Expand); | |
503 | setOperationAction(ISD::FRINT, MVT::v1f64, Expand); | |
504 | setOperationAction(ISD::FSIN, MVT::v1f64, Expand); | |
505 | setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand); | |
506 | setOperationAction(ISD::FSQRT, MVT::v1f64, Expand); | |
507 | setOperationAction(ISD::FSUB, MVT::v1f64, Expand); | |
508 | setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand); | |
509 | setOperationAction(ISD::SETCC, MVT::v1f64, Expand); | |
510 | setOperationAction(ISD::BR_CC, MVT::v1f64, Expand); | |
511 | setOperationAction(ISD::SELECT, MVT::v1f64, Expand); | |
512 | setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand); | |
513 | setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand); | |
514 | ||
515 | setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand); | |
516 | setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand); | |
517 | setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand); | |
518 | setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand); | |
519 | setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand); | |
520 | ||
521 | setOperationAction(ISD::MUL, MVT::v1i64, Expand); | |
522 | ||
523 | // AArch64 doesn't have a direct vector ->f32 conversion instructions for | |
524 | // elements smaller than i32, so promote the input to i32 first. | |
525 | setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote); | |
526 | setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote); | |
527 | setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote); | |
528 | setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote); | |
529 | // Similarly, there is no direct i32 -> f64 vector conversion instruction. | |
530 | setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); | |
531 | setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); | |
532 | setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom); | |
533 | setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom); | |
534 | ||
535 | // AArch64 doesn't have MUL.2d: | |
536 | setOperationAction(ISD::MUL, MVT::v2i64, Expand); | |
85aaf69f SL |
537 | // Custom handling for some quad-vector types to detect MULL. |
538 | setOperationAction(ISD::MUL, MVT::v8i16, Custom); | |
539 | setOperationAction(ISD::MUL, MVT::v4i32, Custom); | |
540 | setOperationAction(ISD::MUL, MVT::v2i64, Custom); | |
541 | ||
1a4d82fc JJ |
542 | setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal); |
543 | setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); | |
544 | // Likewise, narrowing and extending vector loads/stores aren't handled | |
545 | // directly. | |
85aaf69f SL |
546 | for (MVT VT : MVT::vector_valuetypes()) { |
547 | setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand); | |
548 | ||
549 | setOperationAction(ISD::MULHS, VT, Expand); | |
550 | setOperationAction(ISD::SMUL_LOHI, VT, Expand); | |
551 | setOperationAction(ISD::MULHU, VT, Expand); | |
552 | setOperationAction(ISD::UMUL_LOHI, VT, Expand); | |
553 | ||
554 | setOperationAction(ISD::BSWAP, VT, Expand); | |
555 | ||
556 | for (MVT InnerVT : MVT::vector_valuetypes()) { | |
557 | setTruncStoreAction(VT, InnerVT, Expand); | |
558 | setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand); | |
559 | setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand); | |
560 | setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand); | |
561 | } | |
1a4d82fc JJ |
562 | } |
563 | ||
564 | // AArch64 has implementations of a lot of rounding-like FP operations. | |
565 | static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 }; | |
566 | for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) { | |
567 | MVT Ty = RoundingVecTypes[I]; | |
568 | setOperationAction(ISD::FFLOOR, Ty, Legal); | |
569 | setOperationAction(ISD::FNEARBYINT, Ty, Legal); | |
570 | setOperationAction(ISD::FCEIL, Ty, Legal); | |
571 | setOperationAction(ISD::FRINT, Ty, Legal); | |
572 | setOperationAction(ISD::FTRUNC, Ty, Legal); | |
573 | setOperationAction(ISD::FROUND, Ty, Legal); | |
574 | } | |
575 | } | |
970d7e83 | 576 | |
1a4d82fc JJ |
577 | // Prefer likely predicted branches to selects on out-of-order cores. |
578 | if (Subtarget->isCortexA57()) | |
579 | PredictableSelectIsExpensive = true; | |
580 | } | |
970d7e83 | 581 | |
1a4d82fc JJ |
582 | void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) { |
583 | if (VT == MVT::v2f32 || VT == MVT::v4f16) { | |
584 | setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); | |
585 | AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32); | |
970d7e83 | 586 | |
1a4d82fc JJ |
587 | setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); |
588 | AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32); | |
589 | } else if (VT == MVT::v2f64 || VT == MVT::v4f32 || VT == MVT::v8f16) { | |
590 | setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote); | |
591 | AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64); | |
970d7e83 | 592 | |
1a4d82fc JJ |
593 | setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote); |
594 | AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64); | |
595 | } | |
970d7e83 | 596 | |
1a4d82fc JJ |
597 | // Mark vector float intrinsics as expand. |
598 | if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) { | |
599 | setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand); | |
600 | setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand); | |
601 | setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand); | |
602 | setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand); | |
603 | setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand); | |
604 | setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand); | |
605 | setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand); | |
606 | setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand); | |
607 | setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand); | |
608 | } | |
970d7e83 | 609 | |
1a4d82fc JJ |
610 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom); |
611 | setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom); | |
612 | setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom); | |
613 | setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom); | |
614 | setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom); | |
615 | setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom); | |
616 | setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom); | |
617 | setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom); | |
618 | setOperationAction(ISD::AND, VT.getSimpleVT(), Custom); | |
619 | setOperationAction(ISD::OR, VT.getSimpleVT(), Custom); | |
620 | setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom); | |
621 | setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal); | |
622 | ||
623 | setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand); | |
624 | setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand); | |
625 | setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand); | |
85aaf69f SL |
626 | for (MVT InnerVT : MVT::all_valuetypes()) |
627 | setLoadExtAction(ISD::EXTLOAD, InnerVT, VT.getSimpleVT(), Expand); | |
1a4d82fc JJ |
628 | |
629 | // CNT supports only B element sizes. | |
630 | if (VT != MVT::v8i8 && VT != MVT::v16i8) | |
631 | setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand); | |
632 | ||
633 | setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand); | |
634 | setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand); | |
635 | setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand); | |
636 | setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand); | |
637 | setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand); | |
638 | ||
639 | setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom); | |
640 | setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom); | |
641 | ||
642 | if (Subtarget->isLittleEndian()) { | |
643 | for (unsigned im = (unsigned)ISD::PRE_INC; | |
644 | im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) { | |
645 | setIndexedLoadAction(im, VT.getSimpleVT(), Legal); | |
646 | setIndexedStoreAction(im, VT.getSimpleVT(), Legal); | |
647 | } | |
648 | } | |
649 | } | |
970d7e83 | 650 | |
1a4d82fc JJ |
651 | void AArch64TargetLowering::addDRTypeForNEON(MVT VT) { |
652 | addRegisterClass(VT, &AArch64::FPR64RegClass); | |
653 | addTypeForNEON(VT, MVT::v2i32); | |
654 | } | |
970d7e83 | 655 | |
1a4d82fc JJ |
656 | void AArch64TargetLowering::addQRTypeForNEON(MVT VT) { |
657 | addRegisterClass(VT, &AArch64::FPR128RegClass); | |
658 | addTypeForNEON(VT, MVT::v4i32); | |
659 | } | |
970d7e83 | 660 | |
1a4d82fc JJ |
661 | EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { |
662 | if (!VT.isVector()) | |
663 | return MVT::i32; | |
664 | return VT.changeVectorElementTypeToInteger(); | |
665 | } | |
970d7e83 | 666 | |
1a4d82fc JJ |
667 | /// computeKnownBitsForTargetNode - Determine which of the bits specified in |
668 | /// Mask are known to be either zero or one and return them in the | |
669 | /// KnownZero/KnownOne bitsets. | |
670 | void AArch64TargetLowering::computeKnownBitsForTargetNode( | |
671 | const SDValue Op, APInt &KnownZero, APInt &KnownOne, | |
672 | const SelectionDAG &DAG, unsigned Depth) const { | |
673 | switch (Op.getOpcode()) { | |
674 | default: | |
675 | break; | |
676 | case AArch64ISD::CSEL: { | |
677 | APInt KnownZero2, KnownOne2; | |
678 | DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1); | |
679 | DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1); | |
680 | KnownZero &= KnownZero2; | |
681 | KnownOne &= KnownOne2; | |
682 | break; | |
683 | } | |
684 | case ISD::INTRINSIC_W_CHAIN: { | |
685 | ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1)); | |
686 | Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); | |
687 | switch (IntID) { | |
688 | default: return; | |
689 | case Intrinsic::aarch64_ldaxr: | |
690 | case Intrinsic::aarch64_ldxr: { | |
691 | unsigned BitWidth = KnownOne.getBitWidth(); | |
692 | EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT(); | |
693 | unsigned MemBits = VT.getScalarType().getSizeInBits(); | |
694 | KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits); | |
695 | return; | |
696 | } | |
697 | } | |
698 | break; | |
699 | } | |
700 | case ISD::INTRINSIC_WO_CHAIN: | |
701 | case ISD::INTRINSIC_VOID: { | |
702 | unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); | |
703 | switch (IntNo) { | |
704 | default: | |
705 | break; | |
706 | case Intrinsic::aarch64_neon_umaxv: | |
707 | case Intrinsic::aarch64_neon_uminv: { | |
708 | // Figure out the datatype of the vector operand. The UMINV instruction | |
709 | // will zero extend the result, so we can mark as known zero all the | |
710 | // bits larger than the element datatype. 32-bit or larget doesn't need | |
711 | // this as those are legal types and will be handled by isel directly. | |
712 | MVT VT = Op.getOperand(1).getValueType().getSimpleVT(); | |
713 | unsigned BitWidth = KnownZero.getBitWidth(); | |
714 | if (VT == MVT::v8i8 || VT == MVT::v16i8) { | |
715 | assert(BitWidth >= 8 && "Unexpected width!"); | |
716 | APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8); | |
717 | KnownZero |= Mask; | |
718 | } else if (VT == MVT::v4i16 || VT == MVT::v8i16) { | |
719 | assert(BitWidth >= 16 && "Unexpected width!"); | |
720 | APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16); | |
721 | KnownZero |= Mask; | |
722 | } | |
723 | break; | |
724 | } break; | |
725 | } | |
726 | } | |
727 | } | |
728 | } | |
970d7e83 | 729 | |
1a4d82fc JJ |
730 | MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const { |
731 | return MVT::i64; | |
732 | } | |
970d7e83 | 733 | |
1a4d82fc JJ |
734 | unsigned AArch64TargetLowering::getMaximalGlobalOffset() const { |
735 | // FIXME: On AArch64, this depends on the type. | |
736 | // Basically, the addressable offsets are up to 4095 * Ty.getSizeInBytes(). | |
737 | // and the offset has to be a multiple of the related size in bytes. | |
738 | return 4095; | |
739 | } | |
970d7e83 | 740 | |
1a4d82fc JJ |
741 | FastISel * |
742 | AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, | |
743 | const TargetLibraryInfo *libInfo) const { | |
744 | return AArch64::createFastISel(funcInfo, libInfo); | |
970d7e83 LB |
745 | } |
746 | ||
1a4d82fc JJ |
747 | const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { |
748 | switch (Opcode) { | |
749 | default: | |
750 | return nullptr; | |
751 | case AArch64ISD::CALL: return "AArch64ISD::CALL"; | |
752 | case AArch64ISD::ADRP: return "AArch64ISD::ADRP"; | |
753 | case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow"; | |
754 | case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot"; | |
755 | case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG"; | |
756 | case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND"; | |
757 | case AArch64ISD::CSEL: return "AArch64ISD::CSEL"; | |
758 | case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL"; | |
759 | case AArch64ISD::CSINV: return "AArch64ISD::CSINV"; | |
760 | case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG"; | |
761 | case AArch64ISD::CSINC: return "AArch64ISD::CSINC"; | |
762 | case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER"; | |
763 | case AArch64ISD::TLSDESC_CALL: return "AArch64ISD::TLSDESC_CALL"; | |
764 | case AArch64ISD::ADC: return "AArch64ISD::ADC"; | |
765 | case AArch64ISD::SBC: return "AArch64ISD::SBC"; | |
766 | case AArch64ISD::ADDS: return "AArch64ISD::ADDS"; | |
767 | case AArch64ISD::SUBS: return "AArch64ISD::SUBS"; | |
768 | case AArch64ISD::ADCS: return "AArch64ISD::ADCS"; | |
769 | case AArch64ISD::SBCS: return "AArch64ISD::SBCS"; | |
770 | case AArch64ISD::ANDS: return "AArch64ISD::ANDS"; | |
771 | case AArch64ISD::FCMP: return "AArch64ISD::FCMP"; | |
772 | case AArch64ISD::FMIN: return "AArch64ISD::FMIN"; | |
773 | case AArch64ISD::FMAX: return "AArch64ISD::FMAX"; | |
774 | case AArch64ISD::DUP: return "AArch64ISD::DUP"; | |
775 | case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8"; | |
776 | case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16"; | |
777 | case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32"; | |
778 | case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64"; | |
779 | case AArch64ISD::MOVI: return "AArch64ISD::MOVI"; | |
780 | case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift"; | |
781 | case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit"; | |
782 | case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl"; | |
783 | case AArch64ISD::FMOV: return "AArch64ISD::FMOV"; | |
784 | case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift"; | |
785 | case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl"; | |
786 | case AArch64ISD::BICi: return "AArch64ISD::BICi"; | |
787 | case AArch64ISD::ORRi: return "AArch64ISD::ORRi"; | |
788 | case AArch64ISD::BSL: return "AArch64ISD::BSL"; | |
789 | case AArch64ISD::NEG: return "AArch64ISD::NEG"; | |
790 | case AArch64ISD::EXTR: return "AArch64ISD::EXTR"; | |
791 | case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1"; | |
792 | case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2"; | |
793 | case AArch64ISD::UZP1: return "AArch64ISD::UZP1"; | |
794 | case AArch64ISD::UZP2: return "AArch64ISD::UZP2"; | |
795 | case AArch64ISD::TRN1: return "AArch64ISD::TRN1"; | |
796 | case AArch64ISD::TRN2: return "AArch64ISD::TRN2"; | |
797 | case AArch64ISD::REV16: return "AArch64ISD::REV16"; | |
798 | case AArch64ISD::REV32: return "AArch64ISD::REV32"; | |
799 | case AArch64ISD::REV64: return "AArch64ISD::REV64"; | |
800 | case AArch64ISD::EXT: return "AArch64ISD::EXT"; | |
801 | case AArch64ISD::VSHL: return "AArch64ISD::VSHL"; | |
802 | case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR"; | |
803 | case AArch64ISD::VASHR: return "AArch64ISD::VASHR"; | |
804 | case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ"; | |
805 | case AArch64ISD::CMGE: return "AArch64ISD::CMGE"; | |
806 | case AArch64ISD::CMGT: return "AArch64ISD::CMGT"; | |
807 | case AArch64ISD::CMHI: return "AArch64ISD::CMHI"; | |
808 | case AArch64ISD::CMHS: return "AArch64ISD::CMHS"; | |
809 | case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ"; | |
810 | case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE"; | |
811 | case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT"; | |
812 | case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz"; | |
813 | case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz"; | |
814 | case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz"; | |
815 | case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz"; | |
816 | case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz"; | |
817 | case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz"; | |
818 | case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz"; | |
819 | case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz"; | |
820 | case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz"; | |
821 | case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz"; | |
822 | case AArch64ISD::NOT: return "AArch64ISD::NOT"; | |
823 | case AArch64ISD::BIT: return "AArch64ISD::BIT"; | |
824 | case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; | |
825 | case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ"; | |
826 | case AArch64ISD::TBZ: return "AArch64ISD::TBZ"; | |
827 | case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ"; | |
828 | case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN"; | |
829 | case AArch64ISD::SITOF: return "AArch64ISD::SITOF"; | |
830 | case AArch64ISD::UITOF: return "AArch64ISD::UITOF"; | |
831 | case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST"; | |
832 | case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I"; | |
833 | case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I"; | |
834 | case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I"; | |
835 | case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I"; | |
836 | case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I"; | |
837 | case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge"; | |
838 | case AArch64ISD::LD2post: return "AArch64ISD::LD2post"; | |
839 | case AArch64ISD::LD3post: return "AArch64ISD::LD3post"; | |
840 | case AArch64ISD::LD4post: return "AArch64ISD::LD4post"; | |
841 | case AArch64ISD::ST2post: return "AArch64ISD::ST2post"; | |
842 | case AArch64ISD::ST3post: return "AArch64ISD::ST3post"; | |
843 | case AArch64ISD::ST4post: return "AArch64ISD::ST4post"; | |
844 | case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post"; | |
845 | case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post"; | |
846 | case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post"; | |
847 | case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post"; | |
848 | case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post"; | |
849 | case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post"; | |
850 | case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost"; | |
851 | case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost"; | |
852 | case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost"; | |
853 | case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost"; | |
854 | case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost"; | |
855 | case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost"; | |
856 | case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost"; | |
857 | case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost"; | |
858 | case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost"; | |
859 | case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost"; | |
860 | case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost"; | |
85aaf69f SL |
861 | case AArch64ISD::SMULL: return "AArch64ISD::SMULL"; |
862 | case AArch64ISD::UMULL: return "AArch64ISD::UMULL"; | |
1a4d82fc | 863 | } |
970d7e83 LB |
864 | } |
865 | ||
866 | MachineBasicBlock * | |
867 | AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI, | |
868 | MachineBasicBlock *MBB) const { | |
1a4d82fc JJ |
869 | // We materialise the F128CSEL pseudo-instruction as some control flow and a |
870 | // phi node: | |
871 | ||
872 | // OrigBB: | |
873 | // [... previous instrs leading to comparison ...] | |
874 | // b.ne TrueBB | |
875 | // b EndBB | |
876 | // TrueBB: | |
877 | // ; Fallthrough | |
878 | // EndBB: | |
879 | // Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB] | |
880 | ||
881 | const TargetInstrInfo *TII = | |
882 | getTargetMachine().getSubtargetImpl()->getInstrInfo(); | |
970d7e83 LB |
883 | MachineFunction *MF = MBB->getParent(); |
884 | const BasicBlock *LLVM_BB = MBB->getBasicBlock(); | |
885 | DebugLoc DL = MI->getDebugLoc(); | |
886 | MachineFunction::iterator It = MBB; | |
887 | ++It; | |
888 | ||
889 | unsigned DestReg = MI->getOperand(0).getReg(); | |
890 | unsigned IfTrueReg = MI->getOperand(1).getReg(); | |
891 | unsigned IfFalseReg = MI->getOperand(2).getReg(); | |
892 | unsigned CondCode = MI->getOperand(3).getImm(); | |
893 | bool NZCVKilled = MI->getOperand(4).isKill(); | |
894 | ||
895 | MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB); | |
896 | MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB); | |
897 | MF->insert(It, TrueBB); | |
898 | MF->insert(It, EndBB); | |
899 | ||
900 | // Transfer rest of current basic-block to EndBB | |
1a4d82fc | 901 | EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)), |
970d7e83 LB |
902 | MBB->end()); |
903 | EndBB->transferSuccessorsAndUpdatePHIs(MBB); | |
904 | ||
1a4d82fc JJ |
905 | BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB); |
906 | BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB); | |
970d7e83 LB |
907 | MBB->addSuccessor(TrueBB); |
908 | MBB->addSuccessor(EndBB); | |
909 | ||
1a4d82fc | 910 | // TrueBB falls through to the end. |
970d7e83 LB |
911 | TrueBB->addSuccessor(EndBB); |
912 | ||
1a4d82fc JJ |
913 | if (!NZCVKilled) { |
914 | TrueBB->addLiveIn(AArch64::NZCV); | |
970d7e83 | 915 | EndBB->addLiveIn(AArch64::NZCV); |
1a4d82fc JJ |
916 | } |
917 | ||
918 | BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg) | |
919 | .addReg(IfTrueReg) | |
920 | .addMBB(TrueBB) | |
921 | .addReg(IfFalseReg) | |
922 | .addMBB(MBB); | |
970d7e83 LB |
923 | |
924 | MI->eraseFromParent(); | |
925 | return EndBB; | |
926 | } | |
927 | ||
928 | MachineBasicBlock * | |
929 | AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI, | |
1a4d82fc | 930 | MachineBasicBlock *BB) const { |
970d7e83 | 931 | switch (MI->getOpcode()) { |
1a4d82fc JJ |
932 | default: |
933 | #ifndef NDEBUG | |
934 | MI->dump(); | |
935 | #endif | |
936 | llvm_unreachable("Unexpected instruction for custom inserter!"); | |
937 | ||
970d7e83 | 938 | case AArch64::F128CSEL: |
1a4d82fc JJ |
939 | return EmitF128CSEL(MI, BB); |
940 | ||
941 | case TargetOpcode::STACKMAP: | |
942 | case TargetOpcode::PATCHPOINT: | |
943 | return emitPatchPoint(MI, BB); | |
970d7e83 LB |
944 | } |
945 | } | |
946 | ||
1a4d82fc JJ |
947 | //===----------------------------------------------------------------------===// |
948 | // AArch64 Lowering private implementation. | |
949 | //===----------------------------------------------------------------------===// | |
970d7e83 | 950 | |
1a4d82fc JJ |
951 | //===----------------------------------------------------------------------===// |
952 | // Lowering Code | |
953 | //===----------------------------------------------------------------------===// | |
970d7e83 | 954 | |
1a4d82fc JJ |
955 | /// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64 |
956 | /// CC | |
957 | static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) { | |
958 | switch (CC) { | |
959 | default: | |
960 | llvm_unreachable("Unknown condition code!"); | |
961 | case ISD::SETNE: | |
962 | return AArch64CC::NE; | |
963 | case ISD::SETEQ: | |
964 | return AArch64CC::EQ; | |
965 | case ISD::SETGT: | |
966 | return AArch64CC::GT; | |
967 | case ISD::SETGE: | |
968 | return AArch64CC::GE; | |
969 | case ISD::SETLT: | |
970 | return AArch64CC::LT; | |
971 | case ISD::SETLE: | |
972 | return AArch64CC::LE; | |
973 | case ISD::SETUGT: | |
974 | return AArch64CC::HI; | |
975 | case ISD::SETUGE: | |
976 | return AArch64CC::HS; | |
977 | case ISD::SETULT: | |
978 | return AArch64CC::LO; | |
979 | case ISD::SETULE: | |
980 | return AArch64CC::LS; | |
981 | } | |
970d7e83 LB |
982 | } |
983 | ||
1a4d82fc JJ |
984 | /// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC. |
985 | static void changeFPCCToAArch64CC(ISD::CondCode CC, | |
986 | AArch64CC::CondCode &CondCode, | |
987 | AArch64CC::CondCode &CondCode2) { | |
988 | CondCode2 = AArch64CC::AL; | |
989 | switch (CC) { | |
990 | default: | |
991 | llvm_unreachable("Unknown FP condition!"); | |
992 | case ISD::SETEQ: | |
993 | case ISD::SETOEQ: | |
994 | CondCode = AArch64CC::EQ; | |
995 | break; | |
996 | case ISD::SETGT: | |
997 | case ISD::SETOGT: | |
998 | CondCode = AArch64CC::GT; | |
999 | break; | |
1000 | case ISD::SETGE: | |
1001 | case ISD::SETOGE: | |
1002 | CondCode = AArch64CC::GE; | |
1003 | break; | |
1004 | case ISD::SETOLT: | |
1005 | CondCode = AArch64CC::MI; | |
1006 | break; | |
1007 | case ISD::SETOLE: | |
1008 | CondCode = AArch64CC::LS; | |
1009 | break; | |
1010 | case ISD::SETONE: | |
1011 | CondCode = AArch64CC::MI; | |
1012 | CondCode2 = AArch64CC::GT; | |
1013 | break; | |
1014 | case ISD::SETO: | |
1015 | CondCode = AArch64CC::VC; | |
1016 | break; | |
1017 | case ISD::SETUO: | |
1018 | CondCode = AArch64CC::VS; | |
1019 | break; | |
1020 | case ISD::SETUEQ: | |
1021 | CondCode = AArch64CC::EQ; | |
1022 | CondCode2 = AArch64CC::VS; | |
1023 | break; | |
1024 | case ISD::SETUGT: | |
1025 | CondCode = AArch64CC::HI; | |
1026 | break; | |
1027 | case ISD::SETUGE: | |
1028 | CondCode = AArch64CC::PL; | |
1029 | break; | |
1030 | case ISD::SETLT: | |
1031 | case ISD::SETULT: | |
1032 | CondCode = AArch64CC::LT; | |
1033 | break; | |
1034 | case ISD::SETLE: | |
1035 | case ISD::SETULE: | |
1036 | CondCode = AArch64CC::LE; | |
1037 | break; | |
1038 | case ISD::SETNE: | |
1039 | case ISD::SETUNE: | |
1040 | CondCode = AArch64CC::NE; | |
1041 | break; | |
970d7e83 LB |
1042 | } |
1043 | } | |
1044 | ||
1a4d82fc JJ |
1045 | /// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 |
1046 | /// CC usable with the vector instructions. Fewer operations are available | |
1047 | /// without a real NZCV register, so we have to use less efficient combinations | |
1048 | /// to get the same effect. | |
1049 | static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, | |
1050 | AArch64CC::CondCode &CondCode, | |
1051 | AArch64CC::CondCode &CondCode2, | |
1052 | bool &Invert) { | |
1053 | Invert = false; | |
1054 | switch (CC) { | |
1055 | default: | |
1056 | // Mostly the scalar mappings work fine. | |
1057 | changeFPCCToAArch64CC(CC, CondCode, CondCode2); | |
1058 | break; | |
1059 | case ISD::SETUO: | |
1060 | Invert = true; // Fallthrough | |
1061 | case ISD::SETO: | |
1062 | CondCode = AArch64CC::MI; | |
1063 | CondCode2 = AArch64CC::GE; | |
1064 | break; | |
1065 | case ISD::SETUEQ: | |
1066 | case ISD::SETULT: | |
1067 | case ISD::SETULE: | |
1068 | case ISD::SETUGT: | |
1069 | case ISD::SETUGE: | |
1070 | // All of the compare-mask comparisons are ordered, but we can switch | |
1071 | // between the two by a double inversion. E.g. ULE == !OGT. | |
1072 | Invert = true; | |
1073 | changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2); | |
1074 | break; | |
1075 | } | |
1076 | } | |
970d7e83 | 1077 | |
1a4d82fc JJ |
1078 | static bool isLegalArithImmed(uint64_t C) { |
1079 | // Matches AArch64DAGToDAGISel::SelectArithImmed(). | |
1080 | return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0); | |
1081 | } | |
970d7e83 | 1082 | |
1a4d82fc JJ |
1083 | static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC, |
1084 | SDLoc dl, SelectionDAG &DAG) { | |
1085 | EVT VT = LHS.getValueType(); | |
1086 | ||
1087 | if (VT.isFloatingPoint()) | |
1088 | return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS); | |
1089 | ||
1090 | // The CMP instruction is just an alias for SUBS, and representing it as | |
1091 | // SUBS means that it's possible to get CSE with subtract operations. | |
1092 | // A later phase can perform the optimization of setting the destination | |
1093 | // register to WZR/XZR if it ends up being unused. | |
1094 | unsigned Opcode = AArch64ISD::SUBS; | |
1095 | ||
1096 | if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) && | |
1097 | cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 && | |
1098 | (CC == ISD::SETEQ || CC == ISD::SETNE)) { | |
1099 | // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on | |
1100 | // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags | |
1101 | // can be set differently by this operation. It comes down to whether | |
1102 | // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then | |
1103 | // everything is fine. If not then the optimization is wrong. Thus general | |
1104 | // comparisons are only valid if op2 != 0. | |
1105 | ||
1106 | // So, finally, the only LLVM-native comparisons that don't mention C and V | |
1107 | // are SETEQ and SETNE. They're the only ones we can safely use CMN for in | |
1108 | // the absence of information about op2. | |
1109 | Opcode = AArch64ISD::ADDS; | |
1110 | RHS = RHS.getOperand(1); | |
1111 | } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) && | |
1112 | cast<ConstantSDNode>(RHS)->getZExtValue() == 0 && | |
1113 | !isUnsignedIntSetCC(CC)) { | |
1114 | // Similarly, (CMP (and X, Y), 0) can be implemented with a TST | |
1115 | // (a.k.a. ANDS) except that the flags are only guaranteed to work for one | |
1116 | // of the signed comparisons. | |
1117 | Opcode = AArch64ISD::ANDS; | |
1118 | RHS = LHS.getOperand(1); | |
1119 | LHS = LHS.getOperand(0); | |
970d7e83 LB |
1120 | } |
1121 | ||
1a4d82fc JJ |
1122 | return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS) |
1123 | .getValue(1); | |
1124 | } | |
970d7e83 | 1125 | |
1a4d82fc JJ |
1126 | static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, |
1127 | SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) { | |
1128 | SDValue Cmp; | |
1129 | AArch64CC::CondCode AArch64CC; | |
1130 | if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) { | |
1131 | EVT VT = RHS.getValueType(); | |
1132 | uint64_t C = RHSC->getZExtValue(); | |
1133 | if (!isLegalArithImmed(C)) { | |
1134 | // Constant does not fit, try adjusting it by one? | |
1135 | switch (CC) { | |
1136 | default: | |
1137 | break; | |
1138 | case ISD::SETLT: | |
1139 | case ISD::SETGE: | |
1140 | if ((VT == MVT::i32 && C != 0x80000000 && | |
1141 | isLegalArithImmed((uint32_t)(C - 1))) || | |
1142 | (VT == MVT::i64 && C != 0x80000000ULL && | |
1143 | isLegalArithImmed(C - 1ULL))) { | |
1144 | CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT; | |
1145 | C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; | |
1146 | RHS = DAG.getConstant(C, VT); | |
1147 | } | |
1148 | break; | |
1149 | case ISD::SETULT: | |
1150 | case ISD::SETUGE: | |
1151 | if ((VT == MVT::i32 && C != 0 && | |
1152 | isLegalArithImmed((uint32_t)(C - 1))) || | |
1153 | (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) { | |
1154 | CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT; | |
1155 | C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1; | |
1156 | RHS = DAG.getConstant(C, VT); | |
1157 | } | |
1158 | break; | |
1159 | case ISD::SETLE: | |
1160 | case ISD::SETGT: | |
85aaf69f | 1161 | if ((VT == MVT::i32 && C != INT32_MAX && |
1a4d82fc | 1162 | isLegalArithImmed((uint32_t)(C + 1))) || |
85aaf69f | 1163 | (VT == MVT::i64 && C != INT64_MAX && |
1a4d82fc JJ |
1164 | isLegalArithImmed(C + 1ULL))) { |
1165 | CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE; | |
1166 | C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; | |
1167 | RHS = DAG.getConstant(C, VT); | |
1168 | } | |
1169 | break; | |
1170 | case ISD::SETULE: | |
1171 | case ISD::SETUGT: | |
85aaf69f | 1172 | if ((VT == MVT::i32 && C != UINT32_MAX && |
1a4d82fc | 1173 | isLegalArithImmed((uint32_t)(C + 1))) || |
85aaf69f | 1174 | (VT == MVT::i64 && C != UINT64_MAX && |
1a4d82fc JJ |
1175 | isLegalArithImmed(C + 1ULL))) { |
1176 | CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE; | |
1177 | C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1; | |
1178 | RHS = DAG.getConstant(C, VT); | |
1179 | } | |
1180 | break; | |
1181 | } | |
1182 | } | |
1183 | } | |
1184 | // The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095. | |
1185 | // For the i8 operand, the largest immediate is 255, so this can be easily | |
1186 | // encoded in the compare instruction. For the i16 operand, however, the | |
1187 | // largest immediate cannot be encoded in the compare. | |
1188 | // Therefore, use a sign extending load and cmn to avoid materializing the -1 | |
1189 | // constant. For example, | |
1190 | // movz w1, #65535 | |
1191 | // ldrh w0, [x0, #0] | |
1192 | // cmp w0, w1 | |
1193 | // > | |
1194 | // ldrsh w0, [x0, #0] | |
1195 | // cmn w0, #1 | |
1196 | // Fundamental, we're relying on the property that (zext LHS) == (zext RHS) | |
1197 | // if and only if (sext LHS) == (sext RHS). The checks are in place to ensure | |
1198 | // both the LHS and RHS are truely zero extended and to make sure the | |
1199 | // transformation is profitable. | |
1200 | if ((CC == ISD::SETEQ || CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) { | |
1201 | if ((cast<ConstantSDNode>(RHS)->getZExtValue() >> 16 == 0) && | |
1202 | isa<LoadSDNode>(LHS)) { | |
1203 | if (cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD && | |
1204 | cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 && | |
1205 | LHS.getNode()->hasNUsesOfValue(1, 0)) { | |
1206 | int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue(); | |
1207 | if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) { | |
1208 | SDValue SExt = | |
1209 | DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS, | |
1210 | DAG.getValueType(MVT::i16)); | |
1211 | Cmp = emitComparison(SExt, | |
1212 | DAG.getConstant(ValueofRHS, RHS.getValueType()), | |
1213 | CC, dl, DAG); | |
1214 | AArch64CC = changeIntCCToAArch64CC(CC); | |
1215 | AArch64cc = DAG.getConstant(AArch64CC, MVT::i32); | |
1216 | return Cmp; | |
1217 | } | |
1218 | } | |
970d7e83 LB |
1219 | } |
1220 | } | |
1a4d82fc JJ |
1221 | Cmp = emitComparison(LHS, RHS, CC, dl, DAG); |
1222 | AArch64CC = changeIntCCToAArch64CC(CC); | |
1223 | AArch64cc = DAG.getConstant(AArch64CC, MVT::i32); | |
1224 | return Cmp; | |
1225 | } | |
970d7e83 | 1226 | |
1a4d82fc JJ |
1227 | static std::pair<SDValue, SDValue> |
1228 | getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) { | |
1229 | assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) && | |
1230 | "Unsupported value type"); | |
1231 | SDValue Value, Overflow; | |
1232 | SDLoc DL(Op); | |
1233 | SDValue LHS = Op.getOperand(0); | |
1234 | SDValue RHS = Op.getOperand(1); | |
1235 | unsigned Opc = 0; | |
1236 | switch (Op.getOpcode()) { | |
1237 | default: | |
1238 | llvm_unreachable("Unknown overflow instruction!"); | |
1239 | case ISD::SADDO: | |
1240 | Opc = AArch64ISD::ADDS; | |
1241 | CC = AArch64CC::VS; | |
1242 | break; | |
1243 | case ISD::UADDO: | |
1244 | Opc = AArch64ISD::ADDS; | |
1245 | CC = AArch64CC::HS; | |
1246 | break; | |
1247 | case ISD::SSUBO: | |
1248 | Opc = AArch64ISD::SUBS; | |
1249 | CC = AArch64CC::VS; | |
1250 | break; | |
1251 | case ISD::USUBO: | |
1252 | Opc = AArch64ISD::SUBS; | |
1253 | CC = AArch64CC::LO; | |
1254 | break; | |
1255 | // Multiply needs a little bit extra work. | |
1256 | case ISD::SMULO: | |
1257 | case ISD::UMULO: { | |
1258 | CC = AArch64CC::NE; | |
1259 | bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false; | |
1260 | if (Op.getValueType() == MVT::i32) { | |
1261 | unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; | |
1262 | // For a 32 bit multiply with overflow check we want the instruction | |
1263 | // selector to generate a widening multiply (SMADDL/UMADDL). For that we | |
1264 | // need to generate the following pattern: | |
1265 | // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b)) | |
1266 | LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS); | |
1267 | RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS); | |
1268 | SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); | |
1269 | SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul, | |
1270 | DAG.getConstant(0, MVT::i64)); | |
1271 | // On AArch64 the upper 32 bits are always zero extended for a 32 bit | |
1272 | // operation. We need to clear out the upper 32 bits, because we used a | |
1273 | // widening multiply that wrote all 64 bits. In the end this should be a | |
1274 | // noop. | |
1275 | Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add); | |
1276 | if (IsSigned) { | |
1277 | // The signed overflow check requires more than just a simple check for | |
1278 | // any bit set in the upper 32 bits of the result. These bits could be | |
1279 | // just the sign bits of a negative number. To perform the overflow | |
1280 | // check we have to arithmetic shift right the 32nd bit of the result by | |
1281 | // 31 bits. Then we compare the result to the upper 32 bits. | |
1282 | SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add, | |
1283 | DAG.getConstant(32, MVT::i64)); | |
1284 | UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits); | |
1285 | SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value, | |
1286 | DAG.getConstant(31, MVT::i64)); | |
1287 | // It is important that LowerBits is last, otherwise the arithmetic | |
1288 | // shift will not be folded into the compare (SUBS). | |
1289 | SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32); | |
1290 | Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) | |
1291 | .getValue(1); | |
1292 | } else { | |
1293 | // The overflow check for unsigned multiply is easy. We only need to | |
1294 | // check if any of the upper 32 bits are set. This can be done with a | |
1295 | // CMP (shifted register). For that we need to generate the following | |
1296 | // pattern: | |
1297 | // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32) | |
1298 | SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul, | |
1299 | DAG.getConstant(32, MVT::i64)); | |
1300 | SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); | |
1301 | Overflow = | |
1302 | DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64), | |
1303 | UpperBits).getValue(1); | |
1304 | } | |
1305 | break; | |
1306 | } | |
1307 | assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type"); | |
1308 | // For the 64 bit multiply | |
1309 | Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS); | |
1310 | if (IsSigned) { | |
1311 | SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS); | |
1312 | SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value, | |
1313 | DAG.getConstant(63, MVT::i64)); | |
1314 | // It is important that LowerBits is last, otherwise the arithmetic | |
1315 | // shift will not be folded into the compare (SUBS). | |
1316 | SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); | |
1317 | Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits) | |
1318 | .getValue(1); | |
1319 | } else { | |
1320 | SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS); | |
1321 | SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32); | |
1322 | Overflow = | |
1323 | DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64), | |
1324 | UpperBits).getValue(1); | |
1325 | } | |
1326 | break; | |
1327 | } | |
1328 | } // switch (...) | |
970d7e83 | 1329 | |
1a4d82fc JJ |
1330 | if (Opc) { |
1331 | SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32); | |
970d7e83 | 1332 | |
1a4d82fc JJ |
1333 | // Emit the AArch64 operation with overflow check. |
1334 | Value = DAG.getNode(Opc, DL, VTs, LHS, RHS); | |
1335 | Overflow = Value.getValue(1); | |
970d7e83 | 1336 | } |
1a4d82fc | 1337 | return std::make_pair(Value, Overflow); |
970d7e83 LB |
1338 | } |
1339 | ||
1a4d82fc JJ |
1340 | SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG, |
1341 | RTLIB::Libcall Call) const { | |
1342 | SmallVector<SDValue, 2> Ops; | |
1343 | for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) | |
1344 | Ops.push_back(Op.getOperand(i)); | |
970d7e83 | 1345 | |
1a4d82fc JJ |
1346 | return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false, |
1347 | SDLoc(Op)).first; | |
1348 | } | |
970d7e83 | 1349 | |
1a4d82fc JJ |
1350 | static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { |
1351 | SDValue Sel = Op.getOperand(0); | |
1352 | SDValue Other = Op.getOperand(1); | |
970d7e83 | 1353 | |
1a4d82fc JJ |
1354 | // If neither operand is a SELECT_CC, give up. |
1355 | if (Sel.getOpcode() != ISD::SELECT_CC) | |
1356 | std::swap(Sel, Other); | |
1357 | if (Sel.getOpcode() != ISD::SELECT_CC) | |
1358 | return Op; | |
970d7e83 | 1359 | |
1a4d82fc JJ |
1360 | // The folding we want to perform is: |
1361 | // (xor x, (select_cc a, b, cc, 0, -1) ) | |
1362 | // --> | |
1363 | // (csel x, (xor x, -1), cc ...) | |
1364 | // | |
1365 | // The latter will get matched to a CSINV instruction. | |
970d7e83 | 1366 | |
1a4d82fc JJ |
1367 | ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get(); |
1368 | SDValue LHS = Sel.getOperand(0); | |
1369 | SDValue RHS = Sel.getOperand(1); | |
1370 | SDValue TVal = Sel.getOperand(2); | |
1371 | SDValue FVal = Sel.getOperand(3); | |
1372 | SDLoc dl(Sel); | |
970d7e83 | 1373 | |
1a4d82fc JJ |
1374 | // FIXME: This could be generalized to non-integer comparisons. |
1375 | if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) | |
1376 | return Op; | |
970d7e83 | 1377 | |
1a4d82fc JJ |
1378 | ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); |
1379 | ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); | |
970d7e83 | 1380 | |
1a4d82fc JJ |
1381 | // The the values aren't constants, this isn't the pattern we're looking for. |
1382 | if (!CFVal || !CTVal) | |
1383 | return Op; | |
970d7e83 | 1384 | |
1a4d82fc JJ |
1385 | // We can commute the SELECT_CC by inverting the condition. This |
1386 | // might be needed to make this fit into a CSINV pattern. | |
1387 | if (CTVal->isAllOnesValue() && CFVal->isNullValue()) { | |
1388 | std::swap(TVal, FVal); | |
1389 | std::swap(CTVal, CFVal); | |
1390 | CC = ISD::getSetCCInverse(CC, true); | |
1391 | } | |
970d7e83 | 1392 | |
1a4d82fc JJ |
1393 | // If the constants line up, perform the transform! |
1394 | if (CTVal->isNullValue() && CFVal->isAllOnesValue()) { | |
1395 | SDValue CCVal; | |
1396 | SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); | |
970d7e83 | 1397 | |
1a4d82fc JJ |
1398 | FVal = Other; |
1399 | TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other, | |
1400 | DAG.getConstant(-1ULL, Other.getValueType())); | |
970d7e83 | 1401 | |
1a4d82fc JJ |
1402 | return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal, |
1403 | CCVal, Cmp); | |
1404 | } | |
970d7e83 | 1405 | |
1a4d82fc JJ |
1406 | return Op; |
1407 | } | |
970d7e83 | 1408 | |
1a4d82fc JJ |
1409 | static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) { |
1410 | EVT VT = Op.getValueType(); | |
1411 | ||
1412 | // Let legalize expand this if it isn't a legal type yet. | |
1413 | if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) | |
1414 | return SDValue(); | |
1415 | ||
1416 | SDVTList VTs = DAG.getVTList(VT, MVT::i32); | |
970d7e83 | 1417 | |
1a4d82fc JJ |
1418 | unsigned Opc; |
1419 | bool ExtraOp = false; | |
1420 | switch (Op.getOpcode()) { | |
1421 | default: | |
1422 | llvm_unreachable("Invalid code"); | |
1423 | case ISD::ADDC: | |
1424 | Opc = AArch64ISD::ADDS; | |
1425 | break; | |
1426 | case ISD::SUBC: | |
1427 | Opc = AArch64ISD::SUBS; | |
1428 | break; | |
1429 | case ISD::ADDE: | |
1430 | Opc = AArch64ISD::ADCS; | |
1431 | ExtraOp = true; | |
1432 | break; | |
1433 | case ISD::SUBE: | |
1434 | Opc = AArch64ISD::SBCS; | |
1435 | ExtraOp = true; | |
1436 | break; | |
970d7e83 LB |
1437 | } |
1438 | ||
1a4d82fc JJ |
1439 | if (!ExtraOp) |
1440 | return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1)); | |
1441 | return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1), | |
1442 | Op.getOperand(2)); | |
1443 | } | |
970d7e83 | 1444 | |
1a4d82fc JJ |
1445 | static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { |
1446 | // Let legalize expand this if it isn't a legal type yet. | |
1447 | if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType())) | |
1448 | return SDValue(); | |
970d7e83 | 1449 | |
1a4d82fc JJ |
1450 | AArch64CC::CondCode CC; |
1451 | // The actual operation that sets the overflow or carry flag. | |
1452 | SDValue Value, Overflow; | |
1453 | std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG); | |
970d7e83 | 1454 | |
1a4d82fc JJ |
1455 | // We use 0 and 1 as false and true values. |
1456 | SDValue TVal = DAG.getConstant(1, MVT::i32); | |
1457 | SDValue FVal = DAG.getConstant(0, MVT::i32); | |
970d7e83 | 1458 | |
1a4d82fc JJ |
1459 | // We use an inverted condition, because the conditional select is inverted |
1460 | // too. This will allow it to be selected to a single instruction: | |
1461 | // CSINC Wd, WZR, WZR, invert(cond). | |
1462 | SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32); | |
1463 | Overflow = DAG.getNode(AArch64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal, | |
1464 | CCVal, Overflow); | |
1465 | ||
1466 | SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); | |
1467 | return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow); | |
970d7e83 LB |
1468 | } |
1469 | ||
1a4d82fc JJ |
1470 | // Prefetch operands are: |
1471 | // 1: Address to prefetch | |
1472 | // 2: bool isWrite | |
1473 | // 3: int locality (0 = no locality ... 3 = extreme locality) | |
1474 | // 4: bool isDataCache | |
1475 | static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) { | |
1476 | SDLoc DL(Op); | |
1477 | unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); | |
1478 | unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); | |
1479 | unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue(); | |
1480 | ||
1481 | bool IsStream = !Locality; | |
1482 | // When the locality number is set | |
1483 | if (Locality) { | |
1484 | // The front-end should have filtered out the out-of-range values | |
1485 | assert(Locality <= 3 && "Prefetch locality out-of-range"); | |
1486 | // The locality degree is the opposite of the cache speed. | |
1487 | // Put the number the other way around. | |
1488 | // The encoding starts at 0 for level 1 | |
1489 | Locality = 3 - Locality; | |
1490 | } | |
970d7e83 | 1491 | |
1a4d82fc JJ |
1492 | // built the mask value encoding the expected behavior. |
1493 | unsigned PrfOp = (IsWrite << 4) | // Load/Store bit | |
1494 | (!IsData << 3) | // IsDataCache bit | |
1495 | (Locality << 1) | // Cache level bits | |
1496 | (unsigned)IsStream; // Stream bit | |
1497 | return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0), | |
1498 | DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1)); | |
1499 | } | |
970d7e83 | 1500 | |
1a4d82fc JJ |
1501 | SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, |
1502 | SelectionDAG &DAG) const { | |
1503 | assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); | |
970d7e83 | 1504 | |
1a4d82fc JJ |
1505 | RTLIB::Libcall LC; |
1506 | LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType()); | |
970d7e83 | 1507 | |
1a4d82fc JJ |
1508 | return LowerF128Call(Op, DAG, LC); |
1509 | } | |
1510 | ||
1511 | SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, | |
1512 | SelectionDAG &DAG) const { | |
1513 | if (Op.getOperand(0).getValueType() != MVT::f128) { | |
1514 | // It's legal except when f128 is involved | |
1515 | return Op; | |
1516 | } | |
970d7e83 | 1517 | |
1a4d82fc JJ |
1518 | RTLIB::Libcall LC; |
1519 | LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType()); | |
970d7e83 | 1520 | |
1a4d82fc JJ |
1521 | // FP_ROUND node has a second operand indicating whether it is known to be |
1522 | // precise. That doesn't take part in the LibCall so we can't directly use | |
1523 | // LowerF128Call. | |
1524 | SDValue SrcVal = Op.getOperand(0); | |
1525 | return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1, | |
1526 | /*isSigned*/ false, SDLoc(Op)).first; | |
1527 | } | |
970d7e83 | 1528 | |
1a4d82fc JJ |
1529 | static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) { |
1530 | // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. | |
1531 | // Any additional optimization in this function should be recorded | |
1532 | // in the cost tables. | |
1533 | EVT InVT = Op.getOperand(0).getValueType(); | |
1534 | EVT VT = Op.getValueType(); | |
970d7e83 | 1535 | |
1a4d82fc JJ |
1536 | if (VT.getSizeInBits() < InVT.getSizeInBits()) { |
1537 | SDLoc dl(Op); | |
1538 | SDValue Cv = | |
1539 | DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(), | |
1540 | Op.getOperand(0)); | |
1541 | return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv); | |
970d7e83 LB |
1542 | } |
1543 | ||
1a4d82fc JJ |
1544 | if (VT.getSizeInBits() > InVT.getSizeInBits()) { |
1545 | SDLoc dl(Op); | |
1546 | MVT ExtVT = | |
1547 | MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()), | |
1548 | VT.getVectorNumElements()); | |
1549 | SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0)); | |
1550 | return DAG.getNode(Op.getOpcode(), dl, VT, Ext); | |
1551 | } | |
970d7e83 | 1552 | |
1a4d82fc JJ |
1553 | // Type changing conversions are illegal. |
1554 | return Op; | |
970d7e83 LB |
1555 | } |
1556 | ||
1a4d82fc JJ |
1557 | SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, |
1558 | SelectionDAG &DAG) const { | |
1559 | if (Op.getOperand(0).getValueType().isVector()) | |
1560 | return LowerVectorFP_TO_INT(Op, DAG); | |
970d7e83 | 1561 | |
1a4d82fc JJ |
1562 | if (Op.getOperand(0).getValueType() != MVT::f128) { |
1563 | // It's legal except when f128 is involved | |
1564 | return Op; | |
1565 | } | |
970d7e83 | 1566 | |
1a4d82fc JJ |
1567 | RTLIB::Libcall LC; |
1568 | if (Op.getOpcode() == ISD::FP_TO_SINT) | |
1569 | LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType()); | |
1570 | else | |
1571 | LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType()); | |
970d7e83 | 1572 | |
1a4d82fc JJ |
1573 | SmallVector<SDValue, 2> Ops; |
1574 | for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) | |
1575 | Ops.push_back(Op.getOperand(i)); | |
970d7e83 | 1576 | |
1a4d82fc JJ |
1577 | return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false, |
1578 | SDLoc(Op)).first; | |
1579 | } | |
970d7e83 | 1580 | |
1a4d82fc JJ |
1581 | static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) { |
1582 | // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp. | |
1583 | // Any additional optimization in this function should be recorded | |
1584 | // in the cost tables. | |
1585 | EVT VT = Op.getValueType(); | |
1586 | SDLoc dl(Op); | |
1587 | SDValue In = Op.getOperand(0); | |
1588 | EVT InVT = In.getValueType(); | |
1589 | ||
1590 | if (VT.getSizeInBits() < InVT.getSizeInBits()) { | |
1591 | MVT CastVT = | |
1592 | MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), | |
1593 | InVT.getVectorNumElements()); | |
1594 | In = DAG.getNode(Op.getOpcode(), dl, CastVT, In); | |
1595 | return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0)); | |
970d7e83 LB |
1596 | } |
1597 | ||
1a4d82fc JJ |
1598 | if (VT.getSizeInBits() > InVT.getSizeInBits()) { |
1599 | unsigned CastOpc = | |
1600 | Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; | |
1601 | EVT CastVT = VT.changeVectorElementTypeToInteger(); | |
1602 | In = DAG.getNode(CastOpc, dl, CastVT, In); | |
1603 | return DAG.getNode(Op.getOpcode(), dl, VT, In); | |
1604 | } | |
970d7e83 | 1605 | |
1a4d82fc JJ |
1606 | return Op; |
1607 | } | |
970d7e83 | 1608 | |
1a4d82fc JJ |
1609 | SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, |
1610 | SelectionDAG &DAG) const { | |
1611 | if (Op.getValueType().isVector()) | |
1612 | return LowerVectorINT_TO_FP(Op, DAG); | |
970d7e83 | 1613 | |
1a4d82fc JJ |
1614 | // i128 conversions are libcalls. |
1615 | if (Op.getOperand(0).getValueType() == MVT::i128) | |
1616 | return SDValue(); | |
970d7e83 | 1617 | |
1a4d82fc JJ |
1618 | // Other conversions are legal, unless it's to the completely software-based |
1619 | // fp128. | |
1620 | if (Op.getValueType() != MVT::f128) | |
1621 | return Op; | |
970d7e83 | 1622 | |
1a4d82fc JJ |
1623 | RTLIB::Libcall LC; |
1624 | if (Op.getOpcode() == ISD::SINT_TO_FP) | |
1625 | LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); | |
1626 | else | |
1627 | LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType()); | |
970d7e83 | 1628 | |
1a4d82fc JJ |
1629 | return LowerF128Call(Op, DAG, LC); |
1630 | } | |
970d7e83 | 1631 | |
1a4d82fc JJ |
1632 | SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, |
1633 | SelectionDAG &DAG) const { | |
1634 | // For iOS, we want to call an alternative entry point: __sincos_stret, | |
1635 | // which returns the values in two S / D registers. | |
1636 | SDLoc dl(Op); | |
1637 | SDValue Arg = Op.getOperand(0); | |
1638 | EVT ArgVT = Arg.getValueType(); | |
1639 | Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); | |
970d7e83 | 1640 | |
1a4d82fc JJ |
1641 | ArgListTy Args; |
1642 | ArgListEntry Entry; | |
970d7e83 | 1643 | |
1a4d82fc JJ |
1644 | Entry.Node = Arg; |
1645 | Entry.Ty = ArgTy; | |
1646 | Entry.isSExt = false; | |
1647 | Entry.isZExt = false; | |
1648 | Args.push_back(Entry); | |
970d7e83 | 1649 | |
1a4d82fc JJ |
1650 | const char *LibcallName = |
1651 | (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret"; | |
1652 | SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy()); | |
970d7e83 | 1653 | |
85aaf69f | 1654 | StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); |
1a4d82fc JJ |
1655 | TargetLowering::CallLoweringInfo CLI(DAG); |
1656 | CLI.setDebugLoc(dl).setChain(DAG.getEntryNode()) | |
1657 | .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args), 0); | |
970d7e83 | 1658 | |
1a4d82fc JJ |
1659 | std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI); |
1660 | return CallResult.first; | |
1661 | } | |
970d7e83 | 1662 | |
1a4d82fc JJ |
1663 | static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) { |
1664 | if (Op.getValueType() != MVT::f16) | |
1665 | return SDValue(); | |
970d7e83 | 1666 | |
1a4d82fc JJ |
1667 | assert(Op.getOperand(0).getValueType() == MVT::i16); |
1668 | SDLoc DL(Op); | |
970d7e83 | 1669 | |
1a4d82fc JJ |
1670 | Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0)); |
1671 | Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op); | |
1672 | return SDValue( | |
1673 | DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op, | |
1674 | DAG.getTargetConstant(AArch64::hsub, MVT::i32)), | |
1675 | 0); | |
1676 | } | |
970d7e83 | 1677 | |
85aaf69f SL |
1678 | static EVT getExtensionTo64Bits(const EVT &OrigVT) { |
1679 | if (OrigVT.getSizeInBits() >= 64) | |
1680 | return OrigVT; | |
1681 | ||
1682 | assert(OrigVT.isSimple() && "Expecting a simple value type"); | |
1683 | ||
1684 | MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; | |
1685 | switch (OrigSimpleTy) { | |
1686 | default: llvm_unreachable("Unexpected Vector Type"); | |
1687 | case MVT::v2i8: | |
1688 | case MVT::v2i16: | |
1689 | return MVT::v2i32; | |
1690 | case MVT::v4i8: | |
1691 | return MVT::v4i16; | |
1692 | } | |
1693 | } | |
1694 | ||
1695 | static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG, | |
1696 | const EVT &OrigTy, | |
1697 | const EVT &ExtTy, | |
1698 | unsigned ExtOpcode) { | |
1699 | // The vector originally had a size of OrigTy. It was then extended to ExtTy. | |
1700 | // We expect the ExtTy to be 128-bits total. If the OrigTy is less than | |
1701 | // 64-bits we need to insert a new extension so that it will be 64-bits. | |
1702 | assert(ExtTy.is128BitVector() && "Unexpected extension size"); | |
1703 | if (OrigTy.getSizeInBits() >= 64) | |
1704 | return N; | |
1705 | ||
1706 | // Must extend size to at least 64 bits to be used as an operand for VMULL. | |
1707 | EVT NewVT = getExtensionTo64Bits(OrigTy); | |
1708 | ||
1709 | return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N); | |
1710 | } | |
1711 | ||
1712 | static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG, | |
1713 | bool isSigned) { | |
1714 | EVT VT = N->getValueType(0); | |
1715 | ||
1716 | if (N->getOpcode() != ISD::BUILD_VECTOR) | |
1717 | return false; | |
1718 | ||
1719 | for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) { | |
1720 | SDNode *Elt = N->getOperand(i).getNode(); | |
1721 | if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) { | |
1722 | unsigned EltSize = VT.getVectorElementType().getSizeInBits(); | |
1723 | unsigned HalfSize = EltSize / 2; | |
1724 | if (isSigned) { | |
1725 | if (!isIntN(HalfSize, C->getSExtValue())) | |
1726 | return false; | |
1727 | } else { | |
1728 | if (!isUIntN(HalfSize, C->getZExtValue())) | |
1729 | return false; | |
1730 | } | |
1731 | continue; | |
1732 | } | |
1733 | return false; | |
1734 | } | |
1735 | ||
1736 | return true; | |
1737 | } | |
1738 | ||
1739 | static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) { | |
1740 | if (N->getOpcode() == ISD::SIGN_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND) | |
1741 | return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG, | |
1742 | N->getOperand(0)->getValueType(0), | |
1743 | N->getValueType(0), | |
1744 | N->getOpcode()); | |
1745 | ||
1746 | assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR"); | |
1747 | EVT VT = N->getValueType(0); | |
1748 | unsigned EltSize = VT.getVectorElementType().getSizeInBits() / 2; | |
1749 | unsigned NumElts = VT.getVectorNumElements(); | |
1750 | MVT TruncVT = MVT::getIntegerVT(EltSize); | |
1751 | SmallVector<SDValue, 8> Ops; | |
1752 | for (unsigned i = 0; i != NumElts; ++i) { | |
1753 | ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i)); | |
1754 | const APInt &CInt = C->getAPIntValue(); | |
1755 | // Element types smaller than 32 bits are not legal, so use i32 elements. | |
1756 | // The values are implicitly truncated so sext vs. zext doesn't matter. | |
1757 | Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32)); | |
1758 | } | |
1759 | return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), | |
1760 | MVT::getVectorVT(TruncVT, NumElts), Ops); | |
1761 | } | |
1762 | ||
1763 | static bool isSignExtended(SDNode *N, SelectionDAG &DAG) { | |
1764 | if (N->getOpcode() == ISD::SIGN_EXTEND) | |
1765 | return true; | |
1766 | if (isExtendedBUILD_VECTOR(N, DAG, true)) | |
1767 | return true; | |
1768 | return false; | |
1769 | } | |
1770 | ||
1771 | static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { | |
1772 | if (N->getOpcode() == ISD::ZERO_EXTEND) | |
1773 | return true; | |
1774 | if (isExtendedBUILD_VECTOR(N, DAG, false)) | |
1775 | return true; | |
1776 | return false; | |
1777 | } | |
1778 | ||
1779 | static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) { | |
1780 | unsigned Opcode = N->getOpcode(); | |
1781 | if (Opcode == ISD::ADD || Opcode == ISD::SUB) { | |
1782 | SDNode *N0 = N->getOperand(0).getNode(); | |
1783 | SDNode *N1 = N->getOperand(1).getNode(); | |
1784 | return N0->hasOneUse() && N1->hasOneUse() && | |
1785 | isSignExtended(N0, DAG) && isSignExtended(N1, DAG); | |
1786 | } | |
1787 | return false; | |
1788 | } | |
1789 | ||
1790 | static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) { | |
1791 | unsigned Opcode = N->getOpcode(); | |
1792 | if (Opcode == ISD::ADD || Opcode == ISD::SUB) { | |
1793 | SDNode *N0 = N->getOperand(0).getNode(); | |
1794 | SDNode *N1 = N->getOperand(1).getNode(); | |
1795 | return N0->hasOneUse() && N1->hasOneUse() && | |
1796 | isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG); | |
1797 | } | |
1798 | return false; | |
1799 | } | |
1800 | ||
1801 | static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) { | |
1802 | // Multiplications are only custom-lowered for 128-bit vectors so that | |
1803 | // VMULL can be detected. Otherwise v2i64 multiplications are not legal. | |
1804 | EVT VT = Op.getValueType(); | |
1805 | assert(VT.is128BitVector() && VT.isInteger() && | |
1806 | "unexpected type for custom-lowering ISD::MUL"); | |
1807 | SDNode *N0 = Op.getOperand(0).getNode(); | |
1808 | SDNode *N1 = Op.getOperand(1).getNode(); | |
1809 | unsigned NewOpc = 0; | |
1810 | bool isMLA = false; | |
1811 | bool isN0SExt = isSignExtended(N0, DAG); | |
1812 | bool isN1SExt = isSignExtended(N1, DAG); | |
1813 | if (isN0SExt && isN1SExt) | |
1814 | NewOpc = AArch64ISD::SMULL; | |
1815 | else { | |
1816 | bool isN0ZExt = isZeroExtended(N0, DAG); | |
1817 | bool isN1ZExt = isZeroExtended(N1, DAG); | |
1818 | if (isN0ZExt && isN1ZExt) | |
1819 | NewOpc = AArch64ISD::UMULL; | |
1820 | else if (isN1SExt || isN1ZExt) { | |
1821 | // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these | |
1822 | // into (s/zext A * s/zext C) + (s/zext B * s/zext C) | |
1823 | if (isN1SExt && isAddSubSExt(N0, DAG)) { | |
1824 | NewOpc = AArch64ISD::SMULL; | |
1825 | isMLA = true; | |
1826 | } else if (isN1ZExt && isAddSubZExt(N0, DAG)) { | |
1827 | NewOpc = AArch64ISD::UMULL; | |
1828 | isMLA = true; | |
1829 | } else if (isN0ZExt && isAddSubZExt(N1, DAG)) { | |
1830 | std::swap(N0, N1); | |
1831 | NewOpc = AArch64ISD::UMULL; | |
1832 | isMLA = true; | |
1833 | } | |
1834 | } | |
1835 | ||
1836 | if (!NewOpc) { | |
1837 | if (VT == MVT::v2i64) | |
1838 | // Fall through to expand this. It is not legal. | |
1839 | return SDValue(); | |
1840 | else | |
1841 | // Other vector multiplications are legal. | |
1842 | return Op; | |
1843 | } | |
1844 | } | |
1845 | ||
1846 | // Legalize to a S/UMULL instruction | |
1847 | SDLoc DL(Op); | |
1848 | SDValue Op0; | |
1849 | SDValue Op1 = skipExtensionForVectorMULL(N1, DAG); | |
1850 | if (!isMLA) { | |
1851 | Op0 = skipExtensionForVectorMULL(N0, DAG); | |
1852 | assert(Op0.getValueType().is64BitVector() && | |
1853 | Op1.getValueType().is64BitVector() && | |
1854 | "unexpected types for extended operands to VMULL"); | |
1855 | return DAG.getNode(NewOpc, DL, VT, Op0, Op1); | |
1856 | } | |
1857 | // Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during | |
1858 | // isel lowering to take advantage of no-stall back to back s/umul + s/umla. | |
1859 | // This is true for CPUs with accumulate forwarding such as Cortex-A53/A57 | |
1860 | SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG); | |
1861 | SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG); | |
1862 | EVT Op1VT = Op1.getValueType(); | |
1863 | return DAG.getNode(N0->getOpcode(), DL, VT, | |
1864 | DAG.getNode(NewOpc, DL, VT, | |
1865 | DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1), | |
1866 | DAG.getNode(NewOpc, DL, VT, | |
1867 | DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1)); | |
1868 | } | |
970d7e83 | 1869 | |
1a4d82fc JJ |
1870 | SDValue AArch64TargetLowering::LowerOperation(SDValue Op, |
1871 | SelectionDAG &DAG) const { | |
1872 | switch (Op.getOpcode()) { | |
1873 | default: | |
1874 | llvm_unreachable("unimplemented operand"); | |
1875 | return SDValue(); | |
1876 | case ISD::BITCAST: | |
1877 | return LowerBITCAST(Op, DAG); | |
1878 | case ISD::GlobalAddress: | |
1879 | return LowerGlobalAddress(Op, DAG); | |
1880 | case ISD::GlobalTLSAddress: | |
1881 | return LowerGlobalTLSAddress(Op, DAG); | |
1882 | case ISD::SETCC: | |
1883 | return LowerSETCC(Op, DAG); | |
1884 | case ISD::BR_CC: | |
1885 | return LowerBR_CC(Op, DAG); | |
1886 | case ISD::SELECT: | |
1887 | return LowerSELECT(Op, DAG); | |
1888 | case ISD::SELECT_CC: | |
1889 | return LowerSELECT_CC(Op, DAG); | |
1890 | case ISD::JumpTable: | |
1891 | return LowerJumpTable(Op, DAG); | |
1892 | case ISD::ConstantPool: | |
1893 | return LowerConstantPool(Op, DAG); | |
1894 | case ISD::BlockAddress: | |
1895 | return LowerBlockAddress(Op, DAG); | |
1896 | case ISD::VASTART: | |
1897 | return LowerVASTART(Op, DAG); | |
1898 | case ISD::VACOPY: | |
1899 | return LowerVACOPY(Op, DAG); | |
1900 | case ISD::VAARG: | |
1901 | return LowerVAARG(Op, DAG); | |
1902 | case ISD::ADDC: | |
1903 | case ISD::ADDE: | |
1904 | case ISD::SUBC: | |
1905 | case ISD::SUBE: | |
1906 | return LowerADDC_ADDE_SUBC_SUBE(Op, DAG); | |
1907 | case ISD::SADDO: | |
1908 | case ISD::UADDO: | |
1909 | case ISD::SSUBO: | |
1910 | case ISD::USUBO: | |
1911 | case ISD::SMULO: | |
1912 | case ISD::UMULO: | |
1913 | return LowerXALUO(Op, DAG); | |
1914 | case ISD::FADD: | |
1915 | return LowerF128Call(Op, DAG, RTLIB::ADD_F128); | |
1916 | case ISD::FSUB: | |
1917 | return LowerF128Call(Op, DAG, RTLIB::SUB_F128); | |
1918 | case ISD::FMUL: | |
1919 | return LowerF128Call(Op, DAG, RTLIB::MUL_F128); | |
1920 | case ISD::FDIV: | |
1921 | return LowerF128Call(Op, DAG, RTLIB::DIV_F128); | |
1922 | case ISD::FP_ROUND: | |
1923 | return LowerFP_ROUND(Op, DAG); | |
1924 | case ISD::FP_EXTEND: | |
1925 | return LowerFP_EXTEND(Op, DAG); | |
1926 | case ISD::FRAMEADDR: | |
1927 | return LowerFRAMEADDR(Op, DAG); | |
1928 | case ISD::RETURNADDR: | |
1929 | return LowerRETURNADDR(Op, DAG); | |
1930 | case ISD::INSERT_VECTOR_ELT: | |
1931 | return LowerINSERT_VECTOR_ELT(Op, DAG); | |
1932 | case ISD::EXTRACT_VECTOR_ELT: | |
1933 | return LowerEXTRACT_VECTOR_ELT(Op, DAG); | |
1934 | case ISD::BUILD_VECTOR: | |
1935 | return LowerBUILD_VECTOR(Op, DAG); | |
1936 | case ISD::VECTOR_SHUFFLE: | |
1937 | return LowerVECTOR_SHUFFLE(Op, DAG); | |
1938 | case ISD::EXTRACT_SUBVECTOR: | |
1939 | return LowerEXTRACT_SUBVECTOR(Op, DAG); | |
1940 | case ISD::SRA: | |
1941 | case ISD::SRL: | |
1942 | case ISD::SHL: | |
1943 | return LowerVectorSRA_SRL_SHL(Op, DAG); | |
1944 | case ISD::SHL_PARTS: | |
1945 | return LowerShiftLeftParts(Op, DAG); | |
1946 | case ISD::SRL_PARTS: | |
1947 | case ISD::SRA_PARTS: | |
1948 | return LowerShiftRightParts(Op, DAG); | |
1949 | case ISD::CTPOP: | |
1950 | return LowerCTPOP(Op, DAG); | |
1951 | case ISD::FCOPYSIGN: | |
1952 | return LowerFCOPYSIGN(Op, DAG); | |
1953 | case ISD::AND: | |
1954 | return LowerVectorAND(Op, DAG); | |
1955 | case ISD::OR: | |
1956 | return LowerVectorOR(Op, DAG); | |
1957 | case ISD::XOR: | |
1958 | return LowerXOR(Op, DAG); | |
1959 | case ISD::PREFETCH: | |
1960 | return LowerPREFETCH(Op, DAG); | |
1961 | case ISD::SINT_TO_FP: | |
1962 | case ISD::UINT_TO_FP: | |
1963 | return LowerINT_TO_FP(Op, DAG); | |
1964 | case ISD::FP_TO_SINT: | |
1965 | case ISD::FP_TO_UINT: | |
1966 | return LowerFP_TO_INT(Op, DAG); | |
1967 | case ISD::FSINCOS: | |
1968 | return LowerFSINCOS(Op, DAG); | |
85aaf69f SL |
1969 | case ISD::MUL: |
1970 | return LowerMUL(Op, DAG); | |
970d7e83 | 1971 | } |
1a4d82fc | 1972 | } |
970d7e83 | 1973 | |
1a4d82fc JJ |
1974 | /// getFunctionAlignment - Return the Log2 alignment of this function. |
1975 | unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const { | |
1976 | return 2; | |
1977 | } | |
970d7e83 | 1978 | |
1a4d82fc JJ |
1979 | //===----------------------------------------------------------------------===// |
1980 | // Calling Convention Implementation | |
1981 | //===----------------------------------------------------------------------===// | |
970d7e83 | 1982 | |
1a4d82fc | 1983 | #include "AArch64GenCallingConv.inc" |
970d7e83 | 1984 | |
1a4d82fc JJ |
1985 | /// Selects the correct CCAssignFn for a given CallingConvention value. |
1986 | CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, | |
1987 | bool IsVarArg) const { | |
1988 | switch (CC) { | |
1989 | default: | |
1990 | llvm_unreachable("Unsupported calling convention."); | |
1991 | case CallingConv::WebKit_JS: | |
1992 | return CC_AArch64_WebKit_JS; | |
85aaf69f SL |
1993 | case CallingConv::GHC: |
1994 | return CC_AArch64_GHC; | |
1a4d82fc JJ |
1995 | case CallingConv::C: |
1996 | case CallingConv::Fast: | |
1997 | if (!Subtarget->isTargetDarwin()) | |
1998 | return CC_AArch64_AAPCS; | |
1999 | return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; | |
970d7e83 | 2000 | } |
1a4d82fc | 2001 | } |
970d7e83 | 2002 | |
1a4d82fc JJ |
2003 | SDValue AArch64TargetLowering::LowerFormalArguments( |
2004 | SDValue Chain, CallingConv::ID CallConv, bool isVarArg, | |
2005 | const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, | |
2006 | SmallVectorImpl<SDValue> &InVals) const { | |
2007 | MachineFunction &MF = DAG.getMachineFunction(); | |
2008 | MachineFrameInfo *MFI = MF.getFrameInfo(); | |
2009 | ||
2010 | // Assign locations to all of the incoming arguments. | |
2011 | SmallVector<CCValAssign, 16> ArgLocs; | |
2012 | CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs, | |
2013 | *DAG.getContext()); | |
2014 | ||
2015 | // At this point, Ins[].VT may already be promoted to i32. To correctly | |
2016 | // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and | |
2017 | // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. | |
2018 | // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here | |
2019 | // we use a special version of AnalyzeFormalArguments to pass in ValVT and | |
2020 | // LocVT. | |
2021 | unsigned NumArgs = Ins.size(); | |
2022 | Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin(); | |
2023 | unsigned CurArgIdx = 0; | |
2024 | for (unsigned i = 0; i != NumArgs; ++i) { | |
2025 | MVT ValVT = Ins[i].VT; | |
2026 | std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx); | |
2027 | CurArgIdx = Ins[i].OrigArgIndex; | |
2028 | ||
2029 | // Get type of the original argument. | |
2030 | EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true); | |
2031 | MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other; | |
2032 | // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. | |
2033 | if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) | |
2034 | ValVT = MVT::i8; | |
2035 | else if (ActualMVT == MVT::i16) | |
2036 | ValVT = MVT::i16; | |
2037 | ||
2038 | CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); | |
2039 | bool Res = | |
2040 | AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo); | |
2041 | assert(!Res && "Call operand has unhandled type"); | |
2042 | (void)Res; | |
970d7e83 | 2043 | } |
1a4d82fc JJ |
2044 | assert(ArgLocs.size() == Ins.size()); |
2045 | SmallVector<SDValue, 16> ArgValues; | |
2046 | for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { | |
2047 | CCValAssign &VA = ArgLocs[i]; | |
970d7e83 | 2048 | |
1a4d82fc JJ |
2049 | if (Ins[i].Flags.isByVal()) { |
2050 | // Byval is used for HFAs in the PCS, but the system should work in a | |
2051 | // non-compliant manner for larger structs. | |
2052 | EVT PtrTy = getPointerTy(); | |
2053 | int Size = Ins[i].Flags.getByValSize(); | |
2054 | unsigned NumRegs = (Size + 7) / 8; | |
970d7e83 | 2055 | |
1a4d82fc JJ |
2056 | // FIXME: This works on big-endian for composite byvals, which are the common |
2057 | // case. It should also work for fundamental types too. | |
2058 | unsigned FrameIdx = | |
2059 | MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false); | |
2060 | SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy); | |
2061 | InVals.push_back(FrameIdxN); | |
970d7e83 | 2062 | |
1a4d82fc JJ |
2063 | continue; |
2064 | } | |
2065 | ||
2066 | if (VA.isRegLoc()) { | |
2067 | // Arguments stored in registers. | |
2068 | EVT RegVT = VA.getLocVT(); | |
2069 | ||
2070 | SDValue ArgValue; | |
2071 | const TargetRegisterClass *RC; | |
2072 | ||
2073 | if (RegVT == MVT::i32) | |
2074 | RC = &AArch64::GPR32RegClass; | |
2075 | else if (RegVT == MVT::i64) | |
2076 | RC = &AArch64::GPR64RegClass; | |
2077 | else if (RegVT == MVT::f16) | |
2078 | RC = &AArch64::FPR16RegClass; | |
2079 | else if (RegVT == MVT::f32) | |
2080 | RC = &AArch64::FPR32RegClass; | |
2081 | else if (RegVT == MVT::f64 || RegVT.is64BitVector()) | |
2082 | RC = &AArch64::FPR64RegClass; | |
2083 | else if (RegVT == MVT::f128 || RegVT.is128BitVector()) | |
2084 | RC = &AArch64::FPR128RegClass; | |
2085 | else | |
2086 | llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering"); | |
2087 | ||
2088 | // Transform the arguments in physical registers into virtual ones. | |
2089 | unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); | |
2090 | ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT); | |
2091 | ||
2092 | // If this is an 8, 16 or 32-bit value, it is really passed promoted | |
2093 | // to 64 bits. Insert an assert[sz]ext to capture this, then | |
2094 | // truncate to the right size. | |
2095 | switch (VA.getLocInfo()) { | |
2096 | default: | |
2097 | llvm_unreachable("Unknown loc info!"); | |
2098 | case CCValAssign::Full: | |
2099 | break; | |
2100 | case CCValAssign::BCvt: | |
2101 | ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue); | |
2102 | break; | |
2103 | case CCValAssign::AExt: | |
2104 | case CCValAssign::SExt: | |
2105 | case CCValAssign::ZExt: | |
2106 | // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt | |
2107 | // nodes after our lowering. | |
2108 | assert(RegVT == Ins[i].VT && "incorrect register location selected"); | |
2109 | break; | |
2110 | } | |
970d7e83 | 2111 | |
1a4d82fc | 2112 | InVals.push_back(ArgValue); |
970d7e83 | 2113 | |
1a4d82fc JJ |
2114 | } else { // VA.isRegLoc() |
2115 | assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem"); | |
2116 | unsigned ArgOffset = VA.getLocMemOffset(); | |
2117 | unsigned ArgSize = VA.getValVT().getSizeInBits() / 8; | |
970d7e83 | 2118 | |
1a4d82fc | 2119 | uint32_t BEAlign = 0; |
85aaf69f SL |
2120 | if (!Subtarget->isLittleEndian() && ArgSize < 8 && |
2121 | !Ins[i].Flags.isInConsecutiveRegs()) | |
1a4d82fc | 2122 | BEAlign = 8 - ArgSize; |
970d7e83 | 2123 | |
1a4d82fc | 2124 | int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true); |
970d7e83 | 2125 | |
1a4d82fc JJ |
2126 | // Create load nodes to retrieve arguments from the stack. |
2127 | SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); | |
2128 | SDValue ArgValue; | |
2129 | ||
2130 | // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT) | |
2131 | ISD::LoadExtType ExtType = ISD::NON_EXTLOAD; | |
2132 | MVT MemVT = VA.getValVT(); | |
2133 | ||
2134 | switch (VA.getLocInfo()) { | |
2135 | default: | |
2136 | break; | |
2137 | case CCValAssign::BCvt: | |
2138 | MemVT = VA.getLocVT(); | |
2139 | break; | |
2140 | case CCValAssign::SExt: | |
2141 | ExtType = ISD::SEXTLOAD; | |
2142 | break; | |
2143 | case CCValAssign::ZExt: | |
2144 | ExtType = ISD::ZEXTLOAD; | |
2145 | break; | |
2146 | case CCValAssign::AExt: | |
2147 | ExtType = ISD::EXTLOAD; | |
2148 | break; | |
2149 | } | |
2150 | ||
2151 | ArgValue = DAG.getExtLoad(ExtType, DL, VA.getLocVT(), Chain, FIN, | |
2152 | MachinePointerInfo::getFixedStack(FI), | |
85aaf69f | 2153 | MemVT, false, false, false, 0); |
1a4d82fc JJ |
2154 | |
2155 | InVals.push_back(ArgValue); | |
2156 | } | |
970d7e83 LB |
2157 | } |
2158 | ||
1a4d82fc JJ |
2159 | // varargs |
2160 | if (isVarArg) { | |
2161 | if (!Subtarget->isTargetDarwin()) { | |
2162 | // The AAPCS variadic function ABI is identical to the non-variadic | |
2163 | // one. As a result there may be more arguments in registers and we should | |
2164 | // save them for future reference. | |
2165 | saveVarArgRegisters(CCInfo, DAG, DL, Chain); | |
2166 | } | |
970d7e83 | 2167 | |
1a4d82fc JJ |
2168 | AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); |
2169 | // This will point to the next argument passed via stack. | |
2170 | unsigned StackOffset = CCInfo.getNextStackOffset(); | |
2171 | // We currently pass all varargs at 8-byte alignment. | |
2172 | StackOffset = ((StackOffset + 7) & ~7); | |
2173 | AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true)); | |
2174 | } | |
970d7e83 | 2175 | |
1a4d82fc JJ |
2176 | AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); |
2177 | unsigned StackArgSize = CCInfo.getNextStackOffset(); | |
2178 | bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; | |
2179 | if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) { | |
2180 | // This is a non-standard ABI so by fiat I say we're allowed to make full | |
2181 | // use of the stack area to be popped, which must be aligned to 16 bytes in | |
2182 | // any case: | |
2183 | StackArgSize = RoundUpToAlignment(StackArgSize, 16); | |
2184 | ||
2185 | // If we're expected to restore the stack (e.g. fastcc) then we'll be adding | |
2186 | // a multiple of 16. | |
2187 | FuncInfo->setArgumentStackToRestore(StackArgSize); | |
2188 | ||
2189 | // This realignment carries over to the available bytes below. Our own | |
2190 | // callers will guarantee the space is free by giving an aligned value to | |
2191 | // CALLSEQ_START. | |
970d7e83 | 2192 | } |
1a4d82fc JJ |
2193 | // Even if we're not expected to free up the space, it's useful to know how |
2194 | // much is there while considering tail calls (because we can reuse it). | |
2195 | FuncInfo->setBytesInStackArgArea(StackArgSize); | |
970d7e83 | 2196 | |
1a4d82fc | 2197 | return Chain; |
970d7e83 LB |
2198 | } |
2199 | ||
1a4d82fc JJ |
2200 | void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, |
2201 | SelectionDAG &DAG, SDLoc DL, | |
2202 | SDValue &Chain) const { | |
2203 | MachineFunction &MF = DAG.getMachineFunction(); | |
2204 | MachineFrameInfo *MFI = MF.getFrameInfo(); | |
2205 | AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); | |
2206 | ||
2207 | SmallVector<SDValue, 8> MemOps; | |
2208 | ||
2209 | static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2, | |
2210 | AArch64::X3, AArch64::X4, AArch64::X5, | |
2211 | AArch64::X6, AArch64::X7 }; | |
2212 | static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs); | |
2213 | unsigned FirstVariadicGPR = | |
2214 | CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs); | |
2215 | ||
2216 | unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); | |
2217 | int GPRIdx = 0; | |
2218 | if (GPRSaveSize != 0) { | |
2219 | GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false); | |
2220 | ||
2221 | SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy()); | |
2222 | ||
2223 | for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) { | |
2224 | unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass); | |
2225 | SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); | |
2226 | SDValue Store = | |
2227 | DAG.getStore(Val.getValue(1), DL, Val, FIN, | |
2228 | MachinePointerInfo::getStack(i * 8), false, false, 0); | |
2229 | MemOps.push_back(Store); | |
2230 | FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, | |
2231 | DAG.getConstant(8, getPointerTy())); | |
2232 | } | |
2233 | } | |
2234 | FuncInfo->setVarArgsGPRIndex(GPRIdx); | |
2235 | FuncInfo->setVarArgsGPRSize(GPRSaveSize); | |
2236 | ||
2237 | if (Subtarget->hasFPARMv8()) { | |
2238 | static const MCPhysReg FPRArgRegs[] = { | |
2239 | AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, | |
2240 | AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; | |
2241 | static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs); | |
2242 | unsigned FirstVariadicFPR = | |
2243 | CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs); | |
2244 | ||
2245 | unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR); | |
2246 | int FPRIdx = 0; | |
2247 | if (FPRSaveSize != 0) { | |
2248 | FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false); | |
2249 | ||
2250 | SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy()); | |
2251 | ||
2252 | for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) { | |
2253 | unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass); | |
2254 | SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128); | |
2255 | ||
2256 | SDValue Store = | |
2257 | DAG.getStore(Val.getValue(1), DL, Val, FIN, | |
2258 | MachinePointerInfo::getStack(i * 16), false, false, 0); | |
2259 | MemOps.push_back(Store); | |
2260 | FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN, | |
2261 | DAG.getConstant(16, getPointerTy())); | |
2262 | } | |
2263 | } | |
2264 | FuncInfo->setVarArgsFPRIndex(FPRIdx); | |
2265 | FuncInfo->setVarArgsFPRSize(FPRSaveSize); | |
2266 | } | |
2267 | ||
2268 | if (!MemOps.empty()) { | |
2269 | Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); | |
2270 | } | |
2271 | } | |
2272 | ||
2273 | /// LowerCallResult - Lower the result values of a call into the | |
2274 | /// appropriate copies out of appropriate physical registers. | |
2275 | SDValue AArch64TargetLowering::LowerCallResult( | |
2276 | SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, | |
2277 | const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG, | |
2278 | SmallVectorImpl<SDValue> &InVals, bool isThisReturn, | |
2279 | SDValue ThisVal) const { | |
2280 | CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS | |
2281 | ? RetCC_AArch64_WebKit_JS | |
2282 | : RetCC_AArch64_AAPCS; | |
970d7e83 LB |
2283 | // Assign locations to each value returned by this call. |
2284 | SmallVector<CCValAssign, 16> RVLocs; | |
1a4d82fc JJ |
2285 | CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, |
2286 | *DAG.getContext()); | |
2287 | CCInfo.AnalyzeCallResult(Ins, RetCC); | |
970d7e83 | 2288 | |
1a4d82fc | 2289 | // Copy all of the result registers out of their specified physreg. |
970d7e83 LB |
2290 | for (unsigned i = 0; i != RVLocs.size(); ++i) { |
2291 | CCValAssign VA = RVLocs[i]; | |
2292 | ||
1a4d82fc JJ |
2293 | // Pass 'this' value directly from the argument to return value, to avoid |
2294 | // reg unit interference | |
2295 | if (i == 0 && isThisReturn) { | |
2296 | assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 && | |
2297 | "unexpected return calling convention register assignment"); | |
2298 | InVals.push_back(ThisVal); | |
2299 | continue; | |
2300 | } | |
970d7e83 | 2301 | |
1a4d82fc JJ |
2302 | SDValue Val = |
2303 | DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag); | |
970d7e83 LB |
2304 | Chain = Val.getValue(1); |
2305 | InFlag = Val.getValue(2); | |
2306 | ||
2307 | switch (VA.getLocInfo()) { | |
1a4d82fc JJ |
2308 | default: |
2309 | llvm_unreachable("Unknown loc info!"); | |
2310 | case CCValAssign::Full: | |
970d7e83 | 2311 | break; |
1a4d82fc JJ |
2312 | case CCValAssign::BCvt: |
2313 | Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val); | |
970d7e83 LB |
2314 | break; |
2315 | } | |
2316 | ||
2317 | InVals.push_back(Val); | |
2318 | } | |
2319 | ||
2320 | return Chain; | |
2321 | } | |
2322 | ||
1a4d82fc JJ |
2323 | bool AArch64TargetLowering::isEligibleForTailCallOptimization( |
2324 | SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, | |
2325 | bool isCalleeStructRet, bool isCallerStructRet, | |
2326 | const SmallVectorImpl<ISD::OutputArg> &Outs, | |
2327 | const SmallVectorImpl<SDValue> &OutVals, | |
2328 | const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { | |
970d7e83 LB |
2329 | // For CallingConv::C this function knows whether the ABI needs |
2330 | // changing. That's not true for other conventions so they will have to opt in | |
2331 | // manually. | |
2332 | if (!IsTailCallConvention(CalleeCC) && CalleeCC != CallingConv::C) | |
2333 | return false; | |
2334 | ||
2335 | const MachineFunction &MF = DAG.getMachineFunction(); | |
2336 | const Function *CallerF = MF.getFunction(); | |
2337 | CallingConv::ID CallerCC = CallerF->getCallingConv(); | |
2338 | bool CCMatch = CallerCC == CalleeCC; | |
2339 | ||
2340 | // Byval parameters hand the function a pointer directly into the stack area | |
2341 | // we want to reuse during a tail call. Working around this *is* possible (see | |
2342 | // X86) but less efficient and uglier in LowerCall. | |
2343 | for (Function::const_arg_iterator i = CallerF->arg_begin(), | |
1a4d82fc JJ |
2344 | e = CallerF->arg_end(); |
2345 | i != e; ++i) | |
970d7e83 LB |
2346 | if (i->hasByValAttr()) |
2347 | return false; | |
2348 | ||
2349 | if (getTargetMachine().Options.GuaranteedTailCallOpt) { | |
2350 | if (IsTailCallConvention(CalleeCC) && CCMatch) | |
2351 | return true; | |
2352 | return false; | |
2353 | } | |
2354 | ||
1a4d82fc JJ |
2355 | // Externally-defined functions with weak linkage should not be |
2356 | // tail-called on AArch64 when the OS does not support dynamic | |
2357 | // pre-emption of symbols, as the AAELF spec requires normal calls | |
2358 | // to undefined weak functions to be replaced with a NOP or jump to the | |
2359 | // next instruction. The behaviour of branch instructions in this | |
2360 | // situation (as used for tail calls) is implementation-defined, so we | |
2361 | // cannot rely on the linker replacing the tail call with a return. | |
2362 | if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { | |
2363 | const GlobalValue *GV = G->getGlobal(); | |
85aaf69f SL |
2364 | const Triple TT(getTargetMachine().getTargetTriple()); |
2365 | if (GV->hasExternalWeakLinkage() && | |
2366 | (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) | |
1a4d82fc JJ |
2367 | return false; |
2368 | } | |
2369 | ||
970d7e83 LB |
2370 | // Now we search for cases where we can use a tail call without changing the |
2371 | // ABI. Sibcall is used in some places (particularly gcc) to refer to this | |
2372 | // concept. | |
2373 | ||
2374 | // I want anyone implementing a new calling convention to think long and hard | |
2375 | // about this assert. | |
1a4d82fc JJ |
2376 | assert((!isVarArg || CalleeCC == CallingConv::C) && |
2377 | "Unexpected variadic calling convention"); | |
970d7e83 | 2378 | |
1a4d82fc | 2379 | if (isVarArg && !Outs.empty()) { |
970d7e83 LB |
2380 | // At least two cases here: if caller is fastcc then we can't have any |
2381 | // memory arguments (we'd be expected to clean up the stack afterwards). If | |
2382 | // caller is C then we could potentially use its argument area. | |
2383 | ||
2384 | // FIXME: for now we take the most conservative of these in both cases: | |
2385 | // disallow all variadic memory operands. | |
2386 | SmallVector<CCValAssign, 16> ArgLocs; | |
1a4d82fc JJ |
2387 | CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, |
2388 | *DAG.getContext()); | |
970d7e83 | 2389 | |
1a4d82fc | 2390 | CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true)); |
970d7e83 LB |
2391 | for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) |
2392 | if (!ArgLocs[i].isRegLoc()) | |
2393 | return false; | |
2394 | } | |
2395 | ||
2396 | // If the calling conventions do not match, then we'd better make sure the | |
2397 | // results are returned in the same way as what the caller expects. | |
2398 | if (!CCMatch) { | |
2399 | SmallVector<CCValAssign, 16> RVLocs1; | |
1a4d82fc JJ |
2400 | CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1, |
2401 | *DAG.getContext()); | |
2402 | CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg)); | |
970d7e83 LB |
2403 | |
2404 | SmallVector<CCValAssign, 16> RVLocs2; | |
1a4d82fc JJ |
2405 | CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2, |
2406 | *DAG.getContext()); | |
2407 | CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg)); | |
970d7e83 LB |
2408 | |
2409 | if (RVLocs1.size() != RVLocs2.size()) | |
2410 | return false; | |
2411 | for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { | |
2412 | if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) | |
2413 | return false; | |
2414 | if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) | |
2415 | return false; | |
2416 | if (RVLocs1[i].isRegLoc()) { | |
2417 | if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) | |
2418 | return false; | |
2419 | } else { | |
2420 | if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) | |
2421 | return false; | |
2422 | } | |
2423 | } | |
2424 | } | |
2425 | ||
2426 | // Nothing more to check if the callee is taking no arguments | |
2427 | if (Outs.empty()) | |
2428 | return true; | |
2429 | ||
2430 | SmallVector<CCValAssign, 16> ArgLocs; | |
1a4d82fc JJ |
2431 | CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs, |
2432 | *DAG.getContext()); | |
970d7e83 | 2433 | |
1a4d82fc | 2434 | CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg)); |
970d7e83 | 2435 | |
1a4d82fc | 2436 | const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); |
970d7e83 LB |
2437 | |
2438 | // If the stack arguments for this call would fit into our own save area then | |
2439 | // the call can be made tail. | |
2440 | return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea(); | |
2441 | } | |
2442 | ||
970d7e83 LB |
2443 | SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain, |
2444 | SelectionDAG &DAG, | |
2445 | MachineFrameInfo *MFI, | |
2446 | int ClobberedFI) const { | |
2447 | SmallVector<SDValue, 8> ArgChains; | |
2448 | int64_t FirstByte = MFI->getObjectOffset(ClobberedFI); | |
2449 | int64_t LastByte = FirstByte + MFI->getObjectSize(ClobberedFI) - 1; | |
2450 | ||
2451 | // Include the original chain at the beginning of the list. When this is | |
2452 | // used by target LowerCall hooks, this helps legalize find the | |
2453 | // CALLSEQ_BEGIN node. | |
2454 | ArgChains.push_back(Chain); | |
2455 | ||
2456 | // Add a chain value for each stack argument corresponding | |
2457 | for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(), | |
1a4d82fc JJ |
2458 | UE = DAG.getEntryNode().getNode()->use_end(); |
2459 | U != UE; ++U) | |
970d7e83 LB |
2460 | if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U)) |
2461 | if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) | |
2462 | if (FI->getIndex() < 0) { | |
2463 | int64_t InFirstByte = MFI->getObjectOffset(FI->getIndex()); | |
2464 | int64_t InLastByte = InFirstByte; | |
2465 | InLastByte += MFI->getObjectSize(FI->getIndex()) - 1; | |
2466 | ||
2467 | if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) || | |
2468 | (FirstByte <= InFirstByte && InFirstByte <= LastByte)) | |
2469 | ArgChains.push_back(SDValue(L, 1)); | |
2470 | } | |
2471 | ||
1a4d82fc JJ |
2472 | // Build a tokenfactor for all the chains. |
2473 | return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains); | |
970d7e83 LB |
2474 | } |
2475 | ||
1a4d82fc JJ |
2476 | bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC, |
2477 | bool TailCallOpt) const { | |
2478 | return CallCC == CallingConv::Fast && TailCallOpt; | |
970d7e83 LB |
2479 | } |
2480 | ||
1a4d82fc JJ |
2481 | bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const { |
2482 | return CallCC == CallingConv::Fast; | |
970d7e83 LB |
2483 | } |
2484 | ||
1a4d82fc JJ |
2485 | /// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain, |
2486 | /// and add input and output parameter nodes. | |
970d7e83 | 2487 | SDValue |
1a4d82fc JJ |
2488 | AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, |
2489 | SmallVectorImpl<SDValue> &InVals) const { | |
2490 | SelectionDAG &DAG = CLI.DAG; | |
2491 | SDLoc &DL = CLI.DL; | |
2492 | SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; | |
2493 | SmallVector<SDValue, 32> &OutVals = CLI.OutVals; | |
2494 | SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins; | |
2495 | SDValue Chain = CLI.Chain; | |
2496 | SDValue Callee = CLI.Callee; | |
2497 | bool &IsTailCall = CLI.IsTailCall; | |
2498 | CallingConv::ID CallConv = CLI.CallConv; | |
2499 | bool IsVarArg = CLI.IsVarArg; | |
970d7e83 | 2500 | |
1a4d82fc JJ |
2501 | MachineFunction &MF = DAG.getMachineFunction(); |
2502 | bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); | |
2503 | bool IsThisReturn = false; | |
970d7e83 | 2504 | |
1a4d82fc JJ |
2505 | AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); |
2506 | bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; | |
2507 | bool IsSibCall = false; | |
970d7e83 | 2508 | |
1a4d82fc JJ |
2509 | if (IsTailCall) { |
2510 | // Check if it's really possible to do a tail call. | |
2511 | IsTailCall = isEligibleForTailCallOptimization( | |
2512 | Callee, CallConv, IsVarArg, IsStructRet, | |
2513 | MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG); | |
2514 | if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall()) | |
2515 | report_fatal_error("failed to perform tail call elimination on a call " | |
2516 | "site marked musttail"); | |
970d7e83 | 2517 | |
1a4d82fc JJ |
2518 | // A sibling call is one where we're under the usual C ABI and not planning |
2519 | // to change that but can still do a tail call: | |
2520 | if (!TailCallOpt && IsTailCall) | |
2521 | IsSibCall = true; | |
970d7e83 | 2522 | |
1a4d82fc JJ |
2523 | if (IsTailCall) |
2524 | ++NumTailCalls; | |
2525 | } | |
970d7e83 | 2526 | |
1a4d82fc JJ |
2527 | // Analyze operands of the call, assigning locations to each operand. |
2528 | SmallVector<CCValAssign, 16> ArgLocs; | |
2529 | CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs, | |
2530 | *DAG.getContext()); | |
2531 | ||
2532 | if (IsVarArg) { | |
2533 | // Handle fixed and variable vector arguments differently. | |
2534 | // Variable vector arguments always go into memory. | |
2535 | unsigned NumArgs = Outs.size(); | |
2536 | ||
2537 | for (unsigned i = 0; i != NumArgs; ++i) { | |
2538 | MVT ArgVT = Outs[i].VT; | |
2539 | ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; | |
2540 | CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, | |
2541 | /*IsVarArg=*/ !Outs[i].IsFixed); | |
2542 | bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo); | |
2543 | assert(!Res && "Call operand has unhandled type"); | |
2544 | (void)Res; | |
2545 | } | |
2546 | } else { | |
2547 | // At this point, Outs[].VT may already be promoted to i32. To correctly | |
2548 | // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and | |
2549 | // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT. | |
2550 | // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here | |
2551 | // we use a special version of AnalyzeCallOperands to pass in ValVT and | |
2552 | // LocVT. | |
2553 | unsigned NumArgs = Outs.size(); | |
2554 | for (unsigned i = 0; i != NumArgs; ++i) { | |
2555 | MVT ValVT = Outs[i].VT; | |
2556 | // Get type of the original argument. | |
2557 | EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty, | |
2558 | /*AllowUnknown*/ true); | |
2559 | MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT; | |
2560 | ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; | |
2561 | // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16. | |
2562 | if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8) | |
2563 | ValVT = MVT::i8; | |
2564 | else if (ActualMVT == MVT::i16) | |
2565 | ValVT = MVT::i16; | |
2566 | ||
2567 | CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false); | |
2568 | bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo); | |
2569 | assert(!Res && "Call operand has unhandled type"); | |
2570 | (void)Res; | |
970d7e83 LB |
2571 | } |
2572 | } | |
2573 | ||
1a4d82fc JJ |
2574 | // Get a count of how many bytes are to be pushed on the stack. |
2575 | unsigned NumBytes = CCInfo.getNextStackOffset(); | |
970d7e83 | 2576 | |
1a4d82fc JJ |
2577 | if (IsSibCall) { |
2578 | // Since we're not changing the ABI to make this a tail call, the memory | |
2579 | // operands are already available in the caller's incoming argument space. | |
2580 | NumBytes = 0; | |
970d7e83 LB |
2581 | } |
2582 | ||
1a4d82fc JJ |
2583 | // FPDiff is the byte offset of the call's argument area from the callee's. |
2584 | // Stores to callee stack arguments will be placed in FixedStackSlots offset | |
2585 | // by this amount for a tail call. In a sibling call it must be 0 because the | |
2586 | // caller will deallocate the entire stack and the callee still expects its | |
2587 | // arguments to begin at SP+0. Completely unused for non-tail calls. | |
2588 | int FPDiff = 0; | |
970d7e83 | 2589 | |
1a4d82fc JJ |
2590 | if (IsTailCall && !IsSibCall) { |
2591 | unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea(); | |
970d7e83 | 2592 | |
1a4d82fc JJ |
2593 | // Since callee will pop argument stack as a tail call, we must keep the |
2594 | // popped size 16-byte aligned. | |
2595 | NumBytes = RoundUpToAlignment(NumBytes, 16); | |
970d7e83 | 2596 | |
1a4d82fc JJ |
2597 | // FPDiff will be negative if this tail call requires more space than we |
2598 | // would automatically have in our incoming argument space. Positive if we | |
2599 | // can actually shrink the stack. | |
2600 | FPDiff = NumReusableBytes - NumBytes; | |
970d7e83 | 2601 | |
1a4d82fc JJ |
2602 | // The stack pointer must be 16-byte aligned at all times it's used for a |
2603 | // memory operation, which in practice means at *all* times and in | |
2604 | // particular across call boundaries. Therefore our own arguments started at | |
2605 | // a 16-byte aligned SP and the delta applied for the tail call should | |
2606 | // satisfy the same constraint. | |
2607 | assert(FPDiff % 16 == 0 && "unaligned stack on tail call"); | |
970d7e83 | 2608 | } |
970d7e83 | 2609 | |
1a4d82fc JJ |
2610 | // Adjust the stack pointer for the new arguments... |
2611 | // These operations are automatically eliminated by the prolog/epilog pass | |
2612 | if (!IsSibCall) | |
2613 | Chain = | |
2614 | DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL); | |
970d7e83 | 2615 | |
1a4d82fc | 2616 | SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy()); |
970d7e83 | 2617 | |
1a4d82fc JJ |
2618 | SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; |
2619 | SmallVector<SDValue, 8> MemOpChains; | |
970d7e83 | 2620 | |
1a4d82fc JJ |
2621 | // Walk the register/memloc assignments, inserting copies/loads. |
2622 | for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; | |
2623 | ++i, ++realArgIdx) { | |
2624 | CCValAssign &VA = ArgLocs[i]; | |
2625 | SDValue Arg = OutVals[realArgIdx]; | |
2626 | ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; | |
2627 | ||
2628 | // Promote the value if needed. | |
2629 | switch (VA.getLocInfo()) { | |
2630 | default: | |
2631 | llvm_unreachable("Unknown loc info!"); | |
2632 | case CCValAssign::Full: | |
2633 | break; | |
2634 | case CCValAssign::SExt: | |
2635 | Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg); | |
2636 | break; | |
2637 | case CCValAssign::ZExt: | |
2638 | Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); | |
2639 | break; | |
2640 | case CCValAssign::AExt: | |
2641 | if (Outs[realArgIdx].ArgVT == MVT::i1) { | |
2642 | // AAPCS requires i1 to be zero-extended to 8-bits by the caller. | |
2643 | Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); | |
2644 | Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); | |
2645 | } | |
2646 | Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg); | |
2647 | break; | |
2648 | case CCValAssign::BCvt: | |
2649 | Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); | |
2650 | break; | |
2651 | case CCValAssign::FPExt: | |
2652 | Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg); | |
2653 | break; | |
2654 | } | |
2655 | ||
2656 | if (VA.isRegLoc()) { | |
2657 | if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) { | |
2658 | assert(VA.getLocVT() == MVT::i64 && | |
2659 | "unexpected calling convention register assignment"); | |
2660 | assert(!Ins.empty() && Ins[0].VT == MVT::i64 && | |
2661 | "unexpected use of 'returned'"); | |
2662 | IsThisReturn = true; | |
2663 | } | |
2664 | RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); | |
2665 | } else { | |
2666 | assert(VA.isMemLoc()); | |
2667 | ||
2668 | SDValue DstAddr; | |
2669 | MachinePointerInfo DstInfo; | |
2670 | ||
2671 | // FIXME: This works on big-endian for composite byvals, which are the | |
2672 | // common case. It should also work for fundamental types too. | |
2673 | uint32_t BEAlign = 0; | |
2674 | unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8 | |
2675 | : VA.getValVT().getSizeInBits(); | |
2676 | OpSize = (OpSize + 7) / 8; | |
85aaf69f SL |
2677 | if (!Subtarget->isLittleEndian() && !Flags.isByVal() && |
2678 | !Flags.isInConsecutiveRegs()) { | |
1a4d82fc JJ |
2679 | if (OpSize < 8) |
2680 | BEAlign = 8 - OpSize; | |
2681 | } | |
2682 | unsigned LocMemOffset = VA.getLocMemOffset(); | |
2683 | int32_t Offset = LocMemOffset + BEAlign; | |
2684 | SDValue PtrOff = DAG.getIntPtrConstant(Offset); | |
2685 | PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); | |
2686 | ||
2687 | if (IsTailCall) { | |
2688 | Offset = Offset + FPDiff; | |
2689 | int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); | |
2690 | ||
2691 | DstAddr = DAG.getFrameIndex(FI, getPointerTy()); | |
2692 | DstInfo = MachinePointerInfo::getFixedStack(FI); | |
2693 | ||
2694 | // Make sure any stack arguments overlapping with where we're storing | |
2695 | // are loaded before this eventual operation. Otherwise they'll be | |
2696 | // clobbered. | |
2697 | Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI); | |
2698 | } else { | |
2699 | SDValue PtrOff = DAG.getIntPtrConstant(Offset); | |
2700 | ||
2701 | DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff); | |
2702 | DstInfo = MachinePointerInfo::getStack(LocMemOffset); | |
2703 | } | |
2704 | ||
2705 | if (Outs[i].Flags.isByVal()) { | |
2706 | SDValue SizeNode = | |
2707 | DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64); | |
2708 | SDValue Cpy = DAG.getMemcpy( | |
2709 | Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), | |
2710 | /*isVol = */ false, | |
2711 | /*AlwaysInline = */ false, DstInfo, MachinePointerInfo()); | |
2712 | ||
2713 | MemOpChains.push_back(Cpy); | |
2714 | } else { | |
2715 | // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already | |
2716 | // promoted to a legal register type i32, we should truncate Arg back to | |
2717 | // i1/i8/i16. | |
2718 | if (VA.getValVT() == MVT::i1 || VA.getValVT() == MVT::i8 || | |
2719 | VA.getValVT() == MVT::i16) | |
2720 | Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg); | |
2721 | ||
2722 | SDValue Store = | |
2723 | DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0); | |
2724 | MemOpChains.push_back(Store); | |
2725 | } | |
2726 | } | |
2727 | } | |
2728 | ||
2729 | if (!MemOpChains.empty()) | |
2730 | Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); | |
2731 | ||
2732 | // Build a sequence of copy-to-reg nodes chained together with token chain | |
2733 | // and flag operands which copy the outgoing args into the appropriate regs. | |
2734 | SDValue InFlag; | |
2735 | for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { | |
2736 | Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first, | |
2737 | RegsToPass[i].second, InFlag); | |
2738 | InFlag = Chain.getValue(1); | |
2739 | } | |
2740 | ||
2741 | // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every | |
2742 | // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol | |
2743 | // node so that legalize doesn't hack it. | |
2744 | if (getTargetMachine().getCodeModel() == CodeModel::Large && | |
2745 | Subtarget->isTargetMachO()) { | |
2746 | if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { | |
2747 | const GlobalValue *GV = G->getGlobal(); | |
2748 | bool InternalLinkage = GV->hasInternalLinkage(); | |
2749 | if (InternalLinkage) | |
2750 | Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0); | |
2751 | else { | |
2752 | Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, | |
2753 | AArch64II::MO_GOT); | |
2754 | Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee); | |
2755 | } | |
2756 | } else if (ExternalSymbolSDNode *S = | |
2757 | dyn_cast<ExternalSymbolSDNode>(Callee)) { | |
2758 | const char *Sym = S->getSymbol(); | |
2759 | Callee = | |
2760 | DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT); | |
2761 | Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee); | |
2762 | } | |
2763 | } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { | |
2764 | const GlobalValue *GV = G->getGlobal(); | |
2765 | Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0); | |
2766 | } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { | |
2767 | const char *Sym = S->getSymbol(); | |
2768 | Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0); | |
2769 | } | |
2770 | ||
2771 | // We don't usually want to end the call-sequence here because we would tidy | |
2772 | // the frame up *after* the call, however in the ABI-changing tail-call case | |
2773 | // we've carefully laid out the parameters so that when sp is reset they'll be | |
2774 | // in the correct location. | |
2775 | if (IsTailCall && !IsSibCall) { | |
2776 | Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), | |
2777 | DAG.getIntPtrConstant(0, true), InFlag, DL); | |
2778 | InFlag = Chain.getValue(1); | |
2779 | } | |
2780 | ||
2781 | std::vector<SDValue> Ops; | |
2782 | Ops.push_back(Chain); | |
2783 | Ops.push_back(Callee); | |
2784 | ||
2785 | if (IsTailCall) { | |
2786 | // Each tail call may have to adjust the stack by a different amount, so | |
2787 | // this information must travel along with the operation for eventual | |
2788 | // consumption by emitEpilogue. | |
2789 | Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32)); | |
2790 | } | |
2791 | ||
2792 | // Add argument registers to the end of the list so that they are known live | |
2793 | // into the call. | |
2794 | for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) | |
2795 | Ops.push_back(DAG.getRegister(RegsToPass[i].first, | |
2796 | RegsToPass[i].second.getValueType())); | |
2797 | ||
2798 | // Add a register mask operand representing the call-preserved registers. | |
2799 | const uint32_t *Mask; | |
2800 | const TargetRegisterInfo *TRI = | |
2801 | getTargetMachine().getSubtargetImpl()->getRegisterInfo(); | |
2802 | const AArch64RegisterInfo *ARI = | |
2803 | static_cast<const AArch64RegisterInfo *>(TRI); | |
2804 | if (IsThisReturn) { | |
2805 | // For 'this' returns, use the X0-preserving mask if applicable | |
2806 | Mask = ARI->getThisReturnPreservedMask(CallConv); | |
2807 | if (!Mask) { | |
2808 | IsThisReturn = false; | |
2809 | Mask = ARI->getCallPreservedMask(CallConv); | |
2810 | } | |
2811 | } else | |
2812 | Mask = ARI->getCallPreservedMask(CallConv); | |
2813 | ||
2814 | assert(Mask && "Missing call preserved mask for calling convention"); | |
2815 | Ops.push_back(DAG.getRegisterMask(Mask)); | |
2816 | ||
2817 | if (InFlag.getNode()) | |
2818 | Ops.push_back(InFlag); | |
2819 | ||
2820 | SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); | |
2821 | ||
2822 | // If we're doing a tall call, use a TC_RETURN here rather than an | |
2823 | // actual call instruction. | |
2824 | if (IsTailCall) | |
2825 | return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops); | |
2826 | ||
2827 | // Returns a chain and a flag for retval copy to use. | |
2828 | Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops); | |
2829 | InFlag = Chain.getValue(1); | |
2830 | ||
2831 | uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt) | |
2832 | ? RoundUpToAlignment(NumBytes, 16) | |
2833 | : 0; | |
2834 | ||
2835 | Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true), | |
2836 | DAG.getIntPtrConstant(CalleePopBytes, true), | |
2837 | InFlag, DL); | |
2838 | if (!Ins.empty()) | |
2839 | InFlag = Chain.getValue(1); | |
2840 | ||
2841 | // Handle result values, copying them out of physregs into vregs that we | |
2842 | // return. | |
2843 | return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG, | |
2844 | InVals, IsThisReturn, | |
2845 | IsThisReturn ? OutVals[0] : SDValue()); | |
2846 | } | |
2847 | ||
2848 | bool AArch64TargetLowering::CanLowerReturn( | |
2849 | CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, | |
2850 | const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { | |
2851 | CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS | |
2852 | ? RetCC_AArch64_WebKit_JS | |
2853 | : RetCC_AArch64_AAPCS; | |
2854 | SmallVector<CCValAssign, 16> RVLocs; | |
2855 | CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); | |
2856 | return CCInfo.CheckReturn(Outs, RetCC); | |
2857 | } | |
970d7e83 LB |
2858 | |
2859 | SDValue | |
1a4d82fc JJ |
2860 | AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, |
2861 | bool isVarArg, | |
2862 | const SmallVectorImpl<ISD::OutputArg> &Outs, | |
2863 | const SmallVectorImpl<SDValue> &OutVals, | |
2864 | SDLoc DL, SelectionDAG &DAG) const { | |
2865 | CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS | |
2866 | ? RetCC_AArch64_WebKit_JS | |
2867 | : RetCC_AArch64_AAPCS; | |
2868 | SmallVector<CCValAssign, 16> RVLocs; | |
2869 | CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, | |
2870 | *DAG.getContext()); | |
2871 | CCInfo.AnalyzeReturn(Outs, RetCC); | |
2872 | ||
2873 | // Copy the result values into the output registers. | |
2874 | SDValue Flag; | |
2875 | SmallVector<SDValue, 4> RetOps(1, Chain); | |
2876 | for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size(); | |
2877 | ++i, ++realRVLocIdx) { | |
2878 | CCValAssign &VA = RVLocs[i]; | |
2879 | assert(VA.isRegLoc() && "Can only return in registers!"); | |
2880 | SDValue Arg = OutVals[realRVLocIdx]; | |
2881 | ||
2882 | switch (VA.getLocInfo()) { | |
2883 | default: | |
2884 | llvm_unreachable("Unknown loc info!"); | |
2885 | case CCValAssign::Full: | |
2886 | if (Outs[i].ArgVT == MVT::i1) { | |
2887 | // AAPCS requires i1 to be zero-extended to i8 by the producer of the | |
2888 | // value. This is strictly redundant on Darwin (which uses "zeroext | |
2889 | // i1"), but will be optimised out before ISel. | |
2890 | Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); | |
2891 | Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); | |
2892 | } | |
2893 | break; | |
2894 | case CCValAssign::BCvt: | |
2895 | Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); | |
2896 | break; | |
2897 | } | |
2898 | ||
2899 | Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); | |
2900 | Flag = Chain.getValue(1); | |
2901 | RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); | |
970d7e83 LB |
2902 | } |
2903 | ||
1a4d82fc | 2904 | RetOps[0] = Chain; // Update chain. |
970d7e83 | 2905 | |
1a4d82fc JJ |
2906 | // Add the flag if we have it. |
2907 | if (Flag.getNode()) | |
2908 | RetOps.push_back(Flag); | |
2909 | ||
2910 | return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps); | |
2911 | } | |
2912 | ||
2913 | //===----------------------------------------------------------------------===// | |
2914 | // Other Lowering Code | |
2915 | //===----------------------------------------------------------------------===// | |
2916 | ||
2917 | SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op, | |
2918 | SelectionDAG &DAG) const { | |
2919 | EVT PtrVT = getPointerTy(); | |
2920 | SDLoc DL(Op); | |
2921 | const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op); | |
2922 | const GlobalValue *GV = GN->getGlobal(); | |
2923 | unsigned char OpFlags = | |
2924 | Subtarget->ClassifyGlobalReference(GV, getTargetMachine()); | |
2925 | ||
2926 | assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 && | |
2927 | "unexpected offset in global node"); | |
2928 | ||
2929 | // This also catched the large code model case for Darwin. | |
2930 | if ((OpFlags & AArch64II::MO_GOT) != 0) { | |
2931 | SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); | |
2932 | // FIXME: Once remat is capable of dealing with instructions with register | |
2933 | // operands, expand this into two nodes instead of using a wrapper node. | |
2934 | return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); | |
2935 | } | |
2936 | ||
2937 | if ((OpFlags & AArch64II::MO_CONSTPOOL) != 0) { | |
2938 | assert(getTargetMachine().getCodeModel() == CodeModel::Small && | |
2939 | "use of MO_CONSTPOOL only supported on small model"); | |
2940 | SDValue Hi = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, AArch64II::MO_PAGE); | |
2941 | SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); | |
2942 | unsigned char LoFlags = AArch64II::MO_PAGEOFF | AArch64II::MO_NC; | |
2943 | SDValue Lo = DAG.getTargetConstantPool(GV, PtrVT, 0, 0, LoFlags); | |
2944 | SDValue PoolAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); | |
2945 | SDValue GlobalAddr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), PoolAddr, | |
2946 | MachinePointerInfo::getConstantPool(), | |
2947 | /*isVolatile=*/ false, | |
2948 | /*isNonTemporal=*/ true, | |
2949 | /*isInvariant=*/ true, 8); | |
2950 | if (GN->getOffset() != 0) | |
2951 | return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalAddr, | |
2952 | DAG.getConstant(GN->getOffset(), PtrVT)); | |
2953 | return GlobalAddr; | |
2954 | } | |
2955 | ||
2956 | if (getTargetMachine().getCodeModel() == CodeModel::Large) { | |
2957 | const unsigned char MO_NC = AArch64II::MO_NC; | |
2958 | return DAG.getNode( | |
2959 | AArch64ISD::WrapperLarge, DL, PtrVT, | |
2960 | DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3), | |
2961 | DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC), | |
2962 | DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC), | |
2963 | DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); | |
2964 | } else { | |
2965 | // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and | |
2966 | // the only correct model on Darwin. | |
2967 | SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, | |
2968 | OpFlags | AArch64II::MO_PAGE); | |
2969 | unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC; | |
2970 | SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags); | |
2971 | ||
2972 | SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); | |
2973 | return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); | |
2974 | } | |
970d7e83 LB |
2975 | } |
2976 | ||
1a4d82fc JJ |
2977 | /// \brief Convert a TLS address reference into the correct sequence of loads |
2978 | /// and calls to compute the variable's address (for Darwin, currently) and | |
2979 | /// return an SDValue containing the final node. | |
2980 | ||
2981 | /// Darwin only has one TLS scheme which must be capable of dealing with the | |
2982 | /// fully general situation, in the worst case. This means: | |
2983 | /// + "extern __thread" declaration. | |
2984 | /// + Defined in a possibly unknown dynamic library. | |
2985 | /// | |
2986 | /// The general system is that each __thread variable has a [3 x i64] descriptor | |
2987 | /// which contains information used by the runtime to calculate the address. The | |
2988 | /// only part of this the compiler needs to know about is the first xword, which | |
2989 | /// contains a function pointer that must be called with the address of the | |
2990 | /// entire descriptor in "x0". | |
2991 | /// | |
2992 | /// Since this descriptor may be in a different unit, in general even the | |
2993 | /// descriptor must be accessed via an indirect load. The "ideal" code sequence | |
2994 | /// is: | |
2995 | /// adrp x0, _var@TLVPPAGE | |
2996 | /// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor | |
2997 | /// ldr x1, [x0] ; x1 contains 1st entry of descriptor, | |
2998 | /// ; the function pointer | |
2999 | /// blr x1 ; Uses descriptor address in x0 | |
3000 | /// ; Address of _var is now in x0. | |
3001 | /// | |
3002 | /// If the address of _var's descriptor *is* known to the linker, then it can | |
3003 | /// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for | |
3004 | /// a slight efficiency gain. | |
970d7e83 | 3005 | SDValue |
1a4d82fc JJ |
3006 | AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, |
3007 | SelectionDAG &DAG) const { | |
3008 | assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin"); | |
3009 | ||
3010 | SDLoc DL(Op); | |
3011 | MVT PtrVT = getPointerTy(); | |
3012 | const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal(); | |
3013 | ||
3014 | SDValue TLVPAddr = | |
3015 | DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); | |
3016 | SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr); | |
3017 | ||
3018 | // The first entry in the descriptor is a function pointer that we must call | |
3019 | // to obtain the address of the variable. | |
3020 | SDValue Chain = DAG.getEntryNode(); | |
3021 | SDValue FuncTLVGet = | |
3022 | DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(), | |
3023 | false, true, true, 8); | |
3024 | Chain = FuncTLVGet.getValue(1); | |
3025 | ||
3026 | MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); | |
3027 | MFI->setAdjustsStack(true); | |
3028 | ||
3029 | // TLS calls preserve all registers except those that absolutely must be | |
3030 | // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be | |
3031 | // silly). | |
3032 | const TargetRegisterInfo *TRI = | |
3033 | getTargetMachine().getSubtargetImpl()->getRegisterInfo(); | |
3034 | const AArch64RegisterInfo *ARI = | |
3035 | static_cast<const AArch64RegisterInfo *>(TRI); | |
3036 | const uint32_t *Mask = ARI->getTLSCallPreservedMask(); | |
3037 | ||
3038 | // Finally, we can make the call. This is just a degenerate version of a | |
3039 | // normal AArch64 call node: x0 takes the address of the descriptor, and | |
3040 | // returns the address of the variable in this thread. | |
3041 | Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue()); | |
3042 | Chain = | |
3043 | DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue), | |
3044 | Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64), | |
3045 | DAG.getRegisterMask(Mask), Chain.getValue(1)); | |
3046 | return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); | |
3047 | } | |
970d7e83 | 3048 | |
1a4d82fc JJ |
3049 | /// When accessing thread-local variables under either the general-dynamic or |
3050 | /// local-dynamic system, we make a "TLS-descriptor" call. The variable will | |
3051 | /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry | |
3052 | /// is a function pointer to carry out the resolution. This function takes the | |
3053 | /// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All | |
3054 | /// other registers (except LR, NZCV) are preserved. | |
3055 | /// | |
3056 | /// Thus, the ideal call sequence on AArch64 is: | |
3057 | /// | |
3058 | /// adrp x0, :tlsdesc:thread_var | |
3059 | /// ldr x8, [x0, :tlsdesc_lo12:thread_var] | |
3060 | /// add x0, x0, :tlsdesc_lo12:thread_var | |
3061 | /// .tlsdesccall thread_var | |
3062 | /// blr x8 | |
3063 | /// (TPIDR_EL0 offset now in x0). | |
3064 | /// | |
3065 | /// The ".tlsdesccall" directive instructs the assembler to insert a particular | |
3066 | /// relocation to help the linker relax this sequence if it turns out to be too | |
3067 | /// conservative. | |
3068 | /// | |
3069 | /// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this | |
3070 | /// is harmless. | |
3071 | SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr, | |
3072 | SDValue DescAddr, SDLoc DL, | |
3073 | SelectionDAG &DAG) const { | |
3074 | EVT PtrVT = getPointerTy(); | |
3075 | ||
3076 | // The function we need to call is simply the first entry in the GOT for this | |
3077 | // descriptor, load it in preparation. | |
3078 | SDValue Func = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, SymAddr); | |
3079 | ||
3080 | // TLS calls preserve all registers except those that absolutely must be | |
3081 | // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be | |
3082 | // silly). | |
3083 | const TargetRegisterInfo *TRI = | |
3084 | getTargetMachine().getSubtargetImpl()->getRegisterInfo(); | |
3085 | const AArch64RegisterInfo *ARI = | |
3086 | static_cast<const AArch64RegisterInfo *>(TRI); | |
3087 | const uint32_t *Mask = ARI->getTLSCallPreservedMask(); | |
970d7e83 | 3088 | |
1a4d82fc JJ |
3089 | // The function takes only one argument: the address of the descriptor itself |
3090 | // in X0. | |
3091 | SDValue Glue, Chain; | |
3092 | Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue); | |
3093 | Glue = Chain.getValue(1); | |
3094 | ||
3095 | // We're now ready to populate the argument list, as with a normal call: | |
3096 | SmallVector<SDValue, 6> Ops; | |
3097 | Ops.push_back(Chain); | |
3098 | Ops.push_back(Func); | |
3099 | Ops.push_back(SymAddr); | |
3100 | Ops.push_back(DAG.getRegister(AArch64::X0, PtrVT)); | |
3101 | Ops.push_back(DAG.getRegisterMask(Mask)); | |
3102 | Ops.push_back(Glue); | |
3103 | ||
3104 | SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); | |
3105 | Chain = DAG.getNode(AArch64ISD::TLSDESC_CALL, DL, NodeTys, Ops); | |
3106 | Glue = Chain.getValue(1); | |
3107 | ||
3108 | return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue); | |
970d7e83 LB |
3109 | } |
3110 | ||
3111 | SDValue | |
1a4d82fc JJ |
3112 | AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, |
3113 | SelectionDAG &DAG) const { | |
3114 | assert(Subtarget->isTargetELF() && "This function expects an ELF target"); | |
3115 | assert(getTargetMachine().getCodeModel() == CodeModel::Small && | |
3116 | "ELF TLS only supported in small memory model"); | |
3117 | const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); | |
3118 | ||
3119 | TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); | |
3120 | ||
3121 | SDValue TPOff; | |
3122 | EVT PtrVT = getPointerTy(); | |
3123 | SDLoc DL(Op); | |
3124 | const GlobalValue *GV = GA->getGlobal(); | |
3125 | ||
3126 | SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); | |
3127 | ||
3128 | if (Model == TLSModel::LocalExec) { | |
3129 | SDValue HiVar = DAG.getTargetGlobalAddress( | |
3130 | GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1); | |
3131 | SDValue LoVar = DAG.getTargetGlobalAddress( | |
3132 | GV, DL, PtrVT, 0, | |
3133 | AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); | |
3134 | ||
3135 | TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, | |
3136 | DAG.getTargetConstant(16, MVT::i32)), | |
3137 | 0); | |
3138 | TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, | |
3139 | DAG.getTargetConstant(0, MVT::i32)), | |
3140 | 0); | |
3141 | } else if (Model == TLSModel::InitialExec) { | |
3142 | TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); | |
3143 | TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); | |
3144 | } else if (Model == TLSModel::LocalDynamic) { | |
3145 | // Local-dynamic accesses proceed in two phases. A general-dynamic TLS | |
3146 | // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate | |
3147 | // the beginning of the module's TLS region, followed by a DTPREL offset | |
3148 | // calculation. | |
3149 | ||
3150 | // These accesses will need deduplicating if there's more than one. | |
3151 | AArch64FunctionInfo *MFI = | |
3152 | DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); | |
3153 | MFI->incNumLocalDynamicTLSAccesses(); | |
3154 | ||
3155 | // Accesses used in this sequence go via the TLS descriptor which lives in | |
3156 | // the GOT. Prepare an address we can use to handle this. | |
3157 | SDValue HiDesc = DAG.getTargetExternalSymbol( | |
3158 | "_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS | AArch64II::MO_PAGE); | |
3159 | SDValue LoDesc = DAG.getTargetExternalSymbol( | |
3160 | "_TLS_MODULE_BASE_", PtrVT, | |
3161 | AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); | |
3162 | ||
3163 | // First argument to the descriptor call is the address of the descriptor | |
3164 | // itself. | |
3165 | SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc); | |
3166 | DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc); | |
3167 | ||
3168 | // The call needs a relocation too for linker relaxation. It doesn't make | |
3169 | // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of | |
3170 | // the address. | |
3171 | SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT, | |
3172 | AArch64II::MO_TLS); | |
3173 | ||
3174 | // Now we can calculate the offset from TPIDR_EL0 to this module's | |
3175 | // thread-local area. | |
3176 | TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG); | |
3177 | ||
3178 | // Now use :dtprel_whatever: operations to calculate this variable's offset | |
3179 | // in its thread-storage area. | |
3180 | SDValue HiVar = DAG.getTargetGlobalAddress( | |
3181 | GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_G1); | |
3182 | SDValue LoVar = DAG.getTargetGlobalAddress( | |
3183 | GV, DL, MVT::i64, 0, | |
3184 | AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); | |
3185 | ||
3186 | SDValue DTPOff = | |
3187 | SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, | |
3188 | DAG.getTargetConstant(16, MVT::i32)), | |
3189 | 0); | |
3190 | DTPOff = | |
3191 | SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, DTPOff, LoVar, | |
3192 | DAG.getTargetConstant(0, MVT::i32)), | |
3193 | 0); | |
3194 | ||
3195 | TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff); | |
3196 | } else if (Model == TLSModel::GeneralDynamic) { | |
3197 | // Accesses used in this sequence go via the TLS descriptor which lives in | |
3198 | // the GOT. Prepare an address we can use to handle this. | |
3199 | SDValue HiDesc = DAG.getTargetGlobalAddress( | |
3200 | GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGE); | |
3201 | SDValue LoDesc = DAG.getTargetGlobalAddress( | |
3202 | GV, DL, PtrVT, 0, | |
3203 | AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); | |
3204 | ||
3205 | // First argument to the descriptor call is the address of the descriptor | |
3206 | // itself. | |
3207 | SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc); | |
3208 | DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc); | |
3209 | ||
3210 | // The call needs a relocation too for linker relaxation. It doesn't make | |
3211 | // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of | |
3212 | // the address. | |
3213 | SDValue SymAddr = | |
3214 | DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); | |
3215 | ||
3216 | // Finally we can make a call to calculate the offset from tpidr_el0. | |
3217 | TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG); | |
3218 | } else | |
3219 | llvm_unreachable("Unsupported ELF TLS access model"); | |
3220 | ||
3221 | return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); | |
3222 | } | |
3223 | ||
3224 | SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op, | |
3225 | SelectionDAG &DAG) const { | |
3226 | if (Subtarget->isTargetDarwin()) | |
3227 | return LowerDarwinGlobalTLSAddress(Op, DAG); | |
3228 | else if (Subtarget->isTargetELF()) | |
3229 | return LowerELFGlobalTLSAddress(Op, DAG); | |
3230 | ||
3231 | llvm_unreachable("Unexpected platform trying to use TLS"); | |
3232 | } | |
3233 | SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { | |
3234 | SDValue Chain = Op.getOperand(0); | |
3235 | ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get(); | |
3236 | SDValue LHS = Op.getOperand(2); | |
3237 | SDValue RHS = Op.getOperand(3); | |
3238 | SDValue Dest = Op.getOperand(4); | |
3239 | SDLoc dl(Op); | |
3240 | ||
3241 | // Handle f128 first, since lowering it will result in comparing the return | |
3242 | // value of a libcall against zero, which is just what the rest of LowerBR_CC | |
3243 | // is expecting to deal with. | |
3244 | if (LHS.getValueType() == MVT::f128) { | |
3245 | softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); | |
3246 | ||
3247 | // If softenSetCCOperands returned a scalar, we need to compare the result | |
3248 | // against zero to select between true and false values. | |
3249 | if (!RHS.getNode()) { | |
3250 | RHS = DAG.getConstant(0, LHS.getValueType()); | |
3251 | CC = ISD::SETNE; | |
3252 | } | |
970d7e83 LB |
3253 | } |
3254 | ||
1a4d82fc JJ |
3255 | // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch |
3256 | // instruction. | |
3257 | unsigned Opc = LHS.getOpcode(); | |
3258 | if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) && | |
3259 | cast<ConstantSDNode>(RHS)->isOne() && | |
3260 | (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || | |
3261 | Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { | |
3262 | assert((CC == ISD::SETEQ || CC == ISD::SETNE) && | |
3263 | "Unexpected condition code."); | |
3264 | // Only lower legal XALUO ops. | |
3265 | if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0))) | |
3266 | return SDValue(); | |
3267 | ||
3268 | // The actual operation with overflow check. | |
3269 | AArch64CC::CondCode OFCC; | |
3270 | SDValue Value, Overflow; | |
3271 | std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG); | |
3272 | ||
3273 | if (CC == ISD::SETNE) | |
3274 | OFCC = getInvertedCondCode(OFCC); | |
3275 | SDValue CCVal = DAG.getConstant(OFCC, MVT::i32); | |
3276 | ||
3277 | return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest, | |
3278 | CCVal, Overflow); | |
3279 | } | |
3280 | ||
3281 | if (LHS.getValueType().isInteger()) { | |
3282 | assert((LHS.getValueType() == RHS.getValueType()) && | |
3283 | (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); | |
3284 | ||
3285 | // If the RHS of the comparison is zero, we can potentially fold this | |
3286 | // to a specialized branch. | |
3287 | const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS); | |
3288 | if (RHSC && RHSC->getZExtValue() == 0) { | |
3289 | if (CC == ISD::SETEQ) { | |
3290 | // See if we can use a TBZ to fold in an AND as well. | |
3291 | // TBZ has a smaller branch displacement than CBZ. If the offset is | |
3292 | // out of bounds, a late MI-layer pass rewrites branches. | |
3293 | // 403.gcc is an example that hits this case. | |
3294 | if (LHS.getOpcode() == ISD::AND && | |
3295 | isa<ConstantSDNode>(LHS.getOperand(1)) && | |
3296 | isPowerOf2_64(LHS.getConstantOperandVal(1))) { | |
3297 | SDValue Test = LHS.getOperand(0); | |
3298 | uint64_t Mask = LHS.getConstantOperandVal(1); | |
3299 | return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test, | |
3300 | DAG.getConstant(Log2_64(Mask), MVT::i64), Dest); | |
3301 | } | |
3302 | ||
3303 | return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest); | |
3304 | } else if (CC == ISD::SETNE) { | |
3305 | // See if we can use a TBZ to fold in an AND as well. | |
3306 | // TBZ has a smaller branch displacement than CBZ. If the offset is | |
3307 | // out of bounds, a late MI-layer pass rewrites branches. | |
3308 | // 403.gcc is an example that hits this case. | |
3309 | if (LHS.getOpcode() == ISD::AND && | |
3310 | isa<ConstantSDNode>(LHS.getOperand(1)) && | |
3311 | isPowerOf2_64(LHS.getConstantOperandVal(1))) { | |
3312 | SDValue Test = LHS.getOperand(0); | |
3313 | uint64_t Mask = LHS.getConstantOperandVal(1); | |
3314 | return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test, | |
3315 | DAG.getConstant(Log2_64(Mask), MVT::i64), Dest); | |
3316 | } | |
3317 | ||
3318 | return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest); | |
3319 | } else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) { | |
3320 | // Don't combine AND since emitComparison converts the AND to an ANDS | |
3321 | // (a.k.a. TST) and the test in the test bit and branch instruction | |
3322 | // becomes redundant. This would also increase register pressure. | |
3323 | uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; | |
3324 | return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS, | |
3325 | DAG.getConstant(Mask, MVT::i64), Dest); | |
3326 | } | |
3327 | } | |
3328 | if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT && | |
3329 | LHS.getOpcode() != ISD::AND) { | |
3330 | // Don't combine AND since emitComparison converts the AND to an ANDS | |
3331 | // (a.k.a. TST) and the test in the test bit and branch instruction | |
3332 | // becomes redundant. This would also increase register pressure. | |
3333 | uint64_t Mask = LHS.getValueType().getSizeInBits() - 1; | |
3334 | return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS, | |
3335 | DAG.getConstant(Mask, MVT::i64), Dest); | |
3336 | } | |
3337 | ||
3338 | SDValue CCVal; | |
3339 | SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); | |
3340 | return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, | |
3341 | Cmp); | |
3342 | } | |
3343 | ||
3344 | assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); | |
3345 | ||
3346 | // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally | |
3347 | // clean. Some of them require two branches to implement. | |
3348 | SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); | |
3349 | AArch64CC::CondCode CC1, CC2; | |
3350 | changeFPCCToAArch64CC(CC, CC1, CC2); | |
3351 | SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); | |
3352 | SDValue BR1 = | |
3353 | DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp); | |
3354 | if (CC2 != AArch64CC::AL) { | |
3355 | SDValue CC2Val = DAG.getConstant(CC2, MVT::i32); | |
3356 | return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val, | |
3357 | Cmp); | |
3358 | } | |
3359 | ||
3360 | return BR1; | |
3361 | } | |
3362 | ||
3363 | SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op, | |
3364 | SelectionDAG &DAG) const { | |
3365 | EVT VT = Op.getValueType(); | |
3366 | SDLoc DL(Op); | |
3367 | ||
3368 | SDValue In1 = Op.getOperand(0); | |
3369 | SDValue In2 = Op.getOperand(1); | |
3370 | EVT SrcVT = In2.getValueType(); | |
3371 | if (SrcVT != VT) { | |
3372 | if (SrcVT == MVT::f32 && VT == MVT::f64) | |
3373 | In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2); | |
3374 | else if (SrcVT == MVT::f64 && VT == MVT::f32) | |
3375 | In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0)); | |
3376 | else | |
3377 | // FIXME: Src type is different, bail out for now. Can VT really be a | |
3378 | // vector type? | |
3379 | return SDValue(); | |
3380 | } | |
3381 | ||
3382 | EVT VecVT; | |
3383 | EVT EltVT; | |
3384 | SDValue EltMask, VecVal1, VecVal2; | |
3385 | if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) { | |
3386 | EltVT = MVT::i32; | |
3387 | VecVT = MVT::v4i32; | |
3388 | EltMask = DAG.getConstant(0x80000000ULL, EltVT); | |
3389 | ||
3390 | if (!VT.isVector()) { | |
3391 | VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, | |
3392 | DAG.getUNDEF(VecVT), In1); | |
3393 | VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT, | |
3394 | DAG.getUNDEF(VecVT), In2); | |
3395 | } else { | |
3396 | VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); | |
3397 | VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); | |
3398 | } | |
3399 | } else if (VT == MVT::f64 || VT == MVT::v2f64) { | |
3400 | EltVT = MVT::i64; | |
3401 | VecVT = MVT::v2i64; | |
3402 | ||
3403 | // We want to materialize a mask with the the high bit set, but the AdvSIMD | |
3404 | // immediate moves cannot materialize that in a single instruction for | |
3405 | // 64-bit elements. Instead, materialize zero and then negate it. | |
3406 | EltMask = DAG.getConstant(0, EltVT); | |
3407 | ||
3408 | if (!VT.isVector()) { | |
3409 | VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, | |
3410 | DAG.getUNDEF(VecVT), In1); | |
3411 | VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT, | |
3412 | DAG.getUNDEF(VecVT), In2); | |
3413 | } else { | |
3414 | VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1); | |
3415 | VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2); | |
3416 | } | |
3417 | } else { | |
3418 | llvm_unreachable("Invalid type for copysign!"); | |
3419 | } | |
3420 | ||
3421 | std::vector<SDValue> BuildVectorOps; | |
3422 | for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i) | |
3423 | BuildVectorOps.push_back(EltMask); | |
3424 | ||
3425 | SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps); | |
3426 | ||
3427 | // If we couldn't materialize the mask above, then the mask vector will be | |
3428 | // the zero vector, and we need to negate it here. | |
3429 | if (VT == MVT::f64 || VT == MVT::v2f64) { | |
3430 | BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec); | |
3431 | BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec); | |
3432 | BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec); | |
3433 | } | |
3434 | ||
3435 | SDValue Sel = | |
3436 | DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec); | |
3437 | ||
3438 | if (VT == MVT::f32) | |
3439 | return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel); | |
3440 | else if (VT == MVT::f64) | |
3441 | return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel); | |
3442 | else | |
3443 | return DAG.getNode(ISD::BITCAST, DL, VT, Sel); | |
3444 | } | |
3445 | ||
3446 | SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const { | |
3447 | if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute( | |
3448 | AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) | |
3449 | return SDValue(); | |
3450 | ||
85aaf69f SL |
3451 | if (!Subtarget->hasNEON()) |
3452 | return SDValue(); | |
3453 | ||
1a4d82fc JJ |
3454 | // While there is no integer popcount instruction, it can |
3455 | // be more efficiently lowered to the following sequence that uses | |
3456 | // AdvSIMD registers/instructions as long as the copies to/from | |
3457 | // the AdvSIMD registers are cheap. | |
3458 | // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd | |
3459 | // CNT V0.8B, V0.8B // 8xbyte pop-counts | |
3460 | // ADDV B0, V0.8B // sum 8xbyte pop-counts | |
3461 | // UMOV X0, V0.B[0] // copy byte result back to integer reg | |
3462 | SDValue Val = Op.getOperand(0); | |
3463 | SDLoc DL(Op); | |
3464 | EVT VT = Op.getValueType(); | |
3465 | SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8); | |
3466 | ||
3467 | SDValue VecVal; | |
3468 | if (VT == MVT::i32) { | |
3469 | VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val); | |
3470 | VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec, | |
3471 | VecVal); | |
3472 | } else { | |
3473 | VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val); | |
3474 | } | |
3475 | ||
3476 | SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal); | |
3477 | SDValue UaddLV = DAG.getNode( | |
3478 | ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32, | |
3479 | DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop); | |
3480 | ||
3481 | if (VT == MVT::i64) | |
3482 | UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV); | |
3483 | return UaddLV; | |
3484 | } | |
3485 | ||
3486 | SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { | |
3487 | ||
3488 | if (Op.getValueType().isVector()) | |
3489 | return LowerVSETCC(Op, DAG); | |
3490 | ||
3491 | SDValue LHS = Op.getOperand(0); | |
3492 | SDValue RHS = Op.getOperand(1); | |
3493 | ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); | |
3494 | SDLoc dl(Op); | |
3495 | ||
3496 | // We chose ZeroOrOneBooleanContents, so use zero and one. | |
3497 | EVT VT = Op.getValueType(); | |
3498 | SDValue TVal = DAG.getConstant(1, VT); | |
3499 | SDValue FVal = DAG.getConstant(0, VT); | |
3500 | ||
3501 | // Handle f128 first, since one possible outcome is a normal integer | |
3502 | // comparison which gets picked up by the next if statement. | |
3503 | if (LHS.getValueType() == MVT::f128) { | |
3504 | softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); | |
3505 | ||
3506 | // If softenSetCCOperands returned a scalar, use it. | |
3507 | if (!RHS.getNode()) { | |
3508 | assert(LHS.getValueType() == Op.getValueType() && | |
3509 | "Unexpected setcc expansion!"); | |
3510 | return LHS; | |
3511 | } | |
3512 | } | |
3513 | ||
3514 | if (LHS.getValueType().isInteger()) { | |
3515 | SDValue CCVal; | |
3516 | SDValue Cmp = | |
3517 | getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl); | |
3518 | ||
3519 | // Note that we inverted the condition above, so we reverse the order of | |
3520 | // the true and false operands here. This will allow the setcc to be | |
3521 | // matched to a single CSINC instruction. | |
3522 | return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp); | |
3523 | } | |
3524 | ||
3525 | // Now we know we're dealing with FP values. | |
3526 | assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); | |
3527 | ||
3528 | // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead | |
3529 | // and do the comparison. | |
3530 | SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); | |
3531 | ||
3532 | AArch64CC::CondCode CC1, CC2; | |
3533 | changeFPCCToAArch64CC(CC, CC1, CC2); | |
3534 | if (CC2 == AArch64CC::AL) { | |
3535 | changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2); | |
3536 | SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); | |
3537 | ||
3538 | // Note that we inverted the condition above, so we reverse the order of | |
3539 | // the true and false operands here. This will allow the setcc to be | |
3540 | // matched to a single CSINC instruction. | |
3541 | return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp); | |
3542 | } else { | |
3543 | // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't | |
3544 | // totally clean. Some of them require two CSELs to implement. As is in | |
3545 | // this case, we emit the first CSEL and then emit a second using the output | |
3546 | // of the first as the RHS. We're effectively OR'ing the two CC's together. | |
3547 | ||
3548 | // FIXME: It would be nice if we could match the two CSELs to two CSINCs. | |
3549 | SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); | |
3550 | SDValue CS1 = | |
3551 | DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); | |
3552 | ||
3553 | SDValue CC2Val = DAG.getConstant(CC2, MVT::i32); | |
3554 | return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); | |
3555 | } | |
3556 | } | |
3557 | ||
3558 | /// A SELECT_CC operation is really some kind of max or min if both values being | |
3559 | /// compared are, in some sense, equal to the results in either case. However, | |
3560 | /// it is permissible to compare f32 values and produce directly extended f64 | |
3561 | /// values. | |
3562 | /// | |
3563 | /// Extending the comparison operands would also be allowed, but is less likely | |
3564 | /// to happen in practice since their use is right here. Note that truncate | |
3565 | /// operations would *not* be semantically equivalent. | |
3566 | static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) { | |
3567 | if (Cmp == Result) | |
3568 | return true; | |
3569 | ||
3570 | ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp); | |
3571 | ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result); | |
3572 | if (CCmp && CResult && Cmp.getValueType() == MVT::f32 && | |
3573 | Result.getValueType() == MVT::f64) { | |
3574 | bool Lossy; | |
3575 | APFloat CmpVal = CCmp->getValueAPF(); | |
3576 | CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy); | |
3577 | return CResult->getValueAPF().bitwiseIsEqual(CmpVal); | |
3578 | } | |
3579 | ||
3580 | return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp; | |
3581 | } | |
3582 | ||
3583 | SDValue AArch64TargetLowering::LowerSELECT(SDValue Op, | |
3584 | SelectionDAG &DAG) const { | |
3585 | SDValue CC = Op->getOperand(0); | |
3586 | SDValue TVal = Op->getOperand(1); | |
3587 | SDValue FVal = Op->getOperand(2); | |
3588 | SDLoc DL(Op); | |
3589 | ||
3590 | unsigned Opc = CC.getOpcode(); | |
3591 | // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select | |
3592 | // instruction. | |
3593 | if (CC.getResNo() == 1 && | |
3594 | (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO || | |
3595 | Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) { | |
3596 | // Only lower legal XALUO ops. | |
3597 | if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0))) | |
3598 | return SDValue(); | |
3599 | ||
3600 | AArch64CC::CondCode OFCC; | |
3601 | SDValue Value, Overflow; | |
3602 | std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CC.getValue(0), DAG); | |
3603 | SDValue CCVal = DAG.getConstant(OFCC, MVT::i32); | |
3604 | ||
3605 | return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, | |
3606 | CCVal, Overflow); | |
3607 | } | |
3608 | ||
3609 | if (CC.getOpcode() == ISD::SETCC) | |
3610 | return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal, | |
3611 | cast<CondCodeSDNode>(CC.getOperand(2))->get()); | |
3612 | else | |
3613 | return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal, | |
3614 | FVal, ISD::SETNE); | |
3615 | } | |
3616 | ||
3617 | SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op, | |
3618 | SelectionDAG &DAG) const { | |
3619 | ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get(); | |
3620 | SDValue LHS = Op.getOperand(0); | |
3621 | SDValue RHS = Op.getOperand(1); | |
3622 | SDValue TVal = Op.getOperand(2); | |
3623 | SDValue FVal = Op.getOperand(3); | |
3624 | SDLoc dl(Op); | |
3625 | ||
3626 | // Handle f128 first, because it will result in a comparison of some RTLIB | |
3627 | // call result against zero. | |
3628 | if (LHS.getValueType() == MVT::f128) { | |
3629 | softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl); | |
3630 | ||
3631 | // If softenSetCCOperands returned a scalar, we need to compare the result | |
3632 | // against zero to select between true and false values. | |
3633 | if (!RHS.getNode()) { | |
3634 | RHS = DAG.getConstant(0, LHS.getValueType()); | |
3635 | CC = ISD::SETNE; | |
3636 | } | |
3637 | } | |
3638 | ||
3639 | // Handle integers first. | |
3640 | if (LHS.getValueType().isInteger()) { | |
3641 | assert((LHS.getValueType() == RHS.getValueType()) && | |
3642 | (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64)); | |
3643 | ||
3644 | unsigned Opcode = AArch64ISD::CSEL; | |
3645 | ||
3646 | // If both the TVal and the FVal are constants, see if we can swap them in | |
3647 | // order to for a CSINV or CSINC out of them. | |
3648 | ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal); | |
3649 | ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal); | |
3650 | ||
3651 | if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) { | |
3652 | std::swap(TVal, FVal); | |
3653 | std::swap(CTVal, CFVal); | |
3654 | CC = ISD::getSetCCInverse(CC, true); | |
3655 | } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) { | |
3656 | std::swap(TVal, FVal); | |
3657 | std::swap(CTVal, CFVal); | |
3658 | CC = ISD::getSetCCInverse(CC, true); | |
3659 | } else if (TVal.getOpcode() == ISD::XOR) { | |
3660 | // If TVal is a NOT we want to swap TVal and FVal so that we can match | |
3661 | // with a CSINV rather than a CSEL. | |
3662 | ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1)); | |
3663 | ||
3664 | if (CVal && CVal->isAllOnesValue()) { | |
3665 | std::swap(TVal, FVal); | |
3666 | std::swap(CTVal, CFVal); | |
3667 | CC = ISD::getSetCCInverse(CC, true); | |
3668 | } | |
3669 | } else if (TVal.getOpcode() == ISD::SUB) { | |
3670 | // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so | |
3671 | // that we can match with a CSNEG rather than a CSEL. | |
3672 | ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0)); | |
3673 | ||
3674 | if (CVal && CVal->isNullValue()) { | |
3675 | std::swap(TVal, FVal); | |
3676 | std::swap(CTVal, CFVal); | |
3677 | CC = ISD::getSetCCInverse(CC, true); | |
3678 | } | |
3679 | } else if (CTVal && CFVal) { | |
3680 | const int64_t TrueVal = CTVal->getSExtValue(); | |
3681 | const int64_t FalseVal = CFVal->getSExtValue(); | |
3682 | bool Swap = false; | |
3683 | ||
3684 | // If both TVal and FVal are constants, see if FVal is the | |
3685 | // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC | |
3686 | // instead of a CSEL in that case. | |
3687 | if (TrueVal == ~FalseVal) { | |
3688 | Opcode = AArch64ISD::CSINV; | |
3689 | } else if (TrueVal == -FalseVal) { | |
3690 | Opcode = AArch64ISD::CSNEG; | |
3691 | } else if (TVal.getValueType() == MVT::i32) { | |
3692 | // If our operands are only 32-bit wide, make sure we use 32-bit | |
3693 | // arithmetic for the check whether we can use CSINC. This ensures that | |
3694 | // the addition in the check will wrap around properly in case there is | |
3695 | // an overflow (which would not be the case if we do the check with | |
3696 | // 64-bit arithmetic). | |
3697 | const uint32_t TrueVal32 = CTVal->getZExtValue(); | |
3698 | const uint32_t FalseVal32 = CFVal->getZExtValue(); | |
3699 | ||
3700 | if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) { | |
3701 | Opcode = AArch64ISD::CSINC; | |
3702 | ||
3703 | if (TrueVal32 > FalseVal32) { | |
3704 | Swap = true; | |
3705 | } | |
3706 | } | |
3707 | // 64-bit check whether we can use CSINC. | |
3708 | } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) { | |
3709 | Opcode = AArch64ISD::CSINC; | |
3710 | ||
3711 | if (TrueVal > FalseVal) { | |
3712 | Swap = true; | |
3713 | } | |
3714 | } | |
3715 | ||
3716 | // Swap TVal and FVal if necessary. | |
3717 | if (Swap) { | |
3718 | std::swap(TVal, FVal); | |
3719 | std::swap(CTVal, CFVal); | |
3720 | CC = ISD::getSetCCInverse(CC, true); | |
3721 | } | |
3722 | ||
3723 | if (Opcode != AArch64ISD::CSEL) { | |
3724 | // Drop FVal since we can get its value by simply inverting/negating | |
3725 | // TVal. | |
3726 | FVal = TVal; | |
3727 | } | |
3728 | } | |
3729 | ||
3730 | SDValue CCVal; | |
3731 | SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl); | |
3732 | ||
3733 | EVT VT = Op.getValueType(); | |
3734 | return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp); | |
3735 | } | |
3736 | ||
3737 | // Now we know we're dealing with FP values. | |
3738 | assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64); | |
3739 | assert(LHS.getValueType() == RHS.getValueType()); | |
3740 | EVT VT = Op.getValueType(); | |
3741 | ||
3742 | // Try to match this select into a max/min operation, which have dedicated | |
3743 | // opcode in the instruction set. | |
3744 | // FIXME: This is not correct in the presence of NaNs, so we only enable this | |
3745 | // in no-NaNs mode. | |
3746 | if (getTargetMachine().Options.NoNaNsFPMath) { | |
3747 | SDValue MinMaxLHS = TVal, MinMaxRHS = FVal; | |
3748 | if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) && | |
3749 | selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) { | |
3750 | CC = ISD::getSetCCSwappedOperands(CC); | |
3751 | std::swap(MinMaxLHS, MinMaxRHS); | |
3752 | } | |
3753 | ||
3754 | if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) && | |
3755 | selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) { | |
3756 | switch (CC) { | |
3757 | default: | |
3758 | break; | |
3759 | case ISD::SETGT: | |
3760 | case ISD::SETGE: | |
3761 | case ISD::SETUGT: | |
3762 | case ISD::SETUGE: | |
3763 | case ISD::SETOGT: | |
3764 | case ISD::SETOGE: | |
3765 | return DAG.getNode(AArch64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS); | |
3766 | break; | |
3767 | case ISD::SETLT: | |
3768 | case ISD::SETLE: | |
3769 | case ISD::SETULT: | |
3770 | case ISD::SETULE: | |
3771 | case ISD::SETOLT: | |
3772 | case ISD::SETOLE: | |
3773 | return DAG.getNode(AArch64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS); | |
3774 | break; | |
3775 | } | |
3776 | } | |
3777 | } | |
3778 | ||
3779 | // If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead | |
3780 | // and do the comparison. | |
3781 | SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG); | |
3782 | ||
3783 | // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally | |
3784 | // clean. Some of them require two CSELs to implement. | |
3785 | AArch64CC::CondCode CC1, CC2; | |
3786 | changeFPCCToAArch64CC(CC, CC1, CC2); | |
3787 | SDValue CC1Val = DAG.getConstant(CC1, MVT::i32); | |
3788 | SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp); | |
3789 | ||
3790 | // If we need a second CSEL, emit it, using the output of the first as the | |
3791 | // RHS. We're effectively OR'ing the two CC's together. | |
3792 | if (CC2 != AArch64CC::AL) { | |
3793 | SDValue CC2Val = DAG.getConstant(CC2, MVT::i32); | |
3794 | return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp); | |
3795 | } | |
3796 | ||
3797 | // Otherwise, return the output of the first CSEL. | |
3798 | return CS1; | |
3799 | } | |
3800 | ||
3801 | SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op, | |
3802 | SelectionDAG &DAG) const { | |
3803 | // Jump table entries as PC relative offsets. No additional tweaking | |
3804 | // is necessary here. Just get the address of the jump table. | |
3805 | JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); | |
3806 | EVT PtrVT = getPointerTy(); | |
3807 | SDLoc DL(Op); | |
3808 | ||
3809 | if (getTargetMachine().getCodeModel() == CodeModel::Large && | |
3810 | !Subtarget->isTargetMachO()) { | |
3811 | const unsigned char MO_NC = AArch64II::MO_NC; | |
3812 | return DAG.getNode( | |
3813 | AArch64ISD::WrapperLarge, DL, PtrVT, | |
3814 | DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3), | |
3815 | DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC), | |
3816 | DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC), | |
3817 | DAG.getTargetJumpTable(JT->getIndex(), PtrVT, | |
3818 | AArch64II::MO_G0 | MO_NC)); | |
3819 | } | |
3820 | ||
3821 | SDValue Hi = | |
3822 | DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE); | |
3823 | SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, | |
3824 | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); | |
3825 | SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); | |
3826 | return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); | |
3827 | } | |
3828 | ||
3829 | SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op, | |
3830 | SelectionDAG &DAG) const { | |
3831 | ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); | |
3832 | EVT PtrVT = getPointerTy(); | |
3833 | SDLoc DL(Op); | |
3834 | ||
3835 | if (getTargetMachine().getCodeModel() == CodeModel::Large) { | |
3836 | // Use the GOT for the large code model on iOS. | |
3837 | if (Subtarget->isTargetMachO()) { | |
3838 | SDValue GotAddr = DAG.getTargetConstantPool( | |
3839 | CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), | |
3840 | AArch64II::MO_GOT); | |
3841 | return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr); | |
3842 | } | |
3843 | ||
3844 | const unsigned char MO_NC = AArch64II::MO_NC; | |
3845 | return DAG.getNode( | |
3846 | AArch64ISD::WrapperLarge, DL, PtrVT, | |
3847 | DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), | |
3848 | CP->getOffset(), AArch64II::MO_G3), | |
3849 | DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), | |
3850 | CP->getOffset(), AArch64II::MO_G2 | MO_NC), | |
3851 | DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), | |
3852 | CP->getOffset(), AArch64II::MO_G1 | MO_NC), | |
3853 | DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), | |
3854 | CP->getOffset(), AArch64II::MO_G0 | MO_NC)); | |
3855 | } else { | |
3856 | // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on | |
3857 | // ELF, the only valid one on Darwin. | |
3858 | SDValue Hi = | |
3859 | DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(), | |
3860 | CP->getOffset(), AArch64II::MO_PAGE); | |
3861 | SDValue Lo = DAG.getTargetConstantPool( | |
3862 | CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), | |
3863 | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); | |
3864 | ||
3865 | SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); | |
3866 | return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); | |
3867 | } | |
3868 | } | |
3869 | ||
3870 | SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op, | |
3871 | SelectionDAG &DAG) const { | |
3872 | const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); | |
3873 | EVT PtrVT = getPointerTy(); | |
3874 | SDLoc DL(Op); | |
3875 | if (getTargetMachine().getCodeModel() == CodeModel::Large && | |
3876 | !Subtarget->isTargetMachO()) { | |
3877 | const unsigned char MO_NC = AArch64II::MO_NC; | |
3878 | return DAG.getNode( | |
3879 | AArch64ISD::WrapperLarge, DL, PtrVT, | |
3880 | DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3), | |
3881 | DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC), | |
3882 | DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC), | |
3883 | DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC)); | |
3884 | } else { | |
3885 | SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE); | |
3886 | SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF | | |
3887 | AArch64II::MO_NC); | |
3888 | SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi); | |
3889 | return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo); | |
3890 | } | |
3891 | } | |
3892 | ||
3893 | SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, | |
3894 | SelectionDAG &DAG) const { | |
3895 | AArch64FunctionInfo *FuncInfo = | |
3896 | DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); | |
3897 | ||
3898 | SDLoc DL(Op); | |
3899 | SDValue FR = | |
3900 | DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy()); | |
3901 | const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); | |
3902 | return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), | |
3903 | MachinePointerInfo(SV), false, false, 0); | |
3904 | } | |
3905 | ||
3906 | SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, | |
3907 | SelectionDAG &DAG) const { | |
3908 | // The layout of the va_list struct is specified in the AArch64 Procedure Call | |
3909 | // Standard, section B.3. | |
3910 | MachineFunction &MF = DAG.getMachineFunction(); | |
3911 | AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); | |
3912 | SDLoc DL(Op); | |
3913 | ||
3914 | SDValue Chain = Op.getOperand(0); | |
3915 | SDValue VAList = Op.getOperand(1); | |
3916 | const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); | |
3917 | SmallVector<SDValue, 4> MemOps; | |
3918 | ||
3919 | // void *__stack at offset 0 | |
3920 | SDValue Stack = | |
3921 | DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy()); | |
3922 | MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList, | |
3923 | MachinePointerInfo(SV), false, false, 8)); | |
3924 | ||
3925 | // void *__gr_top at offset 8 | |
3926 | int GPRSize = FuncInfo->getVarArgsGPRSize(); | |
3927 | if (GPRSize > 0) { | |
3928 | SDValue GRTop, GRTopAddr; | |
3929 | ||
3930 | GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, | |
3931 | DAG.getConstant(8, getPointerTy())); | |
3932 | ||
3933 | GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy()); | |
3934 | GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop, | |
3935 | DAG.getConstant(GPRSize, getPointerTy())); | |
3936 | ||
3937 | MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr, | |
3938 | MachinePointerInfo(SV, 8), false, false, 8)); | |
3939 | } | |
3940 | ||
3941 | // void *__vr_top at offset 16 | |
3942 | int FPRSize = FuncInfo->getVarArgsFPRSize(); | |
3943 | if (FPRSize > 0) { | |
3944 | SDValue VRTop, VRTopAddr; | |
3945 | VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, | |
3946 | DAG.getConstant(16, getPointerTy())); | |
3947 | ||
3948 | VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy()); | |
3949 | VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop, | |
3950 | DAG.getConstant(FPRSize, getPointerTy())); | |
3951 | ||
3952 | MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr, | |
3953 | MachinePointerInfo(SV, 16), false, false, 8)); | |
3954 | } | |
3955 | ||
3956 | // int __gr_offs at offset 24 | |
3957 | SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, | |
3958 | DAG.getConstant(24, getPointerTy())); | |
3959 | MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32), | |
3960 | GROffsAddr, MachinePointerInfo(SV, 24), false, | |
3961 | false, 4)); | |
3962 | ||
3963 | // int __vr_offs at offset 28 | |
3964 | SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, | |
3965 | DAG.getConstant(28, getPointerTy())); | |
3966 | MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32), | |
3967 | VROffsAddr, MachinePointerInfo(SV, 28), false, | |
3968 | false, 4)); | |
3969 | ||
3970 | return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); | |
3971 | } | |
3972 | ||
3973 | SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, | |
3974 | SelectionDAG &DAG) const { | |
3975 | return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG) | |
3976 | : LowerAAPCS_VASTART(Op, DAG); | |
3977 | } | |
3978 | ||
3979 | SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, | |
3980 | SelectionDAG &DAG) const { | |
3981 | // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single | |
3982 | // pointer. | |
3983 | unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32; | |
3984 | const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); | |
3985 | const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); | |
3986 | ||
3987 | return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1), | |
3988 | Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32), | |
3989 | 8, false, false, MachinePointerInfo(DestSV), | |
3990 | MachinePointerInfo(SrcSV)); | |
3991 | } | |
3992 | ||
3993 | SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { | |
3994 | assert(Subtarget->isTargetDarwin() && | |
3995 | "automatic va_arg instruction only works on Darwin"); | |
3996 | ||
3997 | const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); | |
3998 | EVT VT = Op.getValueType(); | |
3999 | SDLoc DL(Op); | |
4000 | SDValue Chain = Op.getOperand(0); | |
4001 | SDValue Addr = Op.getOperand(1); | |
4002 | unsigned Align = Op.getConstantOperandVal(3); | |
4003 | ||
4004 | SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr, | |
4005 | MachinePointerInfo(V), false, false, false, 0); | |
4006 | Chain = VAList.getValue(1); | |
4007 | ||
4008 | if (Align > 8) { | |
4009 | assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2"); | |
4010 | VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, | |
4011 | DAG.getConstant(Align - 1, getPointerTy())); | |
4012 | VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList, | |
4013 | DAG.getConstant(-(int64_t)Align, getPointerTy())); | |
4014 | } | |
4015 | ||
4016 | Type *ArgTy = VT.getTypeForEVT(*DAG.getContext()); | |
4017 | uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy); | |
4018 | ||
4019 | // Scalar integer and FP values smaller than 64 bits are implicitly extended | |
4020 | // up to 64 bits. At the very least, we have to increase the striding of the | |
4021 | // vaargs list to match this, and for FP values we need to introduce | |
4022 | // FP_ROUND nodes as well. | |
4023 | if (VT.isInteger() && !VT.isVector()) | |
4024 | ArgSize = 8; | |
4025 | bool NeedFPTrunc = false; | |
4026 | if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) { | |
4027 | ArgSize = 8; | |
4028 | NeedFPTrunc = true; | |
4029 | } | |
4030 | ||
4031 | // Increment the pointer, VAList, to the next vaarg | |
4032 | SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList, | |
4033 | DAG.getConstant(ArgSize, getPointerTy())); | |
4034 | // Store the incremented VAList to the legalized pointer | |
4035 | SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V), | |
4036 | false, false, 0); | |
4037 | ||
4038 | // Load the actual argument out of the pointer VAList | |
4039 | if (NeedFPTrunc) { | |
4040 | // Load the value as an f64. | |
4041 | SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList, | |
4042 | MachinePointerInfo(), false, false, false, 0); | |
4043 | // Round the value down to an f32. | |
4044 | SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0), | |
4045 | DAG.getIntPtrConstant(1)); | |
4046 | SDValue Ops[] = { NarrowFP, WideFP.getValue(1) }; | |
4047 | // Merge the rounded value with the chain output of the load. | |
4048 | return DAG.getMergeValues(Ops, DL); | |
4049 | } | |
4050 | ||
4051 | return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false, | |
4052 | false, false, 0); | |
4053 | } | |
4054 | ||
4055 | SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, | |
4056 | SelectionDAG &DAG) const { | |
4057 | MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); | |
4058 | MFI->setFrameAddressIsTaken(true); | |
4059 | ||
4060 | EVT VT = Op.getValueType(); | |
4061 | SDLoc DL(Op); | |
4062 | unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); | |
4063 | SDValue FrameAddr = | |
4064 | DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT); | |
4065 | while (Depth--) | |
4066 | FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr, | |
4067 | MachinePointerInfo(), false, false, false, 0); | |
4068 | return FrameAddr; | |
4069 | } | |
4070 | ||
4071 | // FIXME? Maybe this could be a TableGen attribute on some registers and | |
4072 | // this table could be generated automatically from RegInfo. | |
4073 | unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, | |
4074 | EVT VT) const { | |
4075 | unsigned Reg = StringSwitch<unsigned>(RegName) | |
4076 | .Case("sp", AArch64::SP) | |
4077 | .Default(0); | |
4078 | if (Reg) | |
4079 | return Reg; | |
4080 | report_fatal_error("Invalid register name global variable"); | |
4081 | } | |
4082 | ||
4083 | SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, | |
4084 | SelectionDAG &DAG) const { | |
4085 | MachineFunction &MF = DAG.getMachineFunction(); | |
4086 | MachineFrameInfo *MFI = MF.getFrameInfo(); | |
4087 | MFI->setReturnAddressIsTaken(true); | |
4088 | ||
4089 | EVT VT = Op.getValueType(); | |
4090 | SDLoc DL(Op); | |
4091 | unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue(); | |
4092 | if (Depth) { | |
4093 | SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); | |
4094 | SDValue Offset = DAG.getConstant(8, getPointerTy()); | |
4095 | return DAG.getLoad(VT, DL, DAG.getEntryNode(), | |
4096 | DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset), | |
4097 | MachinePointerInfo(), false, false, false, 0); | |
4098 | } | |
4099 | ||
4100 | // Return LR, which contains the return address. Mark it an implicit live-in. | |
4101 | unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass); | |
4102 | return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); | |
4103 | } | |
4104 | ||
4105 | /// LowerShiftRightParts - Lower SRA_PARTS, which returns two | |
4106 | /// i64 values and take a 2 x i64 value to shift plus a shift amount. | |
4107 | SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op, | |
4108 | SelectionDAG &DAG) const { | |
4109 | assert(Op.getNumOperands() == 3 && "Not a double-shift!"); | |
4110 | EVT VT = Op.getValueType(); | |
4111 | unsigned VTBits = VT.getSizeInBits(); | |
4112 | SDLoc dl(Op); | |
4113 | SDValue ShOpLo = Op.getOperand(0); | |
4114 | SDValue ShOpHi = Op.getOperand(1); | |
4115 | SDValue ShAmt = Op.getOperand(2); | |
4116 | SDValue ARMcc; | |
4117 | unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL; | |
4118 | ||
4119 | assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS); | |
4120 | ||
4121 | SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, | |
4122 | DAG.getConstant(VTBits, MVT::i64), ShAmt); | |
4123 | SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt); | |
4124 | SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, | |
4125 | DAG.getConstant(VTBits, MVT::i64)); | |
4126 | SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt); | |
4127 | ||
4128 | SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), | |
4129 | ISD::SETGE, dl, DAG); | |
4130 | SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32); | |
4131 | ||
4132 | SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); | |
4133 | SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt); | |
4134 | SDValue Lo = | |
4135 | DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); | |
4136 | ||
4137 | // AArch64 shifts larger than the register width are wrapped rather than | |
4138 | // clamped, so we can't just emit "hi >> x". | |
4139 | SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt); | |
4140 | SDValue TrueValHi = Opc == ISD::SRA | |
4141 | ? DAG.getNode(Opc, dl, VT, ShOpHi, | |
4142 | DAG.getConstant(VTBits - 1, MVT::i64)) | |
4143 | : DAG.getConstant(0, VT); | |
4144 | SDValue Hi = | |
4145 | DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp); | |
4146 | ||
4147 | SDValue Ops[2] = { Lo, Hi }; | |
4148 | return DAG.getMergeValues(Ops, dl); | |
4149 | } | |
4150 | ||
4151 | /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two | |
4152 | /// i64 values and take a 2 x i64 value to shift plus a shift amount. | |
4153 | SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op, | |
4154 | SelectionDAG &DAG) const { | |
4155 | assert(Op.getNumOperands() == 3 && "Not a double-shift!"); | |
4156 | EVT VT = Op.getValueType(); | |
4157 | unsigned VTBits = VT.getSizeInBits(); | |
4158 | SDLoc dl(Op); | |
4159 | SDValue ShOpLo = Op.getOperand(0); | |
4160 | SDValue ShOpHi = Op.getOperand(1); | |
4161 | SDValue ShAmt = Op.getOperand(2); | |
4162 | SDValue ARMcc; | |
4163 | ||
4164 | assert(Op.getOpcode() == ISD::SHL_PARTS); | |
4165 | SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, | |
4166 | DAG.getConstant(VTBits, MVT::i64), ShAmt); | |
4167 | SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt); | |
4168 | SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt, | |
4169 | DAG.getConstant(VTBits, MVT::i64)); | |
4170 | SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt); | |
4171 | SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt); | |
4172 | ||
4173 | SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2); | |
4174 | ||
4175 | SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), | |
4176 | ISD::SETGE, dl, DAG); | |
4177 | SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32); | |
4178 | SDValue Hi = | |
4179 | DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp); | |
4180 | ||
4181 | // AArch64 shifts of larger than register sizes are wrapped rather than | |
4182 | // clamped, so we can't just emit "lo << a" if a is too big. | |
4183 | SDValue TrueValLo = DAG.getConstant(0, VT); | |
4184 | SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt); | |
4185 | SDValue Lo = | |
4186 | DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp); | |
4187 | ||
4188 | SDValue Ops[2] = { Lo, Hi }; | |
4189 | return DAG.getMergeValues(Ops, dl); | |
4190 | } | |
4191 | ||
4192 | bool AArch64TargetLowering::isOffsetFoldingLegal( | |
4193 | const GlobalAddressSDNode *GA) const { | |
4194 | // The AArch64 target doesn't support folding offsets into global addresses. | |
4195 | return false; | |
4196 | } | |
4197 | ||
4198 | bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { | |
4199 | // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases. | |
4200 | // FIXME: We should be able to handle f128 as well with a clever lowering. | |
4201 | if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32)) | |
4202 | return true; | |
4203 | ||
4204 | if (VT == MVT::f64) | |
4205 | return AArch64_AM::getFP64Imm(Imm) != -1; | |
4206 | else if (VT == MVT::f32) | |
4207 | return AArch64_AM::getFP32Imm(Imm) != -1; | |
4208 | return false; | |
4209 | } | |
4210 | ||
4211 | //===----------------------------------------------------------------------===// | |
4212 | // AArch64 Optimization Hooks | |
4213 | //===----------------------------------------------------------------------===// | |
4214 | ||
4215 | //===----------------------------------------------------------------------===// | |
4216 | // AArch64 Inline Assembly Support | |
4217 | //===----------------------------------------------------------------------===// | |
4218 | ||
4219 | // Table of Constraints | |
4220 | // TODO: This is the current set of constraints supported by ARM for the | |
4221 | // compiler, not all of them may make sense, e.g. S may be difficult to support. | |
4222 | // | |
4223 | // r - A general register | |
4224 | // w - An FP/SIMD register of some size in the range v0-v31 | |
4225 | // x - An FP/SIMD register of some size in the range v0-v15 | |
4226 | // I - Constant that can be used with an ADD instruction | |
4227 | // J - Constant that can be used with a SUB instruction | |
4228 | // K - Constant that can be used with a 32-bit logical instruction | |
4229 | // L - Constant that can be used with a 64-bit logical instruction | |
4230 | // M - Constant that can be used as a 32-bit MOV immediate | |
4231 | // N - Constant that can be used as a 64-bit MOV immediate | |
4232 | // Q - A memory reference with base register and no offset | |
4233 | // S - A symbolic address | |
4234 | // Y - Floating point constant zero | |
4235 | // Z - Integer constant zero | |
4236 | // | |
4237 | // Note that general register operands will be output using their 64-bit x | |
4238 | // register name, whatever the size of the variable, unless the asm operand | |
4239 | // is prefixed by the %w modifier. Floating-point and SIMD register operands | |
4240 | // will be output with the v prefix unless prefixed by the %b, %h, %s, %d or | |
4241 | // %q modifier. | |
4242 | ||
4243 | /// getConstraintType - Given a constraint letter, return the type of | |
4244 | /// constraint it is for this target. | |
4245 | AArch64TargetLowering::ConstraintType | |
4246 | AArch64TargetLowering::getConstraintType(const std::string &Constraint) const { | |
4247 | if (Constraint.size() == 1) { | |
4248 | switch (Constraint[0]) { | |
4249 | default: | |
4250 | break; | |
4251 | case 'z': | |
4252 | return C_Other; | |
4253 | case 'x': | |
4254 | case 'w': | |
4255 | return C_RegisterClass; | |
4256 | // An address with a single base register. Due to the way we | |
4257 | // currently handle addresses it is the same as 'r'. | |
4258 | case 'Q': | |
4259 | return C_Memory; | |
4260 | } | |
4261 | } | |
4262 | return TargetLowering::getConstraintType(Constraint); | |
4263 | } | |
4264 | ||
4265 | /// Examine constraint type and operand type and determine a weight value. | |
4266 | /// This object must already have been set up with the operand type | |
4267 | /// and the current alternative constraint selected. | |
4268 | TargetLowering::ConstraintWeight | |
4269 | AArch64TargetLowering::getSingleConstraintMatchWeight( | |
4270 | AsmOperandInfo &info, const char *constraint) const { | |
4271 | ConstraintWeight weight = CW_Invalid; | |
4272 | Value *CallOperandVal = info.CallOperandVal; | |
4273 | // If we don't have a value, we can't do a match, | |
4274 | // but allow it at the lowest weight. | |
4275 | if (!CallOperandVal) | |
4276 | return CW_Default; | |
4277 | Type *type = CallOperandVal->getType(); | |
4278 | // Look at the constraint type. | |
4279 | switch (*constraint) { | |
4280 | default: | |
4281 | weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); | |
4282 | break; | |
4283 | case 'x': | |
4284 | case 'w': | |
4285 | if (type->isFloatingPointTy() || type->isVectorTy()) | |
4286 | weight = CW_Register; | |
4287 | break; | |
4288 | case 'z': | |
4289 | weight = CW_Constant; | |
4290 | break; | |
4291 | } | |
4292 | return weight; | |
4293 | } | |
4294 | ||
4295 | std::pair<unsigned, const TargetRegisterClass *> | |
4296 | AArch64TargetLowering::getRegForInlineAsmConstraint( | |
4297 | const std::string &Constraint, MVT VT) const { | |
4298 | if (Constraint.size() == 1) { | |
4299 | switch (Constraint[0]) { | |
4300 | case 'r': | |
4301 | if (VT.getSizeInBits() == 64) | |
4302 | return std::make_pair(0U, &AArch64::GPR64commonRegClass); | |
4303 | return std::make_pair(0U, &AArch64::GPR32commonRegClass); | |
4304 | case 'w': | |
4305 | if (VT == MVT::f32) | |
4306 | return std::make_pair(0U, &AArch64::FPR32RegClass); | |
4307 | if (VT.getSizeInBits() == 64) | |
4308 | return std::make_pair(0U, &AArch64::FPR64RegClass); | |
4309 | if (VT.getSizeInBits() == 128) | |
4310 | return std::make_pair(0U, &AArch64::FPR128RegClass); | |
4311 | break; | |
4312 | // The instructions that this constraint is designed for can | |
4313 | // only take 128-bit registers so just use that regclass. | |
4314 | case 'x': | |
4315 | if (VT.getSizeInBits() == 128) | |
4316 | return std::make_pair(0U, &AArch64::FPR128_loRegClass); | |
4317 | break; | |
4318 | } | |
4319 | } | |
4320 | if (StringRef("{cc}").equals_lower(Constraint)) | |
4321 | return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass); | |
4322 | ||
4323 | // Use the default implementation in TargetLowering to convert the register | |
4324 | // constraint into a member of a register class. | |
4325 | std::pair<unsigned, const TargetRegisterClass *> Res; | |
4326 | Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT); | |
4327 | ||
4328 | // Not found as a standard register? | |
4329 | if (!Res.second) { | |
4330 | unsigned Size = Constraint.size(); | |
4331 | if ((Size == 4 || Size == 5) && Constraint[0] == '{' && | |
4332 | tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') { | |
4333 | const std::string Reg = | |
4334 | std::string(&Constraint[2], &Constraint[Size - 1]); | |
4335 | int RegNo = atoi(Reg.c_str()); | |
4336 | if (RegNo >= 0 && RegNo <= 31) { | |
4337 | // v0 - v31 are aliases of q0 - q31. | |
4338 | // By default we'll emit v0-v31 for this unless there's a modifier where | |
4339 | // we'll emit the correct register as well. | |
4340 | Res.first = AArch64::FPR128RegClass.getRegister(RegNo); | |
4341 | Res.second = &AArch64::FPR128RegClass; | |
4342 | } | |
4343 | } | |
4344 | } | |
4345 | ||
4346 | return Res; | |
4347 | } | |
4348 | ||
4349 | /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops | |
4350 | /// vector. If it is invalid, don't add anything to Ops. | |
4351 | void AArch64TargetLowering::LowerAsmOperandForConstraint( | |
4352 | SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops, | |
4353 | SelectionDAG &DAG) const { | |
4354 | SDValue Result; | |
4355 | ||
4356 | // Currently only support length 1 constraints. | |
4357 | if (Constraint.length() != 1) | |
4358 | return; | |
4359 | ||
4360 | char ConstraintLetter = Constraint[0]; | |
4361 | switch (ConstraintLetter) { | |
4362 | default: | |
4363 | break; | |
4364 | ||
4365 | // This set of constraints deal with valid constants for various instructions. | |
4366 | // Validate and return a target constant for them if we can. | |
4367 | case 'z': { | |
4368 | // 'z' maps to xzr or wzr so it needs an input of 0. | |
4369 | ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); | |
4370 | if (!C || C->getZExtValue() != 0) | |
4371 | return; | |
4372 | ||
4373 | if (Op.getValueType() == MVT::i64) | |
4374 | Result = DAG.getRegister(AArch64::XZR, MVT::i64); | |
4375 | else | |
4376 | Result = DAG.getRegister(AArch64::WZR, MVT::i32); | |
4377 | break; | |
4378 | } | |
4379 | ||
4380 | case 'I': | |
4381 | case 'J': | |
4382 | case 'K': | |
4383 | case 'L': | |
4384 | case 'M': | |
4385 | case 'N': | |
4386 | ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op); | |
4387 | if (!C) | |
4388 | return; | |
4389 | ||
4390 | // Grab the value and do some validation. | |
4391 | uint64_t CVal = C->getZExtValue(); | |
4392 | switch (ConstraintLetter) { | |
4393 | // The I constraint applies only to simple ADD or SUB immediate operands: | |
4394 | // i.e. 0 to 4095 with optional shift by 12 | |
4395 | // The J constraint applies only to ADD or SUB immediates that would be | |
4396 | // valid when negated, i.e. if [an add pattern] were to be output as a SUB | |
4397 | // instruction [or vice versa], in other words -1 to -4095 with optional | |
4398 | // left shift by 12. | |
4399 | case 'I': | |
4400 | if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal)) | |
4401 | break; | |
4402 | return; | |
4403 | case 'J': { | |
4404 | uint64_t NVal = -C->getSExtValue(); | |
4405 | if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal)) { | |
4406 | CVal = C->getSExtValue(); | |
4407 | break; | |
4408 | } | |
4409 | return; | |
4410 | } | |
4411 | // The K and L constraints apply *only* to logical immediates, including | |
4412 | // what used to be the MOVI alias for ORR (though the MOVI alias has now | |
4413 | // been removed and MOV should be used). So these constraints have to | |
4414 | // distinguish between bit patterns that are valid 32-bit or 64-bit | |
4415 | // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but | |
4416 | // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice | |
4417 | // versa. | |
4418 | case 'K': | |
4419 | if (AArch64_AM::isLogicalImmediate(CVal, 32)) | |
4420 | break; | |
4421 | return; | |
4422 | case 'L': | |
4423 | if (AArch64_AM::isLogicalImmediate(CVal, 64)) | |
4424 | break; | |
4425 | return; | |
4426 | // The M and N constraints are a superset of K and L respectively, for use | |
4427 | // with the MOV (immediate) alias. As well as the logical immediates they | |
4428 | // also match 32 or 64-bit immediates that can be loaded either using a | |
4429 | // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca | |
4430 | // (M) or 64-bit 0x1234000000000000 (N) etc. | |
4431 | // As a note some of this code is liberally stolen from the asm parser. | |
4432 | case 'M': { | |
4433 | if (!isUInt<32>(CVal)) | |
4434 | return; | |
4435 | if (AArch64_AM::isLogicalImmediate(CVal, 32)) | |
4436 | break; | |
4437 | if ((CVal & 0xFFFF) == CVal) | |
4438 | break; | |
4439 | if ((CVal & 0xFFFF0000ULL) == CVal) | |
4440 | break; | |
4441 | uint64_t NCVal = ~(uint32_t)CVal; | |
4442 | if ((NCVal & 0xFFFFULL) == NCVal) | |
4443 | break; | |
4444 | if ((NCVal & 0xFFFF0000ULL) == NCVal) | |
4445 | break; | |
4446 | return; | |
4447 | } | |
4448 | case 'N': { | |
4449 | if (AArch64_AM::isLogicalImmediate(CVal, 64)) | |
4450 | break; | |
4451 | if ((CVal & 0xFFFFULL) == CVal) | |
4452 | break; | |
4453 | if ((CVal & 0xFFFF0000ULL) == CVal) | |
4454 | break; | |
4455 | if ((CVal & 0xFFFF00000000ULL) == CVal) | |
4456 | break; | |
4457 | if ((CVal & 0xFFFF000000000000ULL) == CVal) | |
4458 | break; | |
4459 | uint64_t NCVal = ~CVal; | |
4460 | if ((NCVal & 0xFFFFULL) == NCVal) | |
4461 | break; | |
4462 | if ((NCVal & 0xFFFF0000ULL) == NCVal) | |
4463 | break; | |
4464 | if ((NCVal & 0xFFFF00000000ULL) == NCVal) | |
4465 | break; | |
4466 | if ((NCVal & 0xFFFF000000000000ULL) == NCVal) | |
4467 | break; | |
4468 | return; | |
4469 | } | |
4470 | default: | |
4471 | return; | |
4472 | } | |
4473 | ||
4474 | // All assembler immediates are 64-bit integers. | |
4475 | Result = DAG.getTargetConstant(CVal, MVT::i64); | |
4476 | break; | |
4477 | } | |
4478 | ||
4479 | if (Result.getNode()) { | |
4480 | Ops.push_back(Result); | |
4481 | return; | |
4482 | } | |
4483 | ||
4484 | return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); | |
4485 | } | |
4486 | ||
4487 | //===----------------------------------------------------------------------===// | |
4488 | // AArch64 Advanced SIMD Support | |
4489 | //===----------------------------------------------------------------------===// | |
4490 | ||
4491 | /// WidenVector - Given a value in the V64 register class, produce the | |
4492 | /// equivalent value in the V128 register class. | |
4493 | static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) { | |
4494 | EVT VT = V64Reg.getValueType(); | |
4495 | unsigned NarrowSize = VT.getVectorNumElements(); | |
4496 | MVT EltTy = VT.getVectorElementType().getSimpleVT(); | |
4497 | MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize); | |
4498 | SDLoc DL(V64Reg); | |
4499 | ||
4500 | return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy), | |
4501 | V64Reg, DAG.getConstant(0, MVT::i32)); | |
4502 | } | |
4503 | ||
4504 | /// getExtFactor - Determine the adjustment factor for the position when | |
4505 | /// generating an "extract from vector registers" instruction. | |
4506 | static unsigned getExtFactor(SDValue &V) { | |
4507 | EVT EltType = V.getValueType().getVectorElementType(); | |
4508 | return EltType.getSizeInBits() / 8; | |
4509 | } | |
4510 | ||
4511 | /// NarrowVector - Given a value in the V128 register class, produce the | |
4512 | /// equivalent value in the V64 register class. | |
4513 | static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) { | |
4514 | EVT VT = V128Reg.getValueType(); | |
4515 | unsigned WideSize = VT.getVectorNumElements(); | |
4516 | MVT EltTy = VT.getVectorElementType().getSimpleVT(); | |
4517 | MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2); | |
4518 | SDLoc DL(V128Reg); | |
4519 | ||
4520 | return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg); | |
4521 | } | |
4522 | ||
4523 | // Gather data to see if the operation can be modelled as a | |
4524 | // shuffle in combination with VEXTs. | |
4525 | SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op, | |
4526 | SelectionDAG &DAG) const { | |
4527 | assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); | |
4528 | SDLoc dl(Op); | |
4529 | EVT VT = Op.getValueType(); | |
4530 | unsigned NumElts = VT.getVectorNumElements(); | |
4531 | ||
4532 | struct ShuffleSourceInfo { | |
4533 | SDValue Vec; | |
4534 | unsigned MinElt; | |
4535 | unsigned MaxElt; | |
4536 | ||
4537 | // We may insert some combination of BITCASTs and VEXT nodes to force Vec to | |
4538 | // be compatible with the shuffle we intend to construct. As a result | |
4539 | // ShuffleVec will be some sliding window into the original Vec. | |
4540 | SDValue ShuffleVec; | |
4541 | ||
4542 | // Code should guarantee that element i in Vec starts at element "WindowBase | |
4543 | // + i * WindowScale in ShuffleVec". | |
4544 | int WindowBase; | |
4545 | int WindowScale; | |
4546 | ||
4547 | bool operator ==(SDValue OtherVec) { return Vec == OtherVec; } | |
4548 | ShuffleSourceInfo(SDValue Vec) | |
4549 | : Vec(Vec), MinElt(UINT_MAX), MaxElt(0), ShuffleVec(Vec), WindowBase(0), | |
4550 | WindowScale(1) {} | |
4551 | }; | |
4552 | ||
4553 | // First gather all vectors used as an immediate source for this BUILD_VECTOR | |
4554 | // node. | |
4555 | SmallVector<ShuffleSourceInfo, 2> Sources; | |
4556 | for (unsigned i = 0; i < NumElts; ++i) { | |
4557 | SDValue V = Op.getOperand(i); | |
4558 | if (V.getOpcode() == ISD::UNDEF) | |
4559 | continue; | |
4560 | else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) { | |
4561 | // A shuffle can only come from building a vector from various | |
4562 | // elements of other vectors. | |
4563 | return SDValue(); | |
4564 | } | |
4565 | ||
4566 | // Add this element source to the list if it's not already there. | |
4567 | SDValue SourceVec = V.getOperand(0); | |
4568 | auto Source = std::find(Sources.begin(), Sources.end(), SourceVec); | |
4569 | if (Source == Sources.end()) | |
85aaf69f | 4570 | Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec)); |
1a4d82fc JJ |
4571 | |
4572 | // Update the minimum and maximum lane number seen. | |
4573 | unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue(); | |
4574 | Source->MinElt = std::min(Source->MinElt, EltNo); | |
4575 | Source->MaxElt = std::max(Source->MaxElt, EltNo); | |
4576 | } | |
4577 | ||
4578 | // Currently only do something sane when at most two source vectors | |
4579 | // are involved. | |
4580 | if (Sources.size() > 2) | |
4581 | return SDValue(); | |
4582 | ||
4583 | // Find out the smallest element size among result and two sources, and use | |
4584 | // it as element size to build the shuffle_vector. | |
4585 | EVT SmallestEltTy = VT.getVectorElementType(); | |
4586 | for (auto &Source : Sources) { | |
4587 | EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType(); | |
4588 | if (SrcEltTy.bitsLT(SmallestEltTy)) { | |
4589 | SmallestEltTy = SrcEltTy; | |
4590 | } | |
4591 | } | |
4592 | unsigned ResMultiplier = | |
4593 | VT.getVectorElementType().getSizeInBits() / SmallestEltTy.getSizeInBits(); | |
4594 | NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits(); | |
4595 | EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts); | |
4596 | ||
4597 | // If the source vector is too wide or too narrow, we may nevertheless be able | |
4598 | // to construct a compatible shuffle either by concatenating it with UNDEF or | |
4599 | // extracting a suitable range of elements. | |
4600 | for (auto &Src : Sources) { | |
4601 | EVT SrcVT = Src.ShuffleVec.getValueType(); | |
4602 | ||
4603 | if (SrcVT.getSizeInBits() == VT.getSizeInBits()) | |
4604 | continue; | |
4605 | ||
4606 | // This stage of the search produces a source with the same element type as | |
4607 | // the original, but with a total width matching the BUILD_VECTOR output. | |
4608 | EVT EltVT = SrcVT.getVectorElementType(); | |
85aaf69f SL |
4609 | unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits(); |
4610 | EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts); | |
1a4d82fc JJ |
4611 | |
4612 | if (SrcVT.getSizeInBits() < VT.getSizeInBits()) { | |
4613 | assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits()); | |
4614 | // We can pad out the smaller vector for free, so if it's part of a | |
4615 | // shuffle... | |
4616 | Src.ShuffleVec = | |
4617 | DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec, | |
4618 | DAG.getUNDEF(Src.ShuffleVec.getValueType())); | |
4619 | continue; | |
4620 | } | |
4621 | ||
4622 | assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits()); | |
4623 | ||
85aaf69f | 4624 | if (Src.MaxElt - Src.MinElt >= NumSrcElts) { |
1a4d82fc JJ |
4625 | // Span too large for a VEXT to cope |
4626 | return SDValue(); | |
4627 | } | |
4628 | ||
85aaf69f | 4629 | if (Src.MinElt >= NumSrcElts) { |
1a4d82fc JJ |
4630 | // The extraction can just take the second half |
4631 | Src.ShuffleVec = | |
4632 | DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, | |
85aaf69f SL |
4633 | DAG.getConstant(NumSrcElts, MVT::i64)); |
4634 | Src.WindowBase = -NumSrcElts; | |
4635 | } else if (Src.MaxElt < NumSrcElts) { | |
1a4d82fc | 4636 | // The extraction can just take the first half |
85aaf69f SL |
4637 | Src.ShuffleVec = |
4638 | DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, | |
4639 | DAG.getConstant(0, MVT::i64)); | |
1a4d82fc JJ |
4640 | } else { |
4641 | // An actual VEXT is needed | |
85aaf69f SL |
4642 | SDValue VEXTSrc1 = |
4643 | DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, | |
4644 | DAG.getConstant(0, MVT::i64)); | |
1a4d82fc JJ |
4645 | SDValue VEXTSrc2 = |
4646 | DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec, | |
85aaf69f | 4647 | DAG.getConstant(NumSrcElts, MVT::i64)); |
1a4d82fc JJ |
4648 | unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1); |
4649 | ||
4650 | Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1, | |
4651 | VEXTSrc2, DAG.getConstant(Imm, MVT::i32)); | |
4652 | Src.WindowBase = -Src.MinElt; | |
4653 | } | |
4654 | } | |
4655 | ||
4656 | // Another possible incompatibility occurs from the vector element types. We | |
4657 | // can fix this by bitcasting the source vectors to the same type we intend | |
4658 | // for the shuffle. | |
4659 | for (auto &Src : Sources) { | |
4660 | EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType(); | |
4661 | if (SrcEltTy == SmallestEltTy) | |
4662 | continue; | |
4663 | assert(ShuffleVT.getVectorElementType() == SmallestEltTy); | |
4664 | Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec); | |
4665 | Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits(); | |
4666 | Src.WindowBase *= Src.WindowScale; | |
4667 | } | |
4668 | ||
4669 | // Final sanity check before we try to actually produce a shuffle. | |
4670 | DEBUG( | |
4671 | for (auto Src : Sources) | |
4672 | assert(Src.ShuffleVec.getValueType() == ShuffleVT); | |
4673 | ); | |
4674 | ||
4675 | // The stars all align, our next step is to produce the mask for the shuffle. | |
4676 | SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1); | |
4677 | int BitsPerShuffleLane = ShuffleVT.getVectorElementType().getSizeInBits(); | |
4678 | for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) { | |
4679 | SDValue Entry = Op.getOperand(i); | |
4680 | if (Entry.getOpcode() == ISD::UNDEF) | |
4681 | continue; | |
4682 | ||
4683 | auto Src = std::find(Sources.begin(), Sources.end(), Entry.getOperand(0)); | |
4684 | int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue(); | |
4685 | ||
4686 | // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit | |
4687 | // trunc. So only std::min(SrcBits, DestBits) actually get defined in this | |
4688 | // segment. | |
4689 | EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType(); | |
4690 | int BitsDefined = std::min(OrigEltTy.getSizeInBits(), | |
4691 | VT.getVectorElementType().getSizeInBits()); | |
4692 | int LanesDefined = BitsDefined / BitsPerShuffleLane; | |
4693 | ||
4694 | // This source is expected to fill ResMultiplier lanes of the final shuffle, | |
4695 | // starting at the appropriate offset. | |
4696 | int *LaneMask = &Mask[i * ResMultiplier]; | |
4697 | ||
4698 | int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase; | |
4699 | ExtractBase += NumElts * (Src - Sources.begin()); | |
4700 | for (int j = 0; j < LanesDefined; ++j) | |
4701 | LaneMask[j] = ExtractBase + j; | |
4702 | } | |
4703 | ||
4704 | // Final check before we try to produce nonsense... | |
4705 | if (!isShuffleMaskLegal(Mask, ShuffleVT)) | |
4706 | return SDValue(); | |
4707 | ||
4708 | SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) }; | |
4709 | for (unsigned i = 0; i < Sources.size(); ++i) | |
4710 | ShuffleOps[i] = Sources[i].ShuffleVec; | |
4711 | ||
4712 | SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0], | |
4713 | ShuffleOps[1], &Mask[0]); | |
4714 | return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle); | |
4715 | } | |
4716 | ||
4717 | // check if an EXT instruction can handle the shuffle mask when the | |
4718 | // vector sources of the shuffle are the same. | |
4719 | static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) { | |
4720 | unsigned NumElts = VT.getVectorNumElements(); | |
4721 | ||
4722 | // Assume that the first shuffle index is not UNDEF. Fail if it is. | |
4723 | if (M[0] < 0) | |
4724 | return false; | |
4725 | ||
4726 | Imm = M[0]; | |
4727 | ||
4728 | // If this is a VEXT shuffle, the immediate value is the index of the first | |
4729 | // element. The other shuffle indices must be the successive elements after | |
4730 | // the first one. | |
4731 | unsigned ExpectedElt = Imm; | |
4732 | for (unsigned i = 1; i < NumElts; ++i) { | |
4733 | // Increment the expected index. If it wraps around, just follow it | |
4734 | // back to index zero and keep going. | |
4735 | ++ExpectedElt; | |
4736 | if (ExpectedElt == NumElts) | |
4737 | ExpectedElt = 0; | |
4738 | ||
4739 | if (M[i] < 0) | |
4740 | continue; // ignore UNDEF indices | |
4741 | if (ExpectedElt != static_cast<unsigned>(M[i])) | |
4742 | return false; | |
4743 | } | |
4744 | ||
4745 | return true; | |
4746 | } | |
4747 | ||
4748 | // check if an EXT instruction can handle the shuffle mask when the | |
4749 | // vector sources of the shuffle are different. | |
4750 | static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT, | |
4751 | unsigned &Imm) { | |
4752 | // Look for the first non-undef element. | |
4753 | const int *FirstRealElt = std::find_if(M.begin(), M.end(), | |
4754 | [](int Elt) {return Elt >= 0;}); | |
4755 | ||
4756 | // Benefit form APInt to handle overflow when calculating expected element. | |
4757 | unsigned NumElts = VT.getVectorNumElements(); | |
4758 | unsigned MaskBits = APInt(32, NumElts * 2).logBase2(); | |
4759 | APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1); | |
4760 | // The following shuffle indices must be the successive elements after the | |
4761 | // first real element. | |
4762 | const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(), | |
4763 | [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;}); | |
4764 | if (FirstWrongElt != M.end()) | |
4765 | return false; | |
4766 | ||
4767 | // The index of an EXT is the first element if it is not UNDEF. | |
4768 | // Watch out for the beginning UNDEFs. The EXT index should be the expected | |
4769 | // value of the first element. E.g. | |
4770 | // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>. | |
4771 | // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>. | |
4772 | // ExpectedElt is the last mask index plus 1. | |
4773 | Imm = ExpectedElt.getZExtValue(); | |
4774 | ||
4775 | // There are two difference cases requiring to reverse input vectors. | |
4776 | // For example, for vector <4 x i32> we have the following cases, | |
4777 | // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>) | |
4778 | // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>) | |
4779 | // For both cases, we finally use mask <5, 6, 7, 0>, which requires | |
4780 | // to reverse two input vectors. | |
4781 | if (Imm < NumElts) | |
4782 | ReverseEXT = true; | |
4783 | else | |
4784 | Imm -= NumElts; | |
4785 | ||
4786 | return true; | |
4787 | } | |
4788 | ||
4789 | /// isREVMask - Check if a vector shuffle corresponds to a REV | |
4790 | /// instruction with the specified blocksize. (The order of the elements | |
4791 | /// within each block of the vector is reversed.) | |
4792 | static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) { | |
4793 | assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) && | |
4794 | "Only possible block sizes for REV are: 16, 32, 64"); | |
4795 | ||
4796 | unsigned EltSz = VT.getVectorElementType().getSizeInBits(); | |
4797 | if (EltSz == 64) | |
4798 | return false; | |
4799 | ||
4800 | unsigned NumElts = VT.getVectorNumElements(); | |
4801 | unsigned BlockElts = M[0] + 1; | |
4802 | // If the first shuffle index is UNDEF, be optimistic. | |
4803 | if (M[0] < 0) | |
4804 | BlockElts = BlockSize / EltSz; | |
4805 | ||
4806 | if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz) | |
4807 | return false; | |
4808 | ||
4809 | for (unsigned i = 0; i < NumElts; ++i) { | |
4810 | if (M[i] < 0) | |
4811 | continue; // ignore UNDEF indices | |
4812 | if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts)) | |
4813 | return false; | |
4814 | } | |
4815 | ||
4816 | return true; | |
4817 | } | |
4818 | ||
4819 | static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { | |
4820 | unsigned NumElts = VT.getVectorNumElements(); | |
4821 | WhichResult = (M[0] == 0 ? 0 : 1); | |
4822 | unsigned Idx = WhichResult * NumElts / 2; | |
4823 | for (unsigned i = 0; i != NumElts; i += 2) { | |
4824 | if ((M[i] >= 0 && (unsigned)M[i] != Idx) || | |
4825 | (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) | |
4826 | return false; | |
4827 | Idx += 1; | |
4828 | } | |
4829 | ||
4830 | return true; | |
4831 | } | |
4832 | ||
4833 | static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { | |
4834 | unsigned NumElts = VT.getVectorNumElements(); | |
4835 | WhichResult = (M[0] == 0 ? 0 : 1); | |
4836 | for (unsigned i = 0; i != NumElts; ++i) { | |
4837 | if (M[i] < 0) | |
4838 | continue; // ignore UNDEF indices | |
4839 | if ((unsigned)M[i] != 2 * i + WhichResult) | |
4840 | return false; | |
4841 | } | |
4842 | ||
4843 | return true; | |
4844 | } | |
4845 | ||
4846 | static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { | |
4847 | unsigned NumElts = VT.getVectorNumElements(); | |
4848 | WhichResult = (M[0] == 0 ? 0 : 1); | |
4849 | for (unsigned i = 0; i < NumElts; i += 2) { | |
4850 | if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || | |
4851 | (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) | |
4852 | return false; | |
4853 | } | |
4854 | return true; | |
4855 | } | |
4856 | ||
4857 | /// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of | |
4858 | /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". | |
4859 | /// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>. | |
4860 | static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { | |
4861 | unsigned NumElts = VT.getVectorNumElements(); | |
4862 | WhichResult = (M[0] == 0 ? 0 : 1); | |
4863 | unsigned Idx = WhichResult * NumElts / 2; | |
4864 | for (unsigned i = 0; i != NumElts; i += 2) { | |
4865 | if ((M[i] >= 0 && (unsigned)M[i] != Idx) || | |
4866 | (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx)) | |
4867 | return false; | |
4868 | Idx += 1; | |
4869 | } | |
4870 | ||
4871 | return true; | |
4872 | } | |
4873 | ||
4874 | /// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of | |
4875 | /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". | |
4876 | /// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>, | |
4877 | static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { | |
4878 | unsigned Half = VT.getVectorNumElements() / 2; | |
4879 | WhichResult = (M[0] == 0 ? 0 : 1); | |
4880 | for (unsigned j = 0; j != 2; ++j) { | |
4881 | unsigned Idx = WhichResult; | |
4882 | for (unsigned i = 0; i != Half; ++i) { | |
4883 | int MIdx = M[i + j * Half]; | |
4884 | if (MIdx >= 0 && (unsigned)MIdx != Idx) | |
4885 | return false; | |
4886 | Idx += 2; | |
4887 | } | |
4888 | } | |
4889 | ||
4890 | return true; | |
4891 | } | |
4892 | ||
4893 | /// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of | |
4894 | /// "vector_shuffle v, v", i.e., "vector_shuffle v, undef". | |
4895 | /// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>. | |
4896 | static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) { | |
4897 | unsigned NumElts = VT.getVectorNumElements(); | |
4898 | WhichResult = (M[0] == 0 ? 0 : 1); | |
4899 | for (unsigned i = 0; i < NumElts; i += 2) { | |
4900 | if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || | |
4901 | (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult)) | |
4902 | return false; | |
4903 | } | |
4904 | return true; | |
4905 | } | |
4906 | ||
4907 | static bool isINSMask(ArrayRef<int> M, int NumInputElements, | |
4908 | bool &DstIsLeft, int &Anomaly) { | |
4909 | if (M.size() != static_cast<size_t>(NumInputElements)) | |
4910 | return false; | |
4911 | ||
4912 | int NumLHSMatch = 0, NumRHSMatch = 0; | |
4913 | int LastLHSMismatch = -1, LastRHSMismatch = -1; | |
4914 | ||
4915 | for (int i = 0; i < NumInputElements; ++i) { | |
4916 | if (M[i] == -1) { | |
4917 | ++NumLHSMatch; | |
4918 | ++NumRHSMatch; | |
4919 | continue; | |
4920 | } | |
4921 | ||
4922 | if (M[i] == i) | |
4923 | ++NumLHSMatch; | |
4924 | else | |
4925 | LastLHSMismatch = i; | |
4926 | ||
4927 | if (M[i] == i + NumInputElements) | |
4928 | ++NumRHSMatch; | |
4929 | else | |
4930 | LastRHSMismatch = i; | |
4931 | } | |
4932 | ||
4933 | if (NumLHSMatch == NumInputElements - 1) { | |
4934 | DstIsLeft = true; | |
4935 | Anomaly = LastLHSMismatch; | |
4936 | return true; | |
4937 | } else if (NumRHSMatch == NumInputElements - 1) { | |
4938 | DstIsLeft = false; | |
4939 | Anomaly = LastRHSMismatch; | |
4940 | return true; | |
4941 | } | |
4942 | ||
4943 | return false; | |
4944 | } | |
4945 | ||
4946 | static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) { | |
4947 | if (VT.getSizeInBits() != 128) | |
4948 | return false; | |
4949 | ||
4950 | unsigned NumElts = VT.getVectorNumElements(); | |
4951 | ||
4952 | for (int I = 0, E = NumElts / 2; I != E; I++) { | |
4953 | if (Mask[I] != I) | |
4954 | return false; | |
4955 | } | |
4956 | ||
4957 | int Offset = NumElts / 2; | |
4958 | for (int I = NumElts / 2, E = NumElts; I != E; I++) { | |
4959 | if (Mask[I] != I + SplitLHS * Offset) | |
4960 | return false; | |
4961 | } | |
4962 | ||
4963 | return true; | |
4964 | } | |
4965 | ||
4966 | static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) { | |
4967 | SDLoc DL(Op); | |
4968 | EVT VT = Op.getValueType(); | |
4969 | SDValue V0 = Op.getOperand(0); | |
4970 | SDValue V1 = Op.getOperand(1); | |
4971 | ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask(); | |
4972 | ||
4973 | if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() || | |
4974 | VT.getVectorElementType() != V1.getValueType().getVectorElementType()) | |
4975 | return SDValue(); | |
4976 | ||
4977 | bool SplitV0 = V0.getValueType().getSizeInBits() == 128; | |
4978 | ||
4979 | if (!isConcatMask(Mask, VT, SplitV0)) | |
4980 | return SDValue(); | |
4981 | ||
4982 | EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), | |
4983 | VT.getVectorNumElements() / 2); | |
4984 | if (SplitV0) { | |
4985 | V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0, | |
4986 | DAG.getConstant(0, MVT::i64)); | |
4987 | } | |
4988 | if (V1.getValueType().getSizeInBits() == 128) { | |
4989 | V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1, | |
4990 | DAG.getConstant(0, MVT::i64)); | |
4991 | } | |
4992 | return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1); | |
4993 | } | |
4994 | ||
4995 | /// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit | |
4996 | /// the specified operations to build the shuffle. | |
4997 | static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS, | |
4998 | SDValue RHS, SelectionDAG &DAG, | |
4999 | SDLoc dl) { | |
5000 | unsigned OpNum = (PFEntry >> 26) & 0x0F; | |
5001 | unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1); | |
5002 | unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1); | |
5003 | ||
5004 | enum { | |
5005 | OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3> | |
5006 | OP_VREV, | |
5007 | OP_VDUP0, | |
5008 | OP_VDUP1, | |
5009 | OP_VDUP2, | |
5010 | OP_VDUP3, | |
5011 | OP_VEXT1, | |
5012 | OP_VEXT2, | |
5013 | OP_VEXT3, | |
5014 | OP_VUZPL, // VUZP, left result | |
5015 | OP_VUZPR, // VUZP, right result | |
5016 | OP_VZIPL, // VZIP, left result | |
5017 | OP_VZIPR, // VZIP, right result | |
5018 | OP_VTRNL, // VTRN, left result | |
5019 | OP_VTRNR // VTRN, right result | |
5020 | }; | |
5021 | ||
5022 | if (OpNum == OP_COPY) { | |
5023 | if (LHSID == (1 * 9 + 2) * 9 + 3) | |
5024 | return LHS; | |
5025 | assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!"); | |
5026 | return RHS; | |
5027 | } | |
5028 | ||
5029 | SDValue OpLHS, OpRHS; | |
5030 | OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl); | |
5031 | OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl); | |
5032 | EVT VT = OpLHS.getValueType(); | |
5033 | ||
5034 | switch (OpNum) { | |
5035 | default: | |
5036 | llvm_unreachable("Unknown shuffle opcode!"); | |
5037 | case OP_VREV: | |
5038 | // VREV divides the vector in half and swaps within the half. | |
5039 | if (VT.getVectorElementType() == MVT::i32 || | |
5040 | VT.getVectorElementType() == MVT::f32) | |
5041 | return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS); | |
5042 | // vrev <4 x i16> -> REV32 | |
5043 | if (VT.getVectorElementType() == MVT::i16 || | |
5044 | VT.getVectorElementType() == MVT::f16) | |
5045 | return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS); | |
5046 | // vrev <4 x i8> -> REV16 | |
5047 | assert(VT.getVectorElementType() == MVT::i8); | |
5048 | return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS); | |
5049 | case OP_VDUP0: | |
5050 | case OP_VDUP1: | |
5051 | case OP_VDUP2: | |
5052 | case OP_VDUP3: { | |
5053 | EVT EltTy = VT.getVectorElementType(); | |
5054 | unsigned Opcode; | |
5055 | if (EltTy == MVT::i8) | |
5056 | Opcode = AArch64ISD::DUPLANE8; | |
5057 | else if (EltTy == MVT::i16) | |
5058 | Opcode = AArch64ISD::DUPLANE16; | |
5059 | else if (EltTy == MVT::i32 || EltTy == MVT::f32) | |
5060 | Opcode = AArch64ISD::DUPLANE32; | |
5061 | else if (EltTy == MVT::i64 || EltTy == MVT::f64) | |
5062 | Opcode = AArch64ISD::DUPLANE64; | |
5063 | else | |
5064 | llvm_unreachable("Invalid vector element type?"); | |
5065 | ||
5066 | if (VT.getSizeInBits() == 64) | |
5067 | OpLHS = WidenVector(OpLHS, DAG); | |
5068 | SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64); | |
5069 | return DAG.getNode(Opcode, dl, VT, OpLHS, Lane); | |
5070 | } | |
5071 | case OP_VEXT1: | |
5072 | case OP_VEXT2: | |
5073 | case OP_VEXT3: { | |
5074 | unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS); | |
5075 | return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS, | |
5076 | DAG.getConstant(Imm, MVT::i32)); | |
5077 | } | |
5078 | case OP_VUZPL: | |
5079 | return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, | |
5080 | OpRHS); | |
5081 | case OP_VUZPR: | |
5082 | return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, | |
5083 | OpRHS); | |
5084 | case OP_VZIPL: | |
5085 | return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, | |
5086 | OpRHS); | |
5087 | case OP_VZIPR: | |
5088 | return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, | |
5089 | OpRHS); | |
5090 | case OP_VTRNL: | |
5091 | return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, | |
5092 | OpRHS); | |
5093 | case OP_VTRNR: | |
5094 | return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, | |
5095 | OpRHS); | |
5096 | } | |
5097 | } | |
5098 | ||
5099 | static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask, | |
5100 | SelectionDAG &DAG) { | |
5101 | // Check to see if we can use the TBL instruction. | |
5102 | SDValue V1 = Op.getOperand(0); | |
5103 | SDValue V2 = Op.getOperand(1); | |
5104 | SDLoc DL(Op); | |
5105 | ||
5106 | EVT EltVT = Op.getValueType().getVectorElementType(); | |
5107 | unsigned BytesPerElt = EltVT.getSizeInBits() / 8; | |
5108 | ||
5109 | SmallVector<SDValue, 8> TBLMask; | |
5110 | for (int Val : ShuffleMask) { | |
5111 | for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) { | |
5112 | unsigned Offset = Byte + Val * BytesPerElt; | |
5113 | TBLMask.push_back(DAG.getConstant(Offset, MVT::i32)); | |
5114 | } | |
5115 | } | |
5116 | ||
5117 | MVT IndexVT = MVT::v8i8; | |
5118 | unsigned IndexLen = 8; | |
5119 | if (Op.getValueType().getSizeInBits() == 128) { | |
5120 | IndexVT = MVT::v16i8; | |
5121 | IndexLen = 16; | |
5122 | } | |
5123 | ||
5124 | SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1); | |
5125 | SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2); | |
5126 | ||
5127 | SDValue Shuffle; | |
5128 | if (V2.getNode()->getOpcode() == ISD::UNDEF) { | |
5129 | if (IndexLen == 8) | |
5130 | V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst); | |
5131 | Shuffle = DAG.getNode( | |
5132 | ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, | |
5133 | DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst, | |
5134 | DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, | |
5135 | makeArrayRef(TBLMask.data(), IndexLen))); | |
5136 | } else { | |
5137 | if (IndexLen == 8) { | |
5138 | V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst); | |
5139 | Shuffle = DAG.getNode( | |
5140 | ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, | |
5141 | DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst, | |
5142 | DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, | |
5143 | makeArrayRef(TBLMask.data(), IndexLen))); | |
5144 | } else { | |
5145 | // FIXME: We cannot, for the moment, emit a TBL2 instruction because we | |
5146 | // cannot currently represent the register constraints on the input | |
5147 | // table registers. | |
5148 | // Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst, | |
5149 | // DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, | |
5150 | // &TBLMask[0], IndexLen)); | |
5151 | Shuffle = DAG.getNode( | |
5152 | ISD::INTRINSIC_WO_CHAIN, DL, IndexVT, | |
5153 | DAG.getConstant(Intrinsic::aarch64_neon_tbl2, MVT::i32), V1Cst, V2Cst, | |
5154 | DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, | |
5155 | makeArrayRef(TBLMask.data(), IndexLen))); | |
5156 | } | |
5157 | } | |
5158 | return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle); | |
5159 | } | |
5160 | ||
5161 | static unsigned getDUPLANEOp(EVT EltType) { | |
5162 | if (EltType == MVT::i8) | |
5163 | return AArch64ISD::DUPLANE8; | |
5164 | if (EltType == MVT::i16 || EltType == MVT::f16) | |
5165 | return AArch64ISD::DUPLANE16; | |
5166 | if (EltType == MVT::i32 || EltType == MVT::f32) | |
5167 | return AArch64ISD::DUPLANE32; | |
5168 | if (EltType == MVT::i64 || EltType == MVT::f64) | |
5169 | return AArch64ISD::DUPLANE64; | |
5170 | ||
5171 | llvm_unreachable("Invalid vector element type?"); | |
5172 | } | |
5173 | ||
5174 | SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, | |
5175 | SelectionDAG &DAG) const { | |
5176 | SDLoc dl(Op); | |
5177 | EVT VT = Op.getValueType(); | |
5178 | ||
5179 | ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode()); | |
5180 | ||
5181 | // Convert shuffles that are directly supported on NEON to target-specific | |
5182 | // DAG nodes, instead of keeping them as shuffles and matching them again | |
5183 | // during code selection. This is more efficient and avoids the possibility | |
5184 | // of inconsistencies between legalization and selection. | |
5185 | ArrayRef<int> ShuffleMask = SVN->getMask(); | |
5186 | ||
5187 | SDValue V1 = Op.getOperand(0); | |
5188 | SDValue V2 = Op.getOperand(1); | |
5189 | ||
5190 | if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], | |
5191 | V1.getValueType().getSimpleVT())) { | |
5192 | int Lane = SVN->getSplatIndex(); | |
5193 | // If this is undef splat, generate it via "just" vdup, if possible. | |
5194 | if (Lane == -1) | |
5195 | Lane = 0; | |
5196 | ||
5197 | if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) | |
5198 | return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(), | |
5199 | V1.getOperand(0)); | |
5200 | // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non- | |
5201 | // constant. If so, we can just reference the lane's definition directly. | |
5202 | if (V1.getOpcode() == ISD::BUILD_VECTOR && | |
5203 | !isa<ConstantSDNode>(V1.getOperand(Lane))) | |
5204 | return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane)); | |
5205 | ||
5206 | // Otherwise, duplicate from the lane of the input vector. | |
5207 | unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); | |
5208 | ||
5209 | // SelectionDAGBuilder may have "helpfully" already extracted or conatenated | |
5210 | // to make a vector of the same size as this SHUFFLE. We can ignore the | |
5211 | // extract entirely, and canonicalise the concat using WidenVector. | |
5212 | if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { | |
5213 | Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue(); | |
5214 | V1 = V1.getOperand(0); | |
5215 | } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { | |
5216 | unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; | |
5217 | Lane -= Idx * VT.getVectorNumElements() / 2; | |
5218 | V1 = WidenVector(V1.getOperand(Idx), DAG); | |
5219 | } else if (VT.getSizeInBits() == 64) | |
5220 | V1 = WidenVector(V1, DAG); | |
5221 | ||
5222 | return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64)); | |
5223 | } | |
5224 | ||
5225 | if (isREVMask(ShuffleMask, VT, 64)) | |
5226 | return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2); | |
5227 | if (isREVMask(ShuffleMask, VT, 32)) | |
5228 | return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2); | |
5229 | if (isREVMask(ShuffleMask, VT, 16)) | |
5230 | return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2); | |
5231 | ||
5232 | bool ReverseEXT = false; | |
5233 | unsigned Imm; | |
5234 | if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) { | |
5235 | if (ReverseEXT) | |
5236 | std::swap(V1, V2); | |
5237 | Imm *= getExtFactor(V1); | |
5238 | return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2, | |
5239 | DAG.getConstant(Imm, MVT::i32)); | |
5240 | } else if (V2->getOpcode() == ISD::UNDEF && | |
5241 | isSingletonEXTMask(ShuffleMask, VT, Imm)) { | |
5242 | Imm *= getExtFactor(V1); | |
5243 | return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1, | |
5244 | DAG.getConstant(Imm, MVT::i32)); | |
5245 | } | |
5246 | ||
5247 | unsigned WhichResult; | |
5248 | if (isZIPMask(ShuffleMask, VT, WhichResult)) { | |
5249 | unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; | |
5250 | return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); | |
5251 | } | |
5252 | if (isUZPMask(ShuffleMask, VT, WhichResult)) { | |
5253 | unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; | |
5254 | return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); | |
5255 | } | |
5256 | if (isTRNMask(ShuffleMask, VT, WhichResult)) { | |
5257 | unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; | |
5258 | return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2); | |
5259 | } | |
5260 | ||
5261 | if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { | |
5262 | unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; | |
5263 | return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); | |
5264 | } | |
5265 | if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { | |
5266 | unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; | |
5267 | return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); | |
5268 | } | |
5269 | if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) { | |
5270 | unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; | |
5271 | return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1); | |
5272 | } | |
5273 | ||
5274 | SDValue Concat = tryFormConcatFromShuffle(Op, DAG); | |
5275 | if (Concat.getNode()) | |
5276 | return Concat; | |
5277 | ||
5278 | bool DstIsLeft; | |
5279 | int Anomaly; | |
5280 | int NumInputElements = V1.getValueType().getVectorNumElements(); | |
5281 | if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) { | |
5282 | SDValue DstVec = DstIsLeft ? V1 : V2; | |
5283 | SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64); | |
5284 | ||
5285 | SDValue SrcVec = V1; | |
5286 | int SrcLane = ShuffleMask[Anomaly]; | |
5287 | if (SrcLane >= NumInputElements) { | |
5288 | SrcVec = V2; | |
5289 | SrcLane -= VT.getVectorNumElements(); | |
5290 | } | |
5291 | SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64); | |
5292 | ||
5293 | EVT ScalarVT = VT.getVectorElementType(); | |
5294 | ||
5295 | if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger()) | |
5296 | ScalarVT = MVT::i32; | |
5297 | ||
5298 | return DAG.getNode( | |
5299 | ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, | |
5300 | DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV), | |
5301 | DstLaneV); | |
5302 | } | |
5303 | ||
5304 | // If the shuffle is not directly supported and it has 4 elements, use | |
5305 | // the PerfectShuffle-generated table to synthesize it from other shuffles. | |
5306 | unsigned NumElts = VT.getVectorNumElements(); | |
5307 | if (NumElts == 4) { | |
5308 | unsigned PFIndexes[4]; | |
5309 | for (unsigned i = 0; i != 4; ++i) { | |
5310 | if (ShuffleMask[i] < 0) | |
5311 | PFIndexes[i] = 8; | |
5312 | else | |
5313 | PFIndexes[i] = ShuffleMask[i]; | |
5314 | } | |
5315 | ||
5316 | // Compute the index in the perfect shuffle table. | |
5317 | unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + | |
5318 | PFIndexes[2] * 9 + PFIndexes[3]; | |
5319 | unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; | |
5320 | unsigned Cost = (PFEntry >> 30); | |
5321 | ||
5322 | if (Cost <= 4) | |
5323 | return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl); | |
5324 | } | |
5325 | ||
5326 | return GenerateTBL(Op, ShuffleMask, DAG); | |
5327 | } | |
5328 | ||
5329 | static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, | |
5330 | APInt &UndefBits) { | |
5331 | EVT VT = BVN->getValueType(0); | |
5332 | APInt SplatBits, SplatUndef; | |
5333 | unsigned SplatBitSize; | |
5334 | bool HasAnyUndefs; | |
5335 | if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { | |
5336 | unsigned NumSplats = VT.getSizeInBits() / SplatBitSize; | |
5337 | ||
5338 | for (unsigned i = 0; i < NumSplats; ++i) { | |
5339 | CnstBits <<= SplatBitSize; | |
5340 | UndefBits <<= SplatBitSize; | |
5341 | CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits()); | |
5342 | UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits()); | |
5343 | } | |
5344 | ||
5345 | return true; | |
5346 | } | |
5347 | ||
5348 | return false; | |
5349 | } | |
5350 | ||
5351 | SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op, | |
5352 | SelectionDAG &DAG) const { | |
5353 | BuildVectorSDNode *BVN = | |
5354 | dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); | |
5355 | SDValue LHS = Op.getOperand(0); | |
5356 | SDLoc dl(Op); | |
5357 | EVT VT = Op.getValueType(); | |
5358 | ||
5359 | if (!BVN) | |
5360 | return Op; | |
5361 | ||
5362 | APInt CnstBits(VT.getSizeInBits(), 0); | |
5363 | APInt UndefBits(VT.getSizeInBits(), 0); | |
5364 | if (resolveBuildVector(BVN, CnstBits, UndefBits)) { | |
5365 | // We only have BIC vector immediate instruction, which is and-not. | |
5366 | CnstBits = ~CnstBits; | |
5367 | ||
5368 | // We make use of a little bit of goto ickiness in order to avoid having to | |
5369 | // duplicate the immediate matching logic for the undef toggled case. | |
5370 | bool SecondTry = false; | |
5371 | AttemptModImm: | |
5372 | ||
5373 | if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { | |
5374 | CnstBits = CnstBits.zextOrTrunc(64); | |
5375 | uint64_t CnstVal = CnstBits.getZExtValue(); | |
5376 | ||
5377 | if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { | |
5378 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); | |
5379 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5380 | SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, | |
5381 | DAG.getConstant(CnstVal, MVT::i32), | |
5382 | DAG.getConstant(0, MVT::i32)); | |
5383 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5384 | } | |
5385 | ||
5386 | if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { | |
5387 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); | |
5388 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5389 | SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, | |
5390 | DAG.getConstant(CnstVal, MVT::i32), | |
5391 | DAG.getConstant(8, MVT::i32)); | |
5392 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5393 | } | |
5394 | ||
5395 | if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { | |
5396 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); | |
5397 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5398 | SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, | |
5399 | DAG.getConstant(CnstVal, MVT::i32), | |
5400 | DAG.getConstant(16, MVT::i32)); | |
5401 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5402 | } | |
5403 | ||
5404 | if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { | |
5405 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); | |
5406 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5407 | SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, | |
5408 | DAG.getConstant(CnstVal, MVT::i32), | |
5409 | DAG.getConstant(24, MVT::i32)); | |
5410 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5411 | } | |
5412 | ||
5413 | if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { | |
5414 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); | |
5415 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; | |
5416 | SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, | |
5417 | DAG.getConstant(CnstVal, MVT::i32), | |
5418 | DAG.getConstant(0, MVT::i32)); | |
5419 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5420 | } | |
5421 | ||
5422 | if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { | |
5423 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); | |
5424 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; | |
5425 | SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS, | |
5426 | DAG.getConstant(CnstVal, MVT::i32), | |
5427 | DAG.getConstant(8, MVT::i32)); | |
5428 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5429 | } | |
5430 | } | |
5431 | ||
5432 | if (SecondTry) | |
5433 | goto FailedModImm; | |
5434 | SecondTry = true; | |
5435 | CnstBits = ~UndefBits; | |
5436 | goto AttemptModImm; | |
5437 | } | |
5438 | ||
5439 | // We can always fall back to a non-immediate AND. | |
5440 | FailedModImm: | |
5441 | return Op; | |
5442 | } | |
5443 | ||
5444 | // Specialized code to quickly find if PotentialBVec is a BuildVector that | |
5445 | // consists of only the same constant int value, returned in reference arg | |
5446 | // ConstVal | |
5447 | static bool isAllConstantBuildVector(const SDValue &PotentialBVec, | |
5448 | uint64_t &ConstVal) { | |
5449 | BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec); | |
5450 | if (!Bvec) | |
5451 | return false; | |
5452 | ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0)); | |
5453 | if (!FirstElt) | |
5454 | return false; | |
5455 | EVT VT = Bvec->getValueType(0); | |
5456 | unsigned NumElts = VT.getVectorNumElements(); | |
5457 | for (unsigned i = 1; i < NumElts; ++i) | |
5458 | if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt) | |
5459 | return false; | |
5460 | ConstVal = FirstElt->getZExtValue(); | |
5461 | return true; | |
5462 | } | |
5463 | ||
5464 | static unsigned getIntrinsicID(const SDNode *N) { | |
5465 | unsigned Opcode = N->getOpcode(); | |
5466 | switch (Opcode) { | |
5467 | default: | |
5468 | return Intrinsic::not_intrinsic; | |
5469 | case ISD::INTRINSIC_WO_CHAIN: { | |
5470 | unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); | |
5471 | if (IID < Intrinsic::num_intrinsics) | |
5472 | return IID; | |
5473 | return Intrinsic::not_intrinsic; | |
5474 | } | |
5475 | } | |
5476 | } | |
5477 | ||
5478 | // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)), | |
5479 | // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a | |
5480 | // BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2. | |
5481 | // Also, logical shift right -> sri, with the same structure. | |
5482 | static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { | |
5483 | EVT VT = N->getValueType(0); | |
5484 | ||
5485 | if (!VT.isVector()) | |
5486 | return SDValue(); | |
5487 | ||
5488 | SDLoc DL(N); | |
5489 | ||
5490 | // Is the first op an AND? | |
5491 | const SDValue And = N->getOperand(0); | |
5492 | if (And.getOpcode() != ISD::AND) | |
5493 | return SDValue(); | |
5494 | ||
5495 | // Is the second op an shl or lshr? | |
5496 | SDValue Shift = N->getOperand(1); | |
5497 | // This will have been turned into: AArch64ISD::VSHL vector, #shift | |
5498 | // or AArch64ISD::VLSHR vector, #shift | |
5499 | unsigned ShiftOpc = Shift.getOpcode(); | |
5500 | if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR)) | |
5501 | return SDValue(); | |
5502 | bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR; | |
5503 | ||
5504 | // Is the shift amount constant? | |
5505 | ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1)); | |
5506 | if (!C2node) | |
5507 | return SDValue(); | |
5508 | ||
5509 | // Is the and mask vector all constant? | |
5510 | uint64_t C1; | |
5511 | if (!isAllConstantBuildVector(And.getOperand(1), C1)) | |
5512 | return SDValue(); | |
5513 | ||
5514 | // Is C1 == ~C2, taking into account how much one can shift elements of a | |
5515 | // particular size? | |
5516 | uint64_t C2 = C2node->getZExtValue(); | |
5517 | unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits(); | |
5518 | if (C2 > ElemSizeInBits) | |
5519 | return SDValue(); | |
5520 | unsigned ElemMask = (1 << ElemSizeInBits) - 1; | |
5521 | if ((C1 & ElemMask) != (~C2 & ElemMask)) | |
5522 | return SDValue(); | |
5523 | ||
5524 | SDValue X = And.getOperand(0); | |
5525 | SDValue Y = Shift.getOperand(0); | |
5526 | ||
5527 | unsigned Intrin = | |
5528 | IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli; | |
5529 | SDValue ResultSLI = | |
5530 | DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, | |
5531 | DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1)); | |
5532 | ||
5533 | DEBUG(dbgs() << "aarch64-lower: transformed: \n"); | |
5534 | DEBUG(N->dump(&DAG)); | |
5535 | DEBUG(dbgs() << "into: \n"); | |
5536 | DEBUG(ResultSLI->dump(&DAG)); | |
5537 | ||
5538 | ++NumShiftInserts; | |
5539 | return ResultSLI; | |
5540 | } | |
5541 | ||
5542 | SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op, | |
5543 | SelectionDAG &DAG) const { | |
5544 | // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2)) | |
5545 | if (EnableAArch64SlrGeneration) { | |
5546 | SDValue Res = tryLowerToSLI(Op.getNode(), DAG); | |
5547 | if (Res.getNode()) | |
5548 | return Res; | |
5549 | } | |
5550 | ||
5551 | BuildVectorSDNode *BVN = | |
5552 | dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode()); | |
5553 | SDValue LHS = Op.getOperand(1); | |
5554 | SDLoc dl(Op); | |
5555 | EVT VT = Op.getValueType(); | |
5556 | ||
5557 | // OR commutes, so try swapping the operands. | |
5558 | if (!BVN) { | |
5559 | LHS = Op.getOperand(0); | |
5560 | BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode()); | |
5561 | } | |
5562 | if (!BVN) | |
5563 | return Op; | |
5564 | ||
5565 | APInt CnstBits(VT.getSizeInBits(), 0); | |
5566 | APInt UndefBits(VT.getSizeInBits(), 0); | |
5567 | if (resolveBuildVector(BVN, CnstBits, UndefBits)) { | |
5568 | // We make use of a little bit of goto ickiness in order to avoid having to | |
5569 | // duplicate the immediate matching logic for the undef toggled case. | |
5570 | bool SecondTry = false; | |
5571 | AttemptModImm: | |
5572 | ||
5573 | if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { | |
5574 | CnstBits = CnstBits.zextOrTrunc(64); | |
5575 | uint64_t CnstVal = CnstBits.getZExtValue(); | |
5576 | ||
5577 | if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { | |
5578 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); | |
5579 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5580 | SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, | |
5581 | DAG.getConstant(CnstVal, MVT::i32), | |
5582 | DAG.getConstant(0, MVT::i32)); | |
5583 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5584 | } | |
5585 | ||
5586 | if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { | |
5587 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); | |
5588 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5589 | SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, | |
5590 | DAG.getConstant(CnstVal, MVT::i32), | |
5591 | DAG.getConstant(8, MVT::i32)); | |
5592 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5593 | } | |
5594 | ||
5595 | if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { | |
5596 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); | |
5597 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5598 | SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, | |
5599 | DAG.getConstant(CnstVal, MVT::i32), | |
5600 | DAG.getConstant(16, MVT::i32)); | |
5601 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5602 | } | |
5603 | ||
5604 | if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { | |
5605 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); | |
5606 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5607 | SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, | |
5608 | DAG.getConstant(CnstVal, MVT::i32), | |
5609 | DAG.getConstant(24, MVT::i32)); | |
5610 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5611 | } | |
5612 | ||
5613 | if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { | |
5614 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); | |
5615 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; | |
5616 | SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, | |
5617 | DAG.getConstant(CnstVal, MVT::i32), | |
5618 | DAG.getConstant(0, MVT::i32)); | |
5619 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5620 | } | |
5621 | ||
5622 | if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { | |
5623 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); | |
5624 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; | |
5625 | SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS, | |
5626 | DAG.getConstant(CnstVal, MVT::i32), | |
5627 | DAG.getConstant(8, MVT::i32)); | |
5628 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5629 | } | |
5630 | } | |
5631 | ||
5632 | if (SecondTry) | |
5633 | goto FailedModImm; | |
5634 | SecondTry = true; | |
5635 | CnstBits = UndefBits; | |
5636 | goto AttemptModImm; | |
5637 | } | |
5638 | ||
5639 | // We can always fall back to a non-immediate OR. | |
5640 | FailedModImm: | |
5641 | return Op; | |
5642 | } | |
5643 | ||
5644 | // Normalize the operands of BUILD_VECTOR. The value of constant operands will | |
5645 | // be truncated to fit element width. | |
5646 | static SDValue NormalizeBuildVector(SDValue Op, | |
5647 | SelectionDAG &DAG) { | |
5648 | assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!"); | |
5649 | SDLoc dl(Op); | |
5650 | EVT VT = Op.getValueType(); | |
5651 | EVT EltTy= VT.getVectorElementType(); | |
5652 | ||
5653 | if (EltTy.isFloatingPoint() || EltTy.getSizeInBits() > 16) | |
5654 | return Op; | |
5655 | ||
5656 | SmallVector<SDValue, 16> Ops; | |
5657 | for (unsigned I = 0, E = VT.getVectorNumElements(); I != E; ++I) { | |
5658 | SDValue Lane = Op.getOperand(I); | |
5659 | if (Lane.getOpcode() == ISD::Constant) { | |
5660 | APInt LowBits(EltTy.getSizeInBits(), | |
5661 | cast<ConstantSDNode>(Lane)->getZExtValue()); | |
5662 | Lane = DAG.getConstant(LowBits.getZExtValue(), MVT::i32); | |
5663 | } | |
5664 | Ops.push_back(Lane); | |
5665 | } | |
5666 | return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); | |
5667 | } | |
5668 | ||
5669 | SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, | |
5670 | SelectionDAG &DAG) const { | |
5671 | SDLoc dl(Op); | |
5672 | EVT VT = Op.getValueType(); | |
5673 | Op = NormalizeBuildVector(Op, DAG); | |
5674 | BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode()); | |
5675 | ||
5676 | APInt CnstBits(VT.getSizeInBits(), 0); | |
5677 | APInt UndefBits(VT.getSizeInBits(), 0); | |
5678 | if (resolveBuildVector(BVN, CnstBits, UndefBits)) { | |
5679 | // We make use of a little bit of goto ickiness in order to avoid having to | |
5680 | // duplicate the immediate matching logic for the undef toggled case. | |
5681 | bool SecondTry = false; | |
5682 | AttemptModImm: | |
5683 | ||
5684 | if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) { | |
5685 | CnstBits = CnstBits.zextOrTrunc(64); | |
5686 | uint64_t CnstVal = CnstBits.getZExtValue(); | |
5687 | ||
5688 | // Certain magic vector constants (used to express things like NOT | |
5689 | // and NEG) are passed through unmodified. This allows codegen patterns | |
5690 | // for these operations to match. Special-purpose patterns will lower | |
5691 | // these immediates to MOVIs if it proves necessary. | |
5692 | if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL)) | |
5693 | return Op; | |
5694 | ||
5695 | // The many faces of MOVI... | |
5696 | if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) { | |
5697 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal); | |
5698 | if (VT.getSizeInBits() == 128) { | |
5699 | SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64, | |
5700 | DAG.getConstant(CnstVal, MVT::i32)); | |
5701 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5702 | } | |
5703 | ||
5704 | // Support the V64 version via subregister insertion. | |
5705 | SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64, | |
5706 | DAG.getConstant(CnstVal, MVT::i32)); | |
5707 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5708 | } | |
5709 | ||
5710 | if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { | |
5711 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); | |
5712 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5713 | SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, | |
5714 | DAG.getConstant(CnstVal, MVT::i32), | |
5715 | DAG.getConstant(0, MVT::i32)); | |
5716 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5717 | } | |
5718 | ||
5719 | if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { | |
5720 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); | |
5721 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5722 | SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, | |
5723 | DAG.getConstant(CnstVal, MVT::i32), | |
5724 | DAG.getConstant(8, MVT::i32)); | |
5725 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5726 | } | |
5727 | ||
5728 | if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { | |
5729 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); | |
5730 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5731 | SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, | |
5732 | DAG.getConstant(CnstVal, MVT::i32), | |
5733 | DAG.getConstant(16, MVT::i32)); | |
5734 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5735 | } | |
5736 | ||
5737 | if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { | |
5738 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); | |
5739 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5740 | SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, | |
5741 | DAG.getConstant(CnstVal, MVT::i32), | |
5742 | DAG.getConstant(24, MVT::i32)); | |
5743 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5744 | } | |
5745 | ||
5746 | if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { | |
5747 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); | |
5748 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; | |
5749 | SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, | |
5750 | DAG.getConstant(CnstVal, MVT::i32), | |
5751 | DAG.getConstant(0, MVT::i32)); | |
5752 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5753 | } | |
5754 | ||
5755 | if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { | |
5756 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); | |
5757 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; | |
5758 | SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy, | |
5759 | DAG.getConstant(CnstVal, MVT::i32), | |
5760 | DAG.getConstant(8, MVT::i32)); | |
5761 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5762 | } | |
5763 | ||
5764 | if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { | |
5765 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); | |
5766 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5767 | SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, | |
5768 | DAG.getConstant(CnstVal, MVT::i32), | |
5769 | DAG.getConstant(264, MVT::i32)); | |
5770 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5771 | } | |
5772 | ||
5773 | if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { | |
5774 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); | |
5775 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5776 | SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy, | |
5777 | DAG.getConstant(CnstVal, MVT::i32), | |
5778 | DAG.getConstant(272, MVT::i32)); | |
5779 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5780 | } | |
5781 | ||
5782 | if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) { | |
5783 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal); | |
5784 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8; | |
5785 | SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy, | |
5786 | DAG.getConstant(CnstVal, MVT::i32)); | |
5787 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5788 | } | |
5789 | ||
5790 | // The few faces of FMOV... | |
5791 | if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) { | |
5792 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal); | |
5793 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32; | |
5794 | SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy, | |
5795 | DAG.getConstant(CnstVal, MVT::i32)); | |
5796 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5797 | } | |
5798 | ||
5799 | if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) && | |
5800 | VT.getSizeInBits() == 128) { | |
5801 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal); | |
5802 | SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64, | |
5803 | DAG.getConstant(CnstVal, MVT::i32)); | |
5804 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5805 | } | |
5806 | ||
5807 | // The many faces of MVNI... | |
5808 | CnstVal = ~CnstVal; | |
5809 | if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) { | |
5810 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal); | |
5811 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5812 | SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, | |
5813 | DAG.getConstant(CnstVal, MVT::i32), | |
5814 | DAG.getConstant(0, MVT::i32)); | |
5815 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5816 | } | |
5817 | ||
5818 | if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) { | |
5819 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal); | |
5820 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5821 | SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, | |
5822 | DAG.getConstant(CnstVal, MVT::i32), | |
5823 | DAG.getConstant(8, MVT::i32)); | |
5824 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5825 | } | |
5826 | ||
5827 | if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) { | |
5828 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal); | |
5829 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5830 | SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, | |
5831 | DAG.getConstant(CnstVal, MVT::i32), | |
5832 | DAG.getConstant(16, MVT::i32)); | |
5833 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5834 | } | |
5835 | ||
5836 | if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) { | |
5837 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal); | |
5838 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5839 | SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, | |
5840 | DAG.getConstant(CnstVal, MVT::i32), | |
5841 | DAG.getConstant(24, MVT::i32)); | |
5842 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5843 | } | |
5844 | ||
5845 | if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) { | |
5846 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal); | |
5847 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; | |
5848 | SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, | |
5849 | DAG.getConstant(CnstVal, MVT::i32), | |
5850 | DAG.getConstant(0, MVT::i32)); | |
5851 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5852 | } | |
5853 | ||
5854 | if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) { | |
5855 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal); | |
5856 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16; | |
5857 | SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy, | |
5858 | DAG.getConstant(CnstVal, MVT::i32), | |
5859 | DAG.getConstant(8, MVT::i32)); | |
5860 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5861 | } | |
5862 | ||
5863 | if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) { | |
5864 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal); | |
5865 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5866 | SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, | |
5867 | DAG.getConstant(CnstVal, MVT::i32), | |
5868 | DAG.getConstant(264, MVT::i32)); | |
5869 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5870 | } | |
5871 | ||
5872 | if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) { | |
5873 | CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal); | |
5874 | MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32; | |
5875 | SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy, | |
5876 | DAG.getConstant(CnstVal, MVT::i32), | |
5877 | DAG.getConstant(272, MVT::i32)); | |
5878 | return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov); | |
5879 | } | |
5880 | } | |
5881 | ||
5882 | if (SecondTry) | |
5883 | goto FailedModImm; | |
5884 | SecondTry = true; | |
5885 | CnstBits = UndefBits; | |
5886 | goto AttemptModImm; | |
5887 | } | |
5888 | FailedModImm: | |
5889 | ||
5890 | // Scan through the operands to find some interesting properties we can | |
5891 | // exploit: | |
5892 | // 1) If only one value is used, we can use a DUP, or | |
5893 | // 2) if only the low element is not undef, we can just insert that, or | |
5894 | // 3) if only one constant value is used (w/ some non-constant lanes), | |
5895 | // we can splat the constant value into the whole vector then fill | |
5896 | // in the non-constant lanes. | |
5897 | // 4) FIXME: If different constant values are used, but we can intelligently | |
5898 | // select the values we'll be overwriting for the non-constant | |
5899 | // lanes such that we can directly materialize the vector | |
5900 | // some other way (MOVI, e.g.), we can be sneaky. | |
5901 | unsigned NumElts = VT.getVectorNumElements(); | |
5902 | bool isOnlyLowElement = true; | |
5903 | bool usesOnlyOneValue = true; | |
5904 | bool usesOnlyOneConstantValue = true; | |
5905 | bool isConstant = true; | |
5906 | unsigned NumConstantLanes = 0; | |
5907 | SDValue Value; | |
5908 | SDValue ConstantValue; | |
5909 | for (unsigned i = 0; i < NumElts; ++i) { | |
5910 | SDValue V = Op.getOperand(i); | |
5911 | if (V.getOpcode() == ISD::UNDEF) | |
5912 | continue; | |
5913 | if (i > 0) | |
5914 | isOnlyLowElement = false; | |
5915 | if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V)) | |
5916 | isConstant = false; | |
5917 | ||
5918 | if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) { | |
5919 | ++NumConstantLanes; | |
5920 | if (!ConstantValue.getNode()) | |
5921 | ConstantValue = V; | |
5922 | else if (ConstantValue != V) | |
5923 | usesOnlyOneConstantValue = false; | |
5924 | } | |
5925 | ||
5926 | if (!Value.getNode()) | |
5927 | Value = V; | |
5928 | else if (V != Value) | |
5929 | usesOnlyOneValue = false; | |
5930 | } | |
5931 | ||
5932 | if (!Value.getNode()) | |
5933 | return DAG.getUNDEF(VT); | |
5934 | ||
5935 | if (isOnlyLowElement) | |
5936 | return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value); | |
5937 | ||
5938 | // Use DUP for non-constant splats. For f32 constant splats, reduce to | |
5939 | // i32 and try again. | |
5940 | if (usesOnlyOneValue) { | |
5941 | if (!isConstant) { | |
5942 | if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |
5943 | Value.getValueType() != VT) | |
5944 | return DAG.getNode(AArch64ISD::DUP, dl, VT, Value); | |
5945 | ||
5946 | // This is actually a DUPLANExx operation, which keeps everything vectory. | |
5947 | ||
5948 | // DUPLANE works on 128-bit vectors, widen it if necessary. | |
5949 | SDValue Lane = Value.getOperand(1); | |
5950 | Value = Value.getOperand(0); | |
5951 | if (Value.getValueType().getSizeInBits() == 64) | |
5952 | Value = WidenVector(Value, DAG); | |
5953 | ||
5954 | unsigned Opcode = getDUPLANEOp(VT.getVectorElementType()); | |
5955 | return DAG.getNode(Opcode, dl, VT, Value, Lane); | |
5956 | } | |
5957 | ||
5958 | if (VT.getVectorElementType().isFloatingPoint()) { | |
5959 | SmallVector<SDValue, 8> Ops; | |
5960 | MVT NewType = | |
5961 | (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64; | |
5962 | for (unsigned i = 0; i < NumElts; ++i) | |
5963 | Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i))); | |
5964 | EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts); | |
5965 | SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops); | |
5966 | Val = LowerBUILD_VECTOR(Val, DAG); | |
5967 | if (Val.getNode()) | |
5968 | return DAG.getNode(ISD::BITCAST, dl, VT, Val); | |
5969 | } | |
5970 | } | |
5971 | ||
5972 | // If there was only one constant value used and for more than one lane, | |
5973 | // start by splatting that value, then replace the non-constant lanes. This | |
5974 | // is better than the default, which will perform a separate initialization | |
5975 | // for each lane. | |
5976 | if (NumConstantLanes > 0 && usesOnlyOneConstantValue) { | |
5977 | SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); | |
5978 | // Now insert the non-constant lanes. | |
5979 | for (unsigned i = 0; i < NumElts; ++i) { | |
5980 | SDValue V = Op.getOperand(i); | |
5981 | SDValue LaneIdx = DAG.getConstant(i, MVT::i64); | |
5982 | if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) { | |
5983 | // Note that type legalization likely mucked about with the VT of the | |
5984 | // source operand, so we may have to convert it here before inserting. | |
5985 | Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx); | |
5986 | } | |
5987 | } | |
5988 | return Val; | |
5989 | } | |
5990 | ||
5991 | // If all elements are constants and the case above didn't get hit, fall back | |
5992 | // to the default expansion, which will generate a load from the constant | |
5993 | // pool. | |
5994 | if (isConstant) | |
5995 | return SDValue(); | |
5996 | ||
5997 | // Empirical tests suggest this is rarely worth it for vectors of length <= 2. | |
5998 | if (NumElts >= 4) { | |
5999 | SDValue shuffle = ReconstructShuffle(Op, DAG); | |
6000 | if (shuffle != SDValue()) | |
6001 | return shuffle; | |
6002 | } | |
6003 | ||
6004 | // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we | |
6005 | // know the default expansion would otherwise fall back on something even | |
6006 | // worse. For a vector with one or two non-undef values, that's | |
6007 | // scalar_to_vector for the elements followed by a shuffle (provided the | |
6008 | // shuffle is valid for the target) and materialization element by element | |
6009 | // on the stack followed by a load for everything else. | |
6010 | if (!isConstant && !usesOnlyOneValue) { | |
6011 | SDValue Vec = DAG.getUNDEF(VT); | |
6012 | SDValue Op0 = Op.getOperand(0); | |
6013 | unsigned ElemSize = VT.getVectorElementType().getSizeInBits(); | |
6014 | unsigned i = 0; | |
6015 | // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to | |
6016 | // a) Avoid a RMW dependency on the full vector register, and | |
6017 | // b) Allow the register coalescer to fold away the copy if the | |
6018 | // value is already in an S or D register. | |
6019 | if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) { | |
6020 | unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub; | |
6021 | MachineSDNode *N = | |
6022 | DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0, | |
6023 | DAG.getTargetConstant(SubIdx, MVT::i32)); | |
6024 | Vec = SDValue(N, 0); | |
6025 | ++i; | |
6026 | } | |
6027 | for (; i < NumElts; ++i) { | |
6028 | SDValue V = Op.getOperand(i); | |
6029 | if (V.getOpcode() == ISD::UNDEF) | |
6030 | continue; | |
6031 | SDValue LaneIdx = DAG.getConstant(i, MVT::i64); | |
6032 | Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx); | |
6033 | } | |
6034 | return Vec; | |
6035 | } | |
6036 | ||
6037 | // Just use the default expansion. We failed to find a better alternative. | |
6038 | return SDValue(); | |
6039 | } | |
6040 | ||
6041 | SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, | |
6042 | SelectionDAG &DAG) const { | |
6043 | assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!"); | |
6044 | ||
6045 | // Check for non-constant or out of range lane. | |
6046 | EVT VT = Op.getOperand(0).getValueType(); | |
6047 | ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2)); | |
6048 | if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) | |
6049 | return SDValue(); | |
6050 | ||
6051 | ||
6052 | // Insertion/extraction are legal for V128 types. | |
6053 | if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || | |
6054 | VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || | |
6055 | VT == MVT::v8f16) | |
6056 | return Op; | |
6057 | ||
6058 | if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && | |
6059 | VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) | |
6060 | return SDValue(); | |
6061 | ||
6062 | // For V64 types, we perform insertion by expanding the value | |
6063 | // to a V128 type and perform the insertion on that. | |
6064 | SDLoc DL(Op); | |
6065 | SDValue WideVec = WidenVector(Op.getOperand(0), DAG); | |
6066 | EVT WideTy = WideVec.getValueType(); | |
6067 | ||
6068 | SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec, | |
6069 | Op.getOperand(1), Op.getOperand(2)); | |
6070 | // Re-narrow the resultant vector. | |
6071 | return NarrowVector(Node, DAG); | |
6072 | } | |
6073 | ||
6074 | SDValue | |
6075 | AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, | |
6076 | SelectionDAG &DAG) const { | |
6077 | assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!"); | |
6078 | ||
6079 | // Check for non-constant or out of range lane. | |
6080 | EVT VT = Op.getOperand(0).getValueType(); | |
6081 | ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1)); | |
6082 | if (!CI || CI->getZExtValue() >= VT.getVectorNumElements()) | |
6083 | return SDValue(); | |
6084 | ||
6085 | ||
6086 | // Insertion/extraction are legal for V128 types. | |
6087 | if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 || | |
6088 | VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 || | |
6089 | VT == MVT::v8f16) | |
6090 | return Op; | |
6091 | ||
6092 | if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 && | |
6093 | VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16) | |
6094 | return SDValue(); | |
6095 | ||
6096 | // For V64 types, we perform extraction by expanding the value | |
6097 | // to a V128 type and perform the extraction on that. | |
6098 | SDLoc DL(Op); | |
6099 | SDValue WideVec = WidenVector(Op.getOperand(0), DAG); | |
6100 | EVT WideTy = WideVec.getValueType(); | |
6101 | ||
6102 | EVT ExtrTy = WideTy.getVectorElementType(); | |
6103 | if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8) | |
6104 | ExtrTy = MVT::i32; | |
6105 | ||
6106 | // For extractions, we just return the result directly. | |
6107 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec, | |
6108 | Op.getOperand(1)); | |
6109 | } | |
6110 | ||
6111 | SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op, | |
6112 | SelectionDAG &DAG) const { | |
6113 | EVT VT = Op.getOperand(0).getValueType(); | |
6114 | SDLoc dl(Op); | |
6115 | // Just in case... | |
6116 | if (!VT.isVector()) | |
6117 | return SDValue(); | |
6118 | ||
6119 | ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1)); | |
6120 | if (!Cst) | |
6121 | return SDValue(); | |
6122 | unsigned Val = Cst->getZExtValue(); | |
6123 | ||
6124 | unsigned Size = Op.getValueType().getSizeInBits(); | |
6125 | if (Val == 0) { | |
6126 | switch (Size) { | |
6127 | case 8: | |
6128 | return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(), | |
6129 | Op.getOperand(0)); | |
6130 | case 16: | |
6131 | return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(), | |
6132 | Op.getOperand(0)); | |
6133 | case 32: | |
6134 | return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(), | |
6135 | Op.getOperand(0)); | |
6136 | case 64: | |
6137 | return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(), | |
6138 | Op.getOperand(0)); | |
6139 | default: | |
6140 | llvm_unreachable("Unexpected vector type in extract_subvector!"); | |
6141 | } | |
6142 | } | |
6143 | // If this is extracting the upper 64-bits of a 128-bit vector, we match | |
6144 | // that directly. | |
6145 | if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64) | |
6146 | return Op; | |
6147 | ||
6148 | return SDValue(); | |
6149 | } | |
6150 | ||
6151 | bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, | |
6152 | EVT VT) const { | |
6153 | if (VT.getVectorNumElements() == 4 && | |
6154 | (VT.is128BitVector() || VT.is64BitVector())) { | |
6155 | unsigned PFIndexes[4]; | |
6156 | for (unsigned i = 0; i != 4; ++i) { | |
6157 | if (M[i] < 0) | |
6158 | PFIndexes[i] = 8; | |
6159 | else | |
6160 | PFIndexes[i] = M[i]; | |
6161 | } | |
6162 | ||
6163 | // Compute the index in the perfect shuffle table. | |
6164 | unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 + | |
6165 | PFIndexes[2] * 9 + PFIndexes[3]; | |
6166 | unsigned PFEntry = PerfectShuffleTable[PFTableIndex]; | |
6167 | unsigned Cost = (PFEntry >> 30); | |
6168 | ||
6169 | if (Cost <= 4) | |
6170 | return true; | |
6171 | } | |
6172 | ||
6173 | bool DummyBool; | |
6174 | int DummyInt; | |
6175 | unsigned DummyUnsigned; | |
6176 | ||
6177 | return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) || | |
6178 | isREVMask(M, VT, 32) || isREVMask(M, VT, 16) || | |
6179 | isEXTMask(M, VT, DummyBool, DummyUnsigned) || | |
6180 | // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM. | |
6181 | isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) || | |
6182 | isZIPMask(M, VT, DummyUnsigned) || | |
6183 | isTRN_v_undef_Mask(M, VT, DummyUnsigned) || | |
6184 | isUZP_v_undef_Mask(M, VT, DummyUnsigned) || | |
6185 | isZIP_v_undef_Mask(M, VT, DummyUnsigned) || | |
6186 | isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) || | |
6187 | isConcatMask(M, VT, VT.getSizeInBits() == 128)); | |
6188 | } | |
6189 | ||
6190 | /// getVShiftImm - Check if this is a valid build_vector for the immediate | |
6191 | /// operand of a vector shift operation, where all the elements of the | |
6192 | /// build_vector must have the same constant integer value. | |
6193 | static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) { | |
6194 | // Ignore bit_converts. | |
6195 | while (Op.getOpcode() == ISD::BITCAST) | |
6196 | Op = Op.getOperand(0); | |
6197 | BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode()); | |
6198 | APInt SplatBits, SplatUndef; | |
6199 | unsigned SplatBitSize; | |
6200 | bool HasAnyUndefs; | |
6201 | if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, | |
6202 | HasAnyUndefs, ElementBits) || | |
6203 | SplatBitSize > ElementBits) | |
6204 | return false; | |
6205 | Cnt = SplatBits.getSExtValue(); | |
6206 | return true; | |
6207 | } | |
6208 | ||
6209 | /// isVShiftLImm - Check if this is a valid build_vector for the immediate | |
6210 | /// operand of a vector shift left operation. That value must be in the range: | |
6211 | /// 0 <= Value < ElementBits for a left shift; or | |
6212 | /// 0 <= Value <= ElementBits for a long left shift. | |
6213 | static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) { | |
6214 | assert(VT.isVector() && "vector shift count is not a vector type"); | |
6215 | unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); | |
6216 | if (!getVShiftImm(Op, ElementBits, Cnt)) | |
6217 | return false; | |
6218 | return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits); | |
6219 | } | |
6220 | ||
6221 | /// isVShiftRImm - Check if this is a valid build_vector for the immediate | |
6222 | /// operand of a vector shift right operation. For a shift opcode, the value | |
6223 | /// is positive, but for an intrinsic the value count must be negative. The | |
6224 | /// absolute value must be in the range: | |
6225 | /// 1 <= |Value| <= ElementBits for a right shift; or | |
6226 | /// 1 <= |Value| <= ElementBits/2 for a narrow right shift. | |
6227 | static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic, | |
6228 | int64_t &Cnt) { | |
6229 | assert(VT.isVector() && "vector shift count is not a vector type"); | |
6230 | unsigned ElementBits = VT.getVectorElementType().getSizeInBits(); | |
6231 | if (!getVShiftImm(Op, ElementBits, Cnt)) | |
6232 | return false; | |
6233 | if (isIntrinsic) | |
6234 | Cnt = -Cnt; | |
6235 | return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits)); | |
6236 | } | |
6237 | ||
6238 | SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op, | |
6239 | SelectionDAG &DAG) const { | |
6240 | EVT VT = Op.getValueType(); | |
6241 | SDLoc DL(Op); | |
6242 | int64_t Cnt; | |
6243 | ||
6244 | if (!Op.getOperand(1).getValueType().isVector()) | |
6245 | return Op; | |
6246 | unsigned EltSize = VT.getVectorElementType().getSizeInBits(); | |
6247 | ||
6248 | switch (Op.getOpcode()) { | |
6249 | default: | |
6250 | llvm_unreachable("unexpected shift opcode"); | |
6251 | ||
6252 | case ISD::SHL: | |
6253 | if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) | |
6254 | return DAG.getNode(AArch64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0), | |
6255 | DAG.getConstant(Cnt, MVT::i32)); | |
6256 | return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, | |
6257 | DAG.getConstant(Intrinsic::aarch64_neon_ushl, MVT::i32), | |
6258 | Op.getOperand(0), Op.getOperand(1)); | |
6259 | case ISD::SRA: | |
6260 | case ISD::SRL: | |
6261 | // Right shift immediate | |
6262 | if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) && | |
6263 | Cnt < EltSize) { | |
6264 | unsigned Opc = | |
6265 | (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR; | |
6266 | return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0), | |
6267 | DAG.getConstant(Cnt, MVT::i32)); | |
6268 | } | |
6269 | ||
6270 | // Right shift register. Note, there is not a shift right register | |
6271 | // instruction, but the shift left register instruction takes a signed | |
6272 | // value, where negative numbers specify a right shift. | |
6273 | unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl | |
6274 | : Intrinsic::aarch64_neon_ushl; | |
6275 | // negate the shift amount | |
6276 | SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1)); | |
6277 | SDValue NegShiftLeft = | |
6278 | DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, | |
6279 | DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift); | |
6280 | return NegShiftLeft; | |
6281 | } | |
6282 | ||
6283 | return SDValue(); | |
6284 | } | |
6285 | ||
6286 | static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS, | |
6287 | AArch64CC::CondCode CC, bool NoNans, EVT VT, | |
6288 | SDLoc dl, SelectionDAG &DAG) { | |
6289 | EVT SrcVT = LHS.getValueType(); | |
85aaf69f SL |
6290 | assert(VT.getSizeInBits() == SrcVT.getSizeInBits() && |
6291 | "function only supposed to emit natural comparisons"); | |
1a4d82fc JJ |
6292 | |
6293 | BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode()); | |
6294 | APInt CnstBits(VT.getSizeInBits(), 0); | |
6295 | APInt UndefBits(VT.getSizeInBits(), 0); | |
6296 | bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits); | |
6297 | bool IsZero = IsCnst && (CnstBits == 0); | |
6298 | ||
6299 | if (SrcVT.getVectorElementType().isFloatingPoint()) { | |
6300 | switch (CC) { | |
6301 | default: | |
6302 | return SDValue(); | |
6303 | case AArch64CC::NE: { | |
6304 | SDValue Fcmeq; | |
6305 | if (IsZero) | |
6306 | Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); | |
6307 | else | |
6308 | Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); | |
6309 | return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq); | |
6310 | } | |
6311 | case AArch64CC::EQ: | |
6312 | if (IsZero) | |
6313 | return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS); | |
6314 | return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS); | |
6315 | case AArch64CC::GE: | |
6316 | if (IsZero) | |
6317 | return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS); | |
6318 | return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS); | |
6319 | case AArch64CC::GT: | |
6320 | if (IsZero) | |
6321 | return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS); | |
6322 | return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS); | |
6323 | case AArch64CC::LS: | |
6324 | if (IsZero) | |
6325 | return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS); | |
6326 | return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS); | |
6327 | case AArch64CC::LT: | |
6328 | if (!NoNans) | |
6329 | return SDValue(); | |
6330 | // If we ignore NaNs then we can use to the MI implementation. | |
6331 | // Fallthrough. | |
6332 | case AArch64CC::MI: | |
6333 | if (IsZero) | |
6334 | return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS); | |
6335 | return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS); | |
6336 | } | |
6337 | } | |
6338 | ||
6339 | switch (CC) { | |
6340 | default: | |
6341 | return SDValue(); | |
6342 | case AArch64CC::NE: { | |
6343 | SDValue Cmeq; | |
6344 | if (IsZero) | |
6345 | Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); | |
6346 | else | |
6347 | Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); | |
6348 | return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq); | |
6349 | } | |
6350 | case AArch64CC::EQ: | |
6351 | if (IsZero) | |
6352 | return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS); | |
6353 | return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS); | |
6354 | case AArch64CC::GE: | |
6355 | if (IsZero) | |
6356 | return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS); | |
6357 | return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS); | |
6358 | case AArch64CC::GT: | |
6359 | if (IsZero) | |
6360 | return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS); | |
6361 | return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS); | |
6362 | case AArch64CC::LE: | |
6363 | if (IsZero) | |
6364 | return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS); | |
6365 | return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS); | |
6366 | case AArch64CC::LS: | |
6367 | return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS); | |
6368 | case AArch64CC::LO: | |
6369 | return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS); | |
6370 | case AArch64CC::LT: | |
6371 | if (IsZero) | |
6372 | return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS); | |
6373 | return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS); | |
6374 | case AArch64CC::HI: | |
6375 | return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS); | |
6376 | case AArch64CC::HS: | |
6377 | return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS); | |
6378 | } | |
6379 | } | |
6380 | ||
6381 | SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, | |
6382 | SelectionDAG &DAG) const { | |
6383 | ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); | |
6384 | SDValue LHS = Op.getOperand(0); | |
6385 | SDValue RHS = Op.getOperand(1); | |
85aaf69f | 6386 | EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger(); |
1a4d82fc JJ |
6387 | SDLoc dl(Op); |
6388 | ||
6389 | if (LHS.getValueType().getVectorElementType().isInteger()) { | |
6390 | assert(LHS.getValueType() == RHS.getValueType()); | |
6391 | AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC); | |
85aaf69f SL |
6392 | SDValue Cmp = |
6393 | EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG); | |
6394 | return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); | |
1a4d82fc JJ |
6395 | } |
6396 | ||
6397 | assert(LHS.getValueType().getVectorElementType() == MVT::f32 || | |
6398 | LHS.getValueType().getVectorElementType() == MVT::f64); | |
6399 | ||
6400 | // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally | |
6401 | // clean. Some of them require two branches to implement. | |
6402 | AArch64CC::CondCode CC1, CC2; | |
6403 | bool ShouldInvert; | |
6404 | changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert); | |
6405 | ||
6406 | bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath; | |
6407 | SDValue Cmp = | |
85aaf69f | 6408 | EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG); |
1a4d82fc JJ |
6409 | if (!Cmp.getNode()) |
6410 | return SDValue(); | |
6411 | ||
6412 | if (CC2 != AArch64CC::AL) { | |
6413 | SDValue Cmp2 = | |
85aaf69f | 6414 | EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG); |
1a4d82fc JJ |
6415 | if (!Cmp2.getNode()) |
6416 | return SDValue(); | |
6417 | ||
85aaf69f | 6418 | Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2); |
1a4d82fc JJ |
6419 | } |
6420 | ||
85aaf69f SL |
6421 | Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType()); |
6422 | ||
1a4d82fc JJ |
6423 | if (ShouldInvert) |
6424 | return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType()); | |
6425 | ||
6426 | return Cmp; | |
6427 | } | |
6428 | ||
6429 | /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as | |
6430 | /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment | |
6431 | /// specified in the intrinsic calls. | |
6432 | bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, | |
6433 | const CallInst &I, | |
6434 | unsigned Intrinsic) const { | |
6435 | switch (Intrinsic) { | |
6436 | case Intrinsic::aarch64_neon_ld2: | |
6437 | case Intrinsic::aarch64_neon_ld3: | |
6438 | case Intrinsic::aarch64_neon_ld4: | |
6439 | case Intrinsic::aarch64_neon_ld1x2: | |
6440 | case Intrinsic::aarch64_neon_ld1x3: | |
6441 | case Intrinsic::aarch64_neon_ld1x4: | |
6442 | case Intrinsic::aarch64_neon_ld2lane: | |
6443 | case Intrinsic::aarch64_neon_ld3lane: | |
6444 | case Intrinsic::aarch64_neon_ld4lane: | |
6445 | case Intrinsic::aarch64_neon_ld2r: | |
6446 | case Intrinsic::aarch64_neon_ld3r: | |
6447 | case Intrinsic::aarch64_neon_ld4r: { | |
6448 | Info.opc = ISD::INTRINSIC_W_CHAIN; | |
6449 | // Conservatively set memVT to the entire set of vectors loaded. | |
6450 | uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8; | |
6451 | Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); | |
6452 | Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); | |
6453 | Info.offset = 0; | |
6454 | Info.align = 0; | |
6455 | Info.vol = false; // volatile loads with NEON intrinsics not supported | |
6456 | Info.readMem = true; | |
6457 | Info.writeMem = false; | |
6458 | return true; | |
6459 | } | |
6460 | case Intrinsic::aarch64_neon_st2: | |
6461 | case Intrinsic::aarch64_neon_st3: | |
6462 | case Intrinsic::aarch64_neon_st4: | |
6463 | case Intrinsic::aarch64_neon_st1x2: | |
6464 | case Intrinsic::aarch64_neon_st1x3: | |
6465 | case Intrinsic::aarch64_neon_st1x4: | |
6466 | case Intrinsic::aarch64_neon_st2lane: | |
6467 | case Intrinsic::aarch64_neon_st3lane: | |
6468 | case Intrinsic::aarch64_neon_st4lane: { | |
6469 | Info.opc = ISD::INTRINSIC_VOID; | |
6470 | // Conservatively set memVT to the entire set of vectors stored. | |
6471 | unsigned NumElts = 0; | |
6472 | for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) { | |
6473 | Type *ArgTy = I.getArgOperand(ArgI)->getType(); | |
6474 | if (!ArgTy->isVectorTy()) | |
6475 | break; | |
6476 | NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8; | |
6477 | } | |
6478 | Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts); | |
6479 | Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1); | |
6480 | Info.offset = 0; | |
6481 | Info.align = 0; | |
6482 | Info.vol = false; // volatile stores with NEON intrinsics not supported | |
6483 | Info.readMem = false; | |
6484 | Info.writeMem = true; | |
6485 | return true; | |
6486 | } | |
6487 | case Intrinsic::aarch64_ldaxr: | |
6488 | case Intrinsic::aarch64_ldxr: { | |
6489 | PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType()); | |
6490 | Info.opc = ISD::INTRINSIC_W_CHAIN; | |
6491 | Info.memVT = MVT::getVT(PtrTy->getElementType()); | |
6492 | Info.ptrVal = I.getArgOperand(0); | |
6493 | Info.offset = 0; | |
6494 | Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); | |
6495 | Info.vol = true; | |
6496 | Info.readMem = true; | |
6497 | Info.writeMem = false; | |
6498 | return true; | |
6499 | } | |
6500 | case Intrinsic::aarch64_stlxr: | |
6501 | case Intrinsic::aarch64_stxr: { | |
6502 | PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); | |
6503 | Info.opc = ISD::INTRINSIC_W_CHAIN; | |
6504 | Info.memVT = MVT::getVT(PtrTy->getElementType()); | |
6505 | Info.ptrVal = I.getArgOperand(1); | |
6506 | Info.offset = 0; | |
6507 | Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType()); | |
6508 | Info.vol = true; | |
6509 | Info.readMem = false; | |
6510 | Info.writeMem = true; | |
6511 | return true; | |
6512 | } | |
6513 | case Intrinsic::aarch64_ldaxp: | |
6514 | case Intrinsic::aarch64_ldxp: { | |
6515 | Info.opc = ISD::INTRINSIC_W_CHAIN; | |
6516 | Info.memVT = MVT::i128; | |
6517 | Info.ptrVal = I.getArgOperand(0); | |
6518 | Info.offset = 0; | |
6519 | Info.align = 16; | |
6520 | Info.vol = true; | |
6521 | Info.readMem = true; | |
6522 | Info.writeMem = false; | |
6523 | return true; | |
6524 | } | |
6525 | case Intrinsic::aarch64_stlxp: | |
6526 | case Intrinsic::aarch64_stxp: { | |
6527 | Info.opc = ISD::INTRINSIC_W_CHAIN; | |
6528 | Info.memVT = MVT::i128; | |
6529 | Info.ptrVal = I.getArgOperand(2); | |
6530 | Info.offset = 0; | |
6531 | Info.align = 16; | |
6532 | Info.vol = true; | |
6533 | Info.readMem = false; | |
6534 | Info.writeMem = true; | |
6535 | return true; | |
6536 | } | |
6537 | default: | |
6538 | break; | |
6539 | } | |
6540 | ||
6541 | return false; | |
6542 | } | |
6543 | ||
6544 | // Truncations from 64-bit GPR to 32-bit GPR is free. | |
6545 | bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { | |
6546 | if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) | |
6547 | return false; | |
6548 | unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); | |
6549 | unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); | |
6550 | return NumBits1 > NumBits2; | |
6551 | } | |
6552 | bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { | |
6553 | if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) | |
6554 | return false; | |
6555 | unsigned NumBits1 = VT1.getSizeInBits(); | |
6556 | unsigned NumBits2 = VT2.getSizeInBits(); | |
6557 | return NumBits1 > NumBits2; | |
6558 | } | |
6559 | ||
6560 | // All 32-bit GPR operations implicitly zero the high-half of the corresponding | |
6561 | // 64-bit GPR. | |
6562 | bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { | |
6563 | if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) | |
6564 | return false; | |
6565 | unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); | |
6566 | unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); | |
6567 | return NumBits1 == 32 && NumBits2 == 64; | |
6568 | } | |
6569 | bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { | |
6570 | if (VT1.isVector() || VT2.isVector() || !VT1.isInteger() || !VT2.isInteger()) | |
6571 | return false; | |
6572 | unsigned NumBits1 = VT1.getSizeInBits(); | |
6573 | unsigned NumBits2 = VT2.getSizeInBits(); | |
6574 | return NumBits1 == 32 && NumBits2 == 64; | |
6575 | } | |
6576 | ||
6577 | bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { | |
6578 | EVT VT1 = Val.getValueType(); | |
6579 | if (isZExtFree(VT1, VT2)) { | |
6580 | return true; | |
6581 | } | |
6582 | ||
6583 | if (Val.getOpcode() != ISD::LOAD) | |
6584 | return false; | |
6585 | ||
6586 | // 8-, 16-, and 32-bit integer loads all implicitly zero-extend. | |
6587 | return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() && | |
6588 | VT2.isSimple() && !VT2.isVector() && VT2.isInteger() && | |
6589 | VT1.getSizeInBits() <= 32); | |
6590 | } | |
6591 | ||
6592 | bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType, | |
6593 | unsigned &RequiredAligment) const { | |
6594 | if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy()) | |
6595 | return false; | |
6596 | // Cyclone supports unaligned accesses. | |
6597 | RequiredAligment = 0; | |
6598 | unsigned NumBits = LoadedType->getPrimitiveSizeInBits(); | |
6599 | return NumBits == 32 || NumBits == 64; | |
6600 | } | |
6601 | ||
6602 | bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType, | |
6603 | unsigned &RequiredAligment) const { | |
6604 | if (!LoadedType.isSimple() || | |
6605 | (!LoadedType.isInteger() && !LoadedType.isFloatingPoint())) | |
6606 | return false; | |
6607 | // Cyclone supports unaligned accesses. | |
6608 | RequiredAligment = 0; | |
6609 | unsigned NumBits = LoadedType.getSizeInBits(); | |
6610 | return NumBits == 32 || NumBits == 64; | |
6611 | } | |
6612 | ||
6613 | static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign, | |
6614 | unsigned AlignCheck) { | |
6615 | return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) && | |
6616 | (DstAlign == 0 || DstAlign % AlignCheck == 0)); | |
6617 | } | |
6618 | ||
6619 | EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, | |
6620 | unsigned SrcAlign, bool IsMemset, | |
6621 | bool ZeroMemset, | |
6622 | bool MemcpyStrSrc, | |
6623 | MachineFunction &MF) const { | |
6624 | // Don't use AdvSIMD to implement 16-byte memset. It would have taken one | |
6625 | // instruction to materialize the v2i64 zero and one store (with restrictive | |
6626 | // addressing mode). Just do two i64 store of zero-registers. | |
6627 | bool Fast; | |
6628 | const Function *F = MF.getFunction(); | |
6629 | if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 && | |
6630 | !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, | |
6631 | Attribute::NoImplicitFloat) && | |
6632 | (memOpAlign(SrcAlign, DstAlign, 16) || | |
6633 | (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast))) | |
6634 | return MVT::f128; | |
6635 | ||
6636 | return Size >= 8 ? MVT::i64 : MVT::i32; | |
6637 | } | |
6638 | ||
6639 | // 12-bit optionally shifted immediates are legal for adds. | |
6640 | bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { | |
6641 | if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0)) | |
6642 | return true; | |
6643 | return false; | |
6644 | } | |
6645 | ||
6646 | // Integer comparisons are implemented with ADDS/SUBS, so the range of valid | |
6647 | // immediates is the same as for an add or a sub. | |
6648 | bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const { | |
6649 | if (Immed < 0) | |
6650 | Immed *= -1; | |
6651 | return isLegalAddImmediate(Immed); | |
6652 | } | |
6653 | ||
6654 | /// isLegalAddressingMode - Return true if the addressing mode represented | |
6655 | /// by AM is legal for this target, for a load/store of the specified type. | |
6656 | bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM, | |
6657 | Type *Ty) const { | |
6658 | // AArch64 has five basic addressing modes: | |
6659 | // reg | |
6660 | // reg + 9-bit signed offset | |
6661 | // reg + SIZE_IN_BYTES * 12-bit unsigned offset | |
6662 | // reg1 + reg2 | |
6663 | // reg + SIZE_IN_BYTES * reg | |
6664 | ||
6665 | // No global is ever allowed as a base. | |
6666 | if (AM.BaseGV) | |
6667 | return false; | |
6668 | ||
6669 | // No reg+reg+imm addressing. | |
6670 | if (AM.HasBaseReg && AM.BaseOffs && AM.Scale) | |
6671 | return false; | |
6672 | ||
6673 | // check reg + imm case: | |
6674 | // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12 | |
6675 | uint64_t NumBytes = 0; | |
6676 | if (Ty->isSized()) { | |
6677 | uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty); | |
6678 | NumBytes = NumBits / 8; | |
6679 | if (!isPowerOf2_64(NumBits)) | |
6680 | NumBytes = 0; | |
6681 | } | |
6682 | ||
6683 | if (!AM.Scale) { | |
6684 | int64_t Offset = AM.BaseOffs; | |
6685 | ||
6686 | // 9-bit signed offset | |
6687 | if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1) | |
6688 | return true; | |
6689 | ||
6690 | // 12-bit unsigned offset | |
6691 | unsigned shift = Log2_64(NumBytes); | |
6692 | if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 && | |
6693 | // Must be a multiple of NumBytes (NumBytes is a power of 2) | |
6694 | (Offset >> shift) << shift == Offset) | |
6695 | return true; | |
6696 | return false; | |
6697 | } | |
6698 | ||
6699 | // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2 | |
6700 | ||
6701 | if (!AM.Scale || AM.Scale == 1 || | |
6702 | (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes)) | |
6703 | return true; | |
6704 | return false; | |
6705 | } | |
6706 | ||
6707 | int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM, | |
6708 | Type *Ty) const { | |
6709 | // Scaling factors are not free at all. | |
6710 | // Operands | Rt Latency | |
6711 | // ------------------------------------------- | |
6712 | // Rt, [Xn, Xm] | 4 | |
6713 | // ------------------------------------------- | |
6714 | // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5 | |
6715 | // Rt, [Xn, Wm, <extend> #imm] | | |
6716 | if (isLegalAddressingMode(AM, Ty)) | |
6717 | // Scale represents reg2 * scale, thus account for 1 if | |
6718 | // it is not equal to 0 or 1. | |
6719 | return AM.Scale != 0 && AM.Scale != 1; | |
6720 | return -1; | |
6721 | } | |
6722 | ||
6723 | bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { | |
6724 | VT = VT.getScalarType(); | |
6725 | ||
6726 | if (!VT.isSimple()) | |
6727 | return false; | |
6728 | ||
6729 | switch (VT.getSimpleVT().SimpleTy) { | |
6730 | case MVT::f32: | |
6731 | case MVT::f64: | |
6732 | return true; | |
6733 | default: | |
6734 | break; | |
6735 | } | |
6736 | ||
6737 | return false; | |
6738 | } | |
6739 | ||
6740 | const MCPhysReg * | |
6741 | AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { | |
6742 | // LR is a callee-save register, but we must treat it as clobbered by any call | |
6743 | // site. Hence we include LR in the scratch registers, which are in turn added | |
6744 | // as implicit-defs for stackmaps and patchpoints. | |
6745 | static const MCPhysReg ScratchRegs[] = { | |
6746 | AArch64::X16, AArch64::X17, AArch64::LR, 0 | |
6747 | }; | |
6748 | return ScratchRegs; | |
6749 | } | |
6750 | ||
6751 | bool | |
6752 | AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const { | |
6753 | EVT VT = N->getValueType(0); | |
6754 | // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine | |
6755 | // it with shift to let it be lowered to UBFX. | |
6756 | if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) && | |
6757 | isa<ConstantSDNode>(N->getOperand(1))) { | |
6758 | uint64_t TruncMask = N->getConstantOperandVal(1); | |
6759 | if (isMask_64(TruncMask) && | |
6760 | N->getOperand(0).getOpcode() == ISD::SRL && | |
6761 | isa<ConstantSDNode>(N->getOperand(0)->getOperand(1))) | |
6762 | return false; | |
6763 | } | |
6764 | return true; | |
6765 | } | |
6766 | ||
6767 | bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, | |
6768 | Type *Ty) const { | |
6769 | assert(Ty->isIntegerTy()); | |
6770 | ||
6771 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |
6772 | if (BitSize == 0) | |
6773 | return false; | |
6774 | ||
6775 | int64_t Val = Imm.getSExtValue(); | |
6776 | if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize)) | |
6777 | return true; | |
6778 | ||
6779 | if ((int64_t)Val < 0) | |
6780 | Val = ~Val; | |
6781 | if (BitSize == 32) | |
6782 | Val &= (1LL << 32) - 1; | |
6783 | ||
6784 | unsigned LZ = countLeadingZeros((uint64_t)Val); | |
6785 | unsigned Shift = (63 - LZ) / 16; | |
6786 | // MOVZ is free so return true for one or fewer MOVK. | |
6787 | return (Shift < 3) ? true : false; | |
6788 | } | |
6789 | ||
6790 | // Generate SUBS and CSEL for integer abs. | |
6791 | static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) { | |
6792 | EVT VT = N->getValueType(0); | |
6793 | ||
6794 | SDValue N0 = N->getOperand(0); | |
6795 | SDValue N1 = N->getOperand(1); | |
6796 | SDLoc DL(N); | |
6797 | ||
6798 | // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) | |
6799 | // and change it to SUB and CSEL. | |
6800 | if (VT.isInteger() && N->getOpcode() == ISD::XOR && | |
6801 | N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && | |
6802 | N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) | |
6803 | if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1))) | |
6804 | if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) { | |
6805 | SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), | |
6806 | N0.getOperand(0)); | |
6807 | // Generate SUBS & CSEL. | |
6808 | SDValue Cmp = | |
6809 | DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32), | |
6810 | N0.getOperand(0), DAG.getConstant(0, VT)); | |
6811 | return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg, | |
6812 | DAG.getConstant(AArch64CC::PL, MVT::i32), | |
6813 | SDValue(Cmp.getNode(), 1)); | |
6814 | } | |
6815 | return SDValue(); | |
6816 | } | |
6817 | ||
6818 | // performXorCombine - Attempts to handle integer ABS. | |
6819 | static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG, | |
6820 | TargetLowering::DAGCombinerInfo &DCI, | |
6821 | const AArch64Subtarget *Subtarget) { | |
6822 | if (DCI.isBeforeLegalizeOps()) | |
6823 | return SDValue(); | |
6824 | ||
6825 | return performIntegerAbsCombine(N, DAG); | |
6826 | } | |
6827 | ||
6828 | SDValue | |
6829 | AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, | |
6830 | SelectionDAG &DAG, | |
6831 | std::vector<SDNode *> *Created) const { | |
6832 | // fold (sdiv X, pow2) | |
6833 | EVT VT = N->getValueType(0); | |
6834 | if ((VT != MVT::i32 && VT != MVT::i64) || | |
6835 | !(Divisor.isPowerOf2() || (-Divisor).isPowerOf2())) | |
6836 | return SDValue(); | |
6837 | ||
6838 | SDLoc DL(N); | |
6839 | SDValue N0 = N->getOperand(0); | |
6840 | unsigned Lg2 = Divisor.countTrailingZeros(); | |
6841 | SDValue Zero = DAG.getConstant(0, VT); | |
85aaf69f | 6842 | SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, VT); |
1a4d82fc JJ |
6843 | |
6844 | // Add (N0 < 0) ? Pow2 - 1 : 0; | |
6845 | SDValue CCVal; | |
6846 | SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL); | |
6847 | SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); | |
6848 | SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp); | |
6849 | ||
6850 | if (Created) { | |
6851 | Created->push_back(Cmp.getNode()); | |
6852 | Created->push_back(Add.getNode()); | |
6853 | Created->push_back(CSel.getNode()); | |
6854 | } | |
6855 | ||
6856 | // Divide by pow2. | |
6857 | SDValue SRA = | |
6858 | DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, MVT::i64)); | |
6859 | ||
6860 | // If we're dividing by a positive value, we're done. Otherwise, we must | |
6861 | // negate the result. | |
6862 | if (Divisor.isNonNegative()) | |
6863 | return SRA; | |
6864 | ||
6865 | if (Created) | |
6866 | Created->push_back(SRA.getNode()); | |
6867 | return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT), SRA); | |
6868 | } | |
6869 | ||
6870 | static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, | |
6871 | TargetLowering::DAGCombinerInfo &DCI, | |
6872 | const AArch64Subtarget *Subtarget) { | |
6873 | if (DCI.isBeforeLegalizeOps()) | |
6874 | return SDValue(); | |
6875 | ||
6876 | // Multiplication of a power of two plus/minus one can be done more | |
6877 | // cheaply as as shift+add/sub. For now, this is true unilaterally. If | |
6878 | // future CPUs have a cheaper MADD instruction, this may need to be | |
6879 | // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and | |
6880 | // 64-bit is 5 cycles, so this is always a win. | |
6881 | if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) { | |
6882 | APInt Value = C->getAPIntValue(); | |
6883 | EVT VT = N->getValueType(0); | |
6884 | if (Value.isNonNegative()) { | |
6885 | // (mul x, 2^N + 1) => (add (shl x, N), x) | |
6886 | APInt VM1 = Value - 1; | |
6887 | if (VM1.isPowerOf2()) { | |
6888 | SDValue ShiftedVal = | |
6889 | DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), | |
6890 | DAG.getConstant(VM1.logBase2(), MVT::i64)); | |
6891 | return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, | |
6892 | N->getOperand(0)); | |
6893 | } | |
6894 | // (mul x, 2^N - 1) => (sub (shl x, N), x) | |
6895 | APInt VP1 = Value + 1; | |
6896 | if (VP1.isPowerOf2()) { | |
6897 | SDValue ShiftedVal = | |
6898 | DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), | |
6899 | DAG.getConstant(VP1.logBase2(), MVT::i64)); | |
6900 | return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, | |
6901 | N->getOperand(0)); | |
6902 | } | |
6903 | } else { | |
6904 | // (mul x, -(2^N + 1)) => - (add (shl x, N), x) | |
6905 | APInt VNM1 = -Value - 1; | |
6906 | if (VNM1.isPowerOf2()) { | |
6907 | SDValue ShiftedVal = | |
6908 | DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), | |
6909 | DAG.getConstant(VNM1.logBase2(), MVT::i64)); | |
6910 | SDValue Add = | |
6911 | DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0)); | |
6912 | return DAG.getNode(ISD::SUB, SDLoc(N), VT, DAG.getConstant(0, VT), Add); | |
6913 | } | |
6914 | // (mul x, -(2^N - 1)) => (sub x, (shl x, N)) | |
6915 | APInt VNP1 = -Value + 1; | |
6916 | if (VNP1.isPowerOf2()) { | |
6917 | SDValue ShiftedVal = | |
6918 | DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), | |
6919 | DAG.getConstant(VNP1.logBase2(), MVT::i64)); | |
6920 | return DAG.getNode(ISD::SUB, SDLoc(N), VT, N->getOperand(0), | |
6921 | ShiftedVal); | |
6922 | } | |
6923 | } | |
6924 | } | |
6925 | return SDValue(); | |
6926 | } | |
6927 | ||
6928 | static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N, | |
6929 | SelectionDAG &DAG) { | |
6930 | // Take advantage of vector comparisons producing 0 or -1 in each lane to | |
6931 | // optimize away operation when it's from a constant. | |
6932 | // | |
6933 | // The general transformation is: | |
6934 | // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> | |
6935 | // AND(VECTOR_CMP(x,y), constant2) | |
6936 | // constant2 = UNARYOP(constant) | |
6937 | ||
6938 | // Early exit if this isn't a vector operation, the operand of the | |
6939 | // unary operation isn't a bitwise AND, or if the sizes of the operations | |
6940 | // aren't the same. | |
6941 | EVT VT = N->getValueType(0); | |
6942 | if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND || | |
6943 | N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC || | |
6944 | VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits()) | |
6945 | return SDValue(); | |
6946 | ||
6947 | // Now check that the other operand of the AND is a constant. We could | |
6948 | // make the transformation for non-constant splats as well, but it's unclear | |
6949 | // that would be a benefit as it would not eliminate any operations, just | |
6950 | // perform one more step in scalar code before moving to the vector unit. | |
6951 | if (BuildVectorSDNode *BV = | |
6952 | dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) { | |
6953 | // Bail out if the vector isn't a constant. | |
6954 | if (!BV->isConstant()) | |
6955 | return SDValue(); | |
6956 | ||
6957 | // Everything checks out. Build up the new and improved node. | |
6958 | SDLoc DL(N); | |
6959 | EVT IntVT = BV->getValueType(0); | |
6960 | // Create a new constant of the appropriate type for the transformed | |
6961 | // DAG. | |
6962 | SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); | |
6963 | // The AND node needs bitcasts to/from an integer vector type around it. | |
6964 | SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst); | |
6965 | SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, | |
6966 | N->getOperand(0)->getOperand(0), MaskConst); | |
6967 | SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd); | |
6968 | return Res; | |
6969 | } | |
6970 | ||
6971 | return SDValue(); | |
6972 | } | |
6973 | ||
85aaf69f SL |
6974 | static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG, |
6975 | const AArch64Subtarget *Subtarget) { | |
1a4d82fc JJ |
6976 | // First try to optimize away the conversion when it's conditionally from |
6977 | // a constant. Vectors only. | |
6978 | SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG); | |
6979 | if (Res != SDValue()) | |
6980 | return Res; | |
6981 | ||
6982 | EVT VT = N->getValueType(0); | |
6983 | if (VT != MVT::f32 && VT != MVT::f64) | |
6984 | return SDValue(); | |
6985 | ||
6986 | // Only optimize when the source and destination types have the same width. | |
6987 | if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits()) | |
6988 | return SDValue(); | |
6989 | ||
6990 | // If the result of an integer load is only used by an integer-to-float | |
6991 | // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead. | |
6992 | // This eliminates an "integer-to-vector-move UOP and improve throughput. | |
6993 | SDValue N0 = N->getOperand(0); | |
85aaf69f | 6994 | if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() && |
1a4d82fc JJ |
6995 | // Do not change the width of a volatile load. |
6996 | !cast<LoadSDNode>(N0)->isVolatile()) { | |
6997 | LoadSDNode *LN0 = cast<LoadSDNode>(N0); | |
6998 | SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(), | |
6999 | LN0->getPointerInfo(), LN0->isVolatile(), | |
7000 | LN0->isNonTemporal(), LN0->isInvariant(), | |
7001 | LN0->getAlignment()); | |
7002 | ||
7003 | // Make sure successors of the original load stay after it by updating them | |
7004 | // to use the new Chain. | |
7005 | DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1)); | |
7006 | ||
7007 | unsigned Opcode = | |
7008 | (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF; | |
7009 | return DAG.getNode(Opcode, SDLoc(N), VT, Load); | |
7010 | } | |
7011 | ||
7012 | return SDValue(); | |
7013 | } | |
7014 | ||
7015 | /// An EXTR instruction is made up of two shifts, ORed together. This helper | |
7016 | /// searches for and classifies those shifts. | |
7017 | static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, | |
7018 | bool &FromHi) { | |
7019 | if (N.getOpcode() == ISD::SHL) | |
7020 | FromHi = false; | |
7021 | else if (N.getOpcode() == ISD::SRL) | |
7022 | FromHi = true; | |
7023 | else | |
7024 | return false; | |
7025 | ||
7026 | if (!isa<ConstantSDNode>(N.getOperand(1))) | |
7027 | return false; | |
7028 | ||
7029 | ShiftAmount = N->getConstantOperandVal(1); | |
7030 | Src = N->getOperand(0); | |
7031 | return true; | |
7032 | } | |
7033 | ||
7034 | /// EXTR instruction extracts a contiguous chunk of bits from two existing | |
7035 | /// registers viewed as a high/low pair. This function looks for the pattern: | |
7036 | /// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an | |
7037 | /// EXTR. Can't quite be done in TableGen because the two immediates aren't | |
7038 | /// independent. | |
7039 | static SDValue tryCombineToEXTR(SDNode *N, | |
7040 | TargetLowering::DAGCombinerInfo &DCI) { | |
7041 | SelectionDAG &DAG = DCI.DAG; | |
7042 | SDLoc DL(N); | |
7043 | EVT VT = N->getValueType(0); | |
7044 | ||
7045 | assert(N->getOpcode() == ISD::OR && "Unexpected root"); | |
7046 | ||
7047 | if (VT != MVT::i32 && VT != MVT::i64) | |
7048 | return SDValue(); | |
7049 | ||
7050 | SDValue LHS; | |
7051 | uint32_t ShiftLHS = 0; | |
7052 | bool LHSFromHi = 0; | |
7053 | if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi)) | |
7054 | return SDValue(); | |
7055 | ||
7056 | SDValue RHS; | |
7057 | uint32_t ShiftRHS = 0; | |
7058 | bool RHSFromHi = 0; | |
7059 | if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi)) | |
7060 | return SDValue(); | |
7061 | ||
7062 | // If they're both trying to come from the high part of the register, they're | |
7063 | // not really an EXTR. | |
7064 | if (LHSFromHi == RHSFromHi) | |
7065 | return SDValue(); | |
7066 | ||
7067 | if (ShiftLHS + ShiftRHS != VT.getSizeInBits()) | |
7068 | return SDValue(); | |
7069 | ||
7070 | if (LHSFromHi) { | |
7071 | std::swap(LHS, RHS); | |
7072 | std::swap(ShiftLHS, ShiftRHS); | |
7073 | } | |
7074 | ||
7075 | return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS, | |
7076 | DAG.getConstant(ShiftRHS, MVT::i64)); | |
7077 | } | |
7078 | ||
7079 | static SDValue tryCombineToBSL(SDNode *N, | |
7080 | TargetLowering::DAGCombinerInfo &DCI) { | |
7081 | EVT VT = N->getValueType(0); | |
7082 | SelectionDAG &DAG = DCI.DAG; | |
7083 | SDLoc DL(N); | |
7084 | ||
7085 | if (!VT.isVector()) | |
7086 | return SDValue(); | |
7087 | ||
7088 | SDValue N0 = N->getOperand(0); | |
7089 | if (N0.getOpcode() != ISD::AND) | |
7090 | return SDValue(); | |
7091 | ||
7092 | SDValue N1 = N->getOperand(1); | |
7093 | if (N1.getOpcode() != ISD::AND) | |
7094 | return SDValue(); | |
7095 | ||
7096 | // We only have to look for constant vectors here since the general, variable | |
7097 | // case can be handled in TableGen. | |
7098 | unsigned Bits = VT.getVectorElementType().getSizeInBits(); | |
7099 | uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1); | |
7100 | for (int i = 1; i >= 0; --i) | |
7101 | for (int j = 1; j >= 0; --j) { | |
7102 | BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i)); | |
7103 | BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j)); | |
7104 | if (!BVN0 || !BVN1) | |
7105 | continue; | |
7106 | ||
7107 | bool FoundMatch = true; | |
7108 | for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) { | |
7109 | ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k)); | |
7110 | ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k)); | |
7111 | if (!CN0 || !CN1 || | |
7112 | CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) { | |
7113 | FoundMatch = false; | |
7114 | break; | |
7115 | } | |
7116 | } | |
7117 | ||
7118 | if (FoundMatch) | |
7119 | return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0), | |
7120 | N0->getOperand(1 - i), N1->getOperand(1 - j)); | |
7121 | } | |
7122 | ||
7123 | return SDValue(); | |
7124 | } | |
7125 | ||
7126 | static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, | |
7127 | const AArch64Subtarget *Subtarget) { | |
7128 | // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) | |
7129 | if (!EnableAArch64ExtrGeneration) | |
7130 | return SDValue(); | |
7131 | SelectionDAG &DAG = DCI.DAG; | |
7132 | EVT VT = N->getValueType(0); | |
7133 | ||
7134 | if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) | |
7135 | return SDValue(); | |
7136 | ||
7137 | SDValue Res = tryCombineToEXTR(N, DCI); | |
7138 | if (Res.getNode()) | |
7139 | return Res; | |
7140 | ||
7141 | Res = tryCombineToBSL(N, DCI); | |
7142 | if (Res.getNode()) | |
7143 | return Res; | |
7144 | ||
7145 | return SDValue(); | |
7146 | } | |
7147 | ||
7148 | static SDValue performBitcastCombine(SDNode *N, | |
7149 | TargetLowering::DAGCombinerInfo &DCI, | |
7150 | SelectionDAG &DAG) { | |
7151 | // Wait 'til after everything is legalized to try this. That way we have | |
7152 | // legal vector types and such. | |
7153 | if (DCI.isBeforeLegalizeOps()) | |
7154 | return SDValue(); | |
7155 | ||
7156 | // Remove extraneous bitcasts around an extract_subvector. | |
7157 | // For example, | |
7158 | // (v4i16 (bitconvert | |
7159 | // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1))))) | |
7160 | // becomes | |
7161 | // (extract_subvector ((v8i16 ...), (i64 4))) | |
7162 | ||
7163 | // Only interested in 64-bit vectors as the ultimate result. | |
7164 | EVT VT = N->getValueType(0); | |
7165 | if (!VT.isVector()) | |
7166 | return SDValue(); | |
7167 | if (VT.getSimpleVT().getSizeInBits() != 64) | |
7168 | return SDValue(); | |
7169 | // Is the operand an extract_subvector starting at the beginning or halfway | |
7170 | // point of the vector? A low half may also come through as an | |
7171 | // EXTRACT_SUBREG, so look for that, too. | |
7172 | SDValue Op0 = N->getOperand(0); | |
7173 | if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR && | |
7174 | !(Op0->isMachineOpcode() && | |
7175 | Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG)) | |
7176 | return SDValue(); | |
7177 | uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue(); | |
7178 | if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) { | |
7179 | if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0) | |
7180 | return SDValue(); | |
7181 | } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) { | |
7182 | if (idx != AArch64::dsub) | |
7183 | return SDValue(); | |
7184 | // The dsub reference is equivalent to a lane zero subvector reference. | |
7185 | idx = 0; | |
7186 | } | |
7187 | // Look through the bitcast of the input to the extract. | |
7188 | if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST) | |
7189 | return SDValue(); | |
7190 | SDValue Source = Op0->getOperand(0)->getOperand(0); | |
7191 | // If the source type has twice the number of elements as our destination | |
7192 | // type, we know this is an extract of the high or low half of the vector. | |
7193 | EVT SVT = Source->getValueType(0); | |
7194 | if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2) | |
7195 | return SDValue(); | |
7196 | ||
7197 | DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n"); | |
7198 | ||
7199 | // Create the simplified form to just extract the low or high half of the | |
7200 | // vector directly rather than bothering with the bitcasts. | |
7201 | SDLoc dl(N); | |
7202 | unsigned NumElements = VT.getVectorNumElements(); | |
7203 | if (idx) { | |
7204 | SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64); | |
7205 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx); | |
7206 | } else { | |
7207 | SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, MVT::i32); | |
7208 | return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT, | |
7209 | Source, SubReg), | |
7210 | 0); | |
7211 | } | |
7212 | } | |
7213 | ||
7214 | static SDValue performConcatVectorsCombine(SDNode *N, | |
7215 | TargetLowering::DAGCombinerInfo &DCI, | |
7216 | SelectionDAG &DAG) { | |
7217 | // Wait 'til after everything is legalized to try this. That way we have | |
7218 | // legal vector types and such. | |
7219 | if (DCI.isBeforeLegalizeOps()) | |
7220 | return SDValue(); | |
7221 | ||
7222 | SDLoc dl(N); | |
7223 | EVT VT = N->getValueType(0); | |
7224 | ||
7225 | // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector | |
7226 | // splat. The indexed instructions are going to be expecting a DUPLANE64, so | |
7227 | // canonicalise to that. | |
7228 | if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) { | |
7229 | assert(VT.getVectorElementType().getSizeInBits() == 64); | |
7230 | return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, | |
7231 | WidenVector(N->getOperand(0), DAG), | |
7232 | DAG.getConstant(0, MVT::i64)); | |
7233 | } | |
7234 | ||
7235 | // Canonicalise concat_vectors so that the right-hand vector has as few | |
7236 | // bit-casts as possible before its real operation. The primary matching | |
7237 | // destination for these operations will be the narrowing "2" instructions, | |
7238 | // which depend on the operation being performed on this right-hand vector. | |
7239 | // For example, | |
7240 | // (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS)))) | |
7241 | // becomes | |
7242 | // (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS)) | |
7243 | ||
7244 | SDValue Op1 = N->getOperand(1); | |
7245 | if (Op1->getOpcode() != ISD::BITCAST) | |
7246 | return SDValue(); | |
7247 | SDValue RHS = Op1->getOperand(0); | |
7248 | MVT RHSTy = RHS.getValueType().getSimpleVT(); | |
7249 | // If the RHS is not a vector, this is not the pattern we're looking for. | |
7250 | if (!RHSTy.isVector()) | |
7251 | return SDValue(); | |
7252 | ||
7253 | DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n"); | |
7254 | ||
7255 | MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(), | |
7256 | RHSTy.getVectorNumElements() * 2); | |
7257 | return DAG.getNode( | |
7258 | ISD::BITCAST, dl, VT, | |
7259 | DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy, | |
7260 | DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS)); | |
7261 | } | |
7262 | ||
7263 | static SDValue tryCombineFixedPointConvert(SDNode *N, | |
7264 | TargetLowering::DAGCombinerInfo &DCI, | |
7265 | SelectionDAG &DAG) { | |
7266 | // Wait 'til after everything is legalized to try this. That way we have | |
7267 | // legal vector types and such. | |
7268 | if (DCI.isBeforeLegalizeOps()) | |
7269 | return SDValue(); | |
7270 | // Transform a scalar conversion of a value from a lane extract into a | |
7271 | // lane extract of a vector conversion. E.g., from foo1 to foo2: | |
7272 | // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); } | |
7273 | // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; } | |
7274 | // | |
7275 | // The second form interacts better with instruction selection and the | |
7276 | // register allocator to avoid cross-class register copies that aren't | |
7277 | // coalescable due to a lane reference. | |
7278 | ||
7279 | // Check the operand and see if it originates from a lane extract. | |
7280 | SDValue Op1 = N->getOperand(1); | |
7281 | if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { | |
7282 | // Yep, no additional predication needed. Perform the transform. | |
7283 | SDValue IID = N->getOperand(0); | |
7284 | SDValue Shift = N->getOperand(2); | |
7285 | SDValue Vec = Op1.getOperand(0); | |
7286 | SDValue Lane = Op1.getOperand(1); | |
7287 | EVT ResTy = N->getValueType(0); | |
7288 | EVT VecResTy; | |
7289 | SDLoc DL(N); | |
7290 | ||
7291 | // The vector width should be 128 bits by the time we get here, even | |
7292 | // if it started as 64 bits (the extract_vector handling will have | |
7293 | // done so). | |
7294 | assert(Vec.getValueType().getSizeInBits() == 128 && | |
7295 | "unexpected vector size on extract_vector_elt!"); | |
7296 | if (Vec.getValueType() == MVT::v4i32) | |
7297 | VecResTy = MVT::v4f32; | |
7298 | else if (Vec.getValueType() == MVT::v2i64) | |
7299 | VecResTy = MVT::v2f64; | |
7300 | else | |
7301 | llvm_unreachable("unexpected vector type!"); | |
7302 | ||
7303 | SDValue Convert = | |
7304 | DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift); | |
7305 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane); | |
7306 | } | |
7307 | return SDValue(); | |
7308 | } | |
7309 | ||
7310 | // AArch64 high-vector "long" operations are formed by performing the non-high | |
7311 | // version on an extract_subvector of each operand which gets the high half: | |
7312 | // | |
7313 | // (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS)) | |
7314 | // | |
7315 | // However, there are cases which don't have an extract_high explicitly, but | |
7316 | // have another operation that can be made compatible with one for free. For | |
7317 | // example: | |
7318 | // | |
7319 | // (dupv64 scalar) --> (extract_high (dup128 scalar)) | |
7320 | // | |
7321 | // This routine does the actual conversion of such DUPs, once outer routines | |
7322 | // have determined that everything else is in order. | |
7323 | static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) { | |
7324 | // We can handle most types of duplicate, but the lane ones have an extra | |
7325 | // operand saying *which* lane, so we need to know. | |
7326 | bool IsDUPLANE; | |
7327 | switch (N.getOpcode()) { | |
7328 | case AArch64ISD::DUP: | |
7329 | IsDUPLANE = false; | |
7330 | break; | |
7331 | case AArch64ISD::DUPLANE8: | |
7332 | case AArch64ISD::DUPLANE16: | |
7333 | case AArch64ISD::DUPLANE32: | |
7334 | case AArch64ISD::DUPLANE64: | |
7335 | IsDUPLANE = true; | |
7336 | break; | |
7337 | default: | |
7338 | return SDValue(); | |
7339 | } | |
7340 | ||
7341 | MVT NarrowTy = N.getSimpleValueType(); | |
7342 | if (!NarrowTy.is64BitVector()) | |
7343 | return SDValue(); | |
7344 | ||
7345 | MVT ElementTy = NarrowTy.getVectorElementType(); | |
7346 | unsigned NumElems = NarrowTy.getVectorNumElements(); | |
7347 | MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2); | |
7348 | ||
7349 | SDValue NewDUP; | |
7350 | if (IsDUPLANE) | |
7351 | NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0), | |
7352 | N.getOperand(1)); | |
970d7e83 | 7353 | else |
1a4d82fc | 7354 | NewDUP = DAG.getNode(AArch64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0)); |
970d7e83 | 7355 | |
1a4d82fc JJ |
7356 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy, |
7357 | NewDUP, DAG.getConstant(NumElems, MVT::i64)); | |
970d7e83 LB |
7358 | } |
7359 | ||
1a4d82fc JJ |
7360 | static bool isEssentiallyExtractSubvector(SDValue N) { |
7361 | if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR) | |
7362 | return true; | |
970d7e83 | 7363 | |
1a4d82fc JJ |
7364 | return N.getOpcode() == ISD::BITCAST && |
7365 | N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR; | |
7366 | } | |
970d7e83 | 7367 | |
1a4d82fc JJ |
7368 | /// \brief Helper structure to keep track of ISD::SET_CC operands. |
7369 | struct GenericSetCCInfo { | |
7370 | const SDValue *Opnd0; | |
7371 | const SDValue *Opnd1; | |
7372 | ISD::CondCode CC; | |
7373 | }; | |
970d7e83 | 7374 | |
1a4d82fc JJ |
7375 | /// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code. |
7376 | struct AArch64SetCCInfo { | |
7377 | const SDValue *Cmp; | |
7378 | AArch64CC::CondCode CC; | |
7379 | }; | |
970d7e83 | 7380 | |
1a4d82fc JJ |
7381 | /// \brief Helper structure to keep track of SetCC information. |
7382 | union SetCCInfo { | |
7383 | GenericSetCCInfo Generic; | |
7384 | AArch64SetCCInfo AArch64; | |
7385 | }; | |
970d7e83 | 7386 | |
1a4d82fc JJ |
7387 | /// \brief Helper structure to be able to read SetCC information. If set to |
7388 | /// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a | |
7389 | /// GenericSetCCInfo. | |
7390 | struct SetCCInfoAndKind { | |
7391 | SetCCInfo Info; | |
7392 | bool IsAArch64; | |
7393 | }; | |
970d7e83 | 7394 | |
1a4d82fc JJ |
7395 | /// \brief Check whether or not \p Op is a SET_CC operation, either a generic or |
7396 | /// an | |
7397 | /// AArch64 lowered one. | |
7398 | /// \p SetCCInfo is filled accordingly. | |
7399 | /// \post SetCCInfo is meanginfull only when this function returns true. | |
7400 | /// \return True when Op is a kind of SET_CC operation. | |
7401 | static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) { | |
7402 | // If this is a setcc, this is straight forward. | |
7403 | if (Op.getOpcode() == ISD::SETCC) { | |
7404 | SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0); | |
7405 | SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1); | |
7406 | SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get(); | |
7407 | SetCCInfo.IsAArch64 = false; | |
7408 | return true; | |
970d7e83 | 7409 | } |
1a4d82fc JJ |
7410 | // Otherwise, check if this is a matching csel instruction. |
7411 | // In other words: | |
7412 | // - csel 1, 0, cc | |
7413 | // - csel 0, 1, !cc | |
7414 | if (Op.getOpcode() != AArch64ISD::CSEL) | |
7415 | return false; | |
7416 | // Set the information about the operands. | |
7417 | // TODO: we want the operands of the Cmp not the csel | |
7418 | SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3); | |
7419 | SetCCInfo.IsAArch64 = true; | |
7420 | SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>( | |
7421 | cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); | |
7422 | ||
7423 | // Check that the operands matches the constraints: | |
7424 | // (1) Both operands must be constants. | |
7425 | // (2) One must be 1 and the other must be 0. | |
7426 | ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0)); | |
7427 | ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1)); | |
7428 | ||
7429 | // Check (1). | |
7430 | if (!TValue || !FValue) | |
7431 | return false; | |
970d7e83 | 7432 | |
1a4d82fc JJ |
7433 | // Check (2). |
7434 | if (!TValue->isOne()) { | |
7435 | // Update the comparison when we are interested in !cc. | |
7436 | std::swap(TValue, FValue); | |
7437 | SetCCInfo.Info.AArch64.CC = | |
7438 | AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC); | |
970d7e83 | 7439 | } |
1a4d82fc JJ |
7440 | return TValue->isOne() && FValue->isNullValue(); |
7441 | } | |
970d7e83 | 7442 | |
1a4d82fc JJ |
7443 | // Returns true if Op is setcc or zext of setcc. |
7444 | static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) { | |
7445 | if (isSetCC(Op, Info)) | |
7446 | return true; | |
7447 | return ((Op.getOpcode() == ISD::ZERO_EXTEND) && | |
7448 | isSetCC(Op->getOperand(0), Info)); | |
970d7e83 LB |
7449 | } |
7450 | ||
1a4d82fc JJ |
7451 | // The folding we want to perform is: |
7452 | // (add x, [zext] (setcc cc ...) ) | |
7453 | // --> | |
7454 | // (csel x, (add x, 1), !cc ...) | |
7455 | // | |
7456 | // The latter will get matched to a CSINC instruction. | |
7457 | static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { | |
7458 | assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!"); | |
7459 | SDValue LHS = Op->getOperand(0); | |
7460 | SDValue RHS = Op->getOperand(1); | |
7461 | SetCCInfoAndKind InfoAndKind; | |
7462 | ||
7463 | // If neither operand is a SET_CC, give up. | |
7464 | if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) { | |
7465 | std::swap(LHS, RHS); | |
7466 | if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) | |
7467 | return SDValue(); | |
7468 | } | |
970d7e83 | 7469 | |
1a4d82fc JJ |
7470 | // FIXME: This could be generatized to work for FP comparisons. |
7471 | EVT CmpVT = InfoAndKind.IsAArch64 | |
7472 | ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType() | |
7473 | : InfoAndKind.Info.Generic.Opnd0->getValueType(); | |
7474 | if (CmpVT != MVT::i32 && CmpVT != MVT::i64) | |
7475 | return SDValue(); | |
970d7e83 | 7476 | |
1a4d82fc JJ |
7477 | SDValue CCVal; |
7478 | SDValue Cmp; | |
7479 | SDLoc dl(Op); | |
7480 | if (InfoAndKind.IsAArch64) { | |
7481 | CCVal = DAG.getConstant( | |
7482 | AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), MVT::i32); | |
7483 | Cmp = *InfoAndKind.Info.AArch64.Cmp; | |
7484 | } else | |
7485 | Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0, | |
7486 | *InfoAndKind.Info.Generic.Opnd1, | |
7487 | ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true), | |
7488 | CCVal, DAG, dl); | |
7489 | ||
7490 | EVT VT = Op->getValueType(0); | |
7491 | LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT)); | |
7492 | return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp); | |
7493 | } | |
970d7e83 | 7494 | |
1a4d82fc JJ |
7495 | // The basic add/sub long vector instructions have variants with "2" on the end |
7496 | // which act on the high-half of their inputs. They are normally matched by | |
7497 | // patterns like: | |
7498 | // | |
7499 | // (add (zeroext (extract_high LHS)), | |
7500 | // (zeroext (extract_high RHS))) | |
7501 | // -> uaddl2 vD, vN, vM | |
7502 | // | |
7503 | // However, if one of the extracts is something like a duplicate, this | |
7504 | // instruction can still be used profitably. This function puts the DAG into a | |
7505 | // more appropriate form for those patterns to trigger. | |
7506 | static SDValue performAddSubLongCombine(SDNode *N, | |
7507 | TargetLowering::DAGCombinerInfo &DCI, | |
7508 | SelectionDAG &DAG) { | |
7509 | if (DCI.isBeforeLegalizeOps()) | |
7510 | return SDValue(); | |
970d7e83 | 7511 | |
1a4d82fc JJ |
7512 | MVT VT = N->getSimpleValueType(0); |
7513 | if (!VT.is128BitVector()) { | |
7514 | if (N->getOpcode() == ISD::ADD) | |
7515 | return performSetccAddFolding(N, DAG); | |
7516 | return SDValue(); | |
7517 | } | |
970d7e83 | 7518 | |
1a4d82fc JJ |
7519 | // Make sure both branches are extended in the same way. |
7520 | SDValue LHS = N->getOperand(0); | |
7521 | SDValue RHS = N->getOperand(1); | |
7522 | if ((LHS.getOpcode() != ISD::ZERO_EXTEND && | |
7523 | LHS.getOpcode() != ISD::SIGN_EXTEND) || | |
7524 | LHS.getOpcode() != RHS.getOpcode()) | |
7525 | return SDValue(); | |
970d7e83 | 7526 | |
1a4d82fc | 7527 | unsigned ExtType = LHS.getOpcode(); |
970d7e83 | 7528 | |
1a4d82fc JJ |
7529 | // It's not worth doing if at least one of the inputs isn't already an |
7530 | // extract, but we don't know which it'll be so we have to try both. | |
7531 | if (isEssentiallyExtractSubvector(LHS.getOperand(0))) { | |
7532 | RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG); | |
7533 | if (!RHS.getNode()) | |
7534 | return SDValue(); | |
970d7e83 | 7535 | |
1a4d82fc JJ |
7536 | RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS); |
7537 | } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) { | |
7538 | LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG); | |
7539 | if (!LHS.getNode()) | |
7540 | return SDValue(); | |
970d7e83 | 7541 | |
1a4d82fc JJ |
7542 | LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS); |
7543 | } | |
970d7e83 | 7544 | |
1a4d82fc JJ |
7545 | return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS); |
7546 | } | |
970d7e83 | 7547 | |
1a4d82fc JJ |
7548 | // Massage DAGs which we can use the high-half "long" operations on into |
7549 | // something isel will recognize better. E.g. | |
7550 | // | |
7551 | // (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) --> | |
7552 | // (aarch64_neon_umull (extract_high (v2i64 vec))) | |
7553 | // (extract_high (v2i64 (dup128 scalar))))) | |
7554 | // | |
7555 | static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N, | |
7556 | TargetLowering::DAGCombinerInfo &DCI, | |
7557 | SelectionDAG &DAG) { | |
7558 | if (DCI.isBeforeLegalizeOps()) | |
7559 | return SDValue(); | |
970d7e83 | 7560 | |
1a4d82fc JJ |
7561 | SDValue LHS = N->getOperand(1); |
7562 | SDValue RHS = N->getOperand(2); | |
7563 | assert(LHS.getValueType().is64BitVector() && | |
7564 | RHS.getValueType().is64BitVector() && | |
7565 | "unexpected shape for long operation"); | |
7566 | ||
7567 | // Either node could be a DUP, but it's not worth doing both of them (you'd | |
7568 | // just as well use the non-high version) so look for a corresponding extract | |
7569 | // operation on the other "wing". | |
7570 | if (isEssentiallyExtractSubvector(LHS)) { | |
7571 | RHS = tryExtendDUPToExtractHigh(RHS, DAG); | |
7572 | if (!RHS.getNode()) | |
7573 | return SDValue(); | |
7574 | } else if (isEssentiallyExtractSubvector(RHS)) { | |
7575 | LHS = tryExtendDUPToExtractHigh(LHS, DAG); | |
7576 | if (!LHS.getNode()) | |
7577 | return SDValue(); | |
7578 | } | |
970d7e83 | 7579 | |
1a4d82fc JJ |
7580 | return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0), |
7581 | N->getOperand(0), LHS, RHS); | |
7582 | } | |
970d7e83 | 7583 | |
1a4d82fc JJ |
7584 | static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) { |
7585 | MVT ElemTy = N->getSimpleValueType(0).getScalarType(); | |
7586 | unsigned ElemBits = ElemTy.getSizeInBits(); | |
7587 | ||
7588 | int64_t ShiftAmount; | |
7589 | if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) { | |
7590 | APInt SplatValue, SplatUndef; | |
7591 | unsigned SplatBitSize; | |
7592 | bool HasAnyUndefs; | |
7593 | if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, | |
7594 | HasAnyUndefs, ElemBits) || | |
7595 | SplatBitSize != ElemBits) | |
7596 | return SDValue(); | |
7597 | ||
7598 | ShiftAmount = SplatValue.getSExtValue(); | |
7599 | } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) { | |
7600 | ShiftAmount = CVN->getSExtValue(); | |
970d7e83 | 7601 | } else |
1a4d82fc | 7602 | return SDValue(); |
970d7e83 | 7603 | |
1a4d82fc JJ |
7604 | unsigned Opcode; |
7605 | bool IsRightShift; | |
7606 | switch (IID) { | |
7607 | default: | |
7608 | llvm_unreachable("Unknown shift intrinsic"); | |
7609 | case Intrinsic::aarch64_neon_sqshl: | |
7610 | Opcode = AArch64ISD::SQSHL_I; | |
7611 | IsRightShift = false; | |
7612 | break; | |
7613 | case Intrinsic::aarch64_neon_uqshl: | |
7614 | Opcode = AArch64ISD::UQSHL_I; | |
7615 | IsRightShift = false; | |
7616 | break; | |
7617 | case Intrinsic::aarch64_neon_srshl: | |
7618 | Opcode = AArch64ISD::SRSHR_I; | |
7619 | IsRightShift = true; | |
7620 | break; | |
7621 | case Intrinsic::aarch64_neon_urshl: | |
7622 | Opcode = AArch64ISD::URSHR_I; | |
7623 | IsRightShift = true; | |
7624 | break; | |
7625 | case Intrinsic::aarch64_neon_sqshlu: | |
7626 | Opcode = AArch64ISD::SQSHLU_I; | |
7627 | IsRightShift = false; | |
7628 | break; | |
970d7e83 LB |
7629 | } |
7630 | ||
1a4d82fc JJ |
7631 | if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) |
7632 | return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1), | |
7633 | DAG.getConstant(-ShiftAmount, MVT::i32)); | |
7634 | else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) | |
7635 | return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1), | |
7636 | DAG.getConstant(ShiftAmount, MVT::i32)); | |
970d7e83 | 7637 | |
1a4d82fc | 7638 | return SDValue(); |
970d7e83 LB |
7639 | } |
7640 | ||
1a4d82fc JJ |
7641 | // The CRC32[BH] instructions ignore the high bits of their data operand. Since |
7642 | // the intrinsics must be legal and take an i32, this means there's almost | |
7643 | // certainly going to be a zext in the DAG which we can eliminate. | |
7644 | static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) { | |
7645 | SDValue AndN = N->getOperand(2); | |
7646 | if (AndN.getOpcode() != ISD::AND) | |
7647 | return SDValue(); | |
970d7e83 | 7648 | |
1a4d82fc JJ |
7649 | ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1)); |
7650 | if (!CMask || CMask->getZExtValue() != Mask) | |
7651 | return SDValue(); | |
970d7e83 | 7652 | |
1a4d82fc JJ |
7653 | return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32, |
7654 | N->getOperand(0), N->getOperand(1), AndN.getOperand(0)); | |
970d7e83 LB |
7655 | } |
7656 | ||
1a4d82fc JJ |
7657 | static SDValue performIntrinsicCombine(SDNode *N, |
7658 | TargetLowering::DAGCombinerInfo &DCI, | |
7659 | const AArch64Subtarget *Subtarget) { | |
7660 | SelectionDAG &DAG = DCI.DAG; | |
7661 | unsigned IID = getIntrinsicID(N); | |
7662 | switch (IID) { | |
7663 | default: | |
7664 | break; | |
7665 | case Intrinsic::aarch64_neon_vcvtfxs2fp: | |
7666 | case Intrinsic::aarch64_neon_vcvtfxu2fp: | |
7667 | return tryCombineFixedPointConvert(N, DCI, DAG); | |
7668 | break; | |
7669 | case Intrinsic::aarch64_neon_fmax: | |
7670 | return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0), | |
7671 | N->getOperand(1), N->getOperand(2)); | |
7672 | case Intrinsic::aarch64_neon_fmin: | |
7673 | return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0), | |
7674 | N->getOperand(1), N->getOperand(2)); | |
7675 | case Intrinsic::aarch64_neon_smull: | |
7676 | case Intrinsic::aarch64_neon_umull: | |
7677 | case Intrinsic::aarch64_neon_pmull: | |
7678 | case Intrinsic::aarch64_neon_sqdmull: | |
7679 | return tryCombineLongOpWithDup(IID, N, DCI, DAG); | |
7680 | case Intrinsic::aarch64_neon_sqshl: | |
7681 | case Intrinsic::aarch64_neon_uqshl: | |
7682 | case Intrinsic::aarch64_neon_sqshlu: | |
7683 | case Intrinsic::aarch64_neon_srshl: | |
7684 | case Intrinsic::aarch64_neon_urshl: | |
7685 | return tryCombineShiftImm(IID, N, DAG); | |
7686 | case Intrinsic::aarch64_crc32b: | |
7687 | case Intrinsic::aarch64_crc32cb: | |
7688 | return tryCombineCRC32(0xff, N, DAG); | |
7689 | case Intrinsic::aarch64_crc32h: | |
7690 | case Intrinsic::aarch64_crc32ch: | |
7691 | return tryCombineCRC32(0xffff, N, DAG); | |
7692 | } | |
7693 | return SDValue(); | |
7694 | } | |
970d7e83 | 7695 | |
1a4d82fc JJ |
7696 | static SDValue performExtendCombine(SDNode *N, |
7697 | TargetLowering::DAGCombinerInfo &DCI, | |
7698 | SelectionDAG &DAG) { | |
7699 | // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then | |
7700 | // we can convert that DUP into another extract_high (of a bigger DUP), which | |
7701 | // helps the backend to decide that an sabdl2 would be useful, saving a real | |
7702 | // extract_high operation. | |
7703 | if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND && | |
7704 | N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) { | |
7705 | SDNode *ABDNode = N->getOperand(0).getNode(); | |
7706 | unsigned IID = getIntrinsicID(ABDNode); | |
7707 | if (IID == Intrinsic::aarch64_neon_sabd || | |
7708 | IID == Intrinsic::aarch64_neon_uabd) { | |
7709 | SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG); | |
7710 | if (!NewABD.getNode()) | |
7711 | return SDValue(); | |
7712 | ||
7713 | return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), | |
7714 | NewABD); | |
970d7e83 LB |
7715 | } |
7716 | } | |
7717 | ||
1a4d82fc JJ |
7718 | // This is effectively a custom type legalization for AArch64. |
7719 | // | |
7720 | // Type legalization will split an extend of a small, legal, type to a larger | |
7721 | // illegal type by first splitting the destination type, often creating | |
7722 | // illegal source types, which then get legalized in isel-confusing ways, | |
7723 | // leading to really terrible codegen. E.g., | |
7724 | // %result = v8i32 sext v8i8 %value | |
7725 | // becomes | |
7726 | // %losrc = extract_subreg %value, ... | |
7727 | // %hisrc = extract_subreg %value, ... | |
7728 | // %lo = v4i32 sext v4i8 %losrc | |
7729 | // %hi = v4i32 sext v4i8 %hisrc | |
7730 | // Things go rapidly downhill from there. | |
7731 | // | |
7732 | // For AArch64, the [sz]ext vector instructions can only go up one element | |
7733 | // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32 | |
7734 | // take two instructions. | |
7735 | // | |
7736 | // This implies that the most efficient way to do the extend from v8i8 | |
7737 | // to two v4i32 values is to first extend the v8i8 to v8i16, then do | |
7738 | // the normal splitting to happen for the v8i16->v8i32. | |
970d7e83 | 7739 | |
1a4d82fc JJ |
7740 | // This is pre-legalization to catch some cases where the default |
7741 | // type legalization will create ill-tempered code. | |
7742 | if (!DCI.isBeforeLegalizeOps()) | |
7743 | return SDValue(); | |
970d7e83 | 7744 | |
1a4d82fc JJ |
7745 | // We're only interested in cleaning things up for non-legal vector types |
7746 | // here. If both the source and destination are legal, things will just | |
7747 | // work naturally without any fiddling. | |
7748 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |
7749 | EVT ResVT = N->getValueType(0); | |
7750 | if (!ResVT.isVector() || TLI.isTypeLegal(ResVT)) | |
7751 | return SDValue(); | |
7752 | // If the vector type isn't a simple VT, it's beyond the scope of what | |
7753 | // we're worried about here. Let legalization do its thing and hope for | |
7754 | // the best. | |
7755 | SDValue Src = N->getOperand(0); | |
7756 | EVT SrcVT = Src->getValueType(0); | |
7757 | if (!ResVT.isSimple() || !SrcVT.isSimple()) | |
7758 | return SDValue(); | |
970d7e83 | 7759 | |
1a4d82fc JJ |
7760 | // If the source VT is a 64-bit vector, we can play games and get the |
7761 | // better results we want. | |
7762 | if (SrcVT.getSizeInBits() != 64) | |
7763 | return SDValue(); | |
970d7e83 | 7764 | |
1a4d82fc JJ |
7765 | unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits(); |
7766 | unsigned ElementCount = SrcVT.getVectorNumElements(); | |
7767 | SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount); | |
7768 | SDLoc DL(N); | |
7769 | Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src); | |
7770 | ||
7771 | // Now split the rest of the operation into two halves, each with a 64 | |
7772 | // bit source. | |
7773 | EVT LoVT, HiVT; | |
7774 | SDValue Lo, Hi; | |
7775 | unsigned NumElements = ResVT.getVectorNumElements(); | |
7776 | assert(!(NumElements & 1) && "Splitting vector, but not in half!"); | |
7777 | LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(), | |
7778 | ResVT.getVectorElementType(), NumElements / 2); | |
7779 | ||
7780 | EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(), | |
7781 | LoVT.getVectorNumElements()); | |
7782 | Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, | |
85aaf69f | 7783 | DAG.getConstant(0, MVT::i64)); |
1a4d82fc | 7784 | Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src, |
85aaf69f | 7785 | DAG.getConstant(InNVT.getVectorNumElements(), MVT::i64)); |
1a4d82fc JJ |
7786 | Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo); |
7787 | Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi); | |
7788 | ||
7789 | // Now combine the parts back together so we still have a single result | |
7790 | // like the combiner expects. | |
7791 | return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi); | |
970d7e83 LB |
7792 | } |
7793 | ||
1a4d82fc JJ |
7794 | /// Replace a splat of a scalar to a vector store by scalar stores of the scalar |
7795 | /// value. The load store optimizer pass will merge them to store pair stores. | |
7796 | /// This has better performance than a splat of the scalar followed by a split | |
7797 | /// vector store. Even if the stores are not merged it is four stores vs a dup, | |
7798 | /// followed by an ext.b and two stores. | |
7799 | static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) { | |
7800 | SDValue StVal = St->getValue(); | |
7801 | EVT VT = StVal.getValueType(); | |
7802 | ||
7803 | // Don't replace floating point stores, they possibly won't be transformed to | |
7804 | // stp because of the store pair suppress pass. | |
7805 | if (VT.isFloatingPoint()) | |
7806 | return SDValue(); | |
970d7e83 | 7807 | |
1a4d82fc JJ |
7808 | // Check for insert vector elements. |
7809 | if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT) | |
7810 | return SDValue(); | |
970d7e83 | 7811 | |
1a4d82fc JJ |
7812 | // We can express a splat as store pair(s) for 2 or 4 elements. |
7813 | unsigned NumVecElts = VT.getVectorNumElements(); | |
7814 | if (NumVecElts != 4 && NumVecElts != 2) | |
7815 | return SDValue(); | |
7816 | SDValue SplatVal = StVal.getOperand(1); | |
7817 | unsigned RemainInsertElts = NumVecElts - 1; | |
7818 | ||
7819 | // Check that this is a splat. | |
7820 | while (--RemainInsertElts) { | |
7821 | SDValue NextInsertElt = StVal.getOperand(0); | |
7822 | if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT) | |
7823 | return SDValue(); | |
7824 | if (NextInsertElt.getOperand(1) != SplatVal) | |
7825 | return SDValue(); | |
7826 | StVal = NextInsertElt; | |
970d7e83 | 7827 | } |
1a4d82fc JJ |
7828 | unsigned OrigAlignment = St->getAlignment(); |
7829 | unsigned EltOffset = NumVecElts == 4 ? 4 : 8; | |
7830 | unsigned Alignment = std::min(OrigAlignment, EltOffset); | |
7831 | ||
7832 | // Create scalar stores. This is at least as good as the code sequence for a | |
7833 | // split unaligned store wich is a dup.s, ext.b, and two stores. | |
7834 | // Most of the time the three stores should be replaced by store pair | |
7835 | // instructions (stp). | |
7836 | SDLoc DL(St); | |
7837 | SDValue BasePtr = St->getBasePtr(); | |
7838 | SDValue NewST1 = | |
7839 | DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(), | |
7840 | St->isVolatile(), St->isNonTemporal(), St->getAlignment()); | |
7841 | ||
7842 | unsigned Offset = EltOffset; | |
7843 | while (--NumVecElts) { | |
7844 | SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, | |
7845 | DAG.getConstant(Offset, MVT::i64)); | |
7846 | NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr, | |
7847 | St->getPointerInfo(), St->isVolatile(), | |
7848 | St->isNonTemporal(), Alignment); | |
7849 | Offset += EltOffset; | |
7850 | } | |
7851 | return NewST1; | |
7852 | } | |
970d7e83 | 7853 | |
1a4d82fc JJ |
7854 | static SDValue performSTORECombine(SDNode *N, |
7855 | TargetLowering::DAGCombinerInfo &DCI, | |
7856 | SelectionDAG &DAG, | |
7857 | const AArch64Subtarget *Subtarget) { | |
7858 | if (!DCI.isBeforeLegalize()) | |
7859 | return SDValue(); | |
970d7e83 | 7860 | |
1a4d82fc JJ |
7861 | StoreSDNode *S = cast<StoreSDNode>(N); |
7862 | if (S->isVolatile()) | |
7863 | return SDValue(); | |
970d7e83 | 7864 | |
1a4d82fc JJ |
7865 | // Cyclone has bad performance on unaligned 16B stores when crossing line and |
7866 | // page boundries. We want to split such stores. | |
7867 | if (!Subtarget->isCyclone()) | |
7868 | return SDValue(); | |
970d7e83 | 7869 | |
1a4d82fc JJ |
7870 | // Don't split at Oz. |
7871 | MachineFunction &MF = DAG.getMachineFunction(); | |
7872 | bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute( | |
7873 | AttributeSet::FunctionIndex, Attribute::MinSize); | |
7874 | if (IsMinSize) | |
7875 | return SDValue(); | |
970d7e83 | 7876 | |
1a4d82fc JJ |
7877 | SDValue StVal = S->getValue(); |
7878 | EVT VT = StVal.getValueType(); | |
970d7e83 | 7879 | |
1a4d82fc JJ |
7880 | // Don't split v2i64 vectors. Memcpy lowering produces those and splitting |
7881 | // those up regresses performance on micro-benchmarks and olden/bh. | |
7882 | if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64) | |
7883 | return SDValue(); | |
970d7e83 | 7884 | |
1a4d82fc JJ |
7885 | // Split unaligned 16B stores. They are terrible for performance. |
7886 | // Don't split stores with alignment of 1 or 2. Code that uses clang vector | |
7887 | // extensions can use this to mark that it does not want splitting to happen | |
7888 | // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of | |
7889 | // eliminating alignment hazards is only 1 in 8 for alignment of 2. | |
7890 | if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 || | |
7891 | S->getAlignment() <= 2) | |
7892 | return SDValue(); | |
970d7e83 | 7893 | |
1a4d82fc JJ |
7894 | // If we get a splat of a scalar convert this vector store to a store of |
7895 | // scalars. They will be merged into store pairs thereby removing two | |
7896 | // instructions. | |
7897 | SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S); | |
7898 | if (ReplacedSplat != SDValue()) | |
7899 | return ReplacedSplat; | |
7900 | ||
7901 | SDLoc DL(S); | |
7902 | unsigned NumElts = VT.getVectorNumElements() / 2; | |
7903 | // Split VT into two. | |
7904 | EVT HalfVT = | |
7905 | EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts); | |
7906 | SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, | |
85aaf69f | 7907 | DAG.getConstant(0, MVT::i64)); |
1a4d82fc | 7908 | SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal, |
85aaf69f | 7909 | DAG.getConstant(NumElts, MVT::i64)); |
1a4d82fc JJ |
7910 | SDValue BasePtr = S->getBasePtr(); |
7911 | SDValue NewST1 = | |
7912 | DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(), | |
7913 | S->isVolatile(), S->isNonTemporal(), S->getAlignment()); | |
7914 | SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr, | |
7915 | DAG.getConstant(8, MVT::i64)); | |
7916 | return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr, | |
7917 | S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(), | |
7918 | S->getAlignment()); | |
970d7e83 LB |
7919 | } |
7920 | ||
1a4d82fc JJ |
7921 | /// Target-specific DAG combine function for post-increment LD1 (lane) and |
7922 | /// post-increment LD1R. | |
7923 | static SDValue performPostLD1Combine(SDNode *N, | |
7924 | TargetLowering::DAGCombinerInfo &DCI, | |
7925 | bool IsLaneOp) { | |
7926 | if (DCI.isBeforeLegalizeOps()) | |
7927 | return SDValue(); | |
970d7e83 | 7928 | |
1a4d82fc JJ |
7929 | SelectionDAG &DAG = DCI.DAG; |
7930 | EVT VT = N->getValueType(0); | |
970d7e83 | 7931 | |
1a4d82fc JJ |
7932 | unsigned LoadIdx = IsLaneOp ? 1 : 0; |
7933 | SDNode *LD = N->getOperand(LoadIdx).getNode(); | |
7934 | // If it is not LOAD, can not do such combine. | |
7935 | if (LD->getOpcode() != ISD::LOAD) | |
7936 | return SDValue(); | |
970d7e83 | 7937 | |
1a4d82fc JJ |
7938 | LoadSDNode *LoadSDN = cast<LoadSDNode>(LD); |
7939 | EVT MemVT = LoadSDN->getMemoryVT(); | |
7940 | // Check if memory operand is the same type as the vector element. | |
7941 | if (MemVT != VT.getVectorElementType()) | |
7942 | return SDValue(); | |
970d7e83 | 7943 | |
1a4d82fc JJ |
7944 | // Check if there are other uses. If so, do not combine as it will introduce |
7945 | // an extra load. | |
7946 | for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE; | |
7947 | ++UI) { | |
7948 | if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result. | |
7949 | continue; | |
7950 | if (*UI != N) | |
7951 | return SDValue(); | |
7952 | } | |
970d7e83 | 7953 | |
1a4d82fc JJ |
7954 | SDValue Addr = LD->getOperand(1); |
7955 | SDValue Vector = N->getOperand(0); | |
7956 | // Search for a use of the address operand that is an increment. | |
7957 | for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE = | |
7958 | Addr.getNode()->use_end(); UI != UE; ++UI) { | |
7959 | SDNode *User = *UI; | |
7960 | if (User->getOpcode() != ISD::ADD | |
7961 | || UI.getUse().getResNo() != Addr.getResNo()) | |
7962 | continue; | |
970d7e83 | 7963 | |
1a4d82fc JJ |
7964 | // Check that the add is independent of the load. Otherwise, folding it |
7965 | // would create a cycle. | |
7966 | if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User)) | |
7967 | continue; | |
7968 | // Also check that add is not used in the vector operand. This would also | |
7969 | // create a cycle. | |
7970 | if (User->isPredecessorOf(Vector.getNode())) | |
7971 | continue; | |
970d7e83 | 7972 | |
1a4d82fc JJ |
7973 | // If the increment is a constant, it must match the memory ref size. |
7974 | SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); | |
7975 | if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { | |
7976 | uint32_t IncVal = CInc->getZExtValue(); | |
7977 | unsigned NumBytes = VT.getScalarSizeInBits() / 8; | |
7978 | if (IncVal != NumBytes) | |
7979 | continue; | |
7980 | Inc = DAG.getRegister(AArch64::XZR, MVT::i64); | |
7981 | } | |
970d7e83 | 7982 | |
1a4d82fc JJ |
7983 | SmallVector<SDValue, 8> Ops; |
7984 | Ops.push_back(LD->getOperand(0)); // Chain | |
7985 | if (IsLaneOp) { | |
7986 | Ops.push_back(Vector); // The vector to be inserted | |
7987 | Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector | |
7988 | } | |
7989 | Ops.push_back(Addr); | |
7990 | Ops.push_back(Inc); | |
7991 | ||
7992 | EVT Tys[3] = { VT, MVT::i64, MVT::Other }; | |
7993 | SDVTList SDTys = DAG.getVTList(Tys); | |
7994 | unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost; | |
7995 | SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops, | |
7996 | MemVT, | |
7997 | LoadSDN->getMemOperand()); | |
7998 | ||
7999 | // Update the uses. | |
8000 | std::vector<SDValue> NewResults; | |
8001 | NewResults.push_back(SDValue(LD, 0)); // The result of load | |
8002 | NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain | |
8003 | DCI.CombineTo(LD, NewResults); | |
8004 | DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result | |
8005 | DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register | |
970d7e83 | 8006 | |
1a4d82fc | 8007 | break; |
970d7e83 | 8008 | } |
1a4d82fc JJ |
8009 | return SDValue(); |
8010 | } | |
970d7e83 | 8011 | |
1a4d82fc JJ |
8012 | /// Target-specific DAG combine function for NEON load/store intrinsics |
8013 | /// to merge base address updates. | |
8014 | static SDValue performNEONPostLDSTCombine(SDNode *N, | |
8015 | TargetLowering::DAGCombinerInfo &DCI, | |
8016 | SelectionDAG &DAG) { | |
8017 | if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) | |
8018 | return SDValue(); | |
970d7e83 | 8019 | |
1a4d82fc JJ |
8020 | unsigned AddrOpIdx = N->getNumOperands() - 1; |
8021 | SDValue Addr = N->getOperand(AddrOpIdx); | |
8022 | ||
8023 | // Search for a use of the address operand that is an increment. | |
8024 | for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), | |
8025 | UE = Addr.getNode()->use_end(); UI != UE; ++UI) { | |
8026 | SDNode *User = *UI; | |
8027 | if (User->getOpcode() != ISD::ADD || | |
8028 | UI.getUse().getResNo() != Addr.getResNo()) | |
8029 | continue; | |
8030 | ||
8031 | // Check that the add is independent of the load/store. Otherwise, folding | |
8032 | // it would create a cycle. | |
8033 | if (User->isPredecessorOf(N) || N->isPredecessorOf(User)) | |
8034 | continue; | |
8035 | ||
8036 | // Find the new opcode for the updating load/store. | |
8037 | bool IsStore = false; | |
8038 | bool IsLaneOp = false; | |
8039 | bool IsDupOp = false; | |
8040 | unsigned NewOpc = 0; | |
8041 | unsigned NumVecs = 0; | |
8042 | unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); | |
8043 | switch (IntNo) { | |
8044 | default: llvm_unreachable("unexpected intrinsic for Neon base update"); | |
8045 | case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post; | |
8046 | NumVecs = 2; break; | |
8047 | case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post; | |
8048 | NumVecs = 3; break; | |
8049 | case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post; | |
8050 | NumVecs = 4; break; | |
8051 | case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post; | |
8052 | NumVecs = 2; IsStore = true; break; | |
8053 | case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post; | |
8054 | NumVecs = 3; IsStore = true; break; | |
8055 | case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post; | |
8056 | NumVecs = 4; IsStore = true; break; | |
8057 | case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post; | |
8058 | NumVecs = 2; break; | |
8059 | case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post; | |
8060 | NumVecs = 3; break; | |
8061 | case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post; | |
8062 | NumVecs = 4; break; | |
8063 | case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post; | |
8064 | NumVecs = 2; IsStore = true; break; | |
8065 | case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post; | |
8066 | NumVecs = 3; IsStore = true; break; | |
8067 | case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post; | |
8068 | NumVecs = 4; IsStore = true; break; | |
8069 | case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost; | |
8070 | NumVecs = 2; IsDupOp = true; break; | |
8071 | case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost; | |
8072 | NumVecs = 3; IsDupOp = true; break; | |
8073 | case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost; | |
8074 | NumVecs = 4; IsDupOp = true; break; | |
8075 | case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost; | |
8076 | NumVecs = 2; IsLaneOp = true; break; | |
8077 | case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost; | |
8078 | NumVecs = 3; IsLaneOp = true; break; | |
8079 | case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost; | |
8080 | NumVecs = 4; IsLaneOp = true; break; | |
8081 | case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost; | |
8082 | NumVecs = 2; IsStore = true; IsLaneOp = true; break; | |
8083 | case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost; | |
8084 | NumVecs = 3; IsStore = true; IsLaneOp = true; break; | |
8085 | case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost; | |
8086 | NumVecs = 4; IsStore = true; IsLaneOp = true; break; | |
8087 | } | |
8088 | ||
8089 | EVT VecTy; | |
8090 | if (IsStore) | |
8091 | VecTy = N->getOperand(2).getValueType(); | |
8092 | else | |
8093 | VecTy = N->getValueType(0); | |
8094 | ||
8095 | // If the increment is a constant, it must match the memory ref size. | |
8096 | SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0); | |
8097 | if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) { | |
8098 | uint32_t IncVal = CInc->getZExtValue(); | |
8099 | unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8; | |
8100 | if (IsLaneOp || IsDupOp) | |
8101 | NumBytes /= VecTy.getVectorNumElements(); | |
8102 | if (IncVal != NumBytes) | |
8103 | continue; | |
8104 | Inc = DAG.getRegister(AArch64::XZR, MVT::i64); | |
8105 | } | |
8106 | SmallVector<SDValue, 8> Ops; | |
8107 | Ops.push_back(N->getOperand(0)); // Incoming chain | |
8108 | // Load lane and store have vector list as input. | |
8109 | if (IsLaneOp || IsStore) | |
8110 | for (unsigned i = 2; i < AddrOpIdx; ++i) | |
8111 | Ops.push_back(N->getOperand(i)); | |
8112 | Ops.push_back(Addr); // Base register | |
8113 | Ops.push_back(Inc); | |
8114 | ||
8115 | // Return Types. | |
8116 | EVT Tys[6]; | |
8117 | unsigned NumResultVecs = (IsStore ? 0 : NumVecs); | |
8118 | unsigned n; | |
8119 | for (n = 0; n < NumResultVecs; ++n) | |
8120 | Tys[n] = VecTy; | |
8121 | Tys[n++] = MVT::i64; // Type of write back register | |
8122 | Tys[n] = MVT::Other; // Type of the chain | |
8123 | SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2)); | |
8124 | ||
8125 | MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N); | |
8126 | SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops, | |
8127 | MemInt->getMemoryVT(), | |
8128 | MemInt->getMemOperand()); | |
8129 | ||
8130 | // Update the uses. | |
8131 | std::vector<SDValue> NewResults; | |
8132 | for (unsigned i = 0; i < NumResultVecs; ++i) { | |
8133 | NewResults.push_back(SDValue(UpdN.getNode(), i)); | |
8134 | } | |
8135 | NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); | |
8136 | DCI.CombineTo(N, NewResults); | |
8137 | DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs)); | |
970d7e83 | 8138 | |
1a4d82fc JJ |
8139 | break; |
8140 | } | |
8141 | return SDValue(); | |
970d7e83 LB |
8142 | } |
8143 | ||
1a4d82fc JJ |
8144 | // Checks to see if the value is the prescribed width and returns information |
8145 | // about its extension mode. | |
8146 | static | |
8147 | bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) { | |
8148 | ExtType = ISD::NON_EXTLOAD; | |
8149 | switch(V.getNode()->getOpcode()) { | |
8150 | default: | |
8151 | return false; | |
8152 | case ISD::LOAD: { | |
8153 | LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode()); | |
8154 | if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8) | |
8155 | || (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) { | |
8156 | ExtType = LoadNode->getExtensionType(); | |
8157 | return true; | |
8158 | } | |
8159 | return false; | |
8160 | } | |
8161 | case ISD::AssertSext: { | |
8162 | VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); | |
8163 | if ((TypeNode->getVT() == MVT::i8 && width == 8) | |
8164 | || (TypeNode->getVT() == MVT::i16 && width == 16)) { | |
8165 | ExtType = ISD::SEXTLOAD; | |
8166 | return true; | |
8167 | } | |
8168 | return false; | |
8169 | } | |
8170 | case ISD::AssertZext: { | |
8171 | VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1)); | |
8172 | if ((TypeNode->getVT() == MVT::i8 && width == 8) | |
8173 | || (TypeNode->getVT() == MVT::i16 && width == 16)) { | |
8174 | ExtType = ISD::ZEXTLOAD; | |
8175 | return true; | |
8176 | } | |
8177 | return false; | |
8178 | } | |
8179 | case ISD::Constant: | |
8180 | case ISD::TargetConstant: { | |
8181 | if (std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) < | |
8182 | 1LL << (width - 1)) | |
8183 | return true; | |
8184 | return false; | |
8185 | } | |
970d7e83 LB |
8186 | } |
8187 | ||
1a4d82fc | 8188 | return true; |
970d7e83 LB |
8189 | } |
8190 | ||
1a4d82fc JJ |
8191 | // This function does a whole lot of voodoo to determine if the tests are |
8192 | // equivalent without and with a mask. Essentially what happens is that given a | |
8193 | // DAG resembling: | |
8194 | // | |
8195 | // +-------------+ +-------------+ +-------------+ +-------------+ | |
8196 | // | Input | | AddConstant | | CompConstant| | CC | | |
8197 | // +-------------+ +-------------+ +-------------+ +-------------+ | |
8198 | // | | | | | |
8199 | // V V | +----------+ | |
8200 | // +-------------+ +----+ | | | |
8201 | // | ADD | |0xff| | | | |
8202 | // +-------------+ +----+ | | | |
8203 | // | | | | | |
8204 | // V V | | | |
8205 | // +-------------+ | | | |
8206 | // | AND | | | | |
8207 | // +-------------+ | | | |
8208 | // | | | | |
8209 | // +-----+ | | | |
8210 | // | | | | |
8211 | // V V V | |
8212 | // +-------------+ | |
8213 | // | CMP | | |
8214 | // +-------------+ | |
8215 | // | |
8216 | // The AND node may be safely removed for some combinations of inputs. In | |
8217 | // particular we need to take into account the extension type of the Input, | |
8218 | // the exact values of AddConstant, CompConstant, and CC, along with the nominal | |
8219 | // width of the input (this can work for any width inputs, the above graph is | |
8220 | // specific to 8 bits. | |
8221 | // | |
8222 | // The specific equations were worked out by generating output tables for each | |
8223 | // AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The | |
8224 | // problem was simplified by working with 4 bit inputs, which means we only | |
8225 | // needed to reason about 24 distinct bit patterns: 8 patterns unique to zero | |
8226 | // extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8 | |
8227 | // patterns present in both extensions (0,7). For every distinct set of | |
8228 | // AddConstant and CompConstants bit patterns we can consider the masked and | |
8229 | // unmasked versions to be equivalent if the result of this function is true for | |
8230 | // all 16 distinct bit patterns of for the current extension type of Input (w0). | |
8231 | // | |
8232 | // sub w8, w0, w1 | |
8233 | // and w10, w8, #0x0f | |
8234 | // cmp w8, w2 | |
8235 | // cset w9, AArch64CC | |
8236 | // cmp w10, w2 | |
8237 | // cset w11, AArch64CC | |
8238 | // cmp w9, w11 | |
8239 | // cset w0, eq | |
8240 | // ret | |
8241 | // | |
8242 | // Since the above function shows when the outputs are equivalent it defines | |
8243 | // when it is safe to remove the AND. Unfortunately it only runs on AArch64 and | |
8244 | // would be expensive to run during compiles. The equations below were written | |
8245 | // in a test harness that confirmed they gave equivalent outputs to the above | |
8246 | // for all inputs function, so they can be used determine if the removal is | |
8247 | // legal instead. | |
8248 | // | |
8249 | // isEquivalentMaskless() is the code for testing if the AND can be removed | |
8250 | // factored out of the DAG recognition as the DAG can take several forms. | |
8251 | ||
8252 | static | |
8253 | bool isEquivalentMaskless(unsigned CC, unsigned width, | |
8254 | ISD::LoadExtType ExtType, signed AddConstant, | |
8255 | signed CompConstant) { | |
8256 | // By being careful about our equations and only writing the in term | |
8257 | // symbolic values and well known constants (0, 1, -1, MaxUInt) we can | |
8258 | // make them generally applicable to all bit widths. | |
8259 | signed MaxUInt = (1 << width); | |
8260 | ||
8261 | // For the purposes of these comparisons sign extending the type is | |
8262 | // equivalent to zero extending the add and displacing it by half the integer | |
8263 | // width. Provided we are careful and make sure our equations are valid over | |
8264 | // the whole range we can just adjust the input and avoid writing equations | |
8265 | // for sign extended inputs. | |
8266 | if (ExtType == ISD::SEXTLOAD) | |
8267 | AddConstant -= (1 << (width-1)); | |
970d7e83 | 8268 | |
1a4d82fc JJ |
8269 | switch(CC) { |
8270 | case AArch64CC::LE: | |
8271 | case AArch64CC::GT: { | |
8272 | if ((AddConstant == 0) || | |
8273 | (CompConstant == MaxUInt - 1 && AddConstant < 0) || | |
8274 | (AddConstant >= 0 && CompConstant < 0) || | |
8275 | (AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant)) | |
8276 | return true; | |
8277 | } break; | |
8278 | case AArch64CC::LT: | |
8279 | case AArch64CC::GE: { | |
8280 | if ((AddConstant == 0) || | |
8281 | (AddConstant >= 0 && CompConstant <= 0) || | |
8282 | (AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant)) | |
8283 | return true; | |
8284 | } break; | |
8285 | case AArch64CC::HI: | |
8286 | case AArch64CC::LS: { | |
8287 | if ((AddConstant >= 0 && CompConstant < 0) || | |
8288 | (AddConstant <= 0 && CompConstant >= -1 && | |
8289 | CompConstant < AddConstant + MaxUInt)) | |
8290 | return true; | |
8291 | } break; | |
8292 | case AArch64CC::PL: | |
8293 | case AArch64CC::MI: { | |
8294 | if ((AddConstant == 0) || | |
8295 | (AddConstant > 0 && CompConstant <= 0) || | |
8296 | (AddConstant < 0 && CompConstant <= AddConstant)) | |
8297 | return true; | |
8298 | } break; | |
8299 | case AArch64CC::LO: | |
8300 | case AArch64CC::HS: { | |
8301 | if ((AddConstant >= 0 && CompConstant <= 0) || | |
8302 | (AddConstant <= 0 && CompConstant >= 0 && | |
8303 | CompConstant <= AddConstant + MaxUInt)) | |
8304 | return true; | |
8305 | } break; | |
8306 | case AArch64CC::EQ: | |
8307 | case AArch64CC::NE: { | |
8308 | if ((AddConstant > 0 && CompConstant < 0) || | |
8309 | (AddConstant < 0 && CompConstant >= 0 && | |
8310 | CompConstant < AddConstant + MaxUInt) || | |
8311 | (AddConstant >= 0 && CompConstant >= 0 && | |
8312 | CompConstant >= AddConstant) || | |
8313 | (AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant)) | |
970d7e83 | 8314 | |
1a4d82fc JJ |
8315 | return true; |
8316 | } break; | |
8317 | case AArch64CC::VS: | |
8318 | case AArch64CC::VC: | |
8319 | case AArch64CC::AL: | |
8320 | case AArch64CC::NV: | |
8321 | return true; | |
8322 | case AArch64CC::Invalid: | |
8323 | break; | |
8324 | } | |
970d7e83 | 8325 | |
1a4d82fc JJ |
8326 | return false; |
8327 | } | |
970d7e83 | 8328 | |
1a4d82fc JJ |
8329 | static |
8330 | SDValue performCONDCombine(SDNode *N, | |
8331 | TargetLowering::DAGCombinerInfo &DCI, | |
8332 | SelectionDAG &DAG, unsigned CCIndex, | |
8333 | unsigned CmpIndex) { | |
8334 | unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue(); | |
8335 | SDNode *SubsNode = N->getOperand(CmpIndex).getNode(); | |
8336 | unsigned CondOpcode = SubsNode->getOpcode(); | |
970d7e83 | 8337 | |
1a4d82fc | 8338 | if (CondOpcode != AArch64ISD::SUBS) |
970d7e83 LB |
8339 | return SDValue(); |
8340 | ||
1a4d82fc JJ |
8341 | // There is a SUBS feeding this condition. Is it fed by a mask we can |
8342 | // use? | |
970d7e83 | 8343 | |
1a4d82fc JJ |
8344 | SDNode *AndNode = SubsNode->getOperand(0).getNode(); |
8345 | unsigned MaskBits = 0; | |
970d7e83 | 8346 | |
1a4d82fc | 8347 | if (AndNode->getOpcode() != ISD::AND) |
970d7e83 LB |
8348 | return SDValue(); |
8349 | ||
1a4d82fc JJ |
8350 | if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) { |
8351 | uint32_t CNV = CN->getZExtValue(); | |
8352 | if (CNV == 255) | |
8353 | MaskBits = 8; | |
8354 | else if (CNV == 65535) | |
8355 | MaskBits = 16; | |
8356 | } | |
970d7e83 | 8357 | |
1a4d82fc | 8358 | if (!MaskBits) |
970d7e83 LB |
8359 | return SDValue(); |
8360 | ||
1a4d82fc | 8361 | SDValue AddValue = AndNode->getOperand(0); |
970d7e83 | 8362 | |
1a4d82fc | 8363 | if (AddValue.getOpcode() != ISD::ADD) |
970d7e83 LB |
8364 | return SDValue(); |
8365 | ||
1a4d82fc | 8366 | // The basic dag structure is correct, grab the inputs and validate them. |
970d7e83 | 8367 | |
1a4d82fc JJ |
8368 | SDValue AddInputValue1 = AddValue.getNode()->getOperand(0); |
8369 | SDValue AddInputValue2 = AddValue.getNode()->getOperand(1); | |
8370 | SDValue SubsInputValue = SubsNode->getOperand(1); | |
970d7e83 | 8371 | |
1a4d82fc JJ |
8372 | // The mask is present and the provenance of all the values is a smaller type, |
8373 | // lets see if the mask is superfluous. | |
970d7e83 | 8374 | |
1a4d82fc JJ |
8375 | if (!isa<ConstantSDNode>(AddInputValue2.getNode()) || |
8376 | !isa<ConstantSDNode>(SubsInputValue.getNode())) | |
970d7e83 LB |
8377 | return SDValue(); |
8378 | ||
1a4d82fc | 8379 | ISD::LoadExtType ExtType; |
970d7e83 | 8380 | |
1a4d82fc JJ |
8381 | if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) || |
8382 | !checkValueWidth(AddInputValue2, MaskBits, ExtType) || | |
8383 | !checkValueWidth(AddInputValue1, MaskBits, ExtType) ) | |
970d7e83 LB |
8384 | return SDValue(); |
8385 | ||
1a4d82fc JJ |
8386 | if(!isEquivalentMaskless(CC, MaskBits, ExtType, |
8387 | cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(), | |
8388 | cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue())) | |
8389 | return SDValue(); | |
970d7e83 | 8390 | |
1a4d82fc | 8391 | // The AND is not necessary, remove it. |
970d7e83 | 8392 | |
1a4d82fc JJ |
8393 | SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0), |
8394 | SubsNode->getValueType(1)); | |
8395 | SDValue Ops[] = { AddValue, SubsNode->getOperand(1) }; | |
970d7e83 | 8396 | |
1a4d82fc JJ |
8397 | SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops); |
8398 | DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode()); | |
970d7e83 | 8399 | |
1a4d82fc | 8400 | return SDValue(N, 0); |
970d7e83 LB |
8401 | } |
8402 | ||
1a4d82fc JJ |
8403 | // Optimize compare with zero and branch. |
8404 | static SDValue performBRCONDCombine(SDNode *N, | |
8405 | TargetLowering::DAGCombinerInfo &DCI, | |
8406 | SelectionDAG &DAG) { | |
8407 | SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3); | |
8408 | if (NV.getNode()) | |
8409 | N = NV.getNode(); | |
8410 | SDValue Chain = N->getOperand(0); | |
8411 | SDValue Dest = N->getOperand(1); | |
8412 | SDValue CCVal = N->getOperand(2); | |
8413 | SDValue Cmp = N->getOperand(3); | |
8414 | ||
8415 | assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!"); | |
8416 | unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue(); | |
8417 | if (CC != AArch64CC::EQ && CC != AArch64CC::NE) | |
970d7e83 LB |
8418 | return SDValue(); |
8419 | ||
1a4d82fc JJ |
8420 | unsigned CmpOpc = Cmp.getOpcode(); |
8421 | if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS) | |
970d7e83 LB |
8422 | return SDValue(); |
8423 | ||
1a4d82fc JJ |
8424 | // Only attempt folding if there is only one use of the flag and no use of the |
8425 | // value. | |
8426 | if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1)) | |
970d7e83 LB |
8427 | return SDValue(); |
8428 | ||
1a4d82fc JJ |
8429 | SDValue LHS = Cmp.getOperand(0); |
8430 | SDValue RHS = Cmp.getOperand(1); | |
970d7e83 | 8431 | |
1a4d82fc JJ |
8432 | assert(LHS.getValueType() == RHS.getValueType() && |
8433 | "Expected the value type to be the same for both operands!"); | |
8434 | if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64) | |
970d7e83 LB |
8435 | return SDValue(); |
8436 | ||
1a4d82fc | 8437 | if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue()) |
970d7e83 | 8438 | std::swap(LHS, RHS); |
970d7e83 | 8439 | |
1a4d82fc | 8440 | if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue()) |
970d7e83 LB |
8441 | return SDValue(); |
8442 | ||
1a4d82fc JJ |
8443 | if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA || |
8444 | LHS.getOpcode() == ISD::SRL) | |
8445 | return SDValue(); | |
970d7e83 | 8446 | |
1a4d82fc JJ |
8447 | // Fold the compare into the branch instruction. |
8448 | SDValue BR; | |
8449 | if (CC == AArch64CC::EQ) | |
8450 | BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); | |
8451 | else | |
8452 | BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest); | |
970d7e83 | 8453 | |
1a4d82fc JJ |
8454 | // Do not add new nodes to DAG combiner worklist. |
8455 | DCI.CombineTo(N, BR, false); | |
970d7e83 | 8456 | |
1a4d82fc | 8457 | return SDValue(); |
970d7e83 LB |
8458 | } |
8459 | ||
1a4d82fc JJ |
8460 | // vselect (v1i1 setcc) -> |
8461 | // vselect (v1iXX setcc) (XX is the size of the compared operand type) | |
8462 | // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as | |
8463 | // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine | |
8464 | // such VSELECT. | |
8465 | static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) { | |
8466 | SDValue N0 = N->getOperand(0); | |
8467 | EVT CCVT = N0.getValueType(); | |
8468 | ||
8469 | if (N0.getOpcode() != ISD::SETCC || CCVT.getVectorNumElements() != 1 || | |
8470 | CCVT.getVectorElementType() != MVT::i1) | |
970d7e83 LB |
8471 | return SDValue(); |
8472 | ||
1a4d82fc JJ |
8473 | EVT ResVT = N->getValueType(0); |
8474 | EVT CmpVT = N0.getOperand(0).getValueType(); | |
8475 | // Only combine when the result type is of the same size as the compared | |
8476 | // operands. | |
8477 | if (ResVT.getSizeInBits() != CmpVT.getSizeInBits()) | |
970d7e83 LB |
8478 | return SDValue(); |
8479 | ||
1a4d82fc JJ |
8480 | SDValue IfTrue = N->getOperand(1); |
8481 | SDValue IfFalse = N->getOperand(2); | |
8482 | SDValue SetCC = | |
8483 | DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(), | |
8484 | N0.getOperand(0), N0.getOperand(1), | |
8485 | cast<CondCodeSDNode>(N0.getOperand(2))->get()); | |
8486 | return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC, | |
8487 | IfTrue, IfFalse); | |
8488 | } | |
970d7e83 | 8489 | |
1a4d82fc JJ |
8490 | /// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with |
8491 | /// the compare-mask instructions rather than going via NZCV, even if LHS and | |
8492 | /// RHS are really scalar. This replaces any scalar setcc in the above pattern | |
8493 | /// with a vector one followed by a DUP shuffle on the result. | |
8494 | static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) { | |
8495 | SDValue N0 = N->getOperand(0); | |
8496 | EVT ResVT = N->getValueType(0); | |
970d7e83 | 8497 | |
1a4d82fc | 8498 | if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1) |
970d7e83 LB |
8499 | return SDValue(); |
8500 | ||
1a4d82fc JJ |
8501 | // If NumMaskElts == 0, the comparison is larger than select result. The |
8502 | // largest real NEON comparison is 64-bits per lane, which means the result is | |
8503 | // at most 32-bits and an illegal vector. Just bail out for now. | |
8504 | EVT SrcVT = N0.getOperand(0).getValueType(); | |
85aaf69f SL |
8505 | |
8506 | // Don't try to do this optimization when the setcc itself has i1 operands. | |
8507 | // There are no legal vectors of i1, so this would be pointless. | |
8508 | if (SrcVT == MVT::i1) | |
8509 | return SDValue(); | |
8510 | ||
1a4d82fc JJ |
8511 | int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits(); |
8512 | if (!ResVT.isVector() || NumMaskElts == 0) | |
8513 | return SDValue(); | |
970d7e83 | 8514 | |
1a4d82fc JJ |
8515 | SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts); |
8516 | EVT CCVT = SrcVT.changeVectorElementTypeToInteger(); | |
8517 | ||
8518 | // First perform a vector comparison, where lane 0 is the one we're interested | |
8519 | // in. | |
8520 | SDLoc DL(N0); | |
8521 | SDValue LHS = | |
8522 | DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0)); | |
8523 | SDValue RHS = | |
8524 | DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1)); | |
8525 | SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2)); | |
8526 | ||
8527 | // Now duplicate the comparison mask we want across all other lanes. | |
8528 | SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0); | |
8529 | SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data()); | |
8530 | Mask = DAG.getNode(ISD::BITCAST, DL, | |
8531 | ResVT.changeVectorElementTypeToInteger(), Mask); | |
8532 | ||
8533 | return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2)); | |
8534 | } | |
970d7e83 | 8535 | |
1a4d82fc JJ |
8536 | SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, |
8537 | DAGCombinerInfo &DCI) const { | |
8538 | SelectionDAG &DAG = DCI.DAG; | |
8539 | switch (N->getOpcode()) { | |
8540 | default: | |
8541 | break; | |
8542 | case ISD::ADD: | |
8543 | case ISD::SUB: | |
8544 | return performAddSubLongCombine(N, DCI, DAG); | |
8545 | case ISD::XOR: | |
8546 | return performXorCombine(N, DAG, DCI, Subtarget); | |
8547 | case ISD::MUL: | |
8548 | return performMulCombine(N, DAG, DCI, Subtarget); | |
8549 | case ISD::SINT_TO_FP: | |
8550 | case ISD::UINT_TO_FP: | |
85aaf69f | 8551 | return performIntToFpCombine(N, DAG, Subtarget); |
1a4d82fc JJ |
8552 | case ISD::OR: |
8553 | return performORCombine(N, DCI, Subtarget); | |
8554 | case ISD::INTRINSIC_WO_CHAIN: | |
8555 | return performIntrinsicCombine(N, DCI, Subtarget); | |
8556 | case ISD::ANY_EXTEND: | |
8557 | case ISD::ZERO_EXTEND: | |
8558 | case ISD::SIGN_EXTEND: | |
8559 | return performExtendCombine(N, DCI, DAG); | |
8560 | case ISD::BITCAST: | |
8561 | return performBitcastCombine(N, DCI, DAG); | |
8562 | case ISD::CONCAT_VECTORS: | |
8563 | return performConcatVectorsCombine(N, DCI, DAG); | |
8564 | case ISD::SELECT: | |
8565 | return performSelectCombine(N, DAG); | |
8566 | case ISD::VSELECT: | |
8567 | return performVSelectCombine(N, DCI.DAG); | |
8568 | case ISD::STORE: | |
8569 | return performSTORECombine(N, DCI, DAG, Subtarget); | |
8570 | case AArch64ISD::BRCOND: | |
8571 | return performBRCONDCombine(N, DCI, DAG); | |
8572 | case AArch64ISD::CSEL: | |
8573 | return performCONDCombine(N, DCI, DAG, 2, 3); | |
8574 | case AArch64ISD::DUP: | |
8575 | return performPostLD1Combine(N, DCI, false); | |
8576 | case ISD::INSERT_VECTOR_ELT: | |
8577 | return performPostLD1Combine(N, DCI, true); | |
8578 | case ISD::INTRINSIC_VOID: | |
8579 | case ISD::INTRINSIC_W_CHAIN: | |
8580 | switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) { | |
8581 | case Intrinsic::aarch64_neon_ld2: | |
8582 | case Intrinsic::aarch64_neon_ld3: | |
8583 | case Intrinsic::aarch64_neon_ld4: | |
8584 | case Intrinsic::aarch64_neon_ld1x2: | |
8585 | case Intrinsic::aarch64_neon_ld1x3: | |
8586 | case Intrinsic::aarch64_neon_ld1x4: | |
8587 | case Intrinsic::aarch64_neon_ld2lane: | |
8588 | case Intrinsic::aarch64_neon_ld3lane: | |
8589 | case Intrinsic::aarch64_neon_ld4lane: | |
8590 | case Intrinsic::aarch64_neon_ld2r: | |
8591 | case Intrinsic::aarch64_neon_ld3r: | |
8592 | case Intrinsic::aarch64_neon_ld4r: | |
8593 | case Intrinsic::aarch64_neon_st2: | |
8594 | case Intrinsic::aarch64_neon_st3: | |
8595 | case Intrinsic::aarch64_neon_st4: | |
8596 | case Intrinsic::aarch64_neon_st1x2: | |
8597 | case Intrinsic::aarch64_neon_st1x3: | |
8598 | case Intrinsic::aarch64_neon_st1x4: | |
8599 | case Intrinsic::aarch64_neon_st2lane: | |
8600 | case Intrinsic::aarch64_neon_st3lane: | |
8601 | case Intrinsic::aarch64_neon_st4lane: | |
8602 | return performNEONPostLDSTCombine(N, DCI, DAG); | |
8603 | default: | |
8604 | break; | |
8605 | } | |
8606 | } | |
8607 | return SDValue(); | |
970d7e83 LB |
8608 | } |
8609 | ||
1a4d82fc JJ |
8610 | // Check if the return value is used as only a return value, as otherwise |
8611 | // we can't perform a tail-call. In particular, we need to check for | |
8612 | // target ISD nodes that are returns and any other "odd" constructs | |
8613 | // that the generic analysis code won't necessarily catch. | |
8614 | bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N, | |
8615 | SDValue &Chain) const { | |
8616 | if (N->getNumValues() != 1) | |
970d7e83 | 8617 | return false; |
1a4d82fc | 8618 | if (!N->hasNUsesOfValue(1, 0)) |
970d7e83 LB |
8619 | return false; |
8620 | ||
1a4d82fc JJ |
8621 | SDValue TCChain = Chain; |
8622 | SDNode *Copy = *N->use_begin(); | |
8623 | if (Copy->getOpcode() == ISD::CopyToReg) { | |
8624 | // If the copy has a glue operand, we conservatively assume it isn't safe to | |
8625 | // perform a tail call. | |
8626 | if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() == | |
8627 | MVT::Glue) | |
8628 | return false; | |
8629 | TCChain = Copy->getOperand(0); | |
8630 | } else if (Copy->getOpcode() != ISD::FP_EXTEND) | |
8631 | return false; | |
970d7e83 | 8632 | |
1a4d82fc JJ |
8633 | bool HasRet = false; |
8634 | for (SDNode *Node : Copy->uses()) { | |
8635 | if (Node->getOpcode() != AArch64ISD::RET_FLAG) | |
8636 | return false; | |
8637 | HasRet = true; | |
8638 | } | |
970d7e83 | 8639 | |
1a4d82fc JJ |
8640 | if (!HasRet) |
8641 | return false; | |
970d7e83 | 8642 | |
1a4d82fc JJ |
8643 | Chain = TCChain; |
8644 | return true; | |
8645 | } | |
970d7e83 | 8646 | |
1a4d82fc JJ |
8647 | // Return whether the an instruction can potentially be optimized to a tail |
8648 | // call. This will cause the optimizers to attempt to move, or duplicate, | |
8649 | // return instructions to help enable tail call optimizations for this | |
8650 | // instruction. | |
8651 | bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { | |
8652 | if (!CI->isTailCall()) | |
8653 | return false; | |
970d7e83 | 8654 | |
1a4d82fc JJ |
8655 | return true; |
8656 | } | |
970d7e83 | 8657 | |
1a4d82fc JJ |
8658 | bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base, |
8659 | SDValue &Offset, | |
8660 | ISD::MemIndexedMode &AM, | |
8661 | bool &IsInc, | |
8662 | SelectionDAG &DAG) const { | |
8663 | if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB) | |
8664 | return false; | |
970d7e83 | 8665 | |
1a4d82fc JJ |
8666 | Base = Op->getOperand(0); |
8667 | // All of the indexed addressing mode instructions take a signed | |
8668 | // 9 bit immediate offset. | |
8669 | if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) { | |
8670 | int64_t RHSC = (int64_t)RHS->getZExtValue(); | |
8671 | if (RHSC >= 256 || RHSC <= -256) | |
8672 | return false; | |
8673 | IsInc = (Op->getOpcode() == ISD::ADD); | |
8674 | Offset = Op->getOperand(1); | |
8675 | return true; | |
970d7e83 | 8676 | } |
1a4d82fc | 8677 | return false; |
970d7e83 LB |
8678 | } |
8679 | ||
1a4d82fc JJ |
8680 | bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, |
8681 | SDValue &Offset, | |
8682 | ISD::MemIndexedMode &AM, | |
8683 | SelectionDAG &DAG) const { | |
8684 | EVT VT; | |
8685 | SDValue Ptr; | |
8686 | if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { | |
8687 | VT = LD->getMemoryVT(); | |
8688 | Ptr = LD->getBasePtr(); | |
8689 | } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { | |
8690 | VT = ST->getMemoryVT(); | |
8691 | Ptr = ST->getBasePtr(); | |
8692 | } else | |
8693 | return false; | |
970d7e83 | 8694 | |
1a4d82fc JJ |
8695 | bool IsInc; |
8696 | if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG)) | |
8697 | return false; | |
8698 | AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC; | |
8699 | return true; | |
970d7e83 LB |
8700 | } |
8701 | ||
1a4d82fc JJ |
8702 | bool AArch64TargetLowering::getPostIndexedAddressParts( |
8703 | SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, | |
8704 | ISD::MemIndexedMode &AM, SelectionDAG &DAG) const { | |
8705 | EVT VT; | |
8706 | SDValue Ptr; | |
8707 | if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) { | |
8708 | VT = LD->getMemoryVT(); | |
8709 | Ptr = LD->getBasePtr(); | |
8710 | } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) { | |
8711 | VT = ST->getMemoryVT(); | |
8712 | Ptr = ST->getBasePtr(); | |
8713 | } else | |
8714 | return false; | |
970d7e83 | 8715 | |
1a4d82fc JJ |
8716 | bool IsInc; |
8717 | if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG)) | |
8718 | return false; | |
8719 | // Post-indexing updates the base, so it's not a valid transform | |
8720 | // if that's not the same as the load's pointer. | |
8721 | if (Ptr != Base) | |
8722 | return false; | |
8723 | AM = IsInc ? ISD::POST_INC : ISD::POST_DEC; | |
8724 | return true; | |
8725 | } | |
970d7e83 | 8726 | |
1a4d82fc JJ |
8727 | static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results, |
8728 | SelectionDAG &DAG) { | |
1a4d82fc JJ |
8729 | SDLoc DL(N); |
8730 | SDValue Op = N->getOperand(0); | |
85aaf69f SL |
8731 | |
8732 | if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16) | |
8733 | return; | |
8734 | ||
1a4d82fc JJ |
8735 | Op = SDValue( |
8736 | DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32, | |
8737 | DAG.getUNDEF(MVT::i32), Op, | |
8738 | DAG.getTargetConstant(AArch64::hsub, MVT::i32)), | |
8739 | 0); | |
8740 | Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op); | |
8741 | Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op)); | |
970d7e83 LB |
8742 | } |
8743 | ||
1a4d82fc JJ |
8744 | void AArch64TargetLowering::ReplaceNodeResults( |
8745 | SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { | |
970d7e83 | 8746 | switch (N->getOpcode()) { |
1a4d82fc JJ |
8747 | default: |
8748 | llvm_unreachable("Don't know how to custom expand this"); | |
8749 | case ISD::BITCAST: | |
8750 | ReplaceBITCASTResults(N, Results, DAG); | |
8751 | return; | |
8752 | case ISD::FP_TO_UINT: | |
8753 | case ISD::FP_TO_SINT: | |
8754 | assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion"); | |
8755 | // Let normal code take care of it by not adding anything to Results. | |
8756 | return; | |
970d7e83 | 8757 | } |
970d7e83 LB |
8758 | } |
8759 | ||
1a4d82fc JJ |
8760 | bool AArch64TargetLowering::useLoadStackGuardNode() const { |
8761 | return true; | |
970d7e83 LB |
8762 | } |
8763 | ||
85aaf69f SL |
8764 | bool AArch64TargetLowering::combineRepeatedFPDivisors(unsigned NumUsers) const { |
8765 | // Combine multiple FDIVs with the same divisor into multiple FMULs by the | |
8766 | // reciprocal if there are three or more FDIVs. | |
8767 | return NumUsers > 2; | |
8768 | } | |
8769 | ||
1a4d82fc JJ |
8770 | TargetLoweringBase::LegalizeTypeAction |
8771 | AArch64TargetLowering::getPreferredVectorAction(EVT VT) const { | |
8772 | MVT SVT = VT.getSimpleVT(); | |
8773 | // During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8, | |
8774 | // v4i16, v2i32 instead of to promote. | |
8775 | if (SVT == MVT::v1i8 || SVT == MVT::v1i16 || SVT == MVT::v1i32 | |
8776 | || SVT == MVT::v1f32) | |
8777 | return TypeWidenVector; | |
970d7e83 | 8778 | |
1a4d82fc | 8779 | return TargetLoweringBase::getPreferredVectorAction(VT); |
970d7e83 LB |
8780 | } |
8781 | ||
1a4d82fc JJ |
8782 | // Loads and stores less than 128-bits are already atomic; ones above that |
8783 | // are doomed anyway, so defer to the default libcall and blame the OS when | |
8784 | // things go wrong. | |
8785 | bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { | |
8786 | unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits(); | |
8787 | return Size == 128; | |
8788 | } | |
970d7e83 | 8789 | |
1a4d82fc JJ |
8790 | // Loads and stores less than 128-bits are already atomic; ones above that |
8791 | // are doomed anyway, so defer to the default libcall and blame the OS when | |
8792 | // things go wrong. | |
8793 | bool AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { | |
8794 | unsigned Size = LI->getType()->getPrimitiveSizeInBits(); | |
8795 | return Size == 128; | |
8796 | } | |
970d7e83 | 8797 | |
1a4d82fc JJ |
8798 | // For the real atomic operations, we have ldxr/stxr up to 128 bits, |
8799 | bool AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { | |
8800 | unsigned Size = AI->getType()->getPrimitiveSizeInBits(); | |
8801 | return Size <= 128; | |
8802 | } | |
970d7e83 | 8803 | |
1a4d82fc JJ |
8804 | bool AArch64TargetLowering::hasLoadLinkedStoreConditional() const { |
8805 | return true; | |
8806 | } | |
970d7e83 | 8807 | |
1a4d82fc JJ |
8808 | Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr, |
8809 | AtomicOrdering Ord) const { | |
8810 | Module *M = Builder.GetInsertBlock()->getParent()->getParent(); | |
8811 | Type *ValTy = cast<PointerType>(Addr->getType())->getElementType(); | |
8812 | bool IsAcquire = isAtLeastAcquire(Ord); | |
8813 | ||
8814 | // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd | |
8815 | // intrinsic must return {i64, i64} and we have to recombine them into a | |
8816 | // single i128 here. | |
8817 | if (ValTy->getPrimitiveSizeInBits() == 128) { | |
8818 | Intrinsic::ID Int = | |
8819 | IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp; | |
8820 | Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int); | |
8821 | ||
8822 | Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); | |
8823 | Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi"); | |
8824 | ||
8825 | Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); | |
8826 | Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); | |
8827 | Lo = Builder.CreateZExt(Lo, ValTy, "lo64"); | |
8828 | Hi = Builder.CreateZExt(Hi, ValTy, "hi64"); | |
8829 | return Builder.CreateOr( | |
8830 | Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64"); | |
970d7e83 LB |
8831 | } |
8832 | ||
1a4d82fc JJ |
8833 | Type *Tys[] = { Addr->getType() }; |
8834 | Intrinsic::ID Int = | |
8835 | IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr; | |
8836 | Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys); | |
970d7e83 | 8837 | |
1a4d82fc JJ |
8838 | return Builder.CreateTruncOrBitCast( |
8839 | Builder.CreateCall(Ldxr, Addr), | |
8840 | cast<PointerType>(Addr->getType())->getElementType()); | |
970d7e83 LB |
8841 | } |
8842 | ||
1a4d82fc JJ |
8843 | Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder, |
8844 | Value *Val, Value *Addr, | |
8845 | AtomicOrdering Ord) const { | |
8846 | Module *M = Builder.GetInsertBlock()->getParent()->getParent(); | |
8847 | bool IsRelease = isAtLeastRelease(Ord); | |
8848 | ||
8849 | // Since the intrinsics must have legal type, the i128 intrinsics take two | |
8850 | // parameters: "i64, i64". We must marshal Val into the appropriate form | |
8851 | // before the call. | |
8852 | if (Val->getType()->getPrimitiveSizeInBits() == 128) { | |
8853 | Intrinsic::ID Int = | |
8854 | IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; | |
8855 | Function *Stxr = Intrinsic::getDeclaration(M, Int); | |
8856 | Type *Int64Ty = Type::getInt64Ty(M->getContext()); | |
8857 | ||
8858 | Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); | |
8859 | Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); | |
8860 | Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext())); | |
8861 | return Builder.CreateCall3(Stxr, Lo, Hi, Addr); | |
970d7e83 LB |
8862 | } |
8863 | ||
1a4d82fc JJ |
8864 | Intrinsic::ID Int = |
8865 | IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr; | |
8866 | Type *Tys[] = { Addr->getType() }; | |
8867 | Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys); | |
8868 | ||
8869 | return Builder.CreateCall2( | |
8870 | Stxr, Builder.CreateZExtOrBitCast( | |
8871 | Val, Stxr->getFunctionType()->getParamType(0)), | |
8872 | Addr); | |
970d7e83 | 8873 | } |
85aaf69f SL |
8874 | |
8875 | bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters( | |
8876 | Type *Ty, CallingConv::ID CallConv, bool isVarArg) const { | |
8877 | return Ty->isArrayTy(); | |
8878 | } |