]>
Commit | Line | Data |
---|---|---|
ba9703b0 | 1 | // ARM Neon intrinsic specification. |
fc512014 XL |
2 | // |
3 | // This file contains the specification for a number of | |
ba9703b0 XL |
4 | // intrinsics that allows us to generate them along with |
5 | // their test cases. | |
6 | // | |
7 | // To the syntax of the file - it's not very intelligently parsed! | |
8 | // | |
9 | // # Comments | |
10 | // start with AT LEAST two, or four or more slashes so // is a | |
11 | // comment /////// is too. | |
12 | // | |
13 | // # Sections | |
14 | // Sections start with EXACTLY three slashes followed | |
15 | // by AT LEAST one space. Sections are used for two things: | |
16 | // | |
353b0b11 | 17 | // 1) they serve as the doc comment for the given intrinsics. |
ba9703b0 XL |
18 | // 2) they reset all variables (name, fn, etc.) |
19 | // | |
20 | // # Variables | |
21 | // | |
22 | // name - The prefix of the function, suffixes are auto | |
23 | // generated by the type they get passed. | |
24 | // | |
25 | // fn - The function to call in rust-land. | |
26 | // | |
27 | // aarch64 - The intrinsic to check on aarch64 architecture. | |
28 | // If this is given but no arm intrinsic is provided, | |
29 | // the function will exclusively be generated for | |
30 | // aarch64. | |
31 | // This is used to generate both aarch64 specific and | |
353b0b11 | 32 | // shared intrinsics by first only specifying th aarch64 |
ba9703b0 | 33 | // variant then the arm variant. |
fc512014 | 34 | // |
353b0b11 | 35 | // arm - The arm v7 intrinsics used to checked for arm code |
ba9703b0 | 36 | // generation. All neon functions available in arm are |
353b0b11 | 37 | // also available in aarch64. If no aarch64 intrinsic was |
ba9703b0 | 38 | // set they are assumed to be the same. |
353b0b11 | 39 | // Intrinsics ending with a `.` will have a size suffixes |
ba9703b0 | 40 | // added (such as `i8` or `i64`) that is not sign specific |
353b0b11 | 41 | // Intrinsics ending with a `.s` will have a size suffixes |
ba9703b0 XL |
42 | // added (such as `s8` or `u64`) that is sign specific |
43 | // | |
44 | // a - First input for tests, it gets scaled to the size of | |
45 | // the type. | |
46 | // | |
47 | // b - Second input for tests, it gets scaled to the size of | |
48 | // the type. | |
49 | // | |
50 | // # special values | |
51 | // | |
52 | // TRUE - 'true' all bits are set to 1 | |
53 | // FALSE - 'false' all bits are set to 0 | |
54 | // FF - same as 'true' | |
55 | // MIN - minimal value (either 0 or the lowest negative number) | |
17df50a5 | 56 | // MAX - maximal value proper to overflow |
ba9703b0 XL |
57 | // |
58 | // # validate <values> | |
a2a8927a | 59 | // Validates a and b against the expected result of the test. |
ba9703b0 | 60 | // The special values 'TRUE' and 'FALSE' can be used to |
17df50a5 | 61 | // represent the correct NEON representation of true or |
ba9703b0 | 62 | // false values. It too gets scaled to the type. |
fc512014 | 63 | // |
ba9703b0 XL |
64 | // Validate needs to be called before generate as it sets |
65 | // up the rules for validation that get generated for each | |
66 | // type. | |
67 | // # generate <types> | |
68 | // The generate command generates the intrinsics, it uses the | |
69 | // Variables set and can be called multiple times while overwriting | |
70 | // some of the variables. | |
71 | ||
72 | /// Vector bitwise and | |
73 | name = vand | |
74 | fn = simd_and | |
75 | arm = vand | |
76 | aarch64 = and | |
77 | a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00 | |
78 | b = 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F | |
79 | validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x00 | |
80 | b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 | |
81 | validate 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 | |
82 | generate int*_t, uint*_t, int64x*_t, uint64x*_t | |
83 | ||
84 | /// Vector bitwise or (immediate, inclusive) | |
85 | name = vorr | |
86 | fn = simd_or | |
87 | arm = vorr | |
88 | aarch64 = orr | |
89 | a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F | |
90 | b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 | |
91 | validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F | |
92 | generate int*_t, uint*_t, int64x*_t, uint64x*_t | |
93 | ||
94 | ||
95 | /// Vector bitwise exclusive or (vector) | |
96 | name = veor | |
97 | fn = simd_xor | |
98 | arm = veor | |
99 | aarch64 = eor | |
100 | a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F | |
101 | b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 | |
102 | validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F | |
103 | generate int*_t, uint*_t, int64x*_t, uint64x*_t | |
104 | ||
3c0e092e XL |
105 | /// Three-way exclusive OR |
106 | name = veor3 | |
107 | a = 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F | |
108 | b = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 | |
109 | c = 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 | |
110 | validate 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F | |
111 | target = sha3 | |
112 | ||
113 | aarch64 = eor3 | |
114 | link-aarch64 = llvm.aarch64.crypto.eor3s._EXT_ | |
115 | generate int8x16_t, int16x8_t, int32x4_t, int64x2_t | |
116 | link-aarch64 = llvm.aarch64.crypto.eor3u._EXT_ | |
117 | generate uint8x16_t, uint16x8_t, uint32x4_t, uint64x2_t | |
118 | ||
17df50a5 XL |
119 | //////////////////// |
120 | // Absolute difference between the arguments | |
121 | //////////////////// | |
122 | ||
123 | /// Absolute difference between the arguments | |
124 | name = vabd | |
125 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
126 | b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 | |
127 | validate 15, 13, 11, 9, 7, 5, 3, 1, 1, 3, 5, 7, 9, 11, 13, 15 | |
128 | ||
129 | arm = vabd.s | |
130 | aarch64 = sabd | |
131 | link-arm = vabds._EXT_ | |
132 | link-aarch64 = sabd._EXT_ | |
133 | generate int*_t | |
134 | ||
135 | arm = vabd.s | |
136 | aarch64 = uabd | |
137 | link-arm = vabdu._EXT_ | |
138 | link-aarch64 = uabd._EXT_ | |
139 | generate uint*_t | |
140 | ||
141 | /// Absolute difference between the arguments of Floating | |
142 | name = vabd | |
143 | a = 1.0, 2.0, 5.0, -4.0 | |
144 | b = 9.0, 3.0, 2.0, 8.0 | |
145 | validate 8.0, 1.0, 3.0, 12.0 | |
146 | ||
147 | aarch64 = fabd | |
148 | link-aarch64 = fabd._EXT_ | |
149 | generate float64x*_t | |
150 | ||
151 | arm = vabd.s | |
152 | aarch64 = fabd | |
153 | link-arm = vabds._EXT_ | |
154 | link-aarch64 = fabd._EXT_ | |
155 | generate float*_t | |
156 | ||
3c0e092e XL |
157 | /// Floating-point absolute difference |
158 | name = vabd | |
c620b35d | 159 | multi_fn = simd_extract!, {vabd-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 |
3c0e092e XL |
160 | a = 1.0 |
161 | b = 9.0 | |
162 | validate 8.0 | |
163 | ||
164 | aarch64 = fabd | |
165 | generate f32, f64 | |
166 | ||
17df50a5 XL |
167 | //////////////////// |
168 | // Absolute difference Long | |
169 | //////////////////// | |
170 | ||
171 | /// Unsigned Absolute difference Long | |
172 | name = vabdl | |
173 | multi_fn = simd_cast, {vabd-unsigned-noext, a, b} | |
174 | a = 1, 2, 3, 4, 4, 3, 2, 1 | |
175 | b = 10, 10, 10, 10, 10, 10, 10, 10 | |
176 | validate 9, 8, 7, 6, 6, 7, 8, 9 | |
177 | ||
178 | arm = vabdl.s | |
179 | aarch64 = uabdl | |
180 | generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t | |
181 | ||
182 | /// Signed Absolute difference Long | |
183 | name = vabdl | |
184 | multi_fn = simd_cast, c:uint8x8_t, {vabd-signed-noext, a, b} | |
185 | multi_fn = simd_cast, c | |
186 | a = 1, 2, 3, 4, 4, 3, 2, 1 | |
187 | b = 10, 10, 10, 10, 10, 10, 10, 10 | |
188 | validate 9, 8, 7, 6, 6, 7, 8, 9 | |
189 | ||
190 | arm = vabdl.s | |
191 | aarch64 = sabdl | |
192 | generate int8x8_t:int8x8_t:int16x8_t | |
193 | ||
194 | /// Signed Absolute difference Long | |
195 | name = vabdl | |
196 | multi_fn = simd_cast, c:uint16x4_t, {vabd-signed-noext, a, b} | |
197 | multi_fn = simd_cast, c | |
198 | a = 1, 2, 11, 12 | |
199 | b = 10, 10, 10, 10 | |
200 | validate 9, 8, 1, 2 | |
201 | ||
202 | arm = vabdl.s | |
203 | aarch64 = sabdl | |
204 | generate int16x4_t:int16x4_t:int32x4_t | |
205 | ||
206 | /// Signed Absolute difference Long | |
207 | name = vabdl | |
208 | multi_fn = simd_cast, c:uint32x2_t, {vabd-signed-noext, a, b} | |
209 | multi_fn = simd_cast, c | |
210 | a = 1, 11 | |
211 | b = 10, 10 | |
212 | validate 9, 1 | |
213 | ||
214 | arm = vabdl.s | |
215 | aarch64 = sabdl | |
216 | generate int32x2_t:int32x2_t:int64x2_t | |
217 | ||
218 | /// Unsigned Absolute difference Long | |
219 | name = vabdl_high | |
220 | no-q | |
353b0b11 FG |
221 | multi_fn = simd_shuffle!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] |
222 | multi_fn = simd_shuffle!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] | |
17df50a5 XL |
223 | multi_fn = simd_cast, {vabd_u8, c, d} |
224 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
225 | b = 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10 | |
226 | validate 1, 0, 1, 2, 3, 4, 5, 6 | |
227 | ||
228 | aarch64 = uabdl | |
229 | generate uint8x16_t:uint8x16_t:uint16x8_t | |
230 | ||
231 | /// Unsigned Absolute difference Long | |
232 | name = vabdl_high | |
233 | no-q | |
353b0b11 FG |
234 | multi_fn = simd_shuffle!, c:uint16x4_t, a, a, [4, 5, 6, 7] |
235 | multi_fn = simd_shuffle!, d:uint16x4_t, b, b, [4, 5, 6, 7] | |
17df50a5 XL |
236 | multi_fn = simd_cast, {vabd_u16, c, d} |
237 | a = 1, 2, 3, 4, 8, 9, 11, 12 | |
238 | b = 10, 10, 10, 10, 10, 10, 10, 10 | |
239 | validate 2, 1, 1, 2 | |
240 | ||
241 | aarch64 = uabdl | |
242 | generate uint16x8_t:uint16x8_t:uint32x4_t | |
243 | ||
244 | /// Unsigned Absolute difference Long | |
245 | name = vabdl_high | |
246 | no-q | |
353b0b11 FG |
247 | multi_fn = simd_shuffle!, c:uint32x2_t, a, a, [2, 3] |
248 | multi_fn = simd_shuffle!, d:uint32x2_t, b, b, [2, 3] | |
17df50a5 XL |
249 | multi_fn = simd_cast, {vabd_u32, c, d} |
250 | a = 1, 2, 3, 4 | |
251 | b = 10, 10, 10, 10 | |
252 | validate 7, 6 | |
253 | ||
254 | aarch64 = uabdl | |
255 | generate uint32x4_t:uint32x4_t:uint64x2_t | |
256 | ||
257 | /// Signed Absolute difference Long | |
258 | name = vabdl_high | |
259 | no-q | |
353b0b11 FG |
260 | multi_fn = simd_shuffle!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] |
261 | multi_fn = simd_shuffle!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] | |
17df50a5 XL |
262 | multi_fn = simd_cast, e:uint8x8_t, {vabd_s8, c, d} |
263 | multi_fn = simd_cast, e | |
264 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
265 | b = 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10 | |
266 | validate 1, 0, 1, 2, 3, 4, 5, 6 | |
267 | ||
268 | aarch64 = sabdl | |
269 | generate int8x16_t:int8x16_t:int16x8_t | |
270 | ||
271 | /// Signed Absolute difference Long | |
272 | name = vabdl_high | |
273 | no-q | |
353b0b11 FG |
274 | multi_fn = simd_shuffle!, c:int16x4_t, a, a, [4, 5, 6, 7] |
275 | multi_fn = simd_shuffle!, d:int16x4_t, b, b, [4, 5, 6, 7] | |
17df50a5 XL |
276 | multi_fn = simd_cast, e:uint16x4_t, {vabd_s16, c, d} |
277 | multi_fn = simd_cast, e | |
278 | a = 1, 2, 3, 4, 9, 10, 11, 12 | |
279 | b = 10, 10, 10, 10, 10, 10, 10, 10 | |
280 | validate 1, 0, 1, 2 | |
281 | ||
282 | aarch64 = sabdl | |
283 | generate int16x8_t:int16x8_t:int32x4_t | |
284 | ||
285 | /// Signed Absolute difference Long | |
286 | name = vabdl_high | |
287 | no-q | |
353b0b11 FG |
288 | multi_fn = simd_shuffle!, c:int32x2_t, a, a, [2, 3] |
289 | multi_fn = simd_shuffle!, d:int32x2_t, b, b, [2, 3] | |
17df50a5 XL |
290 | multi_fn = simd_cast, e:uint32x2_t, {vabd_s32, c, d} |
291 | multi_fn = simd_cast, e | |
292 | a = 1, 2, 3, 4 | |
293 | b = 10, 10, 10, 10 | |
294 | validate 7, 6 | |
295 | ||
296 | aarch64 = sabdl | |
297 | generate int32x4_t:int32x4_t:int64x2_t | |
298 | ||
ba9703b0 XL |
299 | //////////////////// |
300 | // equality | |
301 | //////////////////// | |
302 | ||
303 | /// Compare bitwise Equal (vector) | |
304 | name = vceq | |
305 | fn = simd_eq | |
306 | a = MIN, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, MAX | |
307 | b = MIN, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, MAX | |
308 | validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE | |
309 | a = MIN, MIN, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0xCC, 0x0D, 0xEE, MAX | |
310 | b = MIN, MAX, 0x02, 0x04, 0x04, 0x00, 0x06, 0x08, 0x08, 0x00, 0x0A, 0x0A, 0xCC, 0xD0, 0xEE, MIN | |
311 | validate TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE | |
312 | ||
313 | aarch64 = cmeq | |
314 | generate uint64x*_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t | |
315 | ||
316 | arm = vceq. | |
17df50a5 | 317 | generate uint*_t, int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t |
ba9703b0 XL |
318 | |
319 | /// Floating-point compare equal | |
320 | name = vceq | |
321 | fn = simd_eq | |
322 | a = 1.2, 3.4, 5.6, 7.8 | |
323 | b = 1.2, 3.4, 5.6, 7.8 | |
324 | validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE | |
325 | ||
326 | aarch64 = fcmeq | |
327 | generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t | |
328 | ||
329 | arm = vceq. | |
ba9703b0 XL |
330 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
331 | ||
3c0e092e XL |
332 | /// Compare bitwise equal |
333 | name = vceq | |
334 | multi_fn = transmute, {vceq-in_ntt-noext, {transmute, a}, {transmute, b}} | |
335 | a = 1 | |
336 | b = 2 | |
337 | validate 0 | |
338 | ||
339 | aarch64 = cmp | |
340 | generate i64:u64, u64 | |
341 | ||
342 | /// Floating-point compare equal | |
343 | name = vceq | |
c620b35d | 344 | multi_fn = simd_extract!, {vceq-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 |
3c0e092e XL |
345 | a = 1. |
346 | b = 2. | |
347 | validate 0 | |
348 | ||
349 | aarch64 = fcmp | |
350 | generate f32:u32, f64:u64 | |
351 | ||
17df50a5 XL |
352 | /// Signed compare bitwise equal to zero |
353 | name = vceqz | |
354 | fn = simd_eq | |
355 | a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX | |
356 | fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
357 | validate FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE | |
358 | ||
359 | aarch64 = cmeq | |
360 | generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t | |
361 | ||
362 | /// Unsigned compare bitwise equal to zero | |
363 | name = vceqz | |
364 | fn = simd_eq | |
365 | a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX | |
366 | fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
367 | validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE | |
368 | ||
369 | aarch64 = cmeq | |
370 | generate uint*_t, uint64x*_t | |
371 | ||
372 | /// Floating-point compare bitwise equal to zero | |
373 | name = vceqz | |
374 | fn = simd_eq | |
375 | a = 0.0, 1.2, 3.4, 5.6 | |
376 | fixed = 0.0, 0.0, 0.0, 0.0 | |
377 | validate TRUE, FALSE, FALSE, FALSE | |
378 | ||
379 | aarch64 = fcmeq | |
380 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t | |
381 | ||
3c0e092e XL |
382 | /// Compare bitwise equal to zero |
383 | name = vceqz | |
384 | multi_fn = transmute, {vceqz-in_ntt-noext, {transmute, a}} | |
385 | a = 1 | |
386 | validate 0 | |
387 | ||
388 | aarch64 = cmp | |
389 | generate i64:u64, u64 | |
390 | ||
391 | /// Floating-point compare bitwise equal to zero | |
392 | name = vceqz | |
c620b35d | 393 | multi_fn = simd_extract!, {vceqz-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 |
3c0e092e XL |
394 | a = 1. |
395 | validate 0 | |
396 | ||
397 | aarch64 = fcmp | |
398 | generate f32:u32, f64:u64 | |
399 | ||
17df50a5 XL |
400 | /// Signed compare bitwise Test bits nonzero |
401 | name = vtst | |
402 | multi_fn = simd_and, c:in_t, a, b | |
403 | multi_fn = fixed, d:in_t | |
404 | multi_fn = simd_ne, c, transmute(d) | |
405 | a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX | |
406 | b = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX | |
407 | fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
408 | validate TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE | |
409 | ||
410 | aarch64 = cmtst | |
411 | generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t, poly64x1_t:uint64x1_t, poly64x2_t:uint64x2_t | |
412 | ||
413 | arm = vtst | |
a2a8927a | 414 | generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, poly8x8_t:uint8x8_t, poly8x16_t:uint8x16_t, poly16x4_t:uint16x4_t, poly16x8_t:uint16x8_t |
17df50a5 XL |
415 | |
416 | /// Unsigned compare bitwise Test bits nonzero | |
417 | name = vtst | |
418 | multi_fn = simd_and, c:in_t, a, b | |
419 | multi_fn = fixed, d:in_t | |
420 | multi_fn = simd_ne, c, transmute(d) | |
421 | a = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX | |
422 | b = MIN, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, MAX | |
423 | fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
424 | validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE | |
425 | ||
426 | aarch64 = cmtst | |
427 | generate uint64x*_t | |
428 | ||
429 | arm = vtst | |
430 | generate uint*_t | |
431 | ||
3c0e092e XL |
432 | /// Compare bitwise test bits nonzero |
433 | name = vtst | |
434 | multi_fn = transmute, {vtst-in_ntt-noext, {transmute, a}, {transmute, b}} | |
435 | a = 0 | |
436 | b = 0 | |
437 | validate 0 | |
438 | ||
439 | aarch64 = tst | |
440 | generate i64:i64:u64, u64 | |
441 | ||
442 | /// Signed saturating accumulate of unsigned value | |
443 | name = vuqadd | |
444 | out-suffix | |
445 | a = 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4 | |
446 | b = 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4 | |
447 | validate 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8 | |
448 | ||
449 | aarch64 = suqadd | |
450 | link-aarch64 = suqadd._EXT_ | |
451 | generate i32:u32:i32, i64:u64:i64 | |
452 | ||
453 | /// Signed saturating accumulate of unsigned value | |
454 | name = vuqadd | |
455 | out-suffix | |
c620b35d | 456 | multi_fn = simd_extract!, {vuqadd-out_ntt-noext, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 |
3c0e092e XL |
457 | a = 1 |
458 | b = 2 | |
459 | validate 3 | |
460 | ||
461 | aarch64 = suqadd | |
462 | generate i8:u8:i8, i16:u16:i16 | |
463 | ||
17df50a5 XL |
464 | //////////////////// |
465 | // Floating-point absolute value | |
466 | //////////////////// | |
467 | ||
468 | /// Floating-point absolute value | |
469 | name = vabs | |
470 | fn = simd_fabs | |
471 | a = -0.1, -2.2, -3.3, -6.6 | |
472 | validate 0.1, 2.2, 3.3, 6.6 | |
473 | aarch64 = fabs | |
474 | generate float64x1_t:float64x1_t, float64x2_t:float64x2_t | |
475 | ||
476 | arm = vabs | |
477 | generate float32x2_t:float32x2_t, float32x4_t:float32x4_t | |
478 | ||
ba9703b0 XL |
479 | //////////////////// |
480 | // greater then | |
481 | //////////////////// | |
482 | ||
483 | /// Compare signed greater than | |
484 | name = vcgt | |
485 | fn = simd_gt | |
486 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
487 | b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
488 | validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE | |
489 | aarch64 = cmgt | |
490 | generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t | |
491 | ||
492 | arm = vcgt.s | |
493 | generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t | |
494 | ||
49aad941 | 495 | /// Compare unsigned greater than |
ba9703b0 XL |
496 | name = vcgt |
497 | fn = simd_gt | |
498 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
499 | b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
500 | validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE | |
501 | ||
502 | aarch64 = cmhi | |
503 | generate uint64x*_t | |
504 | ||
505 | arm = vcgt.s | |
506 | generate uint*_t | |
507 | ||
508 | /// Floating-point compare greater than | |
509 | name = vcgt | |
510 | fn = simd_gt | |
fc512014 | 511 | a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 |
ba9703b0 XL |
512 | b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 |
513 | validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE | |
514 | ||
515 | aarch64 = fcmgt | |
516 | generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t | |
517 | ||
518 | arm = vcgt.s | |
ba9703b0 XL |
519 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
520 | ||
3c0e092e XL |
521 | /// Compare greater than |
522 | name = vcgt | |
523 | multi_fn = transmute, {vcgt-in_ntt-noext, {transmute, a}, {transmute, b}} | |
524 | a = 1 | |
525 | b = 2 | |
526 | validate 0 | |
527 | ||
528 | aarch64 = cmp | |
529 | generate i64:u64, u64 | |
530 | ||
531 | /// Floating-point compare greater than | |
532 | name = vcgt | |
c620b35d | 533 | multi_fn = simd_extract!, {vcgt-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 |
3c0e092e XL |
534 | a = 1. |
535 | b = 2. | |
536 | validate 0 | |
537 | ||
538 | aarch64 = fcmp | |
539 | generate f32:u32, f64:u64 | |
540 | ||
ba9703b0 XL |
541 | //////////////////// |
542 | // lesser then | |
543 | //////////////////// | |
544 | ||
545 | /// Compare signed less than | |
546 | name = vclt | |
547 | fn = simd_lt | |
548 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
549 | b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
550 | validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE | |
551 | aarch64 = cmgt | |
552 | generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t | |
553 | ||
554 | arm = vcgt.s | |
555 | generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t | |
556 | ||
557 | /// Compare unsigned less than | |
558 | name = vclt | |
559 | fn = simd_lt | |
560 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
561 | b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
562 | validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE | |
563 | ||
564 | aarch64 = cmhi | |
565 | generate uint64x*_t | |
566 | ||
567 | arm = vcgt.s | |
568 | generate uint*_t | |
569 | ||
570 | /// Floating-point compare less than | |
571 | name = vclt | |
572 | fn = simd_lt | |
573 | a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 | |
fc512014 | 574 | b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 |
ba9703b0 XL |
575 | validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
576 | ||
577 | aarch64 = fcmgt | |
578 | generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t | |
579 | ||
580 | arm = vcgt.s | |
ba9703b0 XL |
581 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
582 | ||
3c0e092e XL |
583 | /// Compare less than |
584 | name = vclt | |
585 | multi_fn = transmute, {vclt-in_ntt-noext, {transmute, a}, {transmute, b}} | |
586 | a = 2 | |
587 | b = 1 | |
588 | validate 0 | |
589 | ||
590 | aarch64 = cmp | |
591 | generate i64:u64, u64 | |
592 | ||
593 | /// Floating-point compare less than | |
594 | name = vclt | |
c620b35d | 595 | multi_fn = simd_extract!, {vclt-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 |
3c0e092e XL |
596 | a = 2. |
597 | b = 1. | |
598 | validate 0 | |
599 | ||
600 | aarch64 = fcmp | |
601 | generate f32:u32, f64:u64 | |
602 | ||
ba9703b0 XL |
603 | //////////////////// |
604 | // lesser then equals | |
605 | //////////////////// | |
606 | ||
607 | /// Compare signed less than or equal | |
608 | name = vcle | |
609 | fn = simd_le | |
610 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
611 | b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
612 | validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE | |
613 | ||
614 | aarch64 = cmge | |
615 | generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t | |
616 | ||
617 | arm = vcge.s | |
618 | generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t | |
619 | ||
3c0e092e XL |
620 | /// Compare greater than or equal |
621 | name = vcge | |
622 | multi_fn = transmute, {vcge-in_ntt-noext, {transmute, a}, {transmute, b}} | |
623 | a = 1 | |
624 | b = 2 | |
625 | validate 0 | |
626 | ||
627 | aarch64 = cmp | |
628 | generate i64:u64, u64 | |
629 | ||
630 | /// Floating-point compare greater than or equal | |
631 | name = vcge | |
c620b35d | 632 | multi_fn = simd_extract!, {vcge-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 |
3c0e092e XL |
633 | a = 1. |
634 | b = 2. | |
635 | validate 0 | |
636 | ||
637 | aarch64 = fcmp | |
638 | generate f32:u32, f64:u64 | |
639 | ||
ba9703b0 XL |
640 | /// Compare unsigned less than or equal |
641 | name = vcle | |
642 | fn = simd_le | |
643 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
644 | b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
645 | validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE | |
646 | ||
647 | aarch64 = cmhs | |
648 | generate uint64x*_t | |
649 | ||
650 | arm = vcge.s | |
651 | generate uint*_t | |
652 | ||
653 | /// Floating-point compare less than or equal | |
654 | name = vcle | |
655 | fn = simd_le | |
656 | a = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 | |
fc512014 | 657 | b = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 |
ba9703b0 XL |
658 | validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE |
659 | aarch64 = fcmge | |
660 | generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t | |
661 | ||
ba9703b0 XL |
662 | arm = vcge.s |
663 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t | |
664 | ||
3c0e092e XL |
665 | /// Compare less than or equal |
666 | name = vcle | |
667 | multi_fn = transmute, {vcle-in_ntt-noext, {transmute, a}, {transmute, b}} | |
668 | a = 2 | |
669 | b = 1 | |
670 | validate 0 | |
671 | ||
672 | aarch64 = cmp | |
673 | generate i64:u64, u64 | |
674 | ||
675 | /// Floating-point compare less than or equal | |
676 | name = vcle | |
c620b35d | 677 | multi_fn = simd_extract!, {vcle-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 |
3c0e092e XL |
678 | a = 2. |
679 | b = 1. | |
680 | validate 0 | |
681 | ||
682 | aarch64 = fcmp | |
683 | generate f32:u32, f64:u64 | |
684 | ||
ba9703b0 XL |
685 | //////////////////// |
686 | // greater then equals | |
687 | //////////////////// | |
688 | ||
689 | /// Compare signed greater than or equal | |
690 | name = vcge | |
691 | fn = simd_ge | |
692 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
693 | b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
694 | validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE | |
695 | ||
696 | aarch64 = cmge | |
697 | generate int64x1_t:uint64x1_t, int64x2_t:uint64x2_t | |
698 | ||
699 | arm = vcge.s | |
700 | generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t | |
701 | ||
702 | /// Compare unsigned greater than or equal | |
703 | name = vcge | |
704 | fn = simd_ge | |
705 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
706 | b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
707 | validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE | |
708 | ||
709 | aarch64 = cmhs | |
710 | generate uint64x*_t | |
711 | ||
712 | arm = vcge.s | |
713 | generate uint*_t | |
714 | ||
715 | /// Floating-point compare greater than or equal | |
716 | name = vcge | |
717 | fn = simd_ge | |
fc512014 | 718 | a = 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8, 8.9 |
ba9703b0 XL |
719 | b = 0.1, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7, 7.8 |
720 | validate TRUE, TRUE, TRUE, TRUE, TRUE, TRUE | |
721 | ||
722 | aarch64 = fcmge | |
723 | generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t | |
724 | ||
725 | arm = vcge.s | |
ba9703b0 XL |
726 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t |
727 | ||
17df50a5 XL |
728 | /// Compare signed greater than or equal to zero |
729 | name = vcgez | |
730 | fn = simd_ge | |
731 | a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX | |
732 | fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
733 | validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE | |
ba9703b0 | 734 | |
c620b35d | 735 | aarch64 = cmge |
17df50a5 | 736 | generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t |
ba9703b0 | 737 | |
17df50a5 XL |
738 | /// Floating-point compare greater than or equal to zero |
739 | name = vcgez | |
740 | fn = simd_ge | |
741 | a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 | |
742 | fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 | |
743 | validate FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE | |
ba9703b0 | 744 | |
17df50a5 XL |
745 | aarch64 = fcmge |
746 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t | |
ba9703b0 | 747 | |
3c0e092e XL |
748 | /// Compare signed greater than or equal to zero |
749 | name = vcgez | |
750 | multi_fn = transmute, {vcgez-in_ntt-noext, {transmute, a}} | |
751 | a = -1 | |
752 | validate 0 | |
753 | ||
49aad941 | 754 | aarch64 = nop |
3c0e092e XL |
755 | generate i64:u64 |
756 | ||
757 | /// Floating-point compare greater than or equal to zero | |
758 | name = vcgez | |
c620b35d | 759 | multi_fn = simd_extract!, {vcgez-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 |
3c0e092e XL |
760 | a = -1. |
761 | validate 0 | |
762 | ||
763 | aarch64 = fcmp | |
764 | generate f32:u32, f64:u64 | |
765 | ||
17df50a5 XL |
766 | /// Compare signed greater than zero |
767 | name = vcgtz | |
768 | fn = simd_gt | |
769 | a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX | |
770 | fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
771 | validate FALSE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE | |
ba9703b0 | 772 | |
17df50a5 XL |
773 | aarch64 = cmgt |
774 | generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t | |
ba9703b0 | 775 | |
17df50a5 XL |
776 | /// Floating-point compare greater than zero |
777 | name = vcgtz | |
778 | fn = simd_gt | |
779 | a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 | |
780 | fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 | |
781 | validate FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE | |
ba9703b0 | 782 | |
17df50a5 XL |
783 | aarch64 = fcmgt |
784 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t | |
ba9703b0 | 785 | |
3c0e092e XL |
786 | /// Compare signed greater than zero |
787 | name = vcgtz | |
788 | multi_fn = transmute, {vcgtz-in_ntt-noext, {transmute, a}} | |
789 | a = -1 | |
790 | validate 0 | |
791 | ||
792 | aarch64 = cmp | |
793 | generate i64:u64 | |
794 | ||
795 | /// Floating-point compare greater than zero | |
796 | name = vcgtz | |
c620b35d | 797 | multi_fn = simd_extract!, {vcgtz-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 |
3c0e092e XL |
798 | a = -1. |
799 | validate 0 | |
800 | ||
801 | aarch64 = fcmp | |
802 | generate f32:u32, f64:u64 | |
803 | ||
17df50a5 XL |
804 | /// Compare signed less than or equal to zero |
805 | name = vclez | |
806 | fn = simd_le | |
807 | a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX | |
808 | fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
809 | validate TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE | |
ba9703b0 | 810 | |
c620b35d | 811 | aarch64 = cmle |
17df50a5 | 812 | generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t |
ba9703b0 | 813 | |
17df50a5 XL |
814 | /// Floating-point compare less than or equal to zero |
815 | name = vclez | |
816 | fn = simd_le | |
817 | a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 | |
818 | fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 | |
819 | validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE | |
820 | ||
821 | aarch64 = fcmle | |
822 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t | |
823 | ||
3c0e092e XL |
824 | /// Compare less than or equal to zero |
825 | name = vclez | |
826 | multi_fn = transmute, {vclez-in_ntt-noext, {transmute, a}} | |
827 | a = 2 | |
828 | validate 0 | |
829 | ||
830 | aarch64 = cmp | |
831 | generate i64:u64 | |
832 | ||
833 | /// Floating-point compare less than or equal to zero | |
834 | name = vclez | |
c620b35d | 835 | multi_fn = simd_extract!, {vclez-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 |
3c0e092e XL |
836 | a = 2. |
837 | validate 0 | |
838 | ||
839 | aarch64 = fcmp | |
840 | generate f32:u32, f64:u64 | |
841 | ||
17df50a5 XL |
842 | /// Compare signed less than zero |
843 | name = vcltz | |
844 | fn = simd_lt | |
845 | a = MIN, -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, MAX | |
846 | fixed = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
847 | validate TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE | |
848 | ||
5e7ed085 | 849 | aarch64 = cmlt |
17df50a5 XL |
850 | generate int8x8_t:uint8x8_t, int8x16_t:uint8x16_t, int16x4_t:uint16x4_t, int16x8_t:uint16x8_t, int32x2_t:uint32x2_t, int32x4_t:uint32x4_t, int64x1_t:uint64x1_t, int64x2_t:uint64x2_t |
851 | ||
852 | /// Floating-point compare less than zero | |
853 | name = vcltz | |
854 | fn = simd_lt | |
855 | a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 | |
856 | fixed = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 | |
857 | validate TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE | |
858 | ||
859 | aarch64 = fcmlt | |
860 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t | |
861 | ||
3c0e092e XL |
862 | /// Compare less than zero |
863 | name = vcltz | |
864 | multi_fn = transmute, {vcltz-in_ntt-noext, {transmute, a}} | |
865 | a = 2 | |
866 | validate 0 | |
867 | ||
868 | aarch64 = asr | |
869 | generate i64:u64 | |
870 | ||
871 | /// Floating-point compare less than zero | |
872 | name = vcltz | |
c620b35d | 873 | multi_fn = simd_extract!, {vcltz-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 |
3c0e092e XL |
874 | a = 2. |
875 | validate 0 | |
876 | ||
877 | aarch64 = fcmp | |
878 | generate f32:u32, f64:u64 | |
879 | ||
17df50a5 XL |
880 | /// Count leading sign bits |
881 | name = vcls | |
882 | a = MIN, -1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, MAX | |
883 | validate 0, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 0 | |
884 | ||
885 | arm = vcls.s | |
886 | aarch64 = cls | |
887 | link-arm = vcls._EXT_ | |
888 | link-aarch64 = cls._EXT_ | |
ba9703b0 XL |
889 | generate int*_t |
890 | ||
3c0e092e XL |
891 | /// Count leading sign bits |
892 | name = vcls | |
c620b35d | 893 | multi_fn = vcls-signed-noext, {transmute, a} |
3c0e092e XL |
894 | a = MIN, MAX, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, MAX |
895 | validate BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1 | |
896 | ||
897 | arm = vcls | |
898 | aarch64 = cls | |
a2a8927a | 899 | generate uint8x8_t:int8x8_t, uint8x16_t:int8x16_t, uint16x4_t:int16x4_t, uint16x8_t:int16x8_t, uint32x2_t:int32x2_t, uint32x4_t:int32x4_t |
3c0e092e XL |
900 | |
901 | /// Count leading zero bits | |
17df50a5 XL |
902 | name = vclz |
903 | multi_fn = self-signed-ext, a | |
904 | a = MIN, -1, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, MAX | |
905 | validate 0, 0, BITS, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 1 | |
ba9703b0 | 906 | |
17df50a5 XL |
907 | arm = vclz. |
908 | aarch64 = clz | |
909 | generate int*_t | |
910 | ||
3c0e092e | 911 | /// Count leading zero bits |
17df50a5 XL |
912 | name = vclz |
913 | multi_fn = transmute, {self-signed-ext, transmute(a)} | |
914 | a = MIN, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, MAX | |
915 | validate BITS, BITS, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, BITS_M1, 0 | |
916 | ||
917 | arm = vclz. | |
918 | aarch64 = clz | |
ba9703b0 XL |
919 | generate uint*_t |
920 | ||
17df50a5 XL |
921 | /// Floating-point absolute compare greater than |
922 | name = vcagt | |
923 | a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 | |
924 | b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 | |
3c0e092e | 925 | validate !0, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE |
ba9703b0 | 926 | |
17df50a5 XL |
927 | aarch64 = facgt |
928 | link-aarch64 = facgt._EXT2_._EXT_ | |
3c0e092e | 929 | generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 |
ba9703b0 | 930 | |
17df50a5 XL |
931 | arm = vacgt.s |
932 | link-arm = vacgt._EXT2_._EXT_ | |
933 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t | |
ba9703b0 | 934 | |
17df50a5 XL |
935 | /// Floating-point absolute compare greater than or equal |
936 | name = vcage | |
937 | a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 | |
938 | b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 | |
3c0e092e | 939 | validate !0, TRUE, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE |
ba9703b0 | 940 | |
17df50a5 XL |
941 | aarch64 = facge |
942 | link-aarch64 = facge._EXT2_._EXT_ | |
3c0e092e | 943 | generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 |
ba9703b0 | 944 | |
17df50a5 XL |
945 | arm = vacge.s |
946 | link-arm = vacge._EXT2_._EXT_ | |
947 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t | |
ba9703b0 | 948 | |
17df50a5 XL |
949 | /// Floating-point absolute compare less than |
950 | name = vcalt | |
951 | multi_fn = vcagt-self-noext, b, a | |
952 | a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 | |
953 | b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 | |
3c0e092e | 954 | validate 0, FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE |
ba9703b0 | 955 | |
17df50a5 | 956 | aarch64 = facgt |
3c0e092e | 957 | generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 |
ba9703b0 | 958 | |
17df50a5 XL |
959 | arm = vacgt.s |
960 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t | |
ba9703b0 | 961 | |
17df50a5 XL |
962 | /// Floating-point absolute compare less than or equal |
963 | name = vcale | |
964 | multi_fn = vcage-self-noext , b, a | |
965 | a = -1.2, 0.0, 1.2, 2.3, 3.4, 4.5, 5.6, 6.7 | |
966 | b = -1.1, 0.0, 1.1, 2.4, 3.3, 4.6, 5.5, 6.8 | |
3c0e092e | 967 | validate 0, TRUE, FALSE, TRUE, FALSE, TRUE, FALSE, TRUE |
ba9703b0 | 968 | |
17df50a5 | 969 | aarch64 = facge |
3c0e092e | 970 | generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 |
ba9703b0 | 971 | |
17df50a5 XL |
972 | arm = vacge.s |
973 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t | |
974 | ||
975 | /// Insert vector element from another vector element | |
976 | name = vcopy | |
977 | lane-suffixes | |
978 | constn = LANE1:LANE2 | |
979 | multi_fn = static_assert_imm-in0_exp_len-LANE1 | |
980 | multi_fn = static_assert_imm-in_exp_len-LANE2 | |
353b0b11 | 981 | multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle!, a, b, {ins-in0_len-in0_len-LANE2} |
ba9703b0 | 982 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
17df50a5 XL |
983 | b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
984 | n = 0:1 | |
985 | validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
ba9703b0 | 986 | |
17df50a5 XL |
987 | aarch64 = mov |
988 | generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x2_t, int32x4_t, int64x2_t | |
989 | generate uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x2_t, uint32x4_t, uint64x2_t | |
990 | generate poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t | |
ba9703b0 | 991 | |
17df50a5 XL |
992 | /// Insert vector element from another vector element |
993 | name = vcopy | |
994 | lane-suffixes | |
995 | constn = LANE1:LANE2 | |
996 | multi_fn = static_assert_imm-in0_exp_len-LANE1 | |
997 | multi_fn = static_assert_imm-in_exp_len-LANE2 | |
353b0b11 | 998 | multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle!, a, b, {ins-in0_len-in0_len-LANE2} |
17df50a5 XL |
999 | a = 1., 2., 3., 4. |
1000 | b = 0., 0.5, 0., 0. | |
1001 | n = 0:1 | |
1002 | validate 0.5, 2., 3., 4. | |
fc512014 | 1003 | |
17df50a5 XL |
1004 | aarch64 = mov |
1005 | generate float32x2_t, float32x4_t, float64x2_t | |
1006 | ||
1007 | /// Insert vector element from another vector element | |
1008 | name = vcopy | |
1009 | lane-suffixes | |
1010 | constn = LANE1:LANE2 | |
1011 | multi_fn = static_assert_imm-in0_exp_len-LANE1 | |
1012 | multi_fn = static_assert_imm-in_exp_len-LANE2 | |
353b0b11 FG |
1013 | multi_fn = simd_shuffle!, a:in_t, a, a, {asc-0-in_len} |
1014 | multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle!, a, b, {ins-in0_len-in_len-LANE2} | |
fc512014 | 1015 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
17df50a5 XL |
1016 | b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
1017 | n = 0:1 | |
1018 | validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
fc512014 | 1019 | |
17df50a5 XL |
1020 | aarch64 = mov |
1021 | generate int8x8_t:int8x16_t:int8x8_t, int16x4_t:int16x8_t:int16x4_t, int32x2_t:int32x4_t:int32x2_t | |
1022 | generate uint8x8_t:uint8x16_t:uint8x8_t, uint16x4_t:uint16x8_t:uint16x4_t, uint32x2_t:uint32x4_t:uint32x2_t | |
1023 | generate poly8x8_t:poly8x16_t:poly8x8_t, poly16x4_t:poly16x8_t:poly16x4_t | |
fc512014 | 1024 | |
17df50a5 XL |
1025 | /// Insert vector element from another vector element |
1026 | name = vcopy | |
1027 | lane-suffixes | |
1028 | constn = LANE1:LANE2 | |
1029 | multi_fn = static_assert_imm-in0_exp_len-LANE1 | |
1030 | multi_fn = static_assert_imm-in_exp_len-LANE2 | |
353b0b11 FG |
1031 | multi_fn = simd_shuffle!, a:in_t, a, a, {asc-0-in_len} |
1032 | multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle!, a, b, {ins-in0_len-in_len-LANE2} | |
17df50a5 XL |
1033 | a = 1., 2., 3., 4. |
1034 | b = 0., 0.5, 0., 0. | |
1035 | n = 0:1 | |
1036 | validate 0.5, 2., 3., 4. | |
fc512014 | 1037 | |
17df50a5 XL |
1038 | aarch64 = mov |
1039 | generate float32x2_t:float32x4_t:float32x2_t | |
fc512014 | 1040 | |
17df50a5 XL |
1041 | /// Insert vector element from another vector element |
1042 | name = vcopy | |
1043 | lane-suffixes | |
1044 | constn = LANE1:LANE2 | |
1045 | multi_fn = static_assert_imm-in0_exp_len-LANE1 | |
1046 | multi_fn = static_assert_imm-in_exp_len-LANE2 | |
353b0b11 FG |
1047 | multi_fn = simd_shuffle!, b:in_t0, b, b, {asc-0-in0_len} |
1048 | multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle!, a, b, {ins-in0_len-in0_len-LANE2} | |
17df50a5 XL |
1049 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
1050 | b = 0, MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
1051 | n = 0:1 | |
1052 | validate MAX, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
fc512014 | 1053 | |
17df50a5 XL |
1054 | aarch64 = mov |
1055 | generate int8x16_t:int8x8_t:int8x16_t, int16x8_t:int16x4_t:int16x8_t, int32x4_t:int32x2_t:int32x4_t | |
1056 | generate uint8x16_t:uint8x8_t:uint8x16_t, uint16x8_t:uint16x4_t:uint16x8_t, uint32x4_t:uint32x2_t:uint32x4_t | |
1057 | generate poly8x16_t:poly8x8_t:poly8x16_t, poly16x8_t:poly16x4_t:poly16x8_t | |
fc512014 | 1058 | |
17df50a5 XL |
1059 | /// Insert vector element from another vector element |
1060 | name = vcopy | |
1061 | lane-suffixes | |
1062 | constn = LANE1:LANE2 | |
1063 | multi_fn = static_assert_imm-in0_exp_len-LANE1 | |
1064 | multi_fn = static_assert_imm-in_exp_len-LANE2 | |
353b0b11 FG |
1065 | multi_fn = simd_shuffle!, b:in_t0, b, b, {asc-0-in0_len} |
1066 | multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle!, a, b, {ins-in0_len-in0_len-LANE2} | |
fc512014 | 1067 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
17df50a5 XL |
1068 | b = MAX, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
1069 | n = 1:0 | |
1070 | validate 1, MAX | |
fc512014 | 1071 | |
5e7ed085 | 1072 | aarch64 = mov |
17df50a5 | 1073 | generate int64x2_t:int64x1_t:int64x2_t, uint64x2_t:uint64x1_t:uint64x2_t, poly64x2_t:poly64x1_t:poly64x2_t |
fc512014 | 1074 | |
17df50a5 XL |
1075 | /// Insert vector element from another vector element |
1076 | name = vcopy | |
1077 | lane-suffixes | |
1078 | constn = LANE1:LANE2 | |
1079 | multi_fn = static_assert_imm-in0_exp_len-LANE1 | |
1080 | multi_fn = static_assert_imm-in_exp_len-LANE2 | |
353b0b11 FG |
1081 | multi_fn = simd_shuffle!, b:in_t0, b, b, {asc-0-in0_len} |
1082 | multi_fn = matchn-in0_exp_len-LANE1, simd_shuffle!, a, b, {ins-in0_len-in0_len-LANE2} | |
17df50a5 XL |
1083 | a = 1., 2., 3., 4. |
1084 | b = 0.5, 0., 0., 0. | |
1085 | n = 1:0 | |
1086 | validate 1., 0.5, 3., 4. | |
fc512014 | 1087 | |
17df50a5 XL |
1088 | aarch64 = mov |
1089 | generate float32x4_t:float32x2_t:float32x4_t | |
5e7ed085 | 1090 | aarch64 = mov |
17df50a5 | 1091 | generate float64x2_t:float64x1_t:float64x2_t |
fc512014 | 1092 | |
17df50a5 XL |
1093 | /// Insert vector element from another vector element |
1094 | name = vcreate | |
1095 | out-suffix | |
1096 | multi_fn = transmute, a | |
1097 | a = 1 | |
1098 | validate 1, 0, 0, 0, 0, 0, 0, 0 | |
fc512014 | 1099 | |
17df50a5 XL |
1100 | aarch64 = nop |
1101 | arm = nop | |
3c0e092e XL |
1102 | generate u64:int8x8_t, u64:int16x4_t, u64:int32x2_t, u64:int64x1_t |
1103 | generate u64:uint8x8_t, u64:uint16x4_t, u64:uint32x2_t, u64:uint64x1_t | |
17df50a5 | 1104 | generate u64:poly8x8_t, u64:poly16x4_t |
94222f64 | 1105 | target = aes |
17df50a5 XL |
1106 | generate u64:poly64x1_t |
1107 | ||
1108 | /// Insert vector element from another vector element | |
1109 | name = vcreate | |
1110 | out-suffix | |
1111 | multi_fn = transmute, a | |
1112 | a = 0 | |
1113 | validate 0., 0. | |
1114 | ||
1115 | aarch64 = nop | |
1116 | generate u64:float64x1_t | |
1117 | arm = nop | |
1118 | generate u64:float32x2_t | |
1119 | ||
1120 | /// Fixed-point convert to floating-point | |
1121 | name = vcvt | |
1122 | double-suffixes | |
1123 | fn = simd_cast | |
1124 | a = 1, 2, 3, 4 | |
1125 | validate 1., 2., 3., 4. | |
1126 | ||
1127 | aarch64 = scvtf | |
1128 | generate int64x1_t:float64x1_t, int64x2_t:float64x2_t | |
1129 | aarch64 = ucvtf | |
1130 | generate uint64x1_t:float64x1_t, uint64x2_t:float64x2_t | |
1131 | ||
1132 | arm = vcvt | |
1133 | aarch64 = scvtf | |
1134 | generate int32x2_t:float32x2_t, int32x4_t:float32x4_t | |
1135 | aarch64 = ucvtf | |
1136 | generate uint32x2_t:float32x2_t, uint32x4_t:float32x4_t | |
1137 | ||
1138 | /// Floating-point convert to higher precision long | |
1139 | name = vcvt | |
1140 | double-suffixes | |
1141 | fn = simd_cast | |
1142 | a = -1.2, 1.2 | |
1143 | validate -1.2f32 as f64, 1.2f32 as f64 | |
1144 | ||
1145 | aarch64 = fcvtl | |
1146 | generate float32x2_t:float64x2_t | |
1147 | ||
1148 | /// Floating-point convert to higher precision long | |
1149 | name = vcvt_high | |
1150 | noq-double-suffixes | |
353b0b11 | 1151 | multi_fn = simd_shuffle!, b:float32x2_t, a, a, [2, 3] |
17df50a5 XL |
1152 | multi_fn = simd_cast, b |
1153 | a = -1.2, 1.2, 2.3, 3.4 | |
1154 | validate 2.3f32 as f64, 3.4f32 as f64 | |
1155 | ||
1156 | aarch64 = fcvtl | |
1157 | generate float32x4_t:float64x2_t | |
1158 | ||
1159 | /// Floating-point convert to lower precision narrow | |
1160 | name = vcvt | |
1161 | double-suffixes | |
1162 | fn = simd_cast | |
1163 | a = -1.2, 1.2 | |
1164 | validate -1.2f64 as f32, 1.2f64 as f32 | |
1165 | ||
1166 | aarch64 = fcvtn | |
1167 | generate float64x2_t:float32x2_t | |
1168 | ||
1169 | /// Floating-point convert to lower precision narrow | |
1170 | name = vcvt_high | |
1171 | noq-double-suffixes | |
353b0b11 | 1172 | multi_fn = simd_shuffle!, a, {simd_cast, b}, [0, 1, 2, 3] |
17df50a5 XL |
1173 | a = -1.2, 1.2 |
1174 | b = -2.3, 3.4 | |
1175 | validate -1.2, 1.2, -2.3f64 as f32, 3.4f64 as f32 | |
1176 | ||
1177 | aarch64 = fcvtn | |
1178 | generate float32x2_t:float64x2_t:float32x4_t | |
1179 | ||
1180 | /// Floating-point convert to lower precision narrow, rounding to odd | |
1181 | name = vcvtx | |
1182 | double-suffixes | |
1183 | a = -1.0, 2.0 | |
1184 | validate -1.0, 2.0 | |
1185 | ||
1186 | aarch64 = fcvtxn | |
1187 | link-aarch64 = fcvtxn._EXT2_._EXT_ | |
1188 | generate float64x2_t:float32x2_t | |
1189 | ||
3c0e092e XL |
1190 | /// Floating-point convert to lower precision narrow, rounding to odd |
1191 | name = vcvtx | |
1192 | double-suffixes | |
c620b35d | 1193 | multi_fn = simd_extract!, {vcvtx-_f32_f64-noext, {vdupq_n-in_ntt-noext, a}}, 0 |
3c0e092e XL |
1194 | a = -1.0 |
1195 | validate -1.0 | |
1196 | ||
1197 | aarch64 = fcvtxn | |
1198 | generate f64:f32 | |
1199 | ||
17df50a5 XL |
1200 | /// Floating-point convert to lower precision narrow, rounding to odd |
1201 | name = vcvtx_high | |
1202 | noq-double-suffixes | |
353b0b11 | 1203 | multi_fn = simd_shuffle!, a, {vcvtx-noq_doubleself-noext, b}, [0, 1, 2, 3] |
17df50a5 XL |
1204 | a = -1.0, 2.0 |
1205 | b = -3.0, 4.0 | |
1206 | validate -1.0, 2.0, -3.0, 4.0 | |
1207 | ||
1208 | aarch64 = fcvtxn | |
1209 | generate float32x2_t:float64x2_t:float32x4_t | |
1210 | ||
1211 | /// Fixed-point convert to floating-point | |
1212 | name = vcvt | |
1213 | double-n-suffixes | |
1214 | constn = N | |
1215 | multi_fn = static_assert-N-1-bits | |
1216 | a = 1, 2, 3, 4 | |
1217 | n = 2 | |
1218 | validate 0.25, 0.5, 0.75, 1. | |
c295e0f8 | 1219 | arm-aarch64-separate |
17df50a5 XL |
1220 | |
1221 | aarch64 = scvtf | |
1222 | link-aarch64 = vcvtfxs2fp._EXT2_._EXT_ | |
1223 | const-aarch64 = N | |
1224 | generate int64x1_t:float64x1_t, int64x2_t:float64x2_t, i32:f32, i64:f64 | |
1225 | ||
1226 | aarch64 = ucvtf | |
1227 | link-aarch64 = vcvtfxu2fp._EXT2_._EXT_ | |
1228 | const-aarch64 = N | |
1229 | generate uint64x1_t:float64x1_t, uint64x2_t:float64x2_t, u32:f32, u64:f64 | |
1230 | ||
1231 | aarch64 = scvtf | |
1232 | link-aarch64 = vcvtfxs2fp._EXT2_._EXT_ | |
1233 | arm = vcvt | |
1234 | link-arm = vcvtfxs2fp._EXT2_._EXT_ | |
1235 | const-arm = N:i32 | |
c295e0f8 | 1236 | |
17df50a5 XL |
1237 | generate int32x2_t:float32x2_t, int32x4_t:float32x4_t |
1238 | ||
1239 | aarch64 = ucvtf | |
1240 | link-aarch64 = vcvtfxu2fp._EXT2_._EXT_ | |
1241 | arm = vcvt | |
1242 | link-arm = vcvtfxu2fp._EXT2_._EXT_ | |
1243 | const-arm = N:i32 | |
1244 | generate uint32x2_t:float32x2_t, uint32x4_t:float32x4_t | |
1245 | ||
1246 | /// Floating-point convert to fixed-point, rounding toward zero | |
1247 | name = vcvt | |
1248 | double-n-suffixes | |
1249 | constn = N | |
1250 | multi_fn = static_assert-N-1-bits | |
1251 | a = 0.25, 0.5, 0.75, 1. | |
1252 | n = 2 | |
1253 | validate 1, 2, 3, 4 | |
c295e0f8 | 1254 | arm-aarch64-separate |
17df50a5 XL |
1255 | |
1256 | aarch64 = fcvtzs | |
1257 | link-aarch64 = vcvtfp2fxs._EXT2_._EXT_ | |
1258 | const-aarch64 = N | |
1259 | generate float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 | |
1260 | ||
1261 | aarch64 = fcvtzu | |
1262 | link-aarch64 = vcvtfp2fxu._EXT2_._EXT_ | |
1263 | const-aarch64 = N | |
1264 | generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 | |
1265 | ||
1266 | aarch64 = fcvtzs | |
1267 | link-aarch64 = vcvtfp2fxs._EXT2_._EXT_ | |
1268 | arm = vcvt | |
1269 | link-arm = vcvtfp2fxs._EXT2_._EXT_ | |
1270 | const-arm = N:i32 | |
1271 | generate float32x2_t:int32x2_t, float32x4_t:int32x4_t | |
1272 | ||
1273 | aarch64 = fcvtzu | |
1274 | link-aarch64 = vcvtfp2fxu._EXT2_._EXT_ | |
1275 | arm = vcvt | |
1276 | link-arm = vcvtfp2fxu._EXT2_._EXT_ | |
1277 | const-arm = N:i32 | |
1278 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t | |
1279 | ||
1280 | /// Fixed-point convert to floating-point | |
1281 | name = vcvt | |
1282 | double-suffixes | |
1283 | multi_fn = a as out_t | |
1284 | a = 1 | |
1285 | validate 1. | |
1286 | ||
1287 | aarch64 = scvtf | |
1288 | generate i32:f32, i64:f64 | |
1289 | aarch64 = ucvtf | |
1290 | generate u32:f32, u64:f64 | |
1291 | ||
1292 | /// Fixed-point convert to floating-point | |
1293 | name = vcvt | |
1294 | double-suffixes | |
1295 | multi_fn = a as out_t | |
1296 | a = 1. | |
1297 | validate 1 | |
1298 | ||
1299 | aarch64 = fcvtzs | |
1300 | generate f32:i32, f64:i64 | |
1301 | aarch64 = fcvtzu | |
1302 | generate f32:u32, f64:u64 | |
1303 | ||
1304 | /// Floating-point convert to signed fixed-point, rounding toward zero | |
1305 | name = vcvt | |
1306 | double-suffixes | |
94222f64 | 1307 | link-aarch64 = llvm.fptosi.sat._EXT2_._EXT_ |
17df50a5 XL |
1308 | a = -1.1, 2.1, -2.9, 3.9 |
1309 | validate -1, 2, -2, 3 | |
1310 | ||
1311 | aarch64 = fcvtzs | |
1312 | generate float64x1_t:int64x1_t, float64x2_t:int64x2_t | |
1313 | ||
94222f64 | 1314 | link-arm = llvm.fptosi.sat._EXT2_._EXT_ |
17df50a5 XL |
1315 | arm = vcvt |
1316 | generate float32x2_t:int32x2_t, float32x4_t:int32x4_t | |
1317 | ||
1318 | /// Floating-point convert to unsigned fixed-point, rounding toward zero | |
1319 | name = vcvt | |
1320 | double-suffixes | |
94222f64 | 1321 | link-aarch64 = llvm.fptoui.sat._EXT2_._EXT_ |
17df50a5 XL |
1322 | a = 1.1, 2.1, 2.9, 3.9 |
1323 | validate 1, 2, 2, 3 | |
1324 | ||
1325 | aarch64 = fcvtzu | |
1326 | generate float64x1_t:uint64x1_t, float64x2_t:uint64x2_t | |
1327 | ||
94222f64 | 1328 | link-arm = llvm.fptoui.sat._EXT2_._EXT_ |
17df50a5 XL |
1329 | arm = vcvt |
1330 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t | |
1331 | ||
1332 | /// Floating-point convert to signed integer, rounding to nearest with ties to away | |
1333 | name = vcvta | |
1334 | double-suffixes | |
1335 | a = -1.1, 2.1, -2.9, 3.9 | |
1336 | validate -1, 2, -3, 4 | |
1337 | ||
1338 | aarch64 = fcvtas | |
1339 | link-aarch64 = fcvtas._EXT2_._EXT_ | |
1340 | generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t | |
1341 | ||
1342 | /// Floating-point convert to integer, rounding to nearest with ties to away | |
1343 | name = vcvta | |
1344 | double-suffixes | |
1345 | a = 2.9 | |
1346 | validate 3 | |
1347 | ||
1348 | aarch64 = fcvtas | |
1349 | link-aarch64 = fcvtas._EXT2_._EXT_ | |
1350 | generate f32:i32, f64:i64 | |
1351 | ||
1352 | aarch64 = fcvtau | |
1353 | link-aarch64 = fcvtau._EXT2_._EXT_ | |
1354 | generate f32:u32, f64:u64 | |
1355 | ||
1356 | /// Floating-point convert to signed integer, rounding to nearest with ties to even | |
1357 | name = vcvtn | |
1358 | double-suffixes | |
1359 | a = -1.5, 2.1, -2.9, 3.9 | |
1360 | validate -2, 2, -3, 4 | |
1361 | ||
1362 | aarch64 = fcvtns | |
1363 | link-aarch64 = fcvtns._EXT2_._EXT_ | |
1364 | generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 | |
1365 | ||
1366 | /// Floating-point convert to signed integer, rounding toward minus infinity | |
1367 | name = vcvtm | |
1368 | double-suffixes | |
1369 | a = -1.1, 2.1, -2.9, 3.9 | |
1370 | validate -2, 2, -3, 3 | |
1371 | ||
1372 | aarch64 = fcvtms | |
1373 | link-aarch64 = fcvtms._EXT2_._EXT_ | |
1374 | generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 | |
1375 | ||
1376 | /// Floating-point convert to signed integer, rounding toward plus infinity | |
1377 | name = vcvtp | |
1378 | double-suffixes | |
1379 | a = -1.1, 2.1, -2.9, 3.9 | |
1380 | validate -1, 3, -2, 4 | |
1381 | ||
1382 | aarch64 = fcvtps | |
1383 | link-aarch64 = fcvtps._EXT2_._EXT_ | |
1384 | generate float32x2_t:int32x2_t, float32x4_t:int32x4_t, float64x1_t:int64x1_t, float64x2_t:int64x2_t, f32:i32, f64:i64 | |
1385 | ||
1386 | /// Floating-point convert to unsigned integer, rounding to nearest with ties to away | |
1387 | name = vcvta | |
1388 | double-suffixes | |
1389 | a = 1.1, 2.1, 2.9, 3.9 | |
1390 | validate 1, 2, 3, 4 | |
1391 | ||
1392 | aarch64 = fcvtau | |
1393 | link-aarch64 = fcvtau._EXT2_._EXT_ | |
1394 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t | |
1395 | ||
1396 | /// Floating-point convert to unsigned integer, rounding to nearest with ties to even | |
1397 | name = vcvtn | |
1398 | double-suffixes | |
1399 | a = 1.5, 2.1, 2.9, 3.9 | |
1400 | validate 2, 2, 3, 4 | |
1401 | ||
1402 | aarch64 = fcvtnu | |
1403 | link-aarch64 = fcvtnu._EXT2_._EXT_ | |
1404 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 | |
1405 | ||
1406 | /// Floating-point convert to unsigned integer, rounding toward minus infinity | |
1407 | name = vcvtm | |
1408 | double-suffixes | |
1409 | a = 1.1, 2.1, 2.9, 3.9 | |
1410 | validate 1, 2, 2, 3 | |
1411 | ||
1412 | aarch64 = fcvtmu | |
1413 | link-aarch64 = fcvtmu._EXT2_._EXT_ | |
1414 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 | |
1415 | ||
1416 | /// Floating-point convert to unsigned integer, rounding toward plus infinity | |
1417 | name = vcvtp | |
1418 | double-suffixes | |
1419 | a = 1.1, 2.1, 2.9, 3.9 | |
1420 | validate 2, 3, 3, 4 | |
1421 | ||
1422 | aarch64 = fcvtpu | |
1423 | link-aarch64 = fcvtpu._EXT2_._EXT_ | |
1424 | generate float32x2_t:uint32x2_t, float32x4_t:uint32x4_t, float64x1_t:uint64x1_t, float64x2_t:uint64x2_t, f32:u32, f64:u64 | |
1425 | ||
1426 | /// Set all vector lanes to the same value | |
1427 | name = vdup | |
1428 | lane-suffixes | |
1429 | constn = N | |
1430 | multi_fn = static_assert_imm-in_exp_len-N | |
353b0b11 | 1431 | multi_fn = simd_shuffle!, a, a, {dup-out_len-N as u32} |
17df50a5 XL |
1432 | a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16 |
1433 | n = HFLEN | |
1434 | validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 | |
1435 | ||
1436 | aarch64 = dup | |
1437 | generate poly64x2_t, poly64x1_t:poly64x2_t | |
1438 | ||
1439 | arm = vdup.l | |
1440 | generate int*_t | |
1441 | generate int8x16_t:int8x8_t, int16x8_t:int16x4_t, int32x4_t:int32x2_t | |
1442 | generate int8x8_t:int8x16_t, int16x4_t:int16x8_t, int32x2_t:int32x4_t | |
1443 | ||
1444 | generate uint*_t | |
1445 | generate uint8x16_t:uint8x8_t, uint16x8_t:uint16x4_t, uint32x4_t:uint32x2_t | |
1446 | generate uint8x8_t:uint8x16_t, uint16x4_t:uint16x8_t, uint32x2_t:uint32x4_t | |
1447 | ||
1448 | generate poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t | |
1449 | generate poly8x16_t:poly8x8_t, poly16x8_t:poly16x4_t | |
1450 | generate poly8x8_t:poly8x16_t, poly16x4_t:poly16x8_t | |
1451 | ||
1452 | /// Set all vector lanes to the same value | |
1453 | name = vdup | |
1454 | lane-suffixes | |
1455 | constn = N | |
1456 | multi_fn = static_assert_imm-in_exp_len-N | |
353b0b11 | 1457 | multi_fn = simd_shuffle!, a, a, {dup-out_len-N as u32} |
17df50a5 XL |
1458 | a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16 |
1459 | n = HFLEN | |
1460 | validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 | |
1461 | ||
1462 | aarch64 = dup | |
1463 | arm = vmov | |
1464 | generate int64x2_t, int64x1_t:int64x2_t, uint64x2_t, uint64x1_t:uint64x2_t | |
1465 | ||
1466 | /// Set all vector lanes to the same value | |
1467 | name = vdup | |
1468 | lane-suffixes | |
1469 | constn = N | |
1470 | multi_fn = static_assert_imm-in_exp_len-N | |
353b0b11 | 1471 | multi_fn = simd_shuffle!, a, a, {dup-out_len-N as u32} |
17df50a5 XL |
1472 | a = 1., 1., 1., 4. |
1473 | n = HFLEN | |
1474 | validate 1., 1., 1., 1. | |
1475 | ||
1476 | aarch64 = dup | |
1477 | generate float64x2_t, float64x1_t:float64x2_t | |
1478 | ||
1479 | arm = vdup.l | |
1480 | generate float*_t, float32x4_t:float32x2_t, float32x2_t:float32x4_t | |
1481 | ||
1482 | /// Set all vector lanes to the same value | |
1483 | name = vdup | |
1484 | lane-suffixes | |
1485 | constn = N | |
1486 | multi_fn = static_assert_imm-in_exp_len-N | |
1487 | multi_fn = a | |
1488 | a = 0 | |
1489 | n = HFLEN | |
1490 | validate 0 | |
1491 | ||
1492 | aarch64 = nop | |
1493 | generate poly64x1_t | |
1494 | ||
1495 | arm = nop | |
1496 | generate int64x1_t, uint64x1_t | |
1497 | ||
1498 | /// Set all vector lanes to the same value | |
1499 | name = vdup | |
1500 | lane-suffixes | |
1501 | constn = N | |
1502 | multi_fn = static_assert_imm-in_exp_len-N | |
1503 | multi_fn = a | |
1504 | a = 0. | |
1505 | n = HFLEN | |
1506 | validate 0. | |
1507 | ||
1508 | aarch64 = nop | |
1509 | generate float64x1_t | |
1510 | ||
1511 | /// Set all vector lanes to the same value | |
1512 | name = vdup | |
1513 | lane-suffixes | |
1514 | constn = N | |
1515 | multi_fn = static_assert_imm-in_exp_len-N | |
c620b35d | 1516 | multi_fn = transmute--<element_t _>, {simd_extract!, a, N as u32} |
17df50a5 XL |
1517 | a = 0, 1 |
1518 | n = HFLEN | |
1519 | validate 1 | |
1520 | ||
1521 | aarch64 = nop | |
1522 | generate poly64x2_t:poly64x1_t | |
1523 | ||
1524 | arm = vmov | |
1525 | generate int64x2_t:int64x1_t, uint64x2_t:uint64x1_t | |
1526 | ||
1527 | /// Set all vector lanes to the same value | |
1528 | name = vdup | |
1529 | lane-suffixes | |
1530 | constn = N | |
1531 | multi_fn = static_assert_imm-in_exp_len-N | |
c620b35d | 1532 | multi_fn = transmute--<element_t _>, {simd_extract!, a, N as u32} |
17df50a5 XL |
1533 | a = 0., 1. |
1534 | n = HFLEN | |
1535 | validate 1. | |
1536 | ||
1537 | aarch64 = nop | |
1538 | generate float64x2_t:float64x1_t | |
1539 | ||
1540 | /// Set all vector lanes to the same value | |
1541 | name = vdup | |
1542 | lane-suffixes | |
1543 | constn = N | |
1544 | multi_fn = static_assert_imm-in_exp_len-N | |
c620b35d | 1545 | multi_fn = simd_extract!, a, N as u32 |
17df50a5 XL |
1546 | a = 1, 1, 1, 4, 1, 6, 7, 8, 1, 10, 11, 12, 13, 14, 15, 16 |
1547 | n = HFLEN | |
1548 | validate 1 | |
1549 | ||
1550 | aarch64 = nop | |
1551 | generate int8x8_t:i8, int8x16_t:i8, int16x4_t:i16, int16x8_t:i16, int32x2_t:i32, int32x4_t:i32, int64x1_t:i64, int64x2_t:i64 | |
1552 | generate uint8x8_t:u8, uint8x16_t:u8, uint16x4_t:u16, uint16x8_t:u16, uint32x2_t:u32, uint32x4_t:u32, uint64x1_t:u64, uint64x2_t:u64 | |
1553 | generate poly8x8_t:p8, poly8x16_t:p8, poly16x4_t:p16, poly16x8_t:p16 | |
1554 | ||
1555 | /// Set all vector lanes to the same value | |
1556 | name = vdup | |
1557 | lane-suffixes | |
1558 | constn = N | |
1559 | multi_fn = static_assert_imm-in_exp_len-N | |
c620b35d | 1560 | multi_fn = simd_extract!, a, N as u32 |
17df50a5 XL |
1561 | a = 1., 1., 1., 4. |
1562 | n = HFLEN | |
1563 | validate 1. | |
1564 | ||
1565 | aarch64 = nop | |
1566 | generate float32x2_t:f32, float32x4_t:f32, float64x1_t:f64, float64x2_t:f64 | |
1567 | ||
1568 | /// Extract vector from pair of vectors | |
1569 | name = vext | |
1570 | constn = N | |
1571 | multi_fn = static_assert_imm-out_exp_len-N | |
353b0b11 | 1572 | multi_fn = matchn-out_exp_len-N, simd_shuffle!, a, b, {asc-n-out_len} |
f2b60f7d FG |
1573 | a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 |
1574 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1575 | n = LEN_M1 | |
1576 | validate 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
17df50a5 XL |
1577 | |
1578 | arm = "vext.8" | |
1579 | aarch64 = ext | |
1580 | generate int*_t, uint*_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t | |
1581 | ||
1582 | /// Extract vector from pair of vectors | |
1583 | name = vext | |
1584 | constn = N | |
1585 | multi_fn = static_assert_imm-out_exp_len-N | |
353b0b11 | 1586 | multi_fn = matchn-out_exp_len-N, simd_shuffle!, a, b, {asc-n-out_len} |
f2b60f7d FG |
1587 | a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 |
1588 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1589 | n = LEN_M1 | |
1590 | validate 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
17df50a5 XL |
1591 | |
1592 | aarch64 = ext | |
1593 | generate poly64x2_t | |
1594 | ||
1595 | arm = vmov | |
1596 | generate int64x2_t, uint64x2_t | |
1597 | ||
1598 | /// Extract vector from pair of vectors | |
1599 | name = vext | |
1600 | constn = N | |
1601 | multi_fn = static_assert_imm-out_exp_len-N | |
353b0b11 | 1602 | multi_fn = matchn-out_exp_len-N, simd_shuffle!, a, b, {asc-n-out_len} |
f2b60f7d FG |
1603 | a = 1., 1., 1., 1. |
1604 | b = 2., 2., 2., 2., | |
1605 | n = LEN_M1 | |
1606 | validate 1., 2., 2., 2. | |
17df50a5 XL |
1607 | |
1608 | aarch64 = ext | |
1609 | generate float64x2_t | |
1610 | ||
1611 | arm = "vext.8" | |
1612 | generate float*_t | |
1613 | ||
1614 | /// Multiply-add to accumulator | |
1615 | name = vmla | |
1616 | multi_fn = simd_add, a, {simd_mul, b, c} | |
1617 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
1618 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1619 | c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 | |
1620 | validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 | |
1621 | ||
1622 | arm = vmla. | |
1623 | aarch64 = mla | |
1624 | generate int*_t, uint*_t | |
1625 | ||
1626 | /// Floating-point multiply-add to accumulator | |
1627 | name = vmla | |
1628 | multi_fn = simd_add, a, {simd_mul, b, c} | |
1629 | a = 0., 1., 2., 3. | |
1630 | b = 2., 2., 2., 2. | |
1631 | c = 3., 3., 3., 3. | |
1632 | validate 6., 7., 8., 9. | |
1633 | ||
1634 | aarch64 = fmul | |
1635 | generate float64x*_t | |
1636 | ||
1637 | arm = vmla. | |
1638 | generate float*_t | |
1639 | ||
1640 | /// Vector multiply accumulate with scalar | |
1641 | name = vmla | |
1642 | n-suffix | |
1643 | multi_fn = vmla-self-noext, a, b, {vdup-nself-noext, c} | |
1644 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
1645 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1646 | c = 3 | |
1647 | validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 | |
1648 | ||
1649 | aarch64 = mla | |
1650 | arm = vmla. | |
1651 | generate int16x4_t:int16x4_t:i16:int16x4_t, int16x8_t:int16x8_t:i16:int16x8_t, int32x2_t:int32x2_t:i32:int32x2_t, int32x4_t:int32x4_t:i32:int32x4_t | |
1652 | generate uint16x4_t:uint16x4_t:u16:uint16x4_t, uint16x8_t:uint16x8_t:u16:uint16x8_t, uint32x2_t:uint32x2_t:u32:uint32x2_t, uint32x4_t:uint32x4_t:u32:uint32x4_t | |
1653 | ||
1654 | /// Vector multiply accumulate with scalar | |
1655 | name = vmla | |
1656 | n-suffix | |
1657 | multi_fn = vmla-self-noext, a, b, {vdup-nself-noext, c} | |
1658 | a = 0., 1., 2., 3. | |
1659 | b = 2., 2., 2., 2. | |
1660 | c = 3. | |
1661 | validate 6., 7., 8., 9. | |
1662 | ||
1663 | aarch64 = fmul | |
1664 | arm = vmla. | |
1665 | generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t | |
1666 | ||
1667 | /// Vector multiply accumulate with scalar | |
1668 | name = vmla | |
1669 | in2-lane-suffixes | |
1670 | constn = LANE | |
1671 | multi_fn = static_assert_imm-in2_exp_len-LANE | |
353b0b11 | 1672 | multi_fn = vmla-self-noext, a, b, {simd_shuffle!, c, c, {dup-in_len-LANE as u32}} |
17df50a5 XL |
1673 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
1674 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1675 | c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
1676 | n = 1 | |
1677 | validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 | |
1678 | ||
1679 | aarch64 = mla | |
1680 | arm = vmla. | |
1681 | generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t | |
1682 | generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t | |
1683 | generate uint16x4_t, uint16x4_t:uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t | |
1684 | generate uint32x2_t, uint32x2_t:uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t | |
1685 | ||
1686 | /// Vector multiply accumulate with scalar | |
1687 | name = vmla | |
1688 | in2-lane-suffixes | |
1689 | constn = LANE | |
1690 | multi_fn = static_assert_imm-in2_exp_len-LANE | |
353b0b11 | 1691 | multi_fn = vmla-self-noext, a, b, {simd_shuffle!, c, c, {dup-in_len-LANE as u32}} |
17df50a5 XL |
1692 | a = 0., 1., 2., 3. |
1693 | b = 2., 2., 2., 2. | |
1694 | c = 0., 3., 0., 0. | |
1695 | n = 1 | |
1696 | validate 6., 7., 8., 9. | |
1697 | ||
1698 | aarch64 = fmul | |
1699 | arm = vmla. | |
1700 | generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t | |
1701 | ||
1702 | /// Signed multiply-add long | |
1703 | name = vmlal | |
1704 | multi_fn = simd_add, a, {vmull-self-noext, b, c} | |
1705 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
1706 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1707 | c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 | |
1708 | validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 | |
1709 | ||
1710 | arm = vmlal.s | |
1711 | aarch64 = smlal | |
1712 | generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t | |
1713 | ||
1714 | /// Unsigned multiply-add long | |
1715 | name = vmlal | |
1716 | multi_fn = simd_add, a, {vmull-self-noext, b, c} | |
1717 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
1718 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1719 | c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 | |
1720 | validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 | |
1721 | ||
1722 | arm = vmlal.s | |
1723 | aarch64 = umlal | |
1724 | generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t | |
1725 | ||
1726 | /// Vector widening multiply accumulate with scalar | |
1727 | name = vmlal | |
1728 | n-suffix | |
1729 | multi_fn = vmlal-self-noext, a, b, {vdup-nself-noext, c} | |
1730 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
1731 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1732 | c = 3 | |
1733 | validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 | |
1734 | ||
1735 | arm = vmlal.s | |
1736 | aarch64 = smlal | |
1737 | generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t | |
1738 | aarch64 = umlal | |
1739 | generate uint32x4_t:uint16x4_t:u16:uint32x4_t, uint64x2_t:uint32x2_t:u32:uint64x2_t | |
1740 | ||
1741 | /// Vector widening multiply accumulate with scalar | |
1742 | name = vmlal_lane | |
1743 | in2-suffix | |
1744 | constn = LANE | |
1745 | multi_fn = static_assert_imm-in2_exp_len-LANE | |
353b0b11 | 1746 | multi_fn = vmlal-self-noext, a, b, {simd_shuffle!, c, c, {dup-in_len-LANE as u32}} |
17df50a5 XL |
1747 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
1748 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1749 | c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
1750 | n = 1 | |
1751 | validate 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 | |
1752 | ||
1753 | arm = vmlal.s | |
1754 | aarch64 = smlal | |
1755 | generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int32x4_t:int16x4_t:int16x8_t:int32x4_t | |
1756 | generate int64x2_t:int32x2_t:int32x2_t:int64x2_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t | |
1757 | aarch64 = umlal | |
1758 | generate uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x4_t:uint16x8_t:uint32x4_t | |
1759 | generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint32x4_t:uint64x2_t | |
1760 | ||
1761 | /// Signed multiply-add long | |
1762 | name = vmlal_high | |
1763 | no-q | |
353b0b11 FG |
1764 | multi_fn = simd_shuffle!, b:half, b, b, {fixed-half-right} |
1765 | multi_fn = simd_shuffle!, c:half, c, c, {fixed-half-right} | |
17df50a5 XL |
1766 | multi_fn = vmlal-noqself-noext, a, b, c |
1767 | a = 8, 7, 6, 5, 4, 3, 2, 1 | |
1768 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1769 | c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 | |
1770 | fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
1771 | validate 8, 9, 10, 11, 12, 13, 14, 15 | |
1772 | ||
1773 | aarch64 = smlal2 | |
1774 | generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t | |
1775 | ||
1776 | /// Unsigned multiply-add long | |
1777 | name = vmlal_high | |
1778 | no-q | |
353b0b11 FG |
1779 | multi_fn = simd_shuffle!, b:half, b, b, {fixed-half-right} |
1780 | multi_fn = simd_shuffle!, c:half, c, c, {fixed-half-right} | |
17df50a5 XL |
1781 | multi_fn = vmlal-noqself-noext, a, b, c |
1782 | a = 8, 7, 6, 5, 4, 3, 2, 1 | |
1783 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1784 | c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 | |
1785 | fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
1786 | validate 8, 9, 10, 11, 12, 13, 14, 15 | |
1787 | ||
1788 | aarch64 = umlal2 | |
1789 | generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t | |
1790 | ||
1791 | /// Multiply-add long | |
1792 | name = vmlal_high_n | |
1793 | no-q | |
1794 | multi_fn = vmlal_high-noqself-noext, a, b, {vdupq_n-noqself-noext, c} | |
1795 | a = 8, 7, 6, 5, 4, 3, 2, 1 | |
1796 | b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 | |
1797 | c = 2 | |
1798 | validate 8, 9, 10, 11, 12, 13, 14, 15 | |
1799 | ||
1800 | aarch64 = smlal2 | |
1801 | generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t | |
1802 | aarch64 = umlal2 | |
1803 | generate uint32x4_t:uint16x8_t:u16:uint32x4_t, uint64x2_t:uint32x4_t:u32:uint64x2_t | |
1804 | ||
1805 | /// Multiply-add long | |
1806 | name = vmlal_high_lane | |
1807 | in2-suffix | |
1808 | constn = LANE | |
1809 | multi_fn = static_assert_imm-in2_exp_len-LANE | |
353b0b11 | 1810 | multi_fn = vmlal_high-noqself-noext, a, b, {simd_shuffle!, c, c, {dup-in_len-LANE as u32}} |
17df50a5 XL |
1811 | a = 8, 7, 6, 5, 4, 3, 2, 1 |
1812 | b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 | |
1813 | c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
1814 | n = 1 | |
1815 | validate 8, 9, 10, 11, 12, 13, 14, 15 | |
1816 | ||
1817 | aarch64 = smlal2 | |
1818 | generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t | |
1819 | generate int64x2_t:int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t | |
1820 | aarch64 = umlal2 | |
1821 | generate uint32x4_t:uint16x8_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t | |
1822 | generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t | |
1823 | ||
1824 | /// Multiply-subtract from accumulator | |
1825 | name = vmls | |
1826 | multi_fn = simd_sub, a, {simd_mul, b, c} | |
1827 | a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 | |
1828 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1829 | c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 | |
1830 | validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
1831 | ||
1832 | arm = vmls. | |
1833 | aarch64 = mls | |
1834 | generate int*_t, uint*_t | |
1835 | ||
1836 | /// Floating-point multiply-subtract from accumulator | |
1837 | name = vmls | |
1838 | multi_fn = simd_sub, a, {simd_mul, b, c} | |
1839 | a = 6., 7., 8., 9. | |
1840 | b = 2., 2., 2., 2. | |
1841 | c = 3., 3., 3., 3. | |
1842 | validate 0., 1., 2., 3. | |
1843 | ||
1844 | aarch64 = fmul | |
1845 | generate float64x*_t | |
1846 | ||
1847 | arm = vmls. | |
1848 | generate float*_t | |
1849 | ||
1850 | /// Vector multiply subtract with scalar | |
1851 | name = vmls | |
1852 | n-suffix | |
1853 | multi_fn = vmls-self-noext, a, b, {vdup-nself-noext, c} | |
1854 | a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 | |
1855 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1856 | c = 3 | |
1857 | validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
1858 | ||
1859 | aarch64 = mls | |
1860 | arm = vmls. | |
1861 | generate int16x4_t:int16x4_t:i16:int16x4_t, int16x8_t:int16x8_t:i16:int16x8_t, int32x2_t:int32x2_t:i32:int32x2_t, int32x4_t:int32x4_t:i32:int32x4_t | |
1862 | generate uint16x4_t:uint16x4_t:u16:uint16x4_t, uint16x8_t:uint16x8_t:u16:uint16x8_t, uint32x2_t:uint32x2_t:u32:uint32x2_t, uint32x4_t:uint32x4_t:u32:uint32x4_t | |
1863 | ||
1864 | /// Vector multiply subtract with scalar | |
1865 | name = vmls | |
1866 | n-suffix | |
1867 | multi_fn = vmls-self-noext, a, b, {vdup-nself-noext, c} | |
1868 | a = 6., 7., 8., 9. | |
1869 | b = 2., 2., 2., 2. | |
1870 | c = 3. | |
1871 | validate 0., 1., 2., 3. | |
1872 | ||
1873 | aarch64 = fmul | |
1874 | arm = vmls. | |
1875 | generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t | |
1876 | ||
1877 | /// Vector multiply subtract with scalar | |
1878 | name = vmls | |
1879 | in2-lane-suffixes | |
1880 | constn = LANE | |
1881 | multi_fn = static_assert_imm-in2_exp_len-LANE | |
353b0b11 | 1882 | multi_fn = vmls-self-noext, a, b, {simd_shuffle!, c, c, {dup-in_len-LANE as u32}} |
17df50a5 XL |
1883 | a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 |
1884 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1885 | c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
1886 | n = 1 | |
1887 | validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
1888 | ||
1889 | aarch64 = mls | |
1890 | arm = vmls. | |
1891 | generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t | |
1892 | generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t | |
1893 | generate uint16x4_t, uint16x4_t:uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t | |
1894 | generate uint32x2_t, uint32x2_t:uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t | |
1895 | ||
1896 | /// Vector multiply subtract with scalar | |
1897 | name = vmls | |
1898 | in2-lane-suffixes | |
1899 | constn = LANE | |
1900 | multi_fn = static_assert_imm-in2_exp_len-LANE | |
353b0b11 | 1901 | multi_fn = vmls-self-noext, a, b, {simd_shuffle!, c, c, {dup-in_len-LANE as u32}} |
17df50a5 XL |
1902 | a = 6., 7., 8., 9. |
1903 | b = 2., 2., 2., 2. | |
1904 | c = 0., 3., 0., 0. | |
1905 | n = 1 | |
1906 | validate 0., 1., 2., 3. | |
1907 | ||
1908 | aarch64 = fmul | |
1909 | arm = vmls. | |
1910 | generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t | |
1911 | ||
1912 | /// Signed multiply-subtract long | |
1913 | name = vmlsl | |
1914 | multi_fn = simd_sub, a, {vmull-self-noext, b, c} | |
1915 | a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 | |
1916 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1917 | c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 | |
1918 | validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
1919 | ||
1920 | arm = vmlsl.s | |
1921 | aarch64 = smlsl | |
1922 | generate int16x8_t:int8x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t | |
1923 | ||
1924 | /// Unsigned multiply-subtract long | |
1925 | name = vmlsl | |
1926 | multi_fn = simd_sub, a, {vmull-self-noext, b, c} | |
1927 | a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 | |
1928 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1929 | c = 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 | |
1930 | validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
1931 | ||
1932 | arm = vmlsl.s | |
1933 | aarch64 = umlsl | |
1934 | generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t | |
1935 | ||
1936 | /// Vector widening multiply subtract with scalar | |
1937 | name = vmlsl | |
1938 | n-suffix | |
1939 | multi_fn = vmlsl-self-noext, a, b, {vdup-nself-noext, c} | |
1940 | a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 | |
1941 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1942 | c = 3 | |
1943 | validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
1944 | ||
1945 | arm = vmlsl.s | |
1946 | aarch64 = smlsl | |
1947 | generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t | |
1948 | aarch64 = umlsl | |
1949 | generate uint32x4_t:uint16x4_t:u16:uint32x4_t, uint64x2_t:uint32x2_t:u32:uint64x2_t | |
1950 | ||
1951 | /// Vector widening multiply subtract with scalar | |
1952 | name = vmlsl_lane | |
1953 | in2-suffix | |
1954 | constn = LANE | |
1955 | multi_fn = static_assert_imm-in2_exp_len-LANE | |
353b0b11 | 1956 | multi_fn = vmlsl-self-noext, a, b, {simd_shuffle!, c, c, {dup-in_len-LANE as u32}} |
17df50a5 XL |
1957 | a = 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 |
1958 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1959 | c = 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
1960 | n = 1 | |
1961 | validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
1962 | ||
1963 | arm = vmlsl.s | |
1964 | aarch64 = smlsl | |
1965 | generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int32x4_t:int16x4_t:int16x8_t:int32x4_t | |
1966 | generate int64x2_t:int32x2_t:int32x2_t:int64x2_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t | |
1967 | aarch64 = umlsl | |
1968 | generate uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x4_t:uint16x8_t:uint32x4_t | |
1969 | generate uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x2_t:uint32x4_t:uint64x2_t | |
1970 | ||
1971 | /// Signed multiply-subtract long | |
1972 | name = vmlsl_high | |
1973 | no-q | |
353b0b11 FG |
1974 | multi_fn = simd_shuffle!, b:half, b, b, {fixed-half-right} |
1975 | multi_fn = simd_shuffle!, c:half, c, c, {fixed-half-right} | |
17df50a5 XL |
1976 | multi_fn = vmlsl-noqself-noext, a, b, c |
1977 | a = 14, 15, 16, 17, 18, 19, 20, 21 | |
1978 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1979 | c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 | |
1980 | fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
1981 | validate 14, 13, 12, 11, 10, 9, 8, 7 | |
1982 | ||
1983 | aarch64 = smlsl2 | |
1984 | generate int16x8_t:int8x16_t:int8x16_t:int16x8_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t | |
1985 | ||
1986 | /// Unsigned multiply-subtract long | |
1987 | name = vmlsl_high | |
1988 | no-q | |
353b0b11 FG |
1989 | multi_fn = simd_shuffle!, b:half, b, b, {fixed-half-right} |
1990 | multi_fn = simd_shuffle!, c:half, c, c, {fixed-half-right} | |
17df50a5 XL |
1991 | multi_fn = vmlsl-noqself-noext, a, b, c |
1992 | a = 14, 15, 16, 17, 18, 19, 20, 21 | |
1993 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
1994 | c = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 | |
1995 | fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
1996 | validate 14, 13, 12, 11, 10, 9, 8, 7 | |
1997 | ||
1998 | aarch64 = umlsl2 | |
1999 | generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t | |
2000 | ||
2001 | /// Multiply-subtract long | |
2002 | name = vmlsl_high_n | |
2003 | no-q | |
2004 | multi_fn = vmlsl_high-noqself-noext, a, b, {vdupq_n-noqself-noext, c} | |
2005 | a = 14, 15, 16, 17, 18, 19, 20, 21 | |
2006 | b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 | |
2007 | c = 2 | |
2008 | validate 14, 13, 12, 11, 10, 9, 8, 7 | |
2009 | ||
2010 | aarch64 = smlsl2 | |
2011 | generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t | |
2012 | aarch64 = umlsl2 | |
2013 | generate uint32x4_t:uint16x8_t:u16:uint32x4_t, uint64x2_t:uint32x4_t:u32:uint64x2_t | |
2014 | ||
2015 | /// Multiply-subtract long | |
2016 | name = vmlsl_high_lane | |
2017 | in2-suffix | |
2018 | constn = LANE | |
2019 | multi_fn = static_assert_imm-in2_exp_len-LANE | |
353b0b11 | 2020 | multi_fn = vmlsl_high-noqself-noext, a, b, {simd_shuffle!, c, c, {dup-in_len-LANE as u32}} |
17df50a5 XL |
2021 | a = 14, 15, 16, 17, 18, 19, 20, 21 |
2022 | b = 3, 3, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7 | |
2023 | c = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
2024 | n = 1 | |
2025 | validate 14, 13, 12, 11, 10, 9, 8, 7 | |
2026 | ||
2027 | aarch64 = smlsl2 | |
2028 | generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t | |
2029 | generate int64x2_t:int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t | |
2030 | aarch64 = umlsl2 | |
2031 | generate uint32x4_t:uint16x8_t:uint16x4_t:uint32x4_t, uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t | |
2032 | generate uint64x2_t:uint32x4_t:uint32x2_t:uint64x2_t, uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t | |
2033 | ||
2034 | /// Extract narrow | |
2035 | name = vmovn_high | |
2036 | no-q | |
2037 | multi_fn = simd_cast, c:in_t0, b | |
353b0b11 | 2038 | multi_fn = simd_shuffle!, a, c, {asc-0-out_len} |
17df50a5 XL |
2039 | a = 0, 1, 2, 3, 2, 3, 4, 5 |
2040 | b = 2, 3, 4, 5, 12, 13, 14, 15 | |
2041 | validate 0, 1, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 12, 13, 14, 15 | |
2042 | ||
2043 | aarch64 = xtn2 | |
2044 | generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t | |
2045 | generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t | |
2046 | ||
2047 | /// Negate | |
2048 | name = vneg | |
2049 | fn = simd_neg | |
2050 | a = 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 8 | |
2051 | validate 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7, -8 | |
2052 | ||
2053 | aarch64 = neg | |
2054 | generate int64x*_t | |
2055 | ||
2056 | arm = vneg.s | |
2057 | generate int*_t | |
2058 | ||
3c0e092e XL |
2059 | /// Negate |
2060 | name = vneg | |
a2a8927a | 2061 | multi_fn = a.wrapping_neg() |
3c0e092e XL |
2062 | a = 1 |
2063 | validate -1 | |
2064 | ||
2065 | aarch64 = neg | |
2066 | generate i64 | |
2067 | ||
17df50a5 XL |
2068 | /// Negate |
2069 | name = vneg | |
2070 | fn = simd_neg | |
2071 | a = 0., 1., -1., 2., -2., 3., -3., 4. | |
2072 | validate 0., -1., 1., -2., 2., -3., 3., -4. | |
2073 | ||
2074 | aarch64 = fneg | |
2075 | generate float64x*_t | |
2076 | ||
2077 | arm = vneg.s | |
2078 | generate float*_t | |
2079 | ||
2080 | /// Signed saturating negate | |
2081 | name = vqneg | |
2082 | a = MIN, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7 | |
2083 | validate MAX, 0, -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7 | |
2084 | link-arm = vqneg._EXT_ | |
2085 | link-aarch64 = sqneg._EXT_ | |
2086 | ||
2087 | aarch64 = sqneg | |
2088 | generate int64x*_t | |
2089 | ||
2090 | arm = vqneg.s | |
2091 | generate int*_t | |
2092 | ||
3c0e092e XL |
2093 | /// Signed saturating negate |
2094 | name = vqneg | |
c620b35d | 2095 | multi_fn = simd_extract!, {vqneg-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 |
3c0e092e XL |
2096 | a = 1 |
2097 | validate -1 | |
2098 | ||
2099 | aarch64 = sqneg | |
2100 | generate i8, i16, i32, i64 | |
2101 | ||
17df50a5 XL |
2102 | /// Saturating subtract |
2103 | name = vqsub | |
2104 | a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 | |
2105 | b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
2106 | validate 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26 | |
2107 | ||
2108 | arm = vqsub.s | |
2109 | aarch64 = uqsub | |
2110 | link-arm = llvm.usub.sat._EXT_ | |
2111 | link-aarch64 = uqsub._EXT_ | |
2112 | generate uint*_t, uint64x*_t | |
2113 | ||
2114 | arm = vqsub.s | |
2115 | aarch64 = sqsub | |
2116 | link-arm = llvm.ssub.sat._EXT_ | |
2117 | link-aarch64 = sqsub._EXT_ | |
2118 | generate int*_t, int64x*_t | |
2119 | ||
2120 | /// Saturating subtract | |
2121 | name = vqsub | |
2122 | multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a | |
2123 | multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b | |
c620b35d | 2124 | multi_fn = simd_extract!, {vqsub-in_ntt-noext, a, b}, 0 |
17df50a5 XL |
2125 | a = 42 |
2126 | b = 1 | |
2127 | validate 41 | |
2128 | ||
2129 | aarch64 = sqsub | |
2130 | generate i8, i16 | |
2131 | aarch64 = uqsub | |
2132 | generate u8, u16 | |
2133 | ||
2134 | /// Saturating subtract | |
2135 | name = vqsub | |
2136 | a = 42 | |
2137 | b = 1 | |
2138 | validate 41 | |
2139 | ||
2140 | aarch64 = uqsub | |
2141 | link-aarch64 = uqsub._EXT_ | |
2142 | generate u32, u64 | |
2143 | ||
2144 | aarch64 = sqsub | |
2145 | link-aarch64 = sqsub._EXT_ | |
2146 | generate i32, i64 | |
2147 | ||
2148 | /// Halving add | |
2149 | name = vhadd | |
2150 | a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 | |
2151 | b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
2152 | validate 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29 | |
2153 | ||
2154 | arm = vhadd.s | |
2155 | aarch64 = uhadd | |
2156 | link-aarch64 = uhadd._EXT_ | |
2157 | link-arm = vhaddu._EXT_ | |
2158 | generate uint*_t | |
2159 | ||
2160 | arm = vhadd.s | |
2161 | aarch64 = shadd | |
2162 | link-aarch64 = shadd._EXT_ | |
2163 | link-arm = vhadds._EXT_ | |
2164 | generate int*_t | |
2165 | ||
2166 | /// Reverse bit order | |
2167 | name = vrbit | |
2168 | a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 | |
2169 | validate 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120 | |
2170 | ||
2171 | aarch64 = rbit | |
2172 | link-aarch64 = rbit._EXT_ | |
2173 | ||
2174 | generate int8x8_t, int8x16_t | |
2175 | ||
2176 | /// Reverse bit order | |
2177 | name = vrbit | |
2178 | multi_fn = transmute, {vrbit-signed-noext, transmute(a)} | |
2179 | a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 | |
2180 | validate 0, 64, 32, 96, 16, 80, 48, 112, 8, 72, 40, 104, 24, 88, 56, 120 | |
2181 | ||
2182 | aarch64 = rbit | |
2183 | ||
2184 | generate uint8x8_t, uint8x16_t, poly8x8_t, poly8x16_t | |
2185 | ||
2186 | /// Rounding halving add | |
2187 | name = vrhadd | |
2188 | a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 | |
2189 | b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
2190 | validate 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29 | |
2191 | ||
2192 | arm = vrhadd.s | |
2193 | aarch64 = urhadd | |
2194 | link-arm = vrhaddu._EXT_ | |
2195 | link-aarch64 = urhadd._EXT_ | |
2196 | generate uint*_t | |
2197 | ||
2198 | arm = vrhadd.s | |
2199 | aarch64 = srhadd | |
2200 | link-arm = vrhadds._EXT_ | |
2201 | link-aarch64 = srhadd._EXT_ | |
2202 | generate int*_t | |
2203 | ||
2204 | /// Floating-point round to integral exact, using current rounding mode | |
2205 | name = vrndx | |
2206 | a = -1.5, 0.5, 1.5, 2.5 | |
2207 | validate -2.0, 0.0, 2.0, 2.0 | |
2208 | ||
2209 | aarch64 = frintx | |
2210 | link-aarch64 = llvm.rint._EXT_ | |
2211 | generate float*_t, float64x*_t | |
2212 | ||
2213 | /// Floating-point round to integral, to nearest with ties to away | |
2214 | name = vrnda | |
2215 | a = -1.5, 0.5, 1.5, 2.5 | |
2216 | validate -2.0, 1.0, 2.0, 3.0 | |
2217 | ||
2218 | aarch64 = frinta | |
2219 | link-aarch64 = llvm.round._EXT_ | |
2220 | generate float*_t, float64x*_t | |
2221 | ||
2222 | /// Floating-point round to integral, to nearest with ties to even | |
2223 | name = vrndn | |
2224 | a = -1.5, 0.5, 1.5, 2.5 | |
2225 | validate -2.0, 0.0, 2.0, 2.0 | |
2226 | ||
2227 | link-aarch64 = frintn._EXT_ | |
2228 | aarch64 = frintn | |
2229 | generate float64x*_t | |
2230 | ||
2231 | target = fp-armv8 | |
2232 | arm = vrintn | |
2233 | link-arm = vrintn._EXT_ | |
2234 | generate float*_t | |
2235 | ||
3c0e092e XL |
2236 | /// Floating-point round to integral, to nearest with ties to even |
2237 | name = vrndn | |
2238 | a = -1.5 | |
2239 | validate -2.0 | |
2240 | ||
2241 | aarch64 = frintn | |
2242 | link-aarch64 = llvm.roundeven._EXT_ | |
2243 | generate f32 | |
2244 | ||
17df50a5 XL |
2245 | /// Floating-point round to integral, toward minus infinity |
2246 | name = vrndm | |
2247 | a = -1.5, 0.5, 1.5, 2.5 | |
2248 | validate -2.0, 0.0, 1.0, 2.0 | |
2249 | ||
2250 | aarch64 = frintm | |
2251 | link-aarch64 = llvm.floor._EXT_ | |
2252 | generate float*_t, float64x*_t | |
2253 | ||
2254 | /// Floating-point round to integral, toward plus infinity | |
2255 | name = vrndp | |
2256 | a = -1.5, 0.5, 1.5, 2.5 | |
2257 | validate -1.0, 1.0, 2.0, 3.0 | |
2258 | ||
2259 | aarch64 = frintp | |
2260 | link-aarch64 = llvm.ceil._EXT_ | |
2261 | generate float*_t, float64x*_t | |
2262 | ||
2263 | /// Floating-point round to integral, toward zero | |
2264 | name = vrnd | |
2265 | a = -1.5, 0.5, 1.5, 2.5 | |
2266 | validate -1.0, 0.0, 1.0, 2.0 | |
2267 | ||
2268 | aarch64 = frintz | |
2269 | link-aarch64 = llvm.trunc._EXT_ | |
2270 | generate float*_t, float64x*_t | |
2271 | ||
2272 | /// Floating-point round to integral, using current rounding mode | |
2273 | name = vrndi | |
2274 | a = -1.5, 0.5, 1.5, 2.5 | |
2275 | validate -2.0, 0.0, 2.0, 2.0 | |
2276 | ||
2277 | aarch64 = frinti | |
2278 | link-aarch64 = llvm.nearbyint._EXT_ | |
2279 | generate float*_t, float64x*_t | |
2280 | ||
2281 | /// Saturating add | |
2282 | name = vqadd | |
2283 | a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 | |
2284 | b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
2285 | validate 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58 | |
2286 | ||
2287 | arm = vqadd.s | |
2288 | aarch64 = uqadd | |
2289 | link-arm = llvm.uadd.sat._EXT_ | |
2290 | link-aarch64 = uqadd._EXT_ | |
2291 | generate uint*_t, uint64x*_t | |
2292 | ||
2293 | arm = vqadd.s | |
2294 | aarch64 = sqadd | |
2295 | link-arm = llvm.sadd.sat._EXT_ | |
2296 | link-aarch64 = sqadd._EXT_ | |
2297 | generate int*_t, int64x*_t | |
2298 | ||
2299 | /// Saturating add | |
2300 | name = vqadd | |
2301 | multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a | |
2302 | multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b | |
c620b35d | 2303 | multi_fn = simd_extract!, {vqadd-in_ntt-noext, a, b}, 0 |
17df50a5 XL |
2304 | a = 42 |
2305 | b = 1 | |
2306 | validate 43 | |
2307 | ||
2308 | aarch64 = sqadd | |
2309 | generate i8, i16 | |
2310 | aarch64 = uqadd | |
2311 | generate u8, u16 | |
2312 | ||
2313 | /// Saturating add | |
2314 | name = vqadd | |
2315 | a = 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 | |
2316 | b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
2317 | validate 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58 | |
2318 | ||
2319 | aarch64 = uqadd | |
2320 | link-aarch64 = uqadd._EXT_ | |
2321 | generate u32, u64 | |
2322 | ||
2323 | aarch64 = sqadd | |
2324 | link-aarch64 = sqadd._EXT_ | |
2325 | generate i32, i64 | |
2326 | ||
c295e0f8 XL |
2327 | /// Load multiple single-element structures to one, two, three, or four registers |
2328 | name = vld1 | |
2329 | out-suffix | |
2330 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 | |
2331 | validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 | |
2332 | load_fn | |
2333 | ||
2334 | aarch64 = ld1 | |
2335 | link-aarch64 = ld1x2._EXT2_ | |
2336 | arm = vld1 | |
2337 | link-arm = vld1x2._EXT2_ | |
2338 | generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t | |
2339 | generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t, *const i64:int64x2x2_t | |
2340 | ||
2341 | link-aarch64 = ld1x3._EXT2_ | |
2342 | link-arm = vld1x3._EXT2_ | |
2343 | generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t, *const i64:int64x1x3_t | |
2344 | generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t, *const i64:int64x2x3_t | |
2345 | ||
2346 | link-aarch64 = ld1x4._EXT2_ | |
2347 | link-arm = vld1x4._EXT2_ | |
2348 | generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t, *const i64:int64x1x4_t | |
2349 | generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t, *const i64:int64x2x4_t | |
2350 | ||
2351 | /// Load multiple single-element structures to one, two, three, or four registers | |
2352 | name = vld1 | |
2353 | out-suffix | |
2354 | multi_fn = transmute, {vld1-outsigned-noext, transmute(a)} | |
2355 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 | |
2356 | validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 | |
2357 | ||
2358 | load_fn | |
2359 | aarch64 = ld1 | |
2360 | arm = vld1 | |
2361 | generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t | |
2362 | generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t, *const u64:uint64x2x2_t | |
2363 | generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t, *const u64:uint64x1x3_t | |
2364 | generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t, *const u64:uint64x2x3_t | |
2365 | generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t, *const u64:uint64x1x4_t | |
2366 | generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t, *const u64:uint64x2x4_t | |
2367 | generate *const p8:poly8x8x2_t, *const p8:poly8x8x3_t, *const p8:poly8x8x4_t | |
2368 | generate *const p8:poly8x16x2_t, *const p8:poly8x16x3_t, *const p8:poly8x16x4_t | |
2369 | generate *const p16:poly16x4x2_t, *const p16:poly16x4x3_t, *const p16:poly16x4x4_t | |
2370 | generate *const p16:poly16x8x2_t, *const p16:poly16x8x3_t, *const p16:poly16x8x4_t | |
2371 | target = aes | |
2372 | generate *const p64:poly64x1x2_t | |
3c0e092e | 2373 | arm = nop |
c295e0f8 XL |
2374 | generate *const p64:poly64x1x3_t, *const p64:poly64x1x4_t |
2375 | generate *const p64:poly64x2x2_t, *const p64:poly64x2x3_t, *const p64:poly64x2x4_t | |
c295e0f8 XL |
2376 | /// Load multiple single-element structures to one, two, three, or four registers |
2377 | name = vld1 | |
2378 | out-suffix | |
2379 | a = 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. | |
2380 | validate 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. | |
2381 | load_fn | |
2382 | ||
2383 | aarch64 = ld1 | |
2384 | link-aarch64 = ld1x2._EXT2_ | |
2385 | generate *const f64:float64x1x2_t, *const f64:float64x2x2_t | |
2386 | ||
2387 | link-aarch64 = ld1x3._EXT2_ | |
2388 | generate *const f64:float64x1x3_t, *const f64:float64x2x3_t | |
2389 | ||
2390 | link-aarch64 = ld1x4._EXT2_ | |
2391 | generate *const f64:float64x1x4_t, *const f64:float64x2x4_t | |
2392 | ||
2393 | arm = vld1 | |
2394 | link-aarch64 = ld1x2._EXT2_ | |
2395 | link-arm = vld1x2._EXT2_ | |
2396 | generate *const f32:float32x2x2_t, *const f32:float32x4x2_t | |
2397 | ||
2398 | link-aarch64 = ld1x3._EXT2_ | |
2399 | link-arm = vld1x3._EXT2_ | |
2400 | generate *const f32:float32x2x3_t, *const f32:float32x4x3_t | |
2401 | ||
2402 | link-aarch64 = ld1x4._EXT2_ | |
2403 | link-arm = vld1x4._EXT2_ | |
2404 | generate *const f32:float32x2x4_t, *const f32:float32x4x4_t | |
2405 | ||
2406 | /// Load multiple 2-element structures to two registers | |
2407 | name = vld2 | |
2408 | out-nox | |
2409 | a = 0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 | |
2410 | validate 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 | |
2411 | load_fn | |
3c0e092e | 2412 | arm-aarch64-separate |
c295e0f8 XL |
2413 | |
2414 | aarch64 = ld2 | |
2415 | link-aarch64 = ld2._EXTv2_ | |
3c0e092e XL |
2416 | generate *const i64:int64x2x2_t |
2417 | ||
c295e0f8 XL |
2418 | arm = vld2 |
2419 | link-arm = vld2._EXTpi82_ | |
3c0e092e XL |
2420 | generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t |
2421 | generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t | |
2422 | arm = nop | |
2423 | aarch64 = nop | |
2424 | generate *const i64:int64x1x2_t | |
c295e0f8 XL |
2425 | |
2426 | /// Load multiple 2-element structures to two registers | |
2427 | name = vld2 | |
2428 | out-nox | |
2429 | multi_fn = transmute, {vld2-outsignednox-noext, transmute(a)} | |
2430 | a = 0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 | |
2431 | validate 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 | |
2432 | load_fn | |
2433 | ||
2434 | aarch64 = ld2 | |
3c0e092e XL |
2435 | generate *const u64:uint64x2x2_t |
2436 | target = aes | |
2437 | generate *const p64:poly64x2x2_t | |
2438 | ||
2439 | target = default | |
c295e0f8 | 2440 | arm = vld2 |
3c0e092e XL |
2441 | generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t |
2442 | generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t | |
2443 | generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t | |
2444 | arm = nop | |
2445 | aarch64 = nop | |
2446 | generate *const u64:uint64x1x2_t | |
2447 | target = aes | |
2448 | generate *const p64:poly64x1x2_t | |
2449 | ||
c295e0f8 XL |
2450 | |
2451 | /// Load multiple 2-element structures to two registers | |
2452 | name = vld2 | |
2453 | out-nox | |
2454 | a = 0., 1., 2., 2., 3., 2., 4., 3., 5., 2., 6., 3., 7., 4., 8., 5., 9. | |
2455 | validate 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9. | |
2456 | load_fn | |
3c0e092e | 2457 | arm-aarch64-separate |
c295e0f8 | 2458 | |
3c0e092e | 2459 | aarch64 = nop |
c295e0f8 | 2460 | link-aarch64 = ld2._EXTv2_ |
3c0e092e XL |
2461 | generate *const f64:float64x1x2_t |
2462 | aarch64 = ld2 | |
2463 | generate *const f64:float64x2x2_t | |
c295e0f8 XL |
2464 | |
2465 | arm = vld2 | |
2466 | link-arm = vld2._EXTpi82_ | |
3c0e092e | 2467 | generate *const f32:float32x2x2_t, *const f32:float32x4x2_t |
c295e0f8 XL |
2468 | |
2469 | /// Load single 2-element structure and replicate to all lanes of two registers | |
2470 | name = vld2 | |
2471 | out-dup-nox | |
2472 | a = 0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 | |
2473 | validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 | |
2474 | load_fn | |
3c0e092e | 2475 | arm-aarch64-separate |
c295e0f8 | 2476 | |
c295e0f8 XL |
2477 | aarch64 = ld2r |
2478 | link-aarch64 = ld2r._EXT2_ | |
3c0e092e XL |
2479 | generate *const i64:int64x2x2_t |
2480 | ||
2481 | arm = vld2 | |
2482 | link-arm = vld2dup._EXTpi82_ | |
2483 | generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t | |
2484 | generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t | |
2485 | arm = nop | |
2486 | generate *const i64:int64x1x2_t | |
c295e0f8 XL |
2487 | |
2488 | /// Load single 2-element structure and replicate to all lanes of two registers | |
2489 | name = vld2 | |
2490 | out-dup-nox | |
2491 | multi_fn = transmute, {vld2-outsigneddupnox-noext, transmute(a)} | |
2492 | a = 0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 | |
2493 | validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 | |
2494 | load_fn | |
2495 | ||
c295e0f8 | 2496 | aarch64 = ld2r |
3c0e092e XL |
2497 | generate *const u64:uint64x2x2_t |
2498 | target = aes | |
2499 | generate *const p64:poly64x2x2_t | |
2500 | ||
2501 | target = default | |
2502 | arm = vld2 | |
2503 | generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t | |
2504 | generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t | |
2505 | generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t | |
2506 | arm = nop | |
2507 | generate *const u64:uint64x1x2_t | |
2508 | target = aes | |
2509 | generate *const p64:poly64x1x2_t | |
c295e0f8 XL |
2510 | |
2511 | /// Load single 2-element structure and replicate to all lanes of two registers | |
2512 | name = vld2 | |
2513 | out-dup-nox | |
2514 | a = 0., 1., 1., 2., 3., 1., 4., 3., 5. | |
2515 | validate 1., 1., 1., 1., 1., 1., 1., 1. | |
2516 | load_fn | |
3c0e092e | 2517 | arm-aarch64-separate |
c295e0f8 XL |
2518 | |
2519 | aarch64 = ld2r | |
2520 | link-aarch64 = ld2r._EXT2_ | |
3c0e092e | 2521 | generate *const f64:float64x1x2_t, *const f64:float64x2x2_t |
c295e0f8 | 2522 | |
3c0e092e | 2523 | arm = vld2 |
c295e0f8 | 2524 | link-arm = vld2dup._EXTpi82_ |
3c0e092e | 2525 | generate *const f32:float32x2x2_t, *const f32:float32x4x2_t |
c295e0f8 XL |
2526 | |
2527 | /// Load multiple 2-element structures to two registers | |
2528 | name = vld2 | |
2529 | out-lane-nox | |
2530 | multi_fn = static_assert_imm-in_exp_len-LANE | |
2531 | constn = LANE | |
2532 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 | |
2533 | b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 | |
2534 | n = 0 | |
2535 | validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 | |
2536 | load_fn | |
2537 | arm-aarch64-separate | |
2538 | ||
3c0e092e | 2539 | aarch64 = ld2 |
c295e0f8 XL |
2540 | const-aarch64 = LANE |
2541 | link-aarch64 = ld2lane._EXTpi82_ | |
3c0e092e | 2542 | generate *const i8:int8x16x2_t:int8x16x2_t, *const i64:int64x1x2_t:int64x1x2_t, *const i64:int64x2x2_t:int64x2x2_t |
c295e0f8 | 2543 | |
3c0e092e | 2544 | arm = vld2 |
c295e0f8 XL |
2545 | const-arm = LANE |
2546 | link-arm = vld2lane._EXTpi82_ | |
3c0e092e XL |
2547 | generate *const i8:int8x8x2_t:int8x8x2_t, *const i16:int16x4x2_t:int16x4x2_t, *const i32:int32x2x2_t:int32x2x2_t |
2548 | generate *const i16:int16x8x2_t:int16x8x2_t, *const i32:int32x4x2_t:int32x4x2_t | |
c295e0f8 XL |
2549 | |
2550 | /// Load multiple 2-element structures to two registers | |
2551 | name = vld2 | |
2552 | out-lane-nox | |
2553 | multi_fn = static_assert_imm-in_exp_len-LANE | |
2554 | multi_fn = transmute, {vld2-outsignedlanenox-::<LANE>, transmute(a), transmute(b)} | |
2555 | constn = LANE | |
2556 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 | |
2557 | b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 | |
2558 | n = 0 | |
2559 | validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 | |
2560 | load_fn | |
c295e0f8 | 2561 | |
3c0e092e | 2562 | aarch64 = ld2 |
c295e0f8 XL |
2563 | const-aarch64 = LANE |
2564 | ||
2565 | target = aes | |
3c0e092e | 2566 | generate *const p64:poly64x1x2_t:poly64x1x2_t, *const p64:poly64x2x2_t:poly64x2x2_t |
c295e0f8 XL |
2567 | |
2568 | target = default | |
3c0e092e XL |
2569 | generate *const u8:uint8x16x2_t:uint8x16x2_t, *const u64:uint64x1x2_t:uint64x1x2_t, *const u64:uint64x2x2_t:uint64x2x2_t |
2570 | generate *const p8:poly8x16x2_t:poly8x16x2_t | |
c295e0f8 | 2571 | |
3c0e092e | 2572 | arm = vld2 |
c295e0f8 | 2573 | const-arm = LANE |
3c0e092e XL |
2574 | generate *const u8:uint8x8x2_t:uint8x8x2_t, *const u16:uint16x4x2_t:uint16x4x2_t, *const u32:uint32x2x2_t:uint32x2x2_t |
2575 | generate *const u16:uint16x8x2_t:uint16x8x2_t, *const u32:uint32x4x2_t:uint32x4x2_t | |
2576 | generate *const p8:poly8x8x2_t:poly8x8x2_t, *const p16:poly16x4x2_t:poly16x4x2_t | |
2577 | generate *const p16:poly16x8x2_t:poly16x8x2_t | |
c295e0f8 XL |
2578 | |
2579 | /// Load multiple 2-element structures to two registers | |
2580 | name = vld2 | |
2581 | out-lane-nox | |
2582 | multi_fn = static_assert_imm-in_exp_len-LANE | |
2583 | constn = LANE | |
2584 | a = 0., 1., 2., 3., 4., 5., 6., 7., 8. | |
2585 | b = 0., 2., 2., 14., 2., 16., 17., 18. | |
2586 | n = 0 | |
2587 | validate 1., 2., 2., 14., 2., 16., 17., 18. | |
2588 | load_fn | |
2589 | arm-aarch64-separate | |
2590 | ||
3c0e092e | 2591 | aarch64 = ld2 |
c295e0f8 XL |
2592 | const-aarch64 = LANE |
2593 | link-aarch64 = ld2lane._EXTpi82_ | |
3c0e092e | 2594 | generate *const f64:float64x1x2_t:float64x1x2_t, *const f64:float64x2x2_t:float64x2x2_t |
c295e0f8 | 2595 | |
3c0e092e | 2596 | arm = vld2 |
c295e0f8 XL |
2597 | const-arm = LANE |
2598 | link-arm = vld2lane._EXTpi82_ | |
3c0e092e | 2599 | generate *const f32:float32x2x2_t:float32x2x2_t, *const f32:float32x4x2_t:float32x4x2_t |
c295e0f8 | 2600 | |
3c0e092e XL |
2601 | /// Load multiple 3-element structures to three registers |
2602 | name = vld3 | |
2603 | out-nox | |
2604 | a = 0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48 | |
2605 | validate 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48 | |
2606 | load_fn | |
c295e0f8 XL |
2607 | arm-aarch64-separate |
2608 | ||
3c0e092e XL |
2609 | aarch64 = ld3 |
2610 | link-aarch64 = ld3._EXTv2_ | |
2611 | generate *const i64:int64x2x3_t | |
c295e0f8 | 2612 | |
3c0e092e XL |
2613 | arm = vld3 |
2614 | link-arm = vld3._EXTpi82_ | |
2615 | generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t | |
2616 | generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t | |
2617 | arm = nop | |
2618 | aarch64 = nop | |
2619 | generate *const i64:int64x1x3_t | |
c295e0f8 | 2620 | |
3c0e092e XL |
2621 | /// Load multiple 3-element structures to three registers |
2622 | name = vld3 | |
2623 | out-nox | |
2624 | multi_fn = transmute, {vld3-outsignednox-noext, transmute(a)} | |
2625 | a = 0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48 | |
2626 | validate 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48 | |
2627 | load_fn | |
2628 | ||
2629 | aarch64 = ld3 | |
2630 | generate *const u64:uint64x2x3_t | |
2631 | target = aes | |
2632 | generate *const p64:poly64x2x3_t | |
2633 | ||
2634 | target = default | |
2635 | arm = vld3 | |
2636 | generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t | |
2637 | generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t | |
2638 | generate *const p8:poly8x8x3_t, *const p16:poly16x4x3_t, *const p8:poly8x16x3_t, *const p16:poly16x8x3_t | |
2639 | arm = nop | |
2640 | aarch64 = nop | |
2641 | generate *const u64:uint64x1x3_t | |
2642 | target = aes | |
2643 | generate *const p64:poly64x1x3_t | |
2644 | ||
2645 | /// Load multiple 3-element structures to three registers | |
2646 | name = vld3 | |
2647 | out-nox | |
2648 | a = 0., 1., 2., 2., 2., 4., 4., 2., 7., 7., 4., 8., 8. | |
2649 | validate 1., 2., 2., 4., 2., 4., 7., 8., 2., 4., 7., 8. | |
2650 | load_fn | |
2651 | arm-aarch64-separate | |
2652 | ||
2653 | aarch64 = nop | |
2654 | link-aarch64 = ld3._EXTv2_ | |
2655 | generate *const f64:float64x1x3_t | |
2656 | aarch64 = ld3 | |
2657 | generate *const f64:float64x2x3_t | |
2658 | ||
2659 | arm = vld3 | |
2660 | link-arm = vld3._EXTpi82_ | |
2661 | generate *const f32:float32x2x3_t, *const f32:float32x4x3_t | |
2662 | ||
2663 | /// Load single 3-element structure and replicate to all lanes of three registers | |
2664 | name = vld3 | |
2665 | out-dup-nox | |
2666 | a = 0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17 | |
2667 | validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 | |
2668 | load_fn | |
2669 | arm-aarch64-separate | |
2670 | ||
2671 | aarch64 = ld3r | |
2672 | link-aarch64 = ld3r._EXT2_ | |
2673 | generate *const i64:int64x2x3_t | |
2674 | ||
2675 | arm = vld3 | |
2676 | link-arm = vld3dup._EXTpi82_ | |
2677 | generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t | |
2678 | generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t | |
2679 | arm = nop | |
2680 | generate *const i64:int64x1x3_t | |
2681 | ||
2682 | /// Load single 3-element structure and replicate to all lanes of three registers | |
2683 | name = vld3 | |
2684 | out-dup-nox | |
2685 | multi_fn = transmute, {vld3-outsigneddupnox-noext, transmute(a)} | |
2686 | a = 0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17 | |
2687 | validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 | |
2688 | load_fn | |
2689 | ||
2690 | aarch64 = ld3r | |
2691 | generate *const u64:uint64x2x3_t | |
2692 | target = aes | |
2693 | generate *const p64:poly64x2x3_t | |
2694 | ||
2695 | target = default | |
2696 | arm = vld3 | |
2697 | generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t | |
2698 | generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t | |
2699 | generate *const p8:poly8x8x3_t, *const p16:poly16x4x3_t, *const p8:poly8x16x3_t, *const p16:poly16x8x3_t | |
2700 | arm = nop | |
2701 | generate *const u64:uint64x1x3_t | |
2702 | target = aes | |
2703 | generate *const p64:poly64x1x3_t | |
2704 | ||
2705 | /// Load single 3-element structure and replicate to all lanes of three registers | |
2706 | name = vld3 | |
2707 | out-dup-nox | |
2708 | a = 0., 1., 1., 1., 3., 1., 4., 3., 5., 1., 4., 3., 5. | |
2709 | validate 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1. | |
2710 | load_fn | |
2711 | arm-aarch64-separate | |
2712 | ||
2713 | aarch64 = ld3r | |
2714 | link-aarch64 = ld3r._EXT2_ | |
2715 | generate *const f64:float64x1x3_t, *const f64:float64x2x3_t | |
2716 | ||
2717 | arm = vld3 | |
2718 | link-arm = vld3dup._EXTpi82_ | |
2719 | generate *const f32:float32x2x3_t, *const f32:float32x4x3_t | |
2720 | ||
2721 | /// Load multiple 3-element structures to two registers | |
2722 | name = vld3 | |
2723 | out-lane-nox | |
2724 | multi_fn = static_assert_imm-in_exp_len-LANE | |
2725 | constn = LANE | |
2726 | a = 0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 | |
2727 | b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 | |
2728 | n = 0 | |
2729 | validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 | |
2730 | load_fn | |
2731 | arm-aarch64-separate | |
2732 | ||
2733 | aarch64 = ld3 | |
2734 | const-aarch64 = LANE | |
2735 | link-aarch64 = ld3lane._EXTpi82_ | |
2736 | generate *const i8:int8x16x3_t:int8x16x3_t, *const i64:int64x1x3_t:int64x1x3_t, *const i64:int64x2x3_t:int64x2x3_t | |
2737 | ||
2738 | arm = vld3 | |
2739 | const-arm = LANE | |
2740 | link-arm = vld3lane._EXTpi82_ | |
2741 | generate *const i8:int8x8x3_t:int8x8x3_t, *const i16:int16x4x3_t:int16x4x3_t, *const i32:int32x2x3_t:int32x2x3_t | |
2742 | generate *const i16:int16x8x3_t:int16x8x3_t, *const i32:int32x4x3_t:int32x4x3_t | |
2743 | ||
2744 | /// Load multiple 3-element structures to three registers | |
2745 | name = vld3 | |
2746 | out-lane-nox | |
2747 | multi_fn = static_assert_imm-in_exp_len-LANE | |
2748 | multi_fn = transmute, {vld3-outsignedlanenox-::<LANE>, transmute(a), transmute(b)} | |
2749 | constn = LANE | |
2750 | a = 0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 | |
2751 | b = 0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 | |
2752 | n = 0 | |
2753 | validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 | |
2754 | load_fn | |
2755 | ||
2756 | aarch64 = ld3 | |
2757 | const-aarch64 = LANE | |
2758 | target = aes | |
2759 | generate *const p64:poly64x1x3_t:poly64x1x3_t, *const p64:poly64x2x3_t:poly64x2x3_t | |
2760 | target = default | |
2761 | generate *const p8:poly8x16x3_t:poly8x16x3_t, *const u8:uint8x16x3_t:uint8x16x3_t, *const u64:uint64x1x3_t:uint64x1x3_t, *const u64:uint64x2x3_t:uint64x2x3_t | |
2762 | ||
2763 | arm = vld3 | |
2764 | const-arm = LANE | |
2765 | generate *const u8:uint8x8x3_t:uint8x8x3_t, *const u16:uint16x4x3_t:uint16x4x3_t, *const u32:uint32x2x3_t:uint32x2x3_t | |
2766 | generate *const u16:uint16x8x3_t:uint16x8x3_t, *const u32:uint32x4x3_t:uint32x4x3_t | |
2767 | generate *const p8:poly8x8x3_t:poly8x8x3_t, *const p16:poly16x4x3_t:poly16x4x3_t | |
2768 | generate *const p16:poly16x8x3_t:poly16x8x3_t | |
2769 | ||
2770 | /// Load multiple 3-element structures to three registers | |
2771 | name = vld3 | |
2772 | out-lane-nox | |
2773 | multi_fn = static_assert_imm-in_exp_len-LANE | |
2774 | constn = LANE | |
2775 | a = 0., 1., 2., 2., 4., 5., 6., 7., 8., 5., 6., 7., 8. | |
2776 | b = 0., 2., 2., 14., 9., 16., 17., 18., 5., 6., 7., 8. | |
2777 | n = 0 | |
2778 | validate 1., 2., 2., 14., 2., 16., 17., 18., 2., 6., 7., 8. | |
2779 | load_fn | |
2780 | arm-aarch64-separate | |
2781 | ||
2782 | aarch64 = ld3 | |
2783 | const-aarch64 = LANE | |
2784 | link-aarch64 = ld3lane._EXTpi82_ | |
2785 | generate *const f64:float64x1x3_t:float64x1x3_t, *const f64:float64x2x3_t:float64x2x3_t | |
2786 | ||
2787 | arm = vld3 | |
2788 | const-arm = LANE | |
2789 | link-arm = vld3lane._EXTpi82_ | |
2790 | generate *const f32:float32x2x3_t:float32x2x3_t, *const f32:float32x4x3_t:float32x4x3_t | |
2791 | ||
2792 | /// Load multiple 4-element structures to four registers | |
2793 | name = vld4 | |
2794 | out-nox | |
2795 | a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 | |
2796 | validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 | |
2797 | load_fn | |
2798 | arm-aarch64-separate | |
2799 | ||
2800 | aarch64 = ld4 | |
2801 | link-aarch64 = ld4._EXTv2_ | |
2802 | generate *const i64:int64x2x4_t | |
2803 | ||
2804 | arm = vld4 | |
2805 | link-arm = vld4._EXTpi82_ | |
2806 | generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t | |
2807 | generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t | |
2808 | aarch64 = nop | |
2809 | arm = nop | |
2810 | generate *const i64:int64x1x4_t | |
2811 | ||
2812 | /// Load multiple 4-element structures to four registers | |
2813 | name = vld4 | |
2814 | out-nox | |
2815 | multi_fn = transmute, {vld4-outsignednox-noext, transmute(a)} | |
2816 | a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 | |
2817 | validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 | |
2818 | load_fn | |
2819 | ||
2820 | aarch64 = ld4 | |
2821 | generate *const u64:uint64x2x4_t | |
2822 | target = aes | |
2823 | generate *const p64:poly64x2x4_t | |
2824 | ||
2825 | target = default | |
2826 | arm = vld4 | |
2827 | generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t | |
2828 | generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t | |
2829 | generate *const p8:poly8x8x4_t, *const p16:poly16x4x4_t, *const p8:poly8x16x4_t, *const p16:poly16x8x4_t | |
2830 | aarch64 = nop | |
2831 | arm = nop | |
2832 | generate *const u64:uint64x1x4_t | |
2833 | target = aes | |
2834 | generate *const p64:poly64x1x4_t | |
2835 | ||
2836 | /// Load multiple 4-element structures to four registers | |
2837 | name = vld4 | |
2838 | out-nox | |
2839 | a = 0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 15., 16. | |
2840 | validate 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 15., 6., 8., 8., 16. | |
2841 | load_fn | |
2842 | arm-aarch64-separate | |
2843 | ||
2844 | aarch64 = nop | |
2845 | link-aarch64 = ld4._EXTv2_ | |
2846 | generate *const f64:float64x1x4_t | |
2847 | aarch64 = ld4 | |
2848 | generate *const f64:float64x2x4_t | |
2849 | ||
2850 | arm = vld4 | |
2851 | link-arm = vld4._EXTpi82_ | |
2852 | generate *const f32:float32x2x4_t, *const f32:float32x4x4_t | |
2853 | ||
2854 | /// Load single 4-element structure and replicate to all lanes of four registers | |
2855 | name = vld4 | |
2856 | out-dup-nox | |
2857 | a = 0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9 | |
2858 | validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 | |
2859 | load_fn | |
2860 | arm-aarch64-separate | |
2861 | ||
2862 | aarch64 = ld4r | |
2863 | link-aarch64 = ld4r._EXT2_ | |
2864 | generate *const i64:int64x2x4_t | |
2865 | ||
2866 | arm = vld4 | |
2867 | link-arm = vld4dup._EXTpi82_ | |
2868 | generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t | |
2869 | generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t | |
2870 | arm = nop | |
2871 | generate *const i64:int64x1x4_t | |
2872 | ||
2873 | /// Load single 4-element structure and replicate to all lanes of four registers | |
2874 | name = vld4 | |
2875 | out-dup-nox | |
2876 | multi_fn = transmute, {vld4-outsigneddupnox-noext, transmute(a)} | |
2877 | a = 0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9 | |
2878 | validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 | |
2879 | load_fn | |
2880 | ||
2881 | aarch64 = ld4r | |
2882 | generate *const u64:uint64x2x4_t | |
2883 | target = aes | |
2884 | generate *const p64:poly64x2x4_t | |
2885 | ||
2886 | target = default | |
2887 | arm = vld4 | |
2888 | generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t | |
2889 | generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t | |
2890 | generate *const p8:poly8x8x4_t, *const p16:poly16x4x4_t, *const p8:poly8x16x4_t, *const p16:poly16x8x4_t | |
2891 | arm = nop | |
2892 | generate *const u64:uint64x1x4_t | |
2893 | target = aes | |
2894 | generate *const p64:poly64x1x4_t | |
2895 | ||
2896 | /// Load single 4-element structure and replicate to all lanes of four registers | |
2897 | name = vld4 | |
2898 | out-dup-nox | |
2899 | a = 0., 1., 1., 1., 1., 6., 4., 3., 5., 7., 4., 3., 5., 8., 4., 3., 5., 9., 4., 3., 5. | |
2900 | validate 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1. | |
2901 | load_fn | |
2902 | arm-aarch64-separate | |
2903 | ||
2904 | aarch64 = ld4r | |
2905 | link-aarch64 = ld4r._EXT2_ | |
2906 | generate *const f64:float64x1x4_t, *const f64:float64x2x4_t | |
2907 | ||
2908 | arm = vld4 | |
2909 | link-arm = vld4dup._EXTpi82_ | |
2910 | generate *const f32:float32x2x4_t, *const f32:float32x4x4_t | |
2911 | ||
2912 | /// Load multiple 4-element structures to four registers | |
2913 | name = vld4 | |
2914 | out-lane-nox | |
2915 | multi_fn = static_assert_imm-in_exp_len-LANE | |
2916 | constn = LANE | |
2917 | a = 0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 | |
2918 | b = 0, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 11, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 | |
2919 | n = 0 | |
2920 | validate 1, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 2, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 | |
2921 | load_fn | |
2922 | arm-aarch64-separate | |
2923 | ||
2924 | aarch64 = ld4 | |
2925 | const-aarch64 = LANE | |
2926 | link-aarch64 = ld4lane._EXTpi82_ | |
2927 | generate *const i8:int8x16x4_t:int8x16x4_t, *const i64:int64x1x4_t:int64x1x4_t, *const i64:int64x2x4_t:int64x2x4_t | |
2928 | ||
2929 | arm = vld4 | |
2930 | const-arm = LANE | |
2931 | link-arm = vld4lane._EXTpi82_ | |
2932 | generate *const i8:int8x8x4_t:int8x8x4_t, *const i16:int16x4x4_t:int16x4x4_t, *const i32:int32x2x4_t:int32x2x4_t | |
2933 | generate *const i16:int16x8x4_t:int16x8x4_t, *const i32:int32x4x4_t:int32x4x4_t | |
2934 | ||
2935 | /// Load multiple 4-element structures to four registers | |
2936 | name = vld4 | |
2937 | out-lane-nox | |
2938 | multi_fn = static_assert_imm-in_exp_len-LANE | |
2939 | multi_fn = transmute, {vld4-outsignedlanenox-::<LANE>, transmute(a), transmute(b)} | |
2940 | constn = LANE | |
2941 | a = 0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 | |
2942 | b = 0, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 11, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 | |
2943 | n = 0 | |
2944 | validate 1, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 2, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 | |
2945 | load_fn | |
2946 | ||
2947 | aarch64 = ld4 | |
2948 | const-aarch64 = LANE | |
2949 | target = aes | |
2950 | generate *const p64:poly64x1x4_t:poly64x1x4_t, *const p64:poly64x2x4_t:poly64x2x4_t | |
2951 | target = default | |
2952 | generate *const p8:poly8x16x4_t:poly8x16x4_t, *const u8:uint8x16x4_t:uint8x16x4_t, *const u64:uint64x1x4_t:uint64x1x4_t, *const u64:uint64x2x4_t:uint64x2x4_t | |
2953 | ||
2954 | arm = vld4 | |
2955 | const-arm = LANE | |
2956 | generate *const u8:uint8x8x4_t:uint8x8x4_t, *const u16:uint16x4x4_t:uint16x4x4_t, *const u32:uint32x2x4_t:uint32x2x4_t | |
2957 | generate *const u16:uint16x8x4_t:uint16x8x4_t, *const u32:uint32x4x4_t:uint32x4x4_t | |
2958 | generate *const p8:poly8x8x4_t:poly8x8x4_t, *const p16:poly16x4x4_t:poly16x4x4_t | |
2959 | generate *const p16:poly16x8x4_t:poly16x8x4_t | |
2960 | ||
2961 | /// Load multiple 4-element structures to four registers | |
2962 | name = vld4 | |
2963 | out-lane-nox | |
2964 | multi_fn = static_assert_imm-in_exp_len-LANE | |
2965 | constn = LANE | |
2966 | a = 0., 1., 2., 2., 2., 5., 6., 7., 8., 5., 6., 7., 8., 1., 4., 3., 5. | |
2967 | b = 0., 2., 2., 2., 2., 16., 2., 18., 5., 6., 7., 8., 1., 4., 3., 5. | |
2968 | n = 0 | |
2969 | validate 1., 2., 2., 2., 2., 16., 2., 18., 2., 6., 7., 8., 2., 4., 3., 5. | |
2970 | load_fn | |
2971 | arm-aarch64-separate | |
2972 | ||
2973 | aarch64 = ld4 | |
2974 | const-aarch64 = LANE | |
2975 | link-aarch64 = ld4lane._EXTpi82_ | |
2976 | generate *const f64:float64x1x4_t:float64x1x4_t, *const f64:float64x2x4_t:float64x2x4_t | |
2977 | ||
2978 | arm = vld4 | |
2979 | const-arm = LANE | |
2980 | link-arm = vld4lane._EXTpi82_ | |
2981 | generate *const f32:float32x2x4_t:float32x2x4_t, *const f32:float32x4x4_t:float32x4x4_t | |
2982 | ||
2983 | /// Store multiple single-element structures from one, two, three, or four registers | |
2984 | name = vst1 | |
2985 | in1-lane-nox | |
2986 | multi_fn = static_assert_imm-in_exp_len-LANE | |
c620b35d | 2987 | multi_fn = *a, {simd_extract!, b, LANE as u32} |
3c0e092e XL |
2988 | constn = LANE |
2989 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
2990 | n = 0 | |
2991 | validate 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
2992 | store_fn | |
2993 | ||
2994 | aarch64 = nop | |
2995 | arm = nop | |
2996 | generate *mut i8:int8x8_t:void, *mut i16:int16x4_t:void, *mut i32:int32x2_t:void, *mut i64:int64x1_t:void | |
2997 | generate *mut i8:int8x16_t:void, *mut i16:int16x8_t:void, *mut i32:int32x4_t:void, *mut i64:int64x2_t:void | |
2998 | generate *mut u8:uint8x8_t:void, *mut u16:uint16x4_t:void, *mut u32:uint32x2_t:void, *mut u64:uint64x1_t:void | |
2999 | generate *mut u8:uint8x16_t:void, *mut u16:uint16x8_t:void, *mut u32:uint32x4_t:void, *mut u64:uint64x2_t:void | |
3000 | generate *mut p8:poly8x8_t:void, *mut p16:poly16x4_t:void, *mut p8:poly8x16_t:void, *mut p16:poly16x8_t:void | |
3001 | target = aes | |
3002 | generate *mut p64:poly64x1_t:void, *mut p64:poly64x2_t:void | |
3003 | ||
3004 | /// Store multiple single-element structures from one, two, three, or four registers | |
3005 | name = vst1 | |
3006 | in1-lane-nox | |
3007 | multi_fn = static_assert_imm-in_exp_len-LANE | |
c620b35d | 3008 | multi_fn = *a, {simd_extract!, b, LANE as u32} |
3c0e092e XL |
3009 | constn = LANE |
3010 | a = 0., 1., 2., 3., 4., 5., 6., 7., 8. | |
3011 | n = 0 | |
3012 | validate 1., 0., 0., 0., 0., 0., 0., 0. | |
3013 | store_fn | |
3014 | ||
3015 | aarch64 = nop | |
3016 | generate *mut f64:float64x1_t:void, *mut f64:float64x2_t:void | |
3017 | ||
3018 | arm = nop | |
3019 | generate *mut f32:float32x2_t:void, *mut f32:float32x4_t:void | |
3020 | ||
3021 | /// Store multiple single-element structures from one, two, three, or four registers | |
3022 | name = vst1 | |
3023 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 | |
3024 | validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 | |
3025 | store_fn | |
3026 | arm-aarch64-separate | |
3027 | ||
3028 | aarch64 = st1 | |
3029 | link-aarch64 = st1x2._EXT3_ | |
3030 | arm = vst1 | |
3031 | link-arm = vst1x2._EXTr3_ | |
3032 | generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void, *mut i64:int64x1x2_t:void | |
3033 | generate *mut i8:int8x16x2_t:void, *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void, *mut i64:int64x2x2_t:void | |
3034 | ||
3035 | link-aarch64 = st1x3._EXT3_ | |
3036 | link-arm = vst1x3._EXTr3_ | |
3037 | generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void, *mut i64:int64x1x3_t:void | |
3038 | generate *mut i8:int8x16x3_t:void, *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void, *mut i64:int64x2x3_t:void | |
3039 | ||
3040 | link-aarch64 = st1x4._EXT3_ | |
3041 | link-arm = vst1x4._EXTr3_ | |
3042 | generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void, *mut i64:int64x1x4_t:void | |
3043 | generate *mut i8:int8x16x4_t:void, *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void, *mut i64:int64x2x4_t:void | |
c295e0f8 XL |
3044 | |
3045 | /// Store multiple single-element structures to one, two, three, or four registers | |
3046 | name = vst1 | |
3047 | multi_fn = vst1-signed-noext, transmute(a), transmute(b) | |
3048 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 | |
3049 | validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 | |
3050 | ||
3051 | store_fn | |
3052 | aarch64 = st1 | |
3053 | arm = vst1 | |
3054 | generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void, *mut u64:uint64x1x2_t:void | |
3055 | generate *mut u8:uint8x16x2_t:void, *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void, *mut u64:uint64x2x2_t:void | |
3056 | generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void, *mut u64:uint64x1x3_t:void | |
3057 | generate *mut u8:uint8x16x3_t:void, *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void, *mut u64:uint64x2x3_t:void | |
3058 | generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void, *mut u64:uint64x1x4_t:void | |
3059 | generate *mut u8:uint8x16x4_t:void, *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void, *mut u64:uint64x2x4_t:void | |
3060 | generate *mut p8:poly8x8x2_t:void, *mut p8:poly8x8x3_t:void, *mut p8:poly8x8x4_t:void | |
3061 | generate *mut p8:poly8x16x2_t:void, *mut p8:poly8x16x3_t:void, *mut p8:poly8x16x4_t:void | |
3062 | generate *mut p16:poly16x4x2_t:void, *mut p16:poly16x4x3_t:void, *mut p16:poly16x4x4_t:void | |
3063 | generate *mut p16:poly16x8x2_t:void, *mut p16:poly16x8x3_t:void, *mut p16:poly16x8x4_t:void | |
3c0e092e XL |
3064 | target = aes |
3065 | generate *mut p64:poly64x1x2_t:void | |
3066 | arm = nop | |
3067 | generate *mut p64:poly64x1x3_t:void, *mut p64:poly64x1x4_t:void | |
3068 | generate *mut p64:poly64x2x2_t:void, *mut p64:poly64x2x3_t:void, *mut p64:poly64x2x4_t:void | |
c295e0f8 XL |
3069 | |
3070 | /// Store multiple single-element structures to one, two, three, or four registers | |
3071 | name = vst1 | |
3072 | a = 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. | |
3073 | validate 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16. | |
3074 | store_fn | |
3075 | arm-aarch64-separate | |
3076 | ||
3077 | aarch64 = st1 | |
3078 | link-aarch64 = st1x2._EXT3_ | |
3079 | generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void | |
3080 | ||
3081 | link-aarch64 = st1x3._EXT3_ | |
3082 | generate *mut f64:float64x1x3_t:void, *mut f64:float64x2x3_t:void | |
3083 | ||
3084 | link-aarch64 = st1x4._EXT3_ | |
3085 | generate *mut f64:float64x1x4_t:void, *mut f64:float64x2x4_t:void | |
3086 | ||
3087 | arm = vst1 | |
3088 | link-aarch64 = st1x2._EXT3_ | |
3089 | link-arm = vst1x2._EXTr3_ | |
3090 | generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void | |
3091 | ||
3092 | link-aarch64 = st1x3._EXT3_ | |
3093 | link-arm = vst1x3._EXTr3_ | |
3094 | generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void | |
3095 | ||
3096 | link-aarch64 = st1x4._EXT3_ | |
3097 | link-arm = vst1x4._EXTr3_ | |
3098 | generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void | |
3099 | ||
3c0e092e XL |
3100 | /// Store multiple 2-element structures from two registers |
3101 | name = vst2 | |
3102 | in1-nox | |
3103 | a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 | |
3104 | validate 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 | |
3105 | store_fn | |
3106 | arm-aarch64-separate | |
3107 | ||
3108 | aarch64 = st2 | |
3109 | link-aarch64 = st2._EXTpi8_ | |
3110 | generate *mut i64:int64x2x2_t:void | |
3111 | ||
3112 | arm = vst2 | |
3113 | link-arm = vst2._EXTpi8r_ | |
3114 | generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void | |
3115 | generate *mut i8:int8x16x2_t:void, *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void | |
3116 | arm = nop | |
3117 | aarch64 = nop | |
3118 | generate *mut i64:int64x1x2_t:void | |
3119 | ||
3120 | /// Store multiple 2-element structures from two registers | |
3121 | name = vst2 | |
c620b35d | 3122 | multi_fn = vst2-in1signednox-noext, transmute(a), transmute(b) |
3c0e092e XL |
3123 | in1-nox |
3124 | a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 | |
3125 | validate 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 | |
3126 | store_fn | |
3127 | ||
3128 | aarch64 = st2 | |
3129 | generate *mut u64:uint64x2x2_t:void | |
3130 | target = aes | |
3131 | generate *mut p64:poly64x2x2_t:void | |
3132 | ||
3133 | target = default | |
3134 | arm = vst2 | |
3135 | generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void | |
3136 | generate *mut u8:uint8x16x2_t:void, *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void | |
3137 | generate *mut p8:poly8x8x2_t:void, *mut p16:poly16x4x2_t:void, *mut p8:poly8x16x2_t:void, *mut p16:poly16x8x2_t:void | |
3138 | arm = nop | |
3139 | aarch64 = nop | |
3140 | generate *mut u64:uint64x1x2_t:void | |
3141 | target = aes | |
3142 | generate *mut p64:poly64x1x2_t:void | |
3143 | ||
3144 | /// Store multiple 2-element structures from two registers | |
3145 | name = vst2 | |
3146 | in1-nox | |
3147 | a = 0., 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9. | |
3148 | validate 1., 2., 2., 3., 2., 4., 3., 5., 2., 6., 3., 7., 4., 8., 5., 9. | |
3149 | store_fn | |
3150 | arm-aarch64-separate | |
3151 | ||
3152 | aarch64 = st1 | |
3153 | link-aarch64 = st2._EXTpi8_ | |
3154 | generate *mut f64:float64x1x2_t:void | |
3155 | aarch64 = st2 | |
3156 | generate *mut f64:float64x2x2_t:void | |
3157 | ||
3158 | arm = vst2 | |
3159 | link-arm = vst2._EXTpi8r_ | |
3160 | generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void | |
3161 | ||
3162 | /// Store multiple 2-element structures from two registers | |
3163 | name = vst2 | |
3164 | in1-lane-nox | |
3165 | constn = LANE | |
3166 | multi_fn = static_assert_imm-in_exp_len-LANE | |
3167 | a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 | |
3168 | n = 0 | |
3169 | validate 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
3170 | store_fn | |
3171 | arm-aarch64-separate | |
3172 | ||
3173 | aarch64 = st2 | |
3174 | link-aarch64 = st2lane._EXTpi8_ | |
3175 | const-aarch64 = LANE | |
3176 | generate *mut i8:int8x16x2_t:void, *mut i64:int64x1x2_t:void, *mut i64:int64x2x2_t:void | |
3177 | ||
3178 | arm = vst2 | |
3179 | link-arm = vst2lane._EXTpi8r_ | |
3180 | const-arm = LANE | |
3181 | generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void | |
3182 | generate *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void | |
3183 | ||
3184 | /// Store multiple 2-element structures from two registers | |
3185 | name = vst2 | |
3186 | in1-lane-nox | |
3187 | constn = LANE | |
3188 | multi_fn = static_assert_imm-in_exp_len-LANE | |
c620b35d | 3189 | multi_fn = vst2-in1signedlanenox-::<LANE>, transmute(a), transmute(b) |
3c0e092e XL |
3190 | a = 0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 |
3191 | n = 0 | |
3192 | validate 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
3193 | store_fn | |
3194 | ||
3195 | aarch64 = st2 | |
3196 | generate *mut u8:uint8x16x2_t:void, *mut u64:uint64x1x2_t:void, *mut u64:uint64x2x2_t:void, *mut p8:poly8x16x2_t:void | |
3197 | target = aes | |
3198 | generate *mut p64:poly64x1x2_t:void, *mut p64:poly64x2x2_t:void | |
3199 | ||
3200 | target = default | |
3201 | arm = vst2 | |
3202 | generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void | |
3203 | generate *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void | |
3204 | generate *mut p8:poly8x8x2_t:void, *mut p16:poly16x4x2_t:void, *mut p16:poly16x8x2_t:void | |
3205 | ||
3206 | /// Store multiple 2-element structures from two registers | |
3207 | name = vst2 | |
3208 | in1-lane-nox | |
3209 | constn = LANE | |
3210 | multi_fn = static_assert_imm-in_exp_len-LANE | |
3211 | a = 0., 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9. | |
3212 | n = 0 | |
3213 | validate 1., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. | |
3214 | store_fn | |
3215 | arm-aarch64-separate | |
3216 | ||
3217 | aarch64 = st2 | |
3218 | link-aarch64 = st2lane._EXTpi8_ | |
3219 | const-aarch64 = LANE | |
3220 | generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void | |
3221 | ||
3222 | arm = vst2 | |
3223 | link-arm = vst2lane._EXTpi8r_ | |
3224 | const-arm = LANE | |
3225 | generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void | |
3226 | ||
3227 | /// Store multiple 3-element structures from three registers | |
3228 | name = vst3 | |
3229 | in1-nox | |
3230 | a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48 | |
3231 | validate 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48 | |
3232 | store_fn | |
3233 | arm-aarch64-separate | |
3234 | ||
3235 | aarch64 = st3 | |
3236 | link-aarch64 = st3._EXTpi8_ | |
3237 | generate *mut i64:int64x2x3_t:void | |
3238 | ||
3239 | arm = vst3 | |
3240 | link-arm = vst3._EXTpi8r_ | |
3241 | generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void | |
3242 | generate *mut i8:int8x16x3_t:void, *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void | |
3243 | arm = nop | |
3244 | aarch64 = nop | |
3245 | generate *mut i64:int64x1x3_t:void | |
3246 | ||
3247 | /// Store multiple 3-element structures from three registers | |
3248 | name = vst3 | |
c620b35d | 3249 | multi_fn = vst3-in1signednox-noext, transmute(a), transmute(b) |
3c0e092e XL |
3250 | in1-nox |
3251 | a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48 | |
3252 | validate 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48 | |
3253 | store_fn | |
3254 | ||
3255 | aarch64 = st3 | |
3256 | generate *mut u64:uint64x2x3_t:void | |
3257 | target = aes | |
3258 | generate *mut p64:poly64x2x3_t:void | |
3259 | ||
3260 | target = default | |
3261 | arm = vst3 | |
3262 | generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void | |
3263 | generate *mut u8:uint8x16x3_t:void, *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void | |
3264 | generate *mut p8:poly8x8x3_t:void, *mut p16:poly16x4x3_t:void, *mut p8:poly8x16x3_t:void, *mut p16:poly16x8x3_t:void | |
3265 | arm = nop | |
3266 | aarch64 = nop | |
3267 | generate *mut u64:uint64x1x3_t:void | |
3268 | target = aes | |
3269 | generate *mut p64:poly64x1x3_t:void | |
3270 | ||
3271 | /// Store multiple 3-element structures from three registers | |
3272 | name = vst3 | |
3273 | in1-nox | |
3274 | a = 0., 1., 2., 2., 4., 2., 4., 7., 8., 2., 4., 7., 8., 13., 14., 15., 16 | |
3275 | validate 1., 2., 2., 2., 4., 4., 2., 7., 7., 4., 8., 8., 2., 13., 13., 4. | |
3276 | store_fn | |
3277 | arm-aarch64-separate | |
3278 | ||
3279 | aarch64 = nop | |
3280 | link-aarch64 = st3._EXTpi8_ | |
3281 | generate *mut f64:float64x1x3_t:void | |
3282 | aarch64 = st3 | |
3283 | generate *mut f64:float64x2x3_t:void | |
3284 | ||
3285 | arm = vst3 | |
3286 | link-arm = vst3._EXTpi8r_ | |
3287 | generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void | |
3288 | ||
3289 | /// Store multiple 3-element structures from three registers | |
3290 | name = vst3 | |
3291 | in1-lane-nox | |
3292 | constn = LANE | |
3293 | multi_fn = static_assert_imm-in_exp_len-LANE | |
3294 | a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48 | |
3295 | n = 0 | |
3296 | validate 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
3297 | store_fn | |
3298 | arm-aarch64-separate | |
3299 | ||
3300 | aarch64 = st3 | |
3301 | link-aarch64 = st3lane._EXTpi8_ | |
3302 | const-aarch64 = LANE | |
3303 | generate *mut i8:int8x16x3_t:void, *mut i64:int64x1x3_t:void, *mut i64:int64x2x3_t:void | |
3304 | ||
3305 | arm = vst3 | |
3306 | link-arm = vst3lane._EXTpi8r_ | |
3307 | const-arm = LANE | |
3308 | generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void | |
3309 | generate *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void | |
3310 | ||
3311 | /// Store multiple 3-element structures from three registers | |
3312 | name = vst3 | |
3313 | in1-lane-nox | |
3314 | constn = LANE | |
3315 | multi_fn = static_assert_imm-in_exp_len-LANE | |
c620b35d | 3316 | multi_fn = vst3-in1signedlanenox-::<LANE>, transmute(a), transmute(b) |
3c0e092e XL |
3317 | a = 0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48 |
3318 | n = 0 | |
3319 | validate 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
3320 | store_fn | |
3321 | ||
3322 | aarch64 = st3 | |
3323 | generate *mut u8:uint8x16x3_t:void, *mut u64:uint64x1x3_t:void, *mut u64:uint64x2x3_t:void, *mut p8:poly8x16x3_t:void | |
3324 | target = aes | |
3325 | generate *mut p64:poly64x1x3_t:void, *mut p64:poly64x2x3_t:void | |
3326 | ||
3327 | target = default | |
3328 | arm = vst3 | |
3329 | generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void | |
3330 | generate *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void | |
3331 | generate *mut p8:poly8x8x3_t:void, *mut p16:poly16x4x3_t:void, *mut p16:poly16x8x3_t:void | |
3332 | ||
3333 | /// Store multiple 3-element structures from three registers | |
3334 | name = vst3 | |
3335 | in1-lane-nox | |
3336 | constn = LANE | |
3337 | multi_fn = static_assert_imm-in_exp_len-LANE | |
3338 | a = 0., 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9. | |
3339 | n = 0 | |
3340 | validate 1., 2., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. | |
3341 | store_fn | |
3342 | arm-aarch64-separate | |
3343 | ||
3344 | aarch64 = st3 | |
3345 | link-aarch64 = st3lane._EXTpi8_ | |
3346 | const-aarch64 = LANE | |
3347 | generate *mut f64:float64x1x3_t:void, *mut f64:float64x2x3_t:void | |
3348 | ||
3349 | arm = vst3 | |
3350 | link-arm = vst3lane._EXTpi8r_ | |
3351 | const-arm = LANE | |
3352 | generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void | |
3353 | ||
3354 | /// Store multiple 4-element structures from four registers | |
3355 | name = vst4 | |
3356 | in1-nox | |
3357 | a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 | |
3358 | validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 | |
3359 | store_fn | |
3360 | arm-aarch64-separate | |
3361 | ||
3362 | aarch64 = st4 | |
3363 | link-aarch64 = st4._EXTpi8_ | |
3364 | generate *mut i64:int64x2x4_t:void | |
3365 | ||
3366 | arm = vst4 | |
3367 | link-arm = vst4._EXTpi8r_ | |
3368 | generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void | |
3369 | generate *mut i8:int8x16x4_t:void, *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void | |
3370 | arm = nop | |
3371 | aarch64 = nop | |
3372 | generate *mut i64:int64x1x4_t:void | |
3373 | ||
3374 | /// Store multiple 4-element structures from four registers | |
3375 | name = vst4 | |
c620b35d | 3376 | multi_fn = vst4-in1signednox-noext, transmute(a), transmute(b) |
3c0e092e XL |
3377 | in1-nox |
3378 | a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 | |
3379 | validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 | |
3380 | store_fn | |
3381 | ||
3382 | aarch64 = st4 | |
3383 | generate *mut u64:uint64x2x4_t:void | |
3384 | target = aes | |
3385 | generate *mut p64:poly64x2x4_t:void | |
3386 | ||
3387 | target = default | |
3388 | arm = vst4 | |
3389 | generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void | |
3390 | generate *mut u8:uint8x16x4_t:void, *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void | |
3391 | generate *mut p8:poly8x8x4_t:void, *mut p16:poly16x4x4_t:void, *mut p8:poly8x16x4_t:void, *mut p16:poly16x8x4_t:void | |
3392 | arm = nop | |
3393 | aarch64 = nop | |
3394 | generate *mut u64:uint64x1x4_t:void | |
3395 | target = aes | |
3396 | generate *mut p64:poly64x1x4_t:void | |
3397 | ||
3398 | /// Store multiple 4-element structures from four registers | |
3399 | name = vst4 | |
3400 | in1-nox | |
3401 | a = 0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16. | |
3402 | validate 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16. | |
3403 | store_fn | |
3404 | arm-aarch64-separate | |
3405 | ||
3406 | aarch64 = nop | |
3407 | link-aarch64 = st4._EXTpi8_ | |
3408 | generate *mut f64:float64x1x4_t:void | |
3409 | aarch64 = st4 | |
3410 | generate *mut f64:float64x2x4_t:void | |
3411 | ||
3412 | arm = vst4 | |
3413 | link-arm = vst4._EXTpi8r_ | |
3414 | generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void | |
3415 | ||
3416 | /// Store multiple 4-element structures from four registers | |
3417 | name = vst4 | |
3418 | in1-lane-nox | |
3419 | constn = LANE | |
3420 | multi_fn = static_assert_imm-in_exp_len-LANE | |
3421 | a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 | |
3422 | n = 0 | |
3423 | validate 1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
3424 | store_fn | |
3425 | arm-aarch64-separate | |
3426 | ||
3427 | aarch64 = st4 | |
3428 | link-aarch64 = st4lane._EXTpi8_ | |
3429 | const-aarch64 = LANE | |
3430 | generate *mut i8:int8x16x4_t:void, *mut i64:int64x1x4_t:void, *mut i64:int64x2x4_t:void | |
3431 | ||
3432 | arm = vst4 | |
3433 | link-arm = vst4lane._EXTpi8r_ | |
3434 | const-arm = LANE | |
3435 | generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void | |
3436 | generate *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void | |
3437 | ||
3438 | /// Store multiple 4-element structures from four registers | |
3439 | name = vst4 | |
3440 | in1-lane-nox | |
3441 | constn = LANE | |
3442 | multi_fn = static_assert_imm-in_exp_len-LANE | |
c620b35d | 3443 | multi_fn = vst4-in1signedlanenox-::<LANE>, transmute(a), transmute(b) |
3c0e092e XL |
3444 | a = 0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64 |
3445 | n = 0 | |
3446 | validate 1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
3447 | store_fn | |
3448 | ||
3449 | aarch64 = st4 | |
3450 | generate *mut u8:uint8x16x4_t:void, *mut u64:uint64x1x4_t:void, *mut u64:uint64x2x4_t:void, *mut p8:poly8x16x4_t:void | |
3451 | target = aes | |
3452 | generate *mut p64:poly64x1x4_t:void, *mut p64:poly64x2x4_t:void | |
3453 | ||
3454 | target = default | |
3455 | arm = vst4 | |
3456 | generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void | |
3457 | generate *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void | |
3458 | generate *mut p8:poly8x8x4_t:void, *mut p16:poly16x4x4_t:void, *mut p16:poly16x8x4_t:void | |
3459 | ||
3460 | /// Store multiple 4-element structures from four registers | |
3461 | name = vst4 | |
3462 | in1-lane-nox | |
3463 | constn = LANE | |
3464 | multi_fn = static_assert_imm-in_exp_len-LANE | |
3465 | a = 0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16. | |
3466 | n = 0 | |
3467 | validate 1., 2., 2., 6., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. | |
3468 | store_fn | |
3469 | arm-aarch64-separate | |
3470 | ||
3471 | aarch64 = st4 | |
3472 | link-aarch64 = st4lane._EXTpi8_ | |
3473 | const-aarch64 = LANE | |
3474 | generate *mut f64:float64x1x4_t:void, *mut f64:float64x2x4_t:void | |
3475 | ||
3476 | arm = vst4 | |
3477 | link-arm = vst4lane._EXTpi8r_ | |
3478 | const-arm = LANE | |
3479 | generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void | |
3480 | ||
781aab86 FG |
3481 | /// Dot product vector form with unsigned and signed integers |
3482 | name = vusdot | |
3483 | out-suffix | |
3484 | a = 1000, -4200, -1000, 2000 | |
3485 | b = 100, 205, 110, 195, 120, 185, 130, 175, 140, 165, 150, 155, 160, 145, 170, 135 | |
3486 | c = 0, 1, 2, 3, -1, -2, -3, -4, 4, 5, 6, 7, -5, -6, -7, -8 | |
3487 | aarch64 = usdot | |
3488 | arm = vusdot | |
3489 | target = i8mm | |
3490 | ||
3491 | // 1000 + (100, 205, 110, 195) . ( 0, 1, 2, 3) | |
3492 | // -4200 + (120, 185, 130, 175) . (-1, -2, -3, -4) | |
3493 | // ... | |
3494 | validate 2010, -5780, 2370, -1940 | |
3495 | ||
3496 | link-arm = usdot._EXT2_._EXT4_:int32x2_t:uint8x8_t:int8x8_t:int32x2_t | |
3497 | link-aarch64 = usdot._EXT2_._EXT4_:int32x2_t:uint8x8_t:int8x8_t:int32x2_t | |
3498 | generate int32x2_t:uint8x8_t:int8x8_t:int32x2_t | |
3499 | ||
3500 | link-arm = usdot._EXT2_._EXT4_:int32x4_t:uint8x16_t:int8x16_t:int32x4_t | |
3501 | link-aarch64 = usdot._EXT2_._EXT4_:int32x4_t:uint8x16_t:int8x16_t:int32x4_t | |
3502 | generate int32x4_t:uint8x16_t:int8x16_t:int32x4_t | |
3503 | ||
3504 | /// Dot product index form with unsigned and signed integers | |
3505 | name = vusdot | |
3506 | out-lane-suffixes | |
3507 | constn = LANE | |
3508 | aarch64 = usdot | |
3509 | arm = vusdot | |
3510 | target = i8mm | |
3511 | multi_fn = static_assert_imm-in2_dot-LANE | |
3512 | multi_fn = transmute, c:merge4_t2, c | |
3513 | multi_fn = simd_shuffle!, c:out_signed, c, c, {dup-out_len-LANE as u32} | |
3514 | multi_fn = vusdot-out-noext, a, b, {transmute, c} | |
3515 | a = 1000, -4200, -1000, 2000 | |
3516 | b = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250 | |
3517 | c = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11 | |
3518 | ||
3519 | // 1000 + (100, 110, 120, 130) . (4, 3, 2, 1) | |
3520 | // -4200 + (140, 150, 160, 170) . (4, 3, 2, 1) | |
3521 | // ... | |
3522 | n = 0 | |
3523 | validate 2100, -2700, 900, 4300 | |
3524 | ||
3525 | // 1000 + (100, 110, 120, 130) . (0, -1, -2, -3) | |
3526 | // -4200 + (140, 150, 160, 170) . (0, -1, -2, -3) | |
3527 | // ... | |
3528 | n = 1 | |
3529 | validate 260, -5180, -2220, 540 | |
3530 | ||
3531 | generate int32x2_t:uint8x8_t:int8x8_t:int32x2_t | |
3532 | generate int32x4_t:uint8x16_t:int8x8_t:int32x4_t | |
3533 | ||
3534 | /// Dot product index form with unsigned and signed integers | |
3535 | name = vusdot | |
3536 | out-lane-suffixes | |
3537 | constn = LANE | |
3538 | // Only AArch64 has the laneq forms. | |
3539 | aarch64 = usdot | |
3540 | target = i8mm | |
3541 | multi_fn = static_assert_imm-in2_dot-LANE | |
3542 | multi_fn = transmute, c:merge4_t2, c | |
3543 | multi_fn = simd_shuffle!, c:out_signed, c, c, {dup-out_len-LANE as u32} | |
3544 | multi_fn = vusdot-out-noext, a, b, {transmute, c} | |
3545 | a = 1000, -4200, -1000, 2000 | |
3546 | b = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250 | |
3547 | c = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11 | |
3548 | ||
3549 | // 1000 + (100, 110, 120, 130) . (-4, -5, -6, -7) | |
3550 | // -4200 + (140, 150, 160, 170) . (-4, -5, -6, -7) | |
3551 | // ... | |
3552 | n = 3 | |
3553 | validate -3420, -10140, -8460, -6980 | |
3554 | ||
3555 | generate int32x2_t:uint8x8_t:int8x16_t:int32x2_t | |
3556 | generate int32x4_t:uint8x16_t:int8x16_t:int32x4_t | |
3557 | ||
3c0e092e XL |
3558 | /// Dot product index form with signed and unsigned integers |
3559 | name = vsudot | |
3560 | out-lane-suffixes | |
3561 | constn = LANE | |
781aab86 FG |
3562 | aarch64 = sudot |
3563 | arm = vsudot | |
3564 | target = i8mm | |
3565 | ||
3c0e092e | 3566 | multi_fn = static_assert_imm-in2_dot-LANE |
781aab86 FG |
3567 | multi_fn = transmute, c:merge4_t2, c |
3568 | multi_fn = simd_shuffle!, c:out_unsigned, c, c, {dup-out_len-LANE as u32} | |
3569 | multi_fn = vusdot-out-noext, a, {transmute, c}, b | |
3570 | a = -2000, 4200, -1000, 2000 | |
3571 | b = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11 | |
3572 | c = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250 | |
3573 | ||
3574 | // -2000 + (4, 3, 2, 1) . (100, 110, 120, 130) | |
3575 | // 4200 + (0, -1, -2, -3) . (100, 110, 120, 130) | |
3576 | // ... | |
3c0e092e | 3577 | n = 0 |
781aab86 FG |
3578 | validate -900, 3460, -3580, -2420 |
3579 | ||
3580 | // -2000 + (4, 3, 2, 1) . (140, 150, 160, 170) | |
3581 | // 4200 + (0, -1, -2, -3) . (140, 150, 160, 170) | |
3582 | // ... | |
3583 | n = 1 | |
3584 | validate -500, 3220, -4460, -3940 | |
3c0e092e | 3585 | |
781aab86 FG |
3586 | generate int32x2_t:int8x8_t:uint8x8_t:int32x2_t |
3587 | generate int32x4_t:int8x16_t:uint8x8_t:int32x4_t | |
3588 | ||
3589 | /// Dot product index form with signed and unsigned integers | |
3590 | name = vsudot | |
3591 | out-lane-suffixes | |
3592 | constn = LANE | |
3593 | // Only AArch64 has the laneq forms. | |
3c0e092e | 3594 | aarch64 = sudot |
781aab86 FG |
3595 | target = i8mm |
3596 | ||
3597 | multi_fn = static_assert_imm-in2_dot-LANE | |
3598 | multi_fn = transmute, c:merge4_t2, c | |
3599 | multi_fn = simd_shuffle!, c:out_unsigned, c, c, {dup-out_len-LANE as u32} | |
3600 | multi_fn = vusdot-out-noext, a, {transmute, c}, b | |
3601 | a = -2000, 4200, -1000, 2000 | |
3602 | b = 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11 | |
3603 | c = 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250 | |
3604 | ||
3605 | // -2000 + (4, 3, 2, 1) . (220, 230, 240, 250) | |
3606 | // 4200 + (0, -1, -2, -3) . (220, 230, 240, 250) | |
3607 | // ... | |
3608 | n = 3 | |
3609 | validate 300, 2740, -6220, -6980 | |
3610 | ||
3611 | generate int32x2_t:int8x8_t:uint8x16_t:int32x2_t | |
3612 | generate int32x4_t:int8x16_t:uint8x16_t:int32x4_t | |
3c0e092e | 3613 | |
17df50a5 XL |
3614 | /// Multiply |
3615 | name = vmul | |
3616 | a = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 | |
3617 | b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
3618 | validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32 | |
3619 | arm = vmul. | |
3620 | aarch64 = mul | |
3621 | fn = simd_mul | |
3622 | generate int*_t, uint*_t | |
3623 | ||
3624 | /// Polynomial multiply | |
3625 | name = vmul | |
3626 | a = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3 | |
3627 | b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
3628 | validate 1, 6, 3, 12, 5, 10, 7, 24, 9, 30, 11, 20, 13, 18, 15, 48 | |
3629 | ||
3630 | aarch64 = pmul | |
3631 | link-aarch64 = pmul._EXT_ | |
3632 | arm = vmul | |
3633 | link-arm = vmulp._EXT_ | |
3634 | generate poly8x8_t, poly8x16_t | |
3635 | ||
3636 | /// Multiply | |
3637 | name = vmul | |
3638 | fn = simd_mul | |
3639 | a = 1.0, 2.0, 1.0, 2.0 | |
3640 | b = 2.0, 3.0, 4.0, 5.0 | |
3641 | validate 2.0, 6.0, 4.0, 10.0 | |
3642 | ||
3643 | aarch64 = fmul | |
3644 | generate float64x*_t | |
3645 | ||
3646 | arm = vmul. | |
3647 | generate float*_t | |
3648 | ||
3649 | /// Vector multiply by scalar | |
3650 | name = vmul | |
3651 | out-n-suffix | |
3652 | multi_fn = simd_mul, a, {vdup-nout-noext, b} | |
3653 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
3654 | b = 2 | |
3655 | validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 | |
3656 | ||
3657 | arm = vmul | |
3658 | aarch64 = mul | |
3659 | generate int16x4_t:i16:int16x4_t, int16x8_t:i16:int16x8_t, int32x2_t:i32:int32x2_t, int32x4_t:i32:int32x4_t | |
3660 | generate uint16x4_t:u16:uint16x4_t, uint16x8_t:u16:uint16x8_t, uint32x2_t:u32:uint32x2_t, uint32x4_t:u32:uint32x4_t | |
3661 | ||
3662 | /// Vector multiply by scalar | |
3663 | name = vmul | |
3664 | out-n-suffix | |
3665 | multi_fn = simd_mul, a, {vdup-nout-noext, b} | |
3666 | a = 1., 2., 3., 4. | |
3667 | b = 2. | |
3668 | validate 2., 4., 6., 8. | |
3669 | ||
3670 | aarch64 = fmul | |
3671 | generate float64x1_t:f64:float64x1_t, float64x2_t:f64:float64x2_t | |
3672 | ||
3673 | arm = vmul | |
3674 | generate float32x2_t:f32:float32x2_t, float32x4_t:f32:float32x4_t | |
3675 | ||
3676 | /// Multiply | |
3677 | name = vmul | |
3678 | lane-suffixes | |
3679 | constn = LANE | |
3680 | multi_fn = static_assert_imm-in_exp_len-LANE | |
353b0b11 | 3681 | multi_fn = simd_mul, a, {simd_shuffle!, b, b, {dup-out_len-LANE as u32}} |
17df50a5 XL |
3682 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
3683 | b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
3684 | n = 1 | |
3685 | validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 | |
3686 | ||
3687 | aarch64 = mul | |
3688 | arm = vmul | |
3689 | generate int16x4_t, int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x4_t:int16x8_t, int16x8_t | |
3690 | generate int32x2_t, int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x2_t:int32x4_t, int32x4_t | |
3691 | generate uint16x4_t, uint16x4_t:uint16x8_t:uint16x4_t, uint16x8_t:uint16x4_t:uint16x8_t, uint16x8_t | |
3692 | generate uint32x2_t, uint32x2_t:uint32x4_t:uint32x2_t, uint32x4_t:uint32x2_t:uint32x4_t, uint32x4_t | |
3693 | ||
3694 | /// Floating-point multiply | |
3695 | name = vmul | |
3696 | lane-suffixes | |
3697 | constn = LANE | |
3698 | multi_fn = static_assert_imm-in_exp_len-LANE | |
c620b35d | 3699 | multi_fn = simd_mul, a, {transmute--<element_t _>, {simd_extract!, b, LANE as u32}} |
17df50a5 XL |
3700 | a = 1., 2., 3., 4. |
3701 | b = 2., 0., 0., 0. | |
3702 | n = 0 | |
3703 | validate 2., 4., 6., 8. | |
3704 | ||
3705 | aarch64 = fmul | |
3706 | generate float64x1_t, float64x1_t:float64x2_t:float64x1_t | |
3707 | ||
3708 | /// Floating-point multiply | |
3709 | name = vmul | |
3710 | lane-suffixes | |
3711 | constn = LANE | |
3712 | multi_fn = static_assert_imm-in_exp_len-LANE | |
353b0b11 | 3713 | multi_fn = simd_mul, a, {simd_shuffle!, b, b, {dup-out_len-LANE as u32}} |
17df50a5 XL |
3714 | a = 1., 2., 3., 4. |
3715 | b = 2., 0., 0., 0. | |
3716 | n = 0 | |
3717 | validate 2., 4., 6., 8. | |
3718 | ||
3719 | aarch64 = fmul | |
3720 | generate float64x2_t:float64x1_t:float64x2_t, float64x2_t | |
3721 | ||
3722 | arm = vmul | |
3723 | generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2_t:float32x4_t, float32x4_t | |
3724 | ||
3725 | /// Floating-point multiply | |
3726 | name = vmuls_lane | |
3727 | constn = LANE | |
3728 | multi_fn = static_assert_imm-in_exp_len-LANE | |
c620b35d | 3729 | multi_fn = simd_extract!, b:f32, b, LANE as u32 |
17df50a5 XL |
3730 | multi_fn = a * b |
3731 | a = 1. | |
3732 | b = 2., 0., 0., 0. | |
3733 | n = 0 | |
3734 | validate 2. | |
3735 | aarch64 = fmul | |
3736 | generate f32:float32x2_t:f32, f32:float32x4_t:f32 | |
3737 | ||
3738 | /// Floating-point multiply | |
3739 | name = vmuld_lane | |
3740 | constn = LANE | |
3741 | multi_fn = static_assert_imm-in_exp_len-LANE | |
c620b35d | 3742 | multi_fn = simd_extract!, b:f64, b, LANE as u32 |
17df50a5 XL |
3743 | multi_fn = a * b |
3744 | a = 1. | |
3745 | b = 2., 0. | |
3746 | n = 0 | |
3747 | validate 2. | |
3748 | aarch64 = fmul | |
3749 | generate f64:float64x1_t:f64, f64:float64x2_t:f64 | |
3750 | ||
3751 | /// Signed multiply long | |
3752 | name = vmull | |
3753 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
3754 | b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 | |
3755 | validate 1, 4, 3, 8, 5, 12, 7, 16, 9, 20, 11, 24, 13, 28, 15, 32 | |
3756 | ||
3757 | arm = vmull.s | |
3758 | aarch64 = smull | |
3759 | link-arm = vmulls._EXT_ | |
3760 | link-aarch64 = smull._EXT_ | |
3761 | generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t | |
3762 | ||
3763 | /// Signed multiply long | |
3764 | name = vmull_high | |
3765 | no-q | |
353b0b11 FG |
3766 | multi_fn = simd_shuffle!, a:half, a, a, {fixed-half-right} |
3767 | multi_fn = simd_shuffle!, b:half, b, b, {fixed-half-right} | |
17df50a5 XL |
3768 | multi_fn = vmull-noqself-noext, a, b |
3769 | a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 | |
3770 | b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 | |
3771 | fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
3772 | validate 9, 20, 11, 24, 13, 28, 15, 32 | |
3773 | ||
3774 | aarch64 = smull2 | |
3775 | generate int8x16_t:int8x16_t:int16x8_t, int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t | |
3776 | ||
3777 | /// Unsigned multiply long | |
3778 | name = vmull | |
3779 | a = 1, 2, 3, 4, 5, 6, 7, 8 | |
3780 | b = 1, 2, 1, 2, 1, 2, 1, 2 | |
3781 | validate 1, 4, 3, 8, 5, 12, 7, 16 | |
3782 | ||
3783 | arm = vmull.s | |
3784 | aarch64 = umull | |
3785 | link-arm = vmullu._EXT_ | |
3786 | link-aarch64 = umull._EXT_ | |
3787 | generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t | |
3788 | ||
3789 | /// Unsigned multiply long | |
3790 | name = vmull_high | |
3791 | no-q | |
353b0b11 FG |
3792 | multi_fn = simd_shuffle!, a:half, a, a, {fixed-half-right} |
3793 | multi_fn = simd_shuffle!, b:half, b, b, {fixed-half-right} | |
17df50a5 XL |
3794 | multi_fn = vmull-noqself-noext, a, b |
3795 | a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 | |
3796 | b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 | |
3797 | fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
3798 | validate 9, 20, 11, 24, 13, 28, 15, 32 | |
3799 | ||
3800 | aarch64 = umull2 | |
3801 | generate uint8x16_t:uint8x16_t:uint16x8_t, uint16x8_t:uint16x8_t:uint32x4_t, uint32x4_t:uint32x4_t:uint64x2_t | |
3802 | ||
3803 | /// Polynomial multiply long | |
3804 | name = vmull | |
3805 | a = 1, 2, 3, 4, 5, 6, 7, 8 | |
3806 | b = 1, 3, 1, 3, 1, 3, 1, 3 | |
3807 | validate 1, 6, 3, 12, 5, 10, 7, 24 | |
3808 | ||
3809 | arm = vmull.s | |
3810 | aarch64 = pmull | |
3811 | link-arm = vmullp._EXT_ | |
3812 | link-aarch64 = pmull._EXT_ | |
3813 | generate poly8x8_t:poly8x8_t:poly16x8_t | |
3814 | ||
3815 | /// Polynomial multiply long | |
3816 | name = vmull | |
3817 | no-q | |
3818 | a = 15 | |
3819 | b = 3 | |
3820 | validate 17 | |
94222f64 | 3821 | target = aes |
17df50a5 XL |
3822 | |
3823 | aarch64 = pmull | |
3824 | link-aarch64 = pmull64:p64:p64:p64:int8x16_t | |
3c0e092e | 3825 | // Because of the support status of llvm, vmull_p64 is currently only available on arm |
17df50a5 XL |
3826 | // arm = vmull |
3827 | // link-arm = vmullp.v2i64:int64x1_t:int64x1_t:int64x1_t:int64x2_t | |
3828 | generate p64:p64:p128 | |
3829 | ||
3830 | ||
3831 | /// Polynomial multiply long | |
3832 | name = vmull_high | |
3833 | no-q | |
353b0b11 FG |
3834 | multi_fn = simd_shuffle!, a:half, a, a, {fixed-half-right} |
3835 | multi_fn = simd_shuffle!, b:half, b, b, {fixed-half-right} | |
17df50a5 XL |
3836 | multi_fn = vmull-noqself-noext, a, b |
3837 | a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 | |
3838 | b = 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3 | |
3839 | fixed = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
3840 | validate 9, 30, 11, 20, 13, 18, 15, 48 | |
3841 | ||
3842 | aarch64 = pmull | |
3843 | generate poly8x16_t:poly8x16_t:poly16x8_t | |
3844 | ||
3845 | /// Polynomial multiply long | |
3846 | name = vmull_high | |
3847 | no-q | |
c620b35d | 3848 | multi_fn = vmull-noqself-noext, {simd_extract!, a, 1}, {simd_extract!, b, 1} |
17df50a5 XL |
3849 | a = 1, 15 |
3850 | b = 1, 3 | |
3851 | validate 17 | |
94222f64 | 3852 | target = aes |
17df50a5 XL |
3853 | |
3854 | aarch64 = pmull | |
3855 | generate poly64x2_t:poly64x2_t:p128 | |
3856 | ||
3857 | /// Vector long multiply with scalar | |
c295e0f8 XL |
3858 | name = vmull_n |
3859 | no-q | |
17df50a5 XL |
3860 | multi_fn = vmull-in0-noext, a, {vdup-nin0-noext, b} |
3861 | a = 1, 2, 3, 4, 5, 6, 7, 8 | |
3862 | b = 2 | |
3863 | validate 2, 4, 6, 8, 10, 12, 14, 16 | |
3864 | ||
3865 | arm = vmull | |
3866 | aarch64 = smull | |
3867 | generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t | |
3868 | aarch64 = umull | |
3869 | generate uint16x4_t:u16:uint32x4_t, uint32x2_t:u32:uint64x2_t | |
3870 | ||
3871 | /// Vector long multiply by scalar | |
3872 | name = vmull_lane | |
3873 | constn = LANE | |
3874 | multi_fn = static_assert_imm-in_exp_len-LANE | |
353b0b11 | 3875 | multi_fn = vmull-in0-noext, a, {simd_shuffle!, b, b, {dup-in0_len-LANE as u32}} |
17df50a5 XL |
3876 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
3877 | b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
3878 | n = 1 | |
3879 | validate 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32 | |
3880 | ||
3881 | arm = vmull | |
3882 | aarch64 = smull | |
3883 | generate int16x4_t:int16x4_t:int32x4_t, int16x4_t:int16x8_t:int32x4_t | |
3884 | generate int32x2_t:int32x2_t:int64x2_t, int32x2_t:int32x4_t:int64x2_t | |
3885 | aarch64 = umull | |
3886 | generate uint16x4_t:uint16x4_t:uint32x4_t, uint16x4_t:uint16x8_t:uint32x4_t | |
3887 | generate uint32x2_t:uint32x2_t:uint64x2_t, uint32x2_t:uint32x4_t:uint64x2_t | |
3888 | ||
3889 | /// Multiply long | |
3890 | name = vmull_high_n | |
3891 | no-q | |
3892 | multi_fn = vmull_high-noqself-noext, a, {vdup-nin0-noext, b} | |
3893 | a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 | |
3894 | b = 2 | |
3895 | validate 18, 20, 22, 24, 26, 28, 30, 32 | |
3896 | ||
3897 | aarch64 = smull2 | |
3898 | generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t | |
3899 | aarch64 = umull2 | |
3900 | generate uint16x8_t:u16:uint32x4_t, uint32x4_t:u32:uint64x2_t | |
3901 | ||
3902 | /// Multiply long | |
3903 | name = vmull_high_lane | |
3904 | constn = LANE | |
3905 | multi_fn = static_assert_imm-in_exp_len-LANE | |
353b0b11 | 3906 | multi_fn = vmull_high-noqself-noext, a, {simd_shuffle!, b, b, {dup-in0_len-LANE as u32}} |
17df50a5 XL |
3907 | a = 1, 2, 9, 10, 9, 10, 11, 12, 9, 10, 11, 12, 13, 14, 15, 16 |
3908 | b = 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
3909 | n = 1 | |
3910 | validate 18, 20, 22, 24, 26, 28, 30, 32 | |
3911 | ||
3912 | aarch64 = smull2 | |
3913 | generate int16x8_t:int16x4_t:int32x4_t, int16x8_t:int16x8_t:int32x4_t | |
3914 | generate int32x4_t:int32x2_t:int64x2_t, int32x4_t:int32x4_t:int64x2_t | |
3915 | aarch64 = umull2 | |
3916 | generate uint16x8_t:uint16x4_t:uint32x4_t, uint16x8_t:uint16x8_t:uint32x4_t | |
3917 | generate uint32x4_t:uint32x2_t:uint64x2_t, uint32x4_t:uint32x4_t:uint64x2_t | |
3918 | ||
3919 | /// Floating-point multiply extended | |
3920 | name = vmulx | |
3921 | a = 1., 2., 3., 4. | |
3922 | b = 2., 2., 2., 2. | |
3923 | validate 2., 4., 6., 8. | |
3924 | ||
3925 | aarch64 = fmulx | |
3926 | link-aarch64 = fmulx._EXT_ | |
3927 | generate float*_t, float64x*_t | |
3928 | ||
3929 | /// Floating-point multiply extended | |
3930 | name = vmulx | |
3931 | lane-suffixes | |
3932 | constn = LANE | |
3933 | multi_fn = static_assert_imm-in_exp_len-LANE | |
c620b35d | 3934 | multi_fn = vmulx-in0-noext, a, {transmute--<element_t _>, {simd_extract!, b, LANE as u32}} |
17df50a5 XL |
3935 | a = 1. |
3936 | b = 2., 0. | |
3937 | n = 0 | |
3938 | validate 2. | |
3939 | ||
3940 | aarch64 = fmulx | |
3941 | generate float64x1_t, float64x1_t:float64x2_t:float64x1_t | |
3942 | ||
3943 | /// Floating-point multiply extended | |
3944 | name = vmulx | |
3945 | lane-suffixes | |
3946 | constn = LANE | |
3947 | multi_fn = static_assert_imm-in_exp_len-LANE | |
353b0b11 | 3948 | multi_fn = vmulx-in0-noext, a, {simd_shuffle!, b, b, {dup-in0_len-LANE as u32}} |
17df50a5 XL |
3949 | a = 1., 2., 3., 4. |
3950 | b = 2., 0., 0., 0. | |
3951 | n = 0 | |
3952 | validate 2., 4., 6., 8. | |
3953 | ||
3954 | aarch64 = fmulx | |
3955 | generate float32x2_t, float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x2_t:float32x4_t, float32x4_t | |
3956 | generate float64x2_t:float64x1_t:float64x2_t, float64x2_t | |
3957 | ||
3958 | /// Floating-point multiply extended | |
3959 | name = vmulx | |
3960 | a = 2. | |
3961 | b = 3. | |
3962 | validate 6. | |
3963 | ||
3964 | aarch64 = fmulx | |
3965 | link-aarch64 = fmulx._EXT_ | |
3966 | generate f32, f64 | |
3967 | ||
3968 | /// Floating-point multiply extended | |
3969 | name = vmulx | |
3970 | lane-suffixes | |
3971 | constn = LANE | |
3972 | multi_fn = static_assert_imm-in_exp_len-LANE | |
c620b35d | 3973 | multi_fn = vmulx-out-noext, a, {simd_extract!, b, LANE as u32} |
17df50a5 XL |
3974 | |
3975 | a = 2. | |
3976 | b = 3., 0., 0., 0. | |
3977 | n = 0 | |
3978 | validate 6. | |
3979 | ||
3980 | aarch64 = fmulx | |
3981 | generate f32:float32x2_t:f32, f32:float32x4_t:f32, f64:float64x1_t:f64, f64:float64x2_t:f64 | |
3982 | ||
3983 | /// Floating-point fused Multiply-Add to accumulator(vector) | |
3984 | name = vfma | |
3985 | multi_fn = vfma-self-_, b, c, a | |
3986 | a = 8.0, 18.0, 12.0, 10.0 | |
3987 | b = 6.0, 4.0, 7.0, 8.0 | |
3988 | c = 2.0, 3.0, 4.0, 5.0 | |
3989 | validate 20.0, 30.0, 40.0, 50.0 | |
3990 | ||
3991 | link-aarch64 = llvm.fma._EXT_ | |
3992 | aarch64 = fmadd | |
3993 | generate float64x1_t | |
3994 | aarch64 = fmla | |
3995 | generate float64x2_t | |
3996 | ||
c295e0f8 | 3997 | target = vfp4 |
17df50a5 XL |
3998 | arm = vfma |
3999 | link-arm = llvm.fma._EXT_ | |
4000 | generate float*_t | |
4001 | ||
4002 | /// Floating-point fused Multiply-Add to accumulator(vector) | |
4003 | name = vfma | |
4004 | n-suffix | |
c295e0f8 | 4005 | multi_fn = vfma-self-noext, a, b, {vdup-nselfvfp4-noext, c} |
17df50a5 XL |
4006 | a = 2.0, 3.0, 4.0, 5.0 |
4007 | b = 6.0, 4.0, 7.0, 8.0 | |
4008 | c = 8.0 | |
4009 | validate 50.0, 35.0, 60.0, 69.0 | |
4010 | ||
4011 | aarch64 = fmadd | |
4012 | generate float64x1_t:float64x1_t:f64:float64x1_t | |
4013 | aarch64 = fmla | |
4014 | generate float64x2_t:float64x2_t:f64:float64x2_t | |
4015 | ||
c295e0f8 | 4016 | target = vfp4 |
17df50a5 XL |
4017 | arm = vfma |
4018 | generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t | |
4019 | ||
4020 | /// Floating-point fused multiply-add to accumulator | |
4021 | name = vfma | |
4022 | in2-lane-suffixes | |
4023 | constn = LANE | |
4024 | multi_fn = static_assert_imm-in2_exp_len-LANE | |
c620b35d | 4025 | multi_fn = vfma-out-noext, a, b, {vdup-nout-noext, {simd_extract!, c, LANE as u32}} |
17df50a5 XL |
4026 | a = 2., 3., 4., 5. |
4027 | b = 6., 4., 7., 8. | |
4028 | c = 2., 0., 0., 0. | |
4029 | n = 0 | |
4030 | validate 14., 11., 18., 21. | |
4031 | ||
4032 | aarch64 = fmla | |
4033 | generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t | |
4034 | aarch64 = fmadd | |
c620b35d | 4035 | generate float64x1_t, float64x1_t:float64x1_t:float64x2_t:float64x1_t |
17df50a5 | 4036 | aarch64 = fmla |
c620b35d | 4037 | generate float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t |
17df50a5 XL |
4038 | |
4039 | /// Floating-point fused multiply-add to accumulator | |
4040 | name = vfma | |
4041 | in2-lane-suffixes | |
4042 | constn = LANE | |
4043 | multi_fn = static_assert_imm-in2_exp_len-LANE | |
c620b35d | 4044 | multi_fn = simd_extract!, c:out_t, c, LANE as u32 |
17df50a5 XL |
4045 | multi_fn = vfma-in2lane-_, b, c, a |
4046 | a = 2. | |
4047 | b = 6. | |
4048 | c = 3., 0., 0., 0. | |
4049 | n = 0 | |
4050 | validate 20. | |
4051 | ||
c620b35d | 4052 | aarch64 = fmadd |
17df50a5 XL |
4053 | link-aarch64 = llvm.fma._EXT_:f32:f32:f32:f32 |
4054 | generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32 | |
4055 | link-aarch64 = llvm.fma._EXT_:f64:f64:f64:f64 | |
c620b35d | 4056 | generate f64:f64:float64x1_t:f64, f64:f64:float64x2_t:f64 |
17df50a5 XL |
4057 | |
4058 | /// Floating-point fused multiply-subtract from accumulator | |
4059 | name = vfms | |
4060 | multi_fn = simd_neg, b:in_t, b | |
4061 | multi_fn = vfma-self-noext, a, b, c | |
4062 | a = 20.0, 30.0, 40.0, 50.0 | |
4063 | b = 6.0, 4.0, 7.0, 8.0 | |
4064 | c = 2.0, 3.0, 4.0, 5.0 | |
4065 | validate 8.0, 18.0, 12.0, 10.0 | |
4066 | ||
4067 | aarch64 = fmsub | |
4068 | generate float64x1_t | |
4069 | aarch64 = fmls | |
4070 | generate float64x2_t | |
4071 | ||
c295e0f8 | 4072 | target = vfp4 |
17df50a5 XL |
4073 | arm = vfms |
4074 | generate float*_t | |
4075 | ||
4076 | /// Floating-point fused Multiply-subtract to accumulator(vector) | |
4077 | name = vfms | |
4078 | n-suffix | |
c295e0f8 | 4079 | multi_fn = vfms-self-noext, a, b, {vdup-nselfvfp4-noext, c} |
17df50a5 XL |
4080 | a = 50.0, 35.0, 60.0, 69.0 |
4081 | b = 6.0, 4.0, 7.0, 8.0 | |
4082 | c = 8.0 | |
4083 | validate 2.0, 3.0, 4.0, 5.0 | |
4084 | ||
4085 | aarch64 = fmsub | |
4086 | generate float64x1_t:float64x1_t:f64:float64x1_t | |
4087 | aarch64 = fmls | |
4088 | generate float64x2_t:float64x2_t:f64:float64x2_t | |
4089 | ||
c295e0f8 | 4090 | target = vfp4 |
17df50a5 XL |
4091 | arm = vfms |
4092 | generate float32x2_t:float32x2_t:f32:float32x2_t, float32x4_t:float32x4_t:f32:float32x4_t | |
4093 | ||
4094 | /// Floating-point fused multiply-subtract to accumulator | |
4095 | name = vfms | |
4096 | in2-lane-suffixes | |
4097 | constn = LANE | |
4098 | multi_fn = static_assert_imm-in2_exp_len-LANE | |
c620b35d | 4099 | multi_fn = vfms-out-noext, a, b, {vdup-nout-noext, {simd_extract!, c, LANE as u32}} |
17df50a5 XL |
4100 | a = 14., 11., 18., 21. |
4101 | b = 6., 4., 7., 8. | |
4102 | c = 2., 0., 0., 0. | |
4103 | n = 0 | |
4104 | validate 2., 3., 4., 5. | |
4105 | ||
4106 | aarch64 = fmls | |
4107 | generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t, float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t | |
4108 | aarch64 = fmsub | |
c620b35d | 4109 | generate float64x1_t, float64x1_t:float64x1_t:float64x2_t:float64x1_t |
17df50a5 | 4110 | aarch64 = fmls |
c620b35d | 4111 | generate float64x2_t:float64x2_t:float64x1_t:float64x2_t, float64x2_t |
17df50a5 XL |
4112 | |
4113 | /// Floating-point fused multiply-subtract to accumulator | |
4114 | name = vfms | |
4115 | in2-lane-suffixes | |
4116 | constn = LANE | |
4117 | multi_fn = vfma-in2lane-::<LANE>, a, -b, c | |
4118 | a = 14. | |
4119 | b = 6. | |
4120 | c = 2., 0., 0., 0. | |
4121 | n = 0 | |
4122 | validate 2. | |
4123 | ||
17df50a5 | 4124 | aarch64 = fmsub |
c620b35d | 4125 | generate f32:f32:float32x2_t:f32, f32:f32:float32x4_t:f32, f64:f64:float64x1_t:f64, f64:f64:float64x2_t:f64 |
17df50a5 XL |
4126 | |
4127 | /// Divide | |
4128 | name = vdiv | |
4129 | fn = simd_div | |
4130 | a = 2.0, 6.0, 4.0, 10.0 | |
4131 | b = 1.0, 2.0, 1.0, 2.0 | |
4132 | validate 2.0, 3.0, 4.0, 5.0 | |
4133 | ||
4134 | aarch64 = fdiv | |
4135 | generate float*_t, float64x*_t | |
4136 | ||
4137 | /// Subtract | |
4138 | name = vsub | |
4139 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
4140 | b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 | |
4141 | validate 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 | |
4142 | arm = vsub. | |
4143 | aarch64 = sub | |
4144 | fn = simd_sub | |
4145 | generate int*_t, uint*_t, int64x*_t, uint64x*_t | |
4146 | ||
4147 | /// Subtract | |
4148 | name = vsub | |
4149 | fn = simd_sub | |
4150 | a = 1.0, 4.0, 3.0, 8.0 | |
4151 | b = 1.0, 2.0, 3.0, 4.0 | |
4152 | validate 0.0, 2.0, 0.0, 4.0 | |
4153 | ||
4154 | aarch64 = fsub | |
4155 | generate float64x*_t | |
4156 | ||
4157 | arm = vsub. | |
4158 | generate float*_t | |
4159 | ||
3c0e092e XL |
4160 | /// Subtract |
4161 | name = vsub | |
a2a8927a | 4162 | multi_fn = a.wrapping_sub(b) |
3c0e092e XL |
4163 | a = 3 |
4164 | b = 2 | |
4165 | validate 1 | |
4166 | ||
4167 | aarch64 = nop | |
4168 | generate i64, u64 | |
4169 | ||
4170 | /// Add | |
4171 | name = vadd | |
a2a8927a | 4172 | multi_fn = a.wrapping_add(b) |
3c0e092e XL |
4173 | a = 1 |
4174 | b = 2 | |
4175 | validate 3 | |
4176 | ||
4177 | aarch64 = nop | |
4178 | generate i64, u64 | |
4179 | ||
4180 | /// Bitwise exclusive OR | |
4181 | name = vadd | |
4182 | multi_fn = simd_xor, a, b | |
4183 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
4184 | b = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 | |
4185 | validate 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 17 | |
4186 | ||
4187 | aarch64 = nop | |
4188 | arm = nop | |
4189 | generate poly8x8_t, poly16x4_t, poly8x16_t, poly16x8_t, poly64x1_t, poly64x2_t | |
4190 | ||
4191 | /// Bitwise exclusive OR | |
4192 | name = vaddq | |
4193 | no-q | |
4194 | multi_fn = a ^ b | |
4195 | a = 16 | |
4196 | b = 1 | |
4197 | validate 17 | |
4198 | ||
4199 | aarch64 = nop | |
4200 | arm = nop | |
4201 | generate p128 | |
4202 | ||
4203 | /// Floating-point add across vector | |
4204 | name = vaddv | |
4205 | a = 1., 2., 0., 0. | |
4206 | validate 3. | |
4207 | ||
4208 | aarch64 = faddp | |
4209 | link-aarch64 = faddv._EXT2_._EXT_ | |
4210 | generate float32x2_t:f32, float32x4_t:f32, float64x2_t:f64 | |
4211 | ||
17df50a5 XL |
4212 | /// Signed Add Long across Vector |
4213 | name = vaddlv | |
4214 | a = 1, 2, 3, 4 | |
4215 | validate 10 | |
4216 | ||
4217 | aarch64 = saddlv | |
4218 | link-aarch64 = llvm.aarch64.neon.saddlv.i32._EXT_ | |
4219 | generate int16x4_t:i32 | |
4220 | ||
4221 | /// Signed Add Long across Vector | |
4222 | name = vaddlv | |
4223 | a = 1, 2, 3, 4, 5, 6, 7, 8 | |
4224 | validate 36 | |
4225 | ||
4226 | aarch64 = saddlv | |
4227 | link-aarch64 = llvm.aarch64.neon.saddlv.i32._EXT_ | |
4228 | generate int16x8_t:i32 | |
4229 | ||
4230 | /// Signed Add Long across Vector | |
4231 | name = vaddlv | |
4232 | a = 1, 2 | |
4233 | validate 3 | |
4234 | ||
4235 | aarch64 = saddlp | |
4236 | link-aarch64 = llvm.aarch64.neon.saddlv.i64._EXT_ | |
4237 | generate int32x2_t:i64 | |
4238 | ||
4239 | /// Signed Add Long across Vector | |
4240 | name = vaddlv | |
4241 | a = 1, 2, 3, 4 | |
4242 | validate 10 | |
4243 | ||
4244 | aarch64 = saddlv | |
4245 | link-aarch64 = llvm.aarch64.neon.saddlv.i64._EXT_ | |
4246 | generate int32x4_t:i64 | |
4247 | ||
4248 | /// Unsigned Add Long across Vector | |
4249 | name = vaddlv | |
4250 | a = 1, 2, 3, 4 | |
4251 | validate 10 | |
4252 | ||
4253 | aarch64 = uaddlv | |
4254 | link-aarch64 = llvm.aarch64.neon.uaddlv.i32._EXT_ | |
4255 | generate uint16x4_t:u32 | |
4256 | ||
4257 | /// Unsigned Add Long across Vector | |
4258 | name = vaddlv | |
4259 | a = 1, 2, 3, 4, 5, 6, 7, 8 | |
4260 | validate 36 | |
4261 | ||
4262 | aarch64 = uaddlv | |
4263 | link-aarch64 = llvm.aarch64.neon.uaddlv.i32._EXT_ | |
4264 | generate uint16x8_t:u32 | |
4265 | ||
4266 | /// Unsigned Add Long across Vector | |
4267 | name = vaddlv | |
4268 | a = 1, 2 | |
4269 | validate 3 | |
4270 | ||
4271 | aarch64 = uaddlp | |
4272 | link-aarch64 = llvm.aarch64.neon.uaddlv.i64._EXT_ | |
4273 | generate uint32x2_t:u64 | |
4274 | ||
4275 | /// Unsigned Add Long across Vector | |
4276 | name = vaddlv | |
4277 | a = 1, 2, 3, 4 | |
4278 | validate 10 | |
4279 | ||
4280 | aarch64 = uaddlv | |
4281 | link-aarch64 = llvm.aarch64.neon.uaddlv.i64._EXT_ | |
4282 | generate uint32x4_t:u64 | |
4283 | ||
4284 | /// Subtract returning high narrow | |
4285 | name = vsubhn | |
4286 | no-q | |
4287 | multi_fn = fixed, c:in_t | |
4288 | multi_fn = simd_cast, {simd_shr, {simd_sub, a, b}, transmute(c)} | |
4289 | a = MAX, MIN, 1, 1, MAX, MIN, 1, 1 | |
4290 | b = 1, 0, 0, 0, 1, 0, 0, 0 | |
4291 | fixed = HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS, HFBITS | |
4292 | validate MAX, MIN, 0, 0, MAX, MIN, 0, 0 | |
4293 | ||
4294 | arm = vsubhn | |
4295 | aarch64 = subhn | |
4296 | generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t | |
4297 | generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t | |
4298 | ||
4299 | /// Subtract returning high narrow | |
4300 | name = vsubhn_high | |
4301 | no-q | |
4302 | multi_fn = vsubhn-noqself-noext, d:in_t0, b, c | |
353b0b11 | 4303 | multi_fn = simd_shuffle!, a, d, {asc-0-out_len} |
17df50a5 XL |
4304 | a = MAX, 0, MAX, 0, MAX, 0, MAX, 0 |
4305 | b = MAX, 1, MAX, 1, MAX, 1, MAX, 1 | |
4306 | c = 1, 0, 1, 0, 1, 0, 1, 0 | |
4307 | validate MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0, MAX, 0 | |
4308 | ||
4309 | arm = vsubhn | |
4310 | aarch64 = subhn2 | |
4311 | generate int8x8_t:int16x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int64x2_t:int32x4_t | |
4312 | generate uint8x8_t:uint16x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint64x2_t:uint32x4_t | |
4313 | ||
4314 | /// Signed halving subtract | |
4315 | name = vhsub | |
4316 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
4317 | b = 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2 | |
4318 | validate 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7 | |
4319 | ||
4320 | arm = vhsub.s | |
4321 | aarch64 = uhsub | |
4322 | link-arm = vhsubu._EXT_ | |
4323 | link-aarch64 = uhsub._EXT_ | |
4324 | generate uint*_t | |
4325 | ||
4326 | arm = vhsub.s | |
4327 | aarch64 = shsub | |
4328 | link-arm = vhsubs._EXT_ | |
4329 | link-aarch64 = shsub._EXT_ | |
4330 | generate int*_t | |
4331 | ||
4332 | /// Signed Subtract Wide | |
4333 | name = vsubw | |
4334 | no-q | |
4335 | multi_fn = simd_sub, a, {simd_cast, b} | |
4336 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 | |
4337 | b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 | |
4338 | validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
4339 | ||
4340 | arm = vsubw | |
4341 | aarch64 = ssubw | |
4342 | generate int16x8_t:int8x8_t:int16x8_t, int32x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int64x2_t | |
4343 | ||
4344 | /// Unsigned Subtract Wide | |
4345 | name = vsubw | |
4346 | no-q | |
4347 | multi_fn = simd_sub, a, {simd_cast, b} | |
4348 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 | |
4349 | b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 | |
4350 | validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
4351 | ||
4352 | arm = vsubw | |
4353 | aarch64 = usubw | |
4354 | generate uint16x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint64x2_t | |
4355 | ||
4356 | /// Signed Subtract Wide | |
4357 | name = vsubw_high | |
4358 | no-q | |
353b0b11 | 4359 | multi_fn = simd_shuffle!, c:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] |
17df50a5 XL |
4360 | multi_fn = simd_sub, a, {simd_cast, c} |
4361 | a = 8, 9, 10, 12, 13, 14, 15, 16 | |
4362 | b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16 | |
4363 | validate 0, 0, 0, 0, 0, 0, 0, 0 | |
4364 | ||
4365 | aarch64 = ssubw | |
4366 | generate int16x8_t:int8x16_t:int16x8_t | |
4367 | ||
4368 | /// Signed Subtract Wide | |
4369 | name = vsubw_high | |
4370 | no-q | |
353b0b11 | 4371 | multi_fn = simd_shuffle!, c:int16x4_t, b, b, [4, 5, 6, 7] |
17df50a5 XL |
4372 | multi_fn = simd_sub, a, {simd_cast, c} |
4373 | a = 8, 9, 10, 11 | |
4374 | b = 0, 1, 2, 3, 8, 9, 10, 11 | |
4375 | validate 0, 0, 0, 0 | |
4376 | ||
4377 | aarch64 = ssubw | |
4378 | generate int32x4_t:int16x8_t:int32x4_t | |
4379 | ||
4380 | /// Signed Subtract Wide | |
4381 | name = vsubw_high | |
4382 | no-q | |
353b0b11 | 4383 | multi_fn = simd_shuffle!, c:int32x2_t, b, b, [2, 3] |
17df50a5 XL |
4384 | multi_fn = simd_sub, a, {simd_cast, c} |
4385 | a = 8, 9 | |
4386 | b = 6, 7, 8, 9 | |
4387 | validate 0, 0 | |
4388 | ||
4389 | aarch64 = ssubw | |
4390 | generate int64x2_t:int32x4_t:int64x2_t | |
4391 | ||
4392 | /// Unsigned Subtract Wide | |
4393 | name = vsubw_high | |
4394 | no-q | |
353b0b11 | 4395 | multi_fn = simd_shuffle!, c:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] |
17df50a5 XL |
4396 | multi_fn = simd_sub, a, {simd_cast, c} |
4397 | a = 8, 9, 10, 11, 12, 13, 14, 15 | |
4398 | b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
4399 | validate 0, 0, 0, 0, 0, 0, 0, 0 | |
4400 | ||
4401 | aarch64 = usubw | |
4402 | generate uint16x8_t:uint8x16_t:uint16x8_t | |
4403 | ||
4404 | /// Unsigned Subtract Wide | |
4405 | name = vsubw_high | |
4406 | no-q | |
353b0b11 | 4407 | multi_fn = simd_shuffle!, c:uint16x4_t, b, b, [4, 5, 6, 7] |
17df50a5 XL |
4408 | multi_fn = simd_sub, a, {simd_cast, c} |
4409 | a = 8, 9, 10, 11 | |
4410 | b = 0, 1, 2, 3, 8, 9, 10, 11 | |
4411 | validate 0, 0, 0, 0 | |
4412 | ||
4413 | aarch64 = usubw | |
4414 | generate uint32x4_t:uint16x8_t:uint32x4_t | |
4415 | ||
4416 | /// Unsigned Subtract Wide | |
4417 | name = vsubw_high | |
4418 | no-q | |
353b0b11 | 4419 | multi_fn = simd_shuffle!, c:uint32x2_t, b, b, [2, 3] |
17df50a5 XL |
4420 | multi_fn = simd_sub, a, {simd_cast, c} |
4421 | a = 8, 9 | |
4422 | b = 6, 7, 8, 9 | |
4423 | validate 0, 0 | |
4424 | ||
4425 | aarch64 = usubw | |
4426 | generate uint64x2_t:uint32x4_t:uint64x2_t | |
4427 | ||
4428 | /// Signed Subtract Long | |
4429 | name = vsubl | |
4430 | no-q | |
4431 | multi_fn = simd_cast, c:out_t, a | |
4432 | multi_fn = simd_cast, d:out_t, b | |
4433 | multi_fn = simd_sub, c, d | |
4434 | ||
4435 | a = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
4436 | b = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
4437 | validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
4438 | ||
4439 | arm = vsubl | |
4440 | aarch64 = ssubl | |
4441 | generate int8x8_t:int8x8_t:int16x8_t, int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t | |
4442 | ||
4443 | /// Unsigned Subtract Long | |
4444 | name = vsubl | |
4445 | no-q | |
4446 | multi_fn = simd_cast, c:out_t, a | |
4447 | multi_fn = simd_cast, d:out_t, b | |
4448 | multi_fn = simd_sub, c, d | |
4449 | ||
4450 | a = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
4451 | b = MAX, MIN, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
4452 | validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
4453 | ||
4454 | arm = vsubl | |
4455 | aarch64 = usubl | |
4456 | generate uint8x8_t:uint8x8_t:uint16x8_t, uint16x4_t:uint16x4_t:uint32x4_t, uint32x2_t:uint32x2_t:uint64x2_t | |
4457 | ||
4458 | /// Signed Subtract Long | |
4459 | name = vsubl_high | |
4460 | no-q | |
353b0b11 | 4461 | multi_fn = simd_shuffle!, c:int8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] |
17df50a5 | 4462 | multi_fn = simd_cast, d:out_t, c |
353b0b11 | 4463 | multi_fn = simd_shuffle!, e:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] |
17df50a5 XL |
4464 | multi_fn = simd_cast, f:out_t, e |
4465 | multi_fn = simd_sub, d, f | |
4466 | ||
4467 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
4468 | b = 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 | |
4469 | validate 6, 7, 8, 9, 10, 11, 12, 13 | |
4470 | ||
4471 | aarch64 = ssubl | |
4472 | generate int8x16_t:int8x16_t:int16x8_t | |
4473 | ||
4474 | /// Signed Subtract Long | |
4475 | name = vsubl_high | |
4476 | no-q | |
353b0b11 | 4477 | multi_fn = simd_shuffle!, c:int16x4_t, a, a, [4, 5, 6, 7] |
17df50a5 | 4478 | multi_fn = simd_cast, d:out_t, c |
353b0b11 | 4479 | multi_fn = simd_shuffle!, e:int16x4_t, b, b, [4, 5, 6, 7] |
17df50a5 XL |
4480 | multi_fn = simd_cast, f:out_t, e |
4481 | multi_fn = simd_sub, d, f | |
4482 | ||
4483 | a = 8, 9, 10, 11, 12, 13, 14, 15 | |
4484 | b = 6, 6, 6, 6, 8, 8, 8, 8 | |
4485 | validate 4, 5, 6, 7 | |
4486 | ||
4487 | aarch64 = ssubl | |
4488 | generate int16x8_t:int16x8_t:int32x4_t | |
4489 | ||
4490 | /// Signed Subtract Long | |
4491 | name = vsubl_high | |
4492 | no-q | |
353b0b11 | 4493 | multi_fn = simd_shuffle!, c:int32x2_t, a, a, [2, 3] |
17df50a5 | 4494 | multi_fn = simd_cast, d:out_t, c |
353b0b11 | 4495 | multi_fn = simd_shuffle!, e:int32x2_t, b, b, [2, 3] |
17df50a5 XL |
4496 | multi_fn = simd_cast, f:out_t, e |
4497 | multi_fn = simd_sub, d, f | |
4498 | ||
4499 | a = 12, 13, 14, 15 | |
4500 | b = 6, 6, 8, 8 | |
4501 | validate 6, 7 | |
4502 | ||
4503 | aarch64 = ssubl | |
4504 | generate int32x4_t:int32x4_t:int64x2_t | |
4505 | ||
4506 | /// Unsigned Subtract Long | |
4507 | name = vsubl_high | |
4508 | no-q | |
353b0b11 | 4509 | multi_fn = simd_shuffle!, c:uint8x8_t, a, a, [8, 9, 10, 11, 12, 13, 14, 15] |
17df50a5 | 4510 | multi_fn = simd_cast, d:out_t, c |
353b0b11 | 4511 | multi_fn = simd_shuffle!, e:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] |
17df50a5 XL |
4512 | multi_fn = simd_cast, f:out_t, e |
4513 | multi_fn = simd_sub, d, f | |
4514 | ||
4515 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
4516 | b = 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2 | |
4517 | validate 6, 7, 8, 9, 10, 11, 12, 13 | |
4518 | ||
4519 | aarch64 = usubl | |
4520 | generate uint8x16_t:uint8x16_t:uint16x8_t | |
4521 | ||
4522 | /// Unsigned Subtract Long | |
4523 | name = vsubl_high | |
4524 | no-q | |
353b0b11 | 4525 | multi_fn = simd_shuffle!, c:uint16x4_t, a, a, [4, 5, 6, 7] |
17df50a5 | 4526 | multi_fn = simd_cast, d:out_t, c |
353b0b11 | 4527 | multi_fn = simd_shuffle!, e:uint16x4_t, b, b, [4, 5, 6, 7] |
17df50a5 XL |
4528 | multi_fn = simd_cast, f:out_t, e |
4529 | multi_fn = simd_sub, d, f | |
4530 | ||
4531 | a = 8, 9, 10, 11, 12, 13, 14, 15 | |
4532 | b = 6, 6, 6, 6, 8, 8, 8, 8 | |
4533 | validate 4, 5, 6, 7 | |
4534 | ||
4535 | aarch64 = usubl | |
4536 | generate uint16x8_t:uint16x8_t:uint32x4_t | |
4537 | ||
4538 | /// Unsigned Subtract Long | |
4539 | name = vsubl_high | |
4540 | no-q | |
353b0b11 | 4541 | multi_fn = simd_shuffle!, c:uint32x2_t, a, a, [2, 3] |
17df50a5 | 4542 | multi_fn = simd_cast, d:out_t, c |
353b0b11 | 4543 | multi_fn = simd_shuffle!, e:uint32x2_t, b, b, [2, 3] |
17df50a5 XL |
4544 | multi_fn = simd_cast, f:out_t, e |
4545 | multi_fn = simd_sub, d, f | |
4546 | ||
4547 | a = 12, 13, 14, 15 | |
4548 | b = 6, 6, 8, 8 | |
4549 | validate 6, 7 | |
4550 | ||
4551 | aarch64 = usubl | |
4552 | generate uint32x4_t:uint32x4_t:uint64x2_t | |
4553 | ||
3c0e092e XL |
4554 | /// Bit clear and exclusive OR |
4555 | name = vbcax | |
4556 | a = 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0 | |
4557 | b = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
4558 | c = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 | |
4559 | validate 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14 | |
4560 | target = sha3 | |
4561 | ||
4562 | aarch64 = bcax | |
4563 | link-aarch64 = llvm.aarch64.crypto.bcaxs._EXT_ | |
4564 | generate int8x16_t, int16x8_t, int32x4_t, int64x2_t | |
4565 | link-aarch64 = llvm.aarch64.crypto.bcaxu._EXT_ | |
4566 | generate uint8x16_t, uint16x8_t, uint32x4_t, uint64x2_t | |
4567 | ||
4568 | /// Floating-point complex add | |
4569 | name = vcadd_rot270 | |
4570 | no-q | |
4571 | a = 1., -1., 1., -1. | |
4572 | b = -1., 1., -1., 1. | |
4573 | validate 2., 0., 2., 0. | |
4574 | target = fcma | |
4575 | ||
4576 | aarch64 = fcadd | |
4577 | link-aarch64 = vcadd.rot270._EXT_ | |
4578 | generate float32x2_t | |
4579 | name = vcaddq_rot270 | |
4580 | generate float32x4_t, float64x2_t | |
4581 | ||
4582 | /// Floating-point complex add | |
4583 | name = vcadd_rot90 | |
4584 | no-q | |
4585 | a = 1., -1., 1., -1. | |
4586 | b = -1., 1., -1., 1. | |
4587 | validate 0., -2., 0., -2. | |
4588 | target = fcma | |
4589 | ||
4590 | aarch64 = fcadd | |
4591 | link-aarch64 = vcadd.rot90._EXT_ | |
4592 | generate float32x2_t | |
4593 | name = vcaddq_rot90 | |
4594 | generate float32x4_t, float64x2_t | |
4595 | ||
4596 | /// Floating-point complex multiply accumulate | |
4597 | name = vcmla | |
4598 | a = 1., -1., 1., -1. | |
4599 | b = -1., 1., -1., 1. | |
4600 | c = 1., 1., -1., -1. | |
4601 | validate 0., -2., 2., 0. | |
4602 | target = fcma | |
4603 | ||
4604 | aarch64 = fcmla | |
4605 | link-aarch64 = vcmla.rot0._EXT_ | |
4606 | generate float32x2_t, float32x4_t, float64x2_t | |
4607 | ||
4608 | /// Floating-point complex multiply accumulate | |
4609 | name = vcmla_rot90 | |
4610 | rot-suffix | |
4611 | a = 1., 1., 1., 1. | |
4612 | b = 1., -1., 1., -1. | |
4613 | c = 1., 1., 1., 1. | |
4614 | validate 2., 0., 2., 0. | |
4615 | target = fcma | |
4616 | ||
4617 | aarch64 = fcmla | |
4618 | link-aarch64 = vcmla.rot90._EXT_ | |
4619 | generate float32x2_t, float32x4_t, float64x2_t | |
4620 | ||
4621 | /// Floating-point complex multiply accumulate | |
4622 | name = vcmla_rot180 | |
4623 | rot-suffix | |
4624 | a = 1., 1., 1., 1. | |
4625 | b = 1., -1., 1., -1. | |
4626 | c = 1., 1., 1., 1. | |
4627 | validate 0., 0., 0., 0. | |
4628 | target = fcma | |
4629 | ||
4630 | aarch64 = fcmla | |
4631 | link-aarch64 = vcmla.rot180._EXT_ | |
4632 | generate float32x2_t, float32x4_t, float64x2_t | |
4633 | ||
4634 | /// Floating-point complex multiply accumulate | |
4635 | name = vcmla_rot270 | |
4636 | rot-suffix | |
4637 | a = 1., 1., 1., 1. | |
4638 | b = 1., -1., 1., -1. | |
4639 | c = 1., 1., 1., 1. | |
4640 | validate 0., 2., 0., 2. | |
4641 | target = fcma | |
4642 | ||
4643 | aarch64 = fcmla | |
4644 | link-aarch64 = vcmla.rot270._EXT_ | |
4645 | generate float32x2_t, float32x4_t, float64x2_t | |
4646 | ||
4647 | /// Floating-point complex multiply accumulate | |
4648 | name = vcmla | |
4649 | in2-lane-suffixes | |
4650 | constn = LANE | |
4651 | multi_fn = static_assert_imm-in2_rot-LANE | |
353b0b11 | 4652 | multi_fn = simd_shuffle!, c:out_t, c, c, {base-2-LANE} |
3c0e092e XL |
4653 | multi_fn = vcmla-self-noext, a, b, c |
4654 | a = 1., -1., 1., -1. | |
4655 | b = -1., 1., -1., 1. | |
4656 | c = 1., 1., -1., -1. | |
4657 | n = 0 | |
4658 | validate 0., -2., 0., -2. | |
4659 | target = fcma | |
4660 | ||
4661 | aarch64 = fcmla | |
4662 | generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t | |
4663 | generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t | |
4664 | ||
4665 | /// Floating-point complex multiply accumulate | |
4666 | name = vcmla_rot90 | |
4667 | rot-lane-suffixes | |
4668 | constn = LANE | |
4669 | multi_fn = static_assert_imm-in2_rot-LANE | |
353b0b11 | 4670 | multi_fn = simd_shuffle!, c:out_t, c, c, {base-2-LANE} |
3c0e092e XL |
4671 | multi_fn = vcmla_rot90-rot-noext, a, b, c |
4672 | a = 1., -1., 1., -1. | |
4673 | b = -1., 1., -1., 1. | |
4674 | c = 1., 1., -1., -1. | |
4675 | n = 0 | |
4676 | validate 0., 0., 0., 0. | |
4677 | target = fcma | |
4678 | ||
4679 | aarch64 = fcmla | |
4680 | generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t | |
4681 | generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t | |
4682 | ||
4683 | /// Floating-point complex multiply accumulate | |
4684 | name = vcmla_rot180 | |
4685 | rot-lane-suffixes | |
4686 | constn = LANE | |
4687 | multi_fn = static_assert_imm-in2_rot-LANE | |
353b0b11 | 4688 | multi_fn = simd_shuffle!, c:out_t, c, c, {base-2-LANE} |
3c0e092e XL |
4689 | multi_fn = vcmla_rot180-rot-noext, a, b, c |
4690 | a = 1., -1., 1., -1. | |
4691 | b = -1., 1., -1., 1. | |
4692 | c = 1., 1., -1., -1. | |
4693 | n = 0 | |
4694 | validate 2., 0., 2., 0. | |
4695 | target = fcma | |
4696 | ||
4697 | aarch64 = fcmla | |
4698 | generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t | |
4699 | generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t | |
4700 | ||
4701 | /// Floating-point complex multiply accumulate | |
4702 | name = vcmla_rot270 | |
4703 | rot-lane-suffixes | |
4704 | constn = LANE | |
4705 | multi_fn = static_assert_imm-in2_rot-LANE | |
353b0b11 | 4706 | multi_fn = simd_shuffle!, c:out_t, c, c, {base-2-LANE} |
3c0e092e XL |
4707 | multi_fn = vcmla_rot270-rot-noext, a, b, c |
4708 | a = 1., -1., 1., -1. | |
4709 | b = -1., 1., -1., 1. | |
4710 | c = 1., 1., -1., -1. | |
4711 | n = 0 | |
4712 | validate 2., -2., 2., -2. | |
4713 | target = fcma | |
4714 | ||
4715 | aarch64 = fcmla | |
4716 | generate float32x2_t, float32x2_t:float32x2_t:float32x4_t:float32x2_t | |
4717 | generate float32x4_t:float32x4_t:float32x2_t:float32x4_t, float32x4_t | |
4718 | ||
781aab86 | 4719 | /// Dot product arithmetic (vector) |
3c0e092e XL |
4720 | name = vdot |
4721 | out-suffix | |
4722 | a = 1, 2, 1, 2 | |
4723 | b = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 | |
4724 | c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 | |
4725 | validate 31, 176, 31, 176 | |
4726 | target = dotprod | |
4727 | ||
781aab86 | 4728 | arm = vsdot |
3c0e092e | 4729 | aarch64 = sdot |
781aab86 | 4730 | link-arm = sdot._EXT_._EXT3_ |
3c0e092e XL |
4731 | link-aarch64 = sdot._EXT_._EXT3_ |
4732 | generate int32x2_t:int8x8_t:int8x8_t:int32x2_t, int32x4_t:int8x16_t:int8x16_t:int32x4_t | |
4733 | ||
781aab86 | 4734 | arm = vudot |
3c0e092e | 4735 | aarch64 = udot |
781aab86 | 4736 | link-arm = udot._EXT_._EXT3_ |
3c0e092e XL |
4737 | link-aarch64 = udot._EXT_._EXT3_ |
4738 | generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t, uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t | |
4739 | ||
781aab86 | 4740 | /// Dot product arithmetic (indexed) |
3c0e092e XL |
4741 | name = vdot |
4742 | out-lane-suffixes | |
4743 | constn = LANE | |
4744 | multi_fn = static_assert_imm-in2_dot-LANE | |
781aab86 FG |
4745 | multi_fn = transmute, c:merge4_t2, c |
4746 | multi_fn = simd_shuffle!, c:out_t, c, c, {dup-out_len-LANE as u32} | |
4747 | multi_fn = vdot-out-noext, a, b, {transmute, c} | |
3c0e092e | 4748 | a = 1, 2, 1, 2 |
781aab86 | 4749 | b = -1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 |
3c0e092e XL |
4750 | c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 |
4751 | n = 0 | |
781aab86 | 4752 | validate 29, 72, 31, 72 |
3c0e092e XL |
4753 | target = dotprod |
4754 | ||
781aab86 | 4755 | // Only AArch64 has the laneq forms. |
3c0e092e | 4756 | aarch64 = sdot |
781aab86 FG |
4757 | generate int32x2_t:int8x8_t:int8x16_t:int32x2_t |
4758 | generate int32x4_t:int8x16_t:int8x16_t:int32x4_t | |
4759 | ||
4760 | arm = vsdot | |
4761 | generate int32x2_t:int8x8_t:int8x8_t:int32x2_t | |
4762 | generate int32x4_t:int8x16_t:int8x8_t:int32x4_t | |
4763 | ||
4764 | /// Dot product arithmetic (indexed) | |
4765 | name = vdot | |
4766 | out-lane-suffixes | |
4767 | constn = LANE | |
4768 | multi_fn = static_assert_imm-in2_dot-LANE | |
4769 | multi_fn = transmute, c:merge4_t2, c | |
4770 | multi_fn = simd_shuffle!, c:out_t, c, c, {dup-out_len-LANE as u32} | |
4771 | multi_fn = vdot-out-noext, a, b, {transmute, c} | |
4772 | a = 1, 2, 1, 2 | |
4773 | b = 255, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 | |
4774 | c = 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 | |
4775 | n = 0 | |
4776 | validate 285, 72, 31, 72 | |
4777 | target = dotprod | |
3c0e092e | 4778 | |
781aab86 | 4779 | // Only AArch64 has the laneq forms. |
3c0e092e | 4780 | aarch64 = udot |
781aab86 FG |
4781 | generate uint32x2_t:uint8x8_t:uint8x16_t:uint32x2_t |
4782 | generate uint32x4_t:uint8x16_t:uint8x16_t:uint32x4_t | |
4783 | ||
4784 | arm = vudot | |
4785 | generate uint32x2_t:uint8x8_t:uint8x8_t:uint32x2_t | |
4786 | generate uint32x4_t:uint8x16_t:uint8x8_t:uint32x4_t | |
3c0e092e | 4787 | |
17df50a5 XL |
4788 | /// Maximum (vector) |
4789 | name = vmax | |
4790 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
4791 | b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 | |
4792 | validate 16, 15, 14, 13, 12, 11, 10, 9, 9, 10, 11, 12, 13, 14, 15, 16 | |
4793 | ||
4794 | arm = vmax | |
4795 | aarch64 = smax | |
4796 | link-arm = vmaxs._EXT_ | |
4797 | link-aarch64 = smax._EXT_ | |
4798 | generate int*_t | |
4799 | ||
4800 | arm = vmax | |
4801 | aarch64 = umax | |
4802 | link-arm = vmaxu._EXT_ | |
4803 | link-aarch64 = umax._EXT_ | |
4804 | generate uint*_t | |
4805 | ||
4806 | /// Maximum (vector) | |
4807 | name = vmax | |
4808 | a = 1.0, -2.0, 3.0, -4.0 | |
4809 | b = 0.0, 3.0, 2.0, 8.0 | |
4810 | validate 1.0, 3.0, 3.0, 8.0 | |
4811 | ||
4812 | aarch64 = fmax | |
4813 | link-aarch64 = fmax._EXT_ | |
4814 | generate float64x*_t | |
4815 | ||
4816 | arm = vmax | |
4817 | aarch64 = fmax | |
4818 | link-arm = vmaxs._EXT_ | |
4819 | link-aarch64 = fmax._EXT_ | |
4820 | generate float*_t | |
4821 | ||
a2a8927a | 4822 | /// Floating-point Maximum Number (vector) |
17df50a5 XL |
4823 | name = vmaxnm |
4824 | a = 1.0, 2.0, 3.0, -4.0 | |
4825 | b = 8.0, 16.0, -1.0, 6.0 | |
4826 | validate 8.0, 16.0, 3.0, 6.0 | |
4827 | ||
4828 | aarch64 = fmaxnm | |
4829 | link-aarch64 = fmaxnm._EXT_ | |
4830 | generate float64x*_t | |
4831 | ||
4832 | target = fp-armv8 | |
4833 | arm = vmaxnm | |
4834 | aarch64 = fmaxnm | |
4835 | link-arm = vmaxnm._EXT_ | |
4836 | link-aarch64 = fmaxnm._EXT_ | |
4837 | generate float*_t | |
4838 | ||
3c0e092e XL |
4839 | /// Floating-point maximum number across vector |
4840 | name = vmaxnmv | |
4841 | a = 1., 2., 0., 1. | |
4842 | validate 2. | |
4843 | ||
4844 | aarch64 = fmaxnmp | |
4845 | link-aarch64 = fmaxnmv._EXT2_._EXT_ | |
4846 | generate float32x2_t:f32, float64x2_t:f64 | |
4847 | aarch64 = fmaxnmv | |
4848 | generate float32x4_t:f32 | |
4849 | ||
17df50a5 XL |
4850 | /// Floating-point Maximum Number Pairwise (vector). |
4851 | name = vpmaxnm | |
4852 | a = 1.0, 2.0 | |
4853 | b = 6.0, -3.0 | |
4854 | validate 2.0, 6.0 | |
4855 | aarch64 = fmaxnmp | |
4856 | link-aarch64 = fmaxnmp._EXT_ | |
4857 | generate float32x2_t:float32x2_t:float32x2_t, float64x2_t:float64x2_t:float64x2_t | |
4858 | ||
4859 | /// Floating-point Maximum Number Pairwise (vector). | |
4860 | name = vpmaxnm | |
4861 | a = 1.0, 2.0, 3.0, -4.0 | |
4862 | b = 8.0, 16.0, -1.0, 6.0 | |
4863 | validate 2.0, 3.0, 16.0, 6.0 | |
4864 | aarch64 = fmaxnmp | |
4865 | link-aarch64 = fmaxnmp._EXT_ | |
4866 | generate float32x4_t:float32x4_t:float32x4_t | |
4867 | ||
3c0e092e XL |
4868 | /// Floating-point maximum number pairwise |
4869 | name = vpmaxnm | |
4870 | out-suffix | |
4871 | a = 1., 2. | |
4872 | validate 2. | |
4873 | ||
4874 | aarch64 = fmaxnmp | |
4875 | link-aarch64 = fmaxnmv._EXT2_._EXT_ | |
4876 | generate float32x2_t:f32 | |
4877 | name = vpmaxnmq | |
4878 | generate float64x2_t:f64 | |
4879 | ||
4880 | /// Floating-point maximum pairwise | |
4881 | name = vpmax | |
4882 | out-suffix | |
4883 | a = 1., 2. | |
4884 | validate 2. | |
4885 | ||
4886 | aarch64 = fmaxp | |
4887 | link-aarch64 = fmaxv._EXT2_._EXT_ | |
4888 | generate float32x2_t:f32 | |
4889 | name = vpmaxq | |
4890 | generate float64x2_t:f64 | |
4891 | ||
17df50a5 XL |
4892 | /// Minimum (vector) |
4893 | name = vmin | |
4894 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
4895 | b = 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 | |
4896 | validate 1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1 | |
4897 | ||
4898 | arm = vmin | |
4899 | aarch64 = smin | |
4900 | link-arm = vmins._EXT_ | |
4901 | link-aarch64 = smin._EXT_ | |
4902 | generate int*_t | |
4903 | ||
4904 | arm = vmin | |
4905 | aarch64 = umin | |
4906 | link-arm = vminu._EXT_ | |
4907 | link-aarch64 = umin._EXT_ | |
4908 | generate uint*_t | |
4909 | ||
4910 | /// Minimum (vector) | |
4911 | name = vmin | |
4912 | a = 1.0, -2.0, 3.0, -4.0 | |
4913 | b = 0.0, 3.0, 2.0, 8.0 | |
4914 | validate 0.0, -2.0, 2.0, -4.0 | |
4915 | ||
4916 | aarch64 = fmin | |
4917 | link-aarch64 = fmin._EXT_ | |
4918 | generate float64x*_t | |
4919 | ||
4920 | arm = vmin | |
4921 | aarch64 = fmin | |
fc512014 XL |
4922 | link-arm = vmins._EXT_ |
4923 | link-aarch64 = fmin._EXT_ | |
4924 | generate float*_t | |
17df50a5 | 4925 | |
a2a8927a | 4926 | /// Floating-point Minimum Number (vector) |
17df50a5 XL |
4927 | name = vminnm |
4928 | a = 1.0, 2.0, 3.0, -4.0 | |
4929 | b = 8.0, 16.0, -1.0, 6.0 | |
4930 | validate 1.0, 2.0, -1.0, -4.0 | |
4931 | ||
4932 | aarch64 = fminnm | |
4933 | link-aarch64 = fminnm._EXT_ | |
4934 | generate float64x*_t | |
4935 | ||
4936 | target = fp-armv8 | |
4937 | arm = vminnm | |
4938 | aarch64 = fminnm | |
4939 | link-arm = vminnm._EXT_ | |
4940 | link-aarch64 = fminnm._EXT_ | |
4941 | generate float*_t | |
4942 | ||
3c0e092e XL |
4943 | /// Floating-point minimum number across vector |
4944 | name = vminnmv | |
4945 | a = 1., 0., 2., 3. | |
4946 | validate 0. | |
4947 | ||
4948 | aarch64 = fminnmp | |
4949 | link-aarch64 = fminnmv._EXT2_._EXT_ | |
4950 | generate float32x2_t:f32, float64x2_t:f64 | |
4951 | aarch64 = fminnmv | |
4952 | generate float32x4_t:f32 | |
4953 | ||
4954 | /// Vector move | |
4955 | name = vmovl_high | |
4956 | no-q | |
353b0b11 | 4957 | multi_fn = simd_shuffle!, a:half, a, a, {asc-halflen-halflen} |
3c0e092e XL |
4958 | multi_fn = vmovl-noqself-noext, a |
4959 | a = 1, 2, 3, 4, 3, 4, 5, 6, 3, 4, 5, 6, 7, 8, 9, 10 | |
4960 | validate 3, 4, 5, 6, 7, 8, 9, 10 | |
4961 | ||
4962 | aarch64 = sxtl2 | |
4963 | generate int8x16_t:int16x8_t, int16x8_t:int32x4_t, int32x4_t:int64x2_t | |
4964 | ||
4965 | aarch64 = uxtl2 | |
4966 | generate uint8x16_t:uint16x8_t, uint16x8_t:uint32x4_t, uint32x4_t:uint64x2_t | |
4967 | ||
4968 | /// Floating-point add pairwise | |
4969 | name = vpadd | |
4970 | a = 1., 2., 3., 4. | |
4971 | b = 3., 4., 5., 6. | |
4972 | validate 3., 7., 7., 11. | |
4973 | ||
4974 | aarch64 = faddp | |
4975 | link-aarch64 = faddp._EXT_ | |
4976 | generate float32x4_t, float64x2_t | |
4977 | ||
4978 | arm = vpadd | |
4979 | link-arm = vpadd._EXT_ | |
4980 | generate float32x2_t | |
4981 | ||
4982 | /// Floating-point add pairwise | |
4983 | name = vpadd | |
4984 | out-suffix | |
c620b35d FG |
4985 | multi_fn = simd_extract!, a1:out_t, a, 0 |
4986 | multi_fn = simd_extract!, a2:out_t, a, 1 | |
3c0e092e XL |
4987 | multi_fn = a1 + a2 |
4988 | a = 1., 2. | |
4989 | validate 3. | |
4990 | ||
4991 | aarch64 = nop | |
4992 | generate float32x2_t:f32, float64x2_t:f64 | |
4993 | ||
17df50a5 XL |
4994 | /// Floating-point Minimum Number Pairwise (vector). |
4995 | name = vpminnm | |
4996 | a = 1.0, 2.0 | |
4997 | b = 6.0, -3.0 | |
4998 | validate 1.0, -3.0 | |
3c0e092e | 4999 | |
17df50a5 XL |
5000 | aarch64 = fminnmp |
5001 | link-aarch64 = fminnmp._EXT_ | |
5002 | generate float32x2_t:float32x2_t:float32x2_t, float64x2_t:float64x2_t:float64x2_t | |
5003 | ||
5004 | /// Floating-point Minimum Number Pairwise (vector). | |
5005 | name = vpminnm | |
5006 | a = 1.0, 2.0, 3.0, -4.0 | |
5007 | b = 8.0, 16.0, -1.0, 6.0 | |
5008 | validate 1.0, -4.0, 8.0, -1.0 | |
5009 | aarch64 = fminnmp | |
5010 | link-aarch64 = fminnmp._EXT_ | |
5011 | generate float32x4_t:float32x4_t:float32x4_t | |
5012 | ||
3c0e092e XL |
5013 | /// Floating-point minimum number pairwise |
5014 | name = vpminnm | |
5015 | out-suffix | |
5016 | a = 1., 2. | |
5017 | validate 1. | |
5018 | ||
5019 | aarch64 = fminnmp | |
5020 | link-aarch64 = fminnmv._EXT2_._EXT_ | |
5021 | generate float32x2_t:f32 | |
5022 | name = vpminnmq | |
5023 | generate float64x2_t:f64 | |
5024 | ||
5025 | /// Floating-point minimum pairwise | |
5026 | name = vpmin | |
5027 | out-suffix | |
5028 | a = 1., 2. | |
5029 | validate 1. | |
5030 | ||
5031 | aarch64 = fminp | |
5032 | link-aarch64 = fminv._EXT2_._EXT_ | |
5033 | generate float32x2_t:f32 | |
5034 | name = vpminq | |
5035 | generate float64x2_t:f64 | |
5036 | ||
17df50a5 XL |
5037 | /// Signed saturating doubling multiply long |
5038 | name = vqdmull | |
5039 | a = 0, 1, 2, 3, 4, 5, 6, 7 | |
5040 | b = 1, 2, 3, 4, 5, 6, 7, 8 | |
5041 | validate 0, 4, 12, 24, 40, 60, 84, 108 | |
5042 | ||
5043 | aarch64 = sqdmull | |
5044 | link-aarch64 = sqdmull._EXT2_ | |
5045 | arm = vqdmull | |
5046 | link-arm = vqdmull._EXT2_ | |
5047 | generate int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t | |
5048 | ||
5049 | /// Signed saturating doubling multiply long | |
5050 | name = vqdmull | |
5051 | multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a | |
5052 | multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b | |
c620b35d | 5053 | multi_fn = simd_extract!, {vqdmull-in_ntt-noext, a, b}, 0 |
17df50a5 XL |
5054 | a = 2 |
5055 | b = 3 | |
5056 | validate 12 | |
5057 | ||
5058 | aarch64 = sqdmull | |
5059 | generate i16:i16:i32 | |
5060 | ||
5061 | /// Signed saturating doubling multiply long | |
5062 | name = vqdmull | |
5063 | a = 2 | |
5064 | b = 3 | |
5065 | validate 12 | |
5066 | ||
5067 | aarch64 = sqdmull | |
5068 | link-aarch64 = sqdmulls.scalar | |
5069 | generate i32:i32:i64 | |
5070 | ||
5071 | /// Vector saturating doubling long multiply with scalar | |
5072 | name = vqdmull_n | |
5073 | no-q | |
5074 | multi_fn = vqdmull-in_ntt-noext, a, {vdup_n-in_ntt-noext, b} | |
5075 | a = 2, 4, 6, 8 | |
5076 | b = 2 | |
5077 | validate 8, 16, 24, 32 | |
5078 | ||
5079 | aarch64 = sqdmull | |
5080 | arm = vqdmull | |
5081 | generate int16x4_t:i16:int32x4_t, int32x2_t:i32:int64x2_t | |
5082 | ||
5083 | /// Signed saturating doubling multiply long | |
5084 | name = vqdmull_high | |
5085 | no-q | |
353b0b11 FG |
5086 | multi_fn = simd_shuffle!, a:half, a, a, {asc-halflen-halflen} |
5087 | multi_fn = simd_shuffle!, b:half, b, b, {asc-halflen-halflen} | |
17df50a5 XL |
5088 | multi_fn = vqdmull-noqself-noext, a, b |
5089 | a = 0, 1, 4, 5, 4, 5, 6, 7 | |
5090 | b = 1, 2, 5, 6, 5, 6, 7, 8 | |
5091 | validate 40, 60, 84, 112 | |
5092 | ||
5093 | aarch64 = sqdmull2 | |
5094 | generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t | |
5095 | ||
5096 | /// Signed saturating doubling multiply long | |
5097 | name = vqdmull_high_n | |
5098 | no-q | |
353b0b11 | 5099 | multi_fn = simd_shuffle!, a:in_ntt, a, a, {asc-out_len-out_len} |
17df50a5 XL |
5100 | multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b |
5101 | multi_fn = vqdmull-in_ntt-noext, a, b | |
5102 | a = 0, 2, 8, 10, 8, 10, 12, 14 | |
5103 | b = 2 | |
5104 | validate 32, 40, 48, 56 | |
5105 | ||
5106 | aarch64 = sqdmull2 | |
5107 | generate int16x8_t:i16:int32x4_t, int32x4_t:i32:int64x2_t | |
5108 | ||
5109 | /// Vector saturating doubling long multiply by scalar | |
5110 | name = vqdmull_lane | |
5111 | constn = N | |
5112 | multi_fn = static_assert_imm-in_exp_len-N | |
353b0b11 | 5113 | multi_fn = simd_shuffle!, b:in_t0, b, b, {dup-out_len-N as u32} |
17df50a5 XL |
5114 | multi_fn = vqdmull-noqself-noext, a, b |
5115 | a = 1, 2, 3, 4 | |
5116 | b = 0, 2, 2, 0, 2, 0, 0, 0 | |
5117 | n = HFLEN | |
5118 | validate 4, 8, 12, 16 | |
5119 | ||
5120 | aarch64 = sqdmull | |
5121 | generate int16x4_t:int16x8_t:int32x4_t, int32x2_t:int32x4_t:int64x2_t | |
5122 | ||
5123 | arm = vqdmull | |
5124 | generate int16x4_t:int16x4_t:int32x4_t, int32x2_t:int32x2_t:int64x2_t | |
5125 | ||
5126 | /// Signed saturating doubling multiply long | |
5127 | name = vqdmullh_lane | |
5128 | constn = N | |
5129 | multi_fn = static_assert_imm-in_exp_len-N | |
c620b35d | 5130 | multi_fn = simd_extract!, b:in_t0, b, N as u32 |
17df50a5 XL |
5131 | multi_fn = vqdmullh-noqself-noext, a, b |
5132 | a = 2 | |
5133 | b = 0, 2, 2, 0, 2, 0, 0, 0 | |
5134 | n = HFLEN | |
5135 | validate 8 | |
5136 | ||
5137 | aarch64 = sqdmull | |
5138 | generate i16:int16x4_t:i32, i16:int16x8_t:i32 | |
5139 | ||
5140 | /// Signed saturating doubling multiply long | |
5141 | name = vqdmulls_lane | |
5142 | constn = N | |
5143 | multi_fn = static_assert_imm-in_exp_len-N | |
c620b35d | 5144 | multi_fn = simd_extract!, b:in_t0, b, N as u32 |
17df50a5 XL |
5145 | multi_fn = vqdmulls-noqself-noext, a, b |
5146 | a = 2 | |
5147 | b = 0, 2, 2, 0, 2, 0, 0, 0 | |
5148 | n = HFLEN | |
5149 | validate 8 | |
5150 | ||
5151 | aarch64 = sqdmull | |
5152 | generate i32:int32x2_t:i64, i32:int32x4_t:i64 | |
5153 | ||
5154 | /// Signed saturating doubling multiply long | |
5155 | name = vqdmull_high_lane | |
5156 | constn = N | |
5157 | multi_fn = static_assert_imm-in_exp_len-N | |
353b0b11 FG |
5158 | multi_fn = simd_shuffle!, a:in_t, a, a, {asc-out_len-out_len} |
5159 | multi_fn = simd_shuffle!, b:in_t, b, b, {dup-out_len-N as u32} | |
17df50a5 XL |
5160 | multi_fn = vqdmull-self-noext, a, b |
5161 | a = 0, 1, 4, 5, 4, 5, 6, 7 | |
5162 | b = 0, 2, 2, 0, 2, 0, 0, 0 | |
5163 | n = HFLEN | |
5164 | validate 16, 20, 24, 28 | |
5165 | ||
5166 | aarch64 = sqdmull2 | |
5167 | generate int16x8_t:int16x4_t:int32x4_t, int32x4_t:int32x2_t:int64x2_t | |
5168 | ||
5169 | /// Signed saturating doubling multiply long | |
5170 | name = vqdmull_high_lane | |
5171 | constn = N | |
5172 | multi_fn = static_assert_imm-in_exp_len-N | |
353b0b11 FG |
5173 | multi_fn = simd_shuffle!, a:half, a, a, {asc-out_len-out_len} |
5174 | multi_fn = simd_shuffle!, b:half, b, b, {dup-out_len-N as u32} | |
17df50a5 XL |
5175 | multi_fn = vqdmull-noqself-noext, a, b |
5176 | a = 0, 1, 4, 5, 4, 5, 6, 7 | |
5177 | b = 0, 2, 2, 0, 2, 0, 0, 0 | |
5178 | n = HFLEN | |
5179 | validate 16, 20, 24, 28 | |
5180 | ||
5181 | aarch64 = sqdmull2 | |
5182 | generate int16x8_t:int16x8_t:int32x4_t, int32x4_t:int32x4_t:int64x2_t | |
5183 | ||
5184 | /// Signed saturating doubling multiply-add long | |
5185 | name = vqdmlal | |
5186 | multi_fn = vqadd-out-noext, a, {vqdmull-self-noext, b, c} | |
5187 | a = 1, 1, 1, 1 | |
5188 | b = 1, 2, 3, 4 | |
5189 | c = 2, 2, 2, 2 | |
5190 | validate 5, 9, 13, 17 | |
5191 | ||
5192 | aarch64 = sqdmlal | |
5193 | arm = vqdmlal | |
5194 | generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t | |
5195 | ||
5196 | /// Vector widening saturating doubling multiply accumulate with scalar | |
5197 | name = vqdmlal | |
5198 | n-suffix | |
5199 | multi_fn = vqadd-out-noext, a, {vqdmull_n-self-noext, b, c} | |
5200 | a = 1, 1, 1, 1 | |
5201 | b = 1, 2, 3, 4 | |
5202 | c = 2 | |
5203 | validate 5, 9, 13, 17 | |
5204 | ||
5205 | aarch64 = sqdmlal | |
5206 | arm = vqdmlal | |
5207 | generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t | |
5208 | ||
5209 | /// Signed saturating doubling multiply-add long | |
5210 | name = vqdmlal_high | |
5211 | no-q | |
5212 | multi_fn = vqadd-out-noext, a, {vqdmull_high-noqself-noext, b, c} | |
5213 | a = 1, 2, 3, 4 | |
5214 | b = 0, 1, 4, 5, 4, 5, 6, 7 | |
5215 | c = 1, 2, 5, 6, 5, 6, 7, 8 | |
5216 | validate 41, 62, 87, 116 | |
5217 | ||
5218 | aarch64 = sqdmlal2 | |
5219 | generate int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t | |
5220 | ||
5221 | /// Signed saturating doubling multiply-add long | |
5222 | name = vqdmlal_high_n | |
5223 | no-q | |
5224 | multi_fn = vqadd-out-noext, a, {vqdmull_high_n-noqself-noext, b, c} | |
5225 | a = 1, 2, 3, 4 | |
5226 | b = 0, 2, 8, 10, 8, 10, 12, 14 | |
5227 | c = 2 | |
5228 | validate 33, 42, 51, 60 | |
5229 | ||
5230 | aarch64 = sqdmlal2 | |
5231 | generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t | |
5232 | ||
5233 | /// Vector widening saturating doubling multiply accumulate with scalar | |
5234 | name = vqdmlal_lane | |
5235 | in2-suffix | |
5236 | constn = N | |
5237 | multi_fn = static_assert_imm-in2_exp_len-N | |
5238 | multi_fn = vqadd-out-noext, a, {vqdmull_lane-in2-::<N>, b, c} | |
5239 | a = 1, 2, 3, 4 | |
5240 | b = 1, 2, 3, 4 | |
5241 | c = 0, 2, 2, 0, 2, 0, 0, 0 | |
5242 | n = HFLEN | |
5243 | validate 5, 10, 15, 20 | |
5244 | ||
5245 | aarch64 = sqdmlal | |
5246 | generate int32x4_t:int16x4_t:int16x8_t:int32x4_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t | |
5247 | ||
5248 | arm = vqdmlal | |
5249 | generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t | |
5250 | ||
5251 | /// Signed saturating doubling multiply-add long | |
5252 | name = vqdmlal_high_lane | |
5253 | in2-suffix | |
5254 | constn = N | |
5255 | multi_fn = static_assert_imm-in2_exp_len-N | |
5256 | multi_fn = vqadd-out-noext, a, {vqdmull_high_lane-in2-::<N>, b, c} | |
5257 | a = 1, 2, 3, 4 | |
5258 | b = 0, 1, 4, 5, 4, 5, 6, 7 | |
5259 | c = 0, 2, 0, 0, 0, 0, 0, 0 | |
5260 | n = 1 | |
5261 | validate 17, 22, 27, 32 | |
5262 | ||
5263 | aarch64 = sqdmlal2 | |
5264 | generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t: int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t | |
5265 | ||
3c0e092e XL |
5266 | /// Signed saturating doubling multiply-add long |
5267 | name = vqdmlal | |
5268 | multi_fn = vqdmull-in_ntt-noext, x:out_long_ntt, {vdup_n-in_ntt-noext, b}, {vdup_n-in_ntt-noext, c} | |
c620b35d | 5269 | multi_fn = vqadd-out-noext, a, {simd_extract!, x, 0} |
3c0e092e XL |
5270 | a = 1 |
5271 | b = 1 | |
5272 | c = 2 | |
5273 | validate 5 | |
5274 | ||
49aad941 FG |
5275 | aarch64 = sqdmlal |
5276 | generate i32:i16:i16:i32 | |
5277 | ||
5278 | /// Signed saturating doubling multiply-add long | |
5279 | name = vqdmlal | |
5280 | multi_fn = vqadd-out-noext, x:out_t, a, {vqdmulls-in_ntt-noext, b, c} | |
5281 | multi_fn = x as out_t | |
5282 | a = 1 | |
5283 | b = 1 | |
5284 | c = 2 | |
5285 | validate 5 | |
5286 | ||
5287 | aarch64 = sqdmlal | |
5288 | generate i64:i32:i32:i64 | |
3c0e092e XL |
5289 | |
5290 | /// Signed saturating doubling multiply-add long | |
5291 | name = vqdmlalh_lane | |
5292 | in2-suffix | |
5293 | constn = LANE | |
5294 | multi_fn = static_assert_imm-in2_exp_len-LANE | |
c620b35d | 5295 | multi_fn = vqdmlal-self-noext, a, b, {simd_extract!, c, LANE as u32} |
3c0e092e XL |
5296 | a = 1 |
5297 | b = 1 | |
5298 | c = 2, 1, 1, 1, 1, 1, 1, 1 | |
5299 | n = 0 | |
5300 | validate 5 | |
5301 | ||
5302 | aarch64 = sqdmlal | |
5303 | generate i32:i16:int16x4_t:i32, i32:i16:int16x8_t:i32 | |
5304 | name = vqdmlals_lane | |
49aad941 | 5305 | aarch64 = sqdmlal |
3c0e092e XL |
5306 | generate i64:i32:int32x2_t:i64, i64:i32:int32x4_t:i64 |
5307 | ||
17df50a5 XL |
5308 | /// Signed saturating doubling multiply-subtract long |
5309 | name = vqdmlsl | |
5310 | multi_fn = vqsub-out-noext, a, {vqdmull-self-noext, b, c} | |
5311 | a = 3, 7, 11, 15 | |
5312 | b = 1, 2, 3, 4 | |
5313 | c = 2, 2, 2, 2 | |
5314 | validate -1, -1, -1, -1 | |
5315 | ||
5316 | aarch64 = sqdmlsl | |
5317 | arm = vqdmlsl | |
5318 | generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t | |
5319 | ||
5320 | /// Vector widening saturating doubling multiply subtract with scalar | |
5321 | name = vqdmlsl | |
5322 | n-suffix | |
5323 | multi_fn = vqsub-out-noext, a, {vqdmull_n-self-noext, b, c} | |
5324 | a = 3, 7, 11, 15 | |
5325 | b = 1, 2, 3, 4 | |
5326 | c = 2 | |
5327 | validate -1, -1, -1, -1 | |
5328 | ||
5329 | aarch64 = sqdmlsl | |
5330 | arm = vqdmlsl | |
5331 | generate int32x4_t:int16x4_t:i16:int32x4_t, int64x2_t:int32x2_t:i32:int64x2_t | |
5332 | ||
5333 | /// Signed saturating doubling multiply-subtract long | |
5334 | name = vqdmlsl_high | |
5335 | no-q | |
5336 | multi_fn = vqsub-out-noext, a, {vqdmull_high-noqself-noext, b, c} | |
5337 | a = 39, 58, 81, 108 | |
5338 | b = 0, 1, 4, 5, 4, 5, 6, 7 | |
5339 | c = 1, 2, 5, 6, 5, 6, 7, 8 | |
5340 | validate -1, -2, -3, -4 | |
5341 | ||
5342 | aarch64 = sqdmlsl2 | |
5343 | generate int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t | |
5344 | ||
5345 | /// Signed saturating doubling multiply-subtract long | |
5346 | name = vqdmlsl_high_n | |
5347 | no-q | |
5348 | multi_fn = vqsub-out-noext, a, {vqdmull_high_n-noqself-noext, b, c} | |
5349 | a = 31, 38, 45, 52 | |
5350 | b = 0, 2, 8, 10, 8, 10, 12, 14 | |
5351 | c = 2 | |
5352 | validate -1, -2, -3, -4 | |
5353 | ||
5354 | aarch64 = sqdmlsl2 | |
5355 | generate int32x4_t:int16x8_t:i16:int32x4_t, int64x2_t:int32x4_t:i32:int64x2_t | |
5356 | ||
5357 | /// Vector widening saturating doubling multiply subtract with scalar | |
5358 | name = vqdmlsl_lane | |
5359 | in2-suffix | |
5360 | constn = N | |
5361 | multi_fn = static_assert_imm-in2_exp_len-N | |
5362 | multi_fn = vqsub-out-noext, a, {vqdmull_lane-in2-::<N>, b, c} | |
5363 | a = 3, 6, 9, 12 | |
5364 | b = 1, 2, 3, 4 | |
5365 | c = 0, 2, 2, 0, 2, 0, 0, 0 | |
5366 | n = HFLEN | |
5367 | validate -1, -2, -3, -4 | |
5368 | ||
5369 | aarch64 = sqdmlsl | |
5370 | generate int32x4_t:int16x4_t:int16x8_t:int32x4_t, int64x2_t:int32x2_t:int32x4_t:int64x2_t | |
5371 | ||
5372 | arm = vqdmlsl | |
5373 | generate int32x4_t:int16x4_t:int16x4_t:int32x4_t, int64x2_t:int32x2_t:int32x2_t:int64x2_t | |
5374 | ||
5375 | /// Signed saturating doubling multiply-subtract long | |
5376 | name = vqdmlsl_high_lane | |
5377 | in2-suffix | |
5378 | constn = N | |
5379 | multi_fn = static_assert_imm-in2_exp_len-N | |
5380 | multi_fn = vqsub-out-noext, a, {vqdmull_high_lane-in2-::<N>, b, c} | |
5381 | a = 15, 18, 21, 24 | |
5382 | b = 0, 1, 4, 5, 4, 5, 6, 7 | |
5383 | c = 0, 2, 0, 0, 0, 0, 0, 0 | |
5384 | n = 1 | |
5385 | validate -1, -2, -3, -4 | |
5386 | ||
5387 | aarch64 = sqdmlsl2 | |
5388 | generate int32x4_t:int16x8_t:int16x4_t:int32x4_t, int32x4_t:int16x8_t:int16x8_t:int32x4_t, int64x2_t: int32x4_t:int32x2_t:int64x2_t, int64x2_t:int32x4_t:int32x4_t:int64x2_t | |
5389 | ||
3c0e092e XL |
5390 | /// Signed saturating doubling multiply-subtract long |
5391 | name = vqdmlsl | |
5392 | multi_fn = vqdmull-in_ntt-noext, x:out_long_ntt, {vdup_n-in_ntt-noext, b}, {vdup_n-in_ntt-noext, c} | |
c620b35d | 5393 | multi_fn = vqsub-out-noext, a, {simd_extract!, x, 0} |
3c0e092e XL |
5394 | a = 10 |
5395 | b = 1 | |
5396 | c = 2 | |
5397 | validate 6 | |
5398 | ||
49aad941 FG |
5399 | aarch64 = sqdmlsl |
5400 | generate i32:i16:i16:i32 | |
5401 | ||
5402 | /// Signed saturating doubling multiply-subtract long | |
5403 | name = vqdmlsl | |
5404 | multi_fn = vqsub-out-noext, x:out_t, a, {vqdmulls-in_ntt-noext, b, c} | |
5405 | multi_fn = x as out_t | |
5406 | a = 10 | |
5407 | b = 1 | |
5408 | c = 2 | |
5409 | validate 6 | |
5410 | ||
5411 | aarch64 = sqdmlsl | |
5412 | generate i64:i32:i32:i64 | |
3c0e092e XL |
5413 | |
5414 | /// Signed saturating doubling multiply-subtract long | |
5415 | name = vqdmlslh_lane | |
5416 | in2-suffix | |
5417 | constn = LANE | |
5418 | multi_fn = static_assert_imm-in2_exp_len-LANE | |
c620b35d | 5419 | multi_fn = vqdmlsl-self-noext, a, b, {simd_extract!, c, LANE as u32} |
3c0e092e XL |
5420 | a = 10 |
5421 | b = 1 | |
5422 | c = 2, 1, 1, 1, 1, 1, 1, 1 | |
5423 | n = 0 | |
5424 | validate 6 | |
5425 | ||
5426 | aarch64 = sqdmlsl | |
5427 | generate i32:i16:int16x4_t:i32, i32:i16:int16x8_t:i32 | |
5428 | name = vqdmlsls_lane | |
49aad941 | 5429 | aarch64 = sqdmlsl |
3c0e092e XL |
5430 | generate i64:i32:int32x2_t:i64, i64:i32:int32x4_t:i64 |
5431 | ||
17df50a5 XL |
5432 | /// Signed saturating doubling multiply returning high half |
5433 | name = vqdmulh | |
5434 | a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX | |
5435 | b = 2, 2, 2, 2, 2, 2, 2, 2 | |
5436 | validate 1, 1, 1, 1, 1, 1, 1, 1 | |
5437 | ||
5438 | aarch64 = sqdmulh | |
5439 | link-aarch64 = sqdmulh._EXT_ | |
5440 | arm = vqdmulh | |
5441 | link-arm = vqdmulh._EXT_ | |
5442 | generate int16x4_t, int16x8_t, int32x2_t, int32x4_t | |
5443 | ||
5444 | /// Signed saturating doubling multiply returning high half | |
5445 | name = vqdmulh | |
5446 | multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a | |
5447 | multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b | |
c620b35d | 5448 | multi_fn = simd_extract!, {vqdmulh-in_ntt-noext, a, b}, 0 |
17df50a5 XL |
5449 | a = 1 |
5450 | b = 2 | |
5451 | validate 0 | |
5452 | ||
5453 | aarch64 = sqdmulh | |
5454 | generate i16, i32 | |
5455 | ||
5456 | /// Vector saturating doubling multiply high with scalar | |
5457 | name = vqdmulh_n | |
5458 | out-suffix | |
5459 | multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b | |
5460 | multi_fn = vqdmulh-out-noext, a, b | |
5461 | a = MAX, MAX, MAX, MAX | |
5462 | b = 2 | |
5463 | validate 1, 1, 1, 1 | |
5464 | ||
5465 | aarch64 = sqdmulh | |
5466 | arm = vqdmulh | |
5467 | generate int16x4_t:i16:int16x4_t, int32x2_t:i32:int32x2_t | |
5468 | ||
5469 | /// Vector saturating doubling multiply high with scalar | |
5470 | name = vqdmulhq_n | |
c295e0f8 | 5471 | no-q |
17df50a5 XL |
5472 | multi_fn = vdupq_n-in_ntt-noext, b:out_t, b |
5473 | multi_fn = vqdmulh-out-noext, a, b | |
5474 | a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX | |
5475 | b = 2 | |
5476 | validate 1, 1, 1, 1, 1, 1, 1, 1 | |
5477 | ||
5478 | aarch64 = sqdmulh | |
5479 | arm = vqdmulh | |
5480 | generate int16x8_t:i16:int16x8_t, int32x4_t:i32:int32x4_t | |
5481 | ||
5482 | /// Signed saturating doubling multiply returning high half | |
5483 | name = vqdmulhh_lane | |
5484 | constn = N | |
5485 | multi_fn = static_assert_imm-in_exp_len-N | |
c620b35d | 5486 | multi_fn = simd_extract!, b:in_t0, b, N as u32 |
17df50a5 XL |
5487 | multi_fn = vqdmulhh-out_ntt-noext, a, b |
5488 | a = 2 | |
5489 | b = 0, 0, MAX, 0, 0, 0, 0, 0 | |
5490 | n = 2 | |
5491 | validate 1 | |
5492 | ||
5493 | aarch64 = sqdmulh | |
5494 | generate i16:int16x4_t:i16, i16:int16x8_t:i16 | |
5495 | ||
5496 | /// Signed saturating doubling multiply returning high half | |
5497 | name = vqdmulhs_lane | |
5498 | constn = N | |
5499 | multi_fn = static_assert_imm-in_exp_len-N | |
c620b35d | 5500 | multi_fn = simd_extract!, b:in_t0, b, N as u32 |
17df50a5 XL |
5501 | multi_fn = vqdmulhs-out_ntt-noext, a, b |
5502 | a = 2 | |
5503 | b = 0, MAX, 0, 0 | |
5504 | n = 1 | |
5505 | validate 1 | |
5506 | ||
5507 | aarch64 = sqdmulh | |
5508 | generate i32:int32x2_t:i32, i32:int32x4_t:i32 | |
5509 | ||
3c0e092e XL |
5510 | /// Vector saturating doubling multiply high by scalar |
5511 | name = vqdmulh | |
5512 | lane-suffixes | |
5513 | constn = LANE | |
5514 | multi_fn = static_assert_imm-in2_exp_len-LANE | |
c620b35d | 5515 | multi_fn = vqdmulh-out-noext, a, {vdup-nout-noext, {simd_extract!, b, LANE as u32}} |
3c0e092e XL |
5516 | a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX |
5517 | b = 2, 1, 1, 1, 1, 1, 1, 1 | |
5518 | n = 0 | |
5519 | validate 1, 1, 1, 1, 1, 1, 1, 1 | |
5520 | ||
5521 | aarch64 = sqdmulh | |
5522 | generate int16x4_t, int16x8_t:int16x4_t:int16x8_t | |
5523 | generate int32x2_t, int32x4_t:int32x2_t:int32x4_t | |
5524 | arm = vqdmulh | |
5525 | generate int16x8_t, int16x4_t:int16x8_t:int16x4_t | |
5526 | generate int32x4_t, int32x2_t:int32x4_t:int32x2_t | |
5527 | ||
17df50a5 XL |
5528 | /// Signed saturating extract narrow |
5529 | name = vqmovn | |
5530 | no-q | |
5531 | a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX | |
5532 | validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX | |
5533 | ||
5534 | aarch64 = sqxtn | |
5535 | link-aarch64 = sqxtn._EXT2_ | |
5536 | arm = vqmovn | |
5537 | link-arm = vqmovns._EXT2_ | |
5538 | generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t | |
5539 | ||
5540 | /// Unsigned saturating extract narrow | |
5541 | name = vqmovn | |
5542 | no-q | |
5543 | a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX | |
5544 | validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX | |
5545 | ||
5546 | aarch64 = uqxtn | |
5547 | link-aarch64 = uqxtn._EXT2_ | |
5548 | arm = vqmovn | |
5549 | link-arm = vqmovnu._EXT2_ | |
5550 | generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t | |
5551 | ||
5552 | /// Saturating extract narrow | |
5553 | name = vqmovn | |
c620b35d | 5554 | multi_fn = simd_extract!, {vqmovn-in_ntt-noext, {vdupq_n-in_ntt-noext, a}}, 0 |
17df50a5 XL |
5555 | a = 1 |
5556 | validate 1 | |
5557 | ||
5558 | aarch64 = sqxtn | |
5559 | generate i16:i8, i32:i16 | |
5560 | aarch64 = uqxtn | |
5561 | generate u16:u8, u32:u16 | |
5562 | ||
5563 | /// Saturating extract narrow | |
5564 | name = vqmovn | |
5565 | a = 1 | |
5566 | validate 1 | |
5567 | ||
5568 | aarch64 = sqxtn | |
5569 | link-aarch64 = scalar.sqxtn._EXT2_._EXT_ | |
5570 | generate i64:i32 | |
5571 | ||
5572 | aarch64 = uqxtn | |
5573 | link-aarch64 = scalar.uqxtn._EXT2_._EXT_ | |
5574 | generate u64:u32 | |
5575 | ||
5576 | /// Signed saturating extract narrow | |
5577 | name = vqmovn_high | |
5578 | no-q | |
353b0b11 | 5579 | multi_fn = simd_shuffle!, a, {vqmovn-noqself-noext, b}, {asc-0-out_len} |
17df50a5 XL |
5580 | a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX |
5581 | b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX | |
5582 | validate MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX | |
5583 | ||
5584 | aarch64 = sqxtn2 | |
5585 | generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t | |
5586 | aarch64 = uqxtn2 | |
5587 | generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t | |
5588 | ||
5589 | /// Signed saturating extract unsigned narrow | |
5590 | name = vqmovun | |
5591 | no-q | |
5592 | a = -1, -1, -1, -1, -1, -1, -1, -1 | |
5593 | validate 0, 0, 0, 0, 0, 0, 0, 0 | |
5594 | ||
5595 | aarch64 = sqxtun | |
5596 | link-aarch64 = sqxtun._EXT2_ | |
5597 | arm = vqmovun | |
5598 | link-arm = vqmovnsu._EXT2_ | |
5599 | generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t | |
5600 | ||
5601 | /// Signed saturating extract unsigned narrow | |
5602 | name = vqmovun | |
c620b35d | 5603 | multi_fn = simd_extract!, {vqmovun-in_ntt-noext, {vdupq_n-in_ntt-noext, a}}, 0 |
17df50a5 XL |
5604 | a = 1 |
5605 | validate 1 | |
5606 | ||
5607 | aarch64 = sqxtun | |
5608 | generate i16:u8, i32:u16, i64:u32 | |
5609 | ||
5610 | /// Signed saturating extract unsigned narrow | |
5611 | name = vqmovun_high | |
5612 | no-q | |
353b0b11 | 5613 | multi_fn = simd_shuffle!, a, {vqmovun-noqself-noext, b}, {asc-0-out_len} |
17df50a5 XL |
5614 | a = 0, 0, 0, 0, 0, 0, 0, 0 |
5615 | b = -1, -1, -1, -1, -1, -1, -1, -1 | |
5616 | validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
5617 | ||
5618 | aarch64 = sqxtun2 | |
5619 | generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t | |
5620 | ||
5621 | /// Signed saturating rounding doubling multiply returning high half | |
5622 | name = vqrdmulh | |
5623 | a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX | |
5624 | b = 2, 2, 2, 2, 2, 2, 2, 2 | |
5625 | validate 2, 2, 2, 2, 2, 2, 2, 2 | |
5626 | ||
5627 | aarch64 = sqrdmulh | |
5628 | link-aarch64 = sqrdmulh._EXT_ | |
5629 | arm = vqrdmulh | |
5630 | link-arm = vqrdmulh._EXT_ | |
5631 | generate int16x4_t, int16x8_t, int32x2_t, int32x4_t | |
5632 | ||
5633 | /// Signed saturating rounding doubling multiply returning high half | |
5634 | name = vqrdmulh | |
c620b35d | 5635 | multi_fn = simd_extract!, {vqrdmulh-in_ntt-noext, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 |
17df50a5 XL |
5636 | a = 1 |
5637 | b = 2 | |
5638 | validate 0 | |
5639 | ||
5640 | aarch64 = sqrdmulh | |
5641 | generate i16, i32 | |
5642 | ||
5643 | /// Vector saturating rounding doubling multiply high with scalar | |
5644 | name = vqrdmulh | |
5645 | out-n-suffix | |
5646 | multi_fn = vqrdmulh-out-noext, a, {vdup-nout-noext, b} | |
5647 | a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX | |
5648 | b = 2 | |
5649 | validate 2, 2, 2, 2, 2, 2, 2, 2 | |
5650 | ||
5651 | aarch64 = sqrdmulh | |
5652 | arm = vqrdmulh | |
5653 | generate int16x4_t:i16:int16x4_t, int16x8_t:i16:int16x8_t, int32x2_t:i32:int32x2_t, int32x4_t:i32:int32x4_t | |
5654 | ||
5655 | /// Vector rounding saturating doubling multiply high by scalar | |
5656 | name = vqrdmulh | |
5657 | lane-suffixes | |
5658 | constn = LANE | |
5659 | multi_fn = static_assert_imm-in_exp_len-LANE | |
353b0b11 | 5660 | multi_fn = simd_shuffle!, b:out_t, b, b, {dup-out_len-LANE as u32} |
17df50a5 XL |
5661 | multi_fn = vqrdmulh-out-noext, a, b |
5662 | a = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX | |
5663 | b = 0, 2, 0, 0, 0, 0, 0, 0, | |
5664 | n = 1 | |
5665 | validate 2, 2, 2, 2, 2, 2, 2, 2 | |
5666 | ||
5667 | aarch64 = sqrdmulh | |
5668 | arm = vqrdmulh | |
5669 | generate int16x4_t, int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x4_t:int16x8_t, int16x8_t | |
5670 | generate int32x2_t, int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x2_t:int32x4_t, int32x4_t | |
5671 | ||
5672 | /// Signed saturating rounding doubling multiply returning high half | |
5673 | name = vqrdmulh | |
5674 | lane-suffixes | |
5675 | constn = LANE | |
5676 | multi_fn = static_assert_imm-in_exp_len-LANE | |
c620b35d | 5677 | multi_fn = vqrdmulh-out-noext, a, {simd_extract!, b, LANE as u32} |
17df50a5 XL |
5678 | a = 1 |
5679 | b = 0, 2, 0, 0, 0, 0, 0, 0, | |
5680 | n = 1 | |
5681 | validate 0 | |
5682 | ||
5683 | aarch64 = sqrdmulh | |
5684 | generate i16:int16x4_t:i16, i16:int16x8_t:i16, i32:int32x2_t:i32, i32:int32x4_t:i32 | |
5685 | ||
5686 | /// Signed saturating rounding doubling multiply accumulate returning high half | |
5687 | name = vqrdmlah | |
17df50a5 XL |
5688 | a = 1, 1, 1, 1, 1, 1, 1, 1 |
5689 | b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX | |
5690 | c = 2, 2, 2, 2, 2, 2, 2, 2 | |
5691 | validate 3, 3, 3, 3, 3, 3, 3, 3 | |
5692 | ||
3c0e092e | 5693 | aarch64 = sqrdmlah |
5e7ed085 | 5694 | link-aarch64 = sqrdmlah._EXT_ |
3c0e092e | 5695 | target = rdm |
17df50a5 XL |
5696 | generate int16x4_t, int16x8_t, int32x2_t, int32x4_t |
5697 | ||
5698 | /// Signed saturating rounding doubling multiply accumulate returning high half | |
5699 | name = vqrdmlah | |
3c0e092e XL |
5700 | multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a |
5701 | multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b | |
5702 | multi_fn = vdup_n-in_ntt-noext, c:in_ntt, c | |
c620b35d | 5703 | multi_fn = simd_extract!, {vqrdmlah-in_ntt-noext, a, b, c}, 0 |
17df50a5 XL |
5704 | a = 1 |
5705 | b = 1 | |
5706 | c = 2 | |
5707 | validate 1 | |
5708 | ||
3c0e092e XL |
5709 | aarch64 = sqrdmlah |
5710 | target = rdm | |
17df50a5 XL |
5711 | generate i16, i32 |
5712 | ||
5713 | /// Signed saturating rounding doubling multiply accumulate returning high half | |
5714 | name = vqrdmlah | |
5715 | in2-lane-suffixes | |
5716 | constn = LANE | |
5717 | multi_fn = static_assert_imm-in2_exp_len-LANE | |
353b0b11 | 5718 | multi_fn = simd_shuffle!, c:out_t, c, c, {dup-out_len-LANE as u32} |
5e7ed085 | 5719 | multi_fn = vqrdmlah-out-noext, a, b, c |
17df50a5 XL |
5720 | a = 1, 1, 1, 1, 1, 1, 1, 1 |
5721 | b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX | |
5722 | c = 0, 2, 0, 0, 0, 0, 0, 0 | |
5723 | n = 1 | |
5724 | validate 3, 3, 3, 3, 3, 3, 3, 3 | |
5725 | ||
3c0e092e XL |
5726 | aarch64 = sqrdmlah |
5727 | target = rdm | |
17df50a5 XL |
5728 | generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t |
5729 | generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t | |
5730 | ||
5731 | /// Signed saturating rounding doubling multiply accumulate returning high half | |
5732 | name = vqrdmlah | |
5733 | in2-lane-suffixes | |
5734 | constn = LANE | |
5735 | multi_fn = static_assert_imm-in2_exp_len-LANE | |
c620b35d | 5736 | multi_fn = vqrdmlah-self-noext, a, b, {simd_extract!, c, LANE as u32} |
17df50a5 XL |
5737 | a = 1 |
5738 | b = 1 | |
5739 | c = 0, 2, 0, 0, 0, 0, 0, 0 | |
5740 | n = 1 | |
5741 | validate 1 | |
5742 | ||
3c0e092e XL |
5743 | aarch64 = sqrdmlah |
5744 | target = rdm | |
17df50a5 XL |
5745 | generate i16:i16:int16x4_t:i16, i16:i16:int16x8_t:i16, i32:i32:int32x2_t:i32, i32:i32:int32x4_t:i32 |
5746 | ||
5747 | /// Signed saturating rounding doubling multiply subtract returning high half | |
5748 | name = vqrdmlsh | |
04454e1e | 5749 | link-aarch64 = sqrdmlsh._EXT_ |
17df50a5 XL |
5750 | a = 1, 1, 1, 1, 1, 1, 1, 1 |
5751 | b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX | |
5752 | c = 2, 2, 2, 2, 2, 2, 2, 2 | |
5753 | validate -1, -1, -1, -1, -1, -1, -1, -1 | |
5754 | ||
04454e1e FG |
5755 | aarch64 = sqrdmlsh |
5756 | target = rdm | |
17df50a5 XL |
5757 | generate int16x4_t, int16x8_t, int32x2_t, int32x4_t |
5758 | ||
5759 | /// Signed saturating rounding doubling multiply subtract returning high half | |
5760 | name = vqrdmlsh | |
04454e1e FG |
5761 | multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a |
5762 | multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b | |
5763 | multi_fn = vdup_n-in_ntt-noext, c:in_ntt, c | |
c620b35d | 5764 | multi_fn = simd_extract!, {vqrdmlsh-in_ntt-noext, a, b, c}, 0 |
17df50a5 XL |
5765 | a = 1 |
5766 | b = 1 | |
5767 | c = 2 | |
5768 | validate 1 | |
5769 | ||
04454e1e FG |
5770 | aarch64 = sqrdmlsh |
5771 | target = rdm | |
17df50a5 XL |
5772 | generate i16, i32 |
5773 | ||
5774 | /// Signed saturating rounding doubling multiply subtract returning high half | |
5775 | name = vqrdmlsh | |
5776 | in2-lane-suffixes | |
5777 | constn = LANE | |
5778 | multi_fn = static_assert_imm-in2_exp_len-LANE | |
353b0b11 | 5779 | multi_fn = simd_shuffle!, c:out_t, c, c, {dup-out_len-LANE as u32} |
04454e1e | 5780 | multi_fn = vqrdmlsh-out-noext, a, b, c |
17df50a5 XL |
5781 | a = 1, 1, 1, 1, 1, 1, 1, 1 |
5782 | b = MAX, MAX, MAX, MAX, MAX, MAX, MAX, MAX | |
5783 | c = 0, 2, 0, 0, 0, 0, 0, 0 | |
5784 | n = 1 | |
5785 | validate -1, -1, -1, -1, -1, -1, -1, -1 | |
5786 | ||
04454e1e FG |
5787 | aarch64 = sqrdmlsh |
5788 | target = rdm | |
17df50a5 XL |
5789 | generate int16x4_t, int16x4_t:int16x4_t:int16x8_t:int16x4_t, int16x8_t:int16x8_t:int16x4_t:int16x8_t, int16x8_t |
5790 | generate int32x2_t, int32x2_t:int32x2_t:int32x4_t:int32x2_t, int32x4_t:int32x4_t:int32x2_t:int32x4_t, int32x4_t | |
5791 | ||
5792 | /// Signed saturating rounding doubling multiply subtract returning high half | |
5793 | name = vqrdmlsh | |
5794 | in2-lane-suffixes | |
5795 | constn = LANE | |
5796 | multi_fn = static_assert_imm-in2_exp_len-LANE | |
c620b35d | 5797 | multi_fn = vqrdmlsh-self-noext, a, b, {simd_extract!, c, LANE as u32} |
17df50a5 XL |
5798 | a = 1 |
5799 | b = 1 | |
5800 | c = 0, 2, 0, 0, 0, 0, 0, 0 | |
5801 | n = 1 | |
5802 | validate 1 | |
5803 | ||
04454e1e FG |
5804 | aarch64 = sqrdmlsh |
5805 | target = rdm | |
17df50a5 XL |
5806 | generate i16:i16:int16x4_t:i16, i16:i16:int16x8_t:i16, i32:i32:int32x2_t:i32, i32:i32:int32x4_t:i32 |
5807 | ||
5808 | /// Signed saturating rounding shift left | |
5809 | name = vqrshl | |
5810 | a = 2, MIN, MAX, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
5811 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
5812 | validate 8, MIN, MAX, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 | |
5813 | ||
5814 | aarch64 = sqrshl | |
5815 | link-aarch64 = sqrshl._EXT_ | |
5816 | generate i32, i64 | |
5817 | ||
5818 | arm = vqrshl | |
5819 | link-arm = vqrshifts._EXT_ | |
5820 | generate int*_t, int64x*_t | |
5821 | ||
5822 | /// Signed saturating rounding shift left | |
5823 | name = vqrshl | |
5824 | multi_fn = vdup_n-in_ntt-noext, a:in_ntt, a | |
5825 | multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b | |
c620b35d | 5826 | multi_fn = simd_extract!, {vqrshl-in_ntt-noext, a, b}, 0 |
17df50a5 XL |
5827 | a = 1 |
5828 | b = 2 | |
5829 | validate 4 | |
5830 | ||
5831 | aarch64 = sqrshl | |
5832 | generate i8, i16 | |
5833 | ||
5834 | /// Unsigned signed saturating rounding shift left | |
5835 | name = vqrshl | |
5836 | out-suffix | |
5837 | a = 2, MIN, MAX, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
5838 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
5839 | validate 8, 0, MAX, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 | |
5840 | ||
5841 | aarch64 = uqrshl | |
5842 | link-aarch64 = uqrshl._EXT_ | |
5843 | generate u32:i32:u32, u64:i64:u64 | |
5844 | ||
5845 | arm = vqrshl | |
5846 | link-arm = vqrshiftu._EXT_ | |
5847 | generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t | |
5848 | generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t | |
5849 | ||
5850 | /// Unsigned signed saturating rounding shift left | |
5851 | name = vqrshl | |
5852 | out-suffix | |
5853 | multi_fn = vdup_n-out_ntt-noext, a:out_ntt, a | |
5854 | multi_fn = vdup_n-in_ntt-noext, b:in_ntt, b | |
c620b35d | 5855 | multi_fn = simd_extract!, {vqrshl-out_ntt-noext, a, b}, 0 |
17df50a5 XL |
5856 | a = 1 |
5857 | b = 2 | |
5858 | validate 4 | |
5859 | ||
5860 | aarch64 = uqrshl | |
5861 | generate u8:i8:u8, u16:i16:u16 | |
5862 | ||
5863 | /// Signed saturating rounded shift right narrow | |
5864 | name = vqrshrn | |
5865 | noq-n-suffix | |
5866 | constn = N | |
5867 | multi_fn = static_assert-N-1-halfbits | |
5868 | a = MIN, 4, 8, 12, 16, 20, 24, 28 | |
5869 | n = 2 | |
5870 | validate MIN, 1, 2, 3, 4, 5, 6, 7 | |
5871 | ||
5872 | aarch64 = sqrshrn | |
5873 | link-aarch64 = sqrshrn._EXT2_ | |
5874 | const-aarch64 = N | |
5875 | ||
5876 | arm = vqrshrn | |
5877 | link-arm = vqrshiftns._EXT2_ | |
5878 | const-arm = -N as ttn | |
c295e0f8 | 5879 | arm-aarch64-separate |
17df50a5 XL |
5880 | generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t |
5881 | ||
5882 | /// Signed saturating rounded shift right narrow | |
5883 | name = vqrshrn | |
5884 | noq-n-suffix | |
5885 | constn = N | |
5886 | multi_fn = static_assert-N-1-halfbits | |
5887 | multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a | |
c620b35d | 5888 | multi_fn = simd_extract!, {vqrshrn_n-in_ntt-::<N>, a}, 0 |
17df50a5 XL |
5889 | a = 4 |
5890 | n = 2 | |
5891 | validate 1 | |
5892 | ||
5893 | aarch64 = sqrshrn | |
5894 | generate i16:i8, i32:i16, i64:i32 | |
5895 | ||
5896 | /// Signed saturating rounded shift right narrow | |
5897 | name = vqrshrn_high | |
5898 | noq-n-suffix | |
5899 | constn = N | |
5900 | multi_fn = static_assert-N-1-halfbits | |
353b0b11 | 5901 | multi_fn = simd_shuffle!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len} |
17df50a5 XL |
5902 | a = 0, 1, 2, 3, 2, 3, 6, 7 |
5903 | b = 8, 12, 24, 28, 48, 52, 56, 60 | |
5904 | n = 2 | |
5905 | validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15 | |
5906 | ||
5907 | aarch64 = sqrshrn2 | |
5908 | generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t | |
5909 | ||
5910 | /// Unsigned signed saturating rounded shift right narrow | |
5911 | name = vqrshrn | |
5912 | noq-n-suffix | |
5913 | constn = N | |
5914 | multi_fn = static_assert-N-1-halfbits | |
5915 | a = MIN, 4, 8, 12, 16, 20, 24, 28 | |
5916 | n = 2 | |
5917 | validate 0, 1, 2, 3, 4, 5, 6, 7 | |
5918 | ||
5919 | aarch64 = uqrshrn | |
5920 | link-aarch64 = uqrshrn._EXT2_ | |
5921 | const-aarch64 = N | |
5922 | ||
5923 | arm = vqrshrn | |
5924 | link-arm = vqrshiftnu._EXT2_ | |
5925 | const-arm = -N as ttn | |
c295e0f8 | 5926 | arm-aarch64-separate |
17df50a5 XL |
5927 | generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t |
5928 | ||
5929 | /// Unsigned saturating rounded shift right narrow | |
5930 | name = vqrshrn | |
5931 | noq-n-suffix | |
5932 | constn = N | |
5933 | multi_fn = static_assert-N-1-halfbits | |
5934 | multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a | |
c620b35d | 5935 | multi_fn = simd_extract!, {vqrshrn_n-in_ntt-::<N>, a}, 0 |
17df50a5 XL |
5936 | a = 4 |
5937 | n = 2 | |
5938 | validate 1 | |
5939 | ||
5940 | aarch64 = uqrshrn | |
5941 | generate u16:u8, u32:u16, u64:u32 | |
5942 | ||
5943 | /// Unsigned saturating rounded shift right narrow | |
5944 | name = vqrshrn_high | |
5945 | noq-n-suffix | |
5946 | constn = N | |
5947 | multi_fn = static_assert-N-1-halfbits | |
353b0b11 | 5948 | multi_fn = simd_shuffle!, a, {vqrshrn_n-noqself-::<N>, b}, {asc-0-out_len} |
17df50a5 XL |
5949 | a = 0, 1, 2, 3, 2, 3, 6, 7 |
5950 | b = 8, 12, 24, 28, 48, 52, 56, 60 | |
5951 | n = 2 | |
5952 | validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15 | |
5953 | ||
5954 | aarch64 = uqrshrn2 | |
5955 | generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t | |
5956 | ||
5957 | /// Signed saturating rounded shift right unsigned narrow | |
5958 | name = vqrshrun | |
5959 | noq-n-suffix | |
5960 | constn = N | |
5961 | multi_fn = static_assert-N-1-halfbits | |
5962 | a = 0, 4, 8, 12, 16, 20, 24, 28 | |
5963 | n = 2 | |
5964 | validate 0, 1, 2, 3, 4, 5, 6, 7 | |
5965 | ||
5966 | aarch64 = sqrshrun | |
5967 | link-aarch64 = sqrshrun._EXT2_ | |
5968 | const-aarch64 = N | |
5969 | ||
5970 | arm = vqrshrun | |
5971 | link-arm = vqrshiftnsu._EXT2_ | |
5972 | const-arm = -N as ttn | |
c295e0f8 | 5973 | arm-aarch64-separate |
17df50a5 XL |
5974 | generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t |
5975 | ||
5976 | /// Signed saturating rounded shift right unsigned narrow | |
5977 | name = vqrshrun | |
5978 | noq-n-suffix | |
5979 | constn = N | |
5980 | multi_fn = static_assert-N-1-halfbits | |
5981 | multi_fn = vdupq_n-in_ntt-noext, a:in_long_ntt, a | |
c620b35d | 5982 | multi_fn = simd_extract!, {vqrshrun_n-in_ntt-::<N>, a}, 0 |
17df50a5 XL |
5983 | a = 4 |
5984 | n = 2 | |
5985 | validate 1 | |
5986 | ||
5987 | aarch64 = sqrshrun | |
5988 | generate i16:u8, i32:u16, i64:u32 | |
5989 | ||
5990 | /// Signed saturating rounded shift right unsigned narrow | |
5991 | name = vqrshrun_high | |
5992 | noq-n-suffix | |
5993 | constn = N | |
5994 | multi_fn = static_assert-N-1-halfbits | |
353b0b11 | 5995 | multi_fn = simd_shuffle!, a, {vqrshrun_n-noqself-::<N>, b}, {asc-0-out_len} |
17df50a5 XL |
5996 | a = 0, 1, 2, 3, 2, 3, 6, 7 |
5997 | b = 8, 12, 24, 28, 48, 52, 56, 60 | |
5998 | n = 2 | |
5999 | validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 12, 13, 14, 15 | |
6000 | ||
6001 | aarch64 = sqrshrun2 | |
6002 | generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t | |
6003 | ||
6004 | /// Signed saturating shift left | |
6005 | name = vqshl | |
6006 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
6007 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
6008 | validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 | |
6009 | ||
6010 | aarch64 = sqshl | |
6011 | link-aarch64 = sqshl._EXT_ | |
6012 | generate i64 | |
6013 | ||
6014 | arm = vqshl | |
6015 | link-arm = vqshifts._EXT_ | |
6016 | generate int*_t, int64x*_t | |
6017 | ||
6018 | /// Signed saturating shift left | |
6019 | name = vqshl | |
6020 | multi_fn = vqshl-in_ntt-noext, c:in_ntt, {vdup_n-in_ntt-noext, a}, {vdup_n-in_ntt-noext, b} | |
c620b35d | 6021 | multi_fn = simd_extract!, c, 0 |
17df50a5 XL |
6022 | a = 1 |
6023 | b = 2 | |
6024 | validate 4 | |
6025 | ||
6026 | aarch64 = sqshl | |
6027 | generate i8, i16, i32 | |
6028 | ||
6029 | /// Unsigned saturating shift left | |
6030 | name = vqshl | |
6031 | out-suffix | |
6032 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
6033 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
6034 | validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 | |
6035 | ||
6036 | aarch64 = uqshl | |
6037 | link-aarch64 = uqshl._EXT_ | |
6038 | generate u64:i64:u64 | |
6039 | ||
6040 | arm = vqshl | |
6041 | link-arm = vqshiftu._EXT_ | |
6042 | generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t | |
6043 | generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t | |
6044 | ||
6045 | /// Unsigned saturating shift left | |
6046 | name = vqshl | |
6047 | out-suffix | |
6048 | multi_fn = vqshl-out_ntt-noext, c:out_ntt, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b} | |
c620b35d | 6049 | multi_fn = simd_extract!, c, 0 |
17df50a5 XL |
6050 | a = 1 |
6051 | b = 2 | |
6052 | validate 4 | |
6053 | ||
6054 | aarch64 = uqshl | |
6055 | generate u8:i8:u8, u16:i16:u16, u32:i32:u32 | |
6056 | ||
6057 | /// Signed saturating shift left | |
6058 | name = vqshl | |
6059 | n-suffix | |
6060 | constn = N | |
6061 | multi_fn = static_assert_imm-out_bits_exp_len-N | |
a2a8927a | 6062 | multi_fn = vqshl-self-noext, a, {vdup-nself-noext, N as _} |
17df50a5 XL |
6063 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
6064 | n = 2 | |
6065 | validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 | |
6066 | ||
6067 | aarch64 = sqshl | |
6068 | arm = vqshl | |
6069 | generate int*_t, int64x*_t | |
6070 | ||
6071 | /// Signed saturating shift left | |
6072 | name = vqshl | |
6073 | n-suffix | |
6074 | constn = N | |
6075 | multi_fn = static_assert_imm-out_bits_exp_len-N | |
c620b35d | 6076 | multi_fn = simd_extract!, {vqshl_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0 |
17df50a5 XL |
6077 | a = 1 |
6078 | n = 2 | |
6079 | validate 4 | |
6080 | ||
6081 | aarch64 = sqshl | |
6082 | generate i8, i16, i32, i64 | |
6083 | ||
6084 | /// Unsigned saturating shift left | |
6085 | name = vqshl | |
6086 | n-suffix | |
6087 | constn = N | |
6088 | multi_fn = static_assert_imm-out_bits_exp_len-N | |
a2a8927a | 6089 | multi_fn = vqshl-self-noext, a, {vdup-nsigned-noext, N as _} |
17df50a5 XL |
6090 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
6091 | n = 2 | |
6092 | validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 | |
6093 | ||
6094 | aarch64 = uqshl | |
6095 | arm = vqshl | |
6096 | generate uint*_t, uint64x*_t | |
6097 | ||
6098 | /// Unsigned saturating shift left | |
6099 | name = vqshl | |
6100 | n-suffix | |
6101 | constn = N | |
6102 | multi_fn = static_assert_imm-out_bits_exp_len-N | |
c620b35d | 6103 | multi_fn = simd_extract!, {vqshl_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0 |
17df50a5 XL |
6104 | a = 1 |
6105 | n = 2 | |
6106 | validate 4 | |
6107 | ||
6108 | aarch64 = uqshl | |
6109 | generate u8, u16, u32, u64 | |
6110 | ||
3c0e092e XL |
6111 | /// Signed saturating shift left unsigned |
6112 | name = vqshlu | |
6113 | n-suffix | |
6114 | constn = N | |
6115 | multi_fn = static_assert_imm-out_bits_exp_len-N | |
6116 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
6117 | n = 2 | |
6118 | validate 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60 | |
6119 | arm-aarch64-separate | |
6120 | ||
6121 | aarch64 = sqshlu | |
6122 | link-aarch64 = sqshlu._EXT_ | |
6123 | const-aarch64 = {dup-in_len-N as ttn} | |
6124 | arm = vqshlu | |
6125 | link-arm = vqshiftsu._EXT_ | |
6126 | const-arm = N as ttn | |
6127 | generate int8x8_t:uint8x8_t, int16x4_t:uint16x4_t, int32x2_t:uint32x2_t, int64x1_t:uint64x1_t | |
6128 | generate int8x16_t:uint8x16_t, int16x8_t:uint16x8_t, int32x4_t:uint32x4_t, int64x2_t:uint64x2_t | |
6129 | ||
6130 | /// Signed saturating shift left unsigned | |
6131 | name = vqshlu | |
6132 | n-suffix | |
6133 | constn = N | |
6134 | multi_fn = static_assert_imm-out_bits_exp_len-N | |
c620b35d | 6135 | multi_fn = simd_extract!, {vqshlu_n-in_ntt-::<N>, {vdup_n-in_ntt-noext, a}}, 0 |
3c0e092e XL |
6136 | a = 1 |
6137 | n = 2 | |
6138 | validate 4 | |
6139 | ||
6140 | aarch64 = sqshlu | |
6141 | generate i8:u8, i16:u16, i32:u32, i64:u64 | |
6142 | ||
17df50a5 XL |
6143 | /// Signed saturating shift right narrow |
6144 | name = vqshrn | |
6145 | noq-n-suffix | |
6146 | constn = N | |
6147 | multi_fn = static_assert-N-1-halfbits | |
6148 | a = 0, 4, 8, 12, 16, 20, 24, 28 | |
6149 | n = 2 | |
6150 | validate 0, 1, 2, 3, 4, 5, 6, 7 | |
c295e0f8 | 6151 | arm-aarch64-separate |
17df50a5 XL |
6152 | |
6153 | aarch64 = sqshrn | |
6154 | link-aarch64 = sqshrn._EXT2_ | |
6155 | const-aarch64 = N | |
6156 | generate i64:i32 | |
6157 | ||
6158 | arm = vqshrn | |
6159 | link-arm = vqshiftns._EXT2_ | |
6160 | const-arm = -N as ttn | |
6161 | generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t | |
6162 | ||
6163 | /// Signed saturating shift right narrow | |
6164 | name = vqshrn | |
6165 | noq-n-suffix | |
6166 | constn = N | |
6167 | multi_fn = static_assert-N-1-halfbits | |
c620b35d | 6168 | multi_fn = simd_extract!, {vqshrn_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0 |
17df50a5 XL |
6169 | a = 4 |
6170 | n = 2 | |
6171 | validate 1 | |
6172 | ||
6173 | aarch64 = sqshrn | |
6174 | generate i16:i8, i32:i16 | |
6175 | ||
6176 | /// Signed saturating shift right narrow | |
6177 | name = vqshrn_high | |
6178 | noq-n-suffix | |
6179 | constn = N | |
6180 | multi_fn = static_assert-N-1-halfbits | |
353b0b11 | 6181 | multi_fn = simd_shuffle!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len} |
17df50a5 XL |
6182 | a = 0, 1, 8, 9, 8, 9, 10, 11 |
6183 | b = 32, 36, 40, 44, 48, 52, 56, 60 | |
6184 | n = 2 | |
6185 | validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 | |
6186 | ||
6187 | aarch64 = sqshrn2 | |
6188 | generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t | |
6189 | ||
6190 | /// Unsigned saturating shift right narrow | |
6191 | name = vqshrn | |
6192 | noq-n-suffix | |
6193 | constn = N | |
6194 | multi_fn = static_assert-N-1-halfbits | |
6195 | a = 0, 4, 8, 12, 16, 20, 24, 28 | |
6196 | n = 2 | |
6197 | validate 0, 1, 2, 3, 4, 5, 6, 7 | |
c295e0f8 | 6198 | arm-aarch64-separate |
17df50a5 XL |
6199 | |
6200 | aarch64 = uqshrn | |
6201 | link-aarch64 = uqshrn._EXT2_ | |
6202 | const-aarch64 = N | |
6203 | generate u64:u32 | |
6204 | ||
6205 | arm = vqshrn | |
6206 | link-arm = vqshiftnu._EXT2_ | |
6207 | const-arm = -N as ttn | |
6208 | generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t | |
6209 | ||
6210 | /// Unsigned saturating shift right narrow | |
6211 | name = vqshrn | |
6212 | noq-n-suffix | |
6213 | constn = N | |
6214 | multi_fn = static_assert-N-1-halfbits | |
c620b35d | 6215 | multi_fn = simd_extract!, {vqshrn_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0 |
17df50a5 XL |
6216 | a = 4 |
6217 | n = 2 | |
6218 | validate 1 | |
6219 | ||
6220 | aarch64 = uqshrn | |
6221 | generate u16:u8, u32:u16 | |
6222 | ||
6223 | /// Unsigned saturating shift right narrow | |
6224 | name = vqshrn_high | |
6225 | noq-n-suffix | |
6226 | constn = N | |
6227 | multi_fn = static_assert-N-1-halfbits | |
353b0b11 | 6228 | multi_fn = simd_shuffle!, a, {vqshrn_n-noqself-::<N>, b}, {asc-0-out_len} |
17df50a5 XL |
6229 | a = 0, 1, 8, 9, 8, 9, 10, 11 |
6230 | b = 32, 36, 40, 44, 48, 52, 56, 60 | |
6231 | n = 2 | |
6232 | validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 | |
6233 | ||
6234 | aarch64 = uqshrn2 | |
6235 | generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t | |
6236 | ||
6237 | /// Signed saturating shift right unsigned narrow | |
6238 | name = vqshrun | |
6239 | noq-n-suffix | |
6240 | constn = N | |
6241 | multi_fn = static_assert-N-1-halfbits | |
6242 | a = 0, 4, 8, 12, 16, 20, 24, 28 | |
6243 | n = 2 | |
6244 | validate 0, 1, 2, 3, 4, 5, 6, 7 | |
c295e0f8 | 6245 | arm-aarch64-separate |
17df50a5 XL |
6246 | |
6247 | aarch64 = sqshrun | |
6248 | link-aarch64 = sqshrun._EXT2_ | |
6249 | const-aarch64 = N | |
6250 | ||
6251 | arm = vqshrun | |
6252 | link-arm = vqshiftnsu._EXT2_ | |
6253 | const-arm = -N as ttn | |
6254 | generate int16x8_t:uint8x8_t, int32x4_t:uint16x4_t, int64x2_t:uint32x2_t | |
6255 | ||
6256 | /// Signed saturating shift right unsigned narrow | |
6257 | name = vqshrun | |
6258 | noq-n-suffix | |
6259 | constn = N | |
6260 | multi_fn = static_assert-N-1-halfbits | |
c620b35d | 6261 | multi_fn = simd_extract!, {vqshrun_n-in_ntt-::<N>, {vdupq_n-in_ntt-noext, a}}, 0 |
17df50a5 XL |
6262 | a = 4 |
6263 | n = 2 | |
6264 | validate 1 | |
6265 | ||
6266 | aarch64 = sqshrun | |
6267 | generate i16:u8, i32:u16, i64:u32 | |
6268 | ||
6269 | /// Signed saturating shift right unsigned narrow | |
6270 | name = vqshrun_high | |
6271 | noq-n-suffix | |
6272 | constn = N | |
6273 | multi_fn = static_assert-N-1-halfbits | |
353b0b11 | 6274 | multi_fn = simd_shuffle!, a, {vqshrun_n-noqself-::<N>, b}, {asc-0-out_len} |
17df50a5 XL |
6275 | a = 0, 1, 8, 9, 8, 9, 10, 11 |
6276 | b = 32, 36, 40, 44, 48, 52, 56, 60 | |
6277 | n = 2 | |
6278 | validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 | |
6279 | ||
6280 | aarch64 = sqshrun2 | |
6281 | generate uint8x8_t:int16x8_t:uint8x16_t, uint16x4_t:int32x4_t:uint16x8_t, uint32x2_t:int64x2_t:uint32x4_t | |
6282 | ||
3c0e092e XL |
6283 | /// Unsigned saturating accumulate of signed value |
6284 | name = vsqadd | |
6285 | out-suffix | |
c620b35d | 6286 | multi_fn = simd_extract!, {vsqadd-out_ntt-noext, {vdup_n-out_ntt-noext, a}, {vdup_n-in_ntt-noext, b}}, 0 |
3c0e092e XL |
6287 | a = 2 |
6288 | b = 2 | |
6289 | validate 4 | |
6290 | ||
6291 | aarch64 = usqadd | |
6292 | generate u8:i8:u8, u16:i16:u16 | |
6293 | ||
6294 | /// Unsigned saturating accumulate of signed value | |
6295 | name = vsqadd | |
6296 | out-suffix | |
6297 | a = 2 | |
6298 | b = 2 | |
6299 | validate 4 | |
6300 | ||
6301 | aarch64 = usqadd | |
6302 | link-aarch64 = usqadd._EXT_ | |
6303 | generate u32:i32:u32, u64:i64:u64 | |
6304 | ||
17df50a5 XL |
6305 | /// Calculates the square root of each lane. |
6306 | name = vsqrt | |
6307 | fn = simd_fsqrt | |
6308 | a = 4.0, 9.0, 16.0, 25.0 | |
6309 | validate 2.0, 3.0, 4.0, 5.0 | |
6310 | ||
6311 | aarch64 = fsqrt | |
6312 | generate float*_t, float64x*_t | |
6313 | ||
6314 | /// Reciprocal square-root estimate. | |
6315 | name = vrsqrte | |
6316 | a = 1.0, 2.0, 3.0, 4.0 | |
6317 | validate 0.998046875, 0.705078125, 0.576171875, 0.4990234375 | |
6318 | ||
6319 | aarch64 = frsqrte | |
6320 | link-aarch64 = frsqrte._EXT_ | |
3c0e092e XL |
6321 | generate float64x*_t, f32, f64 |
6322 | ||
6323 | arm = vrsqrte | |
6324 | link-arm = vrsqrte._EXT_ | |
6325 | generate float*_t | |
6326 | ||
6327 | /// Unsigned reciprocal square root estimate | |
6328 | name = vrsqrte | |
6329 | a = 1, 2, 3, 4 | |
6330 | validate 4294967295, 4294967295, 4294967295, 4294967295 | |
17df50a5 | 6331 | |
3c0e092e XL |
6332 | aarch64 = ursqrte |
6333 | link-aarch64 = ursqrte._EXT_ | |
17df50a5 XL |
6334 | arm = vrsqrte |
6335 | link-arm = vrsqrte._EXT_ | |
3c0e092e XL |
6336 | generate uint32x2_t, uint32x4_t |
6337 | ||
6338 | /// Floating-point reciprocal square root step | |
6339 | name = vrsqrts | |
6340 | a = 1.0, 2.0, 3.0, 4.0 | |
6341 | b = 1.0, 2.0, 3.0, 4.0 | |
6342 | validate 1., -0.5, -3.0, -6.5 | |
6343 | ||
6344 | aarch64 = frsqrts | |
6345 | link-aarch64 = frsqrts._EXT_ | |
6346 | generate float64x*_t, f32, f64 | |
6347 | ||
6348 | arm = vrsqrts | |
6349 | link-arm = vrsqrts._EXT_ | |
17df50a5 XL |
6350 | generate float*_t |
6351 | ||
6352 | /// Reciprocal estimate. | |
6353 | name = vrecpe | |
6354 | a = 4.0, 3.0, 2.0, 1.0 | |
6355 | validate 0.24951171875, 0.3330078125, 0.4990234375, 0.998046875 | |
6356 | ||
6357 | aarch64 = frecpe | |
6358 | link-aarch64 = frecpe._EXT_ | |
3c0e092e | 6359 | generate float64x*_t, f32, f64 |
17df50a5 XL |
6360 | |
6361 | arm = vrecpe | |
6362 | link-arm = vrecpe._EXT_ | |
6363 | generate float*_t | |
6364 | ||
3c0e092e XL |
6365 | /// Unsigned reciprocal estimate |
6366 | name = vrecpe | |
6367 | a = 4, 3, 2, 1 | |
6368 | validate 4294967295, 4294967295, 4294967295, 4294967295 | |
6369 | ||
6370 | aarch64 = urecpe | |
6371 | link-aarch64 = urecpe._EXT_ | |
6372 | arm = vrecpe | |
6373 | link-arm = vrecpe._EXT_ | |
6374 | generate uint32x2_t, uint32x4_t | |
6375 | ||
6376 | /// Floating-point reciprocal step | |
6377 | name = vrecps | |
6378 | a = 4.0, 3.0, 2.0, 1.0 | |
6379 | b = 4.0, 3.0, 2.0, 1.0 | |
6380 | validate -14., -7., -2., 1. | |
6381 | ||
6382 | aarch64 = frecps | |
6383 | link-aarch64 = frecps._EXT_ | |
6384 | generate float64x*_t, f32, f64 | |
6385 | ||
6386 | arm = vrecps | |
6387 | link-arm = vrecps._EXT_ | |
6388 | generate float*_t | |
6389 | ||
6390 | /// Floating-point reciprocal exponent | |
6391 | name = vrecpx | |
6392 | a = 4.0 | |
6393 | validate 0.5 | |
6394 | ||
6395 | aarch64 = frecpx | |
6396 | link-aarch64 = frecpx._EXT_ | |
6397 | generate f32, f64 | |
6398 | ||
17df50a5 XL |
6399 | /// Vector reinterpret cast operation |
6400 | name = vreinterpret | |
6401 | double-suffixes | |
6402 | fn = transmute | |
6403 | a = 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
6404 | validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
6405 | ||
c295e0f8 | 6406 | aarch64 = nop |
17df50a5 XL |
6407 | generate poly64x1_t:int64x1_t, poly64x1_t:uint64x1_t, int64x1_t:poly64x1_t, uint64x1_t:poly64x1_t |
6408 | generate poly64x2_t:int64x2_t, poly64x2_t:uint64x2_t, int64x2_t:poly64x2_t, uint64x2_t:poly64x2_t | |
6409 | ||
c295e0f8 | 6410 | arm = nop |
17df50a5 XL |
6411 | generate uint8x8_t:int8x8_t, poly8x8_t:int8x8_t, poly16x4_t:int16x4_t, uint16x4_t:int16x4_t, uint32x2_t:int32x2_t, uint64x1_t:int64x1_t |
6412 | generate uint8x16_t:int8x16_t, poly8x16_t:int8x16_t, poly16x8_t:int16x8_t, uint16x8_t:int16x8_t, uint32x4_t:int32x4_t, uint64x2_t:int64x2_t | |
6413 | generate poly8x8_t:uint8x8_t, int8x8_t:uint8x8_t, poly16x4_t:uint16x4_t, int16x4_t:uint16x4_t, int32x2_t:uint32x2_t, int64x1_t:uint64x1_t | |
6414 | generate poly8x16_t:uint8x16_t, int8x16_t:uint8x16_t, poly16x8_t:uint16x8_t, int16x8_t:uint16x8_t, int32x4_t:uint32x4_t, int64x2_t:uint64x2_t | |
6415 | generate int8x8_t:poly8x8_t, uint8x8_t:poly8x8_t, int16x4_t:poly16x4_t, uint16x4_t:poly16x4_t | |
6416 | generate int8x16_t:poly8x16_t, uint8x16_t:poly8x16_t, int16x8_t:poly16x8_t, uint16x8_t:poly16x8_t | |
6417 | ||
6418 | /// Vector reinterpret cast operation | |
6419 | name = vreinterpret | |
6420 | double-suffixes | |
6421 | fn = transmute | |
6422 | a = 0, 1, 2, 3, 4, 5, 6, 7 | |
6423 | validate 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 | |
6424 | ||
c295e0f8 | 6425 | aarch64 = nop |
c295e0f8 | 6426 | arm = nop |
17df50a5 XL |
6427 | generate int16x4_t:int8x8_t, uint16x4_t:int8x8_t, poly16x4_t:int8x8_t, int32x2_t:int16x4_t, uint32x2_t:int16x4_t, int64x1_t:int32x2_t, uint64x1_t:int32x2_t |
6428 | generate int16x8_t:int8x16_t, uint16x8_t:int8x16_t, poly16x8_t:int8x16_t, int32x4_t:int16x8_t, uint32x4_t:int16x8_t, int64x2_t:int32x4_t, uint64x2_t:int32x4_t | |
6429 | generate poly16x4_t:uint8x8_t, int16x4_t:uint8x8_t, uint16x4_t:uint8x8_t, int32x2_t:uint16x4_t, uint32x2_t:uint16x4_t, int64x1_t:uint32x2_t, uint64x1_t:uint32x2_t | |
6430 | generate poly16x8_t:uint8x16_t, int16x8_t:uint8x16_t, uint16x8_t:uint8x16_t, int32x4_t:uint16x8_t, uint32x4_t:uint16x8_t, int64x2_t:uint32x4_t, uint64x2_t:uint32x4_t | |
6431 | generate poly16x4_t:poly8x8_t, int16x4_t:poly8x8_t, uint16x4_t:poly8x8_t, int32x2_t:poly16x4_t, uint32x2_t:poly16x4_t | |
6432 | generate poly16x8_t:poly8x16_t, int16x8_t:poly8x16_t, uint16x8_t:poly8x16_t, int32x4_t:poly16x8_t, uint32x4_t:poly16x8_t | |
3c0e092e XL |
6433 | target = aes |
6434 | generate poly64x1_t:int32x2_t, poly64x1_t:uint32x2_t | |
6435 | generate poly64x2_t:int32x4_t, poly64x2_t:uint32x4_t | |
6436 | generate p128:int64x2_t, p128:uint64x2_t, p128:poly64x2_t | |
17df50a5 XL |
6437 | |
6438 | /// Vector reinterpret cast operation | |
6439 | name = vreinterpret | |
6440 | double-suffixes | |
6441 | fn = transmute | |
6442 | a = 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 | |
6443 | validate 0, 1, 2, 3, 4, 5, 6, 7 | |
6444 | ||
c295e0f8 | 6445 | aarch64 = nop |
c295e0f8 | 6446 | arm = nop |
17df50a5 XL |
6447 | generate poly8x8_t:int16x4_t, int8x8_t:int16x4_t, uint8x8_t:int16x4_t, poly16x4_t:int32x2_t, int16x4_t:int32x2_t, uint16x4_t:int32x2_t, int32x2_t:int64x1_t, uint32x2_t:int64x1_t |
6448 | generate poly8x16_t:int16x8_t, int8x16_t:int16x8_t, uint8x16_t:int16x8_t, poly16x8_t:int32x4_t, int16x8_t:int32x4_t, uint16x8_t:int32x4_t, int32x4_t:int64x2_t, uint32x4_t:int64x2_t | |
6449 | generate poly8x8_t:uint16x4_t, int8x8_t:uint16x4_t, uint8x8_t:uint16x4_t, poly16x4_t:uint32x2_t, int16x4_t:uint32x2_t, uint16x4_t:uint32x2_t, int32x2_t:uint64x1_t, uint32x2_t:uint64x1_t | |
6450 | generate poly8x16_t:uint16x8_t, int8x16_t:uint16x8_t, uint8x16_t:uint16x8_t, poly16x8_t:uint32x4_t, int16x8_t:uint32x4_t, uint16x8_t:uint32x4_t, int32x4_t:uint64x2_t, uint32x4_t:uint64x2_t | |
6451 | generate poly8x8_t:poly16x4_t, int8x8_t:poly16x4_t, uint8x8_t:poly16x4_t | |
6452 | generate poly8x16_t:poly16x8_t, int8x16_t:poly16x8_t, uint8x16_t:poly16x8_t | |
3c0e092e XL |
6453 | target = aes |
6454 | generate int32x2_t:poly64x1_t, uint32x2_t:poly64x1_t | |
6455 | generate int32x4_t:poly64x2_t, uint32x4_t:poly64x2_t | |
6456 | generate int64x2_t:p128, uint64x2_t:p128, poly64x2_t:p128 | |
17df50a5 XL |
6457 | |
6458 | /// Vector reinterpret cast operation | |
6459 | name = vreinterpret | |
6460 | double-suffixes | |
6461 | fn = transmute | |
6462 | a = 0, 1, 2, 3 | |
6463 | validate 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 | |
6464 | ||
c295e0f8 | 6465 | aarch64 = nop |
c295e0f8 | 6466 | arm = nop |
17df50a5 XL |
6467 | generate int32x2_t:int8x8_t, uint32x2_t:int8x8_t, int64x1_t:int16x4_t, uint64x1_t:int16x4_t |
6468 | generate int32x4_t:int8x16_t, uint32x4_t:int8x16_t, int64x2_t:int16x8_t, uint64x2_t:int16x8_t | |
6469 | generate int32x2_t:uint8x8_t, uint32x2_t:uint8x8_t, int64x1_t:uint16x4_t, uint64x1_t:uint16x4_t | |
6470 | generate int32x4_t:uint8x16_t, uint32x4_t:uint8x16_t, int64x2_t:uint16x8_t, uint64x2_t:uint16x8_t | |
6471 | generate int32x2_t:poly8x8_t, uint32x2_t:poly8x8_t, int64x1_t:poly16x4_t, uint64x1_t:poly16x4_t | |
6472 | generate int32x4_t:poly8x16_t, uint32x4_t:poly8x16_t, int64x2_t:poly16x8_t, uint64x2_t:poly16x8_t | |
3c0e092e XL |
6473 | target = aes |
6474 | generate poly64x1_t:int16x4_t, poly64x1_t:uint16x4_t, poly64x1_t:poly16x4_t | |
6475 | generate poly64x2_t:int16x8_t, poly64x2_t:uint16x8_t, poly64x2_t:poly16x8_t | |
6476 | generate p128:int32x4_t, p128:uint32x4_t | |
17df50a5 XL |
6477 | |
6478 | /// Vector reinterpret cast operation | |
6479 | name = vreinterpret | |
6480 | double-suffixes | |
6481 | fn = transmute | |
6482 | a = 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0 | |
6483 | validate 0, 1, 2, 3 | |
6484 | ||
c295e0f8 | 6485 | aarch64 = nop |
c295e0f8 | 6486 | arm = nop |
17df50a5 XL |
6487 | generate poly8x8_t:int32x2_t, int8x8_t:int32x2_t, uint8x8_t:int32x2_t, poly16x4_t:int64x1_t, int16x4_t:int64x1_t, uint16x4_t:int64x1_t |
6488 | generate poly8x16_t:int32x4_t, int8x16_t:int32x4_t, uint8x16_t:int32x4_t, poly16x8_t:int64x2_t, int16x8_t:int64x2_t, uint16x8_t:int64x2_t | |
6489 | generate poly8x8_t:uint32x2_t, int8x8_t:uint32x2_t, uint8x8_t:uint32x2_t, poly16x4_t:uint64x1_t, int16x4_t:uint64x1_t, uint16x4_t:uint64x1_t | |
6490 | generate poly8x16_t:uint32x4_t, int8x16_t:uint32x4_t, uint8x16_t:uint32x4_t, poly16x8_t:uint64x2_t, int16x8_t:uint64x2_t, uint16x8_t:uint64x2_t | |
3c0e092e XL |
6491 | target = aes |
6492 | generate poly16x4_t:poly64x1_t, int16x4_t:poly64x1_t, uint16x4_t:poly64x1_t | |
6493 | generate poly16x8_t:poly64x2_t, int16x8_t:poly64x2_t, uint16x8_t:poly64x2_t | |
6494 | generate int32x4_t:p128, uint32x4_t:p128 | |
17df50a5 XL |
6495 | |
6496 | /// Vector reinterpret cast operation | |
6497 | name = vreinterpret | |
6498 | double-suffixes | |
6499 | fn = transmute | |
6500 | a = 0, 1 | |
6501 | validate 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 | |
6502 | ||
c295e0f8 | 6503 | aarch64 = nop |
c295e0f8 | 6504 | arm = nop |
17df50a5 XL |
6505 | generate int64x1_t:int8x8_t, uint64x1_t:int8x8_t, int64x1_t:uint8x8_t, uint64x1_t:uint8x8_t, int64x1_t:poly8x8_t, uint64x1_t:poly8x8_t |
6506 | generate int64x2_t:int8x16_t, uint64x2_t:int8x16_t, int64x2_t:uint8x16_t, uint64x2_t:uint8x16_t, int64x2_t:poly8x16_t, uint64x2_t:poly8x16_t | |
3c0e092e XL |
6507 | target = aes |
6508 | generate poly64x1_t:int8x8_t, poly64x1_t:uint8x8_t, poly64x1_t:poly8x8_t | |
6509 | generate poly64x2_t:int8x16_t, poly64x2_t:uint8x16_t, poly64x2_t:poly8x16_t | |
6510 | generate p128:int16x8_t, p128:uint16x8_t, p128:poly16x8_t | |
17df50a5 XL |
6511 | |
6512 | /// Vector reinterpret cast operation | |
6513 | name = vreinterpret | |
6514 | double-suffixes | |
6515 | fn = transmute | |
6516 | a = 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 | |
6517 | validate 0, 1 | |
6518 | ||
c295e0f8 | 6519 | aarch64 = nop |
3c0e092e XL |
6520 | arm = nop |
6521 | generate poly8x8_t:int64x1_t, int8x8_t:int64x1_t, uint8x8_t:int64x1_t, poly8x8_t:uint64x1_t, int8x8_t:uint64x1_t, uint8x8_t:uint64x1_t | |
6522 | generate poly8x16_t:int64x2_t, int8x16_t:int64x2_t, uint8x16_t:int64x2_t, poly8x16_t:uint64x2_t, int8x16_t:uint64x2_t, uint8x16_t:uint64x2_t | |
6523 | target = aes | |
17df50a5 XL |
6524 | generate poly8x8_t:poly64x1_t, int8x8_t:poly64x1_t, uint8x8_t:poly64x1_t |
6525 | generate poly8x16_t:poly64x2_t, int8x16_t:poly64x2_t, uint8x16_t:poly64x2_t | |
3c0e092e XL |
6526 | generate int16x8_t:p128, uint16x8_t:p128, poly16x8_t:p128 |
6527 | ||
6528 | /// Vector reinterpret cast operation | |
6529 | name = vreinterpret | |
6530 | double-suffixes | |
6531 | fn = transmute | |
6532 | a = 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
6533 | validate 1 | |
6534 | target = aes | |
17df50a5 | 6535 | |
3c0e092e | 6536 | aarch64 = nop |
c295e0f8 | 6537 | arm = nop |
3c0e092e XL |
6538 | generate int8x16_t:p128, uint8x16_t:p128, poly8x16_t:p128 |
6539 | ||
6540 | /// Vector reinterpret cast operation | |
6541 | name = vreinterpret | |
6542 | double-suffixes | |
6543 | fn = transmute | |
6544 | a = 1 | |
6545 | validate 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
6546 | target = aes | |
6547 | ||
6548 | aarch64 = nop | |
6549 | arm = nop | |
6550 | generate p128:int8x16_t, p128:uint8x16_t, p128:poly8x16_t | |
17df50a5 XL |
6551 | |
6552 | /// Vector reinterpret cast operation | |
6553 | name = vreinterpret | |
6554 | double-suffixes | |
6555 | fn = transmute | |
6556 | a = 0., 0., 0., 0., 0., 0., 0., 0. | |
6557 | validate 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
6558 | ||
c295e0f8 | 6559 | aarch64 = nop |
17df50a5 XL |
6560 | generate float64x1_t:int8x8_t, float64x1_t:int16x4_t, float64x1_t:int32x2_t, float64x1_t:int64x1_t |
6561 | generate float64x2_t:int8x16_t, float64x2_t:int16x8_t, float64x2_t:int32x4_t, float64x2_t:int64x2_t | |
6562 | generate float64x1_t:uint8x8_t, float64x1_t:uint16x4_t, float64x1_t:uint32x2_t, float64x1_t:uint64x1_t | |
6563 | generate float64x2_t:uint8x16_t, float64x2_t:uint16x8_t, float64x2_t:uint32x4_t, float64x2_t:uint64x2_t | |
6564 | generate float64x1_t:poly8x8_t, float64x1_t:poly16x4_t, float32x2_t:poly64x1_t, float64x1_t:poly64x1_t | |
6565 | generate float64x2_t:poly8x16_t, float64x2_t:poly16x8_t, float32x4_t:poly64x2_t, float64x2_t:poly64x2_t | |
3c0e092e | 6566 | generate float64x2_t:p128 |
17df50a5 | 6567 | |
c295e0f8 | 6568 | arm = nop |
17df50a5 XL |
6569 | generate float32x2_t:int8x8_t, float32x2_t:int16x4_t, float32x2_t:int32x2_t, float32x2_t:int64x1_t |
6570 | generate float32x4_t:int8x16_t, float32x4_t:int16x8_t, float32x4_t:int32x4_t, float32x4_t:int64x2_t | |
6571 | generate float32x2_t:uint8x8_t, float32x2_t:uint16x4_t, float32x2_t:uint32x2_t, float32x2_t:uint64x1_t | |
6572 | generate float32x4_t:uint8x16_t, float32x4_t:uint16x8_t, float32x4_t:uint32x4_t, float32x4_t:uint64x2_t | |
6573 | generate float32x2_t:poly8x8_t, float32x2_t:poly16x4_t | |
6574 | generate float32x4_t:poly8x16_t, float32x4_t:poly16x8_t | |
3c0e092e | 6575 | generate float32x4_t:p128 |
17df50a5 XL |
6576 | |
6577 | /// Vector reinterpret cast operation | |
6578 | name = vreinterpret | |
6579 | double-suffixes | |
6580 | fn = transmute | |
6581 | a = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
6582 | validate 0., 0., 0., 0., 0., 0., 0., 0. | |
6583 | ||
c295e0f8 | 6584 | aarch64 = nop |
17df50a5 XL |
6585 | generate int8x8_t:float64x1_t, int16x4_t:float64x1_t, int32x2_t:float64x1_t, int64x1_t:float64x1_t |
6586 | generate int8x16_t:float64x2_t, int16x8_t:float64x2_t, int32x4_t:float64x2_t, int64x2_t:float64x2_t | |
6587 | generate poly8x8_t:float64x1_t, uint16x4_t:float64x1_t, uint32x2_t:float64x1_t, uint64x1_t:float64x1_t | |
6588 | generate poly8x16_t:float64x2_t, uint16x8_t:float64x2_t, uint32x4_t:float64x2_t, uint64x2_t:float64x2_t | |
6589 | generate uint8x8_t:float64x1_t, poly16x4_t:float64x1_t, poly64x1_t:float64x1_t, poly64x1_t:float32x2_t | |
6590 | generate uint8x16_t:float64x2_t, poly16x8_t:float64x2_t, poly64x2_t:float64x2_t, poly64x2_t:float32x4_t | |
3c0e092e | 6591 | generate p128:float64x2_t |
17df50a5 | 6592 | |
c295e0f8 | 6593 | arm = nop |
17df50a5 XL |
6594 | generate int8x8_t:float32x2_t, int16x4_t:float32x2_t, int32x2_t:float32x2_t, int64x1_t:float32x2_t |
6595 | generate int8x16_t:float32x4_t, int16x8_t:float32x4_t, int32x4_t:float32x4_t, int64x2_t:float32x4_t | |
6596 | generate uint8x8_t:float32x2_t, uint16x4_t:float32x2_t, uint32x2_t:float32x2_t, uint64x1_t:float32x2_t | |
6597 | generate uint8x16_t:float32x4_t, uint16x8_t:float32x4_t, uint32x4_t:float32x4_t, uint64x2_t:float32x4_t | |
6598 | generate poly8x8_t:float32x2_t, poly16x4_t:float32x2_t | |
6599 | generate poly8x16_t:float32x4_t, poly16x8_t:float32x4_t | |
3c0e092e | 6600 | generate p128:float32x4_t |
17df50a5 XL |
6601 | |
6602 | /// Vector reinterpret cast operation | |
6603 | name = vreinterpret | |
6604 | double-suffixes | |
6605 | fn = transmute | |
6606 | a = 0., 0., 0., 0., 0., 0., 0., 0. | |
6607 | validate 0., 0., 0., 0., 0., 0., 0., 0. | |
6608 | ||
c295e0f8 | 6609 | aarch64 = nop |
17df50a5 XL |
6610 | generate float32x2_t:float64x1_t, float64x1_t:float32x2_t |
6611 | generate float32x4_t:float64x2_t, float64x2_t:float32x4_t | |
6612 | ||
6613 | /// Signed rounding shift left | |
6614 | name = vrshl | |
6615 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
6616 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
6617 | validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 | |
6618 | ||
6619 | aarch64 = srshl | |
6620 | link-aarch64 = srshl._EXT_ | |
6621 | generate i64 | |
6622 | ||
6623 | arm = vrshl | |
6624 | link-arm = vrshifts._EXT_ | |
6625 | generate int*_t, int64x*_t | |
6626 | ||
6627 | /// Unsigned rounding shift left | |
6628 | name = vrshl | |
6629 | out-suffix | |
6630 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
6631 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
6632 | validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 | |
6633 | ||
6634 | aarch64 = urshl | |
6635 | link-aarch64 = urshl._EXT_ | |
6636 | generate u64:i64:u64 | |
6637 | ||
6638 | arm = vrshl | |
6639 | link-arm = vrshiftu._EXT_ | |
6640 | generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t | |
6641 | generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t | |
6642 | ||
6643 | /// Signed rounding shift right | |
6644 | name = vrshr | |
6645 | n-suffix | |
6646 | constn = N | |
6647 | multi_fn = static_assert-N-1-bits | |
781aab86 | 6648 | multi_fn = vrshl-self-noext, a, {vdup-nself-noext, -N as _} |
17df50a5 XL |
6649 | a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
6650 | n = 2 | |
6651 | validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
6652 | ||
6653 | aarch64 = srshr | |
6654 | arm = vrshr | |
6655 | generate int*_t, int64x*_t | |
6656 | ||
6657 | /// Signed rounding shift right | |
6658 | name = vrshr | |
6659 | n-suffix | |
6660 | constn = N | |
6661 | multi_fn = static_assert-N-1-bits | |
6662 | multi_fn = vrshl-self-noext, a, -N as i64 | |
6663 | a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 | |
6664 | n = 2 | |
6665 | validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
6666 | ||
6667 | aarch64 = srshr | |
6668 | generate i64 | |
6669 | ||
6670 | /// Unsigned rounding shift right | |
6671 | name = vrshr | |
6672 | n-suffix | |
6673 | constn = N | |
6674 | multi_fn = static_assert-N-1-bits | |
781aab86 | 6675 | multi_fn = vrshl-self-noext, a, {vdup-nsigned-noext, -N as _} |
17df50a5 XL |
6676 | a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
6677 | n = 2 | |
6678 | validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
6679 | ||
6680 | aarch64 = urshr | |
6681 | arm = vrshr | |
6682 | generate uint*_t, uint64x*_t | |
6683 | ||
6684 | /// Unsigned rounding shift right | |
6685 | name = vrshr | |
6686 | n-suffix | |
6687 | constn = N | |
6688 | multi_fn = static_assert-N-1-bits | |
6689 | multi_fn = vrshl-self-noext, a, -N as i64 | |
6690 | a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 | |
6691 | n = 2 | |
6692 | validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
6693 | ||
6694 | aarch64 = urshr | |
6695 | generate u64 | |
6696 | ||
6697 | /// Rounding shift right narrow | |
6698 | name = vrshrn | |
6699 | noq-n-suffix | |
6700 | constn = N | |
6701 | multi_fn = static_assert-N-1-halfbits | |
6702 | a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 | |
6703 | n = 2 | |
6704 | validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
c295e0f8 | 6705 | arm-aarch64-separate |
17df50a5 XL |
6706 | |
6707 | aarch64 = rshrn | |
6708 | link-aarch64 = rshrn._EXT2_ | |
6709 | const-aarch64 = N | |
6710 | ||
6711 | arm = vrshrn | |
6712 | link-arm = vrshiftn._EXT2_ | |
6713 | const-arm = -N as ttn | |
6714 | generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t | |
6715 | ||
6716 | /// Rounding shift right narrow | |
6717 | name = vrshrn | |
6718 | noq-n-suffix | |
6719 | constn = N | |
6720 | multi_fn = static_assert-N-1-halfbits | |
6721 | multi_fn = transmute, {vrshrn_n-noqsigned-::<N>, transmute(a)} | |
6722 | a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 | |
6723 | n = 2 | |
6724 | validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
6725 | ||
6726 | aarch64 = rshrn | |
6727 | arm = vrshrn | |
6728 | generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t | |
6729 | ||
6730 | /// Rounding shift right narrow | |
6731 | name = vrshrn_high | |
6732 | noq-n-suffix | |
6733 | constn = N | |
6734 | multi_fn = static_assert-N-1-halfbits | |
353b0b11 | 6735 | multi_fn = simd_shuffle!, a, {vrshrn_n-noqself-::<N>, b}, {asc-0-out_len} |
17df50a5 XL |
6736 | a = 0, 1, 8, 9, 8, 9, 10, 11 |
6737 | b = 32, 36, 40, 44, 48, 52, 56, 60 | |
6738 | n = 2 | |
6739 | validate 0, 1, 8, 9, 8, 9, 10, 11, 8, 9, 10, 11, 12, 13, 14, 15 | |
6740 | ||
6741 | aarch64 = rshrn2 | |
6742 | generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t | |
6743 | generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t | |
6744 | ||
6745 | /// Signed rounding shift right and accumulate | |
6746 | name = vrsra | |
6747 | n-suffix | |
6748 | constn = N | |
6749 | multi_fn = static_assert-N-1-bits | |
6750 | multi_fn = simd_add, a, {vrshr-nself-::<N>, b} | |
6751 | a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 | |
6752 | b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 | |
6753 | n = 2 | |
6754 | validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 | |
6755 | ||
6756 | aarch64 = srsra | |
6757 | arm = vrsra | |
6758 | generate int*_t, int64x*_t | |
6759 | ||
6760 | /// Unsigned rounding shift right and accumulate | |
6761 | name = vrsra | |
6762 | n-suffix | |
6763 | constn = N | |
6764 | multi_fn = static_assert-N-1-bits | |
6765 | multi_fn = simd_add, a, {vrshr-nself-::<N>, b} | |
6766 | a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 | |
6767 | b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 | |
6768 | n = 2 | |
6769 | validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 | |
6770 | ||
6771 | aarch64 = ursra | |
6772 | arm = vrsra | |
6773 | generate uint*_t, uint64x*_t | |
6774 | ||
6775 | /// Signed rounding shift right and accumulate. | |
6776 | name = vrsra | |
6777 | n-suffix | |
6778 | constn = N | |
6779 | multi_fn = static_assert-N-1-bits | |
6780 | multi_fn = vrshr-nself-::<N>, b:in_t, b | |
a2a8927a | 6781 | multi_fn = a.wrapping_add(b) |
17df50a5 XL |
6782 | a = 1 |
6783 | b = 4 | |
6784 | n = 2 | |
6785 | validate 2 | |
6786 | ||
781aab86 | 6787 | aarch64 = srshr |
17df50a5 XL |
6788 | generate i64 |
6789 | ||
781aab86 | 6790 | /// Unsigned rounding shift right and accumulate. |
17df50a5 XL |
6791 | name = vrsra |
6792 | n-suffix | |
6793 | constn = N | |
6794 | multi_fn = static_assert-N-1-bits | |
6795 | multi_fn = vrshr-nself-::<N>, b:in_t, b | |
a2a8927a | 6796 | multi_fn = a.wrapping_add(b) |
17df50a5 XL |
6797 | a = 1 |
6798 | b = 4 | |
6799 | n = 2 | |
6800 | validate 2 | |
6801 | ||
781aab86 | 6802 | aarch64 = urshr |
17df50a5 XL |
6803 | generate u64 |
6804 | ||
3c0e092e XL |
6805 | /// Rounding subtract returning high narrow |
6806 | name = vrsubhn | |
6807 | no-q | |
6808 | a = MAX, MIN, 0, 4, 5, 6, 7, 8 | |
6809 | b = 1, 2, 3, 4, 5, 6, 7, 8 | |
6810 | validate MIN, MIN, 0, 0, 0, 0, 0, 0 | |
6811 | ||
6812 | aarch64 = rsubhn | |
6813 | link-aarch64 = rsubhn._EXT2_ | |
6814 | arm = vrsubhn | |
6815 | link-arm = vrsubhn._EXT2_ | |
6816 | generate int16x8_t:int16x8_t:int8x8_t, int32x4_t:int32x4_t:int16x4_t, int64x2_t:int64x2_t:int32x2_t | |
6817 | ||
6818 | /// Rounding subtract returning high narrow | |
6819 | name = vrsubhn | |
6820 | no-q | |
6821 | multi_fn = transmute, {vrsubhn-noqsigned-noext, {transmute, a}, {transmute, b}} | |
6822 | a = MAX, MIN, 3, 4, 5, 6, 7, 8 | |
6823 | b = 1, 2, 3, 4, 5, 6, 7, 8 | |
6824 | validate 0, 0, 0, 0, 0, 0, 0, 0 | |
6825 | ||
6826 | aarch64 = rsubhn | |
6827 | arm = vrsubhn | |
6828 | generate uint16x8_t:uint16x8_t:uint8x8_t, uint32x4_t:uint32x4_t:uint16x4_t, uint64x2_t:uint64x2_t:uint32x2_t | |
6829 | ||
6830 | /// Rounding subtract returning high narrow | |
6831 | name = vrsubhn_high | |
6832 | no-q | |
6833 | multi_fn = vrsubhn-noqself-noext, x:in_t0, b, c | |
353b0b11 | 6834 | multi_fn = simd_shuffle!, a, x, {asc-0-out_len} |
3c0e092e XL |
6835 | a = 1, 2, 0, 0, 0, 0, 0, 0 |
6836 | b = 1, 2, 3, 4, 5, 6, 7, 8 | |
6837 | c = 1, 2, 3, 4, 5, 6, 7, 8 | |
6838 | validate 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 | |
6839 | ||
6840 | aarch64 = rsubhn2 | |
6841 | generate int8x8_t:int16x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int64x2_t:int32x4_t | |
6842 | generate uint8x8_t:uint16x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint64x2_t:uint32x4_t | |
6843 | ||
17df50a5 XL |
6844 | /// Insert vector element from another vector element |
6845 | name = vset_lane | |
6846 | constn = LANE | |
6847 | multi_fn = static_assert_imm-in_exp_len-LANE | |
c620b35d | 6848 | multi_fn = simd_insert!, b, LANE as u32, a |
17df50a5 XL |
6849 | a = 1 |
6850 | b = 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
6851 | n = 0 | |
6852 | validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
6853 | ||
6854 | aarch64 = nop | |
6855 | arm = nop | |
6856 | generate i8:int8x8_t:int8x8_t, i16:int16x4_t:int16x4_t | |
6857 | generate i32:int32x2_t:int32x2_t, i64:int64x1_t:int64x1_t | |
6858 | generate u8:uint8x8_t:uint8x8_t, u16:uint16x4_t:uint16x4_t | |
6859 | generate u32:uint32x2_t:uint32x2_t, u64:uint64x1_t:uint64x1_t | |
6860 | generate p8:poly8x8_t:poly8x8_t, p16:poly16x4_t:poly16x4_t | |
6861 | ||
94222f64 | 6862 | target = aes |
17df50a5 XL |
6863 | generate p64:poly64x1_t:poly64x1_t |
6864 | ||
6865 | /// Insert vector element from another vector element | |
6866 | name = vsetq_lane | |
6867 | no-q | |
6868 | constn = LANE | |
6869 | multi_fn = static_assert_imm-in_exp_len-LANE | |
c620b35d | 6870 | multi_fn = simd_insert!, b, LANE as u32, a |
17df50a5 XL |
6871 | a = 1 |
6872 | b = 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
6873 | n = 0 | |
6874 | validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
6875 | ||
6876 | aarch64 = nop | |
6877 | arm = nop | |
6878 | generate i8:int8x16_t:int8x16_t, i16:int16x8_t:int16x8_t | |
6879 | generate i32:int32x4_t:int32x4_t, i64:int64x2_t:int64x2_t | |
6880 | generate u8:uint8x16_t:uint8x16_t, u16:uint16x8_t:uint16x8_t | |
6881 | generate u32:uint32x4_t:uint32x4_t, u64:uint64x2_t:uint64x2_t | |
6882 | generate p8:poly8x16_t:poly8x16_t, p16:poly16x8_t:poly16x8_t | |
6883 | ||
94222f64 | 6884 | target = aes |
17df50a5 XL |
6885 | generate p64:poly64x2_t:poly64x2_t |
6886 | ||
6887 | /// Insert vector element from another vector element | |
6888 | name = vset_lane | |
6889 | constn = LANE | |
6890 | multi_fn = static_assert_imm-in_exp_len-LANE | |
c620b35d | 6891 | multi_fn = simd_insert!, b, LANE as u32, a |
17df50a5 XL |
6892 | a = 1. |
6893 | b = 0., 2., 3., 4. | |
6894 | n = 0 | |
6895 | validate 1., 2., 3., 4. | |
6896 | ||
6897 | aarch64 = nop | |
6898 | generate f64:float64x1_t:float64x1_t | |
6899 | ||
6900 | arm = nop | |
6901 | generate f32:float32x2_t:float32x2_t | |
6902 | ||
6903 | /// Insert vector element from another vector element | |
6904 | name = vsetq_lane | |
6905 | no-q | |
6906 | constn = LANE | |
6907 | multi_fn = static_assert_imm-in_exp_len-LANE | |
c620b35d | 6908 | multi_fn = simd_insert!, b, LANE as u32, a |
17df50a5 XL |
6909 | a = 1. |
6910 | b = 0., 2., 3., 4. | |
6911 | n = 0 | |
6912 | validate 1., 2., 3., 4. | |
6913 | ||
6914 | aarch64 = nop | |
6915 | generate f64:float64x2_t:float64x2_t | |
6916 | ||
6917 | arm = nop | |
6918 | generate f32:float32x4_t:float32x4_t | |
6919 | ||
6920 | /// Signed Shift left | |
6921 | name = vshl | |
6922 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
6923 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
6924 | validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 | |
6925 | ||
6926 | aarch64 = sshl | |
6927 | link-aarch64 = sshl._EXT_ | |
6928 | arm = vshl | |
6929 | link-arm = vshifts._EXT_ | |
6930 | generate int*_t, int64x*_t | |
6931 | ||
6932 | /// Signed Shift left | |
6933 | name = vshl | |
6934 | multi_fn = transmute, {vshl-in_ntt-noext, transmute(a), transmute(b)} | |
6935 | a = 1 | |
6936 | b = 2 | |
6937 | validate 4 | |
6938 | ||
6939 | aarch64 = sshl | |
6940 | generate i64 | |
6941 | ||
6942 | /// Unsigned Shift left | |
6943 | name = vshl | |
6944 | out-suffix | |
6945 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
6946 | b = 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 | |
6947 | validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 | |
6948 | ||
6949 | aarch64 = ushl | |
6950 | link-aarch64 = ushl._EXT_ | |
6951 | arm = vshl | |
6952 | link-arm = vshiftu._EXT_ | |
6953 | generate uint8x8_t:int8x8_t:uint8x8_t, uint8x16_t:int8x16_t:uint8x16_t, uint16x4_t:int16x4_t:uint16x4_t, uint16x8_t:int16x8_t:uint16x8_t | |
6954 | generate uint32x2_t:int32x2_t:uint32x2_t, uint32x4_t:int32x4_t:uint32x4_t, uint64x1_t:int64x1_t:uint64x1_t, uint64x2_t:int64x2_t:uint64x2_t | |
6955 | ||
6956 | /// Unsigned Shift left | |
6957 | out-suffix | |
6958 | name = vshl | |
6959 | multi_fn = transmute, {vshl-out_ntt-noext, transmute(a), transmute(b)} | |
6960 | a = 1 | |
6961 | b = 2 | |
6962 | validate 4 | |
6963 | ||
6964 | aarch64 = ushl | |
6965 | generate u64:i64:u64 | |
6966 | ||
6967 | /// Shift left | |
6968 | name = vshl | |
6969 | n-suffix | |
6970 | constn = N | |
6971 | multi_fn = static_assert_imm-out_bits_exp_len-N | |
a2a8927a | 6972 | multi_fn = simd_shl, a, {vdup-nself-noext, N as _} |
17df50a5 XL |
6973 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 |
6974 | n = 2 | |
6975 | validate 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 | |
6976 | ||
6977 | arm = vshl | |
6978 | aarch64 = shl | |
6979 | generate int*_t, uint*_t, int64x*_t, uint64x*_t | |
6980 | ||
6981 | /// Signed shift left long | |
6982 | name = vshll | |
6983 | n-suffix | |
6984 | constn = N | |
6985 | multi_fn = static_assert-N-0-bits | |
a2a8927a | 6986 | multi_fn = simd_shl, {simd_cast, a}, {vdup-nout-noext, N as _} |
17df50a5 XL |
6987 | a = 1, 2, 3, 4, 5, 6, 7, 8 |
6988 | n = 2 | |
6989 | validate 4, 8, 12, 16, 20, 24, 28, 32 | |
6990 | ||
6991 | arm = vshll.s | |
6992 | aarch64 = sshll | |
6993 | generate int8x8_t:int16x8_t, int16x4_t:int32x4_t, int32x2_t:int64x2_t | |
6994 | aarch64 = ushll | |
6995 | generate uint8x8_t:uint16x8_t, uint16x4_t:uint32x4_t, uint32x2_t:uint64x2_t | |
6996 | ||
6997 | /// Signed shift left long | |
6998 | name = vshll_high_n | |
6999 | no-q | |
7000 | constn = N | |
7001 | multi_fn = static_assert-N-0-bits | |
353b0b11 | 7002 | multi_fn = simd_shuffle!, b:half, a, a, {asc-halflen-halflen} |
17df50a5 XL |
7003 | multi_fn = vshll_n-noqself-::<N>, b |
7004 | a = 0, 0, 1, 2, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8 | |
7005 | n = 2 | |
7006 | validate 4, 8, 12, 16, 20, 24, 28, 32 | |
7007 | ||
7008 | aarch64 = sshll2 | |
7009 | generate int8x16_t:int16x8_t, int16x8_t:int32x4_t, int32x4_t:int64x2_t | |
7010 | aarch64 = ushll2 | |
7011 | generate uint8x16_t:uint16x8_t, uint16x8_t:uint32x4_t, uint32x4_t:uint64x2_t | |
7012 | ||
7013 | /// Shift right | |
7014 | name = vshr | |
7015 | n-suffix | |
7016 | constn = N | |
7017 | multi_fn = static_assert-N-1-bits | |
3c0e092e | 7018 | multi_fn = fix_right_shift_imm-N-bits |
a2a8927a | 7019 | multi_fn = simd_shr, a, {vdup-nself-noext, n as _} |
17df50a5 XL |
7020 | a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
7021 | n = 2 | |
7022 | validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
7023 | ||
7024 | arm = vshr.s | |
7025 | aarch64 = sshr | |
7026 | generate int*_t, int64x*_t | |
7027 | aarch64 = ushr | |
7028 | generate uint*_t, uint64x*_t | |
7029 | ||
7030 | /// Shift right narrow | |
7031 | name = vshrn_n | |
7032 | no-q | |
7033 | constn = N | |
7034 | multi_fn = static_assert-N-1-halfbits | |
a2a8927a | 7035 | multi_fn = simd_cast, {simd_shr, a, {vdup-nself-noext, N as _}} |
17df50a5 XL |
7036 | a = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 |
7037 | n = 2 | |
7038 | validate 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
7039 | ||
7040 | arm = vshrn. | |
7041 | aarch64 = shrn | |
7042 | generate int16x8_t:int8x8_t, int32x4_t:int16x4_t, int64x2_t:int32x2_t | |
7043 | generate uint16x8_t:uint8x8_t, uint32x4_t:uint16x4_t, uint64x2_t:uint32x2_t | |
7044 | ||
7045 | /// Shift right narrow | |
7046 | name = vshrn_high_n | |
7047 | no-q | |
7048 | constn = N | |
7049 | multi_fn = static_assert-N-1-halfbits | |
353b0b11 | 7050 | multi_fn = simd_shuffle!, a, {vshrn_n-noqself-::<N>, b}, {asc-0-out_len} |
17df50a5 XL |
7051 | a = 1, 2, 5, 6, 5, 6, 7, 8 |
7052 | b = 20, 24, 28, 32, 52, 56, 60, 64 | |
7053 | n = 2 | |
7054 | validate 1, 2, 5, 6, 5, 6, 7, 8, 5, 6, 7, 8, 13, 14, 15, 16 | |
7055 | ||
7056 | aarch64 = shrn2 | |
7057 | generate int8x8_t:int16x8_t:int8x16_t, int16x4_t:int32x4_t:int16x8_t, int32x2_t:int64x2_t:int32x4_t | |
7058 | generate uint8x8_t:uint16x8_t:uint8x16_t, uint16x4_t:uint32x4_t:uint16x8_t, uint32x2_t:uint64x2_t:uint32x4_t | |
7059 | ||
7060 | /// Signed shift right and accumulate | |
7061 | name = vsra | |
7062 | n-suffix | |
7063 | constn = N | |
7064 | multi_fn = static_assert-N-1-bits | |
7065 | multi_fn = simd_add, a, {vshr-nself-::<N>, b} | |
7066 | a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 | |
7067 | b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 | |
7068 | n = 2 | |
7069 | validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 | |
7070 | ||
7071 | aarch64 = ssra | |
7072 | arm = vsra | |
7073 | generate int*_t, int64x*_t | |
7074 | ||
7075 | /// Unsigned shift right and accumulate | |
7076 | name = vsra | |
7077 | n-suffix | |
7078 | constn = N | |
7079 | multi_fn = static_assert-N-1-bits | |
7080 | multi_fn = simd_add, a, {vshr-nself-::<N>, b} | |
7081 | a = 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 | |
7082 | b = 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 64 | |
7083 | n = 2 | |
7084 | validate 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17 | |
7085 | ||
7086 | aarch64 = usra | |
7087 | arm = vsra | |
7088 | generate uint*_t, uint64x*_t | |
7089 | ||
3c0e092e XL |
7090 | /// SM3PARTW1 |
7091 | name = vsm3partw1 | |
7092 | a = 1, 2, 3, 4 | |
7093 | b = 1, 2, 3, 4 | |
7094 | c = 1, 2, 3, 4 | |
7095 | validate 2147549312, 3221323968, 131329, 2684362752 | |
7096 | target = sm4 | |
7097 | ||
7098 | aarch64 = sm3partw1 | |
7099 | link-aarch64 = llvm.aarch64.crypto.sm3partw1 | |
7100 | generate uint32x4_t | |
7101 | ||
7102 | /// SM3PARTW2 | |
7103 | name = vsm3partw2 | |
7104 | a = 1, 2, 3, 4 | |
7105 | b = 1, 2, 3, 4 | |
7106 | c = 1, 2, 3, 4 | |
7107 | validate 128, 256, 384, 1077977696 | |
7108 | target = sm4 | |
7109 | ||
7110 | aarch64 = sm3partw2 | |
7111 | link-aarch64 = llvm.aarch64.crypto.sm3partw2 | |
7112 | generate uint32x4_t | |
7113 | ||
7114 | /// SM3SS1 | |
7115 | name = vsm3ss1 | |
7116 | a = 1, 2, 3, 4 | |
7117 | b = 1, 2, 3, 4 | |
7118 | c = 1, 2, 3, 4 | |
7119 | validate 0, 0, 0, 2098176 | |
7120 | target = sm4 | |
7121 | ||
7122 | aarch64 = sm3ss1 | |
7123 | link-aarch64 = llvm.aarch64.crypto.sm3ss1 | |
7124 | generate uint32x4_t | |
7125 | ||
7126 | /// SM4 key | |
7127 | name = vsm4ekey | |
7128 | a = 1, 2, 3, 4 | |
7129 | b = 1, 2, 3, 4 | |
7130 | validate 1784948604, 136020997, 2940231695, 3789947679 | |
7131 | target = sm4 | |
7132 | ||
7133 | aarch64 = sm4ekey | |
7134 | link-aarch64 = llvm.aarch64.crypto.sm4ekey | |
7135 | generate uint32x4_t | |
7136 | ||
7137 | /// SM4 encode | |
7138 | name = vsm4e | |
7139 | a = 1, 2, 3, 4 | |
7140 | b = 1, 2, 3, 4 | |
7141 | validate 1093874472, 3616769504, 3878330411, 2765298765 | |
7142 | target = sm4 | |
7143 | ||
7144 | aarch64 = sm4e | |
7145 | link-aarch64 = llvm.aarch64.crypto.sm4e | |
7146 | generate uint32x4_t | |
7147 | ||
7148 | /// Rotate and exclusive OR | |
7149 | name = vrax1 | |
7150 | a = 1, 2 | |
7151 | b = 3, 4 | |
7152 | validate 7, 10 | |
7153 | target = sha3 | |
7154 | ||
7155 | aarch64 = rax1 | |
7156 | link-aarch64 = llvm.aarch64.crypto.rax1 | |
7157 | generate uint64x2_t | |
7158 | ||
7159 | /// SHA512 hash update part 1 | |
7160 | name = vsha512h | |
7161 | a = 1, 2 | |
7162 | b = 3, 4 | |
7163 | c = 5, 6 | |
7164 | validate 11189044327219203, 7177611956453380 | |
7165 | target = sha3 | |
7166 | ||
7167 | aarch64 = sha512h | |
7168 | link-aarch64 = llvm.aarch64.crypto.sha512h | |
7169 | generate uint64x2_t | |
7170 | ||
7171 | /// SHA512 hash update part 2 | |
7172 | name = vsha512h2 | |
7173 | a = 1, 2 | |
7174 | b = 3, 4 | |
7175 | c = 5, 6 | |
7176 | validate 5770237651009406214, 349133864969 | |
7177 | target = sha3 | |
7178 | ||
7179 | aarch64 = sha512h2 | |
7180 | link-aarch64 = llvm.aarch64.crypto.sha512h2 | |
7181 | generate uint64x2_t | |
7182 | ||
7183 | /// SHA512 schedule update 0 | |
7184 | name = vsha512su0 | |
7185 | a = 1, 2 | |
7186 | b = 3, 4 | |
7187 | validate 144115188075855874, 9439544818968559619 | |
7188 | target = sha3 | |
7189 | ||
7190 | aarch64 = sha512su0 | |
7191 | link-aarch64 = llvm.aarch64.crypto.sha512su0 | |
7192 | generate uint64x2_t | |
7193 | ||
7194 | /// SHA512 schedule update 1 | |
7195 | name = vsha512su1 | |
7196 | a = 1, 2 | |
7197 | b = 3, 4 | |
7198 | c = 5, 6 | |
7199 | validate 105553116266526, 140737488355368 | |
7200 | target = sha3 | |
7201 | ||
7202 | aarch64 = sha512su1 | |
7203 | link-aarch64 = llvm.aarch64.crypto.sha512su1 | |
7204 | generate uint64x2_t | |
7205 | ||
7206 | /// Floating-point round to 32-bit integer, using current rounding mode | |
7207 | name = vrnd32x | |
3c0e092e XL |
7208 | target = frintts |
7209 | ||
781aab86 FG |
7210 | // For validation, the rounding mode should be the default: round-to-nearest (ties-to-even). |
7211 | a = -1.5, 2.9, 1.5, -2.5 | |
7212 | validate -2.0, 3.0, 2.0, -2.0 | |
7213 | ||
3c0e092e XL |
7214 | aarch64 = frint32x |
7215 | link-aarch64 = frint32x._EXT_ | |
7216 | generate float32x2_t, float32x4_t | |
7217 | ||
781aab86 FG |
7218 | // The float64x1_t form uses a different LLVM link and isn't supported by Clang |
7219 | // (and so has no intrinsic-test), so perform extra validation to make sure | |
7220 | // that it matches the float64x2_t form. | |
7221 | ||
7222 | a = 1.5, -2.5 | |
7223 | validate 2.0, -2.0 | |
7224 | // - The biggest f64 that rounds to i32::MAX. | |
7225 | // - The smallest positive f64 that rounds out of range. | |
7226 | a = 2147483647.499999762, 2147483647.5 | |
7227 | validate 2147483647.0, -2147483648.0 | |
7228 | // - The smallest f64 that rounds to i32::MIN + 1. | |
7229 | // - The largest negative f64 that rounds out of range. | |
7230 | a = -2147483647.499999762, -2147483648.500000477 | |
7231 | validate -2147483647.0, -2147483648.0 | |
7232 | generate float64x2_t | |
7233 | ||
7234 | // Odd-numbered tests for float64x1_t coverage. | |
7235 | a = 2.9 | |
7236 | validate 3.0 | |
7237 | a = -2.5 | |
7238 | validate -2.0 | |
7239 | a = 2147483647.5 | |
7240 | validate -2147483648.0 | |
7241 | a = -2147483648.500000477 | |
7242 | validate -2147483648.0 | |
7243 | ||
c620b35d | 7244 | multi_fn = transmute, {self-out-_, {simd_extract!, a, 0}} |
781aab86 FG |
7245 | link-aarch64 = llvm.aarch64.frint32x.f64:f64:::f64 |
7246 | generate float64x1_t | |
7247 | ||
3c0e092e XL |
7248 | /// Floating-point round to 32-bit integer toward zero |
7249 | name = vrnd32z | |
3c0e092e XL |
7250 | target = frintts |
7251 | ||
781aab86 FG |
7252 | a = -1.5, 2.9, 1.5, -2.5 |
7253 | validate -1.0, 2.0, 1.0, -2.0 | |
7254 | ||
3c0e092e XL |
7255 | aarch64 = frint32z |
7256 | link-aarch64 = frint32z._EXT_ | |
7257 | generate float32x2_t, float32x4_t | |
7258 | ||
781aab86 FG |
7259 | // The float64x1_t form uses a different LLVM link and isn't supported by Clang |
7260 | // (and so has no intrinsic-test), so perform extra validation to make sure | |
7261 | // that it matches the float64x2_t form. | |
7262 | ||
7263 | a = 1.5, -2.5 | |
7264 | validate 1.0, -2.0 | |
7265 | // - The biggest f64 that rounds to i32::MAX. | |
7266 | // - The smallest positive f64 that rounds out of range. | |
7267 | a = 2147483647.999999762, 2147483648.0 | |
7268 | validate 2147483647.0, -2147483648.0 | |
7269 | // - The smallest f64 that rounds to i32::MIN + 1. | |
7270 | // - The largest negative f64 that rounds out of range. | |
7271 | a = -2147483647.999999762, -2147483649.0 | |
7272 | validate -2147483647.0, -2147483648.0 | |
7273 | generate float64x2_t | |
7274 | ||
7275 | // Odd-numbered tests for float64x1_t coverage. | |
7276 | a = 2.9 | |
7277 | validate 2.0 | |
7278 | a = -2.5 | |
7279 | validate -2.0 | |
7280 | a = 2147483648.0 | |
7281 | validate -2147483648.0 | |
7282 | a = -2147483649.0 | |
7283 | validate -2147483648.0 | |
7284 | ||
c620b35d | 7285 | multi_fn = transmute, {self-out-_, {simd_extract!, a, 0}} |
781aab86 FG |
7286 | link-aarch64 = llvm.aarch64.frint32z.f64:f64:::f64 |
7287 | generate float64x1_t | |
7288 | ||
3c0e092e XL |
7289 | /// Floating-point round to 64-bit integer, using current rounding mode |
7290 | name = vrnd64x | |
3c0e092e XL |
7291 | target = frintts |
7292 | ||
781aab86 FG |
7293 | // For validation, the rounding mode should be the default: round-to-nearest (ties-to-even). |
7294 | a = -1.5, 2.9, 1.5, -2.5 | |
7295 | validate -2.0, 3.0, 2.0, -2.0 | |
7296 | ||
3c0e092e XL |
7297 | aarch64 = frint64x |
7298 | link-aarch64 = frint64x._EXT_ | |
7299 | generate float32x2_t, float32x4_t | |
7300 | ||
781aab86 FG |
7301 | // The float64x1_t form uses a different LLVM link and isn't supported by Clang |
7302 | // (and so has no intrinsic-test), so perform extra validation to make sure | |
7303 | // that it matches the float64x2_t form. | |
7304 | ||
7305 | a = 1.5, -2.5 | |
7306 | validate 2.0, -2.0 | |
7307 | // - The biggest f64 representable as an i64 (0x7ffffffffffffc00). | |
7308 | // - The smallest positive f64 that is out of range (2^63). | |
7309 | a = 9223372036854774784.0, 9223372036854775808.0 | |
7310 | validate 9223372036854774784.0, -9223372036854775808.0 | |
7311 | // - The smallest f64 representable as an i64 (i64::MIN). | |
7312 | // - The biggest negative f64 that is out of range. | |
7313 | a = -9223372036854775808.0, -9223372036854777856.0 | |
7314 | validate -9223372036854775808.0, -9223372036854775808.0 | |
7315 | generate float64x2_t | |
7316 | ||
7317 | // Odd-numbered tests for float64x1_t coverage. | |
7318 | a = 2.9 | |
7319 | validate 3.0 | |
7320 | a = -2.5 | |
7321 | validate -2.0 | |
7322 | a = 9223372036854775808.0 | |
7323 | validate -9223372036854775808.0 | |
7324 | a = -9223372036854777856.0 | |
7325 | validate -9223372036854775808.0 | |
7326 | ||
c620b35d | 7327 | multi_fn = transmute, {self-out-_, {simd_extract!, a, 0}} |
781aab86 FG |
7328 | link-aarch64 = llvm.aarch64.frint64x.f64:f64:::f64 |
7329 | generate float64x1_t | |
7330 | ||
3c0e092e XL |
7331 | /// Floating-point round to 64-bit integer toward zero |
7332 | name = vrnd64z | |
3c0e092e XL |
7333 | target = frintts |
7334 | ||
781aab86 FG |
7335 | a = -1.5, 2.9, 1.5, -2.5 |
7336 | validate -1.0, 2.0, 1.0, -2.0 | |
7337 | ||
3c0e092e XL |
7338 | aarch64 = frint64z |
7339 | link-aarch64 = frint64z._EXT_ | |
7340 | generate float32x2_t, float32x4_t | |
7341 | ||
781aab86 FG |
7342 | // The float64x1_t form uses a different LLVM link and isn't supported by Clang |
7343 | // (and so has no intrinsic-test), so perform extra validation to make sure | |
7344 | // that it matches the float64x2_t form. | |
7345 | ||
7346 | a = 1.5, -2.5 | |
7347 | validate 1.0, -2.0 | |
7348 | // - The biggest f64 representable as an i64 (0x7ffffffffffffc00). | |
7349 | // - The smallest positive f64 that is out of range (2^63). | |
7350 | a = 9223372036854774784.0, 9223372036854775808.0 | |
7351 | validate 9223372036854774784.0, -9223372036854775808.0 | |
7352 | // - The smallest f64 representable as an i64 (i64::MIN). | |
7353 | // - The biggest negative f64 that is out of range. | |
7354 | a = -9223372036854775808.0, -9223372036854777856.0 | |
7355 | validate -9223372036854775808.0, -9223372036854775808.0 | |
7356 | generate float64x2_t | |
7357 | ||
7358 | // Odd-numbered tests for float64x1_t coverage. | |
7359 | a = 2.9 | |
7360 | validate 2.0 | |
7361 | a = -2.5 | |
7362 | validate -2.0 | |
7363 | a = 9223372036854775808.0 | |
7364 | validate -9223372036854775808.0 | |
7365 | a = -9223372036854777856.0 | |
7366 | validate -9223372036854775808.0 | |
7367 | ||
c620b35d | 7368 | multi_fn = transmute, {self-out-_, {simd_extract!, a, 0}} |
781aab86 FG |
7369 | link-aarch64 = llvm.aarch64.frint64z.f64:f64:::f64 |
7370 | generate float64x1_t | |
7371 | ||
3c0e092e XL |
7372 | /// Transpose elements |
7373 | name = vtrn | |
353b0b11 FG |
7374 | multi_fn = simd_shuffle!, a1:in_t, a, b, {transpose-1-in_len} |
7375 | multi_fn = simd_shuffle!, b1:in_t, a, b, {transpose-2-in_len} | |
3c0e092e XL |
7376 | multi_fn = transmute, (a1, b1) |
7377 | a = 0, 2, 2, 6, 2, 10, 6, 14, 2, 18, 6, 22, 10, 26, 14, 30 | |
7378 | b = 1, 3, 3, 7, 3, 1, 7, 15, 3, 19, 7, 23, 1, 27, 15, 31 | |
7379 | validate 0, 1, 2, 3, 2, 3, 6, 7, 2, 3, 6, 7, 10, 1, 14, 15, 2, 3, 6, 7, 10, 1, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 | |
7380 | ||
7381 | aarch64 = trn | |
7382 | arm = vtrn | |
7383 | generate int8x8_t:int8x8_t:int8x8x2_t, int16x4_t:int16x4_t:int16x4x2_t, int8x16_t:int8x16_t:int8x16x2_t, int16x8_t:int16x8_t:int16x8x2_t, int32x4_t:int32x4_t:int32x4x2_t | |
7384 | generate uint8x8_t:uint8x8_t:uint8x8x2_t, uint16x4_t:uint16x4_t:uint16x4x2_t, uint8x16_t:uint8x16_t:uint8x16x2_t, uint16x8_t:uint16x8_t:uint16x8x2_t, uint32x4_t:uint32x4_t:uint32x4x2_t | |
7385 | generate poly8x8_t:poly8x8_t:poly8x8x2_t, poly16x4_t:poly16x4_t:poly16x4x2_t, poly8x16_t:poly8x16_t:poly8x16x2_t, poly16x8_t:poly16x8_t:poly16x8x2_t | |
7386 | aarch64 = zip | |
7387 | generate int32x2_t:int32x2_t:int32x2x2_t, uint32x2_t:uint32x2_t:uint32x2x2_t | |
7388 | ||
7389 | /// Transpose elements | |
7390 | name = vtrn | |
353b0b11 FG |
7391 | multi_fn = simd_shuffle!, a1:in_t, a, b, {transpose-1-in_len} |
7392 | multi_fn = simd_shuffle!, b1:in_t, a, b, {transpose-2-in_len} | |
3c0e092e XL |
7393 | multi_fn = transmute, (a1, b1) |
7394 | a = 0., 2., 2., 6. | |
7395 | b = 1., 3., 3., 7. | |
7396 | validate 0., 1., 2., 3., 2., 3., 6., 7. | |
7397 | ||
7398 | aarch64 = zip | |
7399 | arm = vtrn | |
7400 | generate float32x2_t:float32x2_t:float32x2x2_t | |
7401 | aarch64 = trn | |
7402 | generate float32x4_t:float32x4_t:float32x4x2_t | |
7403 | ||
17df50a5 XL |
7404 | /// Transpose vectors |
7405 | name = vtrn1 | |
353b0b11 | 7406 | multi_fn = simd_shuffle!, a, b, {transpose-1-in_len} |
17df50a5 XL |
7407 | a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 |
7408 | b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 | |
7409 | validate 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 | |
7410 | ||
7411 | aarch64 = trn1 | |
7412 | generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t | |
7413 | ||
7414 | aarch64 = zip1 | |
7415 | generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t | |
7416 | ||
7417 | /// Transpose vectors | |
7418 | name = vtrn1 | |
353b0b11 | 7419 | multi_fn = simd_shuffle!, a, b, {transpose-1-in_len} |
17df50a5 XL |
7420 | a = 0., 2., 4., 6., 8., 10., 12., 14. |
7421 | b = 1., 3., 5., 7., 9., 11., 13., 15. | |
7422 | validate 0., 1., 4., 5., 8., 9., 12., 13. | |
7423 | ||
7424 | aarch64 = trn1 | |
7425 | generate float32x4_t | |
7426 | ||
7427 | aarch64 = zip1 | |
7428 | generate float32x2_t, float64x2_t | |
7429 | ||
7430 | /// Transpose vectors | |
7431 | name = vtrn2 | |
353b0b11 | 7432 | multi_fn = simd_shuffle!, a, b, {transpose-2-in_len} |
17df50a5 XL |
7433 | a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 |
7434 | b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 | |
7435 | validate 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 | |
7436 | ||
7437 | aarch64 = trn2 | |
7438 | generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t | |
7439 | ||
7440 | aarch64 = zip2 | |
7441 | generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t | |
7442 | ||
7443 | /// Transpose vectors | |
7444 | name = vtrn2 | |
353b0b11 | 7445 | multi_fn = simd_shuffle!, a, b, {transpose-2-in_len} |
17df50a5 XL |
7446 | a = 0., 2., 4., 6., 8., 10., 12., 14. |
7447 | b = 1., 3., 5., 7., 9., 11., 13., 15. | |
7448 | validate 2., 3., 6., 7., 10., 11., 14., 15. | |
7449 | ||
7450 | aarch64 = trn2 | |
7451 | generate float32x4_t | |
7452 | ||
7453 | aarch64 = zip2 | |
7454 | generate float32x2_t, float64x2_t | |
7455 | ||
3c0e092e XL |
7456 | /// Zip vectors |
7457 | name = vzip | |
353b0b11 FG |
7458 | multi_fn = simd_shuffle!, a0:in_t, a, b, {zip-1-in_len} |
7459 | multi_fn = simd_shuffle!, b0:in_t, a, b, {zip-2-in_len} | |
3c0e092e XL |
7460 | multi_fn = transmute, (a0, b0) |
7461 | a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 | |
7462 | b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 | |
7463 | validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 | |
7464 | ||
7465 | aarch64 = zip | |
7466 | arm = vzip | |
7467 | generate int8x8_t:int8x8_t:int8x8x2_t, int16x4_t:int16x4_t:int16x4x2_t | |
7468 | generate uint8x8_t:uint8x8_t:uint8x8x2_t, uint16x4_t:uint16x4_t:uint16x4x2_t | |
7469 | generate poly8x8_t:poly8x8_t:poly8x8x2_t, poly16x4_t:poly16x4_t:poly16x4x2_t | |
7470 | arm = vtrn | |
7471 | generate int32x2_t:int32x2_t:int32x2x2_t, uint32x2_t:uint32x2_t:uint32x2x2_t | |
781aab86 | 7472 | aarch64 = zip |
3c0e092e XL |
7473 | arm = vorr |
7474 | generate int8x16_t:int8x16_t:int8x16x2_t, int16x8_t:int16x8_t:int16x8x2_t, int32x4_t:int32x4_t:int32x4x2_t | |
7475 | generate uint8x16_t:uint8x16_t:uint8x16x2_t, uint16x8_t:uint16x8_t:uint16x8x2_t, uint32x4_t:uint32x4_t:uint32x4x2_t | |
7476 | generate poly8x16_t:poly8x16_t:poly8x16x2_t, poly16x8_t:poly16x8_t:poly16x8x2_t | |
7477 | ||
7478 | /// Zip vectors | |
7479 | name = vzip | |
353b0b11 FG |
7480 | multi_fn = simd_shuffle!, a0:in_t, a, b, {zip-1-in_len} |
7481 | multi_fn = simd_shuffle!, b0:in_t, a, b, {zip-2-in_len} | |
3c0e092e XL |
7482 | multi_fn = transmute, (a0, b0) |
7483 | a = 1., 2., 3., 4. | |
7484 | b = 5., 6., 7., 8. | |
7485 | validate 1., 5., 2., 6., 3., 7., 4., 8. | |
7486 | ||
7487 | aarch64 = zip | |
7488 | arm = vtrn | |
7489 | generate float32x2_t:float32x2_t:float32x2x2_t | |
781aab86 | 7490 | aarch64 = zip |
3c0e092e XL |
7491 | arm = vorr |
7492 | generate float32x4_t:float32x4_t:float32x4x2_t | |
7493 | ||
17df50a5 XL |
7494 | /// Zip vectors |
7495 | name = vzip1 | |
353b0b11 | 7496 | multi_fn = simd_shuffle!, a, b, {zip-1-in_len} |
17df50a5 XL |
7497 | a = 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 |
7498 | b = 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 | |
7499 | validate 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 | |
7500 | ||
7501 | aarch64 = zip1 | |
7502 | generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t | |
7503 | ||
7504 | /// Zip vectors | |
7505 | name = vzip1 | |
353b0b11 | 7506 | multi_fn = simd_shuffle!, a, b, {zip-1-in_len} |
17df50a5 XL |
7507 | a = 0., 2., 4., 6., 8., 10., 12., 14. |
7508 | b = 1., 3., 5., 7., 9., 11., 13., 15. | |
7509 | validate 0., 1., 2., 3., 4., 5., 6., 7. | |
7510 | ||
7511 | aarch64 = zip1 | |
7512 | generate float32x2_t, float32x4_t, float64x2_t | |
7513 | ||
7514 | /// Zip vectors | |
7515 | name = vzip2 | |
353b0b11 | 7516 | multi_fn = simd_shuffle!, a, b, {zip-2-in_len} |
17df50a5 XL |
7517 | a = 0, 16, 16, 18, 16, 18, 20, 22, 16, 18, 20, 22, 24, 26, 28, 30 |
7518 | b = 1, 17, 17, 19, 17, 19, 21, 23, 17, 19, 21, 23, 25, 27, 29, 31 | |
7519 | validate 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 | |
7520 | ||
7521 | aarch64 = zip2 | |
7522 | generate int*_t, int64x2_t, uint*_t, uint64x2_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t, poly64x2_t | |
7523 | ||
7524 | /// Zip vectors | |
7525 | name = vzip2 | |
353b0b11 | 7526 | multi_fn = simd_shuffle!, a, b, {zip-2-in_len} |
17df50a5 XL |
7527 | a = 0., 8., 8., 10., 8., 10., 12., 14. |
7528 | b = 1., 9., 9., 11., 9., 11., 13., 15. | |
7529 | validate 8., 9., 10., 11., 12., 13., 14., 15. | |
7530 | ||
7531 | aarch64 = zip2 | |
7532 | generate float32x2_t, float32x4_t, float64x2_t | |
7533 | ||
3c0e092e XL |
7534 | /// Unzip vectors |
7535 | name = vuzp | |
353b0b11 FG |
7536 | multi_fn = simd_shuffle!, a0:in_t, a, b, {unzip-1-in_len} |
7537 | multi_fn = simd_shuffle!, b0:in_t, a, b, {unzip-2-in_len} | |
3c0e092e XL |
7538 | multi_fn = transmute, (a0, b0) |
7539 | a = 1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 15, 8, 16 | |
7540 | b = 2, 3, 3, 8, 3, 15, 8, 16, 3, 29, 8, 30, 15, 31, 16, 32 | |
7541 | validate 1, 2, 2, 3, 2, 3, 3, 8, 2, 3, 3, 8, 3, 8, 15, 16, 2, 3, 3, 8, 3, 8, 15, 16, 3, 8, 15, 16, 29, 30, 31, 32 | |
7542 | ||
7543 | aarch64 = uzp | |
7544 | arm = vuzp | |
7545 | generate int8x8_t:int8x8_t:int8x8x2_t, int16x4_t:int16x4_t:int16x4x2_t, int8x16_t:int8x16_t:int8x16x2_t, int16x8_t:int16x8_t:int16x8x2_t, int32x4_t:int32x4_t:int32x4x2_t | |
7546 | generate uint8x8_t:uint8x8_t:uint8x8x2_t, uint16x4_t:uint16x4_t:uint16x4x2_t, uint8x16_t:uint8x16_t:uint8x16x2_t, uint16x8_t:uint16x8_t:uint16x8x2_t, uint32x4_t:uint32x4_t:uint32x4x2_t | |
7547 | generate poly8x8_t:poly8x8_t:poly8x8x2_t, poly16x4_t:poly16x4_t:poly16x4x2_t, poly8x16_t:poly8x16_t:poly8x16x2_t, poly16x8_t:poly16x8_t:poly16x8x2_t | |
7548 | aarch64 = zip | |
7549 | arm = vtrn | |
7550 | generate int32x2_t:int32x2_t:int32x2x2_t, uint32x2_t:uint32x2_t:uint32x2x2_t | |
7551 | ||
7552 | /// Unzip vectors | |
7553 | name = vuzp | |
353b0b11 FG |
7554 | multi_fn = simd_shuffle!, a0:in_t, a, b, {unzip-1-in_len} |
7555 | multi_fn = simd_shuffle!, b0:in_t, a, b, {unzip-2-in_len} | |
3c0e092e XL |
7556 | multi_fn = transmute, (a0, b0) |
7557 | a = 1., 2., 2., 4. | |
7558 | b = 2., 6., 6., 8. | |
7559 | validate 1., 2., 2., 6., 2., 4., 6., 8. | |
7560 | ||
7561 | aarch64 = zip | |
7562 | arm = vtrn | |
7563 | generate float32x2_t:float32x2_t:float32x2x2_t | |
7564 | aarch64 = uzp | |
7565 | arm = vuzp | |
7566 | generate float32x4_t:float32x4_t:float32x4x2_t | |
7567 | ||
17df50a5 XL |
7568 | /// Unzip vectors |
7569 | name = vuzp1 | |
353b0b11 | 7570 | multi_fn = simd_shuffle!, a, b, {unzip-1-in_len} |
17df50a5 XL |
7571 | a = 1, 0, 2, 0, 2, 0, 3, 0, 2, 0, 3, 0, 7, 0, 8, 0 |
7572 | b = 2, 0, 3, 0, 7, 0, 8, 0, 13, 0, 14, 0, 15, 0, 16, 0 | |
7573 | validate 1, 2, 2, 3, 2, 3, 7, 8, 2, 3, 7, 8, 13, 14, 15, 16 | |
7574 | ||
7575 | aarch64 = uzp1 | |
7576 | generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t | |
7577 | ||
7578 | aarch64 = zip1 | |
7579 | generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t | |
7580 | ||
7581 | /// Unzip vectors | |
7582 | name = vuzp1 | |
353b0b11 | 7583 | multi_fn = simd_shuffle!, a, b, {unzip-1-in_len} |
17df50a5 XL |
7584 | a = 0., 8., 1., 9., 4., 12., 5., 13. |
7585 | b = 1., 10., 3., 11., 6., 14., 7., 15. | |
7586 | validate 0., 1., 1., 3., 4., 5., 6., 7. | |
7587 | ||
7588 | aarch64 = uzp1 | |
7589 | generate float32x4_t | |
7590 | ||
7591 | aarch64 = zip1 | |
7592 | generate float32x2_t, float64x2_t | |
7593 | ||
7594 | /// Unzip vectors | |
7595 | name = vuzp2 | |
353b0b11 | 7596 | multi_fn = simd_shuffle!, a, b, {unzip-2-in_len} |
17df50a5 XL |
7597 | a = 0, 17, 0, 18, 0, 18, 0, 19, 0, 18, 0, 19, 0, 23, 0, 24 |
7598 | b = 0, 18, 0, 19, 0, 23, 0, 24, 0, 29, 0, 30, 0, 31, 0, 32 | |
7599 | validate 17, 18, 18, 19, 18, 19, 23, 24, 18, 19, 23, 24, 29, 30, 31, 32 | |
7600 | ||
7601 | aarch64 = uzp2 | |
7602 | generate int8x8_t, int8x16_t, int16x4_t, int16x8_t, int32x4_t, uint8x8_t, uint8x16_t, uint16x4_t, uint16x8_t, uint32x4_t, poly8x8_t, poly8x16_t, poly16x4_t, poly16x8_t | |
7603 | ||
7604 | aarch64 = zip2 | |
7605 | generate int32x2_t, int64x2_t, uint32x2_t, uint64x2_t, poly64x2_t | |
7606 | ||
7607 | /// Unzip vectors | |
7608 | name = vuzp2 | |
353b0b11 | 7609 | multi_fn = simd_shuffle!, a, b, {unzip-2-in_len} |
17df50a5 XL |
7610 | a = 0., 8., 1., 9., 4., 12., 5., 13. |
7611 | b = 2., 9., 3., 11., 6., 14., 7., 15. | |
7612 | validate 8., 9., 9., 11., 12., 13., 14., 15. | |
7613 | ||
7614 | aarch64 = uzp2 | |
7615 | generate float32x4_t | |
7616 | ||
7617 | aarch64 = zip2 | |
7618 | generate float32x2_t, float64x2_t | |
7619 | ||
7620 | //////////////////// | |
7621 | // Unsigned Absolute difference and Accumulate Long | |
7622 | //////////////////// | |
7623 | ||
7624 | /// Unsigned Absolute difference and Accumulate Long | |
7625 | name = vabal | |
7626 | multi_fn = vabd-unsigned-noext, b, c, d:in_t | |
7627 | multi_fn = simd_add, a, {simd_cast, d} | |
7628 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
7629 | b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
7630 | c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 | |
7631 | validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 | |
7632 | ||
7633 | arm = vabal.s | |
7634 | aarch64 = uabal | |
7635 | generate uint16x8_t:uint8x8_t:uint8x8_t:uint16x8_t, uint32x4_t:uint16x4_t:uint16x4_t:uint32x4_t, uint64x2_t:uint32x2_t:uint32x2_t:uint64x2_t | |
7636 | ||
7637 | /// Unsigned Absolute difference and Accumulate Long | |
7638 | name = vabal_high | |
7639 | no-q | |
353b0b11 FG |
7640 | multi_fn = simd_shuffle!, d:uint8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] |
7641 | multi_fn = simd_shuffle!, e:uint8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15] | |
17df50a5 XL |
7642 | multi_fn = vabd_u8, d, e, f:uint8x8_t |
7643 | multi_fn = simd_add, a, {simd_cast, f} | |
7644 | a = 9, 10, 11, 12, 13, 14, 15, 16 | |
7645 | b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
7646 | c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 | |
7647 | validate 20, 20, 20, 20, 20, 20, 20, 20 | |
7648 | ||
7649 | aarch64 = uabal | |
7650 | generate uint16x8_t:uint8x16_t:uint8x16_t:uint16x8_t | |
7651 | ||
7652 | /// Unsigned Absolute difference and Accumulate Long | |
7653 | name = vabal_high | |
7654 | no-q | |
353b0b11 FG |
7655 | multi_fn = simd_shuffle!, d:uint16x4_t, b, b, [4, 5, 6, 7] |
7656 | multi_fn = simd_shuffle!, e:uint16x4_t, c, c, [4, 5, 6, 7] | |
17df50a5 XL |
7657 | multi_fn = vabd_u16, d, e, f:uint16x4_t |
7658 | multi_fn = simd_add, a, {simd_cast, f} | |
7659 | a = 9, 10, 11, 12 | |
7660 | b = 1, 2, 3, 4, 9, 10, 11, 12 | |
7661 | c = 10, 10, 10, 10, 20, 0, 2, 4 | |
7662 | validate 20, 20, 20, 20 | |
7663 | ||
7664 | aarch64 = uabal | |
7665 | generate uint32x4_t:uint16x8_t:uint16x8_t:uint32x4_t | |
7666 | ||
7667 | /// Unsigned Absolute difference and Accumulate Long | |
7668 | name = vabal_high | |
7669 | no-q | |
353b0b11 FG |
7670 | multi_fn = simd_shuffle!, d:uint32x2_t, b, b, [2, 3] |
7671 | multi_fn = simd_shuffle!, e:uint32x2_t, c, c, [2, 3] | |
17df50a5 XL |
7672 | multi_fn = vabd_u32, d, e, f:uint32x2_t |
7673 | multi_fn = simd_add, a, {simd_cast, f} | |
7674 | a = 15, 16 | |
7675 | b = 1, 2, 15, 16 | |
7676 | c = 10, 10, 10, 12 | |
7677 | validate 20, 20 | |
7678 | ||
7679 | aarch64 = uabal | |
7680 | generate uint64x2_t:uint32x4_t:uint32x4_t:uint64x2_t | |
7681 | ||
7682 | //////////////////// | |
7683 | // Signed Absolute difference and Accumulate Long | |
7684 | //////////////////// | |
7685 | ||
7686 | /// Signed Absolute difference and Accumulate Long | |
7687 | name = vabal | |
7688 | multi_fn = vabd-signed-noext, b, c, d:int8x8_t | |
7689 | multi_fn = simd_cast, e:uint8x8_t, d | |
7690 | multi_fn = simd_add, a, {simd_cast, e} | |
7691 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
7692 | b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
7693 | c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 | |
7694 | validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 | |
7695 | ||
7696 | arm = vabal.s | |
7697 | aarch64 = sabal | |
7698 | generate int16x8_t:int8x8_t:int8x8_t:int16x8_t | |
7699 | ||
7700 | /// Signed Absolute difference and Accumulate Long | |
7701 | name = vabal | |
7702 | multi_fn = vabd-signed-noext, b, c, d:int16x4_t | |
7703 | multi_fn = simd_cast, e:uint16x4_t, d | |
7704 | multi_fn = simd_add, a, {simd_cast, e} | |
7705 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
7706 | b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
7707 | c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 | |
7708 | validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 | |
7709 | ||
7710 | arm = vabal.s | |
7711 | aarch64 = sabal | |
7712 | generate int32x4_t:int16x4_t:int16x4_t:int32x4_t | |
7713 | ||
7714 | /// Signed Absolute difference and Accumulate Long | |
7715 | name = vabal | |
7716 | multi_fn = vabd-signed-noext, b, c, d:int32x2_t | |
7717 | multi_fn = simd_cast, e:uint32x2_t, d | |
7718 | multi_fn = simd_add, a, {simd_cast, e} | |
7719 | a = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
7720 | b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
7721 | c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 | |
7722 | validate 10, 10, 10, 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20 | |
7723 | ||
7724 | arm = vabal.s | |
7725 | aarch64 = sabal | |
7726 | generate int64x2_t:int32x2_t:int32x2_t:int64x2_t | |
7727 | ||
7728 | /// Signed Absolute difference and Accumulate Long | |
7729 | name = vabal_high | |
7730 | no-q | |
353b0b11 FG |
7731 | multi_fn = simd_shuffle!, d:int8x8_t, b, b, [8, 9, 10, 11, 12, 13, 14, 15] |
7732 | multi_fn = simd_shuffle!, e:int8x8_t, c, c, [8, 9, 10, 11, 12, 13, 14, 15] | |
17df50a5 XL |
7733 | multi_fn = vabd_s8, d, e, f:int8x8_t |
7734 | multi_fn = simd_cast, f:uint8x8_t, f | |
7735 | multi_fn = simd_add, a, {simd_cast, f} | |
7736 | a = 9, 10, 11, 12, 13, 14, 15, 16 | |
7737 | b = 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 | |
7738 | c = 10, 10, 10, 10, 10, 10, 10, 10, 20, 0, 2, 4, 6, 8, 10, 12 | |
7739 | validate 20, 20, 20, 20, 20, 20, 20, 20 | |
7740 | ||
7741 | aarch64 = sabal | |
7742 | generate int16x8_t:int8x16_t:int8x16_t:int16x8_t | |
7743 | ||
7744 | /// Signed Absolute difference and Accumulate Long | |
7745 | name = vabal_high | |
7746 | no-q | |
353b0b11 FG |
7747 | multi_fn = simd_shuffle!, d:int16x4_t, b, b, [4, 5, 6, 7] |
7748 | multi_fn = simd_shuffle!, e:int16x4_t, c, c, [4, 5, 6, 7] | |
17df50a5 XL |
7749 | multi_fn = vabd_s16, d, e, f:int16x4_t |
7750 | multi_fn = simd_cast, f:uint16x4_t, f | |
7751 | multi_fn = simd_add, a, {simd_cast, f} | |
7752 | a = 9, 10, 11, 12 | |
7753 | b = 1, 2, 3, 4, 9, 10, 11, 12 | |
7754 | c = 10, 10, 10, 10, 20, 0, 2, 4 | |
7755 | validate 20, 20, 20, 20 | |
7756 | ||
7757 | aarch64 = sabal | |
7758 | generate int32x4_t:int16x8_t:int16x8_t:int32x4_t | |
7759 | ||
7760 | /// Signed Absolute difference and Accumulate Long | |
7761 | name = vabal_high | |
7762 | no-q | |
353b0b11 FG |
7763 | multi_fn = simd_shuffle!, d:int32x2_t, b, b, [2, 3] |
7764 | multi_fn = simd_shuffle!, e:int32x2_t, c, c, [2, 3] | |
17df50a5 XL |
7765 | multi_fn = vabd_s32, d, e, f:int32x2_t |
7766 | multi_fn = simd_cast, f:uint32x2_t, f | |
7767 | multi_fn = simd_add, a, {simd_cast, f} | |
7768 | a = 15, 16 | |
7769 | b = 1, 2, 15, 16 | |
7770 | c = 10, 10, 10, 12 | |
7771 | validate 20, 20 | |
7772 | ||
7773 | aarch64 = sabal | |
7774 | generate int64x2_t:int32x4_t:int32x4_t:int64x2_t | |
7775 | ||
7776 | //////////////////// | |
353b0b11 | 7777 | // Signed saturating Absolute value |
17df50a5 XL |
7778 | //////////////////// |
7779 | ||
353b0b11 | 7780 | /// Signed saturating Absolute value |
17df50a5 XL |
7781 | name = vqabs |
7782 | a = MIN, MAX, -6, -5, -4, -3, -2, -1, 0, -127, 127, 1, 2, 3, 4, 5 | |
7783 | validate MAX, MAX, 6, 5, 4, 3, 2, 1, 0, 127, 127, 1, 2, 3, 4, 5 | |
7784 | ||
7785 | arm = vqabs.s | |
7786 | aarch64 = sqabs | |
7787 | link-arm = vqabs._EXT_ | |
7788 | link-aarch64 = sqabs._EXT_ | |
7789 | generate int*_t | |
7790 | ||
353b0b11 | 7791 | /// Signed saturating Absolute value |
17df50a5 XL |
7792 | name = vqabs |
7793 | a = MIN, -7 | |
7794 | validate MAX, 7 | |
7795 | ||
7796 | aarch64 = sqabs | |
7797 | link-aarch64 = sqabs._EXT_ | |
7798 | generate int64x*_t | |
3c0e092e XL |
7799 | |
7800 | /// Signed saturating absolute value | |
7801 | name = vqabs | |
c620b35d | 7802 | multi_fn = simd_extract!, {vqabs-in_ntt-noext, {vdup_n-in_ntt-noext, a}}, 0 |
3c0e092e XL |
7803 | a = -7 |
7804 | validate 7 | |
7805 | ||
7806 | aarch64 = sqabs | |
7807 | generate i8:i8, i16:i16 | |
7808 | ||
7809 | /// Signed saturating absolute value | |
7810 | name = vqabs | |
7811 | a = -7 | |
7812 | validate 7 | |
7813 | ||
7814 | aarch64 = sqabs | |
7815 | link-aarch64 = sqabs._EXT_ | |
7816 | generate i32:i32, i64:i64 | |
7817 | ||
7818 | /// Shift left and insert | |
7819 | name = vsli | |
7820 | n-suffix | |
7821 | constn = N | |
7822 | multi_fn = static_assert-N-0-63 | |
7823 | multi_fn = transmute, {vsli_n-in_ntt-::<N>, transmute(a), transmute(b)} | |
7824 | a = 333 | |
7825 | b = 2042 | |
7826 | n = 2 | |
7827 | validate 8169 | |
7828 | ||
7829 | aarch64 = sli | |
7830 | generate i64, u64 | |
7831 | ||
7832 | /// Shift right and insert | |
7833 | name = vsri | |
7834 | n-suffix | |
7835 | constn = N | |
7836 | multi_fn = static_assert-N-1-bits | |
7837 | multi_fn = transmute, {vsri_n-in_ntt-::<N>, transmute(a), transmute(b)} | |
7838 | a = 333 | |
7839 | b = 2042 | |
7840 | n = 2 | |
7841 | validate 510 | |
7842 | ||
7843 | aarch64 = sri | |
a2a8927a | 7844 | generate i64, u64 |