]> git.proxmox.com Git - mirror_qemu.git/blob - target/arm/neon_helper.c
Open 7.1 development tree
[mirror_qemu.git] / target / arm / neon_helper.c
1 /*
2 * ARM NEON vector operations.
3 *
4 * Copyright (c) 2007, 2008 CodeSourcery.
5 * Written by Paul Brook
6 *
7 * This code is licensed under the GNU GPL v2.
8 */
9 #include "qemu/osdep.h"
10
11 #include "cpu.h"
12 #include "exec/helper-proto.h"
13 #include "fpu/softfloat.h"
14 #include "vec_internal.h"
15
16 #define SIGNBIT (uint32_t)0x80000000
17 #define SIGNBIT64 ((uint64_t)1 << 63)
18
19 #define SET_QC() env->vfp.qc[0] = 1
20
21 #define NEON_TYPE1(name, type) \
22 typedef struct \
23 { \
24 type v1; \
25 } neon_##name;
26 #ifdef HOST_WORDS_BIGENDIAN
27 #define NEON_TYPE2(name, type) \
28 typedef struct \
29 { \
30 type v2; \
31 type v1; \
32 } neon_##name;
33 #define NEON_TYPE4(name, type) \
34 typedef struct \
35 { \
36 type v4; \
37 type v3; \
38 type v2; \
39 type v1; \
40 } neon_##name;
41 #else
42 #define NEON_TYPE2(name, type) \
43 typedef struct \
44 { \
45 type v1; \
46 type v2; \
47 } neon_##name;
48 #define NEON_TYPE4(name, type) \
49 typedef struct \
50 { \
51 type v1; \
52 type v2; \
53 type v3; \
54 type v4; \
55 } neon_##name;
56 #endif
57
58 NEON_TYPE4(s8, int8_t)
59 NEON_TYPE4(u8, uint8_t)
60 NEON_TYPE2(s16, int16_t)
61 NEON_TYPE2(u16, uint16_t)
62 NEON_TYPE1(s32, int32_t)
63 NEON_TYPE1(u32, uint32_t)
64 #undef NEON_TYPE4
65 #undef NEON_TYPE2
66 #undef NEON_TYPE1
67
68 /* Copy from a uint32_t to a vector structure type. */
69 #define NEON_UNPACK(vtype, dest, val) do { \
70 union { \
71 vtype v; \
72 uint32_t i; \
73 } conv_u; \
74 conv_u.i = (val); \
75 dest = conv_u.v; \
76 } while(0)
77
78 /* Copy from a vector structure type to a uint32_t. */
79 #define NEON_PACK(vtype, dest, val) do { \
80 union { \
81 vtype v; \
82 uint32_t i; \
83 } conv_u; \
84 conv_u.v = (val); \
85 dest = conv_u.i; \
86 } while(0)
87
88 #define NEON_DO1 \
89 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
90 #define NEON_DO2 \
91 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
92 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
93 #define NEON_DO4 \
94 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
95 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
96 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
97 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
98
99 #define NEON_VOP_BODY(vtype, n) \
100 { \
101 uint32_t res; \
102 vtype vsrc1; \
103 vtype vsrc2; \
104 vtype vdest; \
105 NEON_UNPACK(vtype, vsrc1, arg1); \
106 NEON_UNPACK(vtype, vsrc2, arg2); \
107 NEON_DO##n; \
108 NEON_PACK(vtype, res, vdest); \
109 return res; \
110 }
111
112 #define NEON_VOP(name, vtype, n) \
113 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
114 NEON_VOP_BODY(vtype, n)
115
116 #define NEON_VOP_ENV(name, vtype, n) \
117 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
118 NEON_VOP_BODY(vtype, n)
119
120 /* Pairwise operations. */
121 /* For 32-bit elements each segment only contains a single element, so
122 the elementwise and pairwise operations are the same. */
123 #define NEON_PDO2 \
124 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
125 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
126 #define NEON_PDO4 \
127 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
128 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
129 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
130 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
131
132 #define NEON_POP(name, vtype, n) \
133 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
134 { \
135 uint32_t res; \
136 vtype vsrc1; \
137 vtype vsrc2; \
138 vtype vdest; \
139 NEON_UNPACK(vtype, vsrc1, arg1); \
140 NEON_UNPACK(vtype, vsrc2, arg2); \
141 NEON_PDO##n; \
142 NEON_PACK(vtype, res, vdest); \
143 return res; \
144 }
145
146 /* Unary operators. */
147 #define NEON_VOP1(name, vtype, n) \
148 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
149 { \
150 vtype vsrc1; \
151 vtype vdest; \
152 NEON_UNPACK(vtype, vsrc1, arg); \
153 NEON_DO##n; \
154 NEON_PACK(vtype, arg, vdest); \
155 return arg; \
156 }
157
158
159 #define NEON_USAT(dest, src1, src2, type) do { \
160 uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
161 if (tmp != (type)tmp) { \
162 SET_QC(); \
163 dest = ~0; \
164 } else { \
165 dest = tmp; \
166 }} while(0)
167 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
168 NEON_VOP_ENV(qadd_u8, neon_u8, 4)
169 #undef NEON_FN
170 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
171 NEON_VOP_ENV(qadd_u16, neon_u16, 2)
172 #undef NEON_FN
173 #undef NEON_USAT
174
175 uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
176 {
177 uint32_t res = a + b;
178 if (res < a) {
179 SET_QC();
180 res = ~0;
181 }
182 return res;
183 }
184
185 uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
186 {
187 uint64_t res;
188
189 res = src1 + src2;
190 if (res < src1) {
191 SET_QC();
192 res = ~(uint64_t)0;
193 }
194 return res;
195 }
196
197 #define NEON_SSAT(dest, src1, src2, type) do { \
198 int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
199 if (tmp != (type)tmp) { \
200 SET_QC(); \
201 if (src2 > 0) { \
202 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
203 } else { \
204 tmp = 1 << (sizeof(type) * 8 - 1); \
205 } \
206 } \
207 dest = tmp; \
208 } while(0)
209 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
210 NEON_VOP_ENV(qadd_s8, neon_s8, 4)
211 #undef NEON_FN
212 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
213 NEON_VOP_ENV(qadd_s16, neon_s16, 2)
214 #undef NEON_FN
215 #undef NEON_SSAT
216
217 uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
218 {
219 uint32_t res = a + b;
220 if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
221 SET_QC();
222 res = ~(((int32_t)a >> 31) ^ SIGNBIT);
223 }
224 return res;
225 }
226
227 uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
228 {
229 uint64_t res;
230
231 res = src1 + src2;
232 if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
233 SET_QC();
234 res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
235 }
236 return res;
237 }
238
239 /* Unsigned saturating accumulate of signed value
240 *
241 * Op1/Rn is treated as signed
242 * Op2/Rd is treated as unsigned
243 *
244 * Explicit casting is used to ensure the correct sign extension of
245 * inputs. The result is treated as a unsigned value and saturated as such.
246 *
247 * We use a macro for the 8/16 bit cases which expects signed integers of va,
248 * vb, and vr for interim calculation and an unsigned 32 bit result value r.
249 */
250
251 #define USATACC(bits, shift) \
252 do { \
253 va = sextract32(a, shift, bits); \
254 vb = extract32(b, shift, bits); \
255 vr = va + vb; \
256 if (vr > UINT##bits##_MAX) { \
257 SET_QC(); \
258 vr = UINT##bits##_MAX; \
259 } else if (vr < 0) { \
260 SET_QC(); \
261 vr = 0; \
262 } \
263 r = deposit32(r, shift, bits, vr); \
264 } while (0)
265
266 uint32_t HELPER(neon_uqadd_s8)(CPUARMState *env, uint32_t a, uint32_t b)
267 {
268 int16_t va, vb, vr;
269 uint32_t r = 0;
270
271 USATACC(8, 0);
272 USATACC(8, 8);
273 USATACC(8, 16);
274 USATACC(8, 24);
275 return r;
276 }
277
278 uint32_t HELPER(neon_uqadd_s16)(CPUARMState *env, uint32_t a, uint32_t b)
279 {
280 int32_t va, vb, vr;
281 uint64_t r = 0;
282
283 USATACC(16, 0);
284 USATACC(16, 16);
285 return r;
286 }
287
288 #undef USATACC
289
290 uint32_t HELPER(neon_uqadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
291 {
292 int64_t va = (int32_t)a;
293 int64_t vb = (uint32_t)b;
294 int64_t vr = va + vb;
295 if (vr > UINT32_MAX) {
296 SET_QC();
297 vr = UINT32_MAX;
298 } else if (vr < 0) {
299 SET_QC();
300 vr = 0;
301 }
302 return vr;
303 }
304
305 uint64_t HELPER(neon_uqadd_s64)(CPUARMState *env, uint64_t a, uint64_t b)
306 {
307 uint64_t res;
308 res = a + b;
309 /* We only need to look at the pattern of SIGN bits to detect
310 * +ve/-ve saturation
311 */
312 if (~a & b & ~res & SIGNBIT64) {
313 SET_QC();
314 res = UINT64_MAX;
315 } else if (a & ~b & res & SIGNBIT64) {
316 SET_QC();
317 res = 0;
318 }
319 return res;
320 }
321
322 /* Signed saturating accumulate of unsigned value
323 *
324 * Op1/Rn is treated as unsigned
325 * Op2/Rd is treated as signed
326 *
327 * The result is treated as a signed value and saturated as such
328 *
329 * We use a macro for the 8/16 bit cases which expects signed integers of va,
330 * vb, and vr for interim calculation and an unsigned 32 bit result value r.
331 */
332
333 #define SSATACC(bits, shift) \
334 do { \
335 va = extract32(a, shift, bits); \
336 vb = sextract32(b, shift, bits); \
337 vr = va + vb; \
338 if (vr > INT##bits##_MAX) { \
339 SET_QC(); \
340 vr = INT##bits##_MAX; \
341 } else if (vr < INT##bits##_MIN) { \
342 SET_QC(); \
343 vr = INT##bits##_MIN; \
344 } \
345 r = deposit32(r, shift, bits, vr); \
346 } while (0)
347
348 uint32_t HELPER(neon_sqadd_u8)(CPUARMState *env, uint32_t a, uint32_t b)
349 {
350 int16_t va, vb, vr;
351 uint32_t r = 0;
352
353 SSATACC(8, 0);
354 SSATACC(8, 8);
355 SSATACC(8, 16);
356 SSATACC(8, 24);
357 return r;
358 }
359
360 uint32_t HELPER(neon_sqadd_u16)(CPUARMState *env, uint32_t a, uint32_t b)
361 {
362 int32_t va, vb, vr;
363 uint32_t r = 0;
364
365 SSATACC(16, 0);
366 SSATACC(16, 16);
367
368 return r;
369 }
370
371 #undef SSATACC
372
373 uint32_t HELPER(neon_sqadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
374 {
375 int64_t res;
376 int64_t op1 = (uint32_t)a;
377 int64_t op2 = (int32_t)b;
378 res = op1 + op2;
379 if (res > INT32_MAX) {
380 SET_QC();
381 res = INT32_MAX;
382 } else if (res < INT32_MIN) {
383 SET_QC();
384 res = INT32_MIN;
385 }
386 return res;
387 }
388
389 uint64_t HELPER(neon_sqadd_u64)(CPUARMState *env, uint64_t a, uint64_t b)
390 {
391 uint64_t res;
392 res = a + b;
393 /* We only need to look at the pattern of SIGN bits to detect an overflow */
394 if (((a & res)
395 | (~b & res)
396 | (a & ~b)) & SIGNBIT64) {
397 SET_QC();
398 res = INT64_MAX;
399 }
400 return res;
401 }
402
403
404 #define NEON_USAT(dest, src1, src2, type) do { \
405 uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
406 if (tmp != (type)tmp) { \
407 SET_QC(); \
408 dest = 0; \
409 } else { \
410 dest = tmp; \
411 }} while(0)
412 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
413 NEON_VOP_ENV(qsub_u8, neon_u8, 4)
414 #undef NEON_FN
415 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
416 NEON_VOP_ENV(qsub_u16, neon_u16, 2)
417 #undef NEON_FN
418 #undef NEON_USAT
419
420 uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b)
421 {
422 uint32_t res = a - b;
423 if (res > a) {
424 SET_QC();
425 res = 0;
426 }
427 return res;
428 }
429
430 uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
431 {
432 uint64_t res;
433
434 if (src1 < src2) {
435 SET_QC();
436 res = 0;
437 } else {
438 res = src1 - src2;
439 }
440 return res;
441 }
442
443 #define NEON_SSAT(dest, src1, src2, type) do { \
444 int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
445 if (tmp != (type)tmp) { \
446 SET_QC(); \
447 if (src2 < 0) { \
448 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
449 } else { \
450 tmp = 1 << (sizeof(type) * 8 - 1); \
451 } \
452 } \
453 dest = tmp; \
454 } while(0)
455 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
456 NEON_VOP_ENV(qsub_s8, neon_s8, 4)
457 #undef NEON_FN
458 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
459 NEON_VOP_ENV(qsub_s16, neon_s16, 2)
460 #undef NEON_FN
461 #undef NEON_SSAT
462
463 uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b)
464 {
465 uint32_t res = a - b;
466 if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
467 SET_QC();
468 res = ~(((int32_t)a >> 31) ^ SIGNBIT);
469 }
470 return res;
471 }
472
473 uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
474 {
475 uint64_t res;
476
477 res = src1 - src2;
478 if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
479 SET_QC();
480 res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
481 }
482 return res;
483 }
484
485 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
486 NEON_VOP(hadd_s8, neon_s8, 4)
487 NEON_VOP(hadd_u8, neon_u8, 4)
488 NEON_VOP(hadd_s16, neon_s16, 2)
489 NEON_VOP(hadd_u16, neon_u16, 2)
490 #undef NEON_FN
491
492 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
493 {
494 int32_t dest;
495
496 dest = (src1 >> 1) + (src2 >> 1);
497 if (src1 & src2 & 1)
498 dest++;
499 return dest;
500 }
501
502 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
503 {
504 uint32_t dest;
505
506 dest = (src1 >> 1) + (src2 >> 1);
507 if (src1 & src2 & 1)
508 dest++;
509 return dest;
510 }
511
512 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
513 NEON_VOP(rhadd_s8, neon_s8, 4)
514 NEON_VOP(rhadd_u8, neon_u8, 4)
515 NEON_VOP(rhadd_s16, neon_s16, 2)
516 NEON_VOP(rhadd_u16, neon_u16, 2)
517 #undef NEON_FN
518
519 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
520 {
521 int32_t dest;
522
523 dest = (src1 >> 1) + (src2 >> 1);
524 if ((src1 | src2) & 1)
525 dest++;
526 return dest;
527 }
528
529 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
530 {
531 uint32_t dest;
532
533 dest = (src1 >> 1) + (src2 >> 1);
534 if ((src1 | src2) & 1)
535 dest++;
536 return dest;
537 }
538
539 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
540 NEON_VOP(hsub_s8, neon_s8, 4)
541 NEON_VOP(hsub_u8, neon_u8, 4)
542 NEON_VOP(hsub_s16, neon_s16, 2)
543 NEON_VOP(hsub_u16, neon_u16, 2)
544 #undef NEON_FN
545
546 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
547 {
548 int32_t dest;
549
550 dest = (src1 >> 1) - (src2 >> 1);
551 if ((~src1) & src2 & 1)
552 dest--;
553 return dest;
554 }
555
556 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
557 {
558 uint32_t dest;
559
560 dest = (src1 >> 1) - (src2 >> 1);
561 if ((~src1) & src2 & 1)
562 dest--;
563 return dest;
564 }
565
566 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
567 NEON_POP(pmin_s8, neon_s8, 4)
568 NEON_POP(pmin_u8, neon_u8, 4)
569 NEON_POP(pmin_s16, neon_s16, 2)
570 NEON_POP(pmin_u16, neon_u16, 2)
571 #undef NEON_FN
572
573 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
574 NEON_POP(pmax_s8, neon_s8, 4)
575 NEON_POP(pmax_u8, neon_u8, 4)
576 NEON_POP(pmax_s16, neon_s16, 2)
577 NEON_POP(pmax_u16, neon_u16, 2)
578 #undef NEON_FN
579
580 #define NEON_FN(dest, src1, src2) \
581 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
582 NEON_VOP(shl_u16, neon_u16, 2)
583 #undef NEON_FN
584
585 #define NEON_FN(dest, src1, src2) \
586 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, NULL))
587 NEON_VOP(shl_s16, neon_s16, 2)
588 #undef NEON_FN
589
590 #define NEON_FN(dest, src1, src2) \
591 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
592 NEON_VOP(rshl_s8, neon_s8, 4)
593 #undef NEON_FN
594
595 #define NEON_FN(dest, src1, src2) \
596 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
597 NEON_VOP(rshl_s16, neon_s16, 2)
598 #undef NEON_FN
599
600 uint32_t HELPER(neon_rshl_s32)(uint32_t val, uint32_t shift)
601 {
602 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
603 }
604
605 uint64_t HELPER(neon_rshl_s64)(uint64_t val, uint64_t shift)
606 {
607 return do_sqrshl_d(val, (int8_t)shift, true, NULL);
608 }
609
610 #define NEON_FN(dest, src1, src2) \
611 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, NULL))
612 NEON_VOP(rshl_u8, neon_u8, 4)
613 #undef NEON_FN
614
615 #define NEON_FN(dest, src1, src2) \
616 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, NULL))
617 NEON_VOP(rshl_u16, neon_u16, 2)
618 #undef NEON_FN
619
620 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shift)
621 {
622 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, NULL);
623 }
624
625 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shift)
626 {
627 return do_uqrshl_d(val, (int8_t)shift, true, NULL);
628 }
629
630 #define NEON_FN(dest, src1, src2) \
631 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
632 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
633 #undef NEON_FN
634
635 #define NEON_FN(dest, src1, src2) \
636 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
637 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
638 #undef NEON_FN
639
640 uint32_t HELPER(neon_qshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
641 {
642 return do_uqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
643 }
644
645 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
646 {
647 return do_uqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
648 }
649
650 #define NEON_FN(dest, src1, src2) \
651 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
652 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
653 #undef NEON_FN
654
655 #define NEON_FN(dest, src1, src2) \
656 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
657 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
658 #undef NEON_FN
659
660 uint32_t HELPER(neon_qshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
661 {
662 return do_sqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
663 }
664
665 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
666 {
667 return do_sqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
668 }
669
670 #define NEON_FN(dest, src1, src2) \
671 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 8, false, env->vfp.qc))
672 NEON_VOP_ENV(qshlu_s8, neon_s8, 4)
673 #undef NEON_FN
674
675 #define NEON_FN(dest, src1, src2) \
676 (dest = do_suqrshl_bhs(src1, (int8_t)src2, 16, false, env->vfp.qc))
677 NEON_VOP_ENV(qshlu_s16, neon_s16, 2)
678 #undef NEON_FN
679
680 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
681 {
682 return do_suqrshl_bhs(val, (int8_t)shift, 32, false, env->vfp.qc);
683 }
684
685 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
686 {
687 return do_suqrshl_d(val, (int8_t)shift, false, env->vfp.qc);
688 }
689
690 #define NEON_FN(dest, src1, src2) \
691 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
692 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
693 #undef NEON_FN
694
695 #define NEON_FN(dest, src1, src2) \
696 (dest = do_uqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
697 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
698 #undef NEON_FN
699
700 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shift)
701 {
702 return do_uqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
703 }
704
705 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shift)
706 {
707 return do_uqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
708 }
709
710 #define NEON_FN(dest, src1, src2) \
711 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 8, true, env->vfp.qc))
712 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
713 #undef NEON_FN
714
715 #define NEON_FN(dest, src1, src2) \
716 (dest = do_sqrshl_bhs(src1, (int8_t)src2, 16, true, env->vfp.qc))
717 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
718 #undef NEON_FN
719
720 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t val, uint32_t shift)
721 {
722 return do_sqrshl_bhs(val, (int8_t)shift, 32, true, env->vfp.qc);
723 }
724
725 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t val, uint64_t shift)
726 {
727 return do_sqrshl_d(val, (int8_t)shift, true, env->vfp.qc);
728 }
729
730 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
731 {
732 uint32_t mask;
733 mask = (a ^ b) & 0x80808080u;
734 a &= ~0x80808080u;
735 b &= ~0x80808080u;
736 return (a + b) ^ mask;
737 }
738
739 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
740 {
741 uint32_t mask;
742 mask = (a ^ b) & 0x80008000u;
743 a &= ~0x80008000u;
744 b &= ~0x80008000u;
745 return (a + b) ^ mask;
746 }
747
748 #define NEON_FN(dest, src1, src2) dest = src1 + src2
749 NEON_POP(padd_u8, neon_u8, 4)
750 NEON_POP(padd_u16, neon_u16, 2)
751 #undef NEON_FN
752
753 #define NEON_FN(dest, src1, src2) dest = src1 - src2
754 NEON_VOP(sub_u8, neon_u8, 4)
755 NEON_VOP(sub_u16, neon_u16, 2)
756 #undef NEON_FN
757
758 #define NEON_FN(dest, src1, src2) dest = src1 * src2
759 NEON_VOP(mul_u8, neon_u8, 4)
760 NEON_VOP(mul_u16, neon_u16, 2)
761 #undef NEON_FN
762
763 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
764 NEON_VOP(tst_u8, neon_u8, 4)
765 NEON_VOP(tst_u16, neon_u16, 2)
766 NEON_VOP(tst_u32, neon_u32, 1)
767 #undef NEON_FN
768
769 /* Count Leading Sign/Zero Bits. */
770 static inline int do_clz8(uint8_t x)
771 {
772 int n;
773 for (n = 8; x; n--)
774 x >>= 1;
775 return n;
776 }
777
778 static inline int do_clz16(uint16_t x)
779 {
780 int n;
781 for (n = 16; x; n--)
782 x >>= 1;
783 return n;
784 }
785
786 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
787 NEON_VOP1(clz_u8, neon_u8, 4)
788 #undef NEON_FN
789
790 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
791 NEON_VOP1(clz_u16, neon_u16, 2)
792 #undef NEON_FN
793
794 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
795 NEON_VOP1(cls_s8, neon_s8, 4)
796 #undef NEON_FN
797
798 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
799 NEON_VOP1(cls_s16, neon_s16, 2)
800 #undef NEON_FN
801
802 uint32_t HELPER(neon_cls_s32)(uint32_t x)
803 {
804 int count;
805 if ((int32_t)x < 0)
806 x = ~x;
807 for (count = 32; x; count--)
808 x = x >> 1;
809 return count - 1;
810 }
811
812 /* Bit count. */
813 uint32_t HELPER(neon_cnt_u8)(uint32_t x)
814 {
815 x = (x & 0x55555555) + ((x >> 1) & 0x55555555);
816 x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
817 x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f);
818 return x;
819 }
820
821 /* Reverse bits in each 8 bit word */
822 uint32_t HELPER(neon_rbit_u8)(uint32_t x)
823 {
824 x = ((x & 0xf0f0f0f0) >> 4)
825 | ((x & 0x0f0f0f0f) << 4);
826 x = ((x & 0x88888888) >> 3)
827 | ((x & 0x44444444) >> 1)
828 | ((x & 0x22222222) << 1)
829 | ((x & 0x11111111) << 3);
830 return x;
831 }
832
833 #define NEON_QDMULH16(dest, src1, src2, round) do { \
834 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
835 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
836 SET_QC(); \
837 tmp = (tmp >> 31) ^ ~SIGNBIT; \
838 } else { \
839 tmp <<= 1; \
840 } \
841 if (round) { \
842 int32_t old = tmp; \
843 tmp += 1 << 15; \
844 if ((int32_t)tmp < old) { \
845 SET_QC(); \
846 tmp = SIGNBIT - 1; \
847 } \
848 } \
849 dest = tmp >> 16; \
850 } while(0)
851 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
852 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
853 #undef NEON_FN
854 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
855 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
856 #undef NEON_FN
857 #undef NEON_QDMULH16
858
859 #define NEON_QDMULH32(dest, src1, src2, round) do { \
860 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
861 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
862 SET_QC(); \
863 tmp = (tmp >> 63) ^ ~SIGNBIT64; \
864 } else { \
865 tmp <<= 1; \
866 } \
867 if (round) { \
868 int64_t old = tmp; \
869 tmp += (int64_t)1 << 31; \
870 if ((int64_t)tmp < old) { \
871 SET_QC(); \
872 tmp = SIGNBIT64 - 1; \
873 } \
874 } \
875 dest = tmp >> 32; \
876 } while(0)
877 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
878 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
879 #undef NEON_FN
880 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
881 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
882 #undef NEON_FN
883 #undef NEON_QDMULH32
884
885 uint32_t HELPER(neon_narrow_u8)(uint64_t x)
886 {
887 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
888 | ((x >> 24) & 0xff000000u);
889 }
890
891 uint32_t HELPER(neon_narrow_u16)(uint64_t x)
892 {
893 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
894 }
895
896 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
897 {
898 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
899 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
900 }
901
902 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
903 {
904 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
905 }
906
907 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
908 {
909 x &= 0xff80ff80ff80ff80ull;
910 x += 0x0080008000800080ull;
911 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
912 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
913 }
914
915 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
916 {
917 x &= 0xffff8000ffff8000ull;
918 x += 0x0000800000008000ull;
919 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
920 }
921
922 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
923 {
924 uint16_t s;
925 uint8_t d;
926 uint32_t res = 0;
927 #define SAT8(n) \
928 s = x >> n; \
929 if (s & 0x8000) { \
930 SET_QC(); \
931 } else { \
932 if (s > 0xff) { \
933 d = 0xff; \
934 SET_QC(); \
935 } else { \
936 d = s; \
937 } \
938 res |= (uint32_t)d << (n / 2); \
939 }
940
941 SAT8(0);
942 SAT8(16);
943 SAT8(32);
944 SAT8(48);
945 #undef SAT8
946 return res;
947 }
948
949 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
950 {
951 uint16_t s;
952 uint8_t d;
953 uint32_t res = 0;
954 #define SAT8(n) \
955 s = x >> n; \
956 if (s > 0xff) { \
957 d = 0xff; \
958 SET_QC(); \
959 } else { \
960 d = s; \
961 } \
962 res |= (uint32_t)d << (n / 2);
963
964 SAT8(0);
965 SAT8(16);
966 SAT8(32);
967 SAT8(48);
968 #undef SAT8
969 return res;
970 }
971
972 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
973 {
974 int16_t s;
975 uint8_t d;
976 uint32_t res = 0;
977 #define SAT8(n) \
978 s = x >> n; \
979 if (s != (int8_t)s) { \
980 d = (s >> 15) ^ 0x7f; \
981 SET_QC(); \
982 } else { \
983 d = s; \
984 } \
985 res |= (uint32_t)d << (n / 2);
986
987 SAT8(0);
988 SAT8(16);
989 SAT8(32);
990 SAT8(48);
991 #undef SAT8
992 return res;
993 }
994
995 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
996 {
997 uint32_t high;
998 uint32_t low;
999 low = x;
1000 if (low & 0x80000000) {
1001 low = 0;
1002 SET_QC();
1003 } else if (low > 0xffff) {
1004 low = 0xffff;
1005 SET_QC();
1006 }
1007 high = x >> 32;
1008 if (high & 0x80000000) {
1009 high = 0;
1010 SET_QC();
1011 } else if (high > 0xffff) {
1012 high = 0xffff;
1013 SET_QC();
1014 }
1015 return low | (high << 16);
1016 }
1017
1018 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
1019 {
1020 uint32_t high;
1021 uint32_t low;
1022 low = x;
1023 if (low > 0xffff) {
1024 low = 0xffff;
1025 SET_QC();
1026 }
1027 high = x >> 32;
1028 if (high > 0xffff) {
1029 high = 0xffff;
1030 SET_QC();
1031 }
1032 return low | (high << 16);
1033 }
1034
1035 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
1036 {
1037 int32_t low;
1038 int32_t high;
1039 low = x;
1040 if (low != (int16_t)low) {
1041 low = (low >> 31) ^ 0x7fff;
1042 SET_QC();
1043 }
1044 high = x >> 32;
1045 if (high != (int16_t)high) {
1046 high = (high >> 31) ^ 0x7fff;
1047 SET_QC();
1048 }
1049 return (uint16_t)low | (high << 16);
1050 }
1051
1052 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
1053 {
1054 if (x & 0x8000000000000000ull) {
1055 SET_QC();
1056 return 0;
1057 }
1058 if (x > 0xffffffffu) {
1059 SET_QC();
1060 return 0xffffffffu;
1061 }
1062 return x;
1063 }
1064
1065 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
1066 {
1067 if (x > 0xffffffffu) {
1068 SET_QC();
1069 return 0xffffffffu;
1070 }
1071 return x;
1072 }
1073
1074 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
1075 {
1076 if ((int64_t)x != (int32_t)x) {
1077 SET_QC();
1078 return ((int64_t)x >> 63) ^ 0x7fffffff;
1079 }
1080 return x;
1081 }
1082
1083 uint64_t HELPER(neon_widen_u8)(uint32_t x)
1084 {
1085 uint64_t tmp;
1086 uint64_t ret;
1087 ret = (uint8_t)x;
1088 tmp = (uint8_t)(x >> 8);
1089 ret |= tmp << 16;
1090 tmp = (uint8_t)(x >> 16);
1091 ret |= tmp << 32;
1092 tmp = (uint8_t)(x >> 24);
1093 ret |= tmp << 48;
1094 return ret;
1095 }
1096
1097 uint64_t HELPER(neon_widen_s8)(uint32_t x)
1098 {
1099 uint64_t tmp;
1100 uint64_t ret;
1101 ret = (uint16_t)(int8_t)x;
1102 tmp = (uint16_t)(int8_t)(x >> 8);
1103 ret |= tmp << 16;
1104 tmp = (uint16_t)(int8_t)(x >> 16);
1105 ret |= tmp << 32;
1106 tmp = (uint16_t)(int8_t)(x >> 24);
1107 ret |= tmp << 48;
1108 return ret;
1109 }
1110
1111 uint64_t HELPER(neon_widen_u16)(uint32_t x)
1112 {
1113 uint64_t high = (uint16_t)(x >> 16);
1114 return ((uint16_t)x) | (high << 32);
1115 }
1116
1117 uint64_t HELPER(neon_widen_s16)(uint32_t x)
1118 {
1119 uint64_t high = (int16_t)(x >> 16);
1120 return ((uint32_t)(int16_t)x) | (high << 32);
1121 }
1122
1123 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
1124 {
1125 uint64_t mask;
1126 mask = (a ^ b) & 0x8000800080008000ull;
1127 a &= ~0x8000800080008000ull;
1128 b &= ~0x8000800080008000ull;
1129 return (a + b) ^ mask;
1130 }
1131
1132 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
1133 {
1134 uint64_t mask;
1135 mask = (a ^ b) & 0x8000000080000000ull;
1136 a &= ~0x8000000080000000ull;
1137 b &= ~0x8000000080000000ull;
1138 return (a + b) ^ mask;
1139 }
1140
1141 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
1142 {
1143 uint64_t tmp;
1144 uint64_t tmp2;
1145
1146 tmp = a & 0x0000ffff0000ffffull;
1147 tmp += (a >> 16) & 0x0000ffff0000ffffull;
1148 tmp2 = b & 0xffff0000ffff0000ull;
1149 tmp2 += (b << 16) & 0xffff0000ffff0000ull;
1150 return ( tmp & 0xffff)
1151 | ((tmp >> 16) & 0xffff0000ull)
1152 | ((tmp2 << 16) & 0xffff00000000ull)
1153 | ( tmp2 & 0xffff000000000000ull);
1154 }
1155
1156 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
1157 {
1158 uint32_t low = a + (a >> 32);
1159 uint32_t high = b + (b >> 32);
1160 return low + ((uint64_t)high << 32);
1161 }
1162
1163 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
1164 {
1165 uint64_t mask;
1166 mask = (a ^ ~b) & 0x8000800080008000ull;
1167 a |= 0x8000800080008000ull;
1168 b &= ~0x8000800080008000ull;
1169 return (a - b) ^ mask;
1170 }
1171
1172 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
1173 {
1174 uint64_t mask;
1175 mask = (a ^ ~b) & 0x8000000080000000ull;
1176 a |= 0x8000000080000000ull;
1177 b &= ~0x8000000080000000ull;
1178 return (a - b) ^ mask;
1179 }
1180
1181 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
1182 {
1183 uint32_t x, y;
1184 uint32_t low, high;
1185
1186 x = a;
1187 y = b;
1188 low = x + y;
1189 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1190 SET_QC();
1191 low = ((int32_t)x >> 31) ^ ~SIGNBIT;
1192 }
1193 x = a >> 32;
1194 y = b >> 32;
1195 high = x + y;
1196 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
1197 SET_QC();
1198 high = ((int32_t)x >> 31) ^ ~SIGNBIT;
1199 }
1200 return low | ((uint64_t)high << 32);
1201 }
1202
1203 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
1204 {
1205 uint64_t result;
1206
1207 result = a + b;
1208 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
1209 SET_QC();
1210 result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
1211 }
1212 return result;
1213 }
1214
1215 /* We have to do the arithmetic in a larger type than
1216 * the input type, because for example with a signed 32 bit
1217 * op the absolute difference can overflow a signed 32 bit value.
1218 */
1219 #define DO_ABD(dest, x, y, intype, arithtype) do { \
1220 arithtype tmp_x = (intype)(x); \
1221 arithtype tmp_y = (intype)(y); \
1222 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1223 } while(0)
1224
1225 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
1226 {
1227 uint64_t tmp;
1228 uint64_t result;
1229 DO_ABD(result, a, b, uint8_t, uint32_t);
1230 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
1231 result |= tmp << 16;
1232 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
1233 result |= tmp << 32;
1234 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
1235 result |= tmp << 48;
1236 return result;
1237 }
1238
1239 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
1240 {
1241 uint64_t tmp;
1242 uint64_t result;
1243 DO_ABD(result, a, b, int8_t, int32_t);
1244 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
1245 result |= tmp << 16;
1246 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
1247 result |= tmp << 32;
1248 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
1249 result |= tmp << 48;
1250 return result;
1251 }
1252
1253 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
1254 {
1255 uint64_t tmp;
1256 uint64_t result;
1257 DO_ABD(result, a, b, uint16_t, uint32_t);
1258 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1259 return result | (tmp << 32);
1260 }
1261
1262 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
1263 {
1264 uint64_t tmp;
1265 uint64_t result;
1266 DO_ABD(result, a, b, int16_t, int32_t);
1267 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
1268 return result | (tmp << 32);
1269 }
1270
1271 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
1272 {
1273 uint64_t result;
1274 DO_ABD(result, a, b, uint32_t, uint64_t);
1275 return result;
1276 }
1277
1278 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
1279 {
1280 uint64_t result;
1281 DO_ABD(result, a, b, int32_t, int64_t);
1282 return result;
1283 }
1284 #undef DO_ABD
1285
1286 /* Widening multiply. Named type is the source type. */
1287 #define DO_MULL(dest, x, y, type1, type2) do { \
1288 type1 tmp_x = x; \
1289 type1 tmp_y = y; \
1290 dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1291 } while(0)
1292
1293 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
1294 {
1295 uint64_t tmp;
1296 uint64_t result;
1297
1298 DO_MULL(result, a, b, uint8_t, uint16_t);
1299 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
1300 result |= tmp << 16;
1301 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
1302 result |= tmp << 32;
1303 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
1304 result |= tmp << 48;
1305 return result;
1306 }
1307
1308 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
1309 {
1310 uint64_t tmp;
1311 uint64_t result;
1312
1313 DO_MULL(result, a, b, int8_t, uint16_t);
1314 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
1315 result |= tmp << 16;
1316 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
1317 result |= tmp << 32;
1318 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
1319 result |= tmp << 48;
1320 return result;
1321 }
1322
1323 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
1324 {
1325 uint64_t tmp;
1326 uint64_t result;
1327
1328 DO_MULL(result, a, b, uint16_t, uint32_t);
1329 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
1330 return result | (tmp << 32);
1331 }
1332
1333 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
1334 {
1335 uint64_t tmp;
1336 uint64_t result;
1337
1338 DO_MULL(result, a, b, int16_t, uint32_t);
1339 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
1340 return result | (tmp << 32);
1341 }
1342
1343 uint64_t HELPER(neon_negl_u16)(uint64_t x)
1344 {
1345 uint16_t tmp;
1346 uint64_t result;
1347 result = (uint16_t)-x;
1348 tmp = -(x >> 16);
1349 result |= (uint64_t)tmp << 16;
1350 tmp = -(x >> 32);
1351 result |= (uint64_t)tmp << 32;
1352 tmp = -(x >> 48);
1353 result |= (uint64_t)tmp << 48;
1354 return result;
1355 }
1356
1357 uint64_t HELPER(neon_negl_u32)(uint64_t x)
1358 {
1359 uint32_t low = -x;
1360 uint32_t high = -(x >> 32);
1361 return low | ((uint64_t)high << 32);
1362 }
1363
1364 /* Saturating sign manipulation. */
1365 /* ??? Make these use NEON_VOP1 */
1366 #define DO_QABS8(x) do { \
1367 if (x == (int8_t)0x80) { \
1368 x = 0x7f; \
1369 SET_QC(); \
1370 } else if (x < 0) { \
1371 x = -x; \
1372 }} while (0)
1373 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
1374 {
1375 neon_s8 vec;
1376 NEON_UNPACK(neon_s8, vec, x);
1377 DO_QABS8(vec.v1);
1378 DO_QABS8(vec.v2);
1379 DO_QABS8(vec.v3);
1380 DO_QABS8(vec.v4);
1381 NEON_PACK(neon_s8, x, vec);
1382 return x;
1383 }
1384 #undef DO_QABS8
1385
1386 #define DO_QNEG8(x) do { \
1387 if (x == (int8_t)0x80) { \
1388 x = 0x7f; \
1389 SET_QC(); \
1390 } else { \
1391 x = -x; \
1392 }} while (0)
1393 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
1394 {
1395 neon_s8 vec;
1396 NEON_UNPACK(neon_s8, vec, x);
1397 DO_QNEG8(vec.v1);
1398 DO_QNEG8(vec.v2);
1399 DO_QNEG8(vec.v3);
1400 DO_QNEG8(vec.v4);
1401 NEON_PACK(neon_s8, x, vec);
1402 return x;
1403 }
1404 #undef DO_QNEG8
1405
1406 #define DO_QABS16(x) do { \
1407 if (x == (int16_t)0x8000) { \
1408 x = 0x7fff; \
1409 SET_QC(); \
1410 } else if (x < 0) { \
1411 x = -x; \
1412 }} while (0)
1413 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
1414 {
1415 neon_s16 vec;
1416 NEON_UNPACK(neon_s16, vec, x);
1417 DO_QABS16(vec.v1);
1418 DO_QABS16(vec.v2);
1419 NEON_PACK(neon_s16, x, vec);
1420 return x;
1421 }
1422 #undef DO_QABS16
1423
1424 #define DO_QNEG16(x) do { \
1425 if (x == (int16_t)0x8000) { \
1426 x = 0x7fff; \
1427 SET_QC(); \
1428 } else { \
1429 x = -x; \
1430 }} while (0)
1431 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
1432 {
1433 neon_s16 vec;
1434 NEON_UNPACK(neon_s16, vec, x);
1435 DO_QNEG16(vec.v1);
1436 DO_QNEG16(vec.v2);
1437 NEON_PACK(neon_s16, x, vec);
1438 return x;
1439 }
1440 #undef DO_QNEG16
1441
1442 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
1443 {
1444 if (x == SIGNBIT) {
1445 SET_QC();
1446 x = ~SIGNBIT;
1447 } else if ((int32_t)x < 0) {
1448 x = -x;
1449 }
1450 return x;
1451 }
1452
1453 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
1454 {
1455 if (x == SIGNBIT) {
1456 SET_QC();
1457 x = ~SIGNBIT;
1458 } else {
1459 x = -x;
1460 }
1461 return x;
1462 }
1463
1464 uint64_t HELPER(neon_qabs_s64)(CPUARMState *env, uint64_t x)
1465 {
1466 if (x == SIGNBIT64) {
1467 SET_QC();
1468 x = ~SIGNBIT64;
1469 } else if ((int64_t)x < 0) {
1470 x = -x;
1471 }
1472 return x;
1473 }
1474
1475 uint64_t HELPER(neon_qneg_s64)(CPUARMState *env, uint64_t x)
1476 {
1477 if (x == SIGNBIT64) {
1478 SET_QC();
1479 x = ~SIGNBIT64;
1480 } else {
1481 x = -x;
1482 }
1483 return x;
1484 }
1485
1486 /* NEON Float helpers. */
1487
1488 /* Floating point comparisons produce an integer result.
1489 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
1490 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
1491 */
1492 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
1493 {
1494 float_status *fpst = fpstp;
1495 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
1496 }
1497
1498 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
1499 {
1500 float_status *fpst = fpstp;
1501 return -float32_le(make_float32(b), make_float32(a), fpst);
1502 }
1503
1504 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1505 {
1506 float_status *fpst = fpstp;
1507 return -float32_lt(make_float32(b), make_float32(a), fpst);
1508 }
1509
1510 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
1511 {
1512 float_status *fpst = fpstp;
1513 float32 f0 = float32_abs(make_float32(a));
1514 float32 f1 = float32_abs(make_float32(b));
1515 return -float32_le(f1, f0, fpst);
1516 }
1517
1518 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
1519 {
1520 float_status *fpst = fpstp;
1521 float32 f0 = float32_abs(make_float32(a));
1522 float32 f1 = float32_abs(make_float32(b));
1523 return -float32_lt(f1, f0, fpst);
1524 }
1525
1526 uint64_t HELPER(neon_acge_f64)(uint64_t a, uint64_t b, void *fpstp)
1527 {
1528 float_status *fpst = fpstp;
1529 float64 f0 = float64_abs(make_float64(a));
1530 float64 f1 = float64_abs(make_float64(b));
1531 return -float64_le(f1, f0, fpst);
1532 }
1533
1534 uint64_t HELPER(neon_acgt_f64)(uint64_t a, uint64_t b, void *fpstp)
1535 {
1536 float_status *fpst = fpstp;
1537 float64 f0 = float64_abs(make_float64(a));
1538 float64 f1 = float64_abs(make_float64(b));
1539 return -float64_lt(f1, f0, fpst);
1540 }
1541
1542 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
1543
1544 void HELPER(neon_qunzip8)(void *vd, void *vm)
1545 {
1546 uint64_t *rd = vd, *rm = vm;
1547 uint64_t zd0 = rd[0], zd1 = rd[1];
1548 uint64_t zm0 = rm[0], zm1 = rm[1];
1549
1550 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
1551 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
1552 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
1553 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
1554 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
1555 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
1556 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1557 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
1558 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
1559 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
1560 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
1561 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
1562 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
1563 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
1564 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
1565 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1566
1567 rm[0] = m0;
1568 rm[1] = m1;
1569 rd[0] = d0;
1570 rd[1] = d1;
1571 }
1572
1573 void HELPER(neon_qunzip16)(void *vd, void *vm)
1574 {
1575 uint64_t *rd = vd, *rm = vm;
1576 uint64_t zd0 = rd[0], zd1 = rd[1];
1577 uint64_t zm0 = rm[0], zm1 = rm[1];
1578
1579 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
1580 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
1581 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
1582 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
1583 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
1584 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
1585 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
1586 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1587
1588 rm[0] = m0;
1589 rm[1] = m1;
1590 rd[0] = d0;
1591 rd[1] = d1;
1592 }
1593
1594 void HELPER(neon_qunzip32)(void *vd, void *vm)
1595 {
1596 uint64_t *rd = vd, *rm = vm;
1597 uint64_t zd0 = rd[0], zd1 = rd[1];
1598 uint64_t zm0 = rm[0], zm1 = rm[1];
1599
1600 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
1601 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1602 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
1603 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1604
1605 rm[0] = m0;
1606 rm[1] = m1;
1607 rd[0] = d0;
1608 rd[1] = d1;
1609 }
1610
1611 void HELPER(neon_unzip8)(void *vd, void *vm)
1612 {
1613 uint64_t *rd = vd, *rm = vm;
1614 uint64_t zd = rd[0], zm = rm[0];
1615
1616 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
1617 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
1618 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1619 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
1620 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
1621 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
1622 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
1623 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1624
1625 rm[0] = m0;
1626 rd[0] = d0;
1627 }
1628
1629 void HELPER(neon_unzip16)(void *vd, void *vm)
1630 {
1631 uint64_t *rd = vd, *rm = vm;
1632 uint64_t zd = rd[0], zm = rm[0];
1633
1634 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
1635 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
1636 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
1637 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1638
1639 rm[0] = m0;
1640 rd[0] = d0;
1641 }
1642
1643 void HELPER(neon_qzip8)(void *vd, void *vm)
1644 {
1645 uint64_t *rd = vd, *rm = vm;
1646 uint64_t zd0 = rd[0], zd1 = rd[1];
1647 uint64_t zm0 = rm[0], zm1 = rm[1];
1648
1649 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
1650 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
1651 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
1652 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
1653 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
1654 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
1655 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
1656 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
1657 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
1658 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
1659 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
1660 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
1661 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
1662 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
1663 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
1664 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
1665
1666 rm[0] = m0;
1667 rm[1] = m1;
1668 rd[0] = d0;
1669 rd[1] = d1;
1670 }
1671
1672 void HELPER(neon_qzip16)(void *vd, void *vm)
1673 {
1674 uint64_t *rd = vd, *rm = vm;
1675 uint64_t zd0 = rd[0], zd1 = rd[1];
1676 uint64_t zm0 = rm[0], zm1 = rm[1];
1677
1678 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
1679 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
1680 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
1681 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
1682 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
1683 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
1684 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
1685 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
1686
1687 rm[0] = m0;
1688 rm[1] = m1;
1689 rd[0] = d0;
1690 rd[1] = d1;
1691 }
1692
1693 void HELPER(neon_qzip32)(void *vd, void *vm)
1694 {
1695 uint64_t *rd = vd, *rm = vm;
1696 uint64_t zd0 = rd[0], zd1 = rd[1];
1697 uint64_t zm0 = rm[0], zm1 = rm[1];
1698
1699 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
1700 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
1701 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
1702 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
1703
1704 rm[0] = m0;
1705 rm[1] = m1;
1706 rd[0] = d0;
1707 rd[1] = d1;
1708 }
1709
1710 void HELPER(neon_zip8)(void *vd, void *vm)
1711 {
1712 uint64_t *rd = vd, *rm = vm;
1713 uint64_t zd = rd[0], zm = rm[0];
1714
1715 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
1716 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
1717 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
1718 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
1719 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
1720 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
1721 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
1722 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
1723
1724 rm[0] = m0;
1725 rd[0] = d0;
1726 }
1727
1728 void HELPER(neon_zip16)(void *vd, void *vm)
1729 {
1730 uint64_t *rd = vd, *rm = vm;
1731 uint64_t zd = rd[0], zm = rm[0];
1732
1733 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
1734 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
1735 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
1736 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
1737
1738 rm[0] = m0;
1739 rd[0] = d0;
1740 }