]> git.proxmox.com Git - mirror_qemu.git/blob - target-arm/op_neon.h
ARMv7 support.
[mirror_qemu.git] / target-arm / op_neon.h
1 /*
2 * ARM NEON vector operations.
3 *
4 * Copyright (c) 2007 CodeSourcery.
5 * Written by Paul Brook
6 *
7 * This code is licenced under the GPL.
8 */
9 /* Note that for NEON an "l" prefix means it is a wide operation, unlike
10 scalar arm ops where it means a word size operation. */
11
12 /* ??? NEON ops should probably have their own float status. */
13 #define NFS &env->vfp.fp_status
14 #define NEON_OP(name) void OPPROTO op_neon_##name (void)
15
16 NEON_OP(getreg_T0)
17 {
18 T0 = *(uint32_t *)((char *) env + PARAM1);
19 }
20
21 NEON_OP(getreg_T1)
22 {
23 T1 = *(uint32_t *)((char *) env + PARAM1);
24 }
25
26 NEON_OP(getreg_T2)
27 {
28 T2 = *(uint32_t *)((char *) env + PARAM1);
29 }
30
31 NEON_OP(setreg_T0)
32 {
33 *(uint32_t *)((char *) env + PARAM1) = T0;
34 }
35
36 NEON_OP(setreg_T1)
37 {
38 *(uint32_t *)((char *) env + PARAM1) = T1;
39 }
40
41 NEON_OP(setreg_T2)
42 {
43 *(uint32_t *)((char *) env + PARAM1) = T2;
44 }
45
46 #define NEON_TYPE1(name, type) \
47 typedef struct \
48 { \
49 type v1; \
50 } neon_##name;
51 #ifdef WORDS_BIGENDIAN
52 #define NEON_TYPE2(name, type) \
53 typedef struct \
54 { \
55 type v2; \
56 type v1; \
57 } neon_##name;
58 #define NEON_TYPE4(name, type) \
59 typedef struct \
60 { \
61 type v4; \
62 type v3; \
63 type v2; \
64 type v1; \
65 } neon_##name;
66 #else
67 #define NEON_TYPE2(name, type) \
68 typedef struct \
69 { \
70 type v1; \
71 type v2; \
72 } neon_##name;
73 #define NEON_TYPE4(name, type) \
74 typedef struct \
75 { \
76 type v1; \
77 type v2; \
78 type v3; \
79 type v4; \
80 } neon_##name;
81 #endif
82
83 NEON_TYPE4(s8, int8_t)
84 NEON_TYPE4(u8, uint8_t)
85 NEON_TYPE2(s16, int16_t)
86 NEON_TYPE2(u16, uint16_t)
87 NEON_TYPE1(s32, int32_t)
88 NEON_TYPE1(u32, uint32_t)
89 #undef NEON_TYPE4
90 #undef NEON_TYPE2
91 #undef NEON_TYPE1
92
93 /* Copy from a uint32_t to a vector structure type. */
94 #define NEON_UNPACK(vtype, dest, val) do { \
95 union { \
96 vtype v; \
97 uint32_t i; \
98 } conv_u; \
99 conv_u.i = (val); \
100 dest = conv_u.v; \
101 } while(0)
102
103 /* Copy from a vector structure type to a uint32_t. */
104 #define NEON_PACK(vtype, dest, val) do { \
105 union { \
106 vtype v; \
107 uint32_t i; \
108 } conv_u; \
109 conv_u.v = (val); \
110 dest = conv_u.i; \
111 } while(0)
112
113 #define NEON_DO1 \
114 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
115 #define NEON_DO2 \
116 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
117 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
118 #define NEON_DO4 \
119 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
120 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
121 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
122 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
123
124 #define NEON_VOP(name, vtype, n) \
125 NEON_OP(name) \
126 { \
127 vtype vsrc1; \
128 vtype vsrc2; \
129 vtype vdest; \
130 NEON_UNPACK(vtype, vsrc1, T0); \
131 NEON_UNPACK(vtype, vsrc2, T1); \
132 NEON_DO##n; \
133 NEON_PACK(vtype, T0, vdest); \
134 FORCE_RET(); \
135 }
136
137 #define NEON_VOP1(name, vtype, n) \
138 NEON_OP(name) \
139 { \
140 vtype vsrc1; \
141 vtype vdest; \
142 NEON_UNPACK(vtype, vsrc1, T0); \
143 NEON_DO##n; \
144 NEON_PACK(vtype, T0, vdest); \
145 FORCE_RET(); \
146 }
147
148 /* Pairwise operations. */
149 /* For 32-bit elements each segment only contains a single element, so
150 the elementwise and pairwise operations are the same. */
151 #define NEON_PDO2 \
152 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
153 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
154 #define NEON_PDO4 \
155 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
156 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
157 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
158 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
159
160 #define NEON_POP(name, vtype, n) \
161 NEON_OP(name) \
162 { \
163 vtype vsrc1; \
164 vtype vsrc2; \
165 vtype vdest; \
166 NEON_UNPACK(vtype, vsrc1, T0); \
167 NEON_UNPACK(vtype, vsrc2, T1); \
168 NEON_PDO##n; \
169 NEON_PACK(vtype, T0, vdest); \
170 FORCE_RET(); \
171 }
172
173 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
174 NEON_VOP(hadd_s8, neon_s8, 4)
175 NEON_VOP(hadd_u8, neon_u8, 4)
176 NEON_VOP(hadd_s16, neon_s16, 2)
177 NEON_VOP(hadd_u16, neon_u16, 2)
178 #undef NEON_FN
179
180 NEON_OP(hadd_s32)
181 {
182 int32_t src1 = T0;
183 int32_t src2 = T1;
184 int32_t dest;
185
186 dest = (src1 >> 1) + (src2 >> 1);
187 if (src1 & src2 & 1)
188 dest++;
189 T0 = dest;
190 FORCE_RET();
191 }
192
193 NEON_OP(hadd_u32)
194 {
195 uint32_t src1 = T0;
196 uint32_t src2 = T1;
197 uint32_t dest;
198
199 dest = (src1 >> 1) + (src2 >> 1);
200 if (src1 & src2 & 1)
201 dest++;
202 T0 = dest;
203 FORCE_RET();
204 }
205
206 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
207 NEON_VOP(rhadd_s8, neon_s8, 4)
208 NEON_VOP(rhadd_u8, neon_u8, 4)
209 NEON_VOP(rhadd_s16, neon_s16, 2)
210 NEON_VOP(rhadd_u16, neon_u16, 2)
211 #undef NEON_FN
212
213 NEON_OP(rhadd_s32)
214 {
215 int32_t src1 = T0;
216 int32_t src2 = T1;
217 int32_t dest;
218
219 dest = (src1 >> 1) + (src2 >> 1);
220 if ((src1 | src2) & 1)
221 dest++;
222 T0 = dest;
223 FORCE_RET();
224 }
225
226 NEON_OP(rhadd_u32)
227 {
228 uint32_t src1 = T0;
229 uint32_t src2 = T1;
230 uint32_t dest;
231
232 dest = (src1 >> 1) + (src2 >> 1);
233 if ((src1 | src2) & 1)
234 dest++;
235 T0 = dest;
236 FORCE_RET();
237 }
238
239 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
240 NEON_VOP(hsub_s8, neon_s8, 4)
241 NEON_VOP(hsub_u8, neon_u8, 4)
242 NEON_VOP(hsub_s16, neon_s16, 2)
243 NEON_VOP(hsub_u16, neon_u16, 2)
244 #undef NEON_FN
245
246 NEON_OP(hsub_s32)
247 {
248 int32_t src1 = T0;
249 int32_t src2 = T1;
250 int32_t dest;
251
252 dest = (src1 >> 1) - (src2 >> 1);
253 if ((~src1) & src2 & 1)
254 dest--;
255 T0 = dest;
256 FORCE_RET();
257 }
258
259 NEON_OP(hsub_u32)
260 {
261 uint32_t src1 = T0;
262 uint32_t src2 = T1;
263 uint32_t dest;
264
265 dest = (src1 >> 1) - (src2 >> 1);
266 if ((~src1) & src2 & 1)
267 dest--;
268 T0 = dest;
269 FORCE_RET();
270 }
271
272 /* ??? bsl, bif and bit are all the same op, just with the oparands in a
273 differnet order. It's currently easier to have 3 differnt ops than
274 rearange the operands. */
275
276 /* Bitwise Select. */
277 NEON_OP(bsl)
278 {
279 T0 = (T0 & T2) | (T1 & ~T2);
280 }
281
282 /* Bitwise Insert If True. */
283 NEON_OP(bit)
284 {
285 T0 = (T0 & T1) | (T2 & ~T1);
286 }
287
288 /* Bitwise Insert If False. */
289 NEON_OP(bif)
290 {
291 T0 = (T2 & T1) | (T0 & ~T1);
292 }
293
294 #define NEON_USAT(dest, src1, src2, type) do { \
295 uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
296 if (tmp != (type)tmp) { \
297 env->QF = 1; \
298 dest = ~0; \
299 } else { \
300 dest = tmp; \
301 }} while(0)
302 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
303 NEON_VOP(qadd_u8, neon_u8, 4)
304 #undef NEON_FN
305 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
306 NEON_VOP(qadd_u16, neon_u16, 2)
307 #undef NEON_FN
308 #undef NEON_USAT
309
310 #define NEON_SSAT(dest, src1, src2, type) do { \
311 int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
312 if (tmp != (type)tmp) { \
313 env->QF = 1; \
314 if (src2 > 0) { \
315 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
316 } else { \
317 tmp = 1 << (sizeof(type) * 8 - 1); \
318 } \
319 } \
320 dest = tmp; \
321 } while(0)
322 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
323 NEON_VOP(qadd_s8, neon_s8, 4)
324 #undef NEON_FN
325 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
326 NEON_VOP(qadd_s16, neon_s16, 2)
327 #undef NEON_FN
328 #undef NEON_SSAT
329
330 #define NEON_USAT(dest, src1, src2, type) do { \
331 uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
332 if (tmp != (type)tmp) { \
333 env->QF = 1; \
334 dest = 0; \
335 } else { \
336 dest = tmp; \
337 }} while(0)
338 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
339 NEON_VOP(qsub_u8, neon_u8, 4)
340 #undef NEON_FN
341 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
342 NEON_VOP(qsub_u16, neon_u16, 2)
343 #undef NEON_FN
344 #undef NEON_USAT
345
346 #define NEON_SSAT(dest, src1, src2, type) do { \
347 int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
348 if (tmp != (type)tmp) { \
349 env->QF = 1; \
350 if (src2 < 0) { \
351 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
352 } else { \
353 tmp = 1 << (sizeof(type) * 8 - 1); \
354 } \
355 } \
356 dest = tmp; \
357 } while(0)
358 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
359 NEON_VOP(qsub_s8, neon_s8, 4)
360 #undef NEON_FN
361 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
362 NEON_VOP(qsub_s16, neon_s16, 2)
363 #undef NEON_FN
364 #undef NEON_SSAT
365
366 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
367 NEON_VOP(cgt_s8, neon_s8, 4)
368 NEON_VOP(cgt_u8, neon_u8, 4)
369 NEON_VOP(cgt_s16, neon_s16, 2)
370 NEON_VOP(cgt_u16, neon_u16, 2)
371 NEON_VOP(cgt_s32, neon_s32, 1)
372 NEON_VOP(cgt_u32, neon_u32, 1)
373 #undef NEON_FN
374
375 #define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
376 NEON_VOP(cge_s8, neon_s8, 4)
377 NEON_VOP(cge_u8, neon_u8, 4)
378 NEON_VOP(cge_s16, neon_s16, 2)
379 NEON_VOP(cge_u16, neon_u16, 2)
380 NEON_VOP(cge_s32, neon_s32, 1)
381 NEON_VOP(cge_u32, neon_u32, 1)
382 #undef NEON_FN
383
384 #define NEON_FN(dest, src1, src2) do { \
385 int8_t tmp; \
386 tmp = (int8_t)src2; \
387 if (tmp < 0) { \
388 dest = src1 >> -tmp; \
389 } else { \
390 dest = src1 << tmp; \
391 }} while (0)
392 NEON_VOP(shl_s8, neon_s8, 4)
393 NEON_VOP(shl_u8, neon_u8, 4)
394 NEON_VOP(shl_s16, neon_s16, 2)
395 NEON_VOP(shl_u16, neon_u16, 2)
396 NEON_VOP(shl_s32, neon_s32, 1)
397 NEON_VOP(shl_u32, neon_u32, 1)
398 #undef NEON_FN
399
400 NEON_OP(shl_u64)
401 {
402 int8_t shift = T2;
403 uint64_t val = T0 | ((uint64_t)T1 << 32);
404 if (shift < 0) {
405 val >>= -shift;
406 } else {
407 val <<= shift;
408 }
409 T0 = val;
410 T1 = val >> 32;
411 FORCE_RET();
412 }
413
414 NEON_OP(shl_s64)
415 {
416 int8_t shift = T2;
417 int64_t val = T0 | ((uint64_t)T1 << 32);
418 if (shift < 0) {
419 val >>= -shift;
420 } else {
421 val <<= shift;
422 }
423 T0 = val;
424 T1 = val >> 32;
425 FORCE_RET();
426 }
427
428 #define NEON_FN(dest, src1, src2) do { \
429 int8_t tmp; \
430 tmp = (int8_t)src1; \
431 if (tmp < 0) { \
432 dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
433 } else { \
434 dest = src2 << tmp; \
435 }} while (0)
436
437 NEON_VOP(rshl_s8, neon_s8, 4)
438 NEON_VOP(rshl_u8, neon_u8, 4)
439 NEON_VOP(rshl_s16, neon_s16, 2)
440 NEON_VOP(rshl_u16, neon_u16, 2)
441 NEON_VOP(rshl_s32, neon_s32, 1)
442 NEON_VOP(rshl_u32, neon_u32, 1)
443 #undef NEON_FN
444
445 NEON_OP(rshl_u64)
446 {
447 int8_t shift = T2;
448 uint64_t val = T0 | ((uint64_t)T1 << 32);
449 if (shift < 0) {
450 val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift;
451 val >>= -shift;
452 } else {
453 val <<= shift;
454 }
455 T0 = val;
456 T1 = val >> 32;
457 FORCE_RET();
458 }
459
460 NEON_OP(rshl_s64)
461 {
462 int8_t shift = T2;
463 int64_t val = T0 | ((uint64_t)T1 << 32);
464 if (shift < 0) {
465 val = (val + ((int64_t)1 << (-1 - shift))) >> -shift;
466 } else {
467 val <<= shift;
468 }
469 T0 = val;
470 T1 = val >> 32;
471 FORCE_RET();
472 }
473
474 #define NEON_FN(dest, src1, src2) do { \
475 int8_t tmp; \
476 tmp = (int8_t)src1; \
477 if (tmp < 0) { \
478 dest = src2 >> -tmp; \
479 } else { \
480 dest = src2 << tmp; \
481 if ((dest >> tmp) != src2) { \
482 env->QF = 1; \
483 dest = ~0; \
484 } \
485 }} while (0)
486 NEON_VOP(qshl_s8, neon_s8, 4)
487 NEON_VOP(qshl_s16, neon_s16, 2)
488 NEON_VOP(qshl_s32, neon_s32, 1)
489 #undef NEON_FN
490
491 NEON_OP(qshl_s64)
492 {
493 int8_t shift = T2;
494 int64_t val = T0 | ((uint64_t)T1 << 32);
495 if (shift < 0) {
496 val >>= -shift;
497 } else {
498 int64_t tmp = val;
499 val <<= shift;
500 if ((val >> shift) != tmp) {
501 env->QF = 1;
502 val = (tmp >> 63) ^ 0x7fffffffffffffffULL;
503 }
504 }
505 T0 = val;
506 T1 = val >> 32;
507 FORCE_RET();
508 }
509
510 #define NEON_FN(dest, src1, src2) do { \
511 int8_t tmp; \
512 tmp = (int8_t)src1; \
513 if (tmp < 0) { \
514 dest = src2 >> -tmp; \
515 } else { \
516 dest = src2 << tmp; \
517 if ((dest >> tmp) != src2) { \
518 env->QF = 1; \
519 dest = src2 >> 31; \
520 } \
521 }} while (0)
522 NEON_VOP(qshl_u8, neon_u8, 4)
523 NEON_VOP(qshl_u16, neon_u16, 2)
524 NEON_VOP(qshl_u32, neon_u32, 1)
525 #undef NEON_FN
526
527 NEON_OP(qshl_u64)
528 {
529 int8_t shift = T2;
530 uint64_t val = T0 | ((uint64_t)T1 << 32);
531 if (shift < 0) {
532 val >>= -shift;
533 } else {
534 uint64_t tmp = val;
535 val <<= shift;
536 if ((val >> shift) != tmp) {
537 env->QF = 1;
538 val = ~(uint64_t)0;
539 }
540 }
541 T0 = val;
542 T1 = val >> 32;
543 FORCE_RET();
544 }
545
546 #define NEON_FN(dest, src1, src2) do { \
547 int8_t tmp; \
548 tmp = (int8_t)src1; \
549 if (tmp < 0) { \
550 dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
551 } else { \
552 dest = src2 << tmp; \
553 if ((dest >> tmp) != src2) { \
554 dest = ~0; \
555 } \
556 }} while (0)
557 NEON_VOP(qrshl_s8, neon_s8, 4)
558 NEON_VOP(qrshl_s16, neon_s16, 2)
559 NEON_VOP(qrshl_s32, neon_s32, 1)
560 #undef NEON_FN
561
562 #define NEON_FN(dest, src1, src2) do { \
563 int8_t tmp; \
564 tmp = (int8_t)src1; \
565 if (tmp < 0) { \
566 dest = (src2 + (1 << (-1 - tmp))) >> -tmp; \
567 } else { \
568 dest = src2 << tmp; \
569 if ((dest >> tmp) != src2) { \
570 env->QF = 1; \
571 dest = src2 >> 31; \
572 } \
573 }} while (0)
574 NEON_VOP(qrshl_u8, neon_u8, 4)
575 NEON_VOP(qrshl_u16, neon_u16, 2)
576 NEON_VOP(qrshl_u32, neon_u32, 1)
577 #undef NEON_FN
578
579 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
580 NEON_VOP(max_s8, neon_s8, 4)
581 NEON_VOP(max_u8, neon_u8, 4)
582 NEON_VOP(max_s16, neon_s16, 2)
583 NEON_VOP(max_u16, neon_u16, 2)
584 NEON_VOP(max_s32, neon_s32, 1)
585 NEON_VOP(max_u32, neon_u32, 1)
586 NEON_POP(pmax_s8, neon_s8, 4)
587 NEON_POP(pmax_u8, neon_u8, 4)
588 NEON_POP(pmax_s16, neon_s16, 2)
589 NEON_POP(pmax_u16, neon_u16, 2)
590 #undef NEON_FN
591
592 NEON_OP(max_f32)
593 {
594 float32 f0 = vfp_itos(T0);
595 float32 f1 = vfp_itos(T1);
596 T0 = (float32_compare_quiet(f0, f1, NFS) == 1) ? T0 : T1;
597 FORCE_RET();
598 }
599
600 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
601 NEON_VOP(min_s8, neon_s8, 4)
602 NEON_VOP(min_u8, neon_u8, 4)
603 NEON_VOP(min_s16, neon_s16, 2)
604 NEON_VOP(min_u16, neon_u16, 2)
605 NEON_VOP(min_s32, neon_s32, 1)
606 NEON_VOP(min_u32, neon_u32, 1)
607 NEON_POP(pmin_s8, neon_s8, 4)
608 NEON_POP(pmin_u8, neon_u8, 4)
609 NEON_POP(pmin_s16, neon_s16, 2)
610 NEON_POP(pmin_u16, neon_u16, 2)
611 #undef NEON_FN
612
613 NEON_OP(min_f32)
614 {
615 float32 f0 = vfp_itos(T0);
616 float32 f1 = vfp_itos(T1);
617 T0 = (float32_compare_quiet(f0, f1, NFS) == -1) ? T0 : T1;
618 FORCE_RET();
619 }
620
621 #define NEON_FN(dest, src1, src2) \
622 dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
623 NEON_VOP(abd_s8, neon_s8, 4)
624 NEON_VOP(abd_u8, neon_u8, 4)
625 NEON_VOP(abd_s16, neon_s16, 2)
626 NEON_VOP(abd_u16, neon_u16, 2)
627 NEON_VOP(abd_s32, neon_s32, 1)
628 NEON_VOP(abd_u32, neon_u32, 1)
629 #undef NEON_FN
630
631 NEON_OP(abd_f32)
632 {
633 float32 f0 = vfp_itos(T0);
634 float32 f1 = vfp_itos(T1);
635 T0 = vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1)
636 ? float32_sub(f0, f1, NFS)
637 : float32_sub(f1, f0, NFS));
638 FORCE_RET();
639 }
640
641 #define NEON_FN(dest, src1, src2) dest = src1 + src2
642 NEON_VOP(add_u8, neon_u8, 4)
643 NEON_VOP(add_u16, neon_u16, 2)
644 NEON_POP(padd_u8, neon_u8, 4)
645 NEON_POP(padd_u16, neon_u16, 2)
646 #undef NEON_FN
647
648 NEON_OP(add_f32)
649 {
650 T0 = vfp_stoi(float32_add(vfp_itos(T0), vfp_itos(T1), NFS));
651 FORCE_RET();
652 }
653
654 #define NEON_FN(dest, src1, src2) dest = src1 - src2
655 NEON_VOP(sub_u8, neon_u8, 4)
656 NEON_VOP(sub_u16, neon_u16, 2)
657 #undef NEON_FN
658
659 NEON_OP(sub_f32)
660 {
661 T0 = vfp_stoi(float32_sub(vfp_itos(T0), vfp_itos(T1), NFS));
662 FORCE_RET();
663 }
664
665 #define NEON_FN(dest, src1, src2) dest = src2 - src1
666 NEON_VOP(rsb_u8, neon_u8, 4)
667 NEON_VOP(rsb_u16, neon_u16, 2)
668 #undef NEON_FN
669
670 NEON_OP(rsb_f32)
671 {
672 T0 = vfp_stoi(float32_sub(vfp_itos(T1), vfp_itos(T0), NFS));
673 FORCE_RET();
674 }
675
676 #define NEON_FN(dest, src1, src2) dest = src1 * src2
677 NEON_VOP(mul_u8, neon_u8, 4)
678 NEON_VOP(mul_u16, neon_u16, 2)
679 #undef NEON_FN
680
681 NEON_OP(mul_f32)
682 {
683 T0 = vfp_stoi(float32_mul(vfp_itos(T0), vfp_itos(T1), NFS));
684 FORCE_RET();
685 }
686
687 NEON_OP(mul_p8)
688 {
689 T0 = helper_neon_mul_p8(T0, T1);
690 }
691
692 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
693 NEON_VOP(tst_u8, neon_u8, 4)
694 NEON_VOP(tst_u16, neon_u16, 2)
695 NEON_VOP(tst_u32, neon_u32, 1)
696 #undef NEON_FN
697
698 #define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
699 NEON_VOP(ceq_u8, neon_u8, 4)
700 NEON_VOP(ceq_u16, neon_u16, 2)
701 NEON_VOP(ceq_u32, neon_u32, 1)
702 #undef NEON_FN
703
704 #define NEON_QDMULH16(dest, src1, src2, round) do { \
705 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
706 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
707 env->QF = 1; \
708 tmp = (tmp >> 31) ^ ~SIGNBIT; \
709 } \
710 tmp <<= 1; \
711 if (round) { \
712 int32_t old = tmp; \
713 tmp += 1 << 15; \
714 if ((int32_t)tmp < old) { \
715 env->QF = 1; \
716 tmp = SIGNBIT - 1; \
717 } \
718 } \
719 dest = tmp >> 16; \
720 } while(0)
721 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
722 NEON_VOP(qdmulh_s16, neon_s16, 2)
723 #undef NEON_FN
724 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
725 NEON_VOP(qrdmulh_s16, neon_s16, 2)
726 #undef NEON_FN
727 #undef NEON_QDMULH16
728
729 #define SIGNBIT64 ((uint64_t)1 << 63)
730 #define NEON_QDMULH32(dest, src1, src2, round) do { \
731 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
732 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
733 env->QF = 1; \
734 tmp = (tmp >> 63) ^ ~SIGNBIT64; \
735 } else { \
736 tmp <<= 1; \
737 } \
738 if (round) { \
739 int64_t old = tmp; \
740 tmp += (int64_t)1 << 31; \
741 if ((int64_t)tmp < old) { \
742 env->QF = 1; \
743 tmp = SIGNBIT64 - 1; \
744 } \
745 } \
746 dest = tmp >> 32; \
747 } while(0)
748 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
749 NEON_VOP(qdmulh_s32, neon_s32, 1)
750 #undef NEON_FN
751 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
752 NEON_VOP(qrdmulh_s32, neon_s32, 1)
753 #undef NEON_FN
754 #undef NEON_QDMULH32
755
756 NEON_OP(recps_f32)
757 {
758 T0 = vfp_stoi(helper_recps_f32(vfp_itos(T0), vfp_itos(T1)));
759 FORCE_RET();
760 }
761
762 NEON_OP(rsqrts_f32)
763 {
764 T0 = vfp_stoi(helper_rsqrts_f32(vfp_itos(T0), vfp_itos(T1)));
765 FORCE_RET();
766 }
767
768 /* Floating point comparisons produce an integer result. */
769 #define NEON_VOP_FCMP(name, cmp) \
770 NEON_OP(name) \
771 { \
772 if (float32_compare_quiet(vfp_itos(T0), vfp_itos(T1), NFS) cmp 0) \
773 T0 = -1; \
774 else \
775 T0 = 0; \
776 FORCE_RET(); \
777 }
778
779 NEON_VOP_FCMP(ceq_f32, ==)
780 NEON_VOP_FCMP(cge_f32, >=)
781 NEON_VOP_FCMP(cgt_f32, >)
782
783 NEON_OP(acge_f32)
784 {
785 float32 f0 = float32_abs(vfp_itos(T0));
786 float32 f1 = float32_abs(vfp_itos(T1));
787 T0 = (float32_compare_quiet(f0, f1,NFS) >= 0) ? -1 : 0;
788 FORCE_RET();
789 }
790
791 NEON_OP(acgt_f32)
792 {
793 float32 f0 = float32_abs(vfp_itos(T0));
794 float32 f1 = float32_abs(vfp_itos(T1));
795 T0 = (float32_compare_quiet(f0, f1, NFS) > 0) ? -1 : 0;
796 FORCE_RET();
797 }
798
799 /* Narrowing instructions. The named type is the destination type. */
800 NEON_OP(narrow_u8)
801 {
802 T0 = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
803 | ((T1 << 16) & 0xff0000) | (T1 << 24);
804 FORCE_RET();
805 }
806
807 NEON_OP(narrow_sat_u8)
808 {
809 neon_u16 src;
810 neon_u8 dest;
811 #define SAT8(d, s) \
812 if (s > 0xff) { \
813 d = 0xff; \
814 env->QF = 1; \
815 } else { \
816 d = s; \
817 }
818
819 NEON_UNPACK(neon_u16, src, T0);
820 SAT8(dest.v1, src.v1);
821 SAT8(dest.v2, src.v2);
822 NEON_UNPACK(neon_u16, src, T1);
823 SAT8(dest.v3, src.v1);
824 SAT8(dest.v4, src.v2);
825 NEON_PACK(neon_u8, T0, dest);
826 FORCE_RET();
827 #undef SAT8
828 }
829
830 NEON_OP(narrow_sat_s8)
831 {
832 neon_s16 src;
833 neon_s8 dest;
834 #define SAT8(d, s) \
835 if (s != (uint8_t)s) { \
836 d = (s >> 15) ^ 0x7f; \
837 env->QF = 1; \
838 } else { \
839 d = s; \
840 }
841
842 NEON_UNPACK(neon_s16, src, T0);
843 SAT8(dest.v1, src.v1);
844 SAT8(dest.v2, src.v2);
845 NEON_UNPACK(neon_s16, src, T1);
846 SAT8(dest.v3, src.v1);
847 SAT8(dest.v4, src.v2);
848 NEON_PACK(neon_s8, T0, dest);
849 FORCE_RET();
850 #undef SAT8
851 }
852
853 NEON_OP(narrow_u16)
854 {
855 T0 = (T0 & 0xffff) | (T1 << 16);
856 }
857
858 NEON_OP(narrow_sat_u16)
859 {
860 if (T0 > 0xffff) {
861 T0 = 0xffff;
862 env->QF = 1;
863 }
864 if (T1 > 0xffff) {
865 T1 = 0xffff;
866 env->QF = 1;
867 }
868 T0 |= T1 << 16;
869 FORCE_RET();
870 }
871
872 NEON_OP(narrow_sat_s16)
873 {
874 if ((int32_t)T0 != (int16_t)T0) {
875 T0 = ((int32_t)T0 >> 31) ^ 0x7fff;
876 env->QF = 1;
877 }
878 if ((int32_t)T1 != (int16_t) T1) {
879 T1 = ((int32_t)T1 >> 31) ^ 0x7fff;
880 env->QF = 1;
881 }
882 T0 = (uint16_t)T0 | (T1 << 16);
883 FORCE_RET();
884 }
885
886 NEON_OP(narrow_sat_u32)
887 {
888 if (T1) {
889 T0 = 0xffffffffu;
890 env->QF = 1;
891 }
892 FORCE_RET();
893 }
894
895 NEON_OP(narrow_sat_s32)
896 {
897 int32_t sign = (int32_t)T1 >> 31;
898
899 if ((int32_t)T1 != sign) {
900 T0 = sign ^ 0x7fffffff;
901 env->QF = 1;
902 }
903 FORCE_RET();
904 }
905
906 /* Narrowing instructions. Named type is the narrow type. */
907 NEON_OP(narrow_high_u8)
908 {
909 T0 = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
910 | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
911 FORCE_RET();
912 }
913
914 NEON_OP(narrow_high_u16)
915 {
916 T0 = (T0 >> 16) | (T1 & 0xffff0000);
917 FORCE_RET();
918 }
919
920 NEON_OP(narrow_high_round_u8)
921 {
922 T0 = (((T0 + 0x80) >> 8) & 0xff) | (((T0 + 0x800000) >> 16) & 0xff00)
923 | (((T1 + 0x80) << 8) & 0xff0000) | ((T1 + 0x800000) & 0xff000000);
924 FORCE_RET();
925 }
926
927 NEON_OP(narrow_high_round_u16)
928 {
929 T0 = ((T0 + 0x8000) >> 16) | ((T1 + 0x8000) & 0xffff0000);
930 FORCE_RET();
931 }
932
933 NEON_OP(narrow_high_round_u32)
934 {
935 if (T0 >= 0x80000000u)
936 T0 = T1 + 1;
937 else
938 T0 = T1;
939 FORCE_RET();
940 }
941
942 /* Widening instructions. Named type is source type. */
943 NEON_OP(widen_s8)
944 {
945 uint32_t src;
946
947 src = T0;
948 T0 = (uint16_t)(int8_t)src | ((int8_t)(src >> 8) << 16);
949 T1 = (uint16_t)(int8_t)(src >> 16) | ((int8_t)(src >> 24) << 16);
950 }
951
952 NEON_OP(widen_u8)
953 {
954 T1 = ((T0 >> 8) & 0xff0000) | ((T0 >> 16) & 0xff);
955 T0 = ((T0 << 8) & 0xff0000) | (T0 & 0xff);
956 }
957
958 NEON_OP(widen_s16)
959 {
960 int32_t src;
961
962 src = T0;
963 T0 = (int16_t)src;
964 T1 = src >> 16;
965 }
966
967 NEON_OP(widen_u16)
968 {
969 T1 = T0 >> 16;
970 T0 &= 0xffff;
971 }
972
973 NEON_OP(widen_s32)
974 {
975 T1 = (int32_t)T0 >> 31;
976 FORCE_RET();
977 }
978
979 NEON_OP(widen_high_u8)
980 {
981 T1 = (T0 & 0xff000000) | ((T0 >> 8) & 0xff00);
982 T0 = ((T0 << 16) & 0xff000000) | ((T0 << 8) & 0xff00);
983 }
984
985 NEON_OP(widen_high_u16)
986 {
987 T1 = T0 & 0xffff0000;
988 T0 <<= 16;
989 }
990
991 /* Long operations. The type is the wide type. */
992 NEON_OP(shll_u16)
993 {
994 int shift = PARAM1;
995 uint32_t mask;
996
997 mask = 0xffff >> (16 - shift);
998 mask |= mask << 16;
999 mask = ~mask;
1000
1001 T0 = (T0 << shift) & mask;
1002 T1 = (T1 << shift) & mask;
1003 FORCE_RET();
1004 }
1005
1006 NEON_OP(shll_u64)
1007 {
1008 int shift = PARAM1;
1009
1010 T1 <<= shift;
1011 T1 |= T0 >> (32 - shift);
1012 T0 <<= shift;
1013 FORCE_RET();
1014 }
1015
1016 NEON_OP(addl_u16)
1017 {
1018 uint32_t tmp;
1019 uint32_t high;
1020
1021 tmp = env->vfp.scratch[0];
1022 high = (T0 >> 16) + (tmp >> 16);
1023 T0 = (uint16_t)(T0 + tmp);
1024 T0 |= (high << 16);
1025 tmp = env->vfp.scratch[1];
1026 high = (T1 >> 16) + (tmp >> 16);
1027 T1 = (uint16_t)(T1 + tmp);
1028 T1 |= (high << 16);
1029 FORCE_RET();
1030 }
1031
1032 NEON_OP(addl_u32)
1033 {
1034 T0 += env->vfp.scratch[0];
1035 T1 += env->vfp.scratch[1];
1036 FORCE_RET();
1037 }
1038
1039 NEON_OP(addl_u64)
1040 {
1041 uint64_t tmp;
1042 tmp = T0 | ((uint64_t)T1 << 32);
1043 tmp += env->vfp.scratch[0];
1044 tmp += (uint64_t)env->vfp.scratch[1] << 32;
1045 T0 = tmp;
1046 T1 = tmp >> 32;
1047 FORCE_RET();
1048 }
1049
1050 NEON_OP(subl_u16)
1051 {
1052 uint32_t tmp;
1053 uint32_t high;
1054
1055 tmp = env->vfp.scratch[0];
1056 high = (T0 >> 16) - (tmp >> 16);
1057 T0 = (uint16_t)(T0 - tmp);
1058 T0 |= (high << 16);
1059 tmp = env->vfp.scratch[1];
1060 high = (T1 >> 16) - (tmp >> 16);
1061 T1 = (uint16_t)(T1 - tmp);
1062 T1 |= (high << 16);
1063 FORCE_RET();
1064 }
1065
1066 NEON_OP(subl_u32)
1067 {
1068 T0 -= env->vfp.scratch[0];
1069 T1 -= env->vfp.scratch[1];
1070 FORCE_RET();
1071 }
1072
1073 NEON_OP(subl_u64)
1074 {
1075 uint64_t tmp;
1076 tmp = T0 | ((uint64_t)T1 << 32);
1077 tmp -= env->vfp.scratch[0];
1078 tmp -= (uint64_t)env->vfp.scratch[1] << 32;
1079 T0 = tmp;
1080 T1 = tmp >> 32;
1081 FORCE_RET();
1082 }
1083
1084 #define DO_ABD(dest, x, y, type) do { \
1085 type tmp_x = x; \
1086 type tmp_y = y; \
1087 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
1088 } while(0)
1089
1090 NEON_OP(abdl_u16)
1091 {
1092 uint32_t tmp;
1093 uint32_t low;
1094 uint32_t high;
1095
1096 DO_ABD(low, T0, T1, uint8_t);
1097 DO_ABD(tmp, T0 >> 8, T1 >> 8, uint8_t);
1098 low |= tmp << 16;
1099 DO_ABD(high, T0 >> 16, T1 >> 16, uint8_t);
1100 DO_ABD(tmp, T0 >> 24, T1 >> 24, uint8_t);
1101 high |= tmp << 16;
1102 T0 = low;
1103 T1 = high;
1104 FORCE_RET();
1105 }
1106
1107 NEON_OP(abdl_s16)
1108 {
1109 uint32_t tmp;
1110 uint32_t low;
1111 uint32_t high;
1112
1113 DO_ABD(low, T0, T1, int8_t);
1114 DO_ABD(tmp, T0 >> 8, T1 >> 8, int8_t);
1115 low |= tmp << 16;
1116 DO_ABD(high, T0 >> 16, T1 >> 16, int8_t);
1117 DO_ABD(tmp, T0 >> 24, T1 >> 24, int8_t);
1118 high |= tmp << 16;
1119 T0 = low;
1120 T1 = high;
1121 FORCE_RET();
1122 }
1123
1124 NEON_OP(abdl_u32)
1125 {
1126 uint32_t low;
1127 uint32_t high;
1128
1129 DO_ABD(low, T0, T1, uint16_t);
1130 DO_ABD(high, T0 >> 16, T1 >> 16, uint16_t);
1131 T0 = low;
1132 T1 = high;
1133 FORCE_RET();
1134 }
1135
1136 NEON_OP(abdl_s32)
1137 {
1138 uint32_t low;
1139 uint32_t high;
1140
1141 DO_ABD(low, T0, T1, int16_t);
1142 DO_ABD(high, T0 >> 16, T1 >> 16, int16_t);
1143 T0 = low;
1144 T1 = high;
1145 FORCE_RET();
1146 }
1147
1148 NEON_OP(abdl_u64)
1149 {
1150 DO_ABD(T0, T0, T1, uint32_t);
1151 T1 = 0;
1152 }
1153
1154 NEON_OP(abdl_s64)
1155 {
1156 DO_ABD(T0, T0, T1, int32_t);
1157 T1 = 0;
1158 }
1159 #undef DO_ABD
1160
1161 /* Widening multiple. Named type is the source type. */
1162 #define DO_MULL(dest, x, y, type1, type2) do { \
1163 type1 tmp_x = x; \
1164 type1 tmp_y = y; \
1165 dest = (type2)((type2)tmp_x * (type2)tmp_y); \
1166 } while(0)
1167
1168 NEON_OP(mull_u8)
1169 {
1170 uint32_t tmp;
1171 uint32_t low;
1172 uint32_t high;
1173
1174 DO_MULL(low, T0, T1, uint8_t, uint16_t);
1175 DO_MULL(tmp, T0 >> 8, T1 >> 8, uint8_t, uint16_t);
1176 low |= tmp << 16;
1177 DO_MULL(high, T0 >> 16, T1 >> 16, uint8_t, uint16_t);
1178 DO_MULL(tmp, T0 >> 24, T1 >> 24, uint8_t, uint16_t);
1179 high |= tmp << 16;
1180 T0 = low;
1181 T1 = high;
1182 FORCE_RET();
1183 }
1184
1185 NEON_OP(mull_s8)
1186 {
1187 uint32_t tmp;
1188 uint32_t low;
1189 uint32_t high;
1190
1191 DO_MULL(low, T0, T1, int8_t, uint16_t);
1192 DO_MULL(tmp, T0 >> 8, T1 >> 8, int8_t, uint16_t);
1193 low |= tmp << 16;
1194 DO_MULL(high, T0 >> 16, T1 >> 16, int8_t, uint16_t);
1195 DO_MULL(tmp, T0 >> 24, T1 >> 24, int8_t, uint16_t);
1196 high |= tmp << 16;
1197 T0 = low;
1198 T1 = high;
1199 FORCE_RET();
1200 }
1201
1202 NEON_OP(mull_u16)
1203 {
1204 uint32_t low;
1205 uint32_t high;
1206
1207 DO_MULL(low, T0, T1, uint16_t, uint32_t);
1208 DO_MULL(high, T0 >> 16, T1 >> 16, uint16_t, uint32_t);
1209 T0 = low;
1210 T1 = high;
1211 FORCE_RET();
1212 }
1213
1214 NEON_OP(mull_s16)
1215 {
1216 uint32_t low;
1217 uint32_t high;
1218
1219 DO_MULL(low, T0, T1, int16_t, uint32_t);
1220 DO_MULL(high, T0 >> 16, T1 >> 16, int16_t, uint32_t);
1221 T0 = low;
1222 T1 = high;
1223 FORCE_RET();
1224 }
1225
1226 NEON_OP(addl_saturate_s32)
1227 {
1228 uint32_t tmp;
1229 uint32_t res;
1230
1231 tmp = env->vfp.scratch[0];
1232 res = T0 + tmp;
1233 if (((res ^ T0) & SIGNBIT) && !((T0 ^ tmp) & SIGNBIT)) {
1234 env->QF = 1;
1235 T0 = (T0 >> 31) ^ 0x7fffffff;
1236 } else {
1237 T0 = res;
1238 }
1239 tmp = env->vfp.scratch[1];
1240 res = T1 + tmp;
1241 if (((res ^ T1) & SIGNBIT) && !((T1 ^ tmp) & SIGNBIT)) {
1242 env->QF = 1;
1243 T1 = (T1 >> 31) ^ 0x7fffffff;
1244 } else {
1245 T1 = res;
1246 }
1247 FORCE_RET();
1248 }
1249
1250 NEON_OP(addl_saturate_s64)
1251 {
1252 uint64_t src1;
1253 uint64_t src2;
1254 uint64_t res;
1255
1256 src1 = T0 + ((uint64_t)T1 << 32);
1257 src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1258 res = src1 + src2;
1259 if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
1260 env->QF = 1;
1261 T0 = ~(int64_t)src1 >> 63;
1262 T1 = T0 ^ 0x80000000;
1263 } else {
1264 T0 = res;
1265 T1 = res >> 32;
1266 }
1267 FORCE_RET();
1268 }
1269
1270 NEON_OP(addl_saturate_u64)
1271 {
1272 uint64_t src1;
1273 uint64_t src2;
1274 uint64_t res;
1275
1276 src1 = T0 + ((uint64_t)T1 << 32);
1277 src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1278 res = src1 + src2;
1279 if (res < src1) {
1280 env->QF = 1;
1281 T0 = 0xffffffff;
1282 T1 = 0xffffffff;
1283 } else {
1284 T0 = res;
1285 T1 = res >> 32;
1286 }
1287 FORCE_RET();
1288 }
1289
1290 NEON_OP(subl_saturate_s64)
1291 {
1292 uint64_t src1;
1293 uint64_t src2;
1294 uint64_t res;
1295
1296 src1 = T0 + ((uint64_t)T1 << 32);
1297 src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1298 res = src1 - src2;
1299 if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
1300 env->QF = 1;
1301 T0 = ~(int64_t)src1 >> 63;
1302 T1 = T0 ^ 0x80000000;
1303 } else {
1304 T0 = res;
1305 T1 = res >> 32;
1306 }
1307 FORCE_RET();
1308 }
1309
1310 NEON_OP(subl_saturate_u64)
1311 {
1312 uint64_t src1;
1313 uint64_t src2;
1314 uint64_t res;
1315
1316 src1 = T0 + ((uint64_t)T1 << 32);
1317 src2 = env->vfp.scratch[0] + ((uint64_t)env->vfp.scratch[1] << 32);
1318 if (src1 < src2) {
1319 env->QF = 1;
1320 T0 = 0;
1321 T1 = 0;
1322 } else {
1323 res = src1 - src2;
1324 T0 = res;
1325 T1 = res >> 32;
1326 }
1327 FORCE_RET();
1328 }
1329
1330 NEON_OP(negl_u16)
1331 {
1332 uint32_t tmp;
1333 tmp = T0 >> 16;
1334 tmp = -tmp;
1335 T0 = (-T0 & 0xffff) | (tmp << 16);
1336 tmp = T1 >> 16;
1337 tmp = -tmp;
1338 T1 = (-T1 & 0xffff) | (tmp << 16);
1339 FORCE_RET();
1340 }
1341
1342 NEON_OP(negl_u32)
1343 {
1344 T0 = -T0;
1345 T1 = -T1;
1346 FORCE_RET();
1347 }
1348
1349 NEON_OP(negl_u64)
1350 {
1351 uint64_t val;
1352
1353 val = T0 | ((uint64_t)T1 << 32);
1354 val = -val;
1355 T0 = val;
1356 T1 = val >> 32;
1357 FORCE_RET();
1358 }
1359
1360 /* Scalar operations. */
1361 NEON_OP(dup_low16)
1362 {
1363 T0 = (T0 & 0xffff) | (T0 << 16);
1364 FORCE_RET();
1365 }
1366
1367 NEON_OP(dup_high16)
1368 {
1369 T0 = (T0 >> 16) | (T0 & 0xffff0000);
1370 FORCE_RET();
1371 }
1372
1373 /* Helper for VEXT */
1374 NEON_OP(extract)
1375 {
1376 int shift = PARAM1;
1377 T0 = (T0 >> shift) | (T1 << (32 - shift));
1378 FORCE_RET();
1379 }
1380
1381 /* Pairwise add long. Named type is source type. */
1382 NEON_OP(paddl_s8)
1383 {
1384 int8_t src1;
1385 int8_t src2;
1386 uint16_t result;
1387 src1 = T0 >> 24;
1388 src2 = T0 >> 16;
1389 result = (uint16_t)src1 + src2;
1390 src1 = T0 >> 8;
1391 src2 = T0;
1392 T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16);
1393 FORCE_RET();
1394 }
1395
1396 NEON_OP(paddl_u8)
1397 {
1398 uint8_t src1;
1399 uint8_t src2;
1400 uint16_t result;
1401 src1 = T0 >> 24;
1402 src2 = T0 >> 16;
1403 result = (uint16_t)src1 + src2;
1404 src1 = T0 >> 8;
1405 src2 = T0;
1406 T0 = (uint16_t)((uint16_t)src1 + src2) | ((uint32_t)result << 16);
1407 FORCE_RET();
1408 }
1409
1410 NEON_OP(paddl_s16)
1411 {
1412 T0 = (uint32_t)(int16_t)T0 + (uint32_t)(int16_t)(T0 >> 16);
1413 FORCE_RET();
1414 }
1415
1416 NEON_OP(paddl_u16)
1417 {
1418 T0 = (uint32_t)(uint16_t)T0 + (uint32_t)(uint16_t)(T0 >> 16);
1419 FORCE_RET();
1420 }
1421
1422 NEON_OP(paddl_s32)
1423 {
1424 int64_t tmp;
1425 tmp = (int64_t)(int32_t)T0 + (int64_t)(int32_t)T1;
1426 T0 = tmp;
1427 T1 = tmp >> 32;
1428 FORCE_RET();
1429 }
1430
1431 NEON_OP(paddl_u32)
1432 {
1433 uint64_t tmp;
1434 tmp = (uint64_t)T0 + (uint64_t)T1;
1435 T0 = tmp;
1436 T1 = tmp >> 32;
1437 FORCE_RET();
1438 }
1439
1440 /* Count Leading Sign/Zero Bits. */
1441 static inline int do_clz8(uint8_t x)
1442 {
1443 int n;
1444 for (n = 8; x; n--)
1445 x >>= 1;
1446 return n;
1447 }
1448
1449 static inline int do_clz16(uint16_t x)
1450 {
1451 int n;
1452 for (n = 16; x; n--)
1453 x >>= 1;
1454 return n;
1455 }
1456
1457 NEON_OP(clz_u8)
1458 {
1459 uint32_t result;
1460 uint32_t tmp;
1461
1462 tmp = T0;
1463 result = do_clz8(tmp);
1464 result |= do_clz8(tmp >> 8) << 8;
1465 result |= do_clz8(tmp >> 16) << 16;
1466 result |= do_clz8(tmp >> 24) << 24;
1467 T0 = result;
1468 FORCE_RET();
1469 }
1470
1471 NEON_OP(clz_u16)
1472 {
1473 uint32_t result;
1474 uint32_t tmp;
1475 tmp = T0;
1476 result = do_clz16(tmp);
1477 result |= do_clz16(tmp >> 16) << 16;
1478 T0 = result;
1479 FORCE_RET();
1480 }
1481
1482 NEON_OP(cls_s8)
1483 {
1484 uint32_t result;
1485 int8_t tmp;
1486 tmp = T0;
1487 result = do_clz8((tmp < 0) ? ~tmp : tmp) - 1;
1488 tmp = T0 >> 8;
1489 result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 8;
1490 tmp = T0 >> 16;
1491 result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 16;
1492 tmp = T0 >> 24;
1493 result |= (do_clz8((tmp < 0) ? ~tmp : tmp) - 1) << 24;
1494 T0 = result;
1495 FORCE_RET();
1496 }
1497
1498 NEON_OP(cls_s16)
1499 {
1500 uint32_t result;
1501 int16_t tmp;
1502 tmp = T0;
1503 result = do_clz16((tmp < 0) ? ~tmp : tmp) - 1;
1504 tmp = T0 >> 16;
1505 result |= (do_clz16((tmp < 0) ? ~tmp : tmp) - 1) << 16;
1506 T0 = result;
1507 FORCE_RET();
1508 }
1509
1510 NEON_OP(cls_s32)
1511 {
1512 int count;
1513 if ((int32_t)T0 < 0)
1514 T0 = ~T0;
1515 for (count = 32; T0 > 0; count--)
1516 T0 = T0 >> 1;
1517 T0 = count - 1;
1518 FORCE_RET();
1519 }
1520
1521 /* Bit count. */
1522 NEON_OP(cnt_u8)
1523 {
1524 T0 = (T0 & 0x55555555) + ((T0 >> 1) & 0x55555555);
1525 T0 = (T0 & 0x33333333) + ((T0 >> 2) & 0x33333333);
1526 T0 = (T0 & 0x0f0f0f0f) + ((T0 >> 4) & 0x0f0f0f0f);
1527 FORCE_RET();
1528 }
1529
1530 /* Saturnating negation. */
1531 /* ??? Make these use NEON_VOP1 */
1532 #define DO_QABS8(x) do { \
1533 if (x == (int8_t)0x80) { \
1534 x = 0x7f; \
1535 env->QF = 1; \
1536 } else if (x < 0) { \
1537 x = -x; \
1538 }} while (0)
1539 NEON_OP(qabs_s8)
1540 {
1541 neon_s8 vec;
1542 NEON_UNPACK(neon_s8, vec, T0);
1543 DO_QABS8(vec.v1);
1544 DO_QABS8(vec.v2);
1545 DO_QABS8(vec.v3);
1546 DO_QABS8(vec.v4);
1547 NEON_PACK(neon_s8, T0, vec);
1548 FORCE_RET();
1549 }
1550 #undef DO_QABS8
1551
1552 #define DO_QNEG8(x) do { \
1553 if (x == (int8_t)0x80) { \
1554 x = 0x7f; \
1555 env->QF = 1; \
1556 } else { \
1557 x = -x; \
1558 }} while (0)
1559 NEON_OP(qneg_s8)
1560 {
1561 neon_s8 vec;
1562 NEON_UNPACK(neon_s8, vec, T0);
1563 DO_QNEG8(vec.v1);
1564 DO_QNEG8(vec.v2);
1565 DO_QNEG8(vec.v3);
1566 DO_QNEG8(vec.v4);
1567 NEON_PACK(neon_s8, T0, vec);
1568 FORCE_RET();
1569 }
1570 #undef DO_QNEG8
1571
1572 #define DO_QABS16(x) do { \
1573 if (x == (int16_t)0x8000) { \
1574 x = 0x7fff; \
1575 env->QF = 1; \
1576 } else if (x < 0) { \
1577 x = -x; \
1578 }} while (0)
1579 NEON_OP(qabs_s16)
1580 {
1581 neon_s16 vec;
1582 NEON_UNPACK(neon_s16, vec, T0);
1583 DO_QABS16(vec.v1);
1584 DO_QABS16(vec.v2);
1585 NEON_PACK(neon_s16, T0, vec);
1586 FORCE_RET();
1587 }
1588 #undef DO_QABS16
1589
1590 #define DO_QNEG16(x) do { \
1591 if (x == (int16_t)0x8000) { \
1592 x = 0x7fff; \
1593 env->QF = 1; \
1594 } else { \
1595 x = -x; \
1596 }} while (0)
1597 NEON_OP(qneg_s16)
1598 {
1599 neon_s16 vec;
1600 NEON_UNPACK(neon_s16, vec, T0);
1601 DO_QNEG16(vec.v1);
1602 DO_QNEG16(vec.v2);
1603 NEON_PACK(neon_s16, T0, vec);
1604 FORCE_RET();
1605 }
1606 #undef DO_QNEG16
1607
1608 NEON_OP(qabs_s32)
1609 {
1610 if (T0 == 0x80000000) {
1611 T0 = 0x7fffffff;
1612 env->QF = 1;
1613 } else if ((int32_t)T0 < 0) {
1614 T0 = -T0;
1615 }
1616 FORCE_RET();
1617 }
1618
1619 NEON_OP(qneg_s32)
1620 {
1621 if (T0 == 0x80000000) {
1622 T0 = 0x7fffffff;
1623 env->QF = 1;
1624 } else {
1625 T0 = -T0;
1626 }
1627 FORCE_RET();
1628 }
1629
1630 /* Unary opperations */
1631 #define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
1632 NEON_VOP1(abs_s8, neon_s8, 4)
1633 NEON_VOP1(abs_s16, neon_s16, 2)
1634 NEON_OP(abs_s32)
1635 {
1636 if ((int32_t)T0 < 0)
1637 T0 = -T0;
1638 FORCE_RET();
1639 }
1640 #undef NEON_FN
1641
1642 /* Transpose. Argument order is rather strange to avoid special casing
1643 the tranlation code.
1644 On input T0 = rm, T1 = rd. On output T0 = rd, T1 = rm */
1645 NEON_OP(trn_u8)
1646 {
1647 uint32_t rd;
1648 uint32_t rm;
1649 rd = ((T0 & 0x00ff00ff) << 8) | (T1 & 0x00ff00ff);
1650 rm = ((T1 & 0xff00ff00) >> 8) | (T0 & 0xff00ff00);
1651 T0 = rd;
1652 T1 = rm;
1653 FORCE_RET();
1654 }
1655
1656 NEON_OP(trn_u16)
1657 {
1658 uint32_t rd;
1659 uint32_t rm;
1660 rd = (T0 << 16) | (T1 & 0xffff);
1661 rm = (T1 >> 16) | (T0 & 0xffff0000);
1662 T0 = rd;
1663 T1 = rm;
1664 FORCE_RET();
1665 }
1666
1667 /* Worker routines for zip and unzip. */
1668 NEON_OP(unzip_u8)
1669 {
1670 uint32_t rd;
1671 uint32_t rm;
1672 rd = (T0 & 0xff) | ((T0 >> 8) & 0xff00)
1673 | ((T1 << 16) & 0xff0000) | ((T1 << 8) & 0xff000000);
1674 rm = ((T0 >> 8) & 0xff) | ((T0 >> 16) & 0xff00)
1675 | ((T1 << 8) & 0xff0000) | (T1 & 0xff000000);
1676 T0 = rd;
1677 T1 = rm;
1678 FORCE_RET();
1679 }
1680
1681 NEON_OP(zip_u8)
1682 {
1683 uint32_t rd;
1684 uint32_t rm;
1685 rd = (T0 & 0xff) | ((T1 << 8) & 0xff00)
1686 | ((T0 << 16) & 0xff0000) | ((T1 << 24) & 0xff000000);
1687 rm = ((T0 >> 16) & 0xff) | ((T1 >> 8) & 0xff00)
1688 | ((T0 >> 8) & 0xff0000) | (T1 & 0xff000000);
1689 T0 = rd;
1690 T1 = rm;
1691 FORCE_RET();
1692 }
1693
1694 NEON_OP(zip_u16)
1695 {
1696 uint32_t tmp;
1697
1698 tmp = (T0 & 0xffff) | (T1 << 16);
1699 T1 = (T1 & 0xffff0000) | (T0 >> 16);
1700 T0 = tmp;
1701 FORCE_RET();
1702 }
1703
1704 /* Reciprocal/root estimate. */
1705 NEON_OP(recpe_u32)
1706 {
1707 T0 = helper_recpe_u32(T0);
1708 }
1709
1710 NEON_OP(rsqrte_u32)
1711 {
1712 T0 = helper_rsqrte_u32(T0);
1713 }
1714
1715 NEON_OP(recpe_f32)
1716 {
1717 FT0s = helper_recpe_f32(FT0s);
1718 }
1719
1720 NEON_OP(rsqrte_f32)
1721 {
1722 FT0s = helper_rsqrte_f32(FT0s);
1723 }
1724
1725 /* Table lookup. This accessed the register file directly. */
1726 NEON_OP(tbl)
1727 {
1728 helper_neon_tbl(PARAM1, PARAM2);
1729 }
1730
1731 NEON_OP(dup_u8)
1732 {
1733 T0 = (T0 >> PARAM1) & 0xff;
1734 T0 |= T0 << 8;
1735 T0 |= T0 << 16;
1736 FORCE_RET();
1737 }
1738
1739 /* Helpers for element load/store. */
1740 NEON_OP(insert_elt)
1741 {
1742 int shift = PARAM1;
1743 uint32_t mask = PARAM2;
1744 T2 = (T2 & mask) | (T0 << shift);
1745 FORCE_RET();
1746 }
1747
1748 NEON_OP(extract_elt)
1749 {
1750 int shift = PARAM1;
1751 uint32_t mask = PARAM2;
1752 T0 = (T2 & mask) >> shift;
1753 FORCE_RET();
1754 }