]> git.proxmox.com Git - mirror_qemu.git/blame - target/i386/ops_sse.h
target/i386: reimplement 0x0f 0xd0-0xd7, 0xe0-0xe7, 0xf0-0xf7, add AVX
[mirror_qemu.git] / target / i386 / ops_sse.h
CommitLineData
664e0f19 1/*
222a3336 2 * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
5fafdf24 3 *
664e0f19 4 * Copyright (c) 2005 Fabrice Bellard
222a3336 5 * Copyright (c) 2008 Intel Corporation <andrew.zaborowski@intel.com>
664e0f19
FB
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
d9ff33ad 10 * version 2.1 of the License, or (at your option) any later version.
664e0f19
FB
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
8167ee88 18 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
664e0f19 19 */
04af534d 20
6f2945cd 21#include "crypto/aes.h"
04af534d 22
664e0f19
FB
23#if SHIFT == 0
24#define Reg MMXReg
001faf32 25#define XMM_ONLY(...)
664e0f19
FB
26#define B(n) MMX_B(n)
27#define W(n) MMX_W(n)
28#define L(n) MMX_L(n)
83625474 29#define Q(n) MMX_Q(n)
664e0f19
FB
30#define SUFFIX _mmx
31#else
fa451874 32#define Reg ZMMReg
001faf32 33#define XMM_ONLY(...) __VA_ARGS__
19cbd87c
EH
34#define B(n) ZMM_B(n)
35#define W(n) ZMM_W(n)
36#define L(n) ZMM_L(n)
37#define Q(n) ZMM_Q(n)
b98f886c 38#if SHIFT == 1
664e0f19 39#define SUFFIX _xmm
b98f886c
PB
40#else
41#define SUFFIX _ymm
42#endif
664e0f19
FB
43#endif
44
18592d2e 45#define LANE_WIDTH (SHIFT ? 16 : 8)
d45b0de6 46#define PACK_WIDTH (LANE_WIDTH / 2)
18592d2e 47
18592d2e
PB
48#if SHIFT == 0
49#define FPSRL(x, c) ((x) >> shift)
50#define FPSRAW(x, c) ((int16_t)(x) >> shift)
51#define FPSRAL(x, c) ((int32_t)(x) >> shift)
52#define FPSLL(x, c) ((x) << shift)
53#endif
54
f05f9789 55void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
664e0f19 56{
664e0f19 57 int shift;
18592d2e
PB
58 if (c->Q(0) > 15) {
59 for (int i = 0; i < 1 << SHIFT; i++) {
60 d->Q(i) = 0;
61 }
664e0f19 62 } else {
18592d2e
PB
63 shift = c->B(0);
64 for (int i = 0; i < 4 << SHIFT; i++) {
65 d->W(i) = FPSRL(s->W(i), shift);
66 }
664e0f19
FB
67 }
68}
69
f05f9789 70void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
664e0f19 71{
664e0f19 72 int shift;
18592d2e
PB
73 if (c->Q(0) > 15) {
74 for (int i = 0; i < 1 << SHIFT; i++) {
75 d->Q(i) = 0;
76 }
664e0f19 77 } else {
18592d2e
PB
78 shift = c->B(0);
79 for (int i = 0; i < 4 << SHIFT; i++) {
80 d->W(i) = FPSLL(s->W(i), shift);
81 }
664e0f19 82 }
664e0f19
FB
83}
84
f05f9789 85void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
664e0f19 86{
664e0f19 87 int shift;
18592d2e
PB
88 if (c->Q(0) > 15) {
89 shift = 15;
664e0f19 90 } else {
18592d2e
PB
91 shift = c->B(0);
92 }
93 for (int i = 0; i < 4 << SHIFT; i++) {
94 d->W(i) = FPSRAW(s->W(i), shift);
664e0f19
FB
95 }
96}
97
f05f9789 98void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
664e0f19 99{
664e0f19 100 int shift;
18592d2e
PB
101 if (c->Q(0) > 31) {
102 for (int i = 0; i < 1 << SHIFT; i++) {
103 d->Q(i) = 0;
104 }
664e0f19 105 } else {
18592d2e
PB
106 shift = c->B(0);
107 for (int i = 0; i < 2 << SHIFT; i++) {
108 d->L(i) = FPSRL(s->L(i), shift);
109 }
664e0f19
FB
110 }
111}
112
f05f9789 113void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
664e0f19 114{
664e0f19 115 int shift;
18592d2e
PB
116 if (c->Q(0) > 31) {
117 for (int i = 0; i < 1 << SHIFT; i++) {
118 d->Q(i) = 0;
119 }
664e0f19 120 } else {
18592d2e
PB
121 shift = c->B(0);
122 for (int i = 0; i < 2 << SHIFT; i++) {
123 d->L(i) = FPSLL(s->L(i), shift);
124 }
664e0f19 125 }
664e0f19
FB
126}
127
f05f9789 128void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
664e0f19 129{
664e0f19 130 int shift;
18592d2e
PB
131 if (c->Q(0) > 31) {
132 shift = 31;
664e0f19 133 } else {
18592d2e
PB
134 shift = c->B(0);
135 }
136 for (int i = 0; i < 2 << SHIFT; i++) {
137 d->L(i) = FPSRAL(s->L(i), shift);
664e0f19
FB
138 }
139}
140
f05f9789 141void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
664e0f19 142{
664e0f19 143 int shift;
18592d2e
PB
144 if (c->Q(0) > 63) {
145 for (int i = 0; i < 1 << SHIFT; i++) {
146 d->Q(i) = 0;
147 }
664e0f19 148 } else {
18592d2e
PB
149 shift = c->B(0);
150 for (int i = 0; i < 1 << SHIFT; i++) {
151 d->Q(i) = FPSRL(s->Q(i), shift);
152 }
664e0f19
FB
153 }
154}
155
f05f9789 156void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
664e0f19 157{
664e0f19 158 int shift;
18592d2e
PB
159 if (c->Q(0) > 63) {
160 for (int i = 0; i < 1 << SHIFT; i++) {
161 d->Q(i) = 0;
162 }
664e0f19 163 } else {
18592d2e
PB
164 shift = c->B(0);
165 for (int i = 0; i < 1 << SHIFT; i++) {
166 d->Q(i) = FPSLL(s->Q(i), shift);
167 }
664e0f19
FB
168 }
169}
170
18592d2e 171#if SHIFT >= 1
f05f9789 172void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
664e0f19 173{
18592d2e 174 int shift, i, j;
664e0f19 175
18592d2e 176 shift = c->L(0);
e01d9d31 177 if (shift > 16) {
664e0f19 178 shift = 16;
e01d9d31 179 }
18592d2e
PB
180 for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) {
181 for (i = 0; i < 16 - shift; i++) {
182 d->B(j + i) = s->B(j + i + shift);
183 }
184 for (i = 16 - shift; i < 16; i++) {
185 d->B(j + i) = 0;
186 }
e01d9d31 187 }
664e0f19
FB
188}
189
f05f9789 190void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, Reg *c)
664e0f19 191{
18592d2e 192 int shift, i, j;
664e0f19 193
18592d2e 194 shift = c->L(0);
e01d9d31 195 if (shift > 16) {
664e0f19 196 shift = 16;
e01d9d31 197 }
18592d2e
PB
198 for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) {
199 for (i = 15; i >= shift; i--) {
200 d->B(j + i) = s->B(j + i - shift);
201 }
202 for (i = 0; i < shift; i++) {
203 d->B(j + i) = 0;
204 }
e01d9d31 205 }
664e0f19
FB
206}
207#endif
208
ee04a3c8 209#define SSE_HELPER_1(name, elem, num, F) \
d3eb5eae 210 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
e01d9d31 211 { \
ee04a3c8
PB
212 int n = num; \
213 for (int i = 0; i < n; i++) { \
214 d->elem(i) = F(s->elem(i)); \
215 } \
216 }
e01d9d31 217
ee04a3c8 218#define SSE_HELPER_2(name, elem, num, F) \
f05f9789 219 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
e01d9d31 220 { \
ee04a3c8
PB
221 int n = num; \
222 for (int i = 0; i < n; i++) { \
223 d->elem(i) = F(v->elem(i), s->elem(i)); \
224 } \
225 }
226
227#define SSE_HELPER_B(name, F) \
228 SSE_HELPER_2(name, B, 8 << SHIFT, F)
229
230#define SSE_HELPER_W(name, F) \
231 SSE_HELPER_2(name, W, 4 << SHIFT, F)
e01d9d31
BS
232
233#define SSE_HELPER_L(name, F) \
ee04a3c8 234 SSE_HELPER_2(name, L, 2 << SHIFT, F)
e01d9d31
BS
235
236#define SSE_HELPER_Q(name, F) \
ee04a3c8 237 SSE_HELPER_2(name, Q, 1 << SHIFT, F)
664e0f19
FB
238
239#if SHIFT == 0
240static inline int satub(int x)
241{
e01d9d31 242 if (x < 0) {
664e0f19 243 return 0;
e01d9d31 244 } else if (x > 255) {
664e0f19 245 return 255;
e01d9d31 246 } else {
664e0f19 247 return x;
e01d9d31 248 }
664e0f19
FB
249}
250
251static inline int satuw(int x)
252{
e01d9d31 253 if (x < 0) {
664e0f19 254 return 0;
e01d9d31 255 } else if (x > 65535) {
664e0f19 256 return 65535;
e01d9d31 257 } else {
664e0f19 258 return x;
e01d9d31 259 }
664e0f19
FB
260}
261
262static inline int satsb(int x)
263{
e01d9d31 264 if (x < -128) {
664e0f19 265 return -128;
e01d9d31 266 } else if (x > 127) {
664e0f19 267 return 127;
e01d9d31 268 } else {
664e0f19 269 return x;
e01d9d31 270 }
664e0f19
FB
271}
272
273static inline int satsw(int x)
274{
e01d9d31 275 if (x < -32768) {
664e0f19 276 return -32768;
e01d9d31 277 } else if (x > 32767) {
664e0f19 278 return 32767;
e01d9d31 279 } else {
664e0f19 280 return x;
e01d9d31 281 }
664e0f19
FB
282}
283
284#define FADD(a, b) ((a) + (b))
285#define FADDUB(a, b) satub((a) + (b))
286#define FADDUW(a, b) satuw((a) + (b))
287#define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b))
288#define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b))
289
290#define FSUB(a, b) ((a) - (b))
291#define FSUBUB(a, b) satub((a) - (b))
292#define FSUBUW(a, b) satuw((a) - (b))
293#define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b))
294#define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b))
295#define FMINUB(a, b) ((a) < (b)) ? (a) : (b)
296#define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b)
297#define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
298#define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
299
e01d9d31 300#define FAND(a, b) ((a) & (b))
664e0f19 301#define FANDN(a, b) ((~(a)) & (b))
e01d9d31
BS
302#define FOR(a, b) ((a) | (b))
303#define FXOR(a, b) ((a) ^ (b))
664e0f19 304
e01d9d31
BS
305#define FCMPGTB(a, b) ((int8_t)(a) > (int8_t)(b) ? -1 : 0)
306#define FCMPGTW(a, b) ((int16_t)(a) > (int16_t)(b) ? -1 : 0)
307#define FCMPGTL(a, b) ((int32_t)(a) > (int32_t)(b) ? -1 : 0)
308#define FCMPEQ(a, b) ((a) == (b) ? -1 : 0)
664e0f19 309
e01d9d31
BS
310#define FMULLW(a, b) ((a) * (b))
311#define FMULHRW(a, b) (((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16)
312#define FMULHUW(a, b) ((a) * (b) >> 16)
313#define FMULHW(a, b) ((int16_t)(a) * (int16_t)(b) >> 16)
664e0f19 314
e01d9d31 315#define FAVG(a, b) (((a) + (b) + 1) >> 1)
664e0f19
FB
316#endif
317
5af45186
FB
318SSE_HELPER_B(helper_paddb, FADD)
319SSE_HELPER_W(helper_paddw, FADD)
320SSE_HELPER_L(helper_paddl, FADD)
321SSE_HELPER_Q(helper_paddq, FADD)
664e0f19 322
5af45186
FB
323SSE_HELPER_B(helper_psubb, FSUB)
324SSE_HELPER_W(helper_psubw, FSUB)
325SSE_HELPER_L(helper_psubl, FSUB)
326SSE_HELPER_Q(helper_psubq, FSUB)
664e0f19 327
5af45186
FB
328SSE_HELPER_B(helper_paddusb, FADDUB)
329SSE_HELPER_B(helper_paddsb, FADDSB)
330SSE_HELPER_B(helper_psubusb, FSUBUB)
331SSE_HELPER_B(helper_psubsb, FSUBSB)
664e0f19 332
5af45186
FB
333SSE_HELPER_W(helper_paddusw, FADDUW)
334SSE_HELPER_W(helper_paddsw, FADDSW)
335SSE_HELPER_W(helper_psubusw, FSUBUW)
336SSE_HELPER_W(helper_psubsw, FSUBSW)
664e0f19 337
5af45186
FB
338SSE_HELPER_B(helper_pminub, FMINUB)
339SSE_HELPER_B(helper_pmaxub, FMAXUB)
664e0f19 340
5af45186
FB
341SSE_HELPER_W(helper_pminsw, FMINSW)
342SSE_HELPER_W(helper_pmaxsw, FMAXSW)
664e0f19 343
5af45186
FB
344SSE_HELPER_Q(helper_pand, FAND)
345SSE_HELPER_Q(helper_pandn, FANDN)
346SSE_HELPER_Q(helper_por, FOR)
347SSE_HELPER_Q(helper_pxor, FXOR)
664e0f19 348
5af45186
FB
349SSE_HELPER_B(helper_pcmpgtb, FCMPGTB)
350SSE_HELPER_W(helper_pcmpgtw, FCMPGTW)
351SSE_HELPER_L(helper_pcmpgtl, FCMPGTL)
664e0f19 352
5af45186
FB
353SSE_HELPER_B(helper_pcmpeqb, FCMPEQ)
354SSE_HELPER_W(helper_pcmpeqw, FCMPEQ)
355SSE_HELPER_L(helper_pcmpeql, FCMPEQ)
664e0f19 356
5af45186 357SSE_HELPER_W(helper_pmullw, FMULLW)
5af45186
FB
358SSE_HELPER_W(helper_pmulhuw, FMULHUW)
359SSE_HELPER_W(helper_pmulhw, FMULHW)
664e0f19 360
f05f9789
PB
361#if SHIFT == 0
362void glue(helper_pmulhrw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
363{
364 d->W(0) = FMULHRW(d->W(0), s->W(0));
365 d->W(1) = FMULHRW(d->W(1), s->W(1));
366 d->W(2) = FMULHRW(d->W(2), s->W(2));
367 d->W(3) = FMULHRW(d->W(3), s->W(3));
368}
369#endif
370
5af45186
FB
371SSE_HELPER_B(helper_pavgb, FAVG)
372SSE_HELPER_W(helper_pavgw, FAVG)
664e0f19 373
f05f9789 374void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
664e0f19 375{
e894bae8
PB
376 int i;
377
378 for (i = 0; i < (1 << SHIFT); i++) {
379 d->Q(i) = (uint64_t)s->L(i * 2) * (uint64_t)v->L(i * 2);
380 }
664e0f19
FB
381}
382
f05f9789 383void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
664e0f19
FB
384{
385 int i;
664e0f19 386
e01d9d31 387 for (i = 0; i < (2 << SHIFT); i++) {
e894bae8
PB
388 d->L(i) = (int16_t)s->W(2 * i) * (int16_t)v->W(2 * i) +
389 (int16_t)s->W(2 * i + 1) * (int16_t)v->W(2 * i + 1);
664e0f19
FB
390 }
391}
392
393#if SHIFT == 0
394static inline int abs1(int a)
395{
e01d9d31 396 if (a < 0) {
664e0f19 397 return -a;
e01d9d31 398 } else {
664e0f19 399 return a;
e01d9d31 400 }
664e0f19
FB
401}
402#endif
f05f9789 403void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
664e0f19 404{
e894bae8 405 int i;
664e0f19 406
e894bae8
PB
407 for (i = 0; i < (1 << SHIFT); i++) {
408 unsigned int val = 0;
409 val += abs1(v->B(8 * i + 0) - s->B(8 * i + 0));
410 val += abs1(v->B(8 * i + 1) - s->B(8 * i + 1));
411 val += abs1(v->B(8 * i + 2) - s->B(8 * i + 2));
412 val += abs1(v->B(8 * i + 3) - s->B(8 * i + 3));
413 val += abs1(v->B(8 * i + 4) - s->B(8 * i + 4));
414 val += abs1(v->B(8 * i + 5) - s->B(8 * i + 5));
415 val += abs1(v->B(8 * i + 6) - s->B(8 * i + 6));
416 val += abs1(v->B(8 * i + 7) - s->B(8 * i + 7));
417 d->Q(i) = val;
418 }
664e0f19
FB
419}
420
fd17264a 421#if SHIFT < 2
d3eb5eae
BS
422void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
423 target_ulong a0)
664e0f19
FB
424{
425 int i;
e01d9d31
BS
426
427 for (i = 0; i < (8 << SHIFT); i++) {
428 if (s->B(i) & 0x80) {
4054cdec 429 cpu_stb_data_ra(env, a0 + i, d->B(i), GETPC());
e01d9d31 430 }
664e0f19
FB
431 }
432}
fd17264a 433#endif
664e0f19 434
e01d9d31 435void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32_t val)
664e0f19 436{
e894bae8
PB
437 int i;
438
5af45186 439 d->L(0) = val;
664e0f19 440 d->L(1) = 0;
e894bae8
PB
441 for (i = 1; i < (1 << SHIFT); i++) {
442 d->Q(i) = 0;
443 }
664e0f19
FB
444}
445
dabd98dd 446#ifdef TARGET_X86_64
e01d9d31 447void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val)
dabd98dd 448{
e894bae8
PB
449 int i;
450
5af45186 451 d->Q(0) = val;
e894bae8
PB
452 for (i = 1; i < (1 << SHIFT); i++) {
453 d->Q(i) = 0;
454 }
dabd98dd 455}
dabd98dd
FB
456#endif
457
d45b0de6
PB
458#define SHUFFLE4(F, a, b, offset) do { \
459 r0 = a->F((order & 3) + offset); \
460 r1 = a->F(((order >> 2) & 3) + offset); \
461 r2 = b->F(((order >> 4) & 3) + offset); \
462 r3 = b->F(((order >> 6) & 3) + offset); \
463 d->F(offset) = r0; \
464 d->F(offset + 1) = r1; \
465 d->F(offset + 2) = r2; \
466 d->F(offset + 3) = r3; \
467 } while (0)
468
664e0f19 469#if SHIFT == 0
e01d9d31 470void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
664e0f19 471{
d45b0de6 472 uint16_t r0, r1, r2, r3;
e01d9d31 473
d45b0de6 474 SHUFFLE4(W, s, s, 0);
664e0f19
FB
475}
476#else
f05f9789 477void glue(helper_shufps, SUFFIX)(Reg *d, Reg *v, Reg *s, int order)
d52cf7a6 478{
d45b0de6
PB
479 uint32_t r0, r1, r2, r3;
480 int i;
e01d9d31 481
d45b0de6
PB
482 for (i = 0; i < 2 << SHIFT; i += 4) {
483 SHUFFLE4(L, v, s, i);
484 }
d52cf7a6
FB
485}
486
f05f9789 487void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *v, Reg *s, int order)
664e0f19 488{
d45b0de6
PB
489 uint64_t r0, r1;
490 int i;
e01d9d31 491
d45b0de6
PB
492 for (i = 0; i < 1 << SHIFT; i += 2) {
493 r0 = v->Q(((order & 1) & 1) + i);
494 r1 = s->Q(((order >> 1) & 1) + i);
495 d->Q(i) = r0;
496 d->Q(i + 1) = r1;
497 order >>= 2;
498 }
664e0f19
FB
499}
500
e01d9d31 501void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
664e0f19 502{
d45b0de6
PB
503 uint32_t r0, r1, r2, r3;
504 int i;
e01d9d31 505
d45b0de6
PB
506 for (i = 0; i < 2 << SHIFT; i += 4) {
507 SHUFFLE4(L, s, s, i);
508 }
664e0f19
FB
509}
510
e01d9d31 511void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
664e0f19 512{
d45b0de6
PB
513 uint16_t r0, r1, r2, r3;
514 int i, j;
e01d9d31 515
d45b0de6
PB
516 for (i = 0, j = 1; j < 1 << SHIFT; i += 8, j += 2) {
517 SHUFFLE4(W, s, s, i);
518 d->Q(j) = s->Q(j);
519 }
664e0f19
FB
520}
521
e01d9d31 522void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
664e0f19 523{
d45b0de6
PB
524 uint16_t r0, r1, r2, r3;
525 int i, j;
e01d9d31 526
d45b0de6
PB
527 for (i = 4, j = 0; j < 1 << SHIFT; i += 8, j += 2) {
528 d->Q(j) = s->Q(j);
529 SHUFFLE4(W, s, s, i);
530 }
664e0f19
FB
531}
532#endif
533
3403cafe 534#if SHIFT >= 1
664e0f19
FB
535/* FPU ops */
536/* XXX: not accurate */
537
3403cafe
PB
538#define SSE_HELPER_P(name, F) \
539 void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \
f05f9789 540 Reg *d, Reg *v, Reg *s) \
d3eb5eae 541 { \
3403cafe
PB
542 int i; \
543 for (i = 0; i < 2 << SHIFT; i++) { \
544 d->ZMM_S(i) = F(32, v->ZMM_S(i), s->ZMM_S(i)); \
545 } \
d3eb5eae
BS
546 } \
547 \
3403cafe 548 void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \
f05f9789 549 Reg *d, Reg *v, Reg *s) \
d3eb5eae 550 { \
3403cafe
PB
551 int i; \
552 for (i = 0; i < 1 << SHIFT; i++) { \
553 d->ZMM_D(i) = F(64, v->ZMM_D(i), s->ZMM_D(i)); \
554 } \
555 }
556
557#if SHIFT == 1
558
559#define SSE_HELPER_S(name, F) \
560 SSE_HELPER_P(name, F) \
d3eb5eae 561 \
f05f9789 562 void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *v, Reg *s)\
d3eb5eae 563 { \
1de9e7e6 564 int i; \
3403cafe 565 d->ZMM_S(0) = F(32, v->ZMM_S(0), s->ZMM_S(0)); \
1de9e7e6
PB
566 for (i = 1; i < 2 << SHIFT; i++) { \
567 d->ZMM_L(i) = v->ZMM_L(i); \
568 } \
d3eb5eae
BS
569 } \
570 \
f05f9789 571 void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *v, Reg *s)\
d3eb5eae 572 { \
1de9e7e6 573 int i; \
3403cafe 574 d->ZMM_D(0) = F(64, v->ZMM_D(0), s->ZMM_D(0)); \
1de9e7e6
PB
575 for (i = 1; i < 1 << SHIFT; i++) { \
576 d->ZMM_Q(i) = v->ZMM_Q(i); \
577 } \
e01d9d31 578 }
664e0f19 579
3403cafe
PB
580#else
581
582#define SSE_HELPER_S(name, F) SSE_HELPER_P(name, F)
583
584#endif
585
7a0e1f41
FB
586#define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status)
587#define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status)
588#define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status)
589#define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status)
664e0f19 590
a4d1f142
AJ
591/* Note that the choice of comparison op here is important to get the
592 * special cases right: for min and max Intel specifies that (-0,0),
593 * (NaN, anything) and (anything, NaN) return the second argument.
594 */
e01d9d31
BS
595#define FPU_MIN(size, a, b) \
596 (float ## size ## _lt(a, b, &env->sse_status) ? (a) : (b))
597#define FPU_MAX(size, a, b) \
598 (float ## size ## _lt(b, a, &env->sse_status) ? (a) : (b))
a4d1f142 599
5af45186
FB
600SSE_HELPER_S(add, FPU_ADD)
601SSE_HELPER_S(sub, FPU_SUB)
602SSE_HELPER_S(mul, FPU_MUL)
603SSE_HELPER_S(div, FPU_DIV)
604SSE_HELPER_S(min, FPU_MIN)
605SSE_HELPER_S(max, FPU_MAX)
664e0f19 606
3403cafe
PB
607void glue(helper_sqrtps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
608{
609 int i;
610 for (i = 0; i < 2 << SHIFT; i++) {
611 d->ZMM_S(i) = float32_sqrt(s->ZMM_S(i), &env->sse_status);
612 }
613}
614
615void glue(helper_sqrtpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
616{
617 int i;
618 for (i = 0; i < 1 << SHIFT; i++) {
619 d->ZMM_D(i) = float64_sqrt(s->ZMM_D(i), &env->sse_status);
620 }
621}
622
623#if SHIFT == 1
620f7556 624void helper_sqrtss(CPUX86State *env, Reg *d, Reg *v, Reg *s)
3403cafe 625{
620f7556 626 int i;
3403cafe 627 d->ZMM_S(0) = float32_sqrt(s->ZMM_S(0), &env->sse_status);
620f7556
PB
628 for (i = 1; i < 2 << SHIFT; i++) {
629 d->ZMM_L(i) = v->ZMM_L(i);
630 }
3403cafe
PB
631}
632
620f7556 633void helper_sqrtsd(CPUX86State *env, Reg *d, Reg *v, Reg *s)
3403cafe 634{
620f7556 635 int i;
3403cafe 636 d->ZMM_D(0) = float64_sqrt(s->ZMM_D(0), &env->sse_status);
620f7556
PB
637 for (i = 1; i < 1 << SHIFT; i++) {
638 d->ZMM_Q(i) = v->ZMM_Q(i);
639 }
3403cafe
PB
640}
641#endif
664e0f19
FB
642
643/* float to float conversions */
ce4fa29f 644void glue(helper_cvtps2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 645{
fd17264a
PB
646 int i;
647 for (i = 1 << SHIFT; --i >= 0; ) {
648 d->ZMM_D(i) = float32_to_float64(s->ZMM_S(i), &env->sse_status);
649 }
664e0f19
FB
650}
651
ce4fa29f 652void glue(helper_cvtpd2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 653{
fd17264a
PB
654 int i;
655 for (i = 0; i < 1 << SHIFT; i++) {
656 d->ZMM_S(i) = float64_to_float32(s->ZMM_D(i), &env->sse_status);
657 }
658 for (i >>= 1; i < 1 << SHIFT; i++) {
659 d->Q(i) = 0;
660 }
664e0f19
FB
661}
662
fd17264a 663#if SHIFT == 1
620f7556 664void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *v, Reg *s)
664e0f19 665{
620f7556 666 int i;
19cbd87c 667 d->ZMM_D(0) = float32_to_float64(s->ZMM_S(0), &env->sse_status);
620f7556
PB
668 for (i = 1; i < 1 << SHIFT; i++) {
669 d->ZMM_Q(i) = v->ZMM_Q(i);
670 }
664e0f19
FB
671}
672
620f7556 673void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *v, Reg *s)
664e0f19 674{
620f7556 675 int i;
19cbd87c 676 d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status);
620f7556
PB
677 for (i = 1; i < 2 << SHIFT; i++) {
678 d->ZMM_L(i) = v->ZMM_L(i);
679 }
664e0f19 680}
fd17264a 681#endif
664e0f19
FB
682
683/* integer to float */
ce4fa29f 684void glue(helper_cvtdq2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 685{
fd17264a
PB
686 int i;
687 for (i = 0; i < 2 << SHIFT; i++) {
688 d->ZMM_S(i) = int32_to_float32(s->ZMM_L(i), &env->sse_status);
689 }
664e0f19
FB
690}
691
ce4fa29f 692void glue(helper_cvtdq2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 693{
fd17264a
PB
694 int i;
695 for (i = 1 << SHIFT; --i >= 0; ) {
696 int32_t l = s->ZMM_L(i);
697 d->ZMM_D(i) = int32_to_float64(l, &env->sse_status);
698 }
664e0f19
FB
699}
700
fd17264a 701#if SHIFT == 1
fa451874 702void helper_cvtpi2ps(CPUX86State *env, ZMMReg *d, MMXReg *s)
664e0f19 703{
19cbd87c
EH
704 d->ZMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status);
705 d->ZMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status);
664e0f19
FB
706}
707
fa451874 708void helper_cvtpi2pd(CPUX86State *env, ZMMReg *d, MMXReg *s)
664e0f19 709{
19cbd87c
EH
710 d->ZMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status);
711 d->ZMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status);
664e0f19
FB
712}
713
fa451874 714void helper_cvtsi2ss(CPUX86State *env, ZMMReg *d, uint32_t val)
664e0f19 715{
19cbd87c 716 d->ZMM_S(0) = int32_to_float32(val, &env->sse_status);
664e0f19
FB
717}
718
fa451874 719void helper_cvtsi2sd(CPUX86State *env, ZMMReg *d, uint32_t val)
664e0f19 720{
19cbd87c 721 d->ZMM_D(0) = int32_to_float64(val, &env->sse_status);
664e0f19
FB
722}
723
724#ifdef TARGET_X86_64
fa451874 725void helper_cvtsq2ss(CPUX86State *env, ZMMReg *d, uint64_t val)
664e0f19 726{
19cbd87c 727 d->ZMM_S(0) = int64_to_float32(val, &env->sse_status);
664e0f19
FB
728}
729
fa451874 730void helper_cvtsq2sd(CPUX86State *env, ZMMReg *d, uint64_t val)
664e0f19 731{
19cbd87c 732 d->ZMM_D(0) = int64_to_float64(val, &env->sse_status);
664e0f19
FB
733}
734#endif
735
fd17264a
PB
736#endif
737
664e0f19 738/* float to integer */
1e8a98b5 739
fd17264a 740#if SHIFT == 1
1e8a98b5
PM
741/*
742 * x86 mandates that we return the indefinite integer value for the result
743 * of any float-to-integer conversion that raises the 'invalid' exception.
744 * Wrap the softfloat functions to get this behaviour.
745 */
746#define WRAP_FLOATCONV(RETTYPE, FN, FLOATTYPE, INDEFVALUE) \
747 static inline RETTYPE x86_##FN(FLOATTYPE a, float_status *s) \
748 { \
749 int oldflags, newflags; \
750 RETTYPE r; \
751 \
752 oldflags = get_float_exception_flags(s); \
753 set_float_exception_flags(0, s); \
754 r = FN(a, s); \
755 newflags = get_float_exception_flags(s); \
756 if (newflags & float_flag_invalid) { \
757 r = INDEFVALUE; \
758 } \
759 set_float_exception_flags(newflags | oldflags, s); \
760 return r; \
761 }
762
763WRAP_FLOATCONV(int32_t, float32_to_int32, float32, INT32_MIN)
764WRAP_FLOATCONV(int32_t, float32_to_int32_round_to_zero, float32, INT32_MIN)
765WRAP_FLOATCONV(int32_t, float64_to_int32, float64, INT32_MIN)
766WRAP_FLOATCONV(int32_t, float64_to_int32_round_to_zero, float64, INT32_MIN)
767WRAP_FLOATCONV(int64_t, float32_to_int64, float32, INT64_MIN)
768WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, float32, INT64_MIN)
769WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN)
770WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN)
fd17264a 771#endif
1e8a98b5 772
ce4fa29f 773void glue(helper_cvtps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 774{
fd17264a
PB
775 int i;
776 for (i = 0; i < 2 << SHIFT; i++) {
777 d->ZMM_L(i) = x86_float32_to_int32(s->ZMM_S(i), &env->sse_status);
778 }
664e0f19
FB
779}
780
ce4fa29f 781void glue(helper_cvtpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 782{
fd17264a
PB
783 int i;
784 for (i = 0; i < 1 << SHIFT; i++) {
785 d->ZMM_L(i) = x86_float64_to_int32(s->ZMM_D(i), &env->sse_status);
786 }
787 for (i >>= 1; i < 1 << SHIFT; i++) {
788 d->Q(i) = 0;
789 }
664e0f19
FB
790}
791
fd17264a 792#if SHIFT == 1
fa451874 793void helper_cvtps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
664e0f19 794{
1e8a98b5
PM
795 d->MMX_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
796 d->MMX_L(1) = x86_float32_to_int32(s->ZMM_S(1), &env->sse_status);
664e0f19
FB
797}
798
fa451874 799void helper_cvtpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
664e0f19 800{
1e8a98b5
PM
801 d->MMX_L(0) = x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
802 d->MMX_L(1) = x86_float64_to_int32(s->ZMM_D(1), &env->sse_status);
664e0f19
FB
803}
804
fa451874 805int32_t helper_cvtss2si(CPUX86State *env, ZMMReg *s)
664e0f19 806{
1e8a98b5 807 return x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
664e0f19
FB
808}
809
fa451874 810int32_t helper_cvtsd2si(CPUX86State *env, ZMMReg *s)
664e0f19 811{
1e8a98b5 812 return x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
664e0f19
FB
813}
814
815#ifdef TARGET_X86_64
fa451874 816int64_t helper_cvtss2sq(CPUX86State *env, ZMMReg *s)
664e0f19 817{
1e8a98b5 818 return x86_float32_to_int64(s->ZMM_S(0), &env->sse_status);
664e0f19
FB
819}
820
fa451874 821int64_t helper_cvtsd2sq(CPUX86State *env, ZMMReg *s)
664e0f19 822{
1e8a98b5 823 return x86_float64_to_int64(s->ZMM_D(0), &env->sse_status);
664e0f19
FB
824}
825#endif
fd17264a 826#endif
664e0f19
FB
827
828/* float to integer truncated */
ce4fa29f 829void glue(helper_cvttps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 830{
fd17264a
PB
831 int i;
832 for (i = 0; i < 2 << SHIFT; i++) {
833 d->ZMM_L(i) = x86_float32_to_int32_round_to_zero(s->ZMM_S(i),
834 &env->sse_status);
835 }
664e0f19
FB
836}
837
ce4fa29f 838void glue(helper_cvttpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 839{
fd17264a
PB
840 int i;
841 for (i = 0; i < 1 << SHIFT; i++) {
842 d->ZMM_L(i) = x86_float64_to_int32_round_to_zero(s->ZMM_D(i),
843 &env->sse_status);
844 }
845 for (i >>= 1; i < 1 << SHIFT; i++) {
846 d->Q(i) = 0;
847 }
664e0f19
FB
848}
849
fd17264a 850#if SHIFT == 1
fa451874 851void helper_cvttps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
664e0f19 852{
1e8a98b5
PM
853 d->MMX_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
854 d->MMX_L(1) = x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status);
664e0f19
FB
855}
856
fa451874 857void helper_cvttpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
664e0f19 858{
1e8a98b5
PM
859 d->MMX_L(0) = x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
860 d->MMX_L(1) = x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status);
664e0f19
FB
861}
862
fa451874 863int32_t helper_cvttss2si(CPUX86State *env, ZMMReg *s)
664e0f19 864{
1e8a98b5 865 return x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
664e0f19
FB
866}
867
fa451874 868int32_t helper_cvttsd2si(CPUX86State *env, ZMMReg *s)
664e0f19 869{
1e8a98b5 870 return x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
664e0f19
FB
871}
872
873#ifdef TARGET_X86_64
fa451874 874int64_t helper_cvttss2sq(CPUX86State *env, ZMMReg *s)
664e0f19 875{
1e8a98b5 876 return x86_float32_to_int64_round_to_zero(s->ZMM_S(0), &env->sse_status);
664e0f19
FB
877}
878
fa451874 879int64_t helper_cvttsd2sq(CPUX86State *env, ZMMReg *s)
664e0f19 880{
1e8a98b5 881 return x86_float64_to_int64_round_to_zero(s->ZMM_D(0), &env->sse_status);
664e0f19
FB
882}
883#endif
fd17264a 884#endif
664e0f19 885
ce4fa29f 886void glue(helper_rsqrtps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 887{
418b0f93 888 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
3403cafe
PB
889 int i;
890 for (i = 0; i < 2 << SHIFT; i++) {
891 d->ZMM_S(i) = float32_div(float32_one,
892 float32_sqrt(s->ZMM_S(i), &env->sse_status),
893 &env->sse_status);
894 }
418b0f93 895 set_float_exception_flags(old_flags, &env->sse_status);
664e0f19
FB
896}
897
fd17264a 898#if SHIFT == 1
620f7556 899void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s)
664e0f19 900{
418b0f93 901 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
620f7556 902 int i;
19cbd87c
EH
903 d->ZMM_S(0) = float32_div(float32_one,
904 float32_sqrt(s->ZMM_S(0), &env->sse_status),
c2ef9a83 905 &env->sse_status);
418b0f93 906 set_float_exception_flags(old_flags, &env->sse_status);
620f7556
PB
907 for (i = 1; i < 2 << SHIFT; i++) {
908 d->ZMM_L(i) = v->ZMM_L(i);
909 }
664e0f19 910}
fd17264a 911#endif
664e0f19 912
ce4fa29f 913void glue(helper_rcpps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 914{
418b0f93 915 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
3403cafe
PB
916 int i;
917 for (i = 0; i < 2 << SHIFT; i++) {
918 d->ZMM_S(i) = float32_div(float32_one, s->ZMM_S(i), &env->sse_status);
919 }
418b0f93 920 set_float_exception_flags(old_flags, &env->sse_status);
664e0f19
FB
921}
922
fd17264a 923#if SHIFT == 1
620f7556 924void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *v, ZMMReg *s)
664e0f19 925{
418b0f93 926 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
620f7556 927 int i;
19cbd87c 928 d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status);
620f7556
PB
929 for (i = 1; i < 2 << SHIFT; i++) {
930 d->ZMM_L(i) = v->ZMM_L(i);
931 }
418b0f93 932 set_float_exception_flags(old_flags, &env->sse_status);
664e0f19 933}
fd17264a 934#endif
664e0f19 935
fd17264a 936#if SHIFT == 1
d9f4bb27
AP
937static inline uint64_t helper_extrq(uint64_t src, int shift, int len)
938{
939 uint64_t mask;
940
941 if (len == 0) {
942 mask = ~0LL;
943 } else {
944 mask = (1ULL << len) - 1;
945 }
946 return (src >> shift) & mask;
947}
948
fa451874 949void helper_extrq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s)
d9f4bb27 950{
034668c3 951 d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), s->ZMM_B(1) & 63, s->ZMM_B(0) & 63);
d9f4bb27
AP
952}
953
fa451874 954void helper_extrq_i(CPUX86State *env, ZMMReg *d, int index, int length)
d9f4bb27 955{
19cbd87c 956 d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), index, length);
d9f4bb27
AP
957}
958
ca4b1b43 959static inline uint64_t helper_insertq(uint64_t dest, uint64_t src, int shift, int len)
d9f4bb27
AP
960{
961 uint64_t mask;
962
963 if (len == 0) {
964 mask = ~0ULL;
965 } else {
966 mask = (1ULL << len) - 1;
967 }
ca4b1b43 968 return (dest & ~(mask << shift)) | ((src & mask) << shift);
d9f4bb27
AP
969}
970
fa451874 971void helper_insertq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s)
d9f4bb27 972{
ca4b1b43 973 d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), s->ZMM_Q(0), s->ZMM_B(9) & 63, s->ZMM_B(8) & 63);
d9f4bb27
AP
974}
975
ca4b1b43 976void helper_insertq_i(CPUX86State *env, ZMMReg *d, ZMMReg *s, int index, int length)
d9f4bb27 977{
ca4b1b43 978 d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), s->ZMM_Q(0), index, length);
d9f4bb27 979}
fd17264a 980#endif
d9f4bb27 981
6567ffb4 982#define SSE_HELPER_HPS(name, F) \
f05f9789 983void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
6567ffb4 984{ \
6567ffb4
PB
985 float32 r[2 << SHIFT]; \
986 int i, j, k; \
987 for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \
988 for (i = j = 0; j < 4; i++, j += 2) { \
989 r[i + k] = F(v->ZMM_S(j + k), v->ZMM_S(j + k + 1), &env->sse_status); \
990 } \
991 for (j = 0; j < 4; i++, j += 2) { \
992 r[i + k] = F(s->ZMM_S(j + k), s->ZMM_S(j + k + 1), &env->sse_status); \
993 } \
994 } \
995 for (i = 0; i < 2 << SHIFT; i++) { \
996 d->ZMM_S(i) = r[i]; \
997 } \
998}
999
1000SSE_HELPER_HPS(haddps, float32_add)
1001SSE_HELPER_HPS(hsubps, float32_sub)
1002
1003#define SSE_HELPER_HPD(name, F) \
f05f9789 1004void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
6567ffb4 1005{ \
6567ffb4
PB
1006 float64 r[1 << SHIFT]; \
1007 int i, j, k; \
1008 for (k = 0; k < 1 << SHIFT; k += LANE_WIDTH / 8) { \
1009 for (i = j = 0; j < 2; i++, j += 2) { \
1010 r[i + k] = F(v->ZMM_D(j + k), v->ZMM_D(j + k + 1), &env->sse_status); \
1011 } \
1012 for (j = 0; j < 2; i++, j += 2) { \
1013 r[i + k] = F(s->ZMM_D(j + k), s->ZMM_D(j + k + 1), &env->sse_status); \
1014 } \
1015 } \
1016 for (i = 0; i < 1 << SHIFT; i++) { \
1017 d->ZMM_D(i) = r[i]; \
1018 } \
1019}
1020
1021SSE_HELPER_HPD(haddpd, float64_add)
1022SSE_HELPER_HPD(hsubpd, float64_sub)
664e0f19 1023
f05f9789 1024void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
664e0f19 1025{
3403cafe
PB
1026 int i;
1027 for (i = 0; i < 2 << SHIFT; i += 2) {
1028 d->ZMM_S(i) = float32_sub(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status);
1029 d->ZMM_S(i+1) = float32_add(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status);
1030 }
664e0f19
FB
1031}
1032
f05f9789 1033void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
664e0f19 1034{
3403cafe
PB
1035 int i;
1036 for (i = 0; i < 1 << SHIFT; i += 2) {
1037 d->ZMM_D(i) = float64_sub(v->ZMM_D(i), s->ZMM_D(i), &env->sse_status);
1038 d->ZMM_D(i+1) = float64_add(v->ZMM_D(i+1), s->ZMM_D(i+1), &env->sse_status);
1039 }
664e0f19
FB
1040}
1041
cbf4ad54
PB
1042#define SSE_HELPER_CMP_P(name, F, C) \
1043 void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \
f05f9789 1044 Reg *d, Reg *v, Reg *s) \
d3eb5eae 1045 { \
cbf4ad54
PB
1046 int i; \
1047 for (i = 0; i < 2 << SHIFT; i++) { \
1048 d->ZMM_L(i) = C(F(32, v->ZMM_S(i), s->ZMM_S(i))) ? -1 : 0; \
1049 } \
d3eb5eae
BS
1050 } \
1051 \
cbf4ad54 1052 void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \
f05f9789 1053 Reg *d, Reg *v, Reg *s) \
d3eb5eae 1054 { \
cbf4ad54
PB
1055 int i; \
1056 for (i = 0; i < 1 << SHIFT; i++) { \
1057 d->ZMM_Q(i) = C(F(64, v->ZMM_D(i), s->ZMM_D(i))) ? -1 : 0; \
1058 } \
1059 }
1060
1061#if SHIFT == 1
1062#define SSE_HELPER_CMP(name, F, C) \
1063 SSE_HELPER_CMP_P(name, F, C) \
f05f9789 1064 void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
cbf4ad54 1065 { \
1de9e7e6 1066 int i; \
cbf4ad54 1067 d->ZMM_L(0) = C(F(32, v->ZMM_S(0), s->ZMM_S(0))) ? -1 : 0; \
1de9e7e6
PB
1068 for (i = 1; i < 2 << SHIFT; i++) { \
1069 d->ZMM_L(i) = v->ZMM_L(i); \
1070 } \
cbf4ad54
PB
1071 } \
1072 \
f05f9789 1073 void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
cbf4ad54 1074 { \
1de9e7e6 1075 int i; \
cbf4ad54 1076 d->ZMM_Q(0) = C(F(64, v->ZMM_D(0), s->ZMM_D(0))) ? -1 : 0; \
1de9e7e6
PB
1077 for (i = 1; i < 1 << SHIFT; i++) { \
1078 d->ZMM_Q(i) = v->ZMM_Q(i); \
1079 } \
e01d9d31
BS
1080 }
1081
6e0cac78
PB
1082static inline bool FPU_EQU(FloatRelation x)
1083{
1084 return (x == float_relation_equal || x == float_relation_unordered);
1085}
1086static inline bool FPU_GE(FloatRelation x)
1087{
1088 return (x == float_relation_equal || x == float_relation_greater);
1089}
cbf4ad54
PB
1090#define FPU_EQ(x) (x == float_relation_equal)
1091#define FPU_LT(x) (x == float_relation_less)
1092#define FPU_LE(x) (x <= float_relation_equal)
6e0cac78 1093#define FPU_GT(x) (x == float_relation_greater)
cbf4ad54 1094#define FPU_UNORD(x) (x == float_relation_unordered)
6e0cac78
PB
1095/* We must make sure we evaluate the argument in case it is a signalling NAN */
1096#define FPU_FALSE(x) (x == float_relation_equal && 0)
cbf4ad54
PB
1097
1098#define FPU_CMPQ(size, a, b) \
1099 float ## size ## _compare_quiet(a, b, &env->sse_status)
1100#define FPU_CMPS(size, a, b) \
1101 float ## size ## _compare(a, b, &env->sse_status)
1102
1103#else
1104#define SSE_HELPER_CMP(name, F, C) SSE_HELPER_CMP_P(name, F, C)
1105#endif
1106
1107SSE_HELPER_CMP(cmpeq, FPU_CMPQ, FPU_EQ)
1108SSE_HELPER_CMP(cmplt, FPU_CMPS, FPU_LT)
1109SSE_HELPER_CMP(cmple, FPU_CMPS, FPU_LE)
1110SSE_HELPER_CMP(cmpunord, FPU_CMPQ, FPU_UNORD)
1111SSE_HELPER_CMP(cmpneq, FPU_CMPQ, !FPU_EQ)
1112SSE_HELPER_CMP(cmpnlt, FPU_CMPS, !FPU_LT)
1113SSE_HELPER_CMP(cmpnle, FPU_CMPS, !FPU_LE)
1114SSE_HELPER_CMP(cmpord, FPU_CMPQ, !FPU_UNORD)
1115
6e0cac78
PB
1116SSE_HELPER_CMP(cmpequ, FPU_CMPQ, FPU_EQU)
1117SSE_HELPER_CMP(cmpnge, FPU_CMPS, !FPU_GE)
1118SSE_HELPER_CMP(cmpngt, FPU_CMPS, !FPU_GT)
1119SSE_HELPER_CMP(cmpfalse, FPU_CMPQ, FPU_FALSE)
1120SSE_HELPER_CMP(cmpnequ, FPU_CMPQ, !FPU_EQU)
1121SSE_HELPER_CMP(cmpge, FPU_CMPS, FPU_GE)
1122SSE_HELPER_CMP(cmpgt, FPU_CMPS, FPU_GT)
1123SSE_HELPER_CMP(cmptrue, FPU_CMPQ, !FPU_FALSE)
1124
1125SSE_HELPER_CMP(cmpeqs, FPU_CMPS, FPU_EQ)
1126SSE_HELPER_CMP(cmpltq, FPU_CMPQ, FPU_LT)
1127SSE_HELPER_CMP(cmpleq, FPU_CMPQ, FPU_LE)
1128SSE_HELPER_CMP(cmpunords, FPU_CMPS, FPU_UNORD)
1129SSE_HELPER_CMP(cmpneqq, FPU_CMPS, !FPU_EQ)
1130SSE_HELPER_CMP(cmpnltq, FPU_CMPQ, !FPU_LT)
1131SSE_HELPER_CMP(cmpnleq, FPU_CMPQ, !FPU_LE)
1132SSE_HELPER_CMP(cmpords, FPU_CMPS, !FPU_UNORD)
1133
1134SSE_HELPER_CMP(cmpequs, FPU_CMPS, FPU_EQU)
1135SSE_HELPER_CMP(cmpngeq, FPU_CMPQ, !FPU_GE)
1136SSE_HELPER_CMP(cmpngtq, FPU_CMPQ, !FPU_GT)
1137SSE_HELPER_CMP(cmpfalses, FPU_CMPS, FPU_FALSE)
1138SSE_HELPER_CMP(cmpnequs, FPU_CMPS, !FPU_EQU)
1139SSE_HELPER_CMP(cmpgeq, FPU_CMPQ, FPU_GE)
1140SSE_HELPER_CMP(cmpgtq, FPU_CMPQ, FPU_GT)
1141SSE_HELPER_CMP(cmptrues, FPU_CMPS, !FPU_FALSE)
1142
cbf4ad54 1143#undef SSE_HELPER_CMP
664e0f19 1144
fd17264a 1145#if SHIFT == 1
1e6eec8b 1146static const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
43fb823b 1147
d3eb5eae 1148void helper_ucomiss(CPUX86State *env, Reg *d, Reg *s)
664e0f19 1149{
71bfd65c 1150 FloatRelation ret;
8422b113 1151 float32 s0, s1;
664e0f19 1152
19cbd87c
EH
1153 s0 = d->ZMM_S(0);
1154 s1 = s->ZMM_S(0);
43fb823b
FB
1155 ret = float32_compare_quiet(s0, s1, &env->sse_status);
1156 CC_SRC = comis_eflags[ret + 1];
664e0f19
FB
1157}
1158
d3eb5eae 1159void helper_comiss(CPUX86State *env, Reg *d, Reg *s)
664e0f19 1160{
71bfd65c 1161 FloatRelation ret;
8422b113 1162 float32 s0, s1;
664e0f19 1163
19cbd87c
EH
1164 s0 = d->ZMM_S(0);
1165 s1 = s->ZMM_S(0);
43fb823b
FB
1166 ret = float32_compare(s0, s1, &env->sse_status);
1167 CC_SRC = comis_eflags[ret + 1];
664e0f19
FB
1168}
1169
d3eb5eae 1170void helper_ucomisd(CPUX86State *env, Reg *d, Reg *s)
664e0f19 1171{
71bfd65c 1172 FloatRelation ret;
8422b113 1173 float64 d0, d1;
664e0f19 1174
19cbd87c
EH
1175 d0 = d->ZMM_D(0);
1176 d1 = s->ZMM_D(0);
43fb823b
FB
1177 ret = float64_compare_quiet(d0, d1, &env->sse_status);
1178 CC_SRC = comis_eflags[ret + 1];
664e0f19
FB
1179}
1180
d3eb5eae 1181void helper_comisd(CPUX86State *env, Reg *d, Reg *s)
664e0f19 1182{
71bfd65c 1183 FloatRelation ret;
8422b113 1184 float64 d0, d1;
664e0f19 1185
19cbd87c
EH
1186 d0 = d->ZMM_D(0);
1187 d1 = s->ZMM_D(0);
43fb823b
FB
1188 ret = float64_compare(d0, d1, &env->sse_status);
1189 CC_SRC = comis_eflags[ret + 1];
664e0f19 1190}
fd17264a 1191#endif
664e0f19 1192
ce4fa29f 1193uint32_t glue(helper_movmskps, SUFFIX)(CPUX86State *env, Reg *s)
664e0f19 1194{
fd17264a
PB
1195 uint32_t mask;
1196 int i;
e01d9d31 1197
fd17264a
PB
1198 mask = 0;
1199 for (i = 0; i < 2 << SHIFT; i++) {
1200 mask |= (s->ZMM_L(i) >> (31 - i)) & (1 << i);
1201 }
1202 return mask;
664e0f19
FB
1203}
1204
ce4fa29f 1205uint32_t glue(helper_movmskpd, SUFFIX)(CPUX86State *env, Reg *s)
664e0f19 1206{
fd17264a
PB
1207 uint32_t mask;
1208 int i;
e01d9d31 1209
fd17264a
PB
1210 mask = 0;
1211 for (i = 0; i < 1 << SHIFT; i++) {
1212 mask |= (s->ZMM_Q(i) >> (63 - i)) & (1 << i);
1213 }
1214 return mask;
664e0f19
FB
1215}
1216
1217#endif
1218
d3eb5eae 1219uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s)
5af45186
FB
1220{
1221 uint32_t val;
e894bae8 1222 int i;
e01d9d31 1223
5af45186 1224 val = 0;
e894bae8
PB
1225 for (i = 0; i < (1 << SHIFT); i++) {
1226 uint8_t byte = 0;
1227 byte |= (s->B(8 * i + 0) >> 7);
1228 byte |= (s->B(8 * i + 1) >> 6) & 0x02;
1229 byte |= (s->B(8 * i + 2) >> 5) & 0x04;
1230 byte |= (s->B(8 * i + 3) >> 4) & 0x08;
1231 byte |= (s->B(8 * i + 4) >> 3) & 0x10;
1232 byte |= (s->B(8 * i + 5) >> 2) & 0x20;
1233 byte |= (s->B(8 * i + 6) >> 1) & 0x40;
1234 byte |= (s->B(8 * i + 7)) & 0x80;
1235 val |= byte << (8 * i);
1236 }
5af45186 1237 return val;
664e0f19
FB
1238}
1239
d45b0de6
PB
1240#define PACK_HELPER_B(name, F) \
1241void glue(helper_pack ## name, SUFFIX)(CPUX86State *env, \
f05f9789 1242 Reg *d, Reg *v, Reg *s) \
d45b0de6 1243{ \
d45b0de6
PB
1244 uint8_t r[PACK_WIDTH * 2]; \
1245 int j, k; \
1246 for (j = 0; j < 4 << SHIFT; j += PACK_WIDTH) { \
1247 for (k = 0; k < PACK_WIDTH; k++) { \
1248 r[k] = F((int16_t)v->W(j + k)); \
1249 } \
1250 for (k = 0; k < PACK_WIDTH; k++) { \
1251 r[PACK_WIDTH + k] = F((int16_t)s->W(j + k)); \
1252 } \
1253 for (k = 0; k < PACK_WIDTH * 2; k++) { \
1254 d->B(2 * j + k) = r[k]; \
1255 } \
1256 } \
1257}
1258
1259PACK_HELPER_B(sswb, satsb)
1260PACK_HELPER_B(uswb, satub)
664e0f19 1261
f05f9789 1262void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
664e0f19 1263{
d45b0de6
PB
1264 uint16_t r[PACK_WIDTH];
1265 int j, k;
664e0f19 1266
d45b0de6
PB
1267 for (j = 0; j < 2 << SHIFT; j += PACK_WIDTH / 2) {
1268 for (k = 0; k < PACK_WIDTH / 2; k++) {
1269 r[k] = satsw(v->L(j + k));
1270 }
1271 for (k = 0; k < PACK_WIDTH / 2; k++) {
1272 r[PACK_WIDTH / 2 + k] = satsw(s->L(j + k));
1273 }
1274 for (k = 0; k < PACK_WIDTH; k++) {
1275 d->W(2 * j + k) = r[k];
1276 }
1277 }
664e0f19
FB
1278}
1279
e01d9d31
BS
1280#define UNPCK_OP(base_name, base) \
1281 \
d3eb5eae 1282 void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\
f05f9789 1283 Reg *d, Reg *v, Reg *s) \
e01d9d31 1284 { \
d45b0de6
PB
1285 uint8_t r[PACK_WIDTH * 2]; \
1286 int j, i; \
e01d9d31 1287 \
d45b0de6
PB
1288 for (j = 0; j < 8 << SHIFT; ) { \
1289 int k = j + base * PACK_WIDTH; \
1290 for (i = 0; i < PACK_WIDTH; i++) { \
1291 r[2 * i] = v->B(k + i); \
1292 r[2 * i + 1] = s->B(k + i); \
1293 } \
1294 for (i = 0; i < PACK_WIDTH * 2; i++, j++) { \
1295 d->B(j) = r[i]; \
1296 } \
1297 } \
e01d9d31
BS
1298 } \
1299 \
d3eb5eae 1300 void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\
f05f9789 1301 Reg *d, Reg *v, Reg *s) \
e01d9d31 1302 { \
d45b0de6
PB
1303 uint16_t r[PACK_WIDTH]; \
1304 int j, i; \
e01d9d31 1305 \
d45b0de6
PB
1306 for (j = 0; j < 4 << SHIFT; ) { \
1307 int k = j + base * PACK_WIDTH / 2; \
1308 for (i = 0; i < PACK_WIDTH / 2; i++) { \
1309 r[2 * i] = v->W(k + i); \
1310 r[2 * i + 1] = s->W(k + i); \
1311 } \
1312 for (i = 0; i < PACK_WIDTH; i++, j++) { \
1313 d->W(j) = r[i]; \
1314 } \
1315 } \
e01d9d31
BS
1316 } \
1317 \
d3eb5eae 1318 void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\
f05f9789 1319 Reg *d, Reg *v, Reg *s) \
e01d9d31 1320 { \
d45b0de6
PB
1321 uint32_t r[PACK_WIDTH / 2]; \
1322 int j, i; \
e01d9d31 1323 \
d45b0de6
PB
1324 for (j = 0; j < 2 << SHIFT; ) { \
1325 int k = j + base * PACK_WIDTH / 4; \
1326 for (i = 0; i < PACK_WIDTH / 4; i++) { \
1327 r[2 * i] = v->L(k + i); \
1328 r[2 * i + 1] = s->L(k + i); \
1329 } \
1330 for (i = 0; i < PACK_WIDTH / 2; i++, j++) { \
1331 d->L(j) = r[i]; \
1332 } \
1333 } \
e01d9d31
BS
1334 } \
1335 \
1336 XMM_ONLY( \
d45b0de6 1337 void glue(helper_punpck ## base_name ## qdq, SUFFIX)( \
f05f9789 1338 CPUX86State *env, Reg *d, Reg *v, Reg *s) \
e01d9d31 1339 { \
d45b0de6
PB
1340 uint64_t r[2]; \
1341 int i; \
e01d9d31 1342 \
d45b0de6
PB
1343 for (i = 0; i < 1 << SHIFT; i += 2) { \
1344 r[0] = v->Q(base + i); \
1345 r[1] = s->Q(base + i); \
1346 d->Q(i) = r[0]; \
1347 d->Q(i + 1) = r[1]; \
1348 } \
e01d9d31
BS
1349 } \
1350 )
664e0f19
FB
1351
1352UNPCK_OP(l, 0)
1353UNPCK_OP(h, 1)
1354
d45b0de6
PB
1355#undef PACK_WIDTH
1356#undef PACK_HELPER_B
1357#undef UNPCK_OP
1358
1359
a35f3ec7
AJ
1360/* 3DNow! float ops */
1361#if SHIFT == 0
d3eb5eae 1362void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1363{
a35f3ec7
AJ
1364 d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status);
1365 d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status);
1366}
1367
d3eb5eae 1368void helper_pi2fw(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1369{
a35f3ec7
AJ
1370 d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status);
1371 d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status);
1372}
1373
d3eb5eae 1374void helper_pf2id(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1375{
a35f3ec7
AJ
1376 d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status);
1377 d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status);
1378}
1379
d3eb5eae 1380void helper_pf2iw(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1381{
e01d9d31
BS
1382 d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0),
1383 &env->mmx_status));
1384 d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1),
1385 &env->mmx_status));
a35f3ec7
AJ
1386}
1387
d3eb5eae 1388void helper_pfacc(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1389{
25bdec79 1390 float32 r;
e01d9d31 1391
25bdec79
PB
1392 r = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1393 d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1394 d->MMX_S(0) = r;
a35f3ec7
AJ
1395}
1396
d3eb5eae 1397void helper_pfadd(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1398{
a35f3ec7
AJ
1399 d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1400 d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1401}
1402
d3eb5eae 1403void helper_pfcmpeq(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1404{
e01d9d31
BS
1405 d->MMX_L(0) = float32_eq_quiet(d->MMX_S(0), s->MMX_S(0),
1406 &env->mmx_status) ? -1 : 0;
1407 d->MMX_L(1) = float32_eq_quiet(d->MMX_S(1), s->MMX_S(1),
1408 &env->mmx_status) ? -1 : 0;
a35f3ec7
AJ
1409}
1410
d3eb5eae 1411void helper_pfcmpge(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1412{
e01d9d31
BS
1413 d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0),
1414 &env->mmx_status) ? -1 : 0;
1415 d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1),
1416 &env->mmx_status) ? -1 : 0;
a35f3ec7
AJ
1417}
1418
d3eb5eae 1419void helper_pfcmpgt(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1420{
e01d9d31
BS
1421 d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0),
1422 &env->mmx_status) ? -1 : 0;
1423 d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1),
1424 &env->mmx_status) ? -1 : 0;
a35f3ec7
AJ
1425}
1426
d3eb5eae 1427void helper_pfmax(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1428{
e01d9d31 1429 if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status)) {
a35f3ec7 1430 d->MMX_S(0) = s->MMX_S(0);
e01d9d31
BS
1431 }
1432 if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status)) {
a35f3ec7 1433 d->MMX_S(1) = s->MMX_S(1);
e01d9d31 1434 }
a35f3ec7
AJ
1435}
1436
d3eb5eae 1437void helper_pfmin(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1438{
e01d9d31 1439 if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status)) {
a35f3ec7 1440 d->MMX_S(0) = s->MMX_S(0);
e01d9d31
BS
1441 }
1442 if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status)) {
a35f3ec7 1443 d->MMX_S(1) = s->MMX_S(1);
e01d9d31 1444 }
a35f3ec7
AJ
1445}
1446
d3eb5eae 1447void helper_pfmul(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1448{
a35f3ec7
AJ
1449 d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1450 d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1451}
1452
d3eb5eae 1453void helper_pfnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1454{
25bdec79 1455 float32 r;
e01d9d31 1456
25bdec79
PB
1457 r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1458 d->MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1459 d->MMX_S(0) = r;
a35f3ec7
AJ
1460}
1461
d3eb5eae 1462void helper_pfpnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1463{
25bdec79 1464 float32 r;
e01d9d31 1465
25bdec79
PB
1466 r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1467 d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1468 d->MMX_S(0) = r;
a35f3ec7
AJ
1469}
1470
d3eb5eae 1471void helper_pfrcp(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1472{
c2ef9a83 1473 d->MMX_S(0) = float32_div(float32_one, s->MMX_S(0), &env->mmx_status);
a35f3ec7
AJ
1474 d->MMX_S(1) = d->MMX_S(0);
1475}
1476
d3eb5eae 1477void helper_pfrsqrt(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1478{
a35f3ec7 1479 d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff;
c2ef9a83
AJ
1480 d->MMX_S(1) = float32_div(float32_one,
1481 float32_sqrt(d->MMX_S(1), &env->mmx_status),
1482 &env->mmx_status);
a35f3ec7
AJ
1483 d->MMX_L(1) |= s->MMX_L(0) & 0x80000000;
1484 d->MMX_L(0) = d->MMX_L(1);
1485}
1486
d3eb5eae 1487void helper_pfsub(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1488{
a35f3ec7
AJ
1489 d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1490 d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1491}
1492
d3eb5eae 1493void helper_pfsubr(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1494{
a35f3ec7
AJ
1495 d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status);
1496 d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status);
1497}
1498
d3eb5eae 1499void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1500{
25bdec79 1501 uint32_t r;
e01d9d31 1502
25bdec79
PB
1503 r = s->MMX_L(0);
1504 d->MMX_L(0) = s->MMX_L(1);
1505 d->MMX_L(1) = r;
a35f3ec7
AJ
1506}
1507#endif
1508
4242b1bd 1509/* SSSE3 op helpers */
f05f9789 1510void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
4242b1bd
AZ
1511{
1512 int i;
d45b0de6
PB
1513#if SHIFT == 0
1514 uint8_t r[8];
4242b1bd 1515
d45b0de6
PB
1516 for (i = 0; i < 8; i++) {
1517 r[i] = (s->B(i) & 0x80) ? 0 : (v->B(s->B(i) & 7));
e01d9d31 1518 }
d45b0de6
PB
1519 for (i = 0; i < 8; i++) {
1520 d->B(i) = r[i];
1521 }
1522#else
1523 uint8_t r[8 << SHIFT];
4242b1bd 1524
d45b0de6
PB
1525 for (i = 0; i < 8 << SHIFT; i++) {
1526 int j = i & ~0xf;
1527 r[i] = (s->B(i) & 0x80) ? 0 : v->B(j | (s->B(i) & 0xf));
1528 }
1529 for (i = 0; i < 8 << SHIFT; i++) {
1530 d->B(i) = r[i];
1531 }
4242b1bd
AZ
1532#endif
1533}
1534
d45b0de6 1535#define SSE_HELPER_HW(name, F) \
f05f9789 1536void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
d45b0de6 1537{ \
d45b0de6
PB
1538 uint16_t r[4 << SHIFT]; \
1539 int i, j, k; \
1540 for (k = 0; k < 4 << SHIFT; k += LANE_WIDTH / 2) { \
1541 for (i = j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \
1542 r[i + k] = F(v->W(j + k), v->W(j + k + 1)); \
1543 } \
1544 for (j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \
1545 r[i + k] = F(s->W(j + k), s->W(j + k + 1)); \
1546 } \
1547 } \
1548 for (i = 0; i < 4 << SHIFT; i++) { \
1549 d->W(i) = r[i]; \
1550 } \
1551}
1552
1553#define SSE_HELPER_HL(name, F) \
f05f9789 1554void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s) \
d45b0de6 1555{ \
d45b0de6
PB
1556 uint32_t r[2 << SHIFT]; \
1557 int i, j, k; \
1558 for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \
1559 for (i = j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \
1560 r[i + k] = F(v->L(j + k), v->L(j + k + 1)); \
1561 } \
1562 for (j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \
1563 r[i + k] = F(s->L(j + k), s->L(j + k + 1)); \
1564 } \
1565 } \
1566 for (i = 0; i < 2 << SHIFT; i++) { \
1567 d->L(i) = r[i]; \
1568 } \
1569}
1570
1571SSE_HELPER_HW(phaddw, FADD)
1572SSE_HELPER_HW(phsubw, FSUB)
1573SSE_HELPER_HW(phaddsw, FADDSW)
1574SSE_HELPER_HW(phsubsw, FSUBSW)
1575SSE_HELPER_HL(phaddd, FADD)
1576SSE_HELPER_HL(phsubd, FSUB)
1577
1578#undef SSE_HELPER_HW
1579#undef SSE_HELPER_HL
4242b1bd 1580
f05f9789 1581void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
4242b1bd 1582{
d45b0de6
PB
1583 int i;
1584 for (i = 0; i < 4 << SHIFT; i++) {
1585 d->W(i) = satsw((int8_t)s->B(i * 2) * (uint8_t)v->B(i * 2) +
1586 (int8_t)s->B(i * 2 + 1) * (uint8_t)v->B(i * 2 + 1));
1587 }
4242b1bd
AZ
1588}
1589
ee04a3c8
PB
1590#define FABSB(x) (x > INT8_MAX ? -(int8_t)x : x)
1591#define FABSW(x) (x > INT16_MAX ? -(int16_t)x : x)
1592#define FABSL(x) (x > INT32_MAX ? -(int32_t)x : x)
1593SSE_HELPER_1(helper_pabsb, B, 8 << SHIFT, FABSB)
1594SSE_HELPER_1(helper_pabsw, W, 4 << SHIFT, FABSW)
1595SSE_HELPER_1(helper_pabsd, L, 2 << SHIFT, FABSL)
4242b1bd 1596
e01d9d31 1597#define FMULHRSW(d, s) (((int16_t) d * (int16_t)s + 0x4000) >> 15)
4242b1bd
AZ
1598SSE_HELPER_W(helper_pmulhrsw, FMULHRSW)
1599
e01d9d31
BS
1600#define FSIGNB(d, s) (s <= INT8_MAX ? s ? d : 0 : -(int8_t)d)
1601#define FSIGNW(d, s) (s <= INT16_MAX ? s ? d : 0 : -(int16_t)d)
1602#define FSIGNL(d, s) (s <= INT32_MAX ? s ? d : 0 : -(int32_t)d)
4242b1bd
AZ
1603SSE_HELPER_B(helper_psignb, FSIGNB)
1604SSE_HELPER_W(helper_psignw, FSIGNW)
1605SSE_HELPER_L(helper_psignd, FSIGNL)
1606
f05f9789 1607void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
d3eb5eae 1608 int32_t shift)
4242b1bd 1609{
d45b0de6 1610 int i;
4242b1bd
AZ
1611
1612 /* XXX could be checked during translation */
d45b0de6
PB
1613 if (shift >= (SHIFT ? 32 : 16)) {
1614 for (i = 0; i < (1 << SHIFT); i++) {
1615 d->Q(i) = 0;
1616 }
4242b1bd
AZ
1617 } else {
1618 shift <<= 3;
1619#define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
1620#if SHIFT == 0
d45b0de6
PB
1621 d->Q(0) = SHR(s->Q(0), shift - 0) |
1622 SHR(v->Q(0), shift - 64);
4242b1bd 1623#else
d45b0de6
PB
1624 for (i = 0; i < (1 << SHIFT); i += 2) {
1625 uint64_t r0, r1;
1626
1627 r0 = SHR(s->Q(i), shift - 0) |
1628 SHR(s->Q(i + 1), shift - 64) |
1629 SHR(v->Q(i), shift - 128) |
1630 SHR(v->Q(i + 1), shift - 192);
1631 r1 = SHR(s->Q(i), shift + 64) |
1632 SHR(s->Q(i + 1), shift - 0) |
1633 SHR(v->Q(i), shift - 64) |
1634 SHR(v->Q(i + 1), shift - 128);
1635 d->Q(i) = r0;
1636 d->Q(i + 1) = r1;
1637 }
4242b1bd
AZ
1638#endif
1639#undef SHR
1640 }
4242b1bd
AZ
1641}
1642
0e29cea5 1643#if SHIFT >= 1
222a3336 1644
e01d9d31 1645#define SSE_HELPER_V(name, elem, num, F) \
f05f9789
PB
1646 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, \
1647 Reg *m) \
e01d9d31 1648 { \
0e29cea5
PB
1649 int i; \
1650 for (i = 0; i < num; i++) { \
1651 d->elem(i) = F(v->elem(i), s->elem(i), m->elem(i)); \
e01d9d31
BS
1652 } \
1653 }
1654
1655#define SSE_HELPER_I(name, elem, num, F) \
f05f9789 1656 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s, \
0e29cea5 1657 uint32_t imm) \
e01d9d31 1658 { \
0e29cea5
PB
1659 int i; \
1660 for (i = 0; i < num; i++) { \
1661 int j = i & 7; \
1662 d->elem(i) = F(v->elem(i), s->elem(i), (imm >> j) & 1); \
e01d9d31
BS
1663 } \
1664 }
222a3336
AZ
1665
1666/* SSE4.1 op helpers */
0e29cea5
PB
1667#define FBLENDVB(v, s, m) ((m & 0x80) ? s : v)
1668#define FBLENDVPS(v, s, m) ((m & 0x80000000) ? s : v)
1669#define FBLENDVPD(v, s, m) ((m & 0x8000000000000000LL) ? s : v)
1670SSE_HELPER_V(helper_pblendvb, B, 8 << SHIFT, FBLENDVB)
1671SSE_HELPER_V(helper_blendvps, L, 2 << SHIFT, FBLENDVPS)
1672SSE_HELPER_V(helper_blendvpd, Q, 1 << SHIFT, FBLENDVPD)
222a3336 1673
d3eb5eae 1674void glue(helper_ptest, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
222a3336 1675{
e894bae8
PB
1676 uint64_t zf = 0, cf = 0;
1677 int i;
222a3336 1678
e894bae8
PB
1679 for (i = 0; i < 1 << SHIFT; i++) {
1680 zf |= (s->Q(i) & d->Q(i));
1681 cf |= (s->Q(i) & ~d->Q(i));
1682 }
222a3336
AZ
1683 CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C);
1684}
1685
e894bae8
PB
1686#define SSE_HELPER_F(name, elem, num, F) \
1687 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
1688 { \
1689 int n = num; \
1690 for (int i = n; --i >= 0; ) { \
1691 d->elem(i) = F(i); \
1692 } \
e01d9d31 1693 }
222a3336 1694
e894bae8
PB
1695#if SHIFT > 0
1696SSE_HELPER_F(helper_pmovsxbw, W, 4 << SHIFT, (int8_t) s->B)
1697SSE_HELPER_F(helper_pmovsxbd, L, 2 << SHIFT, (int8_t) s->B)
1698SSE_HELPER_F(helper_pmovsxbq, Q, 1 << SHIFT, (int8_t) s->B)
1699SSE_HELPER_F(helper_pmovsxwd, L, 2 << SHIFT, (int16_t) s->W)
1700SSE_HELPER_F(helper_pmovsxwq, Q, 1 << SHIFT, (int16_t) s->W)
1701SSE_HELPER_F(helper_pmovsxdq, Q, 1 << SHIFT, (int32_t) s->L)
1702SSE_HELPER_F(helper_pmovzxbw, W, 4 << SHIFT, s->B)
1703SSE_HELPER_F(helper_pmovzxbd, L, 2 << SHIFT, s->B)
1704SSE_HELPER_F(helper_pmovzxbq, Q, 1 << SHIFT, s->B)
1705SSE_HELPER_F(helper_pmovzxwd, L, 2 << SHIFT, s->W)
1706SSE_HELPER_F(helper_pmovzxwq, Q, 1 << SHIFT, s->W)
1707SSE_HELPER_F(helper_pmovzxdq, Q, 1 << SHIFT, s->L)
1708#endif
222a3336 1709
f05f9789 1710void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
222a3336 1711{
e894bae8
PB
1712 int i;
1713
1714 for (i = 0; i < 1 << SHIFT; i++) {
1715 d->Q(i) = (int64_t)(int32_t) v->L(2 * i) * (int32_t) s->L(2 * i);
1716 }
222a3336
AZ
1717}
1718
e01d9d31 1719#define FCMPEQQ(d, s) (d == s ? -1 : 0)
222a3336
AZ
1720SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ)
1721
f05f9789 1722void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
222a3336 1723{
d45b0de6
PB
1724 uint16_t r[8];
1725 int i, j, k;
1726
1727 for (i = 0, j = 0; i <= 2 << SHIFT; i += 8, j += 4) {
1728 r[0] = satuw(v->L(j));
1729 r[1] = satuw(v->L(j + 1));
1730 r[2] = satuw(v->L(j + 2));
1731 r[3] = satuw(v->L(j + 3));
1732 r[4] = satuw(s->L(j));
1733 r[5] = satuw(s->L(j + 1));
1734 r[6] = satuw(s->L(j + 2));
1735 r[7] = satuw(s->L(j + 3));
1736 for (k = 0; k < 8; k++) {
1737 d->W(i + k) = r[k];
1738 }
1739 }
222a3336
AZ
1740}
1741
e01d9d31
BS
1742#define FMINSB(d, s) MIN((int8_t)d, (int8_t)s)
1743#define FMINSD(d, s) MIN((int32_t)d, (int32_t)s)
1744#define FMAXSB(d, s) MAX((int8_t)d, (int8_t)s)
1745#define FMAXSD(d, s) MAX((int32_t)d, (int32_t)s)
222a3336
AZ
1746SSE_HELPER_B(helper_pminsb, FMINSB)
1747SSE_HELPER_L(helper_pminsd, FMINSD)
1748SSE_HELPER_W(helper_pminuw, MIN)
1749SSE_HELPER_L(helper_pminud, MIN)
1750SSE_HELPER_B(helper_pmaxsb, FMAXSB)
1751SSE_HELPER_L(helper_pmaxsd, FMAXSD)
1752SSE_HELPER_W(helper_pmaxuw, MAX)
1753SSE_HELPER_L(helper_pmaxud, MAX)
1754
e01d9d31 1755#define FMULLD(d, s) ((int32_t)d * (int32_t)s)
222a3336
AZ
1756SSE_HELPER_L(helper_pmulld, FMULLD)
1757
fd17264a 1758#if SHIFT == 1
d3eb5eae 1759void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
222a3336
AZ
1760{
1761 int idx = 0;
1762
e01d9d31 1763 if (s->W(1) < s->W(idx)) {
222a3336 1764 idx = 1;
e01d9d31
BS
1765 }
1766 if (s->W(2) < s->W(idx)) {
222a3336 1767 idx = 2;
e01d9d31
BS
1768 }
1769 if (s->W(3) < s->W(idx)) {
222a3336 1770 idx = 3;
e01d9d31
BS
1771 }
1772 if (s->W(4) < s->W(idx)) {
222a3336 1773 idx = 4;
e01d9d31
BS
1774 }
1775 if (s->W(5) < s->W(idx)) {
222a3336 1776 idx = 5;
e01d9d31
BS
1777 }
1778 if (s->W(6) < s->W(idx)) {
222a3336 1779 idx = 6;
e01d9d31
BS
1780 }
1781 if (s->W(7) < s->W(idx)) {
222a3336 1782 idx = 7;
e01d9d31 1783 }
222a3336 1784
222a3336 1785 d->W(0) = s->W(idx);
aa406fea
JM
1786 d->W(1) = idx;
1787 d->L(1) = 0;
1788 d->Q(1) = 0;
222a3336 1789}
fd17264a 1790#endif
222a3336 1791
d3eb5eae
BS
1792void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1793 uint32_t mode)
222a3336 1794{
418b0f93 1795 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
222a3336 1796 signed char prev_rounding_mode;
fd17264a 1797 int i;
222a3336
AZ
1798
1799 prev_rounding_mode = env->sse_status.float_rounding_mode;
e01d9d31 1800 if (!(mode & (1 << 2))) {
222a3336
AZ
1801 switch (mode & 3) {
1802 case 0:
1803 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1804 break;
1805 case 1:
1806 set_float_rounding_mode(float_round_down, &env->sse_status);
1807 break;
1808 case 2:
1809 set_float_rounding_mode(float_round_up, &env->sse_status);
1810 break;
1811 case 3:
1812 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1813 break;
1814 }
e01d9d31 1815 }
222a3336 1816
fd17264a
PB
1817 for (i = 0; i < 2 << SHIFT; i++) {
1818 d->ZMM_S(i) = float32_round_to_int(s->ZMM_S(i), &env->sse_status);
1819 }
222a3336 1820
418b0f93 1821 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
e01d9d31
BS
1822 set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1823 ~float_flag_inexact,
1824 &env->sse_status);
1825 }
222a3336
AZ
1826 env->sse_status.float_rounding_mode = prev_rounding_mode;
1827}
1828
d3eb5eae
BS
1829void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1830 uint32_t mode)
222a3336 1831{
418b0f93 1832 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
222a3336 1833 signed char prev_rounding_mode;
fd17264a 1834 int i;
222a3336
AZ
1835
1836 prev_rounding_mode = env->sse_status.float_rounding_mode;
e01d9d31 1837 if (!(mode & (1 << 2))) {
222a3336
AZ
1838 switch (mode & 3) {
1839 case 0:
1840 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1841 break;
1842 case 1:
1843 set_float_rounding_mode(float_round_down, &env->sse_status);
1844 break;
1845 case 2:
1846 set_float_rounding_mode(float_round_up, &env->sse_status);
1847 break;
1848 case 3:
1849 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1850 break;
1851 }
e01d9d31 1852 }
222a3336 1853
fd17264a
PB
1854 for (i = 0; i < 1 << SHIFT; i++) {
1855 d->ZMM_D(i) = float64_round_to_int(s->ZMM_D(i), &env->sse_status);
1856 }
222a3336 1857
418b0f93 1858 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
e01d9d31
BS
1859 set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1860 ~float_flag_inexact,
1861 &env->sse_status);
1862 }
222a3336
AZ
1863 env->sse_status.float_rounding_mode = prev_rounding_mode;
1864}
1865
fd17264a 1866#if SHIFT == 1
620f7556 1867void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
d3eb5eae 1868 uint32_t mode)
222a3336 1869{
418b0f93 1870 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
222a3336 1871 signed char prev_rounding_mode;
620f7556 1872 int i;
222a3336
AZ
1873
1874 prev_rounding_mode = env->sse_status.float_rounding_mode;
e01d9d31 1875 if (!(mode & (1 << 2))) {
222a3336
AZ
1876 switch (mode & 3) {
1877 case 0:
1878 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1879 break;
1880 case 1:
1881 set_float_rounding_mode(float_round_down, &env->sse_status);
1882 break;
1883 case 2:
1884 set_float_rounding_mode(float_round_up, &env->sse_status);
1885 break;
1886 case 3:
1887 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1888 break;
1889 }
e01d9d31 1890 }
222a3336 1891
19cbd87c 1892 d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status);
620f7556
PB
1893 for (i = 1; i < 2 << SHIFT; i++) {
1894 d->ZMM_L(i) = v->ZMM_L(i);
1895 }
222a3336 1896
418b0f93 1897 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
e01d9d31
BS
1898 set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1899 ~float_flag_inexact,
1900 &env->sse_status);
1901 }
222a3336
AZ
1902 env->sse_status.float_rounding_mode = prev_rounding_mode;
1903}
1904
620f7556 1905void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
d3eb5eae 1906 uint32_t mode)
222a3336 1907{
418b0f93 1908 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
222a3336 1909 signed char prev_rounding_mode;
620f7556 1910 int i;
222a3336
AZ
1911
1912 prev_rounding_mode = env->sse_status.float_rounding_mode;
e01d9d31 1913 if (!(mode & (1 << 2))) {
222a3336
AZ
1914 switch (mode & 3) {
1915 case 0:
1916 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1917 break;
1918 case 1:
1919 set_float_rounding_mode(float_round_down, &env->sse_status);
1920 break;
1921 case 2:
1922 set_float_rounding_mode(float_round_up, &env->sse_status);
1923 break;
1924 case 3:
1925 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1926 break;
1927 }
e01d9d31 1928 }
222a3336 1929
19cbd87c 1930 d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status);
620f7556
PB
1931 for (i = 1; i < 1 << SHIFT; i++) {
1932 d->ZMM_Q(i) = v->ZMM_Q(i);
1933 }
222a3336 1934
418b0f93 1935 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
e01d9d31
BS
1936 set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1937 ~float_flag_inexact,
1938 &env->sse_status);
1939 }
222a3336
AZ
1940 env->sse_status.float_rounding_mode = prev_rounding_mode;
1941}
fd17264a 1942#endif
222a3336 1943
0e29cea5
PB
1944#define FBLENDP(v, s, m) (m ? s : v)
1945SSE_HELPER_I(helper_blendps, L, 2 << SHIFT, FBLENDP)
1946SSE_HELPER_I(helper_blendpd, Q, 1 << SHIFT, FBLENDP)
1947SSE_HELPER_I(helper_pblendw, W, 4 << SHIFT, FBLENDP)
222a3336 1948
f05f9789 1949void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
6f218d6e 1950 uint32_t mask)
222a3336 1951{
bf30ad8c 1952 float32 prod1, prod2, temp2, temp3, temp4;
6f218d6e 1953 int i;
222a3336 1954
6f218d6e
PB
1955 for (i = 0; i < 2 << SHIFT; i += 4) {
1956 /*
1957 * We must evaluate (A+B)+(C+D), not ((A+B)+C)+D
1958 * to correctly round the intermediate results
1959 */
1960 if (mask & (1 << 4)) {
1961 prod1 = float32_mul(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status);
1962 } else {
1963 prod1 = float32_zero;
1964 }
1965 if (mask & (1 << 5)) {
1966 prod2 = float32_mul(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status);
1967 } else {
1968 prod2 = float32_zero;
1969 }
1970 temp2 = float32_add(prod1, prod2, &env->sse_status);
1971 if (mask & (1 << 6)) {
1972 prod1 = float32_mul(v->ZMM_S(i+2), s->ZMM_S(i+2), &env->sse_status);
1973 } else {
1974 prod1 = float32_zero;
1975 }
1976 if (mask & (1 << 7)) {
1977 prod2 = float32_mul(v->ZMM_S(i+3), s->ZMM_S(i+3), &env->sse_status);
1978 } else {
1979 prod2 = float32_zero;
1980 }
1981 temp3 = float32_add(prod1, prod2, &env->sse_status);
1982 temp4 = float32_add(temp2, temp3, &env->sse_status);
bf30ad8c 1983
6f218d6e
PB
1984 d->ZMM_S(i) = (mask & (1 << 0)) ? temp4 : float32_zero;
1985 d->ZMM_S(i+1) = (mask & (1 << 1)) ? temp4 : float32_zero;
1986 d->ZMM_S(i+2) = (mask & (1 << 2)) ? temp4 : float32_zero;
1987 d->ZMM_S(i+3) = (mask & (1 << 3)) ? temp4 : float32_zero;
1988 }
222a3336
AZ
1989}
1990
6f218d6e
PB
1991#if SHIFT == 1
1992/* Oddly, there is no ymm version of dppd */
1993void glue(helper_dppd, SUFFIX)(CPUX86State *env,
f05f9789 1994 Reg *d, Reg *v, Reg *s, uint32_t mask)
222a3336 1995{
bf30ad8c 1996 float64 prod1, prod2, temp2;
222a3336 1997
e01d9d31 1998 if (mask & (1 << 4)) {
6f218d6e 1999 prod1 = float64_mul(v->ZMM_D(0), s->ZMM_D(0), &env->sse_status);
bf30ad8c
PB
2000 } else {
2001 prod1 = float64_zero;
e01d9d31
BS
2002 }
2003 if (mask & (1 << 5)) {
6f218d6e 2004 prod2 = float64_mul(v->ZMM_D(1), s->ZMM_D(1), &env->sse_status);
bf30ad8c
PB
2005 } else {
2006 prod2 = float64_zero;
e01d9d31 2007 }
bf30ad8c
PB
2008 temp2 = float64_add(prod1, prod2, &env->sse_status);
2009 d->ZMM_D(0) = (mask & (1 << 0)) ? temp2 : float64_zero;
2010 d->ZMM_D(1) = (mask & (1 << 1)) ? temp2 : float64_zero;
222a3336 2011}
6f218d6e 2012#endif
222a3336 2013
f05f9789 2014void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
d3eb5eae 2015 uint32_t offset)
222a3336 2016{
d45b0de6
PB
2017 int i, j;
2018 uint16_t r[8];
2019
2020 for (j = 0; j < 4 << SHIFT; ) {
2021 int s0 = (j * 2) + ((offset & 3) << 2);
2022 int d0 = (j * 2) + ((offset & 4) << 0);
2023 for (i = 0; i < LANE_WIDTH / 2; i++, d0++) {
2024 r[i] = 0;
2025 r[i] += abs1(v->B(d0 + 0) - s->B(s0 + 0));
2026 r[i] += abs1(v->B(d0 + 1) - s->B(s0 + 1));
2027 r[i] += abs1(v->B(d0 + 2) - s->B(s0 + 2));
2028 r[i] += abs1(v->B(d0 + 3) - s->B(s0 + 3));
2029 }
2030 for (i = 0; i < LANE_WIDTH / 2; i++, j++) {
2031 d->W(j) = r[i];
2032 }
2033 offset >>= 3;
222a3336 2034 }
222a3336
AZ
2035}
2036
2037/* SSE4.2 op helpers */
da5156cd 2038#define FCMPGTQ(d, s) ((int64_t)d > (int64_t)s ? -1 : 0)
222a3336
AZ
2039SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ)
2040
fd17264a 2041#if SHIFT == 1
d3eb5eae 2042static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl)
222a3336 2043{
d1da229f 2044 target_long val, limit;
222a3336
AZ
2045
2046 /* Presence of REX.W is indicated by a bit higher than 7 set */
e01d9d31 2047 if (ctrl >> 8) {
d1da229f 2048 val = (target_long)env->regs[reg];
e01d9d31 2049 } else {
d1da229f 2050 val = (int32_t)env->regs[reg];
e01d9d31 2051 }
222a3336 2052 if (ctrl & 1) {
d1da229f 2053 limit = 8;
e01d9d31 2054 } else {
d1da229f 2055 limit = 16;
e01d9d31 2056 }
d1da229f
PB
2057 if ((val > limit) || (val < -limit)) {
2058 return limit;
2059 }
2060 return abs1(val);
222a3336
AZ
2061}
2062
2063static inline int pcmp_ilen(Reg *r, uint8_t ctrl)
2064{
2065 int val = 0;
2066
2067 if (ctrl & 1) {
e01d9d31 2068 while (val < 8 && r->W(val)) {
222a3336 2069 val++;
e01d9d31
BS
2070 }
2071 } else {
2072 while (val < 16 && r->B(val)) {
222a3336 2073 val++;
e01d9d31
BS
2074 }
2075 }
222a3336
AZ
2076
2077 return val;
2078}
2079
2080static inline int pcmp_val(Reg *r, uint8_t ctrl, int i)
2081{
2082 switch ((ctrl >> 0) & 3) {
2083 case 0:
2084 return r->B(i);
2085 case 1:
2086 return r->W(i);
2087 case 2:
e01d9d31 2088 return (int8_t)r->B(i);
222a3336
AZ
2089 case 3:
2090 default:
e01d9d31 2091 return (int16_t)r->W(i);
222a3336
AZ
2092 }
2093}
2094
d3eb5eae 2095static inline unsigned pcmpxstrx(CPUX86State *env, Reg *d, Reg *s,
e01d9d31 2096 int8_t ctrl, int valids, int validd)
222a3336
AZ
2097{
2098 unsigned int res = 0;
2099 int v;
2100 int j, i;
2101 int upper = (ctrl & 1) ? 7 : 15;
2102
2103 valids--;
2104 validd--;
2105
2106 CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0);
2107
2108 switch ((ctrl >> 2) & 3) {
2109 case 0:
2110 for (j = valids; j >= 0; j--) {
2111 res <<= 1;
2112 v = pcmp_val(s, ctrl, j);
e01d9d31 2113 for (i = validd; i >= 0; i--) {
222a3336 2114 res |= (v == pcmp_val(d, ctrl, i));
e01d9d31 2115 }
222a3336
AZ
2116 }
2117 break;
2118 case 1:
2119 for (j = valids; j >= 0; j--) {
2120 res <<= 1;
2121 v = pcmp_val(s, ctrl, j);
e01d9d31 2122 for (i = ((validd - 1) | 1); i >= 0; i -= 2) {
649ad05e
AJ
2123 res |= (pcmp_val(d, ctrl, i - 0) >= v &&
2124 pcmp_val(d, ctrl, i - 1) <= v);
e01d9d31 2125 }
222a3336
AZ
2126 }
2127 break;
2128 case 2:
b27a6cac 2129 res = (1 << (upper - MAX(valids, validd))) - 1;
222a3336
AZ
2130 res <<= MAX(valids, validd) - MIN(valids, validd);
2131 for (i = MIN(valids, validd); i >= 0; i--) {
2132 res <<= 1;
2133 v = pcmp_val(s, ctrl, i);
2134 res |= (v == pcmp_val(d, ctrl, i));
2135 }
2136 break;
2137 case 3:
ae35eea7
JM
2138 if (validd == -1) {
2139 res = (2 << upper) - 1;
2140 break;
2141 }
bc921b27 2142 for (j = valids == upper ? valids : valids - validd; j >= 0; j--) {
222a3336 2143 res <<= 1;
75c9527e 2144 v = 1;
bc921b27 2145 for (i = MIN(valids - j, validd); i >= 0; i--) {
75c9527e 2146 v &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i));
e01d9d31 2147 }
75c9527e 2148 res |= v;
222a3336
AZ
2149 }
2150 break;
2151 }
2152
2153 switch ((ctrl >> 4) & 3) {
2154 case 1:
2155 res ^= (2 << upper) - 1;
2156 break;
2157 case 3:
e4eba27e 2158 res ^= (1 << (valids + 1)) - 1;
222a3336
AZ
2159 break;
2160 }
2161
e01d9d31
BS
2162 if (res) {
2163 CC_SRC |= CC_C;
2164 }
2165 if (res & 1) {
2166 CC_SRC |= CC_O;
2167 }
222a3336
AZ
2168
2169 return res;
2170}
2171
d3eb5eae
BS
2172void glue(helper_pcmpestri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2173 uint32_t ctrl)
222a3336 2174{
d3eb5eae
BS
2175 unsigned int res = pcmpxstrx(env, d, s, ctrl,
2176 pcmp_elen(env, R_EDX, ctrl),
2177 pcmp_elen(env, R_EAX, ctrl));
222a3336 2178
e01d9d31 2179 if (res) {
c334a388 2180 env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res);
e01d9d31 2181 } else {
222a3336 2182 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
e01d9d31 2183 }
222a3336
AZ
2184}
2185
d3eb5eae
BS
2186void glue(helper_pcmpestrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2187 uint32_t ctrl)
222a3336
AZ
2188{
2189 int i;
d3eb5eae
BS
2190 unsigned int res = pcmpxstrx(env, d, s, ctrl,
2191 pcmp_elen(env, R_EDX, ctrl),
2192 pcmp_elen(env, R_EAX, ctrl));
222a3336
AZ
2193
2194 if ((ctrl >> 6) & 1) {
e01d9d31 2195 if (ctrl & 1) {
bc426899 2196 for (i = 0; i < 8; i++, res >>= 1) {
2b8d7e9d 2197 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0;
bc426899 2198 }
e01d9d31 2199 } else {
bc426899 2200 for (i = 0; i < 16; i++, res >>= 1) {
2b8d7e9d 2201 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0;
bc426899 2202 }
e01d9d31 2203 }
222a3336 2204 } else {
2b8d7e9d
AJ
2205 env->xmm_regs[0].Q(1) = 0;
2206 env->xmm_regs[0].Q(0) = res;
222a3336
AZ
2207 }
2208}
2209
d3eb5eae
BS
2210void glue(helper_pcmpistri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2211 uint32_t ctrl)
222a3336 2212{
d3eb5eae 2213 unsigned int res = pcmpxstrx(env, d, s, ctrl,
e01d9d31
BS
2214 pcmp_ilen(s, ctrl),
2215 pcmp_ilen(d, ctrl));
222a3336 2216
e01d9d31 2217 if (res) {
c334a388 2218 env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res);
e01d9d31 2219 } else {
222a3336 2220 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
e01d9d31 2221 }
222a3336
AZ
2222}
2223
d3eb5eae
BS
2224void glue(helper_pcmpistrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2225 uint32_t ctrl)
222a3336
AZ
2226{
2227 int i;
d3eb5eae 2228 unsigned int res = pcmpxstrx(env, d, s, ctrl,
e01d9d31
BS
2229 pcmp_ilen(s, ctrl),
2230 pcmp_ilen(d, ctrl));
222a3336
AZ
2231
2232 if ((ctrl >> 6) & 1) {
e01d9d31 2233 if (ctrl & 1) {
bc426899 2234 for (i = 0; i < 8; i++, res >>= 1) {
2b8d7e9d 2235 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0;
bc426899 2236 }
e01d9d31 2237 } else {
bc426899 2238 for (i = 0; i < 16; i++, res >>= 1) {
2b8d7e9d 2239 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0;
bc426899 2240 }
e01d9d31 2241 }
222a3336 2242 } else {
2b8d7e9d
AJ
2243 env->xmm_regs[0].Q(1) = 0;
2244 env->xmm_regs[0].Q(0) = res;
222a3336
AZ
2245 }
2246}
2247
2248#define CRCPOLY 0x1edc6f41
2249#define CRCPOLY_BITREV 0x82f63b78
2250target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
2251{
2252 target_ulong crc = (msg & ((target_ulong) -1 >>
e01d9d31 2253 (TARGET_LONG_BITS - len))) ^ crc1;
222a3336 2254
e01d9d31 2255 while (len--) {
222a3336 2256 crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0);
e01d9d31 2257 }
222a3336
AZ
2258
2259 return crc;
2260}
2261
fd17264a
PB
2262#endif
2263
5a09df21
PB
2264#if SHIFT == 1
2265static void clmulq(uint64_t *dest_l, uint64_t *dest_h,
2266 uint64_t a, uint64_t b)
e71827bc 2267{
5a09df21 2268 uint64_t al, ah, resh, resl;
e71827bc
AJ
2269
2270 ah = 0;
5a09df21 2271 al = a;
e71827bc
AJ
2272 resh = resl = 0;
2273
2274 while (b) {
2275 if (b & 1) {
2276 resl ^= al;
2277 resh ^= ah;
2278 }
2279 ah = (ah << 1) | (al >> 63);
2280 al <<= 1;
2281 b >>= 1;
2282 }
2283
5a09df21
PB
2284 *dest_l = resl;
2285 *dest_h = resh;
2286}
2287#endif
2288
f05f9789 2289void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s,
5a09df21
PB
2290 uint32_t ctrl)
2291{
5a09df21
PB
2292 uint64_t a, b;
2293 int i;
2294
2295 for (i = 0; i < 1 << SHIFT; i += 2) {
2296 a = v->Q(((ctrl & 1) != 0) + i);
2297 b = s->Q(((ctrl & 16) != 0) + i);
2298 clmulq(&d->Q(i), &d->Q(i + 1), a, b);
2299 }
e71827bc 2300}
d640045a 2301
f05f9789 2302void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
d640045a
AJ
2303{
2304 int i;
f05f9789 2305 Reg st = *v;
d640045a
AJ
2306 Reg rk = *s;
2307
a64fc269
PB
2308 for (i = 0 ; i < 2 << SHIFT ; i++) {
2309 int j = i & 3;
2310 d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4 * j + 0])] ^
2311 AES_Td1[st.B(AES_ishifts[4 * j + 1])] ^
2312 AES_Td2[st.B(AES_ishifts[4 * j + 2])] ^
2313 AES_Td3[st.B(AES_ishifts[4 * j + 3])]);
d640045a
AJ
2314 }
2315}
2316
f05f9789 2317void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
d640045a
AJ
2318{
2319 int i;
f05f9789 2320 Reg st = *v;
d640045a
AJ
2321 Reg rk = *s;
2322
a64fc269
PB
2323 for (i = 0; i < 8 << SHIFT; i++) {
2324 d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i & 15] + (i & ~15))]);
d640045a
AJ
2325 }
2326}
2327
f05f9789 2328void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
d640045a
AJ
2329{
2330 int i;
f05f9789 2331 Reg st = *v;
d640045a
AJ
2332 Reg rk = *s;
2333
a64fc269
PB
2334 for (i = 0 ; i < 2 << SHIFT ; i++) {
2335 int j = i & 3;
2336 d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4 * j + 0])] ^
2337 AES_Te1[st.B(AES_shifts[4 * j + 1])] ^
2338 AES_Te2[st.B(AES_shifts[4 * j + 2])] ^
2339 AES_Te3[st.B(AES_shifts[4 * j + 3])]);
d640045a
AJ
2340 }
2341}
2342
f05f9789 2343void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *v, Reg *s)
d640045a
AJ
2344{
2345 int i;
f05f9789 2346 Reg st = *v;
d640045a
AJ
2347 Reg rk = *s;
2348
a64fc269
PB
2349 for (i = 0; i < 8 << SHIFT; i++) {
2350 d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i & 15] + (i & ~15))]);
d640045a 2351 }
d640045a
AJ
2352}
2353
a64fc269 2354#if SHIFT == 1
d640045a
AJ
2355void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2356{
2357 int i;
2358 Reg tmp = *s;
2359
2360 for (i = 0 ; i < 4 ; i++) {
a64fc269
PB
2361 d->L(i) = bswap32(AES_imc[tmp.B(4 * i + 0)][0] ^
2362 AES_imc[tmp.B(4 * i + 1)][1] ^
2363 AES_imc[tmp.B(4 * i + 2)][2] ^
2364 AES_imc[tmp.B(4 * i + 3)][3]);
d640045a
AJ
2365 }
2366}
2367
2368void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2369 uint32_t ctrl)
2370{
2371 int i;
2372 Reg tmp = *s;
2373
2374 for (i = 0 ; i < 4 ; i++) {
9551ea69
AJ
2375 d->B(i) = AES_sbox[tmp.B(i + 4)];
2376 d->B(i + 8) = AES_sbox[tmp.B(i + 12)];
d640045a
AJ
2377 }
2378 d->L(1) = (d->L(0) << 24 | d->L(0) >> 8) ^ ctrl;
2379 d->L(3) = (d->L(2) << 24 | d->L(2) >> 8) ^ ctrl;
2380}
222a3336 2381#endif
a64fc269 2382#endif
222a3336 2383
3403cafe
PB
2384#undef SSE_HELPER_S
2385
b98f886c 2386#undef LANE_WIDTH
664e0f19
FB
2387#undef SHIFT
2388#undef XMM_ONLY
2389#undef Reg
2390#undef B
2391#undef W
2392#undef L
2393#undef Q
2394#undef SUFFIX