]> git.proxmox.com Git - mirror_qemu.git/blame - target/i386/ops_sse.h
target/i386: Misc AVX helper prep
[mirror_qemu.git] / target / i386 / ops_sse.h
CommitLineData
664e0f19 1/*
222a3336 2 * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
5fafdf24 3 *
664e0f19 4 * Copyright (c) 2005 Fabrice Bellard
222a3336 5 * Copyright (c) 2008 Intel Corporation <andrew.zaborowski@intel.com>
664e0f19
FB
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
d9ff33ad 10 * version 2.1 of the License, or (at your option) any later version.
664e0f19
FB
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
8167ee88 18 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
664e0f19 19 */
04af534d 20
6f2945cd 21#include "crypto/aes.h"
04af534d 22
664e0f19
FB
23#if SHIFT == 0
24#define Reg MMXReg
001faf32 25#define XMM_ONLY(...)
664e0f19
FB
26#define B(n) MMX_B(n)
27#define W(n) MMX_W(n)
28#define L(n) MMX_L(n)
83625474 29#define Q(n) MMX_Q(n)
664e0f19
FB
30#define SUFFIX _mmx
31#else
fa451874 32#define Reg ZMMReg
001faf32 33#define XMM_ONLY(...) __VA_ARGS__
19cbd87c
EH
34#define B(n) ZMM_B(n)
35#define W(n) ZMM_W(n)
36#define L(n) ZMM_L(n)
37#define Q(n) ZMM_Q(n)
664e0f19
FB
38#define SUFFIX _xmm
39#endif
40
18592d2e 41#define LANE_WIDTH (SHIFT ? 16 : 8)
d45b0de6 42#define PACK_WIDTH (LANE_WIDTH / 2)
18592d2e 43
18592d2e
PB
44#if SHIFT == 0
45#define FPSRL(x, c) ((x) >> shift)
46#define FPSRAW(x, c) ((int16_t)(x) >> shift)
47#define FPSRAL(x, c) ((int32_t)(x) >> shift)
48#define FPSLL(x, c) ((x) << shift)
49#endif
50
51void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 52{
18592d2e 53 Reg *s = d;
664e0f19 54 int shift;
18592d2e
PB
55 if (c->Q(0) > 15) {
56 for (int i = 0; i < 1 << SHIFT; i++) {
57 d->Q(i) = 0;
58 }
664e0f19 59 } else {
18592d2e
PB
60 shift = c->B(0);
61 for (int i = 0; i < 4 << SHIFT; i++) {
62 d->W(i) = FPSRL(s->W(i), shift);
63 }
664e0f19
FB
64 }
65}
66
18592d2e 67void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 68{
18592d2e 69 Reg *s = d;
664e0f19 70 int shift;
18592d2e
PB
71 if (c->Q(0) > 15) {
72 for (int i = 0; i < 1 << SHIFT; i++) {
73 d->Q(i) = 0;
74 }
664e0f19 75 } else {
18592d2e
PB
76 shift = c->B(0);
77 for (int i = 0; i < 4 << SHIFT; i++) {
78 d->W(i) = FPSLL(s->W(i), shift);
79 }
664e0f19 80 }
664e0f19
FB
81}
82
18592d2e 83void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 84{
18592d2e 85 Reg *s = d;
664e0f19 86 int shift;
18592d2e
PB
87 if (c->Q(0) > 15) {
88 shift = 15;
664e0f19 89 } else {
18592d2e
PB
90 shift = c->B(0);
91 }
92 for (int i = 0; i < 4 << SHIFT; i++) {
93 d->W(i) = FPSRAW(s->W(i), shift);
664e0f19
FB
94 }
95}
96
18592d2e 97void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 98{
18592d2e 99 Reg *s = d;
664e0f19 100 int shift;
18592d2e
PB
101 if (c->Q(0) > 31) {
102 for (int i = 0; i < 1 << SHIFT; i++) {
103 d->Q(i) = 0;
104 }
664e0f19 105 } else {
18592d2e
PB
106 shift = c->B(0);
107 for (int i = 0; i < 2 << SHIFT; i++) {
108 d->L(i) = FPSRL(s->L(i), shift);
109 }
664e0f19
FB
110 }
111}
112
18592d2e 113void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 114{
18592d2e 115 Reg *s = d;
664e0f19 116 int shift;
18592d2e
PB
117 if (c->Q(0) > 31) {
118 for (int i = 0; i < 1 << SHIFT; i++) {
119 d->Q(i) = 0;
120 }
664e0f19 121 } else {
18592d2e
PB
122 shift = c->B(0);
123 for (int i = 0; i < 2 << SHIFT; i++) {
124 d->L(i) = FPSLL(s->L(i), shift);
125 }
664e0f19 126 }
664e0f19
FB
127}
128
18592d2e 129void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 130{
18592d2e 131 Reg *s = d;
664e0f19 132 int shift;
18592d2e
PB
133 if (c->Q(0) > 31) {
134 shift = 31;
664e0f19 135 } else {
18592d2e
PB
136 shift = c->B(0);
137 }
138 for (int i = 0; i < 2 << SHIFT; i++) {
139 d->L(i) = FPSRAL(s->L(i), shift);
664e0f19
FB
140 }
141}
142
18592d2e 143void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 144{
18592d2e 145 Reg *s = d;
664e0f19 146 int shift;
18592d2e
PB
147 if (c->Q(0) > 63) {
148 for (int i = 0; i < 1 << SHIFT; i++) {
149 d->Q(i) = 0;
150 }
664e0f19 151 } else {
18592d2e
PB
152 shift = c->B(0);
153 for (int i = 0; i < 1 << SHIFT; i++) {
154 d->Q(i) = FPSRL(s->Q(i), shift);
155 }
664e0f19
FB
156 }
157}
158
18592d2e 159void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 160{
18592d2e 161 Reg *s = d;
664e0f19 162 int shift;
18592d2e
PB
163 if (c->Q(0) > 63) {
164 for (int i = 0; i < 1 << SHIFT; i++) {
165 d->Q(i) = 0;
166 }
664e0f19 167 } else {
18592d2e
PB
168 shift = c->B(0);
169 for (int i = 0; i < 1 << SHIFT; i++) {
170 d->Q(i) = FPSLL(s->Q(i), shift);
171 }
664e0f19
FB
172 }
173}
174
18592d2e
PB
175#if SHIFT >= 1
176void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 177{
18592d2e
PB
178 Reg *s = d;
179 int shift, i, j;
664e0f19 180
18592d2e 181 shift = c->L(0);
e01d9d31 182 if (shift > 16) {
664e0f19 183 shift = 16;
e01d9d31 184 }
18592d2e
PB
185 for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) {
186 for (i = 0; i < 16 - shift; i++) {
187 d->B(j + i) = s->B(j + i + shift);
188 }
189 for (i = 16 - shift; i < 16; i++) {
190 d->B(j + i) = 0;
191 }
e01d9d31 192 }
664e0f19
FB
193}
194
18592d2e 195void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 196{
18592d2e
PB
197 Reg *s = d;
198 int shift, i, j;
664e0f19 199
18592d2e 200 shift = c->L(0);
e01d9d31 201 if (shift > 16) {
664e0f19 202 shift = 16;
e01d9d31 203 }
18592d2e
PB
204 for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) {
205 for (i = 15; i >= shift; i--) {
206 d->B(j + i) = s->B(j + i - shift);
207 }
208 for (i = 0; i < shift; i++) {
209 d->B(j + i) = 0;
210 }
e01d9d31 211 }
664e0f19
FB
212}
213#endif
214
ee04a3c8 215#define SSE_HELPER_1(name, elem, num, F) \
d3eb5eae 216 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
e01d9d31 217 { \
ee04a3c8
PB
218 int n = num; \
219 for (int i = 0; i < n; i++) { \
220 d->elem(i) = F(s->elem(i)); \
221 } \
222 }
e01d9d31 223
ee04a3c8 224#define SSE_HELPER_2(name, elem, num, F) \
d3eb5eae 225 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
e01d9d31 226 { \
ee04a3c8
PB
227 Reg *v = d; \
228 int n = num; \
229 for (int i = 0; i < n; i++) { \
230 d->elem(i) = F(v->elem(i), s->elem(i)); \
231 } \
232 }
233
234#define SSE_HELPER_B(name, F) \
235 SSE_HELPER_2(name, B, 8 << SHIFT, F)
236
237#define SSE_HELPER_W(name, F) \
238 SSE_HELPER_2(name, W, 4 << SHIFT, F)
e01d9d31
BS
239
240#define SSE_HELPER_L(name, F) \
ee04a3c8 241 SSE_HELPER_2(name, L, 2 << SHIFT, F)
e01d9d31
BS
242
243#define SSE_HELPER_Q(name, F) \
ee04a3c8 244 SSE_HELPER_2(name, Q, 1 << SHIFT, F)
664e0f19
FB
245
246#if SHIFT == 0
247static inline int satub(int x)
248{
e01d9d31 249 if (x < 0) {
664e0f19 250 return 0;
e01d9d31 251 } else if (x > 255) {
664e0f19 252 return 255;
e01d9d31 253 } else {
664e0f19 254 return x;
e01d9d31 255 }
664e0f19
FB
256}
257
258static inline int satuw(int x)
259{
e01d9d31 260 if (x < 0) {
664e0f19 261 return 0;
e01d9d31 262 } else if (x > 65535) {
664e0f19 263 return 65535;
e01d9d31 264 } else {
664e0f19 265 return x;
e01d9d31 266 }
664e0f19
FB
267}
268
269static inline int satsb(int x)
270{
e01d9d31 271 if (x < -128) {
664e0f19 272 return -128;
e01d9d31 273 } else if (x > 127) {
664e0f19 274 return 127;
e01d9d31 275 } else {
664e0f19 276 return x;
e01d9d31 277 }
664e0f19
FB
278}
279
280static inline int satsw(int x)
281{
e01d9d31 282 if (x < -32768) {
664e0f19 283 return -32768;
e01d9d31 284 } else if (x > 32767) {
664e0f19 285 return 32767;
e01d9d31 286 } else {
664e0f19 287 return x;
e01d9d31 288 }
664e0f19
FB
289}
290
291#define FADD(a, b) ((a) + (b))
292#define FADDUB(a, b) satub((a) + (b))
293#define FADDUW(a, b) satuw((a) + (b))
294#define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b))
295#define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b))
296
297#define FSUB(a, b) ((a) - (b))
298#define FSUBUB(a, b) satub((a) - (b))
299#define FSUBUW(a, b) satuw((a) - (b))
300#define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b))
301#define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b))
302#define FMINUB(a, b) ((a) < (b)) ? (a) : (b)
303#define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b)
304#define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
305#define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
306
e01d9d31 307#define FAND(a, b) ((a) & (b))
664e0f19 308#define FANDN(a, b) ((~(a)) & (b))
e01d9d31
BS
309#define FOR(a, b) ((a) | (b))
310#define FXOR(a, b) ((a) ^ (b))
664e0f19 311
e01d9d31
BS
312#define FCMPGTB(a, b) ((int8_t)(a) > (int8_t)(b) ? -1 : 0)
313#define FCMPGTW(a, b) ((int16_t)(a) > (int16_t)(b) ? -1 : 0)
314#define FCMPGTL(a, b) ((int32_t)(a) > (int32_t)(b) ? -1 : 0)
315#define FCMPEQ(a, b) ((a) == (b) ? -1 : 0)
664e0f19 316
e01d9d31
BS
317#define FMULLW(a, b) ((a) * (b))
318#define FMULHRW(a, b) (((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16)
319#define FMULHUW(a, b) ((a) * (b) >> 16)
320#define FMULHW(a, b) ((int16_t)(a) * (int16_t)(b) >> 16)
664e0f19 321
e01d9d31 322#define FAVG(a, b) (((a) + (b) + 1) >> 1)
664e0f19
FB
323#endif
324
5af45186
FB
325SSE_HELPER_B(helper_paddb, FADD)
326SSE_HELPER_W(helper_paddw, FADD)
327SSE_HELPER_L(helper_paddl, FADD)
328SSE_HELPER_Q(helper_paddq, FADD)
664e0f19 329
5af45186
FB
330SSE_HELPER_B(helper_psubb, FSUB)
331SSE_HELPER_W(helper_psubw, FSUB)
332SSE_HELPER_L(helper_psubl, FSUB)
333SSE_HELPER_Q(helper_psubq, FSUB)
664e0f19 334
5af45186
FB
335SSE_HELPER_B(helper_paddusb, FADDUB)
336SSE_HELPER_B(helper_paddsb, FADDSB)
337SSE_HELPER_B(helper_psubusb, FSUBUB)
338SSE_HELPER_B(helper_psubsb, FSUBSB)
664e0f19 339
5af45186
FB
340SSE_HELPER_W(helper_paddusw, FADDUW)
341SSE_HELPER_W(helper_paddsw, FADDSW)
342SSE_HELPER_W(helper_psubusw, FSUBUW)
343SSE_HELPER_W(helper_psubsw, FSUBSW)
664e0f19 344
5af45186
FB
345SSE_HELPER_B(helper_pminub, FMINUB)
346SSE_HELPER_B(helper_pmaxub, FMAXUB)
664e0f19 347
5af45186
FB
348SSE_HELPER_W(helper_pminsw, FMINSW)
349SSE_HELPER_W(helper_pmaxsw, FMAXSW)
664e0f19 350
5af45186
FB
351SSE_HELPER_Q(helper_pand, FAND)
352SSE_HELPER_Q(helper_pandn, FANDN)
353SSE_HELPER_Q(helper_por, FOR)
354SSE_HELPER_Q(helper_pxor, FXOR)
664e0f19 355
5af45186
FB
356SSE_HELPER_B(helper_pcmpgtb, FCMPGTB)
357SSE_HELPER_W(helper_pcmpgtw, FCMPGTW)
358SSE_HELPER_L(helper_pcmpgtl, FCMPGTL)
664e0f19 359
5af45186
FB
360SSE_HELPER_B(helper_pcmpeqb, FCMPEQ)
361SSE_HELPER_W(helper_pcmpeqw, FCMPEQ)
362SSE_HELPER_L(helper_pcmpeql, FCMPEQ)
664e0f19 363
5af45186 364SSE_HELPER_W(helper_pmullw, FMULLW)
a35f3ec7 365#if SHIFT == 0
5af45186 366SSE_HELPER_W(helper_pmulhrw, FMULHRW)
a35f3ec7 367#endif
5af45186
FB
368SSE_HELPER_W(helper_pmulhuw, FMULHUW)
369SSE_HELPER_W(helper_pmulhw, FMULHW)
664e0f19 370
5af45186
FB
371SSE_HELPER_B(helper_pavgb, FAVG)
372SSE_HELPER_W(helper_pavgw, FAVG)
664e0f19 373
d3eb5eae 374void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 375{
e894bae8
PB
376 Reg *v = d;
377 int i;
378
379 for (i = 0; i < (1 << SHIFT); i++) {
380 d->Q(i) = (uint64_t)s->L(i * 2) * (uint64_t)v->L(i * 2);
381 }
664e0f19
FB
382}
383
d3eb5eae 384void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 385{
e894bae8 386 Reg *v = d;
664e0f19 387 int i;
664e0f19 388
e01d9d31 389 for (i = 0; i < (2 << SHIFT); i++) {
e894bae8
PB
390 d->L(i) = (int16_t)s->W(2 * i) * (int16_t)v->W(2 * i) +
391 (int16_t)s->W(2 * i + 1) * (int16_t)v->W(2 * i + 1);
664e0f19
FB
392 }
393}
394
395#if SHIFT == 0
396static inline int abs1(int a)
397{
e01d9d31 398 if (a < 0) {
664e0f19 399 return -a;
e01d9d31 400 } else {
664e0f19 401 return a;
e01d9d31 402 }
664e0f19
FB
403}
404#endif
e894bae8 405
d3eb5eae 406void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 407{
e894bae8
PB
408 Reg *v = d;
409 int i;
664e0f19 410
e894bae8
PB
411 for (i = 0; i < (1 << SHIFT); i++) {
412 unsigned int val = 0;
413 val += abs1(v->B(8 * i + 0) - s->B(8 * i + 0));
414 val += abs1(v->B(8 * i + 1) - s->B(8 * i + 1));
415 val += abs1(v->B(8 * i + 2) - s->B(8 * i + 2));
416 val += abs1(v->B(8 * i + 3) - s->B(8 * i + 3));
417 val += abs1(v->B(8 * i + 4) - s->B(8 * i + 4));
418 val += abs1(v->B(8 * i + 5) - s->B(8 * i + 5));
419 val += abs1(v->B(8 * i + 6) - s->B(8 * i + 6));
420 val += abs1(v->B(8 * i + 7) - s->B(8 * i + 7));
421 d->Q(i) = val;
422 }
664e0f19
FB
423}
424
fd17264a 425#if SHIFT < 2
d3eb5eae
BS
426void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
427 target_ulong a0)
664e0f19
FB
428{
429 int i;
e01d9d31
BS
430
431 for (i = 0; i < (8 << SHIFT); i++) {
432 if (s->B(i) & 0x80) {
4054cdec 433 cpu_stb_data_ra(env, a0 + i, d->B(i), GETPC());
e01d9d31 434 }
664e0f19
FB
435 }
436}
fd17264a 437#endif
664e0f19 438
e01d9d31 439void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32_t val)
664e0f19 440{
e894bae8
PB
441 int i;
442
5af45186 443 d->L(0) = val;
664e0f19 444 d->L(1) = 0;
e894bae8
PB
445 for (i = 1; i < (1 << SHIFT); i++) {
446 d->Q(i) = 0;
447 }
664e0f19
FB
448}
449
dabd98dd 450#ifdef TARGET_X86_64
e01d9d31 451void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val)
dabd98dd 452{
e894bae8
PB
453 int i;
454
5af45186 455 d->Q(0) = val;
e894bae8
PB
456 for (i = 1; i < (1 << SHIFT); i++) {
457 d->Q(i) = 0;
458 }
dabd98dd 459}
dabd98dd
FB
460#endif
461
d45b0de6
PB
462#define SHUFFLE4(F, a, b, offset) do { \
463 r0 = a->F((order & 3) + offset); \
464 r1 = a->F(((order >> 2) & 3) + offset); \
465 r2 = b->F(((order >> 4) & 3) + offset); \
466 r3 = b->F(((order >> 6) & 3) + offset); \
467 d->F(offset) = r0; \
468 d->F(offset + 1) = r1; \
469 d->F(offset + 2) = r2; \
470 d->F(offset + 3) = r3; \
471 } while (0)
472
664e0f19 473#if SHIFT == 0
e01d9d31 474void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
664e0f19 475{
d45b0de6 476 uint16_t r0, r1, r2, r3;
e01d9d31 477
d45b0de6 478 SHUFFLE4(W, s, s, 0);
664e0f19
FB
479}
480#else
ce4fa29f 481void glue(helper_shufps, SUFFIX)(Reg *d, Reg *s, int order)
d52cf7a6 482{
d45b0de6
PB
483 Reg *v = d;
484 uint32_t r0, r1, r2, r3;
485 int i;
e01d9d31 486
d45b0de6
PB
487 for (i = 0; i < 2 << SHIFT; i += 4) {
488 SHUFFLE4(L, v, s, i);
489 }
d52cf7a6
FB
490}
491
ce4fa29f 492void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *s, int order)
664e0f19 493{
d45b0de6
PB
494 Reg *v = d;
495 uint64_t r0, r1;
496 int i;
e01d9d31 497
d45b0de6
PB
498 for (i = 0; i < 1 << SHIFT; i += 2) {
499 r0 = v->Q(((order & 1) & 1) + i);
500 r1 = s->Q(((order >> 1) & 1) + i);
501 d->Q(i) = r0;
502 d->Q(i + 1) = r1;
503 order >>= 2;
504 }
664e0f19
FB
505}
506
e01d9d31 507void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
664e0f19 508{
d45b0de6
PB
509 uint32_t r0, r1, r2, r3;
510 int i;
e01d9d31 511
d45b0de6
PB
512 for (i = 0; i < 2 << SHIFT; i += 4) {
513 SHUFFLE4(L, s, s, i);
514 }
664e0f19
FB
515}
516
e01d9d31 517void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
664e0f19 518{
d45b0de6
PB
519 uint16_t r0, r1, r2, r3;
520 int i, j;
e01d9d31 521
d45b0de6
PB
522 for (i = 0, j = 1; j < 1 << SHIFT; i += 8, j += 2) {
523 SHUFFLE4(W, s, s, i);
524 d->Q(j) = s->Q(j);
525 }
664e0f19
FB
526}
527
e01d9d31 528void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
664e0f19 529{
d45b0de6
PB
530 uint16_t r0, r1, r2, r3;
531 int i, j;
e01d9d31 532
d45b0de6
PB
533 for (i = 4, j = 0; j < 1 << SHIFT; i += 8, j += 2) {
534 d->Q(j) = s->Q(j);
535 SHUFFLE4(W, s, s, i);
536 }
664e0f19
FB
537}
538#endif
539
3403cafe 540#if SHIFT >= 1
664e0f19
FB
541/* FPU ops */
542/* XXX: not accurate */
543
3403cafe
PB
544#define SSE_HELPER_P(name, F) \
545 void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \
546 Reg *d, Reg *s) \
d3eb5eae 547 { \
3403cafe
PB
548 Reg *v = d; \
549 int i; \
550 for (i = 0; i < 2 << SHIFT; i++) { \
551 d->ZMM_S(i) = F(32, v->ZMM_S(i), s->ZMM_S(i)); \
552 } \
d3eb5eae
BS
553 } \
554 \
3403cafe
PB
555 void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \
556 Reg *d, Reg *s) \
d3eb5eae 557 { \
3403cafe
PB
558 Reg *v = d; \
559 int i; \
560 for (i = 0; i < 1 << SHIFT; i++) { \
561 d->ZMM_D(i) = F(64, v->ZMM_D(i), s->ZMM_D(i)); \
562 } \
563 }
564
565#if SHIFT == 1
566
567#define SSE_HELPER_S(name, F) \
568 SSE_HELPER_P(name, F) \
d3eb5eae 569 \
3403cafe 570 void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)\
d3eb5eae 571 { \
3403cafe
PB
572 Reg *v = d; \
573 d->ZMM_S(0) = F(32, v->ZMM_S(0), s->ZMM_S(0)); \
d3eb5eae
BS
574 } \
575 \
3403cafe 576 void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)\
d3eb5eae 577 { \
3403cafe
PB
578 Reg *v = d; \
579 d->ZMM_D(0) = F(64, v->ZMM_D(0), s->ZMM_D(0)); \
e01d9d31 580 }
664e0f19 581
3403cafe
PB
582#else
583
584#define SSE_HELPER_S(name, F) SSE_HELPER_P(name, F)
585
586#endif
587
7a0e1f41
FB
588#define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status)
589#define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status)
590#define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status)
591#define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status)
664e0f19 592
a4d1f142
AJ
593/* Note that the choice of comparison op here is important to get the
594 * special cases right: for min and max Intel specifies that (-0,0),
595 * (NaN, anything) and (anything, NaN) return the second argument.
596 */
e01d9d31
BS
597#define FPU_MIN(size, a, b) \
598 (float ## size ## _lt(a, b, &env->sse_status) ? (a) : (b))
599#define FPU_MAX(size, a, b) \
600 (float ## size ## _lt(b, a, &env->sse_status) ? (a) : (b))
a4d1f142 601
5af45186
FB
602SSE_HELPER_S(add, FPU_ADD)
603SSE_HELPER_S(sub, FPU_SUB)
604SSE_HELPER_S(mul, FPU_MUL)
605SSE_HELPER_S(div, FPU_DIV)
606SSE_HELPER_S(min, FPU_MIN)
607SSE_HELPER_S(max, FPU_MAX)
664e0f19 608
3403cafe
PB
609void glue(helper_sqrtps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
610{
611 int i;
612 for (i = 0; i < 2 << SHIFT; i++) {
613 d->ZMM_S(i) = float32_sqrt(s->ZMM_S(i), &env->sse_status);
614 }
615}
616
617void glue(helper_sqrtpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
618{
619 int i;
620 for (i = 0; i < 1 << SHIFT; i++) {
621 d->ZMM_D(i) = float64_sqrt(s->ZMM_D(i), &env->sse_status);
622 }
623}
624
625#if SHIFT == 1
626void helper_sqrtss(CPUX86State *env, Reg *d, Reg *s)
627{
628 d->ZMM_S(0) = float32_sqrt(s->ZMM_S(0), &env->sse_status);
629}
630
631void helper_sqrtsd(CPUX86State *env, Reg *d, Reg *s)
632{
633 d->ZMM_D(0) = float64_sqrt(s->ZMM_D(0), &env->sse_status);
634}
635#endif
664e0f19
FB
636
637/* float to float conversions */
ce4fa29f 638void glue(helper_cvtps2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 639{
fd17264a
PB
640 int i;
641 for (i = 1 << SHIFT; --i >= 0; ) {
642 d->ZMM_D(i) = float32_to_float64(s->ZMM_S(i), &env->sse_status);
643 }
664e0f19
FB
644}
645
ce4fa29f 646void glue(helper_cvtpd2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 647{
fd17264a
PB
648 int i;
649 for (i = 0; i < 1 << SHIFT; i++) {
650 d->ZMM_S(i) = float64_to_float32(s->ZMM_D(i), &env->sse_status);
651 }
652 for (i >>= 1; i < 1 << SHIFT; i++) {
653 d->Q(i) = 0;
654 }
664e0f19
FB
655}
656
fd17264a 657#if SHIFT == 1
d3eb5eae 658void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *s)
664e0f19 659{
19cbd87c 660 d->ZMM_D(0) = float32_to_float64(s->ZMM_S(0), &env->sse_status);
664e0f19
FB
661}
662
d3eb5eae 663void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *s)
664e0f19 664{
19cbd87c 665 d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status);
664e0f19 666}
fd17264a 667#endif
664e0f19
FB
668
669/* integer to float */
ce4fa29f 670void glue(helper_cvtdq2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 671{
fd17264a
PB
672 int i;
673 for (i = 0; i < 2 << SHIFT; i++) {
674 d->ZMM_S(i) = int32_to_float32(s->ZMM_L(i), &env->sse_status);
675 }
664e0f19
FB
676}
677
ce4fa29f 678void glue(helper_cvtdq2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 679{
fd17264a
PB
680 int i;
681 for (i = 1 << SHIFT; --i >= 0; ) {
682 int32_t l = s->ZMM_L(i);
683 d->ZMM_D(i) = int32_to_float64(l, &env->sse_status);
684 }
664e0f19
FB
685}
686
fd17264a 687#if SHIFT == 1
fa451874 688void helper_cvtpi2ps(CPUX86State *env, ZMMReg *d, MMXReg *s)
664e0f19 689{
19cbd87c
EH
690 d->ZMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status);
691 d->ZMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status);
664e0f19
FB
692}
693
fa451874 694void helper_cvtpi2pd(CPUX86State *env, ZMMReg *d, MMXReg *s)
664e0f19 695{
19cbd87c
EH
696 d->ZMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status);
697 d->ZMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status);
664e0f19
FB
698}
699
fa451874 700void helper_cvtsi2ss(CPUX86State *env, ZMMReg *d, uint32_t val)
664e0f19 701{
19cbd87c 702 d->ZMM_S(0) = int32_to_float32(val, &env->sse_status);
664e0f19
FB
703}
704
fa451874 705void helper_cvtsi2sd(CPUX86State *env, ZMMReg *d, uint32_t val)
664e0f19 706{
19cbd87c 707 d->ZMM_D(0) = int32_to_float64(val, &env->sse_status);
664e0f19
FB
708}
709
710#ifdef TARGET_X86_64
fa451874 711void helper_cvtsq2ss(CPUX86State *env, ZMMReg *d, uint64_t val)
664e0f19 712{
19cbd87c 713 d->ZMM_S(0) = int64_to_float32(val, &env->sse_status);
664e0f19
FB
714}
715
fa451874 716void helper_cvtsq2sd(CPUX86State *env, ZMMReg *d, uint64_t val)
664e0f19 717{
19cbd87c 718 d->ZMM_D(0) = int64_to_float64(val, &env->sse_status);
664e0f19
FB
719}
720#endif
721
fd17264a
PB
722#endif
723
664e0f19 724/* float to integer */
1e8a98b5 725
fd17264a 726#if SHIFT == 1
1e8a98b5
PM
727/*
728 * x86 mandates that we return the indefinite integer value for the result
729 * of any float-to-integer conversion that raises the 'invalid' exception.
730 * Wrap the softfloat functions to get this behaviour.
731 */
732#define WRAP_FLOATCONV(RETTYPE, FN, FLOATTYPE, INDEFVALUE) \
733 static inline RETTYPE x86_##FN(FLOATTYPE a, float_status *s) \
734 { \
735 int oldflags, newflags; \
736 RETTYPE r; \
737 \
738 oldflags = get_float_exception_flags(s); \
739 set_float_exception_flags(0, s); \
740 r = FN(a, s); \
741 newflags = get_float_exception_flags(s); \
742 if (newflags & float_flag_invalid) { \
743 r = INDEFVALUE; \
744 } \
745 set_float_exception_flags(newflags | oldflags, s); \
746 return r; \
747 }
748
749WRAP_FLOATCONV(int32_t, float32_to_int32, float32, INT32_MIN)
750WRAP_FLOATCONV(int32_t, float32_to_int32_round_to_zero, float32, INT32_MIN)
751WRAP_FLOATCONV(int32_t, float64_to_int32, float64, INT32_MIN)
752WRAP_FLOATCONV(int32_t, float64_to_int32_round_to_zero, float64, INT32_MIN)
753WRAP_FLOATCONV(int64_t, float32_to_int64, float32, INT64_MIN)
754WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, float32, INT64_MIN)
755WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN)
756WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN)
fd17264a 757#endif
1e8a98b5 758
ce4fa29f 759void glue(helper_cvtps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 760{
fd17264a
PB
761 int i;
762 for (i = 0; i < 2 << SHIFT; i++) {
763 d->ZMM_L(i) = x86_float32_to_int32(s->ZMM_S(i), &env->sse_status);
764 }
664e0f19
FB
765}
766
ce4fa29f 767void glue(helper_cvtpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 768{
fd17264a
PB
769 int i;
770 for (i = 0; i < 1 << SHIFT; i++) {
771 d->ZMM_L(i) = x86_float64_to_int32(s->ZMM_D(i), &env->sse_status);
772 }
773 for (i >>= 1; i < 1 << SHIFT; i++) {
774 d->Q(i) = 0;
775 }
664e0f19
FB
776}
777
fd17264a 778#if SHIFT == 1
fa451874 779void helper_cvtps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
664e0f19 780{
1e8a98b5
PM
781 d->MMX_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
782 d->MMX_L(1) = x86_float32_to_int32(s->ZMM_S(1), &env->sse_status);
664e0f19
FB
783}
784
fa451874 785void helper_cvtpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
664e0f19 786{
1e8a98b5
PM
787 d->MMX_L(0) = x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
788 d->MMX_L(1) = x86_float64_to_int32(s->ZMM_D(1), &env->sse_status);
664e0f19
FB
789}
790
fa451874 791int32_t helper_cvtss2si(CPUX86State *env, ZMMReg *s)
664e0f19 792{
1e8a98b5 793 return x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
664e0f19
FB
794}
795
fa451874 796int32_t helper_cvtsd2si(CPUX86State *env, ZMMReg *s)
664e0f19 797{
1e8a98b5 798 return x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
664e0f19
FB
799}
800
801#ifdef TARGET_X86_64
fa451874 802int64_t helper_cvtss2sq(CPUX86State *env, ZMMReg *s)
664e0f19 803{
1e8a98b5 804 return x86_float32_to_int64(s->ZMM_S(0), &env->sse_status);
664e0f19
FB
805}
806
fa451874 807int64_t helper_cvtsd2sq(CPUX86State *env, ZMMReg *s)
664e0f19 808{
1e8a98b5 809 return x86_float64_to_int64(s->ZMM_D(0), &env->sse_status);
664e0f19
FB
810}
811#endif
fd17264a 812#endif
664e0f19
FB
813
814/* float to integer truncated */
ce4fa29f 815void glue(helper_cvttps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 816{
fd17264a
PB
817 int i;
818 for (i = 0; i < 2 << SHIFT; i++) {
819 d->ZMM_L(i) = x86_float32_to_int32_round_to_zero(s->ZMM_S(i),
820 &env->sse_status);
821 }
664e0f19
FB
822}
823
ce4fa29f 824void glue(helper_cvttpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 825{
fd17264a
PB
826 int i;
827 for (i = 0; i < 1 << SHIFT; i++) {
828 d->ZMM_L(i) = x86_float64_to_int32_round_to_zero(s->ZMM_D(i),
829 &env->sse_status);
830 }
831 for (i >>= 1; i < 1 << SHIFT; i++) {
832 d->Q(i) = 0;
833 }
664e0f19
FB
834}
835
fd17264a 836#if SHIFT == 1
fa451874 837void helper_cvttps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
664e0f19 838{
1e8a98b5
PM
839 d->MMX_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
840 d->MMX_L(1) = x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status);
664e0f19
FB
841}
842
fa451874 843void helper_cvttpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
664e0f19 844{
1e8a98b5
PM
845 d->MMX_L(0) = x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
846 d->MMX_L(1) = x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status);
664e0f19
FB
847}
848
fa451874 849int32_t helper_cvttss2si(CPUX86State *env, ZMMReg *s)
664e0f19 850{
1e8a98b5 851 return x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
664e0f19
FB
852}
853
fa451874 854int32_t helper_cvttsd2si(CPUX86State *env, ZMMReg *s)
664e0f19 855{
1e8a98b5 856 return x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
664e0f19
FB
857}
858
859#ifdef TARGET_X86_64
fa451874 860int64_t helper_cvttss2sq(CPUX86State *env, ZMMReg *s)
664e0f19 861{
1e8a98b5 862 return x86_float32_to_int64_round_to_zero(s->ZMM_S(0), &env->sse_status);
664e0f19
FB
863}
864
fa451874 865int64_t helper_cvttsd2sq(CPUX86State *env, ZMMReg *s)
664e0f19 866{
1e8a98b5 867 return x86_float64_to_int64_round_to_zero(s->ZMM_D(0), &env->sse_status);
664e0f19
FB
868}
869#endif
fd17264a 870#endif
664e0f19 871
ce4fa29f 872void glue(helper_rsqrtps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 873{
418b0f93 874 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
3403cafe
PB
875 int i;
876 for (i = 0; i < 2 << SHIFT; i++) {
877 d->ZMM_S(i) = float32_div(float32_one,
878 float32_sqrt(s->ZMM_S(i), &env->sse_status),
879 &env->sse_status);
880 }
418b0f93 881 set_float_exception_flags(old_flags, &env->sse_status);
664e0f19
FB
882}
883
fd17264a 884#if SHIFT == 1
fa451874 885void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 886{
418b0f93 887 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
19cbd87c
EH
888 d->ZMM_S(0) = float32_div(float32_one,
889 float32_sqrt(s->ZMM_S(0), &env->sse_status),
c2ef9a83 890 &env->sse_status);
418b0f93 891 set_float_exception_flags(old_flags, &env->sse_status);
664e0f19 892}
fd17264a 893#endif
664e0f19 894
ce4fa29f 895void glue(helper_rcpps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 896{
418b0f93 897 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
3403cafe
PB
898 int i;
899 for (i = 0; i < 2 << SHIFT; i++) {
900 d->ZMM_S(i) = float32_div(float32_one, s->ZMM_S(i), &env->sse_status);
901 }
418b0f93 902 set_float_exception_flags(old_flags, &env->sse_status);
664e0f19
FB
903}
904
fd17264a 905#if SHIFT == 1
fa451874 906void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 907{
418b0f93 908 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
19cbd87c 909 d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status);
418b0f93 910 set_float_exception_flags(old_flags, &env->sse_status);
664e0f19 911}
fd17264a 912#endif
664e0f19 913
fd17264a 914#if SHIFT == 1
d9f4bb27
AP
915static inline uint64_t helper_extrq(uint64_t src, int shift, int len)
916{
917 uint64_t mask;
918
919 if (len == 0) {
920 mask = ~0LL;
921 } else {
922 mask = (1ULL << len) - 1;
923 }
924 return (src >> shift) & mask;
925}
926
fa451874 927void helper_extrq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s)
d9f4bb27 928{
19cbd87c 929 d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), s->ZMM_B(1), s->ZMM_B(0));
d9f4bb27
AP
930}
931
fa451874 932void helper_extrq_i(CPUX86State *env, ZMMReg *d, int index, int length)
d9f4bb27 933{
19cbd87c 934 d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), index, length);
d9f4bb27
AP
935}
936
937static inline uint64_t helper_insertq(uint64_t src, int shift, int len)
938{
939 uint64_t mask;
940
941 if (len == 0) {
942 mask = ~0ULL;
943 } else {
944 mask = (1ULL << len) - 1;
945 }
946 return (src & ~(mask << shift)) | ((src & mask) << shift);
947}
948
fa451874 949void helper_insertq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s)
d9f4bb27 950{
19cbd87c 951 d->ZMM_Q(0) = helper_insertq(s->ZMM_Q(0), s->ZMM_B(9), s->ZMM_B(8));
d9f4bb27
AP
952}
953
fa451874 954void helper_insertq_i(CPUX86State *env, ZMMReg *d, int index, int length)
d9f4bb27 955{
19cbd87c 956 d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), index, length);
d9f4bb27 957}
fd17264a 958#endif
d9f4bb27 959
6567ffb4
PB
960#define SSE_HELPER_HPS(name, F) \
961void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
962{ \
963 Reg *v = d; \
964 float32 r[2 << SHIFT]; \
965 int i, j, k; \
966 for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \
967 for (i = j = 0; j < 4; i++, j += 2) { \
968 r[i + k] = F(v->ZMM_S(j + k), v->ZMM_S(j + k + 1), &env->sse_status); \
969 } \
970 for (j = 0; j < 4; i++, j += 2) { \
971 r[i + k] = F(s->ZMM_S(j + k), s->ZMM_S(j + k + 1), &env->sse_status); \
972 } \
973 } \
974 for (i = 0; i < 2 << SHIFT; i++) { \
975 d->ZMM_S(i) = r[i]; \
976 } \
977}
978
979SSE_HELPER_HPS(haddps, float32_add)
980SSE_HELPER_HPS(hsubps, float32_sub)
981
982#define SSE_HELPER_HPD(name, F) \
983void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
984{ \
985 Reg *v = d; \
986 float64 r[1 << SHIFT]; \
987 int i, j, k; \
988 for (k = 0; k < 1 << SHIFT; k += LANE_WIDTH / 8) { \
989 for (i = j = 0; j < 2; i++, j += 2) { \
990 r[i + k] = F(v->ZMM_D(j + k), v->ZMM_D(j + k + 1), &env->sse_status); \
991 } \
992 for (j = 0; j < 2; i++, j += 2) { \
993 r[i + k] = F(s->ZMM_D(j + k), s->ZMM_D(j + k + 1), &env->sse_status); \
994 } \
995 } \
996 for (i = 0; i < 1 << SHIFT; i++) { \
997 d->ZMM_D(i) = r[i]; \
998 } \
999}
1000
1001SSE_HELPER_HPD(haddpd, float64_add)
1002SSE_HELPER_HPD(hsubpd, float64_sub)
664e0f19 1003
3403cafe 1004void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 1005{
3403cafe
PB
1006 Reg *v = d;
1007 int i;
1008 for (i = 0; i < 2 << SHIFT; i += 2) {
1009 d->ZMM_S(i) = float32_sub(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status);
1010 d->ZMM_S(i+1) = float32_add(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status);
1011 }
664e0f19
FB
1012}
1013
3403cafe 1014void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 1015{
3403cafe
PB
1016 Reg *v = d;
1017 int i;
1018 for (i = 0; i < 1 << SHIFT; i += 2) {
1019 d->ZMM_D(i) = float64_sub(v->ZMM_D(i), s->ZMM_D(i), &env->sse_status);
1020 d->ZMM_D(i+1) = float64_add(v->ZMM_D(i+1), s->ZMM_D(i+1), &env->sse_status);
1021 }
664e0f19
FB
1022}
1023
cbf4ad54
PB
1024#define SSE_HELPER_CMP_P(name, F, C) \
1025 void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \
1026 Reg *d, Reg *s) \
d3eb5eae 1027 { \
cbf4ad54
PB
1028 Reg *v = d; \
1029 int i; \
1030 for (i = 0; i < 2 << SHIFT; i++) { \
1031 d->ZMM_L(i) = C(F(32, v->ZMM_S(i), s->ZMM_S(i))) ? -1 : 0; \
1032 } \
d3eb5eae
BS
1033 } \
1034 \
cbf4ad54
PB
1035 void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \
1036 Reg *d, Reg *s) \
d3eb5eae 1037 { \
cbf4ad54
PB
1038 Reg *v = d; \
1039 int i; \
1040 for (i = 0; i < 1 << SHIFT; i++) { \
1041 d->ZMM_Q(i) = C(F(64, v->ZMM_D(i), s->ZMM_D(i))) ? -1 : 0; \
1042 } \
1043 }
1044
1045#if SHIFT == 1
1046#define SSE_HELPER_CMP(name, F, C) \
1047 SSE_HELPER_CMP_P(name, F, C) \
1048 void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s) \
1049 { \
1050 Reg *v = d; \
1051 d->ZMM_L(0) = C(F(32, v->ZMM_S(0), s->ZMM_S(0))) ? -1 : 0; \
1052 } \
1053 \
1054 void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s) \
1055 { \
1056 Reg *v = d; \
1057 d->ZMM_Q(0) = C(F(64, v->ZMM_D(0), s->ZMM_D(0))) ? -1 : 0; \
e01d9d31
BS
1058 }
1059
cbf4ad54
PB
1060#define FPU_EQ(x) (x == float_relation_equal)
1061#define FPU_LT(x) (x == float_relation_less)
1062#define FPU_LE(x) (x <= float_relation_equal)
1063#define FPU_UNORD(x) (x == float_relation_unordered)
1064
1065#define FPU_CMPQ(size, a, b) \
1066 float ## size ## _compare_quiet(a, b, &env->sse_status)
1067#define FPU_CMPS(size, a, b) \
1068 float ## size ## _compare(a, b, &env->sse_status)
1069
1070#else
1071#define SSE_HELPER_CMP(name, F, C) SSE_HELPER_CMP_P(name, F, C)
1072#endif
1073
1074SSE_HELPER_CMP(cmpeq, FPU_CMPQ, FPU_EQ)
1075SSE_HELPER_CMP(cmplt, FPU_CMPS, FPU_LT)
1076SSE_HELPER_CMP(cmple, FPU_CMPS, FPU_LE)
1077SSE_HELPER_CMP(cmpunord, FPU_CMPQ, FPU_UNORD)
1078SSE_HELPER_CMP(cmpneq, FPU_CMPQ, !FPU_EQ)
1079SSE_HELPER_CMP(cmpnlt, FPU_CMPS, !FPU_LT)
1080SSE_HELPER_CMP(cmpnle, FPU_CMPS, !FPU_LE)
1081SSE_HELPER_CMP(cmpord, FPU_CMPQ, !FPU_UNORD)
1082
1083#undef SSE_HELPER_CMP
664e0f19 1084
fd17264a 1085#if SHIFT == 1
1e6eec8b 1086static const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
43fb823b 1087
d3eb5eae 1088void helper_ucomiss(CPUX86State *env, Reg *d, Reg *s)
664e0f19 1089{
71bfd65c 1090 FloatRelation ret;
8422b113 1091 float32 s0, s1;
664e0f19 1092
19cbd87c
EH
1093 s0 = d->ZMM_S(0);
1094 s1 = s->ZMM_S(0);
43fb823b
FB
1095 ret = float32_compare_quiet(s0, s1, &env->sse_status);
1096 CC_SRC = comis_eflags[ret + 1];
664e0f19
FB
1097}
1098
d3eb5eae 1099void helper_comiss(CPUX86State *env, Reg *d, Reg *s)
664e0f19 1100{
71bfd65c 1101 FloatRelation ret;
8422b113 1102 float32 s0, s1;
664e0f19 1103
19cbd87c
EH
1104 s0 = d->ZMM_S(0);
1105 s1 = s->ZMM_S(0);
43fb823b
FB
1106 ret = float32_compare(s0, s1, &env->sse_status);
1107 CC_SRC = comis_eflags[ret + 1];
664e0f19
FB
1108}
1109
d3eb5eae 1110void helper_ucomisd(CPUX86State *env, Reg *d, Reg *s)
664e0f19 1111{
71bfd65c 1112 FloatRelation ret;
8422b113 1113 float64 d0, d1;
664e0f19 1114
19cbd87c
EH
1115 d0 = d->ZMM_D(0);
1116 d1 = s->ZMM_D(0);
43fb823b
FB
1117 ret = float64_compare_quiet(d0, d1, &env->sse_status);
1118 CC_SRC = comis_eflags[ret + 1];
664e0f19
FB
1119}
1120
d3eb5eae 1121void helper_comisd(CPUX86State *env, Reg *d, Reg *s)
664e0f19 1122{
71bfd65c 1123 FloatRelation ret;
8422b113 1124 float64 d0, d1;
664e0f19 1125
19cbd87c
EH
1126 d0 = d->ZMM_D(0);
1127 d1 = s->ZMM_D(0);
43fb823b
FB
1128 ret = float64_compare(d0, d1, &env->sse_status);
1129 CC_SRC = comis_eflags[ret + 1];
664e0f19 1130}
fd17264a 1131#endif
664e0f19 1132
ce4fa29f 1133uint32_t glue(helper_movmskps, SUFFIX)(CPUX86State *env, Reg *s)
664e0f19 1134{
fd17264a
PB
1135 uint32_t mask;
1136 int i;
e01d9d31 1137
fd17264a
PB
1138 mask = 0;
1139 for (i = 0; i < 2 << SHIFT; i++) {
1140 mask |= (s->ZMM_L(i) >> (31 - i)) & (1 << i);
1141 }
1142 return mask;
664e0f19
FB
1143}
1144
ce4fa29f 1145uint32_t glue(helper_movmskpd, SUFFIX)(CPUX86State *env, Reg *s)
664e0f19 1146{
fd17264a
PB
1147 uint32_t mask;
1148 int i;
e01d9d31 1149
fd17264a
PB
1150 mask = 0;
1151 for (i = 0; i < 1 << SHIFT; i++) {
1152 mask |= (s->ZMM_Q(i) >> (63 - i)) & (1 << i);
1153 }
1154 return mask;
664e0f19
FB
1155}
1156
1157#endif
1158
d3eb5eae 1159uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s)
5af45186
FB
1160{
1161 uint32_t val;
e894bae8 1162 int i;
e01d9d31 1163
5af45186 1164 val = 0;
e894bae8
PB
1165 for (i = 0; i < (1 << SHIFT); i++) {
1166 uint8_t byte = 0;
1167 byte |= (s->B(8 * i + 0) >> 7);
1168 byte |= (s->B(8 * i + 1) >> 6) & 0x02;
1169 byte |= (s->B(8 * i + 2) >> 5) & 0x04;
1170 byte |= (s->B(8 * i + 3) >> 4) & 0x08;
1171 byte |= (s->B(8 * i + 4) >> 3) & 0x10;
1172 byte |= (s->B(8 * i + 5) >> 2) & 0x20;
1173 byte |= (s->B(8 * i + 6) >> 1) & 0x40;
1174 byte |= (s->B(8 * i + 7)) & 0x80;
1175 val |= byte << (8 * i);
1176 }
5af45186 1177 return val;
664e0f19
FB
1178}
1179
d45b0de6
PB
1180#define PACK_HELPER_B(name, F) \
1181void glue(helper_pack ## name, SUFFIX)(CPUX86State *env, \
1182 Reg *d, Reg *s) \
1183{ \
1184 Reg *v = d; \
1185 uint8_t r[PACK_WIDTH * 2]; \
1186 int j, k; \
1187 for (j = 0; j < 4 << SHIFT; j += PACK_WIDTH) { \
1188 for (k = 0; k < PACK_WIDTH; k++) { \
1189 r[k] = F((int16_t)v->W(j + k)); \
1190 } \
1191 for (k = 0; k < PACK_WIDTH; k++) { \
1192 r[PACK_WIDTH + k] = F((int16_t)s->W(j + k)); \
1193 } \
1194 for (k = 0; k < PACK_WIDTH * 2; k++) { \
1195 d->B(2 * j + k) = r[k]; \
1196 } \
1197 } \
1198}
1199
1200PACK_HELPER_B(sswb, satsb)
1201PACK_HELPER_B(uswb, satub)
664e0f19 1202
d3eb5eae 1203void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 1204{
d45b0de6
PB
1205 Reg *v = d;
1206 uint16_t r[PACK_WIDTH];
1207 int j, k;
664e0f19 1208
d45b0de6
PB
1209 for (j = 0; j < 2 << SHIFT; j += PACK_WIDTH / 2) {
1210 for (k = 0; k < PACK_WIDTH / 2; k++) {
1211 r[k] = satsw(v->L(j + k));
1212 }
1213 for (k = 0; k < PACK_WIDTH / 2; k++) {
1214 r[PACK_WIDTH / 2 + k] = satsw(s->L(j + k));
1215 }
1216 for (k = 0; k < PACK_WIDTH; k++) {
1217 d->W(2 * j + k) = r[k];
1218 }
1219 }
664e0f19
FB
1220}
1221
e01d9d31
BS
1222#define UNPCK_OP(base_name, base) \
1223 \
d3eb5eae 1224 void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\
d45b0de6 1225 Reg *d, Reg *s) \
e01d9d31 1226 { \
d45b0de6
PB
1227 Reg *v = d; \
1228 uint8_t r[PACK_WIDTH * 2]; \
1229 int j, i; \
e01d9d31 1230 \
d45b0de6
PB
1231 for (j = 0; j < 8 << SHIFT; ) { \
1232 int k = j + base * PACK_WIDTH; \
1233 for (i = 0; i < PACK_WIDTH; i++) { \
1234 r[2 * i] = v->B(k + i); \
1235 r[2 * i + 1] = s->B(k + i); \
1236 } \
1237 for (i = 0; i < PACK_WIDTH * 2; i++, j++) { \
1238 d->B(j) = r[i]; \
1239 } \
1240 } \
e01d9d31
BS
1241 } \
1242 \
d3eb5eae 1243 void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\
d45b0de6 1244 Reg *d, Reg *s) \
e01d9d31 1245 { \
d45b0de6
PB
1246 Reg *v = d; \
1247 uint16_t r[PACK_WIDTH]; \
1248 int j, i; \
e01d9d31 1249 \
d45b0de6
PB
1250 for (j = 0; j < 4 << SHIFT; ) { \
1251 int k = j + base * PACK_WIDTH / 2; \
1252 for (i = 0; i < PACK_WIDTH / 2; i++) { \
1253 r[2 * i] = v->W(k + i); \
1254 r[2 * i + 1] = s->W(k + i); \
1255 } \
1256 for (i = 0; i < PACK_WIDTH; i++, j++) { \
1257 d->W(j) = r[i]; \
1258 } \
1259 } \
e01d9d31
BS
1260 } \
1261 \
d3eb5eae 1262 void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\
d45b0de6 1263 Reg *d, Reg *s) \
e01d9d31 1264 { \
d45b0de6
PB
1265 Reg *v = d; \
1266 uint32_t r[PACK_WIDTH / 2]; \
1267 int j, i; \
e01d9d31 1268 \
d45b0de6
PB
1269 for (j = 0; j < 2 << SHIFT; ) { \
1270 int k = j + base * PACK_WIDTH / 4; \
1271 for (i = 0; i < PACK_WIDTH / 4; i++) { \
1272 r[2 * i] = v->L(k + i); \
1273 r[2 * i + 1] = s->L(k + i); \
1274 } \
1275 for (i = 0; i < PACK_WIDTH / 2; i++, j++) { \
1276 d->L(j) = r[i]; \
1277 } \
1278 } \
e01d9d31
BS
1279 } \
1280 \
1281 XMM_ONLY( \
d45b0de6
PB
1282 void glue(helper_punpck ## base_name ## qdq, SUFFIX)( \
1283 CPUX86State *env, Reg *d, Reg *s) \
e01d9d31 1284 { \
d45b0de6
PB
1285 Reg *v = d; \
1286 uint64_t r[2]; \
1287 int i; \
e01d9d31 1288 \
d45b0de6
PB
1289 for (i = 0; i < 1 << SHIFT; i += 2) { \
1290 r[0] = v->Q(base + i); \
1291 r[1] = s->Q(base + i); \
1292 d->Q(i) = r[0]; \
1293 d->Q(i + 1) = r[1]; \
1294 } \
e01d9d31
BS
1295 } \
1296 )
664e0f19
FB
1297
1298UNPCK_OP(l, 0)
1299UNPCK_OP(h, 1)
1300
d45b0de6
PB
1301#undef PACK_WIDTH
1302#undef PACK_HELPER_B
1303#undef UNPCK_OP
1304
1305
a35f3ec7
AJ
1306/* 3DNow! float ops */
1307#if SHIFT == 0
d3eb5eae 1308void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1309{
a35f3ec7
AJ
1310 d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status);
1311 d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status);
1312}
1313
d3eb5eae 1314void helper_pi2fw(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1315{
a35f3ec7
AJ
1316 d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status);
1317 d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status);
1318}
1319
d3eb5eae 1320void helper_pf2id(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1321{
a35f3ec7
AJ
1322 d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status);
1323 d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status);
1324}
1325
d3eb5eae 1326void helper_pf2iw(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1327{
e01d9d31
BS
1328 d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0),
1329 &env->mmx_status));
1330 d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1),
1331 &env->mmx_status));
a35f3ec7
AJ
1332}
1333
d3eb5eae 1334void helper_pfacc(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1335{
25bdec79 1336 float32 r;
e01d9d31 1337
25bdec79
PB
1338 r = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1339 d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1340 d->MMX_S(0) = r;
a35f3ec7
AJ
1341}
1342
d3eb5eae 1343void helper_pfadd(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1344{
a35f3ec7
AJ
1345 d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1346 d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1347}
1348
d3eb5eae 1349void helper_pfcmpeq(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1350{
e01d9d31
BS
1351 d->MMX_L(0) = float32_eq_quiet(d->MMX_S(0), s->MMX_S(0),
1352 &env->mmx_status) ? -1 : 0;
1353 d->MMX_L(1) = float32_eq_quiet(d->MMX_S(1), s->MMX_S(1),
1354 &env->mmx_status) ? -1 : 0;
a35f3ec7
AJ
1355}
1356
d3eb5eae 1357void helper_pfcmpge(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1358{
e01d9d31
BS
1359 d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0),
1360 &env->mmx_status) ? -1 : 0;
1361 d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1),
1362 &env->mmx_status) ? -1 : 0;
a35f3ec7
AJ
1363}
1364
d3eb5eae 1365void helper_pfcmpgt(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1366{
e01d9d31
BS
1367 d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0),
1368 &env->mmx_status) ? -1 : 0;
1369 d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1),
1370 &env->mmx_status) ? -1 : 0;
a35f3ec7
AJ
1371}
1372
d3eb5eae 1373void helper_pfmax(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1374{
e01d9d31 1375 if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status)) {
a35f3ec7 1376 d->MMX_S(0) = s->MMX_S(0);
e01d9d31
BS
1377 }
1378 if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status)) {
a35f3ec7 1379 d->MMX_S(1) = s->MMX_S(1);
e01d9d31 1380 }
a35f3ec7
AJ
1381}
1382
d3eb5eae 1383void helper_pfmin(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1384{
e01d9d31 1385 if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status)) {
a35f3ec7 1386 d->MMX_S(0) = s->MMX_S(0);
e01d9d31
BS
1387 }
1388 if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status)) {
a35f3ec7 1389 d->MMX_S(1) = s->MMX_S(1);
e01d9d31 1390 }
a35f3ec7
AJ
1391}
1392
d3eb5eae 1393void helper_pfmul(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1394{
a35f3ec7
AJ
1395 d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1396 d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1397}
1398
d3eb5eae 1399void helper_pfnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1400{
25bdec79 1401 float32 r;
e01d9d31 1402
25bdec79
PB
1403 r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1404 d->MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1405 d->MMX_S(0) = r;
a35f3ec7
AJ
1406}
1407
d3eb5eae 1408void helper_pfpnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1409{
25bdec79 1410 float32 r;
e01d9d31 1411
25bdec79
PB
1412 r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1413 d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1414 d->MMX_S(0) = r;
a35f3ec7
AJ
1415}
1416
d3eb5eae 1417void helper_pfrcp(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1418{
c2ef9a83 1419 d->MMX_S(0) = float32_div(float32_one, s->MMX_S(0), &env->mmx_status);
a35f3ec7
AJ
1420 d->MMX_S(1) = d->MMX_S(0);
1421}
1422
d3eb5eae 1423void helper_pfrsqrt(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1424{
a35f3ec7 1425 d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff;
c2ef9a83
AJ
1426 d->MMX_S(1) = float32_div(float32_one,
1427 float32_sqrt(d->MMX_S(1), &env->mmx_status),
1428 &env->mmx_status);
a35f3ec7
AJ
1429 d->MMX_L(1) |= s->MMX_L(0) & 0x80000000;
1430 d->MMX_L(0) = d->MMX_L(1);
1431}
1432
d3eb5eae 1433void helper_pfsub(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1434{
a35f3ec7
AJ
1435 d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1436 d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1437}
1438
d3eb5eae 1439void helper_pfsubr(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1440{
a35f3ec7
AJ
1441 d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status);
1442 d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status);
1443}
1444
d3eb5eae 1445void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1446{
25bdec79 1447 uint32_t r;
e01d9d31 1448
25bdec79
PB
1449 r = s->MMX_L(0);
1450 d->MMX_L(0) = s->MMX_L(1);
1451 d->MMX_L(1) = r;
a35f3ec7
AJ
1452}
1453#endif
1454
4242b1bd 1455/* SSSE3 op helpers */
d3eb5eae 1456void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
4242b1bd 1457{
d45b0de6 1458 Reg *v = d;
4242b1bd 1459 int i;
d45b0de6
PB
1460#if SHIFT == 0
1461 uint8_t r[8];
4242b1bd 1462
d45b0de6
PB
1463 for (i = 0; i < 8; i++) {
1464 r[i] = (s->B(i) & 0x80) ? 0 : (v->B(s->B(i) & 7));
e01d9d31 1465 }
d45b0de6
PB
1466 for (i = 0; i < 8; i++) {
1467 d->B(i) = r[i];
1468 }
1469#else
1470 uint8_t r[8 << SHIFT];
4242b1bd 1471
d45b0de6
PB
1472 for (i = 0; i < 8 << SHIFT; i++) {
1473 int j = i & ~0xf;
1474 r[i] = (s->B(i) & 0x80) ? 0 : v->B(j | (s->B(i) & 0xf));
1475 }
1476 for (i = 0; i < 8 << SHIFT; i++) {
1477 d->B(i) = r[i];
1478 }
4242b1bd
AZ
1479#endif
1480}
1481
d45b0de6
PB
1482#define SSE_HELPER_HW(name, F) \
1483void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
1484{ \
1485 Reg *v = d; \
1486 uint16_t r[4 << SHIFT]; \
1487 int i, j, k; \
1488 for (k = 0; k < 4 << SHIFT; k += LANE_WIDTH / 2) { \
1489 for (i = j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \
1490 r[i + k] = F(v->W(j + k), v->W(j + k + 1)); \
1491 } \
1492 for (j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \
1493 r[i + k] = F(s->W(j + k), s->W(j + k + 1)); \
1494 } \
1495 } \
1496 for (i = 0; i < 4 << SHIFT; i++) { \
1497 d->W(i) = r[i]; \
1498 } \
1499}
1500
1501#define SSE_HELPER_HL(name, F) \
1502void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
1503{ \
1504 Reg *v = d; \
1505 uint32_t r[2 << SHIFT]; \
1506 int i, j, k; \
1507 for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \
1508 for (i = j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \
1509 r[i + k] = F(v->L(j + k), v->L(j + k + 1)); \
1510 } \
1511 for (j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \
1512 r[i + k] = F(s->L(j + k), s->L(j + k + 1)); \
1513 } \
1514 } \
1515 for (i = 0; i < 2 << SHIFT; i++) { \
1516 d->L(i) = r[i]; \
1517 } \
1518}
1519
1520SSE_HELPER_HW(phaddw, FADD)
1521SSE_HELPER_HW(phsubw, FSUB)
1522SSE_HELPER_HW(phaddsw, FADDSW)
1523SSE_HELPER_HW(phsubsw, FSUBSW)
1524SSE_HELPER_HL(phaddd, FADD)
1525SSE_HELPER_HL(phsubd, FSUB)
1526
1527#undef SSE_HELPER_HW
1528#undef SSE_HELPER_HL
4242b1bd 1529
d45b0de6 1530void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
4242b1bd 1531{
d45b0de6
PB
1532 Reg *v = d;
1533 int i;
1534 for (i = 0; i < 4 << SHIFT; i++) {
1535 d->W(i) = satsw((int8_t)s->B(i * 2) * (uint8_t)v->B(i * 2) +
1536 (int8_t)s->B(i * 2 + 1) * (uint8_t)v->B(i * 2 + 1));
1537 }
4242b1bd
AZ
1538}
1539
ee04a3c8
PB
1540#define FABSB(x) (x > INT8_MAX ? -(int8_t)x : x)
1541#define FABSW(x) (x > INT16_MAX ? -(int16_t)x : x)
1542#define FABSL(x) (x > INT32_MAX ? -(int32_t)x : x)
1543SSE_HELPER_1(helper_pabsb, B, 8 << SHIFT, FABSB)
1544SSE_HELPER_1(helper_pabsw, W, 4 << SHIFT, FABSW)
1545SSE_HELPER_1(helper_pabsd, L, 2 << SHIFT, FABSL)
4242b1bd 1546
e01d9d31 1547#define FMULHRSW(d, s) (((int16_t) d * (int16_t)s + 0x4000) >> 15)
4242b1bd
AZ
1548SSE_HELPER_W(helper_pmulhrsw, FMULHRSW)
1549
e01d9d31
BS
1550#define FSIGNB(d, s) (s <= INT8_MAX ? s ? d : 0 : -(int8_t)d)
1551#define FSIGNW(d, s) (s <= INT16_MAX ? s ? d : 0 : -(int16_t)d)
1552#define FSIGNL(d, s) (s <= INT32_MAX ? s ? d : 0 : -(int32_t)d)
4242b1bd
AZ
1553SSE_HELPER_B(helper_psignb, FSIGNB)
1554SSE_HELPER_W(helper_psignw, FSIGNW)
1555SSE_HELPER_L(helper_psignd, FSIGNL)
1556
d3eb5eae
BS
1557void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1558 int32_t shift)
4242b1bd 1559{
d45b0de6
PB
1560 Reg *v = d;
1561 int i;
4242b1bd
AZ
1562
1563 /* XXX could be checked during translation */
d45b0de6
PB
1564 if (shift >= (SHIFT ? 32 : 16)) {
1565 for (i = 0; i < (1 << SHIFT); i++) {
1566 d->Q(i) = 0;
1567 }
4242b1bd
AZ
1568 } else {
1569 shift <<= 3;
1570#define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
1571#if SHIFT == 0
d45b0de6
PB
1572 d->Q(0) = SHR(s->Q(0), shift - 0) |
1573 SHR(v->Q(0), shift - 64);
4242b1bd 1574#else
d45b0de6
PB
1575 for (i = 0; i < (1 << SHIFT); i += 2) {
1576 uint64_t r0, r1;
1577
1578 r0 = SHR(s->Q(i), shift - 0) |
1579 SHR(s->Q(i + 1), shift - 64) |
1580 SHR(v->Q(i), shift - 128) |
1581 SHR(v->Q(i + 1), shift - 192);
1582 r1 = SHR(s->Q(i), shift + 64) |
1583 SHR(s->Q(i + 1), shift - 0) |
1584 SHR(v->Q(i), shift - 64) |
1585 SHR(v->Q(i + 1), shift - 128);
1586 d->Q(i) = r0;
1587 d->Q(i + 1) = r1;
1588 }
4242b1bd
AZ
1589#endif
1590#undef SHR
1591 }
4242b1bd
AZ
1592}
1593
e01d9d31 1594#define XMM0 (env->xmm_regs[0])
222a3336
AZ
1595
1596#if SHIFT == 1
e01d9d31 1597#define SSE_HELPER_V(name, elem, num, F) \
d3eb5eae 1598 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
e01d9d31
BS
1599 { \
1600 d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0)); \
1601 d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1)); \
1602 if (num > 2) { \
1603 d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2)); \
1604 d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3)); \
1605 if (num > 4) { \
1606 d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4)); \
1607 d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5)); \
1608 d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6)); \
1609 d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7)); \
1610 if (num > 8) { \
1611 d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8)); \
1612 d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9)); \
1613 d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10)); \
1614 d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11)); \
1615 d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12)); \
1616 d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13)); \
1617 d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14)); \
1618 d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15)); \
1619 } \
1620 } \
1621 } \
1622 }
1623
1624#define SSE_HELPER_I(name, elem, num, F) \
d3eb5eae 1625 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, uint32_t imm) \
e01d9d31
BS
1626 { \
1627 d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1)); \
1628 d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1)); \
1629 if (num > 2) { \
1630 d->elem(2) = F(d->elem(2), s->elem(2), ((imm >> 2) & 1)); \
1631 d->elem(3) = F(d->elem(3), s->elem(3), ((imm >> 3) & 1)); \
1632 if (num > 4) { \
1633 d->elem(4) = F(d->elem(4), s->elem(4), ((imm >> 4) & 1)); \
1634 d->elem(5) = F(d->elem(5), s->elem(5), ((imm >> 5) & 1)); \
1635 d->elem(6) = F(d->elem(6), s->elem(6), ((imm >> 6) & 1)); \
1636 d->elem(7) = F(d->elem(7), s->elem(7), ((imm >> 7) & 1)); \
1637 if (num > 8) { \
1638 d->elem(8) = F(d->elem(8), s->elem(8), ((imm >> 8) & 1)); \
1639 d->elem(9) = F(d->elem(9), s->elem(9), ((imm >> 9) & 1)); \
1640 d->elem(10) = F(d->elem(10), s->elem(10), \
1641 ((imm >> 10) & 1)); \
1642 d->elem(11) = F(d->elem(11), s->elem(11), \
1643 ((imm >> 11) & 1)); \
1644 d->elem(12) = F(d->elem(12), s->elem(12), \
1645 ((imm >> 12) & 1)); \
1646 d->elem(13) = F(d->elem(13), s->elem(13), \
1647 ((imm >> 13) & 1)); \
1648 d->elem(14) = F(d->elem(14), s->elem(14), \
1649 ((imm >> 14) & 1)); \
1650 d->elem(15) = F(d->elem(15), s->elem(15), \
1651 ((imm >> 15) & 1)); \
1652 } \
1653 } \
1654 } \
1655 }
222a3336
AZ
1656
1657/* SSE4.1 op helpers */
e01d9d31
BS
1658#define FBLENDVB(d, s, m) ((m & 0x80) ? s : d)
1659#define FBLENDVPS(d, s, m) ((m & 0x80000000) ? s : d)
1660#define FBLENDVPD(d, s, m) ((m & 0x8000000000000000LL) ? s : d)
222a3336
AZ
1661SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB)
1662SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS)
1663SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD)
1664
d3eb5eae 1665void glue(helper_ptest, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
222a3336 1666{
e894bae8
PB
1667 uint64_t zf = 0, cf = 0;
1668 int i;
222a3336 1669
e894bae8
PB
1670 for (i = 0; i < 1 << SHIFT; i++) {
1671 zf |= (s->Q(i) & d->Q(i));
1672 cf |= (s->Q(i) & ~d->Q(i));
1673 }
222a3336
AZ
1674 CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C);
1675}
1676
e894bae8
PB
1677#define SSE_HELPER_F(name, elem, num, F) \
1678 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
1679 { \
1680 int n = num; \
1681 for (int i = n; --i >= 0; ) { \
1682 d->elem(i) = F(i); \
1683 } \
e01d9d31 1684 }
222a3336 1685
e894bae8
PB
1686#if SHIFT > 0
1687SSE_HELPER_F(helper_pmovsxbw, W, 4 << SHIFT, (int8_t) s->B)
1688SSE_HELPER_F(helper_pmovsxbd, L, 2 << SHIFT, (int8_t) s->B)
1689SSE_HELPER_F(helper_pmovsxbq, Q, 1 << SHIFT, (int8_t) s->B)
1690SSE_HELPER_F(helper_pmovsxwd, L, 2 << SHIFT, (int16_t) s->W)
1691SSE_HELPER_F(helper_pmovsxwq, Q, 1 << SHIFT, (int16_t) s->W)
1692SSE_HELPER_F(helper_pmovsxdq, Q, 1 << SHIFT, (int32_t) s->L)
1693SSE_HELPER_F(helper_pmovzxbw, W, 4 << SHIFT, s->B)
1694SSE_HELPER_F(helper_pmovzxbd, L, 2 << SHIFT, s->B)
1695SSE_HELPER_F(helper_pmovzxbq, Q, 1 << SHIFT, s->B)
1696SSE_HELPER_F(helper_pmovzxwd, L, 2 << SHIFT, s->W)
1697SSE_HELPER_F(helper_pmovzxwq, Q, 1 << SHIFT, s->W)
1698SSE_HELPER_F(helper_pmovzxdq, Q, 1 << SHIFT, s->L)
1699#endif
222a3336 1700
d3eb5eae 1701void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
222a3336 1702{
e894bae8
PB
1703 Reg *v = d;
1704 int i;
1705
1706 for (i = 0; i < 1 << SHIFT; i++) {
1707 d->Q(i) = (int64_t)(int32_t) v->L(2 * i) * (int32_t) s->L(2 * i);
1708 }
222a3336
AZ
1709}
1710
e01d9d31 1711#define FCMPEQQ(d, s) (d == s ? -1 : 0)
222a3336
AZ
1712SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ)
1713
d3eb5eae 1714void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
222a3336 1715{
d45b0de6
PB
1716 Reg *v = d;
1717 uint16_t r[8];
1718 int i, j, k;
1719
1720 for (i = 0, j = 0; i <= 2 << SHIFT; i += 8, j += 4) {
1721 r[0] = satuw(v->L(j));
1722 r[1] = satuw(v->L(j + 1));
1723 r[2] = satuw(v->L(j + 2));
1724 r[3] = satuw(v->L(j + 3));
1725 r[4] = satuw(s->L(j));
1726 r[5] = satuw(s->L(j + 1));
1727 r[6] = satuw(s->L(j + 2));
1728 r[7] = satuw(s->L(j + 3));
1729 for (k = 0; k < 8; k++) {
1730 d->W(i + k) = r[k];
1731 }
1732 }
222a3336
AZ
1733}
1734
e01d9d31
BS
1735#define FMINSB(d, s) MIN((int8_t)d, (int8_t)s)
1736#define FMINSD(d, s) MIN((int32_t)d, (int32_t)s)
1737#define FMAXSB(d, s) MAX((int8_t)d, (int8_t)s)
1738#define FMAXSD(d, s) MAX((int32_t)d, (int32_t)s)
222a3336
AZ
1739SSE_HELPER_B(helper_pminsb, FMINSB)
1740SSE_HELPER_L(helper_pminsd, FMINSD)
1741SSE_HELPER_W(helper_pminuw, MIN)
1742SSE_HELPER_L(helper_pminud, MIN)
1743SSE_HELPER_B(helper_pmaxsb, FMAXSB)
1744SSE_HELPER_L(helper_pmaxsd, FMAXSD)
1745SSE_HELPER_W(helper_pmaxuw, MAX)
1746SSE_HELPER_L(helper_pmaxud, MAX)
1747
e01d9d31 1748#define FMULLD(d, s) ((int32_t)d * (int32_t)s)
222a3336
AZ
1749SSE_HELPER_L(helper_pmulld, FMULLD)
1750
fd17264a 1751#if SHIFT == 1
d3eb5eae 1752void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
222a3336
AZ
1753{
1754 int idx = 0;
1755
e01d9d31 1756 if (s->W(1) < s->W(idx)) {
222a3336 1757 idx = 1;
e01d9d31
BS
1758 }
1759 if (s->W(2) < s->W(idx)) {
222a3336 1760 idx = 2;
e01d9d31
BS
1761 }
1762 if (s->W(3) < s->W(idx)) {
222a3336 1763 idx = 3;
e01d9d31
BS
1764 }
1765 if (s->W(4) < s->W(idx)) {
222a3336 1766 idx = 4;
e01d9d31
BS
1767 }
1768 if (s->W(5) < s->W(idx)) {
222a3336 1769 idx = 5;
e01d9d31
BS
1770 }
1771 if (s->W(6) < s->W(idx)) {
222a3336 1772 idx = 6;
e01d9d31
BS
1773 }
1774 if (s->W(7) < s->W(idx)) {
222a3336 1775 idx = 7;
e01d9d31 1776 }
222a3336 1777
222a3336 1778 d->W(0) = s->W(idx);
aa406fea
JM
1779 d->W(1) = idx;
1780 d->L(1) = 0;
1781 d->Q(1) = 0;
222a3336 1782}
fd17264a 1783#endif
222a3336 1784
d3eb5eae
BS
1785void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1786 uint32_t mode)
222a3336 1787{
418b0f93 1788 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
222a3336 1789 signed char prev_rounding_mode;
fd17264a 1790 int i;
222a3336
AZ
1791
1792 prev_rounding_mode = env->sse_status.float_rounding_mode;
e01d9d31 1793 if (!(mode & (1 << 2))) {
222a3336
AZ
1794 switch (mode & 3) {
1795 case 0:
1796 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1797 break;
1798 case 1:
1799 set_float_rounding_mode(float_round_down, &env->sse_status);
1800 break;
1801 case 2:
1802 set_float_rounding_mode(float_round_up, &env->sse_status);
1803 break;
1804 case 3:
1805 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1806 break;
1807 }
e01d9d31 1808 }
222a3336 1809
fd17264a
PB
1810 for (i = 0; i < 2 << SHIFT; i++) {
1811 d->ZMM_S(i) = float32_round_to_int(s->ZMM_S(i), &env->sse_status);
1812 }
222a3336 1813
418b0f93 1814 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
e01d9d31
BS
1815 set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1816 ~float_flag_inexact,
1817 &env->sse_status);
1818 }
222a3336
AZ
1819 env->sse_status.float_rounding_mode = prev_rounding_mode;
1820}
1821
d3eb5eae
BS
1822void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1823 uint32_t mode)
222a3336 1824{
418b0f93 1825 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
222a3336 1826 signed char prev_rounding_mode;
fd17264a 1827 int i;
222a3336
AZ
1828
1829 prev_rounding_mode = env->sse_status.float_rounding_mode;
e01d9d31 1830 if (!(mode & (1 << 2))) {
222a3336
AZ
1831 switch (mode & 3) {
1832 case 0:
1833 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1834 break;
1835 case 1:
1836 set_float_rounding_mode(float_round_down, &env->sse_status);
1837 break;
1838 case 2:
1839 set_float_rounding_mode(float_round_up, &env->sse_status);
1840 break;
1841 case 3:
1842 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1843 break;
1844 }
e01d9d31 1845 }
222a3336 1846
fd17264a
PB
1847 for (i = 0; i < 1 << SHIFT; i++) {
1848 d->ZMM_D(i) = float64_round_to_int(s->ZMM_D(i), &env->sse_status);
1849 }
222a3336 1850
418b0f93 1851 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
e01d9d31
BS
1852 set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1853 ~float_flag_inexact,
1854 &env->sse_status);
1855 }
222a3336
AZ
1856 env->sse_status.float_rounding_mode = prev_rounding_mode;
1857}
1858
fd17264a 1859#if SHIFT == 1
d3eb5eae
BS
1860void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1861 uint32_t mode)
222a3336 1862{
418b0f93 1863 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
222a3336
AZ
1864 signed char prev_rounding_mode;
1865
1866 prev_rounding_mode = env->sse_status.float_rounding_mode;
e01d9d31 1867 if (!(mode & (1 << 2))) {
222a3336
AZ
1868 switch (mode & 3) {
1869 case 0:
1870 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1871 break;
1872 case 1:
1873 set_float_rounding_mode(float_round_down, &env->sse_status);
1874 break;
1875 case 2:
1876 set_float_rounding_mode(float_round_up, &env->sse_status);
1877 break;
1878 case 3:
1879 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1880 break;
1881 }
e01d9d31 1882 }
222a3336 1883
19cbd87c 1884 d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status);
222a3336 1885
418b0f93 1886 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
e01d9d31
BS
1887 set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1888 ~float_flag_inexact,
1889 &env->sse_status);
1890 }
222a3336
AZ
1891 env->sse_status.float_rounding_mode = prev_rounding_mode;
1892}
1893
d3eb5eae
BS
1894void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1895 uint32_t mode)
222a3336 1896{
418b0f93 1897 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
222a3336
AZ
1898 signed char prev_rounding_mode;
1899
1900 prev_rounding_mode = env->sse_status.float_rounding_mode;
e01d9d31 1901 if (!(mode & (1 << 2))) {
222a3336
AZ
1902 switch (mode & 3) {
1903 case 0:
1904 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1905 break;
1906 case 1:
1907 set_float_rounding_mode(float_round_down, &env->sse_status);
1908 break;
1909 case 2:
1910 set_float_rounding_mode(float_round_up, &env->sse_status);
1911 break;
1912 case 3:
1913 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1914 break;
1915 }
e01d9d31 1916 }
222a3336 1917
19cbd87c 1918 d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status);
222a3336 1919
418b0f93 1920 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
e01d9d31
BS
1921 set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1922 ~float_flag_inexact,
1923 &env->sse_status);
1924 }
222a3336
AZ
1925 env->sse_status.float_rounding_mode = prev_rounding_mode;
1926}
fd17264a 1927#endif
222a3336 1928
e01d9d31 1929#define FBLENDP(d, s, m) (m ? s : d)
222a3336
AZ
1930SSE_HELPER_I(helper_blendps, L, 4, FBLENDP)
1931SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP)
1932SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP)
1933
6f218d6e
PB
1934void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1935 uint32_t mask)
222a3336 1936{
6f218d6e 1937 Reg *v = d;
bf30ad8c 1938 float32 prod1, prod2, temp2, temp3, temp4;
6f218d6e 1939 int i;
222a3336 1940
6f218d6e
PB
1941 for (i = 0; i < 2 << SHIFT; i += 4) {
1942 /*
1943 * We must evaluate (A+B)+(C+D), not ((A+B)+C)+D
1944 * to correctly round the intermediate results
1945 */
1946 if (mask & (1 << 4)) {
1947 prod1 = float32_mul(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status);
1948 } else {
1949 prod1 = float32_zero;
1950 }
1951 if (mask & (1 << 5)) {
1952 prod2 = float32_mul(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status);
1953 } else {
1954 prod2 = float32_zero;
1955 }
1956 temp2 = float32_add(prod1, prod2, &env->sse_status);
1957 if (mask & (1 << 6)) {
1958 prod1 = float32_mul(v->ZMM_S(i+2), s->ZMM_S(i+2), &env->sse_status);
1959 } else {
1960 prod1 = float32_zero;
1961 }
1962 if (mask & (1 << 7)) {
1963 prod2 = float32_mul(v->ZMM_S(i+3), s->ZMM_S(i+3), &env->sse_status);
1964 } else {
1965 prod2 = float32_zero;
1966 }
1967 temp3 = float32_add(prod1, prod2, &env->sse_status);
1968 temp4 = float32_add(temp2, temp3, &env->sse_status);
bf30ad8c 1969
6f218d6e
PB
1970 d->ZMM_S(i) = (mask & (1 << 0)) ? temp4 : float32_zero;
1971 d->ZMM_S(i+1) = (mask & (1 << 1)) ? temp4 : float32_zero;
1972 d->ZMM_S(i+2) = (mask & (1 << 2)) ? temp4 : float32_zero;
1973 d->ZMM_S(i+3) = (mask & (1 << 3)) ? temp4 : float32_zero;
1974 }
222a3336
AZ
1975}
1976
6f218d6e
PB
1977#if SHIFT == 1
1978/* Oddly, there is no ymm version of dppd */
1979void glue(helper_dppd, SUFFIX)(CPUX86State *env,
1980 Reg *d, Reg *s, uint32_t mask)
222a3336 1981{
6f218d6e 1982 Reg *v = d;
bf30ad8c 1983 float64 prod1, prod2, temp2;
222a3336 1984
e01d9d31 1985 if (mask & (1 << 4)) {
6f218d6e 1986 prod1 = float64_mul(v->ZMM_D(0), s->ZMM_D(0), &env->sse_status);
bf30ad8c
PB
1987 } else {
1988 prod1 = float64_zero;
e01d9d31
BS
1989 }
1990 if (mask & (1 << 5)) {
6f218d6e 1991 prod2 = float64_mul(v->ZMM_D(1), s->ZMM_D(1), &env->sse_status);
bf30ad8c
PB
1992 } else {
1993 prod2 = float64_zero;
e01d9d31 1994 }
bf30ad8c
PB
1995 temp2 = float64_add(prod1, prod2, &env->sse_status);
1996 d->ZMM_D(0) = (mask & (1 << 0)) ? temp2 : float64_zero;
1997 d->ZMM_D(1) = (mask & (1 << 1)) ? temp2 : float64_zero;
222a3336 1998}
6f218d6e 1999#endif
222a3336 2000
d3eb5eae
BS
2001void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2002 uint32_t offset)
222a3336 2003{
d45b0de6
PB
2004 Reg *v = d;
2005 int i, j;
2006 uint16_t r[8];
2007
2008 for (j = 0; j < 4 << SHIFT; ) {
2009 int s0 = (j * 2) + ((offset & 3) << 2);
2010 int d0 = (j * 2) + ((offset & 4) << 0);
2011 for (i = 0; i < LANE_WIDTH / 2; i++, d0++) {
2012 r[i] = 0;
2013 r[i] += abs1(v->B(d0 + 0) - s->B(s0 + 0));
2014 r[i] += abs1(v->B(d0 + 1) - s->B(s0 + 1));
2015 r[i] += abs1(v->B(d0 + 2) - s->B(s0 + 2));
2016 r[i] += abs1(v->B(d0 + 3) - s->B(s0 + 3));
2017 }
2018 for (i = 0; i < LANE_WIDTH / 2; i++, j++) {
2019 d->W(j) = r[i];
2020 }
2021 offset >>= 3;
222a3336 2022 }
222a3336
AZ
2023}
2024
2025/* SSE4.2 op helpers */
da5156cd 2026#define FCMPGTQ(d, s) ((int64_t)d > (int64_t)s ? -1 : 0)
222a3336
AZ
2027SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ)
2028
fd17264a 2029#if SHIFT == 1
d3eb5eae 2030static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl)
222a3336 2031{
d1da229f 2032 target_long val, limit;
222a3336
AZ
2033
2034 /* Presence of REX.W is indicated by a bit higher than 7 set */
e01d9d31 2035 if (ctrl >> 8) {
d1da229f 2036 val = (target_long)env->regs[reg];
e01d9d31 2037 } else {
d1da229f 2038 val = (int32_t)env->regs[reg];
e01d9d31 2039 }
222a3336 2040 if (ctrl & 1) {
d1da229f 2041 limit = 8;
e01d9d31 2042 } else {
d1da229f 2043 limit = 16;
e01d9d31 2044 }
d1da229f
PB
2045 if ((val > limit) || (val < -limit)) {
2046 return limit;
2047 }
2048 return abs1(val);
222a3336
AZ
2049}
2050
2051static inline int pcmp_ilen(Reg *r, uint8_t ctrl)
2052{
2053 int val = 0;
2054
2055 if (ctrl & 1) {
e01d9d31 2056 while (val < 8 && r->W(val)) {
222a3336 2057 val++;
e01d9d31
BS
2058 }
2059 } else {
2060 while (val < 16 && r->B(val)) {
222a3336 2061 val++;
e01d9d31
BS
2062 }
2063 }
222a3336
AZ
2064
2065 return val;
2066}
2067
2068static inline int pcmp_val(Reg *r, uint8_t ctrl, int i)
2069{
2070 switch ((ctrl >> 0) & 3) {
2071 case 0:
2072 return r->B(i);
2073 case 1:
2074 return r->W(i);
2075 case 2:
e01d9d31 2076 return (int8_t)r->B(i);
222a3336
AZ
2077 case 3:
2078 default:
e01d9d31 2079 return (int16_t)r->W(i);
222a3336
AZ
2080 }
2081}
2082
d3eb5eae 2083static inline unsigned pcmpxstrx(CPUX86State *env, Reg *d, Reg *s,
e01d9d31 2084 int8_t ctrl, int valids, int validd)
222a3336
AZ
2085{
2086 unsigned int res = 0;
2087 int v;
2088 int j, i;
2089 int upper = (ctrl & 1) ? 7 : 15;
2090
2091 valids--;
2092 validd--;
2093
2094 CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0);
2095
2096 switch ((ctrl >> 2) & 3) {
2097 case 0:
2098 for (j = valids; j >= 0; j--) {
2099 res <<= 1;
2100 v = pcmp_val(s, ctrl, j);
e01d9d31 2101 for (i = validd; i >= 0; i--) {
222a3336 2102 res |= (v == pcmp_val(d, ctrl, i));
e01d9d31 2103 }
222a3336
AZ
2104 }
2105 break;
2106 case 1:
2107 for (j = valids; j >= 0; j--) {
2108 res <<= 1;
2109 v = pcmp_val(s, ctrl, j);
e01d9d31 2110 for (i = ((validd - 1) | 1); i >= 0; i -= 2) {
649ad05e
AJ
2111 res |= (pcmp_val(d, ctrl, i - 0) >= v &&
2112 pcmp_val(d, ctrl, i - 1) <= v);
e01d9d31 2113 }
222a3336
AZ
2114 }
2115 break;
2116 case 2:
b27a6cac 2117 res = (1 << (upper - MAX(valids, validd))) - 1;
222a3336
AZ
2118 res <<= MAX(valids, validd) - MIN(valids, validd);
2119 for (i = MIN(valids, validd); i >= 0; i--) {
2120 res <<= 1;
2121 v = pcmp_val(s, ctrl, i);
2122 res |= (v == pcmp_val(d, ctrl, i));
2123 }
2124 break;
2125 case 3:
ae35eea7
JM
2126 if (validd == -1) {
2127 res = (2 << upper) - 1;
2128 break;
2129 }
bc921b27 2130 for (j = valids == upper ? valids : valids - validd; j >= 0; j--) {
222a3336 2131 res <<= 1;
75c9527e 2132 v = 1;
bc921b27 2133 for (i = MIN(valids - j, validd); i >= 0; i--) {
75c9527e 2134 v &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i));
e01d9d31 2135 }
75c9527e 2136 res |= v;
222a3336
AZ
2137 }
2138 break;
2139 }
2140
2141 switch ((ctrl >> 4) & 3) {
2142 case 1:
2143 res ^= (2 << upper) - 1;
2144 break;
2145 case 3:
e4eba27e 2146 res ^= (1 << (valids + 1)) - 1;
222a3336
AZ
2147 break;
2148 }
2149
e01d9d31
BS
2150 if (res) {
2151 CC_SRC |= CC_C;
2152 }
2153 if (res & 1) {
2154 CC_SRC |= CC_O;
2155 }
222a3336
AZ
2156
2157 return res;
2158}
2159
d3eb5eae
BS
2160void glue(helper_pcmpestri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2161 uint32_t ctrl)
222a3336 2162{
d3eb5eae
BS
2163 unsigned int res = pcmpxstrx(env, d, s, ctrl,
2164 pcmp_elen(env, R_EDX, ctrl),
2165 pcmp_elen(env, R_EAX, ctrl));
222a3336 2166
e01d9d31 2167 if (res) {
c334a388 2168 env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res);
e01d9d31 2169 } else {
222a3336 2170 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
e01d9d31 2171 }
222a3336
AZ
2172}
2173
d3eb5eae
BS
2174void glue(helper_pcmpestrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2175 uint32_t ctrl)
222a3336
AZ
2176{
2177 int i;
d3eb5eae
BS
2178 unsigned int res = pcmpxstrx(env, d, s, ctrl,
2179 pcmp_elen(env, R_EDX, ctrl),
2180 pcmp_elen(env, R_EAX, ctrl));
222a3336
AZ
2181
2182 if ((ctrl >> 6) & 1) {
e01d9d31 2183 if (ctrl & 1) {
bc426899 2184 for (i = 0; i < 8; i++, res >>= 1) {
2b8d7e9d 2185 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0;
bc426899 2186 }
e01d9d31 2187 } else {
bc426899 2188 for (i = 0; i < 16; i++, res >>= 1) {
2b8d7e9d 2189 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0;
bc426899 2190 }
e01d9d31 2191 }
222a3336 2192 } else {
2b8d7e9d
AJ
2193 env->xmm_regs[0].Q(1) = 0;
2194 env->xmm_regs[0].Q(0) = res;
222a3336
AZ
2195 }
2196}
2197
d3eb5eae
BS
2198void glue(helper_pcmpistri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2199 uint32_t ctrl)
222a3336 2200{
d3eb5eae 2201 unsigned int res = pcmpxstrx(env, d, s, ctrl,
e01d9d31
BS
2202 pcmp_ilen(s, ctrl),
2203 pcmp_ilen(d, ctrl));
222a3336 2204
e01d9d31 2205 if (res) {
c334a388 2206 env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res);
e01d9d31 2207 } else {
222a3336 2208 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
e01d9d31 2209 }
222a3336
AZ
2210}
2211
d3eb5eae
BS
2212void glue(helper_pcmpistrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2213 uint32_t ctrl)
222a3336
AZ
2214{
2215 int i;
d3eb5eae 2216 unsigned int res = pcmpxstrx(env, d, s, ctrl,
e01d9d31
BS
2217 pcmp_ilen(s, ctrl),
2218 pcmp_ilen(d, ctrl));
222a3336
AZ
2219
2220 if ((ctrl >> 6) & 1) {
e01d9d31 2221 if (ctrl & 1) {
bc426899 2222 for (i = 0; i < 8; i++, res >>= 1) {
2b8d7e9d 2223 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0;
bc426899 2224 }
e01d9d31 2225 } else {
bc426899 2226 for (i = 0; i < 16; i++, res >>= 1) {
2b8d7e9d 2227 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0;
bc426899 2228 }
e01d9d31 2229 }
222a3336 2230 } else {
2b8d7e9d
AJ
2231 env->xmm_regs[0].Q(1) = 0;
2232 env->xmm_regs[0].Q(0) = res;
222a3336
AZ
2233 }
2234}
2235
2236#define CRCPOLY 0x1edc6f41
2237#define CRCPOLY_BITREV 0x82f63b78
2238target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
2239{
2240 target_ulong crc = (msg & ((target_ulong) -1 >>
e01d9d31 2241 (TARGET_LONG_BITS - len))) ^ crc1;
222a3336 2242
e01d9d31 2243 while (len--) {
222a3336 2244 crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0);
e01d9d31 2245 }
222a3336
AZ
2246
2247 return crc;
2248}
2249
fd17264a
PB
2250#endif
2251
e71827bc
AJ
2252void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2253 uint32_t ctrl)
2254{
2255 uint64_t ah, al, b, resh, resl;
2256
2257 ah = 0;
2258 al = d->Q((ctrl & 1) != 0);
2259 b = s->Q((ctrl & 16) != 0);
2260 resh = resl = 0;
2261
2262 while (b) {
2263 if (b & 1) {
2264 resl ^= al;
2265 resh ^= ah;
2266 }
2267 ah = (ah << 1) | (al >> 63);
2268 al <<= 1;
2269 b >>= 1;
2270 }
2271
2272 d->Q(0) = resl;
2273 d->Q(1) = resh;
2274}
d640045a 2275
d640045a
AJ
2276void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2277{
2278 int i;
2279 Reg st = *d;
2280 Reg rk = *s;
2281
2282 for (i = 0 ; i < 4 ; i++) {
04af534d
TM
2283 d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4*i+0])] ^
2284 AES_Td1[st.B(AES_ishifts[4*i+1])] ^
2285 AES_Td2[st.B(AES_ishifts[4*i+2])] ^
2286 AES_Td3[st.B(AES_ishifts[4*i+3])]);
d640045a
AJ
2287 }
2288}
2289
2290void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2291{
2292 int i;
2293 Reg st = *d;
2294 Reg rk = *s;
2295
2296 for (i = 0; i < 16; i++) {
9551ea69 2297 d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i])]);
d640045a
AJ
2298 }
2299}
2300
2301void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2302{
2303 int i;
2304 Reg st = *d;
2305 Reg rk = *s;
2306
2307 for (i = 0 ; i < 4 ; i++) {
04af534d
TM
2308 d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4*i+0])] ^
2309 AES_Te1[st.B(AES_shifts[4*i+1])] ^
2310 AES_Te2[st.B(AES_shifts[4*i+2])] ^
2311 AES_Te3[st.B(AES_shifts[4*i+3])]);
d640045a
AJ
2312 }
2313}
2314
2315void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2316{
2317 int i;
2318 Reg st = *d;
2319 Reg rk = *s;
2320
2321 for (i = 0; i < 16; i++) {
9551ea69 2322 d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i])]);
d640045a
AJ
2323 }
2324
2325}
2326
2327void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2328{
2329 int i;
2330 Reg tmp = *s;
2331
2332 for (i = 0 ; i < 4 ; i++) {
9551ea69
AJ
2333 d->L(i) = bswap32(AES_imc[tmp.B(4*i+0)][0] ^
2334 AES_imc[tmp.B(4*i+1)][1] ^
2335 AES_imc[tmp.B(4*i+2)][2] ^
2336 AES_imc[tmp.B(4*i+3)][3]);
d640045a
AJ
2337 }
2338}
2339
2340void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2341 uint32_t ctrl)
2342{
2343 int i;
2344 Reg tmp = *s;
2345
2346 for (i = 0 ; i < 4 ; i++) {
9551ea69
AJ
2347 d->B(i) = AES_sbox[tmp.B(i + 4)];
2348 d->B(i + 8) = AES_sbox[tmp.B(i + 12)];
d640045a
AJ
2349 }
2350 d->L(1) = (d->L(0) << 24 | d->L(0) >> 8) ^ ctrl;
2351 d->L(3) = (d->L(2) << 24 | d->L(2) >> 8) ^ ctrl;
2352}
222a3336
AZ
2353#endif
2354
3403cafe
PB
2355#undef SSE_HELPER_S
2356
664e0f19
FB
2357#undef SHIFT
2358#undef XMM_ONLY
2359#undef Reg
2360#undef B
2361#undef W
2362#undef L
2363#undef Q
2364#undef SUFFIX