]> git.proxmox.com Git - mirror_qemu.git/blame - target/i386/ops_sse.h
target/i386: Rewrite blendv helpers
[mirror_qemu.git] / target / i386 / ops_sse.h
CommitLineData
664e0f19 1/*
222a3336 2 * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support
5fafdf24 3 *
664e0f19 4 * Copyright (c) 2005 Fabrice Bellard
222a3336 5 * Copyright (c) 2008 Intel Corporation <andrew.zaborowski@intel.com>
664e0f19
FB
6 *
7 * This library is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
d9ff33ad 10 * version 2.1 of the License, or (at your option) any later version.
664e0f19
FB
11 *
12 * This library is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
8167ee88 18 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
664e0f19 19 */
04af534d 20
6f2945cd 21#include "crypto/aes.h"
04af534d 22
664e0f19
FB
23#if SHIFT == 0
24#define Reg MMXReg
001faf32 25#define XMM_ONLY(...)
664e0f19
FB
26#define B(n) MMX_B(n)
27#define W(n) MMX_W(n)
28#define L(n) MMX_L(n)
83625474 29#define Q(n) MMX_Q(n)
664e0f19
FB
30#define SUFFIX _mmx
31#else
fa451874 32#define Reg ZMMReg
001faf32 33#define XMM_ONLY(...) __VA_ARGS__
19cbd87c
EH
34#define B(n) ZMM_B(n)
35#define W(n) ZMM_W(n)
36#define L(n) ZMM_L(n)
37#define Q(n) ZMM_Q(n)
664e0f19
FB
38#define SUFFIX _xmm
39#endif
40
18592d2e 41#define LANE_WIDTH (SHIFT ? 16 : 8)
d45b0de6 42#define PACK_WIDTH (LANE_WIDTH / 2)
18592d2e 43
18592d2e
PB
44#if SHIFT == 0
45#define FPSRL(x, c) ((x) >> shift)
46#define FPSRAW(x, c) ((int16_t)(x) >> shift)
47#define FPSRAL(x, c) ((int32_t)(x) >> shift)
48#define FPSLL(x, c) ((x) << shift)
49#endif
50
51void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 52{
18592d2e 53 Reg *s = d;
664e0f19 54 int shift;
18592d2e
PB
55 if (c->Q(0) > 15) {
56 for (int i = 0; i < 1 << SHIFT; i++) {
57 d->Q(i) = 0;
58 }
664e0f19 59 } else {
18592d2e
PB
60 shift = c->B(0);
61 for (int i = 0; i < 4 << SHIFT; i++) {
62 d->W(i) = FPSRL(s->W(i), shift);
63 }
664e0f19
FB
64 }
65}
66
18592d2e 67void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 68{
18592d2e 69 Reg *s = d;
664e0f19 70 int shift;
18592d2e
PB
71 if (c->Q(0) > 15) {
72 for (int i = 0; i < 1 << SHIFT; i++) {
73 d->Q(i) = 0;
74 }
664e0f19 75 } else {
18592d2e
PB
76 shift = c->B(0);
77 for (int i = 0; i < 4 << SHIFT; i++) {
78 d->W(i) = FPSLL(s->W(i), shift);
79 }
664e0f19 80 }
664e0f19
FB
81}
82
18592d2e 83void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 84{
18592d2e 85 Reg *s = d;
664e0f19 86 int shift;
18592d2e
PB
87 if (c->Q(0) > 15) {
88 shift = 15;
664e0f19 89 } else {
18592d2e
PB
90 shift = c->B(0);
91 }
92 for (int i = 0; i < 4 << SHIFT; i++) {
93 d->W(i) = FPSRAW(s->W(i), shift);
664e0f19
FB
94 }
95}
96
18592d2e 97void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 98{
18592d2e 99 Reg *s = d;
664e0f19 100 int shift;
18592d2e
PB
101 if (c->Q(0) > 31) {
102 for (int i = 0; i < 1 << SHIFT; i++) {
103 d->Q(i) = 0;
104 }
664e0f19 105 } else {
18592d2e
PB
106 shift = c->B(0);
107 for (int i = 0; i < 2 << SHIFT; i++) {
108 d->L(i) = FPSRL(s->L(i), shift);
109 }
664e0f19
FB
110 }
111}
112
18592d2e 113void glue(helper_pslld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 114{
18592d2e 115 Reg *s = d;
664e0f19 116 int shift;
18592d2e
PB
117 if (c->Q(0) > 31) {
118 for (int i = 0; i < 1 << SHIFT; i++) {
119 d->Q(i) = 0;
120 }
664e0f19 121 } else {
18592d2e
PB
122 shift = c->B(0);
123 for (int i = 0; i < 2 << SHIFT; i++) {
124 d->L(i) = FPSLL(s->L(i), shift);
125 }
664e0f19 126 }
664e0f19
FB
127}
128
18592d2e 129void glue(helper_psrad, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 130{
18592d2e 131 Reg *s = d;
664e0f19 132 int shift;
18592d2e
PB
133 if (c->Q(0) > 31) {
134 shift = 31;
664e0f19 135 } else {
18592d2e
PB
136 shift = c->B(0);
137 }
138 for (int i = 0; i < 2 << SHIFT; i++) {
139 d->L(i) = FPSRAL(s->L(i), shift);
664e0f19
FB
140 }
141}
142
18592d2e 143void glue(helper_psrlq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 144{
18592d2e 145 Reg *s = d;
664e0f19 146 int shift;
18592d2e
PB
147 if (c->Q(0) > 63) {
148 for (int i = 0; i < 1 << SHIFT; i++) {
149 d->Q(i) = 0;
150 }
664e0f19 151 } else {
18592d2e
PB
152 shift = c->B(0);
153 for (int i = 0; i < 1 << SHIFT; i++) {
154 d->Q(i) = FPSRL(s->Q(i), shift);
155 }
664e0f19
FB
156 }
157}
158
18592d2e 159void glue(helper_psllq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 160{
18592d2e 161 Reg *s = d;
664e0f19 162 int shift;
18592d2e
PB
163 if (c->Q(0) > 63) {
164 for (int i = 0; i < 1 << SHIFT; i++) {
165 d->Q(i) = 0;
166 }
664e0f19 167 } else {
18592d2e
PB
168 shift = c->B(0);
169 for (int i = 0; i < 1 << SHIFT; i++) {
170 d->Q(i) = FPSLL(s->Q(i), shift);
171 }
664e0f19
FB
172 }
173}
174
18592d2e
PB
175#if SHIFT >= 1
176void glue(helper_psrldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 177{
18592d2e
PB
178 Reg *s = d;
179 int shift, i, j;
664e0f19 180
18592d2e 181 shift = c->L(0);
e01d9d31 182 if (shift > 16) {
664e0f19 183 shift = 16;
e01d9d31 184 }
18592d2e
PB
185 for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) {
186 for (i = 0; i < 16 - shift; i++) {
187 d->B(j + i) = s->B(j + i + shift);
188 }
189 for (i = 16 - shift; i < 16; i++) {
190 d->B(j + i) = 0;
191 }
e01d9d31 192 }
664e0f19
FB
193}
194
18592d2e 195void glue(helper_pslldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
664e0f19 196{
18592d2e
PB
197 Reg *s = d;
198 int shift, i, j;
664e0f19 199
18592d2e 200 shift = c->L(0);
e01d9d31 201 if (shift > 16) {
664e0f19 202 shift = 16;
e01d9d31 203 }
18592d2e
PB
204 for (j = 0; j < 8 << SHIFT; j += LANE_WIDTH) {
205 for (i = 15; i >= shift; i--) {
206 d->B(j + i) = s->B(j + i - shift);
207 }
208 for (i = 0; i < shift; i++) {
209 d->B(j + i) = 0;
210 }
e01d9d31 211 }
664e0f19
FB
212}
213#endif
214
ee04a3c8 215#define SSE_HELPER_1(name, elem, num, F) \
d3eb5eae 216 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
e01d9d31 217 { \
ee04a3c8
PB
218 int n = num; \
219 for (int i = 0; i < n; i++) { \
220 d->elem(i) = F(s->elem(i)); \
221 } \
222 }
e01d9d31 223
ee04a3c8 224#define SSE_HELPER_2(name, elem, num, F) \
d3eb5eae 225 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
e01d9d31 226 { \
ee04a3c8
PB
227 Reg *v = d; \
228 int n = num; \
229 for (int i = 0; i < n; i++) { \
230 d->elem(i) = F(v->elem(i), s->elem(i)); \
231 } \
232 }
233
234#define SSE_HELPER_B(name, F) \
235 SSE_HELPER_2(name, B, 8 << SHIFT, F)
236
237#define SSE_HELPER_W(name, F) \
238 SSE_HELPER_2(name, W, 4 << SHIFT, F)
e01d9d31
BS
239
240#define SSE_HELPER_L(name, F) \
ee04a3c8 241 SSE_HELPER_2(name, L, 2 << SHIFT, F)
e01d9d31
BS
242
243#define SSE_HELPER_Q(name, F) \
ee04a3c8 244 SSE_HELPER_2(name, Q, 1 << SHIFT, F)
664e0f19
FB
245
246#if SHIFT == 0
247static inline int satub(int x)
248{
e01d9d31 249 if (x < 0) {
664e0f19 250 return 0;
e01d9d31 251 } else if (x > 255) {
664e0f19 252 return 255;
e01d9d31 253 } else {
664e0f19 254 return x;
e01d9d31 255 }
664e0f19
FB
256}
257
258static inline int satuw(int x)
259{
e01d9d31 260 if (x < 0) {
664e0f19 261 return 0;
e01d9d31 262 } else if (x > 65535) {
664e0f19 263 return 65535;
e01d9d31 264 } else {
664e0f19 265 return x;
e01d9d31 266 }
664e0f19
FB
267}
268
269static inline int satsb(int x)
270{
e01d9d31 271 if (x < -128) {
664e0f19 272 return -128;
e01d9d31 273 } else if (x > 127) {
664e0f19 274 return 127;
e01d9d31 275 } else {
664e0f19 276 return x;
e01d9d31 277 }
664e0f19
FB
278}
279
280static inline int satsw(int x)
281{
e01d9d31 282 if (x < -32768) {
664e0f19 283 return -32768;
e01d9d31 284 } else if (x > 32767) {
664e0f19 285 return 32767;
e01d9d31 286 } else {
664e0f19 287 return x;
e01d9d31 288 }
664e0f19
FB
289}
290
291#define FADD(a, b) ((a) + (b))
292#define FADDUB(a, b) satub((a) + (b))
293#define FADDUW(a, b) satuw((a) + (b))
294#define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b))
295#define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b))
296
297#define FSUB(a, b) ((a) - (b))
298#define FSUBUB(a, b) satub((a) - (b))
299#define FSUBUW(a, b) satuw((a) - (b))
300#define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b))
301#define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b))
302#define FMINUB(a, b) ((a) < (b)) ? (a) : (b)
303#define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b)
304#define FMAXUB(a, b) ((a) > (b)) ? (a) : (b)
305#define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b)
306
e01d9d31 307#define FAND(a, b) ((a) & (b))
664e0f19 308#define FANDN(a, b) ((~(a)) & (b))
e01d9d31
BS
309#define FOR(a, b) ((a) | (b))
310#define FXOR(a, b) ((a) ^ (b))
664e0f19 311
e01d9d31
BS
312#define FCMPGTB(a, b) ((int8_t)(a) > (int8_t)(b) ? -1 : 0)
313#define FCMPGTW(a, b) ((int16_t)(a) > (int16_t)(b) ? -1 : 0)
314#define FCMPGTL(a, b) ((int32_t)(a) > (int32_t)(b) ? -1 : 0)
315#define FCMPEQ(a, b) ((a) == (b) ? -1 : 0)
664e0f19 316
e01d9d31
BS
317#define FMULLW(a, b) ((a) * (b))
318#define FMULHRW(a, b) (((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16)
319#define FMULHUW(a, b) ((a) * (b) >> 16)
320#define FMULHW(a, b) ((int16_t)(a) * (int16_t)(b) >> 16)
664e0f19 321
e01d9d31 322#define FAVG(a, b) (((a) + (b) + 1) >> 1)
664e0f19
FB
323#endif
324
5af45186
FB
325SSE_HELPER_B(helper_paddb, FADD)
326SSE_HELPER_W(helper_paddw, FADD)
327SSE_HELPER_L(helper_paddl, FADD)
328SSE_HELPER_Q(helper_paddq, FADD)
664e0f19 329
5af45186
FB
330SSE_HELPER_B(helper_psubb, FSUB)
331SSE_HELPER_W(helper_psubw, FSUB)
332SSE_HELPER_L(helper_psubl, FSUB)
333SSE_HELPER_Q(helper_psubq, FSUB)
664e0f19 334
5af45186
FB
335SSE_HELPER_B(helper_paddusb, FADDUB)
336SSE_HELPER_B(helper_paddsb, FADDSB)
337SSE_HELPER_B(helper_psubusb, FSUBUB)
338SSE_HELPER_B(helper_psubsb, FSUBSB)
664e0f19 339
5af45186
FB
340SSE_HELPER_W(helper_paddusw, FADDUW)
341SSE_HELPER_W(helper_paddsw, FADDSW)
342SSE_HELPER_W(helper_psubusw, FSUBUW)
343SSE_HELPER_W(helper_psubsw, FSUBSW)
664e0f19 344
5af45186
FB
345SSE_HELPER_B(helper_pminub, FMINUB)
346SSE_HELPER_B(helper_pmaxub, FMAXUB)
664e0f19 347
5af45186
FB
348SSE_HELPER_W(helper_pminsw, FMINSW)
349SSE_HELPER_W(helper_pmaxsw, FMAXSW)
664e0f19 350
5af45186
FB
351SSE_HELPER_Q(helper_pand, FAND)
352SSE_HELPER_Q(helper_pandn, FANDN)
353SSE_HELPER_Q(helper_por, FOR)
354SSE_HELPER_Q(helper_pxor, FXOR)
664e0f19 355
5af45186
FB
356SSE_HELPER_B(helper_pcmpgtb, FCMPGTB)
357SSE_HELPER_W(helper_pcmpgtw, FCMPGTW)
358SSE_HELPER_L(helper_pcmpgtl, FCMPGTL)
664e0f19 359
5af45186
FB
360SSE_HELPER_B(helper_pcmpeqb, FCMPEQ)
361SSE_HELPER_W(helper_pcmpeqw, FCMPEQ)
362SSE_HELPER_L(helper_pcmpeql, FCMPEQ)
664e0f19 363
5af45186 364SSE_HELPER_W(helper_pmullw, FMULLW)
a35f3ec7 365#if SHIFT == 0
5af45186 366SSE_HELPER_W(helper_pmulhrw, FMULHRW)
a35f3ec7 367#endif
5af45186
FB
368SSE_HELPER_W(helper_pmulhuw, FMULHUW)
369SSE_HELPER_W(helper_pmulhw, FMULHW)
664e0f19 370
5af45186
FB
371SSE_HELPER_B(helper_pavgb, FAVG)
372SSE_HELPER_W(helper_pavgw, FAVG)
664e0f19 373
d3eb5eae 374void glue(helper_pmuludq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 375{
e894bae8
PB
376 Reg *v = d;
377 int i;
378
379 for (i = 0; i < (1 << SHIFT); i++) {
380 d->Q(i) = (uint64_t)s->L(i * 2) * (uint64_t)v->L(i * 2);
381 }
664e0f19
FB
382}
383
d3eb5eae 384void glue(helper_pmaddwd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 385{
e894bae8 386 Reg *v = d;
664e0f19 387 int i;
664e0f19 388
e01d9d31 389 for (i = 0; i < (2 << SHIFT); i++) {
e894bae8
PB
390 d->L(i) = (int16_t)s->W(2 * i) * (int16_t)v->W(2 * i) +
391 (int16_t)s->W(2 * i + 1) * (int16_t)v->W(2 * i + 1);
664e0f19
FB
392 }
393}
394
395#if SHIFT == 0
396static inline int abs1(int a)
397{
e01d9d31 398 if (a < 0) {
664e0f19 399 return -a;
e01d9d31 400 } else {
664e0f19 401 return a;
e01d9d31 402 }
664e0f19
FB
403}
404#endif
e894bae8 405
d3eb5eae 406void glue(helper_psadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 407{
e894bae8
PB
408 Reg *v = d;
409 int i;
664e0f19 410
e894bae8
PB
411 for (i = 0; i < (1 << SHIFT); i++) {
412 unsigned int val = 0;
413 val += abs1(v->B(8 * i + 0) - s->B(8 * i + 0));
414 val += abs1(v->B(8 * i + 1) - s->B(8 * i + 1));
415 val += abs1(v->B(8 * i + 2) - s->B(8 * i + 2));
416 val += abs1(v->B(8 * i + 3) - s->B(8 * i + 3));
417 val += abs1(v->B(8 * i + 4) - s->B(8 * i + 4));
418 val += abs1(v->B(8 * i + 5) - s->B(8 * i + 5));
419 val += abs1(v->B(8 * i + 6) - s->B(8 * i + 6));
420 val += abs1(v->B(8 * i + 7) - s->B(8 * i + 7));
421 d->Q(i) = val;
422 }
664e0f19
FB
423}
424
fd17264a 425#if SHIFT < 2
d3eb5eae
BS
426void glue(helper_maskmov, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
427 target_ulong a0)
664e0f19
FB
428{
429 int i;
e01d9d31
BS
430
431 for (i = 0; i < (8 << SHIFT); i++) {
432 if (s->B(i) & 0x80) {
4054cdec 433 cpu_stb_data_ra(env, a0 + i, d->B(i), GETPC());
e01d9d31 434 }
664e0f19
FB
435 }
436}
fd17264a 437#endif
664e0f19 438
e01d9d31 439void glue(helper_movl_mm_T0, SUFFIX)(Reg *d, uint32_t val)
664e0f19 440{
e894bae8
PB
441 int i;
442
5af45186 443 d->L(0) = val;
664e0f19 444 d->L(1) = 0;
e894bae8
PB
445 for (i = 1; i < (1 << SHIFT); i++) {
446 d->Q(i) = 0;
447 }
664e0f19
FB
448}
449
dabd98dd 450#ifdef TARGET_X86_64
e01d9d31 451void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val)
dabd98dd 452{
e894bae8
PB
453 int i;
454
5af45186 455 d->Q(0) = val;
e894bae8
PB
456 for (i = 1; i < (1 << SHIFT); i++) {
457 d->Q(i) = 0;
458 }
dabd98dd 459}
dabd98dd
FB
460#endif
461
d45b0de6
PB
462#define SHUFFLE4(F, a, b, offset) do { \
463 r0 = a->F((order & 3) + offset); \
464 r1 = a->F(((order >> 2) & 3) + offset); \
465 r2 = b->F(((order >> 4) & 3) + offset); \
466 r3 = b->F(((order >> 6) & 3) + offset); \
467 d->F(offset) = r0; \
468 d->F(offset + 1) = r1; \
469 d->F(offset + 2) = r2; \
470 d->F(offset + 3) = r3; \
471 } while (0)
472
664e0f19 473#if SHIFT == 0
e01d9d31 474void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
664e0f19 475{
d45b0de6 476 uint16_t r0, r1, r2, r3;
e01d9d31 477
d45b0de6 478 SHUFFLE4(W, s, s, 0);
664e0f19
FB
479}
480#else
ce4fa29f 481void glue(helper_shufps, SUFFIX)(Reg *d, Reg *s, int order)
d52cf7a6 482{
d45b0de6
PB
483 Reg *v = d;
484 uint32_t r0, r1, r2, r3;
485 int i;
e01d9d31 486
d45b0de6
PB
487 for (i = 0; i < 2 << SHIFT; i += 4) {
488 SHUFFLE4(L, v, s, i);
489 }
d52cf7a6
FB
490}
491
ce4fa29f 492void glue(helper_shufpd, SUFFIX)(Reg *d, Reg *s, int order)
664e0f19 493{
d45b0de6
PB
494 Reg *v = d;
495 uint64_t r0, r1;
496 int i;
e01d9d31 497
d45b0de6
PB
498 for (i = 0; i < 1 << SHIFT; i += 2) {
499 r0 = v->Q(((order & 1) & 1) + i);
500 r1 = s->Q(((order >> 1) & 1) + i);
501 d->Q(i) = r0;
502 d->Q(i + 1) = r1;
503 order >>= 2;
504 }
664e0f19
FB
505}
506
e01d9d31 507void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
664e0f19 508{
d45b0de6
PB
509 uint32_t r0, r1, r2, r3;
510 int i;
e01d9d31 511
d45b0de6
PB
512 for (i = 0; i < 2 << SHIFT; i += 4) {
513 SHUFFLE4(L, s, s, i);
514 }
664e0f19
FB
515}
516
e01d9d31 517void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
664e0f19 518{
d45b0de6
PB
519 uint16_t r0, r1, r2, r3;
520 int i, j;
e01d9d31 521
d45b0de6
PB
522 for (i = 0, j = 1; j < 1 << SHIFT; i += 8, j += 2) {
523 SHUFFLE4(W, s, s, i);
524 d->Q(j) = s->Q(j);
525 }
664e0f19
FB
526}
527
e01d9d31 528void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
664e0f19 529{
d45b0de6
PB
530 uint16_t r0, r1, r2, r3;
531 int i, j;
e01d9d31 532
d45b0de6
PB
533 for (i = 4, j = 0; j < 1 << SHIFT; i += 8, j += 2) {
534 d->Q(j) = s->Q(j);
535 SHUFFLE4(W, s, s, i);
536 }
664e0f19
FB
537}
538#endif
539
3403cafe 540#if SHIFT >= 1
664e0f19
FB
541/* FPU ops */
542/* XXX: not accurate */
543
3403cafe
PB
544#define SSE_HELPER_P(name, F) \
545 void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \
546 Reg *d, Reg *s) \
d3eb5eae 547 { \
3403cafe
PB
548 Reg *v = d; \
549 int i; \
550 for (i = 0; i < 2 << SHIFT; i++) { \
551 d->ZMM_S(i) = F(32, v->ZMM_S(i), s->ZMM_S(i)); \
552 } \
d3eb5eae
BS
553 } \
554 \
3403cafe
PB
555 void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \
556 Reg *d, Reg *s) \
d3eb5eae 557 { \
3403cafe
PB
558 Reg *v = d; \
559 int i; \
560 for (i = 0; i < 1 << SHIFT; i++) { \
561 d->ZMM_D(i) = F(64, v->ZMM_D(i), s->ZMM_D(i)); \
562 } \
563 }
564
565#if SHIFT == 1
566
567#define SSE_HELPER_S(name, F) \
568 SSE_HELPER_P(name, F) \
d3eb5eae 569 \
3403cafe 570 void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s)\
d3eb5eae 571 { \
3403cafe
PB
572 Reg *v = d; \
573 d->ZMM_S(0) = F(32, v->ZMM_S(0), s->ZMM_S(0)); \
d3eb5eae
BS
574 } \
575 \
3403cafe 576 void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s)\
d3eb5eae 577 { \
3403cafe
PB
578 Reg *v = d; \
579 d->ZMM_D(0) = F(64, v->ZMM_D(0), s->ZMM_D(0)); \
e01d9d31 580 }
664e0f19 581
3403cafe
PB
582#else
583
584#define SSE_HELPER_S(name, F) SSE_HELPER_P(name, F)
585
586#endif
587
7a0e1f41
FB
588#define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status)
589#define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status)
590#define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status)
591#define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status)
664e0f19 592
a4d1f142
AJ
593/* Note that the choice of comparison op here is important to get the
594 * special cases right: for min and max Intel specifies that (-0,0),
595 * (NaN, anything) and (anything, NaN) return the second argument.
596 */
e01d9d31
BS
597#define FPU_MIN(size, a, b) \
598 (float ## size ## _lt(a, b, &env->sse_status) ? (a) : (b))
599#define FPU_MAX(size, a, b) \
600 (float ## size ## _lt(b, a, &env->sse_status) ? (a) : (b))
a4d1f142 601
5af45186
FB
602SSE_HELPER_S(add, FPU_ADD)
603SSE_HELPER_S(sub, FPU_SUB)
604SSE_HELPER_S(mul, FPU_MUL)
605SSE_HELPER_S(div, FPU_DIV)
606SSE_HELPER_S(min, FPU_MIN)
607SSE_HELPER_S(max, FPU_MAX)
664e0f19 608
3403cafe
PB
609void glue(helper_sqrtps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
610{
611 int i;
612 for (i = 0; i < 2 << SHIFT; i++) {
613 d->ZMM_S(i) = float32_sqrt(s->ZMM_S(i), &env->sse_status);
614 }
615}
616
617void glue(helper_sqrtpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
618{
619 int i;
620 for (i = 0; i < 1 << SHIFT; i++) {
621 d->ZMM_D(i) = float64_sqrt(s->ZMM_D(i), &env->sse_status);
622 }
623}
624
625#if SHIFT == 1
626void helper_sqrtss(CPUX86State *env, Reg *d, Reg *s)
627{
628 d->ZMM_S(0) = float32_sqrt(s->ZMM_S(0), &env->sse_status);
629}
630
631void helper_sqrtsd(CPUX86State *env, Reg *d, Reg *s)
632{
633 d->ZMM_D(0) = float64_sqrt(s->ZMM_D(0), &env->sse_status);
634}
635#endif
664e0f19
FB
636
637/* float to float conversions */
ce4fa29f 638void glue(helper_cvtps2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 639{
fd17264a
PB
640 int i;
641 for (i = 1 << SHIFT; --i >= 0; ) {
642 d->ZMM_D(i) = float32_to_float64(s->ZMM_S(i), &env->sse_status);
643 }
664e0f19
FB
644}
645
ce4fa29f 646void glue(helper_cvtpd2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 647{
fd17264a
PB
648 int i;
649 for (i = 0; i < 1 << SHIFT; i++) {
650 d->ZMM_S(i) = float64_to_float32(s->ZMM_D(i), &env->sse_status);
651 }
652 for (i >>= 1; i < 1 << SHIFT; i++) {
653 d->Q(i) = 0;
654 }
664e0f19
FB
655}
656
fd17264a 657#if SHIFT == 1
d3eb5eae 658void helper_cvtss2sd(CPUX86State *env, Reg *d, Reg *s)
664e0f19 659{
19cbd87c 660 d->ZMM_D(0) = float32_to_float64(s->ZMM_S(0), &env->sse_status);
664e0f19
FB
661}
662
d3eb5eae 663void helper_cvtsd2ss(CPUX86State *env, Reg *d, Reg *s)
664e0f19 664{
19cbd87c 665 d->ZMM_S(0) = float64_to_float32(s->ZMM_D(0), &env->sse_status);
664e0f19 666}
fd17264a 667#endif
664e0f19
FB
668
669/* integer to float */
ce4fa29f 670void glue(helper_cvtdq2ps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 671{
fd17264a
PB
672 int i;
673 for (i = 0; i < 2 << SHIFT; i++) {
674 d->ZMM_S(i) = int32_to_float32(s->ZMM_L(i), &env->sse_status);
675 }
664e0f19
FB
676}
677
ce4fa29f 678void glue(helper_cvtdq2pd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 679{
fd17264a
PB
680 int i;
681 for (i = 1 << SHIFT; --i >= 0; ) {
682 int32_t l = s->ZMM_L(i);
683 d->ZMM_D(i) = int32_to_float64(l, &env->sse_status);
684 }
664e0f19
FB
685}
686
fd17264a 687#if SHIFT == 1
fa451874 688void helper_cvtpi2ps(CPUX86State *env, ZMMReg *d, MMXReg *s)
664e0f19 689{
19cbd87c
EH
690 d->ZMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status);
691 d->ZMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status);
664e0f19
FB
692}
693
fa451874 694void helper_cvtpi2pd(CPUX86State *env, ZMMReg *d, MMXReg *s)
664e0f19 695{
19cbd87c
EH
696 d->ZMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status);
697 d->ZMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status);
664e0f19
FB
698}
699
fa451874 700void helper_cvtsi2ss(CPUX86State *env, ZMMReg *d, uint32_t val)
664e0f19 701{
19cbd87c 702 d->ZMM_S(0) = int32_to_float32(val, &env->sse_status);
664e0f19
FB
703}
704
fa451874 705void helper_cvtsi2sd(CPUX86State *env, ZMMReg *d, uint32_t val)
664e0f19 706{
19cbd87c 707 d->ZMM_D(0) = int32_to_float64(val, &env->sse_status);
664e0f19
FB
708}
709
710#ifdef TARGET_X86_64
fa451874 711void helper_cvtsq2ss(CPUX86State *env, ZMMReg *d, uint64_t val)
664e0f19 712{
19cbd87c 713 d->ZMM_S(0) = int64_to_float32(val, &env->sse_status);
664e0f19
FB
714}
715
fa451874 716void helper_cvtsq2sd(CPUX86State *env, ZMMReg *d, uint64_t val)
664e0f19 717{
19cbd87c 718 d->ZMM_D(0) = int64_to_float64(val, &env->sse_status);
664e0f19
FB
719}
720#endif
721
fd17264a
PB
722#endif
723
664e0f19 724/* float to integer */
1e8a98b5 725
fd17264a 726#if SHIFT == 1
1e8a98b5
PM
727/*
728 * x86 mandates that we return the indefinite integer value for the result
729 * of any float-to-integer conversion that raises the 'invalid' exception.
730 * Wrap the softfloat functions to get this behaviour.
731 */
732#define WRAP_FLOATCONV(RETTYPE, FN, FLOATTYPE, INDEFVALUE) \
733 static inline RETTYPE x86_##FN(FLOATTYPE a, float_status *s) \
734 { \
735 int oldflags, newflags; \
736 RETTYPE r; \
737 \
738 oldflags = get_float_exception_flags(s); \
739 set_float_exception_flags(0, s); \
740 r = FN(a, s); \
741 newflags = get_float_exception_flags(s); \
742 if (newflags & float_flag_invalid) { \
743 r = INDEFVALUE; \
744 } \
745 set_float_exception_flags(newflags | oldflags, s); \
746 return r; \
747 }
748
749WRAP_FLOATCONV(int32_t, float32_to_int32, float32, INT32_MIN)
750WRAP_FLOATCONV(int32_t, float32_to_int32_round_to_zero, float32, INT32_MIN)
751WRAP_FLOATCONV(int32_t, float64_to_int32, float64, INT32_MIN)
752WRAP_FLOATCONV(int32_t, float64_to_int32_round_to_zero, float64, INT32_MIN)
753WRAP_FLOATCONV(int64_t, float32_to_int64, float32, INT64_MIN)
754WRAP_FLOATCONV(int64_t, float32_to_int64_round_to_zero, float32, INT64_MIN)
755WRAP_FLOATCONV(int64_t, float64_to_int64, float64, INT64_MIN)
756WRAP_FLOATCONV(int64_t, float64_to_int64_round_to_zero, float64, INT64_MIN)
fd17264a 757#endif
1e8a98b5 758
ce4fa29f 759void glue(helper_cvtps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 760{
fd17264a
PB
761 int i;
762 for (i = 0; i < 2 << SHIFT; i++) {
763 d->ZMM_L(i) = x86_float32_to_int32(s->ZMM_S(i), &env->sse_status);
764 }
664e0f19
FB
765}
766
ce4fa29f 767void glue(helper_cvtpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 768{
fd17264a
PB
769 int i;
770 for (i = 0; i < 1 << SHIFT; i++) {
771 d->ZMM_L(i) = x86_float64_to_int32(s->ZMM_D(i), &env->sse_status);
772 }
773 for (i >>= 1; i < 1 << SHIFT; i++) {
774 d->Q(i) = 0;
775 }
664e0f19
FB
776}
777
fd17264a 778#if SHIFT == 1
fa451874 779void helper_cvtps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
664e0f19 780{
1e8a98b5
PM
781 d->MMX_L(0) = x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
782 d->MMX_L(1) = x86_float32_to_int32(s->ZMM_S(1), &env->sse_status);
664e0f19
FB
783}
784
fa451874 785void helper_cvtpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
664e0f19 786{
1e8a98b5
PM
787 d->MMX_L(0) = x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
788 d->MMX_L(1) = x86_float64_to_int32(s->ZMM_D(1), &env->sse_status);
664e0f19
FB
789}
790
fa451874 791int32_t helper_cvtss2si(CPUX86State *env, ZMMReg *s)
664e0f19 792{
1e8a98b5 793 return x86_float32_to_int32(s->ZMM_S(0), &env->sse_status);
664e0f19
FB
794}
795
fa451874 796int32_t helper_cvtsd2si(CPUX86State *env, ZMMReg *s)
664e0f19 797{
1e8a98b5 798 return x86_float64_to_int32(s->ZMM_D(0), &env->sse_status);
664e0f19
FB
799}
800
801#ifdef TARGET_X86_64
fa451874 802int64_t helper_cvtss2sq(CPUX86State *env, ZMMReg *s)
664e0f19 803{
1e8a98b5 804 return x86_float32_to_int64(s->ZMM_S(0), &env->sse_status);
664e0f19
FB
805}
806
fa451874 807int64_t helper_cvtsd2sq(CPUX86State *env, ZMMReg *s)
664e0f19 808{
1e8a98b5 809 return x86_float64_to_int64(s->ZMM_D(0), &env->sse_status);
664e0f19
FB
810}
811#endif
fd17264a 812#endif
664e0f19
FB
813
814/* float to integer truncated */
ce4fa29f 815void glue(helper_cvttps2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 816{
fd17264a
PB
817 int i;
818 for (i = 0; i < 2 << SHIFT; i++) {
819 d->ZMM_L(i) = x86_float32_to_int32_round_to_zero(s->ZMM_S(i),
820 &env->sse_status);
821 }
664e0f19
FB
822}
823
ce4fa29f 824void glue(helper_cvttpd2dq, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 825{
fd17264a
PB
826 int i;
827 for (i = 0; i < 1 << SHIFT; i++) {
828 d->ZMM_L(i) = x86_float64_to_int32_round_to_zero(s->ZMM_D(i),
829 &env->sse_status);
830 }
831 for (i >>= 1; i < 1 << SHIFT; i++) {
832 d->Q(i) = 0;
833 }
664e0f19
FB
834}
835
fd17264a 836#if SHIFT == 1
fa451874 837void helper_cvttps2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
664e0f19 838{
1e8a98b5
PM
839 d->MMX_L(0) = x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
840 d->MMX_L(1) = x86_float32_to_int32_round_to_zero(s->ZMM_S(1), &env->sse_status);
664e0f19
FB
841}
842
fa451874 843void helper_cvttpd2pi(CPUX86State *env, MMXReg *d, ZMMReg *s)
664e0f19 844{
1e8a98b5
PM
845 d->MMX_L(0) = x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
846 d->MMX_L(1) = x86_float64_to_int32_round_to_zero(s->ZMM_D(1), &env->sse_status);
664e0f19
FB
847}
848
fa451874 849int32_t helper_cvttss2si(CPUX86State *env, ZMMReg *s)
664e0f19 850{
1e8a98b5 851 return x86_float32_to_int32_round_to_zero(s->ZMM_S(0), &env->sse_status);
664e0f19
FB
852}
853
fa451874 854int32_t helper_cvttsd2si(CPUX86State *env, ZMMReg *s)
664e0f19 855{
1e8a98b5 856 return x86_float64_to_int32_round_to_zero(s->ZMM_D(0), &env->sse_status);
664e0f19
FB
857}
858
859#ifdef TARGET_X86_64
fa451874 860int64_t helper_cvttss2sq(CPUX86State *env, ZMMReg *s)
664e0f19 861{
1e8a98b5 862 return x86_float32_to_int64_round_to_zero(s->ZMM_S(0), &env->sse_status);
664e0f19
FB
863}
864
fa451874 865int64_t helper_cvttsd2sq(CPUX86State *env, ZMMReg *s)
664e0f19 866{
1e8a98b5 867 return x86_float64_to_int64_round_to_zero(s->ZMM_D(0), &env->sse_status);
664e0f19
FB
868}
869#endif
fd17264a 870#endif
664e0f19 871
ce4fa29f 872void glue(helper_rsqrtps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 873{
418b0f93 874 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
3403cafe
PB
875 int i;
876 for (i = 0; i < 2 << SHIFT; i++) {
877 d->ZMM_S(i) = float32_div(float32_one,
878 float32_sqrt(s->ZMM_S(i), &env->sse_status),
879 &env->sse_status);
880 }
418b0f93 881 set_float_exception_flags(old_flags, &env->sse_status);
664e0f19
FB
882}
883
fd17264a 884#if SHIFT == 1
fa451874 885void helper_rsqrtss(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 886{
418b0f93 887 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
19cbd87c
EH
888 d->ZMM_S(0) = float32_div(float32_one,
889 float32_sqrt(s->ZMM_S(0), &env->sse_status),
c2ef9a83 890 &env->sse_status);
418b0f93 891 set_float_exception_flags(old_flags, &env->sse_status);
664e0f19 892}
fd17264a 893#endif
664e0f19 894
ce4fa29f 895void glue(helper_rcpps, SUFFIX)(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 896{
418b0f93 897 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
3403cafe
PB
898 int i;
899 for (i = 0; i < 2 << SHIFT; i++) {
900 d->ZMM_S(i) = float32_div(float32_one, s->ZMM_S(i), &env->sse_status);
901 }
418b0f93 902 set_float_exception_flags(old_flags, &env->sse_status);
664e0f19
FB
903}
904
fd17264a 905#if SHIFT == 1
fa451874 906void helper_rcpss(CPUX86State *env, ZMMReg *d, ZMMReg *s)
664e0f19 907{
418b0f93 908 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
19cbd87c 909 d->ZMM_S(0) = float32_div(float32_one, s->ZMM_S(0), &env->sse_status);
418b0f93 910 set_float_exception_flags(old_flags, &env->sse_status);
664e0f19 911}
fd17264a 912#endif
664e0f19 913
fd17264a 914#if SHIFT == 1
d9f4bb27
AP
915static inline uint64_t helper_extrq(uint64_t src, int shift, int len)
916{
917 uint64_t mask;
918
919 if (len == 0) {
920 mask = ~0LL;
921 } else {
922 mask = (1ULL << len) - 1;
923 }
924 return (src >> shift) & mask;
925}
926
fa451874 927void helper_extrq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s)
d9f4bb27 928{
19cbd87c 929 d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), s->ZMM_B(1), s->ZMM_B(0));
d9f4bb27
AP
930}
931
fa451874 932void helper_extrq_i(CPUX86State *env, ZMMReg *d, int index, int length)
d9f4bb27 933{
19cbd87c 934 d->ZMM_Q(0) = helper_extrq(d->ZMM_Q(0), index, length);
d9f4bb27
AP
935}
936
937static inline uint64_t helper_insertq(uint64_t src, int shift, int len)
938{
939 uint64_t mask;
940
941 if (len == 0) {
942 mask = ~0ULL;
943 } else {
944 mask = (1ULL << len) - 1;
945 }
946 return (src & ~(mask << shift)) | ((src & mask) << shift);
947}
948
fa451874 949void helper_insertq_r(CPUX86State *env, ZMMReg *d, ZMMReg *s)
d9f4bb27 950{
19cbd87c 951 d->ZMM_Q(0) = helper_insertq(s->ZMM_Q(0), s->ZMM_B(9), s->ZMM_B(8));
d9f4bb27
AP
952}
953
fa451874 954void helper_insertq_i(CPUX86State *env, ZMMReg *d, int index, int length)
d9f4bb27 955{
19cbd87c 956 d->ZMM_Q(0) = helper_insertq(d->ZMM_Q(0), index, length);
d9f4bb27 957}
fd17264a 958#endif
d9f4bb27 959
6567ffb4
PB
960#define SSE_HELPER_HPS(name, F) \
961void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
962{ \
963 Reg *v = d; \
964 float32 r[2 << SHIFT]; \
965 int i, j, k; \
966 for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \
967 for (i = j = 0; j < 4; i++, j += 2) { \
968 r[i + k] = F(v->ZMM_S(j + k), v->ZMM_S(j + k + 1), &env->sse_status); \
969 } \
970 for (j = 0; j < 4; i++, j += 2) { \
971 r[i + k] = F(s->ZMM_S(j + k), s->ZMM_S(j + k + 1), &env->sse_status); \
972 } \
973 } \
974 for (i = 0; i < 2 << SHIFT; i++) { \
975 d->ZMM_S(i) = r[i]; \
976 } \
977}
978
979SSE_HELPER_HPS(haddps, float32_add)
980SSE_HELPER_HPS(hsubps, float32_sub)
981
982#define SSE_HELPER_HPD(name, F) \
983void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
984{ \
985 Reg *v = d; \
986 float64 r[1 << SHIFT]; \
987 int i, j, k; \
988 for (k = 0; k < 1 << SHIFT; k += LANE_WIDTH / 8) { \
989 for (i = j = 0; j < 2; i++, j += 2) { \
990 r[i + k] = F(v->ZMM_D(j + k), v->ZMM_D(j + k + 1), &env->sse_status); \
991 } \
992 for (j = 0; j < 2; i++, j += 2) { \
993 r[i + k] = F(s->ZMM_D(j + k), s->ZMM_D(j + k + 1), &env->sse_status); \
994 } \
995 } \
996 for (i = 0; i < 1 << SHIFT; i++) { \
997 d->ZMM_D(i) = r[i]; \
998 } \
999}
1000
1001SSE_HELPER_HPD(haddpd, float64_add)
1002SSE_HELPER_HPD(hsubpd, float64_sub)
664e0f19 1003
3403cafe 1004void glue(helper_addsubps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 1005{
3403cafe
PB
1006 Reg *v = d;
1007 int i;
1008 for (i = 0; i < 2 << SHIFT; i += 2) {
1009 d->ZMM_S(i) = float32_sub(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status);
1010 d->ZMM_S(i+1) = float32_add(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status);
1011 }
664e0f19
FB
1012}
1013
3403cafe 1014void glue(helper_addsubpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 1015{
3403cafe
PB
1016 Reg *v = d;
1017 int i;
1018 for (i = 0; i < 1 << SHIFT; i += 2) {
1019 d->ZMM_D(i) = float64_sub(v->ZMM_D(i), s->ZMM_D(i), &env->sse_status);
1020 d->ZMM_D(i+1) = float64_add(v->ZMM_D(i+1), s->ZMM_D(i+1), &env->sse_status);
1021 }
664e0f19
FB
1022}
1023
cbf4ad54
PB
1024#define SSE_HELPER_CMP_P(name, F, C) \
1025 void glue(helper_ ## name ## ps, SUFFIX)(CPUX86State *env, \
1026 Reg *d, Reg *s) \
d3eb5eae 1027 { \
cbf4ad54
PB
1028 Reg *v = d; \
1029 int i; \
1030 for (i = 0; i < 2 << SHIFT; i++) { \
1031 d->ZMM_L(i) = C(F(32, v->ZMM_S(i), s->ZMM_S(i))) ? -1 : 0; \
1032 } \
d3eb5eae
BS
1033 } \
1034 \
cbf4ad54
PB
1035 void glue(helper_ ## name ## pd, SUFFIX)(CPUX86State *env, \
1036 Reg *d, Reg *s) \
d3eb5eae 1037 { \
cbf4ad54
PB
1038 Reg *v = d; \
1039 int i; \
1040 for (i = 0; i < 1 << SHIFT; i++) { \
1041 d->ZMM_Q(i) = C(F(64, v->ZMM_D(i), s->ZMM_D(i))) ? -1 : 0; \
1042 } \
1043 }
1044
1045#if SHIFT == 1
1046#define SSE_HELPER_CMP(name, F, C) \
1047 SSE_HELPER_CMP_P(name, F, C) \
1048 void helper_ ## name ## ss(CPUX86State *env, Reg *d, Reg *s) \
1049 { \
1050 Reg *v = d; \
1051 d->ZMM_L(0) = C(F(32, v->ZMM_S(0), s->ZMM_S(0))) ? -1 : 0; \
1052 } \
1053 \
1054 void helper_ ## name ## sd(CPUX86State *env, Reg *d, Reg *s) \
1055 { \
1056 Reg *v = d; \
1057 d->ZMM_Q(0) = C(F(64, v->ZMM_D(0), s->ZMM_D(0))) ? -1 : 0; \
e01d9d31
BS
1058 }
1059
cbf4ad54
PB
1060#define FPU_EQ(x) (x == float_relation_equal)
1061#define FPU_LT(x) (x == float_relation_less)
1062#define FPU_LE(x) (x <= float_relation_equal)
1063#define FPU_UNORD(x) (x == float_relation_unordered)
1064
1065#define FPU_CMPQ(size, a, b) \
1066 float ## size ## _compare_quiet(a, b, &env->sse_status)
1067#define FPU_CMPS(size, a, b) \
1068 float ## size ## _compare(a, b, &env->sse_status)
1069
1070#else
1071#define SSE_HELPER_CMP(name, F, C) SSE_HELPER_CMP_P(name, F, C)
1072#endif
1073
1074SSE_HELPER_CMP(cmpeq, FPU_CMPQ, FPU_EQ)
1075SSE_HELPER_CMP(cmplt, FPU_CMPS, FPU_LT)
1076SSE_HELPER_CMP(cmple, FPU_CMPS, FPU_LE)
1077SSE_HELPER_CMP(cmpunord, FPU_CMPQ, FPU_UNORD)
1078SSE_HELPER_CMP(cmpneq, FPU_CMPQ, !FPU_EQ)
1079SSE_HELPER_CMP(cmpnlt, FPU_CMPS, !FPU_LT)
1080SSE_HELPER_CMP(cmpnle, FPU_CMPS, !FPU_LE)
1081SSE_HELPER_CMP(cmpord, FPU_CMPQ, !FPU_UNORD)
1082
1083#undef SSE_HELPER_CMP
664e0f19 1084
fd17264a 1085#if SHIFT == 1
1e6eec8b 1086static const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
43fb823b 1087
d3eb5eae 1088void helper_ucomiss(CPUX86State *env, Reg *d, Reg *s)
664e0f19 1089{
71bfd65c 1090 FloatRelation ret;
8422b113 1091 float32 s0, s1;
664e0f19 1092
19cbd87c
EH
1093 s0 = d->ZMM_S(0);
1094 s1 = s->ZMM_S(0);
43fb823b
FB
1095 ret = float32_compare_quiet(s0, s1, &env->sse_status);
1096 CC_SRC = comis_eflags[ret + 1];
664e0f19
FB
1097}
1098
d3eb5eae 1099void helper_comiss(CPUX86State *env, Reg *d, Reg *s)
664e0f19 1100{
71bfd65c 1101 FloatRelation ret;
8422b113 1102 float32 s0, s1;
664e0f19 1103
19cbd87c
EH
1104 s0 = d->ZMM_S(0);
1105 s1 = s->ZMM_S(0);
43fb823b
FB
1106 ret = float32_compare(s0, s1, &env->sse_status);
1107 CC_SRC = comis_eflags[ret + 1];
664e0f19
FB
1108}
1109
d3eb5eae 1110void helper_ucomisd(CPUX86State *env, Reg *d, Reg *s)
664e0f19 1111{
71bfd65c 1112 FloatRelation ret;
8422b113 1113 float64 d0, d1;
664e0f19 1114
19cbd87c
EH
1115 d0 = d->ZMM_D(0);
1116 d1 = s->ZMM_D(0);
43fb823b
FB
1117 ret = float64_compare_quiet(d0, d1, &env->sse_status);
1118 CC_SRC = comis_eflags[ret + 1];
664e0f19
FB
1119}
1120
d3eb5eae 1121void helper_comisd(CPUX86State *env, Reg *d, Reg *s)
664e0f19 1122{
71bfd65c 1123 FloatRelation ret;
8422b113 1124 float64 d0, d1;
664e0f19 1125
19cbd87c
EH
1126 d0 = d->ZMM_D(0);
1127 d1 = s->ZMM_D(0);
43fb823b
FB
1128 ret = float64_compare(d0, d1, &env->sse_status);
1129 CC_SRC = comis_eflags[ret + 1];
664e0f19 1130}
fd17264a 1131#endif
664e0f19 1132
ce4fa29f 1133uint32_t glue(helper_movmskps, SUFFIX)(CPUX86State *env, Reg *s)
664e0f19 1134{
fd17264a
PB
1135 uint32_t mask;
1136 int i;
e01d9d31 1137
fd17264a
PB
1138 mask = 0;
1139 for (i = 0; i < 2 << SHIFT; i++) {
1140 mask |= (s->ZMM_L(i) >> (31 - i)) & (1 << i);
1141 }
1142 return mask;
664e0f19
FB
1143}
1144
ce4fa29f 1145uint32_t glue(helper_movmskpd, SUFFIX)(CPUX86State *env, Reg *s)
664e0f19 1146{
fd17264a
PB
1147 uint32_t mask;
1148 int i;
e01d9d31 1149
fd17264a
PB
1150 mask = 0;
1151 for (i = 0; i < 1 << SHIFT; i++) {
1152 mask |= (s->ZMM_Q(i) >> (63 - i)) & (1 << i);
1153 }
1154 return mask;
664e0f19
FB
1155}
1156
1157#endif
1158
d3eb5eae 1159uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State *env, Reg *s)
5af45186
FB
1160{
1161 uint32_t val;
e894bae8 1162 int i;
e01d9d31 1163
5af45186 1164 val = 0;
e894bae8
PB
1165 for (i = 0; i < (1 << SHIFT); i++) {
1166 uint8_t byte = 0;
1167 byte |= (s->B(8 * i + 0) >> 7);
1168 byte |= (s->B(8 * i + 1) >> 6) & 0x02;
1169 byte |= (s->B(8 * i + 2) >> 5) & 0x04;
1170 byte |= (s->B(8 * i + 3) >> 4) & 0x08;
1171 byte |= (s->B(8 * i + 4) >> 3) & 0x10;
1172 byte |= (s->B(8 * i + 5) >> 2) & 0x20;
1173 byte |= (s->B(8 * i + 6) >> 1) & 0x40;
1174 byte |= (s->B(8 * i + 7)) & 0x80;
1175 val |= byte << (8 * i);
1176 }
5af45186 1177 return val;
664e0f19
FB
1178}
1179
d45b0de6
PB
1180#define PACK_HELPER_B(name, F) \
1181void glue(helper_pack ## name, SUFFIX)(CPUX86State *env, \
1182 Reg *d, Reg *s) \
1183{ \
1184 Reg *v = d; \
1185 uint8_t r[PACK_WIDTH * 2]; \
1186 int j, k; \
1187 for (j = 0; j < 4 << SHIFT; j += PACK_WIDTH) { \
1188 for (k = 0; k < PACK_WIDTH; k++) { \
1189 r[k] = F((int16_t)v->W(j + k)); \
1190 } \
1191 for (k = 0; k < PACK_WIDTH; k++) { \
1192 r[PACK_WIDTH + k] = F((int16_t)s->W(j + k)); \
1193 } \
1194 for (k = 0; k < PACK_WIDTH * 2; k++) { \
1195 d->B(2 * j + k) = r[k]; \
1196 } \
1197 } \
1198}
1199
1200PACK_HELPER_B(sswb, satsb)
1201PACK_HELPER_B(uswb, satub)
664e0f19 1202
d3eb5eae 1203void glue(helper_packssdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
664e0f19 1204{
d45b0de6
PB
1205 Reg *v = d;
1206 uint16_t r[PACK_WIDTH];
1207 int j, k;
664e0f19 1208
d45b0de6
PB
1209 for (j = 0; j < 2 << SHIFT; j += PACK_WIDTH / 2) {
1210 for (k = 0; k < PACK_WIDTH / 2; k++) {
1211 r[k] = satsw(v->L(j + k));
1212 }
1213 for (k = 0; k < PACK_WIDTH / 2; k++) {
1214 r[PACK_WIDTH / 2 + k] = satsw(s->L(j + k));
1215 }
1216 for (k = 0; k < PACK_WIDTH; k++) {
1217 d->W(2 * j + k) = r[k];
1218 }
1219 }
664e0f19
FB
1220}
1221
e01d9d31
BS
1222#define UNPCK_OP(base_name, base) \
1223 \
d3eb5eae 1224 void glue(helper_punpck ## base_name ## bw, SUFFIX)(CPUX86State *env,\
d45b0de6 1225 Reg *d, Reg *s) \
e01d9d31 1226 { \
d45b0de6
PB
1227 Reg *v = d; \
1228 uint8_t r[PACK_WIDTH * 2]; \
1229 int j, i; \
e01d9d31 1230 \
d45b0de6
PB
1231 for (j = 0; j < 8 << SHIFT; ) { \
1232 int k = j + base * PACK_WIDTH; \
1233 for (i = 0; i < PACK_WIDTH; i++) { \
1234 r[2 * i] = v->B(k + i); \
1235 r[2 * i + 1] = s->B(k + i); \
1236 } \
1237 for (i = 0; i < PACK_WIDTH * 2; i++, j++) { \
1238 d->B(j) = r[i]; \
1239 } \
1240 } \
e01d9d31
BS
1241 } \
1242 \
d3eb5eae 1243 void glue(helper_punpck ## base_name ## wd, SUFFIX)(CPUX86State *env,\
d45b0de6 1244 Reg *d, Reg *s) \
e01d9d31 1245 { \
d45b0de6
PB
1246 Reg *v = d; \
1247 uint16_t r[PACK_WIDTH]; \
1248 int j, i; \
e01d9d31 1249 \
d45b0de6
PB
1250 for (j = 0; j < 4 << SHIFT; ) { \
1251 int k = j + base * PACK_WIDTH / 2; \
1252 for (i = 0; i < PACK_WIDTH / 2; i++) { \
1253 r[2 * i] = v->W(k + i); \
1254 r[2 * i + 1] = s->W(k + i); \
1255 } \
1256 for (i = 0; i < PACK_WIDTH; i++, j++) { \
1257 d->W(j) = r[i]; \
1258 } \
1259 } \
e01d9d31
BS
1260 } \
1261 \
d3eb5eae 1262 void glue(helper_punpck ## base_name ## dq, SUFFIX)(CPUX86State *env,\
d45b0de6 1263 Reg *d, Reg *s) \
e01d9d31 1264 { \
d45b0de6
PB
1265 Reg *v = d; \
1266 uint32_t r[PACK_WIDTH / 2]; \
1267 int j, i; \
e01d9d31 1268 \
d45b0de6
PB
1269 for (j = 0; j < 2 << SHIFT; ) { \
1270 int k = j + base * PACK_WIDTH / 4; \
1271 for (i = 0; i < PACK_WIDTH / 4; i++) { \
1272 r[2 * i] = v->L(k + i); \
1273 r[2 * i + 1] = s->L(k + i); \
1274 } \
1275 for (i = 0; i < PACK_WIDTH / 2; i++, j++) { \
1276 d->L(j) = r[i]; \
1277 } \
1278 } \
e01d9d31
BS
1279 } \
1280 \
1281 XMM_ONLY( \
d45b0de6
PB
1282 void glue(helper_punpck ## base_name ## qdq, SUFFIX)( \
1283 CPUX86State *env, Reg *d, Reg *s) \
e01d9d31 1284 { \
d45b0de6
PB
1285 Reg *v = d; \
1286 uint64_t r[2]; \
1287 int i; \
e01d9d31 1288 \
d45b0de6
PB
1289 for (i = 0; i < 1 << SHIFT; i += 2) { \
1290 r[0] = v->Q(base + i); \
1291 r[1] = s->Q(base + i); \
1292 d->Q(i) = r[0]; \
1293 d->Q(i + 1) = r[1]; \
1294 } \
e01d9d31
BS
1295 } \
1296 )
664e0f19
FB
1297
1298UNPCK_OP(l, 0)
1299UNPCK_OP(h, 1)
1300
d45b0de6
PB
1301#undef PACK_WIDTH
1302#undef PACK_HELPER_B
1303#undef UNPCK_OP
1304
1305
a35f3ec7
AJ
1306/* 3DNow! float ops */
1307#if SHIFT == 0
d3eb5eae 1308void helper_pi2fd(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1309{
a35f3ec7
AJ
1310 d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status);
1311 d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status);
1312}
1313
d3eb5eae 1314void helper_pi2fw(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1315{
a35f3ec7
AJ
1316 d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status);
1317 d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status);
1318}
1319
d3eb5eae 1320void helper_pf2id(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1321{
a35f3ec7
AJ
1322 d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status);
1323 d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status);
1324}
1325
d3eb5eae 1326void helper_pf2iw(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1327{
e01d9d31
BS
1328 d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0),
1329 &env->mmx_status));
1330 d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1),
1331 &env->mmx_status));
a35f3ec7
AJ
1332}
1333
d3eb5eae 1334void helper_pfacc(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1335{
25bdec79 1336 float32 r;
e01d9d31 1337
25bdec79
PB
1338 r = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1339 d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1340 d->MMX_S(0) = r;
a35f3ec7
AJ
1341}
1342
d3eb5eae 1343void helper_pfadd(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1344{
a35f3ec7
AJ
1345 d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1346 d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1347}
1348
d3eb5eae 1349void helper_pfcmpeq(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1350{
e01d9d31
BS
1351 d->MMX_L(0) = float32_eq_quiet(d->MMX_S(0), s->MMX_S(0),
1352 &env->mmx_status) ? -1 : 0;
1353 d->MMX_L(1) = float32_eq_quiet(d->MMX_S(1), s->MMX_S(1),
1354 &env->mmx_status) ? -1 : 0;
a35f3ec7
AJ
1355}
1356
d3eb5eae 1357void helper_pfcmpge(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1358{
e01d9d31
BS
1359 d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0),
1360 &env->mmx_status) ? -1 : 0;
1361 d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1),
1362 &env->mmx_status) ? -1 : 0;
a35f3ec7
AJ
1363}
1364
d3eb5eae 1365void helper_pfcmpgt(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1366{
e01d9d31
BS
1367 d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0),
1368 &env->mmx_status) ? -1 : 0;
1369 d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1),
1370 &env->mmx_status) ? -1 : 0;
a35f3ec7
AJ
1371}
1372
d3eb5eae 1373void helper_pfmax(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1374{
e01d9d31 1375 if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status)) {
a35f3ec7 1376 d->MMX_S(0) = s->MMX_S(0);
e01d9d31
BS
1377 }
1378 if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status)) {
a35f3ec7 1379 d->MMX_S(1) = s->MMX_S(1);
e01d9d31 1380 }
a35f3ec7
AJ
1381}
1382
d3eb5eae 1383void helper_pfmin(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1384{
e01d9d31 1385 if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status)) {
a35f3ec7 1386 d->MMX_S(0) = s->MMX_S(0);
e01d9d31
BS
1387 }
1388 if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status)) {
a35f3ec7 1389 d->MMX_S(1) = s->MMX_S(1);
e01d9d31 1390 }
a35f3ec7
AJ
1391}
1392
d3eb5eae 1393void helper_pfmul(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1394{
a35f3ec7
AJ
1395 d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1396 d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1397}
1398
d3eb5eae 1399void helper_pfnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1400{
25bdec79 1401 float32 r;
e01d9d31 1402
25bdec79
PB
1403 r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1404 d->MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1405 d->MMX_S(0) = r;
a35f3ec7
AJ
1406}
1407
d3eb5eae 1408void helper_pfpnacc(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1409{
25bdec79 1410 float32 r;
e01d9d31 1411
25bdec79
PB
1412 r = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status);
1413 d->MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status);
1414 d->MMX_S(0) = r;
a35f3ec7
AJ
1415}
1416
d3eb5eae 1417void helper_pfrcp(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1418{
c2ef9a83 1419 d->MMX_S(0) = float32_div(float32_one, s->MMX_S(0), &env->mmx_status);
a35f3ec7
AJ
1420 d->MMX_S(1) = d->MMX_S(0);
1421}
1422
d3eb5eae 1423void helper_pfrsqrt(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1424{
a35f3ec7 1425 d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff;
c2ef9a83
AJ
1426 d->MMX_S(1) = float32_div(float32_one,
1427 float32_sqrt(d->MMX_S(1), &env->mmx_status),
1428 &env->mmx_status);
a35f3ec7
AJ
1429 d->MMX_L(1) |= s->MMX_L(0) & 0x80000000;
1430 d->MMX_L(0) = d->MMX_L(1);
1431}
1432
d3eb5eae 1433void helper_pfsub(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1434{
a35f3ec7
AJ
1435 d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status);
1436 d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status);
1437}
1438
d3eb5eae 1439void helper_pfsubr(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1440{
a35f3ec7
AJ
1441 d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status);
1442 d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status);
1443}
1444
d3eb5eae 1445void helper_pswapd(CPUX86State *env, MMXReg *d, MMXReg *s)
a35f3ec7 1446{
25bdec79 1447 uint32_t r;
e01d9d31 1448
25bdec79
PB
1449 r = s->MMX_L(0);
1450 d->MMX_L(0) = s->MMX_L(1);
1451 d->MMX_L(1) = r;
a35f3ec7
AJ
1452}
1453#endif
1454
4242b1bd 1455/* SSSE3 op helpers */
d3eb5eae 1456void glue(helper_pshufb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
4242b1bd 1457{
d45b0de6 1458 Reg *v = d;
4242b1bd 1459 int i;
d45b0de6
PB
1460#if SHIFT == 0
1461 uint8_t r[8];
4242b1bd 1462
d45b0de6
PB
1463 for (i = 0; i < 8; i++) {
1464 r[i] = (s->B(i) & 0x80) ? 0 : (v->B(s->B(i) & 7));
e01d9d31 1465 }
d45b0de6
PB
1466 for (i = 0; i < 8; i++) {
1467 d->B(i) = r[i];
1468 }
1469#else
1470 uint8_t r[8 << SHIFT];
4242b1bd 1471
d45b0de6
PB
1472 for (i = 0; i < 8 << SHIFT; i++) {
1473 int j = i & ~0xf;
1474 r[i] = (s->B(i) & 0x80) ? 0 : v->B(j | (s->B(i) & 0xf));
1475 }
1476 for (i = 0; i < 8 << SHIFT; i++) {
1477 d->B(i) = r[i];
1478 }
4242b1bd
AZ
1479#endif
1480}
1481
d45b0de6
PB
1482#define SSE_HELPER_HW(name, F) \
1483void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
1484{ \
1485 Reg *v = d; \
1486 uint16_t r[4 << SHIFT]; \
1487 int i, j, k; \
1488 for (k = 0; k < 4 << SHIFT; k += LANE_WIDTH / 2) { \
1489 for (i = j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \
1490 r[i + k] = F(v->W(j + k), v->W(j + k + 1)); \
1491 } \
1492 for (j = 0; j < LANE_WIDTH / 2; i++, j += 2) { \
1493 r[i + k] = F(s->W(j + k), s->W(j + k + 1)); \
1494 } \
1495 } \
1496 for (i = 0; i < 4 << SHIFT; i++) { \
1497 d->W(i) = r[i]; \
1498 } \
1499}
1500
1501#define SSE_HELPER_HL(name, F) \
1502void glue(helper_ ## name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
1503{ \
1504 Reg *v = d; \
1505 uint32_t r[2 << SHIFT]; \
1506 int i, j, k; \
1507 for (k = 0; k < 2 << SHIFT; k += LANE_WIDTH / 4) { \
1508 for (i = j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \
1509 r[i + k] = F(v->L(j + k), v->L(j + k + 1)); \
1510 } \
1511 for (j = 0; j < LANE_WIDTH / 4; i++, j += 2) { \
1512 r[i + k] = F(s->L(j + k), s->L(j + k + 1)); \
1513 } \
1514 } \
1515 for (i = 0; i < 2 << SHIFT; i++) { \
1516 d->L(i) = r[i]; \
1517 } \
1518}
1519
1520SSE_HELPER_HW(phaddw, FADD)
1521SSE_HELPER_HW(phsubw, FSUB)
1522SSE_HELPER_HW(phaddsw, FADDSW)
1523SSE_HELPER_HW(phsubsw, FSUBSW)
1524SSE_HELPER_HL(phaddd, FADD)
1525SSE_HELPER_HL(phsubd, FSUB)
1526
1527#undef SSE_HELPER_HW
1528#undef SSE_HELPER_HL
4242b1bd 1529
d45b0de6 1530void glue(helper_pmaddubsw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
4242b1bd 1531{
d45b0de6
PB
1532 Reg *v = d;
1533 int i;
1534 for (i = 0; i < 4 << SHIFT; i++) {
1535 d->W(i) = satsw((int8_t)s->B(i * 2) * (uint8_t)v->B(i * 2) +
1536 (int8_t)s->B(i * 2 + 1) * (uint8_t)v->B(i * 2 + 1));
1537 }
4242b1bd
AZ
1538}
1539
ee04a3c8
PB
1540#define FABSB(x) (x > INT8_MAX ? -(int8_t)x : x)
1541#define FABSW(x) (x > INT16_MAX ? -(int16_t)x : x)
1542#define FABSL(x) (x > INT32_MAX ? -(int32_t)x : x)
1543SSE_HELPER_1(helper_pabsb, B, 8 << SHIFT, FABSB)
1544SSE_HELPER_1(helper_pabsw, W, 4 << SHIFT, FABSW)
1545SSE_HELPER_1(helper_pabsd, L, 2 << SHIFT, FABSL)
4242b1bd 1546
e01d9d31 1547#define FMULHRSW(d, s) (((int16_t) d * (int16_t)s + 0x4000) >> 15)
4242b1bd
AZ
1548SSE_HELPER_W(helper_pmulhrsw, FMULHRSW)
1549
e01d9d31
BS
1550#define FSIGNB(d, s) (s <= INT8_MAX ? s ? d : 0 : -(int8_t)d)
1551#define FSIGNW(d, s) (s <= INT16_MAX ? s ? d : 0 : -(int16_t)d)
1552#define FSIGNL(d, s) (s <= INT32_MAX ? s ? d : 0 : -(int32_t)d)
4242b1bd
AZ
1553SSE_HELPER_B(helper_psignb, FSIGNB)
1554SSE_HELPER_W(helper_psignw, FSIGNW)
1555SSE_HELPER_L(helper_psignd, FSIGNL)
1556
d3eb5eae
BS
1557void glue(helper_palignr, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1558 int32_t shift)
4242b1bd 1559{
d45b0de6
PB
1560 Reg *v = d;
1561 int i;
4242b1bd
AZ
1562
1563 /* XXX could be checked during translation */
d45b0de6
PB
1564 if (shift >= (SHIFT ? 32 : 16)) {
1565 for (i = 0; i < (1 << SHIFT); i++) {
1566 d->Q(i) = 0;
1567 }
4242b1bd
AZ
1568 } else {
1569 shift <<= 3;
1570#define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0)
1571#if SHIFT == 0
d45b0de6
PB
1572 d->Q(0) = SHR(s->Q(0), shift - 0) |
1573 SHR(v->Q(0), shift - 64);
4242b1bd 1574#else
d45b0de6
PB
1575 for (i = 0; i < (1 << SHIFT); i += 2) {
1576 uint64_t r0, r1;
1577
1578 r0 = SHR(s->Q(i), shift - 0) |
1579 SHR(s->Q(i + 1), shift - 64) |
1580 SHR(v->Q(i), shift - 128) |
1581 SHR(v->Q(i + 1), shift - 192);
1582 r1 = SHR(s->Q(i), shift + 64) |
1583 SHR(s->Q(i + 1), shift - 0) |
1584 SHR(v->Q(i), shift - 64) |
1585 SHR(v->Q(i + 1), shift - 128);
1586 d->Q(i) = r0;
1587 d->Q(i + 1) = r1;
1588 }
4242b1bd
AZ
1589#endif
1590#undef SHR
1591 }
4242b1bd
AZ
1592}
1593
0e29cea5 1594#if SHIFT >= 1
222a3336 1595
e01d9d31 1596#define SSE_HELPER_V(name, elem, num, F) \
0e29cea5 1597 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
e01d9d31 1598 { \
0e29cea5
PB
1599 Reg *v = d; \
1600 Reg *m = &env->xmm_regs[0]; \
1601 int i; \
1602 for (i = 0; i < num; i++) { \
1603 d->elem(i) = F(v->elem(i), s->elem(i), m->elem(i)); \
e01d9d31
BS
1604 } \
1605 }
1606
1607#define SSE_HELPER_I(name, elem, num, F) \
0e29cea5
PB
1608 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s, \
1609 uint32_t imm) \
e01d9d31 1610 { \
0e29cea5
PB
1611 Reg *v = d; \
1612 int i; \
1613 for (i = 0; i < num; i++) { \
1614 int j = i & 7; \
1615 d->elem(i) = F(v->elem(i), s->elem(i), (imm >> j) & 1); \
e01d9d31
BS
1616 } \
1617 }
222a3336
AZ
1618
1619/* SSE4.1 op helpers */
0e29cea5
PB
1620#define FBLENDVB(v, s, m) ((m & 0x80) ? s : v)
1621#define FBLENDVPS(v, s, m) ((m & 0x80000000) ? s : v)
1622#define FBLENDVPD(v, s, m) ((m & 0x8000000000000000LL) ? s : v)
1623SSE_HELPER_V(helper_pblendvb, B, 8 << SHIFT, FBLENDVB)
1624SSE_HELPER_V(helper_blendvps, L, 2 << SHIFT, FBLENDVPS)
1625SSE_HELPER_V(helper_blendvpd, Q, 1 << SHIFT, FBLENDVPD)
222a3336 1626
d3eb5eae 1627void glue(helper_ptest, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
222a3336 1628{
e894bae8
PB
1629 uint64_t zf = 0, cf = 0;
1630 int i;
222a3336 1631
e894bae8
PB
1632 for (i = 0; i < 1 << SHIFT; i++) {
1633 zf |= (s->Q(i) & d->Q(i));
1634 cf |= (s->Q(i) & ~d->Q(i));
1635 }
222a3336
AZ
1636 CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C);
1637}
1638
e894bae8
PB
1639#define SSE_HELPER_F(name, elem, num, F) \
1640 void glue(name, SUFFIX)(CPUX86State *env, Reg *d, Reg *s) \
1641 { \
1642 int n = num; \
1643 for (int i = n; --i >= 0; ) { \
1644 d->elem(i) = F(i); \
1645 } \
e01d9d31 1646 }
222a3336 1647
e894bae8
PB
1648#if SHIFT > 0
1649SSE_HELPER_F(helper_pmovsxbw, W, 4 << SHIFT, (int8_t) s->B)
1650SSE_HELPER_F(helper_pmovsxbd, L, 2 << SHIFT, (int8_t) s->B)
1651SSE_HELPER_F(helper_pmovsxbq, Q, 1 << SHIFT, (int8_t) s->B)
1652SSE_HELPER_F(helper_pmovsxwd, L, 2 << SHIFT, (int16_t) s->W)
1653SSE_HELPER_F(helper_pmovsxwq, Q, 1 << SHIFT, (int16_t) s->W)
1654SSE_HELPER_F(helper_pmovsxdq, Q, 1 << SHIFT, (int32_t) s->L)
1655SSE_HELPER_F(helper_pmovzxbw, W, 4 << SHIFT, s->B)
1656SSE_HELPER_F(helper_pmovzxbd, L, 2 << SHIFT, s->B)
1657SSE_HELPER_F(helper_pmovzxbq, Q, 1 << SHIFT, s->B)
1658SSE_HELPER_F(helper_pmovzxwd, L, 2 << SHIFT, s->W)
1659SSE_HELPER_F(helper_pmovzxwq, Q, 1 << SHIFT, s->W)
1660SSE_HELPER_F(helper_pmovzxdq, Q, 1 << SHIFT, s->L)
1661#endif
222a3336 1662
d3eb5eae 1663void glue(helper_pmuldq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
222a3336 1664{
e894bae8
PB
1665 Reg *v = d;
1666 int i;
1667
1668 for (i = 0; i < 1 << SHIFT; i++) {
1669 d->Q(i) = (int64_t)(int32_t) v->L(2 * i) * (int32_t) s->L(2 * i);
1670 }
222a3336
AZ
1671}
1672
e01d9d31 1673#define FCMPEQQ(d, s) (d == s ? -1 : 0)
222a3336
AZ
1674SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ)
1675
d3eb5eae 1676void glue(helper_packusdw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
222a3336 1677{
d45b0de6
PB
1678 Reg *v = d;
1679 uint16_t r[8];
1680 int i, j, k;
1681
1682 for (i = 0, j = 0; i <= 2 << SHIFT; i += 8, j += 4) {
1683 r[0] = satuw(v->L(j));
1684 r[1] = satuw(v->L(j + 1));
1685 r[2] = satuw(v->L(j + 2));
1686 r[3] = satuw(v->L(j + 3));
1687 r[4] = satuw(s->L(j));
1688 r[5] = satuw(s->L(j + 1));
1689 r[6] = satuw(s->L(j + 2));
1690 r[7] = satuw(s->L(j + 3));
1691 for (k = 0; k < 8; k++) {
1692 d->W(i + k) = r[k];
1693 }
1694 }
222a3336
AZ
1695}
1696
e01d9d31
BS
1697#define FMINSB(d, s) MIN((int8_t)d, (int8_t)s)
1698#define FMINSD(d, s) MIN((int32_t)d, (int32_t)s)
1699#define FMAXSB(d, s) MAX((int8_t)d, (int8_t)s)
1700#define FMAXSD(d, s) MAX((int32_t)d, (int32_t)s)
222a3336
AZ
1701SSE_HELPER_B(helper_pminsb, FMINSB)
1702SSE_HELPER_L(helper_pminsd, FMINSD)
1703SSE_HELPER_W(helper_pminuw, MIN)
1704SSE_HELPER_L(helper_pminud, MIN)
1705SSE_HELPER_B(helper_pmaxsb, FMAXSB)
1706SSE_HELPER_L(helper_pmaxsd, FMAXSD)
1707SSE_HELPER_W(helper_pmaxuw, MAX)
1708SSE_HELPER_L(helper_pmaxud, MAX)
1709
e01d9d31 1710#define FMULLD(d, s) ((int32_t)d * (int32_t)s)
222a3336
AZ
1711SSE_HELPER_L(helper_pmulld, FMULLD)
1712
fd17264a 1713#if SHIFT == 1
d3eb5eae 1714void glue(helper_phminposuw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
222a3336
AZ
1715{
1716 int idx = 0;
1717
e01d9d31 1718 if (s->W(1) < s->W(idx)) {
222a3336 1719 idx = 1;
e01d9d31
BS
1720 }
1721 if (s->W(2) < s->W(idx)) {
222a3336 1722 idx = 2;
e01d9d31
BS
1723 }
1724 if (s->W(3) < s->W(idx)) {
222a3336 1725 idx = 3;
e01d9d31
BS
1726 }
1727 if (s->W(4) < s->W(idx)) {
222a3336 1728 idx = 4;
e01d9d31
BS
1729 }
1730 if (s->W(5) < s->W(idx)) {
222a3336 1731 idx = 5;
e01d9d31
BS
1732 }
1733 if (s->W(6) < s->W(idx)) {
222a3336 1734 idx = 6;
e01d9d31
BS
1735 }
1736 if (s->W(7) < s->W(idx)) {
222a3336 1737 idx = 7;
e01d9d31 1738 }
222a3336 1739
222a3336 1740 d->W(0) = s->W(idx);
aa406fea
JM
1741 d->W(1) = idx;
1742 d->L(1) = 0;
1743 d->Q(1) = 0;
222a3336 1744}
fd17264a 1745#endif
222a3336 1746
d3eb5eae
BS
1747void glue(helper_roundps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1748 uint32_t mode)
222a3336 1749{
418b0f93 1750 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
222a3336 1751 signed char prev_rounding_mode;
fd17264a 1752 int i;
222a3336
AZ
1753
1754 prev_rounding_mode = env->sse_status.float_rounding_mode;
e01d9d31 1755 if (!(mode & (1 << 2))) {
222a3336
AZ
1756 switch (mode & 3) {
1757 case 0:
1758 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1759 break;
1760 case 1:
1761 set_float_rounding_mode(float_round_down, &env->sse_status);
1762 break;
1763 case 2:
1764 set_float_rounding_mode(float_round_up, &env->sse_status);
1765 break;
1766 case 3:
1767 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1768 break;
1769 }
e01d9d31 1770 }
222a3336 1771
fd17264a
PB
1772 for (i = 0; i < 2 << SHIFT; i++) {
1773 d->ZMM_S(i) = float32_round_to_int(s->ZMM_S(i), &env->sse_status);
1774 }
222a3336 1775
418b0f93 1776 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
e01d9d31
BS
1777 set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1778 ~float_flag_inexact,
1779 &env->sse_status);
1780 }
222a3336
AZ
1781 env->sse_status.float_rounding_mode = prev_rounding_mode;
1782}
1783
d3eb5eae
BS
1784void glue(helper_roundpd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1785 uint32_t mode)
222a3336 1786{
418b0f93 1787 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
222a3336 1788 signed char prev_rounding_mode;
fd17264a 1789 int i;
222a3336
AZ
1790
1791 prev_rounding_mode = env->sse_status.float_rounding_mode;
e01d9d31 1792 if (!(mode & (1 << 2))) {
222a3336
AZ
1793 switch (mode & 3) {
1794 case 0:
1795 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1796 break;
1797 case 1:
1798 set_float_rounding_mode(float_round_down, &env->sse_status);
1799 break;
1800 case 2:
1801 set_float_rounding_mode(float_round_up, &env->sse_status);
1802 break;
1803 case 3:
1804 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1805 break;
1806 }
e01d9d31 1807 }
222a3336 1808
fd17264a
PB
1809 for (i = 0; i < 1 << SHIFT; i++) {
1810 d->ZMM_D(i) = float64_round_to_int(s->ZMM_D(i), &env->sse_status);
1811 }
222a3336 1812
418b0f93 1813 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
e01d9d31
BS
1814 set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1815 ~float_flag_inexact,
1816 &env->sse_status);
1817 }
222a3336
AZ
1818 env->sse_status.float_rounding_mode = prev_rounding_mode;
1819}
1820
fd17264a 1821#if SHIFT == 1
d3eb5eae
BS
1822void glue(helper_roundss, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1823 uint32_t mode)
222a3336 1824{
418b0f93 1825 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
222a3336
AZ
1826 signed char prev_rounding_mode;
1827
1828 prev_rounding_mode = env->sse_status.float_rounding_mode;
e01d9d31 1829 if (!(mode & (1 << 2))) {
222a3336
AZ
1830 switch (mode & 3) {
1831 case 0:
1832 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1833 break;
1834 case 1:
1835 set_float_rounding_mode(float_round_down, &env->sse_status);
1836 break;
1837 case 2:
1838 set_float_rounding_mode(float_round_up, &env->sse_status);
1839 break;
1840 case 3:
1841 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1842 break;
1843 }
e01d9d31 1844 }
222a3336 1845
19cbd87c 1846 d->ZMM_S(0) = float32_round_to_int(s->ZMM_S(0), &env->sse_status);
222a3336 1847
418b0f93 1848 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
e01d9d31
BS
1849 set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1850 ~float_flag_inexact,
1851 &env->sse_status);
1852 }
222a3336
AZ
1853 env->sse_status.float_rounding_mode = prev_rounding_mode;
1854}
1855
d3eb5eae
BS
1856void glue(helper_roundsd, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1857 uint32_t mode)
222a3336 1858{
418b0f93 1859 uint8_t old_flags = get_float_exception_flags(&env->sse_status);
222a3336
AZ
1860 signed char prev_rounding_mode;
1861
1862 prev_rounding_mode = env->sse_status.float_rounding_mode;
e01d9d31 1863 if (!(mode & (1 << 2))) {
222a3336
AZ
1864 switch (mode & 3) {
1865 case 0:
1866 set_float_rounding_mode(float_round_nearest_even, &env->sse_status);
1867 break;
1868 case 1:
1869 set_float_rounding_mode(float_round_down, &env->sse_status);
1870 break;
1871 case 2:
1872 set_float_rounding_mode(float_round_up, &env->sse_status);
1873 break;
1874 case 3:
1875 set_float_rounding_mode(float_round_to_zero, &env->sse_status);
1876 break;
1877 }
e01d9d31 1878 }
222a3336 1879
19cbd87c 1880 d->ZMM_D(0) = float64_round_to_int(s->ZMM_D(0), &env->sse_status);
222a3336 1881
418b0f93 1882 if (mode & (1 << 3) && !(old_flags & float_flag_inexact)) {
e01d9d31
BS
1883 set_float_exception_flags(get_float_exception_flags(&env->sse_status) &
1884 ~float_flag_inexact,
1885 &env->sse_status);
1886 }
222a3336
AZ
1887 env->sse_status.float_rounding_mode = prev_rounding_mode;
1888}
fd17264a 1889#endif
222a3336 1890
0e29cea5
PB
1891#define FBLENDP(v, s, m) (m ? s : v)
1892SSE_HELPER_I(helper_blendps, L, 2 << SHIFT, FBLENDP)
1893SSE_HELPER_I(helper_blendpd, Q, 1 << SHIFT, FBLENDP)
1894SSE_HELPER_I(helper_pblendw, W, 4 << SHIFT, FBLENDP)
222a3336 1895
6f218d6e
PB
1896void glue(helper_dpps, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1897 uint32_t mask)
222a3336 1898{
6f218d6e 1899 Reg *v = d;
bf30ad8c 1900 float32 prod1, prod2, temp2, temp3, temp4;
6f218d6e 1901 int i;
222a3336 1902
6f218d6e
PB
1903 for (i = 0; i < 2 << SHIFT; i += 4) {
1904 /*
1905 * We must evaluate (A+B)+(C+D), not ((A+B)+C)+D
1906 * to correctly round the intermediate results
1907 */
1908 if (mask & (1 << 4)) {
1909 prod1 = float32_mul(v->ZMM_S(i), s->ZMM_S(i), &env->sse_status);
1910 } else {
1911 prod1 = float32_zero;
1912 }
1913 if (mask & (1 << 5)) {
1914 prod2 = float32_mul(v->ZMM_S(i+1), s->ZMM_S(i+1), &env->sse_status);
1915 } else {
1916 prod2 = float32_zero;
1917 }
1918 temp2 = float32_add(prod1, prod2, &env->sse_status);
1919 if (mask & (1 << 6)) {
1920 prod1 = float32_mul(v->ZMM_S(i+2), s->ZMM_S(i+2), &env->sse_status);
1921 } else {
1922 prod1 = float32_zero;
1923 }
1924 if (mask & (1 << 7)) {
1925 prod2 = float32_mul(v->ZMM_S(i+3), s->ZMM_S(i+3), &env->sse_status);
1926 } else {
1927 prod2 = float32_zero;
1928 }
1929 temp3 = float32_add(prod1, prod2, &env->sse_status);
1930 temp4 = float32_add(temp2, temp3, &env->sse_status);
bf30ad8c 1931
6f218d6e
PB
1932 d->ZMM_S(i) = (mask & (1 << 0)) ? temp4 : float32_zero;
1933 d->ZMM_S(i+1) = (mask & (1 << 1)) ? temp4 : float32_zero;
1934 d->ZMM_S(i+2) = (mask & (1 << 2)) ? temp4 : float32_zero;
1935 d->ZMM_S(i+3) = (mask & (1 << 3)) ? temp4 : float32_zero;
1936 }
222a3336
AZ
1937}
1938
6f218d6e
PB
1939#if SHIFT == 1
1940/* Oddly, there is no ymm version of dppd */
1941void glue(helper_dppd, SUFFIX)(CPUX86State *env,
1942 Reg *d, Reg *s, uint32_t mask)
222a3336 1943{
6f218d6e 1944 Reg *v = d;
bf30ad8c 1945 float64 prod1, prod2, temp2;
222a3336 1946
e01d9d31 1947 if (mask & (1 << 4)) {
6f218d6e 1948 prod1 = float64_mul(v->ZMM_D(0), s->ZMM_D(0), &env->sse_status);
bf30ad8c
PB
1949 } else {
1950 prod1 = float64_zero;
e01d9d31
BS
1951 }
1952 if (mask & (1 << 5)) {
6f218d6e 1953 prod2 = float64_mul(v->ZMM_D(1), s->ZMM_D(1), &env->sse_status);
bf30ad8c
PB
1954 } else {
1955 prod2 = float64_zero;
e01d9d31 1956 }
bf30ad8c
PB
1957 temp2 = float64_add(prod1, prod2, &env->sse_status);
1958 d->ZMM_D(0) = (mask & (1 << 0)) ? temp2 : float64_zero;
1959 d->ZMM_D(1) = (mask & (1 << 1)) ? temp2 : float64_zero;
222a3336 1960}
6f218d6e 1961#endif
222a3336 1962
d3eb5eae
BS
1963void glue(helper_mpsadbw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
1964 uint32_t offset)
222a3336 1965{
d45b0de6
PB
1966 Reg *v = d;
1967 int i, j;
1968 uint16_t r[8];
1969
1970 for (j = 0; j < 4 << SHIFT; ) {
1971 int s0 = (j * 2) + ((offset & 3) << 2);
1972 int d0 = (j * 2) + ((offset & 4) << 0);
1973 for (i = 0; i < LANE_WIDTH / 2; i++, d0++) {
1974 r[i] = 0;
1975 r[i] += abs1(v->B(d0 + 0) - s->B(s0 + 0));
1976 r[i] += abs1(v->B(d0 + 1) - s->B(s0 + 1));
1977 r[i] += abs1(v->B(d0 + 2) - s->B(s0 + 2));
1978 r[i] += abs1(v->B(d0 + 3) - s->B(s0 + 3));
1979 }
1980 for (i = 0; i < LANE_WIDTH / 2; i++, j++) {
1981 d->W(j) = r[i];
1982 }
1983 offset >>= 3;
222a3336 1984 }
222a3336
AZ
1985}
1986
1987/* SSE4.2 op helpers */
da5156cd 1988#define FCMPGTQ(d, s) ((int64_t)d > (int64_t)s ? -1 : 0)
222a3336
AZ
1989SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ)
1990
fd17264a 1991#if SHIFT == 1
d3eb5eae 1992static inline int pcmp_elen(CPUX86State *env, int reg, uint32_t ctrl)
222a3336 1993{
d1da229f 1994 target_long val, limit;
222a3336
AZ
1995
1996 /* Presence of REX.W is indicated by a bit higher than 7 set */
e01d9d31 1997 if (ctrl >> 8) {
d1da229f 1998 val = (target_long)env->regs[reg];
e01d9d31 1999 } else {
d1da229f 2000 val = (int32_t)env->regs[reg];
e01d9d31 2001 }
222a3336 2002 if (ctrl & 1) {
d1da229f 2003 limit = 8;
e01d9d31 2004 } else {
d1da229f 2005 limit = 16;
e01d9d31 2006 }
d1da229f
PB
2007 if ((val > limit) || (val < -limit)) {
2008 return limit;
2009 }
2010 return abs1(val);
222a3336
AZ
2011}
2012
2013static inline int pcmp_ilen(Reg *r, uint8_t ctrl)
2014{
2015 int val = 0;
2016
2017 if (ctrl & 1) {
e01d9d31 2018 while (val < 8 && r->W(val)) {
222a3336 2019 val++;
e01d9d31
BS
2020 }
2021 } else {
2022 while (val < 16 && r->B(val)) {
222a3336 2023 val++;
e01d9d31
BS
2024 }
2025 }
222a3336
AZ
2026
2027 return val;
2028}
2029
2030static inline int pcmp_val(Reg *r, uint8_t ctrl, int i)
2031{
2032 switch ((ctrl >> 0) & 3) {
2033 case 0:
2034 return r->B(i);
2035 case 1:
2036 return r->W(i);
2037 case 2:
e01d9d31 2038 return (int8_t)r->B(i);
222a3336
AZ
2039 case 3:
2040 default:
e01d9d31 2041 return (int16_t)r->W(i);
222a3336
AZ
2042 }
2043}
2044
d3eb5eae 2045static inline unsigned pcmpxstrx(CPUX86State *env, Reg *d, Reg *s,
e01d9d31 2046 int8_t ctrl, int valids, int validd)
222a3336
AZ
2047{
2048 unsigned int res = 0;
2049 int v;
2050 int j, i;
2051 int upper = (ctrl & 1) ? 7 : 15;
2052
2053 valids--;
2054 validd--;
2055
2056 CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0);
2057
2058 switch ((ctrl >> 2) & 3) {
2059 case 0:
2060 for (j = valids; j >= 0; j--) {
2061 res <<= 1;
2062 v = pcmp_val(s, ctrl, j);
e01d9d31 2063 for (i = validd; i >= 0; i--) {
222a3336 2064 res |= (v == pcmp_val(d, ctrl, i));
e01d9d31 2065 }
222a3336
AZ
2066 }
2067 break;
2068 case 1:
2069 for (j = valids; j >= 0; j--) {
2070 res <<= 1;
2071 v = pcmp_val(s, ctrl, j);
e01d9d31 2072 for (i = ((validd - 1) | 1); i >= 0; i -= 2) {
649ad05e
AJ
2073 res |= (pcmp_val(d, ctrl, i - 0) >= v &&
2074 pcmp_val(d, ctrl, i - 1) <= v);
e01d9d31 2075 }
222a3336
AZ
2076 }
2077 break;
2078 case 2:
b27a6cac 2079 res = (1 << (upper - MAX(valids, validd))) - 1;
222a3336
AZ
2080 res <<= MAX(valids, validd) - MIN(valids, validd);
2081 for (i = MIN(valids, validd); i >= 0; i--) {
2082 res <<= 1;
2083 v = pcmp_val(s, ctrl, i);
2084 res |= (v == pcmp_val(d, ctrl, i));
2085 }
2086 break;
2087 case 3:
ae35eea7
JM
2088 if (validd == -1) {
2089 res = (2 << upper) - 1;
2090 break;
2091 }
bc921b27 2092 for (j = valids == upper ? valids : valids - validd; j >= 0; j--) {
222a3336 2093 res <<= 1;
75c9527e 2094 v = 1;
bc921b27 2095 for (i = MIN(valids - j, validd); i >= 0; i--) {
75c9527e 2096 v &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i));
e01d9d31 2097 }
75c9527e 2098 res |= v;
222a3336
AZ
2099 }
2100 break;
2101 }
2102
2103 switch ((ctrl >> 4) & 3) {
2104 case 1:
2105 res ^= (2 << upper) - 1;
2106 break;
2107 case 3:
e4eba27e 2108 res ^= (1 << (valids + 1)) - 1;
222a3336
AZ
2109 break;
2110 }
2111
e01d9d31
BS
2112 if (res) {
2113 CC_SRC |= CC_C;
2114 }
2115 if (res & 1) {
2116 CC_SRC |= CC_O;
2117 }
222a3336
AZ
2118
2119 return res;
2120}
2121
d3eb5eae
BS
2122void glue(helper_pcmpestri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2123 uint32_t ctrl)
222a3336 2124{
d3eb5eae
BS
2125 unsigned int res = pcmpxstrx(env, d, s, ctrl,
2126 pcmp_elen(env, R_EDX, ctrl),
2127 pcmp_elen(env, R_EAX, ctrl));
222a3336 2128
e01d9d31 2129 if (res) {
c334a388 2130 env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res);
e01d9d31 2131 } else {
222a3336 2132 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
e01d9d31 2133 }
222a3336
AZ
2134}
2135
d3eb5eae
BS
2136void glue(helper_pcmpestrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2137 uint32_t ctrl)
222a3336
AZ
2138{
2139 int i;
d3eb5eae
BS
2140 unsigned int res = pcmpxstrx(env, d, s, ctrl,
2141 pcmp_elen(env, R_EDX, ctrl),
2142 pcmp_elen(env, R_EAX, ctrl));
222a3336
AZ
2143
2144 if ((ctrl >> 6) & 1) {
e01d9d31 2145 if (ctrl & 1) {
bc426899 2146 for (i = 0; i < 8; i++, res >>= 1) {
2b8d7e9d 2147 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0;
bc426899 2148 }
e01d9d31 2149 } else {
bc426899 2150 for (i = 0; i < 16; i++, res >>= 1) {
2b8d7e9d 2151 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0;
bc426899 2152 }
e01d9d31 2153 }
222a3336 2154 } else {
2b8d7e9d
AJ
2155 env->xmm_regs[0].Q(1) = 0;
2156 env->xmm_regs[0].Q(0) = res;
222a3336
AZ
2157 }
2158}
2159
d3eb5eae
BS
2160void glue(helper_pcmpistri, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2161 uint32_t ctrl)
222a3336 2162{
d3eb5eae 2163 unsigned int res = pcmpxstrx(env, d, s, ctrl,
e01d9d31
BS
2164 pcmp_ilen(s, ctrl),
2165 pcmp_ilen(d, ctrl));
222a3336 2166
e01d9d31 2167 if (res) {
c334a388 2168 env->regs[R_ECX] = (ctrl & (1 << 6)) ? 31 - clz32(res) : ctz32(res);
e01d9d31 2169 } else {
222a3336 2170 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0));
e01d9d31 2171 }
222a3336
AZ
2172}
2173
d3eb5eae
BS
2174void glue(helper_pcmpistrm, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2175 uint32_t ctrl)
222a3336
AZ
2176{
2177 int i;
d3eb5eae 2178 unsigned int res = pcmpxstrx(env, d, s, ctrl,
e01d9d31
BS
2179 pcmp_ilen(s, ctrl),
2180 pcmp_ilen(d, ctrl));
222a3336
AZ
2181
2182 if ((ctrl >> 6) & 1) {
e01d9d31 2183 if (ctrl & 1) {
bc426899 2184 for (i = 0; i < 8; i++, res >>= 1) {
2b8d7e9d 2185 env->xmm_regs[0].W(i) = (res & 1) ? ~0 : 0;
bc426899 2186 }
e01d9d31 2187 } else {
bc426899 2188 for (i = 0; i < 16; i++, res >>= 1) {
2b8d7e9d 2189 env->xmm_regs[0].B(i) = (res & 1) ? ~0 : 0;
bc426899 2190 }
e01d9d31 2191 }
222a3336 2192 } else {
2b8d7e9d
AJ
2193 env->xmm_regs[0].Q(1) = 0;
2194 env->xmm_regs[0].Q(0) = res;
222a3336
AZ
2195 }
2196}
2197
2198#define CRCPOLY 0x1edc6f41
2199#define CRCPOLY_BITREV 0x82f63b78
2200target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len)
2201{
2202 target_ulong crc = (msg & ((target_ulong) -1 >>
e01d9d31 2203 (TARGET_LONG_BITS - len))) ^ crc1;
222a3336 2204
e01d9d31 2205 while (len--) {
222a3336 2206 crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0);
e01d9d31 2207 }
222a3336
AZ
2208
2209 return crc;
2210}
2211
fd17264a
PB
2212#endif
2213
e71827bc
AJ
2214void glue(helper_pclmulqdq, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2215 uint32_t ctrl)
2216{
2217 uint64_t ah, al, b, resh, resl;
2218
2219 ah = 0;
2220 al = d->Q((ctrl & 1) != 0);
2221 b = s->Q((ctrl & 16) != 0);
2222 resh = resl = 0;
2223
2224 while (b) {
2225 if (b & 1) {
2226 resl ^= al;
2227 resh ^= ah;
2228 }
2229 ah = (ah << 1) | (al >> 63);
2230 al <<= 1;
2231 b >>= 1;
2232 }
2233
2234 d->Q(0) = resl;
2235 d->Q(1) = resh;
2236}
d640045a 2237
d640045a
AJ
2238void glue(helper_aesdec, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2239{
2240 int i;
2241 Reg st = *d;
2242 Reg rk = *s;
2243
2244 for (i = 0 ; i < 4 ; i++) {
04af534d
TM
2245 d->L(i) = rk.L(i) ^ bswap32(AES_Td0[st.B(AES_ishifts[4*i+0])] ^
2246 AES_Td1[st.B(AES_ishifts[4*i+1])] ^
2247 AES_Td2[st.B(AES_ishifts[4*i+2])] ^
2248 AES_Td3[st.B(AES_ishifts[4*i+3])]);
d640045a
AJ
2249 }
2250}
2251
2252void glue(helper_aesdeclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2253{
2254 int i;
2255 Reg st = *d;
2256 Reg rk = *s;
2257
2258 for (i = 0; i < 16; i++) {
9551ea69 2259 d->B(i) = rk.B(i) ^ (AES_isbox[st.B(AES_ishifts[i])]);
d640045a
AJ
2260 }
2261}
2262
2263void glue(helper_aesenc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2264{
2265 int i;
2266 Reg st = *d;
2267 Reg rk = *s;
2268
2269 for (i = 0 ; i < 4 ; i++) {
04af534d
TM
2270 d->L(i) = rk.L(i) ^ bswap32(AES_Te0[st.B(AES_shifts[4*i+0])] ^
2271 AES_Te1[st.B(AES_shifts[4*i+1])] ^
2272 AES_Te2[st.B(AES_shifts[4*i+2])] ^
2273 AES_Te3[st.B(AES_shifts[4*i+3])]);
d640045a
AJ
2274 }
2275}
2276
2277void glue(helper_aesenclast, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2278{
2279 int i;
2280 Reg st = *d;
2281 Reg rk = *s;
2282
2283 for (i = 0; i < 16; i++) {
9551ea69 2284 d->B(i) = rk.B(i) ^ (AES_sbox[st.B(AES_shifts[i])]);
d640045a
AJ
2285 }
2286
2287}
2288
2289void glue(helper_aesimc, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
2290{
2291 int i;
2292 Reg tmp = *s;
2293
2294 for (i = 0 ; i < 4 ; i++) {
9551ea69
AJ
2295 d->L(i) = bswap32(AES_imc[tmp.B(4*i+0)][0] ^
2296 AES_imc[tmp.B(4*i+1)][1] ^
2297 AES_imc[tmp.B(4*i+2)][2] ^
2298 AES_imc[tmp.B(4*i+3)][3]);
d640045a
AJ
2299 }
2300}
2301
2302void glue(helper_aeskeygenassist, SUFFIX)(CPUX86State *env, Reg *d, Reg *s,
2303 uint32_t ctrl)
2304{
2305 int i;
2306 Reg tmp = *s;
2307
2308 for (i = 0 ; i < 4 ; i++) {
9551ea69
AJ
2309 d->B(i) = AES_sbox[tmp.B(i + 4)];
2310 d->B(i + 8) = AES_sbox[tmp.B(i + 12)];
d640045a
AJ
2311 }
2312 d->L(1) = (d->L(0) << 24 | d->L(0) >> 8) ^ ctrl;
2313 d->L(3) = (d->L(2) << 24 | d->L(2) >> 8) ^ ctrl;
2314}
222a3336
AZ
2315#endif
2316
3403cafe
PB
2317#undef SSE_HELPER_S
2318
664e0f19
FB
2319#undef SHIFT
2320#undef XMM_ONLY
2321#undef Reg
2322#undef B
2323#undef W
2324#undef L
2325#undef Q
2326#undef SUFFIX