target/i386/tcg/fpu_helper.c

   1 /*
   2  *  x86 FPU, MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI helpers
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include <math.h>
  22 #include "cpu.h"
  23 #include "tcg-cpu.h"
  24 #include "exec/helper-proto.h"
  25 #include "fpu/softfloat.h"
  26 #include "fpu/softfloat-macros.h"
  27 #include "helper-tcg.h"
  28
  29 /* float macros */
  30 #define FT0    (env->ft0)
  31 #define ST0    (env->fpregs[env->fpstt].d)
  32 #define ST(n)  (env->fpregs[(env->fpstt + (n)) & 7].d)
  33 #define ST1    ST(1)
  34
  35 #define FPU_RC_MASK         0xc00
  36 #define FPU_RC_NEAR         0x000
  37 #define FPU_RC_DOWN         0x400
  38 #define FPU_RC_UP           0x800
  39 #define FPU_RC_CHOP         0xc00
  40
  41 #define MAXTAN 9223372036854775808.0
  42
  43 /* the following deal with x86 long double-precision numbers */
  44 #define MAXEXPD 0x7fff
  45 #define EXPBIAS 16383
  46 #define EXPD(fp)        (fp.l.upper & 0x7fff)
  47 #define SIGND(fp)       ((fp.l.upper) & 0x8000)
  48 #define MANTD(fp)       (fp.l.lower)
  49 #define BIASEXPONENT(fp) fp.l.upper = (fp.l.upper & ~(0x7fff)) | EXPBIAS
  50
  51 #define FPUS_IE (1 << 0)
  52 #define FPUS_DE (1 << 1)
  53 #define FPUS_ZE (1 << 2)
  54 #define FPUS_OE (1 << 3)
  55 #define FPUS_UE (1 << 4)
  56 #define FPUS_PE (1 << 5)
  57 #define FPUS_SF (1 << 6)
  58 #define FPUS_SE (1 << 7)
  59 #define FPUS_B  (1 << 15)
  60
  61 #define FPUC_EM 0x3f
  62
  63 #define floatx80_lg2 make_floatx80(0x3ffd, 0x9a209a84fbcff799LL)
  64 #define floatx80_lg2_d make_floatx80(0x3ffd, 0x9a209a84fbcff798LL)
  65 #define floatx80_l2e make_floatx80(0x3fff, 0xb8aa3b295c17f0bcLL)
  66 #define floatx80_l2e_d make_floatx80(0x3fff, 0xb8aa3b295c17f0bbLL)
  67 #define floatx80_l2t make_floatx80(0x4000, 0xd49a784bcd1b8afeLL)
  68 #define floatx80_l2t_u make_floatx80(0x4000, 0xd49a784bcd1b8affLL)
  69 #define floatx80_ln2_d make_floatx80(0x3ffe, 0xb17217f7d1cf79abLL)
  70 #define floatx80_pi_d make_floatx80(0x4000, 0xc90fdaa22168c234LL)
  71
  72 static inline void fpush(CPUX86State *env)
  73 {
  74     env->fpstt = (env->fpstt - 1) & 7;
  75     env->fptags[env->fpstt] = 0; /* validate stack entry */
  76 }
  77
  78 static inline void fpop(CPUX86State *env)
  79 {
  80     env->fptags[env->fpstt] = 1; /* invalidate stack entry */
  81     env->fpstt = (env->fpstt + 1) & 7;
  82 }
  83
  84 static floatx80 do_fldt(CPUX86State *env, target_ulong ptr, uintptr_t retaddr)
  85 {
  86     CPU_LDoubleU temp;
  87
  88     temp.l.lower = cpu_ldq_data_ra(env, ptr, retaddr);
  89     temp.l.upper = cpu_lduw_data_ra(env, ptr + 8, retaddr);
  90     return temp.d;
  91 }
  92
  93 static void do_fstt(CPUX86State *env, floatx80 f, target_ulong ptr,
  94                     uintptr_t retaddr)
  95 {
  96     CPU_LDoubleU temp;
  97
  98     temp.d = f;
  99     cpu_stq_data_ra(env, ptr, temp.l.lower, retaddr);
 100     cpu_stw_data_ra(env, ptr + 8, temp.l.upper, retaddr);
 101 }
 102
 103 /* x87 FPU helpers */
 104
 105 static inline double floatx80_to_double(CPUX86State *env, floatx80 a)
 106 {
 107     union {
 108         float64 f64;
 109         double d;
 110     } u;
 111
 112     u.f64 = floatx80_to_float64(a, &env->fp_status);
 113     return u.d;
 114 }
 115
 116 static inline floatx80 double_to_floatx80(CPUX86State *env, double a)
 117 {
 118     union {
 119         float64 f64;
 120         double d;
 121     } u;
 122
 123     u.d = a;
 124     return float64_to_floatx80(u.f64, &env->fp_status);
 125 }
 126
 127 static void fpu_set_exception(CPUX86State *env, int mask)
 128 {
 129     env->fpus |= mask;
 130     if (env->fpus & (~env->fpuc & FPUC_EM)) {
 131         env->fpus |= FPUS_SE | FPUS_B;
 132     }
 133 }
 134
 135 static inline uint8_t save_exception_flags(CPUX86State *env)
 136 {
 137     uint8_t old_flags = get_float_exception_flags(&env->fp_status);
 138     set_float_exception_flags(0, &env->fp_status);
 139     return old_flags;
 140 }
 141
 142 static void merge_exception_flags(CPUX86State *env, uint8_t old_flags)
 143 {
 144     uint8_t new_flags = get_float_exception_flags(&env->fp_status);
 145     float_raise(old_flags, &env->fp_status);
 146     fpu_set_exception(env,
 147                       ((new_flags & float_flag_invalid ? FPUS_IE : 0) |
 148                        (new_flags & float_flag_divbyzero ? FPUS_ZE : 0) |
 149                        (new_flags & float_flag_overflow ? FPUS_OE : 0) |
 150                        (new_flags & float_flag_underflow ? FPUS_UE : 0) |
 151                        (new_flags & float_flag_inexact ? FPUS_PE : 0) |
 152                        (new_flags & float_flag_input_denormal ? FPUS_DE : 0)));
 153 }
 154
 155 static inline floatx80 helper_fdiv(CPUX86State *env, floatx80 a, floatx80 b)
 156 {
 157     uint8_t old_flags = save_exception_flags(env);
 158     floatx80 ret = floatx80_div(a, b, &env->fp_status);
 159     merge_exception_flags(env, old_flags);
 160     return ret;
 161 }
 162
 163 static void fpu_raise_exception(CPUX86State *env, uintptr_t retaddr)
 164 {
 165     if (env->cr[0] & CR0_NE_MASK) {
 166         raise_exception_ra(env, EXCP10_COPR, retaddr);
 167     }
 168 #if !defined(CONFIG_USER_ONLY)
 169     else {
 170         fpu_check_raise_ferr_irq(env);
 171     }
 172 #endif
 173 }
 174
 175 void helper_flds_FT0(CPUX86State *env, uint32_t val)
 176 {
 177     uint8_t old_flags = save_exception_flags(env);
 178     union {
 179         float32 f;
 180         uint32_t i;
 181     } u;
 182
 183     u.i = val;
 184     FT0 = float32_to_floatx80(u.f, &env->fp_status);
 185     merge_exception_flags(env, old_flags);
 186 }
 187
 188 void helper_fldl_FT0(CPUX86State *env, uint64_t val)
 189 {
 190     uint8_t old_flags = save_exception_flags(env);
 191     union {
 192         float64 f;
 193         uint64_t i;
 194     } u;
 195
 196     u.i = val;
 197     FT0 = float64_to_floatx80(u.f, &env->fp_status);
 198     merge_exception_flags(env, old_flags);
 199 }
 200
 201 void helper_fildl_FT0(CPUX86State *env, int32_t val)
 202 {
 203     FT0 = int32_to_floatx80(val, &env->fp_status);
 204 }
 205
 206 void helper_flds_ST0(CPUX86State *env, uint32_t val)
 207 {
 208     uint8_t old_flags = save_exception_flags(env);
 209     int new_fpstt;
 210     union {
 211         float32 f;
 212         uint32_t i;
 213     } u;
 214
 215     new_fpstt = (env->fpstt - 1) & 7;
 216     u.i = val;
 217     env->fpregs[new_fpstt].d = float32_to_floatx80(u.f, &env->fp_status);
 218     env->fpstt = new_fpstt;
 219     env->fptags[new_fpstt] = 0; /* validate stack entry */
 220     merge_exception_flags(env, old_flags);
 221 }
 222
 223 void helper_fldl_ST0(CPUX86State *env, uint64_t val)
 224 {
 225     uint8_t old_flags = save_exception_flags(env);
 226     int new_fpstt;
 227     union {
 228         float64 f;
 229         uint64_t i;
 230     } u;
 231
 232     new_fpstt = (env->fpstt - 1) & 7;
 233     u.i = val;
 234     env->fpregs[new_fpstt].d = float64_to_floatx80(u.f, &env->fp_status);
 235     env->fpstt = new_fpstt;
 236     env->fptags[new_fpstt] = 0; /* validate stack entry */
 237     merge_exception_flags(env, old_flags);
 238 }
 239
 240 static FloatX80RoundPrec tmp_maximise_precision(float_status *st)
 241 {
 242     FloatX80RoundPrec old = get_floatx80_rounding_precision(st);
 243     set_floatx80_rounding_precision(floatx80_precision_x, st);
 244     return old;
 245 }
 246
 247 void helper_fildl_ST0(CPUX86State *env, int32_t val)
 248 {
 249     int new_fpstt;
 250     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
 251
 252     new_fpstt = (env->fpstt - 1) & 7;
 253     env->fpregs[new_fpstt].d = int32_to_floatx80(val, &env->fp_status);
 254     env->fpstt = new_fpstt;
 255     env->fptags[new_fpstt] = 0; /* validate stack entry */
 256
 257     set_floatx80_rounding_precision(old, &env->fp_status);
 258 }
 259
 260 void helper_fildll_ST0(CPUX86State *env, int64_t val)
 261 {
 262     int new_fpstt;
 263     FloatX80RoundPrec old = tmp_maximise_precision(&env->fp_status);
 264
 265     new_fpstt = (env->fpstt - 1) & 7;
 266     env->fpregs[new_fpstt].d = int64_to_floatx80(val, &env->fp_status);
 267     env->fpstt = new_fpstt;
 268     env->fptags[new_fpstt] = 0; /* validate stack entry */
 269
 270     set_floatx80_rounding_precision(old, &env->fp_status);
 271 }
 272
 273 uint32_t helper_fsts_ST0(CPUX86State *env)
 274 {
 275     uint8_t old_flags = save_exception_flags(env);
 276     union {
 277         float32 f;
 278         uint32_t i;
 279     } u;
 280
 281     u.f = floatx80_to_float32(ST0, &env->fp_status);
 282     merge_exception_flags(env, old_flags);
 283     return u.i;
 284 }
 285
 286 uint64_t helper_fstl_ST0(CPUX86State *env)
 287 {
 288     uint8_t old_flags = save_exception_flags(env);
 289     union {
 290         float64 f;
 291         uint64_t i;
 292     } u;
 293
 294     u.f = floatx80_to_float64(ST0, &env->fp_status);
 295     merge_exception_flags(env, old_flags);
 296     return u.i;
 297 }
 298
 299 int32_t helper_fist_ST0(CPUX86State *env)
 300 {
 301     uint8_t old_flags = save_exception_flags(env);
 302     int32_t val;
 303
 304     val = floatx80_to_int32(ST0, &env->fp_status);
 305     if (val != (int16_t)val) {
 306         set_float_exception_flags(float_flag_invalid, &env->fp_status);
 307         val = -32768;
 308     }
 309     merge_exception_flags(env, old_flags);
 310     return val;
 311 }
 312
 313 int32_t helper_fistl_ST0(CPUX86State *env)
 314 {
 315     uint8_t old_flags = save_exception_flags(env);
 316     int32_t val;
 317
 318     val = floatx80_to_int32(ST0, &env->fp_status);
 319     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 320         val = 0x80000000;
 321     }
 322     merge_exception_flags(env, old_flags);
 323     return val;
 324 }
 325
 326 int64_t helper_fistll_ST0(CPUX86State *env)
 327 {
 328     uint8_t old_flags = save_exception_flags(env);
 329     int64_t val;
 330
 331     val = floatx80_to_int64(ST0, &env->fp_status);
 332     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 333         val = 0x8000000000000000ULL;
 334     }
 335     merge_exception_flags(env, old_flags);
 336     return val;
 337 }
 338
 339 int32_t helper_fistt_ST0(CPUX86State *env)
 340 {
 341     uint8_t old_flags = save_exception_flags(env);
 342     int32_t val;
 343
 344     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
 345     if (val != (int16_t)val) {
 346         set_float_exception_flags(float_flag_invalid, &env->fp_status);
 347         val = -32768;
 348     }
 349     merge_exception_flags(env, old_flags);
 350     return val;
 351 }
 352
 353 int32_t helper_fisttl_ST0(CPUX86State *env)
 354 {
 355     uint8_t old_flags = save_exception_flags(env);
 356     int32_t val;
 357
 358     val = floatx80_to_int32_round_to_zero(ST0, &env->fp_status);
 359     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 360         val = 0x80000000;
 361     }
 362     merge_exception_flags(env, old_flags);
 363     return val;
 364 }
 365
 366 int64_t helper_fisttll_ST0(CPUX86State *env)
 367 {
 368     uint8_t old_flags = save_exception_flags(env);
 369     int64_t val;
 370
 371     val = floatx80_to_int64_round_to_zero(ST0, &env->fp_status);
 372     if (get_float_exception_flags(&env->fp_status) & float_flag_invalid) {
 373         val = 0x8000000000000000ULL;
 374     }
 375     merge_exception_flags(env, old_flags);
 376     return val;
 377 }
 378
 379 void helper_fldt_ST0(CPUX86State *env, target_ulong ptr)
 380 {
 381     int new_fpstt;
 382
 383     new_fpstt = (env->fpstt - 1) & 7;
 384     env->fpregs[new_fpstt].d = do_fldt(env, ptr, GETPC());
 385     env->fpstt = new_fpstt;
 386     env->fptags[new_fpstt] = 0; /* validate stack entry */
 387 }
 388
 389 void helper_fstt_ST0(CPUX86State *env, target_ulong ptr)
 390 {
 391     do_fstt(env, ST0, ptr, GETPC());
 392 }
 393
 394 void helper_fpush(CPUX86State *env)
 395 {
 396     fpush(env);
 397 }
 398
 399 void helper_fpop(CPUX86State *env)
 400 {
 401     fpop(env);
 402 }
 403
 404 void helper_fdecstp(CPUX86State *env)
 405 {
 406     env->fpstt = (env->fpstt - 1) & 7;
 407     env->fpus &= ~0x4700;
 408 }
 409
 410 void helper_fincstp(CPUX86State *env)
 411 {
 412     env->fpstt = (env->fpstt + 1) & 7;
 413     env->fpus &= ~0x4700;
 414 }
 415
 416 /* FPU move */
 417
 418 void helper_ffree_STN(CPUX86State *env, int st_index)
 419 {
 420     env->fptags[(env->fpstt + st_index) & 7] = 1;
 421 }
 422
 423 void helper_fmov_ST0_FT0(CPUX86State *env)
 424 {
 425     ST0 = FT0;
 426 }
 427
 428 void helper_fmov_FT0_STN(CPUX86State *env, int st_index)
 429 {
 430     FT0 = ST(st_index);
 431 }
 432
 433 void helper_fmov_ST0_STN(CPUX86State *env, int st_index)
 434 {
 435     ST0 = ST(st_index);
 436 }
 437
 438 void helper_fmov_STN_ST0(CPUX86State *env, int st_index)
 439 {
 440     ST(st_index) = ST0;
 441 }
 442
 443 void helper_fxchg_ST0_STN(CPUX86State *env, int st_index)
 444 {
 445     floatx80 tmp;
 446
 447     tmp = ST(st_index);
 448     ST(st_index) = ST0;
 449     ST0 = tmp;
 450 }
 451
 452 /* FPU operations */
 453
 454 static const int fcom_ccval[4] = {0x0100, 0x4000, 0x0000, 0x4500};
 455
 456 void helper_fcom_ST0_FT0(CPUX86State *env)
 457 {
 458     uint8_t old_flags = save_exception_flags(env);
 459     FloatRelation ret;
 460
 461     ret = floatx80_compare(ST0, FT0, &env->fp_status);
 462     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
 463     merge_exception_flags(env, old_flags);
 464 }
 465
 466 void helper_fucom_ST0_FT0(CPUX86State *env)
 467 {
 468     uint8_t old_flags = save_exception_flags(env);
 469     FloatRelation ret;
 470
 471     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
 472     env->fpus = (env->fpus & ~0x4500) | fcom_ccval[ret + 1];
 473     merge_exception_flags(env, old_flags);
 474 }
 475
 476 static const int fcomi_ccval[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C};
 477
 478 void helper_fcomi_ST0_FT0(CPUX86State *env)
 479 {
 480     uint8_t old_flags = save_exception_flags(env);
 481     int eflags;
 482     FloatRelation ret;
 483
 484     ret = floatx80_compare(ST0, FT0, &env->fp_status);
 485     eflags = cpu_cc_compute_all(env, CC_OP);
 486     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
 487     CC_SRC = eflags;
 488     merge_exception_flags(env, old_flags);
 489 }
 490
 491 void helper_fucomi_ST0_FT0(CPUX86State *env)
 492 {
 493     uint8_t old_flags = save_exception_flags(env);
 494     int eflags;
 495     FloatRelation ret;
 496
 497     ret = floatx80_compare_quiet(ST0, FT0, &env->fp_status);
 498     eflags = cpu_cc_compute_all(env, CC_OP);
 499     eflags = (eflags & ~(CC_Z | CC_P | CC_C)) | fcomi_ccval[ret + 1];
 500     CC_SRC = eflags;
 501     merge_exception_flags(env, old_flags);
 502 }
 503
 504 void helper_fadd_ST0_FT0(CPUX86State *env)
 505 {
 506     uint8_t old_flags = save_exception_flags(env);
 507     ST0 = floatx80_add(ST0, FT0, &env->fp_status);
 508     merge_exception_flags(env, old_flags);
 509 }
 510
 511 void helper_fmul_ST0_FT0(CPUX86State *env)
 512 {
 513     uint8_t old_flags = save_exception_flags(env);
 514     ST0 = floatx80_mul(ST0, FT0, &env->fp_status);
 515     merge_exception_flags(env, old_flags);
 516 }
 517
 518 void helper_fsub_ST0_FT0(CPUX86State *env)
 519 {
 520     uint8_t old_flags = save_exception_flags(env);
 521     ST0 = floatx80_sub(ST0, FT0, &env->fp_status);
 522     merge_exception_flags(env, old_flags);
 523 }
 524
 525 void helper_fsubr_ST0_FT0(CPUX86State *env)
 526 {
 527     uint8_t old_flags = save_exception_flags(env);
 528     ST0 = floatx80_sub(FT0, ST0, &env->fp_status);
 529     merge_exception_flags(env, old_flags);
 530 }
 531
 532 void helper_fdiv_ST0_FT0(CPUX86State *env)
 533 {
 534     ST0 = helper_fdiv(env, ST0, FT0);
 535 }
 536
 537 void helper_fdivr_ST0_FT0(CPUX86State *env)
 538 {
 539     ST0 = helper_fdiv(env, FT0, ST0);
 540 }
 541
 542 /* fp operations between STN and ST0 */
 543
 544 void helper_fadd_STN_ST0(CPUX86State *env, int st_index)
 545 {
 546     uint8_t old_flags = save_exception_flags(env);
 547     ST(st_index) = floatx80_add(ST(st_index), ST0, &env->fp_status);
 548     merge_exception_flags(env, old_flags);
 549 }
 550
 551 void helper_fmul_STN_ST0(CPUX86State *env, int st_index)
 552 {
 553     uint8_t old_flags = save_exception_flags(env);
 554     ST(st_index) = floatx80_mul(ST(st_index), ST0, &env->fp_status);
 555     merge_exception_flags(env, old_flags);
 556 }
 557
 558 void helper_fsub_STN_ST0(CPUX86State *env, int st_index)
 559 {
 560     uint8_t old_flags = save_exception_flags(env);
 561     ST(st_index) = floatx80_sub(ST(st_index), ST0, &env->fp_status);
 562     merge_exception_flags(env, old_flags);
 563 }
 564
 565 void helper_fsubr_STN_ST0(CPUX86State *env, int st_index)
 566 {
 567     uint8_t old_flags = save_exception_flags(env);
 568     ST(st_index) = floatx80_sub(ST0, ST(st_index), &env->fp_status);
 569     merge_exception_flags(env, old_flags);
 570 }
 571
 572 void helper_fdiv_STN_ST0(CPUX86State *env, int st_index)
 573 {
 574     floatx80 *p;
 575
 576     p = &ST(st_index);
 577     *p = helper_fdiv(env, *p, ST0);
 578 }
 579
 580 void helper_fdivr_STN_ST0(CPUX86State *env, int st_index)
 581 {
 582     floatx80 *p;
 583
 584     p = &ST(st_index);
 585     *p = helper_fdiv(env, ST0, *p);
 586 }
 587
 588 /* misc FPU operations */
 589 void helper_fchs_ST0(CPUX86State *env)
 590 {
 591     ST0 = floatx80_chs(ST0);
 592 }
 593
 594 void helper_fabs_ST0(CPUX86State *env)
 595 {
 596     ST0 = floatx80_abs(ST0);
 597 }
 598
 599 void helper_fld1_ST0(CPUX86State *env)
 600 {
 601     ST0 = floatx80_one;
 602 }
 603
 604 void helper_fldl2t_ST0(CPUX86State *env)
 605 {
 606     switch (env->fpuc & FPU_RC_MASK) {
 607     case FPU_RC_UP:
 608         ST0 = floatx80_l2t_u;
 609         break;
 610     default:
 611         ST0 = floatx80_l2t;
 612         break;
 613     }
 614 }
 615
 616 void helper_fldl2e_ST0(CPUX86State *env)
 617 {
 618     switch (env->fpuc & FPU_RC_MASK) {
 619     case FPU_RC_DOWN:
 620     case FPU_RC_CHOP:
 621         ST0 = floatx80_l2e_d;
 622         break;
 623     default:
 624         ST0 = floatx80_l2e;
 625         break;
 626     }
 627 }
 628
 629 void helper_fldpi_ST0(CPUX86State *env)
 630 {
 631     switch (env->fpuc & FPU_RC_MASK) {
 632     case FPU_RC_DOWN:
 633     case FPU_RC_CHOP:
 634         ST0 = floatx80_pi_d;
 635         break;
 636     default:
 637         ST0 = floatx80_pi;
 638         break;
 639     }
 640 }
 641
 642 void helper_fldlg2_ST0(CPUX86State *env)
 643 {
 644     switch (env->fpuc & FPU_RC_MASK) {
 645     case FPU_RC_DOWN:
 646     case FPU_RC_CHOP:
 647         ST0 = floatx80_lg2_d;
 648         break;
 649     default:
 650         ST0 = floatx80_lg2;
 651         break;
 652     }
 653 }
 654
 655 void helper_fldln2_ST0(CPUX86State *env)
 656 {
 657     switch (env->fpuc & FPU_RC_MASK) {
 658     case FPU_RC_DOWN:
 659     case FPU_RC_CHOP:
 660         ST0 = floatx80_ln2_d;
 661         break;
 662     default:
 663         ST0 = floatx80_ln2;
 664         break;
 665     }
 666 }
 667
 668 void helper_fldz_ST0(CPUX86State *env)
 669 {
 670     ST0 = floatx80_zero;
 671 }
 672
 673 void helper_fldz_FT0(CPUX86State *env)
 674 {
 675     FT0 = floatx80_zero;
 676 }
 677
 678 uint32_t helper_fnstsw(CPUX86State *env)
 679 {
 680     return (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
 681 }
 682
 683 uint32_t helper_fnstcw(CPUX86State *env)
 684 {
 685     return env->fpuc;
 686 }
 687
 688 void update_fp_status(CPUX86State *env)
 689 {
 690     FloatRoundMode rnd_mode;
 691     FloatX80RoundPrec rnd_prec;
 692
 693     /* set rounding mode */
 694     switch (env->fpuc & FPU_RC_MASK) {
 695     default:
 696     case FPU_RC_NEAR:
 697         rnd_mode = float_round_nearest_even;
 698         break;
 699     case FPU_RC_DOWN:
 700         rnd_mode = float_round_down;
 701         break;
 702     case FPU_RC_UP:
 703         rnd_mode = float_round_up;
 704         break;
 705     case FPU_RC_CHOP:
 706         rnd_mode = float_round_to_zero;
 707         break;
 708     }
 709     set_float_rounding_mode(rnd_mode, &env->fp_status);
 710
 711     switch ((env->fpuc >> 8) & 3) {
 712     case 0:
 713         rnd_prec = floatx80_precision_s;
 714         break;
 715     case 2:
 716         rnd_prec = floatx80_precision_d;
 717         break;
 718     case 3:
 719     default:
 720         rnd_prec = floatx80_precision_x;
 721         break;
 722     }
 723     set_floatx80_rounding_precision(rnd_prec, &env->fp_status);
 724 }
 725
 726 void helper_fldcw(CPUX86State *env, uint32_t val)
 727 {
 728     cpu_set_fpuc(env, val);
 729 }
 730
 731 void helper_fclex(CPUX86State *env)
 732 {
 733     env->fpus &= 0x7f00;
 734 }
 735
 736 void helper_fwait(CPUX86State *env)
 737 {
 738     if (env->fpus & FPUS_SE) {
 739         fpu_raise_exception(env, GETPC());
 740     }
 741 }
 742
 743 static void do_fninit(CPUX86State *env)
 744 {
 745     env->fpus = 0;
 746     env->fpstt = 0;
 747     env->fpcs = 0;
 748     env->fpds = 0;
 749     env->fpip = 0;
 750     env->fpdp = 0;
 751     cpu_set_fpuc(env, 0x37f);
 752     env->fptags[0] = 1;
 753     env->fptags[1] = 1;
 754     env->fptags[2] = 1;
 755     env->fptags[3] = 1;
 756     env->fptags[4] = 1;
 757     env->fptags[5] = 1;
 758     env->fptags[6] = 1;
 759     env->fptags[7] = 1;
 760 }
 761
 762 void helper_fninit(CPUX86State *env)
 763 {
 764     do_fninit(env);
 765 }
 766
 767 /* BCD ops */
 768
 769 void helper_fbld_ST0(CPUX86State *env, target_ulong ptr)
 770 {
 771     floatx80 tmp;
 772     uint64_t val;
 773     unsigned int v;
 774     int i;
 775
 776     val = 0;
 777     for (i = 8; i >= 0; i--) {
 778         v = cpu_ldub_data_ra(env, ptr + i, GETPC());
 779         val = (val * 100) + ((v >> 4) * 10) + (v & 0xf);
 780     }
 781     tmp = int64_to_floatx80(val, &env->fp_status);
 782     if (cpu_ldub_data_ra(env, ptr + 9, GETPC()) & 0x80) {
 783         tmp = floatx80_chs(tmp);
 784     }
 785     fpush(env);
 786     ST0 = tmp;
 787 }
 788
 789 void helper_fbst_ST0(CPUX86State *env, target_ulong ptr)
 790 {
 791     uint8_t old_flags = save_exception_flags(env);
 792     int v;
 793     target_ulong mem_ref, mem_end;
 794     int64_t val;
 795     CPU_LDoubleU temp;
 796
 797     temp.d = ST0;
 798
 799     val = floatx80_to_int64(ST0, &env->fp_status);
 800     mem_ref = ptr;
 801     if (val >= 1000000000000000000LL || val <= -1000000000000000000LL) {
 802         set_float_exception_flags(float_flag_invalid, &env->fp_status);
 803         while (mem_ref < ptr + 7) {
 804             cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
 805         }
 806         cpu_stb_data_ra(env, mem_ref++, 0xc0, GETPC());
 807         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
 808         cpu_stb_data_ra(env, mem_ref++, 0xff, GETPC());
 809         merge_exception_flags(env, old_flags);
 810         return;
 811     }
 812     mem_end = mem_ref + 9;
 813     if (SIGND(temp)) {
 814         cpu_stb_data_ra(env, mem_end, 0x80, GETPC());
 815         val = -val;
 816     } else {
 817         cpu_stb_data_ra(env, mem_end, 0x00, GETPC());
 818     }
 819     while (mem_ref < mem_end) {
 820         if (val == 0) {
 821             break;
 822         }
 823         v = val % 100;
 824         val = val / 100;
 825         v = ((v / 10) << 4) | (v % 10);
 826         cpu_stb_data_ra(env, mem_ref++, v, GETPC());
 827     }
 828     while (mem_ref < mem_end) {
 829         cpu_stb_data_ra(env, mem_ref++, 0, GETPC());
 830     }
 831     merge_exception_flags(env, old_flags);
 832 }
 833
 834 /* 128-bit significand of log(2).  */
 835 #define ln2_sig_high 0xb17217f7d1cf79abULL
 836 #define ln2_sig_low 0xc9e3b39803f2f6afULL
 837
 838 /*
 839  * Polynomial coefficients for an approximation to (2^x - 1) / x, on
 840  * the interval [-1/64, 1/64].
 841  */
 842 #define f2xm1_coeff_0 make_floatx80(0x3ffe, 0xb17217f7d1cf79acULL)
 843 #define f2xm1_coeff_0_low make_floatx80(0xbfbc, 0xd87edabf495b3762ULL)
 844 #define f2xm1_coeff_1 make_floatx80(0x3ffc, 0xf5fdeffc162c7543ULL)
 845 #define f2xm1_coeff_2 make_floatx80(0x3ffa, 0xe35846b82505fcc7ULL)
 846 #define f2xm1_coeff_3 make_floatx80(0x3ff8, 0x9d955b7dd273b899ULL)
 847 #define f2xm1_coeff_4 make_floatx80(0x3ff5, 0xaec3ff3c4ef4ac0cULL)
 848 #define f2xm1_coeff_5 make_floatx80(0x3ff2, 0xa184897c3a7f0de9ULL)
 849 #define f2xm1_coeff_6 make_floatx80(0x3fee, 0xffe634d0ec30d504ULL)
 850 #define f2xm1_coeff_7 make_floatx80(0x3feb, 0xb160111d2db515e4ULL)
 851
 852 struct f2xm1_data {
 853     /*
 854      * A value very close to a multiple of 1/32, such that 2^t and 2^t - 1
 855      * are very close to exact floatx80 values.
 856      */
 857     floatx80 t;
 858     /* The value of 2^t.  */
 859     floatx80 exp2;
 860     /* The value of 2^t - 1.  */
 861     floatx80 exp2m1;
 862 };
 863
 864 static const struct f2xm1_data f2xm1_table[65] = {
 865     { make_floatx80_init(0xbfff, 0x8000000000000000ULL),
 866       make_floatx80_init(0x3ffe, 0x8000000000000000ULL),
 867       make_floatx80_init(0xbffe, 0x8000000000000000ULL) },
 868     { make_floatx80_init(0xbffe, 0xf800000000002e7eULL),
 869       make_floatx80_init(0x3ffe, 0x82cd8698ac2b9160ULL),
 870       make_floatx80_init(0xbffd, 0xfa64f2cea7a8dd40ULL) },
 871     { make_floatx80_init(0xbffe, 0xefffffffffffe960ULL),
 872       make_floatx80_init(0x3ffe, 0x85aac367cc488345ULL),
 873       make_floatx80_init(0xbffd, 0xf4aa7930676ef976ULL) },
 874     { make_floatx80_init(0xbffe, 0xe800000000006f10ULL),
 875       make_floatx80_init(0x3ffe, 0x88980e8092da5c14ULL),
 876       make_floatx80_init(0xbffd, 0xeecfe2feda4b47d8ULL) },
 877     { make_floatx80_init(0xbffe, 0xe000000000008a45ULL),
 878       make_floatx80_init(0x3ffe, 0x8b95c1e3ea8ba2a5ULL),
 879       make_floatx80_init(0xbffd, 0xe8d47c382ae8bab6ULL) },
 880     { make_floatx80_init(0xbffe, 0xd7ffffffffff8a9eULL),
 881       make_floatx80_init(0x3ffe, 0x8ea4398b45cd8116ULL),
 882       make_floatx80_init(0xbffd, 0xe2b78ce97464fdd4ULL) },
 883     { make_floatx80_init(0xbffe, 0xd0000000000019a0ULL),
 884       make_floatx80_init(0x3ffe, 0x91c3d373ab11b919ULL),
 885       make_floatx80_init(0xbffd, 0xdc785918a9dc8dceULL) },
 886     { make_floatx80_init(0xbffe, 0xc7ffffffffff14dfULL),
 887       make_floatx80_init(0x3ffe, 0x94f4efa8fef76836ULL),
 888       make_floatx80_init(0xbffd, 0xd61620ae02112f94ULL) },
 889     { make_floatx80_init(0xbffe, 0xc000000000006530ULL),
 890       make_floatx80_init(0x3ffe, 0x9837f0518db87fbbULL),
 891       make_floatx80_init(0xbffd, 0xcf901f5ce48f008aULL) },
 892     { make_floatx80_init(0xbffe, 0xb7ffffffffff1723ULL),
 893       make_floatx80_init(0x3ffe, 0x9b8d39b9d54eb74cULL),
 894       make_floatx80_init(0xbffd, 0xc8e58c8c55629168ULL) },
 895     { make_floatx80_init(0xbffe, 0xb00000000000b5e1ULL),
 896       make_floatx80_init(0x3ffe, 0x9ef5326091a0c366ULL),
 897       make_floatx80_init(0xbffd, 0xc2159b3edcbe7934ULL) },
 898     { make_floatx80_init(0xbffe, 0xa800000000006f8aULL),
 899       make_floatx80_init(0x3ffe, 0xa27043030c49370aULL),
 900       make_floatx80_init(0xbffd, 0xbb1f79f9e76d91ecULL) },
 901     { make_floatx80_init(0xbffe, 0x9fffffffffff816aULL),
 902       make_floatx80_init(0x3ffe, 0xa5fed6a9b15171cfULL),
 903       make_floatx80_init(0xbffd, 0xb40252ac9d5d1c62ULL) },
 904     { make_floatx80_init(0xbffe, 0x97ffffffffffb621ULL),
 905       make_floatx80_init(0x3ffe, 0xa9a15ab4ea7c30e6ULL),
 906       make_floatx80_init(0xbffd, 0xacbd4a962b079e34ULL) },
 907     { make_floatx80_init(0xbffe, 0x8fffffffffff162bULL),
 908       make_floatx80_init(0x3ffe, 0xad583eea42a1b886ULL),
 909       make_floatx80_init(0xbffd, 0xa54f822b7abc8ef4ULL) },
 910     { make_floatx80_init(0xbffe, 0x87ffffffffff4d34ULL),
 911       make_floatx80_init(0x3ffe, 0xb123f581d2ac7b51ULL),
 912       make_floatx80_init(0xbffd, 0x9db814fc5aa7095eULL) },
 913     { make_floatx80_init(0xbffe, 0x800000000000227dULL),
 914       make_floatx80_init(0x3ffe, 0xb504f333f9de539dULL),
 915       make_floatx80_init(0xbffd, 0x95f619980c4358c6ULL) },
 916     { make_floatx80_init(0xbffd, 0xefffffffffff3978ULL),
 917       make_floatx80_init(0x3ffe, 0xb8fbaf4762fbd0a1ULL),
 918       make_floatx80_init(0xbffd, 0x8e08a1713a085ebeULL) },
 919     { make_floatx80_init(0xbffd, 0xe00000000000df81ULL),
 920       make_floatx80_init(0x3ffe, 0xbd08a39f580bfd8cULL),
 921       make_floatx80_init(0xbffd, 0x85eeb8c14fe804e8ULL) },
 922     { make_floatx80_init(0xbffd, 0xd00000000000bccfULL),
 923       make_floatx80_init(0x3ffe, 0xc12c4cca667062f6ULL),
 924       make_floatx80_init(0xbffc, 0xfb4eccd6663e7428ULL) },
 925     { make_floatx80_init(0xbffd, 0xc00000000000eff0ULL),
 926       make_floatx80_init(0x3ffe, 0xc5672a1155069abeULL),
 927       make_floatx80_init(0xbffc, 0xea6357baabe59508ULL) },
 928     { make_floatx80_init(0xbffd, 0xb000000000000fe6ULL),
 929       make_floatx80_init(0x3ffe, 0xc9b9bd866e2f234bULL),
 930       make_floatx80_init(0xbffc, 0xd91909e6474372d4ULL) },
 931     { make_floatx80_init(0xbffd, 0x9fffffffffff2172ULL),
 932       make_floatx80_init(0x3ffe, 0xce248c151f84bf00ULL),
 933       make_floatx80_init(0xbffc, 0xc76dcfab81ed0400ULL) },
 934     { make_floatx80_init(0xbffd, 0x8fffffffffffafffULL),
 935       make_floatx80_init(0x3ffe, 0xd2a81d91f12afb2bULL),
 936       make_floatx80_init(0xbffc, 0xb55f89b83b541354ULL) },
 937     { make_floatx80_init(0xbffc, 0xffffffffffff81a3ULL),
 938       make_floatx80_init(0x3ffe, 0xd744fccad69d7d5eULL),
 939       make_floatx80_init(0xbffc, 0xa2ec0cd4a58a0a88ULL) },
 940     { make_floatx80_init(0xbffc, 0xdfffffffffff1568ULL),
 941       make_floatx80_init(0x3ffe, 0xdbfbb797daf25a44ULL),
 942       make_floatx80_init(0xbffc, 0x901121a0943696f0ULL) },
 943     { make_floatx80_init(0xbffc, 0xbfffffffffff68daULL),
 944       make_floatx80_init(0x3ffe, 0xe0ccdeec2a94f811ULL),
 945       make_floatx80_init(0xbffb, 0xf999089eab583f78ULL) },
 946     { make_floatx80_init(0xbffc, 0x9fffffffffff4690ULL),
 947       make_floatx80_init(0x3ffe, 0xe5b906e77c83657eULL),
 948       make_floatx80_init(0xbffb, 0xd237c8c41be4d410ULL) },
 949     { make_floatx80_init(0xbffb, 0xffffffffffff8aeeULL),
 950       make_floatx80_init(0x3ffe, 0xeac0c6e7dd24427cULL),
 951       make_floatx80_init(0xbffb, 0xa9f9c8c116ddec20ULL) },
 952     { make_floatx80_init(0xbffb, 0xbfffffffffff2d18ULL),
 953       make_floatx80_init(0x3ffe, 0xefe4b99bdcdb06ebULL),
 954       make_floatx80_init(0xbffb, 0x80da33211927c8a8ULL) },
 955     { make_floatx80_init(0xbffa, 0xffffffffffff8ccbULL),
 956       make_floatx80_init(0x3ffe, 0xf5257d152486d0f4ULL),
 957       make_floatx80_init(0xbffa, 0xada82eadb792f0c0ULL) },
 958     { make_floatx80_init(0xbff9, 0xffffffffffff11feULL),
 959       make_floatx80_init(0x3ffe, 0xfa83b2db722a0846ULL),
 960       make_floatx80_init(0xbff9, 0xaf89a491babef740ULL) },
 961     { floatx80_zero_init,
 962       make_floatx80_init(0x3fff, 0x8000000000000000ULL),
 963       floatx80_zero_init },
 964     { make_floatx80_init(0x3ff9, 0xffffffffffff2680ULL),
 965       make_floatx80_init(0x3fff, 0x82cd8698ac2b9f6fULL),
 966       make_floatx80_init(0x3ff9, 0xb361a62b0ae7dbc0ULL) },
 967     { make_floatx80_init(0x3ffb, 0x800000000000b500ULL),
 968       make_floatx80_init(0x3fff, 0x85aac367cc488345ULL),
 969       make_floatx80_init(0x3ffa, 0xb5586cf9891068a0ULL) },
 970     { make_floatx80_init(0x3ffb, 0xbfffffffffff4b67ULL),
 971       make_floatx80_init(0x3fff, 0x88980e8092da7cceULL),
 972       make_floatx80_init(0x3ffb, 0x8980e8092da7cce0ULL) },
 973     { make_floatx80_init(0x3ffb, 0xffffffffffffff57ULL),
 974       make_floatx80_init(0x3fff, 0x8b95c1e3ea8bd6dfULL),
 975       make_floatx80_init(0x3ffb, 0xb95c1e3ea8bd6df0ULL) },
 976     { make_floatx80_init(0x3ffc, 0x9fffffffffff811fULL),
 977       make_floatx80_init(0x3fff, 0x8ea4398b45cd4780ULL),
 978       make_floatx80_init(0x3ffb, 0xea4398b45cd47800ULL) },
 979     { make_floatx80_init(0x3ffc, 0xbfffffffffff9980ULL),
 980       make_floatx80_init(0x3fff, 0x91c3d373ab11b919ULL),
 981       make_floatx80_init(0x3ffc, 0x8e1e9b9d588dc8c8ULL) },
 982     { make_floatx80_init(0x3ffc, 0xdffffffffffff631ULL),
 983       make_floatx80_init(0x3fff, 0x94f4efa8fef70864ULL),
 984       make_floatx80_init(0x3ffc, 0xa7a77d47f7b84320ULL) },
 985     { make_floatx80_init(0x3ffc, 0xffffffffffff2499ULL),
 986       make_floatx80_init(0x3fff, 0x9837f0518db892d4ULL),
 987       make_floatx80_init(0x3ffc, 0xc1bf828c6dc496a0ULL) },
 988     { make_floatx80_init(0x3ffd, 0x8fffffffffff80fbULL),
 989       make_floatx80_init(0x3fff, 0x9b8d39b9d54e3a79ULL),
 990       make_floatx80_init(0x3ffc, 0xdc69cdceaa71d3c8ULL) },
 991     { make_floatx80_init(0x3ffd, 0x9fffffffffffbc23ULL),
 992       make_floatx80_init(0x3fff, 0x9ef5326091a10313ULL),
 993       make_floatx80_init(0x3ffc, 0xf7a993048d081898ULL) },
 994     { make_floatx80_init(0x3ffd, 0xafffffffffff20ecULL),
 995       make_floatx80_init(0x3fff, 0xa27043030c49370aULL),
 996       make_floatx80_init(0x3ffd, 0x89c10c0c3124dc28ULL) },
 997     { make_floatx80_init(0x3ffd, 0xc00000000000fd2cULL),
 998       make_floatx80_init(0x3fff, 0xa5fed6a9b15171cfULL),
 999       make_floatx80_init(0x3ffd, 0x97fb5aa6c545c73cULL) },
1000     { make_floatx80_init(0x3ffd, 0xd0000000000093beULL),
1001       make_floatx80_init(0x3fff, 0xa9a15ab4ea7c30e6ULL),
1002       make_floatx80_init(0x3ffd, 0xa6856ad3a9f0c398ULL) },
1003     { make_floatx80_init(0x3ffd, 0xe00000000000c2aeULL),
1004       make_floatx80_init(0x3fff, 0xad583eea42a17876ULL),
1005       make_floatx80_init(0x3ffd, 0xb560fba90a85e1d8ULL) },
1006     { make_floatx80_init(0x3ffd, 0xefffffffffff1e3fULL),
1007       make_floatx80_init(0x3fff, 0xb123f581d2abef6cULL),
1008       make_floatx80_init(0x3ffd, 0xc48fd6074aafbdb0ULL) },
1009     { make_floatx80_init(0x3ffd, 0xffffffffffff1c23ULL),
1010       make_floatx80_init(0x3fff, 0xb504f333f9de2cadULL),
1011       make_floatx80_init(0x3ffd, 0xd413cccfe778b2b4ULL) },
1012     { make_floatx80_init(0x3ffe, 0x8800000000006344ULL),
1013       make_floatx80_init(0x3fff, 0xb8fbaf4762fbd0a1ULL),
1014       make_floatx80_init(0x3ffd, 0xe3eebd1d8bef4284ULL) },
1015     { make_floatx80_init(0x3ffe, 0x9000000000005d67ULL),
1016       make_floatx80_init(0x3fff, 0xbd08a39f580c668dULL),
1017       make_floatx80_init(0x3ffd, 0xf4228e7d60319a34ULL) },
1018     { make_floatx80_init(0x3ffe, 0x9800000000009127ULL),
1019       make_floatx80_init(0x3fff, 0xc12c4cca6670e042ULL),
1020       make_floatx80_init(0x3ffe, 0x82589994cce1c084ULL) },
1021     { make_floatx80_init(0x3ffe, 0x9fffffffffff06f9ULL),
1022       make_floatx80_init(0x3fff, 0xc5672a11550655c3ULL),
1023       make_floatx80_init(0x3ffe, 0x8ace5422aa0cab86ULL) },
1024     { make_floatx80_init(0x3ffe, 0xa7fffffffffff80dULL),
1025       make_floatx80_init(0x3fff, 0xc9b9bd866e2f234bULL),
1026       make_floatx80_init(0x3ffe, 0x93737b0cdc5e4696ULL) },
1027     { make_floatx80_init(0x3ffe, 0xafffffffffff1470ULL),
1028       make_floatx80_init(0x3fff, 0xce248c151f83fd69ULL),
1029       make_floatx80_init(0x3ffe, 0x9c49182a3f07fad2ULL) },
1030     { make_floatx80_init(0x3ffe, 0xb800000000000e0aULL),
1031       make_floatx80_init(0x3fff, 0xd2a81d91f12aec5cULL),
1032       make_floatx80_init(0x3ffe, 0xa5503b23e255d8b8ULL) },
1033     { make_floatx80_init(0x3ffe, 0xc00000000000b7faULL),
1034       make_floatx80_init(0x3fff, 0xd744fccad69dd630ULL),
1035       make_floatx80_init(0x3ffe, 0xae89f995ad3bac60ULL) },
1036     { make_floatx80_init(0x3ffe, 0xc800000000003aa6ULL),
1037       make_floatx80_init(0x3fff, 0xdbfbb797daf25a44ULL),
1038       make_floatx80_init(0x3ffe, 0xb7f76f2fb5e4b488ULL) },
1039     { make_floatx80_init(0x3ffe, 0xd00000000000a6aeULL),
1040       make_floatx80_init(0x3fff, 0xe0ccdeec2a954685ULL),
1041       make_floatx80_init(0x3ffe, 0xc199bdd8552a8d0aULL) },
1042     { make_floatx80_init(0x3ffe, 0xd800000000004165ULL),
1043       make_floatx80_init(0x3fff, 0xe5b906e77c837155ULL),
1044       make_floatx80_init(0x3ffe, 0xcb720dcef906e2aaULL) },
1045     { make_floatx80_init(0x3ffe, 0xe00000000000582cULL),
1046       make_floatx80_init(0x3fff, 0xeac0c6e7dd24713aULL),
1047       make_floatx80_init(0x3ffe, 0xd5818dcfba48e274ULL) },
1048     { make_floatx80_init(0x3ffe, 0xe800000000001a5dULL),
1049       make_floatx80_init(0x3fff, 0xefe4b99bdcdb06ebULL),
1050       make_floatx80_init(0x3ffe, 0xdfc97337b9b60dd6ULL) },
1051     { make_floatx80_init(0x3ffe, 0xefffffffffffc1efULL),
1052       make_floatx80_init(0x3fff, 0xf5257d152486a2faULL),
1053       make_floatx80_init(0x3ffe, 0xea4afa2a490d45f4ULL) },
1054     { make_floatx80_init(0x3ffe, 0xf800000000001069ULL),
1055       make_floatx80_init(0x3fff, 0xfa83b2db722a0e5cULL),
1056       make_floatx80_init(0x3ffe, 0xf50765b6e4541cb8ULL) },
1057     { make_floatx80_init(0x3fff, 0x8000000000000000ULL),
1058       make_floatx80_init(0x4000, 0x8000000000000000ULL),
1059       make_floatx80_init(0x3fff, 0x8000000000000000ULL) },
1060 };
1061
1062 void helper_f2xm1(CPUX86State *env)
1063 {
1064     uint8_t old_flags = save_exception_flags(env);
1065     uint64_t sig = extractFloatx80Frac(ST0);
1066     int32_t exp = extractFloatx80Exp(ST0);
1067     bool sign = extractFloatx80Sign(ST0);
1068
1069     if (floatx80_invalid_encoding(ST0)) {
1070         float_raise(float_flag_invalid, &env->fp_status);
1071         ST0 = floatx80_default_nan(&env->fp_status);
1072     } else if (floatx80_is_any_nan(ST0)) {
1073         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1074             float_raise(float_flag_invalid, &env->fp_status);
1075             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1076         }
1077     } else if (exp > 0x3fff ||
1078                (exp == 0x3fff && sig != (0x8000000000000000ULL))) {
1079         /* Out of range for the instruction, treat as invalid.  */
1080         float_raise(float_flag_invalid, &env->fp_status);
1081         ST0 = floatx80_default_nan(&env->fp_status);
1082     } else if (exp == 0x3fff) {
1083         /* Argument 1 or -1, exact result 1 or -0.5.  */
1084         if (sign) {
1085             ST0 = make_floatx80(0xbffe, 0x8000000000000000ULL);
1086         }
1087     } else if (exp < 0x3fb0) {
1088         if (!floatx80_is_zero(ST0)) {
1089             /*
1090              * Multiplying the argument by an extra-precision version
1091              * of log(2) is sufficiently precise.  Zero arguments are
1092              * returned unchanged.
1093              */
1094             uint64_t sig0, sig1, sig2;
1095             if (exp == 0) {
1096                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1097             }
1098             mul128By64To192(ln2_sig_high, ln2_sig_low, sig, &sig0, &sig1,
1099                             &sig2);
1100             /* This result is inexact.  */
1101             sig1 |= 1;
1102             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1103                                                 sign, exp, sig0, sig1,
1104                                                 &env->fp_status);
1105         }
1106     } else {
1107         floatx80 tmp, y, accum;
1108         bool asign, bsign;
1109         int32_t n, aexp, bexp;
1110         uint64_t asig0, asig1, asig2, bsig0, bsig1;
1111         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1112         FloatX80RoundPrec save_prec =
1113             env->fp_status.floatx80_rounding_precision;
1114         env->fp_status.float_rounding_mode = float_round_nearest_even;
1115         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1116
1117         /* Find the nearest multiple of 1/32 to the argument.  */
1118         tmp = floatx80_scalbn(ST0, 5, &env->fp_status);
1119         n = 32 + floatx80_to_int32(tmp, &env->fp_status);
1120         y = floatx80_sub(ST0, f2xm1_table[n].t, &env->fp_status);
1121
1122         if (floatx80_is_zero(y)) {
1123             /*
1124              * Use the value of 2^t - 1 from the table, to avoid
1125              * needing to special-case zero as a result of
1126              * multiplication below.
1127              */
1128             ST0 = f2xm1_table[n].t;
1129             set_float_exception_flags(float_flag_inexact, &env->fp_status);
1130             env->fp_status.float_rounding_mode = save_mode;
1131         } else {
1132             /*
1133              * Compute the lower parts of a polynomial expansion for
1134              * (2^y - 1) / y.
1135              */
1136             accum = floatx80_mul(f2xm1_coeff_7, y, &env->fp_status);
1137             accum = floatx80_add(f2xm1_coeff_6, accum, &env->fp_status);
1138             accum = floatx80_mul(accum, y, &env->fp_status);
1139             accum = floatx80_add(f2xm1_coeff_5, accum, &env->fp_status);
1140             accum = floatx80_mul(accum, y, &env->fp_status);
1141             accum = floatx80_add(f2xm1_coeff_4, accum, &env->fp_status);
1142             accum = floatx80_mul(accum, y, &env->fp_status);
1143             accum = floatx80_add(f2xm1_coeff_3, accum, &env->fp_status);
1144             accum = floatx80_mul(accum, y, &env->fp_status);
1145             accum = floatx80_add(f2xm1_coeff_2, accum, &env->fp_status);
1146             accum = floatx80_mul(accum, y, &env->fp_status);
1147             accum = floatx80_add(f2xm1_coeff_1, accum, &env->fp_status);
1148             accum = floatx80_mul(accum, y, &env->fp_status);
1149             accum = floatx80_add(f2xm1_coeff_0_low, accum, &env->fp_status);
1150
1151             /*
1152              * The full polynomial expansion is f2xm1_coeff_0 + accum
1153              * (where accum has much lower magnitude, and so, in
1154              * particular, carry out of the addition is not possible).
1155              * (This expansion is only accurate to about 70 bits, not
1156              * 128 bits.)
1157              */
1158             aexp = extractFloatx80Exp(f2xm1_coeff_0);
1159             asign = extractFloatx80Sign(f2xm1_coeff_0);
1160             shift128RightJamming(extractFloatx80Frac(accum), 0,
1161                                  aexp - extractFloatx80Exp(accum),
1162                                  &asig0, &asig1);
1163             bsig0 = extractFloatx80Frac(f2xm1_coeff_0);
1164             bsig1 = 0;
1165             if (asign == extractFloatx80Sign(accum)) {
1166                 add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1167             } else {
1168                 sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1169             }
1170             /* And thus compute an approximation to 2^y - 1.  */
1171             mul128By64To192(asig0, asig1, extractFloatx80Frac(y),
1172                             &asig0, &asig1, &asig2);
1173             aexp += extractFloatx80Exp(y) - 0x3ffe;
1174             asign ^= extractFloatx80Sign(y);
1175             if (n != 32) {
1176                 /*
1177                  * Multiply this by the precomputed value of 2^t and
1178                  * add that of 2^t - 1.
1179                  */
1180                 mul128By64To192(asig0, asig1,
1181                                 extractFloatx80Frac(f2xm1_table[n].exp2),
1182                                 &asig0, &asig1, &asig2);
1183                 aexp += extractFloatx80Exp(f2xm1_table[n].exp2) - 0x3ffe;
1184                 bexp = extractFloatx80Exp(f2xm1_table[n].exp2m1);
1185                 bsig0 = extractFloatx80Frac(f2xm1_table[n].exp2m1);
1186                 bsig1 = 0;
1187                 if (bexp < aexp) {
1188                     shift128RightJamming(bsig0, bsig1, aexp - bexp,
1189                                          &bsig0, &bsig1);
1190                 } else if (aexp < bexp) {
1191                     shift128RightJamming(asig0, asig1, bexp - aexp,
1192                                          &asig0, &asig1);
1193                     aexp = bexp;
1194                 }
1195                 /* The sign of 2^t - 1 is always that of the result.  */
1196                 bsign = extractFloatx80Sign(f2xm1_table[n].exp2m1);
1197                 if (asign == bsign) {
1198                     /* Avoid possible carry out of the addition.  */
1199                     shift128RightJamming(asig0, asig1, 1,
1200                                          &asig0, &asig1);
1201                     shift128RightJamming(bsig0, bsig1, 1,
1202                                          &bsig0, &bsig1);
1203                     ++aexp;
1204                     add128(asig0, asig1, bsig0, bsig1, &asig0, &asig1);
1205                 } else {
1206                     sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1207                     asign = bsign;
1208                 }
1209             }
1210             env->fp_status.float_rounding_mode = save_mode;
1211             /* This result is inexact.  */
1212             asig1 |= 1;
1213             ST0 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1214                                                 asign, aexp, asig0, asig1,
1215                                                 &env->fp_status);
1216         }
1217
1218         env->fp_status.floatx80_rounding_precision = save_prec;
1219     }
1220     merge_exception_flags(env, old_flags);
1221 }
1222
1223 void helper_fptan(CPUX86State *env)
1224 {
1225     double fptemp = floatx80_to_double(env, ST0);
1226
1227     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
1228         env->fpus |= 0x400;
1229     } else {
1230         fptemp = tan(fptemp);
1231         ST0 = double_to_floatx80(env, fptemp);
1232         fpush(env);
1233         ST0 = floatx80_one;
1234         env->fpus &= ~0x400; /* C2 <-- 0 */
1235         /* the above code is for |arg| < 2**52 only */
1236     }
1237 }
1238
1239 /* Values of pi/4, pi/2, 3pi/4 and pi, with 128-bit precision.  */
1240 #define pi_4_exp 0x3ffe
1241 #define pi_4_sig_high 0xc90fdaa22168c234ULL
1242 #define pi_4_sig_low 0xc4c6628b80dc1cd1ULL
1243 #define pi_2_exp 0x3fff
1244 #define pi_2_sig_high 0xc90fdaa22168c234ULL
1245 #define pi_2_sig_low 0xc4c6628b80dc1cd1ULL
1246 #define pi_34_exp 0x4000
1247 #define pi_34_sig_high 0x96cbe3f9990e91a7ULL
1248 #define pi_34_sig_low 0x9394c9e8a0a5159dULL
1249 #define pi_exp 0x4000
1250 #define pi_sig_high 0xc90fdaa22168c234ULL
1251 #define pi_sig_low 0xc4c6628b80dc1cd1ULL
1252
1253 /*
1254  * Polynomial coefficients for an approximation to atan(x), with only
1255  * odd powers of x used, for x in the interval [-1/16, 1/16].  (Unlike
1256  * for some other approximations, no low part is needed for the first
1257  * coefficient here to achieve a sufficiently accurate result, because
1258  * the coefficient in this minimax approximation is very close to
1259  * exactly 1.)
1260  */
1261 #define fpatan_coeff_0 make_floatx80(0x3fff, 0x8000000000000000ULL)
1262 #define fpatan_coeff_1 make_floatx80(0xbffd, 0xaaaaaaaaaaaaaa43ULL)
1263 #define fpatan_coeff_2 make_floatx80(0x3ffc, 0xccccccccccbfe4f8ULL)
1264 #define fpatan_coeff_3 make_floatx80(0xbffc, 0x92492491fbab2e66ULL)
1265 #define fpatan_coeff_4 make_floatx80(0x3ffb, 0xe38e372881ea1e0bULL)
1266 #define fpatan_coeff_5 make_floatx80(0xbffb, 0xba2c0104bbdd0615ULL)
1267 #define fpatan_coeff_6 make_floatx80(0x3ffb, 0x9baf7ebf898b42efULL)
1268
1269 struct fpatan_data {
1270     /* High and low parts of atan(x).  */
1271     floatx80 atan_high, atan_low;
1272 };
1273
1274 static const struct fpatan_data fpatan_table[9] = {
1275     { floatx80_zero_init,
1276       floatx80_zero_init },
1277     { make_floatx80_init(0x3ffb, 0xfeadd4d5617b6e33ULL),
1278       make_floatx80_init(0xbfb9, 0xdda19d8305ddc420ULL) },
1279     { make_floatx80_init(0x3ffc, 0xfadbafc96406eb15ULL),
1280       make_floatx80_init(0x3fbb, 0xdb8f3debef442fccULL) },
1281     { make_floatx80_init(0x3ffd, 0xb7b0ca0f26f78474ULL),
1282       make_floatx80_init(0xbfbc, 0xeab9bdba460376faULL) },
1283     { make_floatx80_init(0x3ffd, 0xed63382b0dda7b45ULL),
1284       make_floatx80_init(0x3fbc, 0xdfc88bd978751a06ULL) },
1285     { make_floatx80_init(0x3ffe, 0x8f005d5ef7f59f9bULL),
1286       make_floatx80_init(0x3fbd, 0xb906bc2ccb886e90ULL) },
1287     { make_floatx80_init(0x3ffe, 0xa4bc7d1934f70924ULL),
1288       make_floatx80_init(0x3fbb, 0xcd43f9522bed64f8ULL) },
1289     { make_floatx80_init(0x3ffe, 0xb8053e2bc2319e74ULL),
1290       make_floatx80_init(0xbfbc, 0xd3496ab7bd6eef0cULL) },
1291     { make_floatx80_init(0x3ffe, 0xc90fdaa22168c235ULL),
1292       make_floatx80_init(0xbfbc, 0xece675d1fc8f8cbcULL) },
1293 };
1294
1295 void helper_fpatan(CPUX86State *env)
1296 {
1297     uint8_t old_flags = save_exception_flags(env);
1298     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1299     int32_t arg0_exp = extractFloatx80Exp(ST0);
1300     bool arg0_sign = extractFloatx80Sign(ST0);
1301     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1302     int32_t arg1_exp = extractFloatx80Exp(ST1);
1303     bool arg1_sign = extractFloatx80Sign(ST1);
1304
1305     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1306         float_raise(float_flag_invalid, &env->fp_status);
1307         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1308     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1309         float_raise(float_flag_invalid, &env->fp_status);
1310         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1311     } else if (floatx80_invalid_encoding(ST0) ||
1312                floatx80_invalid_encoding(ST1)) {
1313         float_raise(float_flag_invalid, &env->fp_status);
1314         ST1 = floatx80_default_nan(&env->fp_status);
1315     } else if (floatx80_is_any_nan(ST0)) {
1316         ST1 = ST0;
1317     } else if (floatx80_is_any_nan(ST1)) {
1318         /* Pass this NaN through.  */
1319     } else if (floatx80_is_zero(ST1) && !arg0_sign) {
1320         /* Pass this zero through.  */
1321     } else if (((floatx80_is_infinity(ST0) && !floatx80_is_infinity(ST1)) ||
1322                  arg0_exp - arg1_exp >= 80) &&
1323                !arg0_sign) {
1324         /*
1325          * Dividing ST1 by ST0 gives the correct result up to
1326          * rounding, and avoids spurious underflow exceptions that
1327          * might result from passing some small values through the
1328          * polynomial approximation, but if a finite nonzero result of
1329          * division is exact, the result of fpatan is still inexact
1330          * (and underflowing where appropriate).
1331          */
1332         FloatX80RoundPrec save_prec =
1333             env->fp_status.floatx80_rounding_precision;
1334         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1335         ST1 = floatx80_div(ST1, ST0, &env->fp_status);
1336         env->fp_status.floatx80_rounding_precision = save_prec;
1337         if (!floatx80_is_zero(ST1) &&
1338             !(get_float_exception_flags(&env->fp_status) &
1339               float_flag_inexact)) {
1340             /*
1341              * The mathematical result is very slightly closer to zero
1342              * than this exact result.  Round a value with the
1343              * significand adjusted accordingly to get the correct
1344              * exceptions, and possibly an adjusted result depending
1345              * on the rounding mode.
1346              */
1347             uint64_t sig = extractFloatx80Frac(ST1);
1348             int32_t exp = extractFloatx80Exp(ST1);
1349             bool sign = extractFloatx80Sign(ST1);
1350             if (exp == 0) {
1351                 normalizeFloatx80Subnormal(sig, &exp, &sig);
1352             }
1353             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1354                                                 sign, exp, sig - 1,
1355                                                 -1, &env->fp_status);
1356         }
1357     } else {
1358         /* The result is inexact.  */
1359         bool rsign = arg1_sign;
1360         int32_t rexp;
1361         uint64_t rsig0, rsig1;
1362         if (floatx80_is_zero(ST1)) {
1363             /*
1364              * ST0 is negative.  The result is pi with the sign of
1365              * ST1.
1366              */
1367             rexp = pi_exp;
1368             rsig0 = pi_sig_high;
1369             rsig1 = pi_sig_low;
1370         } else if (floatx80_is_infinity(ST1)) {
1371             if (floatx80_is_infinity(ST0)) {
1372                 if (arg0_sign) {
1373                     rexp = pi_34_exp;
1374                     rsig0 = pi_34_sig_high;
1375                     rsig1 = pi_34_sig_low;
1376                 } else {
1377                     rexp = pi_4_exp;
1378                     rsig0 = pi_4_sig_high;
1379                     rsig1 = pi_4_sig_low;
1380                 }
1381             } else {
1382                 rexp = pi_2_exp;
1383                 rsig0 = pi_2_sig_high;
1384                 rsig1 = pi_2_sig_low;
1385             }
1386         } else if (floatx80_is_zero(ST0) || arg1_exp - arg0_exp >= 80) {
1387             rexp = pi_2_exp;
1388             rsig0 = pi_2_sig_high;
1389             rsig1 = pi_2_sig_low;
1390         } else if (floatx80_is_infinity(ST0) || arg0_exp - arg1_exp >= 80) {
1391             /* ST0 is negative.  */
1392             rexp = pi_exp;
1393             rsig0 = pi_sig_high;
1394             rsig1 = pi_sig_low;
1395         } else {
1396             /*
1397              * ST0 and ST1 are finite, nonzero and with exponents not
1398              * too far apart.
1399              */
1400             int32_t adj_exp, num_exp, den_exp, xexp, yexp, n, texp, zexp, aexp;
1401             int32_t azexp, axexp;
1402             bool adj_sub, ysign, zsign;
1403             uint64_t adj_sig0, adj_sig1, num_sig, den_sig, xsig0, xsig1;
1404             uint64_t msig0, msig1, msig2, remsig0, remsig1, remsig2;
1405             uint64_t ysig0, ysig1, tsig, zsig0, zsig1, asig0, asig1;
1406             uint64_t azsig0, azsig1;
1407             uint64_t azsig2, azsig3, axsig0, axsig1;
1408             floatx80 x8;
1409             FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
1410             FloatX80RoundPrec save_prec =
1411                 env->fp_status.floatx80_rounding_precision;
1412             env->fp_status.float_rounding_mode = float_round_nearest_even;
1413             env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
1414
1415             if (arg0_exp == 0) {
1416                 normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
1417             }
1418             if (arg1_exp == 0) {
1419                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
1420             }
1421             if (arg0_exp > arg1_exp ||
1422                 (arg0_exp == arg1_exp && arg0_sig >= arg1_sig)) {
1423                 /* Work with abs(ST1) / abs(ST0).  */
1424                 num_exp = arg1_exp;
1425                 num_sig = arg1_sig;
1426                 den_exp = arg0_exp;
1427                 den_sig = arg0_sig;
1428                 if (arg0_sign) {
1429                     /* The result is subtracted from pi.  */
1430                     adj_exp = pi_exp;
1431                     adj_sig0 = pi_sig_high;
1432                     adj_sig1 = pi_sig_low;
1433                     adj_sub = true;
1434                 } else {
1435                     /* The result is used as-is.  */
1436                     adj_exp = 0;
1437                     adj_sig0 = 0;
1438                     adj_sig1 = 0;
1439                     adj_sub = false;
1440                 }
1441             } else {
1442                 /* Work with abs(ST0) / abs(ST1).  */
1443                 num_exp = arg0_exp;
1444                 num_sig = arg0_sig;
1445                 den_exp = arg1_exp;
1446                 den_sig = arg1_sig;
1447                 /* The result is added to or subtracted from pi/2.  */
1448                 adj_exp = pi_2_exp;
1449                 adj_sig0 = pi_2_sig_high;
1450                 adj_sig1 = pi_2_sig_low;
1451                 adj_sub = !arg0_sign;
1452             }
1453
1454             /*
1455              * Compute x = num/den, where 0 < x <= 1 and x is not too
1456              * small.
1457              */
1458             xexp = num_exp - den_exp + 0x3ffe;
1459             remsig0 = num_sig;
1460             remsig1 = 0;
1461             if (den_sig <= remsig0) {
1462                 shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1463                 ++xexp;
1464             }
1465             xsig0 = estimateDiv128To64(remsig0, remsig1, den_sig);
1466             mul64To128(den_sig, xsig0, &msig0, &msig1);
1467             sub128(remsig0, remsig1, msig0, msig1, &remsig0, &remsig1);
1468             while ((int64_t) remsig0 < 0) {
1469                 --xsig0;
1470                 add128(remsig0, remsig1, 0, den_sig, &remsig0, &remsig1);
1471             }
1472             xsig1 = estimateDiv128To64(remsig1, 0, den_sig);
1473             /*
1474              * No need to correct any estimation error in xsig1; even
1475              * with such error, it is accurate enough.
1476              */
1477
1478             /*
1479              * Split x as x = t + y, where t = n/8 is the nearest
1480              * multiple of 1/8 to x.
1481              */
1482             x8 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
1483                                                false, xexp + 3, xsig0,
1484                                                xsig1, &env->fp_status);
1485             n = floatx80_to_int32(x8, &env->fp_status);
1486             if (n == 0) {
1487                 ysign = false;
1488                 yexp = xexp;
1489                 ysig0 = xsig0;
1490                 ysig1 = xsig1;
1491                 texp = 0;
1492                 tsig = 0;
1493             } else {
1494                 int shift = clz32(n) + 32;
1495                 texp = 0x403b - shift;
1496                 tsig = n;
1497                 tsig <<= shift;
1498                 if (texp == xexp) {
1499                     sub128(xsig0, xsig1, tsig, 0, &ysig0, &ysig1);
1500                     if ((int64_t) ysig0 >= 0) {
1501                         ysign = false;
1502                         if (ysig0 == 0) {
1503                             if (ysig1 == 0) {
1504                                 yexp = 0;
1505                             } else {
1506                                 shift = clz64(ysig1) + 64;
1507                                 yexp = xexp - shift;
1508                                 shift128Left(ysig0, ysig1, shift,
1509                                              &ysig0, &ysig1);
1510                             }
1511                         } else {
1512                             shift = clz64(ysig0);
1513                             yexp = xexp - shift;
1514                             shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1515                         }
1516                     } else {
1517                         ysign = true;
1518                         sub128(0, 0, ysig0, ysig1, &ysig0, &ysig1);
1519                         if (ysig0 == 0) {
1520                             shift = clz64(ysig1) + 64;
1521                         } else {
1522                             shift = clz64(ysig0);
1523                         }
1524                         yexp = xexp - shift;
1525                         shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1526                     }
1527                 } else {
1528                     /*
1529                      * t's exponent must be greater than x's because t
1530                      * is positive and the nearest multiple of 1/8 to
1531                      * x, and if x has a greater exponent, the power
1532                      * of 2 with that exponent is also a multiple of
1533                      * 1/8.
1534                      */
1535                     uint64_t usig0, usig1;
1536                     shift128RightJamming(xsig0, xsig1, texp - xexp,
1537                                          &usig0, &usig1);
1538                     ysign = true;
1539                     sub128(tsig, 0, usig0, usig1, &ysig0, &ysig1);
1540                     if (ysig0 == 0) {
1541                         shift = clz64(ysig1) + 64;
1542                     } else {
1543                         shift = clz64(ysig0);
1544                     }
1545                     yexp = texp - shift;
1546                     shift128Left(ysig0, ysig1, shift, &ysig0, &ysig1);
1547                 }
1548             }
1549
1550             /*
1551              * Compute z = y/(1+tx), so arctan(x) = arctan(t) +
1552              * arctan(z).
1553              */
1554             zsign = ysign;
1555             if (texp == 0 || yexp == 0) {
1556                 zexp = yexp;
1557                 zsig0 = ysig0;
1558                 zsig1 = ysig1;
1559             } else {
1560                 /*
1561                  * t <= 1, x <= 1 and if both are 1 then y is 0, so tx < 1.
1562                  */
1563                 int32_t dexp = texp + xexp - 0x3ffe;
1564                 uint64_t dsig0, dsig1, dsig2;
1565                 mul128By64To192(xsig0, xsig1, tsig, &dsig0, &dsig1, &dsig2);
1566                 /*
1567                  * dexp <= 0x3fff (and if equal, dsig0 has a leading 0
1568                  * bit).  Add 1 to produce the denominator 1+tx.
1569                  */
1570                 shift128RightJamming(dsig0, dsig1, 0x3fff - dexp,
1571                                      &dsig0, &dsig1);
1572                 dsig0 |= 0x8000000000000000ULL;
1573                 zexp = yexp - 1;
1574                 remsig0 = ysig0;
1575                 remsig1 = ysig1;
1576                 remsig2 = 0;
1577                 if (dsig0 <= remsig0) {
1578                     shift128Right(remsig0, remsig1, 1, &remsig0, &remsig1);
1579                     ++zexp;
1580                 }
1581                 zsig0 = estimateDiv128To64(remsig0, remsig1, dsig0);
1582                 mul128By64To192(dsig0, dsig1, zsig0, &msig0, &msig1, &msig2);
1583                 sub192(remsig0, remsig1, remsig2, msig0, msig1, msig2,
1584                        &remsig0, &remsig1, &remsig2);
1585                 while ((int64_t) remsig0 < 0) {
1586                     --zsig0;
1587                     add192(remsig0, remsig1, remsig2, 0, dsig0, dsig1,
1588                            &remsig0, &remsig1, &remsig2);
1589                 }
1590                 zsig1 = estimateDiv128To64(remsig1, remsig2, dsig0);
1591                 /* No need to correct any estimation error in zsig1.  */
1592             }
1593
1594             if (zexp == 0) {
1595                 azexp = 0;
1596                 azsig0 = 0;
1597                 azsig1 = 0;
1598             } else {
1599                 floatx80 z2, accum;
1600                 uint64_t z2sig0, z2sig1, z2sig2, z2sig3;
1601                 /* Compute z^2.  */
1602                 mul128To256(zsig0, zsig1, zsig0, zsig1,
1603                             &z2sig0, &z2sig1, &z2sig2, &z2sig3);
1604                 z2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1605                                                    zexp + zexp - 0x3ffe,
1606                                                    z2sig0, z2sig1,
1607                                                    &env->fp_status);
1608
1609                 /* Compute the lower parts of the polynomial expansion.  */
1610                 accum = floatx80_mul(fpatan_coeff_6, z2, &env->fp_status);
1611                 accum = floatx80_add(fpatan_coeff_5, accum, &env->fp_status);
1612                 accum = floatx80_mul(accum, z2, &env->fp_status);
1613                 accum = floatx80_add(fpatan_coeff_4, accum, &env->fp_status);
1614                 accum = floatx80_mul(accum, z2, &env->fp_status);
1615                 accum = floatx80_add(fpatan_coeff_3, accum, &env->fp_status);
1616                 accum = floatx80_mul(accum, z2, &env->fp_status);
1617                 accum = floatx80_add(fpatan_coeff_2, accum, &env->fp_status);
1618                 accum = floatx80_mul(accum, z2, &env->fp_status);
1619                 accum = floatx80_add(fpatan_coeff_1, accum, &env->fp_status);
1620                 accum = floatx80_mul(accum, z2, &env->fp_status);
1621
1622                 /*
1623                  * The full polynomial expansion is z*(fpatan_coeff_0 + accum).
1624                  * fpatan_coeff_0 is 1, and accum is negative and much smaller.
1625                  */
1626                 aexp = extractFloatx80Exp(fpatan_coeff_0);
1627                 shift128RightJamming(extractFloatx80Frac(accum), 0,
1628                                      aexp - extractFloatx80Exp(accum),
1629                                      &asig0, &asig1);
1630                 sub128(extractFloatx80Frac(fpatan_coeff_0), 0, asig0, asig1,
1631                        &asig0, &asig1);
1632                 /* Multiply by z to compute arctan(z).  */
1633                 azexp = aexp + zexp - 0x3ffe;
1634                 mul128To256(asig0, asig1, zsig0, zsig1, &azsig0, &azsig1,
1635                             &azsig2, &azsig3);
1636             }
1637
1638             /* Add arctan(t) (positive or zero) and arctan(z) (sign zsign).  */
1639             if (texp == 0) {
1640                 /* z is positive.  */
1641                 axexp = azexp;
1642                 axsig0 = azsig0;
1643                 axsig1 = azsig1;
1644             } else {
1645                 bool low_sign = extractFloatx80Sign(fpatan_table[n].atan_low);
1646                 int32_t low_exp = extractFloatx80Exp(fpatan_table[n].atan_low);
1647                 uint64_t low_sig0 =
1648                     extractFloatx80Frac(fpatan_table[n].atan_low);
1649                 uint64_t low_sig1 = 0;
1650                 axexp = extractFloatx80Exp(fpatan_table[n].atan_high);
1651                 axsig0 = extractFloatx80Frac(fpatan_table[n].atan_high);
1652                 axsig1 = 0;
1653                 shift128RightJamming(low_sig0, low_sig1, axexp - low_exp,
1654                                      &low_sig0, &low_sig1);
1655                 if (low_sign) {
1656                     sub128(axsig0, axsig1, low_sig0, low_sig1,
1657                            &axsig0, &axsig1);
1658                 } else {
1659                     add128(axsig0, axsig1, low_sig0, low_sig1,
1660                            &axsig0, &axsig1);
1661                 }
1662                 if (azexp >= axexp) {
1663                     shift128RightJamming(axsig0, axsig1, azexp - axexp + 1,
1664                                          &axsig0, &axsig1);
1665                     axexp = azexp + 1;
1666                     shift128RightJamming(azsig0, azsig1, 1,
1667                                          &azsig0, &azsig1);
1668                 } else {
1669                     shift128RightJamming(axsig0, axsig1, 1,
1670                                          &axsig0, &axsig1);
1671                     shift128RightJamming(azsig0, azsig1, axexp - azexp + 1,
1672                                          &azsig0, &azsig1);
1673                     ++axexp;
1674                 }
1675                 if (zsign) {
1676                     sub128(axsig0, axsig1, azsig0, azsig1,
1677                            &axsig0, &axsig1);
1678                 } else {
1679                     add128(axsig0, axsig1, azsig0, azsig1,
1680                            &axsig0, &axsig1);
1681                 }
1682             }
1683
1684             if (adj_exp == 0) {
1685                 rexp = axexp;
1686                 rsig0 = axsig0;
1687                 rsig1 = axsig1;
1688             } else {
1689                 /*
1690                  * Add or subtract arctan(x) (exponent axexp,
1691                  * significand axsig0 and axsig1, positive, not
1692                  * necessarily normalized) to the number given by
1693                  * adj_exp, adj_sig0 and adj_sig1, according to
1694                  * adj_sub.
1695                  */
1696                 if (adj_exp >= axexp) {
1697                     shift128RightJamming(axsig0, axsig1, adj_exp - axexp + 1,
1698                                          &axsig0, &axsig1);
1699                     rexp = adj_exp + 1;
1700                     shift128RightJamming(adj_sig0, adj_sig1, 1,
1701                                          &adj_sig0, &adj_sig1);
1702                 } else {
1703                     shift128RightJamming(axsig0, axsig1, 1,
1704                                          &axsig0, &axsig1);
1705                     shift128RightJamming(adj_sig0, adj_sig1,
1706                                          axexp - adj_exp + 1,
1707                                          &adj_sig0, &adj_sig1);
1708                     rexp = axexp + 1;
1709                 }
1710                 if (adj_sub) {
1711                     sub128(adj_sig0, adj_sig1, axsig0, axsig1,
1712                            &rsig0, &rsig1);
1713                 } else {
1714                     add128(adj_sig0, adj_sig1, axsig0, axsig1,
1715                            &rsig0, &rsig1);
1716                 }
1717             }
1718
1719             env->fp_status.float_rounding_mode = save_mode;
1720             env->fp_status.floatx80_rounding_precision = save_prec;
1721         }
1722         /* This result is inexact.  */
1723         rsig1 |= 1;
1724         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x, rsign, rexp,
1725                                             rsig0, rsig1, &env->fp_status);
1726     }
1727
1728     fpop(env);
1729     merge_exception_flags(env, old_flags);
1730 }
1731
1732 void helper_fxtract(CPUX86State *env)
1733 {
1734     uint8_t old_flags = save_exception_flags(env);
1735     CPU_LDoubleU temp;
1736
1737     temp.d = ST0;
1738
1739     if (floatx80_is_zero(ST0)) {
1740         /* Easy way to generate -inf and raising division by 0 exception */
1741         ST0 = floatx80_div(floatx80_chs(floatx80_one), floatx80_zero,
1742                            &env->fp_status);
1743         fpush(env);
1744         ST0 = temp.d;
1745     } else if (floatx80_invalid_encoding(ST0)) {
1746         float_raise(float_flag_invalid, &env->fp_status);
1747         ST0 = floatx80_default_nan(&env->fp_status);
1748         fpush(env);
1749         ST0 = ST1;
1750     } else if (floatx80_is_any_nan(ST0)) {
1751         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1752             float_raise(float_flag_invalid, &env->fp_status);
1753             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
1754         }
1755         fpush(env);
1756         ST0 = ST1;
1757     } else if (floatx80_is_infinity(ST0)) {
1758         fpush(env);
1759         ST0 = ST1;
1760         ST1 = floatx80_infinity;
1761     } else {
1762         int expdif;
1763
1764         if (EXPD(temp) == 0) {
1765             int shift = clz64(temp.l.lower);
1766             temp.l.lower <<= shift;
1767             expdif = 1 - EXPBIAS - shift;
1768             float_raise(float_flag_input_denormal, &env->fp_status);
1769         } else {
1770             expdif = EXPD(temp) - EXPBIAS;
1771         }
1772         /* DP exponent bias */
1773         ST0 = int32_to_floatx80(expdif, &env->fp_status);
1774         fpush(env);
1775         BIASEXPONENT(temp);
1776         ST0 = temp.d;
1777     }
1778     merge_exception_flags(env, old_flags);
1779 }
1780
1781 static void helper_fprem_common(CPUX86State *env, bool mod)
1782 {
1783     uint8_t old_flags = save_exception_flags(env);
1784     uint64_t quotient;
1785     CPU_LDoubleU temp0, temp1;
1786     int exp0, exp1, expdiff;
1787
1788     temp0.d = ST0;
1789     temp1.d = ST1;
1790     exp0 = EXPD(temp0);
1791     exp1 = EXPD(temp1);
1792
1793     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
1794     if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
1795         exp0 == 0x7fff || exp1 == 0x7fff ||
1796         floatx80_invalid_encoding(ST0) || floatx80_invalid_encoding(ST1)) {
1797         ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1798     } else {
1799         if (exp0 == 0) {
1800             exp0 = 1 - clz64(temp0.l.lower);
1801         }
1802         if (exp1 == 0) {
1803             exp1 = 1 - clz64(temp1.l.lower);
1804         }
1805         expdiff = exp0 - exp1;
1806         if (expdiff < 64) {
1807             ST0 = floatx80_modrem(ST0, ST1, mod, &quotient, &env->fp_status);
1808             env->fpus |= (quotient & 0x4) << (8 - 2);  /* (C0) <-- q2 */
1809             env->fpus |= (quotient & 0x2) << (14 - 1); /* (C3) <-- q1 */
1810             env->fpus |= (quotient & 0x1) << (9 - 0);  /* (C1) <-- q0 */
1811         } else {
1812             /*
1813              * Partial remainder.  This choice of how many bits to
1814              * process at once is specified in AMD instruction set
1815              * manuals, and empirically is followed by Intel
1816              * processors as well; it ensures that the final remainder
1817              * operation in a loop does produce the correct low three
1818              * bits of the quotient.  AMD manuals specify that the
1819              * flags other than C2 are cleared, and empirically Intel
1820              * processors clear them as well.
1821              */
1822             int n = 32 + (expdiff % 32);
1823             temp1.d = floatx80_scalbn(temp1.d, expdiff - n, &env->fp_status);
1824             ST0 = floatx80_mod(ST0, temp1.d, &env->fp_status);
1825             env->fpus |= 0x400;  /* C2 <-- 1 */
1826         }
1827     }
1828     merge_exception_flags(env, old_flags);
1829 }
1830
1831 void helper_fprem1(CPUX86State *env)
1832 {
1833     helper_fprem_common(env, false);
1834 }
1835
1836 void helper_fprem(CPUX86State *env)
1837 {
1838     helper_fprem_common(env, true);
1839 }
1840
1841 /* 128-bit significand of log2(e).  */
1842 #define log2_e_sig_high 0xb8aa3b295c17f0bbULL
1843 #define log2_e_sig_low 0xbe87fed0691d3e89ULL
1844
1845 /*
1846  * Polynomial coefficients for an approximation to log2((1+x)/(1-x)),
1847  * with only odd powers of x used, for x in the interval [2*sqrt(2)-3,
1848  * 3-2*sqrt(2)], which corresponds to logarithms of numbers in the
1849  * interval [sqrt(2)/2, sqrt(2)].
1850  */
1851 #define fyl2x_coeff_0 make_floatx80(0x4000, 0xb8aa3b295c17f0bcULL)
1852 #define fyl2x_coeff_0_low make_floatx80(0xbfbf, 0x834972fe2d7bab1bULL)
1853 #define fyl2x_coeff_1 make_floatx80(0x3ffe, 0xf6384ee1d01febb8ULL)
1854 #define fyl2x_coeff_2 make_floatx80(0x3ffe, 0x93bb62877cdfa2e3ULL)
1855 #define fyl2x_coeff_3 make_floatx80(0x3ffd, 0xd30bb153d808f269ULL)
1856 #define fyl2x_coeff_4 make_floatx80(0x3ffd, 0xa42589eaf451499eULL)
1857 #define fyl2x_coeff_5 make_floatx80(0x3ffd, 0x864d42c0f8f17517ULL)
1858 #define fyl2x_coeff_6 make_floatx80(0x3ffc, 0xe3476578adf26272ULL)
1859 #define fyl2x_coeff_7 make_floatx80(0x3ffc, 0xc506c5f874e6d80fULL)
1860 #define fyl2x_coeff_8 make_floatx80(0x3ffc, 0xac5cf50cc57d6372ULL)
1861 #define fyl2x_coeff_9 make_floatx80(0x3ffc, 0xb1ed0066d971a103ULL)
1862
1863 /*
1864  * Compute an approximation of log2(1+arg), where 1+arg is in the
1865  * interval [sqrt(2)/2, sqrt(2)].  It is assumed that when this
1866  * function is called, rounding precision is set to 80 and the
1867  * round-to-nearest mode is in effect.  arg must not be exactly zero,
1868  * and must not be so close to zero that underflow might occur.
1869  */
1870 static void helper_fyl2x_common(CPUX86State *env, floatx80 arg, int32_t *exp,
1871                                 uint64_t *sig0, uint64_t *sig1)
1872 {
1873     uint64_t arg0_sig = extractFloatx80Frac(arg);
1874     int32_t arg0_exp = extractFloatx80Exp(arg);
1875     bool arg0_sign = extractFloatx80Sign(arg);
1876     bool asign;
1877     int32_t dexp, texp, aexp;
1878     uint64_t dsig0, dsig1, tsig0, tsig1, rsig0, rsig1, rsig2;
1879     uint64_t msig0, msig1, msig2, t2sig0, t2sig1, t2sig2, t2sig3;
1880     uint64_t asig0, asig1, asig2, asig3, bsig0, bsig1;
1881     floatx80 t2, accum;
1882
1883     /*
1884      * Compute an approximation of arg/(2+arg), with extra precision,
1885      * as the argument to a polynomial approximation.  The extra
1886      * precision is only needed for the first term of the
1887      * approximation, with subsequent terms being significantly
1888      * smaller; the approximation only uses odd exponents, and the
1889      * square of arg/(2+arg) is at most 17-12*sqrt(2) = 0.029....
1890      */
1891     if (arg0_sign) {
1892         dexp = 0x3fff;
1893         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1894         sub128(0, 0, dsig0, dsig1, &dsig0, &dsig1);
1895     } else {
1896         dexp = 0x4000;
1897         shift128RightJamming(arg0_sig, 0, dexp - arg0_exp, &dsig0, &dsig1);
1898         dsig0 |= 0x8000000000000000ULL;
1899     }
1900     texp = arg0_exp - dexp + 0x3ffe;
1901     rsig0 = arg0_sig;
1902     rsig1 = 0;
1903     rsig2 = 0;
1904     if (dsig0 <= rsig0) {
1905         shift128Right(rsig0, rsig1, 1, &rsig0, &rsig1);
1906         ++texp;
1907     }
1908     tsig0 = estimateDiv128To64(rsig0, rsig1, dsig0);
1909     mul128By64To192(dsig0, dsig1, tsig0, &msig0, &msig1, &msig2);
1910     sub192(rsig0, rsig1, rsig2, msig0, msig1, msig2,
1911            &rsig0, &rsig1, &rsig2);
1912     while ((int64_t) rsig0 < 0) {
1913         --tsig0;
1914         add192(rsig0, rsig1, rsig2, 0, dsig0, dsig1,
1915                &rsig0, &rsig1, &rsig2);
1916     }
1917     tsig1 = estimateDiv128To64(rsig1, rsig2, dsig0);
1918     /*
1919      * No need to correct any estimation error in tsig1; even with
1920      * such error, it is accurate enough.  Now compute the square of
1921      * that approximation.
1922      */
1923     mul128To256(tsig0, tsig1, tsig0, tsig1,
1924                 &t2sig0, &t2sig1, &t2sig2, &t2sig3);
1925     t2 = normalizeRoundAndPackFloatx80(floatx80_precision_x, false,
1926                                        texp + texp - 0x3ffe,
1927                                        t2sig0, t2sig1, &env->fp_status);
1928
1929     /* Compute the lower parts of the polynomial expansion.  */
1930     accum = floatx80_mul(fyl2x_coeff_9, t2, &env->fp_status);
1931     accum = floatx80_add(fyl2x_coeff_8, accum, &env->fp_status);
1932     accum = floatx80_mul(accum, t2, &env->fp_status);
1933     accum = floatx80_add(fyl2x_coeff_7, accum, &env->fp_status);
1934     accum = floatx80_mul(accum, t2, &env->fp_status);
1935     accum = floatx80_add(fyl2x_coeff_6, accum, &env->fp_status);
1936     accum = floatx80_mul(accum, t2, &env->fp_status);
1937     accum = floatx80_add(fyl2x_coeff_5, accum, &env->fp_status);
1938     accum = floatx80_mul(accum, t2, &env->fp_status);
1939     accum = floatx80_add(fyl2x_coeff_4, accum, &env->fp_status);
1940     accum = floatx80_mul(accum, t2, &env->fp_status);
1941     accum = floatx80_add(fyl2x_coeff_3, accum, &env->fp_status);
1942     accum = floatx80_mul(accum, t2, &env->fp_status);
1943     accum = floatx80_add(fyl2x_coeff_2, accum, &env->fp_status);
1944     accum = floatx80_mul(accum, t2, &env->fp_status);
1945     accum = floatx80_add(fyl2x_coeff_1, accum, &env->fp_status);
1946     accum = floatx80_mul(accum, t2, &env->fp_status);
1947     accum = floatx80_add(fyl2x_coeff_0_low, accum, &env->fp_status);
1948
1949     /*
1950      * The full polynomial expansion is fyl2x_coeff_0 + accum (where
1951      * accum has much lower magnitude, and so, in particular, carry
1952      * out of the addition is not possible), multiplied by t.  (This
1953      * expansion is only accurate to about 70 bits, not 128 bits.)
1954      */
1955     aexp = extractFloatx80Exp(fyl2x_coeff_0);
1956     asign = extractFloatx80Sign(fyl2x_coeff_0);
1957     shift128RightJamming(extractFloatx80Frac(accum), 0,
1958                          aexp - extractFloatx80Exp(accum),
1959                          &asig0, &asig1);
1960     bsig0 = extractFloatx80Frac(fyl2x_coeff_0);
1961     bsig1 = 0;
1962     if (asign == extractFloatx80Sign(accum)) {
1963         add128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1964     } else {
1965         sub128(bsig0, bsig1, asig0, asig1, &asig0, &asig1);
1966     }
1967     /* Multiply by t to compute the required result.  */
1968     mul128To256(asig0, asig1, tsig0, tsig1,
1969                 &asig0, &asig1, &asig2, &asig3);
1970     aexp += texp - 0x3ffe;
1971     *exp = aexp;
1972     *sig0 = asig0;
1973     *sig1 = asig1;
1974 }
1975
1976 void helper_fyl2xp1(CPUX86State *env)
1977 {
1978     uint8_t old_flags = save_exception_flags(env);
1979     uint64_t arg0_sig = extractFloatx80Frac(ST0);
1980     int32_t arg0_exp = extractFloatx80Exp(ST0);
1981     bool arg0_sign = extractFloatx80Sign(ST0);
1982     uint64_t arg1_sig = extractFloatx80Frac(ST1);
1983     int32_t arg1_exp = extractFloatx80Exp(ST1);
1984     bool arg1_sign = extractFloatx80Sign(ST1);
1985
1986     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
1987         float_raise(float_flag_invalid, &env->fp_status);
1988         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
1989     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
1990         float_raise(float_flag_invalid, &env->fp_status);
1991         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
1992     } else if (floatx80_invalid_encoding(ST0) ||
1993                floatx80_invalid_encoding(ST1)) {
1994         float_raise(float_flag_invalid, &env->fp_status);
1995         ST1 = floatx80_default_nan(&env->fp_status);
1996     } else if (floatx80_is_any_nan(ST0)) {
1997         ST1 = ST0;
1998     } else if (floatx80_is_any_nan(ST1)) {
1999         /* Pass this NaN through.  */
2000     } else if (arg0_exp > 0x3ffd ||
2001                (arg0_exp == 0x3ffd && arg0_sig > (arg0_sign ?
2002                                                   0x95f619980c4336f7ULL :
2003                                                   0xd413cccfe7799211ULL))) {
2004         /*
2005          * Out of range for the instruction (ST0 must have absolute
2006          * value less than 1 - sqrt(2)/2 = 0.292..., according to
2007          * Intel manuals; AMD manuals allow a range from sqrt(2)/2 - 1
2008          * to sqrt(2) - 1, which we allow here), treat as invalid.
2009          */
2010         float_raise(float_flag_invalid, &env->fp_status);
2011         ST1 = floatx80_default_nan(&env->fp_status);
2012     } else if (floatx80_is_zero(ST0) || floatx80_is_zero(ST1) ||
2013                arg1_exp == 0x7fff) {
2014         /*
2015          * One argument is zero, or multiplying by infinity; correct
2016          * result is exact and can be obtained by multiplying the
2017          * arguments.
2018          */
2019         ST1 = floatx80_mul(ST0, ST1, &env->fp_status);
2020     } else if (arg0_exp < 0x3fb0) {
2021         /*
2022          * Multiplying both arguments and an extra-precision version
2023          * of log2(e) is sufficiently precise.
2024          */
2025         uint64_t sig0, sig1, sig2;
2026         int32_t exp;
2027         if (arg0_exp == 0) {
2028             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2029         }
2030         if (arg1_exp == 0) {
2031             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2032         }
2033         mul128By64To192(log2_e_sig_high, log2_e_sig_low, arg0_sig,
2034                         &sig0, &sig1, &sig2);
2035         exp = arg0_exp + 1;
2036         mul128By64To192(sig0, sig1, arg1_sig, &sig0, &sig1, &sig2);
2037         exp += arg1_exp - 0x3ffe;
2038         /* This result is inexact.  */
2039         sig1 |= 1;
2040         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2041                                             arg0_sign ^ arg1_sign, exp,
2042                                             sig0, sig1, &env->fp_status);
2043     } else {
2044         int32_t aexp;
2045         uint64_t asig0, asig1, asig2;
2046         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2047         FloatX80RoundPrec save_prec =
2048             env->fp_status.floatx80_rounding_precision;
2049         env->fp_status.float_rounding_mode = float_round_nearest_even;
2050         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2051
2052         helper_fyl2x_common(env, ST0, &aexp, &asig0, &asig1);
2053         /*
2054          * Multiply by the second argument to compute the required
2055          * result.
2056          */
2057         if (arg1_exp == 0) {
2058             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2059         }
2060         mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2061         aexp += arg1_exp - 0x3ffe;
2062         /* This result is inexact.  */
2063         asig1 |= 1;
2064         env->fp_status.float_rounding_mode = save_mode;
2065         ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2066                                             arg0_sign ^ arg1_sign, aexp,
2067                                             asig0, asig1, &env->fp_status);
2068         env->fp_status.floatx80_rounding_precision = save_prec;
2069     }
2070     fpop(env);
2071     merge_exception_flags(env, old_flags);
2072 }
2073
2074 void helper_fyl2x(CPUX86State *env)
2075 {
2076     uint8_t old_flags = save_exception_flags(env);
2077     uint64_t arg0_sig = extractFloatx80Frac(ST0);
2078     int32_t arg0_exp = extractFloatx80Exp(ST0);
2079     bool arg0_sign = extractFloatx80Sign(ST0);
2080     uint64_t arg1_sig = extractFloatx80Frac(ST1);
2081     int32_t arg1_exp = extractFloatx80Exp(ST1);
2082     bool arg1_sign = extractFloatx80Sign(ST1);
2083
2084     if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2085         float_raise(float_flag_invalid, &env->fp_status);
2086         ST1 = floatx80_silence_nan(ST0, &env->fp_status);
2087     } else if (floatx80_is_signaling_nan(ST1, &env->fp_status)) {
2088         float_raise(float_flag_invalid, &env->fp_status);
2089         ST1 = floatx80_silence_nan(ST1, &env->fp_status);
2090     } else if (floatx80_invalid_encoding(ST0) ||
2091                floatx80_invalid_encoding(ST1)) {
2092         float_raise(float_flag_invalid, &env->fp_status);
2093         ST1 = floatx80_default_nan(&env->fp_status);
2094     } else if (floatx80_is_any_nan(ST0)) {
2095         ST1 = ST0;
2096     } else if (floatx80_is_any_nan(ST1)) {
2097         /* Pass this NaN through.  */
2098     } else if (arg0_sign && !floatx80_is_zero(ST0)) {
2099         float_raise(float_flag_invalid, &env->fp_status);
2100         ST1 = floatx80_default_nan(&env->fp_status);
2101     } else if (floatx80_is_infinity(ST1)) {
2102         FloatRelation cmp = floatx80_compare(ST0, floatx80_one,
2103                                              &env->fp_status);
2104         switch (cmp) {
2105         case float_relation_less:
2106             ST1 = floatx80_chs(ST1);
2107             break;
2108         case float_relation_greater:
2109             /* Result is infinity of the same sign as ST1.  */
2110             break;
2111         default:
2112             float_raise(float_flag_invalid, &env->fp_status);
2113             ST1 = floatx80_default_nan(&env->fp_status);
2114             break;
2115         }
2116     } else if (floatx80_is_infinity(ST0)) {
2117         if (floatx80_is_zero(ST1)) {
2118             float_raise(float_flag_invalid, &env->fp_status);
2119             ST1 = floatx80_default_nan(&env->fp_status);
2120         } else if (arg1_sign) {
2121             ST1 = floatx80_chs(ST0);
2122         } else {
2123             ST1 = ST0;
2124         }
2125     } else if (floatx80_is_zero(ST0)) {
2126         if (floatx80_is_zero(ST1)) {
2127             float_raise(float_flag_invalid, &env->fp_status);
2128             ST1 = floatx80_default_nan(&env->fp_status);
2129         } else {
2130             /* Result is infinity with opposite sign to ST1.  */
2131             float_raise(float_flag_divbyzero, &env->fp_status);
2132             ST1 = make_floatx80(arg1_sign ? 0x7fff : 0xffff,
2133                                 0x8000000000000000ULL);
2134         }
2135     } else if (floatx80_is_zero(ST1)) {
2136         if (floatx80_lt(ST0, floatx80_one, &env->fp_status)) {
2137             ST1 = floatx80_chs(ST1);
2138         }
2139         /* Otherwise, ST1 is already the correct result.  */
2140     } else if (floatx80_eq(ST0, floatx80_one, &env->fp_status)) {
2141         if (arg1_sign) {
2142             ST1 = floatx80_chs(floatx80_zero);
2143         } else {
2144             ST1 = floatx80_zero;
2145         }
2146     } else {
2147         int32_t int_exp;
2148         floatx80 arg0_m1;
2149         FloatRoundMode save_mode = env->fp_status.float_rounding_mode;
2150         FloatX80RoundPrec save_prec =
2151             env->fp_status.floatx80_rounding_precision;
2152         env->fp_status.float_rounding_mode = float_round_nearest_even;
2153         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2154
2155         if (arg0_exp == 0) {
2156             normalizeFloatx80Subnormal(arg0_sig, &arg0_exp, &arg0_sig);
2157         }
2158         if (arg1_exp == 0) {
2159             normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2160         }
2161         int_exp = arg0_exp - 0x3fff;
2162         if (arg0_sig > 0xb504f333f9de6484ULL) {
2163             ++int_exp;
2164         }
2165         arg0_m1 = floatx80_sub(floatx80_scalbn(ST0, -int_exp,
2166                                                &env->fp_status),
2167                                floatx80_one, &env->fp_status);
2168         if (floatx80_is_zero(arg0_m1)) {
2169             /* Exact power of 2; multiply by ST1.  */
2170             env->fp_status.float_rounding_mode = save_mode;
2171             ST1 = floatx80_mul(int32_to_floatx80(int_exp, &env->fp_status),
2172                                ST1, &env->fp_status);
2173         } else {
2174             bool asign = extractFloatx80Sign(arg0_m1);
2175             int32_t aexp;
2176             uint64_t asig0, asig1, asig2;
2177             helper_fyl2x_common(env, arg0_m1, &aexp, &asig0, &asig1);
2178             if (int_exp != 0) {
2179                 bool isign = (int_exp < 0);
2180                 int32_t iexp;
2181                 uint64_t isig;
2182                 int shift;
2183                 int_exp = isign ? -int_exp : int_exp;
2184                 shift = clz32(int_exp) + 32;
2185                 isig = int_exp;
2186                 isig <<= shift;
2187                 iexp = 0x403e - shift;
2188                 shift128RightJamming(asig0, asig1, iexp - aexp,
2189                                      &asig0, &asig1);
2190                 if (asign == isign) {
2191                     add128(isig, 0, asig0, asig1, &asig0, &asig1);
2192                 } else {
2193                     sub128(isig, 0, asig0, asig1, &asig0, &asig1);
2194                 }
2195                 aexp = iexp;
2196                 asign = isign;
2197             }
2198             /*
2199              * Multiply by the second argument to compute the required
2200              * result.
2201              */
2202             if (arg1_exp == 0) {
2203                 normalizeFloatx80Subnormal(arg1_sig, &arg1_exp, &arg1_sig);
2204             }
2205             mul128By64To192(asig0, asig1, arg1_sig, &asig0, &asig1, &asig2);
2206             aexp += arg1_exp - 0x3ffe;
2207             /* This result is inexact.  */
2208             asig1 |= 1;
2209             env->fp_status.float_rounding_mode = save_mode;
2210             ST1 = normalizeRoundAndPackFloatx80(floatx80_precision_x,
2211                                                 asign ^ arg1_sign, aexp,
2212                                                 asig0, asig1, &env->fp_status);
2213         }
2214
2215         env->fp_status.floatx80_rounding_precision = save_prec;
2216     }
2217     fpop(env);
2218     merge_exception_flags(env, old_flags);
2219 }
2220
2221 void helper_fsqrt(CPUX86State *env)
2222 {
2223     uint8_t old_flags = save_exception_flags(env);
2224     if (floatx80_is_neg(ST0)) {
2225         env->fpus &= ~0x4700;  /* (C3,C2,C1,C0) <-- 0000 */
2226         env->fpus |= 0x400;
2227     }
2228     ST0 = floatx80_sqrt(ST0, &env->fp_status);
2229     merge_exception_flags(env, old_flags);
2230 }
2231
2232 void helper_fsincos(CPUX86State *env)
2233 {
2234     double fptemp = floatx80_to_double(env, ST0);
2235
2236     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2237         env->fpus |= 0x400;
2238     } else {
2239         ST0 = double_to_floatx80(env, sin(fptemp));
2240         fpush(env);
2241         ST0 = double_to_floatx80(env, cos(fptemp));
2242         env->fpus &= ~0x400;  /* C2 <-- 0 */
2243         /* the above code is for |arg| < 2**63 only */
2244     }
2245 }
2246
2247 void helper_frndint(CPUX86State *env)
2248 {
2249     uint8_t old_flags = save_exception_flags(env);
2250     ST0 = floatx80_round_to_int(ST0, &env->fp_status);
2251     merge_exception_flags(env, old_flags);
2252 }
2253
2254 void helper_fscale(CPUX86State *env)
2255 {
2256     uint8_t old_flags = save_exception_flags(env);
2257     if (floatx80_invalid_encoding(ST1) || floatx80_invalid_encoding(ST0)) {
2258         float_raise(float_flag_invalid, &env->fp_status);
2259         ST0 = floatx80_default_nan(&env->fp_status);
2260     } else if (floatx80_is_any_nan(ST1)) {
2261         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2262             float_raise(float_flag_invalid, &env->fp_status);
2263         }
2264         ST0 = ST1;
2265         if (floatx80_is_signaling_nan(ST0, &env->fp_status)) {
2266             float_raise(float_flag_invalid, &env->fp_status);
2267             ST0 = floatx80_silence_nan(ST0, &env->fp_status);
2268         }
2269     } else if (floatx80_is_infinity(ST1) &&
2270                !floatx80_invalid_encoding(ST0) &&
2271                !floatx80_is_any_nan(ST0)) {
2272         if (floatx80_is_neg(ST1)) {
2273             if (floatx80_is_infinity(ST0)) {
2274                 float_raise(float_flag_invalid, &env->fp_status);
2275                 ST0 = floatx80_default_nan(&env->fp_status);
2276             } else {
2277                 ST0 = (floatx80_is_neg(ST0) ?
2278                        floatx80_chs(floatx80_zero) :
2279                        floatx80_zero);
2280             }
2281         } else {
2282             if (floatx80_is_zero(ST0)) {
2283                 float_raise(float_flag_invalid, &env->fp_status);
2284                 ST0 = floatx80_default_nan(&env->fp_status);
2285             } else {
2286                 ST0 = (floatx80_is_neg(ST0) ?
2287                        floatx80_chs(floatx80_infinity) :
2288                        floatx80_infinity);
2289             }
2290         }
2291     } else {
2292         int n;
2293         FloatX80RoundPrec save = env->fp_status.floatx80_rounding_precision;
2294         uint8_t save_flags = get_float_exception_flags(&env->fp_status);
2295         set_float_exception_flags(0, &env->fp_status);
2296         n = floatx80_to_int32_round_to_zero(ST1, &env->fp_status);
2297         set_float_exception_flags(save_flags, &env->fp_status);
2298         env->fp_status.floatx80_rounding_precision = floatx80_precision_x;
2299         ST0 = floatx80_scalbn(ST0, n, &env->fp_status);
2300         env->fp_status.floatx80_rounding_precision = save;
2301     }
2302     merge_exception_flags(env, old_flags);
2303 }
2304
2305 void helper_fsin(CPUX86State *env)
2306 {
2307     double fptemp = floatx80_to_double(env, ST0);
2308
2309     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2310         env->fpus |= 0x400;
2311     } else {
2312         ST0 = double_to_floatx80(env, sin(fptemp));
2313         env->fpus &= ~0x400;  /* C2 <-- 0 */
2314         /* the above code is for |arg| < 2**53 only */
2315     }
2316 }
2317
2318 void helper_fcos(CPUX86State *env)
2319 {
2320     double fptemp = floatx80_to_double(env, ST0);
2321
2322     if ((fptemp > MAXTAN) || (fptemp < -MAXTAN)) {
2323         env->fpus |= 0x400;
2324     } else {
2325         ST0 = double_to_floatx80(env, cos(fptemp));
2326         env->fpus &= ~0x400;  /* C2 <-- 0 */
2327         /* the above code is for |arg| < 2**63 only */
2328     }
2329 }
2330
2331 void helper_fxam_ST0(CPUX86State *env)
2332 {
2333     CPU_LDoubleU temp;
2334     int expdif;
2335
2336     temp.d = ST0;
2337
2338     env->fpus &= ~0x4700; /* (C3,C2,C1,C0) <-- 0000 */
2339     if (SIGND(temp)) {
2340         env->fpus |= 0x200; /* C1 <-- 1 */
2341     }
2342
2343     if (env->fptags[env->fpstt]) {
2344         env->fpus |= 0x4100; /* Empty */
2345         return;
2346     }
2347
2348     expdif = EXPD(temp);
2349     if (expdif == MAXEXPD) {
2350         if (MANTD(temp) == 0x8000000000000000ULL) {
2351             env->fpus |= 0x500; /* Infinity */
2352         } else if (MANTD(temp) & 0x8000000000000000ULL) {
2353             env->fpus |= 0x100; /* NaN */
2354         }
2355     } else if (expdif == 0) {
2356         if (MANTD(temp) == 0) {
2357             env->fpus |=  0x4000; /* Zero */
2358         } else {
2359             env->fpus |= 0x4400; /* Denormal */
2360         }
2361     } else if (MANTD(temp) & 0x8000000000000000ULL) {
2362         env->fpus |= 0x400;
2363     }
2364 }
2365
2366 static void do_fstenv(CPUX86State *env, target_ulong ptr, int data32,
2367                       uintptr_t retaddr)
2368 {
2369     int fpus, fptag, exp, i;
2370     uint64_t mant;
2371     CPU_LDoubleU tmp;
2372
2373     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2374     fptag = 0;
2375     for (i = 7; i >= 0; i--) {
2376         fptag <<= 2;
2377         if (env->fptags[i]) {
2378             fptag |= 3;
2379         } else {
2380             tmp.d = env->fpregs[i].d;
2381             exp = EXPD(tmp);
2382             mant = MANTD(tmp);
2383             if (exp == 0 && mant == 0) {
2384                 /* zero */
2385                 fptag |= 1;
2386             } else if (exp == 0 || exp == MAXEXPD
2387                        || (mant & (1LL << 63)) == 0) {
2388                 /* NaNs, infinity, denormal */
2389                 fptag |= 2;
2390             }
2391         }
2392     }
2393     if (data32) {
2394         /* 32 bit */
2395         cpu_stl_data_ra(env, ptr, env->fpuc, retaddr);
2396         cpu_stl_data_ra(env, ptr + 4, fpus, retaddr);
2397         cpu_stl_data_ra(env, ptr + 8, fptag, retaddr);
2398         cpu_stl_data_ra(env, ptr + 12, env->fpip, retaddr); /* fpip */
2399         cpu_stl_data_ra(env, ptr + 16, env->fpcs, retaddr); /* fpcs */
2400         cpu_stl_data_ra(env, ptr + 20, env->fpdp, retaddr); /* fpoo */
2401         cpu_stl_data_ra(env, ptr + 24, env->fpds, retaddr); /* fpos */
2402     } else {
2403         /* 16 bit */
2404         cpu_stw_data_ra(env, ptr, env->fpuc, retaddr);
2405         cpu_stw_data_ra(env, ptr + 2, fpus, retaddr);
2406         cpu_stw_data_ra(env, ptr + 4, fptag, retaddr);
2407         cpu_stw_data_ra(env, ptr + 6, env->fpip, retaddr);
2408         cpu_stw_data_ra(env, ptr + 8, env->fpcs, retaddr);
2409         cpu_stw_data_ra(env, ptr + 10, env->fpdp, retaddr);
2410         cpu_stw_data_ra(env, ptr + 12, env->fpds, retaddr);
2411     }
2412 }
2413
2414 void helper_fstenv(CPUX86State *env, target_ulong ptr, int data32)
2415 {
2416     do_fstenv(env, ptr, data32, GETPC());
2417 }
2418
2419 static void cpu_set_fpus(CPUX86State *env, uint16_t fpus)
2420 {
2421     env->fpstt = (fpus >> 11) & 7;
2422     env->fpus = fpus & ~0x3800 & ~FPUS_B;
2423     env->fpus |= env->fpus & FPUS_SE ? FPUS_B : 0;
2424 #if !defined(CONFIG_USER_ONLY)
2425     if (!(env->fpus & FPUS_SE)) {
2426         /*
2427          * Here the processor deasserts FERR#; in response, the chipset deasserts
2428          * IGNNE#.
2429          */
2430         cpu_clear_ignne();
2431     }
2432 #endif
2433 }
2434
2435 static void do_fldenv(CPUX86State *env, target_ulong ptr, int data32,
2436                       uintptr_t retaddr)
2437 {
2438     int i, fpus, fptag;
2439
2440     if (data32) {
2441         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2442         fpus = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2443         fptag = cpu_lduw_data_ra(env, ptr + 8, retaddr);
2444     } else {
2445         cpu_set_fpuc(env, cpu_lduw_data_ra(env, ptr, retaddr));
2446         fpus = cpu_lduw_data_ra(env, ptr + 2, retaddr);
2447         fptag = cpu_lduw_data_ra(env, ptr + 4, retaddr);
2448     }
2449     cpu_set_fpus(env, fpus);
2450     for (i = 0; i < 8; i++) {
2451         env->fptags[i] = ((fptag & 3) == 3);
2452         fptag >>= 2;
2453     }
2454 }
2455
2456 void helper_fldenv(CPUX86State *env, target_ulong ptr, int data32)
2457 {
2458     do_fldenv(env, ptr, data32, GETPC());
2459 }
2460
2461 static void do_fsave(CPUX86State *env, target_ulong ptr, int data32,
2462                      uintptr_t retaddr)
2463 {
2464     floatx80 tmp;
2465     int i;
2466
2467     do_fstenv(env, ptr, data32, retaddr);
2468
2469     ptr += (14 << data32);
2470     for (i = 0; i < 8; i++) {
2471         tmp = ST(i);
2472         do_fstt(env, tmp, ptr, retaddr);
2473         ptr += 10;
2474     }
2475
2476     do_fninit(env);
2477 }
2478
2479 void helper_fsave(CPUX86State *env, target_ulong ptr, int data32)
2480 {
2481     do_fsave(env, ptr, data32, GETPC());
2482 }
2483
2484 static void do_frstor(CPUX86State *env, target_ulong ptr, int data32,
2485                       uintptr_t retaddr)
2486 {
2487     floatx80 tmp;
2488     int i;
2489
2490     do_fldenv(env, ptr, data32, retaddr);
2491     ptr += (14 << data32);
2492
2493     for (i = 0; i < 8; i++) {
2494         tmp = do_fldt(env, ptr, retaddr);
2495         ST(i) = tmp;
2496         ptr += 10;
2497     }
2498 }
2499
2500 void helper_frstor(CPUX86State *env, target_ulong ptr, int data32)
2501 {
2502     do_frstor(env, ptr, data32, GETPC());
2503 }
2504
2505 #if defined(CONFIG_USER_ONLY)
2506 void cpu_x86_fsave(CPUX86State *env, target_ulong ptr, int data32)
2507 {
2508     do_fsave(env, ptr, data32, 0);
2509 }
2510
2511 void cpu_x86_frstor(CPUX86State *env, target_ulong ptr, int data32)
2512 {
2513     do_frstor(env, ptr, data32, 0);
2514 }
2515 #endif
2516
2517 #define XO(X)  offsetof(X86XSaveArea, X)
2518
2519 static void do_xsave_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2520 {
2521     int fpus, fptag, i;
2522     target_ulong addr;
2523
2524     fpus = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11;
2525     fptag = 0;
2526     for (i = 0; i < 8; i++) {
2527         fptag |= (env->fptags[i] << i);
2528     }
2529
2530     cpu_stw_data_ra(env, ptr + XO(legacy.fcw), env->fpuc, ra);
2531     cpu_stw_data_ra(env, ptr + XO(legacy.fsw), fpus, ra);
2532     cpu_stw_data_ra(env, ptr + XO(legacy.ftw), fptag ^ 0xff, ra);
2533
2534     /* In 32-bit mode this is eip, sel, dp, sel.
2535        In 64-bit mode this is rip, rdp.
2536        But in either case we don't write actual data, just zeros.  */
2537     cpu_stq_data_ra(env, ptr + XO(legacy.fpip), 0, ra); /* eip+sel; rip */
2538     cpu_stq_data_ra(env, ptr + XO(legacy.fpdp), 0, ra); /* edp+sel; rdp */
2539
2540     addr = ptr + XO(legacy.fpregs);
2541     for (i = 0; i < 8; i++) {
2542         floatx80 tmp = ST(i);
2543         do_fstt(env, tmp, addr, ra);
2544         addr += 16;
2545     }
2546 }
2547
2548 static void do_xsave_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2549 {
2550     update_mxcsr_from_sse_status(env);
2551     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr), env->mxcsr, ra);
2552     cpu_stl_data_ra(env, ptr + XO(legacy.mxcsr_mask), 0x0000ffff, ra);
2553 }
2554
2555 static void do_xsave_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2556 {
2557     int i, nb_xmm_regs;
2558     target_ulong addr;
2559
2560     if (env->hflags & HF_CS64_MASK) {
2561         nb_xmm_regs = 16;
2562     } else {
2563         nb_xmm_regs = 8;
2564     }
2565
2566     addr = ptr + XO(legacy.xmm_regs);
2567     for (i = 0; i < nb_xmm_regs; i++) {
2568         cpu_stq_data_ra(env, addr, env->xmm_regs[i].ZMM_Q(0), ra);
2569         cpu_stq_data_ra(env, addr + 8, env->xmm_regs[i].ZMM_Q(1), ra);
2570         addr += 16;
2571     }
2572 }
2573
2574 static void do_xsave_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2575 {
2576     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2577     int i;
2578
2579     for (i = 0; i < 4; i++, addr += 16) {
2580         cpu_stq_data_ra(env, addr, env->bnd_regs[i].lb, ra);
2581         cpu_stq_data_ra(env, addr + 8, env->bnd_regs[i].ub, ra);
2582     }
2583 }
2584
2585 static void do_xsave_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2586 {
2587     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu),
2588                     env->bndcs_regs.cfgu, ra);
2589     cpu_stq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts),
2590                     env->bndcs_regs.sts, ra);
2591 }
2592
2593 static void do_xsave_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2594 {
2595     cpu_stq_data_ra(env, ptr, env->pkru, ra);
2596 }
2597
2598 static void do_fxsave(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2599 {
2600     /* The operand must be 16 byte aligned */
2601     if (ptr & 0xf) {
2602         raise_exception_ra(env, EXCP0D_GPF, ra);
2603     }
2604
2605     do_xsave_fpu(env, ptr, ra);
2606
2607     if (env->cr[4] & CR4_OSFXSR_MASK) {
2608         do_xsave_mxcsr(env, ptr, ra);
2609         /* Fast FXSAVE leaves out the XMM registers */
2610         if (!(env->efer & MSR_EFER_FFXSR)
2611             || (env->hflags & HF_CPL_MASK)
2612             || !(env->hflags & HF_LMA_MASK)) {
2613             do_xsave_sse(env, ptr, ra);
2614         }
2615     }
2616 }
2617
2618 void helper_fxsave(CPUX86State *env, target_ulong ptr)
2619 {
2620     do_fxsave(env, ptr, GETPC());
2621 }
2622
2623 static uint64_t get_xinuse(CPUX86State *env)
2624 {
2625     uint64_t inuse = -1;
2626
2627     /* For the most part, we don't track XINUSE.  We could calculate it
2628        here for all components, but it's probably less work to simply
2629        indicate in use.  That said, the state of BNDREGS is important
2630        enough to track in HFLAGS, so we might as well use that here.  */
2631     if ((env->hflags & HF_MPX_IU_MASK) == 0) {
2632        inuse &= ~XSTATE_BNDREGS_MASK;
2633     }
2634     return inuse;
2635 }
2636
2637 static void do_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm,
2638                      uint64_t inuse, uint64_t opt, uintptr_t ra)
2639 {
2640     uint64_t old_bv, new_bv;
2641
2642     /* The OS must have enabled XSAVE.  */
2643     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2644         raise_exception_ra(env, EXCP06_ILLOP, ra);
2645     }
2646
2647     /* The operand must be 64 byte aligned.  */
2648     if (ptr & 63) {
2649         raise_exception_ra(env, EXCP0D_GPF, ra);
2650     }
2651
2652     /* Never save anything not enabled by XCR0.  */
2653     rfbm &= env->xcr0;
2654     opt &= rfbm;
2655
2656     if (opt & XSTATE_FP_MASK) {
2657         do_xsave_fpu(env, ptr, ra);
2658     }
2659     if (rfbm & XSTATE_SSE_MASK) {
2660         /* Note that saving MXCSR is not suppressed by XSAVEOPT.  */
2661         do_xsave_mxcsr(env, ptr, ra);
2662     }
2663     if (opt & XSTATE_SSE_MASK) {
2664         do_xsave_sse(env, ptr, ra);
2665     }
2666     if (opt & XSTATE_BNDREGS_MASK) {
2667         do_xsave_bndregs(env, ptr + XO(bndreg_state), ra);
2668     }
2669     if (opt & XSTATE_BNDCSR_MASK) {
2670         do_xsave_bndcsr(env, ptr + XO(bndcsr_state), ra);
2671     }
2672     if (opt & XSTATE_PKRU_MASK) {
2673         do_xsave_pkru(env, ptr + XO(pkru_state), ra);
2674     }
2675
2676     /* Update the XSTATE_BV field.  */
2677     old_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2678     new_bv = (old_bv & ~rfbm) | (inuse & rfbm);
2679     cpu_stq_data_ra(env, ptr + XO(header.xstate_bv), new_bv, ra);
2680 }
2681
2682 void helper_xsave(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2683 {
2684     do_xsave(env, ptr, rfbm, get_xinuse(env), -1, GETPC());
2685 }
2686
2687 void helper_xsaveopt(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2688 {
2689     uint64_t inuse = get_xinuse(env);
2690     do_xsave(env, ptr, rfbm, inuse, inuse, GETPC());
2691 }
2692
2693 static void do_xrstor_fpu(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2694 {
2695     int i, fpuc, fpus, fptag;
2696     target_ulong addr;
2697
2698     fpuc = cpu_lduw_data_ra(env, ptr + XO(legacy.fcw), ra);
2699     fpus = cpu_lduw_data_ra(env, ptr + XO(legacy.fsw), ra);
2700     fptag = cpu_lduw_data_ra(env, ptr + XO(legacy.ftw), ra);
2701     cpu_set_fpuc(env, fpuc);
2702     cpu_set_fpus(env, fpus);
2703     fptag ^= 0xff;
2704     for (i = 0; i < 8; i++) {
2705         env->fptags[i] = ((fptag >> i) & 1);
2706     }
2707
2708     addr = ptr + XO(legacy.fpregs);
2709     for (i = 0; i < 8; i++) {
2710         floatx80 tmp = do_fldt(env, addr, ra);
2711         ST(i) = tmp;
2712         addr += 16;
2713     }
2714 }
2715
2716 static void do_xrstor_mxcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2717 {
2718     cpu_set_mxcsr(env, cpu_ldl_data_ra(env, ptr + XO(legacy.mxcsr), ra));
2719 }
2720
2721 static void do_xrstor_sse(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2722 {
2723     int i, nb_xmm_regs;
2724     target_ulong addr;
2725
2726     if (env->hflags & HF_CS64_MASK) {
2727         nb_xmm_regs = 16;
2728     } else {
2729         nb_xmm_regs = 8;
2730     }
2731
2732     addr = ptr + XO(legacy.xmm_regs);
2733     for (i = 0; i < nb_xmm_regs; i++) {
2734         env->xmm_regs[i].ZMM_Q(0) = cpu_ldq_data_ra(env, addr, ra);
2735         env->xmm_regs[i].ZMM_Q(1) = cpu_ldq_data_ra(env, addr + 8, ra);
2736         addr += 16;
2737     }
2738 }
2739
2740 static void do_xrstor_bndregs(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2741 {
2742     target_ulong addr = ptr + offsetof(XSaveBNDREG, bnd_regs);
2743     int i;
2744
2745     for (i = 0; i < 4; i++, addr += 16) {
2746         env->bnd_regs[i].lb = cpu_ldq_data_ra(env, addr, ra);
2747         env->bnd_regs[i].ub = cpu_ldq_data_ra(env, addr + 8, ra);
2748     }
2749 }
2750
2751 static void do_xrstor_bndcsr(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2752 {
2753     /* FIXME: Extend highest implemented bit of linear address.  */
2754     env->bndcs_regs.cfgu
2755         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.cfgu), ra);
2756     env->bndcs_regs.sts
2757         = cpu_ldq_data_ra(env, ptr + offsetof(XSaveBNDCSR, bndcsr.sts), ra);
2758 }
2759
2760 static void do_xrstor_pkru(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2761 {
2762     env->pkru = cpu_ldq_data_ra(env, ptr, ra);
2763 }
2764
2765 static void do_fxrstor(CPUX86State *env, target_ulong ptr, uintptr_t ra)
2766 {
2767     /* The operand must be 16 byte aligned */
2768     if (ptr & 0xf) {
2769         raise_exception_ra(env, EXCP0D_GPF, ra);
2770     }
2771
2772     do_xrstor_fpu(env, ptr, ra);
2773
2774     if (env->cr[4] & CR4_OSFXSR_MASK) {
2775         do_xrstor_mxcsr(env, ptr, ra);
2776         /* Fast FXRSTOR leaves out the XMM registers */
2777         if (!(env->efer & MSR_EFER_FFXSR)
2778             || (env->hflags & HF_CPL_MASK)
2779             || !(env->hflags & HF_LMA_MASK)) {
2780             do_xrstor_sse(env, ptr, ra);
2781         }
2782     }
2783 }
2784
2785 void helper_fxrstor(CPUX86State *env, target_ulong ptr)
2786 {
2787     do_fxrstor(env, ptr, GETPC());
2788 }
2789
2790 #if defined(CONFIG_USER_ONLY)
2791 void cpu_x86_fxsave(CPUX86State *env, target_ulong ptr)
2792 {
2793     do_fxsave(env, ptr, 0);
2794 }
2795
2796 void cpu_x86_fxrstor(CPUX86State *env, target_ulong ptr)
2797 {
2798     do_fxrstor(env, ptr, 0);
2799 }
2800 #endif
2801
2802 void helper_xrstor(CPUX86State *env, target_ulong ptr, uint64_t rfbm)
2803 {
2804     uintptr_t ra = GETPC();
2805     uint64_t xstate_bv, xcomp_bv, reserve0;
2806
2807     rfbm &= env->xcr0;
2808
2809     /* The OS must have enabled XSAVE.  */
2810     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2811         raise_exception_ra(env, EXCP06_ILLOP, ra);
2812     }
2813
2814     /* The operand must be 64 byte aligned.  */
2815     if (ptr & 63) {
2816         raise_exception_ra(env, EXCP0D_GPF, ra);
2817     }
2818
2819     xstate_bv = cpu_ldq_data_ra(env, ptr + XO(header.xstate_bv), ra);
2820
2821     if ((int64_t)xstate_bv < 0) {
2822         /* FIXME: Compact form.  */
2823         raise_exception_ra(env, EXCP0D_GPF, ra);
2824     }
2825
2826     /* Standard form.  */
2827
2828     /* The XSTATE_BV field must not set bits not present in XCR0.  */
2829     if (xstate_bv & ~env->xcr0) {
2830         raise_exception_ra(env, EXCP0D_GPF, ra);
2831     }
2832
2833     /* The XCOMP_BV field must be zero.  Note that, as of the April 2016
2834        revision, the description of the XSAVE Header (Vol 1, Sec 13.4.2)
2835        describes only XCOMP_BV, but the description of the standard form
2836        of XRSTOR (Vol 1, Sec 13.8.1) checks bytes 23:8 for zero, which
2837        includes the next 64-bit field.  */
2838     xcomp_bv = cpu_ldq_data_ra(env, ptr + XO(header.xcomp_bv), ra);
2839     reserve0 = cpu_ldq_data_ra(env, ptr + XO(header.reserve0), ra);
2840     if (xcomp_bv || reserve0) {
2841         raise_exception_ra(env, EXCP0D_GPF, ra);
2842     }
2843
2844     if (rfbm & XSTATE_FP_MASK) {
2845         if (xstate_bv & XSTATE_FP_MASK) {
2846             do_xrstor_fpu(env, ptr, ra);
2847         } else {
2848             do_fninit(env);
2849             memset(env->fpregs, 0, sizeof(env->fpregs));
2850         }
2851     }
2852     if (rfbm & XSTATE_SSE_MASK) {
2853         /* Note that the standard form of XRSTOR loads MXCSR from memory
2854            whether or not the XSTATE_BV bit is set.  */
2855         do_xrstor_mxcsr(env, ptr, ra);
2856         if (xstate_bv & XSTATE_SSE_MASK) {
2857             do_xrstor_sse(env, ptr, ra);
2858         } else {
2859             /* ??? When AVX is implemented, we may have to be more
2860                selective in the clearing.  */
2861             memset(env->xmm_regs, 0, sizeof(env->xmm_regs));
2862         }
2863     }
2864     if (rfbm & XSTATE_BNDREGS_MASK) {
2865         if (xstate_bv & XSTATE_BNDREGS_MASK) {
2866             do_xrstor_bndregs(env, ptr + XO(bndreg_state), ra);
2867             env->hflags |= HF_MPX_IU_MASK;
2868         } else {
2869             memset(env->bnd_regs, 0, sizeof(env->bnd_regs));
2870             env->hflags &= ~HF_MPX_IU_MASK;
2871         }
2872     }
2873     if (rfbm & XSTATE_BNDCSR_MASK) {
2874         if (xstate_bv & XSTATE_BNDCSR_MASK) {
2875             do_xrstor_bndcsr(env, ptr + XO(bndcsr_state), ra);
2876         } else {
2877             memset(&env->bndcs_regs, 0, sizeof(env->bndcs_regs));
2878         }
2879         cpu_sync_bndcs_hflags(env);
2880     }
2881     if (rfbm & XSTATE_PKRU_MASK) {
2882         uint64_t old_pkru = env->pkru;
2883         if (xstate_bv & XSTATE_PKRU_MASK) {
2884             do_xrstor_pkru(env, ptr + XO(pkru_state), ra);
2885         } else {
2886             env->pkru = 0;
2887         }
2888         if (env->pkru != old_pkru) {
2889             CPUState *cs = env_cpu(env);
2890             tlb_flush(cs);
2891         }
2892     }
2893 }
2894
2895 #undef XO
2896
2897 uint64_t helper_xgetbv(CPUX86State *env, uint32_t ecx)
2898 {
2899     /* The OS must have enabled XSAVE.  */
2900     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2901         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2902     }
2903
2904     switch (ecx) {
2905     case 0:
2906         return env->xcr0;
2907     case 1:
2908         if (env->features[FEAT_XSAVE] & CPUID_XSAVE_XGETBV1) {
2909             return env->xcr0 & get_xinuse(env);
2910         }
2911         break;
2912     }
2913     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2914 }
2915
2916 void helper_xsetbv(CPUX86State *env, uint32_t ecx, uint64_t mask)
2917 {
2918     uint32_t dummy, ena_lo, ena_hi;
2919     uint64_t ena;
2920
2921     /* The OS must have enabled XSAVE.  */
2922     if (!(env->cr[4] & CR4_OSXSAVE_MASK)) {
2923         raise_exception_ra(env, EXCP06_ILLOP, GETPC());
2924     }
2925
2926     /* Only XCR0 is defined at present; the FPU may not be disabled.  */
2927     if (ecx != 0 || (mask & XSTATE_FP_MASK) == 0) {
2928         goto do_gpf;
2929     }
2930
2931     /* Disallow enabling unimplemented features.  */
2932     cpu_x86_cpuid(env, 0x0d, 0, &ena_lo, &dummy, &dummy, &ena_hi);
2933     ena = ((uint64_t)ena_hi << 32) | ena_lo;
2934     if (mask & ~ena) {
2935         goto do_gpf;
2936     }
2937
2938     /* Disallow enabling only half of MPX.  */
2939     if ((mask ^ (mask * (XSTATE_BNDCSR_MASK / XSTATE_BNDREGS_MASK)))
2940         & XSTATE_BNDCSR_MASK) {
2941         goto do_gpf;
2942     }
2943
2944     env->xcr0 = mask;
2945     cpu_sync_bndcs_hflags(env);
2946     return;
2947
2948  do_gpf:
2949     raise_exception_ra(env, EXCP0D_GPF, GETPC());
2950 }
2951
2952 /* MMX/SSE */
2953 /* XXX: optimize by storing fptt and fptags in the static cpu state */
2954
2955 #define SSE_DAZ             0x0040
2956 #define SSE_RC_MASK         0x6000
2957 #define SSE_RC_NEAR         0x0000
2958 #define SSE_RC_DOWN         0x2000
2959 #define SSE_RC_UP           0x4000
2960 #define SSE_RC_CHOP         0x6000
2961 #define SSE_FZ              0x8000
2962
2963 void update_mxcsr_status(CPUX86State *env)
2964 {
2965     uint32_t mxcsr = env->mxcsr;
2966     int rnd_type;
2967
2968     /* set rounding mode */
2969     switch (mxcsr & SSE_RC_MASK) {
2970     default:
2971     case SSE_RC_NEAR:
2972         rnd_type = float_round_nearest_even;
2973         break;
2974     case SSE_RC_DOWN:
2975         rnd_type = float_round_down;
2976         break;
2977     case SSE_RC_UP:
2978         rnd_type = float_round_up;
2979         break;
2980     case SSE_RC_CHOP:
2981         rnd_type = float_round_to_zero;
2982         break;
2983     }
2984     set_float_rounding_mode(rnd_type, &env->sse_status);
2985
2986     /* Set exception flags.  */
2987     set_float_exception_flags((mxcsr & FPUS_IE ? float_flag_invalid : 0) |
2988                               (mxcsr & FPUS_ZE ? float_flag_divbyzero : 0) |
2989                               (mxcsr & FPUS_OE ? float_flag_overflow : 0) |
2990                               (mxcsr & FPUS_UE ? float_flag_underflow : 0) |
2991                               (mxcsr & FPUS_PE ? float_flag_inexact : 0),
2992                               &env->sse_status);
2993
2994     /* set denormals are zero */
2995     set_flush_inputs_to_zero((mxcsr & SSE_DAZ) ? 1 : 0, &env->sse_status);
2996
2997     /* set flush to zero */
2998     set_flush_to_zero((mxcsr & SSE_FZ) ? 1 : 0, &env->sse_status);
2999 }
3000
3001 void update_mxcsr_from_sse_status(CPUX86State *env)
3002 {
3003     uint8_t flags = get_float_exception_flags(&env->sse_status);
3004     /*
3005      * The MXCSR denormal flag has opposite semantics to
3006      * float_flag_input_denormal (the softfloat code sets that flag
3007      * only when flushing input denormals to zero, but SSE sets it
3008      * only when not flushing them to zero), so is not converted
3009      * here.
3010      */
3011     env->mxcsr |= ((flags & float_flag_invalid ? FPUS_IE : 0) |
3012                    (flags & float_flag_divbyzero ? FPUS_ZE : 0) |
3013                    (flags & float_flag_overflow ? FPUS_OE : 0) |
3014                    (flags & float_flag_underflow ? FPUS_UE : 0) |
3015                    (flags & float_flag_inexact ? FPUS_PE : 0) |
3016                    (flags & float_flag_output_denormal ? FPUS_UE | FPUS_PE :
3017                     0));
3018 }
3019
3020 void helper_update_mxcsr(CPUX86State *env)
3021 {
3022     update_mxcsr_from_sse_status(env);
3023 }
3024
3025 void helper_ldmxcsr(CPUX86State *env, uint32_t val)
3026 {
3027     cpu_set_mxcsr(env, val);
3028 }
3029
3030 void helper_enter_mmx(CPUX86State *env)
3031 {
3032     env->fpstt = 0;
3033     *(uint32_t *)(env->fptags) = 0;
3034     *(uint32_t *)(env->fptags + 4) = 0;
3035 }
3036
3037 void helper_emms(CPUX86State *env)
3038 {
3039     /* set to empty state */
3040     *(uint32_t *)(env->fptags) = 0x01010101;
3041     *(uint32_t *)(env->fptags + 4) = 0x01010101;
3042 }
3043
3044 /* XXX: suppress */
3045 void helper_movq(CPUX86State *env, void *d, void *s)
3046 {
3047     *(uint64_t *)d = *(uint64_t *)s;
3048 }
3049
3050 #define SHIFT 0
3051 #include "ops_sse.h"
3052
3053 #define SHIFT 1
3054 #include "ops_sse.h"