fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * Derived from SoftFloat.
   5  */
   6
   7 /*============================================================================
   8
   9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  10 Package, Release 2b.
  11
  12 Written by John R. Hauser.  This work was made possible in part by the
  13 International Computer Science Institute, located at Suite 600, 1947 Center
  14 Street, Berkeley, California 94704.  Funding was partially provided by the
  15 National Science Foundation under grant MIP-9311980.  The original version
  16 of this code was written as part of a project to build a fixed-point vector
  17 processor in collaboration with the University of California at Berkeley,
  18 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  20 arithmetic/SoftFloat.html'.
  21
  22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  24 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  30
  31 Derivative works are acceptable, even for commercial purposes, so long as
  32 (1) the source code for the derivative work includes prominent notice that
  33 the work is derivative, and (2) the source code includes prominent notice with
  34 these four paragraphs for those parts of this code that are retained.
  35
  36 =============================================================================*/
  37
  38 /* softfloat (and in particular the code in softfloat-specialize.h) is
  39  * target-dependent and needs the TARGET_* macros.
  40  */
  41 #include "config.h"
  42
  43 #include "fpu/softfloat.h"
  44
  45 /*----------------------------------------------------------------------------
  46 | Primitive arithmetic functions, including multi-word arithmetic, and
  47 | division and square root approximations.  (Can be specialized to target if
  48 | desired.)
  49 *----------------------------------------------------------------------------*/
  50 #include "softfloat-macros.h"
  51
  52 /*----------------------------------------------------------------------------
  53 | Functions and definitions to determine:  (1) whether tininess for underflow
  54 | is detected before or after rounding by default, (2) what (if anything)
  55 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  56 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  57 | are propagated from function inputs to output.  These details are target-
  58 | specific.
  59 *----------------------------------------------------------------------------*/
  60 #include "softfloat-specialize.h"
  61
  62 /*----------------------------------------------------------------------------
  63 | Returns the fraction bits of the half-precision floating-point value `a'.
  64 *----------------------------------------------------------------------------*/
  65
  66 INLINE uint32_t extractFloat16Frac(float16 a)
  67 {
  68     return float16_val(a) & 0x3ff;
  69 }
  70
  71 /*----------------------------------------------------------------------------
  72 | Returns the exponent bits of the half-precision floating-point value `a'.
  73 *----------------------------------------------------------------------------*/
  74
  75 INLINE int_fast16_t extractFloat16Exp(float16 a)
  76 {
  77     return (float16_val(a) >> 10) & 0x1f;
  78 }
  79
  80 /*----------------------------------------------------------------------------
  81 | Returns the sign bit of the single-precision floating-point value `a'.
  82 *----------------------------------------------------------------------------*/
  83
  84 INLINE flag extractFloat16Sign(float16 a)
  85 {
  86     return float16_val(a)>>15;
  87 }
  88
  89 /*----------------------------------------------------------------------------
  90 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  91 | and 7, and returns the properly rounded 32-bit integer corresponding to the
  92 | input.  If `zSign' is 1, the input is negated before being converted to an
  93 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  94 | is simply rounded to an integer, with the inexact exception raised if the
  95 | input cannot be represented exactly as an integer.  However, if the fixed-
  96 | point input is too large, the invalid exception is raised and the largest
  97 | positive or negative integer is returned.
  98 *----------------------------------------------------------------------------*/
  99
 100 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
 101 {
 102     int8 roundingMode;
 103     flag roundNearestEven;
 104     int8 roundIncrement, roundBits;
 105     int32_t z;
 106
 107     roundingMode = STATUS(float_rounding_mode);
 108     roundNearestEven = ( roundingMode == float_round_nearest_even );
 109     roundIncrement = 0x40;
 110     if ( ! roundNearestEven ) {
 111         if ( roundingMode == float_round_to_zero ) {
 112             roundIncrement = 0;
 113         }
 114         else {
 115             roundIncrement = 0x7F;
 116             if ( zSign ) {
 117                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 118             }
 119             else {
 120                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 121             }
 122         }
 123     }
 124     roundBits = absZ & 0x7F;
 125     absZ = ( absZ + roundIncrement )>>7;
 126     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 127     z = absZ;
 128     if ( zSign ) z = - z;
 129     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 130         float_raise( float_flag_invalid STATUS_VAR);
 131         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 132     }
 133     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 134     return z;
 135
 136 }
 137
 138 /*----------------------------------------------------------------------------
 139 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 140 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 141 | and returns the properly rounded 64-bit integer corresponding to the input.
 142 | If `zSign' is 1, the input is negated before being converted to an integer.
 143 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 144 | the inexact exception raised if the input cannot be represented exactly as
 145 | an integer.  However, if the fixed-point input is too large, the invalid
 146 | exception is raised and the largest positive or negative integer is
 147 | returned.
 148 *----------------------------------------------------------------------------*/
 149
 150 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
 151 {
 152     int8 roundingMode;
 153     flag roundNearestEven, increment;
 154     int64_t z;
 155
 156     roundingMode = STATUS(float_rounding_mode);
 157     roundNearestEven = ( roundingMode == float_round_nearest_even );
 158     increment = ( (int64_t) absZ1 < 0 );
 159     if ( ! roundNearestEven ) {
 160         if ( roundingMode == float_round_to_zero ) {
 161             increment = 0;
 162         }
 163         else {
 164             if ( zSign ) {
 165                 increment = ( roundingMode == float_round_down ) && absZ1;
 166             }
 167             else {
 168                 increment = ( roundingMode == float_round_up ) && absZ1;
 169             }
 170         }
 171     }
 172     if ( increment ) {
 173         ++absZ0;
 174         if ( absZ0 == 0 ) goto overflow;
 175         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 176     }
 177     z = absZ0;
 178     if ( zSign ) z = - z;
 179     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 180  overflow:
 181         float_raise( float_flag_invalid STATUS_VAR);
 182         return
 183               zSign ? (int64_t) LIT64( 0x8000000000000000 )
 184             : LIT64( 0x7FFFFFFFFFFFFFFF );
 185     }
 186     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 187     return z;
 188
 189 }
 190
 191 /*----------------------------------------------------------------------------
 192 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 193 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 194 | and returns the properly rounded 64-bit unsigned integer corresponding to the
 195 | input.  Ordinarily, the fixed-point input is simply rounded to an integer,
 196 | with the inexact exception raised if the input cannot be represented exactly
 197 | as an integer.  However, if the fixed-point input is too large, the invalid
 198 | exception is raised and the largest unsigned integer is returned.
 199 *----------------------------------------------------------------------------*/
 200
 201 static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
 202                                 uint64_t absZ1 STATUS_PARAM)
 203 {
 204     int8 roundingMode;
 205     flag roundNearestEven, increment;
 206
 207     roundingMode = STATUS(float_rounding_mode);
 208     roundNearestEven = (roundingMode == float_round_nearest_even);
 209     increment = ((int64_t)absZ1 < 0);
 210     if (!roundNearestEven) {
 211         if (roundingMode == float_round_to_zero) {
 212             increment = 0;
 213         } else if (absZ1) {
 214             if (zSign) {
 215                 increment = (roundingMode == float_round_down) && absZ1;
 216             } else {
 217                 increment = (roundingMode == float_round_up) && absZ1;
 218             }
 219         }
 220     }
 221     if (increment) {
 222         ++absZ0;
 223         if (absZ0 == 0) {
 224             float_raise(float_flag_invalid STATUS_VAR);
 225             return LIT64(0xFFFFFFFFFFFFFFFF);
 226         }
 227         absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
 228     }
 229
 230     if (zSign && absZ0) {
 231         float_raise(float_flag_invalid STATUS_VAR);
 232         return 0;
 233     }
 234
 235     if (absZ1) {
 236         STATUS(float_exception_flags) |= float_flag_inexact;
 237     }
 238     return absZ0;
 239 }
 240
 241 /*----------------------------------------------------------------------------
 242 | Returns the fraction bits of the single-precision floating-point value `a'.
 243 *----------------------------------------------------------------------------*/
 244
 245 INLINE uint32_t extractFloat32Frac( float32 a )
 246 {
 247
 248     return float32_val(a) & 0x007FFFFF;
 249
 250 }
 251
 252 /*----------------------------------------------------------------------------
 253 | Returns the exponent bits of the single-precision floating-point value `a'.
 254 *----------------------------------------------------------------------------*/
 255
 256 INLINE int_fast16_t extractFloat32Exp(float32 a)
 257 {
 258
 259     return ( float32_val(a)>>23 ) & 0xFF;
 260
 261 }
 262
 263 /*----------------------------------------------------------------------------
 264 | Returns the sign bit of the single-precision floating-point value `a'.
 265 *----------------------------------------------------------------------------*/
 266
 267 INLINE flag extractFloat32Sign( float32 a )
 268 {
 269
 270     return float32_val(a)>>31;
 271
 272 }
 273
 274 /*----------------------------------------------------------------------------
 275 | If `a' is denormal and we are in flush-to-zero mode then set the
 276 | input-denormal exception and return zero. Otherwise just return the value.
 277 *----------------------------------------------------------------------------*/
 278 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 279 {
 280     if (STATUS(flush_inputs_to_zero)) {
 281         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 282             float_raise(float_flag_input_denormal STATUS_VAR);
 283             return make_float32(float32_val(a) & 0x80000000);
 284         }
 285     }
 286     return a;
 287 }
 288
 289 /*----------------------------------------------------------------------------
 290 | Normalizes the subnormal single-precision floating-point value represented
 291 | by the denormalized significand `aSig'.  The normalized exponent and
 292 | significand are stored at the locations pointed to by `zExpPtr' and
 293 | `zSigPtr', respectively.
 294 *----------------------------------------------------------------------------*/
 295
 296 static void
 297  normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
 298 {
 299     int8 shiftCount;
 300
 301     shiftCount = countLeadingZeros32( aSig ) - 8;
 302     *zSigPtr = aSig<<shiftCount;
 303     *zExpPtr = 1 - shiftCount;
 304
 305 }
 306
 307 /*----------------------------------------------------------------------------
 308 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 309 | single-precision floating-point value, returning the result.  After being
 310 | shifted into the proper positions, the three fields are simply added
 311 | together to form the result.  This means that any integer portion of `zSig'
 312 | will be added into the exponent.  Since a properly normalized significand
 313 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 314 | than the desired result exponent whenever `zSig' is a complete, normalized
 315 | significand.
 316 *----------------------------------------------------------------------------*/
 317
 318 INLINE float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
 319 {
 320
 321     return make_float32(
 322           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 323
 324 }
 325
 326 /*----------------------------------------------------------------------------
 327 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 328 | and significand `zSig', and returns the proper single-precision floating-
 329 | point value corresponding to the abstract input.  Ordinarily, the abstract
 330 | value is simply rounded and packed into the single-precision format, with
 331 | the inexact exception raised if the abstract input cannot be represented
 332 | exactly.  However, if the abstract value is too large, the overflow and
 333 | inexact exceptions are raised and an infinity or maximal finite value is
 334 | returned.  If the abstract value is too small, the input value is rounded to
 335 | a subnormal number, and the underflow and inexact exceptions are raised if
 336 | the abstract input cannot be represented exactly as a subnormal single-
 337 | precision floating-point number.
 338 |     The input significand `zSig' has its binary point between bits 30
 339 | and 29, which is 7 bits to the left of the usual location.  This shifted
 340 | significand must be normalized or smaller.  If `zSig' is not normalized,
 341 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 342 | and it must not require rounding.  In the usual case that `zSig' is
 343 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 344 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 345 | Binary Floating-Point Arithmetic.
 346 *----------------------------------------------------------------------------*/
 347
 348 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
 349 {
 350     int8 roundingMode;
 351     flag roundNearestEven;
 352     int8 roundIncrement, roundBits;
 353     flag isTiny;
 354
 355     roundingMode = STATUS(float_rounding_mode);
 356     roundNearestEven = ( roundingMode == float_round_nearest_even );
 357     roundIncrement = 0x40;
 358     if ( ! roundNearestEven ) {
 359         if ( roundingMode == float_round_to_zero ) {
 360             roundIncrement = 0;
 361         }
 362         else {
 363             roundIncrement = 0x7F;
 364             if ( zSign ) {
 365                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 366             }
 367             else {
 368                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 369             }
 370         }
 371     }
 372     roundBits = zSig & 0x7F;
 373     if ( 0xFD <= (uint16_t) zExp ) {
 374         if (    ( 0xFD < zExp )
 375              || (    ( zExp == 0xFD )
 376                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 377            ) {
 378             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 379             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 380         }
 381         if ( zExp < 0 ) {
 382             if (STATUS(flush_to_zero)) {
 383                 float_raise(float_flag_output_denormal STATUS_VAR);
 384                 return packFloat32(zSign, 0, 0);
 385             }
 386             isTiny =
 387                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 388                 || ( zExp < -1 )
 389                 || ( zSig + roundIncrement < 0x80000000 );
 390             shift32RightJamming( zSig, - zExp, &zSig );
 391             zExp = 0;
 392             roundBits = zSig & 0x7F;
 393             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 394         }
 395     }
 396     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 397     zSig = ( zSig + roundIncrement )>>7;
 398     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 399     if ( zSig == 0 ) zExp = 0;
 400     return packFloat32( zSign, zExp, zSig );
 401
 402 }
 403
 404 /*----------------------------------------------------------------------------
 405 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 406 | and significand `zSig', and returns the proper single-precision floating-
 407 | point value corresponding to the abstract input.  This routine is just like
 408 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 409 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 410 | floating-point exponent.
 411 *----------------------------------------------------------------------------*/
 412
 413 static float32
 414  normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
 415 {
 416     int8 shiftCount;
 417
 418     shiftCount = countLeadingZeros32( zSig ) - 1;
 419     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 420
 421 }
 422
 423 /*----------------------------------------------------------------------------
 424 | Returns the fraction bits of the double-precision floating-point value `a'.
 425 *----------------------------------------------------------------------------*/
 426
 427 INLINE uint64_t extractFloat64Frac( float64 a )
 428 {
 429
 430     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 431
 432 }
 433
 434 /*----------------------------------------------------------------------------
 435 | Returns the exponent bits of the double-precision floating-point value `a'.
 436 *----------------------------------------------------------------------------*/
 437
 438 INLINE int_fast16_t extractFloat64Exp(float64 a)
 439 {
 440
 441     return ( float64_val(a)>>52 ) & 0x7FF;
 442
 443 }
 444
 445 /*----------------------------------------------------------------------------
 446 | Returns the sign bit of the double-precision floating-point value `a'.
 447 *----------------------------------------------------------------------------*/
 448
 449 INLINE flag extractFloat64Sign( float64 a )
 450 {
 451
 452     return float64_val(a)>>63;
 453
 454 }
 455
 456 /*----------------------------------------------------------------------------
 457 | If `a' is denormal and we are in flush-to-zero mode then set the
 458 | input-denormal exception and return zero. Otherwise just return the value.
 459 *----------------------------------------------------------------------------*/
 460 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 461 {
 462     if (STATUS(flush_inputs_to_zero)) {
 463         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 464             float_raise(float_flag_input_denormal STATUS_VAR);
 465             return make_float64(float64_val(a) & (1ULL << 63));
 466         }
 467     }
 468     return a;
 469 }
 470
 471 /*----------------------------------------------------------------------------
 472 | Normalizes the subnormal double-precision floating-point value represented
 473 | by the denormalized significand `aSig'.  The normalized exponent and
 474 | significand are stored at the locations pointed to by `zExpPtr' and
 475 | `zSigPtr', respectively.
 476 *----------------------------------------------------------------------------*/
 477
 478 static void
 479  normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
 480 {
 481     int8 shiftCount;
 482
 483     shiftCount = countLeadingZeros64( aSig ) - 11;
 484     *zSigPtr = aSig<<shiftCount;
 485     *zExpPtr = 1 - shiftCount;
 486
 487 }
 488
 489 /*----------------------------------------------------------------------------
 490 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 491 | double-precision floating-point value, returning the result.  After being
 492 | shifted into the proper positions, the three fields are simply added
 493 | together to form the result.  This means that any integer portion of `zSig'
 494 | will be added into the exponent.  Since a properly normalized significand
 495 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 496 | than the desired result exponent whenever `zSig' is a complete, normalized
 497 | significand.
 498 *----------------------------------------------------------------------------*/
 499
 500 INLINE float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
 501 {
 502
 503     return make_float64(
 504         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 505
 506 }
 507
 508 /*----------------------------------------------------------------------------
 509 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 510 | and significand `zSig', and returns the proper double-precision floating-
 511 | point value corresponding to the abstract input.  Ordinarily, the abstract
 512 | value is simply rounded and packed into the double-precision format, with
 513 | the inexact exception raised if the abstract input cannot be represented
 514 | exactly.  However, if the abstract value is too large, the overflow and
 515 | inexact exceptions are raised and an infinity or maximal finite value is
 516 | returned.  If the abstract value is too small, the input value is rounded
 517 | to a subnormal number, and the underflow and inexact exceptions are raised
 518 | if the abstract input cannot be represented exactly as a subnormal double-
 519 | precision floating-point number.
 520 |     The input significand `zSig' has its binary point between bits 62
 521 | and 61, which is 10 bits to the left of the usual location.  This shifted
 522 | significand must be normalized or smaller.  If `zSig' is not normalized,
 523 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 524 | and it must not require rounding.  In the usual case that `zSig' is
 525 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 526 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 527 | Binary Floating-Point Arithmetic.
 528 *----------------------------------------------------------------------------*/
 529
 530 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
 531 {
 532     int8 roundingMode;
 533     flag roundNearestEven;
 534     int_fast16_t roundIncrement, roundBits;
 535     flag isTiny;
 536
 537     roundingMode = STATUS(float_rounding_mode);
 538     roundNearestEven = ( roundingMode == float_round_nearest_even );
 539     roundIncrement = 0x200;
 540     if ( ! roundNearestEven ) {
 541         if ( roundingMode == float_round_to_zero ) {
 542             roundIncrement = 0;
 543         }
 544         else {
 545             roundIncrement = 0x3FF;
 546             if ( zSign ) {
 547                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 548             }
 549             else {
 550                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 551             }
 552         }
 553     }
 554     roundBits = zSig & 0x3FF;
 555     if ( 0x7FD <= (uint16_t) zExp ) {
 556         if (    ( 0x7FD < zExp )
 557              || (    ( zExp == 0x7FD )
 558                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 559            ) {
 560             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 561             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 562         }
 563         if ( zExp < 0 ) {
 564             if (STATUS(flush_to_zero)) {
 565                 float_raise(float_flag_output_denormal STATUS_VAR);
 566                 return packFloat64(zSign, 0, 0);
 567             }
 568             isTiny =
 569                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 570                 || ( zExp < -1 )
 571                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 572             shift64RightJamming( zSig, - zExp, &zSig );
 573             zExp = 0;
 574             roundBits = zSig & 0x3FF;
 575             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 576         }
 577     }
 578     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 579     zSig = ( zSig + roundIncrement )>>10;
 580     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 581     if ( zSig == 0 ) zExp = 0;
 582     return packFloat64( zSign, zExp, zSig );
 583
 584 }
 585
 586 /*----------------------------------------------------------------------------
 587 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 588 | and significand `zSig', and returns the proper double-precision floating-
 589 | point value corresponding to the abstract input.  This routine is just like
 590 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 591 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 592 | floating-point exponent.
 593 *----------------------------------------------------------------------------*/
 594
 595 static float64
 596  normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
 597 {
 598     int8 shiftCount;
 599
 600     shiftCount = countLeadingZeros64( zSig ) - 1;
 601     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 602
 603 }
 604
 605 /*----------------------------------------------------------------------------
 606 | Returns the fraction bits of the extended double-precision floating-point
 607 | value `a'.
 608 *----------------------------------------------------------------------------*/
 609
 610 INLINE uint64_t extractFloatx80Frac( floatx80 a )
 611 {
 612
 613     return a.low;
 614
 615 }
 616
 617 /*----------------------------------------------------------------------------
 618 | Returns the exponent bits of the extended double-precision floating-point
 619 | value `a'.
 620 *----------------------------------------------------------------------------*/
 621
 622 INLINE int32 extractFloatx80Exp( floatx80 a )
 623 {
 624
 625     return a.high & 0x7FFF;
 626
 627 }
 628
 629 /*----------------------------------------------------------------------------
 630 | Returns the sign bit of the extended double-precision floating-point value
 631 | `a'.
 632 *----------------------------------------------------------------------------*/
 633
 634 INLINE flag extractFloatx80Sign( floatx80 a )
 635 {
 636
 637     return a.high>>15;
 638
 639 }
 640
 641 /*----------------------------------------------------------------------------
 642 | Normalizes the subnormal extended double-precision floating-point value
 643 | represented by the denormalized significand `aSig'.  The normalized exponent
 644 | and significand are stored at the locations pointed to by `zExpPtr' and
 645 | `zSigPtr', respectively.
 646 *----------------------------------------------------------------------------*/
 647
 648 static void
 649  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
 650 {
 651     int8 shiftCount;
 652
 653     shiftCount = countLeadingZeros64( aSig );
 654     *zSigPtr = aSig<<shiftCount;
 655     *zExpPtr = 1 - shiftCount;
 656
 657 }
 658
 659 /*----------------------------------------------------------------------------
 660 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 661 | extended double-precision floating-point value, returning the result.
 662 *----------------------------------------------------------------------------*/
 663
 664 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
 665 {
 666     floatx80 z;
 667
 668     z.low = zSig;
 669     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 670     return z;
 671
 672 }
 673
 674 /*----------------------------------------------------------------------------
 675 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 676 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 677 | and returns the proper extended double-precision floating-point value
 678 | corresponding to the abstract input.  Ordinarily, the abstract value is
 679 | rounded and packed into the extended double-precision format, with the
 680 | inexact exception raised if the abstract input cannot be represented
 681 | exactly.  However, if the abstract value is too large, the overflow and
 682 | inexact exceptions are raised and an infinity or maximal finite value is
 683 | returned.  If the abstract value is too small, the input value is rounded to
 684 | a subnormal number, and the underflow and inexact exceptions are raised if
 685 | the abstract input cannot be represented exactly as a subnormal extended
 686 | double-precision floating-point number.
 687 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 688 | number of bits as single or double precision, respectively.  Otherwise, the
 689 | result is rounded to the full precision of the extended double-precision
 690 | format.
 691 |     The input significand must be normalized or smaller.  If the input
 692 | significand is not normalized, `zExp' must be 0; in that case, the result
 693 | returned is a subnormal number, and it must not require rounding.  The
 694 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 695 | Floating-Point Arithmetic.
 696 *----------------------------------------------------------------------------*/
 697
 698 static floatx80
 699  roundAndPackFloatx80(
 700      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 701  STATUS_PARAM)
 702 {
 703     int8 roundingMode;
 704     flag roundNearestEven, increment, isTiny;
 705     int64 roundIncrement, roundMask, roundBits;
 706
 707     roundingMode = STATUS(float_rounding_mode);
 708     roundNearestEven = ( roundingMode == float_round_nearest_even );
 709     if ( roundingPrecision == 80 ) goto precision80;
 710     if ( roundingPrecision == 64 ) {
 711         roundIncrement = LIT64( 0x0000000000000400 );
 712         roundMask = LIT64( 0x00000000000007FF );
 713     }
 714     else if ( roundingPrecision == 32 ) {
 715         roundIncrement = LIT64( 0x0000008000000000 );
 716         roundMask = LIT64( 0x000000FFFFFFFFFF );
 717     }
 718     else {
 719         goto precision80;
 720     }
 721     zSig0 |= ( zSig1 != 0 );
 722     if ( ! roundNearestEven ) {
 723         if ( roundingMode == float_round_to_zero ) {
 724             roundIncrement = 0;
 725         }
 726         else {
 727             roundIncrement = roundMask;
 728             if ( zSign ) {
 729                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 730             }
 731             else {
 732                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 733             }
 734         }
 735     }
 736     roundBits = zSig0 & roundMask;
 737     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 738         if (    ( 0x7FFE < zExp )
 739              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 740            ) {
 741             goto overflow;
 742         }
 743         if ( zExp <= 0 ) {
 744             if (STATUS(flush_to_zero)) {
 745                 float_raise(float_flag_output_denormal STATUS_VAR);
 746                 return packFloatx80(zSign, 0, 0);
 747             }
 748             isTiny =
 749                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 750                 || ( zExp < 0 )
 751                 || ( zSig0 <= zSig0 + roundIncrement );
 752             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 753             zExp = 0;
 754             roundBits = zSig0 & roundMask;
 755             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 756             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 757             zSig0 += roundIncrement;
 758             if ( (int64_t) zSig0 < 0 ) zExp = 1;
 759             roundIncrement = roundMask + 1;
 760             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 761                 roundMask |= roundIncrement;
 762             }
 763             zSig0 &= ~ roundMask;
 764             return packFloatx80( zSign, zExp, zSig0 );
 765         }
 766     }
 767     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 768     zSig0 += roundIncrement;
 769     if ( zSig0 < roundIncrement ) {
 770         ++zExp;
 771         zSig0 = LIT64( 0x8000000000000000 );
 772     }
 773     roundIncrement = roundMask + 1;
 774     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 775         roundMask |= roundIncrement;
 776     }
 777     zSig0 &= ~ roundMask;
 778     if ( zSig0 == 0 ) zExp = 0;
 779     return packFloatx80( zSign, zExp, zSig0 );
 780  precision80:
 781     increment = ( (int64_t) zSig1 < 0 );
 782     if ( ! roundNearestEven ) {
 783         if ( roundingMode == float_round_to_zero ) {
 784             increment = 0;
 785         }
 786         else {
 787             if ( zSign ) {
 788                 increment = ( roundingMode == float_round_down ) && zSig1;
 789             }
 790             else {
 791                 increment = ( roundingMode == float_round_up ) && zSig1;
 792             }
 793         }
 794     }
 795     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 796         if (    ( 0x7FFE < zExp )
 797              || (    ( zExp == 0x7FFE )
 798                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 799                   && increment
 800                 )
 801            ) {
 802             roundMask = 0;
 803  overflow:
 804             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 805             if (    ( roundingMode == float_round_to_zero )
 806                  || ( zSign && ( roundingMode == float_round_up ) )
 807                  || ( ! zSign && ( roundingMode == float_round_down ) )
 808                ) {
 809                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 810             }
 811             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 812         }
 813         if ( zExp <= 0 ) {
 814             isTiny =
 815                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 816                 || ( zExp < 0 )
 817                 || ! increment
 818                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 819             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 820             zExp = 0;
 821             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 822             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 823             if ( roundNearestEven ) {
 824                 increment = ( (int64_t) zSig1 < 0 );
 825             }
 826             else {
 827                 if ( zSign ) {
 828                     increment = ( roundingMode == float_round_down ) && zSig1;
 829                 }
 830                 else {
 831                     increment = ( roundingMode == float_round_up ) && zSig1;
 832                 }
 833             }
 834             if ( increment ) {
 835                 ++zSig0;
 836                 zSig0 &=
 837                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 838                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
 839             }
 840             return packFloatx80( zSign, zExp, zSig0 );
 841         }
 842     }
 843     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 844     if ( increment ) {
 845         ++zSig0;
 846         if ( zSig0 == 0 ) {
 847             ++zExp;
 848             zSig0 = LIT64( 0x8000000000000000 );
 849         }
 850         else {
 851             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 852         }
 853     }
 854     else {
 855         if ( zSig0 == 0 ) zExp = 0;
 856     }
 857     return packFloatx80( zSign, zExp, zSig0 );
 858
 859 }
 860
 861 /*----------------------------------------------------------------------------
 862 | Takes an abstract floating-point value having sign `zSign', exponent
 863 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 864 | and returns the proper extended double-precision floating-point value
 865 | corresponding to the abstract input.  This routine is just like
 866 | `roundAndPackFloatx80' except that the input significand does not have to be
 867 | normalized.
 868 *----------------------------------------------------------------------------*/
 869
 870 static floatx80
 871  normalizeRoundAndPackFloatx80(
 872      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 873  STATUS_PARAM)
 874 {
 875     int8 shiftCount;
 876
 877     if ( zSig0 == 0 ) {
 878         zSig0 = zSig1;
 879         zSig1 = 0;
 880         zExp -= 64;
 881     }
 882     shiftCount = countLeadingZeros64( zSig0 );
 883     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 884     zExp -= shiftCount;
 885     return
 886         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 887
 888 }
 889
 890 /*----------------------------------------------------------------------------
 891 | Returns the least-significant 64 fraction bits of the quadruple-precision
 892 | floating-point value `a'.
 893 *----------------------------------------------------------------------------*/
 894
 895 INLINE uint64_t extractFloat128Frac1( float128 a )
 896 {
 897
 898     return a.low;
 899
 900 }
 901
 902 /*----------------------------------------------------------------------------
 903 | Returns the most-significant 48 fraction bits of the quadruple-precision
 904 | floating-point value `a'.
 905 *----------------------------------------------------------------------------*/
 906
 907 INLINE uint64_t extractFloat128Frac0( float128 a )
 908 {
 909
 910     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 911
 912 }
 913
 914 /*----------------------------------------------------------------------------
 915 | Returns the exponent bits of the quadruple-precision floating-point value
 916 | `a'.
 917 *----------------------------------------------------------------------------*/
 918
 919 INLINE int32 extractFloat128Exp( float128 a )
 920 {
 921
 922     return ( a.high>>48 ) & 0x7FFF;
 923
 924 }
 925
 926 /*----------------------------------------------------------------------------
 927 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 928 *----------------------------------------------------------------------------*/
 929
 930 INLINE flag extractFloat128Sign( float128 a )
 931 {
 932
 933     return a.high>>63;
 934
 935 }
 936
 937 /*----------------------------------------------------------------------------
 938 | Normalizes the subnormal quadruple-precision floating-point value
 939 | represented by the denormalized significand formed by the concatenation of
 940 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 941 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 942 | significand are stored at the location pointed to by `zSig0Ptr', and the
 943 | least significant 64 bits of the normalized significand are stored at the
 944 | location pointed to by `zSig1Ptr'.
 945 *----------------------------------------------------------------------------*/
 946
 947 static void
 948  normalizeFloat128Subnormal(
 949      uint64_t aSig0,
 950      uint64_t aSig1,
 951      int32 *zExpPtr,
 952      uint64_t *zSig0Ptr,
 953      uint64_t *zSig1Ptr
 954  )
 955 {
 956     int8 shiftCount;
 957
 958     if ( aSig0 == 0 ) {
 959         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 960         if ( shiftCount < 0 ) {
 961             *zSig0Ptr = aSig1>>( - shiftCount );
 962             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 963         }
 964         else {
 965             *zSig0Ptr = aSig1<<shiftCount;
 966             *zSig1Ptr = 0;
 967         }
 968         *zExpPtr = - shiftCount - 63;
 969     }
 970     else {
 971         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 972         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 973         *zExpPtr = 1 - shiftCount;
 974     }
 975
 976 }
 977
 978 /*----------------------------------------------------------------------------
 979 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 980 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 981 | floating-point value, returning the result.  After being shifted into the
 982 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 983 | added together to form the most significant 32 bits of the result.  This
 984 | means that any integer portion of `zSig0' will be added into the exponent.
 985 | Since a properly normalized significand will have an integer portion equal
 986 | to 1, the `zExp' input should be 1 less than the desired result exponent
 987 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 988 | significand.
 989 *----------------------------------------------------------------------------*/
 990
 991 INLINE float128
 992  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
 993 {
 994     float128 z;
 995
 996     z.low = zSig1;
 997     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
 998     return z;
 999
1000 }
1001
1002 /*----------------------------------------------------------------------------
1003 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1004 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1005 | and `zSig2', and returns the proper quadruple-precision floating-point value
1006 | corresponding to the abstract input.  Ordinarily, the abstract value is
1007 | simply rounded and packed into the quadruple-precision format, with the
1008 | inexact exception raised if the abstract input cannot be represented
1009 | exactly.  However, if the abstract value is too large, the overflow and
1010 | inexact exceptions are raised and an infinity or maximal finite value is
1011 | returned.  If the abstract value is too small, the input value is rounded to
1012 | a subnormal number, and the underflow and inexact exceptions are raised if
1013 | the abstract input cannot be represented exactly as a subnormal quadruple-
1014 | precision floating-point number.
1015 |     The input significand must be normalized or smaller.  If the input
1016 | significand is not normalized, `zExp' must be 0; in that case, the result
1017 | returned is a subnormal number, and it must not require rounding.  In the
1018 | usual case that the input significand is normalized, `zExp' must be 1 less
1019 | than the ``true'' floating-point exponent.  The handling of underflow and
1020 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1021 *----------------------------------------------------------------------------*/
1022
1023 static float128
1024  roundAndPackFloat128(
1025      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
1026 {
1027     int8 roundingMode;
1028     flag roundNearestEven, increment, isTiny;
1029
1030     roundingMode = STATUS(float_rounding_mode);
1031     roundNearestEven = ( roundingMode == float_round_nearest_even );
1032     increment = ( (int64_t) zSig2 < 0 );
1033     if ( ! roundNearestEven ) {
1034         if ( roundingMode == float_round_to_zero ) {
1035             increment = 0;
1036         }
1037         else {
1038             if ( zSign ) {
1039                 increment = ( roundingMode == float_round_down ) && zSig2;
1040             }
1041             else {
1042                 increment = ( roundingMode == float_round_up ) && zSig2;
1043             }
1044         }
1045     }
1046     if ( 0x7FFD <= (uint32_t) zExp ) {
1047         if (    ( 0x7FFD < zExp )
1048              || (    ( zExp == 0x7FFD )
1049                   && eq128(
1050                          LIT64( 0x0001FFFFFFFFFFFF ),
1051                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1052                          zSig0,
1053                          zSig1
1054                      )
1055                   && increment
1056                 )
1057            ) {
1058             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1059             if (    ( roundingMode == float_round_to_zero )
1060                  || ( zSign && ( roundingMode == float_round_up ) )
1061                  || ( ! zSign && ( roundingMode == float_round_down ) )
1062                ) {
1063                 return
1064                     packFloat128(
1065                         zSign,
1066                         0x7FFE,
1067                         LIT64( 0x0000FFFFFFFFFFFF ),
1068                         LIT64( 0xFFFFFFFFFFFFFFFF )
1069                     );
1070             }
1071             return packFloat128( zSign, 0x7FFF, 0, 0 );
1072         }
1073         if ( zExp < 0 ) {
1074             if (STATUS(flush_to_zero)) {
1075                 float_raise(float_flag_output_denormal STATUS_VAR);
1076                 return packFloat128(zSign, 0, 0, 0);
1077             }
1078             isTiny =
1079                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1080                 || ( zExp < -1 )
1081                 || ! increment
1082                 || lt128(
1083                        zSig0,
1084                        zSig1,
1085                        LIT64( 0x0001FFFFFFFFFFFF ),
1086                        LIT64( 0xFFFFFFFFFFFFFFFF )
1087                    );
1088             shift128ExtraRightJamming(
1089                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1090             zExp = 0;
1091             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1092             if ( roundNearestEven ) {
1093                 increment = ( (int64_t) zSig2 < 0 );
1094             }
1095             else {
1096                 if ( zSign ) {
1097                     increment = ( roundingMode == float_round_down ) && zSig2;
1098                 }
1099                 else {
1100                     increment = ( roundingMode == float_round_up ) && zSig2;
1101                 }
1102             }
1103         }
1104     }
1105     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1106     if ( increment ) {
1107         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1108         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1109     }
1110     else {
1111         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1112     }
1113     return packFloat128( zSign, zExp, zSig0, zSig1 );
1114
1115 }
1116
1117 /*----------------------------------------------------------------------------
1118 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1119 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1120 | returns the proper quadruple-precision floating-point value corresponding
1121 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1122 | except that the input significand has fewer bits and does not have to be
1123 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1124 | point exponent.
1125 *----------------------------------------------------------------------------*/
1126
1127 static float128
1128  normalizeRoundAndPackFloat128(
1129      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1130 {
1131     int8 shiftCount;
1132     uint64_t zSig2;
1133
1134     if ( zSig0 == 0 ) {
1135         zSig0 = zSig1;
1136         zSig1 = 0;
1137         zExp -= 64;
1138     }
1139     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1140     if ( 0 <= shiftCount ) {
1141         zSig2 = 0;
1142         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1143     }
1144     else {
1145         shift128ExtraRightJamming(
1146             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1147     }
1148     zExp -= shiftCount;
1149     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1150
1151 }
1152
1153 /*----------------------------------------------------------------------------
1154 | Returns the result of converting the 32-bit two's complement integer `a'
1155 | to the single-precision floating-point format.  The conversion is performed
1156 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1157 *----------------------------------------------------------------------------*/
1158
1159 float32 int32_to_float32(int32_t a STATUS_PARAM)
1160 {
1161     flag zSign;
1162
1163     if ( a == 0 ) return float32_zero;
1164     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1165     zSign = ( a < 0 );
1166     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1167
1168 }
1169
1170 /*----------------------------------------------------------------------------
1171 | Returns the result of converting the 32-bit two's complement integer `a'
1172 | to the double-precision floating-point format.  The conversion is performed
1173 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1174 *----------------------------------------------------------------------------*/
1175
1176 float64 int32_to_float64(int32_t a STATUS_PARAM)
1177 {
1178     flag zSign;
1179     uint32 absA;
1180     int8 shiftCount;
1181     uint64_t zSig;
1182
1183     if ( a == 0 ) return float64_zero;
1184     zSign = ( a < 0 );
1185     absA = zSign ? - a : a;
1186     shiftCount = countLeadingZeros32( absA ) + 21;
1187     zSig = absA;
1188     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1189
1190 }
1191
1192 /*----------------------------------------------------------------------------
1193 | Returns the result of converting the 32-bit two's complement integer `a'
1194 | to the extended double-precision floating-point format.  The conversion
1195 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1196 | Arithmetic.
1197 *----------------------------------------------------------------------------*/
1198
1199 floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
1200 {
1201     flag zSign;
1202     uint32 absA;
1203     int8 shiftCount;
1204     uint64_t zSig;
1205
1206     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1207     zSign = ( a < 0 );
1208     absA = zSign ? - a : a;
1209     shiftCount = countLeadingZeros32( absA ) + 32;
1210     zSig = absA;
1211     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1212
1213 }
1214
1215 /*----------------------------------------------------------------------------
1216 | Returns the result of converting the 32-bit two's complement integer `a' to
1217 | the quadruple-precision floating-point format.  The conversion is performed
1218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1219 *----------------------------------------------------------------------------*/
1220
1221 float128 int32_to_float128(int32_t a STATUS_PARAM)
1222 {
1223     flag zSign;
1224     uint32 absA;
1225     int8 shiftCount;
1226     uint64_t zSig0;
1227
1228     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1229     zSign = ( a < 0 );
1230     absA = zSign ? - a : a;
1231     shiftCount = countLeadingZeros32( absA ) + 17;
1232     zSig0 = absA;
1233     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1234
1235 }
1236
1237 /*----------------------------------------------------------------------------
1238 | Returns the result of converting the 64-bit two's complement integer `a'
1239 | to the single-precision floating-point format.  The conversion is performed
1240 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1241 *----------------------------------------------------------------------------*/
1242
1243 float32 int64_to_float32(int64_t a STATUS_PARAM)
1244 {
1245     flag zSign;
1246     uint64 absA;
1247     int8 shiftCount;
1248
1249     if ( a == 0 ) return float32_zero;
1250     zSign = ( a < 0 );
1251     absA = zSign ? - a : a;
1252     shiftCount = countLeadingZeros64( absA ) - 40;
1253     if ( 0 <= shiftCount ) {
1254         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1255     }
1256     else {
1257         shiftCount += 7;
1258         if ( shiftCount < 0 ) {
1259             shift64RightJamming( absA, - shiftCount, &absA );
1260         }
1261         else {
1262             absA <<= shiftCount;
1263         }
1264         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1265     }
1266
1267 }
1268
1269 float32 uint64_to_float32(uint64_t a STATUS_PARAM)
1270 {
1271     int8 shiftCount;
1272
1273     if ( a == 0 ) return float32_zero;
1274     shiftCount = countLeadingZeros64( a ) - 40;
1275     if ( 0 <= shiftCount ) {
1276         return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
1277     }
1278     else {
1279         shiftCount += 7;
1280         if ( shiftCount < 0 ) {
1281             shift64RightJamming( a, - shiftCount, &a );
1282         }
1283         else {
1284             a <<= shiftCount;
1285         }
1286         return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
1287     }
1288 }
1289
1290 /*----------------------------------------------------------------------------
1291 | Returns the result of converting the 64-bit two's complement integer `a'
1292 | to the double-precision floating-point format.  The conversion is performed
1293 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1294 *----------------------------------------------------------------------------*/
1295
1296 float64 int64_to_float64(int64_t a STATUS_PARAM)
1297 {
1298     flag zSign;
1299
1300     if ( a == 0 ) return float64_zero;
1301     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1302         return packFloat64( 1, 0x43E, 0 );
1303     }
1304     zSign = ( a < 0 );
1305     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1306
1307 }
1308
1309 float64 uint64_to_float64(uint64_t a STATUS_PARAM)
1310 {
1311     int exp =  0x43C;
1312
1313     if (a == 0) {
1314         return float64_zero;
1315     }
1316     if ((int64_t)a < 0) {
1317         shift64RightJamming(a, 1, &a);
1318         exp += 1;
1319     }
1320     return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
1321 }
1322
1323 /*----------------------------------------------------------------------------
1324 | Returns the result of converting the 64-bit two's complement integer `a'
1325 | to the extended double-precision floating-point format.  The conversion
1326 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1327 | Arithmetic.
1328 *----------------------------------------------------------------------------*/
1329
1330 floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
1331 {
1332     flag zSign;
1333     uint64 absA;
1334     int8 shiftCount;
1335
1336     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1337     zSign = ( a < 0 );
1338     absA = zSign ? - a : a;
1339     shiftCount = countLeadingZeros64( absA );
1340     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1341
1342 }
1343
1344 /*----------------------------------------------------------------------------
1345 | Returns the result of converting the 64-bit two's complement integer `a' to
1346 | the quadruple-precision floating-point format.  The conversion is performed
1347 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1348 *----------------------------------------------------------------------------*/
1349
1350 float128 int64_to_float128(int64_t a STATUS_PARAM)
1351 {
1352     flag zSign;
1353     uint64 absA;
1354     int8 shiftCount;
1355     int32 zExp;
1356     uint64_t zSig0, zSig1;
1357
1358     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1359     zSign = ( a < 0 );
1360     absA = zSign ? - a : a;
1361     shiftCount = countLeadingZeros64( absA ) + 49;
1362     zExp = 0x406E - shiftCount;
1363     if ( 64 <= shiftCount ) {
1364         zSig1 = 0;
1365         zSig0 = absA;
1366         shiftCount -= 64;
1367     }
1368     else {
1369         zSig1 = absA;
1370         zSig0 = 0;
1371     }
1372     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1373     return packFloat128( zSign, zExp, zSig0, zSig1 );
1374
1375 }
1376
1377 float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1378 {
1379     if (a == 0) {
1380         return float128_zero;
1381     }
1382     return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1383 }
1384
1385 /*----------------------------------------------------------------------------
1386 | Returns the result of converting the single-precision floating-point value
1387 | `a' to the 32-bit two's complement integer format.  The conversion is
1388 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1389 | Arithmetic---which means in particular that the conversion is rounded
1390 | according to the current rounding mode.  If `a' is a NaN, the largest
1391 | positive integer is returned.  Otherwise, if the conversion overflows, the
1392 | largest integer with the same sign as `a' is returned.
1393 *----------------------------------------------------------------------------*/
1394
1395 int32 float32_to_int32( float32 a STATUS_PARAM )
1396 {
1397     flag aSign;
1398     int_fast16_t aExp, shiftCount;
1399     uint32_t aSig;
1400     uint64_t aSig64;
1401
1402     a = float32_squash_input_denormal(a STATUS_VAR);
1403     aSig = extractFloat32Frac( a );
1404     aExp = extractFloat32Exp( a );
1405     aSign = extractFloat32Sign( a );
1406     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1407     if ( aExp ) aSig |= 0x00800000;
1408     shiftCount = 0xAF - aExp;
1409     aSig64 = aSig;
1410     aSig64 <<= 32;
1411     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1412     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1413
1414 }
1415
1416 /*----------------------------------------------------------------------------
1417 | Returns the result of converting the single-precision floating-point value
1418 | `a' to the 32-bit two's complement integer format.  The conversion is
1419 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1420 | Arithmetic, except that the conversion is always rounded toward zero.
1421 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1422 | the conversion overflows, the largest integer with the same sign as `a' is
1423 | returned.
1424 *----------------------------------------------------------------------------*/
1425
1426 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1427 {
1428     flag aSign;
1429     int_fast16_t aExp, shiftCount;
1430     uint32_t aSig;
1431     int32_t z;
1432     a = float32_squash_input_denormal(a STATUS_VAR);
1433
1434     aSig = extractFloat32Frac( a );
1435     aExp = extractFloat32Exp( a );
1436     aSign = extractFloat32Sign( a );
1437     shiftCount = aExp - 0x9E;
1438     if ( 0 <= shiftCount ) {
1439         if ( float32_val(a) != 0xCF000000 ) {
1440             float_raise( float_flag_invalid STATUS_VAR);
1441             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1442         }
1443         return (int32_t) 0x80000000;
1444     }
1445     else if ( aExp <= 0x7E ) {
1446         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1447         return 0;
1448     }
1449     aSig = ( aSig | 0x00800000 )<<8;
1450     z = aSig>>( - shiftCount );
1451     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1452         STATUS(float_exception_flags) |= float_flag_inexact;
1453     }
1454     if ( aSign ) z = - z;
1455     return z;
1456
1457 }
1458
1459 /*----------------------------------------------------------------------------
1460 | Returns the result of converting the single-precision floating-point value
1461 | `a' to the 16-bit two's complement integer format.  The conversion is
1462 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1463 | Arithmetic, except that the conversion is always rounded toward zero.
1464 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1465 | the conversion overflows, the largest integer with the same sign as `a' is
1466 | returned.
1467 *----------------------------------------------------------------------------*/
1468
1469 int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
1470 {
1471     flag aSign;
1472     int_fast16_t aExp, shiftCount;
1473     uint32_t aSig;
1474     int32 z;
1475
1476     aSig = extractFloat32Frac( a );
1477     aExp = extractFloat32Exp( a );
1478     aSign = extractFloat32Sign( a );
1479     shiftCount = aExp - 0x8E;
1480     if ( 0 <= shiftCount ) {
1481         if ( float32_val(a) != 0xC7000000 ) {
1482             float_raise( float_flag_invalid STATUS_VAR);
1483             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1484                 return 0x7FFF;
1485             }
1486         }
1487         return (int32_t) 0xffff8000;
1488     }
1489     else if ( aExp <= 0x7E ) {
1490         if ( aExp | aSig ) {
1491             STATUS(float_exception_flags) |= float_flag_inexact;
1492         }
1493         return 0;
1494     }
1495     shiftCount -= 0x10;
1496     aSig = ( aSig | 0x00800000 )<<8;
1497     z = aSig>>( - shiftCount );
1498     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1499         STATUS(float_exception_flags) |= float_flag_inexact;
1500     }
1501     if ( aSign ) {
1502         z = - z;
1503     }
1504     return z;
1505
1506 }
1507
1508 /*----------------------------------------------------------------------------
1509 | Returns the result of converting the single-precision floating-point value
1510 | `a' to the 64-bit two's complement integer format.  The conversion is
1511 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1512 | Arithmetic---which means in particular that the conversion is rounded
1513 | according to the current rounding mode.  If `a' is a NaN, the largest
1514 | positive integer is returned.  Otherwise, if the conversion overflows, the
1515 | largest integer with the same sign as `a' is returned.
1516 *----------------------------------------------------------------------------*/
1517
1518 int64 float32_to_int64( float32 a STATUS_PARAM )
1519 {
1520     flag aSign;
1521     int_fast16_t aExp, shiftCount;
1522     uint32_t aSig;
1523     uint64_t aSig64, aSigExtra;
1524     a = float32_squash_input_denormal(a STATUS_VAR);
1525
1526     aSig = extractFloat32Frac( a );
1527     aExp = extractFloat32Exp( a );
1528     aSign = extractFloat32Sign( a );
1529     shiftCount = 0xBE - aExp;
1530     if ( shiftCount < 0 ) {
1531         float_raise( float_flag_invalid STATUS_VAR);
1532         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1533             return LIT64( 0x7FFFFFFFFFFFFFFF );
1534         }
1535         return (int64_t) LIT64( 0x8000000000000000 );
1536     }
1537     if ( aExp ) aSig |= 0x00800000;
1538     aSig64 = aSig;
1539     aSig64 <<= 40;
1540     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1541     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1542
1543 }
1544
1545 /*----------------------------------------------------------------------------
1546 | Returns the result of converting the single-precision floating-point value
1547 | `a' to the 64-bit unsigned integer format.  The conversion is
1548 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1549 | Arithmetic---which means in particular that the conversion is rounded
1550 | according to the current rounding mode.  If `a' is a NaN, the largest
1551 | unsigned integer is returned.  Otherwise, if the conversion overflows, the
1552 | largest unsigned integer is returned.  If the 'a' is negative, the result
1553 | is rounded and zero is returned; values that do not round to zero will
1554 | raise the inexact exception flag.
1555 *----------------------------------------------------------------------------*/
1556
1557 uint64 float32_to_uint64(float32 a STATUS_PARAM)
1558 {
1559     flag aSign;
1560     int_fast16_t aExp, shiftCount;
1561     uint32_t aSig;
1562     uint64_t aSig64, aSigExtra;
1563     a = float32_squash_input_denormal(a STATUS_VAR);
1564
1565     aSig = extractFloat32Frac(a);
1566     aExp = extractFloat32Exp(a);
1567     aSign = extractFloat32Sign(a);
1568     if ((aSign) && (aExp > 126)) {
1569         float_raise(float_flag_invalid STATUS_VAR);
1570         if (float32_is_any_nan(a)) {
1571             return LIT64(0xFFFFFFFFFFFFFFFF);
1572         } else {
1573             return 0;
1574         }
1575     }
1576     shiftCount = 0xBE - aExp;
1577     if (aExp) {
1578         aSig |= 0x00800000;
1579     }
1580     if (shiftCount < 0) {
1581         float_raise(float_flag_invalid STATUS_VAR);
1582         return LIT64(0xFFFFFFFFFFFFFFFF);
1583     }
1584
1585     aSig64 = aSig;
1586     aSig64 <<= 40;
1587     shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1588     return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1589 }
1590
1591 /*----------------------------------------------------------------------------
1592 | Returns the result of converting the single-precision floating-point value
1593 | `a' to the 64-bit two's complement integer format.  The conversion is
1594 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1595 | Arithmetic, except that the conversion is always rounded toward zero.  If
1596 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1597 | conversion overflows, the largest integer with the same sign as `a' is
1598 | returned.
1599 *----------------------------------------------------------------------------*/
1600
1601 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1602 {
1603     flag aSign;
1604     int_fast16_t aExp, shiftCount;
1605     uint32_t aSig;
1606     uint64_t aSig64;
1607     int64 z;
1608     a = float32_squash_input_denormal(a STATUS_VAR);
1609
1610     aSig = extractFloat32Frac( a );
1611     aExp = extractFloat32Exp( a );
1612     aSign = extractFloat32Sign( a );
1613     shiftCount = aExp - 0xBE;
1614     if ( 0 <= shiftCount ) {
1615         if ( float32_val(a) != 0xDF000000 ) {
1616             float_raise( float_flag_invalid STATUS_VAR);
1617             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1618                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1619             }
1620         }
1621         return (int64_t) LIT64( 0x8000000000000000 );
1622     }
1623     else if ( aExp <= 0x7E ) {
1624         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1625         return 0;
1626     }
1627     aSig64 = aSig | 0x00800000;
1628     aSig64 <<= 40;
1629     z = aSig64>>( - shiftCount );
1630     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1631         STATUS(float_exception_flags) |= float_flag_inexact;
1632     }
1633     if ( aSign ) z = - z;
1634     return z;
1635
1636 }
1637
1638 /*----------------------------------------------------------------------------
1639 | Returns the result of converting the single-precision floating-point value
1640 | `a' to the double-precision floating-point format.  The conversion is
1641 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1642 | Arithmetic.
1643 *----------------------------------------------------------------------------*/
1644
1645 float64 float32_to_float64( float32 a STATUS_PARAM )
1646 {
1647     flag aSign;
1648     int_fast16_t aExp;
1649     uint32_t aSig;
1650     a = float32_squash_input_denormal(a STATUS_VAR);
1651
1652     aSig = extractFloat32Frac( a );
1653     aExp = extractFloat32Exp( a );
1654     aSign = extractFloat32Sign( a );
1655     if ( aExp == 0xFF ) {
1656         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1657         return packFloat64( aSign, 0x7FF, 0 );
1658     }
1659     if ( aExp == 0 ) {
1660         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1661         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1662         --aExp;
1663     }
1664     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1665
1666 }
1667
1668 /*----------------------------------------------------------------------------
1669 | Returns the result of converting the single-precision floating-point value
1670 | `a' to the extended double-precision floating-point format.  The conversion
1671 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1672 | Arithmetic.
1673 *----------------------------------------------------------------------------*/
1674
1675 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1676 {
1677     flag aSign;
1678     int_fast16_t aExp;
1679     uint32_t aSig;
1680
1681     a = float32_squash_input_denormal(a STATUS_VAR);
1682     aSig = extractFloat32Frac( a );
1683     aExp = extractFloat32Exp( a );
1684     aSign = extractFloat32Sign( a );
1685     if ( aExp == 0xFF ) {
1686         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1687         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1688     }
1689     if ( aExp == 0 ) {
1690         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1691         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1692     }
1693     aSig |= 0x00800000;
1694     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1695
1696 }
1697
1698 /*----------------------------------------------------------------------------
1699 | Returns the result of converting the single-precision floating-point value
1700 | `a' to the double-precision floating-point format.  The conversion is
1701 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1702 | Arithmetic.
1703 *----------------------------------------------------------------------------*/
1704
1705 float128 float32_to_float128( float32 a STATUS_PARAM )
1706 {
1707     flag aSign;
1708     int_fast16_t aExp;
1709     uint32_t aSig;
1710
1711     a = float32_squash_input_denormal(a STATUS_VAR);
1712     aSig = extractFloat32Frac( a );
1713     aExp = extractFloat32Exp( a );
1714     aSign = extractFloat32Sign( a );
1715     if ( aExp == 0xFF ) {
1716         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1717         return packFloat128( aSign, 0x7FFF, 0, 0 );
1718     }
1719     if ( aExp == 0 ) {
1720         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1721         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1722         --aExp;
1723     }
1724     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1725
1726 }
1727
1728 /*----------------------------------------------------------------------------
1729 | Rounds the single-precision floating-point value `a' to an integer, and
1730 | returns the result as a single-precision floating-point value.  The
1731 | operation is performed according to the IEC/IEEE Standard for Binary
1732 | Floating-Point Arithmetic.
1733 *----------------------------------------------------------------------------*/
1734
1735 float32 float32_round_to_int( float32 a STATUS_PARAM)
1736 {
1737     flag aSign;
1738     int_fast16_t aExp;
1739     uint32_t lastBitMask, roundBitsMask;
1740     int8 roundingMode;
1741     uint32_t z;
1742     a = float32_squash_input_denormal(a STATUS_VAR);
1743
1744     aExp = extractFloat32Exp( a );
1745     if ( 0x96 <= aExp ) {
1746         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1747             return propagateFloat32NaN( a, a STATUS_VAR );
1748         }
1749         return a;
1750     }
1751     if ( aExp <= 0x7E ) {
1752         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1753         STATUS(float_exception_flags) |= float_flag_inexact;
1754         aSign = extractFloat32Sign( a );
1755         switch ( STATUS(float_rounding_mode) ) {
1756          case float_round_nearest_even:
1757             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1758                 return packFloat32( aSign, 0x7F, 0 );
1759             }
1760             break;
1761          case float_round_down:
1762             return make_float32(aSign ? 0xBF800000 : 0);
1763          case float_round_up:
1764             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1765         }
1766         return packFloat32( aSign, 0, 0 );
1767     }
1768     lastBitMask = 1;
1769     lastBitMask <<= 0x96 - aExp;
1770     roundBitsMask = lastBitMask - 1;
1771     z = float32_val(a);
1772     roundingMode = STATUS(float_rounding_mode);
1773     if ( roundingMode == float_round_nearest_even ) {
1774         z += lastBitMask>>1;
1775         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1776     }
1777     else if ( roundingMode != float_round_to_zero ) {
1778         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1779             z += roundBitsMask;
1780         }
1781     }
1782     z &= ~ roundBitsMask;
1783     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1784     return make_float32(z);
1785
1786 }
1787
1788 /*----------------------------------------------------------------------------
1789 | Returns the result of adding the absolute values of the single-precision
1790 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1791 | before being returned.  `zSign' is ignored if the result is a NaN.
1792 | The addition is performed according to the IEC/IEEE Standard for Binary
1793 | Floating-Point Arithmetic.
1794 *----------------------------------------------------------------------------*/
1795
1796 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1797 {
1798     int_fast16_t aExp, bExp, zExp;
1799     uint32_t aSig, bSig, zSig;
1800     int_fast16_t expDiff;
1801
1802     aSig = extractFloat32Frac( a );
1803     aExp = extractFloat32Exp( a );
1804     bSig = extractFloat32Frac( b );
1805     bExp = extractFloat32Exp( b );
1806     expDiff = aExp - bExp;
1807     aSig <<= 6;
1808     bSig <<= 6;
1809     if ( 0 < expDiff ) {
1810         if ( aExp == 0xFF ) {
1811             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1812             return a;
1813         }
1814         if ( bExp == 0 ) {
1815             --expDiff;
1816         }
1817         else {
1818             bSig |= 0x20000000;
1819         }
1820         shift32RightJamming( bSig, expDiff, &bSig );
1821         zExp = aExp;
1822     }
1823     else if ( expDiff < 0 ) {
1824         if ( bExp == 0xFF ) {
1825             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1826             return packFloat32( zSign, 0xFF, 0 );
1827         }
1828         if ( aExp == 0 ) {
1829             ++expDiff;
1830         }
1831         else {
1832             aSig |= 0x20000000;
1833         }
1834         shift32RightJamming( aSig, - expDiff, &aSig );
1835         zExp = bExp;
1836     }
1837     else {
1838         if ( aExp == 0xFF ) {
1839             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1840             return a;
1841         }
1842         if ( aExp == 0 ) {
1843             if (STATUS(flush_to_zero)) {
1844                 if (aSig | bSig) {
1845                     float_raise(float_flag_output_denormal STATUS_VAR);
1846                 }
1847                 return packFloat32(zSign, 0, 0);
1848             }
1849             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1850         }
1851         zSig = 0x40000000 + aSig + bSig;
1852         zExp = aExp;
1853         goto roundAndPack;
1854     }
1855     aSig |= 0x20000000;
1856     zSig = ( aSig + bSig )<<1;
1857     --zExp;
1858     if ( (int32_t) zSig < 0 ) {
1859         zSig = aSig + bSig;
1860         ++zExp;
1861     }
1862  roundAndPack:
1863     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1864
1865 }
1866
1867 /*----------------------------------------------------------------------------
1868 | Returns the result of subtracting the absolute values of the single-
1869 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1870 | difference is negated before being returned.  `zSign' is ignored if the
1871 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1872 | Standard for Binary Floating-Point Arithmetic.
1873 *----------------------------------------------------------------------------*/
1874
1875 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1876 {
1877     int_fast16_t aExp, bExp, zExp;
1878     uint32_t aSig, bSig, zSig;
1879     int_fast16_t expDiff;
1880
1881     aSig = extractFloat32Frac( a );
1882     aExp = extractFloat32Exp( a );
1883     bSig = extractFloat32Frac( b );
1884     bExp = extractFloat32Exp( b );
1885     expDiff = aExp - bExp;
1886     aSig <<= 7;
1887     bSig <<= 7;
1888     if ( 0 < expDiff ) goto aExpBigger;
1889     if ( expDiff < 0 ) goto bExpBigger;
1890     if ( aExp == 0xFF ) {
1891         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1892         float_raise( float_flag_invalid STATUS_VAR);
1893         return float32_default_nan;
1894     }
1895     if ( aExp == 0 ) {
1896         aExp = 1;
1897         bExp = 1;
1898     }
1899     if ( bSig < aSig ) goto aBigger;
1900     if ( aSig < bSig ) goto bBigger;
1901     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1902  bExpBigger:
1903     if ( bExp == 0xFF ) {
1904         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1905         return packFloat32( zSign ^ 1, 0xFF, 0 );
1906     }
1907     if ( aExp == 0 ) {
1908         ++expDiff;
1909     }
1910     else {
1911         aSig |= 0x40000000;
1912     }
1913     shift32RightJamming( aSig, - expDiff, &aSig );
1914     bSig |= 0x40000000;
1915  bBigger:
1916     zSig = bSig - aSig;
1917     zExp = bExp;
1918     zSign ^= 1;
1919     goto normalizeRoundAndPack;
1920  aExpBigger:
1921     if ( aExp == 0xFF ) {
1922         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1923         return a;
1924     }
1925     if ( bExp == 0 ) {
1926         --expDiff;
1927     }
1928     else {
1929         bSig |= 0x40000000;
1930     }
1931     shift32RightJamming( bSig, expDiff, &bSig );
1932     aSig |= 0x40000000;
1933  aBigger:
1934     zSig = aSig - bSig;
1935     zExp = aExp;
1936  normalizeRoundAndPack:
1937     --zExp;
1938     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1939
1940 }
1941
1942 /*----------------------------------------------------------------------------
1943 | Returns the result of adding the single-precision floating-point values `a'
1944 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1945 | Binary Floating-Point Arithmetic.
1946 *----------------------------------------------------------------------------*/
1947
1948 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1949 {
1950     flag aSign, bSign;
1951     a = float32_squash_input_denormal(a STATUS_VAR);
1952     b = float32_squash_input_denormal(b STATUS_VAR);
1953
1954     aSign = extractFloat32Sign( a );
1955     bSign = extractFloat32Sign( b );
1956     if ( aSign == bSign ) {
1957         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1958     }
1959     else {
1960         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1961     }
1962
1963 }
1964
1965 /*----------------------------------------------------------------------------
1966 | Returns the result of subtracting the single-precision floating-point values
1967 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1968 | for Binary Floating-Point Arithmetic.
1969 *----------------------------------------------------------------------------*/
1970
1971 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1972 {
1973     flag aSign, bSign;
1974     a = float32_squash_input_denormal(a STATUS_VAR);
1975     b = float32_squash_input_denormal(b STATUS_VAR);
1976
1977     aSign = extractFloat32Sign( a );
1978     bSign = extractFloat32Sign( b );
1979     if ( aSign == bSign ) {
1980         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1981     }
1982     else {
1983         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1984     }
1985
1986 }
1987
1988 /*----------------------------------------------------------------------------
1989 | Returns the result of multiplying the single-precision floating-point values
1990 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1991 | for Binary Floating-Point Arithmetic.
1992 *----------------------------------------------------------------------------*/
1993
1994 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1995 {
1996     flag aSign, bSign, zSign;
1997     int_fast16_t aExp, bExp, zExp;
1998     uint32_t aSig, bSig;
1999     uint64_t zSig64;
2000     uint32_t zSig;
2001
2002     a = float32_squash_input_denormal(a STATUS_VAR);
2003     b = float32_squash_input_denormal(b STATUS_VAR);
2004
2005     aSig = extractFloat32Frac( a );
2006     aExp = extractFloat32Exp( a );
2007     aSign = extractFloat32Sign( a );
2008     bSig = extractFloat32Frac( b );
2009     bExp = extractFloat32Exp( b );
2010     bSign = extractFloat32Sign( b );
2011     zSign = aSign ^ bSign;
2012     if ( aExp == 0xFF ) {
2013         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2014             return propagateFloat32NaN( a, b STATUS_VAR );
2015         }
2016         if ( ( bExp | bSig ) == 0 ) {
2017             float_raise( float_flag_invalid STATUS_VAR);
2018             return float32_default_nan;
2019         }
2020         return packFloat32( zSign, 0xFF, 0 );
2021     }
2022     if ( bExp == 0xFF ) {
2023         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2024         if ( ( aExp | aSig ) == 0 ) {
2025             float_raise( float_flag_invalid STATUS_VAR);
2026             return float32_default_nan;
2027         }
2028         return packFloat32( zSign, 0xFF, 0 );
2029     }
2030     if ( aExp == 0 ) {
2031         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2032         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2033     }
2034     if ( bExp == 0 ) {
2035         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2036         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2037     }
2038     zExp = aExp + bExp - 0x7F;
2039     aSig = ( aSig | 0x00800000 )<<7;
2040     bSig = ( bSig | 0x00800000 )<<8;
2041     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2042     zSig = zSig64;
2043     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2044         zSig <<= 1;
2045         --zExp;
2046     }
2047     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2048
2049 }
2050
2051 /*----------------------------------------------------------------------------
2052 | Returns the result of dividing the single-precision floating-point value `a'
2053 | by the corresponding value `b'.  The operation is performed according to the
2054 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2055 *----------------------------------------------------------------------------*/
2056
2057 float32 float32_div( float32 a, float32 b STATUS_PARAM )
2058 {
2059     flag aSign, bSign, zSign;
2060     int_fast16_t aExp, bExp, zExp;
2061     uint32_t aSig, bSig, zSig;
2062     a = float32_squash_input_denormal(a STATUS_VAR);
2063     b = float32_squash_input_denormal(b STATUS_VAR);
2064
2065     aSig = extractFloat32Frac( a );
2066     aExp = extractFloat32Exp( a );
2067     aSign = extractFloat32Sign( a );
2068     bSig = extractFloat32Frac( b );
2069     bExp = extractFloat32Exp( b );
2070     bSign = extractFloat32Sign( b );
2071     zSign = aSign ^ bSign;
2072     if ( aExp == 0xFF ) {
2073         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2074         if ( bExp == 0xFF ) {
2075             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2076             float_raise( float_flag_invalid STATUS_VAR);
2077             return float32_default_nan;
2078         }
2079         return packFloat32( zSign, 0xFF, 0 );
2080     }
2081     if ( bExp == 0xFF ) {
2082         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2083         return packFloat32( zSign, 0, 0 );
2084     }
2085     if ( bExp == 0 ) {
2086         if ( bSig == 0 ) {
2087             if ( ( aExp | aSig ) == 0 ) {
2088                 float_raise( float_flag_invalid STATUS_VAR);
2089                 return float32_default_nan;
2090             }
2091             float_raise( float_flag_divbyzero STATUS_VAR);
2092             return packFloat32( zSign, 0xFF, 0 );
2093         }
2094         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2095     }
2096     if ( aExp == 0 ) {
2097         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2098         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2099     }
2100     zExp = aExp - bExp + 0x7D;
2101     aSig = ( aSig | 0x00800000 )<<7;
2102     bSig = ( bSig | 0x00800000 )<<8;
2103     if ( bSig <= ( aSig + aSig ) ) {
2104         aSig >>= 1;
2105         ++zExp;
2106     }
2107     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2108     if ( ( zSig & 0x3F ) == 0 ) {
2109         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2110     }
2111     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2112
2113 }
2114
2115 /*----------------------------------------------------------------------------
2116 | Returns the remainder of the single-precision floating-point value `a'
2117 | with respect to the corresponding value `b'.  The operation is performed
2118 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2119 *----------------------------------------------------------------------------*/
2120
2121 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2122 {
2123     flag aSign, zSign;
2124     int_fast16_t aExp, bExp, expDiff;
2125     uint32_t aSig, bSig;
2126     uint32_t q;
2127     uint64_t aSig64, bSig64, q64;
2128     uint32_t alternateASig;
2129     int32_t sigMean;
2130     a = float32_squash_input_denormal(a STATUS_VAR);
2131     b = float32_squash_input_denormal(b STATUS_VAR);
2132
2133     aSig = extractFloat32Frac( a );
2134     aExp = extractFloat32Exp( a );
2135     aSign = extractFloat32Sign( a );
2136     bSig = extractFloat32Frac( b );
2137     bExp = extractFloat32Exp( b );
2138     if ( aExp == 0xFF ) {
2139         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2140             return propagateFloat32NaN( a, b STATUS_VAR );
2141         }
2142         float_raise( float_flag_invalid STATUS_VAR);
2143         return float32_default_nan;
2144     }
2145     if ( bExp == 0xFF ) {
2146         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2147         return a;
2148     }
2149     if ( bExp == 0 ) {
2150         if ( bSig == 0 ) {
2151             float_raise( float_flag_invalid STATUS_VAR);
2152             return float32_default_nan;
2153         }
2154         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2155     }
2156     if ( aExp == 0 ) {
2157         if ( aSig == 0 ) return a;
2158         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2159     }
2160     expDiff = aExp - bExp;
2161     aSig |= 0x00800000;
2162     bSig |= 0x00800000;
2163     if ( expDiff < 32 ) {
2164         aSig <<= 8;
2165         bSig <<= 8;
2166         if ( expDiff < 0 ) {
2167             if ( expDiff < -1 ) return a;
2168             aSig >>= 1;
2169         }
2170         q = ( bSig <= aSig );
2171         if ( q ) aSig -= bSig;
2172         if ( 0 < expDiff ) {
2173             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2174             q >>= 32 - expDiff;
2175             bSig >>= 2;
2176             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2177         }
2178         else {
2179             aSig >>= 2;
2180             bSig >>= 2;
2181         }
2182     }
2183     else {
2184         if ( bSig <= aSig ) aSig -= bSig;
2185         aSig64 = ( (uint64_t) aSig )<<40;
2186         bSig64 = ( (uint64_t) bSig )<<40;
2187         expDiff -= 64;
2188         while ( 0 < expDiff ) {
2189             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2190             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2191             aSig64 = - ( ( bSig * q64 )<<38 );
2192             expDiff -= 62;
2193         }
2194         expDiff += 64;
2195         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2196         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2197         q = q64>>( 64 - expDiff );
2198         bSig <<= 6;
2199         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2200     }
2201     do {
2202         alternateASig = aSig;
2203         ++q;
2204         aSig -= bSig;
2205     } while ( 0 <= (int32_t) aSig );
2206     sigMean = aSig + alternateASig;
2207     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2208         aSig = alternateASig;
2209     }
2210     zSign = ( (int32_t) aSig < 0 );
2211     if ( zSign ) aSig = - aSig;
2212     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2213
2214 }
2215
2216 /*----------------------------------------------------------------------------
2217 | Returns the result of multiplying the single-precision floating-point values
2218 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2219 | multiplication.  The operation is performed according to the IEC/IEEE
2220 | Standard for Binary Floating-Point Arithmetic 754-2008.
2221 | The flags argument allows the caller to select negation of the
2222 | addend, the intermediate product, or the final result. (The difference
2223 | between this and having the caller do a separate negation is that negating
2224 | externally will flip the sign bit on NaNs.)
2225 *----------------------------------------------------------------------------*/
2226
2227 float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2228 {
2229     flag aSign, bSign, cSign, zSign;
2230     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2231     uint32_t aSig, bSig, cSig;
2232     flag pInf, pZero, pSign;
2233     uint64_t pSig64, cSig64, zSig64;
2234     uint32_t pSig;
2235     int shiftcount;
2236     flag signflip, infzero;
2237
2238     a = float32_squash_input_denormal(a STATUS_VAR);
2239     b = float32_squash_input_denormal(b STATUS_VAR);
2240     c = float32_squash_input_denormal(c STATUS_VAR);
2241     aSig = extractFloat32Frac(a);
2242     aExp = extractFloat32Exp(a);
2243     aSign = extractFloat32Sign(a);
2244     bSig = extractFloat32Frac(b);
2245     bExp = extractFloat32Exp(b);
2246     bSign = extractFloat32Sign(b);
2247     cSig = extractFloat32Frac(c);
2248     cExp = extractFloat32Exp(c);
2249     cSign = extractFloat32Sign(c);
2250
2251     infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2252                (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2253
2254     /* It is implementation-defined whether the cases of (0,inf,qnan)
2255      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2256      * they return if they do), so we have to hand this information
2257      * off to the target-specific pick-a-NaN routine.
2258      */
2259     if (((aExp == 0xff) && aSig) ||
2260         ((bExp == 0xff) && bSig) ||
2261         ((cExp == 0xff) && cSig)) {
2262         return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2263     }
2264
2265     if (infzero) {
2266         float_raise(float_flag_invalid STATUS_VAR);
2267         return float32_default_nan;
2268     }
2269
2270     if (flags & float_muladd_negate_c) {
2271         cSign ^= 1;
2272     }
2273
2274     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2275
2276     /* Work out the sign and type of the product */
2277     pSign = aSign ^ bSign;
2278     if (flags & float_muladd_negate_product) {
2279         pSign ^= 1;
2280     }
2281     pInf = (aExp == 0xff) || (bExp == 0xff);
2282     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2283
2284     if (cExp == 0xff) {
2285         if (pInf && (pSign ^ cSign)) {
2286             /* addition of opposite-signed infinities => InvalidOperation */
2287             float_raise(float_flag_invalid STATUS_VAR);
2288             return float32_default_nan;
2289         }
2290         /* Otherwise generate an infinity of the same sign */
2291         return packFloat32(cSign ^ signflip, 0xff, 0);
2292     }
2293
2294     if (pInf) {
2295         return packFloat32(pSign ^ signflip, 0xff, 0);
2296     }
2297
2298     if (pZero) {
2299         if (cExp == 0) {
2300             if (cSig == 0) {
2301                 /* Adding two exact zeroes */
2302                 if (pSign == cSign) {
2303                     zSign = pSign;
2304                 } else if (STATUS(float_rounding_mode) == float_round_down) {
2305                     zSign = 1;
2306                 } else {
2307                     zSign = 0;
2308                 }
2309                 return packFloat32(zSign ^ signflip, 0, 0);
2310             }
2311             /* Exact zero plus a denorm */
2312             if (STATUS(flush_to_zero)) {
2313                 float_raise(float_flag_output_denormal STATUS_VAR);
2314                 return packFloat32(cSign ^ signflip, 0, 0);
2315             }
2316         }
2317         /* Zero plus something non-zero : just return the something */
2318         return packFloat32(cSign ^ signflip, cExp, cSig);
2319     }
2320
2321     if (aExp == 0) {
2322         normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2323     }
2324     if (bExp == 0) {
2325         normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2326     }
2327
2328     /* Calculate the actual result a * b + c */
2329
2330     /* Multiply first; this is easy. */
2331     /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2332      * because we want the true exponent, not the "one-less-than"
2333      * flavour that roundAndPackFloat32() takes.
2334      */
2335     pExp = aExp + bExp - 0x7e;
2336     aSig = (aSig | 0x00800000) << 7;
2337     bSig = (bSig | 0x00800000) << 8;
2338     pSig64 = (uint64_t)aSig * bSig;
2339     if ((int64_t)(pSig64 << 1) >= 0) {
2340         pSig64 <<= 1;
2341         pExp--;
2342     }
2343
2344     zSign = pSign ^ signflip;
2345
2346     /* Now pSig64 is the significand of the multiply, with the explicit bit in
2347      * position 62.
2348      */
2349     if (cExp == 0) {
2350         if (!cSig) {
2351             /* Throw out the special case of c being an exact zero now */
2352             shift64RightJamming(pSig64, 32, &pSig64);
2353             pSig = pSig64;
2354             return roundAndPackFloat32(zSign, pExp - 1,
2355                                        pSig STATUS_VAR);
2356         }
2357         normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2358     }
2359
2360     cSig64 = (uint64_t)cSig << (62 - 23);
2361     cSig64 |= LIT64(0x4000000000000000);
2362     expDiff = pExp - cExp;
2363
2364     if (pSign == cSign) {
2365         /* Addition */
2366         if (expDiff > 0) {
2367             /* scale c to match p */
2368             shift64RightJamming(cSig64, expDiff, &cSig64);
2369             zExp = pExp;
2370         } else if (expDiff < 0) {
2371             /* scale p to match c */
2372             shift64RightJamming(pSig64, -expDiff, &pSig64);
2373             zExp = cExp;
2374         } else {
2375             /* no scaling needed */
2376             zExp = cExp;
2377         }
2378         /* Add significands and make sure explicit bit ends up in posn 62 */
2379         zSig64 = pSig64 + cSig64;
2380         if ((int64_t)zSig64 < 0) {
2381             shift64RightJamming(zSig64, 1, &zSig64);
2382         } else {
2383             zExp--;
2384         }
2385     } else {
2386         /* Subtraction */
2387         if (expDiff > 0) {
2388             shift64RightJamming(cSig64, expDiff, &cSig64);
2389             zSig64 = pSig64 - cSig64;
2390             zExp = pExp;
2391         } else if (expDiff < 0) {
2392             shift64RightJamming(pSig64, -expDiff, &pSig64);
2393             zSig64 = cSig64 - pSig64;
2394             zExp = cExp;
2395             zSign ^= 1;
2396         } else {
2397             zExp = pExp;
2398             if (cSig64 < pSig64) {
2399                 zSig64 = pSig64 - cSig64;
2400             } else if (pSig64 < cSig64) {
2401                 zSig64 = cSig64 - pSig64;
2402                 zSign ^= 1;
2403             } else {
2404                 /* Exact zero */
2405                 zSign = signflip;
2406                 if (STATUS(float_rounding_mode) == float_round_down) {
2407                     zSign ^= 1;
2408                 }
2409                 return packFloat32(zSign, 0, 0);
2410             }
2411         }
2412         --zExp;
2413         /* Normalize to put the explicit bit back into bit 62. */
2414         shiftcount = countLeadingZeros64(zSig64) - 1;
2415         zSig64 <<= shiftcount;
2416         zExp -= shiftcount;
2417     }
2418     shift64RightJamming(zSig64, 32, &zSig64);
2419     return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2420 }
2421
2422
2423 /*----------------------------------------------------------------------------
2424 | Returns the square root of the single-precision floating-point value `a'.
2425 | The operation is performed according to the IEC/IEEE Standard for Binary
2426 | Floating-Point Arithmetic.
2427 *----------------------------------------------------------------------------*/
2428
2429 float32 float32_sqrt( float32 a STATUS_PARAM )
2430 {
2431     flag aSign;
2432     int_fast16_t aExp, zExp;
2433     uint32_t aSig, zSig;
2434     uint64_t rem, term;
2435     a = float32_squash_input_denormal(a STATUS_VAR);
2436
2437     aSig = extractFloat32Frac( a );
2438     aExp = extractFloat32Exp( a );
2439     aSign = extractFloat32Sign( a );
2440     if ( aExp == 0xFF ) {
2441         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2442         if ( ! aSign ) return a;
2443         float_raise( float_flag_invalid STATUS_VAR);
2444         return float32_default_nan;
2445     }
2446     if ( aSign ) {
2447         if ( ( aExp | aSig ) == 0 ) return a;
2448         float_raise( float_flag_invalid STATUS_VAR);
2449         return float32_default_nan;
2450     }
2451     if ( aExp == 0 ) {
2452         if ( aSig == 0 ) return float32_zero;
2453         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2454     }
2455     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2456     aSig = ( aSig | 0x00800000 )<<8;
2457     zSig = estimateSqrt32( aExp, aSig ) + 2;
2458     if ( ( zSig & 0x7F ) <= 5 ) {
2459         if ( zSig < 2 ) {
2460             zSig = 0x7FFFFFFF;
2461             goto roundAndPack;
2462         }
2463         aSig >>= aExp & 1;
2464         term = ( (uint64_t) zSig ) * zSig;
2465         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2466         while ( (int64_t) rem < 0 ) {
2467             --zSig;
2468             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2469         }
2470         zSig |= ( rem != 0 );
2471     }
2472     shift32RightJamming( zSig, 1, &zSig );
2473  roundAndPack:
2474     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2475
2476 }
2477
2478 /*----------------------------------------------------------------------------
2479 | Returns the binary exponential of the single-precision floating-point value
2480 | `a'. The operation is performed according to the IEC/IEEE Standard for
2481 | Binary Floating-Point Arithmetic.
2482 |
2483 | Uses the following identities:
2484 |
2485 | 1. -------------------------------------------------------------------------
2486 |      x    x*ln(2)
2487 |     2  = e
2488 |
2489 | 2. -------------------------------------------------------------------------
2490 |                      2     3     4     5           n
2491 |      x        x     x     x     x     x           x
2492 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2493 |               1!    2!    3!    4!    5!          n!
2494 *----------------------------------------------------------------------------*/
2495
2496 static const float64 float32_exp2_coefficients[15] =
2497 {
2498     const_float64( 0x3ff0000000000000ll ), /*  1 */
2499     const_float64( 0x3fe0000000000000ll ), /*  2 */
2500     const_float64( 0x3fc5555555555555ll ), /*  3 */
2501     const_float64( 0x3fa5555555555555ll ), /*  4 */
2502     const_float64( 0x3f81111111111111ll ), /*  5 */
2503     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2504     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2505     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2506     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2507     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2508     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2509     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2510     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2511     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2512     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2513 };
2514
2515 float32 float32_exp2( float32 a STATUS_PARAM )
2516 {
2517     flag aSign;
2518     int_fast16_t aExp;
2519     uint32_t aSig;
2520     float64 r, x, xn;
2521     int i;
2522     a = float32_squash_input_denormal(a STATUS_VAR);
2523
2524     aSig = extractFloat32Frac( a );
2525     aExp = extractFloat32Exp( a );
2526     aSign = extractFloat32Sign( a );
2527
2528     if ( aExp == 0xFF) {
2529         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2530         return (aSign) ? float32_zero : a;
2531     }
2532     if (aExp == 0) {
2533         if (aSig == 0) return float32_one;
2534     }
2535
2536     float_raise( float_flag_inexact STATUS_VAR);
2537
2538     /* ******************************* */
2539     /* using float64 for approximation */
2540     /* ******************************* */
2541     x = float32_to_float64(a STATUS_VAR);
2542     x = float64_mul(x, float64_ln2 STATUS_VAR);
2543
2544     xn = x;
2545     r = float64_one;
2546     for (i = 0 ; i < 15 ; i++) {
2547         float64 f;
2548
2549         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2550         r = float64_add(r, f STATUS_VAR);
2551
2552         xn = float64_mul(xn, x STATUS_VAR);
2553     }
2554
2555     return float64_to_float32(r, status);
2556 }
2557
2558 /*----------------------------------------------------------------------------
2559 | Returns the binary log of the single-precision floating-point value `a'.
2560 | The operation is performed according to the IEC/IEEE Standard for Binary
2561 | Floating-Point Arithmetic.
2562 *----------------------------------------------------------------------------*/
2563 float32 float32_log2( float32 a STATUS_PARAM )
2564 {
2565     flag aSign, zSign;
2566     int_fast16_t aExp;
2567     uint32_t aSig, zSig, i;
2568
2569     a = float32_squash_input_denormal(a STATUS_VAR);
2570     aSig = extractFloat32Frac( a );
2571     aExp = extractFloat32Exp( a );
2572     aSign = extractFloat32Sign( a );
2573
2574     if ( aExp == 0 ) {
2575         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2576         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2577     }
2578     if ( aSign ) {
2579         float_raise( float_flag_invalid STATUS_VAR);
2580         return float32_default_nan;
2581     }
2582     if ( aExp == 0xFF ) {
2583         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2584         return a;
2585     }
2586
2587     aExp -= 0x7F;
2588     aSig |= 0x00800000;
2589     zSign = aExp < 0;
2590     zSig = aExp << 23;
2591
2592     for (i = 1 << 22; i > 0; i >>= 1) {
2593         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2594         if ( aSig & 0x01000000 ) {
2595             aSig >>= 1;
2596             zSig |= i;
2597         }
2598     }
2599
2600     if ( zSign )
2601         zSig = -zSig;
2602
2603     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2604 }
2605
2606 /*----------------------------------------------------------------------------
2607 | Returns 1 if the single-precision floating-point value `a' is equal to
2608 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2609 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2610 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2611 *----------------------------------------------------------------------------*/
2612
2613 int float32_eq( float32 a, float32 b STATUS_PARAM )
2614 {
2615     uint32_t av, bv;
2616     a = float32_squash_input_denormal(a STATUS_VAR);
2617     b = float32_squash_input_denormal(b STATUS_VAR);
2618
2619     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2620          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2621        ) {
2622         float_raise( float_flag_invalid STATUS_VAR);
2623         return 0;
2624     }
2625     av = float32_val(a);
2626     bv = float32_val(b);
2627     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2628 }
2629
2630 /*----------------------------------------------------------------------------
2631 | Returns 1 if the single-precision floating-point value `a' is less than
2632 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2633 | exception is raised if either operand is a NaN.  The comparison is performed
2634 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2635 *----------------------------------------------------------------------------*/
2636
2637 int float32_le( float32 a, float32 b STATUS_PARAM )
2638 {
2639     flag aSign, bSign;
2640     uint32_t av, bv;
2641     a = float32_squash_input_denormal(a STATUS_VAR);
2642     b = float32_squash_input_denormal(b STATUS_VAR);
2643
2644     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2645          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2646        ) {
2647         float_raise( float_flag_invalid STATUS_VAR);
2648         return 0;
2649     }
2650     aSign = extractFloat32Sign( a );
2651     bSign = extractFloat32Sign( b );
2652     av = float32_val(a);
2653     bv = float32_val(b);
2654     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2655     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2656
2657 }
2658
2659 /*----------------------------------------------------------------------------
2660 | Returns 1 if the single-precision floating-point value `a' is less than
2661 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2662 | raised if either operand is a NaN.  The comparison is performed according
2663 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2664 *----------------------------------------------------------------------------*/
2665
2666 int float32_lt( float32 a, float32 b STATUS_PARAM )
2667 {
2668     flag aSign, bSign;
2669     uint32_t av, bv;
2670     a = float32_squash_input_denormal(a STATUS_VAR);
2671     b = float32_squash_input_denormal(b STATUS_VAR);
2672
2673     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2674          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2675        ) {
2676         float_raise( float_flag_invalid STATUS_VAR);
2677         return 0;
2678     }
2679     aSign = extractFloat32Sign( a );
2680     bSign = extractFloat32Sign( b );
2681     av = float32_val(a);
2682     bv = float32_val(b);
2683     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2684     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2685
2686 }
2687
2688 /*----------------------------------------------------------------------------
2689 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2690 | be compared, and 0 otherwise.  The invalid exception is raised if either
2691 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2692 | Standard for Binary Floating-Point Arithmetic.
2693 *----------------------------------------------------------------------------*/
2694
2695 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2696 {
2697     a = float32_squash_input_denormal(a STATUS_VAR);
2698     b = float32_squash_input_denormal(b STATUS_VAR);
2699
2700     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2701          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2702        ) {
2703         float_raise( float_flag_invalid STATUS_VAR);
2704         return 1;
2705     }
2706     return 0;
2707 }
2708
2709 /*----------------------------------------------------------------------------
2710 | Returns 1 if the single-precision floating-point value `a' is equal to
2711 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2712 | exception.  The comparison is performed according to the IEC/IEEE Standard
2713 | for Binary Floating-Point Arithmetic.
2714 *----------------------------------------------------------------------------*/
2715
2716 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2717 {
2718     a = float32_squash_input_denormal(a STATUS_VAR);
2719     b = float32_squash_input_denormal(b STATUS_VAR);
2720
2721     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2722          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2723        ) {
2724         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2725             float_raise( float_flag_invalid STATUS_VAR);
2726         }
2727         return 0;
2728     }
2729     return ( float32_val(a) == float32_val(b) ) ||
2730             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2731 }
2732
2733 /*----------------------------------------------------------------------------
2734 | Returns 1 if the single-precision floating-point value `a' is less than or
2735 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2736 | cause an exception.  Otherwise, the comparison is performed according to the
2737 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2738 *----------------------------------------------------------------------------*/
2739
2740 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2741 {
2742     flag aSign, bSign;
2743     uint32_t av, bv;
2744     a = float32_squash_input_denormal(a STATUS_VAR);
2745     b = float32_squash_input_denormal(b STATUS_VAR);
2746
2747     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2748          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2749        ) {
2750         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2751             float_raise( float_flag_invalid STATUS_VAR);
2752         }
2753         return 0;
2754     }
2755     aSign = extractFloat32Sign( a );
2756     bSign = extractFloat32Sign( b );
2757     av = float32_val(a);
2758     bv = float32_val(b);
2759     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2760     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2761
2762 }
2763
2764 /*----------------------------------------------------------------------------
2765 | Returns 1 if the single-precision floating-point value `a' is less than
2766 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2767 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2768 | Standard for Binary Floating-Point Arithmetic.
2769 *----------------------------------------------------------------------------*/
2770
2771 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2772 {
2773     flag aSign, bSign;
2774     uint32_t av, bv;
2775     a = float32_squash_input_denormal(a STATUS_VAR);
2776     b = float32_squash_input_denormal(b STATUS_VAR);
2777
2778     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2779          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2780        ) {
2781         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2782             float_raise( float_flag_invalid STATUS_VAR);
2783         }
2784         return 0;
2785     }
2786     aSign = extractFloat32Sign( a );
2787     bSign = extractFloat32Sign( b );
2788     av = float32_val(a);
2789     bv = float32_val(b);
2790     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2791     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2792
2793 }
2794
2795 /*----------------------------------------------------------------------------
2796 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2797 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2798 | comparison is performed according to the IEC/IEEE Standard for Binary
2799 | Floating-Point Arithmetic.
2800 *----------------------------------------------------------------------------*/
2801
2802 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2803 {
2804     a = float32_squash_input_denormal(a STATUS_VAR);
2805     b = float32_squash_input_denormal(b STATUS_VAR);
2806
2807     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2808          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2809        ) {
2810         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2811             float_raise( float_flag_invalid STATUS_VAR);
2812         }
2813         return 1;
2814     }
2815     return 0;
2816 }
2817
2818 /*----------------------------------------------------------------------------
2819 | Returns the result of converting the double-precision floating-point value
2820 | `a' to the 32-bit two's complement integer format.  The conversion is
2821 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2822 | Arithmetic---which means in particular that the conversion is rounded
2823 | according to the current rounding mode.  If `a' is a NaN, the largest
2824 | positive integer is returned.  Otherwise, if the conversion overflows, the
2825 | largest integer with the same sign as `a' is returned.
2826 *----------------------------------------------------------------------------*/
2827
2828 int32 float64_to_int32( float64 a STATUS_PARAM )
2829 {
2830     flag aSign;
2831     int_fast16_t aExp, shiftCount;
2832     uint64_t aSig;
2833     a = float64_squash_input_denormal(a STATUS_VAR);
2834
2835     aSig = extractFloat64Frac( a );
2836     aExp = extractFloat64Exp( a );
2837     aSign = extractFloat64Sign( a );
2838     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2839     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2840     shiftCount = 0x42C - aExp;
2841     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2842     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2843
2844 }
2845
2846 /*----------------------------------------------------------------------------
2847 | Returns the result of converting the double-precision floating-point value
2848 | `a' to the 32-bit two's complement integer format.  The conversion is
2849 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2850 | Arithmetic, except that the conversion is always rounded toward zero.
2851 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2852 | the conversion overflows, the largest integer with the same sign as `a' is
2853 | returned.
2854 *----------------------------------------------------------------------------*/
2855
2856 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2857 {
2858     flag aSign;
2859     int_fast16_t aExp, shiftCount;
2860     uint64_t aSig, savedASig;
2861     int32_t z;
2862     a = float64_squash_input_denormal(a STATUS_VAR);
2863
2864     aSig = extractFloat64Frac( a );
2865     aExp = extractFloat64Exp( a );
2866     aSign = extractFloat64Sign( a );
2867     if ( 0x41E < aExp ) {
2868         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2869         goto invalid;
2870     }
2871     else if ( aExp < 0x3FF ) {
2872         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2873         return 0;
2874     }
2875     aSig |= LIT64( 0x0010000000000000 );
2876     shiftCount = 0x433 - aExp;
2877     savedASig = aSig;
2878     aSig >>= shiftCount;
2879     z = aSig;
2880     if ( aSign ) z = - z;
2881     if ( ( z < 0 ) ^ aSign ) {
2882  invalid:
2883         float_raise( float_flag_invalid STATUS_VAR);
2884         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2885     }
2886     if ( ( aSig<<shiftCount ) != savedASig ) {
2887         STATUS(float_exception_flags) |= float_flag_inexact;
2888     }
2889     return z;
2890
2891 }
2892
2893 /*----------------------------------------------------------------------------
2894 | Returns the result of converting the double-precision floating-point value
2895 | `a' to the 16-bit two's complement integer format.  The conversion is
2896 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2897 | Arithmetic, except that the conversion is always rounded toward zero.
2898 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2899 | the conversion overflows, the largest integer with the same sign as `a' is
2900 | returned.
2901 *----------------------------------------------------------------------------*/
2902
2903 int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
2904 {
2905     flag aSign;
2906     int_fast16_t aExp, shiftCount;
2907     uint64_t aSig, savedASig;
2908     int32 z;
2909
2910     aSig = extractFloat64Frac( a );
2911     aExp = extractFloat64Exp( a );
2912     aSign = extractFloat64Sign( a );
2913     if ( 0x40E < aExp ) {
2914         if ( ( aExp == 0x7FF ) && aSig ) {
2915             aSign = 0;
2916         }
2917         goto invalid;
2918     }
2919     else if ( aExp < 0x3FF ) {
2920         if ( aExp || aSig ) {
2921             STATUS(float_exception_flags) |= float_flag_inexact;
2922         }
2923         return 0;
2924     }
2925     aSig |= LIT64( 0x0010000000000000 );
2926     shiftCount = 0x433 - aExp;
2927     savedASig = aSig;
2928     aSig >>= shiftCount;
2929     z = aSig;
2930     if ( aSign ) {
2931         z = - z;
2932     }
2933     if ( ( (int16_t)z < 0 ) ^ aSign ) {
2934  invalid:
2935         float_raise( float_flag_invalid STATUS_VAR);
2936         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2937     }
2938     if ( ( aSig<<shiftCount ) != savedASig ) {
2939         STATUS(float_exception_flags) |= float_flag_inexact;
2940     }
2941     return z;
2942 }
2943
2944 /*----------------------------------------------------------------------------
2945 | Returns the result of converting the double-precision floating-point value
2946 | `a' to the 64-bit two's complement integer format.  The conversion is
2947 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2948 | Arithmetic---which means in particular that the conversion is rounded
2949 | according to the current rounding mode.  If `a' is a NaN, the largest
2950 | positive integer is returned.  Otherwise, if the conversion overflows, the
2951 | largest integer with the same sign as `a' is returned.
2952 *----------------------------------------------------------------------------*/
2953
2954 int64 float64_to_int64( float64 a STATUS_PARAM )
2955 {
2956     flag aSign;
2957     int_fast16_t aExp, shiftCount;
2958     uint64_t aSig, aSigExtra;
2959     a = float64_squash_input_denormal(a STATUS_VAR);
2960
2961     aSig = extractFloat64Frac( a );
2962     aExp = extractFloat64Exp( a );
2963     aSign = extractFloat64Sign( a );
2964     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2965     shiftCount = 0x433 - aExp;
2966     if ( shiftCount <= 0 ) {
2967         if ( 0x43E < aExp ) {
2968             float_raise( float_flag_invalid STATUS_VAR);
2969             if (    ! aSign
2970                  || (    ( aExp == 0x7FF )
2971                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2972                ) {
2973                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2974             }
2975             return (int64_t) LIT64( 0x8000000000000000 );
2976         }
2977         aSigExtra = 0;
2978         aSig <<= - shiftCount;
2979     }
2980     else {
2981         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2982     }
2983     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2984
2985 }
2986
2987 /*----------------------------------------------------------------------------
2988 | Returns the result of converting the double-precision floating-point value
2989 | `a' to the 64-bit two's complement integer format.  The conversion is
2990 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2991 | Arithmetic, except that the conversion is always rounded toward zero.
2992 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2993 | the conversion overflows, the largest integer with the same sign as `a' is
2994 | returned.
2995 *----------------------------------------------------------------------------*/
2996
2997 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2998 {
2999     flag aSign;
3000     int_fast16_t aExp, shiftCount;
3001     uint64_t aSig;
3002     int64 z;
3003     a = float64_squash_input_denormal(a STATUS_VAR);
3004
3005     aSig = extractFloat64Frac( a );
3006     aExp = extractFloat64Exp( a );
3007     aSign = extractFloat64Sign( a );
3008     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3009     shiftCount = aExp - 0x433;
3010     if ( 0 <= shiftCount ) {
3011         if ( 0x43E <= aExp ) {
3012             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3013                 float_raise( float_flag_invalid STATUS_VAR);
3014                 if (    ! aSign
3015                      || (    ( aExp == 0x7FF )
3016                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
3017                    ) {
3018                     return LIT64( 0x7FFFFFFFFFFFFFFF );
3019                 }
3020             }
3021             return (int64_t) LIT64( 0x8000000000000000 );
3022         }
3023         z = aSig<<shiftCount;
3024     }
3025     else {
3026         if ( aExp < 0x3FE ) {
3027             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3028             return 0;
3029         }
3030         z = aSig>>( - shiftCount );
3031         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3032             STATUS(float_exception_flags) |= float_flag_inexact;
3033         }
3034     }
3035     if ( aSign ) z = - z;
3036     return z;
3037
3038 }
3039
3040 /*----------------------------------------------------------------------------
3041 | Returns the result of converting the double-precision floating-point value
3042 | `a' to the single-precision floating-point format.  The conversion is
3043 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3044 | Arithmetic.
3045 *----------------------------------------------------------------------------*/
3046
3047 float32 float64_to_float32( float64 a STATUS_PARAM )
3048 {
3049     flag aSign;
3050     int_fast16_t aExp;
3051     uint64_t aSig;
3052     uint32_t zSig;
3053     a = float64_squash_input_denormal(a STATUS_VAR);
3054
3055     aSig = extractFloat64Frac( a );
3056     aExp = extractFloat64Exp( a );
3057     aSign = extractFloat64Sign( a );
3058     if ( aExp == 0x7FF ) {
3059         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3060         return packFloat32( aSign, 0xFF, 0 );
3061     }
3062     shift64RightJamming( aSig, 22, &aSig );
3063     zSig = aSig;
3064     if ( aExp || zSig ) {
3065         zSig |= 0x40000000;
3066         aExp -= 0x381;
3067     }
3068     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3069
3070 }
3071
3072
3073 /*----------------------------------------------------------------------------
3074 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3075 | half-precision floating-point value, returning the result.  After being
3076 | shifted into the proper positions, the three fields are simply added
3077 | together to form the result.  This means that any integer portion of `zSig'
3078 | will be added into the exponent.  Since a properly normalized significand
3079 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3080 | than the desired result exponent whenever `zSig' is a complete, normalized
3081 | significand.
3082 *----------------------------------------------------------------------------*/
3083 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3084 {
3085     return make_float16(
3086         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3087 }
3088
3089 /*----------------------------------------------------------------------------
3090 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3091 | and significand `zSig', and returns the proper half-precision floating-
3092 | point value corresponding to the abstract input.  Ordinarily, the abstract
3093 | value is simply rounded and packed into the half-precision format, with
3094 | the inexact exception raised if the abstract input cannot be represented
3095 | exactly.  However, if the abstract value is too large, the overflow and
3096 | inexact exceptions are raised and an infinity or maximal finite value is
3097 | returned.  If the abstract value is too small, the input value is rounded to
3098 | a subnormal number, and the underflow and inexact exceptions are raised if
3099 | the abstract input cannot be represented exactly as a subnormal half-
3100 | precision floating-point number.
3101 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3102 | ARM-style "alternative representation", which omits the NaN and Inf
3103 | encodings in order to raise the maximum representable exponent by one.
3104 |     The input significand `zSig' has its binary point between bits 22
3105 | and 23, which is 13 bits to the left of the usual location.  This shifted
3106 | significand must be normalized or smaller.  If `zSig' is not normalized,
3107 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3108 | and it must not require rounding.  In the usual case that `zSig' is
3109 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3110 | Note the slightly odd position of the binary point in zSig compared with the
3111 | other roundAndPackFloat functions. This should probably be fixed if we
3112 | need to implement more float16 routines than just conversion.
3113 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3114 | Binary Floating-Point Arithmetic.
3115 *----------------------------------------------------------------------------*/
3116
3117 static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3118                                    uint32_t zSig, flag ieee STATUS_PARAM)
3119 {
3120     int maxexp = ieee ? 29 : 30;
3121     uint32_t mask;
3122     uint32_t increment;
3123     int8 roundingMode;
3124     bool rounding_bumps_exp;
3125     bool is_tiny = false;
3126
3127     /* Calculate the mask of bits of the mantissa which are not
3128      * representable in half-precision and will be lost.
3129      */
3130     if (zExp < 1) {
3131         /* Will be denormal in halfprec */
3132         mask = 0x00ffffff;
3133         if (zExp >= -11) {
3134             mask >>= 11 + zExp;
3135         }
3136     } else {
3137         /* Normal number in halfprec */
3138         mask = 0x00001fff;
3139     }
3140
3141     roundingMode = STATUS(float_rounding_mode);
3142     switch (roundingMode) {
3143     case float_round_nearest_even:
3144         increment = (mask + 1) >> 1;
3145         if ((zSig & mask) == increment) {
3146             increment = zSig & (increment << 1);
3147         }
3148         break;
3149     case float_round_up:
3150         increment = zSign ? 0 : mask;
3151         break;
3152     case float_round_down:
3153         increment = zSign ? mask : 0;
3154         break;
3155     default: /* round_to_zero */
3156         increment = 0;
3157         break;
3158     }
3159
3160     rounding_bumps_exp = (zSig + increment >= 0x01000000);
3161
3162     if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3163         if (ieee) {
3164             float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3165             return packFloat16(zSign, 0x1f, 0);
3166         } else {
3167             float_raise(float_flag_invalid STATUS_VAR);
3168             return packFloat16(zSign, 0x1f, 0x3ff);
3169         }
3170     }
3171
3172     if (zExp < 0) {
3173         /* Note that flush-to-zero does not affect half-precision results */
3174         is_tiny =
3175             (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3176             || (zExp < -1)
3177             || (!rounding_bumps_exp);
3178     }
3179     if (zSig & mask) {
3180         float_raise(float_flag_inexact STATUS_VAR);
3181         if (is_tiny) {
3182             float_raise(float_flag_underflow STATUS_VAR);
3183         }
3184     }
3185
3186     zSig += increment;
3187     if (rounding_bumps_exp) {
3188         zSig >>= 1;
3189         zExp++;
3190     }
3191
3192     if (zExp < -10) {
3193         return packFloat16(zSign, 0, 0);
3194     }
3195     if (zExp < 0) {
3196         zSig >>= -zExp;
3197         zExp = 0;
3198     }
3199     return packFloat16(zSign, zExp, zSig >> 13);
3200 }
3201
3202 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3203                                       uint32_t *zSigPtr)
3204 {
3205     int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3206     *zSigPtr = aSig << shiftCount;
3207     *zExpPtr = 1 - shiftCount;
3208 }
3209
3210 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3211    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
3212
3213 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
3214 {
3215     flag aSign;
3216     int_fast16_t aExp;
3217     uint32_t aSig;
3218
3219     aSign = extractFloat16Sign(a);
3220     aExp = extractFloat16Exp(a);
3221     aSig = extractFloat16Frac(a);
3222
3223     if (aExp == 0x1f && ieee) {
3224         if (aSig) {
3225             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3226         }
3227         return packFloat32(aSign, 0xff, 0);
3228     }
3229     if (aExp == 0) {
3230         if (aSig == 0) {
3231             return packFloat32(aSign, 0, 0);
3232         }
3233
3234         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3235         aExp--;
3236     }
3237     return packFloat32( aSign, aExp + 0x70, aSig << 13);
3238 }
3239
3240 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
3241 {
3242     flag aSign;
3243     int_fast16_t aExp;
3244     uint32_t aSig;
3245
3246     a = float32_squash_input_denormal(a STATUS_VAR);
3247
3248     aSig = extractFloat32Frac( a );
3249     aExp = extractFloat32Exp( a );
3250     aSign = extractFloat32Sign( a );
3251     if ( aExp == 0xFF ) {
3252         if (aSig) {
3253             /* Input is a NaN */
3254             if (!ieee) {
3255                 float_raise(float_flag_invalid STATUS_VAR);
3256                 return packFloat16(aSign, 0, 0);
3257             }
3258             return commonNaNToFloat16(
3259                 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3260         }
3261         /* Infinity */
3262         if (!ieee) {
3263             float_raise(float_flag_invalid STATUS_VAR);
3264             return packFloat16(aSign, 0x1f, 0x3ff);
3265         }
3266         return packFloat16(aSign, 0x1f, 0);
3267     }
3268     if (aExp == 0 && aSig == 0) {
3269         return packFloat16(aSign, 0, 0);
3270     }
3271     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3272      * even if the input is denormal; however this is harmless because
3273      * the largest possible single-precision denormal is still smaller
3274      * than the smallest representable half-precision denormal, and so we
3275      * will end up ignoring aSig and returning via the "always return zero"
3276      * codepath.
3277      */
3278     aSig |= 0x00800000;
3279     aExp -= 0x71;
3280
3281     return roundAndPackFloat16(aSign, aExp, aSig, ieee STATUS_VAR);
3282 }
3283
3284 float64 float16_to_float64(float16 a, flag ieee STATUS_PARAM)
3285 {
3286     flag aSign;
3287     int_fast16_t aExp;
3288     uint32_t aSig;
3289
3290     aSign = extractFloat16Sign(a);
3291     aExp = extractFloat16Exp(a);
3292     aSig = extractFloat16Frac(a);
3293
3294     if (aExp == 0x1f && ieee) {
3295         if (aSig) {
3296             return commonNaNToFloat64(
3297                 float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3298         }
3299         return packFloat64(aSign, 0x7ff, 0);
3300     }
3301     if (aExp == 0) {
3302         if (aSig == 0) {
3303             return packFloat64(aSign, 0, 0);
3304         }
3305
3306         normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3307         aExp--;
3308     }
3309     return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3310 }
3311
3312 float16 float64_to_float16(float64 a, flag ieee STATUS_PARAM)
3313 {
3314     flag aSign;
3315     int_fast16_t aExp;
3316     uint64_t aSig;
3317     uint32_t zSig;
3318
3319     a = float64_squash_input_denormal(a STATUS_VAR);
3320
3321     aSig = extractFloat64Frac(a);
3322     aExp = extractFloat64Exp(a);
3323     aSign = extractFloat64Sign(a);
3324     if (aExp == 0x7FF) {
3325         if (aSig) {
3326             /* Input is a NaN */
3327             if (!ieee) {
3328                 float_raise(float_flag_invalid STATUS_VAR);
3329                 return packFloat16(aSign, 0, 0);
3330             }
3331             return commonNaNToFloat16(
3332                 float64ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3333         }
3334         /* Infinity */
3335         if (!ieee) {
3336             float_raise(float_flag_invalid STATUS_VAR);
3337             return packFloat16(aSign, 0x1f, 0x3ff);
3338         }
3339         return packFloat16(aSign, 0x1f, 0);
3340     }
3341     shift64RightJamming(aSig, 29, &aSig);
3342     zSig = aSig;
3343     if (aExp == 0 && zSig == 0) {
3344         return packFloat16(aSign, 0, 0);
3345     }
3346     /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3347      * even if the input is denormal; however this is harmless because
3348      * the largest possible single-precision denormal is still smaller
3349      * than the smallest representable half-precision denormal, and so we
3350      * will end up ignoring aSig and returning via the "always return zero"
3351      * codepath.
3352      */
3353     zSig |= 0x00800000;
3354     aExp -= 0x3F1;
3355
3356     return roundAndPackFloat16(aSign, aExp, zSig, ieee STATUS_VAR);
3357 }
3358
3359 /*----------------------------------------------------------------------------
3360 | Returns the result of converting the double-precision floating-point value
3361 | `a' to the extended double-precision floating-point format.  The conversion
3362 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3363 | Arithmetic.
3364 *----------------------------------------------------------------------------*/
3365
3366 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3367 {
3368     flag aSign;
3369     int_fast16_t aExp;
3370     uint64_t aSig;
3371
3372     a = float64_squash_input_denormal(a STATUS_VAR);
3373     aSig = extractFloat64Frac( a );
3374     aExp = extractFloat64Exp( a );
3375     aSign = extractFloat64Sign( a );
3376     if ( aExp == 0x7FF ) {
3377         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3378         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3379     }
3380     if ( aExp == 0 ) {
3381         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3382         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3383     }
3384     return
3385         packFloatx80(
3386             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3387
3388 }
3389
3390 /*----------------------------------------------------------------------------
3391 | Returns the result of converting the double-precision floating-point value
3392 | `a' to the quadruple-precision floating-point format.  The conversion is
3393 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3394 | Arithmetic.
3395 *----------------------------------------------------------------------------*/
3396
3397 float128 float64_to_float128( float64 a STATUS_PARAM )
3398 {
3399     flag aSign;
3400     int_fast16_t aExp;
3401     uint64_t aSig, zSig0, zSig1;
3402
3403     a = float64_squash_input_denormal(a STATUS_VAR);
3404     aSig = extractFloat64Frac( a );
3405     aExp = extractFloat64Exp( a );
3406     aSign = extractFloat64Sign( a );
3407     if ( aExp == 0x7FF ) {
3408         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3409         return packFloat128( aSign, 0x7FFF, 0, 0 );
3410     }
3411     if ( aExp == 0 ) {
3412         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3413         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3414         --aExp;
3415     }
3416     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3417     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3418
3419 }
3420
3421 /*----------------------------------------------------------------------------
3422 | Rounds the double-precision floating-point value `a' to an integer, and
3423 | returns the result as a double-precision floating-point value.  The
3424 | operation is performed according to the IEC/IEEE Standard for Binary
3425 | Floating-Point Arithmetic.
3426 *----------------------------------------------------------------------------*/
3427
3428 float64 float64_round_to_int( float64 a STATUS_PARAM )
3429 {
3430     flag aSign;
3431     int_fast16_t aExp;
3432     uint64_t lastBitMask, roundBitsMask;
3433     int8 roundingMode;
3434     uint64_t z;
3435     a = float64_squash_input_denormal(a STATUS_VAR);
3436
3437     aExp = extractFloat64Exp( a );
3438     if ( 0x433 <= aExp ) {
3439         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3440             return propagateFloat64NaN( a, a STATUS_VAR );
3441         }
3442         return a;
3443     }
3444     if ( aExp < 0x3FF ) {
3445         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3446         STATUS(float_exception_flags) |= float_flag_inexact;
3447         aSign = extractFloat64Sign( a );
3448         switch ( STATUS(float_rounding_mode) ) {
3449          case float_round_nearest_even:
3450             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3451                 return packFloat64( aSign, 0x3FF, 0 );
3452             }
3453             break;
3454          case float_round_down:
3455             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3456          case float_round_up:
3457             return make_float64(
3458             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3459         }
3460         return packFloat64( aSign, 0, 0 );
3461     }
3462     lastBitMask = 1;
3463     lastBitMask <<= 0x433 - aExp;
3464     roundBitsMask = lastBitMask - 1;
3465     z = float64_val(a);
3466     roundingMode = STATUS(float_rounding_mode);
3467     if ( roundingMode == float_round_nearest_even ) {
3468         z += lastBitMask>>1;
3469         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3470     }
3471     else if ( roundingMode != float_round_to_zero ) {
3472         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3473             z += roundBitsMask;
3474         }
3475     }
3476     z &= ~ roundBitsMask;
3477     if ( z != float64_val(a) )
3478         STATUS(float_exception_flags) |= float_flag_inexact;
3479     return make_float64(z);
3480
3481 }
3482
3483 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3484 {
3485     int oldmode;
3486     float64 res;
3487     oldmode = STATUS(float_rounding_mode);
3488     STATUS(float_rounding_mode) = float_round_to_zero;
3489     res = float64_round_to_int(a STATUS_VAR);
3490     STATUS(float_rounding_mode) = oldmode;
3491     return res;
3492 }
3493
3494 /*----------------------------------------------------------------------------
3495 | Returns the result of adding the absolute values of the double-precision
3496 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3497 | before being returned.  `zSign' is ignored if the result is a NaN.
3498 | The addition is performed according to the IEC/IEEE Standard for Binary
3499 | Floating-Point Arithmetic.
3500 *----------------------------------------------------------------------------*/
3501
3502 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3503 {
3504     int_fast16_t aExp, bExp, zExp;
3505     uint64_t aSig, bSig, zSig;
3506     int_fast16_t expDiff;
3507
3508     aSig = extractFloat64Frac( a );
3509     aExp = extractFloat64Exp( a );
3510     bSig = extractFloat64Frac( b );
3511     bExp = extractFloat64Exp( b );
3512     expDiff = aExp - bExp;
3513     aSig <<= 9;
3514     bSig <<= 9;
3515     if ( 0 < expDiff ) {
3516         if ( aExp == 0x7FF ) {
3517             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3518             return a;
3519         }
3520         if ( bExp == 0 ) {
3521             --expDiff;
3522         }
3523         else {
3524             bSig |= LIT64( 0x2000000000000000 );
3525         }
3526         shift64RightJamming( bSig, expDiff, &bSig );
3527         zExp = aExp;
3528     }
3529     else if ( expDiff < 0 ) {
3530         if ( bExp == 0x7FF ) {
3531             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3532             return packFloat64( zSign, 0x7FF, 0 );
3533         }
3534         if ( aExp == 0 ) {
3535             ++expDiff;
3536         }
3537         else {
3538             aSig |= LIT64( 0x2000000000000000 );
3539         }
3540         shift64RightJamming( aSig, - expDiff, &aSig );
3541         zExp = bExp;
3542     }
3543     else {
3544         if ( aExp == 0x7FF ) {
3545             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3546             return a;
3547         }
3548         if ( aExp == 0 ) {
3549             if (STATUS(flush_to_zero)) {
3550                 if (aSig | bSig) {
3551                     float_raise(float_flag_output_denormal STATUS_VAR);
3552                 }
3553                 return packFloat64(zSign, 0, 0);
3554             }
3555             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3556         }
3557         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3558         zExp = aExp;
3559         goto roundAndPack;
3560     }
3561     aSig |= LIT64( 0x2000000000000000 );
3562     zSig = ( aSig + bSig )<<1;
3563     --zExp;
3564     if ( (int64_t) zSig < 0 ) {
3565         zSig = aSig + bSig;
3566         ++zExp;
3567     }
3568  roundAndPack:
3569     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3570
3571 }
3572
3573 /*----------------------------------------------------------------------------
3574 | Returns the result of subtracting the absolute values of the double-
3575 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3576 | difference is negated before being returned.  `zSign' is ignored if the
3577 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3578 | Standard for Binary Floating-Point Arithmetic.
3579 *----------------------------------------------------------------------------*/
3580
3581 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3582 {
3583     int_fast16_t aExp, bExp, zExp;
3584     uint64_t aSig, bSig, zSig;
3585     int_fast16_t expDiff;
3586
3587     aSig = extractFloat64Frac( a );
3588     aExp = extractFloat64Exp( a );
3589     bSig = extractFloat64Frac( b );
3590     bExp = extractFloat64Exp( b );
3591     expDiff = aExp - bExp;
3592     aSig <<= 10;
3593     bSig <<= 10;
3594     if ( 0 < expDiff ) goto aExpBigger;
3595     if ( expDiff < 0 ) goto bExpBigger;
3596     if ( aExp == 0x7FF ) {
3597         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3598         float_raise( float_flag_invalid STATUS_VAR);
3599         return float64_default_nan;
3600     }
3601     if ( aExp == 0 ) {
3602         aExp = 1;
3603         bExp = 1;
3604     }
3605     if ( bSig < aSig ) goto aBigger;
3606     if ( aSig < bSig ) goto bBigger;
3607     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3608  bExpBigger:
3609     if ( bExp == 0x7FF ) {
3610         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3611         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3612     }
3613     if ( aExp == 0 ) {
3614         ++expDiff;
3615     }
3616     else {
3617         aSig |= LIT64( 0x4000000000000000 );
3618     }
3619     shift64RightJamming( aSig, - expDiff, &aSig );
3620     bSig |= LIT64( 0x4000000000000000 );
3621  bBigger:
3622     zSig = bSig - aSig;
3623     zExp = bExp;
3624     zSign ^= 1;
3625     goto normalizeRoundAndPack;
3626  aExpBigger:
3627     if ( aExp == 0x7FF ) {
3628         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3629         return a;
3630     }
3631     if ( bExp == 0 ) {
3632         --expDiff;
3633     }
3634     else {
3635         bSig |= LIT64( 0x4000000000000000 );
3636     }
3637     shift64RightJamming( bSig, expDiff, &bSig );
3638     aSig |= LIT64( 0x4000000000000000 );
3639  aBigger:
3640     zSig = aSig - bSig;
3641     zExp = aExp;
3642  normalizeRoundAndPack:
3643     --zExp;
3644     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3645
3646 }
3647
3648 /*----------------------------------------------------------------------------
3649 | Returns the result of adding the double-precision floating-point values `a'
3650 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3651 | Binary Floating-Point Arithmetic.
3652 *----------------------------------------------------------------------------*/
3653
3654 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3655 {
3656     flag aSign, bSign;
3657     a = float64_squash_input_denormal(a STATUS_VAR);
3658     b = float64_squash_input_denormal(b STATUS_VAR);
3659
3660     aSign = extractFloat64Sign( a );
3661     bSign = extractFloat64Sign( b );
3662     if ( aSign == bSign ) {
3663         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3664     }
3665     else {
3666         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3667     }
3668
3669 }
3670
3671 /*----------------------------------------------------------------------------
3672 | Returns the result of subtracting the double-precision floating-point values
3673 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3674 | for Binary Floating-Point Arithmetic.
3675 *----------------------------------------------------------------------------*/
3676
3677 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3678 {
3679     flag aSign, bSign;
3680     a = float64_squash_input_denormal(a STATUS_VAR);
3681     b = float64_squash_input_denormal(b STATUS_VAR);
3682
3683     aSign = extractFloat64Sign( a );
3684     bSign = extractFloat64Sign( b );
3685     if ( aSign == bSign ) {
3686         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3687     }
3688     else {
3689         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3690     }
3691
3692 }
3693
3694 /*----------------------------------------------------------------------------
3695 | Returns the result of multiplying the double-precision floating-point values
3696 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3697 | for Binary Floating-Point Arithmetic.
3698 *----------------------------------------------------------------------------*/
3699
3700 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3701 {
3702     flag aSign, bSign, zSign;
3703     int_fast16_t aExp, bExp, zExp;
3704     uint64_t aSig, bSig, zSig0, zSig1;
3705
3706     a = float64_squash_input_denormal(a STATUS_VAR);
3707     b = float64_squash_input_denormal(b STATUS_VAR);
3708
3709     aSig = extractFloat64Frac( a );
3710     aExp = extractFloat64Exp( a );
3711     aSign = extractFloat64Sign( a );
3712     bSig = extractFloat64Frac( b );
3713     bExp = extractFloat64Exp( b );
3714     bSign = extractFloat64Sign( b );
3715     zSign = aSign ^ bSign;
3716     if ( aExp == 0x7FF ) {
3717         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3718             return propagateFloat64NaN( a, b STATUS_VAR );
3719         }
3720         if ( ( bExp | bSig ) == 0 ) {
3721             float_raise( float_flag_invalid STATUS_VAR);
3722             return float64_default_nan;
3723         }
3724         return packFloat64( zSign, 0x7FF, 0 );
3725     }
3726     if ( bExp == 0x7FF ) {
3727         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3728         if ( ( aExp | aSig ) == 0 ) {
3729             float_raise( float_flag_invalid STATUS_VAR);
3730             return float64_default_nan;
3731         }
3732         return packFloat64( zSign, 0x7FF, 0 );
3733     }
3734     if ( aExp == 0 ) {
3735         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3736         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3737     }
3738     if ( bExp == 0 ) {
3739         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3740         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3741     }
3742     zExp = aExp + bExp - 0x3FF;
3743     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3744     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3745     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3746     zSig0 |= ( zSig1 != 0 );
3747     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3748         zSig0 <<= 1;
3749         --zExp;
3750     }
3751     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3752
3753 }
3754
3755 /*----------------------------------------------------------------------------
3756 | Returns the result of dividing the double-precision floating-point value `a'
3757 | by the corresponding value `b'.  The operation is performed according to
3758 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3759 *----------------------------------------------------------------------------*/
3760
3761 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3762 {
3763     flag aSign, bSign, zSign;
3764     int_fast16_t aExp, bExp, zExp;
3765     uint64_t aSig, bSig, zSig;
3766     uint64_t rem0, rem1;
3767     uint64_t term0, term1;
3768     a = float64_squash_input_denormal(a STATUS_VAR);
3769     b = float64_squash_input_denormal(b STATUS_VAR);
3770
3771     aSig = extractFloat64Frac( a );
3772     aExp = extractFloat64Exp( a );
3773     aSign = extractFloat64Sign( a );
3774     bSig = extractFloat64Frac( b );
3775     bExp = extractFloat64Exp( b );
3776     bSign = extractFloat64Sign( b );
3777     zSign = aSign ^ bSign;
3778     if ( aExp == 0x7FF ) {
3779         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3780         if ( bExp == 0x7FF ) {
3781             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3782             float_raise( float_flag_invalid STATUS_VAR);
3783             return float64_default_nan;
3784         }
3785         return packFloat64( zSign, 0x7FF, 0 );
3786     }
3787     if ( bExp == 0x7FF ) {
3788         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3789         return packFloat64( zSign, 0, 0 );
3790     }
3791     if ( bExp == 0 ) {
3792         if ( bSig == 0 ) {
3793             if ( ( aExp | aSig ) == 0 ) {
3794                 float_raise( float_flag_invalid STATUS_VAR);
3795                 return float64_default_nan;
3796             }
3797             float_raise( float_flag_divbyzero STATUS_VAR);
3798             return packFloat64( zSign, 0x7FF, 0 );
3799         }
3800         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3801     }
3802     if ( aExp == 0 ) {
3803         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3804         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3805     }
3806     zExp = aExp - bExp + 0x3FD;
3807     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3808     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3809     if ( bSig <= ( aSig + aSig ) ) {
3810         aSig >>= 1;
3811         ++zExp;
3812     }
3813     zSig = estimateDiv128To64( aSig, 0, bSig );
3814     if ( ( zSig & 0x1FF ) <= 2 ) {
3815         mul64To128( bSig, zSig, &term0, &term1 );
3816         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3817         while ( (int64_t) rem0 < 0 ) {
3818             --zSig;
3819             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3820         }
3821         zSig |= ( rem1 != 0 );
3822     }
3823     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3824
3825 }
3826
3827 /*----------------------------------------------------------------------------
3828 | Returns the remainder of the double-precision floating-point value `a'
3829 | with respect to the corresponding value `b'.  The operation is performed
3830 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3831 *----------------------------------------------------------------------------*/
3832
3833 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3834 {
3835     flag aSign, zSign;
3836     int_fast16_t aExp, bExp, expDiff;
3837     uint64_t aSig, bSig;
3838     uint64_t q, alternateASig;
3839     int64_t sigMean;
3840
3841     a = float64_squash_input_denormal(a STATUS_VAR);
3842     b = float64_squash_input_denormal(b STATUS_VAR);
3843     aSig = extractFloat64Frac( a );
3844     aExp = extractFloat64Exp( a );
3845     aSign = extractFloat64Sign( a );
3846     bSig = extractFloat64Frac( b );
3847     bExp = extractFloat64Exp( b );
3848     if ( aExp == 0x7FF ) {
3849         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3850             return propagateFloat64NaN( a, b STATUS_VAR );
3851         }
3852         float_raise( float_flag_invalid STATUS_VAR);
3853         return float64_default_nan;
3854     }
3855     if ( bExp == 0x7FF ) {
3856         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3857         return a;
3858     }
3859     if ( bExp == 0 ) {
3860         if ( bSig == 0 ) {
3861             float_raise( float_flag_invalid STATUS_VAR);
3862             return float64_default_nan;
3863         }
3864         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3865     }
3866     if ( aExp == 0 ) {
3867         if ( aSig == 0 ) return a;
3868         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3869     }
3870     expDiff = aExp - bExp;
3871     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3872     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3873     if ( expDiff < 0 ) {
3874         if ( expDiff < -1 ) return a;
3875         aSig >>= 1;
3876     }
3877     q = ( bSig <= aSig );
3878     if ( q ) aSig -= bSig;
3879     expDiff -= 64;
3880     while ( 0 < expDiff ) {
3881         q = estimateDiv128To64( aSig, 0, bSig );
3882         q = ( 2 < q ) ? q - 2 : 0;
3883         aSig = - ( ( bSig>>2 ) * q );
3884         expDiff -= 62;
3885     }
3886     expDiff += 64;
3887     if ( 0 < expDiff ) {
3888         q = estimateDiv128To64( aSig, 0, bSig );
3889         q = ( 2 < q ) ? q - 2 : 0;
3890         q >>= 64 - expDiff;
3891         bSig >>= 2;
3892         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3893     }
3894     else {
3895         aSig >>= 2;
3896         bSig >>= 2;
3897     }
3898     do {
3899         alternateASig = aSig;
3900         ++q;
3901         aSig -= bSig;
3902     } while ( 0 <= (int64_t) aSig );
3903     sigMean = aSig + alternateASig;
3904     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3905         aSig = alternateASig;
3906     }
3907     zSign = ( (int64_t) aSig < 0 );
3908     if ( zSign ) aSig = - aSig;
3909     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3910
3911 }
3912
3913 /*----------------------------------------------------------------------------
3914 | Returns the result of multiplying the double-precision floating-point values
3915 | `a' and `b' then adding 'c', with no intermediate rounding step after the
3916 | multiplication.  The operation is performed according to the IEC/IEEE
3917 | Standard for Binary Floating-Point Arithmetic 754-2008.
3918 | The flags argument allows the caller to select negation of the
3919 | addend, the intermediate product, or the final result. (The difference
3920 | between this and having the caller do a separate negation is that negating
3921 | externally will flip the sign bit on NaNs.)
3922 *----------------------------------------------------------------------------*/
3923
3924 float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
3925 {
3926     flag aSign, bSign, cSign, zSign;
3927     int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
3928     uint64_t aSig, bSig, cSig;
3929     flag pInf, pZero, pSign;
3930     uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
3931     int shiftcount;
3932     flag signflip, infzero;
3933
3934     a = float64_squash_input_denormal(a STATUS_VAR);
3935     b = float64_squash_input_denormal(b STATUS_VAR);
3936     c = float64_squash_input_denormal(c STATUS_VAR);
3937     aSig = extractFloat64Frac(a);
3938     aExp = extractFloat64Exp(a);
3939     aSign = extractFloat64Sign(a);
3940     bSig = extractFloat64Frac(b);
3941     bExp = extractFloat64Exp(b);
3942     bSign = extractFloat64Sign(b);
3943     cSig = extractFloat64Frac(c);
3944     cExp = extractFloat64Exp(c);
3945     cSign = extractFloat64Sign(c);
3946
3947     infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
3948                (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
3949
3950     /* It is implementation-defined whether the cases of (0,inf,qnan)
3951      * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3952      * they return if they do), so we have to hand this information
3953      * off to the target-specific pick-a-NaN routine.
3954      */
3955     if (((aExp == 0x7ff) && aSig) ||
3956         ((bExp == 0x7ff) && bSig) ||
3957         ((cExp == 0x7ff) && cSig)) {
3958         return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
3959     }
3960
3961     if (infzero) {
3962         float_raise(float_flag_invalid STATUS_VAR);
3963         return float64_default_nan;
3964     }
3965
3966     if (flags & float_muladd_negate_c) {
3967         cSign ^= 1;
3968     }
3969
3970     signflip = (flags & float_muladd_negate_result) ? 1 : 0;
3971
3972     /* Work out the sign and type of the product */
3973     pSign = aSign ^ bSign;
3974     if (flags & float_muladd_negate_product) {
3975         pSign ^= 1;
3976     }
3977     pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
3978     pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
3979
3980     if (cExp == 0x7ff) {
3981         if (pInf && (pSign ^ cSign)) {
3982             /* addition of opposite-signed infinities => InvalidOperation */
3983             float_raise(float_flag_invalid STATUS_VAR);
3984             return float64_default_nan;
3985         }
3986         /* Otherwise generate an infinity of the same sign */
3987         return packFloat64(cSign ^ signflip, 0x7ff, 0);
3988     }
3989
3990     if (pInf) {
3991         return packFloat64(pSign ^ signflip, 0x7ff, 0);
3992     }
3993
3994     if (pZero) {
3995         if (cExp == 0) {
3996             if (cSig == 0) {
3997                 /* Adding two exact zeroes */
3998                 if (pSign == cSign) {
3999                     zSign = pSign;
4000                 } else if (STATUS(float_rounding_mode) == float_round_down) {
4001                     zSign = 1;
4002                 } else {
4003                     zSign = 0;
4004                 }
4005                 return packFloat64(zSign ^ signflip, 0, 0);
4006             }
4007             /* Exact zero plus a denorm */
4008             if (STATUS(flush_to_zero)) {
4009                 float_raise(float_flag_output_denormal STATUS_VAR);
4010                 return packFloat64(cSign ^ signflip, 0, 0);
4011             }
4012         }
4013         /* Zero plus something non-zero : just return the something */
4014         return packFloat64(cSign ^ signflip, cExp, cSig);
4015     }
4016
4017     if (aExp == 0) {
4018         normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4019     }
4020     if (bExp == 0) {
4021         normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4022     }
4023
4024     /* Calculate the actual result a * b + c */
4025
4026     /* Multiply first; this is easy. */
4027     /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4028      * because we want the true exponent, not the "one-less-than"
4029      * flavour that roundAndPackFloat64() takes.
4030      */
4031     pExp = aExp + bExp - 0x3fe;
4032     aSig = (aSig | LIT64(0x0010000000000000))<<10;
4033     bSig = (bSig | LIT64(0x0010000000000000))<<11;
4034     mul64To128(aSig, bSig, &pSig0, &pSig1);
4035     if ((int64_t)(pSig0 << 1) >= 0) {
4036         shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4037         pExp--;
4038     }
4039
4040     zSign = pSign ^ signflip;
4041
4042     /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4043      * bit in position 126.
4044      */
4045     if (cExp == 0) {
4046         if (!cSig) {
4047             /* Throw out the special case of c being an exact zero now */
4048             shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4049             return roundAndPackFloat64(zSign, pExp - 1,
4050                                        pSig1 STATUS_VAR);
4051         }
4052         normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4053     }
4054
4055     /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4056      * significand of the addend, with the explicit bit in position 126.
4057      */
4058     cSig0 = cSig << (126 - 64 - 52);
4059     cSig1 = 0;
4060     cSig0 |= LIT64(0x4000000000000000);
4061     expDiff = pExp - cExp;
4062
4063     if (pSign == cSign) {
4064         /* Addition */
4065         if (expDiff > 0) {
4066             /* scale c to match p */
4067             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4068             zExp = pExp;
4069         } else if (expDiff < 0) {
4070             /* scale p to match c */
4071             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4072             zExp = cExp;
4073         } else {
4074             /* no scaling needed */
4075             zExp = cExp;
4076         }
4077         /* Add significands and make sure explicit bit ends up in posn 126 */
4078         add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4079         if ((int64_t)zSig0 < 0) {
4080             shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4081         } else {
4082             zExp--;
4083         }
4084         shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4085         return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
4086     } else {
4087         /* Subtraction */
4088         if (expDiff > 0) {
4089             shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4090             sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4091             zExp = pExp;
4092         } else if (expDiff < 0) {
4093             shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4094             sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4095             zExp = cExp;
4096             zSign ^= 1;
4097         } else {
4098             zExp = pExp;
4099             if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4100                 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4101             } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4102                 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4103                 zSign ^= 1;
4104             } else {
4105                 /* Exact zero */
4106                 zSign = signflip;
4107                 if (STATUS(float_rounding_mode) == float_round_down) {
4108                     zSign ^= 1;
4109                 }
4110                 return packFloat64(zSign, 0, 0);
4111             }
4112         }
4113         --zExp;
4114         /* Do the equivalent of normalizeRoundAndPackFloat64() but
4115          * starting with the significand in a pair of uint64_t.
4116          */
4117         if (zSig0) {
4118             shiftcount = countLeadingZeros64(zSig0) - 1;
4119             shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4120             if (zSig1) {
4121                 zSig0 |= 1;
4122             }
4123             zExp -= shiftcount;
4124         } else {
4125             shiftcount = countLeadingZeros64(zSig1);
4126             if (shiftcount == 0) {
4127                 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4128                 zExp -= 63;
4129             } else {
4130                 shiftcount--;
4131                 zSig0 = zSig1 << shiftcount;
4132                 zExp -= (shiftcount + 64);
4133             }
4134         }
4135         return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4136     }
4137 }
4138
4139 /*----------------------------------------------------------------------------
4140 | Returns the square root of the double-precision floating-point value `a'.
4141 | The operation is performed according to the IEC/IEEE Standard for Binary
4142 | Floating-Point Arithmetic.
4143 *----------------------------------------------------------------------------*/
4144
4145 float64 float64_sqrt( float64 a STATUS_PARAM )
4146 {
4147     flag aSign;
4148     int_fast16_t aExp, zExp;
4149     uint64_t aSig, zSig, doubleZSig;
4150     uint64_t rem0, rem1, term0, term1;
4151     a = float64_squash_input_denormal(a STATUS_VAR);
4152
4153     aSig = extractFloat64Frac( a );
4154     aExp = extractFloat64Exp( a );
4155     aSign = extractFloat64Sign( a );
4156     if ( aExp == 0x7FF ) {
4157         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4158         if ( ! aSign ) return a;
4159         float_raise( float_flag_invalid STATUS_VAR);
4160         return float64_default_nan;
4161     }
4162     if ( aSign ) {
4163         if ( ( aExp | aSig ) == 0 ) return a;
4164         float_raise( float_flag_invalid STATUS_VAR);
4165         return float64_default_nan;
4166     }
4167     if ( aExp == 0 ) {
4168         if ( aSig == 0 ) return float64_zero;
4169         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4170     }
4171     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4172     aSig |= LIT64( 0x0010000000000000 );
4173     zSig = estimateSqrt32( aExp, aSig>>21 );
4174     aSig <<= 9 - ( aExp & 1 );
4175     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4176     if ( ( zSig & 0x1FF ) <= 5 ) {
4177         doubleZSig = zSig<<1;
4178         mul64To128( zSig, zSig, &term0, &term1 );
4179         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4180         while ( (int64_t) rem0 < 0 ) {
4181             --zSig;
4182             doubleZSig -= 2;
4183             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4184         }
4185         zSig |= ( ( rem0 | rem1 ) != 0 );
4186     }
4187     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4188
4189 }
4190
4191 /*----------------------------------------------------------------------------
4192 | Returns the binary log of the double-precision floating-point value `a'.
4193 | The operation is performed according to the IEC/IEEE Standard for Binary
4194 | Floating-Point Arithmetic.
4195 *----------------------------------------------------------------------------*/
4196 float64 float64_log2( float64 a STATUS_PARAM )
4197 {
4198     flag aSign, zSign;
4199     int_fast16_t aExp;
4200     uint64_t aSig, aSig0, aSig1, zSig, i;
4201     a = float64_squash_input_denormal(a STATUS_VAR);
4202
4203     aSig = extractFloat64Frac( a );
4204     aExp = extractFloat64Exp( a );
4205     aSign = extractFloat64Sign( a );
4206
4207     if ( aExp == 0 ) {
4208         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4209         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4210     }
4211     if ( aSign ) {
4212         float_raise( float_flag_invalid STATUS_VAR);
4213         return float64_default_nan;
4214     }
4215     if ( aExp == 0x7FF ) {
4216         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4217         return a;
4218     }
4219
4220     aExp -= 0x3FF;
4221     aSig |= LIT64( 0x0010000000000000 );
4222     zSign = aExp < 0;
4223     zSig = (uint64_t)aExp << 52;
4224     for (i = 1LL << 51; i > 0; i >>= 1) {
4225         mul64To128( aSig, aSig, &aSig0, &aSig1 );
4226         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4227         if ( aSig & LIT64( 0x0020000000000000 ) ) {
4228             aSig >>= 1;
4229             zSig |= i;
4230         }
4231     }
4232
4233     if ( zSign )
4234         zSig = -zSig;
4235     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4236 }
4237
4238 /*----------------------------------------------------------------------------
4239 | Returns 1 if the double-precision floating-point value `a' is equal to the
4240 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
4241 | if either operand is a NaN.  Otherwise, the comparison is performed
4242 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4243 *----------------------------------------------------------------------------*/
4244
4245 int float64_eq( float64 a, float64 b STATUS_PARAM )
4246 {
4247     uint64_t av, bv;
4248     a = float64_squash_input_denormal(a STATUS_VAR);
4249     b = float64_squash_input_denormal(b STATUS_VAR);
4250
4251     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4252          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4253        ) {
4254         float_raise( float_flag_invalid STATUS_VAR);
4255         return 0;
4256     }
4257     av = float64_val(a);
4258     bv = float64_val(b);
4259     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4260
4261 }
4262
4263 /*----------------------------------------------------------------------------
4264 | Returns 1 if the double-precision floating-point value `a' is less than or
4265 | equal to the corresponding value `b', and 0 otherwise.  The invalid
4266 | exception is raised if either operand is a NaN.  The comparison is performed
4267 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4268 *----------------------------------------------------------------------------*/
4269
4270 int float64_le( float64 a, float64 b STATUS_PARAM )
4271 {
4272     flag aSign, bSign;
4273     uint64_t av, bv;
4274     a = float64_squash_input_denormal(a STATUS_VAR);
4275     b = float64_squash_input_denormal(b STATUS_VAR);
4276
4277     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4278          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4279        ) {
4280         float_raise( float_flag_invalid STATUS_VAR);
4281         return 0;
4282     }
4283     aSign = extractFloat64Sign( a );
4284     bSign = extractFloat64Sign( b );
4285     av = float64_val(a);
4286     bv = float64_val(b);
4287     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4288     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4289
4290 }
4291
4292 /*----------------------------------------------------------------------------
4293 | Returns 1 if the double-precision floating-point value `a' is less than
4294 | the corresponding value `b', and 0 otherwise.  The invalid exception is
4295 | raised if either operand is a NaN.  The comparison is performed according
4296 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4297 *----------------------------------------------------------------------------*/
4298
4299 int float64_lt( float64 a, float64 b STATUS_PARAM )
4300 {
4301     flag aSign, bSign;
4302     uint64_t av, bv;
4303
4304     a = float64_squash_input_denormal(a STATUS_VAR);
4305     b = float64_squash_input_denormal(b STATUS_VAR);
4306     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4307          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4308        ) {
4309         float_raise( float_flag_invalid STATUS_VAR);
4310         return 0;
4311     }
4312     aSign = extractFloat64Sign( a );
4313     bSign = extractFloat64Sign( b );
4314     av = float64_val(a);
4315     bv = float64_val(b);
4316     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4317     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4318
4319 }
4320
4321 /*----------------------------------------------------------------------------
4322 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4323 | be compared, and 0 otherwise.  The invalid exception is raised if either
4324 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
4325 | Standard for Binary Floating-Point Arithmetic.
4326 *----------------------------------------------------------------------------*/
4327
4328 int float64_unordered( float64 a, float64 b STATUS_PARAM )
4329 {
4330     a = float64_squash_input_denormal(a STATUS_VAR);
4331     b = float64_squash_input_denormal(b STATUS_VAR);
4332
4333     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4334          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4335        ) {
4336         float_raise( float_flag_invalid STATUS_VAR);
4337         return 1;
4338     }
4339     return 0;
4340 }
4341
4342 /*----------------------------------------------------------------------------
4343 | Returns 1 if the double-precision floating-point value `a' is equal to the
4344 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4345 | exception.The comparison is performed according to the IEC/IEEE Standard
4346 | for Binary Floating-Point Arithmetic.
4347 *----------------------------------------------------------------------------*/
4348
4349 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
4350 {
4351     uint64_t av, bv;
4352     a = float64_squash_input_denormal(a STATUS_VAR);
4353     b = float64_squash_input_denormal(b STATUS_VAR);
4354
4355     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4356          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4357        ) {
4358         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4359             float_raise( float_flag_invalid STATUS_VAR);
4360         }
4361         return 0;
4362     }
4363     av = float64_val(a);
4364     bv = float64_val(b);
4365     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4366
4367 }
4368
4369 /*----------------------------------------------------------------------------
4370 | Returns 1 if the double-precision floating-point value `a' is less than or
4371 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4372 | cause an exception.  Otherwise, the comparison is performed according to the
4373 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4374 *----------------------------------------------------------------------------*/
4375
4376 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
4377 {
4378     flag aSign, bSign;
4379     uint64_t av, bv;
4380     a = float64_squash_input_denormal(a STATUS_VAR);
4381     b = float64_squash_input_denormal(b STATUS_VAR);
4382
4383     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4384          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4385        ) {
4386         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4387             float_raise( float_flag_invalid STATUS_VAR);
4388         }
4389         return 0;
4390     }
4391     aSign = extractFloat64Sign( a );
4392     bSign = extractFloat64Sign( b );
4393     av = float64_val(a);
4394     bv = float64_val(b);
4395     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4396     return ( av == bv ) || ( aSign ^ ( av < bv ) );
4397
4398 }
4399
4400 /*----------------------------------------------------------------------------
4401 | Returns 1 if the double-precision floating-point value `a' is less than
4402 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
4403 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
4404 | Standard for Binary Floating-Point Arithmetic.
4405 *----------------------------------------------------------------------------*/
4406
4407 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
4408 {
4409     flag aSign, bSign;
4410     uint64_t av, bv;
4411     a = float64_squash_input_denormal(a STATUS_VAR);
4412     b = float64_squash_input_denormal(b STATUS_VAR);
4413
4414     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4415          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4416        ) {
4417         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4418             float_raise( float_flag_invalid STATUS_VAR);
4419         }
4420         return 0;
4421     }
4422     aSign = extractFloat64Sign( a );
4423     bSign = extractFloat64Sign( b );
4424     av = float64_val(a);
4425     bv = float64_val(b);
4426     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4427     return ( av != bv ) && ( aSign ^ ( av < bv ) );
4428
4429 }
4430
4431 /*----------------------------------------------------------------------------
4432 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4433 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
4434 | comparison is performed according to the IEC/IEEE Standard for Binary
4435 | Floating-Point Arithmetic.
4436 *----------------------------------------------------------------------------*/
4437
4438 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4439 {
4440     a = float64_squash_input_denormal(a STATUS_VAR);
4441     b = float64_squash_input_denormal(b STATUS_VAR);
4442
4443     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4444          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4445        ) {
4446         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4447             float_raise( float_flag_invalid STATUS_VAR);
4448         }
4449         return 1;
4450     }
4451     return 0;
4452 }
4453
4454 /*----------------------------------------------------------------------------
4455 | Returns the result of converting the extended double-precision floating-
4456 | point value `a' to the 32-bit two's complement integer format.  The
4457 | conversion is performed according to the IEC/IEEE Standard for Binary
4458 | Floating-Point Arithmetic---which means in particular that the conversion
4459 | is rounded according to the current rounding mode.  If `a' is a NaN, the
4460 | largest positive integer is returned.  Otherwise, if the conversion
4461 | overflows, the largest integer with the same sign as `a' is returned.
4462 *----------------------------------------------------------------------------*/
4463
4464 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4465 {
4466     flag aSign;
4467     int32 aExp, shiftCount;
4468     uint64_t aSig;
4469
4470     aSig = extractFloatx80Frac( a );
4471     aExp = extractFloatx80Exp( a );
4472     aSign = extractFloatx80Sign( a );
4473     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4474     shiftCount = 0x4037 - aExp;
4475     if ( shiftCount <= 0 ) shiftCount = 1;
4476     shift64RightJamming( aSig, shiftCount, &aSig );
4477     return roundAndPackInt32( aSign, aSig STATUS_VAR );
4478
4479 }
4480
4481 /*----------------------------------------------------------------------------
4482 | Returns the result of converting the extended double-precision floating-
4483 | point value `a' to the 32-bit two's complement integer format.  The
4484 | conversion is performed according to the IEC/IEEE Standard for Binary
4485 | Floating-Point Arithmetic, except that the conversion is always rounded
4486 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4487 | Otherwise, if the conversion overflows, the largest integer with the same
4488 | sign as `a' is returned.
4489 *----------------------------------------------------------------------------*/
4490
4491 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4492 {
4493     flag aSign;
4494     int32 aExp, shiftCount;
4495     uint64_t aSig, savedASig;
4496     int32_t z;
4497
4498     aSig = extractFloatx80Frac( a );
4499     aExp = extractFloatx80Exp( a );
4500     aSign = extractFloatx80Sign( a );
4501     if ( 0x401E < aExp ) {
4502         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4503         goto invalid;
4504     }
4505     else if ( aExp < 0x3FFF ) {
4506         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4507         return 0;
4508     }
4509     shiftCount = 0x403E - aExp;
4510     savedASig = aSig;
4511     aSig >>= shiftCount;
4512     z = aSig;
4513     if ( aSign ) z = - z;
4514     if ( ( z < 0 ) ^ aSign ) {
4515  invalid:
4516         float_raise( float_flag_invalid STATUS_VAR);
4517         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4518     }
4519     if ( ( aSig<<shiftCount ) != savedASig ) {
4520         STATUS(float_exception_flags) |= float_flag_inexact;
4521     }
4522     return z;
4523
4524 }
4525
4526 /*----------------------------------------------------------------------------
4527 | Returns the result of converting the extended double-precision floating-
4528 | point value `a' to the 64-bit two's complement integer format.  The
4529 | conversion is performed according to the IEC/IEEE Standard for Binary
4530 | Floating-Point Arithmetic---which means in particular that the conversion
4531 | is rounded according to the current rounding mode.  If `a' is a NaN,
4532 | the largest positive integer is returned.  Otherwise, if the conversion
4533 | overflows, the largest integer with the same sign as `a' is returned.
4534 *----------------------------------------------------------------------------*/
4535
4536 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4537 {
4538     flag aSign;
4539     int32 aExp, shiftCount;
4540     uint64_t aSig, aSigExtra;
4541
4542     aSig = extractFloatx80Frac( a );
4543     aExp = extractFloatx80Exp( a );
4544     aSign = extractFloatx80Sign( a );
4545     shiftCount = 0x403E - aExp;
4546     if ( shiftCount <= 0 ) {
4547         if ( shiftCount ) {
4548             float_raise( float_flag_invalid STATUS_VAR);
4549             if (    ! aSign
4550                  || (    ( aExp == 0x7FFF )
4551                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
4552                ) {
4553                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4554             }
4555             return (int64_t) LIT64( 0x8000000000000000 );
4556         }
4557         aSigExtra = 0;
4558     }
4559     else {
4560         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4561     }
4562     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4563
4564 }
4565
4566 /*----------------------------------------------------------------------------
4567 | Returns the result of converting the extended double-precision floating-
4568 | point value `a' to the 64-bit two's complement integer format.  The
4569 | conversion is performed according to the IEC/IEEE Standard for Binary
4570 | Floating-Point Arithmetic, except that the conversion is always rounded
4571 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
4572 | Otherwise, if the conversion overflows, the largest integer with the same
4573 | sign as `a' is returned.
4574 *----------------------------------------------------------------------------*/
4575
4576 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4577 {
4578     flag aSign;
4579     int32 aExp, shiftCount;
4580     uint64_t aSig;
4581     int64 z;
4582
4583     aSig = extractFloatx80Frac( a );
4584     aExp = extractFloatx80Exp( a );
4585     aSign = extractFloatx80Sign( a );
4586     shiftCount = aExp - 0x403E;
4587     if ( 0 <= shiftCount ) {
4588         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4589         if ( ( a.high != 0xC03E ) || aSig ) {
4590             float_raise( float_flag_invalid STATUS_VAR);
4591             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4592                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4593             }
4594         }
4595         return (int64_t) LIT64( 0x8000000000000000 );
4596     }
4597     else if ( aExp < 0x3FFF ) {
4598         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4599         return 0;
4600     }
4601     z = aSig>>( - shiftCount );
4602     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4603         STATUS(float_exception_flags) |= float_flag_inexact;
4604     }
4605     if ( aSign ) z = - z;
4606     return z;
4607
4608 }
4609
4610 /*----------------------------------------------------------------------------
4611 | Returns the result of converting the extended double-precision floating-
4612 | point value `a' to the single-precision floating-point format.  The
4613 | conversion is performed according to the IEC/IEEE Standard for Binary
4614 | Floating-Point Arithmetic.
4615 *----------------------------------------------------------------------------*/
4616
4617 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4618 {
4619     flag aSign;
4620     int32 aExp;
4621     uint64_t aSig;
4622
4623     aSig = extractFloatx80Frac( a );
4624     aExp = extractFloatx80Exp( a );
4625     aSign = extractFloatx80Sign( a );
4626     if ( aExp == 0x7FFF ) {
4627         if ( (uint64_t) ( aSig<<1 ) ) {
4628             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4629         }
4630         return packFloat32( aSign, 0xFF, 0 );
4631     }
4632     shift64RightJamming( aSig, 33, &aSig );
4633     if ( aExp || aSig ) aExp -= 0x3F81;
4634     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4635
4636 }
4637
4638 /*----------------------------------------------------------------------------
4639 | Returns the result of converting the extended double-precision floating-
4640 | point value `a' to the double-precision floating-point format.  The
4641 | conversion is performed according to the IEC/IEEE Standard for Binary
4642 | Floating-Point Arithmetic.
4643 *----------------------------------------------------------------------------*/
4644
4645 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4646 {
4647     flag aSign;
4648     int32 aExp;
4649     uint64_t aSig, zSig;
4650
4651     aSig = extractFloatx80Frac( a );
4652     aExp = extractFloatx80Exp( a );
4653     aSign = extractFloatx80Sign( a );
4654     if ( aExp == 0x7FFF ) {
4655         if ( (uint64_t) ( aSig<<1 ) ) {
4656             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4657         }
4658         return packFloat64( aSign, 0x7FF, 0 );
4659     }
4660     shift64RightJamming( aSig, 1, &zSig );
4661     if ( aExp || aSig ) aExp -= 0x3C01;
4662     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4663
4664 }
4665
4666 /*----------------------------------------------------------------------------
4667 | Returns the result of converting the extended double-precision floating-
4668 | point value `a' to the quadruple-precision floating-point format.  The
4669 | conversion is performed according to the IEC/IEEE Standard for Binary
4670 | Floating-Point Arithmetic.
4671 *----------------------------------------------------------------------------*/
4672
4673 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4674 {
4675     flag aSign;
4676     int_fast16_t aExp;
4677     uint64_t aSig, zSig0, zSig1;
4678
4679     aSig = extractFloatx80Frac( a );
4680     aExp = extractFloatx80Exp( a );
4681     aSign = extractFloatx80Sign( a );
4682     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4683         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4684     }
4685     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4686     return packFloat128( aSign, aExp, zSig0, zSig1 );
4687
4688 }
4689
4690 /*----------------------------------------------------------------------------
4691 | Rounds the extended double-precision floating-point value `a' to an integer,
4692 | and returns the result as an extended quadruple-precision floating-point
4693 | value.  The operation is performed according to the IEC/IEEE Standard for
4694 | Binary Floating-Point Arithmetic.
4695 *----------------------------------------------------------------------------*/
4696
4697 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4698 {
4699     flag aSign;
4700     int32 aExp;
4701     uint64_t lastBitMask, roundBitsMask;
4702     int8 roundingMode;
4703     floatx80 z;
4704
4705     aExp = extractFloatx80Exp( a );
4706     if ( 0x403E <= aExp ) {
4707         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4708             return propagateFloatx80NaN( a, a STATUS_VAR );
4709         }
4710         return a;
4711     }
4712     if ( aExp < 0x3FFF ) {
4713         if (    ( aExp == 0 )
4714              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4715             return a;
4716         }
4717         STATUS(float_exception_flags) |= float_flag_inexact;
4718         aSign = extractFloatx80Sign( a );
4719         switch ( STATUS(float_rounding_mode) ) {
4720          case float_round_nearest_even:
4721             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4722                ) {
4723                 return
4724                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4725             }
4726             break;
4727          case float_round_down:
4728             return
4729                   aSign ?
4730                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4731                 : packFloatx80( 0, 0, 0 );
4732          case float_round_up:
4733             return
4734                   aSign ? packFloatx80( 1, 0, 0 )
4735                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4736         }
4737         return packFloatx80( aSign, 0, 0 );
4738     }
4739     lastBitMask = 1;
4740     lastBitMask <<= 0x403E - aExp;
4741     roundBitsMask = lastBitMask - 1;
4742     z = a;
4743     roundingMode = STATUS(float_rounding_mode);
4744     if ( roundingMode == float_round_nearest_even ) {
4745         z.low += lastBitMask>>1;
4746         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4747     }
4748     else if ( roundingMode != float_round_to_zero ) {
4749         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4750             z.low += roundBitsMask;
4751         }
4752     }
4753     z.low &= ~ roundBitsMask;
4754     if ( z.low == 0 ) {
4755         ++z.high;
4756         z.low = LIT64( 0x8000000000000000 );
4757     }
4758     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4759     return z;
4760
4761 }
4762
4763 /*----------------------------------------------------------------------------
4764 | Returns the result of adding the absolute values of the extended double-
4765 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4766 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4767 | The addition is performed according to the IEC/IEEE Standard for Binary
4768 | Floating-Point Arithmetic.
4769 *----------------------------------------------------------------------------*/
4770
4771 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4772 {
4773     int32 aExp, bExp, zExp;
4774     uint64_t aSig, bSig, zSig0, zSig1;
4775     int32 expDiff;
4776
4777     aSig = extractFloatx80Frac( a );
4778     aExp = extractFloatx80Exp( a );
4779     bSig = extractFloatx80Frac( b );
4780     bExp = extractFloatx80Exp( b );
4781     expDiff = aExp - bExp;
4782     if ( 0 < expDiff ) {
4783         if ( aExp == 0x7FFF ) {
4784             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4785             return a;
4786         }
4787         if ( bExp == 0 ) --expDiff;
4788         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4789         zExp = aExp;
4790     }
4791     else if ( expDiff < 0 ) {
4792         if ( bExp == 0x7FFF ) {
4793             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4794             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4795         }
4796         if ( aExp == 0 ) ++expDiff;
4797         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4798         zExp = bExp;
4799     }
4800     else {
4801         if ( aExp == 0x7FFF ) {
4802             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4803                 return propagateFloatx80NaN( a, b STATUS_VAR );
4804             }
4805             return a;
4806         }
4807         zSig1 = 0;
4808         zSig0 = aSig + bSig;
4809         if ( aExp == 0 ) {
4810             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4811             goto roundAndPack;
4812         }
4813         zExp = aExp;
4814         goto shiftRight1;
4815     }
4816     zSig0 = aSig + bSig;
4817     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4818  shiftRight1:
4819     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4820     zSig0 |= LIT64( 0x8000000000000000 );
4821     ++zExp;
4822  roundAndPack:
4823     return
4824         roundAndPackFloatx80(
4825             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4826
4827 }
4828
4829 /*----------------------------------------------------------------------------
4830 | Returns the result of subtracting the absolute values of the extended
4831 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4832 | difference is negated before being returned.  `zSign' is ignored if the
4833 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4834 | Standard for Binary Floating-Point Arithmetic.
4835 *----------------------------------------------------------------------------*/
4836
4837 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4838 {
4839     int32 aExp, bExp, zExp;
4840     uint64_t aSig, bSig, zSig0, zSig1;
4841     int32 expDiff;
4842     floatx80 z;
4843
4844     aSig = extractFloatx80Frac( a );
4845     aExp = extractFloatx80Exp( a );
4846     bSig = extractFloatx80Frac( b );
4847     bExp = extractFloatx80Exp( b );
4848     expDiff = aExp - bExp;
4849     if ( 0 < expDiff ) goto aExpBigger;
4850     if ( expDiff < 0 ) goto bExpBigger;
4851     if ( aExp == 0x7FFF ) {
4852         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4853             return propagateFloatx80NaN( a, b STATUS_VAR );
4854         }
4855         float_raise( float_flag_invalid STATUS_VAR);
4856         z.low = floatx80_default_nan_low;
4857         z.high = floatx80_default_nan_high;
4858         return z;
4859     }
4860     if ( aExp == 0 ) {
4861         aExp = 1;
4862         bExp = 1;
4863     }
4864     zSig1 = 0;
4865     if ( bSig < aSig ) goto aBigger;
4866     if ( aSig < bSig ) goto bBigger;
4867     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4868  bExpBigger:
4869     if ( bExp == 0x7FFF ) {
4870         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4871         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4872     }
4873     if ( aExp == 0 ) ++expDiff;
4874     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4875  bBigger:
4876     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4877     zExp = bExp;
4878     zSign ^= 1;
4879     goto normalizeRoundAndPack;
4880  aExpBigger:
4881     if ( aExp == 0x7FFF ) {
4882         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4883         return a;
4884     }
4885     if ( bExp == 0 ) --expDiff;
4886     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4887  aBigger:
4888     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4889     zExp = aExp;
4890  normalizeRoundAndPack:
4891     return
4892         normalizeRoundAndPackFloatx80(
4893             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4894
4895 }
4896
4897 /*----------------------------------------------------------------------------
4898 | Returns the result of adding the extended double-precision floating-point
4899 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4900 | Standard for Binary Floating-Point Arithmetic.
4901 *----------------------------------------------------------------------------*/
4902
4903 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4904 {
4905     flag aSign, bSign;
4906
4907     aSign = extractFloatx80Sign( a );
4908     bSign = extractFloatx80Sign( b );
4909     if ( aSign == bSign ) {
4910         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4911     }
4912     else {
4913         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4914     }
4915
4916 }
4917
4918 /*----------------------------------------------------------------------------
4919 | Returns the result of subtracting the extended double-precision floating-
4920 | point values `a' and `b'.  The operation is performed according to the
4921 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4922 *----------------------------------------------------------------------------*/
4923
4924 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4925 {
4926     flag aSign, bSign;
4927
4928     aSign = extractFloatx80Sign( a );
4929     bSign = extractFloatx80Sign( b );
4930     if ( aSign == bSign ) {
4931         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4932     }
4933     else {
4934         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4935     }
4936
4937 }
4938
4939 /*----------------------------------------------------------------------------
4940 | Returns the result of multiplying the extended double-precision floating-
4941 | point values `a' and `b'.  The operation is performed according to the
4942 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4943 *----------------------------------------------------------------------------*/
4944
4945 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4946 {
4947     flag aSign, bSign, zSign;
4948     int32 aExp, bExp, zExp;
4949     uint64_t aSig, bSig, zSig0, zSig1;
4950     floatx80 z;
4951
4952     aSig = extractFloatx80Frac( a );
4953     aExp = extractFloatx80Exp( a );
4954     aSign = extractFloatx80Sign( a );
4955     bSig = extractFloatx80Frac( b );
4956     bExp = extractFloatx80Exp( b );
4957     bSign = extractFloatx80Sign( b );
4958     zSign = aSign ^ bSign;
4959     if ( aExp == 0x7FFF ) {
4960         if (    (uint64_t) ( aSig<<1 )
4961              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4962             return propagateFloatx80NaN( a, b STATUS_VAR );
4963         }
4964         if ( ( bExp | bSig ) == 0 ) goto invalid;
4965         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4966     }
4967     if ( bExp == 0x7FFF ) {
4968         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4969         if ( ( aExp | aSig ) == 0 ) {
4970  invalid:
4971             float_raise( float_flag_invalid STATUS_VAR);
4972             z.low = floatx80_default_nan_low;
4973             z.high = floatx80_default_nan_high;
4974             return z;
4975         }
4976         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4977     }
4978     if ( aExp == 0 ) {
4979         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4980         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4981     }
4982     if ( bExp == 0 ) {
4983         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4984         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4985     }
4986     zExp = aExp + bExp - 0x3FFE;
4987     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4988     if ( 0 < (int64_t) zSig0 ) {
4989         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4990         --zExp;
4991     }
4992     return
4993         roundAndPackFloatx80(
4994             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4995
4996 }
4997
4998 /*----------------------------------------------------------------------------
4999 | Returns the result of dividing the extended double-precision floating-point
5000 | value `a' by the corresponding value `b'.  The operation is performed
5001 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5002 *----------------------------------------------------------------------------*/
5003
5004 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
5005 {
5006     flag aSign, bSign, zSign;
5007     int32 aExp, bExp, zExp;
5008     uint64_t aSig, bSig, zSig0, zSig1;
5009     uint64_t rem0, rem1, rem2, term0, term1, term2;
5010     floatx80 z;
5011
5012     aSig = extractFloatx80Frac( a );
5013     aExp = extractFloatx80Exp( a );
5014     aSign = extractFloatx80Sign( a );
5015     bSig = extractFloatx80Frac( b );
5016     bExp = extractFloatx80Exp( b );
5017     bSign = extractFloatx80Sign( b );
5018     zSign = aSign ^ bSign;
5019     if ( aExp == 0x7FFF ) {
5020         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5021         if ( bExp == 0x7FFF ) {
5022             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5023             goto invalid;
5024         }
5025         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5026     }
5027     if ( bExp == 0x7FFF ) {
5028         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5029         return packFloatx80( zSign, 0, 0 );
5030     }
5031     if ( bExp == 0 ) {
5032         if ( bSig == 0 ) {
5033             if ( ( aExp | aSig ) == 0 ) {
5034  invalid:
5035                 float_raise( float_flag_invalid STATUS_VAR);
5036                 z.low = floatx80_default_nan_low;
5037                 z.high = floatx80_default_nan_high;
5038                 return z;
5039             }
5040             float_raise( float_flag_divbyzero STATUS_VAR);
5041             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5042         }
5043         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5044     }
5045     if ( aExp == 0 ) {
5046         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5047         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5048     }
5049     zExp = aExp - bExp + 0x3FFE;
5050     rem1 = 0;
5051     if ( bSig <= aSig ) {
5052         shift128Right( aSig, 0, 1, &aSig, &rem1 );
5053         ++zExp;
5054     }
5055     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5056     mul64To128( bSig, zSig0, &term0, &term1 );
5057     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5058     while ( (int64_t) rem0 < 0 ) {
5059         --zSig0;
5060         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5061     }
5062     zSig1 = estimateDiv128To64( rem1, 0, bSig );
5063     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5064         mul64To128( bSig, zSig1, &term1, &term2 );
5065         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5066         while ( (int64_t) rem1 < 0 ) {
5067             --zSig1;
5068             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5069         }
5070         zSig1 |= ( ( rem1 | rem2 ) != 0 );
5071     }
5072     return
5073         roundAndPackFloatx80(
5074             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5075
5076 }
5077
5078 /*----------------------------------------------------------------------------
5079 | Returns the remainder of the extended double-precision floating-point value
5080 | `a' with respect to the corresponding value `b'.  The operation is performed
5081 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5082 *----------------------------------------------------------------------------*/
5083
5084 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
5085 {
5086     flag aSign, zSign;
5087     int32 aExp, bExp, expDiff;
5088     uint64_t aSig0, aSig1, bSig;
5089     uint64_t q, term0, term1, alternateASig0, alternateASig1;
5090     floatx80 z;
5091
5092     aSig0 = extractFloatx80Frac( a );
5093     aExp = extractFloatx80Exp( a );
5094     aSign = extractFloatx80Sign( a );
5095     bSig = extractFloatx80Frac( b );
5096     bExp = extractFloatx80Exp( b );
5097     if ( aExp == 0x7FFF ) {
5098         if (    (uint64_t) ( aSig0<<1 )
5099              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5100             return propagateFloatx80NaN( a, b STATUS_VAR );
5101         }
5102         goto invalid;
5103     }
5104     if ( bExp == 0x7FFF ) {
5105         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
5106         return a;
5107     }
5108     if ( bExp == 0 ) {
5109         if ( bSig == 0 ) {
5110  invalid:
5111             float_raise( float_flag_invalid STATUS_VAR);
5112             z.low = floatx80_default_nan_low;
5113             z.high = floatx80_default_nan_high;
5114             return z;
5115         }
5116         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5117     }
5118     if ( aExp == 0 ) {
5119         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5120         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5121     }
5122     bSig |= LIT64( 0x8000000000000000 );
5123     zSign = aSign;
5124     expDiff = aExp - bExp;
5125     aSig1 = 0;
5126     if ( expDiff < 0 ) {
5127         if ( expDiff < -1 ) return a;
5128         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5129         expDiff = 0;
5130     }
5131     q = ( bSig <= aSig0 );
5132     if ( q ) aSig0 -= bSig;
5133     expDiff -= 64;
5134     while ( 0 < expDiff ) {
5135         q = estimateDiv128To64( aSig0, aSig1, bSig );
5136         q = ( 2 < q ) ? q - 2 : 0;
5137         mul64To128( bSig, q, &term0, &term1 );
5138         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5139         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5140         expDiff -= 62;
5141     }
5142     expDiff += 64;
5143     if ( 0 < expDiff ) {
5144         q = estimateDiv128To64( aSig0, aSig1, bSig );
5145         q = ( 2 < q ) ? q - 2 : 0;
5146         q >>= 64 - expDiff;
5147         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5148         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5149         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5150         while ( le128( term0, term1, aSig0, aSig1 ) ) {
5151             ++q;
5152             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5153         }
5154     }
5155     else {
5156         term1 = 0;
5157         term0 = bSig;
5158     }
5159     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5160     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5161          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5162               && ( q & 1 ) )
5163        ) {
5164         aSig0 = alternateASig0;
5165         aSig1 = alternateASig1;
5166         zSign = ! zSign;
5167     }
5168     return
5169         normalizeRoundAndPackFloatx80(
5170             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5171
5172 }
5173
5174 /*----------------------------------------------------------------------------
5175 | Returns the square root of the extended double-precision floating-point
5176 | value `a'.  The operation is performed according to the IEC/IEEE Standard
5177 | for Binary Floating-Point Arithmetic.
5178 *----------------------------------------------------------------------------*/
5179
5180 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5181 {
5182     flag aSign;
5183     int32 aExp, zExp;
5184     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5185     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5186     floatx80 z;
5187
5188     aSig0 = extractFloatx80Frac( a );
5189     aExp = extractFloatx80Exp( a );
5190     aSign = extractFloatx80Sign( a );
5191     if ( aExp == 0x7FFF ) {
5192         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
5193         if ( ! aSign ) return a;
5194         goto invalid;
5195     }
5196     if ( aSign ) {
5197         if ( ( aExp | aSig0 ) == 0 ) return a;
5198  invalid:
5199         float_raise( float_flag_invalid STATUS_VAR);
5200         z.low = floatx80_default_nan_low;
5201         z.high = floatx80_default_nan_high;
5202         return z;
5203     }
5204     if ( aExp == 0 ) {
5205         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5206         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5207     }
5208     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5209     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5210     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5211     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5212     doubleZSig0 = zSig0<<1;
5213     mul64To128( zSig0, zSig0, &term0, &term1 );
5214     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5215     while ( (int64_t) rem0 < 0 ) {
5216         --zSig0;
5217         doubleZSig0 -= 2;
5218         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5219     }
5220     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5221     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5222         if ( zSig1 == 0 ) zSig1 = 1;
5223         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5224         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5225         mul64To128( zSig1, zSig1, &term2, &term3 );
5226         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5227         while ( (int64_t) rem1 < 0 ) {
5228             --zSig1;
5229             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5230             term3 |= 1;
5231             term2 |= doubleZSig0;
5232             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5233         }
5234         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5235     }
5236     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5237     zSig0 |= doubleZSig0;
5238     return
5239         roundAndPackFloatx80(
5240             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5241
5242 }
5243
5244 /*----------------------------------------------------------------------------
5245 | Returns 1 if the extended double-precision floating-point value `a' is equal
5246 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
5247 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5248 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5249 *----------------------------------------------------------------------------*/
5250
5251 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
5252 {
5253
5254     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5255               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5256          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5257               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5258        ) {
5259         float_raise( float_flag_invalid STATUS_VAR);
5260         return 0;
5261     }
5262     return
5263            ( a.low == b.low )
5264         && (    ( a.high == b.high )
5265              || (    ( a.low == 0 )
5266                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5267            );
5268
5269 }
5270
5271 /*----------------------------------------------------------------------------
5272 | Returns 1 if the extended double-precision floating-point value `a' is
5273 | less than or equal to the corresponding value `b', and 0 otherwise.  The
5274 | invalid exception is raised if either operand is a NaN.  The comparison is
5275 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5276 | Arithmetic.
5277 *----------------------------------------------------------------------------*/
5278
5279 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
5280 {
5281     flag aSign, bSign;
5282
5283     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5284               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5285          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5286               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5287        ) {
5288         float_raise( float_flag_invalid STATUS_VAR);
5289         return 0;
5290     }
5291     aSign = extractFloatx80Sign( a );
5292     bSign = extractFloatx80Sign( b );
5293     if ( aSign != bSign ) {
5294         return
5295                aSign
5296             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5297                  == 0 );
5298     }
5299     return
5300           aSign ? le128( b.high, b.low, a.high, a.low )
5301         : le128( a.high, a.low, b.high, b.low );
5302
5303 }
5304
5305 /*----------------------------------------------------------------------------
5306 | Returns 1 if the extended double-precision floating-point value `a' is
5307 | less than the corresponding value `b', and 0 otherwise.  The invalid
5308 | exception is raised if either operand is a NaN.  The comparison is performed
5309 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5310 *----------------------------------------------------------------------------*/
5311
5312 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
5313 {
5314     flag aSign, bSign;
5315
5316     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5317               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5318          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5319               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5320        ) {
5321         float_raise( float_flag_invalid STATUS_VAR);
5322         return 0;
5323     }
5324     aSign = extractFloatx80Sign( a );
5325     bSign = extractFloatx80Sign( b );
5326     if ( aSign != bSign ) {
5327         return
5328                aSign
5329             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5330                  != 0 );
5331     }
5332     return
5333           aSign ? lt128( b.high, b.low, a.high, a.low )
5334         : lt128( a.high, a.low, b.high, b.low );
5335
5336 }
5337
5338 /*----------------------------------------------------------------------------
5339 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5340 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
5341 | either operand is a NaN.   The comparison is performed according to the
5342 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5343 *----------------------------------------------------------------------------*/
5344 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5345 {
5346     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5347               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5348          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5349               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5350        ) {
5351         float_raise( float_flag_invalid STATUS_VAR);
5352         return 1;
5353     }
5354     return 0;
5355 }
5356
5357 /*----------------------------------------------------------------------------
5358 | Returns 1 if the extended double-precision floating-point value `a' is
5359 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5360 | cause an exception.  The comparison is performed according to the IEC/IEEE
5361 | Standard for Binary Floating-Point Arithmetic.
5362 *----------------------------------------------------------------------------*/
5363
5364 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5365 {
5366
5367     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5368               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5369          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5370               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5371        ) {
5372         if (    floatx80_is_signaling_nan( a )
5373              || floatx80_is_signaling_nan( b ) ) {
5374             float_raise( float_flag_invalid STATUS_VAR);
5375         }
5376         return 0;
5377     }
5378     return
5379            ( a.low == b.low )
5380         && (    ( a.high == b.high )
5381              || (    ( a.low == 0 )
5382                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5383            );
5384
5385 }
5386
5387 /*----------------------------------------------------------------------------
5388 | Returns 1 if the extended double-precision floating-point value `a' is less
5389 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
5390 | do not cause an exception.  Otherwise, the comparison is performed according
5391 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5392 *----------------------------------------------------------------------------*/
5393
5394 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5395 {
5396     flag aSign, bSign;
5397
5398     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5399               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5400          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5401               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5402        ) {
5403         if (    floatx80_is_signaling_nan( a )
5404              || floatx80_is_signaling_nan( b ) ) {
5405             float_raise( float_flag_invalid STATUS_VAR);
5406         }
5407         return 0;
5408     }
5409     aSign = extractFloatx80Sign( a );
5410     bSign = extractFloatx80Sign( b );
5411     if ( aSign != bSign ) {
5412         return
5413                aSign
5414             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5415                  == 0 );
5416     }
5417     return
5418           aSign ? le128( b.high, b.low, a.high, a.low )
5419         : le128( a.high, a.low, b.high, b.low );
5420
5421 }
5422
5423 /*----------------------------------------------------------------------------
5424 | Returns 1 if the extended double-precision floating-point value `a' is less
5425 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
5426 | an exception.  Otherwise, the comparison is performed according to the
5427 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5428 *----------------------------------------------------------------------------*/
5429
5430 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5431 {
5432     flag aSign, bSign;
5433
5434     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5435               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5436          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5437               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5438        ) {
5439         if (    floatx80_is_signaling_nan( a )
5440              || floatx80_is_signaling_nan( b ) ) {
5441             float_raise( float_flag_invalid STATUS_VAR);
5442         }
5443         return 0;
5444     }
5445     aSign = extractFloatx80Sign( a );
5446     bSign = extractFloatx80Sign( b );
5447     if ( aSign != bSign ) {
5448         return
5449                aSign
5450             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5451                  != 0 );
5452     }
5453     return
5454           aSign ? lt128( b.high, b.low, a.high, a.low )
5455         : lt128( a.high, a.low, b.high, b.low );
5456
5457 }
5458
5459 /*----------------------------------------------------------------------------
5460 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5461 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
5462 | The comparison is performed according to the IEC/IEEE Standard for Binary
5463 | Floating-Point Arithmetic.
5464 *----------------------------------------------------------------------------*/
5465 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5466 {
5467     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
5468               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5469          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
5470               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5471        ) {
5472         if (    floatx80_is_signaling_nan( a )
5473              || floatx80_is_signaling_nan( b ) ) {
5474             float_raise( float_flag_invalid STATUS_VAR);
5475         }
5476         return 1;
5477     }
5478     return 0;
5479 }
5480
5481 /*----------------------------------------------------------------------------
5482 | Returns the result of converting the quadruple-precision floating-point
5483 | value `a' to the 32-bit two's complement integer format.  The conversion
5484 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5485 | Arithmetic---which means in particular that the conversion is rounded
5486 | according to the current rounding mode.  If `a' is a NaN, the largest
5487 | positive integer is returned.  Otherwise, if the conversion overflows, the
5488 | largest integer with the same sign as `a' is returned.
5489 *----------------------------------------------------------------------------*/
5490
5491 int32 float128_to_int32( float128 a STATUS_PARAM )
5492 {
5493     flag aSign;
5494     int32 aExp, shiftCount;
5495     uint64_t aSig0, aSig1;
5496
5497     aSig1 = extractFloat128Frac1( a );
5498     aSig0 = extractFloat128Frac0( a );
5499     aExp = extractFloat128Exp( a );
5500     aSign = extractFloat128Sign( a );
5501     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5502     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5503     aSig0 |= ( aSig1 != 0 );
5504     shiftCount = 0x4028 - aExp;
5505     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5506     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5507
5508 }
5509
5510 /*----------------------------------------------------------------------------
5511 | Returns the result of converting the quadruple-precision floating-point
5512 | value `a' to the 32-bit two's complement integer format.  The conversion
5513 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5514 | Arithmetic, except that the conversion is always rounded toward zero.  If
5515 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
5516 | conversion overflows, the largest integer with the same sign as `a' is
5517 | returned.
5518 *----------------------------------------------------------------------------*/
5519
5520 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5521 {
5522     flag aSign;
5523     int32 aExp, shiftCount;
5524     uint64_t aSig0, aSig1, savedASig;
5525     int32_t z;
5526
5527     aSig1 = extractFloat128Frac1( a );
5528     aSig0 = extractFloat128Frac0( a );
5529     aExp = extractFloat128Exp( a );
5530     aSign = extractFloat128Sign( a );
5531     aSig0 |= ( aSig1 != 0 );
5532     if ( 0x401E < aExp ) {
5533         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5534         goto invalid;
5535     }
5536     else if ( aExp < 0x3FFF ) {
5537         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5538         return 0;
5539     }
5540     aSig0 |= LIT64( 0x0001000000000000 );
5541     shiftCount = 0x402F - aExp;
5542     savedASig = aSig0;
5543     aSig0 >>= shiftCount;
5544     z = aSig0;
5545     if ( aSign ) z = - z;
5546     if ( ( z < 0 ) ^ aSign ) {
5547  invalid:
5548         float_raise( float_flag_invalid STATUS_VAR);
5549         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5550     }
5551     if ( ( aSig0<<shiftCount ) != savedASig ) {
5552         STATUS(float_exception_flags) |= float_flag_inexact;
5553     }
5554     return z;
5555
5556 }
5557
5558 /*----------------------------------------------------------------------------
5559 | Returns the result of converting the quadruple-precision floating-point
5560 | value `a' to the 64-bit two's complement integer format.  The conversion
5561 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5562 | Arithmetic---which means in particular that the conversion is rounded
5563 | according to the current rounding mode.  If `a' is a NaN, the largest
5564 | positive integer is returned.  Otherwise, if the conversion overflows, the
5565 | largest integer with the same sign as `a' is returned.
5566 *----------------------------------------------------------------------------*/
5567
5568 int64 float128_to_int64( float128 a STATUS_PARAM )
5569 {
5570     flag aSign;
5571     int32 aExp, shiftCount;
5572     uint64_t aSig0, aSig1;
5573
5574     aSig1 = extractFloat128Frac1( a );
5575     aSig0 = extractFloat128Frac0( a );
5576     aExp = extractFloat128Exp( a );
5577     aSign = extractFloat128Sign( a );
5578     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5579     shiftCount = 0x402F - aExp;
5580     if ( shiftCount <= 0 ) {
5581         if ( 0x403E < aExp ) {
5582             float_raise( float_flag_invalid STATUS_VAR);
5583             if (    ! aSign
5584                  || (    ( aExp == 0x7FFF )
5585                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5586                     )
5587                ) {
5588                 return LIT64( 0x7FFFFFFFFFFFFFFF );
5589             }
5590             return (int64_t) LIT64( 0x8000000000000000 );
5591         }
5592         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5593     }
5594     else {
5595         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5596     }
5597     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5598
5599 }
5600
5601 /*----------------------------------------------------------------------------
5602 | Returns the result of converting the quadruple-precision floating-point
5603 | value `a' to the 64-bit two's complement integer format.  The conversion
5604 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5605 | Arithmetic, except that the conversion is always rounded toward zero.
5606 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
5607 | the conversion overflows, the largest integer with the same sign as `a' is
5608 | returned.
5609 *----------------------------------------------------------------------------*/
5610
5611 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5612 {
5613     flag aSign;
5614     int32 aExp, shiftCount;
5615     uint64_t aSig0, aSig1;
5616     int64 z;
5617
5618     aSig1 = extractFloat128Frac1( a );
5619     aSig0 = extractFloat128Frac0( a );
5620     aExp = extractFloat128Exp( a );
5621     aSign = extractFloat128Sign( a );
5622     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5623     shiftCount = aExp - 0x402F;
5624     if ( 0 < shiftCount ) {
5625         if ( 0x403E <= aExp ) {
5626             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5627             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5628                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5629                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5630             }
5631             else {
5632                 float_raise( float_flag_invalid STATUS_VAR);
5633                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5634                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5635                 }
5636             }
5637             return (int64_t) LIT64( 0x8000000000000000 );
5638         }
5639         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5640         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5641             STATUS(float_exception_flags) |= float_flag_inexact;
5642         }
5643     }
5644     else {
5645         if ( aExp < 0x3FFF ) {
5646             if ( aExp | aSig0 | aSig1 ) {
5647                 STATUS(float_exception_flags) |= float_flag_inexact;
5648             }
5649             return 0;
5650         }
5651         z = aSig0>>( - shiftCount );
5652         if (    aSig1
5653              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5654             STATUS(float_exception_flags) |= float_flag_inexact;
5655         }
5656     }
5657     if ( aSign ) z = - z;
5658     return z;
5659
5660 }
5661
5662 /*----------------------------------------------------------------------------
5663 | Returns the result of converting the quadruple-precision floating-point
5664 | value `a' to the single-precision floating-point format.  The conversion
5665 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5666 | Arithmetic.
5667 *----------------------------------------------------------------------------*/
5668
5669 float32 float128_to_float32( float128 a STATUS_PARAM )
5670 {
5671     flag aSign;
5672     int32 aExp;
5673     uint64_t aSig0, aSig1;
5674     uint32_t zSig;
5675
5676     aSig1 = extractFloat128Frac1( a );
5677     aSig0 = extractFloat128Frac0( a );
5678     aExp = extractFloat128Exp( a );
5679     aSign = extractFloat128Sign( a );
5680     if ( aExp == 0x7FFF ) {
5681         if ( aSig0 | aSig1 ) {
5682             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5683         }
5684         return packFloat32( aSign, 0xFF, 0 );
5685     }
5686     aSig0 |= ( aSig1 != 0 );
5687     shift64RightJamming( aSig0, 18, &aSig0 );
5688     zSig = aSig0;
5689     if ( aExp || zSig ) {
5690         zSig |= 0x40000000;
5691         aExp -= 0x3F81;
5692     }
5693     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5694
5695 }
5696
5697 /*----------------------------------------------------------------------------
5698 | Returns the result of converting the quadruple-precision floating-point
5699 | value `a' to the double-precision floating-point format.  The conversion
5700 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5701 | Arithmetic.
5702 *----------------------------------------------------------------------------*/
5703
5704 float64 float128_to_float64( float128 a STATUS_PARAM )
5705 {
5706     flag aSign;
5707     int32 aExp;
5708     uint64_t aSig0, aSig1;
5709
5710     aSig1 = extractFloat128Frac1( a );
5711     aSig0 = extractFloat128Frac0( a );
5712     aExp = extractFloat128Exp( a );
5713     aSign = extractFloat128Sign( a );
5714     if ( aExp == 0x7FFF ) {
5715         if ( aSig0 | aSig1 ) {
5716             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5717         }
5718         return packFloat64( aSign, 0x7FF, 0 );
5719     }
5720     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5721     aSig0 |= ( aSig1 != 0 );
5722     if ( aExp || aSig0 ) {
5723         aSig0 |= LIT64( 0x4000000000000000 );
5724         aExp -= 0x3C01;
5725     }
5726     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5727
5728 }
5729
5730 /*----------------------------------------------------------------------------
5731 | Returns the result of converting the quadruple-precision floating-point
5732 | value `a' to the extended double-precision floating-point format.  The
5733 | conversion is performed according to the IEC/IEEE Standard for Binary
5734 | Floating-Point Arithmetic.
5735 *----------------------------------------------------------------------------*/
5736
5737 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5738 {
5739     flag aSign;
5740     int32 aExp;
5741     uint64_t aSig0, aSig1;
5742
5743     aSig1 = extractFloat128Frac1( a );
5744     aSig0 = extractFloat128Frac0( a );
5745     aExp = extractFloat128Exp( a );
5746     aSign = extractFloat128Sign( a );
5747     if ( aExp == 0x7FFF ) {
5748         if ( aSig0 | aSig1 ) {
5749             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5750         }
5751         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5752     }
5753     if ( aExp == 0 ) {
5754         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5755         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5756     }
5757     else {
5758         aSig0 |= LIT64( 0x0001000000000000 );
5759     }
5760     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5761     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5762
5763 }
5764
5765 /*----------------------------------------------------------------------------
5766 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5767 | returns the result as a quadruple-precision floating-point value.  The
5768 | operation is performed according to the IEC/IEEE Standard for Binary
5769 | Floating-Point Arithmetic.
5770 *----------------------------------------------------------------------------*/
5771
5772 float128 float128_round_to_int( float128 a STATUS_PARAM )
5773 {
5774     flag aSign;
5775     int32 aExp;
5776     uint64_t lastBitMask, roundBitsMask;
5777     int8 roundingMode;
5778     float128 z;
5779
5780     aExp = extractFloat128Exp( a );
5781     if ( 0x402F <= aExp ) {
5782         if ( 0x406F <= aExp ) {
5783             if (    ( aExp == 0x7FFF )
5784                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5785                ) {
5786                 return propagateFloat128NaN( a, a STATUS_VAR );
5787             }
5788             return a;
5789         }
5790         lastBitMask = 1;
5791         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5792         roundBitsMask = lastBitMask - 1;
5793         z = a;
5794         roundingMode = STATUS(float_rounding_mode);
5795         if ( roundingMode == float_round_nearest_even ) {
5796             if ( lastBitMask ) {
5797                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5798                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5799             }
5800             else {
5801                 if ( (int64_t) z.low < 0 ) {
5802                     ++z.high;
5803                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5804                 }
5805             }
5806         }
5807         else if ( roundingMode != float_round_to_zero ) {
5808             if (   extractFloat128Sign( z )
5809                  ^ ( roundingMode == float_round_up ) ) {
5810                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5811             }
5812         }
5813         z.low &= ~ roundBitsMask;
5814     }
5815     else {
5816         if ( aExp < 0x3FFF ) {
5817             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5818             STATUS(float_exception_flags) |= float_flag_inexact;
5819             aSign = extractFloat128Sign( a );
5820             switch ( STATUS(float_rounding_mode) ) {
5821              case float_round_nearest_even:
5822                 if (    ( aExp == 0x3FFE )
5823                      && (   extractFloat128Frac0( a )
5824                           | extractFloat128Frac1( a ) )
5825                    ) {
5826                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5827                 }
5828                 break;
5829              case float_round_down:
5830                 return
5831                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5832                     : packFloat128( 0, 0, 0, 0 );
5833              case float_round_up:
5834                 return
5835                       aSign ? packFloat128( 1, 0, 0, 0 )
5836                     : packFloat128( 0, 0x3FFF, 0, 0 );
5837             }
5838             return packFloat128( aSign, 0, 0, 0 );
5839         }
5840         lastBitMask = 1;
5841         lastBitMask <<= 0x402F - aExp;
5842         roundBitsMask = lastBitMask - 1;
5843         z.low = 0;
5844         z.high = a.high;
5845         roundingMode = STATUS(float_rounding_mode);
5846         if ( roundingMode == float_round_nearest_even ) {
5847             z.high += lastBitMask>>1;
5848             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5849                 z.high &= ~ lastBitMask;
5850             }
5851         }
5852         else if ( roundingMode != float_round_to_zero ) {
5853             if (   extractFloat128Sign( z )
5854                  ^ ( roundingMode == float_round_up ) ) {
5855                 z.high |= ( a.low != 0 );
5856                 z.high += roundBitsMask;
5857             }
5858         }
5859         z.high &= ~ roundBitsMask;
5860     }
5861     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5862         STATUS(float_exception_flags) |= float_flag_inexact;
5863     }
5864     return z;
5865
5866 }
5867
5868 /*----------------------------------------------------------------------------
5869 | Returns the result of adding the absolute values of the quadruple-precision
5870 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5871 | before being returned.  `zSign' is ignored if the result is a NaN.
5872 | The addition is performed according to the IEC/IEEE Standard for Binary
5873 | Floating-Point Arithmetic.
5874 *----------------------------------------------------------------------------*/
5875
5876 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5877 {
5878     int32 aExp, bExp, zExp;
5879     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5880     int32 expDiff;
5881
5882     aSig1 = extractFloat128Frac1( a );
5883     aSig0 = extractFloat128Frac0( a );
5884     aExp = extractFloat128Exp( a );
5885     bSig1 = extractFloat128Frac1( b );
5886     bSig0 = extractFloat128Frac0( b );
5887     bExp = extractFloat128Exp( b );
5888     expDiff = aExp - bExp;
5889     if ( 0 < expDiff ) {
5890         if ( aExp == 0x7FFF ) {
5891             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5892             return a;
5893         }
5894         if ( bExp == 0 ) {
5895             --expDiff;
5896         }
5897         else {
5898             bSig0 |= LIT64( 0x0001000000000000 );
5899         }
5900         shift128ExtraRightJamming(
5901             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5902         zExp = aExp;
5903     }
5904     else if ( expDiff < 0 ) {
5905         if ( bExp == 0x7FFF ) {
5906             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5907             return packFloat128( zSign, 0x7FFF, 0, 0 );
5908         }
5909         if ( aExp == 0 ) {
5910             ++expDiff;
5911         }
5912         else {
5913             aSig0 |= LIT64( 0x0001000000000000 );
5914         }
5915         shift128ExtraRightJamming(
5916             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5917         zExp = bExp;
5918     }
5919     else {
5920         if ( aExp == 0x7FFF ) {
5921             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5922                 return propagateFloat128NaN( a, b STATUS_VAR );
5923             }
5924             return a;
5925         }
5926         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5927         if ( aExp == 0 ) {
5928             if (STATUS(flush_to_zero)) {
5929                 if (zSig0 | zSig1) {
5930                     float_raise(float_flag_output_denormal STATUS_VAR);
5931                 }
5932                 return packFloat128(zSign, 0, 0, 0);
5933             }
5934             return packFloat128( zSign, 0, zSig0, zSig1 );
5935         }
5936         zSig2 = 0;
5937         zSig0 |= LIT64( 0x0002000000000000 );
5938         zExp = aExp;
5939         goto shiftRight1;
5940     }
5941     aSig0 |= LIT64( 0x0001000000000000 );
5942     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5943     --zExp;
5944     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5945     ++zExp;
5946  shiftRight1:
5947     shift128ExtraRightJamming(
5948         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5949  roundAndPack:
5950     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5951
5952 }
5953
5954 /*----------------------------------------------------------------------------
5955 | Returns the result of subtracting the absolute values of the quadruple-
5956 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
5957 | difference is negated before being returned.  `zSign' is ignored if the
5958 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5959 | Standard for Binary Floating-Point Arithmetic.
5960 *----------------------------------------------------------------------------*/
5961
5962 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5963 {
5964     int32 aExp, bExp, zExp;
5965     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5966     int32 expDiff;
5967     float128 z;
5968
5969     aSig1 = extractFloat128Frac1( a );
5970     aSig0 = extractFloat128Frac0( a );
5971     aExp = extractFloat128Exp( a );
5972     bSig1 = extractFloat128Frac1( b );
5973     bSig0 = extractFloat128Frac0( b );
5974     bExp = extractFloat128Exp( b );
5975     expDiff = aExp - bExp;
5976     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5977     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5978     if ( 0 < expDiff ) goto aExpBigger;
5979     if ( expDiff < 0 ) goto bExpBigger;
5980     if ( aExp == 0x7FFF ) {
5981         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5982             return propagateFloat128NaN( a, b STATUS_VAR );
5983         }
5984         float_raise( float_flag_invalid STATUS_VAR);
5985         z.low = float128_default_nan_low;
5986         z.high = float128_default_nan_high;
5987         return z;
5988     }
5989     if ( aExp == 0 ) {
5990         aExp = 1;
5991         bExp = 1;
5992     }
5993     if ( bSig0 < aSig0 ) goto aBigger;
5994     if ( aSig0 < bSig0 ) goto bBigger;
5995     if ( bSig1 < aSig1 ) goto aBigger;
5996     if ( aSig1 < bSig1 ) goto bBigger;
5997     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5998  bExpBigger:
5999     if ( bExp == 0x7FFF ) {
6000         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6001         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6002     }
6003     if ( aExp == 0 ) {
6004         ++expDiff;
6005     }
6006     else {
6007         aSig0 |= LIT64( 0x4000000000000000 );
6008     }
6009     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6010     bSig0 |= LIT64( 0x4000000000000000 );
6011  bBigger:
6012     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6013     zExp = bExp;
6014     zSign ^= 1;
6015     goto normalizeRoundAndPack;
6016  aExpBigger:
6017     if ( aExp == 0x7FFF ) {
6018         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6019         return a;
6020     }
6021     if ( bExp == 0 ) {
6022         --expDiff;
6023     }
6024     else {
6025         bSig0 |= LIT64( 0x4000000000000000 );
6026     }
6027     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6028     aSig0 |= LIT64( 0x4000000000000000 );
6029  aBigger:
6030     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6031     zExp = aExp;
6032  normalizeRoundAndPack:
6033     --zExp;
6034     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
6035
6036 }
6037
6038 /*----------------------------------------------------------------------------
6039 | Returns the result of adding the quadruple-precision floating-point values
6040 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
6041 | for Binary Floating-Point Arithmetic.
6042 *----------------------------------------------------------------------------*/
6043
6044 float128 float128_add( float128 a, float128 b STATUS_PARAM )
6045 {
6046     flag aSign, bSign;
6047
6048     aSign = extractFloat128Sign( a );
6049     bSign = extractFloat128Sign( b );
6050     if ( aSign == bSign ) {
6051         return addFloat128Sigs( a, b, aSign STATUS_VAR );
6052     }
6053     else {
6054         return subFloat128Sigs( a, b, aSign STATUS_VAR );
6055     }
6056
6057 }
6058
6059 /*----------------------------------------------------------------------------
6060 | Returns the result of subtracting the quadruple-precision floating-point
6061 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6062 | Standard for Binary Floating-Point Arithmetic.
6063 *----------------------------------------------------------------------------*/
6064
6065 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
6066 {
6067     flag aSign, bSign;
6068
6069     aSign = extractFloat128Sign( a );
6070     bSign = extractFloat128Sign( b );
6071     if ( aSign == bSign ) {
6072         return subFloat128Sigs( a, b, aSign STATUS_VAR );
6073     }
6074     else {
6075         return addFloat128Sigs( a, b, aSign STATUS_VAR );
6076     }
6077
6078 }
6079
6080 /*----------------------------------------------------------------------------
6081 | Returns the result of multiplying the quadruple-precision floating-point
6082 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
6083 | Standard for Binary Floating-Point Arithmetic.
6084 *----------------------------------------------------------------------------*/
6085
6086 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
6087 {
6088     flag aSign, bSign, zSign;
6089     int32 aExp, bExp, zExp;
6090     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6091     float128 z;
6092
6093     aSig1 = extractFloat128Frac1( a );
6094     aSig0 = extractFloat128Frac0( a );
6095     aExp = extractFloat128Exp( a );
6096     aSign = extractFloat128Sign( a );
6097     bSig1 = extractFloat128Frac1( b );
6098     bSig0 = extractFloat128Frac0( b );
6099     bExp = extractFloat128Exp( b );
6100     bSign = extractFloat128Sign( b );
6101     zSign = aSign ^ bSign;
6102     if ( aExp == 0x7FFF ) {
6103         if (    ( aSig0 | aSig1 )
6104              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6105             return propagateFloat128NaN( a, b STATUS_VAR );
6106         }
6107         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6108         return packFloat128( zSign, 0x7FFF, 0, 0 );
6109     }
6110     if ( bExp == 0x7FFF ) {
6111         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6112         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6113  invalid:
6114             float_raise( float_flag_invalid STATUS_VAR);
6115             z.low = float128_default_nan_low;
6116             z.high = float128_default_nan_high;
6117             return z;
6118         }
6119         return packFloat128( zSign, 0x7FFF, 0, 0 );
6120     }
6121     if ( aExp == 0 ) {
6122         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6123         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6124     }
6125     if ( bExp == 0 ) {
6126         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6127         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6128     }
6129     zExp = aExp + bExp - 0x4000;
6130     aSig0 |= LIT64( 0x0001000000000000 );
6131     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6132     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6133     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6134     zSig2 |= ( zSig3 != 0 );
6135     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6136         shift128ExtraRightJamming(
6137             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6138         ++zExp;
6139     }
6140     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6141
6142 }
6143
6144 /*----------------------------------------------------------------------------
6145 | Returns the result of dividing the quadruple-precision floating-point value
6146 | `a' by the corresponding value `b'.  The operation is performed according to
6147 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6148 *----------------------------------------------------------------------------*/
6149
6150 float128 float128_div( float128 a, float128 b STATUS_PARAM )
6151 {
6152     flag aSign, bSign, zSign;
6153     int32 aExp, bExp, zExp;
6154     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6155     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6156     float128 z;
6157
6158     aSig1 = extractFloat128Frac1( a );
6159     aSig0 = extractFloat128Frac0( a );
6160     aExp = extractFloat128Exp( a );
6161     aSign = extractFloat128Sign( a );
6162     bSig1 = extractFloat128Frac1( b );
6163     bSig0 = extractFloat128Frac0( b );
6164     bExp = extractFloat128Exp( b );
6165     bSign = extractFloat128Sign( b );
6166     zSign = aSign ^ bSign;
6167     if ( aExp == 0x7FFF ) {
6168         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6169         if ( bExp == 0x7FFF ) {
6170             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6171             goto invalid;
6172         }
6173         return packFloat128( zSign, 0x7FFF, 0, 0 );
6174     }
6175     if ( bExp == 0x7FFF ) {
6176         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6177         return packFloat128( zSign, 0, 0, 0 );
6178     }
6179     if ( bExp == 0 ) {
6180         if ( ( bSig0 | bSig1 ) == 0 ) {
6181             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6182  invalid:
6183                 float_raise( float_flag_invalid STATUS_VAR);
6184                 z.low = float128_default_nan_low;
6185                 z.high = float128_default_nan_high;
6186                 return z;
6187             }
6188             float_raise( float_flag_divbyzero STATUS_VAR);
6189             return packFloat128( zSign, 0x7FFF, 0, 0 );
6190         }
6191         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6192     }
6193     if ( aExp == 0 ) {
6194         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6195         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6196     }
6197     zExp = aExp - bExp + 0x3FFD;
6198     shortShift128Left(
6199         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6200     shortShift128Left(
6201         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6202     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6203         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6204         ++zExp;
6205     }
6206     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6207     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6208     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6209     while ( (int64_t) rem0 < 0 ) {
6210         --zSig0;
6211         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6212     }
6213     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6214     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6215         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6216         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6217         while ( (int64_t) rem1 < 0 ) {
6218             --zSig1;
6219             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6220         }
6221         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6222     }
6223     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6224     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6225
6226 }
6227
6228 /*----------------------------------------------------------------------------
6229 | Returns the remainder of the quadruple-precision floating-point value `a'
6230 | with respect to the corresponding value `b'.  The operation is performed
6231 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6232 *----------------------------------------------------------------------------*/
6233
6234 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6235 {
6236     flag aSign, zSign;
6237     int32 aExp, bExp, expDiff;
6238     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6239     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6240     int64_t sigMean0;
6241     float128 z;
6242
6243     aSig1 = extractFloat128Frac1( a );
6244     aSig0 = extractFloat128Frac0( a );
6245     aExp = extractFloat128Exp( a );
6246     aSign = extractFloat128Sign( a );
6247     bSig1 = extractFloat128Frac1( b );
6248     bSig0 = extractFloat128Frac0( b );
6249     bExp = extractFloat128Exp( b );
6250     if ( aExp == 0x7FFF ) {
6251         if (    ( aSig0 | aSig1 )
6252              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6253             return propagateFloat128NaN( a, b STATUS_VAR );
6254         }
6255         goto invalid;
6256     }
6257     if ( bExp == 0x7FFF ) {
6258         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6259         return a;
6260     }
6261     if ( bExp == 0 ) {
6262         if ( ( bSig0 | bSig1 ) == 0 ) {
6263  invalid:
6264             float_raise( float_flag_invalid STATUS_VAR);
6265             z.low = float128_default_nan_low;
6266             z.high = float128_default_nan_high;
6267             return z;
6268         }
6269         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6270     }
6271     if ( aExp == 0 ) {
6272         if ( ( aSig0 | aSig1 ) == 0 ) return a;
6273         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6274     }
6275     expDiff = aExp - bExp;
6276     if ( expDiff < -1 ) return a;
6277     shortShift128Left(
6278         aSig0 | LIT64( 0x0001000000000000 ),
6279         aSig1,
6280         15 - ( expDiff < 0 ),
6281         &aSig0,
6282         &aSig1
6283     );
6284     shortShift128Left(
6285         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6286     q = le128( bSig0, bSig1, aSig0, aSig1 );
6287     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6288     expDiff -= 64;
6289     while ( 0 < expDiff ) {
6290         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6291         q = ( 4 < q ) ? q - 4 : 0;
6292         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6293         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6294         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6295         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6296         expDiff -= 61;
6297     }
6298     if ( -64 < expDiff ) {
6299         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6300         q = ( 4 < q ) ? q - 4 : 0;
6301         q >>= - expDiff;
6302         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6303         expDiff += 52;
6304         if ( expDiff < 0 ) {
6305             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6306         }
6307         else {
6308             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6309         }
6310         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6311         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6312     }
6313     else {
6314         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6315         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6316     }
6317     do {
6318         alternateASig0 = aSig0;
6319         alternateASig1 = aSig1;
6320         ++q;
6321         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6322     } while ( 0 <= (int64_t) aSig0 );
6323     add128(
6324         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6325     if (    ( sigMean0 < 0 )
6326          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6327         aSig0 = alternateASig0;
6328         aSig1 = alternateASig1;
6329     }
6330     zSign = ( (int64_t) aSig0 < 0 );
6331     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6332     return
6333         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6334
6335 }
6336
6337 /*----------------------------------------------------------------------------
6338 | Returns the square root of the quadruple-precision floating-point value `a'.
6339 | The operation is performed according to the IEC/IEEE Standard for Binary
6340 | Floating-Point Arithmetic.
6341 *----------------------------------------------------------------------------*/
6342
6343 float128 float128_sqrt( float128 a STATUS_PARAM )
6344 {
6345     flag aSign;
6346     int32 aExp, zExp;
6347     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6348     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6349     float128 z;
6350
6351     aSig1 = extractFloat128Frac1( a );
6352     aSig0 = extractFloat128Frac0( a );
6353     aExp = extractFloat128Exp( a );
6354     aSign = extractFloat128Sign( a );
6355     if ( aExp == 0x7FFF ) {
6356         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6357         if ( ! aSign ) return a;
6358         goto invalid;
6359     }
6360     if ( aSign ) {
6361         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6362  invalid:
6363         float_raise( float_flag_invalid STATUS_VAR);
6364         z.low = float128_default_nan_low;
6365         z.high = float128_default_nan_high;
6366         return z;
6367     }
6368     if ( aExp == 0 ) {
6369         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6370         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6371     }
6372     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6373     aSig0 |= LIT64( 0x0001000000000000 );
6374     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6375     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6376     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6377     doubleZSig0 = zSig0<<1;
6378     mul64To128( zSig0, zSig0, &term0, &term1 );
6379     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6380     while ( (int64_t) rem0 < 0 ) {
6381         --zSig0;
6382         doubleZSig0 -= 2;
6383         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6384     }
6385     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6386     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6387         if ( zSig1 == 0 ) zSig1 = 1;
6388         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6389         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6390         mul64To128( zSig1, zSig1, &term2, &term3 );
6391         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6392         while ( (int64_t) rem1 < 0 ) {
6393             --zSig1;
6394             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6395             term3 |= 1;
6396             term2 |= doubleZSig0;
6397             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6398         }
6399         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6400     }
6401     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6402     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6403
6404 }
6405
6406 /*----------------------------------------------------------------------------
6407 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6408 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6409 | raised if either operand is a NaN.  Otherwise, the comparison is performed
6410 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6411 *----------------------------------------------------------------------------*/
6412
6413 int float128_eq( float128 a, float128 b STATUS_PARAM )
6414 {
6415
6416     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6417               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6418          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6419               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6420        ) {
6421         float_raise( float_flag_invalid STATUS_VAR);
6422         return 0;
6423     }
6424     return
6425            ( a.low == b.low )
6426         && (    ( a.high == b.high )
6427              || (    ( a.low == 0 )
6428                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6429            );
6430
6431 }
6432
6433 /*----------------------------------------------------------------------------
6434 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6435 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
6436 | exception is raised if either operand is a NaN.  The comparison is performed
6437 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6438 *----------------------------------------------------------------------------*/
6439
6440 int float128_le( float128 a, float128 b STATUS_PARAM )
6441 {
6442     flag aSign, bSign;
6443
6444     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6445               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6446          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6447               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6448        ) {
6449         float_raise( float_flag_invalid STATUS_VAR);
6450         return 0;
6451     }
6452     aSign = extractFloat128Sign( a );
6453     bSign = extractFloat128Sign( b );
6454     if ( aSign != bSign ) {
6455         return
6456                aSign
6457             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6458                  == 0 );
6459     }
6460     return
6461           aSign ? le128( b.high, b.low, a.high, a.low )
6462         : le128( a.high, a.low, b.high, b.low );
6463
6464 }
6465
6466 /*----------------------------------------------------------------------------
6467 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6468 | the corresponding value `b', and 0 otherwise.  The invalid exception is
6469 | raised if either operand is a NaN.  The comparison is performed according
6470 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6471 *----------------------------------------------------------------------------*/
6472
6473 int float128_lt( float128 a, float128 b STATUS_PARAM )
6474 {
6475     flag aSign, bSign;
6476
6477     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6478               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6479          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6480               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6481        ) {
6482         float_raise( float_flag_invalid STATUS_VAR);
6483         return 0;
6484     }
6485     aSign = extractFloat128Sign( a );
6486     bSign = extractFloat128Sign( b );
6487     if ( aSign != bSign ) {
6488         return
6489                aSign
6490             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6491                  != 0 );
6492     }
6493     return
6494           aSign ? lt128( b.high, b.low, a.high, a.low )
6495         : lt128( a.high, a.low, b.high, b.low );
6496
6497 }
6498
6499 /*----------------------------------------------------------------------------
6500 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6501 | be compared, and 0 otherwise.  The invalid exception is raised if either
6502 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6503 | Standard for Binary Floating-Point Arithmetic.
6504 *----------------------------------------------------------------------------*/
6505
6506 int float128_unordered( float128 a, float128 b STATUS_PARAM )
6507 {
6508     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6509               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6510          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6511               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6512        ) {
6513         float_raise( float_flag_invalid STATUS_VAR);
6514         return 1;
6515     }
6516     return 0;
6517 }
6518
6519 /*----------------------------------------------------------------------------
6520 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6521 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6522 | exception.  The comparison is performed according to the IEC/IEEE Standard
6523 | for Binary Floating-Point Arithmetic.
6524 *----------------------------------------------------------------------------*/
6525
6526 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
6527 {
6528
6529     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6530               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6531          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6532               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6533        ) {
6534         if (    float128_is_signaling_nan( a )
6535              || float128_is_signaling_nan( b ) ) {
6536             float_raise( float_flag_invalid STATUS_VAR);
6537         }
6538         return 0;
6539     }
6540     return
6541            ( a.low == b.low )
6542         && (    ( a.high == b.high )
6543              || (    ( a.low == 0 )
6544                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6545            );
6546
6547 }
6548
6549 /*----------------------------------------------------------------------------
6550 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6551 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
6552 | cause an exception.  Otherwise, the comparison is performed according to the
6553 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6554 *----------------------------------------------------------------------------*/
6555
6556 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
6557 {
6558     flag aSign, bSign;
6559
6560     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6561               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6562          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6563               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6564        ) {
6565         if (    float128_is_signaling_nan( a )
6566              || float128_is_signaling_nan( b ) ) {
6567             float_raise( float_flag_invalid STATUS_VAR);
6568         }
6569         return 0;
6570     }
6571     aSign = extractFloat128Sign( a );
6572     bSign = extractFloat128Sign( b );
6573     if ( aSign != bSign ) {
6574         return
6575                aSign
6576             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6577                  == 0 );
6578     }
6579     return
6580           aSign ? le128( b.high, b.low, a.high, a.low )
6581         : le128( a.high, a.low, b.high, b.low );
6582
6583 }
6584
6585 /*----------------------------------------------------------------------------
6586 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6587 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
6588 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
6589 | Standard for Binary Floating-Point Arithmetic.
6590 *----------------------------------------------------------------------------*/
6591
6592 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
6593 {
6594     flag aSign, bSign;
6595
6596     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6597               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6598          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6599               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6600        ) {
6601         if (    float128_is_signaling_nan( a )
6602              || float128_is_signaling_nan( b ) ) {
6603             float_raise( float_flag_invalid STATUS_VAR);
6604         }
6605         return 0;
6606     }
6607     aSign = extractFloat128Sign( a );
6608     bSign = extractFloat128Sign( b );
6609     if ( aSign != bSign ) {
6610         return
6611                aSign
6612             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6613                  != 0 );
6614     }
6615     return
6616           aSign ? lt128( b.high, b.low, a.high, a.low )
6617         : lt128( a.high, a.low, b.high, b.low );
6618
6619 }
6620
6621 /*----------------------------------------------------------------------------
6622 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6623 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6624 | comparison is performed according to the IEC/IEEE Standard for Binary
6625 | Floating-Point Arithmetic.
6626 *----------------------------------------------------------------------------*/
6627
6628 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6629 {
6630     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6631               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6632          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6633               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6634        ) {
6635         if (    float128_is_signaling_nan( a )
6636              || float128_is_signaling_nan( b ) ) {
6637             float_raise( float_flag_invalid STATUS_VAR);
6638         }
6639         return 1;
6640     }
6641     return 0;
6642 }
6643
6644 /* misc functions */
6645 float32 uint32_to_float32(uint32_t a STATUS_PARAM)
6646 {
6647     return int64_to_float32(a STATUS_VAR);
6648 }
6649
6650 float64 uint32_to_float64(uint32_t a STATUS_PARAM)
6651 {
6652     return int64_to_float64(a STATUS_VAR);
6653 }
6654
6655 uint32 float32_to_uint32( float32 a STATUS_PARAM )
6656 {
6657     int64_t v;
6658     uint32 res;
6659     int old_exc_flags = get_float_exception_flags(status);
6660
6661     v = float32_to_int64(a STATUS_VAR);
6662     if (v < 0) {
6663         res = 0;
6664     } else if (v > 0xffffffff) {
6665         res = 0xffffffff;
6666     } else {
6667         return v;
6668     }
6669     set_float_exception_flags(old_exc_flags, status);
6670     float_raise(float_flag_invalid STATUS_VAR);
6671     return res;
6672 }
6673
6674 uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6675 {
6676     int64_t v;
6677     uint32 res;
6678     int old_exc_flags = get_float_exception_flags(status);
6679
6680     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6681     if (v < 0) {
6682         res = 0;
6683     } else if (v > 0xffffffff) {
6684         res = 0xffffffff;
6685     } else {
6686         return v;
6687     }
6688     set_float_exception_flags(old_exc_flags, status);
6689     float_raise(float_flag_invalid STATUS_VAR);
6690     return res;
6691 }
6692
6693 int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6694 {
6695     int32_t v;
6696     int_fast16_t res;
6697     int old_exc_flags = get_float_exception_flags(status);
6698
6699     v = float32_to_int32(a STATUS_VAR);
6700     if (v < -0x8000) {
6701         res = -0x8000;
6702     } else if (v > 0x7fff) {
6703         res = 0x7fff;
6704     } else {
6705         return v;
6706     }
6707
6708     set_float_exception_flags(old_exc_flags, status);
6709     float_raise(float_flag_invalid STATUS_VAR);
6710     return res;
6711 }
6712
6713 uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6714 {
6715     int32_t v;
6716     uint_fast16_t res;
6717     int old_exc_flags = get_float_exception_flags(status);
6718
6719     v = float32_to_int32(a STATUS_VAR);
6720     if (v < 0) {
6721         res = 0;
6722     } else if (v > 0xffff) {
6723         res = 0xffff;
6724     } else {
6725         return v;
6726     }
6727
6728     set_float_exception_flags(old_exc_flags, status);
6729     float_raise(float_flag_invalid STATUS_VAR);
6730     return res;
6731 }
6732
6733 uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
6734 {
6735     int64_t v;
6736     uint_fast16_t res;
6737     int old_exc_flags = get_float_exception_flags(status);
6738
6739     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6740     if (v < 0) {
6741         res = 0;
6742     } else if (v > 0xffff) {
6743         res = 0xffff;
6744     } else {
6745         return v;
6746     }
6747     set_float_exception_flags(old_exc_flags, status);
6748     float_raise(float_flag_invalid STATUS_VAR);
6749     return res;
6750 }
6751
6752 uint32 float64_to_uint32( float64 a STATUS_PARAM )
6753 {
6754     uint64_t v;
6755     uint32 res;
6756     int old_exc_flags = get_float_exception_flags(status);
6757
6758     v = float64_to_uint64(a STATUS_VAR);
6759     if (v > 0xffffffff) {
6760         res = 0xffffffff;
6761     } else {
6762         return v;
6763     }
6764     set_float_exception_flags(old_exc_flags, status);
6765     float_raise(float_flag_invalid STATUS_VAR);
6766     return res;
6767 }
6768
6769 uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6770 {
6771     uint64_t v;
6772     uint32 res;
6773     int old_exc_flags = get_float_exception_flags(status);
6774
6775     v = float64_to_uint64_round_to_zero(a STATUS_VAR);
6776     if (v > 0xffffffff) {
6777         res = 0xffffffff;
6778     } else {
6779         return v;
6780     }
6781     set_float_exception_flags(old_exc_flags, status);
6782     float_raise(float_flag_invalid STATUS_VAR);
6783     return res;
6784 }
6785
6786 int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
6787 {
6788     int64_t v;
6789     int_fast16_t res;
6790     int old_exc_flags = get_float_exception_flags(status);
6791
6792     v = float64_to_int32(a STATUS_VAR);
6793     if (v < -0x8000) {
6794         res = -0x8000;
6795     } else if (v > 0x7fff) {
6796         res = 0x7fff;
6797     } else {
6798         return v;
6799     }
6800
6801     set_float_exception_flags(old_exc_flags, status);
6802     float_raise(float_flag_invalid STATUS_VAR);
6803     return res;
6804 }
6805
6806 uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
6807 {
6808     int64_t v;
6809     uint_fast16_t res;
6810     int old_exc_flags = get_float_exception_flags(status);
6811
6812     v = float64_to_int32(a STATUS_VAR);
6813     if (v < 0) {
6814         res = 0;
6815     } else if (v > 0xffff) {
6816         res = 0xffff;
6817     } else {
6818         return v;
6819     }
6820
6821     set_float_exception_flags(old_exc_flags, status);
6822     float_raise(float_flag_invalid STATUS_VAR);
6823     return res;
6824 }
6825
6826 uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
6827 {
6828     int64_t v;
6829     uint_fast16_t res;
6830     int old_exc_flags = get_float_exception_flags(status);
6831
6832     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6833     if (v < 0) {
6834         res = 0;
6835     } else if (v > 0xffff) {
6836         res = 0xffff;
6837     } else {
6838         return v;
6839     }
6840     set_float_exception_flags(old_exc_flags, status);
6841     float_raise(float_flag_invalid STATUS_VAR);
6842     return res;
6843 }
6844
6845 /*----------------------------------------------------------------------------
6846 | Returns the result of converting the double-precision floating-point value
6847 | `a' to the 64-bit unsigned integer format.  The conversion is
6848 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6849 | Arithmetic---which means in particular that the conversion is rounded
6850 | according to the current rounding mode.  If `a' is a NaN, the largest
6851 | positive integer is returned.  If the conversion overflows, the
6852 | largest unsigned integer is returned.  If 'a' is negative, the value is
6853 | rounded and zero is returned; negative values that do not round to zero
6854 | will raise the inexact exception.
6855 *----------------------------------------------------------------------------*/
6856
6857 uint64_t float64_to_uint64(float64 a STATUS_PARAM)
6858 {
6859     flag aSign;
6860     int_fast16_t aExp, shiftCount;
6861     uint64_t aSig, aSigExtra;
6862     a = float64_squash_input_denormal(a STATUS_VAR);
6863
6864     aSig = extractFloat64Frac(a);
6865     aExp = extractFloat64Exp(a);
6866     aSign = extractFloat64Sign(a);
6867     if (aSign && (aExp > 1022)) {
6868         float_raise(float_flag_invalid STATUS_VAR);
6869         if (float64_is_any_nan(a)) {
6870             return LIT64(0xFFFFFFFFFFFFFFFF);
6871         } else {
6872             return 0;
6873         }
6874     }
6875     if (aExp) {
6876         aSig |= LIT64(0x0010000000000000);
6877     }
6878     shiftCount = 0x433 - aExp;
6879     if (shiftCount <= 0) {
6880         if (0x43E < aExp) {
6881             float_raise(float_flag_invalid STATUS_VAR);
6882             return LIT64(0xFFFFFFFFFFFFFFFF);
6883         }
6884         aSigExtra = 0;
6885         aSig <<= -shiftCount;
6886     } else {
6887         shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
6888     }
6889     return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
6890 }
6891
6892 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6893 {
6894     signed char current_rounding_mode = STATUS(float_rounding_mode);
6895     set_float_rounding_mode(float_round_to_zero STATUS_VAR);
6896     int64_t v = float64_to_uint64(a STATUS_VAR);
6897     set_float_rounding_mode(current_rounding_mode STATUS_VAR);
6898     return v;
6899 }
6900
6901 #define COMPARE(s, nan_exp)                                                  \
6902 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
6903                                       int is_quiet STATUS_PARAM )            \
6904 {                                                                            \
6905     flag aSign, bSign;                                                       \
6906     uint ## s ## _t av, bv;                                                  \
6907     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
6908     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
6909                                                                              \
6910     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6911          extractFloat ## s ## Frac( a ) ) ||                                 \
6912         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6913           extractFloat ## s ## Frac( b ) )) {                                \
6914         if (!is_quiet ||                                                     \
6915             float ## s ## _is_signaling_nan( a ) ||                          \
6916             float ## s ## _is_signaling_nan( b ) ) {                         \
6917             float_raise( float_flag_invalid STATUS_VAR);                     \
6918         }                                                                    \
6919         return float_relation_unordered;                                     \
6920     }                                                                        \
6921     aSign = extractFloat ## s ## Sign( a );                                  \
6922     bSign = extractFloat ## s ## Sign( b );                                  \
6923     av = float ## s ## _val(a);                                              \
6924     bv = float ## s ## _val(b);                                              \
6925     if ( aSign != bSign ) {                                                  \
6926         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6927             /* zero case */                                                  \
6928             return float_relation_equal;                                     \
6929         } else {                                                             \
6930             return 1 - (2 * aSign);                                          \
6931         }                                                                    \
6932     } else {                                                                 \
6933         if (av == bv) {                                                      \
6934             return float_relation_equal;                                     \
6935         } else {                                                             \
6936             return 1 - 2 * (aSign ^ ( av < bv ));                            \
6937         }                                                                    \
6938     }                                                                        \
6939 }                                                                            \
6940                                                                              \
6941 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6942 {                                                                            \
6943     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6944 }                                                                            \
6945                                                                              \
6946 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6947 {                                                                            \
6948     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6949 }
6950
6951 COMPARE(32, 0xff)
6952 COMPARE(64, 0x7ff)
6953
6954 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6955                                       int is_quiet STATUS_PARAM )
6956 {
6957     flag aSign, bSign;
6958
6959     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6960           ( extractFloatx80Frac( a )<<1 ) ) ||
6961         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6962           ( extractFloatx80Frac( b )<<1 ) )) {
6963         if (!is_quiet ||
6964             floatx80_is_signaling_nan( a ) ||
6965             floatx80_is_signaling_nan( b ) ) {
6966             float_raise( float_flag_invalid STATUS_VAR);
6967         }
6968         return float_relation_unordered;
6969     }
6970     aSign = extractFloatx80Sign( a );
6971     bSign = extractFloatx80Sign( b );
6972     if ( aSign != bSign ) {
6973
6974         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6975              ( ( a.low | b.low ) == 0 ) ) {
6976             /* zero case */
6977             return float_relation_equal;
6978         } else {
6979             return 1 - (2 * aSign);
6980         }
6981     } else {
6982         if (a.low == b.low && a.high == b.high) {
6983             return float_relation_equal;
6984         } else {
6985             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6986         }
6987     }
6988 }
6989
6990 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6991 {
6992     return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6993 }
6994
6995 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6996 {
6997     return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6998 }
6999
7000 INLINE int float128_compare_internal( float128 a, float128 b,
7001                                       int is_quiet STATUS_PARAM )
7002 {
7003     flag aSign, bSign;
7004
7005     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7006           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7007         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7008           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7009         if (!is_quiet ||
7010             float128_is_signaling_nan( a ) ||
7011             float128_is_signaling_nan( b ) ) {
7012             float_raise( float_flag_invalid STATUS_VAR);
7013         }
7014         return float_relation_unordered;
7015     }
7016     aSign = extractFloat128Sign( a );
7017     bSign = extractFloat128Sign( b );
7018     if ( aSign != bSign ) {
7019         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7020             /* zero case */
7021             return float_relation_equal;
7022         } else {
7023             return 1 - (2 * aSign);
7024         }
7025     } else {
7026         if (a.low == b.low && a.high == b.high) {
7027             return float_relation_equal;
7028         } else {
7029             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7030         }
7031     }
7032 }
7033
7034 int float128_compare( float128 a, float128 b STATUS_PARAM )
7035 {
7036     return float128_compare_internal(a, b, 0 STATUS_VAR);
7037 }
7038
7039 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
7040 {
7041     return float128_compare_internal(a, b, 1 STATUS_VAR);
7042 }
7043
7044 /* min() and max() functions. These can't be implemented as
7045  * 'compare and pick one input' because that would mishandle
7046  * NaNs and +0 vs -0.
7047  *
7048  * minnum() and maxnum() functions. These are similar to the min()
7049  * and max() functions but if one of the arguments is a QNaN and
7050  * the other is numerical then the numerical argument is returned.
7051  * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7052  * and maxNum() operations. min() and max() are the typical min/max
7053  * semantics provided by many CPUs which predate that specification.
7054  */
7055 #define MINMAX(s)                                                       \
7056 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
7057                                         int ismin, int isieee STATUS_PARAM) \
7058 {                                                                       \
7059     flag aSign, bSign;                                                  \
7060     uint ## s ## _t av, bv;                                             \
7061     a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
7062     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
7063     if (float ## s ## _is_any_nan(a) ||                                 \
7064         float ## s ## _is_any_nan(b)) {                                 \
7065         if (isieee) {                                                   \
7066             if (float ## s ## _is_quiet_nan(a) &&                       \
7067                 !float ## s ##_is_any_nan(b)) {                         \
7068                 return b;                                               \
7069             } else if (float ## s ## _is_quiet_nan(b) &&                \
7070                        !float ## s ## _is_any_nan(a)) {                 \
7071                 return a;                                               \
7072             }                                                           \
7073         }                                                               \
7074         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
7075     }                                                                   \
7076     aSign = extractFloat ## s ## Sign(a);                               \
7077     bSign = extractFloat ## s ## Sign(b);                               \
7078     av = float ## s ## _val(a);                                         \
7079     bv = float ## s ## _val(b);                                         \
7080     if (aSign != bSign) {                                               \
7081         if (ismin) {                                                    \
7082             return aSign ? a : b;                                       \
7083         } else {                                                        \
7084             return aSign ? b : a;                                       \
7085         }                                                               \
7086     } else {                                                            \
7087         if (ismin) {                                                    \
7088             return (aSign ^ (av < bv)) ? a : b;                         \
7089         } else {                                                        \
7090             return (aSign ^ (av < bv)) ? b : a;                         \
7091         }                                                               \
7092     }                                                                   \
7093 }                                                                       \
7094                                                                         \
7095 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
7096 {                                                                       \
7097     return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR);                \
7098 }                                                                       \
7099                                                                         \
7100 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
7101 {                                                                       \
7102     return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR);                \
7103 }                                                                       \
7104                                                                         \
7105 float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7106 {                                                                       \
7107     return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR);                \
7108 }                                                                       \
7109                                                                         \
7110 float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7111 {                                                                       \
7112     return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR);                \
7113 }
7114
7115 MINMAX(32)
7116 MINMAX(64)
7117
7118
7119 /* Multiply A by 2 raised to the power N.  */
7120 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
7121 {
7122     flag aSign;
7123     int16_t aExp;
7124     uint32_t aSig;
7125
7126     a = float32_squash_input_denormal(a STATUS_VAR);
7127     aSig = extractFloat32Frac( a );
7128     aExp = extractFloat32Exp( a );
7129     aSign = extractFloat32Sign( a );
7130
7131     if ( aExp == 0xFF ) {
7132         if ( aSig ) {
7133             return propagateFloat32NaN( a, a STATUS_VAR );
7134         }
7135         return a;
7136     }
7137     if (aExp != 0) {
7138         aSig |= 0x00800000;
7139     } else if (aSig == 0) {
7140         return a;
7141     } else {
7142         aExp++;
7143     }
7144
7145     if (n > 0x200) {
7146         n = 0x200;
7147     } else if (n < -0x200) {
7148         n = -0x200;
7149     }
7150
7151     aExp += n - 1;
7152     aSig <<= 7;
7153     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
7154 }
7155
7156 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7157 {
7158     flag aSign;
7159     int16_t aExp;
7160     uint64_t aSig;
7161
7162     a = float64_squash_input_denormal(a STATUS_VAR);
7163     aSig = extractFloat64Frac( a );
7164     aExp = extractFloat64Exp( a );
7165     aSign = extractFloat64Sign( a );
7166
7167     if ( aExp == 0x7FF ) {
7168         if ( aSig ) {
7169             return propagateFloat64NaN( a, a STATUS_VAR );
7170         }
7171         return a;
7172     }
7173     if (aExp != 0) {
7174         aSig |= LIT64( 0x0010000000000000 );
7175     } else if (aSig == 0) {
7176         return a;
7177     } else {
7178         aExp++;
7179     }
7180
7181     if (n > 0x1000) {
7182         n = 0x1000;
7183     } else if (n < -0x1000) {
7184         n = -0x1000;
7185     }
7186
7187     aExp += n - 1;
7188     aSig <<= 10;
7189     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
7190 }
7191
7192 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7193 {
7194     flag aSign;
7195     int32_t aExp;
7196     uint64_t aSig;
7197
7198     aSig = extractFloatx80Frac( a );
7199     aExp = extractFloatx80Exp( a );
7200     aSign = extractFloatx80Sign( a );
7201
7202     if ( aExp == 0x7FFF ) {
7203         if ( aSig<<1 ) {
7204             return propagateFloatx80NaN( a, a STATUS_VAR );
7205         }
7206         return a;
7207     }
7208
7209     if (aExp == 0) {
7210         if (aSig == 0) {
7211             return a;
7212         }
7213         aExp++;
7214     }
7215
7216     if (n > 0x10000) {
7217         n = 0x10000;
7218     } else if (n < -0x10000) {
7219         n = -0x10000;
7220     }
7221
7222     aExp += n;
7223     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7224                                           aSign, aExp, aSig, 0 STATUS_VAR );
7225 }
7226
7227 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7228 {
7229     flag aSign;
7230     int32_t aExp;
7231     uint64_t aSig0, aSig1;
7232
7233     aSig1 = extractFloat128Frac1( a );
7234     aSig0 = extractFloat128Frac0( a );
7235     aExp = extractFloat128Exp( a );
7236     aSign = extractFloat128Sign( a );
7237     if ( aExp == 0x7FFF ) {
7238         if ( aSig0 | aSig1 ) {
7239             return propagateFloat128NaN( a, a STATUS_VAR );
7240         }
7241         return a;
7242     }
7243     if (aExp != 0) {
7244         aSig0 |= LIT64( 0x0001000000000000 );
7245     } else if (aSig0 == 0 && aSig1 == 0) {
7246         return a;
7247     } else {
7248         aExp++;
7249     }
7250
7251     if (n > 0x10000) {
7252         n = 0x10000;
7253     } else if (n < -0x10000) {
7254         n = -0x10000;
7255     }
7256
7257     aExp += n - 1;
7258     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7259                                           STATUS_VAR );
7260
7261 }