fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * Derived from SoftFloat.
   5  */
   6
   7 /*============================================================================
   8
   9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  10 Package, Release 2b.
  11
  12 Written by John R. Hauser.  This work was made possible in part by the
  13 International Computer Science Institute, located at Suite 600, 1947 Center
  14 Street, Berkeley, California 94704.  Funding was partially provided by the
  15 National Science Foundation under grant MIP-9311980.  The original version
  16 of this code was written as part of a project to build a fixed-point vector
  17 processor in collaboration with the University of California at Berkeley,
  18 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  20 arithmetic/SoftFloat.html'.
  21
  22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  24 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  30
  31 Derivative works are acceptable, even for commercial purposes, so long as
  32 (1) the source code for the derivative work includes prominent notice that
  33 the work is derivative, and (2) the source code includes prominent notice with
  34 these four paragraphs for those parts of this code that are retained.
  35
  36 =============================================================================*/
  37
  38 #include "softfloat.h"
  39
  40 /*----------------------------------------------------------------------------
  41 | Primitive arithmetic functions, including multi-word arithmetic, and
  42 | division and square root approximations.  (Can be specialized to target if
  43 | desired.)
  44 *----------------------------------------------------------------------------*/
  45 #include "softfloat-macros.h"
  46
  47 /*----------------------------------------------------------------------------
  48 | Functions and definitions to determine:  (1) whether tininess for underflow
  49 | is detected before or after rounding by default, (2) what (if anything)
  50 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  51 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  52 | are propagated from function inputs to output.  These details are target-
  53 | specific.
  54 *----------------------------------------------------------------------------*/
  55 #include "softfloat-specialize.h"
  56
  57 void set_float_rounding_mode(int val STATUS_PARAM)
  58 {
  59     STATUS(float_rounding_mode) = val;
  60 }
  61
  62 void set_float_exception_flags(int val STATUS_PARAM)
  63 {
  64     STATUS(float_exception_flags) = val;
  65 }
  66
  67 #ifdef FLOATX80
  68 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  69 {
  70     STATUS(floatx80_rounding_precision) = val;
  71 }
  72 #endif
  73
  74 /*----------------------------------------------------------------------------
  75 | Returns the fraction bits of the half-precision floating-point value `a'.
  76 *----------------------------------------------------------------------------*/
  77
  78 INLINE uint32_t extractFloat16Frac(float16 a)
  79 {
  80     return float16_val(a) & 0x3ff;
  81 }
  82
  83 /*----------------------------------------------------------------------------
  84 | Returns the exponent bits of the half-precision floating-point value `a'.
  85 *----------------------------------------------------------------------------*/
  86
  87 INLINE int16 extractFloat16Exp(float16 a)
  88 {
  89     return (float16_val(a) >> 10) & 0x1f;
  90 }
  91
  92 /*----------------------------------------------------------------------------
  93 | Returns the sign bit of the single-precision floating-point value `a'.
  94 *----------------------------------------------------------------------------*/
  95
  96 INLINE flag extractFloat16Sign(float16 a)
  97 {
  98     return float16_val(a)>>15;
  99 }
 100
 101 /*----------------------------------------------------------------------------
 102 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 103 | and 7, and returns the properly rounded 32-bit integer corresponding to the
 104 | input.  If `zSign' is 1, the input is negated before being converted to an
 105 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 106 | is simply rounded to an integer, with the inexact exception raised if the
 107 | input cannot be represented exactly as an integer.  However, if the fixed-
 108 | point input is too large, the invalid exception is raised and the largest
 109 | positive or negative integer is returned.
 110 *----------------------------------------------------------------------------*/
 111
 112 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
 113 {
 114     int8 roundingMode;
 115     flag roundNearestEven;
 116     int8 roundIncrement, roundBits;
 117     int32 z;
 118
 119     roundingMode = STATUS(float_rounding_mode);
 120     roundNearestEven = ( roundingMode == float_round_nearest_even );
 121     roundIncrement = 0x40;
 122     if ( ! roundNearestEven ) {
 123         if ( roundingMode == float_round_to_zero ) {
 124             roundIncrement = 0;
 125         }
 126         else {
 127             roundIncrement = 0x7F;
 128             if ( zSign ) {
 129                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 130             }
 131             else {
 132                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 133             }
 134         }
 135     }
 136     roundBits = absZ & 0x7F;
 137     absZ = ( absZ + roundIncrement )>>7;
 138     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 139     z = absZ;
 140     if ( zSign ) z = - z;
 141     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 142         float_raise( float_flag_invalid STATUS_VAR);
 143         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 144     }
 145     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 146     return z;
 147
 148 }
 149
 150 /*----------------------------------------------------------------------------
 151 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 152 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 153 | and returns the properly rounded 64-bit integer corresponding to the input.
 154 | If `zSign' is 1, the input is negated before being converted to an integer.
 155 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 156 | the inexact exception raised if the input cannot be represented exactly as
 157 | an integer.  However, if the fixed-point input is too large, the invalid
 158 | exception is raised and the largest positive or negative integer is
 159 | returned.
 160 *----------------------------------------------------------------------------*/
 161
 162 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
 163 {
 164     int8 roundingMode;
 165     flag roundNearestEven, increment;
 166     int64 z;
 167
 168     roundingMode = STATUS(float_rounding_mode);
 169     roundNearestEven = ( roundingMode == float_round_nearest_even );
 170     increment = ( (int64_t) absZ1 < 0 );
 171     if ( ! roundNearestEven ) {
 172         if ( roundingMode == float_round_to_zero ) {
 173             increment = 0;
 174         }
 175         else {
 176             if ( zSign ) {
 177                 increment = ( roundingMode == float_round_down ) && absZ1;
 178             }
 179             else {
 180                 increment = ( roundingMode == float_round_up ) && absZ1;
 181             }
 182         }
 183     }
 184     if ( increment ) {
 185         ++absZ0;
 186         if ( absZ0 == 0 ) goto overflow;
 187         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 188     }
 189     z = absZ0;
 190     if ( zSign ) z = - z;
 191     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 192  overflow:
 193         float_raise( float_flag_invalid STATUS_VAR);
 194         return
 195               zSign ? (int64_t) LIT64( 0x8000000000000000 )
 196             : LIT64( 0x7FFFFFFFFFFFFFFF );
 197     }
 198     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 199     return z;
 200
 201 }
 202
 203 /*----------------------------------------------------------------------------
 204 | Returns the fraction bits of the single-precision floating-point value `a'.
 205 *----------------------------------------------------------------------------*/
 206
 207 INLINE uint32_t extractFloat32Frac( float32 a )
 208 {
 209
 210     return float32_val(a) & 0x007FFFFF;
 211
 212 }
 213
 214 /*----------------------------------------------------------------------------
 215 | Returns the exponent bits of the single-precision floating-point value `a'.
 216 *----------------------------------------------------------------------------*/
 217
 218 INLINE int16 extractFloat32Exp( float32 a )
 219 {
 220
 221     return ( float32_val(a)>>23 ) & 0xFF;
 222
 223 }
 224
 225 /*----------------------------------------------------------------------------
 226 | Returns the sign bit of the single-precision floating-point value `a'.
 227 *----------------------------------------------------------------------------*/
 228
 229 INLINE flag extractFloat32Sign( float32 a )
 230 {
 231
 232     return float32_val(a)>>31;
 233
 234 }
 235
 236 /*----------------------------------------------------------------------------
 237 | If `a' is denormal and we are in flush-to-zero mode then set the
 238 | input-denormal exception and return zero. Otherwise just return the value.
 239 *----------------------------------------------------------------------------*/
 240 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 241 {
 242     if (STATUS(flush_inputs_to_zero)) {
 243         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 244             float_raise(float_flag_input_denormal STATUS_VAR);
 245             return make_float32(float32_val(a) & 0x80000000);
 246         }
 247     }
 248     return a;
 249 }
 250
 251 /*----------------------------------------------------------------------------
 252 | Normalizes the subnormal single-precision floating-point value represented
 253 | by the denormalized significand `aSig'.  The normalized exponent and
 254 | significand are stored at the locations pointed to by `zExpPtr' and
 255 | `zSigPtr', respectively.
 256 *----------------------------------------------------------------------------*/
 257
 258 static void
 259  normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
 260 {
 261     int8 shiftCount;
 262
 263     shiftCount = countLeadingZeros32( aSig ) - 8;
 264     *zSigPtr = aSig<<shiftCount;
 265     *zExpPtr = 1 - shiftCount;
 266
 267 }
 268
 269 /*----------------------------------------------------------------------------
 270 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 271 | single-precision floating-point value, returning the result.  After being
 272 | shifted into the proper positions, the three fields are simply added
 273 | together to form the result.  This means that any integer portion of `zSig'
 274 | will be added into the exponent.  Since a properly normalized significand
 275 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 276 | than the desired result exponent whenever `zSig' is a complete, normalized
 277 | significand.
 278 *----------------------------------------------------------------------------*/
 279
 280 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
 281 {
 282
 283     return make_float32(
 284           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 285
 286 }
 287
 288 /*----------------------------------------------------------------------------
 289 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 290 | and significand `zSig', and returns the proper single-precision floating-
 291 | point value corresponding to the abstract input.  Ordinarily, the abstract
 292 | value is simply rounded and packed into the single-precision format, with
 293 | the inexact exception raised if the abstract input cannot be represented
 294 | exactly.  However, if the abstract value is too large, the overflow and
 295 | inexact exceptions are raised and an infinity or maximal finite value is
 296 | returned.  If the abstract value is too small, the input value is rounded to
 297 | a subnormal number, and the underflow and inexact exceptions are raised if
 298 | the abstract input cannot be represented exactly as a subnormal single-
 299 | precision floating-point number.
 300 |     The input significand `zSig' has its binary point between bits 30
 301 | and 29, which is 7 bits to the left of the usual location.  This shifted
 302 | significand must be normalized or smaller.  If `zSig' is not normalized,
 303 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 304 | and it must not require rounding.  In the usual case that `zSig' is
 305 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 306 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 307 | Binary Floating-Point Arithmetic.
 308 *----------------------------------------------------------------------------*/
 309
 310 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
 311 {
 312     int8 roundingMode;
 313     flag roundNearestEven;
 314     int8 roundIncrement, roundBits;
 315     flag isTiny;
 316
 317     roundingMode = STATUS(float_rounding_mode);
 318     roundNearestEven = ( roundingMode == float_round_nearest_even );
 319     roundIncrement = 0x40;
 320     if ( ! roundNearestEven ) {
 321         if ( roundingMode == float_round_to_zero ) {
 322             roundIncrement = 0;
 323         }
 324         else {
 325             roundIncrement = 0x7F;
 326             if ( zSign ) {
 327                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 328             }
 329             else {
 330                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 331             }
 332         }
 333     }
 334     roundBits = zSig & 0x7F;
 335     if ( 0xFD <= (uint16_t) zExp ) {
 336         if (    ( 0xFD < zExp )
 337              || (    ( zExp == 0xFD )
 338                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 339            ) {
 340             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 341             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 342         }
 343         if ( zExp < 0 ) {
 344             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
 345             isTiny =
 346                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 347                 || ( zExp < -1 )
 348                 || ( zSig + roundIncrement < 0x80000000 );
 349             shift32RightJamming( zSig, - zExp, &zSig );
 350             zExp = 0;
 351             roundBits = zSig & 0x7F;
 352             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 353         }
 354     }
 355     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 356     zSig = ( zSig + roundIncrement )>>7;
 357     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 358     if ( zSig == 0 ) zExp = 0;
 359     return packFloat32( zSign, zExp, zSig );
 360
 361 }
 362
 363 /*----------------------------------------------------------------------------
 364 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 365 | and significand `zSig', and returns the proper single-precision floating-
 366 | point value corresponding to the abstract input.  This routine is just like
 367 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 368 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 369 | floating-point exponent.
 370 *----------------------------------------------------------------------------*/
 371
 372 static float32
 373  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
 374 {
 375     int8 shiftCount;
 376
 377     shiftCount = countLeadingZeros32( zSig ) - 1;
 378     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 379
 380 }
 381
 382 /*----------------------------------------------------------------------------
 383 | Returns the fraction bits of the double-precision floating-point value `a'.
 384 *----------------------------------------------------------------------------*/
 385
 386 INLINE uint64_t extractFloat64Frac( float64 a )
 387 {
 388
 389     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 390
 391 }
 392
 393 /*----------------------------------------------------------------------------
 394 | Returns the exponent bits of the double-precision floating-point value `a'.
 395 *----------------------------------------------------------------------------*/
 396
 397 INLINE int16 extractFloat64Exp( float64 a )
 398 {
 399
 400     return ( float64_val(a)>>52 ) & 0x7FF;
 401
 402 }
 403
 404 /*----------------------------------------------------------------------------
 405 | Returns the sign bit of the double-precision floating-point value `a'.
 406 *----------------------------------------------------------------------------*/
 407
 408 INLINE flag extractFloat64Sign( float64 a )
 409 {
 410
 411     return float64_val(a)>>63;
 412
 413 }
 414
 415 /*----------------------------------------------------------------------------
 416 | If `a' is denormal and we are in flush-to-zero mode then set the
 417 | input-denormal exception and return zero. Otherwise just return the value.
 418 *----------------------------------------------------------------------------*/
 419 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 420 {
 421     if (STATUS(flush_inputs_to_zero)) {
 422         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 423             float_raise(float_flag_input_denormal STATUS_VAR);
 424             return make_float64(float64_val(a) & (1ULL << 63));
 425         }
 426     }
 427     return a;
 428 }
 429
 430 /*----------------------------------------------------------------------------
 431 | Normalizes the subnormal double-precision floating-point value represented
 432 | by the denormalized significand `aSig'.  The normalized exponent and
 433 | significand are stored at the locations pointed to by `zExpPtr' and
 434 | `zSigPtr', respectively.
 435 *----------------------------------------------------------------------------*/
 436
 437 static void
 438  normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
 439 {
 440     int8 shiftCount;
 441
 442     shiftCount = countLeadingZeros64( aSig ) - 11;
 443     *zSigPtr = aSig<<shiftCount;
 444     *zExpPtr = 1 - shiftCount;
 445
 446 }
 447
 448 /*----------------------------------------------------------------------------
 449 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 450 | double-precision floating-point value, returning the result.  After being
 451 | shifted into the proper positions, the three fields are simply added
 452 | together to form the result.  This means that any integer portion of `zSig'
 453 | will be added into the exponent.  Since a properly normalized significand
 454 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 455 | than the desired result exponent whenever `zSig' is a complete, normalized
 456 | significand.
 457 *----------------------------------------------------------------------------*/
 458
 459 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
 460 {
 461
 462     return make_float64(
 463         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 464
 465 }
 466
 467 /*----------------------------------------------------------------------------
 468 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 469 | and significand `zSig', and returns the proper double-precision floating-
 470 | point value corresponding to the abstract input.  Ordinarily, the abstract
 471 | value is simply rounded and packed into the double-precision format, with
 472 | the inexact exception raised if the abstract input cannot be represented
 473 | exactly.  However, if the abstract value is too large, the overflow and
 474 | inexact exceptions are raised and an infinity or maximal finite value is
 475 | returned.  If the abstract value is too small, the input value is rounded
 476 | to a subnormal number, and the underflow and inexact exceptions are raised
 477 | if the abstract input cannot be represented exactly as a subnormal double-
 478 | precision floating-point number.
 479 |     The input significand `zSig' has its binary point between bits 62
 480 | and 61, which is 10 bits to the left of the usual location.  This shifted
 481 | significand must be normalized or smaller.  If `zSig' is not normalized,
 482 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 483 | and it must not require rounding.  In the usual case that `zSig' is
 484 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 485 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 486 | Binary Floating-Point Arithmetic.
 487 *----------------------------------------------------------------------------*/
 488
 489 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
 490 {
 491     int8 roundingMode;
 492     flag roundNearestEven;
 493     int16 roundIncrement, roundBits;
 494     flag isTiny;
 495
 496     roundingMode = STATUS(float_rounding_mode);
 497     roundNearestEven = ( roundingMode == float_round_nearest_even );
 498     roundIncrement = 0x200;
 499     if ( ! roundNearestEven ) {
 500         if ( roundingMode == float_round_to_zero ) {
 501             roundIncrement = 0;
 502         }
 503         else {
 504             roundIncrement = 0x3FF;
 505             if ( zSign ) {
 506                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 507             }
 508             else {
 509                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 510             }
 511         }
 512     }
 513     roundBits = zSig & 0x3FF;
 514     if ( 0x7FD <= (uint16_t) zExp ) {
 515         if (    ( 0x7FD < zExp )
 516              || (    ( zExp == 0x7FD )
 517                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 518            ) {
 519             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 520             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 521         }
 522         if ( zExp < 0 ) {
 523             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
 524             isTiny =
 525                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 526                 || ( zExp < -1 )
 527                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 528             shift64RightJamming( zSig, - zExp, &zSig );
 529             zExp = 0;
 530             roundBits = zSig & 0x3FF;
 531             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 532         }
 533     }
 534     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 535     zSig = ( zSig + roundIncrement )>>10;
 536     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 537     if ( zSig == 0 ) zExp = 0;
 538     return packFloat64( zSign, zExp, zSig );
 539
 540 }
 541
 542 /*----------------------------------------------------------------------------
 543 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 544 | and significand `zSig', and returns the proper double-precision floating-
 545 | point value corresponding to the abstract input.  This routine is just like
 546 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 547 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 548 | floating-point exponent.
 549 *----------------------------------------------------------------------------*/
 550
 551 static float64
 552  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
 553 {
 554     int8 shiftCount;
 555
 556     shiftCount = countLeadingZeros64( zSig ) - 1;
 557     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 558
 559 }
 560
 561 #ifdef FLOATX80
 562
 563 /*----------------------------------------------------------------------------
 564 | Returns the fraction bits of the extended double-precision floating-point
 565 | value `a'.
 566 *----------------------------------------------------------------------------*/
 567
 568 INLINE uint64_t extractFloatx80Frac( floatx80 a )
 569 {
 570
 571     return a.low;
 572
 573 }
 574
 575 /*----------------------------------------------------------------------------
 576 | Returns the exponent bits of the extended double-precision floating-point
 577 | value `a'.
 578 *----------------------------------------------------------------------------*/
 579
 580 INLINE int32 extractFloatx80Exp( floatx80 a )
 581 {
 582
 583     return a.high & 0x7FFF;
 584
 585 }
 586
 587 /*----------------------------------------------------------------------------
 588 | Returns the sign bit of the extended double-precision floating-point value
 589 | `a'.
 590 *----------------------------------------------------------------------------*/
 591
 592 INLINE flag extractFloatx80Sign( floatx80 a )
 593 {
 594
 595     return a.high>>15;
 596
 597 }
 598
 599 /*----------------------------------------------------------------------------
 600 | Normalizes the subnormal extended double-precision floating-point value
 601 | represented by the denormalized significand `aSig'.  The normalized exponent
 602 | and significand are stored at the locations pointed to by `zExpPtr' and
 603 | `zSigPtr', respectively.
 604 *----------------------------------------------------------------------------*/
 605
 606 static void
 607  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
 608 {
 609     int8 shiftCount;
 610
 611     shiftCount = countLeadingZeros64( aSig );
 612     *zSigPtr = aSig<<shiftCount;
 613     *zExpPtr = 1 - shiftCount;
 614
 615 }
 616
 617 /*----------------------------------------------------------------------------
 618 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 619 | extended double-precision floating-point value, returning the result.
 620 *----------------------------------------------------------------------------*/
 621
 622 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
 623 {
 624     floatx80 z;
 625
 626     z.low = zSig;
 627     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 628     return z;
 629
 630 }
 631
 632 /*----------------------------------------------------------------------------
 633 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 634 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 635 | and returns the proper extended double-precision floating-point value
 636 | corresponding to the abstract input.  Ordinarily, the abstract value is
 637 | rounded and packed into the extended double-precision format, with the
 638 | inexact exception raised if the abstract input cannot be represented
 639 | exactly.  However, if the abstract value is too large, the overflow and
 640 | inexact exceptions are raised and an infinity or maximal finite value is
 641 | returned.  If the abstract value is too small, the input value is rounded to
 642 | a subnormal number, and the underflow and inexact exceptions are raised if
 643 | the abstract input cannot be represented exactly as a subnormal extended
 644 | double-precision floating-point number.
 645 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 646 | number of bits as single or double precision, respectively.  Otherwise, the
 647 | result is rounded to the full precision of the extended double-precision
 648 | format.
 649 |     The input significand must be normalized or smaller.  If the input
 650 | significand is not normalized, `zExp' must be 0; in that case, the result
 651 | returned is a subnormal number, and it must not require rounding.  The
 652 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 653 | Floating-Point Arithmetic.
 654 *----------------------------------------------------------------------------*/
 655
 656 static floatx80
 657  roundAndPackFloatx80(
 658      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 659  STATUS_PARAM)
 660 {
 661     int8 roundingMode;
 662     flag roundNearestEven, increment, isTiny;
 663     int64 roundIncrement, roundMask, roundBits;
 664
 665     roundingMode = STATUS(float_rounding_mode);
 666     roundNearestEven = ( roundingMode == float_round_nearest_even );
 667     if ( roundingPrecision == 80 ) goto precision80;
 668     if ( roundingPrecision == 64 ) {
 669         roundIncrement = LIT64( 0x0000000000000400 );
 670         roundMask = LIT64( 0x00000000000007FF );
 671     }
 672     else if ( roundingPrecision == 32 ) {
 673         roundIncrement = LIT64( 0x0000008000000000 );
 674         roundMask = LIT64( 0x000000FFFFFFFFFF );
 675     }
 676     else {
 677         goto precision80;
 678     }
 679     zSig0 |= ( zSig1 != 0 );
 680     if ( ! roundNearestEven ) {
 681         if ( roundingMode == float_round_to_zero ) {
 682             roundIncrement = 0;
 683         }
 684         else {
 685             roundIncrement = roundMask;
 686             if ( zSign ) {
 687                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 688             }
 689             else {
 690                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 691             }
 692         }
 693     }
 694     roundBits = zSig0 & roundMask;
 695     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 696         if (    ( 0x7FFE < zExp )
 697              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 698            ) {
 699             goto overflow;
 700         }
 701         if ( zExp <= 0 ) {
 702             if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
 703             isTiny =
 704                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 705                 || ( zExp < 0 )
 706                 || ( zSig0 <= zSig0 + roundIncrement );
 707             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 708             zExp = 0;
 709             roundBits = zSig0 & roundMask;
 710             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 711             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 712             zSig0 += roundIncrement;
 713             if ( (int64_t) zSig0 < 0 ) zExp = 1;
 714             roundIncrement = roundMask + 1;
 715             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 716                 roundMask |= roundIncrement;
 717             }
 718             zSig0 &= ~ roundMask;
 719             return packFloatx80( zSign, zExp, zSig0 );
 720         }
 721     }
 722     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 723     zSig0 += roundIncrement;
 724     if ( zSig0 < roundIncrement ) {
 725         ++zExp;
 726         zSig0 = LIT64( 0x8000000000000000 );
 727     }
 728     roundIncrement = roundMask + 1;
 729     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 730         roundMask |= roundIncrement;
 731     }
 732     zSig0 &= ~ roundMask;
 733     if ( zSig0 == 0 ) zExp = 0;
 734     return packFloatx80( zSign, zExp, zSig0 );
 735  precision80:
 736     increment = ( (int64_t) zSig1 < 0 );
 737     if ( ! roundNearestEven ) {
 738         if ( roundingMode == float_round_to_zero ) {
 739             increment = 0;
 740         }
 741         else {
 742             if ( zSign ) {
 743                 increment = ( roundingMode == float_round_down ) && zSig1;
 744             }
 745             else {
 746                 increment = ( roundingMode == float_round_up ) && zSig1;
 747             }
 748         }
 749     }
 750     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 751         if (    ( 0x7FFE < zExp )
 752              || (    ( zExp == 0x7FFE )
 753                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 754                   && increment
 755                 )
 756            ) {
 757             roundMask = 0;
 758  overflow:
 759             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 760             if (    ( roundingMode == float_round_to_zero )
 761                  || ( zSign && ( roundingMode == float_round_up ) )
 762                  || ( ! zSign && ( roundingMode == float_round_down ) )
 763                ) {
 764                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 765             }
 766             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 767         }
 768         if ( zExp <= 0 ) {
 769             isTiny =
 770                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 771                 || ( zExp < 0 )
 772                 || ! increment
 773                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 774             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 775             zExp = 0;
 776             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 777             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 778             if ( roundNearestEven ) {
 779                 increment = ( (int64_t) zSig1 < 0 );
 780             }
 781             else {
 782                 if ( zSign ) {
 783                     increment = ( roundingMode == float_round_down ) && zSig1;
 784                 }
 785                 else {
 786                     increment = ( roundingMode == float_round_up ) && zSig1;
 787                 }
 788             }
 789             if ( increment ) {
 790                 ++zSig0;
 791                 zSig0 &=
 792                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 793                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
 794             }
 795             return packFloatx80( zSign, zExp, zSig0 );
 796         }
 797     }
 798     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 799     if ( increment ) {
 800         ++zSig0;
 801         if ( zSig0 == 0 ) {
 802             ++zExp;
 803             zSig0 = LIT64( 0x8000000000000000 );
 804         }
 805         else {
 806             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 807         }
 808     }
 809     else {
 810         if ( zSig0 == 0 ) zExp = 0;
 811     }
 812     return packFloatx80( zSign, zExp, zSig0 );
 813
 814 }
 815
 816 /*----------------------------------------------------------------------------
 817 | Takes an abstract floating-point value having sign `zSign', exponent
 818 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 819 | and returns the proper extended double-precision floating-point value
 820 | corresponding to the abstract input.  This routine is just like
 821 | `roundAndPackFloatx80' except that the input significand does not have to be
 822 | normalized.
 823 *----------------------------------------------------------------------------*/
 824
 825 static floatx80
 826  normalizeRoundAndPackFloatx80(
 827      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 828  STATUS_PARAM)
 829 {
 830     int8 shiftCount;
 831
 832     if ( zSig0 == 0 ) {
 833         zSig0 = zSig1;
 834         zSig1 = 0;
 835         zExp -= 64;
 836     }
 837     shiftCount = countLeadingZeros64( zSig0 );
 838     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 839     zExp -= shiftCount;
 840     return
 841         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 842
 843 }
 844
 845 #endif
 846
 847 #ifdef FLOAT128
 848
 849 /*----------------------------------------------------------------------------
 850 | Returns the least-significant 64 fraction bits of the quadruple-precision
 851 | floating-point value `a'.
 852 *----------------------------------------------------------------------------*/
 853
 854 INLINE uint64_t extractFloat128Frac1( float128 a )
 855 {
 856
 857     return a.low;
 858
 859 }
 860
 861 /*----------------------------------------------------------------------------
 862 | Returns the most-significant 48 fraction bits of the quadruple-precision
 863 | floating-point value `a'.
 864 *----------------------------------------------------------------------------*/
 865
 866 INLINE uint64_t extractFloat128Frac0( float128 a )
 867 {
 868
 869     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 870
 871 }
 872
 873 /*----------------------------------------------------------------------------
 874 | Returns the exponent bits of the quadruple-precision floating-point value
 875 | `a'.
 876 *----------------------------------------------------------------------------*/
 877
 878 INLINE int32 extractFloat128Exp( float128 a )
 879 {
 880
 881     return ( a.high>>48 ) & 0x7FFF;
 882
 883 }
 884
 885 /*----------------------------------------------------------------------------
 886 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 887 *----------------------------------------------------------------------------*/
 888
 889 INLINE flag extractFloat128Sign( float128 a )
 890 {
 891
 892     return a.high>>63;
 893
 894 }
 895
 896 /*----------------------------------------------------------------------------
 897 | Normalizes the subnormal quadruple-precision floating-point value
 898 | represented by the denormalized significand formed by the concatenation of
 899 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 900 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 901 | significand are stored at the location pointed to by `zSig0Ptr', and the
 902 | least significant 64 bits of the normalized significand are stored at the
 903 | location pointed to by `zSig1Ptr'.
 904 *----------------------------------------------------------------------------*/
 905
 906 static void
 907  normalizeFloat128Subnormal(
 908      uint64_t aSig0,
 909      uint64_t aSig1,
 910      int32 *zExpPtr,
 911      uint64_t *zSig0Ptr,
 912      uint64_t *zSig1Ptr
 913  )
 914 {
 915     int8 shiftCount;
 916
 917     if ( aSig0 == 0 ) {
 918         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 919         if ( shiftCount < 0 ) {
 920             *zSig0Ptr = aSig1>>( - shiftCount );
 921             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 922         }
 923         else {
 924             *zSig0Ptr = aSig1<<shiftCount;
 925             *zSig1Ptr = 0;
 926         }
 927         *zExpPtr = - shiftCount - 63;
 928     }
 929     else {
 930         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 931         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 932         *zExpPtr = 1 - shiftCount;
 933     }
 934
 935 }
 936
 937 /*----------------------------------------------------------------------------
 938 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 939 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 940 | floating-point value, returning the result.  After being shifted into the
 941 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 942 | added together to form the most significant 32 bits of the result.  This
 943 | means that any integer portion of `zSig0' will be added into the exponent.
 944 | Since a properly normalized significand will have an integer portion equal
 945 | to 1, the `zExp' input should be 1 less than the desired result exponent
 946 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 947 | significand.
 948 *----------------------------------------------------------------------------*/
 949
 950 INLINE float128
 951  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
 952 {
 953     float128 z;
 954
 955     z.low = zSig1;
 956     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
 957     return z;
 958
 959 }
 960
 961 /*----------------------------------------------------------------------------
 962 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 963 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 964 | and `zSig2', and returns the proper quadruple-precision floating-point value
 965 | corresponding to the abstract input.  Ordinarily, the abstract value is
 966 | simply rounded and packed into the quadruple-precision format, with the
 967 | inexact exception raised if the abstract input cannot be represented
 968 | exactly.  However, if the abstract value is too large, the overflow and
 969 | inexact exceptions are raised and an infinity or maximal finite value is
 970 | returned.  If the abstract value is too small, the input value is rounded to
 971 | a subnormal number, and the underflow and inexact exceptions are raised if
 972 | the abstract input cannot be represented exactly as a subnormal quadruple-
 973 | precision floating-point number.
 974 |     The input significand must be normalized or smaller.  If the input
 975 | significand is not normalized, `zExp' must be 0; in that case, the result
 976 | returned is a subnormal number, and it must not require rounding.  In the
 977 | usual case that the input significand is normalized, `zExp' must be 1 less
 978 | than the ``true'' floating-point exponent.  The handling of underflow and
 979 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 980 *----------------------------------------------------------------------------*/
 981
 982 static float128
 983  roundAndPackFloat128(
 984      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
 985 {
 986     int8 roundingMode;
 987     flag roundNearestEven, increment, isTiny;
 988
 989     roundingMode = STATUS(float_rounding_mode);
 990     roundNearestEven = ( roundingMode == float_round_nearest_even );
 991     increment = ( (int64_t) zSig2 < 0 );
 992     if ( ! roundNearestEven ) {
 993         if ( roundingMode == float_round_to_zero ) {
 994             increment = 0;
 995         }
 996         else {
 997             if ( zSign ) {
 998                 increment = ( roundingMode == float_round_down ) && zSig2;
 999             }
1000             else {
1001                 increment = ( roundingMode == float_round_up ) && zSig2;
1002             }
1003         }
1004     }
1005     if ( 0x7FFD <= (uint32_t) zExp ) {
1006         if (    ( 0x7FFD < zExp )
1007              || (    ( zExp == 0x7FFD )
1008                   && eq128(
1009                          LIT64( 0x0001FFFFFFFFFFFF ),
1010                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1011                          zSig0,
1012                          zSig1
1013                      )
1014                   && increment
1015                 )
1016            ) {
1017             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1018             if (    ( roundingMode == float_round_to_zero )
1019                  || ( zSign && ( roundingMode == float_round_up ) )
1020                  || ( ! zSign && ( roundingMode == float_round_down ) )
1021                ) {
1022                 return
1023                     packFloat128(
1024                         zSign,
1025                         0x7FFE,
1026                         LIT64( 0x0000FFFFFFFFFFFF ),
1027                         LIT64( 0xFFFFFFFFFFFFFFFF )
1028                     );
1029             }
1030             return packFloat128( zSign, 0x7FFF, 0, 0 );
1031         }
1032         if ( zExp < 0 ) {
1033             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1034             isTiny =
1035                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1036                 || ( zExp < -1 )
1037                 || ! increment
1038                 || lt128(
1039                        zSig0,
1040                        zSig1,
1041                        LIT64( 0x0001FFFFFFFFFFFF ),
1042                        LIT64( 0xFFFFFFFFFFFFFFFF )
1043                    );
1044             shift128ExtraRightJamming(
1045                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1046             zExp = 0;
1047             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1048             if ( roundNearestEven ) {
1049                 increment = ( (int64_t) zSig2 < 0 );
1050             }
1051             else {
1052                 if ( zSign ) {
1053                     increment = ( roundingMode == float_round_down ) && zSig2;
1054                 }
1055                 else {
1056                     increment = ( roundingMode == float_round_up ) && zSig2;
1057                 }
1058             }
1059         }
1060     }
1061     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1062     if ( increment ) {
1063         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1064         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1065     }
1066     else {
1067         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1068     }
1069     return packFloat128( zSign, zExp, zSig0, zSig1 );
1070
1071 }
1072
1073 /*----------------------------------------------------------------------------
1074 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1075 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1076 | returns the proper quadruple-precision floating-point value corresponding
1077 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1078 | except that the input significand has fewer bits and does not have to be
1079 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1080 | point exponent.
1081 *----------------------------------------------------------------------------*/
1082
1083 static float128
1084  normalizeRoundAndPackFloat128(
1085      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1086 {
1087     int8 shiftCount;
1088     uint64_t zSig2;
1089
1090     if ( zSig0 == 0 ) {
1091         zSig0 = zSig1;
1092         zSig1 = 0;
1093         zExp -= 64;
1094     }
1095     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1096     if ( 0 <= shiftCount ) {
1097         zSig2 = 0;
1098         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1099     }
1100     else {
1101         shift128ExtraRightJamming(
1102             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1103     }
1104     zExp -= shiftCount;
1105     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1106
1107 }
1108
1109 #endif
1110
1111 /*----------------------------------------------------------------------------
1112 | Returns the result of converting the 32-bit two's complement integer `a'
1113 | to the single-precision floating-point format.  The conversion is performed
1114 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 *----------------------------------------------------------------------------*/
1116
1117 float32 int32_to_float32( int32 a STATUS_PARAM )
1118 {
1119     flag zSign;
1120
1121     if ( a == 0 ) return float32_zero;
1122     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1123     zSign = ( a < 0 );
1124     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1125
1126 }
1127
1128 /*----------------------------------------------------------------------------
1129 | Returns the result of converting the 32-bit two's complement integer `a'
1130 | to the double-precision floating-point format.  The conversion is performed
1131 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1132 *----------------------------------------------------------------------------*/
1133
1134 float64 int32_to_float64( int32 a STATUS_PARAM )
1135 {
1136     flag zSign;
1137     uint32 absA;
1138     int8 shiftCount;
1139     uint64_t zSig;
1140
1141     if ( a == 0 ) return float64_zero;
1142     zSign = ( a < 0 );
1143     absA = zSign ? - a : a;
1144     shiftCount = countLeadingZeros32( absA ) + 21;
1145     zSig = absA;
1146     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1147
1148 }
1149
1150 #ifdef FLOATX80
1151
1152 /*----------------------------------------------------------------------------
1153 | Returns the result of converting the 32-bit two's complement integer `a'
1154 | to the extended double-precision floating-point format.  The conversion
1155 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1156 | Arithmetic.
1157 *----------------------------------------------------------------------------*/
1158
1159 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1160 {
1161     flag zSign;
1162     uint32 absA;
1163     int8 shiftCount;
1164     uint64_t zSig;
1165
1166     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1167     zSign = ( a < 0 );
1168     absA = zSign ? - a : a;
1169     shiftCount = countLeadingZeros32( absA ) + 32;
1170     zSig = absA;
1171     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1172
1173 }
1174
1175 #endif
1176
1177 #ifdef FLOAT128
1178
1179 /*----------------------------------------------------------------------------
1180 | Returns the result of converting the 32-bit two's complement integer `a' to
1181 | the quadruple-precision floating-point format.  The conversion is performed
1182 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1183 *----------------------------------------------------------------------------*/
1184
1185 float128 int32_to_float128( int32 a STATUS_PARAM )
1186 {
1187     flag zSign;
1188     uint32 absA;
1189     int8 shiftCount;
1190     uint64_t zSig0;
1191
1192     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1193     zSign = ( a < 0 );
1194     absA = zSign ? - a : a;
1195     shiftCount = countLeadingZeros32( absA ) + 17;
1196     zSig0 = absA;
1197     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1198
1199 }
1200
1201 #endif
1202
1203 /*----------------------------------------------------------------------------
1204 | Returns the result of converting the 64-bit two's complement integer `a'
1205 | to the single-precision floating-point format.  The conversion is performed
1206 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1207 *----------------------------------------------------------------------------*/
1208
1209 float32 int64_to_float32( int64 a STATUS_PARAM )
1210 {
1211     flag zSign;
1212     uint64 absA;
1213     int8 shiftCount;
1214
1215     if ( a == 0 ) return float32_zero;
1216     zSign = ( a < 0 );
1217     absA = zSign ? - a : a;
1218     shiftCount = countLeadingZeros64( absA ) - 40;
1219     if ( 0 <= shiftCount ) {
1220         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1221     }
1222     else {
1223         shiftCount += 7;
1224         if ( shiftCount < 0 ) {
1225             shift64RightJamming( absA, - shiftCount, &absA );
1226         }
1227         else {
1228             absA <<= shiftCount;
1229         }
1230         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1231     }
1232
1233 }
1234
1235 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1236 {
1237     int8 shiftCount;
1238
1239     if ( a == 0 ) return float32_zero;
1240     shiftCount = countLeadingZeros64( a ) - 40;
1241     if ( 0 <= shiftCount ) {
1242         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1243     }
1244     else {
1245         shiftCount += 7;
1246         if ( shiftCount < 0 ) {
1247             shift64RightJamming( a, - shiftCount, &a );
1248         }
1249         else {
1250             a <<= shiftCount;
1251         }
1252         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1253     }
1254 }
1255
1256 /*----------------------------------------------------------------------------
1257 | Returns the result of converting the 64-bit two's complement integer `a'
1258 | to the double-precision floating-point format.  The conversion is performed
1259 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1260 *----------------------------------------------------------------------------*/
1261
1262 float64 int64_to_float64( int64 a STATUS_PARAM )
1263 {
1264     flag zSign;
1265
1266     if ( a == 0 ) return float64_zero;
1267     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1268         return packFloat64( 1, 0x43E, 0 );
1269     }
1270     zSign = ( a < 0 );
1271     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1272
1273 }
1274
1275 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1276 {
1277     if ( a == 0 ) return float64_zero;
1278     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1279
1280 }
1281
1282 #ifdef FLOATX80
1283
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the 64-bit two's complement integer `a'
1286 | to the extended double-precision floating-point format.  The conversion
1287 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic.
1289 *----------------------------------------------------------------------------*/
1290
1291 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1292 {
1293     flag zSign;
1294     uint64 absA;
1295     int8 shiftCount;
1296
1297     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1298     zSign = ( a < 0 );
1299     absA = zSign ? - a : a;
1300     shiftCount = countLeadingZeros64( absA );
1301     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1302
1303 }
1304
1305 #endif
1306
1307 #ifdef FLOAT128
1308
1309 /*----------------------------------------------------------------------------
1310 | Returns the result of converting the 64-bit two's complement integer `a' to
1311 | the quadruple-precision floating-point format.  The conversion is performed
1312 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1313 *----------------------------------------------------------------------------*/
1314
1315 float128 int64_to_float128( int64 a STATUS_PARAM )
1316 {
1317     flag zSign;
1318     uint64 absA;
1319     int8 shiftCount;
1320     int32 zExp;
1321     uint64_t zSig0, zSig1;
1322
1323     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1324     zSign = ( a < 0 );
1325     absA = zSign ? - a : a;
1326     shiftCount = countLeadingZeros64( absA ) + 49;
1327     zExp = 0x406E - shiftCount;
1328     if ( 64 <= shiftCount ) {
1329         zSig1 = 0;
1330         zSig0 = absA;
1331         shiftCount -= 64;
1332     }
1333     else {
1334         zSig1 = absA;
1335         zSig0 = 0;
1336     }
1337     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1338     return packFloat128( zSign, zExp, zSig0, zSig1 );
1339
1340 }
1341
1342 #endif
1343
1344 /*----------------------------------------------------------------------------
1345 | Returns the result of converting the single-precision floating-point value
1346 | `a' to the 32-bit two's complement integer format.  The conversion is
1347 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1348 | Arithmetic---which means in particular that the conversion is rounded
1349 | according to the current rounding mode.  If `a' is a NaN, the largest
1350 | positive integer is returned.  Otherwise, if the conversion overflows, the
1351 | largest integer with the same sign as `a' is returned.
1352 *----------------------------------------------------------------------------*/
1353
1354 int32 float32_to_int32( float32 a STATUS_PARAM )
1355 {
1356     flag aSign;
1357     int16 aExp, shiftCount;
1358     uint32_t aSig;
1359     uint64_t aSig64;
1360
1361     a = float32_squash_input_denormal(a STATUS_VAR);
1362     aSig = extractFloat32Frac( a );
1363     aExp = extractFloat32Exp( a );
1364     aSign = extractFloat32Sign( a );
1365     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1366     if ( aExp ) aSig |= 0x00800000;
1367     shiftCount = 0xAF - aExp;
1368     aSig64 = aSig;
1369     aSig64 <<= 32;
1370     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1371     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1372
1373 }
1374
1375 /*----------------------------------------------------------------------------
1376 | Returns the result of converting the single-precision floating-point value
1377 | `a' to the 32-bit two's complement integer format.  The conversion is
1378 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1379 | Arithmetic, except that the conversion is always rounded toward zero.
1380 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1381 | the conversion overflows, the largest integer with the same sign as `a' is
1382 | returned.
1383 *----------------------------------------------------------------------------*/
1384
1385 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1386 {
1387     flag aSign;
1388     int16 aExp, shiftCount;
1389     uint32_t aSig;
1390     int32 z;
1391     a = float32_squash_input_denormal(a STATUS_VAR);
1392
1393     aSig = extractFloat32Frac( a );
1394     aExp = extractFloat32Exp( a );
1395     aSign = extractFloat32Sign( a );
1396     shiftCount = aExp - 0x9E;
1397     if ( 0 <= shiftCount ) {
1398         if ( float32_val(a) != 0xCF000000 ) {
1399             float_raise( float_flag_invalid STATUS_VAR);
1400             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1401         }
1402         return (int32_t) 0x80000000;
1403     }
1404     else if ( aExp <= 0x7E ) {
1405         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1406         return 0;
1407     }
1408     aSig = ( aSig | 0x00800000 )<<8;
1409     z = aSig>>( - shiftCount );
1410     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1411         STATUS(float_exception_flags) |= float_flag_inexact;
1412     }
1413     if ( aSign ) z = - z;
1414     return z;
1415
1416 }
1417
1418 /*----------------------------------------------------------------------------
1419 | Returns the result of converting the single-precision floating-point value
1420 | `a' to the 16-bit two's complement integer format.  The conversion is
1421 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1422 | Arithmetic, except that the conversion is always rounded toward zero.
1423 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1424 | the conversion overflows, the largest integer with the same sign as `a' is
1425 | returned.
1426 *----------------------------------------------------------------------------*/
1427
1428 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1429 {
1430     flag aSign;
1431     int16 aExp, shiftCount;
1432     uint32_t aSig;
1433     int32 z;
1434
1435     aSig = extractFloat32Frac( a );
1436     aExp = extractFloat32Exp( a );
1437     aSign = extractFloat32Sign( a );
1438     shiftCount = aExp - 0x8E;
1439     if ( 0 <= shiftCount ) {
1440         if ( float32_val(a) != 0xC7000000 ) {
1441             float_raise( float_flag_invalid STATUS_VAR);
1442             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1443                 return 0x7FFF;
1444             }
1445         }
1446         return (int32_t) 0xffff8000;
1447     }
1448     else if ( aExp <= 0x7E ) {
1449         if ( aExp | aSig ) {
1450             STATUS(float_exception_flags) |= float_flag_inexact;
1451         }
1452         return 0;
1453     }
1454     shiftCount -= 0x10;
1455     aSig = ( aSig | 0x00800000 )<<8;
1456     z = aSig>>( - shiftCount );
1457     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1458         STATUS(float_exception_flags) |= float_flag_inexact;
1459     }
1460     if ( aSign ) {
1461         z = - z;
1462     }
1463     return z;
1464
1465 }
1466
1467 /*----------------------------------------------------------------------------
1468 | Returns the result of converting the single-precision floating-point value
1469 | `a' to the 64-bit two's complement integer format.  The conversion is
1470 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1471 | Arithmetic---which means in particular that the conversion is rounded
1472 | according to the current rounding mode.  If `a' is a NaN, the largest
1473 | positive integer is returned.  Otherwise, if the conversion overflows, the
1474 | largest integer with the same sign as `a' is returned.
1475 *----------------------------------------------------------------------------*/
1476
1477 int64 float32_to_int64( float32 a STATUS_PARAM )
1478 {
1479     flag aSign;
1480     int16 aExp, shiftCount;
1481     uint32_t aSig;
1482     uint64_t aSig64, aSigExtra;
1483     a = float32_squash_input_denormal(a STATUS_VAR);
1484
1485     aSig = extractFloat32Frac( a );
1486     aExp = extractFloat32Exp( a );
1487     aSign = extractFloat32Sign( a );
1488     shiftCount = 0xBE - aExp;
1489     if ( shiftCount < 0 ) {
1490         float_raise( float_flag_invalid STATUS_VAR);
1491         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1492             return LIT64( 0x7FFFFFFFFFFFFFFF );
1493         }
1494         return (int64_t) LIT64( 0x8000000000000000 );
1495     }
1496     if ( aExp ) aSig |= 0x00800000;
1497     aSig64 = aSig;
1498     aSig64 <<= 40;
1499     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1500     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1501
1502 }
1503
1504 /*----------------------------------------------------------------------------
1505 | Returns the result of converting the single-precision floating-point value
1506 | `a' to the 64-bit two's complement integer format.  The conversion is
1507 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1508 | Arithmetic, except that the conversion is always rounded toward zero.  If
1509 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1510 | conversion overflows, the largest integer with the same sign as `a' is
1511 | returned.
1512 *----------------------------------------------------------------------------*/
1513
1514 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1515 {
1516     flag aSign;
1517     int16 aExp, shiftCount;
1518     uint32_t aSig;
1519     uint64_t aSig64;
1520     int64 z;
1521     a = float32_squash_input_denormal(a STATUS_VAR);
1522
1523     aSig = extractFloat32Frac( a );
1524     aExp = extractFloat32Exp( a );
1525     aSign = extractFloat32Sign( a );
1526     shiftCount = aExp - 0xBE;
1527     if ( 0 <= shiftCount ) {
1528         if ( float32_val(a) != 0xDF000000 ) {
1529             float_raise( float_flag_invalid STATUS_VAR);
1530             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1531                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1532             }
1533         }
1534         return (int64_t) LIT64( 0x8000000000000000 );
1535     }
1536     else if ( aExp <= 0x7E ) {
1537         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1538         return 0;
1539     }
1540     aSig64 = aSig | 0x00800000;
1541     aSig64 <<= 40;
1542     z = aSig64>>( - shiftCount );
1543     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1544         STATUS(float_exception_flags) |= float_flag_inexact;
1545     }
1546     if ( aSign ) z = - z;
1547     return z;
1548
1549 }
1550
1551 /*----------------------------------------------------------------------------
1552 | Returns the result of converting the single-precision floating-point value
1553 | `a' to the double-precision floating-point format.  The conversion is
1554 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1555 | Arithmetic.
1556 *----------------------------------------------------------------------------*/
1557
1558 float64 float32_to_float64( float32 a STATUS_PARAM )
1559 {
1560     flag aSign;
1561     int16 aExp;
1562     uint32_t aSig;
1563     a = float32_squash_input_denormal(a STATUS_VAR);
1564
1565     aSig = extractFloat32Frac( a );
1566     aExp = extractFloat32Exp( a );
1567     aSign = extractFloat32Sign( a );
1568     if ( aExp == 0xFF ) {
1569         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1570         return packFloat64( aSign, 0x7FF, 0 );
1571     }
1572     if ( aExp == 0 ) {
1573         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1574         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1575         --aExp;
1576     }
1577     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1578
1579 }
1580
1581 #ifdef FLOATX80
1582
1583 /*----------------------------------------------------------------------------
1584 | Returns the result of converting the single-precision floating-point value
1585 | `a' to the extended double-precision floating-point format.  The conversion
1586 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1587 | Arithmetic.
1588 *----------------------------------------------------------------------------*/
1589
1590 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1591 {
1592     flag aSign;
1593     int16 aExp;
1594     uint32_t aSig;
1595
1596     a = float32_squash_input_denormal(a STATUS_VAR);
1597     aSig = extractFloat32Frac( a );
1598     aExp = extractFloat32Exp( a );
1599     aSign = extractFloat32Sign( a );
1600     if ( aExp == 0xFF ) {
1601         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1602         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1603     }
1604     if ( aExp == 0 ) {
1605         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1606         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1607     }
1608     aSig |= 0x00800000;
1609     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1610
1611 }
1612
1613 #endif
1614
1615 #ifdef FLOAT128
1616
1617 /*----------------------------------------------------------------------------
1618 | Returns the result of converting the single-precision floating-point value
1619 | `a' to the double-precision floating-point format.  The conversion is
1620 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1621 | Arithmetic.
1622 *----------------------------------------------------------------------------*/
1623
1624 float128 float32_to_float128( float32 a STATUS_PARAM )
1625 {
1626     flag aSign;
1627     int16 aExp;
1628     uint32_t aSig;
1629
1630     a = float32_squash_input_denormal(a STATUS_VAR);
1631     aSig = extractFloat32Frac( a );
1632     aExp = extractFloat32Exp( a );
1633     aSign = extractFloat32Sign( a );
1634     if ( aExp == 0xFF ) {
1635         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1636         return packFloat128( aSign, 0x7FFF, 0, 0 );
1637     }
1638     if ( aExp == 0 ) {
1639         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1640         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1641         --aExp;
1642     }
1643     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1644
1645 }
1646
1647 #endif
1648
1649 /*----------------------------------------------------------------------------
1650 | Rounds the single-precision floating-point value `a' to an integer, and
1651 | returns the result as a single-precision floating-point value.  The
1652 | operation is performed according to the IEC/IEEE Standard for Binary
1653 | Floating-Point Arithmetic.
1654 *----------------------------------------------------------------------------*/
1655
1656 float32 float32_round_to_int( float32 a STATUS_PARAM)
1657 {
1658     flag aSign;
1659     int16 aExp;
1660     uint32_t lastBitMask, roundBitsMask;
1661     int8 roundingMode;
1662     uint32_t z;
1663     a = float32_squash_input_denormal(a STATUS_VAR);
1664
1665     aExp = extractFloat32Exp( a );
1666     if ( 0x96 <= aExp ) {
1667         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1668             return propagateFloat32NaN( a, a STATUS_VAR );
1669         }
1670         return a;
1671     }
1672     if ( aExp <= 0x7E ) {
1673         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1674         STATUS(float_exception_flags) |= float_flag_inexact;
1675         aSign = extractFloat32Sign( a );
1676         switch ( STATUS(float_rounding_mode) ) {
1677          case float_round_nearest_even:
1678             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1679                 return packFloat32( aSign, 0x7F, 0 );
1680             }
1681             break;
1682          case float_round_down:
1683             return make_float32(aSign ? 0xBF800000 : 0);
1684          case float_round_up:
1685             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1686         }
1687         return packFloat32( aSign, 0, 0 );
1688     }
1689     lastBitMask = 1;
1690     lastBitMask <<= 0x96 - aExp;
1691     roundBitsMask = lastBitMask - 1;
1692     z = float32_val(a);
1693     roundingMode = STATUS(float_rounding_mode);
1694     if ( roundingMode == float_round_nearest_even ) {
1695         z += lastBitMask>>1;
1696         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1697     }
1698     else if ( roundingMode != float_round_to_zero ) {
1699         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1700             z += roundBitsMask;
1701         }
1702     }
1703     z &= ~ roundBitsMask;
1704     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1705     return make_float32(z);
1706
1707 }
1708
1709 /*----------------------------------------------------------------------------
1710 | Returns the result of adding the absolute values of the single-precision
1711 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1712 | before being returned.  `zSign' is ignored if the result is a NaN.
1713 | The addition is performed according to the IEC/IEEE Standard for Binary
1714 | Floating-Point Arithmetic.
1715 *----------------------------------------------------------------------------*/
1716
1717 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1718 {
1719     int16 aExp, bExp, zExp;
1720     uint32_t aSig, bSig, zSig;
1721     int16 expDiff;
1722
1723     aSig = extractFloat32Frac( a );
1724     aExp = extractFloat32Exp( a );
1725     bSig = extractFloat32Frac( b );
1726     bExp = extractFloat32Exp( b );
1727     expDiff = aExp - bExp;
1728     aSig <<= 6;
1729     bSig <<= 6;
1730     if ( 0 < expDiff ) {
1731         if ( aExp == 0xFF ) {
1732             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1733             return a;
1734         }
1735         if ( bExp == 0 ) {
1736             --expDiff;
1737         }
1738         else {
1739             bSig |= 0x20000000;
1740         }
1741         shift32RightJamming( bSig, expDiff, &bSig );
1742         zExp = aExp;
1743     }
1744     else if ( expDiff < 0 ) {
1745         if ( bExp == 0xFF ) {
1746             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1747             return packFloat32( zSign, 0xFF, 0 );
1748         }
1749         if ( aExp == 0 ) {
1750             ++expDiff;
1751         }
1752         else {
1753             aSig |= 0x20000000;
1754         }
1755         shift32RightJamming( aSig, - expDiff, &aSig );
1756         zExp = bExp;
1757     }
1758     else {
1759         if ( aExp == 0xFF ) {
1760             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1761             return a;
1762         }
1763         if ( aExp == 0 ) {
1764             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1765             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1766         }
1767         zSig = 0x40000000 + aSig + bSig;
1768         zExp = aExp;
1769         goto roundAndPack;
1770     }
1771     aSig |= 0x20000000;
1772     zSig = ( aSig + bSig )<<1;
1773     --zExp;
1774     if ( (int32_t) zSig < 0 ) {
1775         zSig = aSig + bSig;
1776         ++zExp;
1777     }
1778  roundAndPack:
1779     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1780
1781 }
1782
1783 /*----------------------------------------------------------------------------
1784 | Returns the result of subtracting the absolute values of the single-
1785 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1786 | difference is negated before being returned.  `zSign' is ignored if the
1787 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1788 | Standard for Binary Floating-Point Arithmetic.
1789 *----------------------------------------------------------------------------*/
1790
1791 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1792 {
1793     int16 aExp, bExp, zExp;
1794     uint32_t aSig, bSig, zSig;
1795     int16 expDiff;
1796
1797     aSig = extractFloat32Frac( a );
1798     aExp = extractFloat32Exp( a );
1799     bSig = extractFloat32Frac( b );
1800     bExp = extractFloat32Exp( b );
1801     expDiff = aExp - bExp;
1802     aSig <<= 7;
1803     bSig <<= 7;
1804     if ( 0 < expDiff ) goto aExpBigger;
1805     if ( expDiff < 0 ) goto bExpBigger;
1806     if ( aExp == 0xFF ) {
1807         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1808         float_raise( float_flag_invalid STATUS_VAR);
1809         return float32_default_nan;
1810     }
1811     if ( aExp == 0 ) {
1812         aExp = 1;
1813         bExp = 1;
1814     }
1815     if ( bSig < aSig ) goto aBigger;
1816     if ( aSig < bSig ) goto bBigger;
1817     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1818  bExpBigger:
1819     if ( bExp == 0xFF ) {
1820         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1821         return packFloat32( zSign ^ 1, 0xFF, 0 );
1822     }
1823     if ( aExp == 0 ) {
1824         ++expDiff;
1825     }
1826     else {
1827         aSig |= 0x40000000;
1828     }
1829     shift32RightJamming( aSig, - expDiff, &aSig );
1830     bSig |= 0x40000000;
1831  bBigger:
1832     zSig = bSig - aSig;
1833     zExp = bExp;
1834     zSign ^= 1;
1835     goto normalizeRoundAndPack;
1836  aExpBigger:
1837     if ( aExp == 0xFF ) {
1838         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1839         return a;
1840     }
1841     if ( bExp == 0 ) {
1842         --expDiff;
1843     }
1844     else {
1845         bSig |= 0x40000000;
1846     }
1847     shift32RightJamming( bSig, expDiff, &bSig );
1848     aSig |= 0x40000000;
1849  aBigger:
1850     zSig = aSig - bSig;
1851     zExp = aExp;
1852  normalizeRoundAndPack:
1853     --zExp;
1854     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1855
1856 }
1857
1858 /*----------------------------------------------------------------------------
1859 | Returns the result of adding the single-precision floating-point values `a'
1860 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1861 | Binary Floating-Point Arithmetic.
1862 *----------------------------------------------------------------------------*/
1863
1864 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1865 {
1866     flag aSign, bSign;
1867     a = float32_squash_input_denormal(a STATUS_VAR);
1868     b = float32_squash_input_denormal(b STATUS_VAR);
1869
1870     aSign = extractFloat32Sign( a );
1871     bSign = extractFloat32Sign( b );
1872     if ( aSign == bSign ) {
1873         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1874     }
1875     else {
1876         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1877     }
1878
1879 }
1880
1881 /*----------------------------------------------------------------------------
1882 | Returns the result of subtracting the single-precision floating-point values
1883 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1884 | for Binary Floating-Point Arithmetic.
1885 *----------------------------------------------------------------------------*/
1886
1887 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1888 {
1889     flag aSign, bSign;
1890     a = float32_squash_input_denormal(a STATUS_VAR);
1891     b = float32_squash_input_denormal(b STATUS_VAR);
1892
1893     aSign = extractFloat32Sign( a );
1894     bSign = extractFloat32Sign( b );
1895     if ( aSign == bSign ) {
1896         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1897     }
1898     else {
1899         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1900     }
1901
1902 }
1903
1904 /*----------------------------------------------------------------------------
1905 | Returns the result of multiplying the single-precision floating-point values
1906 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1907 | for Binary Floating-Point Arithmetic.
1908 *----------------------------------------------------------------------------*/
1909
1910 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1911 {
1912     flag aSign, bSign, zSign;
1913     int16 aExp, bExp, zExp;
1914     uint32_t aSig, bSig;
1915     uint64_t zSig64;
1916     uint32_t zSig;
1917
1918     a = float32_squash_input_denormal(a STATUS_VAR);
1919     b = float32_squash_input_denormal(b STATUS_VAR);
1920
1921     aSig = extractFloat32Frac( a );
1922     aExp = extractFloat32Exp( a );
1923     aSign = extractFloat32Sign( a );
1924     bSig = extractFloat32Frac( b );
1925     bExp = extractFloat32Exp( b );
1926     bSign = extractFloat32Sign( b );
1927     zSign = aSign ^ bSign;
1928     if ( aExp == 0xFF ) {
1929         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1930             return propagateFloat32NaN( a, b STATUS_VAR );
1931         }
1932         if ( ( bExp | bSig ) == 0 ) {
1933             float_raise( float_flag_invalid STATUS_VAR);
1934             return float32_default_nan;
1935         }
1936         return packFloat32( zSign, 0xFF, 0 );
1937     }
1938     if ( bExp == 0xFF ) {
1939         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1940         if ( ( aExp | aSig ) == 0 ) {
1941             float_raise( float_flag_invalid STATUS_VAR);
1942             return float32_default_nan;
1943         }
1944         return packFloat32( zSign, 0xFF, 0 );
1945     }
1946     if ( aExp == 0 ) {
1947         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1948         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1949     }
1950     if ( bExp == 0 ) {
1951         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1952         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1953     }
1954     zExp = aExp + bExp - 0x7F;
1955     aSig = ( aSig | 0x00800000 )<<7;
1956     bSig = ( bSig | 0x00800000 )<<8;
1957     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1958     zSig = zSig64;
1959     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1960         zSig <<= 1;
1961         --zExp;
1962     }
1963     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1964
1965 }
1966
1967 /*----------------------------------------------------------------------------
1968 | Returns the result of dividing the single-precision floating-point value `a'
1969 | by the corresponding value `b'.  The operation is performed according to the
1970 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1971 *----------------------------------------------------------------------------*/
1972
1973 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1974 {
1975     flag aSign, bSign, zSign;
1976     int16 aExp, bExp, zExp;
1977     uint32_t aSig, bSig, zSig;
1978     a = float32_squash_input_denormal(a STATUS_VAR);
1979     b = float32_squash_input_denormal(b STATUS_VAR);
1980
1981     aSig = extractFloat32Frac( a );
1982     aExp = extractFloat32Exp( a );
1983     aSign = extractFloat32Sign( a );
1984     bSig = extractFloat32Frac( b );
1985     bExp = extractFloat32Exp( b );
1986     bSign = extractFloat32Sign( b );
1987     zSign = aSign ^ bSign;
1988     if ( aExp == 0xFF ) {
1989         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1990         if ( bExp == 0xFF ) {
1991             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1992             float_raise( float_flag_invalid STATUS_VAR);
1993             return float32_default_nan;
1994         }
1995         return packFloat32( zSign, 0xFF, 0 );
1996     }
1997     if ( bExp == 0xFF ) {
1998         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1999         return packFloat32( zSign, 0, 0 );
2000     }
2001     if ( bExp == 0 ) {
2002         if ( bSig == 0 ) {
2003             if ( ( aExp | aSig ) == 0 ) {
2004                 float_raise( float_flag_invalid STATUS_VAR);
2005                 return float32_default_nan;
2006             }
2007             float_raise( float_flag_divbyzero STATUS_VAR);
2008             return packFloat32( zSign, 0xFF, 0 );
2009         }
2010         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2011     }
2012     if ( aExp == 0 ) {
2013         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2014         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2015     }
2016     zExp = aExp - bExp + 0x7D;
2017     aSig = ( aSig | 0x00800000 )<<7;
2018     bSig = ( bSig | 0x00800000 )<<8;
2019     if ( bSig <= ( aSig + aSig ) ) {
2020         aSig >>= 1;
2021         ++zExp;
2022     }
2023     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2024     if ( ( zSig & 0x3F ) == 0 ) {
2025         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2026     }
2027     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2028
2029 }
2030
2031 /*----------------------------------------------------------------------------
2032 | Returns the remainder of the single-precision floating-point value `a'
2033 | with respect to the corresponding value `b'.  The operation is performed
2034 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2035 *----------------------------------------------------------------------------*/
2036
2037 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2038 {
2039     flag aSign, zSign;
2040     int16 aExp, bExp, expDiff;
2041     uint32_t aSig, bSig;
2042     uint32_t q;
2043     uint64_t aSig64, bSig64, q64;
2044     uint32_t alternateASig;
2045     int32_t sigMean;
2046     a = float32_squash_input_denormal(a STATUS_VAR);
2047     b = float32_squash_input_denormal(b STATUS_VAR);
2048
2049     aSig = extractFloat32Frac( a );
2050     aExp = extractFloat32Exp( a );
2051     aSign = extractFloat32Sign( a );
2052     bSig = extractFloat32Frac( b );
2053     bExp = extractFloat32Exp( b );
2054     if ( aExp == 0xFF ) {
2055         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2056             return propagateFloat32NaN( a, b STATUS_VAR );
2057         }
2058         float_raise( float_flag_invalid STATUS_VAR);
2059         return float32_default_nan;
2060     }
2061     if ( bExp == 0xFF ) {
2062         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2063         return a;
2064     }
2065     if ( bExp == 0 ) {
2066         if ( bSig == 0 ) {
2067             float_raise( float_flag_invalid STATUS_VAR);
2068             return float32_default_nan;
2069         }
2070         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2071     }
2072     if ( aExp == 0 ) {
2073         if ( aSig == 0 ) return a;
2074         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2075     }
2076     expDiff = aExp - bExp;
2077     aSig |= 0x00800000;
2078     bSig |= 0x00800000;
2079     if ( expDiff < 32 ) {
2080         aSig <<= 8;
2081         bSig <<= 8;
2082         if ( expDiff < 0 ) {
2083             if ( expDiff < -1 ) return a;
2084             aSig >>= 1;
2085         }
2086         q = ( bSig <= aSig );
2087         if ( q ) aSig -= bSig;
2088         if ( 0 < expDiff ) {
2089             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2090             q >>= 32 - expDiff;
2091             bSig >>= 2;
2092             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2093         }
2094         else {
2095             aSig >>= 2;
2096             bSig >>= 2;
2097         }
2098     }
2099     else {
2100         if ( bSig <= aSig ) aSig -= bSig;
2101         aSig64 = ( (uint64_t) aSig )<<40;
2102         bSig64 = ( (uint64_t) bSig )<<40;
2103         expDiff -= 64;
2104         while ( 0 < expDiff ) {
2105             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2106             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2107             aSig64 = - ( ( bSig * q64 )<<38 );
2108             expDiff -= 62;
2109         }
2110         expDiff += 64;
2111         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2112         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2113         q = q64>>( 64 - expDiff );
2114         bSig <<= 6;
2115         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2116     }
2117     do {
2118         alternateASig = aSig;
2119         ++q;
2120         aSig -= bSig;
2121     } while ( 0 <= (int32_t) aSig );
2122     sigMean = aSig + alternateASig;
2123     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2124         aSig = alternateASig;
2125     }
2126     zSign = ( (int32_t) aSig < 0 );
2127     if ( zSign ) aSig = - aSig;
2128     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2129
2130 }
2131
2132 /*----------------------------------------------------------------------------
2133 | Returns the square root of the single-precision floating-point value `a'.
2134 | The operation is performed according to the IEC/IEEE Standard for Binary
2135 | Floating-Point Arithmetic.
2136 *----------------------------------------------------------------------------*/
2137
2138 float32 float32_sqrt( float32 a STATUS_PARAM )
2139 {
2140     flag aSign;
2141     int16 aExp, zExp;
2142     uint32_t aSig, zSig;
2143     uint64_t rem, term;
2144     a = float32_squash_input_denormal(a STATUS_VAR);
2145
2146     aSig = extractFloat32Frac( a );
2147     aExp = extractFloat32Exp( a );
2148     aSign = extractFloat32Sign( a );
2149     if ( aExp == 0xFF ) {
2150         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2151         if ( ! aSign ) return a;
2152         float_raise( float_flag_invalid STATUS_VAR);
2153         return float32_default_nan;
2154     }
2155     if ( aSign ) {
2156         if ( ( aExp | aSig ) == 0 ) return a;
2157         float_raise( float_flag_invalid STATUS_VAR);
2158         return float32_default_nan;
2159     }
2160     if ( aExp == 0 ) {
2161         if ( aSig == 0 ) return float32_zero;
2162         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2163     }
2164     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2165     aSig = ( aSig | 0x00800000 )<<8;
2166     zSig = estimateSqrt32( aExp, aSig ) + 2;
2167     if ( ( zSig & 0x7F ) <= 5 ) {
2168         if ( zSig < 2 ) {
2169             zSig = 0x7FFFFFFF;
2170             goto roundAndPack;
2171         }
2172         aSig >>= aExp & 1;
2173         term = ( (uint64_t) zSig ) * zSig;
2174         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2175         while ( (int64_t) rem < 0 ) {
2176             --zSig;
2177             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2178         }
2179         zSig |= ( rem != 0 );
2180     }
2181     shift32RightJamming( zSig, 1, &zSig );
2182  roundAndPack:
2183     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2184
2185 }
2186
2187 /*----------------------------------------------------------------------------
2188 | Returns the binary exponential of the single-precision floating-point value
2189 | `a'. The operation is performed according to the IEC/IEEE Standard for
2190 | Binary Floating-Point Arithmetic.
2191 |
2192 | Uses the following identities:
2193 |
2194 | 1. -------------------------------------------------------------------------
2195 |      x    x*ln(2)
2196 |     2  = e
2197 |
2198 | 2. -------------------------------------------------------------------------
2199 |                      2     3     4     5           n
2200 |      x        x     x     x     x     x           x
2201 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2202 |               1!    2!    3!    4!    5!          n!
2203 *----------------------------------------------------------------------------*/
2204
2205 static const float64 float32_exp2_coefficients[15] =
2206 {
2207     const_float64( 0x3ff0000000000000ll ), /*  1 */
2208     const_float64( 0x3fe0000000000000ll ), /*  2 */
2209     const_float64( 0x3fc5555555555555ll ), /*  3 */
2210     const_float64( 0x3fa5555555555555ll ), /*  4 */
2211     const_float64( 0x3f81111111111111ll ), /*  5 */
2212     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2213     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2214     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2215     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2216     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2217     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2218     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2219     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2220     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2221     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2222 };
2223
2224 float32 float32_exp2( float32 a STATUS_PARAM )
2225 {
2226     flag aSign;
2227     int16 aExp;
2228     uint32_t aSig;
2229     float64 r, x, xn;
2230     int i;
2231     a = float32_squash_input_denormal(a STATUS_VAR);
2232
2233     aSig = extractFloat32Frac( a );
2234     aExp = extractFloat32Exp( a );
2235     aSign = extractFloat32Sign( a );
2236
2237     if ( aExp == 0xFF) {
2238         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2239         return (aSign) ? float32_zero : a;
2240     }
2241     if (aExp == 0) {
2242         if (aSig == 0) return float32_one;
2243     }
2244
2245     float_raise( float_flag_inexact STATUS_VAR);
2246
2247     /* ******************************* */
2248     /* using float64 for approximation */
2249     /* ******************************* */
2250     x = float32_to_float64(a STATUS_VAR);
2251     x = float64_mul(x, float64_ln2 STATUS_VAR);
2252
2253     xn = x;
2254     r = float64_one;
2255     for (i = 0 ; i < 15 ; i++) {
2256         float64 f;
2257
2258         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2259         r = float64_add(r, f STATUS_VAR);
2260
2261         xn = float64_mul(xn, x STATUS_VAR);
2262     }
2263
2264     return float64_to_float32(r, status);
2265 }
2266
2267 /*----------------------------------------------------------------------------
2268 | Returns the binary log of the single-precision floating-point value `a'.
2269 | The operation is performed according to the IEC/IEEE Standard for Binary
2270 | Floating-Point Arithmetic.
2271 *----------------------------------------------------------------------------*/
2272 float32 float32_log2( float32 a STATUS_PARAM )
2273 {
2274     flag aSign, zSign;
2275     int16 aExp;
2276     uint32_t aSig, zSig, i;
2277
2278     a = float32_squash_input_denormal(a STATUS_VAR);
2279     aSig = extractFloat32Frac( a );
2280     aExp = extractFloat32Exp( a );
2281     aSign = extractFloat32Sign( a );
2282
2283     if ( aExp == 0 ) {
2284         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2285         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2286     }
2287     if ( aSign ) {
2288         float_raise( float_flag_invalid STATUS_VAR);
2289         return float32_default_nan;
2290     }
2291     if ( aExp == 0xFF ) {
2292         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2293         return a;
2294     }
2295
2296     aExp -= 0x7F;
2297     aSig |= 0x00800000;
2298     zSign = aExp < 0;
2299     zSig = aExp << 23;
2300
2301     for (i = 1 << 22; i > 0; i >>= 1) {
2302         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2303         if ( aSig & 0x01000000 ) {
2304             aSig >>= 1;
2305             zSig |= i;
2306         }
2307     }
2308
2309     if ( zSign )
2310         zSig = -zSig;
2311
2312     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2313 }
2314
2315 /*----------------------------------------------------------------------------
2316 | Returns 1 if the single-precision floating-point value `a' is equal to
2317 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2318 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2319 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2320 *----------------------------------------------------------------------------*/
2321
2322 int float32_eq( float32 a, float32 b STATUS_PARAM )
2323 {
2324     uint32_t av, bv;
2325     a = float32_squash_input_denormal(a STATUS_VAR);
2326     b = float32_squash_input_denormal(b STATUS_VAR);
2327
2328     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2329          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2330        ) {
2331         float_raise( float_flag_invalid STATUS_VAR);
2332         return 0;
2333     }
2334     av = float32_val(a);
2335     bv = float32_val(b);
2336     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2337 }
2338
2339 /*----------------------------------------------------------------------------
2340 | Returns 1 if the single-precision floating-point value `a' is less than
2341 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2342 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2343 | Arithmetic.
2344 *----------------------------------------------------------------------------*/
2345
2346 int float32_le( float32 a, float32 b STATUS_PARAM )
2347 {
2348     flag aSign, bSign;
2349     uint32_t av, bv;
2350     a = float32_squash_input_denormal(a STATUS_VAR);
2351     b = float32_squash_input_denormal(b STATUS_VAR);
2352
2353     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2354          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2355        ) {
2356         float_raise( float_flag_invalid STATUS_VAR);
2357         return 0;
2358     }
2359     aSign = extractFloat32Sign( a );
2360     bSign = extractFloat32Sign( b );
2361     av = float32_val(a);
2362     bv = float32_val(b);
2363     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2364     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2365
2366 }
2367
2368 /*----------------------------------------------------------------------------
2369 | Returns 1 if the single-precision floating-point value `a' is less than
2370 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2371 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2372 *----------------------------------------------------------------------------*/
2373
2374 int float32_lt( float32 a, float32 b STATUS_PARAM )
2375 {
2376     flag aSign, bSign;
2377     uint32_t av, bv;
2378     a = float32_squash_input_denormal(a STATUS_VAR);
2379     b = float32_squash_input_denormal(b STATUS_VAR);
2380
2381     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2382          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2383        ) {
2384         float_raise( float_flag_invalid STATUS_VAR);
2385         return 0;
2386     }
2387     aSign = extractFloat32Sign( a );
2388     bSign = extractFloat32Sign( b );
2389     av = float32_val(a);
2390     bv = float32_val(b);
2391     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2392     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2393
2394 }
2395
2396 /*----------------------------------------------------------------------------
2397 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2398 | be compared, and 0 otherwise.  The comparison is performed according to the
2399 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2400 *----------------------------------------------------------------------------*/
2401
2402 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2403 {
2404     a = float32_squash_input_denormal(a STATUS_VAR);
2405     b = float32_squash_input_denormal(b STATUS_VAR);
2406
2407     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2408          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2409        ) {
2410         float_raise( float_flag_invalid STATUS_VAR);
2411         return 1;
2412     }
2413     return 0;
2414 }
2415
2416 /*----------------------------------------------------------------------------
2417 | Returns 1 if the single-precision floating-point value `a' is equal to
2418 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2419 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2420 *----------------------------------------------------------------------------*/
2421
2422 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2423 {
2424     a = float32_squash_input_denormal(a STATUS_VAR);
2425     b = float32_squash_input_denormal(b STATUS_VAR);
2426
2427     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2428          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2429        ) {
2430         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2431             float_raise( float_flag_invalid STATUS_VAR);
2432         }
2433         return 0;
2434     }
2435     return ( float32_val(a) == float32_val(b) ) ||
2436             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2437 }
2438
2439 /*----------------------------------------------------------------------------
2440 | Returns 1 if the single-precision floating-point value `a' is less than or
2441 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2442 | cause an exception.  Otherwise, the comparison is performed according to the
2443 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2444 *----------------------------------------------------------------------------*/
2445
2446 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2447 {
2448     flag aSign, bSign;
2449     uint32_t av, bv;
2450     a = float32_squash_input_denormal(a STATUS_VAR);
2451     b = float32_squash_input_denormal(b STATUS_VAR);
2452
2453     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2454          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2455        ) {
2456         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2457             float_raise( float_flag_invalid STATUS_VAR);
2458         }
2459         return 0;
2460     }
2461     aSign = extractFloat32Sign( a );
2462     bSign = extractFloat32Sign( b );
2463     av = float32_val(a);
2464     bv = float32_val(b);
2465     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2466     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2467
2468 }
2469
2470 /*----------------------------------------------------------------------------
2471 | Returns 1 if the single-precision floating-point value `a' is less than
2472 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2473 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2474 | Standard for Binary Floating-Point Arithmetic.
2475 *----------------------------------------------------------------------------*/
2476
2477 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2478 {
2479     flag aSign, bSign;
2480     uint32_t av, bv;
2481     a = float32_squash_input_denormal(a STATUS_VAR);
2482     b = float32_squash_input_denormal(b STATUS_VAR);
2483
2484     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2485          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2486        ) {
2487         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2488             float_raise( float_flag_invalid STATUS_VAR);
2489         }
2490         return 0;
2491     }
2492     aSign = extractFloat32Sign( a );
2493     bSign = extractFloat32Sign( b );
2494     av = float32_val(a);
2495     bv = float32_val(b);
2496     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2497     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2498
2499 }
2500
2501 /*----------------------------------------------------------------------------
2502 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2503 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2504 | comparison is performed according to the IEC/IEEE Standard for Binary
2505 | Floating-Point Arithmetic.
2506 *----------------------------------------------------------------------------*/
2507
2508 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2509 {
2510     a = float32_squash_input_denormal(a STATUS_VAR);
2511     b = float32_squash_input_denormal(b STATUS_VAR);
2512
2513     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2514          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2515        ) {
2516         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2517             float_raise( float_flag_invalid STATUS_VAR);
2518         }
2519         return 1;
2520     }
2521     return 0;
2522 }
2523
2524 /*----------------------------------------------------------------------------
2525 | Returns the result of converting the double-precision floating-point value
2526 | `a' to the 32-bit two's complement integer format.  The conversion is
2527 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2528 | Arithmetic---which means in particular that the conversion is rounded
2529 | according to the current rounding mode.  If `a' is a NaN, the largest
2530 | positive integer is returned.  Otherwise, if the conversion overflows, the
2531 | largest integer with the same sign as `a' is returned.
2532 *----------------------------------------------------------------------------*/
2533
2534 int32 float64_to_int32( float64 a STATUS_PARAM )
2535 {
2536     flag aSign;
2537     int16 aExp, shiftCount;
2538     uint64_t aSig;
2539     a = float64_squash_input_denormal(a STATUS_VAR);
2540
2541     aSig = extractFloat64Frac( a );
2542     aExp = extractFloat64Exp( a );
2543     aSign = extractFloat64Sign( a );
2544     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2545     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2546     shiftCount = 0x42C - aExp;
2547     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2548     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2549
2550 }
2551
2552 /*----------------------------------------------------------------------------
2553 | Returns the result of converting the double-precision floating-point value
2554 | `a' to the 32-bit two's complement integer format.  The conversion is
2555 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2556 | Arithmetic, except that the conversion is always rounded toward zero.
2557 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2558 | the conversion overflows, the largest integer with the same sign as `a' is
2559 | returned.
2560 *----------------------------------------------------------------------------*/
2561
2562 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2563 {
2564     flag aSign;
2565     int16 aExp, shiftCount;
2566     uint64_t aSig, savedASig;
2567     int32 z;
2568     a = float64_squash_input_denormal(a STATUS_VAR);
2569
2570     aSig = extractFloat64Frac( a );
2571     aExp = extractFloat64Exp( a );
2572     aSign = extractFloat64Sign( a );
2573     if ( 0x41E < aExp ) {
2574         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2575         goto invalid;
2576     }
2577     else if ( aExp < 0x3FF ) {
2578         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2579         return 0;
2580     }
2581     aSig |= LIT64( 0x0010000000000000 );
2582     shiftCount = 0x433 - aExp;
2583     savedASig = aSig;
2584     aSig >>= shiftCount;
2585     z = aSig;
2586     if ( aSign ) z = - z;
2587     if ( ( z < 0 ) ^ aSign ) {
2588  invalid:
2589         float_raise( float_flag_invalid STATUS_VAR);
2590         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2591     }
2592     if ( ( aSig<<shiftCount ) != savedASig ) {
2593         STATUS(float_exception_flags) |= float_flag_inexact;
2594     }
2595     return z;
2596
2597 }
2598
2599 /*----------------------------------------------------------------------------
2600 | Returns the result of converting the double-precision floating-point value
2601 | `a' to the 16-bit two's complement integer format.  The conversion is
2602 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2603 | Arithmetic, except that the conversion is always rounded toward zero.
2604 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2605 | the conversion overflows, the largest integer with the same sign as `a' is
2606 | returned.
2607 *----------------------------------------------------------------------------*/
2608
2609 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2610 {
2611     flag aSign;
2612     int16 aExp, shiftCount;
2613     uint64_t aSig, savedASig;
2614     int32 z;
2615
2616     aSig = extractFloat64Frac( a );
2617     aExp = extractFloat64Exp( a );
2618     aSign = extractFloat64Sign( a );
2619     if ( 0x40E < aExp ) {
2620         if ( ( aExp == 0x7FF ) && aSig ) {
2621             aSign = 0;
2622         }
2623         goto invalid;
2624     }
2625     else if ( aExp < 0x3FF ) {
2626         if ( aExp || aSig ) {
2627             STATUS(float_exception_flags) |= float_flag_inexact;
2628         }
2629         return 0;
2630     }
2631     aSig |= LIT64( 0x0010000000000000 );
2632     shiftCount = 0x433 - aExp;
2633     savedASig = aSig;
2634     aSig >>= shiftCount;
2635     z = aSig;
2636     if ( aSign ) {
2637         z = - z;
2638     }
2639     if ( ( (int16_t)z < 0 ) ^ aSign ) {
2640  invalid:
2641         float_raise( float_flag_invalid STATUS_VAR);
2642         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2643     }
2644     if ( ( aSig<<shiftCount ) != savedASig ) {
2645         STATUS(float_exception_flags) |= float_flag_inexact;
2646     }
2647     return z;
2648 }
2649
2650 /*----------------------------------------------------------------------------
2651 | Returns the result of converting the double-precision floating-point value
2652 | `a' to the 64-bit two's complement integer format.  The conversion is
2653 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2654 | Arithmetic---which means in particular that the conversion is rounded
2655 | according to the current rounding mode.  If `a' is a NaN, the largest
2656 | positive integer is returned.  Otherwise, if the conversion overflows, the
2657 | largest integer with the same sign as `a' is returned.
2658 *----------------------------------------------------------------------------*/
2659
2660 int64 float64_to_int64( float64 a STATUS_PARAM )
2661 {
2662     flag aSign;
2663     int16 aExp, shiftCount;
2664     uint64_t aSig, aSigExtra;
2665     a = float64_squash_input_denormal(a STATUS_VAR);
2666
2667     aSig = extractFloat64Frac( a );
2668     aExp = extractFloat64Exp( a );
2669     aSign = extractFloat64Sign( a );
2670     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2671     shiftCount = 0x433 - aExp;
2672     if ( shiftCount <= 0 ) {
2673         if ( 0x43E < aExp ) {
2674             float_raise( float_flag_invalid STATUS_VAR);
2675             if (    ! aSign
2676                  || (    ( aExp == 0x7FF )
2677                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2678                ) {
2679                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2680             }
2681             return (int64_t) LIT64( 0x8000000000000000 );
2682         }
2683         aSigExtra = 0;
2684         aSig <<= - shiftCount;
2685     }
2686     else {
2687         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2688     }
2689     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2690
2691 }
2692
2693 /*----------------------------------------------------------------------------
2694 | Returns the result of converting the double-precision floating-point value
2695 | `a' to the 64-bit two's complement integer format.  The conversion is
2696 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2697 | Arithmetic, except that the conversion is always rounded toward zero.
2698 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2699 | the conversion overflows, the largest integer with the same sign as `a' is
2700 | returned.
2701 *----------------------------------------------------------------------------*/
2702
2703 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2704 {
2705     flag aSign;
2706     int16 aExp, shiftCount;
2707     uint64_t aSig;
2708     int64 z;
2709     a = float64_squash_input_denormal(a STATUS_VAR);
2710
2711     aSig = extractFloat64Frac( a );
2712     aExp = extractFloat64Exp( a );
2713     aSign = extractFloat64Sign( a );
2714     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2715     shiftCount = aExp - 0x433;
2716     if ( 0 <= shiftCount ) {
2717         if ( 0x43E <= aExp ) {
2718             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2719                 float_raise( float_flag_invalid STATUS_VAR);
2720                 if (    ! aSign
2721                      || (    ( aExp == 0x7FF )
2722                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2723                    ) {
2724                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2725                 }
2726             }
2727             return (int64_t) LIT64( 0x8000000000000000 );
2728         }
2729         z = aSig<<shiftCount;
2730     }
2731     else {
2732         if ( aExp < 0x3FE ) {
2733             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2734             return 0;
2735         }
2736         z = aSig>>( - shiftCount );
2737         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2738             STATUS(float_exception_flags) |= float_flag_inexact;
2739         }
2740     }
2741     if ( aSign ) z = - z;
2742     return z;
2743
2744 }
2745
2746 /*----------------------------------------------------------------------------
2747 | Returns the result of converting the double-precision floating-point value
2748 | `a' to the single-precision floating-point format.  The conversion is
2749 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2750 | Arithmetic.
2751 *----------------------------------------------------------------------------*/
2752
2753 float32 float64_to_float32( float64 a STATUS_PARAM )
2754 {
2755     flag aSign;
2756     int16 aExp;
2757     uint64_t aSig;
2758     uint32_t zSig;
2759     a = float64_squash_input_denormal(a STATUS_VAR);
2760
2761     aSig = extractFloat64Frac( a );
2762     aExp = extractFloat64Exp( a );
2763     aSign = extractFloat64Sign( a );
2764     if ( aExp == 0x7FF ) {
2765         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2766         return packFloat32( aSign, 0xFF, 0 );
2767     }
2768     shift64RightJamming( aSig, 22, &aSig );
2769     zSig = aSig;
2770     if ( aExp || zSig ) {
2771         zSig |= 0x40000000;
2772         aExp -= 0x381;
2773     }
2774     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2775
2776 }
2777
2778
2779 /*----------------------------------------------------------------------------
2780 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2781 | half-precision floating-point value, returning the result.  After being
2782 | shifted into the proper positions, the three fields are simply added
2783 | together to form the result.  This means that any integer portion of `zSig'
2784 | will be added into the exponent.  Since a properly normalized significand
2785 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2786 | than the desired result exponent whenever `zSig' is a complete, normalized
2787 | significand.
2788 *----------------------------------------------------------------------------*/
2789 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2790 {
2791     return make_float16(
2792         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2793 }
2794
2795 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2796    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2797
2798 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2799 {
2800     flag aSign;
2801     int16 aExp;
2802     uint32_t aSig;
2803
2804     aSign = extractFloat16Sign(a);
2805     aExp = extractFloat16Exp(a);
2806     aSig = extractFloat16Frac(a);
2807
2808     if (aExp == 0x1f && ieee) {
2809         if (aSig) {
2810             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2811         }
2812         return packFloat32(aSign, 0xff, aSig << 13);
2813     }
2814     if (aExp == 0) {
2815         int8 shiftCount;
2816
2817         if (aSig == 0) {
2818             return packFloat32(aSign, 0, 0);
2819         }
2820
2821         shiftCount = countLeadingZeros32( aSig ) - 21;
2822         aSig = aSig << shiftCount;
2823         aExp = -shiftCount;
2824     }
2825     return packFloat32( aSign, aExp + 0x70, aSig << 13);
2826 }
2827
2828 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2829 {
2830     flag aSign;
2831     int16 aExp;
2832     uint32_t aSig;
2833     uint32_t mask;
2834     uint32_t increment;
2835     int8 roundingMode;
2836     a = float32_squash_input_denormal(a STATUS_VAR);
2837
2838     aSig = extractFloat32Frac( a );
2839     aExp = extractFloat32Exp( a );
2840     aSign = extractFloat32Sign( a );
2841     if ( aExp == 0xFF ) {
2842         if (aSig) {
2843             /* Input is a NaN */
2844             float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2845             if (!ieee) {
2846                 return packFloat16(aSign, 0, 0);
2847             }
2848             return r;
2849         }
2850         /* Infinity */
2851         if (!ieee) {
2852             float_raise(float_flag_invalid STATUS_VAR);
2853             return packFloat16(aSign, 0x1f, 0x3ff);
2854         }
2855         return packFloat16(aSign, 0x1f, 0);
2856     }
2857     if (aExp == 0 && aSig == 0) {
2858         return packFloat16(aSign, 0, 0);
2859     }
2860     /* Decimal point between bits 22 and 23.  */
2861     aSig |= 0x00800000;
2862     aExp -= 0x7f;
2863     if (aExp < -14) {
2864         mask = 0x00ffffff;
2865         if (aExp >= -24) {
2866             mask >>= 25 + aExp;
2867         }
2868     } else {
2869         mask = 0x00001fff;
2870     }
2871     if (aSig & mask) {
2872         float_raise( float_flag_underflow STATUS_VAR );
2873         roundingMode = STATUS(float_rounding_mode);
2874         switch (roundingMode) {
2875         case float_round_nearest_even:
2876             increment = (mask + 1) >> 1;
2877             if ((aSig & mask) == increment) {
2878                 increment = aSig & (increment << 1);
2879             }
2880             break;
2881         case float_round_up:
2882             increment = aSign ? 0 : mask;
2883             break;
2884         case float_round_down:
2885             increment = aSign ? mask : 0;
2886             break;
2887         default: /* round_to_zero */
2888             increment = 0;
2889             break;
2890         }
2891         aSig += increment;
2892         if (aSig >= 0x01000000) {
2893             aSig >>= 1;
2894             aExp++;
2895         }
2896     } else if (aExp < -14
2897           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2898         float_raise( float_flag_underflow STATUS_VAR);
2899     }
2900
2901     if (ieee) {
2902         if (aExp > 15) {
2903             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2904             return packFloat16(aSign, 0x1f, 0);
2905         }
2906     } else {
2907         if (aExp > 16) {
2908             float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2909             return packFloat16(aSign, 0x1f, 0x3ff);
2910         }
2911     }
2912     if (aExp < -24) {
2913         return packFloat16(aSign, 0, 0);
2914     }
2915     if (aExp < -14) {
2916         aSig >>= -14 - aExp;
2917         aExp = -14;
2918     }
2919     return packFloat16(aSign, aExp + 14, aSig >> 13);
2920 }
2921
2922 #ifdef FLOATX80
2923
2924 /*----------------------------------------------------------------------------
2925 | Returns the result of converting the double-precision floating-point value
2926 | `a' to the extended double-precision floating-point format.  The conversion
2927 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2928 | Arithmetic.
2929 *----------------------------------------------------------------------------*/
2930
2931 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2932 {
2933     flag aSign;
2934     int16 aExp;
2935     uint64_t aSig;
2936
2937     a = float64_squash_input_denormal(a STATUS_VAR);
2938     aSig = extractFloat64Frac( a );
2939     aExp = extractFloat64Exp( a );
2940     aSign = extractFloat64Sign( a );
2941     if ( aExp == 0x7FF ) {
2942         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2943         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2944     }
2945     if ( aExp == 0 ) {
2946         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2947         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2948     }
2949     return
2950         packFloatx80(
2951             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2952
2953 }
2954
2955 #endif
2956
2957 #ifdef FLOAT128
2958
2959 /*----------------------------------------------------------------------------
2960 | Returns the result of converting the double-precision floating-point value
2961 | `a' to the quadruple-precision floating-point format.  The conversion is
2962 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2963 | Arithmetic.
2964 *----------------------------------------------------------------------------*/
2965
2966 float128 float64_to_float128( float64 a STATUS_PARAM )
2967 {
2968     flag aSign;
2969     int16 aExp;
2970     uint64_t aSig, zSig0, zSig1;
2971
2972     a = float64_squash_input_denormal(a STATUS_VAR);
2973     aSig = extractFloat64Frac( a );
2974     aExp = extractFloat64Exp( a );
2975     aSign = extractFloat64Sign( a );
2976     if ( aExp == 0x7FF ) {
2977         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2978         return packFloat128( aSign, 0x7FFF, 0, 0 );
2979     }
2980     if ( aExp == 0 ) {
2981         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2982         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2983         --aExp;
2984     }
2985     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2986     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2987
2988 }
2989
2990 #endif
2991
2992 /*----------------------------------------------------------------------------
2993 | Rounds the double-precision floating-point value `a' to an integer, and
2994 | returns the result as a double-precision floating-point value.  The
2995 | operation is performed according to the IEC/IEEE Standard for Binary
2996 | Floating-Point Arithmetic.
2997 *----------------------------------------------------------------------------*/
2998
2999 float64 float64_round_to_int( float64 a STATUS_PARAM )
3000 {
3001     flag aSign;
3002     int16 aExp;
3003     uint64_t lastBitMask, roundBitsMask;
3004     int8 roundingMode;
3005     uint64_t z;
3006     a = float64_squash_input_denormal(a STATUS_VAR);
3007
3008     aExp = extractFloat64Exp( a );
3009     if ( 0x433 <= aExp ) {
3010         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3011             return propagateFloat64NaN( a, a STATUS_VAR );
3012         }
3013         return a;
3014     }
3015     if ( aExp < 0x3FF ) {
3016         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3017         STATUS(float_exception_flags) |= float_flag_inexact;
3018         aSign = extractFloat64Sign( a );
3019         switch ( STATUS(float_rounding_mode) ) {
3020          case float_round_nearest_even:
3021             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3022                 return packFloat64( aSign, 0x3FF, 0 );
3023             }
3024             break;
3025          case float_round_down:
3026             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3027          case float_round_up:
3028             return make_float64(
3029             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3030         }
3031         return packFloat64( aSign, 0, 0 );
3032     }
3033     lastBitMask = 1;
3034     lastBitMask <<= 0x433 - aExp;
3035     roundBitsMask = lastBitMask - 1;
3036     z = float64_val(a);
3037     roundingMode = STATUS(float_rounding_mode);
3038     if ( roundingMode == float_round_nearest_even ) {
3039         z += lastBitMask>>1;
3040         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3041     }
3042     else if ( roundingMode != float_round_to_zero ) {
3043         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3044             z += roundBitsMask;
3045         }
3046     }
3047     z &= ~ roundBitsMask;
3048     if ( z != float64_val(a) )
3049         STATUS(float_exception_flags) |= float_flag_inexact;
3050     return make_float64(z);
3051
3052 }
3053
3054 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3055 {
3056     int oldmode;
3057     float64 res;
3058     oldmode = STATUS(float_rounding_mode);
3059     STATUS(float_rounding_mode) = float_round_to_zero;
3060     res = float64_round_to_int(a STATUS_VAR);
3061     STATUS(float_rounding_mode) = oldmode;
3062     return res;
3063 }
3064
3065 /*----------------------------------------------------------------------------
3066 | Returns the result of adding the absolute values of the double-precision
3067 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3068 | before being returned.  `zSign' is ignored if the result is a NaN.
3069 | The addition is performed according to the IEC/IEEE Standard for Binary
3070 | Floating-Point Arithmetic.
3071 *----------------------------------------------------------------------------*/
3072
3073 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3074 {
3075     int16 aExp, bExp, zExp;
3076     uint64_t aSig, bSig, zSig;
3077     int16 expDiff;
3078
3079     aSig = extractFloat64Frac( a );
3080     aExp = extractFloat64Exp( a );
3081     bSig = extractFloat64Frac( b );
3082     bExp = extractFloat64Exp( b );
3083     expDiff = aExp - bExp;
3084     aSig <<= 9;
3085     bSig <<= 9;
3086     if ( 0 < expDiff ) {
3087         if ( aExp == 0x7FF ) {
3088             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3089             return a;
3090         }
3091         if ( bExp == 0 ) {
3092             --expDiff;
3093         }
3094         else {
3095             bSig |= LIT64( 0x2000000000000000 );
3096         }
3097         shift64RightJamming( bSig, expDiff, &bSig );
3098         zExp = aExp;
3099     }
3100     else if ( expDiff < 0 ) {
3101         if ( bExp == 0x7FF ) {
3102             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3103             return packFloat64( zSign, 0x7FF, 0 );
3104         }
3105         if ( aExp == 0 ) {
3106             ++expDiff;
3107         }
3108         else {
3109             aSig |= LIT64( 0x2000000000000000 );
3110         }
3111         shift64RightJamming( aSig, - expDiff, &aSig );
3112         zExp = bExp;
3113     }
3114     else {
3115         if ( aExp == 0x7FF ) {
3116             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3117             return a;
3118         }
3119         if ( aExp == 0 ) {
3120             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3121             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3122         }
3123         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3124         zExp = aExp;
3125         goto roundAndPack;
3126     }
3127     aSig |= LIT64( 0x2000000000000000 );
3128     zSig = ( aSig + bSig )<<1;
3129     --zExp;
3130     if ( (int64_t) zSig < 0 ) {
3131         zSig = aSig + bSig;
3132         ++zExp;
3133     }
3134  roundAndPack:
3135     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3136
3137 }
3138
3139 /*----------------------------------------------------------------------------
3140 | Returns the result of subtracting the absolute values of the double-
3141 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3142 | difference is negated before being returned.  `zSign' is ignored if the
3143 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3144 | Standard for Binary Floating-Point Arithmetic.
3145 *----------------------------------------------------------------------------*/
3146
3147 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3148 {
3149     int16 aExp, bExp, zExp;
3150     uint64_t aSig, bSig, zSig;
3151     int16 expDiff;
3152
3153     aSig = extractFloat64Frac( a );
3154     aExp = extractFloat64Exp( a );
3155     bSig = extractFloat64Frac( b );
3156     bExp = extractFloat64Exp( b );
3157     expDiff = aExp - bExp;
3158     aSig <<= 10;
3159     bSig <<= 10;
3160     if ( 0 < expDiff ) goto aExpBigger;
3161     if ( expDiff < 0 ) goto bExpBigger;
3162     if ( aExp == 0x7FF ) {
3163         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3164         float_raise( float_flag_invalid STATUS_VAR);
3165         return float64_default_nan;
3166     }
3167     if ( aExp == 0 ) {
3168         aExp = 1;
3169         bExp = 1;
3170     }
3171     if ( bSig < aSig ) goto aBigger;
3172     if ( aSig < bSig ) goto bBigger;
3173     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3174  bExpBigger:
3175     if ( bExp == 0x7FF ) {
3176         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3177         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3178     }
3179     if ( aExp == 0 ) {
3180         ++expDiff;
3181     }
3182     else {
3183         aSig |= LIT64( 0x4000000000000000 );
3184     }
3185     shift64RightJamming( aSig, - expDiff, &aSig );
3186     bSig |= LIT64( 0x4000000000000000 );
3187  bBigger:
3188     zSig = bSig - aSig;
3189     zExp = bExp;
3190     zSign ^= 1;
3191     goto normalizeRoundAndPack;
3192  aExpBigger:
3193     if ( aExp == 0x7FF ) {
3194         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3195         return a;
3196     }
3197     if ( bExp == 0 ) {
3198         --expDiff;
3199     }
3200     else {
3201         bSig |= LIT64( 0x4000000000000000 );
3202     }
3203     shift64RightJamming( bSig, expDiff, &bSig );
3204     aSig |= LIT64( 0x4000000000000000 );
3205  aBigger:
3206     zSig = aSig - bSig;
3207     zExp = aExp;
3208  normalizeRoundAndPack:
3209     --zExp;
3210     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3211
3212 }
3213
3214 /*----------------------------------------------------------------------------
3215 | Returns the result of adding the double-precision floating-point values `a'
3216 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3217 | Binary Floating-Point Arithmetic.
3218 *----------------------------------------------------------------------------*/
3219
3220 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3221 {
3222     flag aSign, bSign;
3223     a = float64_squash_input_denormal(a STATUS_VAR);
3224     b = float64_squash_input_denormal(b STATUS_VAR);
3225
3226     aSign = extractFloat64Sign( a );
3227     bSign = extractFloat64Sign( b );
3228     if ( aSign == bSign ) {
3229         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3230     }
3231     else {
3232         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3233     }
3234
3235 }
3236
3237 /*----------------------------------------------------------------------------
3238 | Returns the result of subtracting the double-precision floating-point values
3239 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3240 | for Binary Floating-Point Arithmetic.
3241 *----------------------------------------------------------------------------*/
3242
3243 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3244 {
3245     flag aSign, bSign;
3246     a = float64_squash_input_denormal(a STATUS_VAR);
3247     b = float64_squash_input_denormal(b STATUS_VAR);
3248
3249     aSign = extractFloat64Sign( a );
3250     bSign = extractFloat64Sign( b );
3251     if ( aSign == bSign ) {
3252         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3253     }
3254     else {
3255         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3256     }
3257
3258 }
3259
3260 /*----------------------------------------------------------------------------
3261 | Returns the result of multiplying the double-precision floating-point values
3262 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3263 | for Binary Floating-Point Arithmetic.
3264 *----------------------------------------------------------------------------*/
3265
3266 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3267 {
3268     flag aSign, bSign, zSign;
3269     int16 aExp, bExp, zExp;
3270     uint64_t aSig, bSig, zSig0, zSig1;
3271
3272     a = float64_squash_input_denormal(a STATUS_VAR);
3273     b = float64_squash_input_denormal(b STATUS_VAR);
3274
3275     aSig = extractFloat64Frac( a );
3276     aExp = extractFloat64Exp( a );
3277     aSign = extractFloat64Sign( a );
3278     bSig = extractFloat64Frac( b );
3279     bExp = extractFloat64Exp( b );
3280     bSign = extractFloat64Sign( b );
3281     zSign = aSign ^ bSign;
3282     if ( aExp == 0x7FF ) {
3283         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3284             return propagateFloat64NaN( a, b STATUS_VAR );
3285         }
3286         if ( ( bExp | bSig ) == 0 ) {
3287             float_raise( float_flag_invalid STATUS_VAR);
3288             return float64_default_nan;
3289         }
3290         return packFloat64( zSign, 0x7FF, 0 );
3291     }
3292     if ( bExp == 0x7FF ) {
3293         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3294         if ( ( aExp | aSig ) == 0 ) {
3295             float_raise( float_flag_invalid STATUS_VAR);
3296             return float64_default_nan;
3297         }
3298         return packFloat64( zSign, 0x7FF, 0 );
3299     }
3300     if ( aExp == 0 ) {
3301         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3302         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3303     }
3304     if ( bExp == 0 ) {
3305         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3306         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3307     }
3308     zExp = aExp + bExp - 0x3FF;
3309     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3310     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3311     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3312     zSig0 |= ( zSig1 != 0 );
3313     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3314         zSig0 <<= 1;
3315         --zExp;
3316     }
3317     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3318
3319 }
3320
3321 /*----------------------------------------------------------------------------
3322 | Returns the result of dividing the double-precision floating-point value `a'
3323 | by the corresponding value `b'.  The operation is performed according to
3324 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3325 *----------------------------------------------------------------------------*/
3326
3327 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3328 {
3329     flag aSign, bSign, zSign;
3330     int16 aExp, bExp, zExp;
3331     uint64_t aSig, bSig, zSig;
3332     uint64_t rem0, rem1;
3333     uint64_t term0, term1;
3334     a = float64_squash_input_denormal(a STATUS_VAR);
3335     b = float64_squash_input_denormal(b STATUS_VAR);
3336
3337     aSig = extractFloat64Frac( a );
3338     aExp = extractFloat64Exp( a );
3339     aSign = extractFloat64Sign( a );
3340     bSig = extractFloat64Frac( b );
3341     bExp = extractFloat64Exp( b );
3342     bSign = extractFloat64Sign( b );
3343     zSign = aSign ^ bSign;
3344     if ( aExp == 0x7FF ) {
3345         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3346         if ( bExp == 0x7FF ) {
3347             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3348             float_raise( float_flag_invalid STATUS_VAR);
3349             return float64_default_nan;
3350         }
3351         return packFloat64( zSign, 0x7FF, 0 );
3352     }
3353     if ( bExp == 0x7FF ) {
3354         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3355         return packFloat64( zSign, 0, 0 );
3356     }
3357     if ( bExp == 0 ) {
3358         if ( bSig == 0 ) {
3359             if ( ( aExp | aSig ) == 0 ) {
3360                 float_raise( float_flag_invalid STATUS_VAR);
3361                 return float64_default_nan;
3362             }
3363             float_raise( float_flag_divbyzero STATUS_VAR);
3364             return packFloat64( zSign, 0x7FF, 0 );
3365         }
3366         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3367     }
3368     if ( aExp == 0 ) {
3369         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3370         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3371     }
3372     zExp = aExp - bExp + 0x3FD;
3373     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3374     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3375     if ( bSig <= ( aSig + aSig ) ) {
3376         aSig >>= 1;
3377         ++zExp;
3378     }
3379     zSig = estimateDiv128To64( aSig, 0, bSig );
3380     if ( ( zSig & 0x1FF ) <= 2 ) {
3381         mul64To128( bSig, zSig, &term0, &term1 );
3382         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3383         while ( (int64_t) rem0 < 0 ) {
3384             --zSig;
3385             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3386         }
3387         zSig |= ( rem1 != 0 );
3388     }
3389     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3390
3391 }
3392
3393 /*----------------------------------------------------------------------------
3394 | Returns the remainder of the double-precision floating-point value `a'
3395 | with respect to the corresponding value `b'.  The operation is performed
3396 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3397 *----------------------------------------------------------------------------*/
3398
3399 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3400 {
3401     flag aSign, zSign;
3402     int16 aExp, bExp, expDiff;
3403     uint64_t aSig, bSig;
3404     uint64_t q, alternateASig;
3405     int64_t sigMean;
3406
3407     a = float64_squash_input_denormal(a STATUS_VAR);
3408     b = float64_squash_input_denormal(b STATUS_VAR);
3409     aSig = extractFloat64Frac( a );
3410     aExp = extractFloat64Exp( a );
3411     aSign = extractFloat64Sign( a );
3412     bSig = extractFloat64Frac( b );
3413     bExp = extractFloat64Exp( b );
3414     if ( aExp == 0x7FF ) {
3415         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3416             return propagateFloat64NaN( a, b STATUS_VAR );
3417         }
3418         float_raise( float_flag_invalid STATUS_VAR);
3419         return float64_default_nan;
3420     }
3421     if ( bExp == 0x7FF ) {
3422         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3423         return a;
3424     }
3425     if ( bExp == 0 ) {
3426         if ( bSig == 0 ) {
3427             float_raise( float_flag_invalid STATUS_VAR);
3428             return float64_default_nan;
3429         }
3430         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3431     }
3432     if ( aExp == 0 ) {
3433         if ( aSig == 0 ) return a;
3434         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3435     }
3436     expDiff = aExp - bExp;
3437     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3438     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3439     if ( expDiff < 0 ) {
3440         if ( expDiff < -1 ) return a;
3441         aSig >>= 1;
3442     }
3443     q = ( bSig <= aSig );
3444     if ( q ) aSig -= bSig;
3445     expDiff -= 64;
3446     while ( 0 < expDiff ) {
3447         q = estimateDiv128To64( aSig, 0, bSig );
3448         q = ( 2 < q ) ? q - 2 : 0;
3449         aSig = - ( ( bSig>>2 ) * q );
3450         expDiff -= 62;
3451     }
3452     expDiff += 64;
3453     if ( 0 < expDiff ) {
3454         q = estimateDiv128To64( aSig, 0, bSig );
3455         q = ( 2 < q ) ? q - 2 : 0;
3456         q >>= 64 - expDiff;
3457         bSig >>= 2;
3458         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3459     }
3460     else {
3461         aSig >>= 2;
3462         bSig >>= 2;
3463     }
3464     do {
3465         alternateASig = aSig;
3466         ++q;
3467         aSig -= bSig;
3468     } while ( 0 <= (int64_t) aSig );
3469     sigMean = aSig + alternateASig;
3470     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3471         aSig = alternateASig;
3472     }
3473     zSign = ( (int64_t) aSig < 0 );
3474     if ( zSign ) aSig = - aSig;
3475     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3476
3477 }
3478
3479 /*----------------------------------------------------------------------------
3480 | Returns the square root of the double-precision floating-point value `a'.
3481 | The operation is performed according to the IEC/IEEE Standard for Binary
3482 | Floating-Point Arithmetic.
3483 *----------------------------------------------------------------------------*/
3484
3485 float64 float64_sqrt( float64 a STATUS_PARAM )
3486 {
3487     flag aSign;
3488     int16 aExp, zExp;
3489     uint64_t aSig, zSig, doubleZSig;
3490     uint64_t rem0, rem1, term0, term1;
3491     a = float64_squash_input_denormal(a STATUS_VAR);
3492
3493     aSig = extractFloat64Frac( a );
3494     aExp = extractFloat64Exp( a );
3495     aSign = extractFloat64Sign( a );
3496     if ( aExp == 0x7FF ) {
3497         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3498         if ( ! aSign ) return a;
3499         float_raise( float_flag_invalid STATUS_VAR);
3500         return float64_default_nan;
3501     }
3502     if ( aSign ) {
3503         if ( ( aExp | aSig ) == 0 ) return a;
3504         float_raise( float_flag_invalid STATUS_VAR);
3505         return float64_default_nan;
3506     }
3507     if ( aExp == 0 ) {
3508         if ( aSig == 0 ) return float64_zero;
3509         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3510     }
3511     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3512     aSig |= LIT64( 0x0010000000000000 );
3513     zSig = estimateSqrt32( aExp, aSig>>21 );
3514     aSig <<= 9 - ( aExp & 1 );
3515     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3516     if ( ( zSig & 0x1FF ) <= 5 ) {
3517         doubleZSig = zSig<<1;
3518         mul64To128( zSig, zSig, &term0, &term1 );
3519         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3520         while ( (int64_t) rem0 < 0 ) {
3521             --zSig;
3522             doubleZSig -= 2;
3523             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3524         }
3525         zSig |= ( ( rem0 | rem1 ) != 0 );
3526     }
3527     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3528
3529 }
3530
3531 /*----------------------------------------------------------------------------
3532 | Returns the binary log of the double-precision floating-point value `a'.
3533 | The operation is performed according to the IEC/IEEE Standard for Binary
3534 | Floating-Point Arithmetic.
3535 *----------------------------------------------------------------------------*/
3536 float64 float64_log2( float64 a STATUS_PARAM )
3537 {
3538     flag aSign, zSign;
3539     int16 aExp;
3540     uint64_t aSig, aSig0, aSig1, zSig, i;
3541     a = float64_squash_input_denormal(a STATUS_VAR);
3542
3543     aSig = extractFloat64Frac( a );
3544     aExp = extractFloat64Exp( a );
3545     aSign = extractFloat64Sign( a );
3546
3547     if ( aExp == 0 ) {
3548         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3549         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3550     }
3551     if ( aSign ) {
3552         float_raise( float_flag_invalid STATUS_VAR);
3553         return float64_default_nan;
3554     }
3555     if ( aExp == 0x7FF ) {
3556         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3557         return a;
3558     }
3559
3560     aExp -= 0x3FF;
3561     aSig |= LIT64( 0x0010000000000000 );
3562     zSign = aExp < 0;
3563     zSig = (uint64_t)aExp << 52;
3564     for (i = 1LL << 51; i > 0; i >>= 1) {
3565         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3566         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3567         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3568             aSig >>= 1;
3569             zSig |= i;
3570         }
3571     }
3572
3573     if ( zSign )
3574         zSig = -zSig;
3575     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3576 }
3577
3578 /*----------------------------------------------------------------------------
3579 | Returns 1 if the double-precision floating-point value `a' is equal to the
3580 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3581 | if either operand is a NaN.  Otherwise, the comparison is performed
3582 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3583 *----------------------------------------------------------------------------*/
3584
3585 int float64_eq( float64 a, float64 b STATUS_PARAM )
3586 {
3587     uint64_t av, bv;
3588     a = float64_squash_input_denormal(a STATUS_VAR);
3589     b = float64_squash_input_denormal(b STATUS_VAR);
3590
3591     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3592          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3593        ) {
3594         float_raise( float_flag_invalid STATUS_VAR);
3595         return 0;
3596     }
3597     av = float64_val(a);
3598     bv = float64_val(b);
3599     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3600
3601 }
3602
3603 /*----------------------------------------------------------------------------
3604 | Returns 1 if the double-precision floating-point value `a' is less than or
3605 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3606 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3607 | Arithmetic.
3608 *----------------------------------------------------------------------------*/
3609
3610 int float64_le( float64 a, float64 b STATUS_PARAM )
3611 {
3612     flag aSign, bSign;
3613     uint64_t av, bv;
3614     a = float64_squash_input_denormal(a STATUS_VAR);
3615     b = float64_squash_input_denormal(b STATUS_VAR);
3616
3617     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3618          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3619        ) {
3620         float_raise( float_flag_invalid STATUS_VAR);
3621         return 0;
3622     }
3623     aSign = extractFloat64Sign( a );
3624     bSign = extractFloat64Sign( b );
3625     av = float64_val(a);
3626     bv = float64_val(b);
3627     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3628     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3629
3630 }
3631
3632 /*----------------------------------------------------------------------------
3633 | Returns 1 if the double-precision floating-point value `a' is less than
3634 | the corresponding value `b', and 0 otherwise.  The comparison is performed
3635 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3636 *----------------------------------------------------------------------------*/
3637
3638 int float64_lt( float64 a, float64 b STATUS_PARAM )
3639 {
3640     flag aSign, bSign;
3641     uint64_t av, bv;
3642
3643     a = float64_squash_input_denormal(a STATUS_VAR);
3644     b = float64_squash_input_denormal(b STATUS_VAR);
3645     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3646          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3647        ) {
3648         float_raise( float_flag_invalid STATUS_VAR);
3649         return 0;
3650     }
3651     aSign = extractFloat64Sign( a );
3652     bSign = extractFloat64Sign( b );
3653     av = float64_val(a);
3654     bv = float64_val(b);
3655     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3656     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3657
3658 }
3659
3660 /*----------------------------------------------------------------------------
3661 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3662 | be compared, and 0 otherwise.  The comparison is performed according to the
3663 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3664 *----------------------------------------------------------------------------*/
3665
3666 int float64_unordered( float64 a, float64 b STATUS_PARAM )
3667 {
3668     a = float64_squash_input_denormal(a STATUS_VAR);
3669     b = float64_squash_input_denormal(b STATUS_VAR);
3670
3671     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3672          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3673        ) {
3674         float_raise( float_flag_invalid STATUS_VAR);
3675         return 1;
3676     }
3677     return 0;
3678 }
3679
3680 /*----------------------------------------------------------------------------
3681 | Returns 1 if the double-precision floating-point value `a' is equal to the
3682 | corresponding value `b', and 0 otherwise.  The comparison is performed
3683 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3684 *----------------------------------------------------------------------------*/
3685
3686 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
3687 {
3688     uint64_t av, bv;
3689     a = float64_squash_input_denormal(a STATUS_VAR);
3690     b = float64_squash_input_denormal(b STATUS_VAR);
3691
3692     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3693          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3694        ) {
3695         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3696             float_raise( float_flag_invalid STATUS_VAR);
3697         }
3698         return 0;
3699     }
3700     av = float64_val(a);
3701     bv = float64_val(b);
3702     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3703
3704 }
3705
3706 /*----------------------------------------------------------------------------
3707 | Returns 1 if the double-precision floating-point value `a' is less than or
3708 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3709 | cause an exception.  Otherwise, the comparison is performed according to the
3710 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3711 *----------------------------------------------------------------------------*/
3712
3713 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3714 {
3715     flag aSign, bSign;
3716     uint64_t av, bv;
3717     a = float64_squash_input_denormal(a STATUS_VAR);
3718     b = float64_squash_input_denormal(b STATUS_VAR);
3719
3720     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3721          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3722        ) {
3723         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3724             float_raise( float_flag_invalid STATUS_VAR);
3725         }
3726         return 0;
3727     }
3728     aSign = extractFloat64Sign( a );
3729     bSign = extractFloat64Sign( b );
3730     av = float64_val(a);
3731     bv = float64_val(b);
3732     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3733     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3734
3735 }
3736
3737 /*----------------------------------------------------------------------------
3738 | Returns 1 if the double-precision floating-point value `a' is less than
3739 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3740 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3741 | Standard for Binary Floating-Point Arithmetic.
3742 *----------------------------------------------------------------------------*/
3743
3744 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3745 {
3746     flag aSign, bSign;
3747     uint64_t av, bv;
3748     a = float64_squash_input_denormal(a STATUS_VAR);
3749     b = float64_squash_input_denormal(b STATUS_VAR);
3750
3751     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3752          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3753        ) {
3754         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3755             float_raise( float_flag_invalid STATUS_VAR);
3756         }
3757         return 0;
3758     }
3759     aSign = extractFloat64Sign( a );
3760     bSign = extractFloat64Sign( b );
3761     av = float64_val(a);
3762     bv = float64_val(b);
3763     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3764     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3765
3766 }
3767
3768 /*----------------------------------------------------------------------------
3769 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3770 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3771 | comparison is performed according to the IEC/IEEE Standard for Binary
3772 | Floating-Point Arithmetic.
3773 *----------------------------------------------------------------------------*/
3774
3775 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
3776 {
3777     a = float64_squash_input_denormal(a STATUS_VAR);
3778     b = float64_squash_input_denormal(b STATUS_VAR);
3779
3780     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3781          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3782        ) {
3783         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3784             float_raise( float_flag_invalid STATUS_VAR);
3785         }
3786         return 1;
3787     }
3788     return 0;
3789 }
3790
3791 #ifdef FLOATX80
3792
3793 /*----------------------------------------------------------------------------
3794 | Returns the result of converting the extended double-precision floating-
3795 | point value `a' to the 32-bit two's complement integer format.  The
3796 | conversion is performed according to the IEC/IEEE Standard for Binary
3797 | Floating-Point Arithmetic---which means in particular that the conversion
3798 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3799 | largest positive integer is returned.  Otherwise, if the conversion
3800 | overflows, the largest integer with the same sign as `a' is returned.
3801 *----------------------------------------------------------------------------*/
3802
3803 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3804 {
3805     flag aSign;
3806     int32 aExp, shiftCount;
3807     uint64_t aSig;
3808
3809     aSig = extractFloatx80Frac( a );
3810     aExp = extractFloatx80Exp( a );
3811     aSign = extractFloatx80Sign( a );
3812     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3813     shiftCount = 0x4037 - aExp;
3814     if ( shiftCount <= 0 ) shiftCount = 1;
3815     shift64RightJamming( aSig, shiftCount, &aSig );
3816     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3817
3818 }
3819
3820 /*----------------------------------------------------------------------------
3821 | Returns the result of converting the extended double-precision floating-
3822 | point value `a' to the 32-bit two's complement integer format.  The
3823 | conversion is performed according to the IEC/IEEE Standard for Binary
3824 | Floating-Point Arithmetic, except that the conversion is always rounded
3825 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3826 | Otherwise, if the conversion overflows, the largest integer with the same
3827 | sign as `a' is returned.
3828 *----------------------------------------------------------------------------*/
3829
3830 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3831 {
3832     flag aSign;
3833     int32 aExp, shiftCount;
3834     uint64_t aSig, savedASig;
3835     int32 z;
3836
3837     aSig = extractFloatx80Frac( a );
3838     aExp = extractFloatx80Exp( a );
3839     aSign = extractFloatx80Sign( a );
3840     if ( 0x401E < aExp ) {
3841         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3842         goto invalid;
3843     }
3844     else if ( aExp < 0x3FFF ) {
3845         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3846         return 0;
3847     }
3848     shiftCount = 0x403E - aExp;
3849     savedASig = aSig;
3850     aSig >>= shiftCount;
3851     z = aSig;
3852     if ( aSign ) z = - z;
3853     if ( ( z < 0 ) ^ aSign ) {
3854  invalid:
3855         float_raise( float_flag_invalid STATUS_VAR);
3856         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3857     }
3858     if ( ( aSig<<shiftCount ) != savedASig ) {
3859         STATUS(float_exception_flags) |= float_flag_inexact;
3860     }
3861     return z;
3862
3863 }
3864
3865 /*----------------------------------------------------------------------------
3866 | Returns the result of converting the extended double-precision floating-
3867 | point value `a' to the 64-bit two's complement integer format.  The
3868 | conversion is performed according to the IEC/IEEE Standard for Binary
3869 | Floating-Point Arithmetic---which means in particular that the conversion
3870 | is rounded according to the current rounding mode.  If `a' is a NaN,
3871 | the largest positive integer is returned.  Otherwise, if the conversion
3872 | overflows, the largest integer with the same sign as `a' is returned.
3873 *----------------------------------------------------------------------------*/
3874
3875 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3876 {
3877     flag aSign;
3878     int32 aExp, shiftCount;
3879     uint64_t aSig, aSigExtra;
3880
3881     aSig = extractFloatx80Frac( a );
3882     aExp = extractFloatx80Exp( a );
3883     aSign = extractFloatx80Sign( a );
3884     shiftCount = 0x403E - aExp;
3885     if ( shiftCount <= 0 ) {
3886         if ( shiftCount ) {
3887             float_raise( float_flag_invalid STATUS_VAR);
3888             if (    ! aSign
3889                  || (    ( aExp == 0x7FFF )
3890                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3891                ) {
3892                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3893             }
3894             return (int64_t) LIT64( 0x8000000000000000 );
3895         }
3896         aSigExtra = 0;
3897     }
3898     else {
3899         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3900     }
3901     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3902
3903 }
3904
3905 /*----------------------------------------------------------------------------
3906 | Returns the result of converting the extended double-precision floating-
3907 | point value `a' to the 64-bit two's complement integer format.  The
3908 | conversion is performed according to the IEC/IEEE Standard for Binary
3909 | Floating-Point Arithmetic, except that the conversion is always rounded
3910 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3911 | Otherwise, if the conversion overflows, the largest integer with the same
3912 | sign as `a' is returned.
3913 *----------------------------------------------------------------------------*/
3914
3915 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3916 {
3917     flag aSign;
3918     int32 aExp, shiftCount;
3919     uint64_t aSig;
3920     int64 z;
3921
3922     aSig = extractFloatx80Frac( a );
3923     aExp = extractFloatx80Exp( a );
3924     aSign = extractFloatx80Sign( a );
3925     shiftCount = aExp - 0x403E;
3926     if ( 0 <= shiftCount ) {
3927         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3928         if ( ( a.high != 0xC03E ) || aSig ) {
3929             float_raise( float_flag_invalid STATUS_VAR);
3930             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3931                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3932             }
3933         }
3934         return (int64_t) LIT64( 0x8000000000000000 );
3935     }
3936     else if ( aExp < 0x3FFF ) {
3937         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3938         return 0;
3939     }
3940     z = aSig>>( - shiftCount );
3941     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3942         STATUS(float_exception_flags) |= float_flag_inexact;
3943     }
3944     if ( aSign ) z = - z;
3945     return z;
3946
3947 }
3948
3949 /*----------------------------------------------------------------------------
3950 | Returns the result of converting the extended double-precision floating-
3951 | point value `a' to the single-precision floating-point format.  The
3952 | conversion is performed according to the IEC/IEEE Standard for Binary
3953 | Floating-Point Arithmetic.
3954 *----------------------------------------------------------------------------*/
3955
3956 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3957 {
3958     flag aSign;
3959     int32 aExp;
3960     uint64_t aSig;
3961
3962     aSig = extractFloatx80Frac( a );
3963     aExp = extractFloatx80Exp( a );
3964     aSign = extractFloatx80Sign( a );
3965     if ( aExp == 0x7FFF ) {
3966         if ( (uint64_t) ( aSig<<1 ) ) {
3967             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3968         }
3969         return packFloat32( aSign, 0xFF, 0 );
3970     }
3971     shift64RightJamming( aSig, 33, &aSig );
3972     if ( aExp || aSig ) aExp -= 0x3F81;
3973     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3974
3975 }
3976
3977 /*----------------------------------------------------------------------------
3978 | Returns the result of converting the extended double-precision floating-
3979 | point value `a' to the double-precision floating-point format.  The
3980 | conversion is performed according to the IEC/IEEE Standard for Binary
3981 | Floating-Point Arithmetic.
3982 *----------------------------------------------------------------------------*/
3983
3984 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3985 {
3986     flag aSign;
3987     int32 aExp;
3988     uint64_t aSig, zSig;
3989
3990     aSig = extractFloatx80Frac( a );
3991     aExp = extractFloatx80Exp( a );
3992     aSign = extractFloatx80Sign( a );
3993     if ( aExp == 0x7FFF ) {
3994         if ( (uint64_t) ( aSig<<1 ) ) {
3995             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3996         }
3997         return packFloat64( aSign, 0x7FF, 0 );
3998     }
3999     shift64RightJamming( aSig, 1, &zSig );
4000     if ( aExp || aSig ) aExp -= 0x3C01;
4001     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4002
4003 }
4004
4005 #ifdef FLOAT128
4006
4007 /*----------------------------------------------------------------------------
4008 | Returns the result of converting the extended double-precision floating-
4009 | point value `a' to the quadruple-precision floating-point format.  The
4010 | conversion is performed according to the IEC/IEEE Standard for Binary
4011 | Floating-Point Arithmetic.
4012 *----------------------------------------------------------------------------*/
4013
4014 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4015 {
4016     flag aSign;
4017     int16 aExp;
4018     uint64_t aSig, zSig0, zSig1;
4019
4020     aSig = extractFloatx80Frac( a );
4021     aExp = extractFloatx80Exp( a );
4022     aSign = extractFloatx80Sign( a );
4023     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4024         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4025     }
4026     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4027     return packFloat128( aSign, aExp, zSig0, zSig1 );
4028
4029 }
4030
4031 #endif
4032
4033 /*----------------------------------------------------------------------------
4034 | Rounds the extended double-precision floating-point value `a' to an integer,
4035 | and returns the result as an extended quadruple-precision floating-point
4036 | value.  The operation is performed according to the IEC/IEEE Standard for
4037 | Binary Floating-Point Arithmetic.
4038 *----------------------------------------------------------------------------*/
4039
4040 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4041 {
4042     flag aSign;
4043     int32 aExp;
4044     uint64_t lastBitMask, roundBitsMask;
4045     int8 roundingMode;
4046     floatx80 z;
4047
4048     aExp = extractFloatx80Exp( a );
4049     if ( 0x403E <= aExp ) {
4050         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4051             return propagateFloatx80NaN( a, a STATUS_VAR );
4052         }
4053         return a;
4054     }
4055     if ( aExp < 0x3FFF ) {
4056         if (    ( aExp == 0 )
4057              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4058             return a;
4059         }
4060         STATUS(float_exception_flags) |= float_flag_inexact;
4061         aSign = extractFloatx80Sign( a );
4062         switch ( STATUS(float_rounding_mode) ) {
4063          case float_round_nearest_even:
4064             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4065                ) {
4066                 return
4067                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4068             }
4069             break;
4070          case float_round_down:
4071             return
4072                   aSign ?
4073                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4074                 : packFloatx80( 0, 0, 0 );
4075          case float_round_up:
4076             return
4077                   aSign ? packFloatx80( 1, 0, 0 )
4078                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4079         }
4080         return packFloatx80( aSign, 0, 0 );
4081     }
4082     lastBitMask = 1;
4083     lastBitMask <<= 0x403E - aExp;
4084     roundBitsMask = lastBitMask - 1;
4085     z = a;
4086     roundingMode = STATUS(float_rounding_mode);
4087     if ( roundingMode == float_round_nearest_even ) {
4088         z.low += lastBitMask>>1;
4089         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4090     }
4091     else if ( roundingMode != float_round_to_zero ) {
4092         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4093             z.low += roundBitsMask;
4094         }
4095     }
4096     z.low &= ~ roundBitsMask;
4097     if ( z.low == 0 ) {
4098         ++z.high;
4099         z.low = LIT64( 0x8000000000000000 );
4100     }
4101     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4102     return z;
4103
4104 }
4105
4106 /*----------------------------------------------------------------------------
4107 | Returns the result of adding the absolute values of the extended double-
4108 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4109 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4110 | The addition is performed according to the IEC/IEEE Standard for Binary
4111 | Floating-Point Arithmetic.
4112 *----------------------------------------------------------------------------*/
4113
4114 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4115 {
4116     int32 aExp, bExp, zExp;
4117     uint64_t aSig, bSig, zSig0, zSig1;
4118     int32 expDiff;
4119
4120     aSig = extractFloatx80Frac( a );
4121     aExp = extractFloatx80Exp( a );
4122     bSig = extractFloatx80Frac( b );
4123     bExp = extractFloatx80Exp( b );
4124     expDiff = aExp - bExp;
4125     if ( 0 < expDiff ) {
4126         if ( aExp == 0x7FFF ) {
4127             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4128             return a;
4129         }
4130         if ( bExp == 0 ) --expDiff;
4131         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4132         zExp = aExp;
4133     }
4134     else if ( expDiff < 0 ) {
4135         if ( bExp == 0x7FFF ) {
4136             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4137             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4138         }
4139         if ( aExp == 0 ) ++expDiff;
4140         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4141         zExp = bExp;
4142     }
4143     else {
4144         if ( aExp == 0x7FFF ) {
4145             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4146                 return propagateFloatx80NaN( a, b STATUS_VAR );
4147             }
4148             return a;
4149         }
4150         zSig1 = 0;
4151         zSig0 = aSig + bSig;
4152         if ( aExp == 0 ) {
4153             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4154             goto roundAndPack;
4155         }
4156         zExp = aExp;
4157         goto shiftRight1;
4158     }
4159     zSig0 = aSig + bSig;
4160     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4161  shiftRight1:
4162     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4163     zSig0 |= LIT64( 0x8000000000000000 );
4164     ++zExp;
4165  roundAndPack:
4166     return
4167         roundAndPackFloatx80(
4168             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4169
4170 }
4171
4172 /*----------------------------------------------------------------------------
4173 | Returns the result of subtracting the absolute values of the extended
4174 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4175 | difference is negated before being returned.  `zSign' is ignored if the
4176 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4177 | Standard for Binary Floating-Point Arithmetic.
4178 *----------------------------------------------------------------------------*/
4179
4180 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4181 {
4182     int32 aExp, bExp, zExp;
4183     uint64_t aSig, bSig, zSig0, zSig1;
4184     int32 expDiff;
4185     floatx80 z;
4186
4187     aSig = extractFloatx80Frac( a );
4188     aExp = extractFloatx80Exp( a );
4189     bSig = extractFloatx80Frac( b );
4190     bExp = extractFloatx80Exp( b );
4191     expDiff = aExp - bExp;
4192     if ( 0 < expDiff ) goto aExpBigger;
4193     if ( expDiff < 0 ) goto bExpBigger;
4194     if ( aExp == 0x7FFF ) {
4195         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4196             return propagateFloatx80NaN( a, b STATUS_VAR );
4197         }
4198         float_raise( float_flag_invalid STATUS_VAR);
4199         z.low = floatx80_default_nan_low;
4200         z.high = floatx80_default_nan_high;
4201         return z;
4202     }
4203     if ( aExp == 0 ) {
4204         aExp = 1;
4205         bExp = 1;
4206     }
4207     zSig1 = 0;
4208     if ( bSig < aSig ) goto aBigger;
4209     if ( aSig < bSig ) goto bBigger;
4210     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4211  bExpBigger:
4212     if ( bExp == 0x7FFF ) {
4213         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4214         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4215     }
4216     if ( aExp == 0 ) ++expDiff;
4217     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4218  bBigger:
4219     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4220     zExp = bExp;
4221     zSign ^= 1;
4222     goto normalizeRoundAndPack;
4223  aExpBigger:
4224     if ( aExp == 0x7FFF ) {
4225         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4226         return a;
4227     }
4228     if ( bExp == 0 ) --expDiff;
4229     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4230  aBigger:
4231     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4232     zExp = aExp;
4233  normalizeRoundAndPack:
4234     return
4235         normalizeRoundAndPackFloatx80(
4236             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4237
4238 }
4239
4240 /*----------------------------------------------------------------------------
4241 | Returns the result of adding the extended double-precision floating-point
4242 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4243 | Standard for Binary Floating-Point Arithmetic.
4244 *----------------------------------------------------------------------------*/
4245
4246 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4247 {
4248     flag aSign, bSign;
4249
4250     aSign = extractFloatx80Sign( a );
4251     bSign = extractFloatx80Sign( b );
4252     if ( aSign == bSign ) {
4253         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4254     }
4255     else {
4256         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4257     }
4258
4259 }
4260
4261 /*----------------------------------------------------------------------------
4262 | Returns the result of subtracting the extended double-precision floating-
4263 | point values `a' and `b'.  The operation is performed according to the
4264 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4265 *----------------------------------------------------------------------------*/
4266
4267 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4268 {
4269     flag aSign, bSign;
4270
4271     aSign = extractFloatx80Sign( a );
4272     bSign = extractFloatx80Sign( b );
4273     if ( aSign == bSign ) {
4274         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4275     }
4276     else {
4277         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4278     }
4279
4280 }
4281
4282 /*----------------------------------------------------------------------------
4283 | Returns the result of multiplying the extended double-precision floating-
4284 | point values `a' and `b'.  The operation is performed according to the
4285 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4286 *----------------------------------------------------------------------------*/
4287
4288 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4289 {
4290     flag aSign, bSign, zSign;
4291     int32 aExp, bExp, zExp;
4292     uint64_t aSig, bSig, zSig0, zSig1;
4293     floatx80 z;
4294
4295     aSig = extractFloatx80Frac( a );
4296     aExp = extractFloatx80Exp( a );
4297     aSign = extractFloatx80Sign( a );
4298     bSig = extractFloatx80Frac( b );
4299     bExp = extractFloatx80Exp( b );
4300     bSign = extractFloatx80Sign( b );
4301     zSign = aSign ^ bSign;
4302     if ( aExp == 0x7FFF ) {
4303         if (    (uint64_t) ( aSig<<1 )
4304              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4305             return propagateFloatx80NaN( a, b STATUS_VAR );
4306         }
4307         if ( ( bExp | bSig ) == 0 ) goto invalid;
4308         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4309     }
4310     if ( bExp == 0x7FFF ) {
4311         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4312         if ( ( aExp | aSig ) == 0 ) {
4313  invalid:
4314             float_raise( float_flag_invalid STATUS_VAR);
4315             z.low = floatx80_default_nan_low;
4316             z.high = floatx80_default_nan_high;
4317             return z;
4318         }
4319         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4320     }
4321     if ( aExp == 0 ) {
4322         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4323         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4324     }
4325     if ( bExp == 0 ) {
4326         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4327         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4328     }
4329     zExp = aExp + bExp - 0x3FFE;
4330     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4331     if ( 0 < (int64_t) zSig0 ) {
4332         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4333         --zExp;
4334     }
4335     return
4336         roundAndPackFloatx80(
4337             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4338
4339 }
4340
4341 /*----------------------------------------------------------------------------
4342 | Returns the result of dividing the extended double-precision floating-point
4343 | value `a' by the corresponding value `b'.  The operation is performed
4344 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4345 *----------------------------------------------------------------------------*/
4346
4347 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4348 {
4349     flag aSign, bSign, zSign;
4350     int32 aExp, bExp, zExp;
4351     uint64_t aSig, bSig, zSig0, zSig1;
4352     uint64_t rem0, rem1, rem2, term0, term1, term2;
4353     floatx80 z;
4354
4355     aSig = extractFloatx80Frac( a );
4356     aExp = extractFloatx80Exp( a );
4357     aSign = extractFloatx80Sign( a );
4358     bSig = extractFloatx80Frac( b );
4359     bExp = extractFloatx80Exp( b );
4360     bSign = extractFloatx80Sign( b );
4361     zSign = aSign ^ bSign;
4362     if ( aExp == 0x7FFF ) {
4363         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4364         if ( bExp == 0x7FFF ) {
4365             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4366             goto invalid;
4367         }
4368         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4369     }
4370     if ( bExp == 0x7FFF ) {
4371         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4372         return packFloatx80( zSign, 0, 0 );
4373     }
4374     if ( bExp == 0 ) {
4375         if ( bSig == 0 ) {
4376             if ( ( aExp | aSig ) == 0 ) {
4377  invalid:
4378                 float_raise( float_flag_invalid STATUS_VAR);
4379                 z.low = floatx80_default_nan_low;
4380                 z.high = floatx80_default_nan_high;
4381                 return z;
4382             }
4383             float_raise( float_flag_divbyzero STATUS_VAR);
4384             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4385         }
4386         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4387     }
4388     if ( aExp == 0 ) {
4389         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4390         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4391     }
4392     zExp = aExp - bExp + 0x3FFE;
4393     rem1 = 0;
4394     if ( bSig <= aSig ) {
4395         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4396         ++zExp;
4397     }
4398     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4399     mul64To128( bSig, zSig0, &term0, &term1 );
4400     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4401     while ( (int64_t) rem0 < 0 ) {
4402         --zSig0;
4403         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4404     }
4405     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4406     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4407         mul64To128( bSig, zSig1, &term1, &term2 );
4408         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4409         while ( (int64_t) rem1 < 0 ) {
4410             --zSig1;
4411             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4412         }
4413         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4414     }
4415     return
4416         roundAndPackFloatx80(
4417             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4418
4419 }
4420
4421 /*----------------------------------------------------------------------------
4422 | Returns the remainder of the extended double-precision floating-point value
4423 | `a' with respect to the corresponding value `b'.  The operation is performed
4424 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4425 *----------------------------------------------------------------------------*/
4426
4427 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4428 {
4429     flag aSign, zSign;
4430     int32 aExp, bExp, expDiff;
4431     uint64_t aSig0, aSig1, bSig;
4432     uint64_t q, term0, term1, alternateASig0, alternateASig1;
4433     floatx80 z;
4434
4435     aSig0 = extractFloatx80Frac( a );
4436     aExp = extractFloatx80Exp( a );
4437     aSign = extractFloatx80Sign( a );
4438     bSig = extractFloatx80Frac( b );
4439     bExp = extractFloatx80Exp( b );
4440     if ( aExp == 0x7FFF ) {
4441         if (    (uint64_t) ( aSig0<<1 )
4442              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4443             return propagateFloatx80NaN( a, b STATUS_VAR );
4444         }
4445         goto invalid;
4446     }
4447     if ( bExp == 0x7FFF ) {
4448         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4449         return a;
4450     }
4451     if ( bExp == 0 ) {
4452         if ( bSig == 0 ) {
4453  invalid:
4454             float_raise( float_flag_invalid STATUS_VAR);
4455             z.low = floatx80_default_nan_low;
4456             z.high = floatx80_default_nan_high;
4457             return z;
4458         }
4459         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4460     }
4461     if ( aExp == 0 ) {
4462         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4463         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4464     }
4465     bSig |= LIT64( 0x8000000000000000 );
4466     zSign = aSign;
4467     expDiff = aExp - bExp;
4468     aSig1 = 0;
4469     if ( expDiff < 0 ) {
4470         if ( expDiff < -1 ) return a;
4471         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4472         expDiff = 0;
4473     }
4474     q = ( bSig <= aSig0 );
4475     if ( q ) aSig0 -= bSig;
4476     expDiff -= 64;
4477     while ( 0 < expDiff ) {
4478         q = estimateDiv128To64( aSig0, aSig1, bSig );
4479         q = ( 2 < q ) ? q - 2 : 0;
4480         mul64To128( bSig, q, &term0, &term1 );
4481         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4482         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4483         expDiff -= 62;
4484     }
4485     expDiff += 64;
4486     if ( 0 < expDiff ) {
4487         q = estimateDiv128To64( aSig0, aSig1, bSig );
4488         q = ( 2 < q ) ? q - 2 : 0;
4489         q >>= 64 - expDiff;
4490         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4491         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4492         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4493         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4494             ++q;
4495             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4496         }
4497     }
4498     else {
4499         term1 = 0;
4500         term0 = bSig;
4501     }
4502     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4503     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4504          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4505               && ( q & 1 ) )
4506        ) {
4507         aSig0 = alternateASig0;
4508         aSig1 = alternateASig1;
4509         zSign = ! zSign;
4510     }
4511     return
4512         normalizeRoundAndPackFloatx80(
4513             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4514
4515 }
4516
4517 /*----------------------------------------------------------------------------
4518 | Returns the square root of the extended double-precision floating-point
4519 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4520 | for Binary Floating-Point Arithmetic.
4521 *----------------------------------------------------------------------------*/
4522
4523 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4524 {
4525     flag aSign;
4526     int32 aExp, zExp;
4527     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4528     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4529     floatx80 z;
4530
4531     aSig0 = extractFloatx80Frac( a );
4532     aExp = extractFloatx80Exp( a );
4533     aSign = extractFloatx80Sign( a );
4534     if ( aExp == 0x7FFF ) {
4535         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4536         if ( ! aSign ) return a;
4537         goto invalid;
4538     }
4539     if ( aSign ) {
4540         if ( ( aExp | aSig0 ) == 0 ) return a;
4541  invalid:
4542         float_raise( float_flag_invalid STATUS_VAR);
4543         z.low = floatx80_default_nan_low;
4544         z.high = floatx80_default_nan_high;
4545         return z;
4546     }
4547     if ( aExp == 0 ) {
4548         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4549         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4550     }
4551     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4552     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4553     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4554     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4555     doubleZSig0 = zSig0<<1;
4556     mul64To128( zSig0, zSig0, &term0, &term1 );
4557     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4558     while ( (int64_t) rem0 < 0 ) {
4559         --zSig0;
4560         doubleZSig0 -= 2;
4561         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4562     }
4563     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4564     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4565         if ( zSig1 == 0 ) zSig1 = 1;
4566         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4567         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4568         mul64To128( zSig1, zSig1, &term2, &term3 );
4569         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4570         while ( (int64_t) rem1 < 0 ) {
4571             --zSig1;
4572             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4573             term3 |= 1;
4574             term2 |= doubleZSig0;
4575             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4576         }
4577         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4578     }
4579     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4580     zSig0 |= doubleZSig0;
4581     return
4582         roundAndPackFloatx80(
4583             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4584
4585 }
4586
4587 /*----------------------------------------------------------------------------
4588 | Returns 1 if the extended double-precision floating-point value `a' is equal
4589 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4590 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4591 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4592 *----------------------------------------------------------------------------*/
4593
4594 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4595 {
4596
4597     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4598               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4599          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4600               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4601        ) {
4602         float_raise( float_flag_invalid STATUS_VAR);
4603         return 0;
4604     }
4605     return
4606            ( a.low == b.low )
4607         && (    ( a.high == b.high )
4608              || (    ( a.low == 0 )
4609                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4610            );
4611
4612 }
4613
4614 /*----------------------------------------------------------------------------
4615 | Returns 1 if the extended double-precision floating-point value `a' is
4616 | less than or equal to the corresponding value `b', and 0 otherwise.  The
4617 | comparison is performed according to the IEC/IEEE Standard for Binary
4618 | Floating-Point Arithmetic.
4619 *----------------------------------------------------------------------------*/
4620
4621 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4622 {
4623     flag aSign, bSign;
4624
4625     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4626               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4627          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4628               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4629        ) {
4630         float_raise( float_flag_invalid STATUS_VAR);
4631         return 0;
4632     }
4633     aSign = extractFloatx80Sign( a );
4634     bSign = extractFloatx80Sign( b );
4635     if ( aSign != bSign ) {
4636         return
4637                aSign
4638             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4639                  == 0 );
4640     }
4641     return
4642           aSign ? le128( b.high, b.low, a.high, a.low )
4643         : le128( a.high, a.low, b.high, b.low );
4644
4645 }
4646
4647 /*----------------------------------------------------------------------------
4648 | Returns 1 if the extended double-precision floating-point value `a' is
4649 | less than the corresponding value `b', and 0 otherwise.  The comparison
4650 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4651 | Arithmetic.
4652 *----------------------------------------------------------------------------*/
4653
4654 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4655 {
4656     flag aSign, bSign;
4657
4658     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4659               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4660          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4661               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4662        ) {
4663         float_raise( float_flag_invalid STATUS_VAR);
4664         return 0;
4665     }
4666     aSign = extractFloatx80Sign( a );
4667     bSign = extractFloatx80Sign( b );
4668     if ( aSign != bSign ) {
4669         return
4670                aSign
4671             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4672                  != 0 );
4673     }
4674     return
4675           aSign ? lt128( b.high, b.low, a.high, a.low )
4676         : lt128( a.high, a.low, b.high, b.low );
4677
4678 }
4679
4680 /*----------------------------------------------------------------------------
4681 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4682 | cannot be compared, and 0 otherwise.  The comparison is performed according
4683 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4684 *----------------------------------------------------------------------------*/
4685 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
4686 {
4687     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4688               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4689          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4690               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4691        ) {
4692         float_raise( float_flag_invalid STATUS_VAR);
4693         return 1;
4694     }
4695     return 0;
4696 }
4697
4698 /*----------------------------------------------------------------------------
4699 | Returns 1 if the extended double-precision floating-point value `a' is
4700 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
4701 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4702 | Arithmetic.
4703 *----------------------------------------------------------------------------*/
4704
4705 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4706 {
4707
4708     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4709               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4710          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4711               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4712        ) {
4713         if (    floatx80_is_signaling_nan( a )
4714              || floatx80_is_signaling_nan( b ) ) {
4715             float_raise( float_flag_invalid STATUS_VAR);
4716         }
4717         return 0;
4718     }
4719     return
4720            ( a.low == b.low )
4721         && (    ( a.high == b.high )
4722              || (    ( a.low == 0 )
4723                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4724            );
4725
4726 }
4727
4728 /*----------------------------------------------------------------------------
4729 | Returns 1 if the extended double-precision floating-point value `a' is less
4730 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4731 | do not cause an exception.  Otherwise, the comparison is performed according
4732 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4733 *----------------------------------------------------------------------------*/
4734
4735 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4736 {
4737     flag aSign, bSign;
4738
4739     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4740               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4741          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4742               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4743        ) {
4744         if (    floatx80_is_signaling_nan( a )
4745              || floatx80_is_signaling_nan( b ) ) {
4746             float_raise( float_flag_invalid STATUS_VAR);
4747         }
4748         return 0;
4749     }
4750     aSign = extractFloatx80Sign( a );
4751     bSign = extractFloatx80Sign( b );
4752     if ( aSign != bSign ) {
4753         return
4754                aSign
4755             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4756                  == 0 );
4757     }
4758     return
4759           aSign ? le128( b.high, b.low, a.high, a.low )
4760         : le128( a.high, a.low, b.high, b.low );
4761
4762 }
4763
4764 /*----------------------------------------------------------------------------
4765 | Returns 1 if the extended double-precision floating-point value `a' is less
4766 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4767 | an exception.  Otherwise, the comparison is performed according to the
4768 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4769 *----------------------------------------------------------------------------*/
4770
4771 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4772 {
4773     flag aSign, bSign;
4774
4775     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4776               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4777          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4778               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4779        ) {
4780         if (    floatx80_is_signaling_nan( a )
4781              || floatx80_is_signaling_nan( b ) ) {
4782             float_raise( float_flag_invalid STATUS_VAR);
4783         }
4784         return 0;
4785     }
4786     aSign = extractFloatx80Sign( a );
4787     bSign = extractFloatx80Sign( b );
4788     if ( aSign != bSign ) {
4789         return
4790                aSign
4791             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4792                  != 0 );
4793     }
4794     return
4795           aSign ? lt128( b.high, b.low, a.high, a.low )
4796         : lt128( a.high, a.low, b.high, b.low );
4797
4798 }
4799
4800 /*----------------------------------------------------------------------------
4801 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4802 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
4803 | The comparison is performed according to the IEC/IEEE Standard for Binary
4804 | Floating-Point Arithmetic.
4805 *----------------------------------------------------------------------------*/
4806 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4807 {
4808     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4809               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4810          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4811               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4812        ) {
4813         if (    floatx80_is_signaling_nan( a )
4814              || floatx80_is_signaling_nan( b ) ) {
4815             float_raise( float_flag_invalid STATUS_VAR);
4816         }
4817         return 1;
4818     }
4819     return 0;
4820 }
4821
4822 #endif
4823
4824 #ifdef FLOAT128
4825
4826 /*----------------------------------------------------------------------------
4827 | Returns the result of converting the quadruple-precision floating-point
4828 | value `a' to the 32-bit two's complement integer format.  The conversion
4829 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4830 | Arithmetic---which means in particular that the conversion is rounded
4831 | according to the current rounding mode.  If `a' is a NaN, the largest
4832 | positive integer is returned.  Otherwise, if the conversion overflows, the
4833 | largest integer with the same sign as `a' is returned.
4834 *----------------------------------------------------------------------------*/
4835
4836 int32 float128_to_int32( float128 a STATUS_PARAM )
4837 {
4838     flag aSign;
4839     int32 aExp, shiftCount;
4840     uint64_t aSig0, aSig1;
4841
4842     aSig1 = extractFloat128Frac1( a );
4843     aSig0 = extractFloat128Frac0( a );
4844     aExp = extractFloat128Exp( a );
4845     aSign = extractFloat128Sign( a );
4846     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4847     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4848     aSig0 |= ( aSig1 != 0 );
4849     shiftCount = 0x4028 - aExp;
4850     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4851     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4852
4853 }
4854
4855 /*----------------------------------------------------------------------------
4856 | Returns the result of converting the quadruple-precision floating-point
4857 | value `a' to the 32-bit two's complement integer format.  The conversion
4858 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4859 | Arithmetic, except that the conversion is always rounded toward zero.  If
4860 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4861 | conversion overflows, the largest integer with the same sign as `a' is
4862 | returned.
4863 *----------------------------------------------------------------------------*/
4864
4865 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4866 {
4867     flag aSign;
4868     int32 aExp, shiftCount;
4869     uint64_t aSig0, aSig1, savedASig;
4870     int32 z;
4871
4872     aSig1 = extractFloat128Frac1( a );
4873     aSig0 = extractFloat128Frac0( a );
4874     aExp = extractFloat128Exp( a );
4875     aSign = extractFloat128Sign( a );
4876     aSig0 |= ( aSig1 != 0 );
4877     if ( 0x401E < aExp ) {
4878         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4879         goto invalid;
4880     }
4881     else if ( aExp < 0x3FFF ) {
4882         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4883         return 0;
4884     }
4885     aSig0 |= LIT64( 0x0001000000000000 );
4886     shiftCount = 0x402F - aExp;
4887     savedASig = aSig0;
4888     aSig0 >>= shiftCount;
4889     z = aSig0;
4890     if ( aSign ) z = - z;
4891     if ( ( z < 0 ) ^ aSign ) {
4892  invalid:
4893         float_raise( float_flag_invalid STATUS_VAR);
4894         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4895     }
4896     if ( ( aSig0<<shiftCount ) != savedASig ) {
4897         STATUS(float_exception_flags) |= float_flag_inexact;
4898     }
4899     return z;
4900
4901 }
4902
4903 /*----------------------------------------------------------------------------
4904 | Returns the result of converting the quadruple-precision floating-point
4905 | value `a' to the 64-bit two's complement integer format.  The conversion
4906 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4907 | Arithmetic---which means in particular that the conversion is rounded
4908 | according to the current rounding mode.  If `a' is a NaN, the largest
4909 | positive integer is returned.  Otherwise, if the conversion overflows, the
4910 | largest integer with the same sign as `a' is returned.
4911 *----------------------------------------------------------------------------*/
4912
4913 int64 float128_to_int64( float128 a STATUS_PARAM )
4914 {
4915     flag aSign;
4916     int32 aExp, shiftCount;
4917     uint64_t aSig0, aSig1;
4918
4919     aSig1 = extractFloat128Frac1( a );
4920     aSig0 = extractFloat128Frac0( a );
4921     aExp = extractFloat128Exp( a );
4922     aSign = extractFloat128Sign( a );
4923     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4924     shiftCount = 0x402F - aExp;
4925     if ( shiftCount <= 0 ) {
4926         if ( 0x403E < aExp ) {
4927             float_raise( float_flag_invalid STATUS_VAR);
4928             if (    ! aSign
4929                  || (    ( aExp == 0x7FFF )
4930                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4931                     )
4932                ) {
4933                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4934             }
4935             return (int64_t) LIT64( 0x8000000000000000 );
4936         }
4937         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4938     }
4939     else {
4940         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4941     }
4942     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4943
4944 }
4945
4946 /*----------------------------------------------------------------------------
4947 | Returns the result of converting the quadruple-precision floating-point
4948 | value `a' to the 64-bit two's complement integer format.  The conversion
4949 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4950 | Arithmetic, except that the conversion is always rounded toward zero.
4951 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4952 | the conversion overflows, the largest integer with the same sign as `a' is
4953 | returned.
4954 *----------------------------------------------------------------------------*/
4955
4956 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4957 {
4958     flag aSign;
4959     int32 aExp, shiftCount;
4960     uint64_t aSig0, aSig1;
4961     int64 z;
4962
4963     aSig1 = extractFloat128Frac1( a );
4964     aSig0 = extractFloat128Frac0( a );
4965     aExp = extractFloat128Exp( a );
4966     aSign = extractFloat128Sign( a );
4967     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4968     shiftCount = aExp - 0x402F;
4969     if ( 0 < shiftCount ) {
4970         if ( 0x403E <= aExp ) {
4971             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4972             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4973                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4974                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4975             }
4976             else {
4977                 float_raise( float_flag_invalid STATUS_VAR);
4978                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4979                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4980                 }
4981             }
4982             return (int64_t) LIT64( 0x8000000000000000 );
4983         }
4984         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4985         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
4986             STATUS(float_exception_flags) |= float_flag_inexact;
4987         }
4988     }
4989     else {
4990         if ( aExp < 0x3FFF ) {
4991             if ( aExp | aSig0 | aSig1 ) {
4992                 STATUS(float_exception_flags) |= float_flag_inexact;
4993             }
4994             return 0;
4995         }
4996         z = aSig0>>( - shiftCount );
4997         if (    aSig1
4998              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4999             STATUS(float_exception_flags) |= float_flag_inexact;
5000         }
5001     }
5002     if ( aSign ) z = - z;
5003     return z;
5004
5005 }
5006
5007 /*----------------------------------------------------------------------------
5008 | Returns the result of converting the quadruple-precision floating-point
5009 | value `a' to the single-precision floating-point format.  The conversion
5010 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5011 | Arithmetic.
5012 *----------------------------------------------------------------------------*/
5013
5014 float32 float128_to_float32( float128 a STATUS_PARAM )
5015 {
5016     flag aSign;
5017     int32 aExp;
5018     uint64_t aSig0, aSig1;
5019     uint32_t zSig;
5020
5021     aSig1 = extractFloat128Frac1( a );
5022     aSig0 = extractFloat128Frac0( a );
5023     aExp = extractFloat128Exp( a );
5024     aSign = extractFloat128Sign( a );
5025     if ( aExp == 0x7FFF ) {
5026         if ( aSig0 | aSig1 ) {
5027             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5028         }
5029         return packFloat32( aSign, 0xFF, 0 );
5030     }
5031     aSig0 |= ( aSig1 != 0 );
5032     shift64RightJamming( aSig0, 18, &aSig0 );
5033     zSig = aSig0;
5034     if ( aExp || zSig ) {
5035         zSig |= 0x40000000;
5036         aExp -= 0x3F81;
5037     }
5038     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5039
5040 }
5041
5042 /*----------------------------------------------------------------------------
5043 | Returns the result of converting the quadruple-precision floating-point
5044 | value `a' to the double-precision floating-point format.  The conversion
5045 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5046 | Arithmetic.
5047 *----------------------------------------------------------------------------*/
5048
5049 float64 float128_to_float64( float128 a STATUS_PARAM )
5050 {
5051     flag aSign;
5052     int32 aExp;
5053     uint64_t aSig0, aSig1;
5054
5055     aSig1 = extractFloat128Frac1( a );
5056     aSig0 = extractFloat128Frac0( a );
5057     aExp = extractFloat128Exp( a );
5058     aSign = extractFloat128Sign( a );
5059     if ( aExp == 0x7FFF ) {
5060         if ( aSig0 | aSig1 ) {
5061             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5062         }
5063         return packFloat64( aSign, 0x7FF, 0 );
5064     }
5065     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5066     aSig0 |= ( aSig1 != 0 );
5067     if ( aExp || aSig0 ) {
5068         aSig0 |= LIT64( 0x4000000000000000 );
5069         aExp -= 0x3C01;
5070     }
5071     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5072
5073 }
5074
5075 #ifdef FLOATX80
5076
5077 /*----------------------------------------------------------------------------
5078 | Returns the result of converting the quadruple-precision floating-point
5079 | value `a' to the extended double-precision floating-point format.  The
5080 | conversion is performed according to the IEC/IEEE Standard for Binary
5081 | Floating-Point Arithmetic.
5082 *----------------------------------------------------------------------------*/
5083
5084 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5085 {
5086     flag aSign;
5087     int32 aExp;
5088     uint64_t aSig0, aSig1;
5089
5090     aSig1 = extractFloat128Frac1( a );
5091     aSig0 = extractFloat128Frac0( a );
5092     aExp = extractFloat128Exp( a );
5093     aSign = extractFloat128Sign( a );
5094     if ( aExp == 0x7FFF ) {
5095         if ( aSig0 | aSig1 ) {
5096             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5097         }
5098         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5099     }
5100     if ( aExp == 0 ) {
5101         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5102         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5103     }
5104     else {
5105         aSig0 |= LIT64( 0x0001000000000000 );
5106     }
5107     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5108     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5109
5110 }
5111
5112 #endif
5113
5114 /*----------------------------------------------------------------------------
5115 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5116 | returns the result as a quadruple-precision floating-point value.  The
5117 | operation is performed according to the IEC/IEEE Standard for Binary
5118 | Floating-Point Arithmetic.
5119 *----------------------------------------------------------------------------*/
5120
5121 float128 float128_round_to_int( float128 a STATUS_PARAM )
5122 {
5123     flag aSign;
5124     int32 aExp;
5125     uint64_t lastBitMask, roundBitsMask;
5126     int8 roundingMode;
5127     float128 z;
5128
5129     aExp = extractFloat128Exp( a );
5130     if ( 0x402F <= aExp ) {
5131         if ( 0x406F <= aExp ) {
5132             if (    ( aExp == 0x7FFF )
5133                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5134                ) {
5135                 return propagateFloat128NaN( a, a STATUS_VAR );
5136             }
5137             return a;
5138         }
5139         lastBitMask = 1;
5140         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5141         roundBitsMask = lastBitMask - 1;
5142         z = a;
5143         roundingMode = STATUS(float_rounding_mode);
5144         if ( roundingMode == float_round_nearest_even ) {
5145             if ( lastBitMask ) {
5146                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5147                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5148             }
5149             else {
5150                 if ( (int64_t) z.low < 0 ) {
5151                     ++z.high;
5152                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5153                 }
5154             }
5155         }
5156         else if ( roundingMode != float_round_to_zero ) {
5157             if (   extractFloat128Sign( z )
5158                  ^ ( roundingMode == float_round_up ) ) {
5159                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5160             }
5161         }
5162         z.low &= ~ roundBitsMask;
5163     }
5164     else {
5165         if ( aExp < 0x3FFF ) {
5166             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5167             STATUS(float_exception_flags) |= float_flag_inexact;
5168             aSign = extractFloat128Sign( a );
5169             switch ( STATUS(float_rounding_mode) ) {
5170              case float_round_nearest_even:
5171                 if (    ( aExp == 0x3FFE )
5172                      && (   extractFloat128Frac0( a )
5173                           | extractFloat128Frac1( a ) )
5174                    ) {
5175                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5176                 }
5177                 break;
5178              case float_round_down:
5179                 return
5180                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5181                     : packFloat128( 0, 0, 0, 0 );
5182              case float_round_up:
5183                 return
5184                       aSign ? packFloat128( 1, 0, 0, 0 )
5185                     : packFloat128( 0, 0x3FFF, 0, 0 );
5186             }
5187             return packFloat128( aSign, 0, 0, 0 );
5188         }
5189         lastBitMask = 1;
5190         lastBitMask <<= 0x402F - aExp;
5191         roundBitsMask = lastBitMask - 1;
5192         z.low = 0;
5193         z.high = a.high;
5194         roundingMode = STATUS(float_rounding_mode);
5195         if ( roundingMode == float_round_nearest_even ) {
5196             z.high += lastBitMask>>1;
5197             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5198                 z.high &= ~ lastBitMask;
5199             }
5200         }
5201         else if ( roundingMode != float_round_to_zero ) {
5202             if (   extractFloat128Sign( z )
5203                  ^ ( roundingMode == float_round_up ) ) {
5204                 z.high |= ( a.low != 0 );
5205                 z.high += roundBitsMask;
5206             }
5207         }
5208         z.high &= ~ roundBitsMask;
5209     }
5210     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5211         STATUS(float_exception_flags) |= float_flag_inexact;
5212     }
5213     return z;
5214
5215 }
5216
5217 /*----------------------------------------------------------------------------
5218 | Returns the result of adding the absolute values of the quadruple-precision
5219 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5220 | before being returned.  `zSign' is ignored if the result is a NaN.
5221 | The addition is performed according to the IEC/IEEE Standard for Binary
5222 | Floating-Point Arithmetic.
5223 *----------------------------------------------------------------------------*/
5224
5225 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5226 {
5227     int32 aExp, bExp, zExp;
5228     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5229     int32 expDiff;
5230
5231     aSig1 = extractFloat128Frac1( a );
5232     aSig0 = extractFloat128Frac0( a );
5233     aExp = extractFloat128Exp( a );
5234     bSig1 = extractFloat128Frac1( b );
5235     bSig0 = extractFloat128Frac0( b );
5236     bExp = extractFloat128Exp( b );
5237     expDiff = aExp - bExp;
5238     if ( 0 < expDiff ) {
5239         if ( aExp == 0x7FFF ) {
5240             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5241             return a;
5242         }
5243         if ( bExp == 0 ) {
5244             --expDiff;
5245         }
5246         else {
5247             bSig0 |= LIT64( 0x0001000000000000 );
5248         }
5249         shift128ExtraRightJamming(
5250             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5251         zExp = aExp;
5252     }
5253     else if ( expDiff < 0 ) {
5254         if ( bExp == 0x7FFF ) {
5255             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5256             return packFloat128( zSign, 0x7FFF, 0, 0 );
5257         }
5258         if ( aExp == 0 ) {
5259             ++expDiff;
5260         }
5261         else {
5262             aSig0 |= LIT64( 0x0001000000000000 );
5263         }
5264         shift128ExtraRightJamming(
5265             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5266         zExp = bExp;
5267     }
5268     else {
5269         if ( aExp == 0x7FFF ) {
5270             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5271                 return propagateFloat128NaN( a, b STATUS_VAR );
5272             }
5273             return a;
5274         }
5275         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5276         if ( aExp == 0 ) {
5277             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5278             return packFloat128( zSign, 0, zSig0, zSig1 );
5279         }
5280         zSig2 = 0;
5281         zSig0 |= LIT64( 0x0002000000000000 );
5282         zExp = aExp;
5283         goto shiftRight1;
5284     }
5285     aSig0 |= LIT64( 0x0001000000000000 );
5286     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5287     --zExp;
5288     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5289     ++zExp;
5290  shiftRight1:
5291     shift128ExtraRightJamming(
5292         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5293  roundAndPack:
5294     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5295
5296 }
5297
5298 /*----------------------------------------------------------------------------
5299 | Returns the result of subtracting the absolute values of the quadruple-
5300 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
5301 | difference is negated before being returned.  `zSign' is ignored if the
5302 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5303 | Standard for Binary Floating-Point Arithmetic.
5304 *----------------------------------------------------------------------------*/
5305
5306 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5307 {
5308     int32 aExp, bExp, zExp;
5309     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5310     int32 expDiff;
5311     float128 z;
5312
5313     aSig1 = extractFloat128Frac1( a );
5314     aSig0 = extractFloat128Frac0( a );
5315     aExp = extractFloat128Exp( a );
5316     bSig1 = extractFloat128Frac1( b );
5317     bSig0 = extractFloat128Frac0( b );
5318     bExp = extractFloat128Exp( b );
5319     expDiff = aExp - bExp;
5320     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5321     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5322     if ( 0 < expDiff ) goto aExpBigger;
5323     if ( expDiff < 0 ) goto bExpBigger;
5324     if ( aExp == 0x7FFF ) {
5325         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5326             return propagateFloat128NaN( a, b STATUS_VAR );
5327         }
5328         float_raise( float_flag_invalid STATUS_VAR);
5329         z.low = float128_default_nan_low;
5330         z.high = float128_default_nan_high;
5331         return z;
5332     }
5333     if ( aExp == 0 ) {
5334         aExp = 1;
5335         bExp = 1;
5336     }
5337     if ( bSig0 < aSig0 ) goto aBigger;
5338     if ( aSig0 < bSig0 ) goto bBigger;
5339     if ( bSig1 < aSig1 ) goto aBigger;
5340     if ( aSig1 < bSig1 ) goto bBigger;
5341     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5342  bExpBigger:
5343     if ( bExp == 0x7FFF ) {
5344         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5345         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5346     }
5347     if ( aExp == 0 ) {
5348         ++expDiff;
5349     }
5350     else {
5351         aSig0 |= LIT64( 0x4000000000000000 );
5352     }
5353     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5354     bSig0 |= LIT64( 0x4000000000000000 );
5355  bBigger:
5356     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5357     zExp = bExp;
5358     zSign ^= 1;
5359     goto normalizeRoundAndPack;
5360  aExpBigger:
5361     if ( aExp == 0x7FFF ) {
5362         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5363         return a;
5364     }
5365     if ( bExp == 0 ) {
5366         --expDiff;
5367     }
5368     else {
5369         bSig0 |= LIT64( 0x4000000000000000 );
5370     }
5371     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5372     aSig0 |= LIT64( 0x4000000000000000 );
5373  aBigger:
5374     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5375     zExp = aExp;
5376  normalizeRoundAndPack:
5377     --zExp;
5378     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5379
5380 }
5381
5382 /*----------------------------------------------------------------------------
5383 | Returns the result of adding the quadruple-precision floating-point values
5384 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5385 | for Binary Floating-Point Arithmetic.
5386 *----------------------------------------------------------------------------*/
5387
5388 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5389 {
5390     flag aSign, bSign;
5391
5392     aSign = extractFloat128Sign( a );
5393     bSign = extractFloat128Sign( b );
5394     if ( aSign == bSign ) {
5395         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5396     }
5397     else {
5398         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5399     }
5400
5401 }
5402
5403 /*----------------------------------------------------------------------------
5404 | Returns the result of subtracting the quadruple-precision floating-point
5405 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5406 | Standard for Binary Floating-Point Arithmetic.
5407 *----------------------------------------------------------------------------*/
5408
5409 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5410 {
5411     flag aSign, bSign;
5412
5413     aSign = extractFloat128Sign( a );
5414     bSign = extractFloat128Sign( b );
5415     if ( aSign == bSign ) {
5416         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5417     }
5418     else {
5419         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5420     }
5421
5422 }
5423
5424 /*----------------------------------------------------------------------------
5425 | Returns the result of multiplying the quadruple-precision floating-point
5426 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5427 | Standard for Binary Floating-Point Arithmetic.
5428 *----------------------------------------------------------------------------*/
5429
5430 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5431 {
5432     flag aSign, bSign, zSign;
5433     int32 aExp, bExp, zExp;
5434     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5435     float128 z;
5436
5437     aSig1 = extractFloat128Frac1( a );
5438     aSig0 = extractFloat128Frac0( a );
5439     aExp = extractFloat128Exp( a );
5440     aSign = extractFloat128Sign( a );
5441     bSig1 = extractFloat128Frac1( b );
5442     bSig0 = extractFloat128Frac0( b );
5443     bExp = extractFloat128Exp( b );
5444     bSign = extractFloat128Sign( b );
5445     zSign = aSign ^ bSign;
5446     if ( aExp == 0x7FFF ) {
5447         if (    ( aSig0 | aSig1 )
5448              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5449             return propagateFloat128NaN( a, b STATUS_VAR );
5450         }
5451         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5452         return packFloat128( zSign, 0x7FFF, 0, 0 );
5453     }
5454     if ( bExp == 0x7FFF ) {
5455         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5456         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5457  invalid:
5458             float_raise( float_flag_invalid STATUS_VAR);
5459             z.low = float128_default_nan_low;
5460             z.high = float128_default_nan_high;
5461             return z;
5462         }
5463         return packFloat128( zSign, 0x7FFF, 0, 0 );
5464     }
5465     if ( aExp == 0 ) {
5466         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5467         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5468     }
5469     if ( bExp == 0 ) {
5470         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5471         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5472     }
5473     zExp = aExp + bExp - 0x4000;
5474     aSig0 |= LIT64( 0x0001000000000000 );
5475     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5476     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5477     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5478     zSig2 |= ( zSig3 != 0 );
5479     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5480         shift128ExtraRightJamming(
5481             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5482         ++zExp;
5483     }
5484     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5485
5486 }
5487
5488 /*----------------------------------------------------------------------------
5489 | Returns the result of dividing the quadruple-precision floating-point value
5490 | `a' by the corresponding value `b'.  The operation is performed according to
5491 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5492 *----------------------------------------------------------------------------*/
5493
5494 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5495 {
5496     flag aSign, bSign, zSign;
5497     int32 aExp, bExp, zExp;
5498     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5499     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5500     float128 z;
5501
5502     aSig1 = extractFloat128Frac1( a );
5503     aSig0 = extractFloat128Frac0( a );
5504     aExp = extractFloat128Exp( a );
5505     aSign = extractFloat128Sign( a );
5506     bSig1 = extractFloat128Frac1( b );
5507     bSig0 = extractFloat128Frac0( b );
5508     bExp = extractFloat128Exp( b );
5509     bSign = extractFloat128Sign( b );
5510     zSign = aSign ^ bSign;
5511     if ( aExp == 0x7FFF ) {
5512         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5513         if ( bExp == 0x7FFF ) {
5514             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5515             goto invalid;
5516         }
5517         return packFloat128( zSign, 0x7FFF, 0, 0 );
5518     }
5519     if ( bExp == 0x7FFF ) {
5520         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5521         return packFloat128( zSign, 0, 0, 0 );
5522     }
5523     if ( bExp == 0 ) {
5524         if ( ( bSig0 | bSig1 ) == 0 ) {
5525             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5526  invalid:
5527                 float_raise( float_flag_invalid STATUS_VAR);
5528                 z.low = float128_default_nan_low;
5529                 z.high = float128_default_nan_high;
5530                 return z;
5531             }
5532             float_raise( float_flag_divbyzero STATUS_VAR);
5533             return packFloat128( zSign, 0x7FFF, 0, 0 );
5534         }
5535         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5536     }
5537     if ( aExp == 0 ) {
5538         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5539         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5540     }
5541     zExp = aExp - bExp + 0x3FFD;
5542     shortShift128Left(
5543         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5544     shortShift128Left(
5545         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5546     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5547         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5548         ++zExp;
5549     }
5550     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5551     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5552     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5553     while ( (int64_t) rem0 < 0 ) {
5554         --zSig0;
5555         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5556     }
5557     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5558     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5559         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5560         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5561         while ( (int64_t) rem1 < 0 ) {
5562             --zSig1;
5563             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5564         }
5565         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5566     }
5567     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5568     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5569
5570 }
5571
5572 /*----------------------------------------------------------------------------
5573 | Returns the remainder of the quadruple-precision floating-point value `a'
5574 | with respect to the corresponding value `b'.  The operation is performed
5575 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5576 *----------------------------------------------------------------------------*/
5577
5578 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5579 {
5580     flag aSign, zSign;
5581     int32 aExp, bExp, expDiff;
5582     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5583     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5584     int64_t sigMean0;
5585     float128 z;
5586
5587     aSig1 = extractFloat128Frac1( a );
5588     aSig0 = extractFloat128Frac0( a );
5589     aExp = extractFloat128Exp( a );
5590     aSign = extractFloat128Sign( a );
5591     bSig1 = extractFloat128Frac1( b );
5592     bSig0 = extractFloat128Frac0( b );
5593     bExp = extractFloat128Exp( b );
5594     if ( aExp == 0x7FFF ) {
5595         if (    ( aSig0 | aSig1 )
5596              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5597             return propagateFloat128NaN( a, b STATUS_VAR );
5598         }
5599         goto invalid;
5600     }
5601     if ( bExp == 0x7FFF ) {
5602         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5603         return a;
5604     }
5605     if ( bExp == 0 ) {
5606         if ( ( bSig0 | bSig1 ) == 0 ) {
5607  invalid:
5608             float_raise( float_flag_invalid STATUS_VAR);
5609             z.low = float128_default_nan_low;
5610             z.high = float128_default_nan_high;
5611             return z;
5612         }
5613         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5614     }
5615     if ( aExp == 0 ) {
5616         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5617         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5618     }
5619     expDiff = aExp - bExp;
5620     if ( expDiff < -1 ) return a;
5621     shortShift128Left(
5622         aSig0 | LIT64( 0x0001000000000000 ),
5623         aSig1,
5624         15 - ( expDiff < 0 ),
5625         &aSig0,
5626         &aSig1
5627     );
5628     shortShift128Left(
5629         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5630     q = le128( bSig0, bSig1, aSig0, aSig1 );
5631     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5632     expDiff -= 64;
5633     while ( 0 < expDiff ) {
5634         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5635         q = ( 4 < q ) ? q - 4 : 0;
5636         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5637         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5638         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5639         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5640         expDiff -= 61;
5641     }
5642     if ( -64 < expDiff ) {
5643         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5644         q = ( 4 < q ) ? q - 4 : 0;
5645         q >>= - expDiff;
5646         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5647         expDiff += 52;
5648         if ( expDiff < 0 ) {
5649             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5650         }
5651         else {
5652             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5653         }
5654         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5655         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5656     }
5657     else {
5658         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5659         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5660     }
5661     do {
5662         alternateASig0 = aSig0;
5663         alternateASig1 = aSig1;
5664         ++q;
5665         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5666     } while ( 0 <= (int64_t) aSig0 );
5667     add128(
5668         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
5669     if (    ( sigMean0 < 0 )
5670          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5671         aSig0 = alternateASig0;
5672         aSig1 = alternateASig1;
5673     }
5674     zSign = ( (int64_t) aSig0 < 0 );
5675     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5676     return
5677         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5678
5679 }
5680
5681 /*----------------------------------------------------------------------------
5682 | Returns the square root of the quadruple-precision floating-point value `a'.
5683 | The operation is performed according to the IEC/IEEE Standard for Binary
5684 | Floating-Point Arithmetic.
5685 *----------------------------------------------------------------------------*/
5686
5687 float128 float128_sqrt( float128 a STATUS_PARAM )
5688 {
5689     flag aSign;
5690     int32 aExp, zExp;
5691     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5692     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5693     float128 z;
5694
5695     aSig1 = extractFloat128Frac1( a );
5696     aSig0 = extractFloat128Frac0( a );
5697     aExp = extractFloat128Exp( a );
5698     aSign = extractFloat128Sign( a );
5699     if ( aExp == 0x7FFF ) {
5700         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5701         if ( ! aSign ) return a;
5702         goto invalid;
5703     }
5704     if ( aSign ) {
5705         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5706  invalid:
5707         float_raise( float_flag_invalid STATUS_VAR);
5708         z.low = float128_default_nan_low;
5709         z.high = float128_default_nan_high;
5710         return z;
5711     }
5712     if ( aExp == 0 ) {
5713         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5714         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5715     }
5716     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5717     aSig0 |= LIT64( 0x0001000000000000 );
5718     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5719     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5720     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5721     doubleZSig0 = zSig0<<1;
5722     mul64To128( zSig0, zSig0, &term0, &term1 );
5723     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5724     while ( (int64_t) rem0 < 0 ) {
5725         --zSig0;
5726         doubleZSig0 -= 2;
5727         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5728     }
5729     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5730     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5731         if ( zSig1 == 0 ) zSig1 = 1;
5732         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5733         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5734         mul64To128( zSig1, zSig1, &term2, &term3 );
5735         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5736         while ( (int64_t) rem1 < 0 ) {
5737             --zSig1;
5738             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5739             term3 |= 1;
5740             term2 |= doubleZSig0;
5741             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5742         }
5743         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5744     }
5745     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5746     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5747
5748 }
5749
5750 /*----------------------------------------------------------------------------
5751 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5752 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5753 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5754 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5755 *----------------------------------------------------------------------------*/
5756
5757 int float128_eq( float128 a, float128 b STATUS_PARAM )
5758 {
5759
5760     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5761               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5762          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5763               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5764        ) {
5765         float_raise( float_flag_invalid STATUS_VAR);
5766         return 0;
5767     }
5768     return
5769            ( a.low == b.low )
5770         && (    ( a.high == b.high )
5771              || (    ( a.low == 0 )
5772                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5773            );
5774
5775 }
5776
5777 /*----------------------------------------------------------------------------
5778 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5779 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5780 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5781 | Arithmetic.
5782 *----------------------------------------------------------------------------*/
5783
5784 int float128_le( float128 a, float128 b STATUS_PARAM )
5785 {
5786     flag aSign, bSign;
5787
5788     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5789               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5790          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5791               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5792        ) {
5793         float_raise( float_flag_invalid STATUS_VAR);
5794         return 0;
5795     }
5796     aSign = extractFloat128Sign( a );
5797     bSign = extractFloat128Sign( b );
5798     if ( aSign != bSign ) {
5799         return
5800                aSign
5801             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5802                  == 0 );
5803     }
5804     return
5805           aSign ? le128( b.high, b.low, a.high, a.low )
5806         : le128( a.high, a.low, b.high, b.low );
5807
5808 }
5809
5810 /*----------------------------------------------------------------------------
5811 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5812 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5813 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5814 *----------------------------------------------------------------------------*/
5815
5816 int float128_lt( float128 a, float128 b STATUS_PARAM )
5817 {
5818     flag aSign, bSign;
5819
5820     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5821               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5822          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5823               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5824        ) {
5825         float_raise( float_flag_invalid STATUS_VAR);
5826         return 0;
5827     }
5828     aSign = extractFloat128Sign( a );
5829     bSign = extractFloat128Sign( b );
5830     if ( aSign != bSign ) {
5831         return
5832                aSign
5833             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5834                  != 0 );
5835     }
5836     return
5837           aSign ? lt128( b.high, b.low, a.high, a.low )
5838         : lt128( a.high, a.low, b.high, b.low );
5839
5840 }
5841
5842 /*----------------------------------------------------------------------------
5843 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5844 | be compared, and 0 otherwise.  The comparison is performed according to the
5845 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5846 *----------------------------------------------------------------------------*/
5847
5848 int float128_unordered( float128 a, float128 b STATUS_PARAM )
5849 {
5850     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5851               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5852          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5853               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5854        ) {
5855         float_raise( float_flag_invalid STATUS_VAR);
5856         return 1;
5857     }
5858     return 0;
5859 }
5860
5861 /*----------------------------------------------------------------------------
5862 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5863 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5864 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5865 *----------------------------------------------------------------------------*/
5866
5867 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
5868 {
5869
5870     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5871               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5872          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5873               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5874        ) {
5875         if (    float128_is_signaling_nan( a )
5876              || float128_is_signaling_nan( b ) ) {
5877             float_raise( float_flag_invalid STATUS_VAR);
5878         }
5879         return 0;
5880     }
5881     return
5882            ( a.low == b.low )
5883         && (    ( a.high == b.high )
5884              || (    ( a.low == 0 )
5885                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5886            );
5887
5888 }
5889
5890 /*----------------------------------------------------------------------------
5891 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5892 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5893 | cause an exception.  Otherwise, the comparison is performed according to the
5894 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5895 *----------------------------------------------------------------------------*/
5896
5897 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5898 {
5899     flag aSign, bSign;
5900
5901     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5902               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5903          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5904               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5905        ) {
5906         if (    float128_is_signaling_nan( a )
5907              || float128_is_signaling_nan( b ) ) {
5908             float_raise( float_flag_invalid STATUS_VAR);
5909         }
5910         return 0;
5911     }
5912     aSign = extractFloat128Sign( a );
5913     bSign = extractFloat128Sign( b );
5914     if ( aSign != bSign ) {
5915         return
5916                aSign
5917             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5918                  == 0 );
5919     }
5920     return
5921           aSign ? le128( b.high, b.low, a.high, a.low )
5922         : le128( a.high, a.low, b.high, b.low );
5923
5924 }
5925
5926 /*----------------------------------------------------------------------------
5927 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5928 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5929 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5930 | Standard for Binary Floating-Point Arithmetic.
5931 *----------------------------------------------------------------------------*/
5932
5933 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5934 {
5935     flag aSign, bSign;
5936
5937     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5938               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5939          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5940               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5941        ) {
5942         if (    float128_is_signaling_nan( a )
5943              || float128_is_signaling_nan( b ) ) {
5944             float_raise( float_flag_invalid STATUS_VAR);
5945         }
5946         return 0;
5947     }
5948     aSign = extractFloat128Sign( a );
5949     bSign = extractFloat128Sign( b );
5950     if ( aSign != bSign ) {
5951         return
5952                aSign
5953             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5954                  != 0 );
5955     }
5956     return
5957           aSign ? lt128( b.high, b.low, a.high, a.low )
5958         : lt128( a.high, a.low, b.high, b.low );
5959
5960 }
5961
5962 /*----------------------------------------------------------------------------
5963 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5964 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5965 | comparison is performed according to the IEC/IEEE Standard for Binary
5966 | Floating-Point Arithmetic.
5967 *----------------------------------------------------------------------------*/
5968
5969 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
5970 {
5971     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5972               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5973          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5974               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5975        ) {
5976         if (    float128_is_signaling_nan( a )
5977              || float128_is_signaling_nan( b ) ) {
5978             float_raise( float_flag_invalid STATUS_VAR);
5979         }
5980         return 1;
5981     }
5982     return 0;
5983 }
5984
5985 #endif
5986
5987 /* misc functions */
5988 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5989 {
5990     return int64_to_float32(a STATUS_VAR);
5991 }
5992
5993 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5994 {
5995     return int64_to_float64(a STATUS_VAR);
5996 }
5997
5998 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5999 {
6000     int64_t v;
6001     unsigned int res;
6002
6003     v = float32_to_int64(a STATUS_VAR);
6004     if (v < 0) {
6005         res = 0;
6006         float_raise( float_flag_invalid STATUS_VAR);
6007     } else if (v > 0xffffffff) {
6008         res = 0xffffffff;
6009         float_raise( float_flag_invalid STATUS_VAR);
6010     } else {
6011         res = v;
6012     }
6013     return res;
6014 }
6015
6016 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6017 {
6018     int64_t v;
6019     unsigned int res;
6020
6021     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6022     if (v < 0) {
6023         res = 0;
6024         float_raise( float_flag_invalid STATUS_VAR);
6025     } else if (v > 0xffffffff) {
6026         res = 0xffffffff;
6027         float_raise( float_flag_invalid STATUS_VAR);
6028     } else {
6029         res = v;
6030     }
6031     return res;
6032 }
6033
6034 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
6035 {
6036     int64_t v;
6037     unsigned int res;
6038
6039     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6040     if (v < 0) {
6041         res = 0;
6042         float_raise( float_flag_invalid STATUS_VAR);
6043     } else if (v > 0xffff) {
6044         res = 0xffff;
6045         float_raise( float_flag_invalid STATUS_VAR);
6046     } else {
6047         res = v;
6048     }
6049     return res;
6050 }
6051
6052 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
6053 {
6054     int64_t v;
6055     unsigned int res;
6056
6057     v = float64_to_int64(a STATUS_VAR);
6058     if (v < 0) {
6059         res = 0;
6060         float_raise( float_flag_invalid STATUS_VAR);
6061     } else if (v > 0xffffffff) {
6062         res = 0xffffffff;
6063         float_raise( float_flag_invalid STATUS_VAR);
6064     } else {
6065         res = v;
6066     }
6067     return res;
6068 }
6069
6070 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6071 {
6072     int64_t v;
6073     unsigned int res;
6074
6075     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6076     if (v < 0) {
6077         res = 0;
6078         float_raise( float_flag_invalid STATUS_VAR);
6079     } else if (v > 0xffffffff) {
6080         res = 0xffffffff;
6081         float_raise( float_flag_invalid STATUS_VAR);
6082     } else {
6083         res = v;
6084     }
6085     return res;
6086 }
6087
6088 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
6089 {
6090     int64_t v;
6091     unsigned int res;
6092
6093     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6094     if (v < 0) {
6095         res = 0;
6096         float_raise( float_flag_invalid STATUS_VAR);
6097     } else if (v > 0xffff) {
6098         res = 0xffff;
6099         float_raise( float_flag_invalid STATUS_VAR);
6100     } else {
6101         res = v;
6102     }
6103     return res;
6104 }
6105
6106 /* FIXME: This looks broken.  */
6107 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6108 {
6109     int64_t v;
6110
6111     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6112     v += float64_val(a);
6113     v = float64_to_int64(make_float64(v) STATUS_VAR);
6114
6115     return v - INT64_MIN;
6116 }
6117
6118 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6119 {
6120     int64_t v;
6121
6122     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6123     v += float64_val(a);
6124     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6125
6126     return v - INT64_MIN;
6127 }
6128
6129 #define COMPARE(s, nan_exp)                                                  \
6130 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
6131                                       int is_quiet STATUS_PARAM )            \
6132 {                                                                            \
6133     flag aSign, bSign;                                                       \
6134     uint ## s ## _t av, bv;                                                  \
6135     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
6136     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
6137                                                                              \
6138     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6139          extractFloat ## s ## Frac( a ) ) ||                                 \
6140         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6141           extractFloat ## s ## Frac( b ) )) {                                \
6142         if (!is_quiet ||                                                     \
6143             float ## s ## _is_signaling_nan( a ) ||                          \
6144             float ## s ## _is_signaling_nan( b ) ) {                         \
6145             float_raise( float_flag_invalid STATUS_VAR);                     \
6146         }                                                                    \
6147         return float_relation_unordered;                                     \
6148     }                                                                        \
6149     aSign = extractFloat ## s ## Sign( a );                                  \
6150     bSign = extractFloat ## s ## Sign( b );                                  \
6151     av = float ## s ## _val(a);                                              \
6152     bv = float ## s ## _val(b);                                              \
6153     if ( aSign != bSign ) {                                                  \
6154         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6155             /* zero case */                                                  \
6156             return float_relation_equal;                                     \
6157         } else {                                                             \
6158             return 1 - (2 * aSign);                                          \
6159         }                                                                    \
6160     } else {                                                                 \
6161         if (av == bv) {                                                      \
6162             return float_relation_equal;                                     \
6163         } else {                                                             \
6164             return 1 - 2 * (aSign ^ ( av < bv ));                            \
6165         }                                                                    \
6166     }                                                                        \
6167 }                                                                            \
6168                                                                              \
6169 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6170 {                                                                            \
6171     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6172 }                                                                            \
6173                                                                              \
6174 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6175 {                                                                            \
6176     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6177 }
6178
6179 COMPARE(32, 0xff)
6180 COMPARE(64, 0x7ff)
6181
6182 INLINE int float128_compare_internal( float128 a, float128 b,
6183                                       int is_quiet STATUS_PARAM )
6184 {
6185     flag aSign, bSign;
6186
6187     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6188           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6189         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6190           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6191         if (!is_quiet ||
6192             float128_is_signaling_nan( a ) ||
6193             float128_is_signaling_nan( b ) ) {
6194             float_raise( float_flag_invalid STATUS_VAR);
6195         }
6196         return float_relation_unordered;
6197     }
6198     aSign = extractFloat128Sign( a );
6199     bSign = extractFloat128Sign( b );
6200     if ( aSign != bSign ) {
6201         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6202             /* zero case */
6203             return float_relation_equal;
6204         } else {
6205             return 1 - (2 * aSign);
6206         }
6207     } else {
6208         if (a.low == b.low && a.high == b.high) {
6209             return float_relation_equal;
6210         } else {
6211             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6212         }
6213     }
6214 }
6215
6216 int float128_compare( float128 a, float128 b STATUS_PARAM )
6217 {
6218     return float128_compare_internal(a, b, 0 STATUS_VAR);
6219 }
6220
6221 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6222 {
6223     return float128_compare_internal(a, b, 1 STATUS_VAR);
6224 }
6225
6226 /* min() and max() functions. These can't be implemented as
6227  * 'compare and pick one input' because that would mishandle
6228  * NaNs and +0 vs -0.
6229  */
6230 #define MINMAX(s, nan_exp)                                              \
6231 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
6232                                         int ismin STATUS_PARAM )        \
6233 {                                                                       \
6234     flag aSign, bSign;                                                  \
6235     uint ## s ## _t av, bv;                                             \
6236     a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
6237     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
6238     if (float ## s ## _is_any_nan(a) ||                                 \
6239         float ## s ## _is_any_nan(b)) {                                 \
6240         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
6241     }                                                                   \
6242     aSign = extractFloat ## s ## Sign(a);                               \
6243     bSign = extractFloat ## s ## Sign(b);                               \
6244     av = float ## s ## _val(a);                                         \
6245     bv = float ## s ## _val(b);                                         \
6246     if (aSign != bSign) {                                               \
6247         if (ismin) {                                                    \
6248             return aSign ? a : b;                                       \
6249         } else {                                                        \
6250             return aSign ? b : a;                                       \
6251         }                                                               \
6252     } else {                                                            \
6253         if (ismin) {                                                    \
6254             return (aSign ^ (av < bv)) ? a : b;                         \
6255         } else {                                                        \
6256             return (aSign ^ (av < bv)) ? b : a;                         \
6257         }                                                               \
6258     }                                                                   \
6259 }                                                                       \
6260                                                                         \
6261 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
6262 {                                                                       \
6263     return float ## s ## _minmax(a, b, 1 STATUS_VAR);                   \
6264 }                                                                       \
6265                                                                         \
6266 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
6267 {                                                                       \
6268     return float ## s ## _minmax(a, b, 0 STATUS_VAR);                   \
6269 }
6270
6271 MINMAX(32, 0xff)
6272 MINMAX(64, 0x7ff)
6273
6274
6275 /* Multiply A by 2 raised to the power N.  */
6276 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6277 {
6278     flag aSign;
6279     int16 aExp;
6280     uint32_t aSig;
6281
6282     a = float32_squash_input_denormal(a STATUS_VAR);
6283     aSig = extractFloat32Frac( a );
6284     aExp = extractFloat32Exp( a );
6285     aSign = extractFloat32Sign( a );
6286
6287     if ( aExp == 0xFF ) {
6288         return a;
6289     }
6290     if ( aExp != 0 )
6291         aSig |= 0x00800000;
6292     else if ( aSig == 0 )
6293         return a;
6294
6295     aExp += n - 1;
6296     aSig <<= 7;
6297     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6298 }
6299
6300 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6301 {
6302     flag aSign;
6303     int16 aExp;
6304     uint64_t aSig;
6305
6306     a = float64_squash_input_denormal(a STATUS_VAR);
6307     aSig = extractFloat64Frac( a );
6308     aExp = extractFloat64Exp( a );
6309     aSign = extractFloat64Sign( a );
6310
6311     if ( aExp == 0x7FF ) {
6312         return a;
6313     }
6314     if ( aExp != 0 )
6315         aSig |= LIT64( 0x0010000000000000 );
6316     else if ( aSig == 0 )
6317         return a;
6318
6319     aExp += n - 1;
6320     aSig <<= 10;
6321     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6322 }
6323
6324 #ifdef FLOATX80
6325 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6326 {
6327     flag aSign;
6328     int16 aExp;
6329     uint64_t aSig;
6330
6331     aSig = extractFloatx80Frac( a );
6332     aExp = extractFloatx80Exp( a );
6333     aSign = extractFloatx80Sign( a );
6334
6335     if ( aExp == 0x7FF ) {
6336         return a;
6337     }
6338     if (aExp == 0 && aSig == 0)
6339         return a;
6340
6341     aExp += n;
6342     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6343                                           aSign, aExp, aSig, 0 STATUS_VAR );
6344 }
6345 #endif
6346
6347 #ifdef FLOAT128
6348 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6349 {
6350     flag aSign;
6351     int32 aExp;
6352     uint64_t aSig0, aSig1;
6353
6354     aSig1 = extractFloat128Frac1( a );
6355     aSig0 = extractFloat128Frac0( a );
6356     aExp = extractFloat128Exp( a );
6357     aSign = extractFloat128Sign( a );
6358     if ( aExp == 0x7FFF ) {
6359         return a;
6360     }
6361     if ( aExp != 0 )
6362         aSig0 |= LIT64( 0x0001000000000000 );
6363     else if ( aSig0 == 0 && aSig1 == 0 )
6364         return a;
6365
6366     aExp += n - 1;
6367     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6368                                           STATUS_VAR );
6369
6370 }
6371 #endif