fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * Derived from SoftFloat.
   5  */
   6
   7 /*============================================================================
   8
   9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  10 Package, Release 2b.
  11
  12 Written by John R. Hauser.  This work was made possible in part by the
  13 International Computer Science Institute, located at Suite 600, 1947 Center
  14 Street, Berkeley, California 94704.  Funding was partially provided by the
  15 National Science Foundation under grant MIP-9311980.  The original version
  16 of this code was written as part of a project to build a fixed-point vector
  17 processor in collaboration with the University of California at Berkeley,
  18 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  20 arithmetic/SoftFloat.html'.
  21
  22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  24 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  30
  31 Derivative works are acceptable, even for commercial purposes, so long as
  32 (1) the source code for the derivative work includes prominent notice that
  33 the work is derivative, and (2) the source code includes prominent notice with
  34 these four paragraphs for those parts of this code that are retained.
  35
  36 =============================================================================*/
  37
  38 #include "softfloat.h"
  39
  40 /*----------------------------------------------------------------------------
  41 | Primitive arithmetic functions, including multi-word arithmetic, and
  42 | division and square root approximations.  (Can be specialized to target if
  43 | desired.)
  44 *----------------------------------------------------------------------------*/
  45 #include "softfloat-macros.h"
  46
  47 /*----------------------------------------------------------------------------
  48 | Functions and definitions to determine:  (1) whether tininess for underflow
  49 | is detected before or after rounding by default, (2) what (if anything)
  50 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  51 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  52 | are propagated from function inputs to output.  These details are target-
  53 | specific.
  54 *----------------------------------------------------------------------------*/
  55 #include "softfloat-specialize.h"
  56
  57 void set_float_rounding_mode(int val STATUS_PARAM)
  58 {
  59     STATUS(float_rounding_mode) = val;
  60 }
  61
  62 void set_float_exception_flags(int val STATUS_PARAM)
  63 {
  64     STATUS(float_exception_flags) = val;
  65 }
  66
  67 #ifdef FLOATX80
  68 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  69 {
  70     STATUS(floatx80_rounding_precision) = val;
  71 }
  72 #endif
  73
  74 /*----------------------------------------------------------------------------
  75 | Returns the fraction bits of the half-precision floating-point value `a'.
  76 *----------------------------------------------------------------------------*/
  77
  78 INLINE uint32_t extractFloat16Frac(float16 a)
  79 {
  80     return float16_val(a) & 0x3ff;
  81 }
  82
  83 /*----------------------------------------------------------------------------
  84 | Returns the exponent bits of the half-precision floating-point value `a'.
  85 *----------------------------------------------------------------------------*/
  86
  87 INLINE int16 extractFloat16Exp(float16 a)
  88 {
  89     return (float16_val(a) >> 10) & 0x1f;
  90 }
  91
  92 /*----------------------------------------------------------------------------
  93 | Returns the sign bit of the single-precision floating-point value `a'.
  94 *----------------------------------------------------------------------------*/
  95
  96 INLINE flag extractFloat16Sign(float16 a)
  97 {
  98     return float16_val(a)>>15;
  99 }
 100
 101 /*----------------------------------------------------------------------------
 102 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 103 | and 7, and returns the properly rounded 32-bit integer corresponding to the
 104 | input.  If `zSign' is 1, the input is negated before being converted to an
 105 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 106 | is simply rounded to an integer, with the inexact exception raised if the
 107 | input cannot be represented exactly as an integer.  However, if the fixed-
 108 | point input is too large, the invalid exception is raised and the largest
 109 | positive or negative integer is returned.
 110 *----------------------------------------------------------------------------*/
 111
 112 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
 113 {
 114     int8 roundingMode;
 115     flag roundNearestEven;
 116     int8 roundIncrement, roundBits;
 117     int32 z;
 118
 119     roundingMode = STATUS(float_rounding_mode);
 120     roundNearestEven = ( roundingMode == float_round_nearest_even );
 121     roundIncrement = 0x40;
 122     if ( ! roundNearestEven ) {
 123         if ( roundingMode == float_round_to_zero ) {
 124             roundIncrement = 0;
 125         }
 126         else {
 127             roundIncrement = 0x7F;
 128             if ( zSign ) {
 129                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 130             }
 131             else {
 132                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 133             }
 134         }
 135     }
 136     roundBits = absZ & 0x7F;
 137     absZ = ( absZ + roundIncrement )>>7;
 138     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 139     z = absZ;
 140     if ( zSign ) z = - z;
 141     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 142         float_raise( float_flag_invalid STATUS_VAR);
 143         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 144     }
 145     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 146     return z;
 147
 148 }
 149
 150 /*----------------------------------------------------------------------------
 151 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 152 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 153 | and returns the properly rounded 64-bit integer corresponding to the input.
 154 | If `zSign' is 1, the input is negated before being converted to an integer.
 155 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 156 | the inexact exception raised if the input cannot be represented exactly as
 157 | an integer.  However, if the fixed-point input is too large, the invalid
 158 | exception is raised and the largest positive or negative integer is
 159 | returned.
 160 *----------------------------------------------------------------------------*/
 161
 162 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
 163 {
 164     int8 roundingMode;
 165     flag roundNearestEven, increment;
 166     int64 z;
 167
 168     roundingMode = STATUS(float_rounding_mode);
 169     roundNearestEven = ( roundingMode == float_round_nearest_even );
 170     increment = ( (int64_t) absZ1 < 0 );
 171     if ( ! roundNearestEven ) {
 172         if ( roundingMode == float_round_to_zero ) {
 173             increment = 0;
 174         }
 175         else {
 176             if ( zSign ) {
 177                 increment = ( roundingMode == float_round_down ) && absZ1;
 178             }
 179             else {
 180                 increment = ( roundingMode == float_round_up ) && absZ1;
 181             }
 182         }
 183     }
 184     if ( increment ) {
 185         ++absZ0;
 186         if ( absZ0 == 0 ) goto overflow;
 187         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 188     }
 189     z = absZ0;
 190     if ( zSign ) z = - z;
 191     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 192  overflow:
 193         float_raise( float_flag_invalid STATUS_VAR);
 194         return
 195               zSign ? (int64_t) LIT64( 0x8000000000000000 )
 196             : LIT64( 0x7FFFFFFFFFFFFFFF );
 197     }
 198     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 199     return z;
 200
 201 }
 202
 203 /*----------------------------------------------------------------------------
 204 | Returns the fraction bits of the single-precision floating-point value `a'.
 205 *----------------------------------------------------------------------------*/
 206
 207 INLINE uint32_t extractFloat32Frac( float32 a )
 208 {
 209
 210     return float32_val(a) & 0x007FFFFF;
 211
 212 }
 213
 214 /*----------------------------------------------------------------------------
 215 | Returns the exponent bits of the single-precision floating-point value `a'.
 216 *----------------------------------------------------------------------------*/
 217
 218 INLINE int16 extractFloat32Exp( float32 a )
 219 {
 220
 221     return ( float32_val(a)>>23 ) & 0xFF;
 222
 223 }
 224
 225 /*----------------------------------------------------------------------------
 226 | Returns the sign bit of the single-precision floating-point value `a'.
 227 *----------------------------------------------------------------------------*/
 228
 229 INLINE flag extractFloat32Sign( float32 a )
 230 {
 231
 232     return float32_val(a)>>31;
 233
 234 }
 235
 236 /*----------------------------------------------------------------------------
 237 | If `a' is denormal and we are in flush-to-zero mode then set the
 238 | input-denormal exception and return zero. Otherwise just return the value.
 239 *----------------------------------------------------------------------------*/
 240 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 241 {
 242     if (STATUS(flush_inputs_to_zero)) {
 243         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 244             float_raise(float_flag_input_denormal STATUS_VAR);
 245             return make_float32(float32_val(a) & 0x80000000);
 246         }
 247     }
 248     return a;
 249 }
 250
 251 /*----------------------------------------------------------------------------
 252 | Normalizes the subnormal single-precision floating-point value represented
 253 | by the denormalized significand `aSig'.  The normalized exponent and
 254 | significand are stored at the locations pointed to by `zExpPtr' and
 255 | `zSigPtr', respectively.
 256 *----------------------------------------------------------------------------*/
 257
 258 static void
 259  normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
 260 {
 261     int8 shiftCount;
 262
 263     shiftCount = countLeadingZeros32( aSig ) - 8;
 264     *zSigPtr = aSig<<shiftCount;
 265     *zExpPtr = 1 - shiftCount;
 266
 267 }
 268
 269 /*----------------------------------------------------------------------------
 270 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 271 | single-precision floating-point value, returning the result.  After being
 272 | shifted into the proper positions, the three fields are simply added
 273 | together to form the result.  This means that any integer portion of `zSig'
 274 | will be added into the exponent.  Since a properly normalized significand
 275 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 276 | than the desired result exponent whenever `zSig' is a complete, normalized
 277 | significand.
 278 *----------------------------------------------------------------------------*/
 279
 280 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
 281 {
 282
 283     return make_float32(
 284           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 285
 286 }
 287
 288 /*----------------------------------------------------------------------------
 289 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 290 | and significand `zSig', and returns the proper single-precision floating-
 291 | point value corresponding to the abstract input.  Ordinarily, the abstract
 292 | value is simply rounded and packed into the single-precision format, with
 293 | the inexact exception raised if the abstract input cannot be represented
 294 | exactly.  However, if the abstract value is too large, the overflow and
 295 | inexact exceptions are raised and an infinity or maximal finite value is
 296 | returned.  If the abstract value is too small, the input value is rounded to
 297 | a subnormal number, and the underflow and inexact exceptions are raised if
 298 | the abstract input cannot be represented exactly as a subnormal single-
 299 | precision floating-point number.
 300 |     The input significand `zSig' has its binary point between bits 30
 301 | and 29, which is 7 bits to the left of the usual location.  This shifted
 302 | significand must be normalized or smaller.  If `zSig' is not normalized,
 303 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 304 | and it must not require rounding.  In the usual case that `zSig' is
 305 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 306 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 307 | Binary Floating-Point Arithmetic.
 308 *----------------------------------------------------------------------------*/
 309
 310 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
 311 {
 312     int8 roundingMode;
 313     flag roundNearestEven;
 314     int8 roundIncrement, roundBits;
 315     flag isTiny;
 316
 317     roundingMode = STATUS(float_rounding_mode);
 318     roundNearestEven = ( roundingMode == float_round_nearest_even );
 319     roundIncrement = 0x40;
 320     if ( ! roundNearestEven ) {
 321         if ( roundingMode == float_round_to_zero ) {
 322             roundIncrement = 0;
 323         }
 324         else {
 325             roundIncrement = 0x7F;
 326             if ( zSign ) {
 327                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 328             }
 329             else {
 330                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 331             }
 332         }
 333     }
 334     roundBits = zSig & 0x7F;
 335     if ( 0xFD <= (uint16_t) zExp ) {
 336         if (    ( 0xFD < zExp )
 337              || (    ( zExp == 0xFD )
 338                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 339            ) {
 340             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 341             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 342         }
 343         if ( zExp < 0 ) {
 344             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
 345             isTiny =
 346                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 347                 || ( zExp < -1 )
 348                 || ( zSig + roundIncrement < 0x80000000 );
 349             shift32RightJamming( zSig, - zExp, &zSig );
 350             zExp = 0;
 351             roundBits = zSig & 0x7F;
 352             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 353         }
 354     }
 355     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 356     zSig = ( zSig + roundIncrement )>>7;
 357     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 358     if ( zSig == 0 ) zExp = 0;
 359     return packFloat32( zSign, zExp, zSig );
 360
 361 }
 362
 363 /*----------------------------------------------------------------------------
 364 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 365 | and significand `zSig', and returns the proper single-precision floating-
 366 | point value corresponding to the abstract input.  This routine is just like
 367 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 368 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 369 | floating-point exponent.
 370 *----------------------------------------------------------------------------*/
 371
 372 static float32
 373  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
 374 {
 375     int8 shiftCount;
 376
 377     shiftCount = countLeadingZeros32( zSig ) - 1;
 378     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 379
 380 }
 381
 382 /*----------------------------------------------------------------------------
 383 | Returns the fraction bits of the double-precision floating-point value `a'.
 384 *----------------------------------------------------------------------------*/
 385
 386 INLINE uint64_t extractFloat64Frac( float64 a )
 387 {
 388
 389     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 390
 391 }
 392
 393 /*----------------------------------------------------------------------------
 394 | Returns the exponent bits of the double-precision floating-point value `a'.
 395 *----------------------------------------------------------------------------*/
 396
 397 INLINE int16 extractFloat64Exp( float64 a )
 398 {
 399
 400     return ( float64_val(a)>>52 ) & 0x7FF;
 401
 402 }
 403
 404 /*----------------------------------------------------------------------------
 405 | Returns the sign bit of the double-precision floating-point value `a'.
 406 *----------------------------------------------------------------------------*/
 407
 408 INLINE flag extractFloat64Sign( float64 a )
 409 {
 410
 411     return float64_val(a)>>63;
 412
 413 }
 414
 415 /*----------------------------------------------------------------------------
 416 | If `a' is denormal and we are in flush-to-zero mode then set the
 417 | input-denormal exception and return zero. Otherwise just return the value.
 418 *----------------------------------------------------------------------------*/
 419 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 420 {
 421     if (STATUS(flush_inputs_to_zero)) {
 422         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 423             float_raise(float_flag_input_denormal STATUS_VAR);
 424             return make_float64(float64_val(a) & (1ULL << 63));
 425         }
 426     }
 427     return a;
 428 }
 429
 430 /*----------------------------------------------------------------------------
 431 | Normalizes the subnormal double-precision floating-point value represented
 432 | by the denormalized significand `aSig'.  The normalized exponent and
 433 | significand are stored at the locations pointed to by `zExpPtr' and
 434 | `zSigPtr', respectively.
 435 *----------------------------------------------------------------------------*/
 436
 437 static void
 438  normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
 439 {
 440     int8 shiftCount;
 441
 442     shiftCount = countLeadingZeros64( aSig ) - 11;
 443     *zSigPtr = aSig<<shiftCount;
 444     *zExpPtr = 1 - shiftCount;
 445
 446 }
 447
 448 /*----------------------------------------------------------------------------
 449 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 450 | double-precision floating-point value, returning the result.  After being
 451 | shifted into the proper positions, the three fields are simply added
 452 | together to form the result.  This means that any integer portion of `zSig'
 453 | will be added into the exponent.  Since a properly normalized significand
 454 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 455 | than the desired result exponent whenever `zSig' is a complete, normalized
 456 | significand.
 457 *----------------------------------------------------------------------------*/
 458
 459 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
 460 {
 461
 462     return make_float64(
 463         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 464
 465 }
 466
 467 /*----------------------------------------------------------------------------
 468 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 469 | and significand `zSig', and returns the proper double-precision floating-
 470 | point value corresponding to the abstract input.  Ordinarily, the abstract
 471 | value is simply rounded and packed into the double-precision format, with
 472 | the inexact exception raised if the abstract input cannot be represented
 473 | exactly.  However, if the abstract value is too large, the overflow and
 474 | inexact exceptions are raised and an infinity or maximal finite value is
 475 | returned.  If the abstract value is too small, the input value is rounded
 476 | to a subnormal number, and the underflow and inexact exceptions are raised
 477 | if the abstract input cannot be represented exactly as a subnormal double-
 478 | precision floating-point number.
 479 |     The input significand `zSig' has its binary point between bits 62
 480 | and 61, which is 10 bits to the left of the usual location.  This shifted
 481 | significand must be normalized or smaller.  If `zSig' is not normalized,
 482 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 483 | and it must not require rounding.  In the usual case that `zSig' is
 484 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 485 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 486 | Binary Floating-Point Arithmetic.
 487 *----------------------------------------------------------------------------*/
 488
 489 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
 490 {
 491     int8 roundingMode;
 492     flag roundNearestEven;
 493     int16 roundIncrement, roundBits;
 494     flag isTiny;
 495
 496     roundingMode = STATUS(float_rounding_mode);
 497     roundNearestEven = ( roundingMode == float_round_nearest_even );
 498     roundIncrement = 0x200;
 499     if ( ! roundNearestEven ) {
 500         if ( roundingMode == float_round_to_zero ) {
 501             roundIncrement = 0;
 502         }
 503         else {
 504             roundIncrement = 0x3FF;
 505             if ( zSign ) {
 506                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 507             }
 508             else {
 509                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 510             }
 511         }
 512     }
 513     roundBits = zSig & 0x3FF;
 514     if ( 0x7FD <= (uint16_t) zExp ) {
 515         if (    ( 0x7FD < zExp )
 516              || (    ( zExp == 0x7FD )
 517                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 518            ) {
 519             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 520             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 521         }
 522         if ( zExp < 0 ) {
 523             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
 524             isTiny =
 525                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 526                 || ( zExp < -1 )
 527                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 528             shift64RightJamming( zSig, - zExp, &zSig );
 529             zExp = 0;
 530             roundBits = zSig & 0x3FF;
 531             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 532         }
 533     }
 534     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 535     zSig = ( zSig + roundIncrement )>>10;
 536     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 537     if ( zSig == 0 ) zExp = 0;
 538     return packFloat64( zSign, zExp, zSig );
 539
 540 }
 541
 542 /*----------------------------------------------------------------------------
 543 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 544 | and significand `zSig', and returns the proper double-precision floating-
 545 | point value corresponding to the abstract input.  This routine is just like
 546 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 547 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 548 | floating-point exponent.
 549 *----------------------------------------------------------------------------*/
 550
 551 static float64
 552  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
 553 {
 554     int8 shiftCount;
 555
 556     shiftCount = countLeadingZeros64( zSig ) - 1;
 557     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 558
 559 }
 560
 561 #ifdef FLOATX80
 562
 563 /*----------------------------------------------------------------------------
 564 | Returns the fraction bits of the extended double-precision floating-point
 565 | value `a'.
 566 *----------------------------------------------------------------------------*/
 567
 568 INLINE uint64_t extractFloatx80Frac( floatx80 a )
 569 {
 570
 571     return a.low;
 572
 573 }
 574
 575 /*----------------------------------------------------------------------------
 576 | Returns the exponent bits of the extended double-precision floating-point
 577 | value `a'.
 578 *----------------------------------------------------------------------------*/
 579
 580 INLINE int32 extractFloatx80Exp( floatx80 a )
 581 {
 582
 583     return a.high & 0x7FFF;
 584
 585 }
 586
 587 /*----------------------------------------------------------------------------
 588 | Returns the sign bit of the extended double-precision floating-point value
 589 | `a'.
 590 *----------------------------------------------------------------------------*/
 591
 592 INLINE flag extractFloatx80Sign( floatx80 a )
 593 {
 594
 595     return a.high>>15;
 596
 597 }
 598
 599 /*----------------------------------------------------------------------------
 600 | Normalizes the subnormal extended double-precision floating-point value
 601 | represented by the denormalized significand `aSig'.  The normalized exponent
 602 | and significand are stored at the locations pointed to by `zExpPtr' and
 603 | `zSigPtr', respectively.
 604 *----------------------------------------------------------------------------*/
 605
 606 static void
 607  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
 608 {
 609     int8 shiftCount;
 610
 611     shiftCount = countLeadingZeros64( aSig );
 612     *zSigPtr = aSig<<shiftCount;
 613     *zExpPtr = 1 - shiftCount;
 614
 615 }
 616
 617 /*----------------------------------------------------------------------------
 618 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 619 | extended double-precision floating-point value, returning the result.
 620 *----------------------------------------------------------------------------*/
 621
 622 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
 623 {
 624     floatx80 z;
 625
 626     z.low = zSig;
 627     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 628     return z;
 629
 630 }
 631
 632 /*----------------------------------------------------------------------------
 633 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 634 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 635 | and returns the proper extended double-precision floating-point value
 636 | corresponding to the abstract input.  Ordinarily, the abstract value is
 637 | rounded and packed into the extended double-precision format, with the
 638 | inexact exception raised if the abstract input cannot be represented
 639 | exactly.  However, if the abstract value is too large, the overflow and
 640 | inexact exceptions are raised and an infinity or maximal finite value is
 641 | returned.  If the abstract value is too small, the input value is rounded to
 642 | a subnormal number, and the underflow and inexact exceptions are raised if
 643 | the abstract input cannot be represented exactly as a subnormal extended
 644 | double-precision floating-point number.
 645 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 646 | number of bits as single or double precision, respectively.  Otherwise, the
 647 | result is rounded to the full precision of the extended double-precision
 648 | format.
 649 |     The input significand must be normalized or smaller.  If the input
 650 | significand is not normalized, `zExp' must be 0; in that case, the result
 651 | returned is a subnormal number, and it must not require rounding.  The
 652 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 653 | Floating-Point Arithmetic.
 654 *----------------------------------------------------------------------------*/
 655
 656 static floatx80
 657  roundAndPackFloatx80(
 658      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 659  STATUS_PARAM)
 660 {
 661     int8 roundingMode;
 662     flag roundNearestEven, increment, isTiny;
 663     int64 roundIncrement, roundMask, roundBits;
 664
 665     roundingMode = STATUS(float_rounding_mode);
 666     roundNearestEven = ( roundingMode == float_round_nearest_even );
 667     if ( roundingPrecision == 80 ) goto precision80;
 668     if ( roundingPrecision == 64 ) {
 669         roundIncrement = LIT64( 0x0000000000000400 );
 670         roundMask = LIT64( 0x00000000000007FF );
 671     }
 672     else if ( roundingPrecision == 32 ) {
 673         roundIncrement = LIT64( 0x0000008000000000 );
 674         roundMask = LIT64( 0x000000FFFFFFFFFF );
 675     }
 676     else {
 677         goto precision80;
 678     }
 679     zSig0 |= ( zSig1 != 0 );
 680     if ( ! roundNearestEven ) {
 681         if ( roundingMode == float_round_to_zero ) {
 682             roundIncrement = 0;
 683         }
 684         else {
 685             roundIncrement = roundMask;
 686             if ( zSign ) {
 687                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 688             }
 689             else {
 690                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 691             }
 692         }
 693     }
 694     roundBits = zSig0 & roundMask;
 695     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 696         if (    ( 0x7FFE < zExp )
 697              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 698            ) {
 699             goto overflow;
 700         }
 701         if ( zExp <= 0 ) {
 702             if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
 703             isTiny =
 704                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 705                 || ( zExp < 0 )
 706                 || ( zSig0 <= zSig0 + roundIncrement );
 707             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 708             zExp = 0;
 709             roundBits = zSig0 & roundMask;
 710             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 711             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 712             zSig0 += roundIncrement;
 713             if ( (int64_t) zSig0 < 0 ) zExp = 1;
 714             roundIncrement = roundMask + 1;
 715             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 716                 roundMask |= roundIncrement;
 717             }
 718             zSig0 &= ~ roundMask;
 719             return packFloatx80( zSign, zExp, zSig0 );
 720         }
 721     }
 722     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 723     zSig0 += roundIncrement;
 724     if ( zSig0 < roundIncrement ) {
 725         ++zExp;
 726         zSig0 = LIT64( 0x8000000000000000 );
 727     }
 728     roundIncrement = roundMask + 1;
 729     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 730         roundMask |= roundIncrement;
 731     }
 732     zSig0 &= ~ roundMask;
 733     if ( zSig0 == 0 ) zExp = 0;
 734     return packFloatx80( zSign, zExp, zSig0 );
 735  precision80:
 736     increment = ( (int64_t) zSig1 < 0 );
 737     if ( ! roundNearestEven ) {
 738         if ( roundingMode == float_round_to_zero ) {
 739             increment = 0;
 740         }
 741         else {
 742             if ( zSign ) {
 743                 increment = ( roundingMode == float_round_down ) && zSig1;
 744             }
 745             else {
 746                 increment = ( roundingMode == float_round_up ) && zSig1;
 747             }
 748         }
 749     }
 750     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 751         if (    ( 0x7FFE < zExp )
 752              || (    ( zExp == 0x7FFE )
 753                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 754                   && increment
 755                 )
 756            ) {
 757             roundMask = 0;
 758  overflow:
 759             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 760             if (    ( roundingMode == float_round_to_zero )
 761                  || ( zSign && ( roundingMode == float_round_up ) )
 762                  || ( ! zSign && ( roundingMode == float_round_down ) )
 763                ) {
 764                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 765             }
 766             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 767         }
 768         if ( zExp <= 0 ) {
 769             isTiny =
 770                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 771                 || ( zExp < 0 )
 772                 || ! increment
 773                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 774             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 775             zExp = 0;
 776             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 777             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 778             if ( roundNearestEven ) {
 779                 increment = ( (int64_t) zSig1 < 0 );
 780             }
 781             else {
 782                 if ( zSign ) {
 783                     increment = ( roundingMode == float_round_down ) && zSig1;
 784                 }
 785                 else {
 786                     increment = ( roundingMode == float_round_up ) && zSig1;
 787                 }
 788             }
 789             if ( increment ) {
 790                 ++zSig0;
 791                 zSig0 &=
 792                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 793                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
 794             }
 795             return packFloatx80( zSign, zExp, zSig0 );
 796         }
 797     }
 798     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 799     if ( increment ) {
 800         ++zSig0;
 801         if ( zSig0 == 0 ) {
 802             ++zExp;
 803             zSig0 = LIT64( 0x8000000000000000 );
 804         }
 805         else {
 806             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 807         }
 808     }
 809     else {
 810         if ( zSig0 == 0 ) zExp = 0;
 811     }
 812     return packFloatx80( zSign, zExp, zSig0 );
 813
 814 }
 815
 816 /*----------------------------------------------------------------------------
 817 | Takes an abstract floating-point value having sign `zSign', exponent
 818 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 819 | and returns the proper extended double-precision floating-point value
 820 | corresponding to the abstract input.  This routine is just like
 821 | `roundAndPackFloatx80' except that the input significand does not have to be
 822 | normalized.
 823 *----------------------------------------------------------------------------*/
 824
 825 static floatx80
 826  normalizeRoundAndPackFloatx80(
 827      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 828  STATUS_PARAM)
 829 {
 830     int8 shiftCount;
 831
 832     if ( zSig0 == 0 ) {
 833         zSig0 = zSig1;
 834         zSig1 = 0;
 835         zExp -= 64;
 836     }
 837     shiftCount = countLeadingZeros64( zSig0 );
 838     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 839     zExp -= shiftCount;
 840     return
 841         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 842
 843 }
 844
 845 #endif
 846
 847 #ifdef FLOAT128
 848
 849 /*----------------------------------------------------------------------------
 850 | Returns the least-significant 64 fraction bits of the quadruple-precision
 851 | floating-point value `a'.
 852 *----------------------------------------------------------------------------*/
 853
 854 INLINE uint64_t extractFloat128Frac1( float128 a )
 855 {
 856
 857     return a.low;
 858
 859 }
 860
 861 /*----------------------------------------------------------------------------
 862 | Returns the most-significant 48 fraction bits of the quadruple-precision
 863 | floating-point value `a'.
 864 *----------------------------------------------------------------------------*/
 865
 866 INLINE uint64_t extractFloat128Frac0( float128 a )
 867 {
 868
 869     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 870
 871 }
 872
 873 /*----------------------------------------------------------------------------
 874 | Returns the exponent bits of the quadruple-precision floating-point value
 875 | `a'.
 876 *----------------------------------------------------------------------------*/
 877
 878 INLINE int32 extractFloat128Exp( float128 a )
 879 {
 880
 881     return ( a.high>>48 ) & 0x7FFF;
 882
 883 }
 884
 885 /*----------------------------------------------------------------------------
 886 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 887 *----------------------------------------------------------------------------*/
 888
 889 INLINE flag extractFloat128Sign( float128 a )
 890 {
 891
 892     return a.high>>63;
 893
 894 }
 895
 896 /*----------------------------------------------------------------------------
 897 | Normalizes the subnormal quadruple-precision floating-point value
 898 | represented by the denormalized significand formed by the concatenation of
 899 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 900 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 901 | significand are stored at the location pointed to by `zSig0Ptr', and the
 902 | least significant 64 bits of the normalized significand are stored at the
 903 | location pointed to by `zSig1Ptr'.
 904 *----------------------------------------------------------------------------*/
 905
 906 static void
 907  normalizeFloat128Subnormal(
 908      uint64_t aSig0,
 909      uint64_t aSig1,
 910      int32 *zExpPtr,
 911      uint64_t *zSig0Ptr,
 912      uint64_t *zSig1Ptr
 913  )
 914 {
 915     int8 shiftCount;
 916
 917     if ( aSig0 == 0 ) {
 918         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 919         if ( shiftCount < 0 ) {
 920             *zSig0Ptr = aSig1>>( - shiftCount );
 921             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 922         }
 923         else {
 924             *zSig0Ptr = aSig1<<shiftCount;
 925             *zSig1Ptr = 0;
 926         }
 927         *zExpPtr = - shiftCount - 63;
 928     }
 929     else {
 930         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 931         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 932         *zExpPtr = 1 - shiftCount;
 933     }
 934
 935 }
 936
 937 /*----------------------------------------------------------------------------
 938 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 939 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 940 | floating-point value, returning the result.  After being shifted into the
 941 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 942 | added together to form the most significant 32 bits of the result.  This
 943 | means that any integer portion of `zSig0' will be added into the exponent.
 944 | Since a properly normalized significand will have an integer portion equal
 945 | to 1, the `zExp' input should be 1 less than the desired result exponent
 946 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 947 | significand.
 948 *----------------------------------------------------------------------------*/
 949
 950 INLINE float128
 951  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
 952 {
 953     float128 z;
 954
 955     z.low = zSig1;
 956     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
 957     return z;
 958
 959 }
 960
 961 /*----------------------------------------------------------------------------
 962 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 963 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 964 | and `zSig2', and returns the proper quadruple-precision floating-point value
 965 | corresponding to the abstract input.  Ordinarily, the abstract value is
 966 | simply rounded and packed into the quadruple-precision format, with the
 967 | inexact exception raised if the abstract input cannot be represented
 968 | exactly.  However, if the abstract value is too large, the overflow and
 969 | inexact exceptions are raised and an infinity or maximal finite value is
 970 | returned.  If the abstract value is too small, the input value is rounded to
 971 | a subnormal number, and the underflow and inexact exceptions are raised if
 972 | the abstract input cannot be represented exactly as a subnormal quadruple-
 973 | precision floating-point number.
 974 |     The input significand must be normalized or smaller.  If the input
 975 | significand is not normalized, `zExp' must be 0; in that case, the result
 976 | returned is a subnormal number, and it must not require rounding.  In the
 977 | usual case that the input significand is normalized, `zExp' must be 1 less
 978 | than the ``true'' floating-point exponent.  The handling of underflow and
 979 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 980 *----------------------------------------------------------------------------*/
 981
 982 static float128
 983  roundAndPackFloat128(
 984      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
 985 {
 986     int8 roundingMode;
 987     flag roundNearestEven, increment, isTiny;
 988
 989     roundingMode = STATUS(float_rounding_mode);
 990     roundNearestEven = ( roundingMode == float_round_nearest_even );
 991     increment = ( (int64_t) zSig2 < 0 );
 992     if ( ! roundNearestEven ) {
 993         if ( roundingMode == float_round_to_zero ) {
 994             increment = 0;
 995         }
 996         else {
 997             if ( zSign ) {
 998                 increment = ( roundingMode == float_round_down ) && zSig2;
 999             }
1000             else {
1001                 increment = ( roundingMode == float_round_up ) && zSig2;
1002             }
1003         }
1004     }
1005     if ( 0x7FFD <= (uint32_t) zExp ) {
1006         if (    ( 0x7FFD < zExp )
1007              || (    ( zExp == 0x7FFD )
1008                   && eq128(
1009                          LIT64( 0x0001FFFFFFFFFFFF ),
1010                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1011                          zSig0,
1012                          zSig1
1013                      )
1014                   && increment
1015                 )
1016            ) {
1017             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1018             if (    ( roundingMode == float_round_to_zero )
1019                  || ( zSign && ( roundingMode == float_round_up ) )
1020                  || ( ! zSign && ( roundingMode == float_round_down ) )
1021                ) {
1022                 return
1023                     packFloat128(
1024                         zSign,
1025                         0x7FFE,
1026                         LIT64( 0x0000FFFFFFFFFFFF ),
1027                         LIT64( 0xFFFFFFFFFFFFFFFF )
1028                     );
1029             }
1030             return packFloat128( zSign, 0x7FFF, 0, 0 );
1031         }
1032         if ( zExp < 0 ) {
1033             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1034             isTiny =
1035                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1036                 || ( zExp < -1 )
1037                 || ! increment
1038                 || lt128(
1039                        zSig0,
1040                        zSig1,
1041                        LIT64( 0x0001FFFFFFFFFFFF ),
1042                        LIT64( 0xFFFFFFFFFFFFFFFF )
1043                    );
1044             shift128ExtraRightJamming(
1045                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1046             zExp = 0;
1047             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1048             if ( roundNearestEven ) {
1049                 increment = ( (int64_t) zSig2 < 0 );
1050             }
1051             else {
1052                 if ( zSign ) {
1053                     increment = ( roundingMode == float_round_down ) && zSig2;
1054                 }
1055                 else {
1056                     increment = ( roundingMode == float_round_up ) && zSig2;
1057                 }
1058             }
1059         }
1060     }
1061     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1062     if ( increment ) {
1063         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1064         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1065     }
1066     else {
1067         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1068     }
1069     return packFloat128( zSign, zExp, zSig0, zSig1 );
1070
1071 }
1072
1073 /*----------------------------------------------------------------------------
1074 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1075 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1076 | returns the proper quadruple-precision floating-point value corresponding
1077 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1078 | except that the input significand has fewer bits and does not have to be
1079 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1080 | point exponent.
1081 *----------------------------------------------------------------------------*/
1082
1083 static float128
1084  normalizeRoundAndPackFloat128(
1085      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1086 {
1087     int8 shiftCount;
1088     uint64_t zSig2;
1089
1090     if ( zSig0 == 0 ) {
1091         zSig0 = zSig1;
1092         zSig1 = 0;
1093         zExp -= 64;
1094     }
1095     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1096     if ( 0 <= shiftCount ) {
1097         zSig2 = 0;
1098         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1099     }
1100     else {
1101         shift128ExtraRightJamming(
1102             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1103     }
1104     zExp -= shiftCount;
1105     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1106
1107 }
1108
1109 #endif
1110
1111 /*----------------------------------------------------------------------------
1112 | Returns the result of converting the 32-bit two's complement integer `a'
1113 | to the single-precision floating-point format.  The conversion is performed
1114 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 *----------------------------------------------------------------------------*/
1116
1117 float32 int32_to_float32( int32 a STATUS_PARAM )
1118 {
1119     flag zSign;
1120
1121     if ( a == 0 ) return float32_zero;
1122     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1123     zSign = ( a < 0 );
1124     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1125
1126 }
1127
1128 /*----------------------------------------------------------------------------
1129 | Returns the result of converting the 32-bit two's complement integer `a'
1130 | to the double-precision floating-point format.  The conversion is performed
1131 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1132 *----------------------------------------------------------------------------*/
1133
1134 float64 int32_to_float64( int32 a STATUS_PARAM )
1135 {
1136     flag zSign;
1137     uint32 absA;
1138     int8 shiftCount;
1139     uint64_t zSig;
1140
1141     if ( a == 0 ) return float64_zero;
1142     zSign = ( a < 0 );
1143     absA = zSign ? - a : a;
1144     shiftCount = countLeadingZeros32( absA ) + 21;
1145     zSig = absA;
1146     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1147
1148 }
1149
1150 #ifdef FLOATX80
1151
1152 /*----------------------------------------------------------------------------
1153 | Returns the result of converting the 32-bit two's complement integer `a'
1154 | to the extended double-precision floating-point format.  The conversion
1155 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1156 | Arithmetic.
1157 *----------------------------------------------------------------------------*/
1158
1159 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1160 {
1161     flag zSign;
1162     uint32 absA;
1163     int8 shiftCount;
1164     uint64_t zSig;
1165
1166     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1167     zSign = ( a < 0 );
1168     absA = zSign ? - a : a;
1169     shiftCount = countLeadingZeros32( absA ) + 32;
1170     zSig = absA;
1171     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1172
1173 }
1174
1175 #endif
1176
1177 #ifdef FLOAT128
1178
1179 /*----------------------------------------------------------------------------
1180 | Returns the result of converting the 32-bit two's complement integer `a' to
1181 | the quadruple-precision floating-point format.  The conversion is performed
1182 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1183 *----------------------------------------------------------------------------*/
1184
1185 float128 int32_to_float128( int32 a STATUS_PARAM )
1186 {
1187     flag zSign;
1188     uint32 absA;
1189     int8 shiftCount;
1190     uint64_t zSig0;
1191
1192     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1193     zSign = ( a < 0 );
1194     absA = zSign ? - a : a;
1195     shiftCount = countLeadingZeros32( absA ) + 17;
1196     zSig0 = absA;
1197     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1198
1199 }
1200
1201 #endif
1202
1203 /*----------------------------------------------------------------------------
1204 | Returns the result of converting the 64-bit two's complement integer `a'
1205 | to the single-precision floating-point format.  The conversion is performed
1206 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1207 *----------------------------------------------------------------------------*/
1208
1209 float32 int64_to_float32( int64 a STATUS_PARAM )
1210 {
1211     flag zSign;
1212     uint64 absA;
1213     int8 shiftCount;
1214
1215     if ( a == 0 ) return float32_zero;
1216     zSign = ( a < 0 );
1217     absA = zSign ? - a : a;
1218     shiftCount = countLeadingZeros64( absA ) - 40;
1219     if ( 0 <= shiftCount ) {
1220         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1221     }
1222     else {
1223         shiftCount += 7;
1224         if ( shiftCount < 0 ) {
1225             shift64RightJamming( absA, - shiftCount, &absA );
1226         }
1227         else {
1228             absA <<= shiftCount;
1229         }
1230         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1231     }
1232
1233 }
1234
1235 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1236 {
1237     int8 shiftCount;
1238
1239     if ( a == 0 ) return float32_zero;
1240     shiftCount = countLeadingZeros64( a ) - 40;
1241     if ( 0 <= shiftCount ) {
1242         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1243     }
1244     else {
1245         shiftCount += 7;
1246         if ( shiftCount < 0 ) {
1247             shift64RightJamming( a, - shiftCount, &a );
1248         }
1249         else {
1250             a <<= shiftCount;
1251         }
1252         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1253     }
1254 }
1255
1256 /*----------------------------------------------------------------------------
1257 | Returns the result of converting the 64-bit two's complement integer `a'
1258 | to the double-precision floating-point format.  The conversion is performed
1259 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1260 *----------------------------------------------------------------------------*/
1261
1262 float64 int64_to_float64( int64 a STATUS_PARAM )
1263 {
1264     flag zSign;
1265
1266     if ( a == 0 ) return float64_zero;
1267     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1268         return packFloat64( 1, 0x43E, 0 );
1269     }
1270     zSign = ( a < 0 );
1271     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1272
1273 }
1274
1275 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1276 {
1277     if ( a == 0 ) return float64_zero;
1278     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1279
1280 }
1281
1282 #ifdef FLOATX80
1283
1284 /*----------------------------------------------------------------------------
1285 | Returns the result of converting the 64-bit two's complement integer `a'
1286 | to the extended double-precision floating-point format.  The conversion
1287 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1288 | Arithmetic.
1289 *----------------------------------------------------------------------------*/
1290
1291 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1292 {
1293     flag zSign;
1294     uint64 absA;
1295     int8 shiftCount;
1296
1297     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1298     zSign = ( a < 0 );
1299     absA = zSign ? - a : a;
1300     shiftCount = countLeadingZeros64( absA );
1301     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1302
1303 }
1304
1305 #endif
1306
1307 #ifdef FLOAT128
1308
1309 /*----------------------------------------------------------------------------
1310 | Returns the result of converting the 64-bit two's complement integer `a' to
1311 | the quadruple-precision floating-point format.  The conversion is performed
1312 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1313 *----------------------------------------------------------------------------*/
1314
1315 float128 int64_to_float128( int64 a STATUS_PARAM )
1316 {
1317     flag zSign;
1318     uint64 absA;
1319     int8 shiftCount;
1320     int32 zExp;
1321     uint64_t zSig0, zSig1;
1322
1323     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1324     zSign = ( a < 0 );
1325     absA = zSign ? - a : a;
1326     shiftCount = countLeadingZeros64( absA ) + 49;
1327     zExp = 0x406E - shiftCount;
1328     if ( 64 <= shiftCount ) {
1329         zSig1 = 0;
1330         zSig0 = absA;
1331         shiftCount -= 64;
1332     }
1333     else {
1334         zSig1 = absA;
1335         zSig0 = 0;
1336     }
1337     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1338     return packFloat128( zSign, zExp, zSig0, zSig1 );
1339
1340 }
1341
1342 #endif
1343
1344 /*----------------------------------------------------------------------------
1345 | Returns the result of converting the single-precision floating-point value
1346 | `a' to the 32-bit two's complement integer format.  The conversion is
1347 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1348 | Arithmetic---which means in particular that the conversion is rounded
1349 | according to the current rounding mode.  If `a' is a NaN, the largest
1350 | positive integer is returned.  Otherwise, if the conversion overflows, the
1351 | largest integer with the same sign as `a' is returned.
1352 *----------------------------------------------------------------------------*/
1353
1354 int32 float32_to_int32( float32 a STATUS_PARAM )
1355 {
1356     flag aSign;
1357     int16 aExp, shiftCount;
1358     uint32_t aSig;
1359     uint64_t aSig64;
1360
1361     a = float32_squash_input_denormal(a STATUS_VAR);
1362     aSig = extractFloat32Frac( a );
1363     aExp = extractFloat32Exp( a );
1364     aSign = extractFloat32Sign( a );
1365     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1366     if ( aExp ) aSig |= 0x00800000;
1367     shiftCount = 0xAF - aExp;
1368     aSig64 = aSig;
1369     aSig64 <<= 32;
1370     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1371     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1372
1373 }
1374
1375 /*----------------------------------------------------------------------------
1376 | Returns the result of converting the single-precision floating-point value
1377 | `a' to the 32-bit two's complement integer format.  The conversion is
1378 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1379 | Arithmetic, except that the conversion is always rounded toward zero.
1380 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1381 | the conversion overflows, the largest integer with the same sign as `a' is
1382 | returned.
1383 *----------------------------------------------------------------------------*/
1384
1385 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1386 {
1387     flag aSign;
1388     int16 aExp, shiftCount;
1389     uint32_t aSig;
1390     int32 z;
1391     a = float32_squash_input_denormal(a STATUS_VAR);
1392
1393     aSig = extractFloat32Frac( a );
1394     aExp = extractFloat32Exp( a );
1395     aSign = extractFloat32Sign( a );
1396     shiftCount = aExp - 0x9E;
1397     if ( 0 <= shiftCount ) {
1398         if ( float32_val(a) != 0xCF000000 ) {
1399             float_raise( float_flag_invalid STATUS_VAR);
1400             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1401         }
1402         return (int32_t) 0x80000000;
1403     }
1404     else if ( aExp <= 0x7E ) {
1405         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1406         return 0;
1407     }
1408     aSig = ( aSig | 0x00800000 )<<8;
1409     z = aSig>>( - shiftCount );
1410     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1411         STATUS(float_exception_flags) |= float_flag_inexact;
1412     }
1413     if ( aSign ) z = - z;
1414     return z;
1415
1416 }
1417
1418 /*----------------------------------------------------------------------------
1419 | Returns the result of converting the single-precision floating-point value
1420 | `a' to the 16-bit two's complement integer format.  The conversion is
1421 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1422 | Arithmetic, except that the conversion is always rounded toward zero.
1423 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1424 | the conversion overflows, the largest integer with the same sign as `a' is
1425 | returned.
1426 *----------------------------------------------------------------------------*/
1427
1428 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1429 {
1430     flag aSign;
1431     int16 aExp, shiftCount;
1432     uint32_t aSig;
1433     int32 z;
1434
1435     aSig = extractFloat32Frac( a );
1436     aExp = extractFloat32Exp( a );
1437     aSign = extractFloat32Sign( a );
1438     shiftCount = aExp - 0x8E;
1439     if ( 0 <= shiftCount ) {
1440         if ( float32_val(a) != 0xC7000000 ) {
1441             float_raise( float_flag_invalid STATUS_VAR);
1442             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1443                 return 0x7FFF;
1444             }
1445         }
1446         return (int32_t) 0xffff8000;
1447     }
1448     else if ( aExp <= 0x7E ) {
1449         if ( aExp | aSig ) {
1450             STATUS(float_exception_flags) |= float_flag_inexact;
1451         }
1452         return 0;
1453     }
1454     shiftCount -= 0x10;
1455     aSig = ( aSig | 0x00800000 )<<8;
1456     z = aSig>>( - shiftCount );
1457     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1458         STATUS(float_exception_flags) |= float_flag_inexact;
1459     }
1460     if ( aSign ) {
1461         z = - z;
1462     }
1463     return z;
1464
1465 }
1466
1467 /*----------------------------------------------------------------------------
1468 | Returns the result of converting the single-precision floating-point value
1469 | `a' to the 64-bit two's complement integer format.  The conversion is
1470 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1471 | Arithmetic---which means in particular that the conversion is rounded
1472 | according to the current rounding mode.  If `a' is a NaN, the largest
1473 | positive integer is returned.  Otherwise, if the conversion overflows, the
1474 | largest integer with the same sign as `a' is returned.
1475 *----------------------------------------------------------------------------*/
1476
1477 int64 float32_to_int64( float32 a STATUS_PARAM )
1478 {
1479     flag aSign;
1480     int16 aExp, shiftCount;
1481     uint32_t aSig;
1482     uint64_t aSig64, aSigExtra;
1483     a = float32_squash_input_denormal(a STATUS_VAR);
1484
1485     aSig = extractFloat32Frac( a );
1486     aExp = extractFloat32Exp( a );
1487     aSign = extractFloat32Sign( a );
1488     shiftCount = 0xBE - aExp;
1489     if ( shiftCount < 0 ) {
1490         float_raise( float_flag_invalid STATUS_VAR);
1491         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1492             return LIT64( 0x7FFFFFFFFFFFFFFF );
1493         }
1494         return (int64_t) LIT64( 0x8000000000000000 );
1495     }
1496     if ( aExp ) aSig |= 0x00800000;
1497     aSig64 = aSig;
1498     aSig64 <<= 40;
1499     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1500     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1501
1502 }
1503
1504 /*----------------------------------------------------------------------------
1505 | Returns the result of converting the single-precision floating-point value
1506 | `a' to the 64-bit two's complement integer format.  The conversion is
1507 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1508 | Arithmetic, except that the conversion is always rounded toward zero.  If
1509 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1510 | conversion overflows, the largest integer with the same sign as `a' is
1511 | returned.
1512 *----------------------------------------------------------------------------*/
1513
1514 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1515 {
1516     flag aSign;
1517     int16 aExp, shiftCount;
1518     uint32_t aSig;
1519     uint64_t aSig64;
1520     int64 z;
1521     a = float32_squash_input_denormal(a STATUS_VAR);
1522
1523     aSig = extractFloat32Frac( a );
1524     aExp = extractFloat32Exp( a );
1525     aSign = extractFloat32Sign( a );
1526     shiftCount = aExp - 0xBE;
1527     if ( 0 <= shiftCount ) {
1528         if ( float32_val(a) != 0xDF000000 ) {
1529             float_raise( float_flag_invalid STATUS_VAR);
1530             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1531                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1532             }
1533         }
1534         return (int64_t) LIT64( 0x8000000000000000 );
1535     }
1536     else if ( aExp <= 0x7E ) {
1537         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1538         return 0;
1539     }
1540     aSig64 = aSig | 0x00800000;
1541     aSig64 <<= 40;
1542     z = aSig64>>( - shiftCount );
1543     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1544         STATUS(float_exception_flags) |= float_flag_inexact;
1545     }
1546     if ( aSign ) z = - z;
1547     return z;
1548
1549 }
1550
1551 /*----------------------------------------------------------------------------
1552 | Returns the result of converting the single-precision floating-point value
1553 | `a' to the double-precision floating-point format.  The conversion is
1554 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1555 | Arithmetic.
1556 *----------------------------------------------------------------------------*/
1557
1558 float64 float32_to_float64( float32 a STATUS_PARAM )
1559 {
1560     flag aSign;
1561     int16 aExp;
1562     uint32_t aSig;
1563     a = float32_squash_input_denormal(a STATUS_VAR);
1564
1565     aSig = extractFloat32Frac( a );
1566     aExp = extractFloat32Exp( a );
1567     aSign = extractFloat32Sign( a );
1568     if ( aExp == 0xFF ) {
1569         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1570         return packFloat64( aSign, 0x7FF, 0 );
1571     }
1572     if ( aExp == 0 ) {
1573         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1574         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1575         --aExp;
1576     }
1577     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1578
1579 }
1580
1581 #ifdef FLOATX80
1582
1583 /*----------------------------------------------------------------------------
1584 | Returns the result of converting the single-precision floating-point value
1585 | `a' to the extended double-precision floating-point format.  The conversion
1586 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1587 | Arithmetic.
1588 *----------------------------------------------------------------------------*/
1589
1590 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1591 {
1592     flag aSign;
1593     int16 aExp;
1594     uint32_t aSig;
1595
1596     a = float32_squash_input_denormal(a STATUS_VAR);
1597     aSig = extractFloat32Frac( a );
1598     aExp = extractFloat32Exp( a );
1599     aSign = extractFloat32Sign( a );
1600     if ( aExp == 0xFF ) {
1601         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1602         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1603     }
1604     if ( aExp == 0 ) {
1605         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1606         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1607     }
1608     aSig |= 0x00800000;
1609     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1610
1611 }
1612
1613 #endif
1614
1615 #ifdef FLOAT128
1616
1617 /*----------------------------------------------------------------------------
1618 | Returns the result of converting the single-precision floating-point value
1619 | `a' to the double-precision floating-point format.  The conversion is
1620 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1621 | Arithmetic.
1622 *----------------------------------------------------------------------------*/
1623
1624 float128 float32_to_float128( float32 a STATUS_PARAM )
1625 {
1626     flag aSign;
1627     int16 aExp;
1628     uint32_t aSig;
1629
1630     a = float32_squash_input_denormal(a STATUS_VAR);
1631     aSig = extractFloat32Frac( a );
1632     aExp = extractFloat32Exp( a );
1633     aSign = extractFloat32Sign( a );
1634     if ( aExp == 0xFF ) {
1635         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1636         return packFloat128( aSign, 0x7FFF, 0, 0 );
1637     }
1638     if ( aExp == 0 ) {
1639         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1640         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1641         --aExp;
1642     }
1643     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1644
1645 }
1646
1647 #endif
1648
1649 /*----------------------------------------------------------------------------
1650 | Rounds the single-precision floating-point value `a' to an integer, and
1651 | returns the result as a single-precision floating-point value.  The
1652 | operation is performed according to the IEC/IEEE Standard for Binary
1653 | Floating-Point Arithmetic.
1654 *----------------------------------------------------------------------------*/
1655
1656 float32 float32_round_to_int( float32 a STATUS_PARAM)
1657 {
1658     flag aSign;
1659     int16 aExp;
1660     uint32_t lastBitMask, roundBitsMask;
1661     int8 roundingMode;
1662     uint32_t z;
1663     a = float32_squash_input_denormal(a STATUS_VAR);
1664
1665     aExp = extractFloat32Exp( a );
1666     if ( 0x96 <= aExp ) {
1667         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1668             return propagateFloat32NaN( a, a STATUS_VAR );
1669         }
1670         return a;
1671     }
1672     if ( aExp <= 0x7E ) {
1673         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1674         STATUS(float_exception_flags) |= float_flag_inexact;
1675         aSign = extractFloat32Sign( a );
1676         switch ( STATUS(float_rounding_mode) ) {
1677          case float_round_nearest_even:
1678             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1679                 return packFloat32( aSign, 0x7F, 0 );
1680             }
1681             break;
1682          case float_round_down:
1683             return make_float32(aSign ? 0xBF800000 : 0);
1684          case float_round_up:
1685             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1686         }
1687         return packFloat32( aSign, 0, 0 );
1688     }
1689     lastBitMask = 1;
1690     lastBitMask <<= 0x96 - aExp;
1691     roundBitsMask = lastBitMask - 1;
1692     z = float32_val(a);
1693     roundingMode = STATUS(float_rounding_mode);
1694     if ( roundingMode == float_round_nearest_even ) {
1695         z += lastBitMask>>1;
1696         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1697     }
1698     else if ( roundingMode != float_round_to_zero ) {
1699         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1700             z += roundBitsMask;
1701         }
1702     }
1703     z &= ~ roundBitsMask;
1704     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1705     return make_float32(z);
1706
1707 }
1708
1709 /*----------------------------------------------------------------------------
1710 | Returns the result of adding the absolute values of the single-precision
1711 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1712 | before being returned.  `zSign' is ignored if the result is a NaN.
1713 | The addition is performed according to the IEC/IEEE Standard for Binary
1714 | Floating-Point Arithmetic.
1715 *----------------------------------------------------------------------------*/
1716
1717 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1718 {
1719     int16 aExp, bExp, zExp;
1720     uint32_t aSig, bSig, zSig;
1721     int16 expDiff;
1722
1723     aSig = extractFloat32Frac( a );
1724     aExp = extractFloat32Exp( a );
1725     bSig = extractFloat32Frac( b );
1726     bExp = extractFloat32Exp( b );
1727     expDiff = aExp - bExp;
1728     aSig <<= 6;
1729     bSig <<= 6;
1730     if ( 0 < expDiff ) {
1731         if ( aExp == 0xFF ) {
1732             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1733             return a;
1734         }
1735         if ( bExp == 0 ) {
1736             --expDiff;
1737         }
1738         else {
1739             bSig |= 0x20000000;
1740         }
1741         shift32RightJamming( bSig, expDiff, &bSig );
1742         zExp = aExp;
1743     }
1744     else if ( expDiff < 0 ) {
1745         if ( bExp == 0xFF ) {
1746             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1747             return packFloat32( zSign, 0xFF, 0 );
1748         }
1749         if ( aExp == 0 ) {
1750             ++expDiff;
1751         }
1752         else {
1753             aSig |= 0x20000000;
1754         }
1755         shift32RightJamming( aSig, - expDiff, &aSig );
1756         zExp = bExp;
1757     }
1758     else {
1759         if ( aExp == 0xFF ) {
1760             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1761             return a;
1762         }
1763         if ( aExp == 0 ) {
1764             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1765             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1766         }
1767         zSig = 0x40000000 + aSig + bSig;
1768         zExp = aExp;
1769         goto roundAndPack;
1770     }
1771     aSig |= 0x20000000;
1772     zSig = ( aSig + bSig )<<1;
1773     --zExp;
1774     if ( (int32_t) zSig < 0 ) {
1775         zSig = aSig + bSig;
1776         ++zExp;
1777     }
1778  roundAndPack:
1779     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1780
1781 }
1782
1783 /*----------------------------------------------------------------------------
1784 | Returns the result of subtracting the absolute values of the single-
1785 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1786 | difference is negated before being returned.  `zSign' is ignored if the
1787 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1788 | Standard for Binary Floating-Point Arithmetic.
1789 *----------------------------------------------------------------------------*/
1790
1791 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1792 {
1793     int16 aExp, bExp, zExp;
1794     uint32_t aSig, bSig, zSig;
1795     int16 expDiff;
1796
1797     aSig = extractFloat32Frac( a );
1798     aExp = extractFloat32Exp( a );
1799     bSig = extractFloat32Frac( b );
1800     bExp = extractFloat32Exp( b );
1801     expDiff = aExp - bExp;
1802     aSig <<= 7;
1803     bSig <<= 7;
1804     if ( 0 < expDiff ) goto aExpBigger;
1805     if ( expDiff < 0 ) goto bExpBigger;
1806     if ( aExp == 0xFF ) {
1807         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1808         float_raise( float_flag_invalid STATUS_VAR);
1809         return float32_default_nan;
1810     }
1811     if ( aExp == 0 ) {
1812         aExp = 1;
1813         bExp = 1;
1814     }
1815     if ( bSig < aSig ) goto aBigger;
1816     if ( aSig < bSig ) goto bBigger;
1817     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1818  bExpBigger:
1819     if ( bExp == 0xFF ) {
1820         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1821         return packFloat32( zSign ^ 1, 0xFF, 0 );
1822     }
1823     if ( aExp == 0 ) {
1824         ++expDiff;
1825     }
1826     else {
1827         aSig |= 0x40000000;
1828     }
1829     shift32RightJamming( aSig, - expDiff, &aSig );
1830     bSig |= 0x40000000;
1831  bBigger:
1832     zSig = bSig - aSig;
1833     zExp = bExp;
1834     zSign ^= 1;
1835     goto normalizeRoundAndPack;
1836  aExpBigger:
1837     if ( aExp == 0xFF ) {
1838         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1839         return a;
1840     }
1841     if ( bExp == 0 ) {
1842         --expDiff;
1843     }
1844     else {
1845         bSig |= 0x40000000;
1846     }
1847     shift32RightJamming( bSig, expDiff, &bSig );
1848     aSig |= 0x40000000;
1849  aBigger:
1850     zSig = aSig - bSig;
1851     zExp = aExp;
1852  normalizeRoundAndPack:
1853     --zExp;
1854     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1855
1856 }
1857
1858 /*----------------------------------------------------------------------------
1859 | Returns the result of adding the single-precision floating-point values `a'
1860 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1861 | Binary Floating-Point Arithmetic.
1862 *----------------------------------------------------------------------------*/
1863
1864 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1865 {
1866     flag aSign, bSign;
1867     a = float32_squash_input_denormal(a STATUS_VAR);
1868     b = float32_squash_input_denormal(b STATUS_VAR);
1869
1870     aSign = extractFloat32Sign( a );
1871     bSign = extractFloat32Sign( b );
1872     if ( aSign == bSign ) {
1873         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1874     }
1875     else {
1876         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1877     }
1878
1879 }
1880
1881 /*----------------------------------------------------------------------------
1882 | Returns the result of subtracting the single-precision floating-point values
1883 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1884 | for Binary Floating-Point Arithmetic.
1885 *----------------------------------------------------------------------------*/
1886
1887 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1888 {
1889     flag aSign, bSign;
1890     a = float32_squash_input_denormal(a STATUS_VAR);
1891     b = float32_squash_input_denormal(b STATUS_VAR);
1892
1893     aSign = extractFloat32Sign( a );
1894     bSign = extractFloat32Sign( b );
1895     if ( aSign == bSign ) {
1896         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1897     }
1898     else {
1899         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1900     }
1901
1902 }
1903
1904 /*----------------------------------------------------------------------------
1905 | Returns the result of multiplying the single-precision floating-point values
1906 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1907 | for Binary Floating-Point Arithmetic.
1908 *----------------------------------------------------------------------------*/
1909
1910 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1911 {
1912     flag aSign, bSign, zSign;
1913     int16 aExp, bExp, zExp;
1914     uint32_t aSig, bSig;
1915     uint64_t zSig64;
1916     uint32_t zSig;
1917
1918     a = float32_squash_input_denormal(a STATUS_VAR);
1919     b = float32_squash_input_denormal(b STATUS_VAR);
1920
1921     aSig = extractFloat32Frac( a );
1922     aExp = extractFloat32Exp( a );
1923     aSign = extractFloat32Sign( a );
1924     bSig = extractFloat32Frac( b );
1925     bExp = extractFloat32Exp( b );
1926     bSign = extractFloat32Sign( b );
1927     zSign = aSign ^ bSign;
1928     if ( aExp == 0xFF ) {
1929         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1930             return propagateFloat32NaN( a, b STATUS_VAR );
1931         }
1932         if ( ( bExp | bSig ) == 0 ) {
1933             float_raise( float_flag_invalid STATUS_VAR);
1934             return float32_default_nan;
1935         }
1936         return packFloat32( zSign, 0xFF, 0 );
1937     }
1938     if ( bExp == 0xFF ) {
1939         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1940         if ( ( aExp | aSig ) == 0 ) {
1941             float_raise( float_flag_invalid STATUS_VAR);
1942             return float32_default_nan;
1943         }
1944         return packFloat32( zSign, 0xFF, 0 );
1945     }
1946     if ( aExp == 0 ) {
1947         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1948         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1949     }
1950     if ( bExp == 0 ) {
1951         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1952         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1953     }
1954     zExp = aExp + bExp - 0x7F;
1955     aSig = ( aSig | 0x00800000 )<<7;
1956     bSig = ( bSig | 0x00800000 )<<8;
1957     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1958     zSig = zSig64;
1959     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1960         zSig <<= 1;
1961         --zExp;
1962     }
1963     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1964
1965 }
1966
1967 /*----------------------------------------------------------------------------
1968 | Returns the result of dividing the single-precision floating-point value `a'
1969 | by the corresponding value `b'.  The operation is performed according to the
1970 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1971 *----------------------------------------------------------------------------*/
1972
1973 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1974 {
1975     flag aSign, bSign, zSign;
1976     int16 aExp, bExp, zExp;
1977     uint32_t aSig, bSig, zSig;
1978     a = float32_squash_input_denormal(a STATUS_VAR);
1979     b = float32_squash_input_denormal(b STATUS_VAR);
1980
1981     aSig = extractFloat32Frac( a );
1982     aExp = extractFloat32Exp( a );
1983     aSign = extractFloat32Sign( a );
1984     bSig = extractFloat32Frac( b );
1985     bExp = extractFloat32Exp( b );
1986     bSign = extractFloat32Sign( b );
1987     zSign = aSign ^ bSign;
1988     if ( aExp == 0xFF ) {
1989         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1990         if ( bExp == 0xFF ) {
1991             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1992             float_raise( float_flag_invalid STATUS_VAR);
1993             return float32_default_nan;
1994         }
1995         return packFloat32( zSign, 0xFF, 0 );
1996     }
1997     if ( bExp == 0xFF ) {
1998         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1999         return packFloat32( zSign, 0, 0 );
2000     }
2001     if ( bExp == 0 ) {
2002         if ( bSig == 0 ) {
2003             if ( ( aExp | aSig ) == 0 ) {
2004                 float_raise( float_flag_invalid STATUS_VAR);
2005                 return float32_default_nan;
2006             }
2007             float_raise( float_flag_divbyzero STATUS_VAR);
2008             return packFloat32( zSign, 0xFF, 0 );
2009         }
2010         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2011     }
2012     if ( aExp == 0 ) {
2013         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2014         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2015     }
2016     zExp = aExp - bExp + 0x7D;
2017     aSig = ( aSig | 0x00800000 )<<7;
2018     bSig = ( bSig | 0x00800000 )<<8;
2019     if ( bSig <= ( aSig + aSig ) ) {
2020         aSig >>= 1;
2021         ++zExp;
2022     }
2023     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2024     if ( ( zSig & 0x3F ) == 0 ) {
2025         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2026     }
2027     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2028
2029 }
2030
2031 /*----------------------------------------------------------------------------
2032 | Returns the remainder of the single-precision floating-point value `a'
2033 | with respect to the corresponding value `b'.  The operation is performed
2034 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2035 *----------------------------------------------------------------------------*/
2036
2037 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2038 {
2039     flag aSign, zSign;
2040     int16 aExp, bExp, expDiff;
2041     uint32_t aSig, bSig;
2042     uint32_t q;
2043     uint64_t aSig64, bSig64, q64;
2044     uint32_t alternateASig;
2045     int32_t sigMean;
2046     a = float32_squash_input_denormal(a STATUS_VAR);
2047     b = float32_squash_input_denormal(b STATUS_VAR);
2048
2049     aSig = extractFloat32Frac( a );
2050     aExp = extractFloat32Exp( a );
2051     aSign = extractFloat32Sign( a );
2052     bSig = extractFloat32Frac( b );
2053     bExp = extractFloat32Exp( b );
2054     if ( aExp == 0xFF ) {
2055         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2056             return propagateFloat32NaN( a, b STATUS_VAR );
2057         }
2058         float_raise( float_flag_invalid STATUS_VAR);
2059         return float32_default_nan;
2060     }
2061     if ( bExp == 0xFF ) {
2062         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2063         return a;
2064     }
2065     if ( bExp == 0 ) {
2066         if ( bSig == 0 ) {
2067             float_raise( float_flag_invalid STATUS_VAR);
2068             return float32_default_nan;
2069         }
2070         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2071     }
2072     if ( aExp == 0 ) {
2073         if ( aSig == 0 ) return a;
2074         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2075     }
2076     expDiff = aExp - bExp;
2077     aSig |= 0x00800000;
2078     bSig |= 0x00800000;
2079     if ( expDiff < 32 ) {
2080         aSig <<= 8;
2081         bSig <<= 8;
2082         if ( expDiff < 0 ) {
2083             if ( expDiff < -1 ) return a;
2084             aSig >>= 1;
2085         }
2086         q = ( bSig <= aSig );
2087         if ( q ) aSig -= bSig;
2088         if ( 0 < expDiff ) {
2089             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2090             q >>= 32 - expDiff;
2091             bSig >>= 2;
2092             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2093         }
2094         else {
2095             aSig >>= 2;
2096             bSig >>= 2;
2097         }
2098     }
2099     else {
2100         if ( bSig <= aSig ) aSig -= bSig;
2101         aSig64 = ( (uint64_t) aSig )<<40;
2102         bSig64 = ( (uint64_t) bSig )<<40;
2103         expDiff -= 64;
2104         while ( 0 < expDiff ) {
2105             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2106             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2107             aSig64 = - ( ( bSig * q64 )<<38 );
2108             expDiff -= 62;
2109         }
2110         expDiff += 64;
2111         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2112         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2113         q = q64>>( 64 - expDiff );
2114         bSig <<= 6;
2115         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2116     }
2117     do {
2118         alternateASig = aSig;
2119         ++q;
2120         aSig -= bSig;
2121     } while ( 0 <= (int32_t) aSig );
2122     sigMean = aSig + alternateASig;
2123     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2124         aSig = alternateASig;
2125     }
2126     zSign = ( (int32_t) aSig < 0 );
2127     if ( zSign ) aSig = - aSig;
2128     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2129
2130 }
2131
2132 /*----------------------------------------------------------------------------
2133 | Returns the square root of the single-precision floating-point value `a'.
2134 | The operation is performed according to the IEC/IEEE Standard for Binary
2135 | Floating-Point Arithmetic.
2136 *----------------------------------------------------------------------------*/
2137
2138 float32 float32_sqrt( float32 a STATUS_PARAM )
2139 {
2140     flag aSign;
2141     int16 aExp, zExp;
2142     uint32_t aSig, zSig;
2143     uint64_t rem, term;
2144     a = float32_squash_input_denormal(a STATUS_VAR);
2145
2146     aSig = extractFloat32Frac( a );
2147     aExp = extractFloat32Exp( a );
2148     aSign = extractFloat32Sign( a );
2149     if ( aExp == 0xFF ) {
2150         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2151         if ( ! aSign ) return a;
2152         float_raise( float_flag_invalid STATUS_VAR);
2153         return float32_default_nan;
2154     }
2155     if ( aSign ) {
2156         if ( ( aExp | aSig ) == 0 ) return a;
2157         float_raise( float_flag_invalid STATUS_VAR);
2158         return float32_default_nan;
2159     }
2160     if ( aExp == 0 ) {
2161         if ( aSig == 0 ) return float32_zero;
2162         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2163     }
2164     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2165     aSig = ( aSig | 0x00800000 )<<8;
2166     zSig = estimateSqrt32( aExp, aSig ) + 2;
2167     if ( ( zSig & 0x7F ) <= 5 ) {
2168         if ( zSig < 2 ) {
2169             zSig = 0x7FFFFFFF;
2170             goto roundAndPack;
2171         }
2172         aSig >>= aExp & 1;
2173         term = ( (uint64_t) zSig ) * zSig;
2174         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2175         while ( (int64_t) rem < 0 ) {
2176             --zSig;
2177             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2178         }
2179         zSig |= ( rem != 0 );
2180     }
2181     shift32RightJamming( zSig, 1, &zSig );
2182  roundAndPack:
2183     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2184
2185 }
2186
2187 /*----------------------------------------------------------------------------
2188 | Returns the binary exponential of the single-precision floating-point value
2189 | `a'. The operation is performed according to the IEC/IEEE Standard for
2190 | Binary Floating-Point Arithmetic.
2191 |
2192 | Uses the following identities:
2193 |
2194 | 1. -------------------------------------------------------------------------
2195 |      x    x*ln(2)
2196 |     2  = e
2197 |
2198 | 2. -------------------------------------------------------------------------
2199 |                      2     3     4     5           n
2200 |      x        x     x     x     x     x           x
2201 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2202 |               1!    2!    3!    4!    5!          n!
2203 *----------------------------------------------------------------------------*/
2204
2205 static const float64 float32_exp2_coefficients[15] =
2206 {
2207     const_float64( 0x3ff0000000000000ll ), /*  1 */
2208     const_float64( 0x3fe0000000000000ll ), /*  2 */
2209     const_float64( 0x3fc5555555555555ll ), /*  3 */
2210     const_float64( 0x3fa5555555555555ll ), /*  4 */
2211     const_float64( 0x3f81111111111111ll ), /*  5 */
2212     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2213     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2214     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2215     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2216     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2217     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2218     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2219     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2220     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2221     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2222 };
2223
2224 float32 float32_exp2( float32 a STATUS_PARAM )
2225 {
2226     flag aSign;
2227     int16 aExp;
2228     uint32_t aSig;
2229     float64 r, x, xn;
2230     int i;
2231     a = float32_squash_input_denormal(a STATUS_VAR);
2232
2233     aSig = extractFloat32Frac( a );
2234     aExp = extractFloat32Exp( a );
2235     aSign = extractFloat32Sign( a );
2236
2237     if ( aExp == 0xFF) {
2238         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2239         return (aSign) ? float32_zero : a;
2240     }
2241     if (aExp == 0) {
2242         if (aSig == 0) return float32_one;
2243     }
2244
2245     float_raise( float_flag_inexact STATUS_VAR);
2246
2247     /* ******************************* */
2248     /* using float64 for approximation */
2249     /* ******************************* */
2250     x = float32_to_float64(a STATUS_VAR);
2251     x = float64_mul(x, float64_ln2 STATUS_VAR);
2252
2253     xn = x;
2254     r = float64_one;
2255     for (i = 0 ; i < 15 ; i++) {
2256         float64 f;
2257
2258         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2259         r = float64_add(r, f STATUS_VAR);
2260
2261         xn = float64_mul(xn, x STATUS_VAR);
2262     }
2263
2264     return float64_to_float32(r, status);
2265 }
2266
2267 /*----------------------------------------------------------------------------
2268 | Returns the binary log of the single-precision floating-point value `a'.
2269 | The operation is performed according to the IEC/IEEE Standard for Binary
2270 | Floating-Point Arithmetic.
2271 *----------------------------------------------------------------------------*/
2272 float32 float32_log2( float32 a STATUS_PARAM )
2273 {
2274     flag aSign, zSign;
2275     int16 aExp;
2276     uint32_t aSig, zSig, i;
2277
2278     a = float32_squash_input_denormal(a STATUS_VAR);
2279     aSig = extractFloat32Frac( a );
2280     aExp = extractFloat32Exp( a );
2281     aSign = extractFloat32Sign( a );
2282
2283     if ( aExp == 0 ) {
2284         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2285         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2286     }
2287     if ( aSign ) {
2288         float_raise( float_flag_invalid STATUS_VAR);
2289         return float32_default_nan;
2290     }
2291     if ( aExp == 0xFF ) {
2292         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2293         return a;
2294     }
2295
2296     aExp -= 0x7F;
2297     aSig |= 0x00800000;
2298     zSign = aExp < 0;
2299     zSig = aExp << 23;
2300
2301     for (i = 1 << 22; i > 0; i >>= 1) {
2302         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2303         if ( aSig & 0x01000000 ) {
2304             aSig >>= 1;
2305             zSig |= i;
2306         }
2307     }
2308
2309     if ( zSign )
2310         zSig = -zSig;
2311
2312     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2313 }
2314
2315 /*----------------------------------------------------------------------------
2316 | Returns 1 if the single-precision floating-point value `a' is equal to
2317 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2319 *----------------------------------------------------------------------------*/
2320
2321 int float32_eq( float32 a, float32 b STATUS_PARAM )
2322 {
2323     a = float32_squash_input_denormal(a STATUS_VAR);
2324     b = float32_squash_input_denormal(b STATUS_VAR);
2325
2326     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2327          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2328        ) {
2329         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2330             float_raise( float_flag_invalid STATUS_VAR);
2331         }
2332         return 0;
2333     }
2334     return ( float32_val(a) == float32_val(b) ) ||
2335             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2336
2337 }
2338
2339 /*----------------------------------------------------------------------------
2340 | Returns 1 if the single-precision floating-point value `a' is less than
2341 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2342 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2343 | Arithmetic.
2344 *----------------------------------------------------------------------------*/
2345
2346 int float32_le( float32 a, float32 b STATUS_PARAM )
2347 {
2348     flag aSign, bSign;
2349     uint32_t av, bv;
2350     a = float32_squash_input_denormal(a STATUS_VAR);
2351     b = float32_squash_input_denormal(b STATUS_VAR);
2352
2353     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2354          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2355        ) {
2356         float_raise( float_flag_invalid STATUS_VAR);
2357         return 0;
2358     }
2359     aSign = extractFloat32Sign( a );
2360     bSign = extractFloat32Sign( b );
2361     av = float32_val(a);
2362     bv = float32_val(b);
2363     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2364     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2365
2366 }
2367
2368 /*----------------------------------------------------------------------------
2369 | Returns 1 if the single-precision floating-point value `a' is less than
2370 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2371 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2372 *----------------------------------------------------------------------------*/
2373
2374 int float32_lt( float32 a, float32 b STATUS_PARAM )
2375 {
2376     flag aSign, bSign;
2377     uint32_t av, bv;
2378     a = float32_squash_input_denormal(a STATUS_VAR);
2379     b = float32_squash_input_denormal(b STATUS_VAR);
2380
2381     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2382          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2383        ) {
2384         float_raise( float_flag_invalid STATUS_VAR);
2385         return 0;
2386     }
2387     aSign = extractFloat32Sign( a );
2388     bSign = extractFloat32Sign( b );
2389     av = float32_val(a);
2390     bv = float32_val(b);
2391     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2392     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2393
2394 }
2395
2396 /*----------------------------------------------------------------------------
2397 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2398 | be compared, and 0 otherwise.  The comparison is performed according to the
2399 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2400 *----------------------------------------------------------------------------*/
2401
2402 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2403 {
2404     a = float32_squash_input_denormal(a STATUS_VAR);
2405     b = float32_squash_input_denormal(b STATUS_VAR);
2406
2407     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2408          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2409        ) {
2410         float_raise( float_flag_invalid STATUS_VAR);
2411         return 1;
2412     }
2413     return 0;
2414 }
2415 /*----------------------------------------------------------------------------
2416 | Returns 1 if the single-precision floating-point value `a' is equal to
2417 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2418 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2419 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2420 *----------------------------------------------------------------------------*/
2421
2422 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2423 {
2424     uint32_t av, bv;
2425     a = float32_squash_input_denormal(a STATUS_VAR);
2426     b = float32_squash_input_denormal(b STATUS_VAR);
2427
2428     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2429          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2430        ) {
2431         float_raise( float_flag_invalid STATUS_VAR);
2432         return 0;
2433     }
2434     av = float32_val(a);
2435     bv = float32_val(b);
2436     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2437
2438 }
2439
2440 /*----------------------------------------------------------------------------
2441 | Returns 1 if the single-precision floating-point value `a' is less than or
2442 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2443 | cause an exception.  Otherwise, the comparison is performed according to the
2444 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2445 *----------------------------------------------------------------------------*/
2446
2447 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2448 {
2449     flag aSign, bSign;
2450     uint32_t av, bv;
2451     a = float32_squash_input_denormal(a STATUS_VAR);
2452     b = float32_squash_input_denormal(b STATUS_VAR);
2453
2454     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2455          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2456        ) {
2457         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2458             float_raise( float_flag_invalid STATUS_VAR);
2459         }
2460         return 0;
2461     }
2462     aSign = extractFloat32Sign( a );
2463     bSign = extractFloat32Sign( b );
2464     av = float32_val(a);
2465     bv = float32_val(b);
2466     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2467     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2468
2469 }
2470
2471 /*----------------------------------------------------------------------------
2472 | Returns 1 if the single-precision floating-point value `a' is less than
2473 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2474 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2475 | Standard for Binary Floating-Point Arithmetic.
2476 *----------------------------------------------------------------------------*/
2477
2478 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2479 {
2480     flag aSign, bSign;
2481     uint32_t av, bv;
2482     a = float32_squash_input_denormal(a STATUS_VAR);
2483     b = float32_squash_input_denormal(b STATUS_VAR);
2484
2485     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2486          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2487        ) {
2488         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2489             float_raise( float_flag_invalid STATUS_VAR);
2490         }
2491         return 0;
2492     }
2493     aSign = extractFloat32Sign( a );
2494     bSign = extractFloat32Sign( b );
2495     av = float32_val(a);
2496     bv = float32_val(b);
2497     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2498     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2499
2500 }
2501
2502 /*----------------------------------------------------------------------------
2503 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2504 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2505 | comparison is performed according to the IEC/IEEE Standard for Binary
2506 | Floating-Point Arithmetic.
2507 *----------------------------------------------------------------------------*/
2508
2509 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2510 {
2511     a = float32_squash_input_denormal(a STATUS_VAR);
2512     b = float32_squash_input_denormal(b STATUS_VAR);
2513
2514     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2515          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2516        ) {
2517         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2518             float_raise( float_flag_invalid STATUS_VAR);
2519         }
2520         return 1;
2521     }
2522     return 0;
2523 }
2524
2525 /*----------------------------------------------------------------------------
2526 | Returns the result of converting the double-precision floating-point value
2527 | `a' to the 32-bit two's complement integer format.  The conversion is
2528 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2529 | Arithmetic---which means in particular that the conversion is rounded
2530 | according to the current rounding mode.  If `a' is a NaN, the largest
2531 | positive integer is returned.  Otherwise, if the conversion overflows, the
2532 | largest integer with the same sign as `a' is returned.
2533 *----------------------------------------------------------------------------*/
2534
2535 int32 float64_to_int32( float64 a STATUS_PARAM )
2536 {
2537     flag aSign;
2538     int16 aExp, shiftCount;
2539     uint64_t aSig;
2540     a = float64_squash_input_denormal(a STATUS_VAR);
2541
2542     aSig = extractFloat64Frac( a );
2543     aExp = extractFloat64Exp( a );
2544     aSign = extractFloat64Sign( a );
2545     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2546     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2547     shiftCount = 0x42C - aExp;
2548     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2549     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2550
2551 }
2552
2553 /*----------------------------------------------------------------------------
2554 | Returns the result of converting the double-precision floating-point value
2555 | `a' to the 32-bit two's complement integer format.  The conversion is
2556 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2557 | Arithmetic, except that the conversion is always rounded toward zero.
2558 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2559 | the conversion overflows, the largest integer with the same sign as `a' is
2560 | returned.
2561 *----------------------------------------------------------------------------*/
2562
2563 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2564 {
2565     flag aSign;
2566     int16 aExp, shiftCount;
2567     uint64_t aSig, savedASig;
2568     int32 z;
2569     a = float64_squash_input_denormal(a STATUS_VAR);
2570
2571     aSig = extractFloat64Frac( a );
2572     aExp = extractFloat64Exp( a );
2573     aSign = extractFloat64Sign( a );
2574     if ( 0x41E < aExp ) {
2575         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2576         goto invalid;
2577     }
2578     else if ( aExp < 0x3FF ) {
2579         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2580         return 0;
2581     }
2582     aSig |= LIT64( 0x0010000000000000 );
2583     shiftCount = 0x433 - aExp;
2584     savedASig = aSig;
2585     aSig >>= shiftCount;
2586     z = aSig;
2587     if ( aSign ) z = - z;
2588     if ( ( z < 0 ) ^ aSign ) {
2589  invalid:
2590         float_raise( float_flag_invalid STATUS_VAR);
2591         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2592     }
2593     if ( ( aSig<<shiftCount ) != savedASig ) {
2594         STATUS(float_exception_flags) |= float_flag_inexact;
2595     }
2596     return z;
2597
2598 }
2599
2600 /*----------------------------------------------------------------------------
2601 | Returns the result of converting the double-precision floating-point value
2602 | `a' to the 16-bit two's complement integer format.  The conversion is
2603 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2604 | Arithmetic, except that the conversion is always rounded toward zero.
2605 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2606 | the conversion overflows, the largest integer with the same sign as `a' is
2607 | returned.
2608 *----------------------------------------------------------------------------*/
2609
2610 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2611 {
2612     flag aSign;
2613     int16 aExp, shiftCount;
2614     uint64_t aSig, savedASig;
2615     int32 z;
2616
2617     aSig = extractFloat64Frac( a );
2618     aExp = extractFloat64Exp( a );
2619     aSign = extractFloat64Sign( a );
2620     if ( 0x40E < aExp ) {
2621         if ( ( aExp == 0x7FF ) && aSig ) {
2622             aSign = 0;
2623         }
2624         goto invalid;
2625     }
2626     else if ( aExp < 0x3FF ) {
2627         if ( aExp || aSig ) {
2628             STATUS(float_exception_flags) |= float_flag_inexact;
2629         }
2630         return 0;
2631     }
2632     aSig |= LIT64( 0x0010000000000000 );
2633     shiftCount = 0x433 - aExp;
2634     savedASig = aSig;
2635     aSig >>= shiftCount;
2636     z = aSig;
2637     if ( aSign ) {
2638         z = - z;
2639     }
2640     if ( ( (int16_t)z < 0 ) ^ aSign ) {
2641  invalid:
2642         float_raise( float_flag_invalid STATUS_VAR);
2643         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2644     }
2645     if ( ( aSig<<shiftCount ) != savedASig ) {
2646         STATUS(float_exception_flags) |= float_flag_inexact;
2647     }
2648     return z;
2649 }
2650
2651 /*----------------------------------------------------------------------------
2652 | Returns the result of converting the double-precision floating-point value
2653 | `a' to the 64-bit two's complement integer format.  The conversion is
2654 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2655 | Arithmetic---which means in particular that the conversion is rounded
2656 | according to the current rounding mode.  If `a' is a NaN, the largest
2657 | positive integer is returned.  Otherwise, if the conversion overflows, the
2658 | largest integer with the same sign as `a' is returned.
2659 *----------------------------------------------------------------------------*/
2660
2661 int64 float64_to_int64( float64 a STATUS_PARAM )
2662 {
2663     flag aSign;
2664     int16 aExp, shiftCount;
2665     uint64_t aSig, aSigExtra;
2666     a = float64_squash_input_denormal(a STATUS_VAR);
2667
2668     aSig = extractFloat64Frac( a );
2669     aExp = extractFloat64Exp( a );
2670     aSign = extractFloat64Sign( a );
2671     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2672     shiftCount = 0x433 - aExp;
2673     if ( shiftCount <= 0 ) {
2674         if ( 0x43E < aExp ) {
2675             float_raise( float_flag_invalid STATUS_VAR);
2676             if (    ! aSign
2677                  || (    ( aExp == 0x7FF )
2678                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2679                ) {
2680                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2681             }
2682             return (int64_t) LIT64( 0x8000000000000000 );
2683         }
2684         aSigExtra = 0;
2685         aSig <<= - shiftCount;
2686     }
2687     else {
2688         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2689     }
2690     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2691
2692 }
2693
2694 /*----------------------------------------------------------------------------
2695 | Returns the result of converting the double-precision floating-point value
2696 | `a' to the 64-bit two's complement integer format.  The conversion is
2697 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2698 | Arithmetic, except that the conversion is always rounded toward zero.
2699 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2700 | the conversion overflows, the largest integer with the same sign as `a' is
2701 | returned.
2702 *----------------------------------------------------------------------------*/
2703
2704 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2705 {
2706     flag aSign;
2707     int16 aExp, shiftCount;
2708     uint64_t aSig;
2709     int64 z;
2710     a = float64_squash_input_denormal(a STATUS_VAR);
2711
2712     aSig = extractFloat64Frac( a );
2713     aExp = extractFloat64Exp( a );
2714     aSign = extractFloat64Sign( a );
2715     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2716     shiftCount = aExp - 0x433;
2717     if ( 0 <= shiftCount ) {
2718         if ( 0x43E <= aExp ) {
2719             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2720                 float_raise( float_flag_invalid STATUS_VAR);
2721                 if (    ! aSign
2722                      || (    ( aExp == 0x7FF )
2723                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2724                    ) {
2725                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2726                 }
2727             }
2728             return (int64_t) LIT64( 0x8000000000000000 );
2729         }
2730         z = aSig<<shiftCount;
2731     }
2732     else {
2733         if ( aExp < 0x3FE ) {
2734             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2735             return 0;
2736         }
2737         z = aSig>>( - shiftCount );
2738         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2739             STATUS(float_exception_flags) |= float_flag_inexact;
2740         }
2741     }
2742     if ( aSign ) z = - z;
2743     return z;
2744
2745 }
2746
2747 /*----------------------------------------------------------------------------
2748 | Returns the result of converting the double-precision floating-point value
2749 | `a' to the single-precision floating-point format.  The conversion is
2750 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2751 | Arithmetic.
2752 *----------------------------------------------------------------------------*/
2753
2754 float32 float64_to_float32( float64 a STATUS_PARAM )
2755 {
2756     flag aSign;
2757     int16 aExp;
2758     uint64_t aSig;
2759     uint32_t zSig;
2760     a = float64_squash_input_denormal(a STATUS_VAR);
2761
2762     aSig = extractFloat64Frac( a );
2763     aExp = extractFloat64Exp( a );
2764     aSign = extractFloat64Sign( a );
2765     if ( aExp == 0x7FF ) {
2766         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2767         return packFloat32( aSign, 0xFF, 0 );
2768     }
2769     shift64RightJamming( aSig, 22, &aSig );
2770     zSig = aSig;
2771     if ( aExp || zSig ) {
2772         zSig |= 0x40000000;
2773         aExp -= 0x381;
2774     }
2775     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2776
2777 }
2778
2779
2780 /*----------------------------------------------------------------------------
2781 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2782 | half-precision floating-point value, returning the result.  After being
2783 | shifted into the proper positions, the three fields are simply added
2784 | together to form the result.  This means that any integer portion of `zSig'
2785 | will be added into the exponent.  Since a properly normalized significand
2786 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2787 | than the desired result exponent whenever `zSig' is a complete, normalized
2788 | significand.
2789 *----------------------------------------------------------------------------*/
2790 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2791 {
2792     return make_float16(
2793         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2794 }
2795
2796 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2797    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2798
2799 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2800 {
2801     flag aSign;
2802     int16 aExp;
2803     uint32_t aSig;
2804
2805     aSign = extractFloat16Sign(a);
2806     aExp = extractFloat16Exp(a);
2807     aSig = extractFloat16Frac(a);
2808
2809     if (aExp == 0x1f && ieee) {
2810         if (aSig) {
2811             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2812         }
2813         return packFloat32(aSign, 0xff, aSig << 13);
2814     }
2815     if (aExp == 0) {
2816         int8 shiftCount;
2817
2818         if (aSig == 0) {
2819             return packFloat32(aSign, 0, 0);
2820         }
2821
2822         shiftCount = countLeadingZeros32( aSig ) - 21;
2823         aSig = aSig << shiftCount;
2824         aExp = -shiftCount;
2825     }
2826     return packFloat32( aSign, aExp + 0x70, aSig << 13);
2827 }
2828
2829 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2830 {
2831     flag aSign;
2832     int16 aExp;
2833     uint32_t aSig;
2834     uint32_t mask;
2835     uint32_t increment;
2836     int8 roundingMode;
2837     a = float32_squash_input_denormal(a STATUS_VAR);
2838
2839     aSig = extractFloat32Frac( a );
2840     aExp = extractFloat32Exp( a );
2841     aSign = extractFloat32Sign( a );
2842     if ( aExp == 0xFF ) {
2843         if (aSig) {
2844             /* Input is a NaN */
2845             float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2846             if (!ieee) {
2847                 return packFloat16(aSign, 0, 0);
2848             }
2849             return r;
2850         }
2851         /* Infinity */
2852         if (!ieee) {
2853             float_raise(float_flag_invalid STATUS_VAR);
2854             return packFloat16(aSign, 0x1f, 0x3ff);
2855         }
2856         return packFloat16(aSign, 0x1f, 0);
2857     }
2858     if (aExp == 0 && aSig == 0) {
2859         return packFloat16(aSign, 0, 0);
2860     }
2861     /* Decimal point between bits 22 and 23.  */
2862     aSig |= 0x00800000;
2863     aExp -= 0x7f;
2864     if (aExp < -14) {
2865         mask = 0x00ffffff;
2866         if (aExp >= -24) {
2867             mask >>= 25 + aExp;
2868         }
2869     } else {
2870         mask = 0x00001fff;
2871     }
2872     if (aSig & mask) {
2873         float_raise( float_flag_underflow STATUS_VAR );
2874         roundingMode = STATUS(float_rounding_mode);
2875         switch (roundingMode) {
2876         case float_round_nearest_even:
2877             increment = (mask + 1) >> 1;
2878             if ((aSig & mask) == increment) {
2879                 increment = aSig & (increment << 1);
2880             }
2881             break;
2882         case float_round_up:
2883             increment = aSign ? 0 : mask;
2884             break;
2885         case float_round_down:
2886             increment = aSign ? mask : 0;
2887             break;
2888         default: /* round_to_zero */
2889             increment = 0;
2890             break;
2891         }
2892         aSig += increment;
2893         if (aSig >= 0x01000000) {
2894             aSig >>= 1;
2895             aExp++;
2896         }
2897     } else if (aExp < -14
2898           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2899         float_raise( float_flag_underflow STATUS_VAR);
2900     }
2901
2902     if (ieee) {
2903         if (aExp > 15) {
2904             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2905             return packFloat16(aSign, 0x1f, 0);
2906         }
2907     } else {
2908         if (aExp > 16) {
2909             float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2910             return packFloat16(aSign, 0x1f, 0x3ff);
2911         }
2912     }
2913     if (aExp < -24) {
2914         return packFloat16(aSign, 0, 0);
2915     }
2916     if (aExp < -14) {
2917         aSig >>= -14 - aExp;
2918         aExp = -14;
2919     }
2920     return packFloat16(aSign, aExp + 14, aSig >> 13);
2921 }
2922
2923 #ifdef FLOATX80
2924
2925 /*----------------------------------------------------------------------------
2926 | Returns the result of converting the double-precision floating-point value
2927 | `a' to the extended double-precision floating-point format.  The conversion
2928 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2929 | Arithmetic.
2930 *----------------------------------------------------------------------------*/
2931
2932 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2933 {
2934     flag aSign;
2935     int16 aExp;
2936     uint64_t aSig;
2937
2938     a = float64_squash_input_denormal(a STATUS_VAR);
2939     aSig = extractFloat64Frac( a );
2940     aExp = extractFloat64Exp( a );
2941     aSign = extractFloat64Sign( a );
2942     if ( aExp == 0x7FF ) {
2943         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2944         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2945     }
2946     if ( aExp == 0 ) {
2947         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2948         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2949     }
2950     return
2951         packFloatx80(
2952             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2953
2954 }
2955
2956 #endif
2957
2958 #ifdef FLOAT128
2959
2960 /*----------------------------------------------------------------------------
2961 | Returns the result of converting the double-precision floating-point value
2962 | `a' to the quadruple-precision floating-point format.  The conversion is
2963 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2964 | Arithmetic.
2965 *----------------------------------------------------------------------------*/
2966
2967 float128 float64_to_float128( float64 a STATUS_PARAM )
2968 {
2969     flag aSign;
2970     int16 aExp;
2971     uint64_t aSig, zSig0, zSig1;
2972
2973     a = float64_squash_input_denormal(a STATUS_VAR);
2974     aSig = extractFloat64Frac( a );
2975     aExp = extractFloat64Exp( a );
2976     aSign = extractFloat64Sign( a );
2977     if ( aExp == 0x7FF ) {
2978         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2979         return packFloat128( aSign, 0x7FFF, 0, 0 );
2980     }
2981     if ( aExp == 0 ) {
2982         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2983         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2984         --aExp;
2985     }
2986     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2987     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2988
2989 }
2990
2991 #endif
2992
2993 /*----------------------------------------------------------------------------
2994 | Rounds the double-precision floating-point value `a' to an integer, and
2995 | returns the result as a double-precision floating-point value.  The
2996 | operation is performed according to the IEC/IEEE Standard for Binary
2997 | Floating-Point Arithmetic.
2998 *----------------------------------------------------------------------------*/
2999
3000 float64 float64_round_to_int( float64 a STATUS_PARAM )
3001 {
3002     flag aSign;
3003     int16 aExp;
3004     uint64_t lastBitMask, roundBitsMask;
3005     int8 roundingMode;
3006     uint64_t z;
3007     a = float64_squash_input_denormal(a STATUS_VAR);
3008
3009     aExp = extractFloat64Exp( a );
3010     if ( 0x433 <= aExp ) {
3011         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3012             return propagateFloat64NaN( a, a STATUS_VAR );
3013         }
3014         return a;
3015     }
3016     if ( aExp < 0x3FF ) {
3017         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3018         STATUS(float_exception_flags) |= float_flag_inexact;
3019         aSign = extractFloat64Sign( a );
3020         switch ( STATUS(float_rounding_mode) ) {
3021          case float_round_nearest_even:
3022             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3023                 return packFloat64( aSign, 0x3FF, 0 );
3024             }
3025             break;
3026          case float_round_down:
3027             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3028          case float_round_up:
3029             return make_float64(
3030             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3031         }
3032         return packFloat64( aSign, 0, 0 );
3033     }
3034     lastBitMask = 1;
3035     lastBitMask <<= 0x433 - aExp;
3036     roundBitsMask = lastBitMask - 1;
3037     z = float64_val(a);
3038     roundingMode = STATUS(float_rounding_mode);
3039     if ( roundingMode == float_round_nearest_even ) {
3040         z += lastBitMask>>1;
3041         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3042     }
3043     else if ( roundingMode != float_round_to_zero ) {
3044         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3045             z += roundBitsMask;
3046         }
3047     }
3048     z &= ~ roundBitsMask;
3049     if ( z != float64_val(a) )
3050         STATUS(float_exception_flags) |= float_flag_inexact;
3051     return make_float64(z);
3052
3053 }
3054
3055 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3056 {
3057     int oldmode;
3058     float64 res;
3059     oldmode = STATUS(float_rounding_mode);
3060     STATUS(float_rounding_mode) = float_round_to_zero;
3061     res = float64_round_to_int(a STATUS_VAR);
3062     STATUS(float_rounding_mode) = oldmode;
3063     return res;
3064 }
3065
3066 /*----------------------------------------------------------------------------
3067 | Returns the result of adding the absolute values of the double-precision
3068 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3069 | before being returned.  `zSign' is ignored if the result is a NaN.
3070 | The addition is performed according to the IEC/IEEE Standard for Binary
3071 | Floating-Point Arithmetic.
3072 *----------------------------------------------------------------------------*/
3073
3074 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3075 {
3076     int16 aExp, bExp, zExp;
3077     uint64_t aSig, bSig, zSig;
3078     int16 expDiff;
3079
3080     aSig = extractFloat64Frac( a );
3081     aExp = extractFloat64Exp( a );
3082     bSig = extractFloat64Frac( b );
3083     bExp = extractFloat64Exp( b );
3084     expDiff = aExp - bExp;
3085     aSig <<= 9;
3086     bSig <<= 9;
3087     if ( 0 < expDiff ) {
3088         if ( aExp == 0x7FF ) {
3089             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3090             return a;
3091         }
3092         if ( bExp == 0 ) {
3093             --expDiff;
3094         }
3095         else {
3096             bSig |= LIT64( 0x2000000000000000 );
3097         }
3098         shift64RightJamming( bSig, expDiff, &bSig );
3099         zExp = aExp;
3100     }
3101     else if ( expDiff < 0 ) {
3102         if ( bExp == 0x7FF ) {
3103             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3104             return packFloat64( zSign, 0x7FF, 0 );
3105         }
3106         if ( aExp == 0 ) {
3107             ++expDiff;
3108         }
3109         else {
3110             aSig |= LIT64( 0x2000000000000000 );
3111         }
3112         shift64RightJamming( aSig, - expDiff, &aSig );
3113         zExp = bExp;
3114     }
3115     else {
3116         if ( aExp == 0x7FF ) {
3117             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3118             return a;
3119         }
3120         if ( aExp == 0 ) {
3121             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3122             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3123         }
3124         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3125         zExp = aExp;
3126         goto roundAndPack;
3127     }
3128     aSig |= LIT64( 0x2000000000000000 );
3129     zSig = ( aSig + bSig )<<1;
3130     --zExp;
3131     if ( (int64_t) zSig < 0 ) {
3132         zSig = aSig + bSig;
3133         ++zExp;
3134     }
3135  roundAndPack:
3136     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3137
3138 }
3139
3140 /*----------------------------------------------------------------------------
3141 | Returns the result of subtracting the absolute values of the double-
3142 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3143 | difference is negated before being returned.  `zSign' is ignored if the
3144 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3145 | Standard for Binary Floating-Point Arithmetic.
3146 *----------------------------------------------------------------------------*/
3147
3148 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3149 {
3150     int16 aExp, bExp, zExp;
3151     uint64_t aSig, bSig, zSig;
3152     int16 expDiff;
3153
3154     aSig = extractFloat64Frac( a );
3155     aExp = extractFloat64Exp( a );
3156     bSig = extractFloat64Frac( b );
3157     bExp = extractFloat64Exp( b );
3158     expDiff = aExp - bExp;
3159     aSig <<= 10;
3160     bSig <<= 10;
3161     if ( 0 < expDiff ) goto aExpBigger;
3162     if ( expDiff < 0 ) goto bExpBigger;
3163     if ( aExp == 0x7FF ) {
3164         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3165         float_raise( float_flag_invalid STATUS_VAR);
3166         return float64_default_nan;
3167     }
3168     if ( aExp == 0 ) {
3169         aExp = 1;
3170         bExp = 1;
3171     }
3172     if ( bSig < aSig ) goto aBigger;
3173     if ( aSig < bSig ) goto bBigger;
3174     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3175  bExpBigger:
3176     if ( bExp == 0x7FF ) {
3177         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3178         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3179     }
3180     if ( aExp == 0 ) {
3181         ++expDiff;
3182     }
3183     else {
3184         aSig |= LIT64( 0x4000000000000000 );
3185     }
3186     shift64RightJamming( aSig, - expDiff, &aSig );
3187     bSig |= LIT64( 0x4000000000000000 );
3188  bBigger:
3189     zSig = bSig - aSig;
3190     zExp = bExp;
3191     zSign ^= 1;
3192     goto normalizeRoundAndPack;
3193  aExpBigger:
3194     if ( aExp == 0x7FF ) {
3195         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3196         return a;
3197     }
3198     if ( bExp == 0 ) {
3199         --expDiff;
3200     }
3201     else {
3202         bSig |= LIT64( 0x4000000000000000 );
3203     }
3204     shift64RightJamming( bSig, expDiff, &bSig );
3205     aSig |= LIT64( 0x4000000000000000 );
3206  aBigger:
3207     zSig = aSig - bSig;
3208     zExp = aExp;
3209  normalizeRoundAndPack:
3210     --zExp;
3211     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3212
3213 }
3214
3215 /*----------------------------------------------------------------------------
3216 | Returns the result of adding the double-precision floating-point values `a'
3217 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3218 | Binary Floating-Point Arithmetic.
3219 *----------------------------------------------------------------------------*/
3220
3221 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3222 {
3223     flag aSign, bSign;
3224     a = float64_squash_input_denormal(a STATUS_VAR);
3225     b = float64_squash_input_denormal(b STATUS_VAR);
3226
3227     aSign = extractFloat64Sign( a );
3228     bSign = extractFloat64Sign( b );
3229     if ( aSign == bSign ) {
3230         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3231     }
3232     else {
3233         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3234     }
3235
3236 }
3237
3238 /*----------------------------------------------------------------------------
3239 | Returns the result of subtracting the double-precision floating-point values
3240 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3241 | for Binary Floating-Point Arithmetic.
3242 *----------------------------------------------------------------------------*/
3243
3244 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3245 {
3246     flag aSign, bSign;
3247     a = float64_squash_input_denormal(a STATUS_VAR);
3248     b = float64_squash_input_denormal(b STATUS_VAR);
3249
3250     aSign = extractFloat64Sign( a );
3251     bSign = extractFloat64Sign( b );
3252     if ( aSign == bSign ) {
3253         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3254     }
3255     else {
3256         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3257     }
3258
3259 }
3260
3261 /*----------------------------------------------------------------------------
3262 | Returns the result of multiplying the double-precision floating-point values
3263 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3264 | for Binary Floating-Point Arithmetic.
3265 *----------------------------------------------------------------------------*/
3266
3267 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3268 {
3269     flag aSign, bSign, zSign;
3270     int16 aExp, bExp, zExp;
3271     uint64_t aSig, bSig, zSig0, zSig1;
3272
3273     a = float64_squash_input_denormal(a STATUS_VAR);
3274     b = float64_squash_input_denormal(b STATUS_VAR);
3275
3276     aSig = extractFloat64Frac( a );
3277     aExp = extractFloat64Exp( a );
3278     aSign = extractFloat64Sign( a );
3279     bSig = extractFloat64Frac( b );
3280     bExp = extractFloat64Exp( b );
3281     bSign = extractFloat64Sign( b );
3282     zSign = aSign ^ bSign;
3283     if ( aExp == 0x7FF ) {
3284         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3285             return propagateFloat64NaN( a, b STATUS_VAR );
3286         }
3287         if ( ( bExp | bSig ) == 0 ) {
3288             float_raise( float_flag_invalid STATUS_VAR);
3289             return float64_default_nan;
3290         }
3291         return packFloat64( zSign, 0x7FF, 0 );
3292     }
3293     if ( bExp == 0x7FF ) {
3294         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3295         if ( ( aExp | aSig ) == 0 ) {
3296             float_raise( float_flag_invalid STATUS_VAR);
3297             return float64_default_nan;
3298         }
3299         return packFloat64( zSign, 0x7FF, 0 );
3300     }
3301     if ( aExp == 0 ) {
3302         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3303         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3304     }
3305     if ( bExp == 0 ) {
3306         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3307         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3308     }
3309     zExp = aExp + bExp - 0x3FF;
3310     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3311     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3312     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3313     zSig0 |= ( zSig1 != 0 );
3314     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3315         zSig0 <<= 1;
3316         --zExp;
3317     }
3318     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3319
3320 }
3321
3322 /*----------------------------------------------------------------------------
3323 | Returns the result of dividing the double-precision floating-point value `a'
3324 | by the corresponding value `b'.  The operation is performed according to
3325 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3326 *----------------------------------------------------------------------------*/
3327
3328 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3329 {
3330     flag aSign, bSign, zSign;
3331     int16 aExp, bExp, zExp;
3332     uint64_t aSig, bSig, zSig;
3333     uint64_t rem0, rem1;
3334     uint64_t term0, term1;
3335     a = float64_squash_input_denormal(a STATUS_VAR);
3336     b = float64_squash_input_denormal(b STATUS_VAR);
3337
3338     aSig = extractFloat64Frac( a );
3339     aExp = extractFloat64Exp( a );
3340     aSign = extractFloat64Sign( a );
3341     bSig = extractFloat64Frac( b );
3342     bExp = extractFloat64Exp( b );
3343     bSign = extractFloat64Sign( b );
3344     zSign = aSign ^ bSign;
3345     if ( aExp == 0x7FF ) {
3346         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3347         if ( bExp == 0x7FF ) {
3348             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3349             float_raise( float_flag_invalid STATUS_VAR);
3350             return float64_default_nan;
3351         }
3352         return packFloat64( zSign, 0x7FF, 0 );
3353     }
3354     if ( bExp == 0x7FF ) {
3355         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3356         return packFloat64( zSign, 0, 0 );
3357     }
3358     if ( bExp == 0 ) {
3359         if ( bSig == 0 ) {
3360             if ( ( aExp | aSig ) == 0 ) {
3361                 float_raise( float_flag_invalid STATUS_VAR);
3362                 return float64_default_nan;
3363             }
3364             float_raise( float_flag_divbyzero STATUS_VAR);
3365             return packFloat64( zSign, 0x7FF, 0 );
3366         }
3367         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3368     }
3369     if ( aExp == 0 ) {
3370         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3371         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3372     }
3373     zExp = aExp - bExp + 0x3FD;
3374     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3375     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3376     if ( bSig <= ( aSig + aSig ) ) {
3377         aSig >>= 1;
3378         ++zExp;
3379     }
3380     zSig = estimateDiv128To64( aSig, 0, bSig );
3381     if ( ( zSig & 0x1FF ) <= 2 ) {
3382         mul64To128( bSig, zSig, &term0, &term1 );
3383         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3384         while ( (int64_t) rem0 < 0 ) {
3385             --zSig;
3386             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3387         }
3388         zSig |= ( rem1 != 0 );
3389     }
3390     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3391
3392 }
3393
3394 /*----------------------------------------------------------------------------
3395 | Returns the remainder of the double-precision floating-point value `a'
3396 | with respect to the corresponding value `b'.  The operation is performed
3397 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3398 *----------------------------------------------------------------------------*/
3399
3400 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3401 {
3402     flag aSign, zSign;
3403     int16 aExp, bExp, expDiff;
3404     uint64_t aSig, bSig;
3405     uint64_t q, alternateASig;
3406     int64_t sigMean;
3407
3408     a = float64_squash_input_denormal(a STATUS_VAR);
3409     b = float64_squash_input_denormal(b STATUS_VAR);
3410     aSig = extractFloat64Frac( a );
3411     aExp = extractFloat64Exp( a );
3412     aSign = extractFloat64Sign( a );
3413     bSig = extractFloat64Frac( b );
3414     bExp = extractFloat64Exp( b );
3415     if ( aExp == 0x7FF ) {
3416         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3417             return propagateFloat64NaN( a, b STATUS_VAR );
3418         }
3419         float_raise( float_flag_invalid STATUS_VAR);
3420         return float64_default_nan;
3421     }
3422     if ( bExp == 0x7FF ) {
3423         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3424         return a;
3425     }
3426     if ( bExp == 0 ) {
3427         if ( bSig == 0 ) {
3428             float_raise( float_flag_invalid STATUS_VAR);
3429             return float64_default_nan;
3430         }
3431         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3432     }
3433     if ( aExp == 0 ) {
3434         if ( aSig == 0 ) return a;
3435         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3436     }
3437     expDiff = aExp - bExp;
3438     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3439     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3440     if ( expDiff < 0 ) {
3441         if ( expDiff < -1 ) return a;
3442         aSig >>= 1;
3443     }
3444     q = ( bSig <= aSig );
3445     if ( q ) aSig -= bSig;
3446     expDiff -= 64;
3447     while ( 0 < expDiff ) {
3448         q = estimateDiv128To64( aSig, 0, bSig );
3449         q = ( 2 < q ) ? q - 2 : 0;
3450         aSig = - ( ( bSig>>2 ) * q );
3451         expDiff -= 62;
3452     }
3453     expDiff += 64;
3454     if ( 0 < expDiff ) {
3455         q = estimateDiv128To64( aSig, 0, bSig );
3456         q = ( 2 < q ) ? q - 2 : 0;
3457         q >>= 64 - expDiff;
3458         bSig >>= 2;
3459         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3460     }
3461     else {
3462         aSig >>= 2;
3463         bSig >>= 2;
3464     }
3465     do {
3466         alternateASig = aSig;
3467         ++q;
3468         aSig -= bSig;
3469     } while ( 0 <= (int64_t) aSig );
3470     sigMean = aSig + alternateASig;
3471     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3472         aSig = alternateASig;
3473     }
3474     zSign = ( (int64_t) aSig < 0 );
3475     if ( zSign ) aSig = - aSig;
3476     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3477
3478 }
3479
3480 /*----------------------------------------------------------------------------
3481 | Returns the square root of the double-precision floating-point value `a'.
3482 | The operation is performed according to the IEC/IEEE Standard for Binary
3483 | Floating-Point Arithmetic.
3484 *----------------------------------------------------------------------------*/
3485
3486 float64 float64_sqrt( float64 a STATUS_PARAM )
3487 {
3488     flag aSign;
3489     int16 aExp, zExp;
3490     uint64_t aSig, zSig, doubleZSig;
3491     uint64_t rem0, rem1, term0, term1;
3492     a = float64_squash_input_denormal(a STATUS_VAR);
3493
3494     aSig = extractFloat64Frac( a );
3495     aExp = extractFloat64Exp( a );
3496     aSign = extractFloat64Sign( a );
3497     if ( aExp == 0x7FF ) {
3498         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3499         if ( ! aSign ) return a;
3500         float_raise( float_flag_invalid STATUS_VAR);
3501         return float64_default_nan;
3502     }
3503     if ( aSign ) {
3504         if ( ( aExp | aSig ) == 0 ) return a;
3505         float_raise( float_flag_invalid STATUS_VAR);
3506         return float64_default_nan;
3507     }
3508     if ( aExp == 0 ) {
3509         if ( aSig == 0 ) return float64_zero;
3510         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3511     }
3512     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3513     aSig |= LIT64( 0x0010000000000000 );
3514     zSig = estimateSqrt32( aExp, aSig>>21 );
3515     aSig <<= 9 - ( aExp & 1 );
3516     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3517     if ( ( zSig & 0x1FF ) <= 5 ) {
3518         doubleZSig = zSig<<1;
3519         mul64To128( zSig, zSig, &term0, &term1 );
3520         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3521         while ( (int64_t) rem0 < 0 ) {
3522             --zSig;
3523             doubleZSig -= 2;
3524             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3525         }
3526         zSig |= ( ( rem0 | rem1 ) != 0 );
3527     }
3528     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3529
3530 }
3531
3532 /*----------------------------------------------------------------------------
3533 | Returns the binary log of the double-precision floating-point value `a'.
3534 | The operation is performed according to the IEC/IEEE Standard for Binary
3535 | Floating-Point Arithmetic.
3536 *----------------------------------------------------------------------------*/
3537 float64 float64_log2( float64 a STATUS_PARAM )
3538 {
3539     flag aSign, zSign;
3540     int16 aExp;
3541     uint64_t aSig, aSig0, aSig1, zSig, i;
3542     a = float64_squash_input_denormal(a STATUS_VAR);
3543
3544     aSig = extractFloat64Frac( a );
3545     aExp = extractFloat64Exp( a );
3546     aSign = extractFloat64Sign( a );
3547
3548     if ( aExp == 0 ) {
3549         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3550         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3551     }
3552     if ( aSign ) {
3553         float_raise( float_flag_invalid STATUS_VAR);
3554         return float64_default_nan;
3555     }
3556     if ( aExp == 0x7FF ) {
3557         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3558         return a;
3559     }
3560
3561     aExp -= 0x3FF;
3562     aSig |= LIT64( 0x0010000000000000 );
3563     zSign = aExp < 0;
3564     zSig = (uint64_t)aExp << 52;
3565     for (i = 1LL << 51; i > 0; i >>= 1) {
3566         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3567         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3568         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3569             aSig >>= 1;
3570             zSig |= i;
3571         }
3572     }
3573
3574     if ( zSign )
3575         zSig = -zSig;
3576     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3577 }
3578
3579 /*----------------------------------------------------------------------------
3580 | Returns 1 if the double-precision floating-point value `a' is equal to the
3581 | corresponding value `b', and 0 otherwise.  The comparison is performed
3582 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3583 *----------------------------------------------------------------------------*/
3584
3585 int float64_eq( float64 a, float64 b STATUS_PARAM )
3586 {
3587     uint64_t av, bv;
3588     a = float64_squash_input_denormal(a STATUS_VAR);
3589     b = float64_squash_input_denormal(b STATUS_VAR);
3590
3591     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3592          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3593        ) {
3594         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3595             float_raise( float_flag_invalid STATUS_VAR);
3596         }
3597         return 0;
3598     }
3599     av = float64_val(a);
3600     bv = float64_val(b);
3601     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3602
3603 }
3604
3605 /*----------------------------------------------------------------------------
3606 | Returns 1 if the double-precision floating-point value `a' is less than or
3607 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3608 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3609 | Arithmetic.
3610 *----------------------------------------------------------------------------*/
3611
3612 int float64_le( float64 a, float64 b STATUS_PARAM )
3613 {
3614     flag aSign, bSign;
3615     uint64_t av, bv;
3616     a = float64_squash_input_denormal(a STATUS_VAR);
3617     b = float64_squash_input_denormal(b STATUS_VAR);
3618
3619     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3620          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3621        ) {
3622         float_raise( float_flag_invalid STATUS_VAR);
3623         return 0;
3624     }
3625     aSign = extractFloat64Sign( a );
3626     bSign = extractFloat64Sign( b );
3627     av = float64_val(a);
3628     bv = float64_val(b);
3629     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3630     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3631
3632 }
3633
3634 /*----------------------------------------------------------------------------
3635 | Returns 1 if the double-precision floating-point value `a' is less than
3636 | the corresponding value `b', and 0 otherwise.  The comparison is performed
3637 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3638 *----------------------------------------------------------------------------*/
3639
3640 int float64_lt( float64 a, float64 b STATUS_PARAM )
3641 {
3642     flag aSign, bSign;
3643     uint64_t av, bv;
3644
3645     a = float64_squash_input_denormal(a STATUS_VAR);
3646     b = float64_squash_input_denormal(b STATUS_VAR);
3647     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3648          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3649        ) {
3650         float_raise( float_flag_invalid STATUS_VAR);
3651         return 0;
3652     }
3653     aSign = extractFloat64Sign( a );
3654     bSign = extractFloat64Sign( b );
3655     av = float64_val(a);
3656     bv = float64_val(b);
3657     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3658     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3659
3660 }
3661
3662 /*----------------------------------------------------------------------------
3663 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3664 | be compared, and 0 otherwise.  The comparison is performed according to the
3665 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3666 *----------------------------------------------------------------------------*/
3667
3668 int float64_unordered( float64 a, float64 b STATUS_PARAM )
3669 {
3670     a = float64_squash_input_denormal(a STATUS_VAR);
3671     b = float64_squash_input_denormal(b STATUS_VAR);
3672
3673     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3674          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3675        ) {
3676         float_raise( float_flag_invalid STATUS_VAR);
3677         return 1;
3678     }
3679     return 0;
3680 }
3681
3682 /*----------------------------------------------------------------------------
3683 | Returns 1 if the double-precision floating-point value `a' is equal to the
3684 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3685 | if either operand is a NaN.  Otherwise, the comparison is performed
3686 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3687 *----------------------------------------------------------------------------*/
3688
3689 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3690 {
3691     uint64_t av, bv;
3692     a = float64_squash_input_denormal(a STATUS_VAR);
3693     b = float64_squash_input_denormal(b STATUS_VAR);
3694
3695     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3696          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3697        ) {
3698         float_raise( float_flag_invalid STATUS_VAR);
3699         return 0;
3700     }
3701     av = float64_val(a);
3702     bv = float64_val(b);
3703     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3704
3705 }
3706
3707 /*----------------------------------------------------------------------------
3708 | Returns 1 if the double-precision floating-point value `a' is less than or
3709 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3710 | cause an exception.  Otherwise, the comparison is performed according to the
3711 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3712 *----------------------------------------------------------------------------*/
3713
3714 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3715 {
3716     flag aSign, bSign;
3717     uint64_t av, bv;
3718     a = float64_squash_input_denormal(a STATUS_VAR);
3719     b = float64_squash_input_denormal(b STATUS_VAR);
3720
3721     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3722          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3723        ) {
3724         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3725             float_raise( float_flag_invalid STATUS_VAR);
3726         }
3727         return 0;
3728     }
3729     aSign = extractFloat64Sign( a );
3730     bSign = extractFloat64Sign( b );
3731     av = float64_val(a);
3732     bv = float64_val(b);
3733     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3734     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3735
3736 }
3737
3738 /*----------------------------------------------------------------------------
3739 | Returns 1 if the double-precision floating-point value `a' is less than
3740 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3741 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3742 | Standard for Binary Floating-Point Arithmetic.
3743 *----------------------------------------------------------------------------*/
3744
3745 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3746 {
3747     flag aSign, bSign;
3748     uint64_t av, bv;
3749     a = float64_squash_input_denormal(a STATUS_VAR);
3750     b = float64_squash_input_denormal(b STATUS_VAR);
3751
3752     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3753          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3754        ) {
3755         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3756             float_raise( float_flag_invalid STATUS_VAR);
3757         }
3758         return 0;
3759     }
3760     aSign = extractFloat64Sign( a );
3761     bSign = extractFloat64Sign( b );
3762     av = float64_val(a);
3763     bv = float64_val(b);
3764     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3765     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3766
3767 }
3768
3769 /*----------------------------------------------------------------------------
3770 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3771 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3772 | comparison is performed according to the IEC/IEEE Standard for Binary
3773 | Floating-Point Arithmetic.
3774 *----------------------------------------------------------------------------*/
3775
3776 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
3777 {
3778     a = float64_squash_input_denormal(a STATUS_VAR);
3779     b = float64_squash_input_denormal(b STATUS_VAR);
3780
3781     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3782          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3783        ) {
3784         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3785             float_raise( float_flag_invalid STATUS_VAR);
3786         }
3787         return 1;
3788     }
3789     return 0;
3790 }
3791
3792 #ifdef FLOATX80
3793
3794 /*----------------------------------------------------------------------------
3795 | Returns the result of converting the extended double-precision floating-
3796 | point value `a' to the 32-bit two's complement integer format.  The
3797 | conversion is performed according to the IEC/IEEE Standard for Binary
3798 | Floating-Point Arithmetic---which means in particular that the conversion
3799 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3800 | largest positive integer is returned.  Otherwise, if the conversion
3801 | overflows, the largest integer with the same sign as `a' is returned.
3802 *----------------------------------------------------------------------------*/
3803
3804 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3805 {
3806     flag aSign;
3807     int32 aExp, shiftCount;
3808     uint64_t aSig;
3809
3810     aSig = extractFloatx80Frac( a );
3811     aExp = extractFloatx80Exp( a );
3812     aSign = extractFloatx80Sign( a );
3813     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3814     shiftCount = 0x4037 - aExp;
3815     if ( shiftCount <= 0 ) shiftCount = 1;
3816     shift64RightJamming( aSig, shiftCount, &aSig );
3817     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3818
3819 }
3820
3821 /*----------------------------------------------------------------------------
3822 | Returns the result of converting the extended double-precision floating-
3823 | point value `a' to the 32-bit two's complement integer format.  The
3824 | conversion is performed according to the IEC/IEEE Standard for Binary
3825 | Floating-Point Arithmetic, except that the conversion is always rounded
3826 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3827 | Otherwise, if the conversion overflows, the largest integer with the same
3828 | sign as `a' is returned.
3829 *----------------------------------------------------------------------------*/
3830
3831 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3832 {
3833     flag aSign;
3834     int32 aExp, shiftCount;
3835     uint64_t aSig, savedASig;
3836     int32 z;
3837
3838     aSig = extractFloatx80Frac( a );
3839     aExp = extractFloatx80Exp( a );
3840     aSign = extractFloatx80Sign( a );
3841     if ( 0x401E < aExp ) {
3842         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3843         goto invalid;
3844     }
3845     else if ( aExp < 0x3FFF ) {
3846         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3847         return 0;
3848     }
3849     shiftCount = 0x403E - aExp;
3850     savedASig = aSig;
3851     aSig >>= shiftCount;
3852     z = aSig;
3853     if ( aSign ) z = - z;
3854     if ( ( z < 0 ) ^ aSign ) {
3855  invalid:
3856         float_raise( float_flag_invalid STATUS_VAR);
3857         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3858     }
3859     if ( ( aSig<<shiftCount ) != savedASig ) {
3860         STATUS(float_exception_flags) |= float_flag_inexact;
3861     }
3862     return z;
3863
3864 }
3865
3866 /*----------------------------------------------------------------------------
3867 | Returns the result of converting the extended double-precision floating-
3868 | point value `a' to the 64-bit two's complement integer format.  The
3869 | conversion is performed according to the IEC/IEEE Standard for Binary
3870 | Floating-Point Arithmetic---which means in particular that the conversion
3871 | is rounded according to the current rounding mode.  If `a' is a NaN,
3872 | the largest positive integer is returned.  Otherwise, if the conversion
3873 | overflows, the largest integer with the same sign as `a' is returned.
3874 *----------------------------------------------------------------------------*/
3875
3876 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3877 {
3878     flag aSign;
3879     int32 aExp, shiftCount;
3880     uint64_t aSig, aSigExtra;
3881
3882     aSig = extractFloatx80Frac( a );
3883     aExp = extractFloatx80Exp( a );
3884     aSign = extractFloatx80Sign( a );
3885     shiftCount = 0x403E - aExp;
3886     if ( shiftCount <= 0 ) {
3887         if ( shiftCount ) {
3888             float_raise( float_flag_invalid STATUS_VAR);
3889             if (    ! aSign
3890                  || (    ( aExp == 0x7FFF )
3891                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3892                ) {
3893                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3894             }
3895             return (int64_t) LIT64( 0x8000000000000000 );
3896         }
3897         aSigExtra = 0;
3898     }
3899     else {
3900         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3901     }
3902     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3903
3904 }
3905
3906 /*----------------------------------------------------------------------------
3907 | Returns the result of converting the extended double-precision floating-
3908 | point value `a' to the 64-bit two's complement integer format.  The
3909 | conversion is performed according to the IEC/IEEE Standard for Binary
3910 | Floating-Point Arithmetic, except that the conversion is always rounded
3911 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3912 | Otherwise, if the conversion overflows, the largest integer with the same
3913 | sign as `a' is returned.
3914 *----------------------------------------------------------------------------*/
3915
3916 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3917 {
3918     flag aSign;
3919     int32 aExp, shiftCount;
3920     uint64_t aSig;
3921     int64 z;
3922
3923     aSig = extractFloatx80Frac( a );
3924     aExp = extractFloatx80Exp( a );
3925     aSign = extractFloatx80Sign( a );
3926     shiftCount = aExp - 0x403E;
3927     if ( 0 <= shiftCount ) {
3928         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3929         if ( ( a.high != 0xC03E ) || aSig ) {
3930             float_raise( float_flag_invalid STATUS_VAR);
3931             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3932                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3933             }
3934         }
3935         return (int64_t) LIT64( 0x8000000000000000 );
3936     }
3937     else if ( aExp < 0x3FFF ) {
3938         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3939         return 0;
3940     }
3941     z = aSig>>( - shiftCount );
3942     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3943         STATUS(float_exception_flags) |= float_flag_inexact;
3944     }
3945     if ( aSign ) z = - z;
3946     return z;
3947
3948 }
3949
3950 /*----------------------------------------------------------------------------
3951 | Returns the result of converting the extended double-precision floating-
3952 | point value `a' to the single-precision floating-point format.  The
3953 | conversion is performed according to the IEC/IEEE Standard for Binary
3954 | Floating-Point Arithmetic.
3955 *----------------------------------------------------------------------------*/
3956
3957 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3958 {
3959     flag aSign;
3960     int32 aExp;
3961     uint64_t aSig;
3962
3963     aSig = extractFloatx80Frac( a );
3964     aExp = extractFloatx80Exp( a );
3965     aSign = extractFloatx80Sign( a );
3966     if ( aExp == 0x7FFF ) {
3967         if ( (uint64_t) ( aSig<<1 ) ) {
3968             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3969         }
3970         return packFloat32( aSign, 0xFF, 0 );
3971     }
3972     shift64RightJamming( aSig, 33, &aSig );
3973     if ( aExp || aSig ) aExp -= 0x3F81;
3974     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3975
3976 }
3977
3978 /*----------------------------------------------------------------------------
3979 | Returns the result of converting the extended double-precision floating-
3980 | point value `a' to the double-precision floating-point format.  The
3981 | conversion is performed according to the IEC/IEEE Standard for Binary
3982 | Floating-Point Arithmetic.
3983 *----------------------------------------------------------------------------*/
3984
3985 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3986 {
3987     flag aSign;
3988     int32 aExp;
3989     uint64_t aSig, zSig;
3990
3991     aSig = extractFloatx80Frac( a );
3992     aExp = extractFloatx80Exp( a );
3993     aSign = extractFloatx80Sign( a );
3994     if ( aExp == 0x7FFF ) {
3995         if ( (uint64_t) ( aSig<<1 ) ) {
3996             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3997         }
3998         return packFloat64( aSign, 0x7FF, 0 );
3999     }
4000     shift64RightJamming( aSig, 1, &zSig );
4001     if ( aExp || aSig ) aExp -= 0x3C01;
4002     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4003
4004 }
4005
4006 #ifdef FLOAT128
4007
4008 /*----------------------------------------------------------------------------
4009 | Returns the result of converting the extended double-precision floating-
4010 | point value `a' to the quadruple-precision floating-point format.  The
4011 | conversion is performed according to the IEC/IEEE Standard for Binary
4012 | Floating-Point Arithmetic.
4013 *----------------------------------------------------------------------------*/
4014
4015 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4016 {
4017     flag aSign;
4018     int16 aExp;
4019     uint64_t aSig, zSig0, zSig1;
4020
4021     aSig = extractFloatx80Frac( a );
4022     aExp = extractFloatx80Exp( a );
4023     aSign = extractFloatx80Sign( a );
4024     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4025         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4026     }
4027     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4028     return packFloat128( aSign, aExp, zSig0, zSig1 );
4029
4030 }
4031
4032 #endif
4033
4034 /*----------------------------------------------------------------------------
4035 | Rounds the extended double-precision floating-point value `a' to an integer,
4036 | and returns the result as an extended quadruple-precision floating-point
4037 | value.  The operation is performed according to the IEC/IEEE Standard for
4038 | Binary Floating-Point Arithmetic.
4039 *----------------------------------------------------------------------------*/
4040
4041 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4042 {
4043     flag aSign;
4044     int32 aExp;
4045     uint64_t lastBitMask, roundBitsMask;
4046     int8 roundingMode;
4047     floatx80 z;
4048
4049     aExp = extractFloatx80Exp( a );
4050     if ( 0x403E <= aExp ) {
4051         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4052             return propagateFloatx80NaN( a, a STATUS_VAR );
4053         }
4054         return a;
4055     }
4056     if ( aExp < 0x3FFF ) {
4057         if (    ( aExp == 0 )
4058              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4059             return a;
4060         }
4061         STATUS(float_exception_flags) |= float_flag_inexact;
4062         aSign = extractFloatx80Sign( a );
4063         switch ( STATUS(float_rounding_mode) ) {
4064          case float_round_nearest_even:
4065             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4066                ) {
4067                 return
4068                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4069             }
4070             break;
4071          case float_round_down:
4072             return
4073                   aSign ?
4074                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4075                 : packFloatx80( 0, 0, 0 );
4076          case float_round_up:
4077             return
4078                   aSign ? packFloatx80( 1, 0, 0 )
4079                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4080         }
4081         return packFloatx80( aSign, 0, 0 );
4082     }
4083     lastBitMask = 1;
4084     lastBitMask <<= 0x403E - aExp;
4085     roundBitsMask = lastBitMask - 1;
4086     z = a;
4087     roundingMode = STATUS(float_rounding_mode);
4088     if ( roundingMode == float_round_nearest_even ) {
4089         z.low += lastBitMask>>1;
4090         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4091     }
4092     else if ( roundingMode != float_round_to_zero ) {
4093         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4094             z.low += roundBitsMask;
4095         }
4096     }
4097     z.low &= ~ roundBitsMask;
4098     if ( z.low == 0 ) {
4099         ++z.high;
4100         z.low = LIT64( 0x8000000000000000 );
4101     }
4102     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4103     return z;
4104
4105 }
4106
4107 /*----------------------------------------------------------------------------
4108 | Returns the result of adding the absolute values of the extended double-
4109 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4110 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4111 | The addition is performed according to the IEC/IEEE Standard for Binary
4112 | Floating-Point Arithmetic.
4113 *----------------------------------------------------------------------------*/
4114
4115 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4116 {
4117     int32 aExp, bExp, zExp;
4118     uint64_t aSig, bSig, zSig0, zSig1;
4119     int32 expDiff;
4120
4121     aSig = extractFloatx80Frac( a );
4122     aExp = extractFloatx80Exp( a );
4123     bSig = extractFloatx80Frac( b );
4124     bExp = extractFloatx80Exp( b );
4125     expDiff = aExp - bExp;
4126     if ( 0 < expDiff ) {
4127         if ( aExp == 0x7FFF ) {
4128             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4129             return a;
4130         }
4131         if ( bExp == 0 ) --expDiff;
4132         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4133         zExp = aExp;
4134     }
4135     else if ( expDiff < 0 ) {
4136         if ( bExp == 0x7FFF ) {
4137             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4138             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4139         }
4140         if ( aExp == 0 ) ++expDiff;
4141         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4142         zExp = bExp;
4143     }
4144     else {
4145         if ( aExp == 0x7FFF ) {
4146             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4147                 return propagateFloatx80NaN( a, b STATUS_VAR );
4148             }
4149             return a;
4150         }
4151         zSig1 = 0;
4152         zSig0 = aSig + bSig;
4153         if ( aExp == 0 ) {
4154             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4155             goto roundAndPack;
4156         }
4157         zExp = aExp;
4158         goto shiftRight1;
4159     }
4160     zSig0 = aSig + bSig;
4161     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4162  shiftRight1:
4163     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4164     zSig0 |= LIT64( 0x8000000000000000 );
4165     ++zExp;
4166  roundAndPack:
4167     return
4168         roundAndPackFloatx80(
4169             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4170
4171 }
4172
4173 /*----------------------------------------------------------------------------
4174 | Returns the result of subtracting the absolute values of the extended
4175 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4176 | difference is negated before being returned.  `zSign' is ignored if the
4177 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4178 | Standard for Binary Floating-Point Arithmetic.
4179 *----------------------------------------------------------------------------*/
4180
4181 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4182 {
4183     int32 aExp, bExp, zExp;
4184     uint64_t aSig, bSig, zSig0, zSig1;
4185     int32 expDiff;
4186     floatx80 z;
4187
4188     aSig = extractFloatx80Frac( a );
4189     aExp = extractFloatx80Exp( a );
4190     bSig = extractFloatx80Frac( b );
4191     bExp = extractFloatx80Exp( b );
4192     expDiff = aExp - bExp;
4193     if ( 0 < expDiff ) goto aExpBigger;
4194     if ( expDiff < 0 ) goto bExpBigger;
4195     if ( aExp == 0x7FFF ) {
4196         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4197             return propagateFloatx80NaN( a, b STATUS_VAR );
4198         }
4199         float_raise( float_flag_invalid STATUS_VAR);
4200         z.low = floatx80_default_nan_low;
4201         z.high = floatx80_default_nan_high;
4202         return z;
4203     }
4204     if ( aExp == 0 ) {
4205         aExp = 1;
4206         bExp = 1;
4207     }
4208     zSig1 = 0;
4209     if ( bSig < aSig ) goto aBigger;
4210     if ( aSig < bSig ) goto bBigger;
4211     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4212  bExpBigger:
4213     if ( bExp == 0x7FFF ) {
4214         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4215         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4216     }
4217     if ( aExp == 0 ) ++expDiff;
4218     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4219  bBigger:
4220     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4221     zExp = bExp;
4222     zSign ^= 1;
4223     goto normalizeRoundAndPack;
4224  aExpBigger:
4225     if ( aExp == 0x7FFF ) {
4226         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4227         return a;
4228     }
4229     if ( bExp == 0 ) --expDiff;
4230     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4231  aBigger:
4232     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4233     zExp = aExp;
4234  normalizeRoundAndPack:
4235     return
4236         normalizeRoundAndPackFloatx80(
4237             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4238
4239 }
4240
4241 /*----------------------------------------------------------------------------
4242 | Returns the result of adding the extended double-precision floating-point
4243 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4244 | Standard for Binary Floating-Point Arithmetic.
4245 *----------------------------------------------------------------------------*/
4246
4247 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4248 {
4249     flag aSign, bSign;
4250
4251     aSign = extractFloatx80Sign( a );
4252     bSign = extractFloatx80Sign( b );
4253     if ( aSign == bSign ) {
4254         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4255     }
4256     else {
4257         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4258     }
4259
4260 }
4261
4262 /*----------------------------------------------------------------------------
4263 | Returns the result of subtracting the extended double-precision floating-
4264 | point values `a' and `b'.  The operation is performed according to the
4265 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4266 *----------------------------------------------------------------------------*/
4267
4268 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4269 {
4270     flag aSign, bSign;
4271
4272     aSign = extractFloatx80Sign( a );
4273     bSign = extractFloatx80Sign( b );
4274     if ( aSign == bSign ) {
4275         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4276     }
4277     else {
4278         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4279     }
4280
4281 }
4282
4283 /*----------------------------------------------------------------------------
4284 | Returns the result of multiplying the extended double-precision floating-
4285 | point values `a' and `b'.  The operation is performed according to the
4286 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4287 *----------------------------------------------------------------------------*/
4288
4289 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4290 {
4291     flag aSign, bSign, zSign;
4292     int32 aExp, bExp, zExp;
4293     uint64_t aSig, bSig, zSig0, zSig1;
4294     floatx80 z;
4295
4296     aSig = extractFloatx80Frac( a );
4297     aExp = extractFloatx80Exp( a );
4298     aSign = extractFloatx80Sign( a );
4299     bSig = extractFloatx80Frac( b );
4300     bExp = extractFloatx80Exp( b );
4301     bSign = extractFloatx80Sign( b );
4302     zSign = aSign ^ bSign;
4303     if ( aExp == 0x7FFF ) {
4304         if (    (uint64_t) ( aSig<<1 )
4305              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4306             return propagateFloatx80NaN( a, b STATUS_VAR );
4307         }
4308         if ( ( bExp | bSig ) == 0 ) goto invalid;
4309         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4310     }
4311     if ( bExp == 0x7FFF ) {
4312         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4313         if ( ( aExp | aSig ) == 0 ) {
4314  invalid:
4315             float_raise( float_flag_invalid STATUS_VAR);
4316             z.low = floatx80_default_nan_low;
4317             z.high = floatx80_default_nan_high;
4318             return z;
4319         }
4320         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4321     }
4322     if ( aExp == 0 ) {
4323         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4324         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4325     }
4326     if ( bExp == 0 ) {
4327         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4328         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4329     }
4330     zExp = aExp + bExp - 0x3FFE;
4331     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4332     if ( 0 < (int64_t) zSig0 ) {
4333         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4334         --zExp;
4335     }
4336     return
4337         roundAndPackFloatx80(
4338             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4339
4340 }
4341
4342 /*----------------------------------------------------------------------------
4343 | Returns the result of dividing the extended double-precision floating-point
4344 | value `a' by the corresponding value `b'.  The operation is performed
4345 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4346 *----------------------------------------------------------------------------*/
4347
4348 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4349 {
4350     flag aSign, bSign, zSign;
4351     int32 aExp, bExp, zExp;
4352     uint64_t aSig, bSig, zSig0, zSig1;
4353     uint64_t rem0, rem1, rem2, term0, term1, term2;
4354     floatx80 z;
4355
4356     aSig = extractFloatx80Frac( a );
4357     aExp = extractFloatx80Exp( a );
4358     aSign = extractFloatx80Sign( a );
4359     bSig = extractFloatx80Frac( b );
4360     bExp = extractFloatx80Exp( b );
4361     bSign = extractFloatx80Sign( b );
4362     zSign = aSign ^ bSign;
4363     if ( aExp == 0x7FFF ) {
4364         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4365         if ( bExp == 0x7FFF ) {
4366             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4367             goto invalid;
4368         }
4369         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4370     }
4371     if ( bExp == 0x7FFF ) {
4372         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4373         return packFloatx80( zSign, 0, 0 );
4374     }
4375     if ( bExp == 0 ) {
4376         if ( bSig == 0 ) {
4377             if ( ( aExp | aSig ) == 0 ) {
4378  invalid:
4379                 float_raise( float_flag_invalid STATUS_VAR);
4380                 z.low = floatx80_default_nan_low;
4381                 z.high = floatx80_default_nan_high;
4382                 return z;
4383             }
4384             float_raise( float_flag_divbyzero STATUS_VAR);
4385             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4386         }
4387         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4388     }
4389     if ( aExp == 0 ) {
4390         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4391         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4392     }
4393     zExp = aExp - bExp + 0x3FFE;
4394     rem1 = 0;
4395     if ( bSig <= aSig ) {
4396         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4397         ++zExp;
4398     }
4399     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4400     mul64To128( bSig, zSig0, &term0, &term1 );
4401     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4402     while ( (int64_t) rem0 < 0 ) {
4403         --zSig0;
4404         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4405     }
4406     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4407     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4408         mul64To128( bSig, zSig1, &term1, &term2 );
4409         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4410         while ( (int64_t) rem1 < 0 ) {
4411             --zSig1;
4412             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4413         }
4414         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4415     }
4416     return
4417         roundAndPackFloatx80(
4418             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4419
4420 }
4421
4422 /*----------------------------------------------------------------------------
4423 | Returns the remainder of the extended double-precision floating-point value
4424 | `a' with respect to the corresponding value `b'.  The operation is performed
4425 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4426 *----------------------------------------------------------------------------*/
4427
4428 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4429 {
4430     flag aSign, zSign;
4431     int32 aExp, bExp, expDiff;
4432     uint64_t aSig0, aSig1, bSig;
4433     uint64_t q, term0, term1, alternateASig0, alternateASig1;
4434     floatx80 z;
4435
4436     aSig0 = extractFloatx80Frac( a );
4437     aExp = extractFloatx80Exp( a );
4438     aSign = extractFloatx80Sign( a );
4439     bSig = extractFloatx80Frac( b );
4440     bExp = extractFloatx80Exp( b );
4441     if ( aExp == 0x7FFF ) {
4442         if (    (uint64_t) ( aSig0<<1 )
4443              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4444             return propagateFloatx80NaN( a, b STATUS_VAR );
4445         }
4446         goto invalid;
4447     }
4448     if ( bExp == 0x7FFF ) {
4449         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4450         return a;
4451     }
4452     if ( bExp == 0 ) {
4453         if ( bSig == 0 ) {
4454  invalid:
4455             float_raise( float_flag_invalid STATUS_VAR);
4456             z.low = floatx80_default_nan_low;
4457             z.high = floatx80_default_nan_high;
4458             return z;
4459         }
4460         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4461     }
4462     if ( aExp == 0 ) {
4463         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4464         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4465     }
4466     bSig |= LIT64( 0x8000000000000000 );
4467     zSign = aSign;
4468     expDiff = aExp - bExp;
4469     aSig1 = 0;
4470     if ( expDiff < 0 ) {
4471         if ( expDiff < -1 ) return a;
4472         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4473         expDiff = 0;
4474     }
4475     q = ( bSig <= aSig0 );
4476     if ( q ) aSig0 -= bSig;
4477     expDiff -= 64;
4478     while ( 0 < expDiff ) {
4479         q = estimateDiv128To64( aSig0, aSig1, bSig );
4480         q = ( 2 < q ) ? q - 2 : 0;
4481         mul64To128( bSig, q, &term0, &term1 );
4482         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4483         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4484         expDiff -= 62;
4485     }
4486     expDiff += 64;
4487     if ( 0 < expDiff ) {
4488         q = estimateDiv128To64( aSig0, aSig1, bSig );
4489         q = ( 2 < q ) ? q - 2 : 0;
4490         q >>= 64 - expDiff;
4491         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4492         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4493         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4494         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4495             ++q;
4496             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4497         }
4498     }
4499     else {
4500         term1 = 0;
4501         term0 = bSig;
4502     }
4503     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4504     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4505          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4506               && ( q & 1 ) )
4507        ) {
4508         aSig0 = alternateASig0;
4509         aSig1 = alternateASig1;
4510         zSign = ! zSign;
4511     }
4512     return
4513         normalizeRoundAndPackFloatx80(
4514             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4515
4516 }
4517
4518 /*----------------------------------------------------------------------------
4519 | Returns the square root of the extended double-precision floating-point
4520 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4521 | for Binary Floating-Point Arithmetic.
4522 *----------------------------------------------------------------------------*/
4523
4524 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4525 {
4526     flag aSign;
4527     int32 aExp, zExp;
4528     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4529     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4530     floatx80 z;
4531
4532     aSig0 = extractFloatx80Frac( a );
4533     aExp = extractFloatx80Exp( a );
4534     aSign = extractFloatx80Sign( a );
4535     if ( aExp == 0x7FFF ) {
4536         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4537         if ( ! aSign ) return a;
4538         goto invalid;
4539     }
4540     if ( aSign ) {
4541         if ( ( aExp | aSig0 ) == 0 ) return a;
4542  invalid:
4543         float_raise( float_flag_invalid STATUS_VAR);
4544         z.low = floatx80_default_nan_low;
4545         z.high = floatx80_default_nan_high;
4546         return z;
4547     }
4548     if ( aExp == 0 ) {
4549         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4550         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4551     }
4552     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4553     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4554     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4555     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4556     doubleZSig0 = zSig0<<1;
4557     mul64To128( zSig0, zSig0, &term0, &term1 );
4558     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4559     while ( (int64_t) rem0 < 0 ) {
4560         --zSig0;
4561         doubleZSig0 -= 2;
4562         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4563     }
4564     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4565     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4566         if ( zSig1 == 0 ) zSig1 = 1;
4567         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4568         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4569         mul64To128( zSig1, zSig1, &term2, &term3 );
4570         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4571         while ( (int64_t) rem1 < 0 ) {
4572             --zSig1;
4573             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4574             term3 |= 1;
4575             term2 |= doubleZSig0;
4576             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4577         }
4578         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4579     }
4580     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4581     zSig0 |= doubleZSig0;
4582     return
4583         roundAndPackFloatx80(
4584             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4585
4586 }
4587
4588 /*----------------------------------------------------------------------------
4589 | Returns 1 if the extended double-precision floating-point value `a' is
4590 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
4591 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4592 | Arithmetic.
4593 *----------------------------------------------------------------------------*/
4594
4595 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4596 {
4597
4598     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4599               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4600          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4601               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4602        ) {
4603         if (    floatx80_is_signaling_nan( a )
4604              || floatx80_is_signaling_nan( b ) ) {
4605             float_raise( float_flag_invalid STATUS_VAR);
4606         }
4607         return 0;
4608     }
4609     return
4610            ( a.low == b.low )
4611         && (    ( a.high == b.high )
4612              || (    ( a.low == 0 )
4613                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4614            );
4615
4616 }
4617
4618 /*----------------------------------------------------------------------------
4619 | Returns 1 if the extended double-precision floating-point value `a' is
4620 | less than or equal to the corresponding value `b', and 0 otherwise.  The
4621 | comparison is performed according to the IEC/IEEE Standard for Binary
4622 | Floating-Point Arithmetic.
4623 *----------------------------------------------------------------------------*/
4624
4625 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4626 {
4627     flag aSign, bSign;
4628
4629     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4630               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4631          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4632               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4633        ) {
4634         float_raise( float_flag_invalid STATUS_VAR);
4635         return 0;
4636     }
4637     aSign = extractFloatx80Sign( a );
4638     bSign = extractFloatx80Sign( b );
4639     if ( aSign != bSign ) {
4640         return
4641                aSign
4642             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4643                  == 0 );
4644     }
4645     return
4646           aSign ? le128( b.high, b.low, a.high, a.low )
4647         : le128( a.high, a.low, b.high, b.low );
4648
4649 }
4650
4651 /*----------------------------------------------------------------------------
4652 | Returns 1 if the extended double-precision floating-point value `a' is
4653 | less than the corresponding value `b', and 0 otherwise.  The comparison
4654 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4655 | Arithmetic.
4656 *----------------------------------------------------------------------------*/
4657
4658 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4659 {
4660     flag aSign, bSign;
4661
4662     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4663               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4664          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4665               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4666        ) {
4667         float_raise( float_flag_invalid STATUS_VAR);
4668         return 0;
4669     }
4670     aSign = extractFloatx80Sign( a );
4671     bSign = extractFloatx80Sign( b );
4672     if ( aSign != bSign ) {
4673         return
4674                aSign
4675             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4676                  != 0 );
4677     }
4678     return
4679           aSign ? lt128( b.high, b.low, a.high, a.low )
4680         : lt128( a.high, a.low, b.high, b.low );
4681
4682 }
4683
4684 /*----------------------------------------------------------------------------
4685 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4686 | cannot be compared, and 0 otherwise.  The comparison is performed according
4687 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4688 *----------------------------------------------------------------------------*/
4689 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
4690 {
4691     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4692               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4693          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4694               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4695        ) {
4696         float_raise( float_flag_invalid STATUS_VAR);
4697         return 1;
4698     }
4699     return 0;
4700 }
4701
4702 /*----------------------------------------------------------------------------
4703 | Returns 1 if the extended double-precision floating-point value `a' is equal
4704 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4705 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4706 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4707 *----------------------------------------------------------------------------*/
4708
4709 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4710 {
4711
4712     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4713               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4714          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4715               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4716        ) {
4717         float_raise( float_flag_invalid STATUS_VAR);
4718         return 0;
4719     }
4720     return
4721            ( a.low == b.low )
4722         && (    ( a.high == b.high )
4723              || (    ( a.low == 0 )
4724                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4725            );
4726
4727 }
4728
4729 /*----------------------------------------------------------------------------
4730 | Returns 1 if the extended double-precision floating-point value `a' is less
4731 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4732 | do not cause an exception.  Otherwise, the comparison is performed according
4733 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4734 *----------------------------------------------------------------------------*/
4735
4736 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4737 {
4738     flag aSign, bSign;
4739
4740     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4741               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4742          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4743               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4744        ) {
4745         if (    floatx80_is_signaling_nan( a )
4746              || floatx80_is_signaling_nan( b ) ) {
4747             float_raise( float_flag_invalid STATUS_VAR);
4748         }
4749         return 0;
4750     }
4751     aSign = extractFloatx80Sign( a );
4752     bSign = extractFloatx80Sign( b );
4753     if ( aSign != bSign ) {
4754         return
4755                aSign
4756             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4757                  == 0 );
4758     }
4759     return
4760           aSign ? le128( b.high, b.low, a.high, a.low )
4761         : le128( a.high, a.low, b.high, b.low );
4762
4763 }
4764
4765 /*----------------------------------------------------------------------------
4766 | Returns 1 if the extended double-precision floating-point value `a' is less
4767 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4768 | an exception.  Otherwise, the comparison is performed according to the
4769 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4770 *----------------------------------------------------------------------------*/
4771
4772 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4773 {
4774     flag aSign, bSign;
4775
4776     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4777               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4778          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4779               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4780        ) {
4781         if (    floatx80_is_signaling_nan( a )
4782              || floatx80_is_signaling_nan( b ) ) {
4783             float_raise( float_flag_invalid STATUS_VAR);
4784         }
4785         return 0;
4786     }
4787     aSign = extractFloatx80Sign( a );
4788     bSign = extractFloatx80Sign( b );
4789     if ( aSign != bSign ) {
4790         return
4791                aSign
4792             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4793                  != 0 );
4794     }
4795     return
4796           aSign ? lt128( b.high, b.low, a.high, a.low )
4797         : lt128( a.high, a.low, b.high, b.low );
4798
4799 }
4800
4801 /*----------------------------------------------------------------------------
4802 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4803 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
4804 | The comparison is performed according to the IEC/IEEE Standard for Binary
4805 | Floating-Point Arithmetic.
4806 *----------------------------------------------------------------------------*/
4807 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4808 {
4809     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4810               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4811          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4812               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4813        ) {
4814         if (    floatx80_is_signaling_nan( a )
4815              || floatx80_is_signaling_nan( b ) ) {
4816             float_raise( float_flag_invalid STATUS_VAR);
4817         }
4818         return 1;
4819     }
4820     return 0;
4821 }
4822
4823 #endif
4824
4825 #ifdef FLOAT128
4826
4827 /*----------------------------------------------------------------------------
4828 | Returns the result of converting the quadruple-precision floating-point
4829 | value `a' to the 32-bit two's complement integer format.  The conversion
4830 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4831 | Arithmetic---which means in particular that the conversion is rounded
4832 | according to the current rounding mode.  If `a' is a NaN, the largest
4833 | positive integer is returned.  Otherwise, if the conversion overflows, the
4834 | largest integer with the same sign as `a' is returned.
4835 *----------------------------------------------------------------------------*/
4836
4837 int32 float128_to_int32( float128 a STATUS_PARAM )
4838 {
4839     flag aSign;
4840     int32 aExp, shiftCount;
4841     uint64_t aSig0, aSig1;
4842
4843     aSig1 = extractFloat128Frac1( a );
4844     aSig0 = extractFloat128Frac0( a );
4845     aExp = extractFloat128Exp( a );
4846     aSign = extractFloat128Sign( a );
4847     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4848     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4849     aSig0 |= ( aSig1 != 0 );
4850     shiftCount = 0x4028 - aExp;
4851     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4852     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4853
4854 }
4855
4856 /*----------------------------------------------------------------------------
4857 | Returns the result of converting the quadruple-precision floating-point
4858 | value `a' to the 32-bit two's complement integer format.  The conversion
4859 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4860 | Arithmetic, except that the conversion is always rounded toward zero.  If
4861 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4862 | conversion overflows, the largest integer with the same sign as `a' is
4863 | returned.
4864 *----------------------------------------------------------------------------*/
4865
4866 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4867 {
4868     flag aSign;
4869     int32 aExp, shiftCount;
4870     uint64_t aSig0, aSig1, savedASig;
4871     int32 z;
4872
4873     aSig1 = extractFloat128Frac1( a );
4874     aSig0 = extractFloat128Frac0( a );
4875     aExp = extractFloat128Exp( a );
4876     aSign = extractFloat128Sign( a );
4877     aSig0 |= ( aSig1 != 0 );
4878     if ( 0x401E < aExp ) {
4879         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4880         goto invalid;
4881     }
4882     else if ( aExp < 0x3FFF ) {
4883         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4884         return 0;
4885     }
4886     aSig0 |= LIT64( 0x0001000000000000 );
4887     shiftCount = 0x402F - aExp;
4888     savedASig = aSig0;
4889     aSig0 >>= shiftCount;
4890     z = aSig0;
4891     if ( aSign ) z = - z;
4892     if ( ( z < 0 ) ^ aSign ) {
4893  invalid:
4894         float_raise( float_flag_invalid STATUS_VAR);
4895         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4896     }
4897     if ( ( aSig0<<shiftCount ) != savedASig ) {
4898         STATUS(float_exception_flags) |= float_flag_inexact;
4899     }
4900     return z;
4901
4902 }
4903
4904 /*----------------------------------------------------------------------------
4905 | Returns the result of converting the quadruple-precision floating-point
4906 | value `a' to the 64-bit two's complement integer format.  The conversion
4907 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4908 | Arithmetic---which means in particular that the conversion is rounded
4909 | according to the current rounding mode.  If `a' is a NaN, the largest
4910 | positive integer is returned.  Otherwise, if the conversion overflows, the
4911 | largest integer with the same sign as `a' is returned.
4912 *----------------------------------------------------------------------------*/
4913
4914 int64 float128_to_int64( float128 a STATUS_PARAM )
4915 {
4916     flag aSign;
4917     int32 aExp, shiftCount;
4918     uint64_t aSig0, aSig1;
4919
4920     aSig1 = extractFloat128Frac1( a );
4921     aSig0 = extractFloat128Frac0( a );
4922     aExp = extractFloat128Exp( a );
4923     aSign = extractFloat128Sign( a );
4924     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4925     shiftCount = 0x402F - aExp;
4926     if ( shiftCount <= 0 ) {
4927         if ( 0x403E < aExp ) {
4928             float_raise( float_flag_invalid STATUS_VAR);
4929             if (    ! aSign
4930                  || (    ( aExp == 0x7FFF )
4931                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4932                     )
4933                ) {
4934                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4935             }
4936             return (int64_t) LIT64( 0x8000000000000000 );
4937         }
4938         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4939     }
4940     else {
4941         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4942     }
4943     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4944
4945 }
4946
4947 /*----------------------------------------------------------------------------
4948 | Returns the result of converting the quadruple-precision floating-point
4949 | value `a' to the 64-bit two's complement integer format.  The conversion
4950 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4951 | Arithmetic, except that the conversion is always rounded toward zero.
4952 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4953 | the conversion overflows, the largest integer with the same sign as `a' is
4954 | returned.
4955 *----------------------------------------------------------------------------*/
4956
4957 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4958 {
4959     flag aSign;
4960     int32 aExp, shiftCount;
4961     uint64_t aSig0, aSig1;
4962     int64 z;
4963
4964     aSig1 = extractFloat128Frac1( a );
4965     aSig0 = extractFloat128Frac0( a );
4966     aExp = extractFloat128Exp( a );
4967     aSign = extractFloat128Sign( a );
4968     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4969     shiftCount = aExp - 0x402F;
4970     if ( 0 < shiftCount ) {
4971         if ( 0x403E <= aExp ) {
4972             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4973             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4974                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4975                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4976             }
4977             else {
4978                 float_raise( float_flag_invalid STATUS_VAR);
4979                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4980                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4981                 }
4982             }
4983             return (int64_t) LIT64( 0x8000000000000000 );
4984         }
4985         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4986         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
4987             STATUS(float_exception_flags) |= float_flag_inexact;
4988         }
4989     }
4990     else {
4991         if ( aExp < 0x3FFF ) {
4992             if ( aExp | aSig0 | aSig1 ) {
4993                 STATUS(float_exception_flags) |= float_flag_inexact;
4994             }
4995             return 0;
4996         }
4997         z = aSig0>>( - shiftCount );
4998         if (    aSig1
4999              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5000             STATUS(float_exception_flags) |= float_flag_inexact;
5001         }
5002     }
5003     if ( aSign ) z = - z;
5004     return z;
5005
5006 }
5007
5008 /*----------------------------------------------------------------------------
5009 | Returns the result of converting the quadruple-precision floating-point
5010 | value `a' to the single-precision floating-point format.  The conversion
5011 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5012 | Arithmetic.
5013 *----------------------------------------------------------------------------*/
5014
5015 float32 float128_to_float32( float128 a STATUS_PARAM )
5016 {
5017     flag aSign;
5018     int32 aExp;
5019     uint64_t aSig0, aSig1;
5020     uint32_t zSig;
5021
5022     aSig1 = extractFloat128Frac1( a );
5023     aSig0 = extractFloat128Frac0( a );
5024     aExp = extractFloat128Exp( a );
5025     aSign = extractFloat128Sign( a );
5026     if ( aExp == 0x7FFF ) {
5027         if ( aSig0 | aSig1 ) {
5028             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5029         }
5030         return packFloat32( aSign, 0xFF, 0 );
5031     }
5032     aSig0 |= ( aSig1 != 0 );
5033     shift64RightJamming( aSig0, 18, &aSig0 );
5034     zSig = aSig0;
5035     if ( aExp || zSig ) {
5036         zSig |= 0x40000000;
5037         aExp -= 0x3F81;
5038     }
5039     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5040
5041 }
5042
5043 /*----------------------------------------------------------------------------
5044 | Returns the result of converting the quadruple-precision floating-point
5045 | value `a' to the double-precision floating-point format.  The conversion
5046 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5047 | Arithmetic.
5048 *----------------------------------------------------------------------------*/
5049
5050 float64 float128_to_float64( float128 a STATUS_PARAM )
5051 {
5052     flag aSign;
5053     int32 aExp;
5054     uint64_t aSig0, aSig1;
5055
5056     aSig1 = extractFloat128Frac1( a );
5057     aSig0 = extractFloat128Frac0( a );
5058     aExp = extractFloat128Exp( a );
5059     aSign = extractFloat128Sign( a );
5060     if ( aExp == 0x7FFF ) {
5061         if ( aSig0 | aSig1 ) {
5062             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5063         }
5064         return packFloat64( aSign, 0x7FF, 0 );
5065     }
5066     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5067     aSig0 |= ( aSig1 != 0 );
5068     if ( aExp || aSig0 ) {
5069         aSig0 |= LIT64( 0x4000000000000000 );
5070         aExp -= 0x3C01;
5071     }
5072     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5073
5074 }
5075
5076 #ifdef FLOATX80
5077
5078 /*----------------------------------------------------------------------------
5079 | Returns the result of converting the quadruple-precision floating-point
5080 | value `a' to the extended double-precision floating-point format.  The
5081 | conversion is performed according to the IEC/IEEE Standard for Binary
5082 | Floating-Point Arithmetic.
5083 *----------------------------------------------------------------------------*/
5084
5085 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5086 {
5087     flag aSign;
5088     int32 aExp;
5089     uint64_t aSig0, aSig1;
5090
5091     aSig1 = extractFloat128Frac1( a );
5092     aSig0 = extractFloat128Frac0( a );
5093     aExp = extractFloat128Exp( a );
5094     aSign = extractFloat128Sign( a );
5095     if ( aExp == 0x7FFF ) {
5096         if ( aSig0 | aSig1 ) {
5097             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5098         }
5099         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5100     }
5101     if ( aExp == 0 ) {
5102         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5103         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5104     }
5105     else {
5106         aSig0 |= LIT64( 0x0001000000000000 );
5107     }
5108     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5109     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5110
5111 }
5112
5113 #endif
5114
5115 /*----------------------------------------------------------------------------
5116 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5117 | returns the result as a quadruple-precision floating-point value.  The
5118 | operation is performed according to the IEC/IEEE Standard for Binary
5119 | Floating-Point Arithmetic.
5120 *----------------------------------------------------------------------------*/
5121
5122 float128 float128_round_to_int( float128 a STATUS_PARAM )
5123 {
5124     flag aSign;
5125     int32 aExp;
5126     uint64_t lastBitMask, roundBitsMask;
5127     int8 roundingMode;
5128     float128 z;
5129
5130     aExp = extractFloat128Exp( a );
5131     if ( 0x402F <= aExp ) {
5132         if ( 0x406F <= aExp ) {
5133             if (    ( aExp == 0x7FFF )
5134                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5135                ) {
5136                 return propagateFloat128NaN( a, a STATUS_VAR );
5137             }
5138             return a;
5139         }
5140         lastBitMask = 1;
5141         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5142         roundBitsMask = lastBitMask - 1;
5143         z = a;
5144         roundingMode = STATUS(float_rounding_mode);
5145         if ( roundingMode == float_round_nearest_even ) {
5146             if ( lastBitMask ) {
5147                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5148                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5149             }
5150             else {
5151                 if ( (int64_t) z.low < 0 ) {
5152                     ++z.high;
5153                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5154                 }
5155             }
5156         }
5157         else if ( roundingMode != float_round_to_zero ) {
5158             if (   extractFloat128Sign( z )
5159                  ^ ( roundingMode == float_round_up ) ) {
5160                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5161             }
5162         }
5163         z.low &= ~ roundBitsMask;
5164     }
5165     else {
5166         if ( aExp < 0x3FFF ) {
5167             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5168             STATUS(float_exception_flags) |= float_flag_inexact;
5169             aSign = extractFloat128Sign( a );
5170             switch ( STATUS(float_rounding_mode) ) {
5171              case float_round_nearest_even:
5172                 if (    ( aExp == 0x3FFE )
5173                      && (   extractFloat128Frac0( a )
5174                           | extractFloat128Frac1( a ) )
5175                    ) {
5176                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5177                 }
5178                 break;
5179              case float_round_down:
5180                 return
5181                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5182                     : packFloat128( 0, 0, 0, 0 );
5183              case float_round_up:
5184                 return
5185                       aSign ? packFloat128( 1, 0, 0, 0 )
5186                     : packFloat128( 0, 0x3FFF, 0, 0 );
5187             }
5188             return packFloat128( aSign, 0, 0, 0 );
5189         }
5190         lastBitMask = 1;
5191         lastBitMask <<= 0x402F - aExp;
5192         roundBitsMask = lastBitMask - 1;
5193         z.low = 0;
5194         z.high = a.high;
5195         roundingMode = STATUS(float_rounding_mode);
5196         if ( roundingMode == float_round_nearest_even ) {
5197             z.high += lastBitMask>>1;
5198             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5199                 z.high &= ~ lastBitMask;
5200             }
5201         }
5202         else if ( roundingMode != float_round_to_zero ) {
5203             if (   extractFloat128Sign( z )
5204                  ^ ( roundingMode == float_round_up ) ) {
5205                 z.high |= ( a.low != 0 );
5206                 z.high += roundBitsMask;
5207             }
5208         }
5209         z.high &= ~ roundBitsMask;
5210     }
5211     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5212         STATUS(float_exception_flags) |= float_flag_inexact;
5213     }
5214     return z;
5215
5216 }
5217
5218 /*----------------------------------------------------------------------------
5219 | Returns the result of adding the absolute values of the quadruple-precision
5220 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5221 | before being returned.  `zSign' is ignored if the result is a NaN.
5222 | The addition is performed according to the IEC/IEEE Standard for Binary
5223 | Floating-Point Arithmetic.
5224 *----------------------------------------------------------------------------*/
5225
5226 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5227 {
5228     int32 aExp, bExp, zExp;
5229     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5230     int32 expDiff;
5231
5232     aSig1 = extractFloat128Frac1( a );
5233     aSig0 = extractFloat128Frac0( a );
5234     aExp = extractFloat128Exp( a );
5235     bSig1 = extractFloat128Frac1( b );
5236     bSig0 = extractFloat128Frac0( b );
5237     bExp = extractFloat128Exp( b );
5238     expDiff = aExp - bExp;
5239     if ( 0 < expDiff ) {
5240         if ( aExp == 0x7FFF ) {
5241             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5242             return a;
5243         }
5244         if ( bExp == 0 ) {
5245             --expDiff;
5246         }
5247         else {
5248             bSig0 |= LIT64( 0x0001000000000000 );
5249         }
5250         shift128ExtraRightJamming(
5251             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5252         zExp = aExp;
5253     }
5254     else if ( expDiff < 0 ) {
5255         if ( bExp == 0x7FFF ) {
5256             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5257             return packFloat128( zSign, 0x7FFF, 0, 0 );
5258         }
5259         if ( aExp == 0 ) {
5260             ++expDiff;
5261         }
5262         else {
5263             aSig0 |= LIT64( 0x0001000000000000 );
5264         }
5265         shift128ExtraRightJamming(
5266             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5267         zExp = bExp;
5268     }
5269     else {
5270         if ( aExp == 0x7FFF ) {
5271             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5272                 return propagateFloat128NaN( a, b STATUS_VAR );
5273             }
5274             return a;
5275         }
5276         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5277         if ( aExp == 0 ) {
5278             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5279             return packFloat128( zSign, 0, zSig0, zSig1 );
5280         }
5281         zSig2 = 0;
5282         zSig0 |= LIT64( 0x0002000000000000 );
5283         zExp = aExp;
5284         goto shiftRight1;
5285     }
5286     aSig0 |= LIT64( 0x0001000000000000 );
5287     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5288     --zExp;
5289     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5290     ++zExp;
5291  shiftRight1:
5292     shift128ExtraRightJamming(
5293         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5294  roundAndPack:
5295     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5296
5297 }
5298
5299 /*----------------------------------------------------------------------------
5300 | Returns the result of subtracting the absolute values of the quadruple-
5301 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
5302 | difference is negated before being returned.  `zSign' is ignored if the
5303 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5304 | Standard for Binary Floating-Point Arithmetic.
5305 *----------------------------------------------------------------------------*/
5306
5307 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5308 {
5309     int32 aExp, bExp, zExp;
5310     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5311     int32 expDiff;
5312     float128 z;
5313
5314     aSig1 = extractFloat128Frac1( a );
5315     aSig0 = extractFloat128Frac0( a );
5316     aExp = extractFloat128Exp( a );
5317     bSig1 = extractFloat128Frac1( b );
5318     bSig0 = extractFloat128Frac0( b );
5319     bExp = extractFloat128Exp( b );
5320     expDiff = aExp - bExp;
5321     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5322     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5323     if ( 0 < expDiff ) goto aExpBigger;
5324     if ( expDiff < 0 ) goto bExpBigger;
5325     if ( aExp == 0x7FFF ) {
5326         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5327             return propagateFloat128NaN( a, b STATUS_VAR );
5328         }
5329         float_raise( float_flag_invalid STATUS_VAR);
5330         z.low = float128_default_nan_low;
5331         z.high = float128_default_nan_high;
5332         return z;
5333     }
5334     if ( aExp == 0 ) {
5335         aExp = 1;
5336         bExp = 1;
5337     }
5338     if ( bSig0 < aSig0 ) goto aBigger;
5339     if ( aSig0 < bSig0 ) goto bBigger;
5340     if ( bSig1 < aSig1 ) goto aBigger;
5341     if ( aSig1 < bSig1 ) goto bBigger;
5342     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5343  bExpBigger:
5344     if ( bExp == 0x7FFF ) {
5345         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5346         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5347     }
5348     if ( aExp == 0 ) {
5349         ++expDiff;
5350     }
5351     else {
5352         aSig0 |= LIT64( 0x4000000000000000 );
5353     }
5354     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5355     bSig0 |= LIT64( 0x4000000000000000 );
5356  bBigger:
5357     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5358     zExp = bExp;
5359     zSign ^= 1;
5360     goto normalizeRoundAndPack;
5361  aExpBigger:
5362     if ( aExp == 0x7FFF ) {
5363         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5364         return a;
5365     }
5366     if ( bExp == 0 ) {
5367         --expDiff;
5368     }
5369     else {
5370         bSig0 |= LIT64( 0x4000000000000000 );
5371     }
5372     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5373     aSig0 |= LIT64( 0x4000000000000000 );
5374  aBigger:
5375     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5376     zExp = aExp;
5377  normalizeRoundAndPack:
5378     --zExp;
5379     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5380
5381 }
5382
5383 /*----------------------------------------------------------------------------
5384 | Returns the result of adding the quadruple-precision floating-point values
5385 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5386 | for Binary Floating-Point Arithmetic.
5387 *----------------------------------------------------------------------------*/
5388
5389 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5390 {
5391     flag aSign, bSign;
5392
5393     aSign = extractFloat128Sign( a );
5394     bSign = extractFloat128Sign( b );
5395     if ( aSign == bSign ) {
5396         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5397     }
5398     else {
5399         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5400     }
5401
5402 }
5403
5404 /*----------------------------------------------------------------------------
5405 | Returns the result of subtracting the quadruple-precision floating-point
5406 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5407 | Standard for Binary Floating-Point Arithmetic.
5408 *----------------------------------------------------------------------------*/
5409
5410 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5411 {
5412     flag aSign, bSign;
5413
5414     aSign = extractFloat128Sign( a );
5415     bSign = extractFloat128Sign( b );
5416     if ( aSign == bSign ) {
5417         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5418     }
5419     else {
5420         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5421     }
5422
5423 }
5424
5425 /*----------------------------------------------------------------------------
5426 | Returns the result of multiplying the quadruple-precision floating-point
5427 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5428 | Standard for Binary Floating-Point Arithmetic.
5429 *----------------------------------------------------------------------------*/
5430
5431 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5432 {
5433     flag aSign, bSign, zSign;
5434     int32 aExp, bExp, zExp;
5435     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5436     float128 z;
5437
5438     aSig1 = extractFloat128Frac1( a );
5439     aSig0 = extractFloat128Frac0( a );
5440     aExp = extractFloat128Exp( a );
5441     aSign = extractFloat128Sign( a );
5442     bSig1 = extractFloat128Frac1( b );
5443     bSig0 = extractFloat128Frac0( b );
5444     bExp = extractFloat128Exp( b );
5445     bSign = extractFloat128Sign( b );
5446     zSign = aSign ^ bSign;
5447     if ( aExp == 0x7FFF ) {
5448         if (    ( aSig0 | aSig1 )
5449              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5450             return propagateFloat128NaN( a, b STATUS_VAR );
5451         }
5452         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5453         return packFloat128( zSign, 0x7FFF, 0, 0 );
5454     }
5455     if ( bExp == 0x7FFF ) {
5456         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5457         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5458  invalid:
5459             float_raise( float_flag_invalid STATUS_VAR);
5460             z.low = float128_default_nan_low;
5461             z.high = float128_default_nan_high;
5462             return z;
5463         }
5464         return packFloat128( zSign, 0x7FFF, 0, 0 );
5465     }
5466     if ( aExp == 0 ) {
5467         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5468         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5469     }
5470     if ( bExp == 0 ) {
5471         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5472         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5473     }
5474     zExp = aExp + bExp - 0x4000;
5475     aSig0 |= LIT64( 0x0001000000000000 );
5476     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5477     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5478     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5479     zSig2 |= ( zSig3 != 0 );
5480     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5481         shift128ExtraRightJamming(
5482             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5483         ++zExp;
5484     }
5485     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5486
5487 }
5488
5489 /*----------------------------------------------------------------------------
5490 | Returns the result of dividing the quadruple-precision floating-point value
5491 | `a' by the corresponding value `b'.  The operation is performed according to
5492 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5493 *----------------------------------------------------------------------------*/
5494
5495 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5496 {
5497     flag aSign, bSign, zSign;
5498     int32 aExp, bExp, zExp;
5499     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5500     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5501     float128 z;
5502
5503     aSig1 = extractFloat128Frac1( a );
5504     aSig0 = extractFloat128Frac0( a );
5505     aExp = extractFloat128Exp( a );
5506     aSign = extractFloat128Sign( a );
5507     bSig1 = extractFloat128Frac1( b );
5508     bSig0 = extractFloat128Frac0( b );
5509     bExp = extractFloat128Exp( b );
5510     bSign = extractFloat128Sign( b );
5511     zSign = aSign ^ bSign;
5512     if ( aExp == 0x7FFF ) {
5513         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5514         if ( bExp == 0x7FFF ) {
5515             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5516             goto invalid;
5517         }
5518         return packFloat128( zSign, 0x7FFF, 0, 0 );
5519     }
5520     if ( bExp == 0x7FFF ) {
5521         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5522         return packFloat128( zSign, 0, 0, 0 );
5523     }
5524     if ( bExp == 0 ) {
5525         if ( ( bSig0 | bSig1 ) == 0 ) {
5526             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5527  invalid:
5528                 float_raise( float_flag_invalid STATUS_VAR);
5529                 z.low = float128_default_nan_low;
5530                 z.high = float128_default_nan_high;
5531                 return z;
5532             }
5533             float_raise( float_flag_divbyzero STATUS_VAR);
5534             return packFloat128( zSign, 0x7FFF, 0, 0 );
5535         }
5536         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5537     }
5538     if ( aExp == 0 ) {
5539         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5540         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5541     }
5542     zExp = aExp - bExp + 0x3FFD;
5543     shortShift128Left(
5544         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5545     shortShift128Left(
5546         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5547     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5548         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5549         ++zExp;
5550     }
5551     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5552     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5553     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5554     while ( (int64_t) rem0 < 0 ) {
5555         --zSig0;
5556         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5557     }
5558     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5559     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5560         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5561         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5562         while ( (int64_t) rem1 < 0 ) {
5563             --zSig1;
5564             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5565         }
5566         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5567     }
5568     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5569     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5570
5571 }
5572
5573 /*----------------------------------------------------------------------------
5574 | Returns the remainder of the quadruple-precision floating-point value `a'
5575 | with respect to the corresponding value `b'.  The operation is performed
5576 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5577 *----------------------------------------------------------------------------*/
5578
5579 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5580 {
5581     flag aSign, zSign;
5582     int32 aExp, bExp, expDiff;
5583     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5584     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5585     int64_t sigMean0;
5586     float128 z;
5587
5588     aSig1 = extractFloat128Frac1( a );
5589     aSig0 = extractFloat128Frac0( a );
5590     aExp = extractFloat128Exp( a );
5591     aSign = extractFloat128Sign( a );
5592     bSig1 = extractFloat128Frac1( b );
5593     bSig0 = extractFloat128Frac0( b );
5594     bExp = extractFloat128Exp( b );
5595     if ( aExp == 0x7FFF ) {
5596         if (    ( aSig0 | aSig1 )
5597              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5598             return propagateFloat128NaN( a, b STATUS_VAR );
5599         }
5600         goto invalid;
5601     }
5602     if ( bExp == 0x7FFF ) {
5603         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5604         return a;
5605     }
5606     if ( bExp == 0 ) {
5607         if ( ( bSig0 | bSig1 ) == 0 ) {
5608  invalid:
5609             float_raise( float_flag_invalid STATUS_VAR);
5610             z.low = float128_default_nan_low;
5611             z.high = float128_default_nan_high;
5612             return z;
5613         }
5614         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5615     }
5616     if ( aExp == 0 ) {
5617         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5618         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5619     }
5620     expDiff = aExp - bExp;
5621     if ( expDiff < -1 ) return a;
5622     shortShift128Left(
5623         aSig0 | LIT64( 0x0001000000000000 ),
5624         aSig1,
5625         15 - ( expDiff < 0 ),
5626         &aSig0,
5627         &aSig1
5628     );
5629     shortShift128Left(
5630         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5631     q = le128( bSig0, bSig1, aSig0, aSig1 );
5632     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5633     expDiff -= 64;
5634     while ( 0 < expDiff ) {
5635         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5636         q = ( 4 < q ) ? q - 4 : 0;
5637         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5638         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5639         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5640         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5641         expDiff -= 61;
5642     }
5643     if ( -64 < expDiff ) {
5644         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5645         q = ( 4 < q ) ? q - 4 : 0;
5646         q >>= - expDiff;
5647         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5648         expDiff += 52;
5649         if ( expDiff < 0 ) {
5650             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5651         }
5652         else {
5653             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5654         }
5655         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5656         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5657     }
5658     else {
5659         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5660         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5661     }
5662     do {
5663         alternateASig0 = aSig0;
5664         alternateASig1 = aSig1;
5665         ++q;
5666         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5667     } while ( 0 <= (int64_t) aSig0 );
5668     add128(
5669         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
5670     if (    ( sigMean0 < 0 )
5671          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5672         aSig0 = alternateASig0;
5673         aSig1 = alternateASig1;
5674     }
5675     zSign = ( (int64_t) aSig0 < 0 );
5676     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5677     return
5678         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5679
5680 }
5681
5682 /*----------------------------------------------------------------------------
5683 | Returns the square root of the quadruple-precision floating-point value `a'.
5684 | The operation is performed according to the IEC/IEEE Standard for Binary
5685 | Floating-Point Arithmetic.
5686 *----------------------------------------------------------------------------*/
5687
5688 float128 float128_sqrt( float128 a STATUS_PARAM )
5689 {
5690     flag aSign;
5691     int32 aExp, zExp;
5692     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5693     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5694     float128 z;
5695
5696     aSig1 = extractFloat128Frac1( a );
5697     aSig0 = extractFloat128Frac0( a );
5698     aExp = extractFloat128Exp( a );
5699     aSign = extractFloat128Sign( a );
5700     if ( aExp == 0x7FFF ) {
5701         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5702         if ( ! aSign ) return a;
5703         goto invalid;
5704     }
5705     if ( aSign ) {
5706         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5707  invalid:
5708         float_raise( float_flag_invalid STATUS_VAR);
5709         z.low = float128_default_nan_low;
5710         z.high = float128_default_nan_high;
5711         return z;
5712     }
5713     if ( aExp == 0 ) {
5714         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5715         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5716     }
5717     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5718     aSig0 |= LIT64( 0x0001000000000000 );
5719     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5720     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5721     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5722     doubleZSig0 = zSig0<<1;
5723     mul64To128( zSig0, zSig0, &term0, &term1 );
5724     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5725     while ( (int64_t) rem0 < 0 ) {
5726         --zSig0;
5727         doubleZSig0 -= 2;
5728         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5729     }
5730     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5731     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5732         if ( zSig1 == 0 ) zSig1 = 1;
5733         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5734         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5735         mul64To128( zSig1, zSig1, &term2, &term3 );
5736         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5737         while ( (int64_t) rem1 < 0 ) {
5738             --zSig1;
5739             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5740             term3 |= 1;
5741             term2 |= doubleZSig0;
5742             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5743         }
5744         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5745     }
5746     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5747     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5748
5749 }
5750
5751 /*----------------------------------------------------------------------------
5752 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5753 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5754 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5755 *----------------------------------------------------------------------------*/
5756
5757 int float128_eq( float128 a, float128 b STATUS_PARAM )
5758 {
5759
5760     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5761               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5762          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5763               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5764        ) {
5765         if (    float128_is_signaling_nan( a )
5766              || float128_is_signaling_nan( b ) ) {
5767             float_raise( float_flag_invalid STATUS_VAR);
5768         }
5769         return 0;
5770     }
5771     return
5772            ( a.low == b.low )
5773         && (    ( a.high == b.high )
5774              || (    ( a.low == 0 )
5775                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5776            );
5777
5778 }
5779
5780 /*----------------------------------------------------------------------------
5781 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5782 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5783 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5784 | Arithmetic.
5785 *----------------------------------------------------------------------------*/
5786
5787 int float128_le( float128 a, float128 b STATUS_PARAM )
5788 {
5789     flag aSign, bSign;
5790
5791     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5792               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5793          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5794               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5795        ) {
5796         float_raise( float_flag_invalid STATUS_VAR);
5797         return 0;
5798     }
5799     aSign = extractFloat128Sign( a );
5800     bSign = extractFloat128Sign( b );
5801     if ( aSign != bSign ) {
5802         return
5803                aSign
5804             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5805                  == 0 );
5806     }
5807     return
5808           aSign ? le128( b.high, b.low, a.high, a.low )
5809         : le128( a.high, a.low, b.high, b.low );
5810
5811 }
5812
5813 /*----------------------------------------------------------------------------
5814 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5815 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5816 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5817 *----------------------------------------------------------------------------*/
5818
5819 int float128_lt( float128 a, float128 b STATUS_PARAM )
5820 {
5821     flag aSign, bSign;
5822
5823     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5824               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5825          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5826               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5827        ) {
5828         float_raise( float_flag_invalid STATUS_VAR);
5829         return 0;
5830     }
5831     aSign = extractFloat128Sign( a );
5832     bSign = extractFloat128Sign( b );
5833     if ( aSign != bSign ) {
5834         return
5835                aSign
5836             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5837                  != 0 );
5838     }
5839     return
5840           aSign ? lt128( b.high, b.low, a.high, a.low )
5841         : lt128( a.high, a.low, b.high, b.low );
5842
5843 }
5844
5845 /*----------------------------------------------------------------------------
5846 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5847 | be compared, and 0 otherwise.  The comparison is performed according to the
5848 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5849 *----------------------------------------------------------------------------*/
5850
5851 int float128_unordered( float128 a, float128 b STATUS_PARAM )
5852 {
5853     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5854               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5855          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5856               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5857        ) {
5858         float_raise( float_flag_invalid STATUS_VAR);
5859         return 1;
5860     }
5861     return 0;
5862 }
5863
5864 /*----------------------------------------------------------------------------
5865 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5866 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5867 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5868 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5869 *----------------------------------------------------------------------------*/
5870
5871 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5872 {
5873
5874     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5875               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5876          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5877               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5878        ) {
5879         float_raise( float_flag_invalid STATUS_VAR);
5880         return 0;
5881     }
5882     return
5883            ( a.low == b.low )
5884         && (    ( a.high == b.high )
5885              || (    ( a.low == 0 )
5886                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5887            );
5888
5889 }
5890
5891 /*----------------------------------------------------------------------------
5892 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5893 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5894 | cause an exception.  Otherwise, the comparison is performed according to the
5895 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5896 *----------------------------------------------------------------------------*/
5897
5898 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5899 {
5900     flag aSign, bSign;
5901
5902     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5903               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5904          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5905               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5906        ) {
5907         if (    float128_is_signaling_nan( a )
5908              || float128_is_signaling_nan( b ) ) {
5909             float_raise( float_flag_invalid STATUS_VAR);
5910         }
5911         return 0;
5912     }
5913     aSign = extractFloat128Sign( a );
5914     bSign = extractFloat128Sign( b );
5915     if ( aSign != bSign ) {
5916         return
5917                aSign
5918             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5919                  == 0 );
5920     }
5921     return
5922           aSign ? le128( b.high, b.low, a.high, a.low )
5923         : le128( a.high, a.low, b.high, b.low );
5924
5925 }
5926
5927 /*----------------------------------------------------------------------------
5928 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5929 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5930 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5931 | Standard for Binary Floating-Point Arithmetic.
5932 *----------------------------------------------------------------------------*/
5933
5934 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5935 {
5936     flag aSign, bSign;
5937
5938     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5939               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5940          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5941               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5942        ) {
5943         if (    float128_is_signaling_nan( a )
5944              || float128_is_signaling_nan( b ) ) {
5945             float_raise( float_flag_invalid STATUS_VAR);
5946         }
5947         return 0;
5948     }
5949     aSign = extractFloat128Sign( a );
5950     bSign = extractFloat128Sign( b );
5951     if ( aSign != bSign ) {
5952         return
5953                aSign
5954             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5955                  != 0 );
5956     }
5957     return
5958           aSign ? lt128( b.high, b.low, a.high, a.low )
5959         : lt128( a.high, a.low, b.high, b.low );
5960
5961 }
5962
5963 /*----------------------------------------------------------------------------
5964 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5965 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
5966 | comparison is performed according to the IEC/IEEE Standard for Binary
5967 | Floating-Point Arithmetic.
5968 *----------------------------------------------------------------------------*/
5969
5970 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
5971 {
5972     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5973               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5974          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5975               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5976        ) {
5977         if (    float128_is_signaling_nan( a )
5978              || float128_is_signaling_nan( b ) ) {
5979             float_raise( float_flag_invalid STATUS_VAR);
5980         }
5981         return 1;
5982     }
5983     return 0;
5984 }
5985
5986 #endif
5987
5988 /* misc functions */
5989 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5990 {
5991     return int64_to_float32(a STATUS_VAR);
5992 }
5993
5994 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5995 {
5996     return int64_to_float64(a STATUS_VAR);
5997 }
5998
5999 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
6000 {
6001     int64_t v;
6002     unsigned int res;
6003
6004     v = float32_to_int64(a STATUS_VAR);
6005     if (v < 0) {
6006         res = 0;
6007         float_raise( float_flag_invalid STATUS_VAR);
6008     } else if (v > 0xffffffff) {
6009         res = 0xffffffff;
6010         float_raise( float_flag_invalid STATUS_VAR);
6011     } else {
6012         res = v;
6013     }
6014     return res;
6015 }
6016
6017 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6018 {
6019     int64_t v;
6020     unsigned int res;
6021
6022     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6023     if (v < 0) {
6024         res = 0;
6025         float_raise( float_flag_invalid STATUS_VAR);
6026     } else if (v > 0xffffffff) {
6027         res = 0xffffffff;
6028         float_raise( float_flag_invalid STATUS_VAR);
6029     } else {
6030         res = v;
6031     }
6032     return res;
6033 }
6034
6035 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
6036 {
6037     int64_t v;
6038     unsigned int res;
6039
6040     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6041     if (v < 0) {
6042         res = 0;
6043         float_raise( float_flag_invalid STATUS_VAR);
6044     } else if (v > 0xffff) {
6045         res = 0xffff;
6046         float_raise( float_flag_invalid STATUS_VAR);
6047     } else {
6048         res = v;
6049     }
6050     return res;
6051 }
6052
6053 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
6054 {
6055     int64_t v;
6056     unsigned int res;
6057
6058     v = float64_to_int64(a STATUS_VAR);
6059     if (v < 0) {
6060         res = 0;
6061         float_raise( float_flag_invalid STATUS_VAR);
6062     } else if (v > 0xffffffff) {
6063         res = 0xffffffff;
6064         float_raise( float_flag_invalid STATUS_VAR);
6065     } else {
6066         res = v;
6067     }
6068     return res;
6069 }
6070
6071 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6072 {
6073     int64_t v;
6074     unsigned int res;
6075
6076     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6077     if (v < 0) {
6078         res = 0;
6079         float_raise( float_flag_invalid STATUS_VAR);
6080     } else if (v > 0xffffffff) {
6081         res = 0xffffffff;
6082         float_raise( float_flag_invalid STATUS_VAR);
6083     } else {
6084         res = v;
6085     }
6086     return res;
6087 }
6088
6089 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
6090 {
6091     int64_t v;
6092     unsigned int res;
6093
6094     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6095     if (v < 0) {
6096         res = 0;
6097         float_raise( float_flag_invalid STATUS_VAR);
6098     } else if (v > 0xffff) {
6099         res = 0xffff;
6100         float_raise( float_flag_invalid STATUS_VAR);
6101     } else {
6102         res = v;
6103     }
6104     return res;
6105 }
6106
6107 /* FIXME: This looks broken.  */
6108 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6109 {
6110     int64_t v;
6111
6112     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6113     v += float64_val(a);
6114     v = float64_to_int64(make_float64(v) STATUS_VAR);
6115
6116     return v - INT64_MIN;
6117 }
6118
6119 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6120 {
6121     int64_t v;
6122
6123     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6124     v += float64_val(a);
6125     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6126
6127     return v - INT64_MIN;
6128 }
6129
6130 #define COMPARE(s, nan_exp)                                                  \
6131 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
6132                                       int is_quiet STATUS_PARAM )            \
6133 {                                                                            \
6134     flag aSign, bSign;                                                       \
6135     uint ## s ## _t av, bv;                                                  \
6136     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
6137     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
6138                                                                              \
6139     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6140          extractFloat ## s ## Frac( a ) ) ||                                 \
6141         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6142           extractFloat ## s ## Frac( b ) )) {                                \
6143         if (!is_quiet ||                                                     \
6144             float ## s ## _is_signaling_nan( a ) ||                          \
6145             float ## s ## _is_signaling_nan( b ) ) {                         \
6146             float_raise( float_flag_invalid STATUS_VAR);                     \
6147         }                                                                    \
6148         return float_relation_unordered;                                     \
6149     }                                                                        \
6150     aSign = extractFloat ## s ## Sign( a );                                  \
6151     bSign = extractFloat ## s ## Sign( b );                                  \
6152     av = float ## s ## _val(a);                                              \
6153     bv = float ## s ## _val(b);                                              \
6154     if ( aSign != bSign ) {                                                  \
6155         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6156             /* zero case */                                                  \
6157             return float_relation_equal;                                     \
6158         } else {                                                             \
6159             return 1 - (2 * aSign);                                          \
6160         }                                                                    \
6161     } else {                                                                 \
6162         if (av == bv) {                                                      \
6163             return float_relation_equal;                                     \
6164         } else {                                                             \
6165             return 1 - 2 * (aSign ^ ( av < bv ));                            \
6166         }                                                                    \
6167     }                                                                        \
6168 }                                                                            \
6169                                                                              \
6170 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6171 {                                                                            \
6172     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6173 }                                                                            \
6174                                                                              \
6175 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6176 {                                                                            \
6177     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6178 }
6179
6180 COMPARE(32, 0xff)
6181 COMPARE(64, 0x7ff)
6182
6183 INLINE int float128_compare_internal( float128 a, float128 b,
6184                                       int is_quiet STATUS_PARAM )
6185 {
6186     flag aSign, bSign;
6187
6188     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6189           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6190         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6191           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6192         if (!is_quiet ||
6193             float128_is_signaling_nan( a ) ||
6194             float128_is_signaling_nan( b ) ) {
6195             float_raise( float_flag_invalid STATUS_VAR);
6196         }
6197         return float_relation_unordered;
6198     }
6199     aSign = extractFloat128Sign( a );
6200     bSign = extractFloat128Sign( b );
6201     if ( aSign != bSign ) {
6202         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6203             /* zero case */
6204             return float_relation_equal;
6205         } else {
6206             return 1 - (2 * aSign);
6207         }
6208     } else {
6209         if (a.low == b.low && a.high == b.high) {
6210             return float_relation_equal;
6211         } else {
6212             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6213         }
6214     }
6215 }
6216
6217 int float128_compare( float128 a, float128 b STATUS_PARAM )
6218 {
6219     return float128_compare_internal(a, b, 0 STATUS_VAR);
6220 }
6221
6222 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6223 {
6224     return float128_compare_internal(a, b, 1 STATUS_VAR);
6225 }
6226
6227 /* min() and max() functions. These can't be implemented as
6228  * 'compare and pick one input' because that would mishandle
6229  * NaNs and +0 vs -0.
6230  */
6231 #define MINMAX(s, nan_exp)                                              \
6232 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
6233                                         int ismin STATUS_PARAM )        \
6234 {                                                                       \
6235     flag aSign, bSign;                                                  \
6236     uint ## s ## _t av, bv;                                             \
6237     a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
6238     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
6239     if (float ## s ## _is_any_nan(a) ||                                 \
6240         float ## s ## _is_any_nan(b)) {                                 \
6241         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
6242     }                                                                   \
6243     aSign = extractFloat ## s ## Sign(a);                               \
6244     bSign = extractFloat ## s ## Sign(b);                               \
6245     av = float ## s ## _val(a);                                         \
6246     bv = float ## s ## _val(b);                                         \
6247     if (aSign != bSign) {                                               \
6248         if (ismin) {                                                    \
6249             return aSign ? a : b;                                       \
6250         } else {                                                        \
6251             return aSign ? b : a;                                       \
6252         }                                                               \
6253     } else {                                                            \
6254         if (ismin) {                                                    \
6255             return (aSign ^ (av < bv)) ? a : b;                         \
6256         } else {                                                        \
6257             return (aSign ^ (av < bv)) ? b : a;                         \
6258         }                                                               \
6259     }                                                                   \
6260 }                                                                       \
6261                                                                         \
6262 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
6263 {                                                                       \
6264     return float ## s ## _minmax(a, b, 1 STATUS_VAR);                   \
6265 }                                                                       \
6266                                                                         \
6267 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
6268 {                                                                       \
6269     return float ## s ## _minmax(a, b, 0 STATUS_VAR);                   \
6270 }
6271
6272 MINMAX(32, 0xff)
6273 MINMAX(64, 0x7ff)
6274
6275
6276 /* Multiply A by 2 raised to the power N.  */
6277 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6278 {
6279     flag aSign;
6280     int16 aExp;
6281     uint32_t aSig;
6282
6283     a = float32_squash_input_denormal(a STATUS_VAR);
6284     aSig = extractFloat32Frac( a );
6285     aExp = extractFloat32Exp( a );
6286     aSign = extractFloat32Sign( a );
6287
6288     if ( aExp == 0xFF ) {
6289         return a;
6290     }
6291     if ( aExp != 0 )
6292         aSig |= 0x00800000;
6293     else if ( aSig == 0 )
6294         return a;
6295
6296     aExp += n - 1;
6297     aSig <<= 7;
6298     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6299 }
6300
6301 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6302 {
6303     flag aSign;
6304     int16 aExp;
6305     uint64_t aSig;
6306
6307     a = float64_squash_input_denormal(a STATUS_VAR);
6308     aSig = extractFloat64Frac( a );
6309     aExp = extractFloat64Exp( a );
6310     aSign = extractFloat64Sign( a );
6311
6312     if ( aExp == 0x7FF ) {
6313         return a;
6314     }
6315     if ( aExp != 0 )
6316         aSig |= LIT64( 0x0010000000000000 );
6317     else if ( aSig == 0 )
6318         return a;
6319
6320     aExp += n - 1;
6321     aSig <<= 10;
6322     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6323 }
6324
6325 #ifdef FLOATX80
6326 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6327 {
6328     flag aSign;
6329     int16 aExp;
6330     uint64_t aSig;
6331
6332     aSig = extractFloatx80Frac( a );
6333     aExp = extractFloatx80Exp( a );
6334     aSign = extractFloatx80Sign( a );
6335
6336     if ( aExp == 0x7FF ) {
6337         return a;
6338     }
6339     if (aExp == 0 && aSig == 0)
6340         return a;
6341
6342     aExp += n;
6343     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6344                                           aSign, aExp, aSig, 0 STATUS_VAR );
6345 }
6346 #endif
6347
6348 #ifdef FLOAT128
6349 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6350 {
6351     flag aSign;
6352     int32 aExp;
6353     uint64_t aSig0, aSig1;
6354
6355     aSig1 = extractFloat128Frac1( a );
6356     aSig0 = extractFloat128Frac0( a );
6357     aExp = extractFloat128Exp( a );
6358     aSign = extractFloat128Sign( a );
6359     if ( aExp == 0x7FFF ) {
6360         return a;
6361     }
6362     if ( aExp != 0 )
6363         aSig0 |= LIT64( 0x0001000000000000 );
6364     else if ( aSig0 == 0 && aSig1 == 0 )
6365         return a;
6366
6367     aExp += n - 1;
6368     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6369                                           STATUS_VAR );
6370
6371 }
6372 #endif