fpu/softfloat.c

   1 /*
   2  * QEMU float support
   3  *
   4  * Derived from SoftFloat.
   5  */
   6
   7 /*============================================================================
   8
   9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
  10 Package, Release 2b.
  11
  12 Written by John R. Hauser.  This work was made possible in part by the
  13 International Computer Science Institute, located at Suite 600, 1947 Center
  14 Street, Berkeley, California 94704.  Funding was partially provided by the
  15 National Science Foundation under grant MIP-9311980.  The original version
  16 of this code was written as part of a project to build a fixed-point vector
  17 processor in collaboration with the University of California at Berkeley,
  18 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  20 arithmetic/SoftFloat.html'.
  21
  22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  24 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  30
  31 Derivative works are acceptable, even for commercial purposes, so long as
  32 (1) the source code for the derivative work includes prominent notice that
  33 the work is derivative, and (2) the source code includes prominent notice with
  34 these four paragraphs for those parts of this code that are retained.
  35
  36 =============================================================================*/
  37
  38 #include "softfloat.h"
  39
  40 /*----------------------------------------------------------------------------
  41 | Primitive arithmetic functions, including multi-word arithmetic, and
  42 | division and square root approximations.  (Can be specialized to target if
  43 | desired.)
  44 *----------------------------------------------------------------------------*/
  45 #include "softfloat-macros.h"
  46
  47 /*----------------------------------------------------------------------------
  48 | Functions and definitions to determine:  (1) whether tininess for underflow
  49 | is detected before or after rounding by default, (2) what (if anything)
  50 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  51 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  52 | are propagated from function inputs to output.  These details are target-
  53 | specific.
  54 *----------------------------------------------------------------------------*/
  55 #include "softfloat-specialize.h"
  56
  57 void set_float_rounding_mode(int val STATUS_PARAM)
  58 {
  59     STATUS(float_rounding_mode) = val;
  60 }
  61
  62 void set_float_exception_flags(int val STATUS_PARAM)
  63 {
  64     STATUS(float_exception_flags) = val;
  65 }
  66
  67 #ifdef FLOATX80
  68 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  69 {
  70     STATUS(floatx80_rounding_precision) = val;
  71 }
  72 #endif
  73
  74 /*----------------------------------------------------------------------------
  75 | Returns the fraction bits of the half-precision floating-point value `a'.
  76 *----------------------------------------------------------------------------*/
  77
  78 INLINE uint32_t extractFloat16Frac(float16 a)
  79 {
  80     return float16_val(a) & 0x3ff;
  81 }
  82
  83 /*----------------------------------------------------------------------------
  84 | Returns the exponent bits of the half-precision floating-point value `a'.
  85 *----------------------------------------------------------------------------*/
  86
  87 INLINE int16 extractFloat16Exp(float16 a)
  88 {
  89     return (float16_val(a) >> 10) & 0x1f;
  90 }
  91
  92 /*----------------------------------------------------------------------------
  93 | Returns the sign bit of the single-precision floating-point value `a'.
  94 *----------------------------------------------------------------------------*/
  95
  96 INLINE flag extractFloat16Sign(float16 a)
  97 {
  98     return float16_val(a)>>15;
  99 }
 100
 101 /*----------------------------------------------------------------------------
 102 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
 103 | and 7, and returns the properly rounded 32-bit integer corresponding to the
 104 | input.  If `zSign' is 1, the input is negated before being converted to an
 105 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 106 | is simply rounded to an integer, with the inexact exception raised if the
 107 | input cannot be represented exactly as an integer.  However, if the fixed-
 108 | point input is too large, the invalid exception is raised and the largest
 109 | positive or negative integer is returned.
 110 *----------------------------------------------------------------------------*/
 111
 112 static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
 113 {
 114     int8 roundingMode;
 115     flag roundNearestEven;
 116     int8 roundIncrement, roundBits;
 117     int32 z;
 118
 119     roundingMode = STATUS(float_rounding_mode);
 120     roundNearestEven = ( roundingMode == float_round_nearest_even );
 121     roundIncrement = 0x40;
 122     if ( ! roundNearestEven ) {
 123         if ( roundingMode == float_round_to_zero ) {
 124             roundIncrement = 0;
 125         }
 126         else {
 127             roundIncrement = 0x7F;
 128             if ( zSign ) {
 129                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 130             }
 131             else {
 132                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 133             }
 134         }
 135     }
 136     roundBits = absZ & 0x7F;
 137     absZ = ( absZ + roundIncrement )>>7;
 138     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 139     z = absZ;
 140     if ( zSign ) z = - z;
 141     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 142         float_raise( float_flag_invalid STATUS_VAR);
 143         return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
 144     }
 145     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 146     return z;
 147
 148 }
 149
 150 /*----------------------------------------------------------------------------
 151 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 152 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 153 | and returns the properly rounded 64-bit integer corresponding to the input.
 154 | If `zSign' is 1, the input is negated before being converted to an integer.
 155 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 156 | the inexact exception raised if the input cannot be represented exactly as
 157 | an integer.  However, if the fixed-point input is too large, the invalid
 158 | exception is raised and the largest positive or negative integer is
 159 | returned.
 160 *----------------------------------------------------------------------------*/
 161
 162 static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
 163 {
 164     int8 roundingMode;
 165     flag roundNearestEven, increment;
 166     int64 z;
 167
 168     roundingMode = STATUS(float_rounding_mode);
 169     roundNearestEven = ( roundingMode == float_round_nearest_even );
 170     increment = ( (int64_t) absZ1 < 0 );
 171     if ( ! roundNearestEven ) {
 172         if ( roundingMode == float_round_to_zero ) {
 173             increment = 0;
 174         }
 175         else {
 176             if ( zSign ) {
 177                 increment = ( roundingMode == float_round_down ) && absZ1;
 178             }
 179             else {
 180                 increment = ( roundingMode == float_round_up ) && absZ1;
 181             }
 182         }
 183     }
 184     if ( increment ) {
 185         ++absZ0;
 186         if ( absZ0 == 0 ) goto overflow;
 187         absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 188     }
 189     z = absZ0;
 190     if ( zSign ) z = - z;
 191     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 192  overflow:
 193         float_raise( float_flag_invalid STATUS_VAR);
 194         return
 195               zSign ? (int64_t) LIT64( 0x8000000000000000 )
 196             : LIT64( 0x7FFFFFFFFFFFFFFF );
 197     }
 198     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 199     return z;
 200
 201 }
 202
 203 /*----------------------------------------------------------------------------
 204 | Returns the fraction bits of the single-precision floating-point value `a'.
 205 *----------------------------------------------------------------------------*/
 206
 207 INLINE uint32_t extractFloat32Frac( float32 a )
 208 {
 209
 210     return float32_val(a) & 0x007FFFFF;
 211
 212 }
 213
 214 /*----------------------------------------------------------------------------
 215 | Returns the exponent bits of the single-precision floating-point value `a'.
 216 *----------------------------------------------------------------------------*/
 217
 218 INLINE int16 extractFloat32Exp( float32 a )
 219 {
 220
 221     return ( float32_val(a)>>23 ) & 0xFF;
 222
 223 }
 224
 225 /*----------------------------------------------------------------------------
 226 | Returns the sign bit of the single-precision floating-point value `a'.
 227 *----------------------------------------------------------------------------*/
 228
 229 INLINE flag extractFloat32Sign( float32 a )
 230 {
 231
 232     return float32_val(a)>>31;
 233
 234 }
 235
 236 /*----------------------------------------------------------------------------
 237 | If `a' is denormal and we are in flush-to-zero mode then set the
 238 | input-denormal exception and return zero. Otherwise just return the value.
 239 *----------------------------------------------------------------------------*/
 240 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 241 {
 242     if (STATUS(flush_inputs_to_zero)) {
 243         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 244             float_raise(float_flag_input_denormal STATUS_VAR);
 245             return make_float32(float32_val(a) & 0x80000000);
 246         }
 247     }
 248     return a;
 249 }
 250
 251 /*----------------------------------------------------------------------------
 252 | Normalizes the subnormal single-precision floating-point value represented
 253 | by the denormalized significand `aSig'.  The normalized exponent and
 254 | significand are stored at the locations pointed to by `zExpPtr' and
 255 | `zSigPtr', respectively.
 256 *----------------------------------------------------------------------------*/
 257
 258 static void
 259  normalizeFloat32Subnormal( uint32_t aSig, int16 *zExpPtr, uint32_t *zSigPtr )
 260 {
 261     int8 shiftCount;
 262
 263     shiftCount = countLeadingZeros32( aSig ) - 8;
 264     *zSigPtr = aSig<<shiftCount;
 265     *zExpPtr = 1 - shiftCount;
 266
 267 }
 268
 269 /*----------------------------------------------------------------------------
 270 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 271 | single-precision floating-point value, returning the result.  After being
 272 | shifted into the proper positions, the three fields are simply added
 273 | together to form the result.  This means that any integer portion of `zSig'
 274 | will be added into the exponent.  Since a properly normalized significand
 275 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 276 | than the desired result exponent whenever `zSig' is a complete, normalized
 277 | significand.
 278 *----------------------------------------------------------------------------*/
 279
 280 INLINE float32 packFloat32( flag zSign, int16 zExp, uint32_t zSig )
 281 {
 282
 283     return make_float32(
 284           ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
 285
 286 }
 287
 288 /*----------------------------------------------------------------------------
 289 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 290 | and significand `zSig', and returns the proper single-precision floating-
 291 | point value corresponding to the abstract input.  Ordinarily, the abstract
 292 | value is simply rounded and packed into the single-precision format, with
 293 | the inexact exception raised if the abstract input cannot be represented
 294 | exactly.  However, if the abstract value is too large, the overflow and
 295 | inexact exceptions are raised and an infinity or maximal finite value is
 296 | returned.  If the abstract value is too small, the input value is rounded to
 297 | a subnormal number, and the underflow and inexact exceptions are raised if
 298 | the abstract input cannot be represented exactly as a subnormal single-
 299 | precision floating-point number.
 300 |     The input significand `zSig' has its binary point between bits 30
 301 | and 29, which is 7 bits to the left of the usual location.  This shifted
 302 | significand must be normalized or smaller.  If `zSig' is not normalized,
 303 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 304 | and it must not require rounding.  In the usual case that `zSig' is
 305 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 306 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 307 | Binary Floating-Point Arithmetic.
 308 *----------------------------------------------------------------------------*/
 309
 310 static float32 roundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
 311 {
 312     int8 roundingMode;
 313     flag roundNearestEven;
 314     int8 roundIncrement, roundBits;
 315     flag isTiny;
 316
 317     roundingMode = STATUS(float_rounding_mode);
 318     roundNearestEven = ( roundingMode == float_round_nearest_even );
 319     roundIncrement = 0x40;
 320     if ( ! roundNearestEven ) {
 321         if ( roundingMode == float_round_to_zero ) {
 322             roundIncrement = 0;
 323         }
 324         else {
 325             roundIncrement = 0x7F;
 326             if ( zSign ) {
 327                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 328             }
 329             else {
 330                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 331             }
 332         }
 333     }
 334     roundBits = zSig & 0x7F;
 335     if ( 0xFD <= (uint16_t) zExp ) {
 336         if (    ( 0xFD < zExp )
 337              || (    ( zExp == 0xFD )
 338                   && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
 339            ) {
 340             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 341             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 342         }
 343         if ( zExp < 0 ) {
 344             if (STATUS(flush_to_zero)) {
 345                 float_raise(float_flag_output_denormal STATUS_VAR);
 346                 return packFloat32(zSign, 0, 0);
 347             }
 348             isTiny =
 349                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 350                 || ( zExp < -1 )
 351                 || ( zSig + roundIncrement < 0x80000000 );
 352             shift32RightJamming( zSig, - zExp, &zSig );
 353             zExp = 0;
 354             roundBits = zSig & 0x7F;
 355             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 356         }
 357     }
 358     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 359     zSig = ( zSig + roundIncrement )>>7;
 360     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 361     if ( zSig == 0 ) zExp = 0;
 362     return packFloat32( zSign, zExp, zSig );
 363
 364 }
 365
 366 /*----------------------------------------------------------------------------
 367 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 368 | and significand `zSig', and returns the proper single-precision floating-
 369 | point value corresponding to the abstract input.  This routine is just like
 370 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 371 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 372 | floating-point exponent.
 373 *----------------------------------------------------------------------------*/
 374
 375 static float32
 376  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, uint32_t zSig STATUS_PARAM)
 377 {
 378     int8 shiftCount;
 379
 380     shiftCount = countLeadingZeros32( zSig ) - 1;
 381     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 382
 383 }
 384
 385 /*----------------------------------------------------------------------------
 386 | Returns the fraction bits of the double-precision floating-point value `a'.
 387 *----------------------------------------------------------------------------*/
 388
 389 INLINE uint64_t extractFloat64Frac( float64 a )
 390 {
 391
 392     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 393
 394 }
 395
 396 /*----------------------------------------------------------------------------
 397 | Returns the exponent bits of the double-precision floating-point value `a'.
 398 *----------------------------------------------------------------------------*/
 399
 400 INLINE int16 extractFloat64Exp( float64 a )
 401 {
 402
 403     return ( float64_val(a)>>52 ) & 0x7FF;
 404
 405 }
 406
 407 /*----------------------------------------------------------------------------
 408 | Returns the sign bit of the double-precision floating-point value `a'.
 409 *----------------------------------------------------------------------------*/
 410
 411 INLINE flag extractFloat64Sign( float64 a )
 412 {
 413
 414     return float64_val(a)>>63;
 415
 416 }
 417
 418 /*----------------------------------------------------------------------------
 419 | If `a' is denormal and we are in flush-to-zero mode then set the
 420 | input-denormal exception and return zero. Otherwise just return the value.
 421 *----------------------------------------------------------------------------*/
 422 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 423 {
 424     if (STATUS(flush_inputs_to_zero)) {
 425         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 426             float_raise(float_flag_input_denormal STATUS_VAR);
 427             return make_float64(float64_val(a) & (1ULL << 63));
 428         }
 429     }
 430     return a;
 431 }
 432
 433 /*----------------------------------------------------------------------------
 434 | Normalizes the subnormal double-precision floating-point value represented
 435 | by the denormalized significand `aSig'.  The normalized exponent and
 436 | significand are stored at the locations pointed to by `zExpPtr' and
 437 | `zSigPtr', respectively.
 438 *----------------------------------------------------------------------------*/
 439
 440 static void
 441  normalizeFloat64Subnormal( uint64_t aSig, int16 *zExpPtr, uint64_t *zSigPtr )
 442 {
 443     int8 shiftCount;
 444
 445     shiftCount = countLeadingZeros64( aSig ) - 11;
 446     *zSigPtr = aSig<<shiftCount;
 447     *zExpPtr = 1 - shiftCount;
 448
 449 }
 450
 451 /*----------------------------------------------------------------------------
 452 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 453 | double-precision floating-point value, returning the result.  After being
 454 | shifted into the proper positions, the three fields are simply added
 455 | together to form the result.  This means that any integer portion of `zSig'
 456 | will be added into the exponent.  Since a properly normalized significand
 457 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 458 | than the desired result exponent whenever `zSig' is a complete, normalized
 459 | significand.
 460 *----------------------------------------------------------------------------*/
 461
 462 INLINE float64 packFloat64( flag zSign, int16 zExp, uint64_t zSig )
 463 {
 464
 465     return make_float64(
 466         ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
 467
 468 }
 469
 470 /*----------------------------------------------------------------------------
 471 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 472 | and significand `zSig', and returns the proper double-precision floating-
 473 | point value corresponding to the abstract input.  Ordinarily, the abstract
 474 | value is simply rounded and packed into the double-precision format, with
 475 | the inexact exception raised if the abstract input cannot be represented
 476 | exactly.  However, if the abstract value is too large, the overflow and
 477 | inexact exceptions are raised and an infinity or maximal finite value is
 478 | returned.  If the abstract value is too small, the input value is rounded
 479 | to a subnormal number, and the underflow and inexact exceptions are raised
 480 | if the abstract input cannot be represented exactly as a subnormal double-
 481 | precision floating-point number.
 482 |     The input significand `zSig' has its binary point between bits 62
 483 | and 61, which is 10 bits to the left of the usual location.  This shifted
 484 | significand must be normalized or smaller.  If `zSig' is not normalized,
 485 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 486 | and it must not require rounding.  In the usual case that `zSig' is
 487 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 488 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 489 | Binary Floating-Point Arithmetic.
 490 *----------------------------------------------------------------------------*/
 491
 492 static float64 roundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
 493 {
 494     int8 roundingMode;
 495     flag roundNearestEven;
 496     int16 roundIncrement, roundBits;
 497     flag isTiny;
 498
 499     roundingMode = STATUS(float_rounding_mode);
 500     roundNearestEven = ( roundingMode == float_round_nearest_even );
 501     roundIncrement = 0x200;
 502     if ( ! roundNearestEven ) {
 503         if ( roundingMode == float_round_to_zero ) {
 504             roundIncrement = 0;
 505         }
 506         else {
 507             roundIncrement = 0x3FF;
 508             if ( zSign ) {
 509                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 510             }
 511             else {
 512                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 513             }
 514         }
 515     }
 516     roundBits = zSig & 0x3FF;
 517     if ( 0x7FD <= (uint16_t) zExp ) {
 518         if (    ( 0x7FD < zExp )
 519              || (    ( zExp == 0x7FD )
 520                   && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
 521            ) {
 522             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 523             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 524         }
 525         if ( zExp < 0 ) {
 526             if (STATUS(flush_to_zero)) {
 527                 float_raise(float_flag_output_denormal STATUS_VAR);
 528                 return packFloat64(zSign, 0, 0);
 529             }
 530             isTiny =
 531                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 532                 || ( zExp < -1 )
 533                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 534             shift64RightJamming( zSig, - zExp, &zSig );
 535             zExp = 0;
 536             roundBits = zSig & 0x3FF;
 537             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 538         }
 539     }
 540     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 541     zSig = ( zSig + roundIncrement )>>10;
 542     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 543     if ( zSig == 0 ) zExp = 0;
 544     return packFloat64( zSign, zExp, zSig );
 545
 546 }
 547
 548 /*----------------------------------------------------------------------------
 549 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 550 | and significand `zSig', and returns the proper double-precision floating-
 551 | point value corresponding to the abstract input.  This routine is just like
 552 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 553 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 554 | floating-point exponent.
 555 *----------------------------------------------------------------------------*/
 556
 557 static float64
 558  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, uint64_t zSig STATUS_PARAM)
 559 {
 560     int8 shiftCount;
 561
 562     shiftCount = countLeadingZeros64( zSig ) - 1;
 563     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 564
 565 }
 566
 567 #ifdef FLOATX80
 568
 569 /*----------------------------------------------------------------------------
 570 | Returns the fraction bits of the extended double-precision floating-point
 571 | value `a'.
 572 *----------------------------------------------------------------------------*/
 573
 574 INLINE uint64_t extractFloatx80Frac( floatx80 a )
 575 {
 576
 577     return a.low;
 578
 579 }
 580
 581 /*----------------------------------------------------------------------------
 582 | Returns the exponent bits of the extended double-precision floating-point
 583 | value `a'.
 584 *----------------------------------------------------------------------------*/
 585
 586 INLINE int32 extractFloatx80Exp( floatx80 a )
 587 {
 588
 589     return a.high & 0x7FFF;
 590
 591 }
 592
 593 /*----------------------------------------------------------------------------
 594 | Returns the sign bit of the extended double-precision floating-point value
 595 | `a'.
 596 *----------------------------------------------------------------------------*/
 597
 598 INLINE flag extractFloatx80Sign( floatx80 a )
 599 {
 600
 601     return a.high>>15;
 602
 603 }
 604
 605 /*----------------------------------------------------------------------------
 606 | Normalizes the subnormal extended double-precision floating-point value
 607 | represented by the denormalized significand `aSig'.  The normalized exponent
 608 | and significand are stored at the locations pointed to by `zExpPtr' and
 609 | `zSigPtr', respectively.
 610 *----------------------------------------------------------------------------*/
 611
 612 static void
 613  normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
 614 {
 615     int8 shiftCount;
 616
 617     shiftCount = countLeadingZeros64( aSig );
 618     *zSigPtr = aSig<<shiftCount;
 619     *zExpPtr = 1 - shiftCount;
 620
 621 }
 622
 623 /*----------------------------------------------------------------------------
 624 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 625 | extended double-precision floating-point value, returning the result.
 626 *----------------------------------------------------------------------------*/
 627
 628 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
 629 {
 630     floatx80 z;
 631
 632     z.low = zSig;
 633     z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
 634     return z;
 635
 636 }
 637
 638 /*----------------------------------------------------------------------------
 639 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 640 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 641 | and returns the proper extended double-precision floating-point value
 642 | corresponding to the abstract input.  Ordinarily, the abstract value is
 643 | rounded and packed into the extended double-precision format, with the
 644 | inexact exception raised if the abstract input cannot be represented
 645 | exactly.  However, if the abstract value is too large, the overflow and
 646 | inexact exceptions are raised and an infinity or maximal finite value is
 647 | returned.  If the abstract value is too small, the input value is rounded to
 648 | a subnormal number, and the underflow and inexact exceptions are raised if
 649 | the abstract input cannot be represented exactly as a subnormal extended
 650 | double-precision floating-point number.
 651 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 652 | number of bits as single or double precision, respectively.  Otherwise, the
 653 | result is rounded to the full precision of the extended double-precision
 654 | format.
 655 |     The input significand must be normalized or smaller.  If the input
 656 | significand is not normalized, `zExp' must be 0; in that case, the result
 657 | returned is a subnormal number, and it must not require rounding.  The
 658 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 659 | Floating-Point Arithmetic.
 660 *----------------------------------------------------------------------------*/
 661
 662 static floatx80
 663  roundAndPackFloatx80(
 664      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 665  STATUS_PARAM)
 666 {
 667     int8 roundingMode;
 668     flag roundNearestEven, increment, isTiny;
 669     int64 roundIncrement, roundMask, roundBits;
 670
 671     roundingMode = STATUS(float_rounding_mode);
 672     roundNearestEven = ( roundingMode == float_round_nearest_even );
 673     if ( roundingPrecision == 80 ) goto precision80;
 674     if ( roundingPrecision == 64 ) {
 675         roundIncrement = LIT64( 0x0000000000000400 );
 676         roundMask = LIT64( 0x00000000000007FF );
 677     }
 678     else if ( roundingPrecision == 32 ) {
 679         roundIncrement = LIT64( 0x0000008000000000 );
 680         roundMask = LIT64( 0x000000FFFFFFFFFF );
 681     }
 682     else {
 683         goto precision80;
 684     }
 685     zSig0 |= ( zSig1 != 0 );
 686     if ( ! roundNearestEven ) {
 687         if ( roundingMode == float_round_to_zero ) {
 688             roundIncrement = 0;
 689         }
 690         else {
 691             roundIncrement = roundMask;
 692             if ( zSign ) {
 693                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 694             }
 695             else {
 696                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 697             }
 698         }
 699     }
 700     roundBits = zSig0 & roundMask;
 701     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 702         if (    ( 0x7FFE < zExp )
 703              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 704            ) {
 705             goto overflow;
 706         }
 707         if ( zExp <= 0 ) {
 708             if (STATUS(flush_to_zero)) {
 709                 float_raise(float_flag_output_denormal STATUS_VAR);
 710                 return packFloatx80(zSign, 0, 0);
 711             }
 712             isTiny =
 713                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 714                 || ( zExp < 0 )
 715                 || ( zSig0 <= zSig0 + roundIncrement );
 716             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 717             zExp = 0;
 718             roundBits = zSig0 & roundMask;
 719             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 720             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 721             zSig0 += roundIncrement;
 722             if ( (int64_t) zSig0 < 0 ) zExp = 1;
 723             roundIncrement = roundMask + 1;
 724             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 725                 roundMask |= roundIncrement;
 726             }
 727             zSig0 &= ~ roundMask;
 728             return packFloatx80( zSign, zExp, zSig0 );
 729         }
 730     }
 731     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 732     zSig0 += roundIncrement;
 733     if ( zSig0 < roundIncrement ) {
 734         ++zExp;
 735         zSig0 = LIT64( 0x8000000000000000 );
 736     }
 737     roundIncrement = roundMask + 1;
 738     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 739         roundMask |= roundIncrement;
 740     }
 741     zSig0 &= ~ roundMask;
 742     if ( zSig0 == 0 ) zExp = 0;
 743     return packFloatx80( zSign, zExp, zSig0 );
 744  precision80:
 745     increment = ( (int64_t) zSig1 < 0 );
 746     if ( ! roundNearestEven ) {
 747         if ( roundingMode == float_round_to_zero ) {
 748             increment = 0;
 749         }
 750         else {
 751             if ( zSign ) {
 752                 increment = ( roundingMode == float_round_down ) && zSig1;
 753             }
 754             else {
 755                 increment = ( roundingMode == float_round_up ) && zSig1;
 756             }
 757         }
 758     }
 759     if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
 760         if (    ( 0x7FFE < zExp )
 761              || (    ( zExp == 0x7FFE )
 762                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 763                   && increment
 764                 )
 765            ) {
 766             roundMask = 0;
 767  overflow:
 768             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 769             if (    ( roundingMode == float_round_to_zero )
 770                  || ( zSign && ( roundingMode == float_round_up ) )
 771                  || ( ! zSign && ( roundingMode == float_round_down ) )
 772                ) {
 773                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 774             }
 775             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 776         }
 777         if ( zExp <= 0 ) {
 778             isTiny =
 779                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 780                 || ( zExp < 0 )
 781                 || ! increment
 782                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 783             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 784             zExp = 0;
 785             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 786             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 787             if ( roundNearestEven ) {
 788                 increment = ( (int64_t) zSig1 < 0 );
 789             }
 790             else {
 791                 if ( zSign ) {
 792                     increment = ( roundingMode == float_round_down ) && zSig1;
 793                 }
 794                 else {
 795                     increment = ( roundingMode == float_round_up ) && zSig1;
 796                 }
 797             }
 798             if ( increment ) {
 799                 ++zSig0;
 800                 zSig0 &=
 801                     ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 802                 if ( (int64_t) zSig0 < 0 ) zExp = 1;
 803             }
 804             return packFloatx80( zSign, zExp, zSig0 );
 805         }
 806     }
 807     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 808     if ( increment ) {
 809         ++zSig0;
 810         if ( zSig0 == 0 ) {
 811             ++zExp;
 812             zSig0 = LIT64( 0x8000000000000000 );
 813         }
 814         else {
 815             zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 816         }
 817     }
 818     else {
 819         if ( zSig0 == 0 ) zExp = 0;
 820     }
 821     return packFloatx80( zSign, zExp, zSig0 );
 822
 823 }
 824
 825 /*----------------------------------------------------------------------------
 826 | Takes an abstract floating-point value having sign `zSign', exponent
 827 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 828 | and returns the proper extended double-precision floating-point value
 829 | corresponding to the abstract input.  This routine is just like
 830 | `roundAndPackFloatx80' except that the input significand does not have to be
 831 | normalized.
 832 *----------------------------------------------------------------------------*/
 833
 834 static floatx80
 835  normalizeRoundAndPackFloatx80(
 836      int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
 837  STATUS_PARAM)
 838 {
 839     int8 shiftCount;
 840
 841     if ( zSig0 == 0 ) {
 842         zSig0 = zSig1;
 843         zSig1 = 0;
 844         zExp -= 64;
 845     }
 846     shiftCount = countLeadingZeros64( zSig0 );
 847     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 848     zExp -= shiftCount;
 849     return
 850         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 851
 852 }
 853
 854 #endif
 855
 856 #ifdef FLOAT128
 857
 858 /*----------------------------------------------------------------------------
 859 | Returns the least-significant 64 fraction bits of the quadruple-precision
 860 | floating-point value `a'.
 861 *----------------------------------------------------------------------------*/
 862
 863 INLINE uint64_t extractFloat128Frac1( float128 a )
 864 {
 865
 866     return a.low;
 867
 868 }
 869
 870 /*----------------------------------------------------------------------------
 871 | Returns the most-significant 48 fraction bits of the quadruple-precision
 872 | floating-point value `a'.
 873 *----------------------------------------------------------------------------*/
 874
 875 INLINE uint64_t extractFloat128Frac0( float128 a )
 876 {
 877
 878     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 879
 880 }
 881
 882 /*----------------------------------------------------------------------------
 883 | Returns the exponent bits of the quadruple-precision floating-point value
 884 | `a'.
 885 *----------------------------------------------------------------------------*/
 886
 887 INLINE int32 extractFloat128Exp( float128 a )
 888 {
 889
 890     return ( a.high>>48 ) & 0x7FFF;
 891
 892 }
 893
 894 /*----------------------------------------------------------------------------
 895 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 896 *----------------------------------------------------------------------------*/
 897
 898 INLINE flag extractFloat128Sign( float128 a )
 899 {
 900
 901     return a.high>>63;
 902
 903 }
 904
 905 /*----------------------------------------------------------------------------
 906 | Normalizes the subnormal quadruple-precision floating-point value
 907 | represented by the denormalized significand formed by the concatenation of
 908 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 909 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 910 | significand are stored at the location pointed to by `zSig0Ptr', and the
 911 | least significant 64 bits of the normalized significand are stored at the
 912 | location pointed to by `zSig1Ptr'.
 913 *----------------------------------------------------------------------------*/
 914
 915 static void
 916  normalizeFloat128Subnormal(
 917      uint64_t aSig0,
 918      uint64_t aSig1,
 919      int32 *zExpPtr,
 920      uint64_t *zSig0Ptr,
 921      uint64_t *zSig1Ptr
 922  )
 923 {
 924     int8 shiftCount;
 925
 926     if ( aSig0 == 0 ) {
 927         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 928         if ( shiftCount < 0 ) {
 929             *zSig0Ptr = aSig1>>( - shiftCount );
 930             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 931         }
 932         else {
 933             *zSig0Ptr = aSig1<<shiftCount;
 934             *zSig1Ptr = 0;
 935         }
 936         *zExpPtr = - shiftCount - 63;
 937     }
 938     else {
 939         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 940         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 941         *zExpPtr = 1 - shiftCount;
 942     }
 943
 944 }
 945
 946 /*----------------------------------------------------------------------------
 947 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 948 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 949 | floating-point value, returning the result.  After being shifted into the
 950 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 951 | added together to form the most significant 32 bits of the result.  This
 952 | means that any integer portion of `zSig0' will be added into the exponent.
 953 | Since a properly normalized significand will have an integer portion equal
 954 | to 1, the `zExp' input should be 1 less than the desired result exponent
 955 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 956 | significand.
 957 *----------------------------------------------------------------------------*/
 958
 959 INLINE float128
 960  packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
 961 {
 962     float128 z;
 963
 964     z.low = zSig1;
 965     z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
 966     return z;
 967
 968 }
 969
 970 /*----------------------------------------------------------------------------
 971 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 972 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 973 | and `zSig2', and returns the proper quadruple-precision floating-point value
 974 | corresponding to the abstract input.  Ordinarily, the abstract value is
 975 | simply rounded and packed into the quadruple-precision format, with the
 976 | inexact exception raised if the abstract input cannot be represented
 977 | exactly.  However, if the abstract value is too large, the overflow and
 978 | inexact exceptions are raised and an infinity or maximal finite value is
 979 | returned.  If the abstract value is too small, the input value is rounded to
 980 | a subnormal number, and the underflow and inexact exceptions are raised if
 981 | the abstract input cannot be represented exactly as a subnormal quadruple-
 982 | precision floating-point number.
 983 |     The input significand must be normalized or smaller.  If the input
 984 | significand is not normalized, `zExp' must be 0; in that case, the result
 985 | returned is a subnormal number, and it must not require rounding.  In the
 986 | usual case that the input significand is normalized, `zExp' must be 1 less
 987 | than the ``true'' floating-point exponent.  The handling of underflow and
 988 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 989 *----------------------------------------------------------------------------*/
 990
 991 static float128
 992  roundAndPackFloat128(
 993      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
 994 {
 995     int8 roundingMode;
 996     flag roundNearestEven, increment, isTiny;
 997
 998     roundingMode = STATUS(float_rounding_mode);
 999     roundNearestEven = ( roundingMode == float_round_nearest_even );
1000     increment = ( (int64_t) zSig2 < 0 );
1001     if ( ! roundNearestEven ) {
1002         if ( roundingMode == float_round_to_zero ) {
1003             increment = 0;
1004         }
1005         else {
1006             if ( zSign ) {
1007                 increment = ( roundingMode == float_round_down ) && zSig2;
1008             }
1009             else {
1010                 increment = ( roundingMode == float_round_up ) && zSig2;
1011             }
1012         }
1013     }
1014     if ( 0x7FFD <= (uint32_t) zExp ) {
1015         if (    ( 0x7FFD < zExp )
1016              || (    ( zExp == 0x7FFD )
1017                   && eq128(
1018                          LIT64( 0x0001FFFFFFFFFFFF ),
1019                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1020                          zSig0,
1021                          zSig1
1022                      )
1023                   && increment
1024                 )
1025            ) {
1026             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1027             if (    ( roundingMode == float_round_to_zero )
1028                  || ( zSign && ( roundingMode == float_round_up ) )
1029                  || ( ! zSign && ( roundingMode == float_round_down ) )
1030                ) {
1031                 return
1032                     packFloat128(
1033                         zSign,
1034                         0x7FFE,
1035                         LIT64( 0x0000FFFFFFFFFFFF ),
1036                         LIT64( 0xFFFFFFFFFFFFFFFF )
1037                     );
1038             }
1039             return packFloat128( zSign, 0x7FFF, 0, 0 );
1040         }
1041         if ( zExp < 0 ) {
1042             if (STATUS(flush_to_zero)) {
1043                 float_raise(float_flag_output_denormal STATUS_VAR);
1044                 return packFloat128(zSign, 0, 0, 0);
1045             }
1046             isTiny =
1047                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1048                 || ( zExp < -1 )
1049                 || ! increment
1050                 || lt128(
1051                        zSig0,
1052                        zSig1,
1053                        LIT64( 0x0001FFFFFFFFFFFF ),
1054                        LIT64( 0xFFFFFFFFFFFFFFFF )
1055                    );
1056             shift128ExtraRightJamming(
1057                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1058             zExp = 0;
1059             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1060             if ( roundNearestEven ) {
1061                 increment = ( (int64_t) zSig2 < 0 );
1062             }
1063             else {
1064                 if ( zSign ) {
1065                     increment = ( roundingMode == float_round_down ) && zSig2;
1066                 }
1067                 else {
1068                     increment = ( roundingMode == float_round_up ) && zSig2;
1069                 }
1070             }
1071         }
1072     }
1073     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1074     if ( increment ) {
1075         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1076         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1077     }
1078     else {
1079         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1080     }
1081     return packFloat128( zSign, zExp, zSig0, zSig1 );
1082
1083 }
1084
1085 /*----------------------------------------------------------------------------
1086 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1087 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1088 | returns the proper quadruple-precision floating-point value corresponding
1089 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1090 | except that the input significand has fewer bits and does not have to be
1091 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1092 | point exponent.
1093 *----------------------------------------------------------------------------*/
1094
1095 static float128
1096  normalizeRoundAndPackFloat128(
1097      flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
1098 {
1099     int8 shiftCount;
1100     uint64_t zSig2;
1101
1102     if ( zSig0 == 0 ) {
1103         zSig0 = zSig1;
1104         zSig1 = 0;
1105         zExp -= 64;
1106     }
1107     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1108     if ( 0 <= shiftCount ) {
1109         zSig2 = 0;
1110         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1111     }
1112     else {
1113         shift128ExtraRightJamming(
1114             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1115     }
1116     zExp -= shiftCount;
1117     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1118
1119 }
1120
1121 #endif
1122
1123 /*----------------------------------------------------------------------------
1124 | Returns the result of converting the 32-bit two's complement integer `a'
1125 | to the single-precision floating-point format.  The conversion is performed
1126 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1128
1129 float32 int32_to_float32( int32 a STATUS_PARAM )
1130 {
1131     flag zSign;
1132
1133     if ( a == 0 ) return float32_zero;
1134     if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1135     zSign = ( a < 0 );
1136     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1137
1138 }
1139
1140 /*----------------------------------------------------------------------------
1141 | Returns the result of converting the 32-bit two's complement integer `a'
1142 | to the double-precision floating-point format.  The conversion is performed
1143 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1144 *----------------------------------------------------------------------------*/
1145
1146 float64 int32_to_float64( int32 a STATUS_PARAM )
1147 {
1148     flag zSign;
1149     uint32 absA;
1150     int8 shiftCount;
1151     uint64_t zSig;
1152
1153     if ( a == 0 ) return float64_zero;
1154     zSign = ( a < 0 );
1155     absA = zSign ? - a : a;
1156     shiftCount = countLeadingZeros32( absA ) + 21;
1157     zSig = absA;
1158     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1159
1160 }
1161
1162 #ifdef FLOATX80
1163
1164 /*----------------------------------------------------------------------------
1165 | Returns the result of converting the 32-bit two's complement integer `a'
1166 | to the extended double-precision floating-point format.  The conversion
1167 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1168 | Arithmetic.
1169 *----------------------------------------------------------------------------*/
1170
1171 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1172 {
1173     flag zSign;
1174     uint32 absA;
1175     int8 shiftCount;
1176     uint64_t zSig;
1177
1178     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1179     zSign = ( a < 0 );
1180     absA = zSign ? - a : a;
1181     shiftCount = countLeadingZeros32( absA ) + 32;
1182     zSig = absA;
1183     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1184
1185 }
1186
1187 #endif
1188
1189 #ifdef FLOAT128
1190
1191 /*----------------------------------------------------------------------------
1192 | Returns the result of converting the 32-bit two's complement integer `a' to
1193 | the quadruple-precision floating-point format.  The conversion is performed
1194 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1195 *----------------------------------------------------------------------------*/
1196
1197 float128 int32_to_float128( int32 a STATUS_PARAM )
1198 {
1199     flag zSign;
1200     uint32 absA;
1201     int8 shiftCount;
1202     uint64_t zSig0;
1203
1204     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1205     zSign = ( a < 0 );
1206     absA = zSign ? - a : a;
1207     shiftCount = countLeadingZeros32( absA ) + 17;
1208     zSig0 = absA;
1209     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1210
1211 }
1212
1213 #endif
1214
1215 /*----------------------------------------------------------------------------
1216 | Returns the result of converting the 64-bit two's complement integer `a'
1217 | to the single-precision floating-point format.  The conversion is performed
1218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1219 *----------------------------------------------------------------------------*/
1220
1221 float32 int64_to_float32( int64 a STATUS_PARAM )
1222 {
1223     flag zSign;
1224     uint64 absA;
1225     int8 shiftCount;
1226
1227     if ( a == 0 ) return float32_zero;
1228     zSign = ( a < 0 );
1229     absA = zSign ? - a : a;
1230     shiftCount = countLeadingZeros64( absA ) - 40;
1231     if ( 0 <= shiftCount ) {
1232         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1233     }
1234     else {
1235         shiftCount += 7;
1236         if ( shiftCount < 0 ) {
1237             shift64RightJamming( absA, - shiftCount, &absA );
1238         }
1239         else {
1240             absA <<= shiftCount;
1241         }
1242         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1243     }
1244
1245 }
1246
1247 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1248 {
1249     int8 shiftCount;
1250
1251     if ( a == 0 ) return float32_zero;
1252     shiftCount = countLeadingZeros64( a ) - 40;
1253     if ( 0 <= shiftCount ) {
1254         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1255     }
1256     else {
1257         shiftCount += 7;
1258         if ( shiftCount < 0 ) {
1259             shift64RightJamming( a, - shiftCount, &a );
1260         }
1261         else {
1262             a <<= shiftCount;
1263         }
1264         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1265     }
1266 }
1267
1268 /*----------------------------------------------------------------------------
1269 | Returns the result of converting the 64-bit two's complement integer `a'
1270 | to the double-precision floating-point format.  The conversion is performed
1271 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1272 *----------------------------------------------------------------------------*/
1273
1274 float64 int64_to_float64( int64 a STATUS_PARAM )
1275 {
1276     flag zSign;
1277
1278     if ( a == 0 ) return float64_zero;
1279     if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1280         return packFloat64( 1, 0x43E, 0 );
1281     }
1282     zSign = ( a < 0 );
1283     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1284
1285 }
1286
1287 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1288 {
1289     if ( a == 0 ) return float64_zero;
1290     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1291
1292 }
1293
1294 #ifdef FLOATX80
1295
1296 /*----------------------------------------------------------------------------
1297 | Returns the result of converting the 64-bit two's complement integer `a'
1298 | to the extended double-precision floating-point format.  The conversion
1299 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1300 | Arithmetic.
1301 *----------------------------------------------------------------------------*/
1302
1303 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1304 {
1305     flag zSign;
1306     uint64 absA;
1307     int8 shiftCount;
1308
1309     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1310     zSign = ( a < 0 );
1311     absA = zSign ? - a : a;
1312     shiftCount = countLeadingZeros64( absA );
1313     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1314
1315 }
1316
1317 #endif
1318
1319 #ifdef FLOAT128
1320
1321 /*----------------------------------------------------------------------------
1322 | Returns the result of converting the 64-bit two's complement integer `a' to
1323 | the quadruple-precision floating-point format.  The conversion is performed
1324 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1325 *----------------------------------------------------------------------------*/
1326
1327 float128 int64_to_float128( int64 a STATUS_PARAM )
1328 {
1329     flag zSign;
1330     uint64 absA;
1331     int8 shiftCount;
1332     int32 zExp;
1333     uint64_t zSig0, zSig1;
1334
1335     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1336     zSign = ( a < 0 );
1337     absA = zSign ? - a : a;
1338     shiftCount = countLeadingZeros64( absA ) + 49;
1339     zExp = 0x406E - shiftCount;
1340     if ( 64 <= shiftCount ) {
1341         zSig1 = 0;
1342         zSig0 = absA;
1343         shiftCount -= 64;
1344     }
1345     else {
1346         zSig1 = absA;
1347         zSig0 = 0;
1348     }
1349     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1350     return packFloat128( zSign, zExp, zSig0, zSig1 );
1351
1352 }
1353
1354 #endif
1355
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the single-precision floating-point value
1358 | `a' to the 32-bit two's complement integer format.  The conversion is
1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1360 | Arithmetic---which means in particular that the conversion is rounded
1361 | according to the current rounding mode.  If `a' is a NaN, the largest
1362 | positive integer is returned.  Otherwise, if the conversion overflows, the
1363 | largest integer with the same sign as `a' is returned.
1364 *----------------------------------------------------------------------------*/
1365
1366 int32 float32_to_int32( float32 a STATUS_PARAM )
1367 {
1368     flag aSign;
1369     int16 aExp, shiftCount;
1370     uint32_t aSig;
1371     uint64_t aSig64;
1372
1373     a = float32_squash_input_denormal(a STATUS_VAR);
1374     aSig = extractFloat32Frac( a );
1375     aExp = extractFloat32Exp( a );
1376     aSign = extractFloat32Sign( a );
1377     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1378     if ( aExp ) aSig |= 0x00800000;
1379     shiftCount = 0xAF - aExp;
1380     aSig64 = aSig;
1381     aSig64 <<= 32;
1382     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1383     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1384
1385 }
1386
1387 /*----------------------------------------------------------------------------
1388 | Returns the result of converting the single-precision floating-point value
1389 | `a' to the 32-bit two's complement integer format.  The conversion is
1390 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1391 | Arithmetic, except that the conversion is always rounded toward zero.
1392 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1393 | the conversion overflows, the largest integer with the same sign as `a' is
1394 | returned.
1395 *----------------------------------------------------------------------------*/
1396
1397 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1398 {
1399     flag aSign;
1400     int16 aExp, shiftCount;
1401     uint32_t aSig;
1402     int32 z;
1403     a = float32_squash_input_denormal(a STATUS_VAR);
1404
1405     aSig = extractFloat32Frac( a );
1406     aExp = extractFloat32Exp( a );
1407     aSign = extractFloat32Sign( a );
1408     shiftCount = aExp - 0x9E;
1409     if ( 0 <= shiftCount ) {
1410         if ( float32_val(a) != 0xCF000000 ) {
1411             float_raise( float_flag_invalid STATUS_VAR);
1412             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1413         }
1414         return (int32_t) 0x80000000;
1415     }
1416     else if ( aExp <= 0x7E ) {
1417         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1418         return 0;
1419     }
1420     aSig = ( aSig | 0x00800000 )<<8;
1421     z = aSig>>( - shiftCount );
1422     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1423         STATUS(float_exception_flags) |= float_flag_inexact;
1424     }
1425     if ( aSign ) z = - z;
1426     return z;
1427
1428 }
1429
1430 /*----------------------------------------------------------------------------
1431 | Returns the result of converting the single-precision floating-point value
1432 | `a' to the 16-bit two's complement integer format.  The conversion is
1433 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1434 | Arithmetic, except that the conversion is always rounded toward zero.
1435 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1436 | the conversion overflows, the largest integer with the same sign as `a' is
1437 | returned.
1438 *----------------------------------------------------------------------------*/
1439
1440 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1441 {
1442     flag aSign;
1443     int16 aExp, shiftCount;
1444     uint32_t aSig;
1445     int32 z;
1446
1447     aSig = extractFloat32Frac( a );
1448     aExp = extractFloat32Exp( a );
1449     aSign = extractFloat32Sign( a );
1450     shiftCount = aExp - 0x8E;
1451     if ( 0 <= shiftCount ) {
1452         if ( float32_val(a) != 0xC7000000 ) {
1453             float_raise( float_flag_invalid STATUS_VAR);
1454             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1455                 return 0x7FFF;
1456             }
1457         }
1458         return (int32_t) 0xffff8000;
1459     }
1460     else if ( aExp <= 0x7E ) {
1461         if ( aExp | aSig ) {
1462             STATUS(float_exception_flags) |= float_flag_inexact;
1463         }
1464         return 0;
1465     }
1466     shiftCount -= 0x10;
1467     aSig = ( aSig | 0x00800000 )<<8;
1468     z = aSig>>( - shiftCount );
1469     if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1470         STATUS(float_exception_flags) |= float_flag_inexact;
1471     }
1472     if ( aSign ) {
1473         z = - z;
1474     }
1475     return z;
1476
1477 }
1478
1479 /*----------------------------------------------------------------------------
1480 | Returns the result of converting the single-precision floating-point value
1481 | `a' to the 64-bit two's complement integer format.  The conversion is
1482 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1483 | Arithmetic---which means in particular that the conversion is rounded
1484 | according to the current rounding mode.  If `a' is a NaN, the largest
1485 | positive integer is returned.  Otherwise, if the conversion overflows, the
1486 | largest integer with the same sign as `a' is returned.
1487 *----------------------------------------------------------------------------*/
1488
1489 int64 float32_to_int64( float32 a STATUS_PARAM )
1490 {
1491     flag aSign;
1492     int16 aExp, shiftCount;
1493     uint32_t aSig;
1494     uint64_t aSig64, aSigExtra;
1495     a = float32_squash_input_denormal(a STATUS_VAR);
1496
1497     aSig = extractFloat32Frac( a );
1498     aExp = extractFloat32Exp( a );
1499     aSign = extractFloat32Sign( a );
1500     shiftCount = 0xBE - aExp;
1501     if ( shiftCount < 0 ) {
1502         float_raise( float_flag_invalid STATUS_VAR);
1503         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1504             return LIT64( 0x7FFFFFFFFFFFFFFF );
1505         }
1506         return (int64_t) LIT64( 0x8000000000000000 );
1507     }
1508     if ( aExp ) aSig |= 0x00800000;
1509     aSig64 = aSig;
1510     aSig64 <<= 40;
1511     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1512     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1513
1514 }
1515
1516 /*----------------------------------------------------------------------------
1517 | Returns the result of converting the single-precision floating-point value
1518 | `a' to the 64-bit two's complement integer format.  The conversion is
1519 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1520 | Arithmetic, except that the conversion is always rounded toward zero.  If
1521 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1522 | conversion overflows, the largest integer with the same sign as `a' is
1523 | returned.
1524 *----------------------------------------------------------------------------*/
1525
1526 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1527 {
1528     flag aSign;
1529     int16 aExp, shiftCount;
1530     uint32_t aSig;
1531     uint64_t aSig64;
1532     int64 z;
1533     a = float32_squash_input_denormal(a STATUS_VAR);
1534
1535     aSig = extractFloat32Frac( a );
1536     aExp = extractFloat32Exp( a );
1537     aSign = extractFloat32Sign( a );
1538     shiftCount = aExp - 0xBE;
1539     if ( 0 <= shiftCount ) {
1540         if ( float32_val(a) != 0xDF000000 ) {
1541             float_raise( float_flag_invalid STATUS_VAR);
1542             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1543                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1544             }
1545         }
1546         return (int64_t) LIT64( 0x8000000000000000 );
1547     }
1548     else if ( aExp <= 0x7E ) {
1549         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1550         return 0;
1551     }
1552     aSig64 = aSig | 0x00800000;
1553     aSig64 <<= 40;
1554     z = aSig64>>( - shiftCount );
1555     if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1556         STATUS(float_exception_flags) |= float_flag_inexact;
1557     }
1558     if ( aSign ) z = - z;
1559     return z;
1560
1561 }
1562
1563 /*----------------------------------------------------------------------------
1564 | Returns the result of converting the single-precision floating-point value
1565 | `a' to the double-precision floating-point format.  The conversion is
1566 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1567 | Arithmetic.
1568 *----------------------------------------------------------------------------*/
1569
1570 float64 float32_to_float64( float32 a STATUS_PARAM )
1571 {
1572     flag aSign;
1573     int16 aExp;
1574     uint32_t aSig;
1575     a = float32_squash_input_denormal(a STATUS_VAR);
1576
1577     aSig = extractFloat32Frac( a );
1578     aExp = extractFloat32Exp( a );
1579     aSign = extractFloat32Sign( a );
1580     if ( aExp == 0xFF ) {
1581         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1582         return packFloat64( aSign, 0x7FF, 0 );
1583     }
1584     if ( aExp == 0 ) {
1585         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1586         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1587         --aExp;
1588     }
1589     return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1590
1591 }
1592
1593 #ifdef FLOATX80
1594
1595 /*----------------------------------------------------------------------------
1596 | Returns the result of converting the single-precision floating-point value
1597 | `a' to the extended double-precision floating-point format.  The conversion
1598 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1599 | Arithmetic.
1600 *----------------------------------------------------------------------------*/
1601
1602 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1603 {
1604     flag aSign;
1605     int16 aExp;
1606     uint32_t aSig;
1607
1608     a = float32_squash_input_denormal(a STATUS_VAR);
1609     aSig = extractFloat32Frac( a );
1610     aExp = extractFloat32Exp( a );
1611     aSign = extractFloat32Sign( a );
1612     if ( aExp == 0xFF ) {
1613         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1614         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1615     }
1616     if ( aExp == 0 ) {
1617         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1618         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1619     }
1620     aSig |= 0x00800000;
1621     return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1622
1623 }
1624
1625 #endif
1626
1627 #ifdef FLOAT128
1628
1629 /*----------------------------------------------------------------------------
1630 | Returns the result of converting the single-precision floating-point value
1631 | `a' to the double-precision floating-point format.  The conversion is
1632 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1633 | Arithmetic.
1634 *----------------------------------------------------------------------------*/
1635
1636 float128 float32_to_float128( float32 a STATUS_PARAM )
1637 {
1638     flag aSign;
1639     int16 aExp;
1640     uint32_t aSig;
1641
1642     a = float32_squash_input_denormal(a STATUS_VAR);
1643     aSig = extractFloat32Frac( a );
1644     aExp = extractFloat32Exp( a );
1645     aSign = extractFloat32Sign( a );
1646     if ( aExp == 0xFF ) {
1647         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
1648         return packFloat128( aSign, 0x7FFF, 0, 0 );
1649     }
1650     if ( aExp == 0 ) {
1651         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1652         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1653         --aExp;
1654     }
1655     return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1656
1657 }
1658
1659 #endif
1660
1661 /*----------------------------------------------------------------------------
1662 | Rounds the single-precision floating-point value `a' to an integer, and
1663 | returns the result as a single-precision floating-point value.  The
1664 | operation is performed according to the IEC/IEEE Standard for Binary
1665 | Floating-Point Arithmetic.
1666 *----------------------------------------------------------------------------*/
1667
1668 float32 float32_round_to_int( float32 a STATUS_PARAM)
1669 {
1670     flag aSign;
1671     int16 aExp;
1672     uint32_t lastBitMask, roundBitsMask;
1673     int8 roundingMode;
1674     uint32_t z;
1675     a = float32_squash_input_denormal(a STATUS_VAR);
1676
1677     aExp = extractFloat32Exp( a );
1678     if ( 0x96 <= aExp ) {
1679         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1680             return propagateFloat32NaN( a, a STATUS_VAR );
1681         }
1682         return a;
1683     }
1684     if ( aExp <= 0x7E ) {
1685         if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1686         STATUS(float_exception_flags) |= float_flag_inexact;
1687         aSign = extractFloat32Sign( a );
1688         switch ( STATUS(float_rounding_mode) ) {
1689          case float_round_nearest_even:
1690             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1691                 return packFloat32( aSign, 0x7F, 0 );
1692             }
1693             break;
1694          case float_round_down:
1695             return make_float32(aSign ? 0xBF800000 : 0);
1696          case float_round_up:
1697             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1698         }
1699         return packFloat32( aSign, 0, 0 );
1700     }
1701     lastBitMask = 1;
1702     lastBitMask <<= 0x96 - aExp;
1703     roundBitsMask = lastBitMask - 1;
1704     z = float32_val(a);
1705     roundingMode = STATUS(float_rounding_mode);
1706     if ( roundingMode == float_round_nearest_even ) {
1707         z += lastBitMask>>1;
1708         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1709     }
1710     else if ( roundingMode != float_round_to_zero ) {
1711         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1712             z += roundBitsMask;
1713         }
1714     }
1715     z &= ~ roundBitsMask;
1716     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1717     return make_float32(z);
1718
1719 }
1720
1721 /*----------------------------------------------------------------------------
1722 | Returns the result of adding the absolute values of the single-precision
1723 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1724 | before being returned.  `zSign' is ignored if the result is a NaN.
1725 | The addition is performed according to the IEC/IEEE Standard for Binary
1726 | Floating-Point Arithmetic.
1727 *----------------------------------------------------------------------------*/
1728
1729 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1730 {
1731     int16 aExp, bExp, zExp;
1732     uint32_t aSig, bSig, zSig;
1733     int16 expDiff;
1734
1735     aSig = extractFloat32Frac( a );
1736     aExp = extractFloat32Exp( a );
1737     bSig = extractFloat32Frac( b );
1738     bExp = extractFloat32Exp( b );
1739     expDiff = aExp - bExp;
1740     aSig <<= 6;
1741     bSig <<= 6;
1742     if ( 0 < expDiff ) {
1743         if ( aExp == 0xFF ) {
1744             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1745             return a;
1746         }
1747         if ( bExp == 0 ) {
1748             --expDiff;
1749         }
1750         else {
1751             bSig |= 0x20000000;
1752         }
1753         shift32RightJamming( bSig, expDiff, &bSig );
1754         zExp = aExp;
1755     }
1756     else if ( expDiff < 0 ) {
1757         if ( bExp == 0xFF ) {
1758             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1759             return packFloat32( zSign, 0xFF, 0 );
1760         }
1761         if ( aExp == 0 ) {
1762             ++expDiff;
1763         }
1764         else {
1765             aSig |= 0x20000000;
1766         }
1767         shift32RightJamming( aSig, - expDiff, &aSig );
1768         zExp = bExp;
1769     }
1770     else {
1771         if ( aExp == 0xFF ) {
1772             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1773             return a;
1774         }
1775         if ( aExp == 0 ) {
1776             if (STATUS(flush_to_zero)) {
1777                 if (aSig | bSig) {
1778                     float_raise(float_flag_output_denormal STATUS_VAR);
1779                 }
1780                 return packFloat32(zSign, 0, 0);
1781             }
1782             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1783         }
1784         zSig = 0x40000000 + aSig + bSig;
1785         zExp = aExp;
1786         goto roundAndPack;
1787     }
1788     aSig |= 0x20000000;
1789     zSig = ( aSig + bSig )<<1;
1790     --zExp;
1791     if ( (int32_t) zSig < 0 ) {
1792         zSig = aSig + bSig;
1793         ++zExp;
1794     }
1795  roundAndPack:
1796     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1797
1798 }
1799
1800 /*----------------------------------------------------------------------------
1801 | Returns the result of subtracting the absolute values of the single-
1802 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1803 | difference is negated before being returned.  `zSign' is ignored if the
1804 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1805 | Standard for Binary Floating-Point Arithmetic.
1806 *----------------------------------------------------------------------------*/
1807
1808 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1809 {
1810     int16 aExp, bExp, zExp;
1811     uint32_t aSig, bSig, zSig;
1812     int16 expDiff;
1813
1814     aSig = extractFloat32Frac( a );
1815     aExp = extractFloat32Exp( a );
1816     bSig = extractFloat32Frac( b );
1817     bExp = extractFloat32Exp( b );
1818     expDiff = aExp - bExp;
1819     aSig <<= 7;
1820     bSig <<= 7;
1821     if ( 0 < expDiff ) goto aExpBigger;
1822     if ( expDiff < 0 ) goto bExpBigger;
1823     if ( aExp == 0xFF ) {
1824         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1825         float_raise( float_flag_invalid STATUS_VAR);
1826         return float32_default_nan;
1827     }
1828     if ( aExp == 0 ) {
1829         aExp = 1;
1830         bExp = 1;
1831     }
1832     if ( bSig < aSig ) goto aBigger;
1833     if ( aSig < bSig ) goto bBigger;
1834     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1835  bExpBigger:
1836     if ( bExp == 0xFF ) {
1837         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1838         return packFloat32( zSign ^ 1, 0xFF, 0 );
1839     }
1840     if ( aExp == 0 ) {
1841         ++expDiff;
1842     }
1843     else {
1844         aSig |= 0x40000000;
1845     }
1846     shift32RightJamming( aSig, - expDiff, &aSig );
1847     bSig |= 0x40000000;
1848  bBigger:
1849     zSig = bSig - aSig;
1850     zExp = bExp;
1851     zSign ^= 1;
1852     goto normalizeRoundAndPack;
1853  aExpBigger:
1854     if ( aExp == 0xFF ) {
1855         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1856         return a;
1857     }
1858     if ( bExp == 0 ) {
1859         --expDiff;
1860     }
1861     else {
1862         bSig |= 0x40000000;
1863     }
1864     shift32RightJamming( bSig, expDiff, &bSig );
1865     aSig |= 0x40000000;
1866  aBigger:
1867     zSig = aSig - bSig;
1868     zExp = aExp;
1869  normalizeRoundAndPack:
1870     --zExp;
1871     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1872
1873 }
1874
1875 /*----------------------------------------------------------------------------
1876 | Returns the result of adding the single-precision floating-point values `a'
1877 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1878 | Binary Floating-Point Arithmetic.
1879 *----------------------------------------------------------------------------*/
1880
1881 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1882 {
1883     flag aSign, bSign;
1884     a = float32_squash_input_denormal(a STATUS_VAR);
1885     b = float32_squash_input_denormal(b STATUS_VAR);
1886
1887     aSign = extractFloat32Sign( a );
1888     bSign = extractFloat32Sign( b );
1889     if ( aSign == bSign ) {
1890         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1891     }
1892     else {
1893         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1894     }
1895
1896 }
1897
1898 /*----------------------------------------------------------------------------
1899 | Returns the result of subtracting the single-precision floating-point values
1900 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1901 | for Binary Floating-Point Arithmetic.
1902 *----------------------------------------------------------------------------*/
1903
1904 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1905 {
1906     flag aSign, bSign;
1907     a = float32_squash_input_denormal(a STATUS_VAR);
1908     b = float32_squash_input_denormal(b STATUS_VAR);
1909
1910     aSign = extractFloat32Sign( a );
1911     bSign = extractFloat32Sign( b );
1912     if ( aSign == bSign ) {
1913         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1914     }
1915     else {
1916         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1917     }
1918
1919 }
1920
1921 /*----------------------------------------------------------------------------
1922 | Returns the result of multiplying the single-precision floating-point values
1923 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1924 | for Binary Floating-Point Arithmetic.
1925 *----------------------------------------------------------------------------*/
1926
1927 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1928 {
1929     flag aSign, bSign, zSign;
1930     int16 aExp, bExp, zExp;
1931     uint32_t aSig, bSig;
1932     uint64_t zSig64;
1933     uint32_t zSig;
1934
1935     a = float32_squash_input_denormal(a STATUS_VAR);
1936     b = float32_squash_input_denormal(b STATUS_VAR);
1937
1938     aSig = extractFloat32Frac( a );
1939     aExp = extractFloat32Exp( a );
1940     aSign = extractFloat32Sign( a );
1941     bSig = extractFloat32Frac( b );
1942     bExp = extractFloat32Exp( b );
1943     bSign = extractFloat32Sign( b );
1944     zSign = aSign ^ bSign;
1945     if ( aExp == 0xFF ) {
1946         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1947             return propagateFloat32NaN( a, b STATUS_VAR );
1948         }
1949         if ( ( bExp | bSig ) == 0 ) {
1950             float_raise( float_flag_invalid STATUS_VAR);
1951             return float32_default_nan;
1952         }
1953         return packFloat32( zSign, 0xFF, 0 );
1954     }
1955     if ( bExp == 0xFF ) {
1956         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1957         if ( ( aExp | aSig ) == 0 ) {
1958             float_raise( float_flag_invalid STATUS_VAR);
1959             return float32_default_nan;
1960         }
1961         return packFloat32( zSign, 0xFF, 0 );
1962     }
1963     if ( aExp == 0 ) {
1964         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1965         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1966     }
1967     if ( bExp == 0 ) {
1968         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1969         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1970     }
1971     zExp = aExp + bExp - 0x7F;
1972     aSig = ( aSig | 0x00800000 )<<7;
1973     bSig = ( bSig | 0x00800000 )<<8;
1974     shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
1975     zSig = zSig64;
1976     if ( 0 <= (int32_t) ( zSig<<1 ) ) {
1977         zSig <<= 1;
1978         --zExp;
1979     }
1980     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1981
1982 }
1983
1984 /*----------------------------------------------------------------------------
1985 | Returns the result of dividing the single-precision floating-point value `a'
1986 | by the corresponding value `b'.  The operation is performed according to the
1987 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1988 *----------------------------------------------------------------------------*/
1989
1990 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1991 {
1992     flag aSign, bSign, zSign;
1993     int16 aExp, bExp, zExp;
1994     uint32_t aSig, bSig, zSig;
1995     a = float32_squash_input_denormal(a STATUS_VAR);
1996     b = float32_squash_input_denormal(b STATUS_VAR);
1997
1998     aSig = extractFloat32Frac( a );
1999     aExp = extractFloat32Exp( a );
2000     aSign = extractFloat32Sign( a );
2001     bSig = extractFloat32Frac( b );
2002     bExp = extractFloat32Exp( b );
2003     bSign = extractFloat32Sign( b );
2004     zSign = aSign ^ bSign;
2005     if ( aExp == 0xFF ) {
2006         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2007         if ( bExp == 0xFF ) {
2008             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2009             float_raise( float_flag_invalid STATUS_VAR);
2010             return float32_default_nan;
2011         }
2012         return packFloat32( zSign, 0xFF, 0 );
2013     }
2014     if ( bExp == 0xFF ) {
2015         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2016         return packFloat32( zSign, 0, 0 );
2017     }
2018     if ( bExp == 0 ) {
2019         if ( bSig == 0 ) {
2020             if ( ( aExp | aSig ) == 0 ) {
2021                 float_raise( float_flag_invalid STATUS_VAR);
2022                 return float32_default_nan;
2023             }
2024             float_raise( float_flag_divbyzero STATUS_VAR);
2025             return packFloat32( zSign, 0xFF, 0 );
2026         }
2027         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2028     }
2029     if ( aExp == 0 ) {
2030         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2031         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2032     }
2033     zExp = aExp - bExp + 0x7D;
2034     aSig = ( aSig | 0x00800000 )<<7;
2035     bSig = ( bSig | 0x00800000 )<<8;
2036     if ( bSig <= ( aSig + aSig ) ) {
2037         aSig >>= 1;
2038         ++zExp;
2039     }
2040     zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2041     if ( ( zSig & 0x3F ) == 0 ) {
2042         zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2043     }
2044     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2045
2046 }
2047
2048 /*----------------------------------------------------------------------------
2049 | Returns the remainder of the single-precision floating-point value `a'
2050 | with respect to the corresponding value `b'.  The operation is performed
2051 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2052 *----------------------------------------------------------------------------*/
2053
2054 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2055 {
2056     flag aSign, zSign;
2057     int16 aExp, bExp, expDiff;
2058     uint32_t aSig, bSig;
2059     uint32_t q;
2060     uint64_t aSig64, bSig64, q64;
2061     uint32_t alternateASig;
2062     int32_t sigMean;
2063     a = float32_squash_input_denormal(a STATUS_VAR);
2064     b = float32_squash_input_denormal(b STATUS_VAR);
2065
2066     aSig = extractFloat32Frac( a );
2067     aExp = extractFloat32Exp( a );
2068     aSign = extractFloat32Sign( a );
2069     bSig = extractFloat32Frac( b );
2070     bExp = extractFloat32Exp( b );
2071     if ( aExp == 0xFF ) {
2072         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2073             return propagateFloat32NaN( a, b STATUS_VAR );
2074         }
2075         float_raise( float_flag_invalid STATUS_VAR);
2076         return float32_default_nan;
2077     }
2078     if ( bExp == 0xFF ) {
2079         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2080         return a;
2081     }
2082     if ( bExp == 0 ) {
2083         if ( bSig == 0 ) {
2084             float_raise( float_flag_invalid STATUS_VAR);
2085             return float32_default_nan;
2086         }
2087         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2088     }
2089     if ( aExp == 0 ) {
2090         if ( aSig == 0 ) return a;
2091         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2092     }
2093     expDiff = aExp - bExp;
2094     aSig |= 0x00800000;
2095     bSig |= 0x00800000;
2096     if ( expDiff < 32 ) {
2097         aSig <<= 8;
2098         bSig <<= 8;
2099         if ( expDiff < 0 ) {
2100             if ( expDiff < -1 ) return a;
2101             aSig >>= 1;
2102         }
2103         q = ( bSig <= aSig );
2104         if ( q ) aSig -= bSig;
2105         if ( 0 < expDiff ) {
2106             q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2107             q >>= 32 - expDiff;
2108             bSig >>= 2;
2109             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2110         }
2111         else {
2112             aSig >>= 2;
2113             bSig >>= 2;
2114         }
2115     }
2116     else {
2117         if ( bSig <= aSig ) aSig -= bSig;
2118         aSig64 = ( (uint64_t) aSig )<<40;
2119         bSig64 = ( (uint64_t) bSig )<<40;
2120         expDiff -= 64;
2121         while ( 0 < expDiff ) {
2122             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2123             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2124             aSig64 = - ( ( bSig * q64 )<<38 );
2125             expDiff -= 62;
2126         }
2127         expDiff += 64;
2128         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2129         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2130         q = q64>>( 64 - expDiff );
2131         bSig <<= 6;
2132         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2133     }
2134     do {
2135         alternateASig = aSig;
2136         ++q;
2137         aSig -= bSig;
2138     } while ( 0 <= (int32_t) aSig );
2139     sigMean = aSig + alternateASig;
2140     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2141         aSig = alternateASig;
2142     }
2143     zSign = ( (int32_t) aSig < 0 );
2144     if ( zSign ) aSig = - aSig;
2145     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2146
2147 }
2148
2149 /*----------------------------------------------------------------------------
2150 | Returns the square root of the single-precision floating-point value `a'.
2151 | The operation is performed according to the IEC/IEEE Standard for Binary
2152 | Floating-Point Arithmetic.
2153 *----------------------------------------------------------------------------*/
2154
2155 float32 float32_sqrt( float32 a STATUS_PARAM )
2156 {
2157     flag aSign;
2158     int16 aExp, zExp;
2159     uint32_t aSig, zSig;
2160     uint64_t rem, term;
2161     a = float32_squash_input_denormal(a STATUS_VAR);
2162
2163     aSig = extractFloat32Frac( a );
2164     aExp = extractFloat32Exp( a );
2165     aSign = extractFloat32Sign( a );
2166     if ( aExp == 0xFF ) {
2167         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2168         if ( ! aSign ) return a;
2169         float_raise( float_flag_invalid STATUS_VAR);
2170         return float32_default_nan;
2171     }
2172     if ( aSign ) {
2173         if ( ( aExp | aSig ) == 0 ) return a;
2174         float_raise( float_flag_invalid STATUS_VAR);
2175         return float32_default_nan;
2176     }
2177     if ( aExp == 0 ) {
2178         if ( aSig == 0 ) return float32_zero;
2179         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2180     }
2181     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2182     aSig = ( aSig | 0x00800000 )<<8;
2183     zSig = estimateSqrt32( aExp, aSig ) + 2;
2184     if ( ( zSig & 0x7F ) <= 5 ) {
2185         if ( zSig < 2 ) {
2186             zSig = 0x7FFFFFFF;
2187             goto roundAndPack;
2188         }
2189         aSig >>= aExp & 1;
2190         term = ( (uint64_t) zSig ) * zSig;
2191         rem = ( ( (uint64_t) aSig )<<32 ) - term;
2192         while ( (int64_t) rem < 0 ) {
2193             --zSig;
2194             rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2195         }
2196         zSig |= ( rem != 0 );
2197     }
2198     shift32RightJamming( zSig, 1, &zSig );
2199  roundAndPack:
2200     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2201
2202 }
2203
2204 /*----------------------------------------------------------------------------
2205 | Returns the binary exponential of the single-precision floating-point value
2206 | `a'. The operation is performed according to the IEC/IEEE Standard for
2207 | Binary Floating-Point Arithmetic.
2208 |
2209 | Uses the following identities:
2210 |
2211 | 1. -------------------------------------------------------------------------
2212 |      x    x*ln(2)
2213 |     2  = e
2214 |
2215 | 2. -------------------------------------------------------------------------
2216 |                      2     3     4     5           n
2217 |      x        x     x     x     x     x           x
2218 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2219 |               1!    2!    3!    4!    5!          n!
2220 *----------------------------------------------------------------------------*/
2221
2222 static const float64 float32_exp2_coefficients[15] =
2223 {
2224     const_float64( 0x3ff0000000000000ll ), /*  1 */
2225     const_float64( 0x3fe0000000000000ll ), /*  2 */
2226     const_float64( 0x3fc5555555555555ll ), /*  3 */
2227     const_float64( 0x3fa5555555555555ll ), /*  4 */
2228     const_float64( 0x3f81111111111111ll ), /*  5 */
2229     const_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2230     const_float64( 0x3f2a01a01a01a01all ), /*  7 */
2231     const_float64( 0x3efa01a01a01a01all ), /*  8 */
2232     const_float64( 0x3ec71de3a556c734ll ), /*  9 */
2233     const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2234     const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2235     const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2236     const_float64( 0x3de6124613a86d09ll ), /* 13 */
2237     const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2238     const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2239 };
2240
2241 float32 float32_exp2( float32 a STATUS_PARAM )
2242 {
2243     flag aSign;
2244     int16 aExp;
2245     uint32_t aSig;
2246     float64 r, x, xn;
2247     int i;
2248     a = float32_squash_input_denormal(a STATUS_VAR);
2249
2250     aSig = extractFloat32Frac( a );
2251     aExp = extractFloat32Exp( a );
2252     aSign = extractFloat32Sign( a );
2253
2254     if ( aExp == 0xFF) {
2255         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2256         return (aSign) ? float32_zero : a;
2257     }
2258     if (aExp == 0) {
2259         if (aSig == 0) return float32_one;
2260     }
2261
2262     float_raise( float_flag_inexact STATUS_VAR);
2263
2264     /* ******************************* */
2265     /* using float64 for approximation */
2266     /* ******************************* */
2267     x = float32_to_float64(a STATUS_VAR);
2268     x = float64_mul(x, float64_ln2 STATUS_VAR);
2269
2270     xn = x;
2271     r = float64_one;
2272     for (i = 0 ; i < 15 ; i++) {
2273         float64 f;
2274
2275         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2276         r = float64_add(r, f STATUS_VAR);
2277
2278         xn = float64_mul(xn, x STATUS_VAR);
2279     }
2280
2281     return float64_to_float32(r, status);
2282 }
2283
2284 /*----------------------------------------------------------------------------
2285 | Returns the binary log of the single-precision floating-point value `a'.
2286 | The operation is performed according to the IEC/IEEE Standard for Binary
2287 | Floating-Point Arithmetic.
2288 *----------------------------------------------------------------------------*/
2289 float32 float32_log2( float32 a STATUS_PARAM )
2290 {
2291     flag aSign, zSign;
2292     int16 aExp;
2293     uint32_t aSig, zSig, i;
2294
2295     a = float32_squash_input_denormal(a STATUS_VAR);
2296     aSig = extractFloat32Frac( a );
2297     aExp = extractFloat32Exp( a );
2298     aSign = extractFloat32Sign( a );
2299
2300     if ( aExp == 0 ) {
2301         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2302         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2303     }
2304     if ( aSign ) {
2305         float_raise( float_flag_invalid STATUS_VAR);
2306         return float32_default_nan;
2307     }
2308     if ( aExp == 0xFF ) {
2309         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2310         return a;
2311     }
2312
2313     aExp -= 0x7F;
2314     aSig |= 0x00800000;
2315     zSign = aExp < 0;
2316     zSig = aExp << 23;
2317
2318     for (i = 1 << 22; i > 0; i >>= 1) {
2319         aSig = ( (uint64_t)aSig * aSig ) >> 23;
2320         if ( aSig & 0x01000000 ) {
2321             aSig >>= 1;
2322             zSig |= i;
2323         }
2324     }
2325
2326     if ( zSign )
2327         zSig = -zSig;
2328
2329     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2330 }
2331
2332 /*----------------------------------------------------------------------------
2333 | Returns 1 if the single-precision floating-point value `a' is equal to
2334 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2335 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2336 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2337 *----------------------------------------------------------------------------*/
2338
2339 int float32_eq( float32 a, float32 b STATUS_PARAM )
2340 {
2341     uint32_t av, bv;
2342     a = float32_squash_input_denormal(a STATUS_VAR);
2343     b = float32_squash_input_denormal(b STATUS_VAR);
2344
2345     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2346          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2347        ) {
2348         float_raise( float_flag_invalid STATUS_VAR);
2349         return 0;
2350     }
2351     av = float32_val(a);
2352     bv = float32_val(b);
2353     return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2354 }
2355
2356 /*----------------------------------------------------------------------------
2357 | Returns 1 if the single-precision floating-point value `a' is less than
2358 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
2359 | exception is raised if either operand is a NaN.  The comparison is performed
2360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2361 *----------------------------------------------------------------------------*/
2362
2363 int float32_le( float32 a, float32 b STATUS_PARAM )
2364 {
2365     flag aSign, bSign;
2366     uint32_t av, bv;
2367     a = float32_squash_input_denormal(a STATUS_VAR);
2368     b = float32_squash_input_denormal(b STATUS_VAR);
2369
2370     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2371          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2372        ) {
2373         float_raise( float_flag_invalid STATUS_VAR);
2374         return 0;
2375     }
2376     aSign = extractFloat32Sign( a );
2377     bSign = extractFloat32Sign( b );
2378     av = float32_val(a);
2379     bv = float32_val(b);
2380     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2381     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2382
2383 }
2384
2385 /*----------------------------------------------------------------------------
2386 | Returns 1 if the single-precision floating-point value `a' is less than
2387 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2388 | raised if either operand is a NaN.  The comparison is performed according
2389 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2390 *----------------------------------------------------------------------------*/
2391
2392 int float32_lt( float32 a, float32 b STATUS_PARAM )
2393 {
2394     flag aSign, bSign;
2395     uint32_t av, bv;
2396     a = float32_squash_input_denormal(a STATUS_VAR);
2397     b = float32_squash_input_denormal(b STATUS_VAR);
2398
2399     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2400          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2401        ) {
2402         float_raise( float_flag_invalid STATUS_VAR);
2403         return 0;
2404     }
2405     aSign = extractFloat32Sign( a );
2406     bSign = extractFloat32Sign( b );
2407     av = float32_val(a);
2408     bv = float32_val(b);
2409     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2410     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2411
2412 }
2413
2414 /*----------------------------------------------------------------------------
2415 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2416 | be compared, and 0 otherwise.  The invalid exception is raised if either
2417 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
2418 | Standard for Binary Floating-Point Arithmetic.
2419 *----------------------------------------------------------------------------*/
2420
2421 int float32_unordered( float32 a, float32 b STATUS_PARAM )
2422 {
2423     a = float32_squash_input_denormal(a STATUS_VAR);
2424     b = float32_squash_input_denormal(b STATUS_VAR);
2425
2426     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2427          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2428        ) {
2429         float_raise( float_flag_invalid STATUS_VAR);
2430         return 1;
2431     }
2432     return 0;
2433 }
2434
2435 /*----------------------------------------------------------------------------
2436 | Returns 1 if the single-precision floating-point value `a' is equal to
2437 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2438 | exception.  The comparison is performed according to the IEC/IEEE Standard
2439 | for Binary Floating-Point Arithmetic.
2440 *----------------------------------------------------------------------------*/
2441
2442 int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
2443 {
2444     a = float32_squash_input_denormal(a STATUS_VAR);
2445     b = float32_squash_input_denormal(b STATUS_VAR);
2446
2447     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2448          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2449        ) {
2450         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2451             float_raise( float_flag_invalid STATUS_VAR);
2452         }
2453         return 0;
2454     }
2455     return ( float32_val(a) == float32_val(b) ) ||
2456             ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2457 }
2458
2459 /*----------------------------------------------------------------------------
2460 | Returns 1 if the single-precision floating-point value `a' is less than or
2461 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2462 | cause an exception.  Otherwise, the comparison is performed according to the
2463 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2464 *----------------------------------------------------------------------------*/
2465
2466 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2467 {
2468     flag aSign, bSign;
2469     uint32_t av, bv;
2470     a = float32_squash_input_denormal(a STATUS_VAR);
2471     b = float32_squash_input_denormal(b STATUS_VAR);
2472
2473     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2474          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2475        ) {
2476         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2477             float_raise( float_flag_invalid STATUS_VAR);
2478         }
2479         return 0;
2480     }
2481     aSign = extractFloat32Sign( a );
2482     bSign = extractFloat32Sign( b );
2483     av = float32_val(a);
2484     bv = float32_val(b);
2485     if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2486     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2487
2488 }
2489
2490 /*----------------------------------------------------------------------------
2491 | Returns 1 if the single-precision floating-point value `a' is less than
2492 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2493 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2494 | Standard for Binary Floating-Point Arithmetic.
2495 *----------------------------------------------------------------------------*/
2496
2497 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2498 {
2499     flag aSign, bSign;
2500     uint32_t av, bv;
2501     a = float32_squash_input_denormal(a STATUS_VAR);
2502     b = float32_squash_input_denormal(b STATUS_VAR);
2503
2504     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2505          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2506        ) {
2507         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2508             float_raise( float_flag_invalid STATUS_VAR);
2509         }
2510         return 0;
2511     }
2512     aSign = extractFloat32Sign( a );
2513     bSign = extractFloat32Sign( b );
2514     av = float32_val(a);
2515     bv = float32_val(b);
2516     if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2517     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2518
2519 }
2520
2521 /*----------------------------------------------------------------------------
2522 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2523 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
2524 | comparison is performed according to the IEC/IEEE Standard for Binary
2525 | Floating-Point Arithmetic.
2526 *----------------------------------------------------------------------------*/
2527
2528 int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2529 {
2530     a = float32_squash_input_denormal(a STATUS_VAR);
2531     b = float32_squash_input_denormal(b STATUS_VAR);
2532
2533     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2534          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2535        ) {
2536         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2537             float_raise( float_flag_invalid STATUS_VAR);
2538         }
2539         return 1;
2540     }
2541     return 0;
2542 }
2543
2544 /*----------------------------------------------------------------------------
2545 | Returns the result of converting the double-precision floating-point value
2546 | `a' to the 32-bit two's complement integer format.  The conversion is
2547 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2548 | Arithmetic---which means in particular that the conversion is rounded
2549 | according to the current rounding mode.  If `a' is a NaN, the largest
2550 | positive integer is returned.  Otherwise, if the conversion overflows, the
2551 | largest integer with the same sign as `a' is returned.
2552 *----------------------------------------------------------------------------*/
2553
2554 int32 float64_to_int32( float64 a STATUS_PARAM )
2555 {
2556     flag aSign;
2557     int16 aExp, shiftCount;
2558     uint64_t aSig;
2559     a = float64_squash_input_denormal(a STATUS_VAR);
2560
2561     aSig = extractFloat64Frac( a );
2562     aExp = extractFloat64Exp( a );
2563     aSign = extractFloat64Sign( a );
2564     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2565     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2566     shiftCount = 0x42C - aExp;
2567     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2568     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2569
2570 }
2571
2572 /*----------------------------------------------------------------------------
2573 | Returns the result of converting the double-precision floating-point value
2574 | `a' to the 32-bit two's complement integer format.  The conversion is
2575 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2576 | Arithmetic, except that the conversion is always rounded toward zero.
2577 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2578 | the conversion overflows, the largest integer with the same sign as `a' is
2579 | returned.
2580 *----------------------------------------------------------------------------*/
2581
2582 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2583 {
2584     flag aSign;
2585     int16 aExp, shiftCount;
2586     uint64_t aSig, savedASig;
2587     int32 z;
2588     a = float64_squash_input_denormal(a STATUS_VAR);
2589
2590     aSig = extractFloat64Frac( a );
2591     aExp = extractFloat64Exp( a );
2592     aSign = extractFloat64Sign( a );
2593     if ( 0x41E < aExp ) {
2594         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2595         goto invalid;
2596     }
2597     else if ( aExp < 0x3FF ) {
2598         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2599         return 0;
2600     }
2601     aSig |= LIT64( 0x0010000000000000 );
2602     shiftCount = 0x433 - aExp;
2603     savedASig = aSig;
2604     aSig >>= shiftCount;
2605     z = aSig;
2606     if ( aSign ) z = - z;
2607     if ( ( z < 0 ) ^ aSign ) {
2608  invalid:
2609         float_raise( float_flag_invalid STATUS_VAR);
2610         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
2611     }
2612     if ( ( aSig<<shiftCount ) != savedASig ) {
2613         STATUS(float_exception_flags) |= float_flag_inexact;
2614     }
2615     return z;
2616
2617 }
2618
2619 /*----------------------------------------------------------------------------
2620 | Returns the result of converting the double-precision floating-point value
2621 | `a' to the 16-bit two's complement integer format.  The conversion is
2622 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2623 | Arithmetic, except that the conversion is always rounded toward zero.
2624 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2625 | the conversion overflows, the largest integer with the same sign as `a' is
2626 | returned.
2627 *----------------------------------------------------------------------------*/
2628
2629 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2630 {
2631     flag aSign;
2632     int16 aExp, shiftCount;
2633     uint64_t aSig, savedASig;
2634     int32 z;
2635
2636     aSig = extractFloat64Frac( a );
2637     aExp = extractFloat64Exp( a );
2638     aSign = extractFloat64Sign( a );
2639     if ( 0x40E < aExp ) {
2640         if ( ( aExp == 0x7FF ) && aSig ) {
2641             aSign = 0;
2642         }
2643         goto invalid;
2644     }
2645     else if ( aExp < 0x3FF ) {
2646         if ( aExp || aSig ) {
2647             STATUS(float_exception_flags) |= float_flag_inexact;
2648         }
2649         return 0;
2650     }
2651     aSig |= LIT64( 0x0010000000000000 );
2652     shiftCount = 0x433 - aExp;
2653     savedASig = aSig;
2654     aSig >>= shiftCount;
2655     z = aSig;
2656     if ( aSign ) {
2657         z = - z;
2658     }
2659     if ( ( (int16_t)z < 0 ) ^ aSign ) {
2660  invalid:
2661         float_raise( float_flag_invalid STATUS_VAR);
2662         return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
2663     }
2664     if ( ( aSig<<shiftCount ) != savedASig ) {
2665         STATUS(float_exception_flags) |= float_flag_inexact;
2666     }
2667     return z;
2668 }
2669
2670 /*----------------------------------------------------------------------------
2671 | Returns the result of converting the double-precision floating-point value
2672 | `a' to the 64-bit two's complement integer format.  The conversion is
2673 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2674 | Arithmetic---which means in particular that the conversion is rounded
2675 | according to the current rounding mode.  If `a' is a NaN, the largest
2676 | positive integer is returned.  Otherwise, if the conversion overflows, the
2677 | largest integer with the same sign as `a' is returned.
2678 *----------------------------------------------------------------------------*/
2679
2680 int64 float64_to_int64( float64 a STATUS_PARAM )
2681 {
2682     flag aSign;
2683     int16 aExp, shiftCount;
2684     uint64_t aSig, aSigExtra;
2685     a = float64_squash_input_denormal(a STATUS_VAR);
2686
2687     aSig = extractFloat64Frac( a );
2688     aExp = extractFloat64Exp( a );
2689     aSign = extractFloat64Sign( a );
2690     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2691     shiftCount = 0x433 - aExp;
2692     if ( shiftCount <= 0 ) {
2693         if ( 0x43E < aExp ) {
2694             float_raise( float_flag_invalid STATUS_VAR);
2695             if (    ! aSign
2696                  || (    ( aExp == 0x7FF )
2697                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2698                ) {
2699                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2700             }
2701             return (int64_t) LIT64( 0x8000000000000000 );
2702         }
2703         aSigExtra = 0;
2704         aSig <<= - shiftCount;
2705     }
2706     else {
2707         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2708     }
2709     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2710
2711 }
2712
2713 /*----------------------------------------------------------------------------
2714 | Returns the result of converting the double-precision floating-point value
2715 | `a' to the 64-bit two's complement integer format.  The conversion is
2716 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2717 | Arithmetic, except that the conversion is always rounded toward zero.
2718 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2719 | the conversion overflows, the largest integer with the same sign as `a' is
2720 | returned.
2721 *----------------------------------------------------------------------------*/
2722
2723 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2724 {
2725     flag aSign;
2726     int16 aExp, shiftCount;
2727     uint64_t aSig;
2728     int64 z;
2729     a = float64_squash_input_denormal(a STATUS_VAR);
2730
2731     aSig = extractFloat64Frac( a );
2732     aExp = extractFloat64Exp( a );
2733     aSign = extractFloat64Sign( a );
2734     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2735     shiftCount = aExp - 0x433;
2736     if ( 0 <= shiftCount ) {
2737         if ( 0x43E <= aExp ) {
2738             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2739                 float_raise( float_flag_invalid STATUS_VAR);
2740                 if (    ! aSign
2741                      || (    ( aExp == 0x7FF )
2742                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2743                    ) {
2744                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2745                 }
2746             }
2747             return (int64_t) LIT64( 0x8000000000000000 );
2748         }
2749         z = aSig<<shiftCount;
2750     }
2751     else {
2752         if ( aExp < 0x3FE ) {
2753             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2754             return 0;
2755         }
2756         z = aSig>>( - shiftCount );
2757         if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
2758             STATUS(float_exception_flags) |= float_flag_inexact;
2759         }
2760     }
2761     if ( aSign ) z = - z;
2762     return z;
2763
2764 }
2765
2766 /*----------------------------------------------------------------------------
2767 | Returns the result of converting the double-precision floating-point value
2768 | `a' to the single-precision floating-point format.  The conversion is
2769 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2770 | Arithmetic.
2771 *----------------------------------------------------------------------------*/
2772
2773 float32 float64_to_float32( float64 a STATUS_PARAM )
2774 {
2775     flag aSign;
2776     int16 aExp;
2777     uint64_t aSig;
2778     uint32_t zSig;
2779     a = float64_squash_input_denormal(a STATUS_VAR);
2780
2781     aSig = extractFloat64Frac( a );
2782     aExp = extractFloat64Exp( a );
2783     aSign = extractFloat64Sign( a );
2784     if ( aExp == 0x7FF ) {
2785         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2786         return packFloat32( aSign, 0xFF, 0 );
2787     }
2788     shift64RightJamming( aSig, 22, &aSig );
2789     zSig = aSig;
2790     if ( aExp || zSig ) {
2791         zSig |= 0x40000000;
2792         aExp -= 0x381;
2793     }
2794     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2795
2796 }
2797
2798
2799 /*----------------------------------------------------------------------------
2800 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2801 | half-precision floating-point value, returning the result.  After being
2802 | shifted into the proper positions, the three fields are simply added
2803 | together to form the result.  This means that any integer portion of `zSig'
2804 | will be added into the exponent.  Since a properly normalized significand
2805 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2806 | than the desired result exponent whenever `zSig' is a complete, normalized
2807 | significand.
2808 *----------------------------------------------------------------------------*/
2809 static float16 packFloat16(flag zSign, int16 zExp, uint16_t zSig)
2810 {
2811     return make_float16(
2812         (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
2813 }
2814
2815 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2816    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2817
2818 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2819 {
2820     flag aSign;
2821     int16 aExp;
2822     uint32_t aSig;
2823
2824     aSign = extractFloat16Sign(a);
2825     aExp = extractFloat16Exp(a);
2826     aSig = extractFloat16Frac(a);
2827
2828     if (aExp == 0x1f && ieee) {
2829         if (aSig) {
2830             return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
2831         }
2832         return packFloat32(aSign, 0xff, aSig << 13);
2833     }
2834     if (aExp == 0) {
2835         int8 shiftCount;
2836
2837         if (aSig == 0) {
2838             return packFloat32(aSign, 0, 0);
2839         }
2840
2841         shiftCount = countLeadingZeros32( aSig ) - 21;
2842         aSig = aSig << shiftCount;
2843         aExp = -shiftCount;
2844     }
2845     return packFloat32( aSign, aExp + 0x70, aSig << 13);
2846 }
2847
2848 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2849 {
2850     flag aSign;
2851     int16 aExp;
2852     uint32_t aSig;
2853     uint32_t mask;
2854     uint32_t increment;
2855     int8 roundingMode;
2856     a = float32_squash_input_denormal(a STATUS_VAR);
2857
2858     aSig = extractFloat32Frac( a );
2859     aExp = extractFloat32Exp( a );
2860     aSign = extractFloat32Sign( a );
2861     if ( aExp == 0xFF ) {
2862         if (aSig) {
2863             /* Input is a NaN */
2864             float16 r = commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2865             if (!ieee) {
2866                 return packFloat16(aSign, 0, 0);
2867             }
2868             return r;
2869         }
2870         /* Infinity */
2871         if (!ieee) {
2872             float_raise(float_flag_invalid STATUS_VAR);
2873             return packFloat16(aSign, 0x1f, 0x3ff);
2874         }
2875         return packFloat16(aSign, 0x1f, 0);
2876     }
2877     if (aExp == 0 && aSig == 0) {
2878         return packFloat16(aSign, 0, 0);
2879     }
2880     /* Decimal point between bits 22 and 23.  */
2881     aSig |= 0x00800000;
2882     aExp -= 0x7f;
2883     if (aExp < -14) {
2884         mask = 0x00ffffff;
2885         if (aExp >= -24) {
2886             mask >>= 25 + aExp;
2887         }
2888     } else {
2889         mask = 0x00001fff;
2890     }
2891     if (aSig & mask) {
2892         float_raise( float_flag_underflow STATUS_VAR );
2893         roundingMode = STATUS(float_rounding_mode);
2894         switch (roundingMode) {
2895         case float_round_nearest_even:
2896             increment = (mask + 1) >> 1;
2897             if ((aSig & mask) == increment) {
2898                 increment = aSig & (increment << 1);
2899             }
2900             break;
2901         case float_round_up:
2902             increment = aSign ? 0 : mask;
2903             break;
2904         case float_round_down:
2905             increment = aSign ? mask : 0;
2906             break;
2907         default: /* round_to_zero */
2908             increment = 0;
2909             break;
2910         }
2911         aSig += increment;
2912         if (aSig >= 0x01000000) {
2913             aSig >>= 1;
2914             aExp++;
2915         }
2916     } else if (aExp < -14
2917           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2918         float_raise( float_flag_underflow STATUS_VAR);
2919     }
2920
2921     if (ieee) {
2922         if (aExp > 15) {
2923             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2924             return packFloat16(aSign, 0x1f, 0);
2925         }
2926     } else {
2927         if (aExp > 16) {
2928             float_raise(float_flag_invalid | float_flag_inexact STATUS_VAR);
2929             return packFloat16(aSign, 0x1f, 0x3ff);
2930         }
2931     }
2932     if (aExp < -24) {
2933         return packFloat16(aSign, 0, 0);
2934     }
2935     if (aExp < -14) {
2936         aSig >>= -14 - aExp;
2937         aExp = -14;
2938     }
2939     return packFloat16(aSign, aExp + 14, aSig >> 13);
2940 }
2941
2942 #ifdef FLOATX80
2943
2944 /*----------------------------------------------------------------------------
2945 | Returns the result of converting the double-precision floating-point value
2946 | `a' to the extended double-precision floating-point format.  The conversion
2947 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2948 | Arithmetic.
2949 *----------------------------------------------------------------------------*/
2950
2951 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2952 {
2953     flag aSign;
2954     int16 aExp;
2955     uint64_t aSig;
2956
2957     a = float64_squash_input_denormal(a STATUS_VAR);
2958     aSig = extractFloat64Frac( a );
2959     aExp = extractFloat64Exp( a );
2960     aSign = extractFloat64Sign( a );
2961     if ( aExp == 0x7FF ) {
2962         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2963         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2964     }
2965     if ( aExp == 0 ) {
2966         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2967         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2968     }
2969     return
2970         packFloatx80(
2971             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2972
2973 }
2974
2975 #endif
2976
2977 #ifdef FLOAT128
2978
2979 /*----------------------------------------------------------------------------
2980 | Returns the result of converting the double-precision floating-point value
2981 | `a' to the quadruple-precision floating-point format.  The conversion is
2982 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2983 | Arithmetic.
2984 *----------------------------------------------------------------------------*/
2985
2986 float128 float64_to_float128( float64 a STATUS_PARAM )
2987 {
2988     flag aSign;
2989     int16 aExp;
2990     uint64_t aSig, zSig0, zSig1;
2991
2992     a = float64_squash_input_denormal(a STATUS_VAR);
2993     aSig = extractFloat64Frac( a );
2994     aExp = extractFloat64Exp( a );
2995     aSign = extractFloat64Sign( a );
2996     if ( aExp == 0x7FF ) {
2997         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
2998         return packFloat128( aSign, 0x7FFF, 0, 0 );
2999     }
3000     if ( aExp == 0 ) {
3001         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3002         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3003         --aExp;
3004     }
3005     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3006     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3007
3008 }
3009
3010 #endif
3011
3012 /*----------------------------------------------------------------------------
3013 | Rounds the double-precision floating-point value `a' to an integer, and
3014 | returns the result as a double-precision floating-point value.  The
3015 | operation is performed according to the IEC/IEEE Standard for Binary
3016 | Floating-Point Arithmetic.
3017 *----------------------------------------------------------------------------*/
3018
3019 float64 float64_round_to_int( float64 a STATUS_PARAM )
3020 {
3021     flag aSign;
3022     int16 aExp;
3023     uint64_t lastBitMask, roundBitsMask;
3024     int8 roundingMode;
3025     uint64_t z;
3026     a = float64_squash_input_denormal(a STATUS_VAR);
3027
3028     aExp = extractFloat64Exp( a );
3029     if ( 0x433 <= aExp ) {
3030         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3031             return propagateFloat64NaN( a, a STATUS_VAR );
3032         }
3033         return a;
3034     }
3035     if ( aExp < 0x3FF ) {
3036         if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3037         STATUS(float_exception_flags) |= float_flag_inexact;
3038         aSign = extractFloat64Sign( a );
3039         switch ( STATUS(float_rounding_mode) ) {
3040          case float_round_nearest_even:
3041             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3042                 return packFloat64( aSign, 0x3FF, 0 );
3043             }
3044             break;
3045          case float_round_down:
3046             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3047          case float_round_up:
3048             return make_float64(
3049             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3050         }
3051         return packFloat64( aSign, 0, 0 );
3052     }
3053     lastBitMask = 1;
3054     lastBitMask <<= 0x433 - aExp;
3055     roundBitsMask = lastBitMask - 1;
3056     z = float64_val(a);
3057     roundingMode = STATUS(float_rounding_mode);
3058     if ( roundingMode == float_round_nearest_even ) {
3059         z += lastBitMask>>1;
3060         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
3061     }
3062     else if ( roundingMode != float_round_to_zero ) {
3063         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
3064             z += roundBitsMask;
3065         }
3066     }
3067     z &= ~ roundBitsMask;
3068     if ( z != float64_val(a) )
3069         STATUS(float_exception_flags) |= float_flag_inexact;
3070     return make_float64(z);
3071
3072 }
3073
3074 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3075 {
3076     int oldmode;
3077     float64 res;
3078     oldmode = STATUS(float_rounding_mode);
3079     STATUS(float_rounding_mode) = float_round_to_zero;
3080     res = float64_round_to_int(a STATUS_VAR);
3081     STATUS(float_rounding_mode) = oldmode;
3082     return res;
3083 }
3084
3085 /*----------------------------------------------------------------------------
3086 | Returns the result of adding the absolute values of the double-precision
3087 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3088 | before being returned.  `zSign' is ignored if the result is a NaN.
3089 | The addition is performed according to the IEC/IEEE Standard for Binary
3090 | Floating-Point Arithmetic.
3091 *----------------------------------------------------------------------------*/
3092
3093 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3094 {
3095     int16 aExp, bExp, zExp;
3096     uint64_t aSig, bSig, zSig;
3097     int16 expDiff;
3098
3099     aSig = extractFloat64Frac( a );
3100     aExp = extractFloat64Exp( a );
3101     bSig = extractFloat64Frac( b );
3102     bExp = extractFloat64Exp( b );
3103     expDiff = aExp - bExp;
3104     aSig <<= 9;
3105     bSig <<= 9;
3106     if ( 0 < expDiff ) {
3107         if ( aExp == 0x7FF ) {
3108             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3109             return a;
3110         }
3111         if ( bExp == 0 ) {
3112             --expDiff;
3113         }
3114         else {
3115             bSig |= LIT64( 0x2000000000000000 );
3116         }
3117         shift64RightJamming( bSig, expDiff, &bSig );
3118         zExp = aExp;
3119     }
3120     else if ( expDiff < 0 ) {
3121         if ( bExp == 0x7FF ) {
3122             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3123             return packFloat64( zSign, 0x7FF, 0 );
3124         }
3125         if ( aExp == 0 ) {
3126             ++expDiff;
3127         }
3128         else {
3129             aSig |= LIT64( 0x2000000000000000 );
3130         }
3131         shift64RightJamming( aSig, - expDiff, &aSig );
3132         zExp = bExp;
3133     }
3134     else {
3135         if ( aExp == 0x7FF ) {
3136             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3137             return a;
3138         }
3139         if ( aExp == 0 ) {
3140             if (STATUS(flush_to_zero)) {
3141                 if (aSig | bSig) {
3142                     float_raise(float_flag_output_denormal STATUS_VAR);
3143                 }
3144                 return packFloat64(zSign, 0, 0);
3145             }
3146             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3147         }
3148         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3149         zExp = aExp;
3150         goto roundAndPack;
3151     }
3152     aSig |= LIT64( 0x2000000000000000 );
3153     zSig = ( aSig + bSig )<<1;
3154     --zExp;
3155     if ( (int64_t) zSig < 0 ) {
3156         zSig = aSig + bSig;
3157         ++zExp;
3158     }
3159  roundAndPack:
3160     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3161
3162 }
3163
3164 /*----------------------------------------------------------------------------
3165 | Returns the result of subtracting the absolute values of the double-
3166 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3167 | difference is negated before being returned.  `zSign' is ignored if the
3168 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3169 | Standard for Binary Floating-Point Arithmetic.
3170 *----------------------------------------------------------------------------*/
3171
3172 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3173 {
3174     int16 aExp, bExp, zExp;
3175     uint64_t aSig, bSig, zSig;
3176     int16 expDiff;
3177
3178     aSig = extractFloat64Frac( a );
3179     aExp = extractFloat64Exp( a );
3180     bSig = extractFloat64Frac( b );
3181     bExp = extractFloat64Exp( b );
3182     expDiff = aExp - bExp;
3183     aSig <<= 10;
3184     bSig <<= 10;
3185     if ( 0 < expDiff ) goto aExpBigger;
3186     if ( expDiff < 0 ) goto bExpBigger;
3187     if ( aExp == 0x7FF ) {
3188         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3189         float_raise( float_flag_invalid STATUS_VAR);
3190         return float64_default_nan;
3191     }
3192     if ( aExp == 0 ) {
3193         aExp = 1;
3194         bExp = 1;
3195     }
3196     if ( bSig < aSig ) goto aBigger;
3197     if ( aSig < bSig ) goto bBigger;
3198     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3199  bExpBigger:
3200     if ( bExp == 0x7FF ) {
3201         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3202         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3203     }
3204     if ( aExp == 0 ) {
3205         ++expDiff;
3206     }
3207     else {
3208         aSig |= LIT64( 0x4000000000000000 );
3209     }
3210     shift64RightJamming( aSig, - expDiff, &aSig );
3211     bSig |= LIT64( 0x4000000000000000 );
3212  bBigger:
3213     zSig = bSig - aSig;
3214     zExp = bExp;
3215     zSign ^= 1;
3216     goto normalizeRoundAndPack;
3217  aExpBigger:
3218     if ( aExp == 0x7FF ) {
3219         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3220         return a;
3221     }
3222     if ( bExp == 0 ) {
3223         --expDiff;
3224     }
3225     else {
3226         bSig |= LIT64( 0x4000000000000000 );
3227     }
3228     shift64RightJamming( bSig, expDiff, &bSig );
3229     aSig |= LIT64( 0x4000000000000000 );
3230  aBigger:
3231     zSig = aSig - bSig;
3232     zExp = aExp;
3233  normalizeRoundAndPack:
3234     --zExp;
3235     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3236
3237 }
3238
3239 /*----------------------------------------------------------------------------
3240 | Returns the result of adding the double-precision floating-point values `a'
3241 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3242 | Binary Floating-Point Arithmetic.
3243 *----------------------------------------------------------------------------*/
3244
3245 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3246 {
3247     flag aSign, bSign;
3248     a = float64_squash_input_denormal(a STATUS_VAR);
3249     b = float64_squash_input_denormal(b STATUS_VAR);
3250
3251     aSign = extractFloat64Sign( a );
3252     bSign = extractFloat64Sign( b );
3253     if ( aSign == bSign ) {
3254         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3255     }
3256     else {
3257         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3258     }
3259
3260 }
3261
3262 /*----------------------------------------------------------------------------
3263 | Returns the result of subtracting the double-precision floating-point values
3264 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3265 | for Binary Floating-Point Arithmetic.
3266 *----------------------------------------------------------------------------*/
3267
3268 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3269 {
3270     flag aSign, bSign;
3271     a = float64_squash_input_denormal(a STATUS_VAR);
3272     b = float64_squash_input_denormal(b STATUS_VAR);
3273
3274     aSign = extractFloat64Sign( a );
3275     bSign = extractFloat64Sign( b );
3276     if ( aSign == bSign ) {
3277         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3278     }
3279     else {
3280         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3281     }
3282
3283 }
3284
3285 /*----------------------------------------------------------------------------
3286 | Returns the result of multiplying the double-precision floating-point values
3287 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3288 | for Binary Floating-Point Arithmetic.
3289 *----------------------------------------------------------------------------*/
3290
3291 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3292 {
3293     flag aSign, bSign, zSign;
3294     int16 aExp, bExp, zExp;
3295     uint64_t aSig, bSig, zSig0, zSig1;
3296
3297     a = float64_squash_input_denormal(a STATUS_VAR);
3298     b = float64_squash_input_denormal(b STATUS_VAR);
3299
3300     aSig = extractFloat64Frac( a );
3301     aExp = extractFloat64Exp( a );
3302     aSign = extractFloat64Sign( a );
3303     bSig = extractFloat64Frac( b );
3304     bExp = extractFloat64Exp( b );
3305     bSign = extractFloat64Sign( b );
3306     zSign = aSign ^ bSign;
3307     if ( aExp == 0x7FF ) {
3308         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3309             return propagateFloat64NaN( a, b STATUS_VAR );
3310         }
3311         if ( ( bExp | bSig ) == 0 ) {
3312             float_raise( float_flag_invalid STATUS_VAR);
3313             return float64_default_nan;
3314         }
3315         return packFloat64( zSign, 0x7FF, 0 );
3316     }
3317     if ( bExp == 0x7FF ) {
3318         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3319         if ( ( aExp | aSig ) == 0 ) {
3320             float_raise( float_flag_invalid STATUS_VAR);
3321             return float64_default_nan;
3322         }
3323         return packFloat64( zSign, 0x7FF, 0 );
3324     }
3325     if ( aExp == 0 ) {
3326         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3327         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3328     }
3329     if ( bExp == 0 ) {
3330         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3331         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3332     }
3333     zExp = aExp + bExp - 0x3FF;
3334     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3335     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3336     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3337     zSig0 |= ( zSig1 != 0 );
3338     if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
3339         zSig0 <<= 1;
3340         --zExp;
3341     }
3342     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3343
3344 }
3345
3346 /*----------------------------------------------------------------------------
3347 | Returns the result of dividing the double-precision floating-point value `a'
3348 | by the corresponding value `b'.  The operation is performed according to
3349 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3350 *----------------------------------------------------------------------------*/
3351
3352 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3353 {
3354     flag aSign, bSign, zSign;
3355     int16 aExp, bExp, zExp;
3356     uint64_t aSig, bSig, zSig;
3357     uint64_t rem0, rem1;
3358     uint64_t term0, term1;
3359     a = float64_squash_input_denormal(a STATUS_VAR);
3360     b = float64_squash_input_denormal(b STATUS_VAR);
3361
3362     aSig = extractFloat64Frac( a );
3363     aExp = extractFloat64Exp( a );
3364     aSign = extractFloat64Sign( a );
3365     bSig = extractFloat64Frac( b );
3366     bExp = extractFloat64Exp( b );
3367     bSign = extractFloat64Sign( b );
3368     zSign = aSign ^ bSign;
3369     if ( aExp == 0x7FF ) {
3370         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3371         if ( bExp == 0x7FF ) {
3372             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3373             float_raise( float_flag_invalid STATUS_VAR);
3374             return float64_default_nan;
3375         }
3376         return packFloat64( zSign, 0x7FF, 0 );
3377     }
3378     if ( bExp == 0x7FF ) {
3379         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3380         return packFloat64( zSign, 0, 0 );
3381     }
3382     if ( bExp == 0 ) {
3383         if ( bSig == 0 ) {
3384             if ( ( aExp | aSig ) == 0 ) {
3385                 float_raise( float_flag_invalid STATUS_VAR);
3386                 return float64_default_nan;
3387             }
3388             float_raise( float_flag_divbyzero STATUS_VAR);
3389             return packFloat64( zSign, 0x7FF, 0 );
3390         }
3391         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3392     }
3393     if ( aExp == 0 ) {
3394         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3395         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3396     }
3397     zExp = aExp - bExp + 0x3FD;
3398     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3399     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3400     if ( bSig <= ( aSig + aSig ) ) {
3401         aSig >>= 1;
3402         ++zExp;
3403     }
3404     zSig = estimateDiv128To64( aSig, 0, bSig );
3405     if ( ( zSig & 0x1FF ) <= 2 ) {
3406         mul64To128( bSig, zSig, &term0, &term1 );
3407         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3408         while ( (int64_t) rem0 < 0 ) {
3409             --zSig;
3410             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3411         }
3412         zSig |= ( rem1 != 0 );
3413     }
3414     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3415
3416 }
3417
3418 /*----------------------------------------------------------------------------
3419 | Returns the remainder of the double-precision floating-point value `a'
3420 | with respect to the corresponding value `b'.  The operation is performed
3421 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3422 *----------------------------------------------------------------------------*/
3423
3424 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3425 {
3426     flag aSign, zSign;
3427     int16 aExp, bExp, expDiff;
3428     uint64_t aSig, bSig;
3429     uint64_t q, alternateASig;
3430     int64_t sigMean;
3431
3432     a = float64_squash_input_denormal(a STATUS_VAR);
3433     b = float64_squash_input_denormal(b STATUS_VAR);
3434     aSig = extractFloat64Frac( a );
3435     aExp = extractFloat64Exp( a );
3436     aSign = extractFloat64Sign( a );
3437     bSig = extractFloat64Frac( b );
3438     bExp = extractFloat64Exp( b );
3439     if ( aExp == 0x7FF ) {
3440         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3441             return propagateFloat64NaN( a, b STATUS_VAR );
3442         }
3443         float_raise( float_flag_invalid STATUS_VAR);
3444         return float64_default_nan;
3445     }
3446     if ( bExp == 0x7FF ) {
3447         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3448         return a;
3449     }
3450     if ( bExp == 0 ) {
3451         if ( bSig == 0 ) {
3452             float_raise( float_flag_invalid STATUS_VAR);
3453             return float64_default_nan;
3454         }
3455         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3456     }
3457     if ( aExp == 0 ) {
3458         if ( aSig == 0 ) return a;
3459         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3460     }
3461     expDiff = aExp - bExp;
3462     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3463     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3464     if ( expDiff < 0 ) {
3465         if ( expDiff < -1 ) return a;
3466         aSig >>= 1;
3467     }
3468     q = ( bSig <= aSig );
3469     if ( q ) aSig -= bSig;
3470     expDiff -= 64;
3471     while ( 0 < expDiff ) {
3472         q = estimateDiv128To64( aSig, 0, bSig );
3473         q = ( 2 < q ) ? q - 2 : 0;
3474         aSig = - ( ( bSig>>2 ) * q );
3475         expDiff -= 62;
3476     }
3477     expDiff += 64;
3478     if ( 0 < expDiff ) {
3479         q = estimateDiv128To64( aSig, 0, bSig );
3480         q = ( 2 < q ) ? q - 2 : 0;
3481         q >>= 64 - expDiff;
3482         bSig >>= 2;
3483         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3484     }
3485     else {
3486         aSig >>= 2;
3487         bSig >>= 2;
3488     }
3489     do {
3490         alternateASig = aSig;
3491         ++q;
3492         aSig -= bSig;
3493     } while ( 0 <= (int64_t) aSig );
3494     sigMean = aSig + alternateASig;
3495     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3496         aSig = alternateASig;
3497     }
3498     zSign = ( (int64_t) aSig < 0 );
3499     if ( zSign ) aSig = - aSig;
3500     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3501
3502 }
3503
3504 /*----------------------------------------------------------------------------
3505 | Returns the square root of the double-precision floating-point value `a'.
3506 | The operation is performed according to the IEC/IEEE Standard for Binary
3507 | Floating-Point Arithmetic.
3508 *----------------------------------------------------------------------------*/
3509
3510 float64 float64_sqrt( float64 a STATUS_PARAM )
3511 {
3512     flag aSign;
3513     int16 aExp, zExp;
3514     uint64_t aSig, zSig, doubleZSig;
3515     uint64_t rem0, rem1, term0, term1;
3516     a = float64_squash_input_denormal(a STATUS_VAR);
3517
3518     aSig = extractFloat64Frac( a );
3519     aExp = extractFloat64Exp( a );
3520     aSign = extractFloat64Sign( a );
3521     if ( aExp == 0x7FF ) {
3522         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3523         if ( ! aSign ) return a;
3524         float_raise( float_flag_invalid STATUS_VAR);
3525         return float64_default_nan;
3526     }
3527     if ( aSign ) {
3528         if ( ( aExp | aSig ) == 0 ) return a;
3529         float_raise( float_flag_invalid STATUS_VAR);
3530         return float64_default_nan;
3531     }
3532     if ( aExp == 0 ) {
3533         if ( aSig == 0 ) return float64_zero;
3534         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3535     }
3536     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3537     aSig |= LIT64( 0x0010000000000000 );
3538     zSig = estimateSqrt32( aExp, aSig>>21 );
3539     aSig <<= 9 - ( aExp & 1 );
3540     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3541     if ( ( zSig & 0x1FF ) <= 5 ) {
3542         doubleZSig = zSig<<1;
3543         mul64To128( zSig, zSig, &term0, &term1 );
3544         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3545         while ( (int64_t) rem0 < 0 ) {
3546             --zSig;
3547             doubleZSig -= 2;
3548             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3549         }
3550         zSig |= ( ( rem0 | rem1 ) != 0 );
3551     }
3552     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3553
3554 }
3555
3556 /*----------------------------------------------------------------------------
3557 | Returns the binary log of the double-precision floating-point value `a'.
3558 | The operation is performed according to the IEC/IEEE Standard for Binary
3559 | Floating-Point Arithmetic.
3560 *----------------------------------------------------------------------------*/
3561 float64 float64_log2( float64 a STATUS_PARAM )
3562 {
3563     flag aSign, zSign;
3564     int16 aExp;
3565     uint64_t aSig, aSig0, aSig1, zSig, i;
3566     a = float64_squash_input_denormal(a STATUS_VAR);
3567
3568     aSig = extractFloat64Frac( a );
3569     aExp = extractFloat64Exp( a );
3570     aSign = extractFloat64Sign( a );
3571
3572     if ( aExp == 0 ) {
3573         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3574         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3575     }
3576     if ( aSign ) {
3577         float_raise( float_flag_invalid STATUS_VAR);
3578         return float64_default_nan;
3579     }
3580     if ( aExp == 0x7FF ) {
3581         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3582         return a;
3583     }
3584
3585     aExp -= 0x3FF;
3586     aSig |= LIT64( 0x0010000000000000 );
3587     zSign = aExp < 0;
3588     zSig = (uint64_t)aExp << 52;
3589     for (i = 1LL << 51; i > 0; i >>= 1) {
3590         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3591         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3592         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3593             aSig >>= 1;
3594             zSig |= i;
3595         }
3596     }
3597
3598     if ( zSign )
3599         zSig = -zSig;
3600     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3601 }
3602
3603 /*----------------------------------------------------------------------------
3604 | Returns 1 if the double-precision floating-point value `a' is equal to the
3605 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3606 | if either operand is a NaN.  Otherwise, the comparison is performed
3607 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3608 *----------------------------------------------------------------------------*/
3609
3610 int float64_eq( float64 a, float64 b STATUS_PARAM )
3611 {
3612     uint64_t av, bv;
3613     a = float64_squash_input_denormal(a STATUS_VAR);
3614     b = float64_squash_input_denormal(b STATUS_VAR);
3615
3616     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3617          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3618        ) {
3619         float_raise( float_flag_invalid STATUS_VAR);
3620         return 0;
3621     }
3622     av = float64_val(a);
3623     bv = float64_val(b);
3624     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3625
3626 }
3627
3628 /*----------------------------------------------------------------------------
3629 | Returns 1 if the double-precision floating-point value `a' is less than or
3630 | equal to the corresponding value `b', and 0 otherwise.  The invalid
3631 | exception is raised if either operand is a NaN.  The comparison is performed
3632 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3633 *----------------------------------------------------------------------------*/
3634
3635 int float64_le( float64 a, float64 b STATUS_PARAM )
3636 {
3637     flag aSign, bSign;
3638     uint64_t av, bv;
3639     a = float64_squash_input_denormal(a STATUS_VAR);
3640     b = float64_squash_input_denormal(b STATUS_VAR);
3641
3642     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3643          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3644        ) {
3645         float_raise( float_flag_invalid STATUS_VAR);
3646         return 0;
3647     }
3648     aSign = extractFloat64Sign( a );
3649     bSign = extractFloat64Sign( b );
3650     av = float64_val(a);
3651     bv = float64_val(b);
3652     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3653     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3654
3655 }
3656
3657 /*----------------------------------------------------------------------------
3658 | Returns 1 if the double-precision floating-point value `a' is less than
3659 | the corresponding value `b', and 0 otherwise.  The invalid exception is
3660 | raised if either operand is a NaN.  The comparison is performed according
3661 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3662 *----------------------------------------------------------------------------*/
3663
3664 int float64_lt( float64 a, float64 b STATUS_PARAM )
3665 {
3666     flag aSign, bSign;
3667     uint64_t av, bv;
3668
3669     a = float64_squash_input_denormal(a STATUS_VAR);
3670     b = float64_squash_input_denormal(b STATUS_VAR);
3671     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3672          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3673        ) {
3674         float_raise( float_flag_invalid STATUS_VAR);
3675         return 0;
3676     }
3677     aSign = extractFloat64Sign( a );
3678     bSign = extractFloat64Sign( b );
3679     av = float64_val(a);
3680     bv = float64_val(b);
3681     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3682     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3683
3684 }
3685
3686 /*----------------------------------------------------------------------------
3687 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3688 | be compared, and 0 otherwise.  The invalid exception is raised if either
3689 | operand is a NaN.  The comparison is performed according to the IEC/IEEE
3690 | Standard for Binary Floating-Point Arithmetic.
3691 *----------------------------------------------------------------------------*/
3692
3693 int float64_unordered( float64 a, float64 b STATUS_PARAM )
3694 {
3695     a = float64_squash_input_denormal(a STATUS_VAR);
3696     b = float64_squash_input_denormal(b STATUS_VAR);
3697
3698     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3699          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3700        ) {
3701         float_raise( float_flag_invalid STATUS_VAR);
3702         return 1;
3703     }
3704     return 0;
3705 }
3706
3707 /*----------------------------------------------------------------------------
3708 | Returns 1 if the double-precision floating-point value `a' is equal to the
3709 | corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3710 | exception.The comparison is performed according to the IEC/IEEE Standard
3711 | for Binary Floating-Point Arithmetic.
3712 *----------------------------------------------------------------------------*/
3713
3714 int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
3715 {
3716     uint64_t av, bv;
3717     a = float64_squash_input_denormal(a STATUS_VAR);
3718     b = float64_squash_input_denormal(b STATUS_VAR);
3719
3720     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3721          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3722        ) {
3723         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3724             float_raise( float_flag_invalid STATUS_VAR);
3725         }
3726         return 0;
3727     }
3728     av = float64_val(a);
3729     bv = float64_val(b);
3730     return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3731
3732 }
3733
3734 /*----------------------------------------------------------------------------
3735 | Returns 1 if the double-precision floating-point value `a' is less than or
3736 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3737 | cause an exception.  Otherwise, the comparison is performed according to the
3738 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3739 *----------------------------------------------------------------------------*/
3740
3741 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3742 {
3743     flag aSign, bSign;
3744     uint64_t av, bv;
3745     a = float64_squash_input_denormal(a STATUS_VAR);
3746     b = float64_squash_input_denormal(b STATUS_VAR);
3747
3748     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3749          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3750        ) {
3751         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3752             float_raise( float_flag_invalid STATUS_VAR);
3753         }
3754         return 0;
3755     }
3756     aSign = extractFloat64Sign( a );
3757     bSign = extractFloat64Sign( b );
3758     av = float64_val(a);
3759     bv = float64_val(b);
3760     if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
3761     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3762
3763 }
3764
3765 /*----------------------------------------------------------------------------
3766 | Returns 1 if the double-precision floating-point value `a' is less than
3767 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3768 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3769 | Standard for Binary Floating-Point Arithmetic.
3770 *----------------------------------------------------------------------------*/
3771
3772 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3773 {
3774     flag aSign, bSign;
3775     uint64_t av, bv;
3776     a = float64_squash_input_denormal(a STATUS_VAR);
3777     b = float64_squash_input_denormal(b STATUS_VAR);
3778
3779     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3780          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3781        ) {
3782         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3783             float_raise( float_flag_invalid STATUS_VAR);
3784         }
3785         return 0;
3786     }
3787     aSign = extractFloat64Sign( a );
3788     bSign = extractFloat64Sign( b );
3789     av = float64_val(a);
3790     bv = float64_val(b);
3791     if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
3792     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3793
3794 }
3795
3796 /*----------------------------------------------------------------------------
3797 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3798 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
3799 | comparison is performed according to the IEC/IEEE Standard for Binary
3800 | Floating-Point Arithmetic.
3801 *----------------------------------------------------------------------------*/
3802
3803 int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
3804 {
3805     a = float64_squash_input_denormal(a STATUS_VAR);
3806     b = float64_squash_input_denormal(b STATUS_VAR);
3807
3808     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3809          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3810        ) {
3811         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3812             float_raise( float_flag_invalid STATUS_VAR);
3813         }
3814         return 1;
3815     }
3816     return 0;
3817 }
3818
3819 #ifdef FLOATX80
3820
3821 /*----------------------------------------------------------------------------
3822 | Returns the result of converting the extended double-precision floating-
3823 | point value `a' to the 32-bit two's complement integer format.  The
3824 | conversion is performed according to the IEC/IEEE Standard for Binary
3825 | Floating-Point Arithmetic---which means in particular that the conversion
3826 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3827 | largest positive integer is returned.  Otherwise, if the conversion
3828 | overflows, the largest integer with the same sign as `a' is returned.
3829 *----------------------------------------------------------------------------*/
3830
3831 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3832 {
3833     flag aSign;
3834     int32 aExp, shiftCount;
3835     uint64_t aSig;
3836
3837     aSig = extractFloatx80Frac( a );
3838     aExp = extractFloatx80Exp( a );
3839     aSign = extractFloatx80Sign( a );
3840     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3841     shiftCount = 0x4037 - aExp;
3842     if ( shiftCount <= 0 ) shiftCount = 1;
3843     shift64RightJamming( aSig, shiftCount, &aSig );
3844     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3845
3846 }
3847
3848 /*----------------------------------------------------------------------------
3849 | Returns the result of converting the extended double-precision floating-
3850 | point value `a' to the 32-bit two's complement integer format.  The
3851 | conversion is performed according to the IEC/IEEE Standard for Binary
3852 | Floating-Point Arithmetic, except that the conversion is always rounded
3853 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3854 | Otherwise, if the conversion overflows, the largest integer with the same
3855 | sign as `a' is returned.
3856 *----------------------------------------------------------------------------*/
3857
3858 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3859 {
3860     flag aSign;
3861     int32 aExp, shiftCount;
3862     uint64_t aSig, savedASig;
3863     int32 z;
3864
3865     aSig = extractFloatx80Frac( a );
3866     aExp = extractFloatx80Exp( a );
3867     aSign = extractFloatx80Sign( a );
3868     if ( 0x401E < aExp ) {
3869         if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
3870         goto invalid;
3871     }
3872     else if ( aExp < 0x3FFF ) {
3873         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3874         return 0;
3875     }
3876     shiftCount = 0x403E - aExp;
3877     savedASig = aSig;
3878     aSig >>= shiftCount;
3879     z = aSig;
3880     if ( aSign ) z = - z;
3881     if ( ( z < 0 ) ^ aSign ) {
3882  invalid:
3883         float_raise( float_flag_invalid STATUS_VAR);
3884         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3885     }
3886     if ( ( aSig<<shiftCount ) != savedASig ) {
3887         STATUS(float_exception_flags) |= float_flag_inexact;
3888     }
3889     return z;
3890
3891 }
3892
3893 /*----------------------------------------------------------------------------
3894 | Returns the result of converting the extended double-precision floating-
3895 | point value `a' to the 64-bit two's complement integer format.  The
3896 | conversion is performed according to the IEC/IEEE Standard for Binary
3897 | Floating-Point Arithmetic---which means in particular that the conversion
3898 | is rounded according to the current rounding mode.  If `a' is a NaN,
3899 | the largest positive integer is returned.  Otherwise, if the conversion
3900 | overflows, the largest integer with the same sign as `a' is returned.
3901 *----------------------------------------------------------------------------*/
3902
3903 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3904 {
3905     flag aSign;
3906     int32 aExp, shiftCount;
3907     uint64_t aSig, aSigExtra;
3908
3909     aSig = extractFloatx80Frac( a );
3910     aExp = extractFloatx80Exp( a );
3911     aSign = extractFloatx80Sign( a );
3912     shiftCount = 0x403E - aExp;
3913     if ( shiftCount <= 0 ) {
3914         if ( shiftCount ) {
3915             float_raise( float_flag_invalid STATUS_VAR);
3916             if (    ! aSign
3917                  || (    ( aExp == 0x7FFF )
3918                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3919                ) {
3920                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3921             }
3922             return (int64_t) LIT64( 0x8000000000000000 );
3923         }
3924         aSigExtra = 0;
3925     }
3926     else {
3927         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3928     }
3929     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3930
3931 }
3932
3933 /*----------------------------------------------------------------------------
3934 | Returns the result of converting the extended double-precision floating-
3935 | point value `a' to the 64-bit two's complement integer format.  The
3936 | conversion is performed according to the IEC/IEEE Standard for Binary
3937 | Floating-Point Arithmetic, except that the conversion is always rounded
3938 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3939 | Otherwise, if the conversion overflows, the largest integer with the same
3940 | sign as `a' is returned.
3941 *----------------------------------------------------------------------------*/
3942
3943 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3944 {
3945     flag aSign;
3946     int32 aExp, shiftCount;
3947     uint64_t aSig;
3948     int64 z;
3949
3950     aSig = extractFloatx80Frac( a );
3951     aExp = extractFloatx80Exp( a );
3952     aSign = extractFloatx80Sign( a );
3953     shiftCount = aExp - 0x403E;
3954     if ( 0 <= shiftCount ) {
3955         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3956         if ( ( a.high != 0xC03E ) || aSig ) {
3957             float_raise( float_flag_invalid STATUS_VAR);
3958             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3959                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3960             }
3961         }
3962         return (int64_t) LIT64( 0x8000000000000000 );
3963     }
3964     else if ( aExp < 0x3FFF ) {
3965         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3966         return 0;
3967     }
3968     z = aSig>>( - shiftCount );
3969     if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3970         STATUS(float_exception_flags) |= float_flag_inexact;
3971     }
3972     if ( aSign ) z = - z;
3973     return z;
3974
3975 }
3976
3977 /*----------------------------------------------------------------------------
3978 | Returns the result of converting the extended double-precision floating-
3979 | point value `a' to the single-precision floating-point format.  The
3980 | conversion is performed according to the IEC/IEEE Standard for Binary
3981 | Floating-Point Arithmetic.
3982 *----------------------------------------------------------------------------*/
3983
3984 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3985 {
3986     flag aSign;
3987     int32 aExp;
3988     uint64_t aSig;
3989
3990     aSig = extractFloatx80Frac( a );
3991     aExp = extractFloatx80Exp( a );
3992     aSign = extractFloatx80Sign( a );
3993     if ( aExp == 0x7FFF ) {
3994         if ( (uint64_t) ( aSig<<1 ) ) {
3995             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
3996         }
3997         return packFloat32( aSign, 0xFF, 0 );
3998     }
3999     shift64RightJamming( aSig, 33, &aSig );
4000     if ( aExp || aSig ) aExp -= 0x3F81;
4001     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4002
4003 }
4004
4005 /*----------------------------------------------------------------------------
4006 | Returns the result of converting the extended double-precision floating-
4007 | point value `a' to the double-precision floating-point format.  The
4008 | conversion is performed according to the IEC/IEEE Standard for Binary
4009 | Floating-Point Arithmetic.
4010 *----------------------------------------------------------------------------*/
4011
4012 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4013 {
4014     flag aSign;
4015     int32 aExp;
4016     uint64_t aSig, zSig;
4017
4018     aSig = extractFloatx80Frac( a );
4019     aExp = extractFloatx80Exp( a );
4020     aSign = extractFloatx80Sign( a );
4021     if ( aExp == 0x7FFF ) {
4022         if ( (uint64_t) ( aSig<<1 ) ) {
4023             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4024         }
4025         return packFloat64( aSign, 0x7FF, 0 );
4026     }
4027     shift64RightJamming( aSig, 1, &zSig );
4028     if ( aExp || aSig ) aExp -= 0x3C01;
4029     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4030
4031 }
4032
4033 #ifdef FLOAT128
4034
4035 /*----------------------------------------------------------------------------
4036 | Returns the result of converting the extended double-precision floating-
4037 | point value `a' to the quadruple-precision floating-point format.  The
4038 | conversion is performed according to the IEC/IEEE Standard for Binary
4039 | Floating-Point Arithmetic.
4040 *----------------------------------------------------------------------------*/
4041
4042 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4043 {
4044     flag aSign;
4045     int16 aExp;
4046     uint64_t aSig, zSig0, zSig1;
4047
4048     aSig = extractFloatx80Frac( a );
4049     aExp = extractFloatx80Exp( a );
4050     aSign = extractFloatx80Sign( a );
4051     if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
4052         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
4053     }
4054     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4055     return packFloat128( aSign, aExp, zSig0, zSig1 );
4056
4057 }
4058
4059 #endif
4060
4061 /*----------------------------------------------------------------------------
4062 | Rounds the extended double-precision floating-point value `a' to an integer,
4063 | and returns the result as an extended quadruple-precision floating-point
4064 | value.  The operation is performed according to the IEC/IEEE Standard for
4065 | Binary Floating-Point Arithmetic.
4066 *----------------------------------------------------------------------------*/
4067
4068 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4069 {
4070     flag aSign;
4071     int32 aExp;
4072     uint64_t lastBitMask, roundBitsMask;
4073     int8 roundingMode;
4074     floatx80 z;
4075
4076     aExp = extractFloatx80Exp( a );
4077     if ( 0x403E <= aExp ) {
4078         if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
4079             return propagateFloatx80NaN( a, a STATUS_VAR );
4080         }
4081         return a;
4082     }
4083     if ( aExp < 0x3FFF ) {
4084         if (    ( aExp == 0 )
4085              && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
4086             return a;
4087         }
4088         STATUS(float_exception_flags) |= float_flag_inexact;
4089         aSign = extractFloatx80Sign( a );
4090         switch ( STATUS(float_rounding_mode) ) {
4091          case float_round_nearest_even:
4092             if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
4093                ) {
4094                 return
4095                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4096             }
4097             break;
4098          case float_round_down:
4099             return
4100                   aSign ?
4101                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4102                 : packFloatx80( 0, 0, 0 );
4103          case float_round_up:
4104             return
4105                   aSign ? packFloatx80( 1, 0, 0 )
4106                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4107         }
4108         return packFloatx80( aSign, 0, 0 );
4109     }
4110     lastBitMask = 1;
4111     lastBitMask <<= 0x403E - aExp;
4112     roundBitsMask = lastBitMask - 1;
4113     z = a;
4114     roundingMode = STATUS(float_rounding_mode);
4115     if ( roundingMode == float_round_nearest_even ) {
4116         z.low += lastBitMask>>1;
4117         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4118     }
4119     else if ( roundingMode != float_round_to_zero ) {
4120         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4121             z.low += roundBitsMask;
4122         }
4123     }
4124     z.low &= ~ roundBitsMask;
4125     if ( z.low == 0 ) {
4126         ++z.high;
4127         z.low = LIT64( 0x8000000000000000 );
4128     }
4129     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4130     return z;
4131
4132 }
4133
4134 /*----------------------------------------------------------------------------
4135 | Returns the result of adding the absolute values of the extended double-
4136 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4137 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4138 | The addition is performed according to the IEC/IEEE Standard for Binary
4139 | Floating-Point Arithmetic.
4140 *----------------------------------------------------------------------------*/
4141
4142 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4143 {
4144     int32 aExp, bExp, zExp;
4145     uint64_t aSig, bSig, zSig0, zSig1;
4146     int32 expDiff;
4147
4148     aSig = extractFloatx80Frac( a );
4149     aExp = extractFloatx80Exp( a );
4150     bSig = extractFloatx80Frac( b );
4151     bExp = extractFloatx80Exp( b );
4152     expDiff = aExp - bExp;
4153     if ( 0 < expDiff ) {
4154         if ( aExp == 0x7FFF ) {
4155             if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4156             return a;
4157         }
4158         if ( bExp == 0 ) --expDiff;
4159         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4160         zExp = aExp;
4161     }
4162     else if ( expDiff < 0 ) {
4163         if ( bExp == 0x7FFF ) {
4164             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4165             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4166         }
4167         if ( aExp == 0 ) ++expDiff;
4168         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4169         zExp = bExp;
4170     }
4171     else {
4172         if ( aExp == 0x7FFF ) {
4173             if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4174                 return propagateFloatx80NaN( a, b STATUS_VAR );
4175             }
4176             return a;
4177         }
4178         zSig1 = 0;
4179         zSig0 = aSig + bSig;
4180         if ( aExp == 0 ) {
4181             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4182             goto roundAndPack;
4183         }
4184         zExp = aExp;
4185         goto shiftRight1;
4186     }
4187     zSig0 = aSig + bSig;
4188     if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
4189  shiftRight1:
4190     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4191     zSig0 |= LIT64( 0x8000000000000000 );
4192     ++zExp;
4193  roundAndPack:
4194     return
4195         roundAndPackFloatx80(
4196             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4197
4198 }
4199
4200 /*----------------------------------------------------------------------------
4201 | Returns the result of subtracting the absolute values of the extended
4202 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4203 | difference is negated before being returned.  `zSign' is ignored if the
4204 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4205 | Standard for Binary Floating-Point Arithmetic.
4206 *----------------------------------------------------------------------------*/
4207
4208 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4209 {
4210     int32 aExp, bExp, zExp;
4211     uint64_t aSig, bSig, zSig0, zSig1;
4212     int32 expDiff;
4213     floatx80 z;
4214
4215     aSig = extractFloatx80Frac( a );
4216     aExp = extractFloatx80Exp( a );
4217     bSig = extractFloatx80Frac( b );
4218     bExp = extractFloatx80Exp( b );
4219     expDiff = aExp - bExp;
4220     if ( 0 < expDiff ) goto aExpBigger;
4221     if ( expDiff < 0 ) goto bExpBigger;
4222     if ( aExp == 0x7FFF ) {
4223         if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
4224             return propagateFloatx80NaN( a, b STATUS_VAR );
4225         }
4226         float_raise( float_flag_invalid STATUS_VAR);
4227         z.low = floatx80_default_nan_low;
4228         z.high = floatx80_default_nan_high;
4229         return z;
4230     }
4231     if ( aExp == 0 ) {
4232         aExp = 1;
4233         bExp = 1;
4234     }
4235     zSig1 = 0;
4236     if ( bSig < aSig ) goto aBigger;
4237     if ( aSig < bSig ) goto bBigger;
4238     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4239  bExpBigger:
4240     if ( bExp == 0x7FFF ) {
4241         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4242         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4243     }
4244     if ( aExp == 0 ) ++expDiff;
4245     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4246  bBigger:
4247     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4248     zExp = bExp;
4249     zSign ^= 1;
4250     goto normalizeRoundAndPack;
4251  aExpBigger:
4252     if ( aExp == 0x7FFF ) {
4253         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4254         return a;
4255     }
4256     if ( bExp == 0 ) --expDiff;
4257     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4258  aBigger:
4259     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4260     zExp = aExp;
4261  normalizeRoundAndPack:
4262     return
4263         normalizeRoundAndPackFloatx80(
4264             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4265
4266 }
4267
4268 /*----------------------------------------------------------------------------
4269 | Returns the result of adding the extended double-precision floating-point
4270 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4271 | Standard for Binary Floating-Point Arithmetic.
4272 *----------------------------------------------------------------------------*/
4273
4274 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4275 {
4276     flag aSign, bSign;
4277
4278     aSign = extractFloatx80Sign( a );
4279     bSign = extractFloatx80Sign( b );
4280     if ( aSign == bSign ) {
4281         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4282     }
4283     else {
4284         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4285     }
4286
4287 }
4288
4289 /*----------------------------------------------------------------------------
4290 | Returns the result of subtracting the extended double-precision floating-
4291 | point values `a' and `b'.  The operation is performed according to the
4292 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4293 *----------------------------------------------------------------------------*/
4294
4295 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4296 {
4297     flag aSign, bSign;
4298
4299     aSign = extractFloatx80Sign( a );
4300     bSign = extractFloatx80Sign( b );
4301     if ( aSign == bSign ) {
4302         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4303     }
4304     else {
4305         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4306     }
4307
4308 }
4309
4310 /*----------------------------------------------------------------------------
4311 | Returns the result of multiplying the extended double-precision floating-
4312 | point values `a' and `b'.  The operation is performed according to the
4313 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4314 *----------------------------------------------------------------------------*/
4315
4316 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4317 {
4318     flag aSign, bSign, zSign;
4319     int32 aExp, bExp, zExp;
4320     uint64_t aSig, bSig, zSig0, zSig1;
4321     floatx80 z;
4322
4323     aSig = extractFloatx80Frac( a );
4324     aExp = extractFloatx80Exp( a );
4325     aSign = extractFloatx80Sign( a );
4326     bSig = extractFloatx80Frac( b );
4327     bExp = extractFloatx80Exp( b );
4328     bSign = extractFloatx80Sign( b );
4329     zSign = aSign ^ bSign;
4330     if ( aExp == 0x7FFF ) {
4331         if (    (uint64_t) ( aSig<<1 )
4332              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4333             return propagateFloatx80NaN( a, b STATUS_VAR );
4334         }
4335         if ( ( bExp | bSig ) == 0 ) goto invalid;
4336         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4337     }
4338     if ( bExp == 0x7FFF ) {
4339         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4340         if ( ( aExp | aSig ) == 0 ) {
4341  invalid:
4342             float_raise( float_flag_invalid STATUS_VAR);
4343             z.low = floatx80_default_nan_low;
4344             z.high = floatx80_default_nan_high;
4345             return z;
4346         }
4347         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4348     }
4349     if ( aExp == 0 ) {
4350         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4351         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4352     }
4353     if ( bExp == 0 ) {
4354         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4355         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4356     }
4357     zExp = aExp + bExp - 0x3FFE;
4358     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4359     if ( 0 < (int64_t) zSig0 ) {
4360         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4361         --zExp;
4362     }
4363     return
4364         roundAndPackFloatx80(
4365             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4366
4367 }
4368
4369 /*----------------------------------------------------------------------------
4370 | Returns the result of dividing the extended double-precision floating-point
4371 | value `a' by the corresponding value `b'.  The operation is performed
4372 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4373 *----------------------------------------------------------------------------*/
4374
4375 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4376 {
4377     flag aSign, bSign, zSign;
4378     int32 aExp, bExp, zExp;
4379     uint64_t aSig, bSig, zSig0, zSig1;
4380     uint64_t rem0, rem1, rem2, term0, term1, term2;
4381     floatx80 z;
4382
4383     aSig = extractFloatx80Frac( a );
4384     aExp = extractFloatx80Exp( a );
4385     aSign = extractFloatx80Sign( a );
4386     bSig = extractFloatx80Frac( b );
4387     bExp = extractFloatx80Exp( b );
4388     bSign = extractFloatx80Sign( b );
4389     zSign = aSign ^ bSign;
4390     if ( aExp == 0x7FFF ) {
4391         if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4392         if ( bExp == 0x7FFF ) {
4393             if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4394             goto invalid;
4395         }
4396         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4397     }
4398     if ( bExp == 0x7FFF ) {
4399         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4400         return packFloatx80( zSign, 0, 0 );
4401     }
4402     if ( bExp == 0 ) {
4403         if ( bSig == 0 ) {
4404             if ( ( aExp | aSig ) == 0 ) {
4405  invalid:
4406                 float_raise( float_flag_invalid STATUS_VAR);
4407                 z.low = floatx80_default_nan_low;
4408                 z.high = floatx80_default_nan_high;
4409                 return z;
4410             }
4411             float_raise( float_flag_divbyzero STATUS_VAR);
4412             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4413         }
4414         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4415     }
4416     if ( aExp == 0 ) {
4417         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4418         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4419     }
4420     zExp = aExp - bExp + 0x3FFE;
4421     rem1 = 0;
4422     if ( bSig <= aSig ) {
4423         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4424         ++zExp;
4425     }
4426     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4427     mul64To128( bSig, zSig0, &term0, &term1 );
4428     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4429     while ( (int64_t) rem0 < 0 ) {
4430         --zSig0;
4431         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4432     }
4433     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4434     if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
4435         mul64To128( bSig, zSig1, &term1, &term2 );
4436         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4437         while ( (int64_t) rem1 < 0 ) {
4438             --zSig1;
4439             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4440         }
4441         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4442     }
4443     return
4444         roundAndPackFloatx80(
4445             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4446
4447 }
4448
4449 /*----------------------------------------------------------------------------
4450 | Returns the remainder of the extended double-precision floating-point value
4451 | `a' with respect to the corresponding value `b'.  The operation is performed
4452 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4453 *----------------------------------------------------------------------------*/
4454
4455 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4456 {
4457     flag aSign, zSign;
4458     int32 aExp, bExp, expDiff;
4459     uint64_t aSig0, aSig1, bSig;
4460     uint64_t q, term0, term1, alternateASig0, alternateASig1;
4461     floatx80 z;
4462
4463     aSig0 = extractFloatx80Frac( a );
4464     aExp = extractFloatx80Exp( a );
4465     aSign = extractFloatx80Sign( a );
4466     bSig = extractFloatx80Frac( b );
4467     bExp = extractFloatx80Exp( b );
4468     if ( aExp == 0x7FFF ) {
4469         if (    (uint64_t) ( aSig0<<1 )
4470              || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
4471             return propagateFloatx80NaN( a, b STATUS_VAR );
4472         }
4473         goto invalid;
4474     }
4475     if ( bExp == 0x7FFF ) {
4476         if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4477         return a;
4478     }
4479     if ( bExp == 0 ) {
4480         if ( bSig == 0 ) {
4481  invalid:
4482             float_raise( float_flag_invalid STATUS_VAR);
4483             z.low = floatx80_default_nan_low;
4484             z.high = floatx80_default_nan_high;
4485             return z;
4486         }
4487         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4488     }
4489     if ( aExp == 0 ) {
4490         if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
4491         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4492     }
4493     bSig |= LIT64( 0x8000000000000000 );
4494     zSign = aSign;
4495     expDiff = aExp - bExp;
4496     aSig1 = 0;
4497     if ( expDiff < 0 ) {
4498         if ( expDiff < -1 ) return a;
4499         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4500         expDiff = 0;
4501     }
4502     q = ( bSig <= aSig0 );
4503     if ( q ) aSig0 -= bSig;
4504     expDiff -= 64;
4505     while ( 0 < expDiff ) {
4506         q = estimateDiv128To64( aSig0, aSig1, bSig );
4507         q = ( 2 < q ) ? q - 2 : 0;
4508         mul64To128( bSig, q, &term0, &term1 );
4509         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4510         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4511         expDiff -= 62;
4512     }
4513     expDiff += 64;
4514     if ( 0 < expDiff ) {
4515         q = estimateDiv128To64( aSig0, aSig1, bSig );
4516         q = ( 2 < q ) ? q - 2 : 0;
4517         q >>= 64 - expDiff;
4518         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4519         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4520         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4521         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4522             ++q;
4523             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4524         }
4525     }
4526     else {
4527         term1 = 0;
4528         term0 = bSig;
4529     }
4530     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4531     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4532          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4533               && ( q & 1 ) )
4534        ) {
4535         aSig0 = alternateASig0;
4536         aSig1 = alternateASig1;
4537         zSign = ! zSign;
4538     }
4539     return
4540         normalizeRoundAndPackFloatx80(
4541             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4542
4543 }
4544
4545 /*----------------------------------------------------------------------------
4546 | Returns the square root of the extended double-precision floating-point
4547 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4548 | for Binary Floating-Point Arithmetic.
4549 *----------------------------------------------------------------------------*/
4550
4551 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4552 {
4553     flag aSign;
4554     int32 aExp, zExp;
4555     uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4556     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4557     floatx80 z;
4558
4559     aSig0 = extractFloatx80Frac( a );
4560     aExp = extractFloatx80Exp( a );
4561     aSign = extractFloatx80Sign( a );
4562     if ( aExp == 0x7FFF ) {
4563         if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4564         if ( ! aSign ) return a;
4565         goto invalid;
4566     }
4567     if ( aSign ) {
4568         if ( ( aExp | aSig0 ) == 0 ) return a;
4569  invalid:
4570         float_raise( float_flag_invalid STATUS_VAR);
4571         z.low = floatx80_default_nan_low;
4572         z.high = floatx80_default_nan_high;
4573         return z;
4574     }
4575     if ( aExp == 0 ) {
4576         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4577         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4578     }
4579     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4580     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4581     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4582     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4583     doubleZSig0 = zSig0<<1;
4584     mul64To128( zSig0, zSig0, &term0, &term1 );
4585     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4586     while ( (int64_t) rem0 < 0 ) {
4587         --zSig0;
4588         doubleZSig0 -= 2;
4589         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4590     }
4591     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4592     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4593         if ( zSig1 == 0 ) zSig1 = 1;
4594         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4595         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4596         mul64To128( zSig1, zSig1, &term2, &term3 );
4597         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4598         while ( (int64_t) rem1 < 0 ) {
4599             --zSig1;
4600             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4601             term3 |= 1;
4602             term2 |= doubleZSig0;
4603             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4604         }
4605         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4606     }
4607     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4608     zSig0 |= doubleZSig0;
4609     return
4610         roundAndPackFloatx80(
4611             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4612
4613 }
4614
4615 /*----------------------------------------------------------------------------
4616 | Returns 1 if the extended double-precision floating-point value `a' is equal
4617 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4618 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4619 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4620 *----------------------------------------------------------------------------*/
4621
4622 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4623 {
4624
4625     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4626               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4627          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4628               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4629        ) {
4630         float_raise( float_flag_invalid STATUS_VAR);
4631         return 0;
4632     }
4633     return
4634            ( a.low == b.low )
4635         && (    ( a.high == b.high )
4636              || (    ( a.low == 0 )
4637                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4638            );
4639
4640 }
4641
4642 /*----------------------------------------------------------------------------
4643 | Returns 1 if the extended double-precision floating-point value `a' is
4644 | less than or equal to the corresponding value `b', and 0 otherwise.  The
4645 | invalid exception is raised if either operand is a NaN.  The comparison is
4646 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4647 | Arithmetic.
4648 *----------------------------------------------------------------------------*/
4649
4650 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4651 {
4652     flag aSign, bSign;
4653
4654     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4655               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4656          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4657               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4658        ) {
4659         float_raise( float_flag_invalid STATUS_VAR);
4660         return 0;
4661     }
4662     aSign = extractFloatx80Sign( a );
4663     bSign = extractFloatx80Sign( b );
4664     if ( aSign != bSign ) {
4665         return
4666                aSign
4667             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4668                  == 0 );
4669     }
4670     return
4671           aSign ? le128( b.high, b.low, a.high, a.low )
4672         : le128( a.high, a.low, b.high, b.low );
4673
4674 }
4675
4676 /*----------------------------------------------------------------------------
4677 | Returns 1 if the extended double-precision floating-point value `a' is
4678 | less than the corresponding value `b', and 0 otherwise.  The invalid
4679 | exception is raised if either operand is a NaN.  The comparison is performed
4680 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4681 *----------------------------------------------------------------------------*/
4682
4683 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4684 {
4685     flag aSign, bSign;
4686
4687     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4688               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4689          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4690               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4691        ) {
4692         float_raise( float_flag_invalid STATUS_VAR);
4693         return 0;
4694     }
4695     aSign = extractFloatx80Sign( a );
4696     bSign = extractFloatx80Sign( b );
4697     if ( aSign != bSign ) {
4698         return
4699                aSign
4700             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4701                  != 0 );
4702     }
4703     return
4704           aSign ? lt128( b.high, b.low, a.high, a.low )
4705         : lt128( a.high, a.low, b.high, b.low );
4706
4707 }
4708
4709 /*----------------------------------------------------------------------------
4710 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4711 | cannot be compared, and 0 otherwise.  The invalid exception is raised if
4712 | either operand is a NaN.   The comparison is performed according to the
4713 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4714 *----------------------------------------------------------------------------*/
4715 int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
4716 {
4717     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4718               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4719          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4720               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4721        ) {
4722         float_raise( float_flag_invalid STATUS_VAR);
4723         return 1;
4724     }
4725     return 0;
4726 }
4727
4728 /*----------------------------------------------------------------------------
4729 | Returns 1 if the extended double-precision floating-point value `a' is
4730 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
4731 | cause an exception.  The comparison is performed according to the IEC/IEEE
4732 | Standard for Binary Floating-Point Arithmetic.
4733 *----------------------------------------------------------------------------*/
4734
4735 int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4736 {
4737
4738     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4739               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4740          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4741               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4742        ) {
4743         if (    floatx80_is_signaling_nan( a )
4744              || floatx80_is_signaling_nan( b ) ) {
4745             float_raise( float_flag_invalid STATUS_VAR);
4746         }
4747         return 0;
4748     }
4749     return
4750            ( a.low == b.low )
4751         && (    ( a.high == b.high )
4752              || (    ( a.low == 0 )
4753                   && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
4754            );
4755
4756 }
4757
4758 /*----------------------------------------------------------------------------
4759 | Returns 1 if the extended double-precision floating-point value `a' is less
4760 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4761 | do not cause an exception.  Otherwise, the comparison is performed according
4762 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4763 *----------------------------------------------------------------------------*/
4764
4765 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4766 {
4767     flag aSign, bSign;
4768
4769     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4770               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4771          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4772               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4773        ) {
4774         if (    floatx80_is_signaling_nan( a )
4775              || floatx80_is_signaling_nan( b ) ) {
4776             float_raise( float_flag_invalid STATUS_VAR);
4777         }
4778         return 0;
4779     }
4780     aSign = extractFloatx80Sign( a );
4781     bSign = extractFloatx80Sign( b );
4782     if ( aSign != bSign ) {
4783         return
4784                aSign
4785             || (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4786                  == 0 );
4787     }
4788     return
4789           aSign ? le128( b.high, b.low, a.high, a.low )
4790         : le128( a.high, a.low, b.high, b.low );
4791
4792 }
4793
4794 /*----------------------------------------------------------------------------
4795 | Returns 1 if the extended double-precision floating-point value `a' is less
4796 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4797 | an exception.  Otherwise, the comparison is performed according to the
4798 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4799 *----------------------------------------------------------------------------*/
4800
4801 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4802 {
4803     flag aSign, bSign;
4804
4805     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4806               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4807          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4808               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4809        ) {
4810         if (    floatx80_is_signaling_nan( a )
4811              || floatx80_is_signaling_nan( b ) ) {
4812             float_raise( float_flag_invalid STATUS_VAR);
4813         }
4814         return 0;
4815     }
4816     aSign = extractFloatx80Sign( a );
4817     bSign = extractFloatx80Sign( b );
4818     if ( aSign != bSign ) {
4819         return
4820                aSign
4821             && (    ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4822                  != 0 );
4823     }
4824     return
4825           aSign ? lt128( b.high, b.low, a.high, a.low )
4826         : lt128( a.high, a.low, b.high, b.low );
4827
4828 }
4829
4830 /*----------------------------------------------------------------------------
4831 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4832 | cannot be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.
4833 | The comparison is performed according to the IEC/IEEE Standard for Binary
4834 | Floating-Point Arithmetic.
4835 *----------------------------------------------------------------------------*/
4836 int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4837 {
4838     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4839               && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
4840          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4841               && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
4842        ) {
4843         if (    floatx80_is_signaling_nan( a )
4844              || floatx80_is_signaling_nan( b ) ) {
4845             float_raise( float_flag_invalid STATUS_VAR);
4846         }
4847         return 1;
4848     }
4849     return 0;
4850 }
4851
4852 #endif
4853
4854 #ifdef FLOAT128
4855
4856 /*----------------------------------------------------------------------------
4857 | Returns the result of converting the quadruple-precision floating-point
4858 | value `a' to the 32-bit two's complement integer format.  The conversion
4859 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4860 | Arithmetic---which means in particular that the conversion is rounded
4861 | according to the current rounding mode.  If `a' is a NaN, the largest
4862 | positive integer is returned.  Otherwise, if the conversion overflows, the
4863 | largest integer with the same sign as `a' is returned.
4864 *----------------------------------------------------------------------------*/
4865
4866 int32 float128_to_int32( float128 a STATUS_PARAM )
4867 {
4868     flag aSign;
4869     int32 aExp, shiftCount;
4870     uint64_t aSig0, aSig1;
4871
4872     aSig1 = extractFloat128Frac1( a );
4873     aSig0 = extractFloat128Frac0( a );
4874     aExp = extractFloat128Exp( a );
4875     aSign = extractFloat128Sign( a );
4876     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4877     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4878     aSig0 |= ( aSig1 != 0 );
4879     shiftCount = 0x4028 - aExp;
4880     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4881     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4882
4883 }
4884
4885 /*----------------------------------------------------------------------------
4886 | Returns the result of converting the quadruple-precision floating-point
4887 | value `a' to the 32-bit two's complement integer format.  The conversion
4888 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4889 | Arithmetic, except that the conversion is always rounded toward zero.  If
4890 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4891 | conversion overflows, the largest integer with the same sign as `a' is
4892 | returned.
4893 *----------------------------------------------------------------------------*/
4894
4895 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4896 {
4897     flag aSign;
4898     int32 aExp, shiftCount;
4899     uint64_t aSig0, aSig1, savedASig;
4900     int32 z;
4901
4902     aSig1 = extractFloat128Frac1( a );
4903     aSig0 = extractFloat128Frac0( a );
4904     aExp = extractFloat128Exp( a );
4905     aSign = extractFloat128Sign( a );
4906     aSig0 |= ( aSig1 != 0 );
4907     if ( 0x401E < aExp ) {
4908         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4909         goto invalid;
4910     }
4911     else if ( aExp < 0x3FFF ) {
4912         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4913         return 0;
4914     }
4915     aSig0 |= LIT64( 0x0001000000000000 );
4916     shiftCount = 0x402F - aExp;
4917     savedASig = aSig0;
4918     aSig0 >>= shiftCount;
4919     z = aSig0;
4920     if ( aSign ) z = - z;
4921     if ( ( z < 0 ) ^ aSign ) {
4922  invalid:
4923         float_raise( float_flag_invalid STATUS_VAR);
4924         return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4925     }
4926     if ( ( aSig0<<shiftCount ) != savedASig ) {
4927         STATUS(float_exception_flags) |= float_flag_inexact;
4928     }
4929     return z;
4930
4931 }
4932
4933 /*----------------------------------------------------------------------------
4934 | Returns the result of converting the quadruple-precision floating-point
4935 | value `a' to the 64-bit two's complement integer format.  The conversion
4936 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4937 | Arithmetic---which means in particular that the conversion is rounded
4938 | according to the current rounding mode.  If `a' is a NaN, the largest
4939 | positive integer is returned.  Otherwise, if the conversion overflows, the
4940 | largest integer with the same sign as `a' is returned.
4941 *----------------------------------------------------------------------------*/
4942
4943 int64 float128_to_int64( float128 a STATUS_PARAM )
4944 {
4945     flag aSign;
4946     int32 aExp, shiftCount;
4947     uint64_t aSig0, aSig1;
4948
4949     aSig1 = extractFloat128Frac1( a );
4950     aSig0 = extractFloat128Frac0( a );
4951     aExp = extractFloat128Exp( a );
4952     aSign = extractFloat128Sign( a );
4953     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4954     shiftCount = 0x402F - aExp;
4955     if ( shiftCount <= 0 ) {
4956         if ( 0x403E < aExp ) {
4957             float_raise( float_flag_invalid STATUS_VAR);
4958             if (    ! aSign
4959                  || (    ( aExp == 0x7FFF )
4960                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4961                     )
4962                ) {
4963                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4964             }
4965             return (int64_t) LIT64( 0x8000000000000000 );
4966         }
4967         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4968     }
4969     else {
4970         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4971     }
4972     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4973
4974 }
4975
4976 /*----------------------------------------------------------------------------
4977 | Returns the result of converting the quadruple-precision floating-point
4978 | value `a' to the 64-bit two's complement integer format.  The conversion
4979 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4980 | Arithmetic, except that the conversion is always rounded toward zero.
4981 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4982 | the conversion overflows, the largest integer with the same sign as `a' is
4983 | returned.
4984 *----------------------------------------------------------------------------*/
4985
4986 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4987 {
4988     flag aSign;
4989     int32 aExp, shiftCount;
4990     uint64_t aSig0, aSig1;
4991     int64 z;
4992
4993     aSig1 = extractFloat128Frac1( a );
4994     aSig0 = extractFloat128Frac0( a );
4995     aExp = extractFloat128Exp( a );
4996     aSign = extractFloat128Sign( a );
4997     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4998     shiftCount = aExp - 0x402F;
4999     if ( 0 < shiftCount ) {
5000         if ( 0x403E <= aExp ) {
5001             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5002             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
5003                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5004                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5005             }
5006             else {
5007                 float_raise( float_flag_invalid STATUS_VAR);
5008                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5009                     return LIT64( 0x7FFFFFFFFFFFFFFF );
5010                 }
5011             }
5012             return (int64_t) LIT64( 0x8000000000000000 );
5013         }
5014         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
5015         if ( (uint64_t) ( aSig1<<shiftCount ) ) {
5016             STATUS(float_exception_flags) |= float_flag_inexact;
5017         }
5018     }
5019     else {
5020         if ( aExp < 0x3FFF ) {
5021             if ( aExp | aSig0 | aSig1 ) {
5022                 STATUS(float_exception_flags) |= float_flag_inexact;
5023             }
5024             return 0;
5025         }
5026         z = aSig0>>( - shiftCount );
5027         if (    aSig1
5028              || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
5029             STATUS(float_exception_flags) |= float_flag_inexact;
5030         }
5031     }
5032     if ( aSign ) z = - z;
5033     return z;
5034
5035 }
5036
5037 /*----------------------------------------------------------------------------
5038 | Returns the result of converting the quadruple-precision floating-point
5039 | value `a' to the single-precision floating-point format.  The conversion
5040 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5041 | Arithmetic.
5042 *----------------------------------------------------------------------------*/
5043
5044 float32 float128_to_float32( float128 a STATUS_PARAM )
5045 {
5046     flag aSign;
5047     int32 aExp;
5048     uint64_t aSig0, aSig1;
5049     uint32_t zSig;
5050
5051     aSig1 = extractFloat128Frac1( a );
5052     aSig0 = extractFloat128Frac0( a );
5053     aExp = extractFloat128Exp( a );
5054     aSign = extractFloat128Sign( a );
5055     if ( aExp == 0x7FFF ) {
5056         if ( aSig0 | aSig1 ) {
5057             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5058         }
5059         return packFloat32( aSign, 0xFF, 0 );
5060     }
5061     aSig0 |= ( aSig1 != 0 );
5062     shift64RightJamming( aSig0, 18, &aSig0 );
5063     zSig = aSig0;
5064     if ( aExp || zSig ) {
5065         zSig |= 0x40000000;
5066         aExp -= 0x3F81;
5067     }
5068     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5069
5070 }
5071
5072 /*----------------------------------------------------------------------------
5073 | Returns the result of converting the quadruple-precision floating-point
5074 | value `a' to the double-precision floating-point format.  The conversion
5075 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5076 | Arithmetic.
5077 *----------------------------------------------------------------------------*/
5078
5079 float64 float128_to_float64( float128 a STATUS_PARAM )
5080 {
5081     flag aSign;
5082     int32 aExp;
5083     uint64_t aSig0, aSig1;
5084
5085     aSig1 = extractFloat128Frac1( a );
5086     aSig0 = extractFloat128Frac0( a );
5087     aExp = extractFloat128Exp( a );
5088     aSign = extractFloat128Sign( a );
5089     if ( aExp == 0x7FFF ) {
5090         if ( aSig0 | aSig1 ) {
5091             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5092         }
5093         return packFloat64( aSign, 0x7FF, 0 );
5094     }
5095     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5096     aSig0 |= ( aSig1 != 0 );
5097     if ( aExp || aSig0 ) {
5098         aSig0 |= LIT64( 0x4000000000000000 );
5099         aExp -= 0x3C01;
5100     }
5101     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5102
5103 }
5104
5105 #ifdef FLOATX80
5106
5107 /*----------------------------------------------------------------------------
5108 | Returns the result of converting the quadruple-precision floating-point
5109 | value `a' to the extended double-precision floating-point format.  The
5110 | conversion is performed according to the IEC/IEEE Standard for Binary
5111 | Floating-Point Arithmetic.
5112 *----------------------------------------------------------------------------*/
5113
5114 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5115 {
5116     flag aSign;
5117     int32 aExp;
5118     uint64_t aSig0, aSig1;
5119
5120     aSig1 = extractFloat128Frac1( a );
5121     aSig0 = extractFloat128Frac0( a );
5122     aExp = extractFloat128Exp( a );
5123     aSign = extractFloat128Sign( a );
5124     if ( aExp == 0x7FFF ) {
5125         if ( aSig0 | aSig1 ) {
5126             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
5127         }
5128         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5129     }
5130     if ( aExp == 0 ) {
5131         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5132         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5133     }
5134     else {
5135         aSig0 |= LIT64( 0x0001000000000000 );
5136     }
5137     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5138     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5139
5140 }
5141
5142 #endif
5143
5144 /*----------------------------------------------------------------------------
5145 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5146 | returns the result as a quadruple-precision floating-point value.  The
5147 | operation is performed according to the IEC/IEEE Standard for Binary
5148 | Floating-Point Arithmetic.
5149 *----------------------------------------------------------------------------*/
5150
5151 float128 float128_round_to_int( float128 a STATUS_PARAM )
5152 {
5153     flag aSign;
5154     int32 aExp;
5155     uint64_t lastBitMask, roundBitsMask;
5156     int8 roundingMode;
5157     float128 z;
5158
5159     aExp = extractFloat128Exp( a );
5160     if ( 0x402F <= aExp ) {
5161         if ( 0x406F <= aExp ) {
5162             if (    ( aExp == 0x7FFF )
5163                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5164                ) {
5165                 return propagateFloat128NaN( a, a STATUS_VAR );
5166             }
5167             return a;
5168         }
5169         lastBitMask = 1;
5170         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5171         roundBitsMask = lastBitMask - 1;
5172         z = a;
5173         roundingMode = STATUS(float_rounding_mode);
5174         if ( roundingMode == float_round_nearest_even ) {
5175             if ( lastBitMask ) {
5176                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5177                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5178             }
5179             else {
5180                 if ( (int64_t) z.low < 0 ) {
5181                     ++z.high;
5182                     if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
5183                 }
5184             }
5185         }
5186         else if ( roundingMode != float_round_to_zero ) {
5187             if (   extractFloat128Sign( z )
5188                  ^ ( roundingMode == float_round_up ) ) {
5189                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5190             }
5191         }
5192         z.low &= ~ roundBitsMask;
5193     }
5194     else {
5195         if ( aExp < 0x3FFF ) {
5196             if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5197             STATUS(float_exception_flags) |= float_flag_inexact;
5198             aSign = extractFloat128Sign( a );
5199             switch ( STATUS(float_rounding_mode) ) {
5200              case float_round_nearest_even:
5201                 if (    ( aExp == 0x3FFE )
5202                      && (   extractFloat128Frac0( a )
5203                           | extractFloat128Frac1( a ) )
5204                    ) {
5205                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5206                 }
5207                 break;
5208              case float_round_down:
5209                 return
5210                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5211                     : packFloat128( 0, 0, 0, 0 );
5212              case float_round_up:
5213                 return
5214                       aSign ? packFloat128( 1, 0, 0, 0 )
5215                     : packFloat128( 0, 0x3FFF, 0, 0 );
5216             }
5217             return packFloat128( aSign, 0, 0, 0 );
5218         }
5219         lastBitMask = 1;
5220         lastBitMask <<= 0x402F - aExp;
5221         roundBitsMask = lastBitMask - 1;
5222         z.low = 0;
5223         z.high = a.high;
5224         roundingMode = STATUS(float_rounding_mode);
5225         if ( roundingMode == float_round_nearest_even ) {
5226             z.high += lastBitMask>>1;
5227             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5228                 z.high &= ~ lastBitMask;
5229             }
5230         }
5231         else if ( roundingMode != float_round_to_zero ) {
5232             if (   extractFloat128Sign( z )
5233                  ^ ( roundingMode == float_round_up ) ) {
5234                 z.high |= ( a.low != 0 );
5235                 z.high += roundBitsMask;
5236             }
5237         }
5238         z.high &= ~ roundBitsMask;
5239     }
5240     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5241         STATUS(float_exception_flags) |= float_flag_inexact;
5242     }
5243     return z;
5244
5245 }
5246
5247 /*----------------------------------------------------------------------------
5248 | Returns the result of adding the absolute values of the quadruple-precision
5249 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5250 | before being returned.  `zSign' is ignored if the result is a NaN.
5251 | The addition is performed according to the IEC/IEEE Standard for Binary
5252 | Floating-Point Arithmetic.
5253 *----------------------------------------------------------------------------*/
5254
5255 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5256 {
5257     int32 aExp, bExp, zExp;
5258     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5259     int32 expDiff;
5260
5261     aSig1 = extractFloat128Frac1( a );
5262     aSig0 = extractFloat128Frac0( a );
5263     aExp = extractFloat128Exp( a );
5264     bSig1 = extractFloat128Frac1( b );
5265     bSig0 = extractFloat128Frac0( b );
5266     bExp = extractFloat128Exp( b );
5267     expDiff = aExp - bExp;
5268     if ( 0 < expDiff ) {
5269         if ( aExp == 0x7FFF ) {
5270             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5271             return a;
5272         }
5273         if ( bExp == 0 ) {
5274             --expDiff;
5275         }
5276         else {
5277             bSig0 |= LIT64( 0x0001000000000000 );
5278         }
5279         shift128ExtraRightJamming(
5280             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5281         zExp = aExp;
5282     }
5283     else if ( expDiff < 0 ) {
5284         if ( bExp == 0x7FFF ) {
5285             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5286             return packFloat128( zSign, 0x7FFF, 0, 0 );
5287         }
5288         if ( aExp == 0 ) {
5289             ++expDiff;
5290         }
5291         else {
5292             aSig0 |= LIT64( 0x0001000000000000 );
5293         }
5294         shift128ExtraRightJamming(
5295             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5296         zExp = bExp;
5297     }
5298     else {
5299         if ( aExp == 0x7FFF ) {
5300             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5301                 return propagateFloat128NaN( a, b STATUS_VAR );
5302             }
5303             return a;
5304         }
5305         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5306         if ( aExp == 0 ) {
5307             if (STATUS(flush_to_zero)) {
5308                 if (zSig0 | zSig1) {
5309                     float_raise(float_flag_output_denormal STATUS_VAR);
5310                 }
5311                 return packFloat128(zSign, 0, 0, 0);
5312             }
5313             return packFloat128( zSign, 0, zSig0, zSig1 );
5314         }
5315         zSig2 = 0;
5316         zSig0 |= LIT64( 0x0002000000000000 );
5317         zExp = aExp;
5318         goto shiftRight1;
5319     }
5320     aSig0 |= LIT64( 0x0001000000000000 );
5321     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5322     --zExp;
5323     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5324     ++zExp;
5325  shiftRight1:
5326     shift128ExtraRightJamming(
5327         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5328  roundAndPack:
5329     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5330
5331 }
5332
5333 /*----------------------------------------------------------------------------
5334 | Returns the result of subtracting the absolute values of the quadruple-
5335 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
5336 | difference is negated before being returned.  `zSign' is ignored if the
5337 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5338 | Standard for Binary Floating-Point Arithmetic.
5339 *----------------------------------------------------------------------------*/
5340
5341 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5342 {
5343     int32 aExp, bExp, zExp;
5344     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5345     int32 expDiff;
5346     float128 z;
5347
5348     aSig1 = extractFloat128Frac1( a );
5349     aSig0 = extractFloat128Frac0( a );
5350     aExp = extractFloat128Exp( a );
5351     bSig1 = extractFloat128Frac1( b );
5352     bSig0 = extractFloat128Frac0( b );
5353     bExp = extractFloat128Exp( b );
5354     expDiff = aExp - bExp;
5355     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5356     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5357     if ( 0 < expDiff ) goto aExpBigger;
5358     if ( expDiff < 0 ) goto bExpBigger;
5359     if ( aExp == 0x7FFF ) {
5360         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5361             return propagateFloat128NaN( a, b STATUS_VAR );
5362         }
5363         float_raise( float_flag_invalid STATUS_VAR);
5364         z.low = float128_default_nan_low;
5365         z.high = float128_default_nan_high;
5366         return z;
5367     }
5368     if ( aExp == 0 ) {
5369         aExp = 1;
5370         bExp = 1;
5371     }
5372     if ( bSig0 < aSig0 ) goto aBigger;
5373     if ( aSig0 < bSig0 ) goto bBigger;
5374     if ( bSig1 < aSig1 ) goto aBigger;
5375     if ( aSig1 < bSig1 ) goto bBigger;
5376     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5377  bExpBigger:
5378     if ( bExp == 0x7FFF ) {
5379         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5380         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5381     }
5382     if ( aExp == 0 ) {
5383         ++expDiff;
5384     }
5385     else {
5386         aSig0 |= LIT64( 0x4000000000000000 );
5387     }
5388     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5389     bSig0 |= LIT64( 0x4000000000000000 );
5390  bBigger:
5391     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5392     zExp = bExp;
5393     zSign ^= 1;
5394     goto normalizeRoundAndPack;
5395  aExpBigger:
5396     if ( aExp == 0x7FFF ) {
5397         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5398         return a;
5399     }
5400     if ( bExp == 0 ) {
5401         --expDiff;
5402     }
5403     else {
5404         bSig0 |= LIT64( 0x4000000000000000 );
5405     }
5406     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5407     aSig0 |= LIT64( 0x4000000000000000 );
5408  aBigger:
5409     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5410     zExp = aExp;
5411  normalizeRoundAndPack:
5412     --zExp;
5413     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5414
5415 }
5416
5417 /*----------------------------------------------------------------------------
5418 | Returns the result of adding the quadruple-precision floating-point values
5419 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5420 | for Binary Floating-Point Arithmetic.
5421 *----------------------------------------------------------------------------*/
5422
5423 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5424 {
5425     flag aSign, bSign;
5426
5427     aSign = extractFloat128Sign( a );
5428     bSign = extractFloat128Sign( b );
5429     if ( aSign == bSign ) {
5430         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5431     }
5432     else {
5433         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5434     }
5435
5436 }
5437
5438 /*----------------------------------------------------------------------------
5439 | Returns the result of subtracting the quadruple-precision floating-point
5440 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5441 | Standard for Binary Floating-Point Arithmetic.
5442 *----------------------------------------------------------------------------*/
5443
5444 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5445 {
5446     flag aSign, bSign;
5447
5448     aSign = extractFloat128Sign( a );
5449     bSign = extractFloat128Sign( b );
5450     if ( aSign == bSign ) {
5451         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5452     }
5453     else {
5454         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5455     }
5456
5457 }
5458
5459 /*----------------------------------------------------------------------------
5460 | Returns the result of multiplying the quadruple-precision floating-point
5461 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5462 | Standard for Binary Floating-Point Arithmetic.
5463 *----------------------------------------------------------------------------*/
5464
5465 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5466 {
5467     flag aSign, bSign, zSign;
5468     int32 aExp, bExp, zExp;
5469     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5470     float128 z;
5471
5472     aSig1 = extractFloat128Frac1( a );
5473     aSig0 = extractFloat128Frac0( a );
5474     aExp = extractFloat128Exp( a );
5475     aSign = extractFloat128Sign( a );
5476     bSig1 = extractFloat128Frac1( b );
5477     bSig0 = extractFloat128Frac0( b );
5478     bExp = extractFloat128Exp( b );
5479     bSign = extractFloat128Sign( b );
5480     zSign = aSign ^ bSign;
5481     if ( aExp == 0x7FFF ) {
5482         if (    ( aSig0 | aSig1 )
5483              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5484             return propagateFloat128NaN( a, b STATUS_VAR );
5485         }
5486         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5487         return packFloat128( zSign, 0x7FFF, 0, 0 );
5488     }
5489     if ( bExp == 0x7FFF ) {
5490         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5491         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5492  invalid:
5493             float_raise( float_flag_invalid STATUS_VAR);
5494             z.low = float128_default_nan_low;
5495             z.high = float128_default_nan_high;
5496             return z;
5497         }
5498         return packFloat128( zSign, 0x7FFF, 0, 0 );
5499     }
5500     if ( aExp == 0 ) {
5501         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5502         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5503     }
5504     if ( bExp == 0 ) {
5505         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5506         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5507     }
5508     zExp = aExp + bExp - 0x4000;
5509     aSig0 |= LIT64( 0x0001000000000000 );
5510     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5511     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5512     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5513     zSig2 |= ( zSig3 != 0 );
5514     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5515         shift128ExtraRightJamming(
5516             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5517         ++zExp;
5518     }
5519     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5520
5521 }
5522
5523 /*----------------------------------------------------------------------------
5524 | Returns the result of dividing the quadruple-precision floating-point value
5525 | `a' by the corresponding value `b'.  The operation is performed according to
5526 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5527 *----------------------------------------------------------------------------*/
5528
5529 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5530 {
5531     flag aSign, bSign, zSign;
5532     int32 aExp, bExp, zExp;
5533     uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5534     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5535     float128 z;
5536
5537     aSig1 = extractFloat128Frac1( a );
5538     aSig0 = extractFloat128Frac0( a );
5539     aExp = extractFloat128Exp( a );
5540     aSign = extractFloat128Sign( a );
5541     bSig1 = extractFloat128Frac1( b );
5542     bSig0 = extractFloat128Frac0( b );
5543     bExp = extractFloat128Exp( b );
5544     bSign = extractFloat128Sign( b );
5545     zSign = aSign ^ bSign;
5546     if ( aExp == 0x7FFF ) {
5547         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5548         if ( bExp == 0x7FFF ) {
5549             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5550             goto invalid;
5551         }
5552         return packFloat128( zSign, 0x7FFF, 0, 0 );
5553     }
5554     if ( bExp == 0x7FFF ) {
5555         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5556         return packFloat128( zSign, 0, 0, 0 );
5557     }
5558     if ( bExp == 0 ) {
5559         if ( ( bSig0 | bSig1 ) == 0 ) {
5560             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5561  invalid:
5562                 float_raise( float_flag_invalid STATUS_VAR);
5563                 z.low = float128_default_nan_low;
5564                 z.high = float128_default_nan_high;
5565                 return z;
5566             }
5567             float_raise( float_flag_divbyzero STATUS_VAR);
5568             return packFloat128( zSign, 0x7FFF, 0, 0 );
5569         }
5570         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5571     }
5572     if ( aExp == 0 ) {
5573         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5574         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5575     }
5576     zExp = aExp - bExp + 0x3FFD;
5577     shortShift128Left(
5578         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5579     shortShift128Left(
5580         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5581     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5582         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5583         ++zExp;
5584     }
5585     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5586     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5587     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5588     while ( (int64_t) rem0 < 0 ) {
5589         --zSig0;
5590         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5591     }
5592     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5593     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5594         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5595         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5596         while ( (int64_t) rem1 < 0 ) {
5597             --zSig1;
5598             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5599         }
5600         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5601     }
5602     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5603     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5604
5605 }
5606
5607 /*----------------------------------------------------------------------------
5608 | Returns the remainder of the quadruple-precision floating-point value `a'
5609 | with respect to the corresponding value `b'.  The operation is performed
5610 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5611 *----------------------------------------------------------------------------*/
5612
5613 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5614 {
5615     flag aSign, zSign;
5616     int32 aExp, bExp, expDiff;
5617     uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5618     uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
5619     int64_t sigMean0;
5620     float128 z;
5621
5622     aSig1 = extractFloat128Frac1( a );
5623     aSig0 = extractFloat128Frac0( a );
5624     aExp = extractFloat128Exp( a );
5625     aSign = extractFloat128Sign( a );
5626     bSig1 = extractFloat128Frac1( b );
5627     bSig0 = extractFloat128Frac0( b );
5628     bExp = extractFloat128Exp( b );
5629     if ( aExp == 0x7FFF ) {
5630         if (    ( aSig0 | aSig1 )
5631              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5632             return propagateFloat128NaN( a, b STATUS_VAR );
5633         }
5634         goto invalid;
5635     }
5636     if ( bExp == 0x7FFF ) {
5637         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5638         return a;
5639     }
5640     if ( bExp == 0 ) {
5641         if ( ( bSig0 | bSig1 ) == 0 ) {
5642  invalid:
5643             float_raise( float_flag_invalid STATUS_VAR);
5644             z.low = float128_default_nan_low;
5645             z.high = float128_default_nan_high;
5646             return z;
5647         }
5648         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5649     }
5650     if ( aExp == 0 ) {
5651         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5652         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5653     }
5654     expDiff = aExp - bExp;
5655     if ( expDiff < -1 ) return a;
5656     shortShift128Left(
5657         aSig0 | LIT64( 0x0001000000000000 ),
5658         aSig1,
5659         15 - ( expDiff < 0 ),
5660         &aSig0,
5661         &aSig1
5662     );
5663     shortShift128Left(
5664         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5665     q = le128( bSig0, bSig1, aSig0, aSig1 );
5666     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5667     expDiff -= 64;
5668     while ( 0 < expDiff ) {
5669         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5670         q = ( 4 < q ) ? q - 4 : 0;
5671         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5672         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5673         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5674         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5675         expDiff -= 61;
5676     }
5677     if ( -64 < expDiff ) {
5678         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5679         q = ( 4 < q ) ? q - 4 : 0;
5680         q >>= - expDiff;
5681         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5682         expDiff += 52;
5683         if ( expDiff < 0 ) {
5684             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5685         }
5686         else {
5687             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5688         }
5689         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5690         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5691     }
5692     else {
5693         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5694         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5695     }
5696     do {
5697         alternateASig0 = aSig0;
5698         alternateASig1 = aSig1;
5699         ++q;
5700         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5701     } while ( 0 <= (int64_t) aSig0 );
5702     add128(
5703         aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
5704     if (    ( sigMean0 < 0 )
5705          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5706         aSig0 = alternateASig0;
5707         aSig1 = alternateASig1;
5708     }
5709     zSign = ( (int64_t) aSig0 < 0 );
5710     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5711     return
5712         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5713
5714 }
5715
5716 /*----------------------------------------------------------------------------
5717 | Returns the square root of the quadruple-precision floating-point value `a'.
5718 | The operation is performed according to the IEC/IEEE Standard for Binary
5719 | Floating-Point Arithmetic.
5720 *----------------------------------------------------------------------------*/
5721
5722 float128 float128_sqrt( float128 a STATUS_PARAM )
5723 {
5724     flag aSign;
5725     int32 aExp, zExp;
5726     uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5727     uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5728     float128 z;
5729
5730     aSig1 = extractFloat128Frac1( a );
5731     aSig0 = extractFloat128Frac0( a );
5732     aExp = extractFloat128Exp( a );
5733     aSign = extractFloat128Sign( a );
5734     if ( aExp == 0x7FFF ) {
5735         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5736         if ( ! aSign ) return a;
5737         goto invalid;
5738     }
5739     if ( aSign ) {
5740         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5741  invalid:
5742         float_raise( float_flag_invalid STATUS_VAR);
5743         z.low = float128_default_nan_low;
5744         z.high = float128_default_nan_high;
5745         return z;
5746     }
5747     if ( aExp == 0 ) {
5748         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5749         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5750     }
5751     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5752     aSig0 |= LIT64( 0x0001000000000000 );
5753     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5754     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5755     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5756     doubleZSig0 = zSig0<<1;
5757     mul64To128( zSig0, zSig0, &term0, &term1 );
5758     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5759     while ( (int64_t) rem0 < 0 ) {
5760         --zSig0;
5761         doubleZSig0 -= 2;
5762         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5763     }
5764     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5765     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5766         if ( zSig1 == 0 ) zSig1 = 1;
5767         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5768         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5769         mul64To128( zSig1, zSig1, &term2, &term3 );
5770         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5771         while ( (int64_t) rem1 < 0 ) {
5772             --zSig1;
5773             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5774             term3 |= 1;
5775             term2 |= doubleZSig0;
5776             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5777         }
5778         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5779     }
5780     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5781     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5782
5783 }
5784
5785 /*----------------------------------------------------------------------------
5786 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5787 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5788 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5789 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5790 *----------------------------------------------------------------------------*/
5791
5792 int float128_eq( float128 a, float128 b STATUS_PARAM )
5793 {
5794
5795     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5796               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5797          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5798               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5799        ) {
5800         float_raise( float_flag_invalid STATUS_VAR);
5801         return 0;
5802     }
5803     return
5804            ( a.low == b.low )
5805         && (    ( a.high == b.high )
5806              || (    ( a.low == 0 )
5807                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5808            );
5809
5810 }
5811
5812 /*----------------------------------------------------------------------------
5813 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5814 | or equal to the corresponding value `b', and 0 otherwise.  The invalid
5815 | exception is raised if either operand is a NaN.  The comparison is performed
5816 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5817 *----------------------------------------------------------------------------*/
5818
5819 int float128_le( float128 a, float128 b STATUS_PARAM )
5820 {
5821     flag aSign, bSign;
5822
5823     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5824               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5825          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5826               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5827        ) {
5828         float_raise( float_flag_invalid STATUS_VAR);
5829         return 0;
5830     }
5831     aSign = extractFloat128Sign( a );
5832     bSign = extractFloat128Sign( b );
5833     if ( aSign != bSign ) {
5834         return
5835                aSign
5836             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5837                  == 0 );
5838     }
5839     return
5840           aSign ? le128( b.high, b.low, a.high, a.low )
5841         : le128( a.high, a.low, b.high, b.low );
5842
5843 }
5844
5845 /*----------------------------------------------------------------------------
5846 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5847 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5848 | raised if either operand is a NaN.  The comparison is performed according
5849 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5850 *----------------------------------------------------------------------------*/
5851
5852 int float128_lt( float128 a, float128 b STATUS_PARAM )
5853 {
5854     flag aSign, bSign;
5855
5856     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5857               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5858          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5859               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5860        ) {
5861         float_raise( float_flag_invalid STATUS_VAR);
5862         return 0;
5863     }
5864     aSign = extractFloat128Sign( a );
5865     bSign = extractFloat128Sign( b );
5866     if ( aSign != bSign ) {
5867         return
5868                aSign
5869             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5870                  != 0 );
5871     }
5872     return
5873           aSign ? lt128( b.high, b.low, a.high, a.low )
5874         : lt128( a.high, a.low, b.high, b.low );
5875
5876 }
5877
5878 /*----------------------------------------------------------------------------
5879 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5880 | be compared, and 0 otherwise.  The invalid exception is raised if either
5881 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5882 | Standard for Binary Floating-Point Arithmetic.
5883 *----------------------------------------------------------------------------*/
5884
5885 int float128_unordered( float128 a, float128 b STATUS_PARAM )
5886 {
5887     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5888               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5889          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5890               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5891        ) {
5892         float_raise( float_flag_invalid STATUS_VAR);
5893         return 1;
5894     }
5895     return 0;
5896 }
5897
5898 /*----------------------------------------------------------------------------
5899 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5900 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5901 | exception.  The comparison is performed according to the IEC/IEEE Standard
5902 | for Binary Floating-Point Arithmetic.
5903 *----------------------------------------------------------------------------*/
5904
5905 int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
5906 {
5907
5908     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5909               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5910          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5911               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5912        ) {
5913         if (    float128_is_signaling_nan( a )
5914              || float128_is_signaling_nan( b ) ) {
5915             float_raise( float_flag_invalid STATUS_VAR);
5916         }
5917         return 0;
5918     }
5919     return
5920            ( a.low == b.low )
5921         && (    ( a.high == b.high )
5922              || (    ( a.low == 0 )
5923                   && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5924            );
5925
5926 }
5927
5928 /*----------------------------------------------------------------------------
5929 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5930 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5931 | cause an exception.  Otherwise, the comparison is performed according to the
5932 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5933 *----------------------------------------------------------------------------*/
5934
5935 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5936 {
5937     flag aSign, bSign;
5938
5939     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5940               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5941          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5942               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5943        ) {
5944         if (    float128_is_signaling_nan( a )
5945              || float128_is_signaling_nan( b ) ) {
5946             float_raise( float_flag_invalid STATUS_VAR);
5947         }
5948         return 0;
5949     }
5950     aSign = extractFloat128Sign( a );
5951     bSign = extractFloat128Sign( b );
5952     if ( aSign != bSign ) {
5953         return
5954                aSign
5955             || (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5956                  == 0 );
5957     }
5958     return
5959           aSign ? le128( b.high, b.low, a.high, a.low )
5960         : le128( a.high, a.low, b.high, b.low );
5961
5962 }
5963
5964 /*----------------------------------------------------------------------------
5965 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5966 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5967 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5968 | Standard for Binary Floating-Point Arithmetic.
5969 *----------------------------------------------------------------------------*/
5970
5971 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5972 {
5973     flag aSign, bSign;
5974
5975     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5976               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5977          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5978               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5979        ) {
5980         if (    float128_is_signaling_nan( a )
5981              || float128_is_signaling_nan( b ) ) {
5982             float_raise( float_flag_invalid STATUS_VAR);
5983         }
5984         return 0;
5985     }
5986     aSign = extractFloat128Sign( a );
5987     bSign = extractFloat128Sign( b );
5988     if ( aSign != bSign ) {
5989         return
5990                aSign
5991             && (    ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5992                  != 0 );
5993     }
5994     return
5995           aSign ? lt128( b.high, b.low, a.high, a.low )
5996         : lt128( a.high, a.low, b.high, b.low );
5997
5998 }
5999
6000 /*----------------------------------------------------------------------------
6001 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6002 | be compared, and 0 otherwise.  Quiet NaNs do not cause an exception.  The
6003 | comparison is performed according to the IEC/IEEE Standard for Binary
6004 | Floating-Point Arithmetic.
6005 *----------------------------------------------------------------------------*/
6006
6007 int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6008 {
6009     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
6010               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6011          || (    ( extractFloat128Exp( b ) == 0x7FFF )
6012               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6013        ) {
6014         if (    float128_is_signaling_nan( a )
6015              || float128_is_signaling_nan( b ) ) {
6016             float_raise( float_flag_invalid STATUS_VAR);
6017         }
6018         return 1;
6019     }
6020     return 0;
6021 }
6022
6023 #endif
6024
6025 /* misc functions */
6026 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
6027 {
6028     return int64_to_float32(a STATUS_VAR);
6029 }
6030
6031 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
6032 {
6033     return int64_to_float64(a STATUS_VAR);
6034 }
6035
6036 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
6037 {
6038     int64_t v;
6039     unsigned int res;
6040
6041     v = float32_to_int64(a STATUS_VAR);
6042     if (v < 0) {
6043         res = 0;
6044         float_raise( float_flag_invalid STATUS_VAR);
6045     } else if (v > 0xffffffff) {
6046         res = 0xffffffff;
6047         float_raise( float_flag_invalid STATUS_VAR);
6048     } else {
6049         res = v;
6050     }
6051     return res;
6052 }
6053
6054 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
6055 {
6056     int64_t v;
6057     unsigned int res;
6058
6059     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6060     if (v < 0) {
6061         res = 0;
6062         float_raise( float_flag_invalid STATUS_VAR);
6063     } else if (v > 0xffffffff) {
6064         res = 0xffffffff;
6065         float_raise( float_flag_invalid STATUS_VAR);
6066     } else {
6067         res = v;
6068     }
6069     return res;
6070 }
6071
6072 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
6073 {
6074     int64_t v;
6075     unsigned int res;
6076
6077     v = float32_to_int64_round_to_zero(a STATUS_VAR);
6078     if (v < 0) {
6079         res = 0;
6080         float_raise( float_flag_invalid STATUS_VAR);
6081     } else if (v > 0xffff) {
6082         res = 0xffff;
6083         float_raise( float_flag_invalid STATUS_VAR);
6084     } else {
6085         res = v;
6086     }
6087     return res;
6088 }
6089
6090 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
6091 {
6092     int64_t v;
6093     unsigned int res;
6094
6095     v = float64_to_int64(a STATUS_VAR);
6096     if (v < 0) {
6097         res = 0;
6098         float_raise( float_flag_invalid STATUS_VAR);
6099     } else if (v > 0xffffffff) {
6100         res = 0xffffffff;
6101         float_raise( float_flag_invalid STATUS_VAR);
6102     } else {
6103         res = v;
6104     }
6105     return res;
6106 }
6107
6108 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
6109 {
6110     int64_t v;
6111     unsigned int res;
6112
6113     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6114     if (v < 0) {
6115         res = 0;
6116         float_raise( float_flag_invalid STATUS_VAR);
6117     } else if (v > 0xffffffff) {
6118         res = 0xffffffff;
6119         float_raise( float_flag_invalid STATUS_VAR);
6120     } else {
6121         res = v;
6122     }
6123     return res;
6124 }
6125
6126 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
6127 {
6128     int64_t v;
6129     unsigned int res;
6130
6131     v = float64_to_int64_round_to_zero(a STATUS_VAR);
6132     if (v < 0) {
6133         res = 0;
6134         float_raise( float_flag_invalid STATUS_VAR);
6135     } else if (v > 0xffff) {
6136         res = 0xffff;
6137         float_raise( float_flag_invalid STATUS_VAR);
6138     } else {
6139         res = v;
6140     }
6141     return res;
6142 }
6143
6144 /* FIXME: This looks broken.  */
6145 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
6146 {
6147     int64_t v;
6148
6149     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6150     v += float64_val(a);
6151     v = float64_to_int64(make_float64(v) STATUS_VAR);
6152
6153     return v - INT64_MIN;
6154 }
6155
6156 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
6157 {
6158     int64_t v;
6159
6160     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
6161     v += float64_val(a);
6162     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
6163
6164     return v - INT64_MIN;
6165 }
6166
6167 #define COMPARE(s, nan_exp)                                                  \
6168 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
6169                                       int is_quiet STATUS_PARAM )            \
6170 {                                                                            \
6171     flag aSign, bSign;                                                       \
6172     uint ## s ## _t av, bv;                                                  \
6173     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
6174     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
6175                                                                              \
6176     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
6177          extractFloat ## s ## Frac( a ) ) ||                                 \
6178         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
6179           extractFloat ## s ## Frac( b ) )) {                                \
6180         if (!is_quiet ||                                                     \
6181             float ## s ## _is_signaling_nan( a ) ||                          \
6182             float ## s ## _is_signaling_nan( b ) ) {                         \
6183             float_raise( float_flag_invalid STATUS_VAR);                     \
6184         }                                                                    \
6185         return float_relation_unordered;                                     \
6186     }                                                                        \
6187     aSign = extractFloat ## s ## Sign( a );                                  \
6188     bSign = extractFloat ## s ## Sign( b );                                  \
6189     av = float ## s ## _val(a);                                              \
6190     bv = float ## s ## _val(b);                                              \
6191     if ( aSign != bSign ) {                                                  \
6192         if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) {                   \
6193             /* zero case */                                                  \
6194             return float_relation_equal;                                     \
6195         } else {                                                             \
6196             return 1 - (2 * aSign);                                          \
6197         }                                                                    \
6198     } else {                                                                 \
6199         if (av == bv) {                                                      \
6200             return float_relation_equal;                                     \
6201         } else {                                                             \
6202             return 1 - 2 * (aSign ^ ( av < bv ));                            \
6203         }                                                                    \
6204     }                                                                        \
6205 }                                                                            \
6206                                                                              \
6207 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
6208 {                                                                            \
6209     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
6210 }                                                                            \
6211                                                                              \
6212 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6213 {                                                                            \
6214     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6215 }
6216
6217 COMPARE(32, 0xff)
6218 COMPARE(64, 0x7ff)
6219
6220 INLINE int floatx80_compare_internal( floatx80 a, floatx80 b,
6221                                       int is_quiet STATUS_PARAM )
6222 {
6223     flag aSign, bSign;
6224
6225     if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
6226           ( extractFloatx80Frac( a )<<1 ) ) ||
6227         ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
6228           ( extractFloatx80Frac( b )<<1 ) )) {
6229         if (!is_quiet ||
6230             floatx80_is_signaling_nan( a ) ||
6231             floatx80_is_signaling_nan( b ) ) {
6232             float_raise( float_flag_invalid STATUS_VAR);
6233         }
6234         return float_relation_unordered;
6235     }
6236     aSign = extractFloatx80Sign( a );
6237     bSign = extractFloatx80Sign( b );
6238     if ( aSign != bSign ) {
6239
6240         if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
6241              ( ( a.low | b.low ) == 0 ) ) {
6242             /* zero case */
6243             return float_relation_equal;
6244         } else {
6245             return 1 - (2 * aSign);
6246         }
6247     } else {
6248         if (a.low == b.low && a.high == b.high) {
6249             return float_relation_equal;
6250         } else {
6251             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6252         }
6253     }
6254 }
6255
6256 int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
6257 {
6258     return floatx80_compare_internal(a, b, 0 STATUS_VAR);
6259 }
6260
6261 int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
6262 {
6263     return floatx80_compare_internal(a, b, 1 STATUS_VAR);
6264 }
6265
6266 INLINE int float128_compare_internal( float128 a, float128 b,
6267                                       int is_quiet STATUS_PARAM )
6268 {
6269     flag aSign, bSign;
6270
6271     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6272           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6273         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6274           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6275         if (!is_quiet ||
6276             float128_is_signaling_nan( a ) ||
6277             float128_is_signaling_nan( b ) ) {
6278             float_raise( float_flag_invalid STATUS_VAR);
6279         }
6280         return float_relation_unordered;
6281     }
6282     aSign = extractFloat128Sign( a );
6283     bSign = extractFloat128Sign( b );
6284     if ( aSign != bSign ) {
6285         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6286             /* zero case */
6287             return float_relation_equal;
6288         } else {
6289             return 1 - (2 * aSign);
6290         }
6291     } else {
6292         if (a.low == b.low && a.high == b.high) {
6293             return float_relation_equal;
6294         } else {
6295             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6296         }
6297     }
6298 }
6299
6300 int float128_compare( float128 a, float128 b STATUS_PARAM )
6301 {
6302     return float128_compare_internal(a, b, 0 STATUS_VAR);
6303 }
6304
6305 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6306 {
6307     return float128_compare_internal(a, b, 1 STATUS_VAR);
6308 }
6309
6310 /* min() and max() functions. These can't be implemented as
6311  * 'compare and pick one input' because that would mishandle
6312  * NaNs and +0 vs -0.
6313  */
6314 #define MINMAX(s, nan_exp)                                              \
6315 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b,     \
6316                                         int ismin STATUS_PARAM )        \
6317 {                                                                       \
6318     flag aSign, bSign;                                                  \
6319     uint ## s ## _t av, bv;                                             \
6320     a = float ## s ## _squash_input_denormal(a STATUS_VAR);             \
6321     b = float ## s ## _squash_input_denormal(b STATUS_VAR);             \
6322     if (float ## s ## _is_any_nan(a) ||                                 \
6323         float ## s ## _is_any_nan(b)) {                                 \
6324         return propagateFloat ## s ## NaN(a, b STATUS_VAR);             \
6325     }                                                                   \
6326     aSign = extractFloat ## s ## Sign(a);                               \
6327     bSign = extractFloat ## s ## Sign(b);                               \
6328     av = float ## s ## _val(a);                                         \
6329     bv = float ## s ## _val(b);                                         \
6330     if (aSign != bSign) {                                               \
6331         if (ismin) {                                                    \
6332             return aSign ? a : b;                                       \
6333         } else {                                                        \
6334             return aSign ? b : a;                                       \
6335         }                                                               \
6336     } else {                                                            \
6337         if (ismin) {                                                    \
6338             return (aSign ^ (av < bv)) ? a : b;                         \
6339         } else {                                                        \
6340             return (aSign ^ (av < bv)) ? b : a;                         \
6341         }                                                               \
6342     }                                                                   \
6343 }                                                                       \
6344                                                                         \
6345 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM)  \
6346 {                                                                       \
6347     return float ## s ## _minmax(a, b, 1 STATUS_VAR);                   \
6348 }                                                                       \
6349                                                                         \
6350 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM)  \
6351 {                                                                       \
6352     return float ## s ## _minmax(a, b, 0 STATUS_VAR);                   \
6353 }
6354
6355 MINMAX(32, 0xff)
6356 MINMAX(64, 0x7ff)
6357
6358
6359 /* Multiply A by 2 raised to the power N.  */
6360 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6361 {
6362     flag aSign;
6363     int16_t aExp;
6364     uint32_t aSig;
6365
6366     a = float32_squash_input_denormal(a STATUS_VAR);
6367     aSig = extractFloat32Frac( a );
6368     aExp = extractFloat32Exp( a );
6369     aSign = extractFloat32Sign( a );
6370
6371     if ( aExp == 0xFF ) {
6372         if ( aSig ) {
6373             return propagateFloat32NaN( a, a STATUS_VAR );
6374         }
6375         return a;
6376     }
6377     if ( aExp != 0 )
6378         aSig |= 0x00800000;
6379     else if ( aSig == 0 )
6380         return a;
6381
6382     if (n > 0x200) {
6383         n = 0x200;
6384     } else if (n < -0x200) {
6385         n = -0x200;
6386     }
6387
6388     aExp += n - 1;
6389     aSig <<= 7;
6390     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6391 }
6392
6393 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6394 {
6395     flag aSign;
6396     int16_t aExp;
6397     uint64_t aSig;
6398
6399     a = float64_squash_input_denormal(a STATUS_VAR);
6400     aSig = extractFloat64Frac( a );
6401     aExp = extractFloat64Exp( a );
6402     aSign = extractFloat64Sign( a );
6403
6404     if ( aExp == 0x7FF ) {
6405         if ( aSig ) {
6406             return propagateFloat64NaN( a, a STATUS_VAR );
6407         }
6408         return a;
6409     }
6410     if ( aExp != 0 )
6411         aSig |= LIT64( 0x0010000000000000 );
6412     else if ( aSig == 0 )
6413         return a;
6414
6415     if (n > 0x1000) {
6416         n = 0x1000;
6417     } else if (n < -0x1000) {
6418         n = -0x1000;
6419     }
6420
6421     aExp += n - 1;
6422     aSig <<= 10;
6423     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6424 }
6425
6426 #ifdef FLOATX80
6427 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6428 {
6429     flag aSign;
6430     int32_t aExp;
6431     uint64_t aSig;
6432
6433     aSig = extractFloatx80Frac( a );
6434     aExp = extractFloatx80Exp( a );
6435     aSign = extractFloatx80Sign( a );
6436
6437     if ( aExp == 0x7FFF ) {
6438         if ( aSig<<1 ) {
6439             return propagateFloatx80NaN( a, a STATUS_VAR );
6440         }
6441         return a;
6442     }
6443
6444     if (aExp == 0 && aSig == 0)
6445         return a;
6446
6447     if (n > 0x10000) {
6448         n = 0x10000;
6449     } else if (n < -0x10000) {
6450         n = -0x10000;
6451     }
6452
6453     aExp += n;
6454     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6455                                           aSign, aExp, aSig, 0 STATUS_VAR );
6456 }
6457 #endif
6458
6459 #ifdef FLOAT128
6460 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6461 {
6462     flag aSign;
6463     int32_t aExp;
6464     uint64_t aSig0, aSig1;
6465
6466     aSig1 = extractFloat128Frac1( a );
6467     aSig0 = extractFloat128Frac0( a );
6468     aExp = extractFloat128Exp( a );
6469     aSign = extractFloat128Sign( a );
6470     if ( aExp == 0x7FFF ) {
6471         if ( aSig0 | aSig1 ) {
6472             return propagateFloat128NaN( a, a STATUS_VAR );
6473         }
6474         return a;
6475     }
6476     if ( aExp != 0 )
6477         aSig0 |= LIT64( 0x0001000000000000 );
6478     else if ( aSig0 == 0 && aSig1 == 0 )
6479         return a;
6480
6481     if (n > 0x10000) {
6482         n = 0x10000;
6483     } else if (n < -0x10000) {
6484         n = -0x10000;
6485     }
6486
6487     aExp += n - 1;
6488     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6489                                           STATUS_VAR );
6490
6491 }
6492 #endif