fpu/softfloat.c

   1
   2 /*============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5 Package, Release 2b.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 arithmetic/SoftFloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26 Derivative works are acceptable, even for commercial purposes, so long as
  27 (1) the source code for the derivative work includes prominent notice that
  28 the work is derivative, and (2) the source code includes prominent notice with
  29 these four paragraphs for those parts of this code that are retained.
  30
  31 =============================================================================*/
  32
  33 #include "softfloat.h"
  34
  35 /*----------------------------------------------------------------------------
  36 | Primitive arithmetic functions, including multi-word arithmetic, and
  37 | division and square root approximations.  (Can be specialized to target if
  38 | desired.)
  39 *----------------------------------------------------------------------------*/
  40 #include "softfloat-macros.h"
  41
  42 /*----------------------------------------------------------------------------
  43 | Functions and definitions to determine:  (1) whether tininess for underflow
  44 | is detected before or after rounding by default, (2) what (if anything)
  45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  47 | are propagated from function inputs to output.  These details are target-
  48 | specific.
  49 *----------------------------------------------------------------------------*/
  50 #include "softfloat-specialize.h"
  51
  52 void set_float_rounding_mode(int val STATUS_PARAM)
  53 {
  54     STATUS(float_rounding_mode) = val;
  55 }
  56
  57 void set_float_exception_flags(int val STATUS_PARAM)
  58 {
  59     STATUS(float_exception_flags) = val;
  60 }
  61
  62 #ifdef FLOATX80
  63 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  64 {
  65     STATUS(floatx80_rounding_precision) = val;
  66 }
  67 #endif
  68
  69 /*----------------------------------------------------------------------------
  70 | Returns the fraction bits of the half-precision floating-point value `a'.
  71 *----------------------------------------------------------------------------*/
  72
  73 INLINE uint32_t extractFloat16Frac(float16 a)
  74 {
  75     return float16_val(a) & 0x3ff;
  76 }
  77
  78 /*----------------------------------------------------------------------------
  79 | Returns the exponent bits of the half-precision floating-point value `a'.
  80 *----------------------------------------------------------------------------*/
  81
  82 INLINE int16 extractFloat16Exp(float16 a)
  83 {
  84     return (float16_val(a) >> 10) & 0x1f;
  85 }
  86
  87 /*----------------------------------------------------------------------------
  88 | Returns the sign bit of the single-precision floating-point value `a'.
  89 *----------------------------------------------------------------------------*/
  90
  91 INLINE flag extractFloat16Sign(float16 a)
  92 {
  93     return float16_val(a)>>15;
  94 }
  95
  96 /*----------------------------------------------------------------------------
  97 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  98 | and 7, and returns the properly rounded 32-bit integer corresponding to the
  99 | input.  If `zSign' is 1, the input is negated before being converted to an
 100 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
 101 | is simply rounded to an integer, with the inexact exception raised if the
 102 | input cannot be represented exactly as an integer.  However, if the fixed-
 103 | point input is too large, the invalid exception is raised and the largest
 104 | positive or negative integer is returned.
 105 *----------------------------------------------------------------------------*/
 106
 107 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
 108 {
 109     int8 roundingMode;
 110     flag roundNearestEven;
 111     int8 roundIncrement, roundBits;
 112     int32 z;
 113
 114     roundingMode = STATUS(float_rounding_mode);
 115     roundNearestEven = ( roundingMode == float_round_nearest_even );
 116     roundIncrement = 0x40;
 117     if ( ! roundNearestEven ) {
 118         if ( roundingMode == float_round_to_zero ) {
 119             roundIncrement = 0;
 120         }
 121         else {
 122             roundIncrement = 0x7F;
 123             if ( zSign ) {
 124                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 125             }
 126             else {
 127                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 128             }
 129         }
 130     }
 131     roundBits = absZ & 0x7F;
 132     absZ = ( absZ + roundIncrement )>>7;
 133     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 134     z = absZ;
 135     if ( zSign ) z = - z;
 136     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 137         float_raise( float_flag_invalid STATUS_VAR);
 138         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 139     }
 140     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 141     return z;
 142
 143 }
 144
 145 /*----------------------------------------------------------------------------
 146 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 147 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 148 | and returns the properly rounded 64-bit integer corresponding to the input.
 149 | If `zSign' is 1, the input is negated before being converted to an integer.
 150 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 151 | the inexact exception raised if the input cannot be represented exactly as
 152 | an integer.  However, if the fixed-point input is too large, the invalid
 153 | exception is raised and the largest positive or negative integer is
 154 | returned.
 155 *----------------------------------------------------------------------------*/
 156
 157 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
 158 {
 159     int8 roundingMode;
 160     flag roundNearestEven, increment;
 161     int64 z;
 162
 163     roundingMode = STATUS(float_rounding_mode);
 164     roundNearestEven = ( roundingMode == float_round_nearest_even );
 165     increment = ( (sbits64) absZ1 < 0 );
 166     if ( ! roundNearestEven ) {
 167         if ( roundingMode == float_round_to_zero ) {
 168             increment = 0;
 169         }
 170         else {
 171             if ( zSign ) {
 172                 increment = ( roundingMode == float_round_down ) && absZ1;
 173             }
 174             else {
 175                 increment = ( roundingMode == float_round_up ) && absZ1;
 176             }
 177         }
 178     }
 179     if ( increment ) {
 180         ++absZ0;
 181         if ( absZ0 == 0 ) goto overflow;
 182         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 183     }
 184     z = absZ0;
 185     if ( zSign ) z = - z;
 186     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 187  overflow:
 188         float_raise( float_flag_invalid STATUS_VAR);
 189         return
 190               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 191             : LIT64( 0x7FFFFFFFFFFFFFFF );
 192     }
 193     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 194     return z;
 195
 196 }
 197
 198 /*----------------------------------------------------------------------------
 199 | Returns the fraction bits of the single-precision floating-point value `a'.
 200 *----------------------------------------------------------------------------*/
 201
 202 INLINE bits32 extractFloat32Frac( float32 a )
 203 {
 204
 205     return float32_val(a) & 0x007FFFFF;
 206
 207 }
 208
 209 /*----------------------------------------------------------------------------
 210 | Returns the exponent bits of the single-precision floating-point value `a'.
 211 *----------------------------------------------------------------------------*/
 212
 213 INLINE int16 extractFloat32Exp( float32 a )
 214 {
 215
 216     return ( float32_val(a)>>23 ) & 0xFF;
 217
 218 }
 219
 220 /*----------------------------------------------------------------------------
 221 | Returns the sign bit of the single-precision floating-point value `a'.
 222 *----------------------------------------------------------------------------*/
 223
 224 INLINE flag extractFloat32Sign( float32 a )
 225 {
 226
 227     return float32_val(a)>>31;
 228
 229 }
 230
 231 /*----------------------------------------------------------------------------
 232 | If `a' is denormal and we are in flush-to-zero mode then set the
 233 | input-denormal exception and return zero. Otherwise just return the value.
 234 *----------------------------------------------------------------------------*/
 235 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 236 {
 237     if (STATUS(flush_inputs_to_zero)) {
 238         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 239             float_raise(float_flag_input_denormal STATUS_VAR);
 240             return make_float32(float32_val(a) & 0x80000000);
 241         }
 242     }
 243     return a;
 244 }
 245
 246 /*----------------------------------------------------------------------------
 247 | Normalizes the subnormal single-precision floating-point value represented
 248 | by the denormalized significand `aSig'.  The normalized exponent and
 249 | significand are stored at the locations pointed to by `zExpPtr' and
 250 | `zSigPtr', respectively.
 251 *----------------------------------------------------------------------------*/
 252
 253 static void
 254  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 255 {
 256     int8 shiftCount;
 257
 258     shiftCount = countLeadingZeros32( aSig ) - 8;
 259     *zSigPtr = aSig<<shiftCount;
 260     *zExpPtr = 1 - shiftCount;
 261
 262 }
 263
 264 /*----------------------------------------------------------------------------
 265 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 266 | single-precision floating-point value, returning the result.  After being
 267 | shifted into the proper positions, the three fields are simply added
 268 | together to form the result.  This means that any integer portion of `zSig'
 269 | will be added into the exponent.  Since a properly normalized significand
 270 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 271 | than the desired result exponent whenever `zSig' is a complete, normalized
 272 | significand.
 273 *----------------------------------------------------------------------------*/
 274
 275 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 276 {
 277
 278     return make_float32(
 279           ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
 280
 281 }
 282
 283 /*----------------------------------------------------------------------------
 284 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 285 | and significand `zSig', and returns the proper single-precision floating-
 286 | point value corresponding to the abstract input.  Ordinarily, the abstract
 287 | value is simply rounded and packed into the single-precision format, with
 288 | the inexact exception raised if the abstract input cannot be represented
 289 | exactly.  However, if the abstract value is too large, the overflow and
 290 | inexact exceptions are raised and an infinity or maximal finite value is
 291 | returned.  If the abstract value is too small, the input value is rounded to
 292 | a subnormal number, and the underflow and inexact exceptions are raised if
 293 | the abstract input cannot be represented exactly as a subnormal single-
 294 | precision floating-point number.
 295 |     The input significand `zSig' has its binary point between bits 30
 296 | and 29, which is 7 bits to the left of the usual location.  This shifted
 297 | significand must be normalized or smaller.  If `zSig' is not normalized,
 298 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 299 | and it must not require rounding.  In the usual case that `zSig' is
 300 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 301 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 302 | Binary Floating-Point Arithmetic.
 303 *----------------------------------------------------------------------------*/
 304
 305 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 306 {
 307     int8 roundingMode;
 308     flag roundNearestEven;
 309     int8 roundIncrement, roundBits;
 310     flag isTiny;
 311
 312     roundingMode = STATUS(float_rounding_mode);
 313     roundNearestEven = ( roundingMode == float_round_nearest_even );
 314     roundIncrement = 0x40;
 315     if ( ! roundNearestEven ) {
 316         if ( roundingMode == float_round_to_zero ) {
 317             roundIncrement = 0;
 318         }
 319         else {
 320             roundIncrement = 0x7F;
 321             if ( zSign ) {
 322                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 323             }
 324             else {
 325                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 326             }
 327         }
 328     }
 329     roundBits = zSig & 0x7F;
 330     if ( 0xFD <= (bits16) zExp ) {
 331         if (    ( 0xFD < zExp )
 332              || (    ( zExp == 0xFD )
 333                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 334            ) {
 335             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 336             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 337         }
 338         if ( zExp < 0 ) {
 339             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
 340             isTiny =
 341                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 342                 || ( zExp < -1 )
 343                 || ( zSig + roundIncrement < 0x80000000 );
 344             shift32RightJamming( zSig, - zExp, &zSig );
 345             zExp = 0;
 346             roundBits = zSig & 0x7F;
 347             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 348         }
 349     }
 350     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 351     zSig = ( zSig + roundIncrement )>>7;
 352     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 353     if ( zSig == 0 ) zExp = 0;
 354     return packFloat32( zSign, zExp, zSig );
 355
 356 }
 357
 358 /*----------------------------------------------------------------------------
 359 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 360 | and significand `zSig', and returns the proper single-precision floating-
 361 | point value corresponding to the abstract input.  This routine is just like
 362 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 363 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 364 | floating-point exponent.
 365 *----------------------------------------------------------------------------*/
 366
 367 static float32
 368  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 369 {
 370     int8 shiftCount;
 371
 372     shiftCount = countLeadingZeros32( zSig ) - 1;
 373     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 374
 375 }
 376
 377 /*----------------------------------------------------------------------------
 378 | Returns the fraction bits of the double-precision floating-point value `a'.
 379 *----------------------------------------------------------------------------*/
 380
 381 INLINE bits64 extractFloat64Frac( float64 a )
 382 {
 383
 384     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 385
 386 }
 387
 388 /*----------------------------------------------------------------------------
 389 | Returns the exponent bits of the double-precision floating-point value `a'.
 390 *----------------------------------------------------------------------------*/
 391
 392 INLINE int16 extractFloat64Exp( float64 a )
 393 {
 394
 395     return ( float64_val(a)>>52 ) & 0x7FF;
 396
 397 }
 398
 399 /*----------------------------------------------------------------------------
 400 | Returns the sign bit of the double-precision floating-point value `a'.
 401 *----------------------------------------------------------------------------*/
 402
 403 INLINE flag extractFloat64Sign( float64 a )
 404 {
 405
 406     return float64_val(a)>>63;
 407
 408 }
 409
 410 /*----------------------------------------------------------------------------
 411 | If `a' is denormal and we are in flush-to-zero mode then set the
 412 | input-denormal exception and return zero. Otherwise just return the value.
 413 *----------------------------------------------------------------------------*/
 414 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 415 {
 416     if (STATUS(flush_inputs_to_zero)) {
 417         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 418             float_raise(float_flag_input_denormal STATUS_VAR);
 419             return make_float64(float64_val(a) & (1ULL << 63));
 420         }
 421     }
 422     return a;
 423 }
 424
 425 /*----------------------------------------------------------------------------
 426 | Normalizes the subnormal double-precision floating-point value represented
 427 | by the denormalized significand `aSig'.  The normalized exponent and
 428 | significand are stored at the locations pointed to by `zExpPtr' and
 429 | `zSigPtr', respectively.
 430 *----------------------------------------------------------------------------*/
 431
 432 static void
 433  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 434 {
 435     int8 shiftCount;
 436
 437     shiftCount = countLeadingZeros64( aSig ) - 11;
 438     *zSigPtr = aSig<<shiftCount;
 439     *zExpPtr = 1 - shiftCount;
 440
 441 }
 442
 443 /*----------------------------------------------------------------------------
 444 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 445 | double-precision floating-point value, returning the result.  After being
 446 | shifted into the proper positions, the three fields are simply added
 447 | together to form the result.  This means that any integer portion of `zSig'
 448 | will be added into the exponent.  Since a properly normalized significand
 449 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 450 | than the desired result exponent whenever `zSig' is a complete, normalized
 451 | significand.
 452 *----------------------------------------------------------------------------*/
 453
 454 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 455 {
 456
 457     return make_float64(
 458         ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
 459
 460 }
 461
 462 /*----------------------------------------------------------------------------
 463 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 464 | and significand `zSig', and returns the proper double-precision floating-
 465 | point value corresponding to the abstract input.  Ordinarily, the abstract
 466 | value is simply rounded and packed into the double-precision format, with
 467 | the inexact exception raised if the abstract input cannot be represented
 468 | exactly.  However, if the abstract value is too large, the overflow and
 469 | inexact exceptions are raised and an infinity or maximal finite value is
 470 | returned.  If the abstract value is too small, the input value is rounded
 471 | to a subnormal number, and the underflow and inexact exceptions are raised
 472 | if the abstract input cannot be represented exactly as a subnormal double-
 473 | precision floating-point number.
 474 |     The input significand `zSig' has its binary point between bits 62
 475 | and 61, which is 10 bits to the left of the usual location.  This shifted
 476 | significand must be normalized or smaller.  If `zSig' is not normalized,
 477 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 478 | and it must not require rounding.  In the usual case that `zSig' is
 479 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 480 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 481 | Binary Floating-Point Arithmetic.
 482 *----------------------------------------------------------------------------*/
 483
 484 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 485 {
 486     int8 roundingMode;
 487     flag roundNearestEven;
 488     int16 roundIncrement, roundBits;
 489     flag isTiny;
 490
 491     roundingMode = STATUS(float_rounding_mode);
 492     roundNearestEven = ( roundingMode == float_round_nearest_even );
 493     roundIncrement = 0x200;
 494     if ( ! roundNearestEven ) {
 495         if ( roundingMode == float_round_to_zero ) {
 496             roundIncrement = 0;
 497         }
 498         else {
 499             roundIncrement = 0x3FF;
 500             if ( zSign ) {
 501                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 502             }
 503             else {
 504                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 505             }
 506         }
 507     }
 508     roundBits = zSig & 0x3FF;
 509     if ( 0x7FD <= (bits16) zExp ) {
 510         if (    ( 0x7FD < zExp )
 511              || (    ( zExp == 0x7FD )
 512                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 513            ) {
 514             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 515             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 516         }
 517         if ( zExp < 0 ) {
 518             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
 519             isTiny =
 520                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 521                 || ( zExp < -1 )
 522                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 523             shift64RightJamming( zSig, - zExp, &zSig );
 524             zExp = 0;
 525             roundBits = zSig & 0x3FF;
 526             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 527         }
 528     }
 529     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 530     zSig = ( zSig + roundIncrement )>>10;
 531     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 532     if ( zSig == 0 ) zExp = 0;
 533     return packFloat64( zSign, zExp, zSig );
 534
 535 }
 536
 537 /*----------------------------------------------------------------------------
 538 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 539 | and significand `zSig', and returns the proper double-precision floating-
 540 | point value corresponding to the abstract input.  This routine is just like
 541 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 542 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 543 | floating-point exponent.
 544 *----------------------------------------------------------------------------*/
 545
 546 static float64
 547  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 548 {
 549     int8 shiftCount;
 550
 551     shiftCount = countLeadingZeros64( zSig ) - 1;
 552     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 553
 554 }
 555
 556 #ifdef FLOATX80
 557
 558 /*----------------------------------------------------------------------------
 559 | Returns the fraction bits of the extended double-precision floating-point
 560 | value `a'.
 561 *----------------------------------------------------------------------------*/
 562
 563 INLINE bits64 extractFloatx80Frac( floatx80 a )
 564 {
 565
 566     return a.low;
 567
 568 }
 569
 570 /*----------------------------------------------------------------------------
 571 | Returns the exponent bits of the extended double-precision floating-point
 572 | value `a'.
 573 *----------------------------------------------------------------------------*/
 574
 575 INLINE int32 extractFloatx80Exp( floatx80 a )
 576 {
 577
 578     return a.high & 0x7FFF;
 579
 580 }
 581
 582 /*----------------------------------------------------------------------------
 583 | Returns the sign bit of the extended double-precision floating-point value
 584 | `a'.
 585 *----------------------------------------------------------------------------*/
 586
 587 INLINE flag extractFloatx80Sign( floatx80 a )
 588 {
 589
 590     return a.high>>15;
 591
 592 }
 593
 594 /*----------------------------------------------------------------------------
 595 | Normalizes the subnormal extended double-precision floating-point value
 596 | represented by the denormalized significand `aSig'.  The normalized exponent
 597 | and significand are stored at the locations pointed to by `zExpPtr' and
 598 | `zSigPtr', respectively.
 599 *----------------------------------------------------------------------------*/
 600
 601 static void
 602  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 603 {
 604     int8 shiftCount;
 605
 606     shiftCount = countLeadingZeros64( aSig );
 607     *zSigPtr = aSig<<shiftCount;
 608     *zExpPtr = 1 - shiftCount;
 609
 610 }
 611
 612 /*----------------------------------------------------------------------------
 613 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 614 | extended double-precision floating-point value, returning the result.
 615 *----------------------------------------------------------------------------*/
 616
 617 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 618 {
 619     floatx80 z;
 620
 621     z.low = zSig;
 622     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 623     return z;
 624
 625 }
 626
 627 /*----------------------------------------------------------------------------
 628 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 629 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 630 | and returns the proper extended double-precision floating-point value
 631 | corresponding to the abstract input.  Ordinarily, the abstract value is
 632 | rounded and packed into the extended double-precision format, with the
 633 | inexact exception raised if the abstract input cannot be represented
 634 | exactly.  However, if the abstract value is too large, the overflow and
 635 | inexact exceptions are raised and an infinity or maximal finite value is
 636 | returned.  If the abstract value is too small, the input value is rounded to
 637 | a subnormal number, and the underflow and inexact exceptions are raised if
 638 | the abstract input cannot be represented exactly as a subnormal extended
 639 | double-precision floating-point number.
 640 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 641 | number of bits as single or double precision, respectively.  Otherwise, the
 642 | result is rounded to the full precision of the extended double-precision
 643 | format.
 644 |     The input significand must be normalized or smaller.  If the input
 645 | significand is not normalized, `zExp' must be 0; in that case, the result
 646 | returned is a subnormal number, and it must not require rounding.  The
 647 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 648 | Floating-Point Arithmetic.
 649 *----------------------------------------------------------------------------*/
 650
 651 static floatx80
 652  roundAndPackFloatx80(
 653      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 654  STATUS_PARAM)
 655 {
 656     int8 roundingMode;
 657     flag roundNearestEven, increment, isTiny;
 658     int64 roundIncrement, roundMask, roundBits;
 659
 660     roundingMode = STATUS(float_rounding_mode);
 661     roundNearestEven = ( roundingMode == float_round_nearest_even );
 662     if ( roundingPrecision == 80 ) goto precision80;
 663     if ( roundingPrecision == 64 ) {
 664         roundIncrement = LIT64( 0x0000000000000400 );
 665         roundMask = LIT64( 0x00000000000007FF );
 666     }
 667     else if ( roundingPrecision == 32 ) {
 668         roundIncrement = LIT64( 0x0000008000000000 );
 669         roundMask = LIT64( 0x000000FFFFFFFFFF );
 670     }
 671     else {
 672         goto precision80;
 673     }
 674     zSig0 |= ( zSig1 != 0 );
 675     if ( ! roundNearestEven ) {
 676         if ( roundingMode == float_round_to_zero ) {
 677             roundIncrement = 0;
 678         }
 679         else {
 680             roundIncrement = roundMask;
 681             if ( zSign ) {
 682                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 683             }
 684             else {
 685                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 686             }
 687         }
 688     }
 689     roundBits = zSig0 & roundMask;
 690     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 691         if (    ( 0x7FFE < zExp )
 692              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 693            ) {
 694             goto overflow;
 695         }
 696         if ( zExp <= 0 ) {
 697             if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
 698             isTiny =
 699                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 700                 || ( zExp < 0 )
 701                 || ( zSig0 <= zSig0 + roundIncrement );
 702             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 703             zExp = 0;
 704             roundBits = zSig0 & roundMask;
 705             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 706             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 707             zSig0 += roundIncrement;
 708             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 709             roundIncrement = roundMask + 1;
 710             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 711                 roundMask |= roundIncrement;
 712             }
 713             zSig0 &= ~ roundMask;
 714             return packFloatx80( zSign, zExp, zSig0 );
 715         }
 716     }
 717     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 718     zSig0 += roundIncrement;
 719     if ( zSig0 < roundIncrement ) {
 720         ++zExp;
 721         zSig0 = LIT64( 0x8000000000000000 );
 722     }
 723     roundIncrement = roundMask + 1;
 724     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 725         roundMask |= roundIncrement;
 726     }
 727     zSig0 &= ~ roundMask;
 728     if ( zSig0 == 0 ) zExp = 0;
 729     return packFloatx80( zSign, zExp, zSig0 );
 730  precision80:
 731     increment = ( (sbits64) zSig1 < 0 );
 732     if ( ! roundNearestEven ) {
 733         if ( roundingMode == float_round_to_zero ) {
 734             increment = 0;
 735         }
 736         else {
 737             if ( zSign ) {
 738                 increment = ( roundingMode == float_round_down ) && zSig1;
 739             }
 740             else {
 741                 increment = ( roundingMode == float_round_up ) && zSig1;
 742             }
 743         }
 744     }
 745     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 746         if (    ( 0x7FFE < zExp )
 747              || (    ( zExp == 0x7FFE )
 748                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 749                   && increment
 750                 )
 751            ) {
 752             roundMask = 0;
 753  overflow:
 754             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 755             if (    ( roundingMode == float_round_to_zero )
 756                  || ( zSign && ( roundingMode == float_round_up ) )
 757                  || ( ! zSign && ( roundingMode == float_round_down ) )
 758                ) {
 759                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 760             }
 761             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 762         }
 763         if ( zExp <= 0 ) {
 764             isTiny =
 765                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 766                 || ( zExp < 0 )
 767                 || ! increment
 768                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 769             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 770             zExp = 0;
 771             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 772             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 773             if ( roundNearestEven ) {
 774                 increment = ( (sbits64) zSig1 < 0 );
 775             }
 776             else {
 777                 if ( zSign ) {
 778                     increment = ( roundingMode == float_round_down ) && zSig1;
 779                 }
 780                 else {
 781                     increment = ( roundingMode == float_round_up ) && zSig1;
 782                 }
 783             }
 784             if ( increment ) {
 785                 ++zSig0;
 786                 zSig0 &=
 787                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 788                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 789             }
 790             return packFloatx80( zSign, zExp, zSig0 );
 791         }
 792     }
 793     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 794     if ( increment ) {
 795         ++zSig0;
 796         if ( zSig0 == 0 ) {
 797             ++zExp;
 798             zSig0 = LIT64( 0x8000000000000000 );
 799         }
 800         else {
 801             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 802         }
 803     }
 804     else {
 805         if ( zSig0 == 0 ) zExp = 0;
 806     }
 807     return packFloatx80( zSign, zExp, zSig0 );
 808
 809 }
 810
 811 /*----------------------------------------------------------------------------
 812 | Takes an abstract floating-point value having sign `zSign', exponent
 813 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 814 | and returns the proper extended double-precision floating-point value
 815 | corresponding to the abstract input.  This routine is just like
 816 | `roundAndPackFloatx80' except that the input significand does not have to be
 817 | normalized.
 818 *----------------------------------------------------------------------------*/
 819
 820 static floatx80
 821  normalizeRoundAndPackFloatx80(
 822      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 823  STATUS_PARAM)
 824 {
 825     int8 shiftCount;
 826
 827     if ( zSig0 == 0 ) {
 828         zSig0 = zSig1;
 829         zSig1 = 0;
 830         zExp -= 64;
 831     }
 832     shiftCount = countLeadingZeros64( zSig0 );
 833     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 834     zExp -= shiftCount;
 835     return
 836         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 837
 838 }
 839
 840 #endif
 841
 842 #ifdef FLOAT128
 843
 844 /*----------------------------------------------------------------------------
 845 | Returns the least-significant 64 fraction bits of the quadruple-precision
 846 | floating-point value `a'.
 847 *----------------------------------------------------------------------------*/
 848
 849 INLINE bits64 extractFloat128Frac1( float128 a )
 850 {
 851
 852     return a.low;
 853
 854 }
 855
 856 /*----------------------------------------------------------------------------
 857 | Returns the most-significant 48 fraction bits of the quadruple-precision
 858 | floating-point value `a'.
 859 *----------------------------------------------------------------------------*/
 860
 861 INLINE bits64 extractFloat128Frac0( float128 a )
 862 {
 863
 864     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 865
 866 }
 867
 868 /*----------------------------------------------------------------------------
 869 | Returns the exponent bits of the quadruple-precision floating-point value
 870 | `a'.
 871 *----------------------------------------------------------------------------*/
 872
 873 INLINE int32 extractFloat128Exp( float128 a )
 874 {
 875
 876     return ( a.high>>48 ) & 0x7FFF;
 877
 878 }
 879
 880 /*----------------------------------------------------------------------------
 881 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 882 *----------------------------------------------------------------------------*/
 883
 884 INLINE flag extractFloat128Sign( float128 a )
 885 {
 886
 887     return a.high>>63;
 888
 889 }
 890
 891 /*----------------------------------------------------------------------------
 892 | Normalizes the subnormal quadruple-precision floating-point value
 893 | represented by the denormalized significand formed by the concatenation of
 894 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 895 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 896 | significand are stored at the location pointed to by `zSig0Ptr', and the
 897 | least significant 64 bits of the normalized significand are stored at the
 898 | location pointed to by `zSig1Ptr'.
 899 *----------------------------------------------------------------------------*/
 900
 901 static void
 902  normalizeFloat128Subnormal(
 903      bits64 aSig0,
 904      bits64 aSig1,
 905      int32 *zExpPtr,
 906      bits64 *zSig0Ptr,
 907      bits64 *zSig1Ptr
 908  )
 909 {
 910     int8 shiftCount;
 911
 912     if ( aSig0 == 0 ) {
 913         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 914         if ( shiftCount < 0 ) {
 915             *zSig0Ptr = aSig1>>( - shiftCount );
 916             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 917         }
 918         else {
 919             *zSig0Ptr = aSig1<<shiftCount;
 920             *zSig1Ptr = 0;
 921         }
 922         *zExpPtr = - shiftCount - 63;
 923     }
 924     else {
 925         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 926         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 927         *zExpPtr = 1 - shiftCount;
 928     }
 929
 930 }
 931
 932 /*----------------------------------------------------------------------------
 933 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 934 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 935 | floating-point value, returning the result.  After being shifted into the
 936 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 937 | added together to form the most significant 32 bits of the result.  This
 938 | means that any integer portion of `zSig0' will be added into the exponent.
 939 | Since a properly normalized significand will have an integer portion equal
 940 | to 1, the `zExp' input should be 1 less than the desired result exponent
 941 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 942 | significand.
 943 *----------------------------------------------------------------------------*/
 944
 945 INLINE float128
 946  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 947 {
 948     float128 z;
 949
 950     z.low = zSig1;
 951     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 952     return z;
 953
 954 }
 955
 956 /*----------------------------------------------------------------------------
 957 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 958 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 959 | and `zSig2', and returns the proper quadruple-precision floating-point value
 960 | corresponding to the abstract input.  Ordinarily, the abstract value is
 961 | simply rounded and packed into the quadruple-precision format, with the
 962 | inexact exception raised if the abstract input cannot be represented
 963 | exactly.  However, if the abstract value is too large, the overflow and
 964 | inexact exceptions are raised and an infinity or maximal finite value is
 965 | returned.  If the abstract value is too small, the input value is rounded to
 966 | a subnormal number, and the underflow and inexact exceptions are raised if
 967 | the abstract input cannot be represented exactly as a subnormal quadruple-
 968 | precision floating-point number.
 969 |     The input significand must be normalized or smaller.  If the input
 970 | significand is not normalized, `zExp' must be 0; in that case, the result
 971 | returned is a subnormal number, and it must not require rounding.  In the
 972 | usual case that the input significand is normalized, `zExp' must be 1 less
 973 | than the ``true'' floating-point exponent.  The handling of underflow and
 974 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 975 *----------------------------------------------------------------------------*/
 976
 977 static float128
 978  roundAndPackFloat128(
 979      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
 980 {
 981     int8 roundingMode;
 982     flag roundNearestEven, increment, isTiny;
 983
 984     roundingMode = STATUS(float_rounding_mode);
 985     roundNearestEven = ( roundingMode == float_round_nearest_even );
 986     increment = ( (sbits64) zSig2 < 0 );
 987     if ( ! roundNearestEven ) {
 988         if ( roundingMode == float_round_to_zero ) {
 989             increment = 0;
 990         }
 991         else {
 992             if ( zSign ) {
 993                 increment = ( roundingMode == float_round_down ) && zSig2;
 994             }
 995             else {
 996                 increment = ( roundingMode == float_round_up ) && zSig2;
 997             }
 998         }
 999     }
1000     if ( 0x7FFD <= (bits32) zExp ) {
1001         if (    ( 0x7FFD < zExp )
1002              || (    ( zExp == 0x7FFD )
1003                   && eq128(
1004                          LIT64( 0x0001FFFFFFFFFFFF ),
1005                          LIT64( 0xFFFFFFFFFFFFFFFF ),
1006                          zSig0,
1007                          zSig1
1008                      )
1009                   && increment
1010                 )
1011            ) {
1012             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1013             if (    ( roundingMode == float_round_to_zero )
1014                  || ( zSign && ( roundingMode == float_round_up ) )
1015                  || ( ! zSign && ( roundingMode == float_round_down ) )
1016                ) {
1017                 return
1018                     packFloat128(
1019                         zSign,
1020                         0x7FFE,
1021                         LIT64( 0x0000FFFFFFFFFFFF ),
1022                         LIT64( 0xFFFFFFFFFFFFFFFF )
1023                     );
1024             }
1025             return packFloat128( zSign, 0x7FFF, 0, 0 );
1026         }
1027         if ( zExp < 0 ) {
1028             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1029             isTiny =
1030                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1031                 || ( zExp < -1 )
1032                 || ! increment
1033                 || lt128(
1034                        zSig0,
1035                        zSig1,
1036                        LIT64( 0x0001FFFFFFFFFFFF ),
1037                        LIT64( 0xFFFFFFFFFFFFFFFF )
1038                    );
1039             shift128ExtraRightJamming(
1040                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1041             zExp = 0;
1042             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1043             if ( roundNearestEven ) {
1044                 increment = ( (sbits64) zSig2 < 0 );
1045             }
1046             else {
1047                 if ( zSign ) {
1048                     increment = ( roundingMode == float_round_down ) && zSig2;
1049                 }
1050                 else {
1051                     increment = ( roundingMode == float_round_up ) && zSig2;
1052                 }
1053             }
1054         }
1055     }
1056     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1057     if ( increment ) {
1058         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1059         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1060     }
1061     else {
1062         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1063     }
1064     return packFloat128( zSign, zExp, zSig0, zSig1 );
1065
1066 }
1067
1068 /*----------------------------------------------------------------------------
1069 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1070 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1071 | returns the proper quadruple-precision floating-point value corresponding
1072 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1073 | except that the input significand has fewer bits and does not have to be
1074 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1075 | point exponent.
1076 *----------------------------------------------------------------------------*/
1077
1078 static float128
1079  normalizeRoundAndPackFloat128(
1080      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1081 {
1082     int8 shiftCount;
1083     bits64 zSig2;
1084
1085     if ( zSig0 == 0 ) {
1086         zSig0 = zSig1;
1087         zSig1 = 0;
1088         zExp -= 64;
1089     }
1090     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1091     if ( 0 <= shiftCount ) {
1092         zSig2 = 0;
1093         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1094     }
1095     else {
1096         shift128ExtraRightJamming(
1097             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1098     }
1099     zExp -= shiftCount;
1100     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1101
1102 }
1103
1104 #endif
1105
1106 /*----------------------------------------------------------------------------
1107 | Returns the result of converting the 32-bit two's complement integer `a'
1108 | to the single-precision floating-point format.  The conversion is performed
1109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110 *----------------------------------------------------------------------------*/
1111
1112 float32 int32_to_float32( int32 a STATUS_PARAM )
1113 {
1114     flag zSign;
1115
1116     if ( a == 0 ) return float32_zero;
1117     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1118     zSign = ( a < 0 );
1119     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1120
1121 }
1122
1123 /*----------------------------------------------------------------------------
1124 | Returns the result of converting the 32-bit two's complement integer `a'
1125 | to the double-precision floating-point format.  The conversion is performed
1126 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1128
1129 float64 int32_to_float64( int32 a STATUS_PARAM )
1130 {
1131     flag zSign;
1132     uint32 absA;
1133     int8 shiftCount;
1134     bits64 zSig;
1135
1136     if ( a == 0 ) return float64_zero;
1137     zSign = ( a < 0 );
1138     absA = zSign ? - a : a;
1139     shiftCount = countLeadingZeros32( absA ) + 21;
1140     zSig = absA;
1141     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1142
1143 }
1144
1145 #ifdef FLOATX80
1146
1147 /*----------------------------------------------------------------------------
1148 | Returns the result of converting the 32-bit two's complement integer `a'
1149 | to the extended double-precision floating-point format.  The conversion
1150 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1151 | Arithmetic.
1152 *----------------------------------------------------------------------------*/
1153
1154 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1155 {
1156     flag zSign;
1157     uint32 absA;
1158     int8 shiftCount;
1159     bits64 zSig;
1160
1161     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1162     zSign = ( a < 0 );
1163     absA = zSign ? - a : a;
1164     shiftCount = countLeadingZeros32( absA ) + 32;
1165     zSig = absA;
1166     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1167
1168 }
1169
1170 #endif
1171
1172 #ifdef FLOAT128
1173
1174 /*----------------------------------------------------------------------------
1175 | Returns the result of converting the 32-bit two's complement integer `a' to
1176 | the quadruple-precision floating-point format.  The conversion is performed
1177 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1178 *----------------------------------------------------------------------------*/
1179
1180 float128 int32_to_float128( int32 a STATUS_PARAM )
1181 {
1182     flag zSign;
1183     uint32 absA;
1184     int8 shiftCount;
1185     bits64 zSig0;
1186
1187     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1188     zSign = ( a < 0 );
1189     absA = zSign ? - a : a;
1190     shiftCount = countLeadingZeros32( absA ) + 17;
1191     zSig0 = absA;
1192     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1193
1194 }
1195
1196 #endif
1197
1198 /*----------------------------------------------------------------------------
1199 | Returns the result of converting the 64-bit two's complement integer `a'
1200 | to the single-precision floating-point format.  The conversion is performed
1201 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1202 *----------------------------------------------------------------------------*/
1203
1204 float32 int64_to_float32( int64 a STATUS_PARAM )
1205 {
1206     flag zSign;
1207     uint64 absA;
1208     int8 shiftCount;
1209
1210     if ( a == 0 ) return float32_zero;
1211     zSign = ( a < 0 );
1212     absA = zSign ? - a : a;
1213     shiftCount = countLeadingZeros64( absA ) - 40;
1214     if ( 0 <= shiftCount ) {
1215         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1216     }
1217     else {
1218         shiftCount += 7;
1219         if ( shiftCount < 0 ) {
1220             shift64RightJamming( absA, - shiftCount, &absA );
1221         }
1222         else {
1223             absA <<= shiftCount;
1224         }
1225         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1226     }
1227
1228 }
1229
1230 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1231 {
1232     int8 shiftCount;
1233
1234     if ( a == 0 ) return float32_zero;
1235     shiftCount = countLeadingZeros64( a ) - 40;
1236     if ( 0 <= shiftCount ) {
1237         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1238     }
1239     else {
1240         shiftCount += 7;
1241         if ( shiftCount < 0 ) {
1242             shift64RightJamming( a, - shiftCount, &a );
1243         }
1244         else {
1245             a <<= shiftCount;
1246         }
1247         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1248     }
1249 }
1250
1251 /*----------------------------------------------------------------------------
1252 | Returns the result of converting the 64-bit two's complement integer `a'
1253 | to the double-precision floating-point format.  The conversion is performed
1254 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1255 *----------------------------------------------------------------------------*/
1256
1257 float64 int64_to_float64( int64 a STATUS_PARAM )
1258 {
1259     flag zSign;
1260
1261     if ( a == 0 ) return float64_zero;
1262     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1263         return packFloat64( 1, 0x43E, 0 );
1264     }
1265     zSign = ( a < 0 );
1266     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1267
1268 }
1269
1270 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1271 {
1272     if ( a == 0 ) return float64_zero;
1273     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1274
1275 }
1276
1277 #ifdef FLOATX80
1278
1279 /*----------------------------------------------------------------------------
1280 | Returns the result of converting the 64-bit two's complement integer `a'
1281 | to the extended double-precision floating-point format.  The conversion
1282 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1283 | Arithmetic.
1284 *----------------------------------------------------------------------------*/
1285
1286 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1287 {
1288     flag zSign;
1289     uint64 absA;
1290     int8 shiftCount;
1291
1292     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1293     zSign = ( a < 0 );
1294     absA = zSign ? - a : a;
1295     shiftCount = countLeadingZeros64( absA );
1296     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1297
1298 }
1299
1300 #endif
1301
1302 #ifdef FLOAT128
1303
1304 /*----------------------------------------------------------------------------
1305 | Returns the result of converting the 64-bit two's complement integer `a' to
1306 | the quadruple-precision floating-point format.  The conversion is performed
1307 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1308 *----------------------------------------------------------------------------*/
1309
1310 float128 int64_to_float128( int64 a STATUS_PARAM )
1311 {
1312     flag zSign;
1313     uint64 absA;
1314     int8 shiftCount;
1315     int32 zExp;
1316     bits64 zSig0, zSig1;
1317
1318     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1319     zSign = ( a < 0 );
1320     absA = zSign ? - a : a;
1321     shiftCount = countLeadingZeros64( absA ) + 49;
1322     zExp = 0x406E - shiftCount;
1323     if ( 64 <= shiftCount ) {
1324         zSig1 = 0;
1325         zSig0 = absA;
1326         shiftCount -= 64;
1327     }
1328     else {
1329         zSig1 = absA;
1330         zSig0 = 0;
1331     }
1332     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1333     return packFloat128( zSign, zExp, zSig0, zSig1 );
1334
1335 }
1336
1337 #endif
1338
1339 /*----------------------------------------------------------------------------
1340 | Returns the result of converting the single-precision floating-point value
1341 | `a' to the 32-bit two's complement integer format.  The conversion is
1342 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1343 | Arithmetic---which means in particular that the conversion is rounded
1344 | according to the current rounding mode.  If `a' is a NaN, the largest
1345 | positive integer is returned.  Otherwise, if the conversion overflows, the
1346 | largest integer with the same sign as `a' is returned.
1347 *----------------------------------------------------------------------------*/
1348
1349 int32 float32_to_int32( float32 a STATUS_PARAM )
1350 {
1351     flag aSign;
1352     int16 aExp, shiftCount;
1353     bits32 aSig;
1354     bits64 aSig64;
1355
1356     a = float32_squash_input_denormal(a STATUS_VAR);
1357     aSig = extractFloat32Frac( a );
1358     aExp = extractFloat32Exp( a );
1359     aSign = extractFloat32Sign( a );
1360     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1361     if ( aExp ) aSig |= 0x00800000;
1362     shiftCount = 0xAF - aExp;
1363     aSig64 = aSig;
1364     aSig64 <<= 32;
1365     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1366     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1367
1368 }
1369
1370 /*----------------------------------------------------------------------------
1371 | Returns the result of converting the single-precision floating-point value
1372 | `a' to the 32-bit two's complement integer format.  The conversion is
1373 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1374 | Arithmetic, except that the conversion is always rounded toward zero.
1375 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1376 | the conversion overflows, the largest integer with the same sign as `a' is
1377 | returned.
1378 *----------------------------------------------------------------------------*/
1379
1380 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1381 {
1382     flag aSign;
1383     int16 aExp, shiftCount;
1384     bits32 aSig;
1385     int32 z;
1386     a = float32_squash_input_denormal(a STATUS_VAR);
1387
1388     aSig = extractFloat32Frac( a );
1389     aExp = extractFloat32Exp( a );
1390     aSign = extractFloat32Sign( a );
1391     shiftCount = aExp - 0x9E;
1392     if ( 0 <= shiftCount ) {
1393         if ( float32_val(a) != 0xCF000000 ) {
1394             float_raise( float_flag_invalid STATUS_VAR);
1395             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1396         }
1397         return (sbits32) 0x80000000;
1398     }
1399     else if ( aExp <= 0x7E ) {
1400         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1401         return 0;
1402     }
1403     aSig = ( aSig | 0x00800000 )<<8;
1404     z = aSig>>( - shiftCount );
1405     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1406         STATUS(float_exception_flags) |= float_flag_inexact;
1407     }
1408     if ( aSign ) z = - z;
1409     return z;
1410
1411 }
1412
1413 /*----------------------------------------------------------------------------
1414 | Returns the result of converting the single-precision floating-point value
1415 | `a' to the 16-bit two's complement integer format.  The conversion is
1416 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1417 | Arithmetic, except that the conversion is always rounded toward zero.
1418 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1419 | the conversion overflows, the largest integer with the same sign as `a' is
1420 | returned.
1421 *----------------------------------------------------------------------------*/
1422
1423 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1424 {
1425     flag aSign;
1426     int16 aExp, shiftCount;
1427     bits32 aSig;
1428     int32 z;
1429
1430     aSig = extractFloat32Frac( a );
1431     aExp = extractFloat32Exp( a );
1432     aSign = extractFloat32Sign( a );
1433     shiftCount = aExp - 0x8E;
1434     if ( 0 <= shiftCount ) {
1435         if ( float32_val(a) != 0xC7000000 ) {
1436             float_raise( float_flag_invalid STATUS_VAR);
1437             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1438                 return 0x7FFF;
1439             }
1440         }
1441         return (sbits32) 0xffff8000;
1442     }
1443     else if ( aExp <= 0x7E ) {
1444         if ( aExp | aSig ) {
1445             STATUS(float_exception_flags) |= float_flag_inexact;
1446         }
1447         return 0;
1448     }
1449     shiftCount -= 0x10;
1450     aSig = ( aSig | 0x00800000 )<<8;
1451     z = aSig>>( - shiftCount );
1452     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1453         STATUS(float_exception_flags) |= float_flag_inexact;
1454     }
1455     if ( aSign ) {
1456         z = - z;
1457     }
1458     return z;
1459
1460 }
1461
1462 /*----------------------------------------------------------------------------
1463 | Returns the result of converting the single-precision floating-point value
1464 | `a' to the 64-bit two's complement integer format.  The conversion is
1465 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1466 | Arithmetic---which means in particular that the conversion is rounded
1467 | according to the current rounding mode.  If `a' is a NaN, the largest
1468 | positive integer is returned.  Otherwise, if the conversion overflows, the
1469 | largest integer with the same sign as `a' is returned.
1470 *----------------------------------------------------------------------------*/
1471
1472 int64 float32_to_int64( float32 a STATUS_PARAM )
1473 {
1474     flag aSign;
1475     int16 aExp, shiftCount;
1476     bits32 aSig;
1477     bits64 aSig64, aSigExtra;
1478     a = float32_squash_input_denormal(a STATUS_VAR);
1479
1480     aSig = extractFloat32Frac( a );
1481     aExp = extractFloat32Exp( a );
1482     aSign = extractFloat32Sign( a );
1483     shiftCount = 0xBE - aExp;
1484     if ( shiftCount < 0 ) {
1485         float_raise( float_flag_invalid STATUS_VAR);
1486         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1487             return LIT64( 0x7FFFFFFFFFFFFFFF );
1488         }
1489         return (sbits64) LIT64( 0x8000000000000000 );
1490     }
1491     if ( aExp ) aSig |= 0x00800000;
1492     aSig64 = aSig;
1493     aSig64 <<= 40;
1494     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1495     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1496
1497 }
1498
1499 /*----------------------------------------------------------------------------
1500 | Returns the result of converting the single-precision floating-point value
1501 | `a' to the 64-bit two's complement integer format.  The conversion is
1502 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1503 | Arithmetic, except that the conversion is always rounded toward zero.  If
1504 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1505 | conversion overflows, the largest integer with the same sign as `a' is
1506 | returned.
1507 *----------------------------------------------------------------------------*/
1508
1509 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1510 {
1511     flag aSign;
1512     int16 aExp, shiftCount;
1513     bits32 aSig;
1514     bits64 aSig64;
1515     int64 z;
1516     a = float32_squash_input_denormal(a STATUS_VAR);
1517
1518     aSig = extractFloat32Frac( a );
1519     aExp = extractFloat32Exp( a );
1520     aSign = extractFloat32Sign( a );
1521     shiftCount = aExp - 0xBE;
1522     if ( 0 <= shiftCount ) {
1523         if ( float32_val(a) != 0xDF000000 ) {
1524             float_raise( float_flag_invalid STATUS_VAR);
1525             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1526                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1527             }
1528         }
1529         return (sbits64) LIT64( 0x8000000000000000 );
1530     }
1531     else if ( aExp <= 0x7E ) {
1532         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1533         return 0;
1534     }
1535     aSig64 = aSig | 0x00800000;
1536     aSig64 <<= 40;
1537     z = aSig64>>( - shiftCount );
1538     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1539         STATUS(float_exception_flags) |= float_flag_inexact;
1540     }
1541     if ( aSign ) z = - z;
1542     return z;
1543
1544 }
1545
1546 /*----------------------------------------------------------------------------
1547 | Returns the result of converting the single-precision floating-point value
1548 | `a' to the double-precision floating-point format.  The conversion is
1549 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1550 | Arithmetic.
1551 *----------------------------------------------------------------------------*/
1552
1553 float64 float32_to_float64( float32 a STATUS_PARAM )
1554 {
1555     flag aSign;
1556     int16 aExp;
1557     bits32 aSig;
1558     a = float32_squash_input_denormal(a STATUS_VAR);
1559
1560     aSig = extractFloat32Frac( a );
1561     aExp = extractFloat32Exp( a );
1562     aSign = extractFloat32Sign( a );
1563     if ( aExp == 0xFF ) {
1564         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1565         return packFloat64( aSign, 0x7FF, 0 );
1566     }
1567     if ( aExp == 0 ) {
1568         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1569         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1570         --aExp;
1571     }
1572     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1573
1574 }
1575
1576 #ifdef FLOATX80
1577
1578 /*----------------------------------------------------------------------------
1579 | Returns the result of converting the single-precision floating-point value
1580 | `a' to the extended double-precision floating-point format.  The conversion
1581 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1582 | Arithmetic.
1583 *----------------------------------------------------------------------------*/
1584
1585 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1586 {
1587     flag aSign;
1588     int16 aExp;
1589     bits32 aSig;
1590
1591     a = float32_squash_input_denormal(a STATUS_VAR);
1592     aSig = extractFloat32Frac( a );
1593     aExp = extractFloat32Exp( a );
1594     aSign = extractFloat32Sign( a );
1595     if ( aExp == 0xFF ) {
1596         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1597         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1598     }
1599     if ( aExp == 0 ) {
1600         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1601         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1602     }
1603     aSig |= 0x00800000;
1604     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1605
1606 }
1607
1608 #endif
1609
1610 #ifdef FLOAT128
1611
1612 /*----------------------------------------------------------------------------
1613 | Returns the result of converting the single-precision floating-point value
1614 | `a' to the double-precision floating-point format.  The conversion is
1615 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1616 | Arithmetic.
1617 *----------------------------------------------------------------------------*/
1618
1619 float128 float32_to_float128( float32 a STATUS_PARAM )
1620 {
1621     flag aSign;
1622     int16 aExp;
1623     bits32 aSig;
1624
1625     a = float32_squash_input_denormal(a STATUS_VAR);
1626     aSig = extractFloat32Frac( a );
1627     aExp = extractFloat32Exp( a );
1628     aSign = extractFloat32Sign( a );
1629     if ( aExp == 0xFF ) {
1630         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1631         return packFloat128( aSign, 0x7FFF, 0, 0 );
1632     }
1633     if ( aExp == 0 ) {
1634         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1635         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1636         --aExp;
1637     }
1638     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1639
1640 }
1641
1642 #endif
1643
1644 /*----------------------------------------------------------------------------
1645 | Rounds the single-precision floating-point value `a' to an integer, and
1646 | returns the result as a single-precision floating-point value.  The
1647 | operation is performed according to the IEC/IEEE Standard for Binary
1648 | Floating-Point Arithmetic.
1649 *----------------------------------------------------------------------------*/
1650
1651 float32 float32_round_to_int( float32 a STATUS_PARAM)
1652 {
1653     flag aSign;
1654     int16 aExp;
1655     bits32 lastBitMask, roundBitsMask;
1656     int8 roundingMode;
1657     bits32 z;
1658     a = float32_squash_input_denormal(a STATUS_VAR);
1659
1660     aExp = extractFloat32Exp( a );
1661     if ( 0x96 <= aExp ) {
1662         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1663             return propagateFloat32NaN( a, a STATUS_VAR );
1664         }
1665         return a;
1666     }
1667     if ( aExp <= 0x7E ) {
1668         if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1669         STATUS(float_exception_flags) |= float_flag_inexact;
1670         aSign = extractFloat32Sign( a );
1671         switch ( STATUS(float_rounding_mode) ) {
1672          case float_round_nearest_even:
1673             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1674                 return packFloat32( aSign, 0x7F, 0 );
1675             }
1676             break;
1677          case float_round_down:
1678             return make_float32(aSign ? 0xBF800000 : 0);
1679          case float_round_up:
1680             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1681         }
1682         return packFloat32( aSign, 0, 0 );
1683     }
1684     lastBitMask = 1;
1685     lastBitMask <<= 0x96 - aExp;
1686     roundBitsMask = lastBitMask - 1;
1687     z = float32_val(a);
1688     roundingMode = STATUS(float_rounding_mode);
1689     if ( roundingMode == float_round_nearest_even ) {
1690         z += lastBitMask>>1;
1691         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1692     }
1693     else if ( roundingMode != float_round_to_zero ) {
1694         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1695             z += roundBitsMask;
1696         }
1697     }
1698     z &= ~ roundBitsMask;
1699     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1700     return make_float32(z);
1701
1702 }
1703
1704 /*----------------------------------------------------------------------------
1705 | Returns the result of adding the absolute values of the single-precision
1706 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1707 | before being returned.  `zSign' is ignored if the result is a NaN.
1708 | The addition is performed according to the IEC/IEEE Standard for Binary
1709 | Floating-Point Arithmetic.
1710 *----------------------------------------------------------------------------*/
1711
1712 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1713 {
1714     int16 aExp, bExp, zExp;
1715     bits32 aSig, bSig, zSig;
1716     int16 expDiff;
1717
1718     aSig = extractFloat32Frac( a );
1719     aExp = extractFloat32Exp( a );
1720     bSig = extractFloat32Frac( b );
1721     bExp = extractFloat32Exp( b );
1722     expDiff = aExp - bExp;
1723     aSig <<= 6;
1724     bSig <<= 6;
1725     if ( 0 < expDiff ) {
1726         if ( aExp == 0xFF ) {
1727             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1728             return a;
1729         }
1730         if ( bExp == 0 ) {
1731             --expDiff;
1732         }
1733         else {
1734             bSig |= 0x20000000;
1735         }
1736         shift32RightJamming( bSig, expDiff, &bSig );
1737         zExp = aExp;
1738     }
1739     else if ( expDiff < 0 ) {
1740         if ( bExp == 0xFF ) {
1741             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1742             return packFloat32( zSign, 0xFF, 0 );
1743         }
1744         if ( aExp == 0 ) {
1745             ++expDiff;
1746         }
1747         else {
1748             aSig |= 0x20000000;
1749         }
1750         shift32RightJamming( aSig, - expDiff, &aSig );
1751         zExp = bExp;
1752     }
1753     else {
1754         if ( aExp == 0xFF ) {
1755             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1756             return a;
1757         }
1758         if ( aExp == 0 ) {
1759             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1760             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1761         }
1762         zSig = 0x40000000 + aSig + bSig;
1763         zExp = aExp;
1764         goto roundAndPack;
1765     }
1766     aSig |= 0x20000000;
1767     zSig = ( aSig + bSig )<<1;
1768     --zExp;
1769     if ( (sbits32) zSig < 0 ) {
1770         zSig = aSig + bSig;
1771         ++zExp;
1772     }
1773  roundAndPack:
1774     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1775
1776 }
1777
1778 /*----------------------------------------------------------------------------
1779 | Returns the result of subtracting the absolute values of the single-
1780 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1781 | difference is negated before being returned.  `zSign' is ignored if the
1782 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1783 | Standard for Binary Floating-Point Arithmetic.
1784 *----------------------------------------------------------------------------*/
1785
1786 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1787 {
1788     int16 aExp, bExp, zExp;
1789     bits32 aSig, bSig, zSig;
1790     int16 expDiff;
1791
1792     aSig = extractFloat32Frac( a );
1793     aExp = extractFloat32Exp( a );
1794     bSig = extractFloat32Frac( b );
1795     bExp = extractFloat32Exp( b );
1796     expDiff = aExp - bExp;
1797     aSig <<= 7;
1798     bSig <<= 7;
1799     if ( 0 < expDiff ) goto aExpBigger;
1800     if ( expDiff < 0 ) goto bExpBigger;
1801     if ( aExp == 0xFF ) {
1802         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1803         float_raise( float_flag_invalid STATUS_VAR);
1804         return float32_default_nan;
1805     }
1806     if ( aExp == 0 ) {
1807         aExp = 1;
1808         bExp = 1;
1809     }
1810     if ( bSig < aSig ) goto aBigger;
1811     if ( aSig < bSig ) goto bBigger;
1812     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1813  bExpBigger:
1814     if ( bExp == 0xFF ) {
1815         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1816         return packFloat32( zSign ^ 1, 0xFF, 0 );
1817     }
1818     if ( aExp == 0 ) {
1819         ++expDiff;
1820     }
1821     else {
1822         aSig |= 0x40000000;
1823     }
1824     shift32RightJamming( aSig, - expDiff, &aSig );
1825     bSig |= 0x40000000;
1826  bBigger:
1827     zSig = bSig - aSig;
1828     zExp = bExp;
1829     zSign ^= 1;
1830     goto normalizeRoundAndPack;
1831  aExpBigger:
1832     if ( aExp == 0xFF ) {
1833         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1834         return a;
1835     }
1836     if ( bExp == 0 ) {
1837         --expDiff;
1838     }
1839     else {
1840         bSig |= 0x40000000;
1841     }
1842     shift32RightJamming( bSig, expDiff, &bSig );
1843     aSig |= 0x40000000;
1844  aBigger:
1845     zSig = aSig - bSig;
1846     zExp = aExp;
1847  normalizeRoundAndPack:
1848     --zExp;
1849     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1850
1851 }
1852
1853 /*----------------------------------------------------------------------------
1854 | Returns the result of adding the single-precision floating-point values `a'
1855 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1856 | Binary Floating-Point Arithmetic.
1857 *----------------------------------------------------------------------------*/
1858
1859 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1860 {
1861     flag aSign, bSign;
1862     a = float32_squash_input_denormal(a STATUS_VAR);
1863     b = float32_squash_input_denormal(b STATUS_VAR);
1864
1865     aSign = extractFloat32Sign( a );
1866     bSign = extractFloat32Sign( b );
1867     if ( aSign == bSign ) {
1868         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1869     }
1870     else {
1871         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1872     }
1873
1874 }
1875
1876 /*----------------------------------------------------------------------------
1877 | Returns the result of subtracting the single-precision floating-point values
1878 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1879 | for Binary Floating-Point Arithmetic.
1880 *----------------------------------------------------------------------------*/
1881
1882 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1883 {
1884     flag aSign, bSign;
1885     a = float32_squash_input_denormal(a STATUS_VAR);
1886     b = float32_squash_input_denormal(b STATUS_VAR);
1887
1888     aSign = extractFloat32Sign( a );
1889     bSign = extractFloat32Sign( b );
1890     if ( aSign == bSign ) {
1891         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1892     }
1893     else {
1894         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1895     }
1896
1897 }
1898
1899 /*----------------------------------------------------------------------------
1900 | Returns the result of multiplying the single-precision floating-point values
1901 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1902 | for Binary Floating-Point Arithmetic.
1903 *----------------------------------------------------------------------------*/
1904
1905 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1906 {
1907     flag aSign, bSign, zSign;
1908     int16 aExp, bExp, zExp;
1909     bits32 aSig, bSig;
1910     bits64 zSig64;
1911     bits32 zSig;
1912
1913     a = float32_squash_input_denormal(a STATUS_VAR);
1914     b = float32_squash_input_denormal(b STATUS_VAR);
1915
1916     aSig = extractFloat32Frac( a );
1917     aExp = extractFloat32Exp( a );
1918     aSign = extractFloat32Sign( a );
1919     bSig = extractFloat32Frac( b );
1920     bExp = extractFloat32Exp( b );
1921     bSign = extractFloat32Sign( b );
1922     zSign = aSign ^ bSign;
1923     if ( aExp == 0xFF ) {
1924         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1925             return propagateFloat32NaN( a, b STATUS_VAR );
1926         }
1927         if ( ( bExp | bSig ) == 0 ) {
1928             float_raise( float_flag_invalid STATUS_VAR);
1929             return float32_default_nan;
1930         }
1931         return packFloat32( zSign, 0xFF, 0 );
1932     }
1933     if ( bExp == 0xFF ) {
1934         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1935         if ( ( aExp | aSig ) == 0 ) {
1936             float_raise( float_flag_invalid STATUS_VAR);
1937             return float32_default_nan;
1938         }
1939         return packFloat32( zSign, 0xFF, 0 );
1940     }
1941     if ( aExp == 0 ) {
1942         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1943         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1944     }
1945     if ( bExp == 0 ) {
1946         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1947         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1948     }
1949     zExp = aExp + bExp - 0x7F;
1950     aSig = ( aSig | 0x00800000 )<<7;
1951     bSig = ( bSig | 0x00800000 )<<8;
1952     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1953     zSig = zSig64;
1954     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1955         zSig <<= 1;
1956         --zExp;
1957     }
1958     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1959
1960 }
1961
1962 /*----------------------------------------------------------------------------
1963 | Returns the result of dividing the single-precision floating-point value `a'
1964 | by the corresponding value `b'.  The operation is performed according to the
1965 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1966 *----------------------------------------------------------------------------*/
1967
1968 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1969 {
1970     flag aSign, bSign, zSign;
1971     int16 aExp, bExp, zExp;
1972     bits32 aSig, bSig, zSig;
1973     a = float32_squash_input_denormal(a STATUS_VAR);
1974     b = float32_squash_input_denormal(b STATUS_VAR);
1975
1976     aSig = extractFloat32Frac( a );
1977     aExp = extractFloat32Exp( a );
1978     aSign = extractFloat32Sign( a );
1979     bSig = extractFloat32Frac( b );
1980     bExp = extractFloat32Exp( b );
1981     bSign = extractFloat32Sign( b );
1982     zSign = aSign ^ bSign;
1983     if ( aExp == 0xFF ) {
1984         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1985         if ( bExp == 0xFF ) {
1986             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1987             float_raise( float_flag_invalid STATUS_VAR);
1988             return float32_default_nan;
1989         }
1990         return packFloat32( zSign, 0xFF, 0 );
1991     }
1992     if ( bExp == 0xFF ) {
1993         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1994         return packFloat32( zSign, 0, 0 );
1995     }
1996     if ( bExp == 0 ) {
1997         if ( bSig == 0 ) {
1998             if ( ( aExp | aSig ) == 0 ) {
1999                 float_raise( float_flag_invalid STATUS_VAR);
2000                 return float32_default_nan;
2001             }
2002             float_raise( float_flag_divbyzero STATUS_VAR);
2003             return packFloat32( zSign, 0xFF, 0 );
2004         }
2005         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2006     }
2007     if ( aExp == 0 ) {
2008         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2009         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2010     }
2011     zExp = aExp - bExp + 0x7D;
2012     aSig = ( aSig | 0x00800000 )<<7;
2013     bSig = ( bSig | 0x00800000 )<<8;
2014     if ( bSig <= ( aSig + aSig ) ) {
2015         aSig >>= 1;
2016         ++zExp;
2017     }
2018     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
2019     if ( ( zSig & 0x3F ) == 0 ) {
2020         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
2021     }
2022     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2023
2024 }
2025
2026 /*----------------------------------------------------------------------------
2027 | Returns the remainder of the single-precision floating-point value `a'
2028 | with respect to the corresponding value `b'.  The operation is performed
2029 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2030 *----------------------------------------------------------------------------*/
2031
2032 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2033 {
2034     flag aSign, zSign;
2035     int16 aExp, bExp, expDiff;
2036     bits32 aSig, bSig;
2037     bits32 q;
2038     bits64 aSig64, bSig64, q64;
2039     bits32 alternateASig;
2040     sbits32 sigMean;
2041     a = float32_squash_input_denormal(a STATUS_VAR);
2042     b = float32_squash_input_denormal(b STATUS_VAR);
2043
2044     aSig = extractFloat32Frac( a );
2045     aExp = extractFloat32Exp( a );
2046     aSign = extractFloat32Sign( a );
2047     bSig = extractFloat32Frac( b );
2048     bExp = extractFloat32Exp( b );
2049     if ( aExp == 0xFF ) {
2050         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2051             return propagateFloat32NaN( a, b STATUS_VAR );
2052         }
2053         float_raise( float_flag_invalid STATUS_VAR);
2054         return float32_default_nan;
2055     }
2056     if ( bExp == 0xFF ) {
2057         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2058         return a;
2059     }
2060     if ( bExp == 0 ) {
2061         if ( bSig == 0 ) {
2062             float_raise( float_flag_invalid STATUS_VAR);
2063             return float32_default_nan;
2064         }
2065         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2066     }
2067     if ( aExp == 0 ) {
2068         if ( aSig == 0 ) return a;
2069         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2070     }
2071     expDiff = aExp - bExp;
2072     aSig |= 0x00800000;
2073     bSig |= 0x00800000;
2074     if ( expDiff < 32 ) {
2075         aSig <<= 8;
2076         bSig <<= 8;
2077         if ( expDiff < 0 ) {
2078             if ( expDiff < -1 ) return a;
2079             aSig >>= 1;
2080         }
2081         q = ( bSig <= aSig );
2082         if ( q ) aSig -= bSig;
2083         if ( 0 < expDiff ) {
2084             q = ( ( (bits64) aSig )<<32 ) / bSig;
2085             q >>= 32 - expDiff;
2086             bSig >>= 2;
2087             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2088         }
2089         else {
2090             aSig >>= 2;
2091             bSig >>= 2;
2092         }
2093     }
2094     else {
2095         if ( bSig <= aSig ) aSig -= bSig;
2096         aSig64 = ( (bits64) aSig )<<40;
2097         bSig64 = ( (bits64) bSig )<<40;
2098         expDiff -= 64;
2099         while ( 0 < expDiff ) {
2100             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2101             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2102             aSig64 = - ( ( bSig * q64 )<<38 );
2103             expDiff -= 62;
2104         }
2105         expDiff += 64;
2106         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2107         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2108         q = q64>>( 64 - expDiff );
2109         bSig <<= 6;
2110         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2111     }
2112     do {
2113         alternateASig = aSig;
2114         ++q;
2115         aSig -= bSig;
2116     } while ( 0 <= (sbits32) aSig );
2117     sigMean = aSig + alternateASig;
2118     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2119         aSig = alternateASig;
2120     }
2121     zSign = ( (sbits32) aSig < 0 );
2122     if ( zSign ) aSig = - aSig;
2123     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2124
2125 }
2126
2127 /*----------------------------------------------------------------------------
2128 | Returns the square root of the single-precision floating-point value `a'.
2129 | The operation is performed according to the IEC/IEEE Standard for Binary
2130 | Floating-Point Arithmetic.
2131 *----------------------------------------------------------------------------*/
2132
2133 float32 float32_sqrt( float32 a STATUS_PARAM )
2134 {
2135     flag aSign;
2136     int16 aExp, zExp;
2137     bits32 aSig, zSig;
2138     bits64 rem, term;
2139     a = float32_squash_input_denormal(a STATUS_VAR);
2140
2141     aSig = extractFloat32Frac( a );
2142     aExp = extractFloat32Exp( a );
2143     aSign = extractFloat32Sign( a );
2144     if ( aExp == 0xFF ) {
2145         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2146         if ( ! aSign ) return a;
2147         float_raise( float_flag_invalid STATUS_VAR);
2148         return float32_default_nan;
2149     }
2150     if ( aSign ) {
2151         if ( ( aExp | aSig ) == 0 ) return a;
2152         float_raise( float_flag_invalid STATUS_VAR);
2153         return float32_default_nan;
2154     }
2155     if ( aExp == 0 ) {
2156         if ( aSig == 0 ) return float32_zero;
2157         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2158     }
2159     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2160     aSig = ( aSig | 0x00800000 )<<8;
2161     zSig = estimateSqrt32( aExp, aSig ) + 2;
2162     if ( ( zSig & 0x7F ) <= 5 ) {
2163         if ( zSig < 2 ) {
2164             zSig = 0x7FFFFFFF;
2165             goto roundAndPack;
2166         }
2167         aSig >>= aExp & 1;
2168         term = ( (bits64) zSig ) * zSig;
2169         rem = ( ( (bits64) aSig )<<32 ) - term;
2170         while ( (sbits64) rem < 0 ) {
2171             --zSig;
2172             rem += ( ( (bits64) zSig )<<1 ) | 1;
2173         }
2174         zSig |= ( rem != 0 );
2175     }
2176     shift32RightJamming( zSig, 1, &zSig );
2177  roundAndPack:
2178     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2179
2180 }
2181
2182 /*----------------------------------------------------------------------------
2183 | Returns the binary exponential of the single-precision floating-point value
2184 | `a'. The operation is performed according to the IEC/IEEE Standard for
2185 | Binary Floating-Point Arithmetic.
2186 |
2187 | Uses the following identities:
2188 |
2189 | 1. -------------------------------------------------------------------------
2190 |      x    x*ln(2)
2191 |     2  = e
2192 |
2193 | 2. -------------------------------------------------------------------------
2194 |                      2     3     4     5           n
2195 |      x        x     x     x     x     x           x
2196 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2197 |               1!    2!    3!    4!    5!          n!
2198 *----------------------------------------------------------------------------*/
2199
2200 static const float64 float32_exp2_coefficients[15] =
2201 {
2202     make_float64( 0x3ff0000000000000ll ), /*  1 */
2203     make_float64( 0x3fe0000000000000ll ), /*  2 */
2204     make_float64( 0x3fc5555555555555ll ), /*  3 */
2205     make_float64( 0x3fa5555555555555ll ), /*  4 */
2206     make_float64( 0x3f81111111111111ll ), /*  5 */
2207     make_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2208     make_float64( 0x3f2a01a01a01a01all ), /*  7 */
2209     make_float64( 0x3efa01a01a01a01all ), /*  8 */
2210     make_float64( 0x3ec71de3a556c734ll ), /*  9 */
2211     make_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2212     make_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2213     make_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2214     make_float64( 0x3de6124613a86d09ll ), /* 13 */
2215     make_float64( 0x3da93974a8c07c9dll ), /* 14 */
2216     make_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2217 };
2218
2219 float32 float32_exp2( float32 a STATUS_PARAM )
2220 {
2221     flag aSign;
2222     int16 aExp;
2223     bits32 aSig;
2224     float64 r, x, xn;
2225     int i;
2226     a = float32_squash_input_denormal(a STATUS_VAR);
2227
2228     aSig = extractFloat32Frac( a );
2229     aExp = extractFloat32Exp( a );
2230     aSign = extractFloat32Sign( a );
2231
2232     if ( aExp == 0xFF) {
2233         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2234         return (aSign) ? float32_zero : a;
2235     }
2236     if (aExp == 0) {
2237         if (aSig == 0) return float32_one;
2238     }
2239
2240     float_raise( float_flag_inexact STATUS_VAR);
2241
2242     /* ******************************* */
2243     /* using float64 for approximation */
2244     /* ******************************* */
2245     x = float32_to_float64(a STATUS_VAR);
2246     x = float64_mul(x, float64_ln2 STATUS_VAR);
2247
2248     xn = x;
2249     r = float64_one;
2250     for (i = 0 ; i < 15 ; i++) {
2251         float64 f;
2252
2253         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2254         r = float64_add(r, f STATUS_VAR);
2255
2256         xn = float64_mul(xn, x STATUS_VAR);
2257     }
2258
2259     return float64_to_float32(r, status);
2260 }
2261
2262 /*----------------------------------------------------------------------------
2263 | Returns the binary log of the single-precision floating-point value `a'.
2264 | The operation is performed according to the IEC/IEEE Standard for Binary
2265 | Floating-Point Arithmetic.
2266 *----------------------------------------------------------------------------*/
2267 float32 float32_log2( float32 a STATUS_PARAM )
2268 {
2269     flag aSign, zSign;
2270     int16 aExp;
2271     bits32 aSig, zSig, i;
2272
2273     a = float32_squash_input_denormal(a STATUS_VAR);
2274     aSig = extractFloat32Frac( a );
2275     aExp = extractFloat32Exp( a );
2276     aSign = extractFloat32Sign( a );
2277
2278     if ( aExp == 0 ) {
2279         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2280         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2281     }
2282     if ( aSign ) {
2283         float_raise( float_flag_invalid STATUS_VAR);
2284         return float32_default_nan;
2285     }
2286     if ( aExp == 0xFF ) {
2287         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2288         return a;
2289     }
2290
2291     aExp -= 0x7F;
2292     aSig |= 0x00800000;
2293     zSign = aExp < 0;
2294     zSig = aExp << 23;
2295
2296     for (i = 1 << 22; i > 0; i >>= 1) {
2297         aSig = ( (bits64)aSig * aSig ) >> 23;
2298         if ( aSig & 0x01000000 ) {
2299             aSig >>= 1;
2300             zSig |= i;
2301         }
2302     }
2303
2304     if ( zSign )
2305         zSig = -zSig;
2306
2307     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2308 }
2309
2310 /*----------------------------------------------------------------------------
2311 | Returns 1 if the single-precision floating-point value `a' is equal to
2312 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2313 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2314 *----------------------------------------------------------------------------*/
2315
2316 int float32_eq( float32 a, float32 b STATUS_PARAM )
2317 {
2318     a = float32_squash_input_denormal(a STATUS_VAR);
2319     b = float32_squash_input_denormal(b STATUS_VAR);
2320
2321     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2322          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2323        ) {
2324         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2325             float_raise( float_flag_invalid STATUS_VAR);
2326         }
2327         return 0;
2328     }
2329     return ( float32_val(a) == float32_val(b) ) ||
2330             ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2331
2332 }
2333
2334 /*----------------------------------------------------------------------------
2335 | Returns 1 if the single-precision floating-point value `a' is less than
2336 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2338 | Arithmetic.
2339 *----------------------------------------------------------------------------*/
2340
2341 int float32_le( float32 a, float32 b STATUS_PARAM )
2342 {
2343     flag aSign, bSign;
2344     bits32 av, bv;
2345     a = float32_squash_input_denormal(a STATUS_VAR);
2346     b = float32_squash_input_denormal(b STATUS_VAR);
2347
2348     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2349          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2350        ) {
2351         float_raise( float_flag_invalid STATUS_VAR);
2352         return 0;
2353     }
2354     aSign = extractFloat32Sign( a );
2355     bSign = extractFloat32Sign( b );
2356     av = float32_val(a);
2357     bv = float32_val(b);
2358     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2359     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2360
2361 }
2362
2363 /*----------------------------------------------------------------------------
2364 | Returns 1 if the single-precision floating-point value `a' is less than
2365 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2366 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2367 *----------------------------------------------------------------------------*/
2368
2369 int float32_lt( float32 a, float32 b STATUS_PARAM )
2370 {
2371     flag aSign, bSign;
2372     bits32 av, bv;
2373     a = float32_squash_input_denormal(a STATUS_VAR);
2374     b = float32_squash_input_denormal(b STATUS_VAR);
2375
2376     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2377          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2378        ) {
2379         float_raise( float_flag_invalid STATUS_VAR);
2380         return 0;
2381     }
2382     aSign = extractFloat32Sign( a );
2383     bSign = extractFloat32Sign( b );
2384     av = float32_val(a);
2385     bv = float32_val(b);
2386     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2387     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2388
2389 }
2390
2391 /*----------------------------------------------------------------------------
2392 | Returns 1 if the single-precision floating-point value `a' is equal to
2393 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2394 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2395 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2396 *----------------------------------------------------------------------------*/
2397
2398 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2399 {
2400     bits32 av, bv;
2401     a = float32_squash_input_denormal(a STATUS_VAR);
2402     b = float32_squash_input_denormal(b STATUS_VAR);
2403
2404     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2405          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2406        ) {
2407         float_raise( float_flag_invalid STATUS_VAR);
2408         return 0;
2409     }
2410     av = float32_val(a);
2411     bv = float32_val(b);
2412     return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2413
2414 }
2415
2416 /*----------------------------------------------------------------------------
2417 | Returns 1 if the single-precision floating-point value `a' is less than or
2418 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2419 | cause an exception.  Otherwise, the comparison is performed according to the
2420 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2421 *----------------------------------------------------------------------------*/
2422
2423 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2424 {
2425     flag aSign, bSign;
2426     bits32 av, bv;
2427     a = float32_squash_input_denormal(a STATUS_VAR);
2428     b = float32_squash_input_denormal(b STATUS_VAR);
2429
2430     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2431          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2432        ) {
2433         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2434             float_raise( float_flag_invalid STATUS_VAR);
2435         }
2436         return 0;
2437     }
2438     aSign = extractFloat32Sign( a );
2439     bSign = extractFloat32Sign( b );
2440     av = float32_val(a);
2441     bv = float32_val(b);
2442     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2443     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2444
2445 }
2446
2447 /*----------------------------------------------------------------------------
2448 | Returns 1 if the single-precision floating-point value `a' is less than
2449 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2450 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2451 | Standard for Binary Floating-Point Arithmetic.
2452 *----------------------------------------------------------------------------*/
2453
2454 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2455 {
2456     flag aSign, bSign;
2457     bits32 av, bv;
2458     a = float32_squash_input_denormal(a STATUS_VAR);
2459     b = float32_squash_input_denormal(b STATUS_VAR);
2460
2461     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2462          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2463        ) {
2464         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2465             float_raise( float_flag_invalid STATUS_VAR);
2466         }
2467         return 0;
2468     }
2469     aSign = extractFloat32Sign( a );
2470     bSign = extractFloat32Sign( b );
2471     av = float32_val(a);
2472     bv = float32_val(b);
2473     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2474     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2475
2476 }
2477
2478 /*----------------------------------------------------------------------------
2479 | Returns the result of converting the double-precision floating-point value
2480 | `a' to the 32-bit two's complement integer format.  The conversion is
2481 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2482 | Arithmetic---which means in particular that the conversion is rounded
2483 | according to the current rounding mode.  If `a' is a NaN, the largest
2484 | positive integer is returned.  Otherwise, if the conversion overflows, the
2485 | largest integer with the same sign as `a' is returned.
2486 *----------------------------------------------------------------------------*/
2487
2488 int32 float64_to_int32( float64 a STATUS_PARAM )
2489 {
2490     flag aSign;
2491     int16 aExp, shiftCount;
2492     bits64 aSig;
2493     a = float64_squash_input_denormal(a STATUS_VAR);
2494
2495     aSig = extractFloat64Frac( a );
2496     aExp = extractFloat64Exp( a );
2497     aSign = extractFloat64Sign( a );
2498     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2499     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2500     shiftCount = 0x42C - aExp;
2501     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2502     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2503
2504 }
2505
2506 /*----------------------------------------------------------------------------
2507 | Returns the result of converting the double-precision floating-point value
2508 | `a' to the 32-bit two's complement integer format.  The conversion is
2509 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2510 | Arithmetic, except that the conversion is always rounded toward zero.
2511 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2512 | the conversion overflows, the largest integer with the same sign as `a' is
2513 | returned.
2514 *----------------------------------------------------------------------------*/
2515
2516 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2517 {
2518     flag aSign;
2519     int16 aExp, shiftCount;
2520     bits64 aSig, savedASig;
2521     int32 z;
2522     a = float64_squash_input_denormal(a STATUS_VAR);
2523
2524     aSig = extractFloat64Frac( a );
2525     aExp = extractFloat64Exp( a );
2526     aSign = extractFloat64Sign( a );
2527     if ( 0x41E < aExp ) {
2528         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2529         goto invalid;
2530     }
2531     else if ( aExp < 0x3FF ) {
2532         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2533         return 0;
2534     }
2535     aSig |= LIT64( 0x0010000000000000 );
2536     shiftCount = 0x433 - aExp;
2537     savedASig = aSig;
2538     aSig >>= shiftCount;
2539     z = aSig;
2540     if ( aSign ) z = - z;
2541     if ( ( z < 0 ) ^ aSign ) {
2542  invalid:
2543         float_raise( float_flag_invalid STATUS_VAR);
2544         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2545     }
2546     if ( ( aSig<<shiftCount ) != savedASig ) {
2547         STATUS(float_exception_flags) |= float_flag_inexact;
2548     }
2549     return z;
2550
2551 }
2552
2553 /*----------------------------------------------------------------------------
2554 | Returns the result of converting the double-precision floating-point value
2555 | `a' to the 16-bit two's complement integer format.  The conversion is
2556 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2557 | Arithmetic, except that the conversion is always rounded toward zero.
2558 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2559 | the conversion overflows, the largest integer with the same sign as `a' is
2560 | returned.
2561 *----------------------------------------------------------------------------*/
2562
2563 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2564 {
2565     flag aSign;
2566     int16 aExp, shiftCount;
2567     bits64 aSig, savedASig;
2568     int32 z;
2569
2570     aSig = extractFloat64Frac( a );
2571     aExp = extractFloat64Exp( a );
2572     aSign = extractFloat64Sign( a );
2573     if ( 0x40E < aExp ) {
2574         if ( ( aExp == 0x7FF ) && aSig ) {
2575             aSign = 0;
2576         }
2577         goto invalid;
2578     }
2579     else if ( aExp < 0x3FF ) {
2580         if ( aExp || aSig ) {
2581             STATUS(float_exception_flags) |= float_flag_inexact;
2582         }
2583         return 0;
2584     }
2585     aSig |= LIT64( 0x0010000000000000 );
2586     shiftCount = 0x433 - aExp;
2587     savedASig = aSig;
2588     aSig >>= shiftCount;
2589     z = aSig;
2590     if ( aSign ) {
2591         z = - z;
2592     }
2593     if ( ( (int16_t)z < 0 ) ^ aSign ) {
2594  invalid:
2595         float_raise( float_flag_invalid STATUS_VAR);
2596         return aSign ? (sbits32) 0xffff8000 : 0x7FFF;
2597     }
2598     if ( ( aSig<<shiftCount ) != savedASig ) {
2599         STATUS(float_exception_flags) |= float_flag_inexact;
2600     }
2601     return z;
2602 }
2603
2604 /*----------------------------------------------------------------------------
2605 | Returns the result of converting the double-precision floating-point value
2606 | `a' to the 64-bit two's complement integer format.  The conversion is
2607 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2608 | Arithmetic---which means in particular that the conversion is rounded
2609 | according to the current rounding mode.  If `a' is a NaN, the largest
2610 | positive integer is returned.  Otherwise, if the conversion overflows, the
2611 | largest integer with the same sign as `a' is returned.
2612 *----------------------------------------------------------------------------*/
2613
2614 int64 float64_to_int64( float64 a STATUS_PARAM )
2615 {
2616     flag aSign;
2617     int16 aExp, shiftCount;
2618     bits64 aSig, aSigExtra;
2619     a = float64_squash_input_denormal(a STATUS_VAR);
2620
2621     aSig = extractFloat64Frac( a );
2622     aExp = extractFloat64Exp( a );
2623     aSign = extractFloat64Sign( a );
2624     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2625     shiftCount = 0x433 - aExp;
2626     if ( shiftCount <= 0 ) {
2627         if ( 0x43E < aExp ) {
2628             float_raise( float_flag_invalid STATUS_VAR);
2629             if (    ! aSign
2630                  || (    ( aExp == 0x7FF )
2631                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2632                ) {
2633                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2634             }
2635             return (sbits64) LIT64( 0x8000000000000000 );
2636         }
2637         aSigExtra = 0;
2638         aSig <<= - shiftCount;
2639     }
2640     else {
2641         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2642     }
2643     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2644
2645 }
2646
2647 /*----------------------------------------------------------------------------
2648 | Returns the result of converting the double-precision floating-point value
2649 | `a' to the 64-bit two's complement integer format.  The conversion is
2650 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2651 | Arithmetic, except that the conversion is always rounded toward zero.
2652 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2653 | the conversion overflows, the largest integer with the same sign as `a' is
2654 | returned.
2655 *----------------------------------------------------------------------------*/
2656
2657 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2658 {
2659     flag aSign;
2660     int16 aExp, shiftCount;
2661     bits64 aSig;
2662     int64 z;
2663     a = float64_squash_input_denormal(a STATUS_VAR);
2664
2665     aSig = extractFloat64Frac( a );
2666     aExp = extractFloat64Exp( a );
2667     aSign = extractFloat64Sign( a );
2668     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2669     shiftCount = aExp - 0x433;
2670     if ( 0 <= shiftCount ) {
2671         if ( 0x43E <= aExp ) {
2672             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2673                 float_raise( float_flag_invalid STATUS_VAR);
2674                 if (    ! aSign
2675                      || (    ( aExp == 0x7FF )
2676                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2677                    ) {
2678                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2679                 }
2680             }
2681             return (sbits64) LIT64( 0x8000000000000000 );
2682         }
2683         z = aSig<<shiftCount;
2684     }
2685     else {
2686         if ( aExp < 0x3FE ) {
2687             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2688             return 0;
2689         }
2690         z = aSig>>( - shiftCount );
2691         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2692             STATUS(float_exception_flags) |= float_flag_inexact;
2693         }
2694     }
2695     if ( aSign ) z = - z;
2696     return z;
2697
2698 }
2699
2700 /*----------------------------------------------------------------------------
2701 | Returns the result of converting the double-precision floating-point value
2702 | `a' to the single-precision floating-point format.  The conversion is
2703 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2704 | Arithmetic.
2705 *----------------------------------------------------------------------------*/
2706
2707 float32 float64_to_float32( float64 a STATUS_PARAM )
2708 {
2709     flag aSign;
2710     int16 aExp;
2711     bits64 aSig;
2712     bits32 zSig;
2713     a = float64_squash_input_denormal(a STATUS_VAR);
2714
2715     aSig = extractFloat64Frac( a );
2716     aExp = extractFloat64Exp( a );
2717     aSign = extractFloat64Sign( a );
2718     if ( aExp == 0x7FF ) {
2719         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2720         return packFloat32( aSign, 0xFF, 0 );
2721     }
2722     shift64RightJamming( aSig, 22, &aSig );
2723     zSig = aSig;
2724     if ( aExp || zSig ) {
2725         zSig |= 0x40000000;
2726         aExp -= 0x381;
2727     }
2728     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2729
2730 }
2731
2732
2733 /*----------------------------------------------------------------------------
2734 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2735 | half-precision floating-point value, returning the result.  After being
2736 | shifted into the proper positions, the three fields are simply added
2737 | together to form the result.  This means that any integer portion of `zSig'
2738 | will be added into the exponent.  Since a properly normalized significand
2739 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2740 | than the desired result exponent whenever `zSig' is a complete, normalized
2741 | significand.
2742 *----------------------------------------------------------------------------*/
2743 static float16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2744 {
2745     return make_float16(
2746         (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig);
2747 }
2748
2749 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2750    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2751
2752 float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
2753 {
2754     flag aSign;
2755     int16 aExp;
2756     bits32 aSig;
2757
2758     aSign = extractFloat16Sign(a);
2759     aExp = extractFloat16Exp(a);
2760     aSig = extractFloat16Frac(a);
2761
2762     if (aExp == 0x1f && ieee) {
2763         if (aSig) {
2764             /* Make sure correct exceptions are raised.  */
2765             float32ToCommonNaN(a STATUS_VAR);
2766             aSig |= 0x200;
2767         }
2768         return packFloat32(aSign, 0xff, aSig << 13);
2769     }
2770     if (aExp == 0) {
2771         int8 shiftCount;
2772
2773         if (aSig == 0) {
2774             return packFloat32(aSign, 0, 0);
2775         }
2776
2777         shiftCount = countLeadingZeros32( aSig ) - 21;
2778         aSig = aSig << shiftCount;
2779         aExp = -shiftCount;
2780     }
2781     return packFloat32( aSign, aExp + 0x70, aSig << 13);
2782 }
2783
2784 float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
2785 {
2786     flag aSign;
2787     int16 aExp;
2788     bits32 aSig;
2789     bits32 mask;
2790     bits32 increment;
2791     int8 roundingMode;
2792     a = float32_squash_input_denormal(a STATUS_VAR);
2793
2794     aSig = extractFloat32Frac( a );
2795     aExp = extractFloat32Exp( a );
2796     aSign = extractFloat32Sign( a );
2797     if ( aExp == 0xFF ) {
2798         if (aSig) {
2799             /* Make sure correct exceptions are raised.  */
2800             float32ToCommonNaN(a STATUS_VAR);
2801             aSig |= 0x00400000;
2802         }
2803         return packFloat16(aSign, 0x1f, aSig >> 13);
2804     }
2805     if (aExp == 0 && aSign == 0) {
2806         return packFloat16(aSign, 0, 0);
2807     }
2808     /* Decimal point between bits 22 and 23.  */
2809     aSig |= 0x00800000;
2810     aExp -= 0x7f;
2811     if (aExp < -14) {
2812         mask = 0x007fffff;
2813         if (aExp < -24) {
2814             aExp = -25;
2815         } else {
2816             mask >>= 24 + aExp;
2817         }
2818     } else {
2819         mask = 0x00001fff;
2820     }
2821     if (aSig & mask) {
2822         float_raise( float_flag_underflow STATUS_VAR );
2823         roundingMode = STATUS(float_rounding_mode);
2824         switch (roundingMode) {
2825         case float_round_nearest_even:
2826             increment = (mask + 1) >> 1;
2827             if ((aSig & mask) == increment) {
2828                 increment = aSig & (increment << 1);
2829             }
2830             break;
2831         case float_round_up:
2832             increment = aSign ? 0 : mask;
2833             break;
2834         case float_round_down:
2835             increment = aSign ? mask : 0;
2836             break;
2837         default: /* round_to_zero */
2838             increment = 0;
2839             break;
2840         }
2841         aSig += increment;
2842         if (aSig >= 0x01000000) {
2843             aSig >>= 1;
2844             aExp++;
2845         }
2846     } else if (aExp < -14
2847           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2848         float_raise( float_flag_underflow STATUS_VAR);
2849     }
2850
2851     if (ieee) {
2852         if (aExp > 15) {
2853             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2854             return packFloat16(aSign, 0x1f, 0);
2855         }
2856     } else {
2857         if (aExp > 16) {
2858             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2859             return packFloat16(aSign, 0x1f, 0x3ff);
2860         }
2861     }
2862     if (aExp < -24) {
2863         return packFloat16(aSign, 0, 0);
2864     }
2865     if (aExp < -14) {
2866         aSig >>= -14 - aExp;
2867         aExp = -14;
2868     }
2869     return packFloat16(aSign, aExp + 14, aSig >> 13);
2870 }
2871
2872 #ifdef FLOATX80
2873
2874 /*----------------------------------------------------------------------------
2875 | Returns the result of converting the double-precision floating-point value
2876 | `a' to the extended double-precision floating-point format.  The conversion
2877 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2878 | Arithmetic.
2879 *----------------------------------------------------------------------------*/
2880
2881 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2882 {
2883     flag aSign;
2884     int16 aExp;
2885     bits64 aSig;
2886
2887     a = float64_squash_input_denormal(a STATUS_VAR);
2888     aSig = extractFloat64Frac( a );
2889     aExp = extractFloat64Exp( a );
2890     aSign = extractFloat64Sign( a );
2891     if ( aExp == 0x7FF ) {
2892         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2893         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2894     }
2895     if ( aExp == 0 ) {
2896         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2897         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2898     }
2899     return
2900         packFloatx80(
2901             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2902
2903 }
2904
2905 #endif
2906
2907 #ifdef FLOAT128
2908
2909 /*----------------------------------------------------------------------------
2910 | Returns the result of converting the double-precision floating-point value
2911 | `a' to the quadruple-precision floating-point format.  The conversion is
2912 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2913 | Arithmetic.
2914 *----------------------------------------------------------------------------*/
2915
2916 float128 float64_to_float128( float64 a STATUS_PARAM )
2917 {
2918     flag aSign;
2919     int16 aExp;
2920     bits64 aSig, zSig0, zSig1;
2921
2922     a = float64_squash_input_denormal(a STATUS_VAR);
2923     aSig = extractFloat64Frac( a );
2924     aExp = extractFloat64Exp( a );
2925     aSign = extractFloat64Sign( a );
2926     if ( aExp == 0x7FF ) {
2927         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2928         return packFloat128( aSign, 0x7FFF, 0, 0 );
2929     }
2930     if ( aExp == 0 ) {
2931         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2932         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2933         --aExp;
2934     }
2935     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2936     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2937
2938 }
2939
2940 #endif
2941
2942 /*----------------------------------------------------------------------------
2943 | Rounds the double-precision floating-point value `a' to an integer, and
2944 | returns the result as a double-precision floating-point value.  The
2945 | operation is performed according to the IEC/IEEE Standard for Binary
2946 | Floating-Point Arithmetic.
2947 *----------------------------------------------------------------------------*/
2948
2949 float64 float64_round_to_int( float64 a STATUS_PARAM )
2950 {
2951     flag aSign;
2952     int16 aExp;
2953     bits64 lastBitMask, roundBitsMask;
2954     int8 roundingMode;
2955     bits64 z;
2956     a = float64_squash_input_denormal(a STATUS_VAR);
2957
2958     aExp = extractFloat64Exp( a );
2959     if ( 0x433 <= aExp ) {
2960         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2961             return propagateFloat64NaN( a, a STATUS_VAR );
2962         }
2963         return a;
2964     }
2965     if ( aExp < 0x3FF ) {
2966         if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2967         STATUS(float_exception_flags) |= float_flag_inexact;
2968         aSign = extractFloat64Sign( a );
2969         switch ( STATUS(float_rounding_mode) ) {
2970          case float_round_nearest_even:
2971             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2972                 return packFloat64( aSign, 0x3FF, 0 );
2973             }
2974             break;
2975          case float_round_down:
2976             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2977          case float_round_up:
2978             return make_float64(
2979             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2980         }
2981         return packFloat64( aSign, 0, 0 );
2982     }
2983     lastBitMask = 1;
2984     lastBitMask <<= 0x433 - aExp;
2985     roundBitsMask = lastBitMask - 1;
2986     z = float64_val(a);
2987     roundingMode = STATUS(float_rounding_mode);
2988     if ( roundingMode == float_round_nearest_even ) {
2989         z += lastBitMask>>1;
2990         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2991     }
2992     else if ( roundingMode != float_round_to_zero ) {
2993         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2994             z += roundBitsMask;
2995         }
2996     }
2997     z &= ~ roundBitsMask;
2998     if ( z != float64_val(a) )
2999         STATUS(float_exception_flags) |= float_flag_inexact;
3000     return make_float64(z);
3001
3002 }
3003
3004 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3005 {
3006     int oldmode;
3007     float64 res;
3008     oldmode = STATUS(float_rounding_mode);
3009     STATUS(float_rounding_mode) = float_round_to_zero;
3010     res = float64_round_to_int(a STATUS_VAR);
3011     STATUS(float_rounding_mode) = oldmode;
3012     return res;
3013 }
3014
3015 /*----------------------------------------------------------------------------
3016 | Returns the result of adding the absolute values of the double-precision
3017 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
3018 | before being returned.  `zSign' is ignored if the result is a NaN.
3019 | The addition is performed according to the IEC/IEEE Standard for Binary
3020 | Floating-Point Arithmetic.
3021 *----------------------------------------------------------------------------*/
3022
3023 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3024 {
3025     int16 aExp, bExp, zExp;
3026     bits64 aSig, bSig, zSig;
3027     int16 expDiff;
3028
3029     aSig = extractFloat64Frac( a );
3030     aExp = extractFloat64Exp( a );
3031     bSig = extractFloat64Frac( b );
3032     bExp = extractFloat64Exp( b );
3033     expDiff = aExp - bExp;
3034     aSig <<= 9;
3035     bSig <<= 9;
3036     if ( 0 < expDiff ) {
3037         if ( aExp == 0x7FF ) {
3038             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3039             return a;
3040         }
3041         if ( bExp == 0 ) {
3042             --expDiff;
3043         }
3044         else {
3045             bSig |= LIT64( 0x2000000000000000 );
3046         }
3047         shift64RightJamming( bSig, expDiff, &bSig );
3048         zExp = aExp;
3049     }
3050     else if ( expDiff < 0 ) {
3051         if ( bExp == 0x7FF ) {
3052             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3053             return packFloat64( zSign, 0x7FF, 0 );
3054         }
3055         if ( aExp == 0 ) {
3056             ++expDiff;
3057         }
3058         else {
3059             aSig |= LIT64( 0x2000000000000000 );
3060         }
3061         shift64RightJamming( aSig, - expDiff, &aSig );
3062         zExp = bExp;
3063     }
3064     else {
3065         if ( aExp == 0x7FF ) {
3066             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3067             return a;
3068         }
3069         if ( aExp == 0 ) {
3070             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3071             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3072         }
3073         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3074         zExp = aExp;
3075         goto roundAndPack;
3076     }
3077     aSig |= LIT64( 0x2000000000000000 );
3078     zSig = ( aSig + bSig )<<1;
3079     --zExp;
3080     if ( (sbits64) zSig < 0 ) {
3081         zSig = aSig + bSig;
3082         ++zExp;
3083     }
3084  roundAndPack:
3085     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3086
3087 }
3088
3089 /*----------------------------------------------------------------------------
3090 | Returns the result of subtracting the absolute values of the double-
3091 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3092 | difference is negated before being returned.  `zSign' is ignored if the
3093 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3094 | Standard for Binary Floating-Point Arithmetic.
3095 *----------------------------------------------------------------------------*/
3096
3097 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3098 {
3099     int16 aExp, bExp, zExp;
3100     bits64 aSig, bSig, zSig;
3101     int16 expDiff;
3102
3103     aSig = extractFloat64Frac( a );
3104     aExp = extractFloat64Exp( a );
3105     bSig = extractFloat64Frac( b );
3106     bExp = extractFloat64Exp( b );
3107     expDiff = aExp - bExp;
3108     aSig <<= 10;
3109     bSig <<= 10;
3110     if ( 0 < expDiff ) goto aExpBigger;
3111     if ( expDiff < 0 ) goto bExpBigger;
3112     if ( aExp == 0x7FF ) {
3113         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3114         float_raise( float_flag_invalid STATUS_VAR);
3115         return float64_default_nan;
3116     }
3117     if ( aExp == 0 ) {
3118         aExp = 1;
3119         bExp = 1;
3120     }
3121     if ( bSig < aSig ) goto aBigger;
3122     if ( aSig < bSig ) goto bBigger;
3123     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3124  bExpBigger:
3125     if ( bExp == 0x7FF ) {
3126         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3127         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3128     }
3129     if ( aExp == 0 ) {
3130         ++expDiff;
3131     }
3132     else {
3133         aSig |= LIT64( 0x4000000000000000 );
3134     }
3135     shift64RightJamming( aSig, - expDiff, &aSig );
3136     bSig |= LIT64( 0x4000000000000000 );
3137  bBigger:
3138     zSig = bSig - aSig;
3139     zExp = bExp;
3140     zSign ^= 1;
3141     goto normalizeRoundAndPack;
3142  aExpBigger:
3143     if ( aExp == 0x7FF ) {
3144         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3145         return a;
3146     }
3147     if ( bExp == 0 ) {
3148         --expDiff;
3149     }
3150     else {
3151         bSig |= LIT64( 0x4000000000000000 );
3152     }
3153     shift64RightJamming( bSig, expDiff, &bSig );
3154     aSig |= LIT64( 0x4000000000000000 );
3155  aBigger:
3156     zSig = aSig - bSig;
3157     zExp = aExp;
3158  normalizeRoundAndPack:
3159     --zExp;
3160     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3161
3162 }
3163
3164 /*----------------------------------------------------------------------------
3165 | Returns the result of adding the double-precision floating-point values `a'
3166 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3167 | Binary Floating-Point Arithmetic.
3168 *----------------------------------------------------------------------------*/
3169
3170 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3171 {
3172     flag aSign, bSign;
3173     a = float64_squash_input_denormal(a STATUS_VAR);
3174     b = float64_squash_input_denormal(b STATUS_VAR);
3175
3176     aSign = extractFloat64Sign( a );
3177     bSign = extractFloat64Sign( b );
3178     if ( aSign == bSign ) {
3179         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3180     }
3181     else {
3182         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3183     }
3184
3185 }
3186
3187 /*----------------------------------------------------------------------------
3188 | Returns the result of subtracting the double-precision floating-point values
3189 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3190 | for Binary Floating-Point Arithmetic.
3191 *----------------------------------------------------------------------------*/
3192
3193 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3194 {
3195     flag aSign, bSign;
3196     a = float64_squash_input_denormal(a STATUS_VAR);
3197     b = float64_squash_input_denormal(b STATUS_VAR);
3198
3199     aSign = extractFloat64Sign( a );
3200     bSign = extractFloat64Sign( b );
3201     if ( aSign == bSign ) {
3202         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3203     }
3204     else {
3205         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3206     }
3207
3208 }
3209
3210 /*----------------------------------------------------------------------------
3211 | Returns the result of multiplying the double-precision floating-point values
3212 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3213 | for Binary Floating-Point Arithmetic.
3214 *----------------------------------------------------------------------------*/
3215
3216 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3217 {
3218     flag aSign, bSign, zSign;
3219     int16 aExp, bExp, zExp;
3220     bits64 aSig, bSig, zSig0, zSig1;
3221
3222     a = float64_squash_input_denormal(a STATUS_VAR);
3223     b = float64_squash_input_denormal(b STATUS_VAR);
3224
3225     aSig = extractFloat64Frac( a );
3226     aExp = extractFloat64Exp( a );
3227     aSign = extractFloat64Sign( a );
3228     bSig = extractFloat64Frac( b );
3229     bExp = extractFloat64Exp( b );
3230     bSign = extractFloat64Sign( b );
3231     zSign = aSign ^ bSign;
3232     if ( aExp == 0x7FF ) {
3233         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3234             return propagateFloat64NaN( a, b STATUS_VAR );
3235         }
3236         if ( ( bExp | bSig ) == 0 ) {
3237             float_raise( float_flag_invalid STATUS_VAR);
3238             return float64_default_nan;
3239         }
3240         return packFloat64( zSign, 0x7FF, 0 );
3241     }
3242     if ( bExp == 0x7FF ) {
3243         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3244         if ( ( aExp | aSig ) == 0 ) {
3245             float_raise( float_flag_invalid STATUS_VAR);
3246             return float64_default_nan;
3247         }
3248         return packFloat64( zSign, 0x7FF, 0 );
3249     }
3250     if ( aExp == 0 ) {
3251         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3252         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3253     }
3254     if ( bExp == 0 ) {
3255         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3256         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3257     }
3258     zExp = aExp + bExp - 0x3FF;
3259     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3260     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3261     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3262     zSig0 |= ( zSig1 != 0 );
3263     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
3264         zSig0 <<= 1;
3265         --zExp;
3266     }
3267     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3268
3269 }
3270
3271 /*----------------------------------------------------------------------------
3272 | Returns the result of dividing the double-precision floating-point value `a'
3273 | by the corresponding value `b'.  The operation is performed according to
3274 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3275 *----------------------------------------------------------------------------*/
3276
3277 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3278 {
3279     flag aSign, bSign, zSign;
3280     int16 aExp, bExp, zExp;
3281     bits64 aSig, bSig, zSig;
3282     bits64 rem0, rem1;
3283     bits64 term0, term1;
3284     a = float64_squash_input_denormal(a STATUS_VAR);
3285     b = float64_squash_input_denormal(b STATUS_VAR);
3286
3287     aSig = extractFloat64Frac( a );
3288     aExp = extractFloat64Exp( a );
3289     aSign = extractFloat64Sign( a );
3290     bSig = extractFloat64Frac( b );
3291     bExp = extractFloat64Exp( b );
3292     bSign = extractFloat64Sign( b );
3293     zSign = aSign ^ bSign;
3294     if ( aExp == 0x7FF ) {
3295         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3296         if ( bExp == 0x7FF ) {
3297             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3298             float_raise( float_flag_invalid STATUS_VAR);
3299             return float64_default_nan;
3300         }
3301         return packFloat64( zSign, 0x7FF, 0 );
3302     }
3303     if ( bExp == 0x7FF ) {
3304         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3305         return packFloat64( zSign, 0, 0 );
3306     }
3307     if ( bExp == 0 ) {
3308         if ( bSig == 0 ) {
3309             if ( ( aExp | aSig ) == 0 ) {
3310                 float_raise( float_flag_invalid STATUS_VAR);
3311                 return float64_default_nan;
3312             }
3313             float_raise( float_flag_divbyzero STATUS_VAR);
3314             return packFloat64( zSign, 0x7FF, 0 );
3315         }
3316         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3317     }
3318     if ( aExp == 0 ) {
3319         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3320         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3321     }
3322     zExp = aExp - bExp + 0x3FD;
3323     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3324     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3325     if ( bSig <= ( aSig + aSig ) ) {
3326         aSig >>= 1;
3327         ++zExp;
3328     }
3329     zSig = estimateDiv128To64( aSig, 0, bSig );
3330     if ( ( zSig & 0x1FF ) <= 2 ) {
3331         mul64To128( bSig, zSig, &term0, &term1 );
3332         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3333         while ( (sbits64) rem0 < 0 ) {
3334             --zSig;
3335             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3336         }
3337         zSig |= ( rem1 != 0 );
3338     }
3339     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3340
3341 }
3342
3343 /*----------------------------------------------------------------------------
3344 | Returns the remainder of the double-precision floating-point value `a'
3345 | with respect to the corresponding value `b'.  The operation is performed
3346 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3347 *----------------------------------------------------------------------------*/
3348
3349 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3350 {
3351     flag aSign, zSign;
3352     int16 aExp, bExp, expDiff;
3353     bits64 aSig, bSig;
3354     bits64 q, alternateASig;
3355     sbits64 sigMean;
3356
3357     a = float64_squash_input_denormal(a STATUS_VAR);
3358     b = float64_squash_input_denormal(b STATUS_VAR);
3359     aSig = extractFloat64Frac( a );
3360     aExp = extractFloat64Exp( a );
3361     aSign = extractFloat64Sign( a );
3362     bSig = extractFloat64Frac( b );
3363     bExp = extractFloat64Exp( b );
3364     if ( aExp == 0x7FF ) {
3365         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3366             return propagateFloat64NaN( a, b STATUS_VAR );
3367         }
3368         float_raise( float_flag_invalid STATUS_VAR);
3369         return float64_default_nan;
3370     }
3371     if ( bExp == 0x7FF ) {
3372         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3373         return a;
3374     }
3375     if ( bExp == 0 ) {
3376         if ( bSig == 0 ) {
3377             float_raise( float_flag_invalid STATUS_VAR);
3378             return float64_default_nan;
3379         }
3380         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3381     }
3382     if ( aExp == 0 ) {
3383         if ( aSig == 0 ) return a;
3384         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3385     }
3386     expDiff = aExp - bExp;
3387     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3388     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3389     if ( expDiff < 0 ) {
3390         if ( expDiff < -1 ) return a;
3391         aSig >>= 1;
3392     }
3393     q = ( bSig <= aSig );
3394     if ( q ) aSig -= bSig;
3395     expDiff -= 64;
3396     while ( 0 < expDiff ) {
3397         q = estimateDiv128To64( aSig, 0, bSig );
3398         q = ( 2 < q ) ? q - 2 : 0;
3399         aSig = - ( ( bSig>>2 ) * q );
3400         expDiff -= 62;
3401     }
3402     expDiff += 64;
3403     if ( 0 < expDiff ) {
3404         q = estimateDiv128To64( aSig, 0, bSig );
3405         q = ( 2 < q ) ? q - 2 : 0;
3406         q >>= 64 - expDiff;
3407         bSig >>= 2;
3408         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3409     }
3410     else {
3411         aSig >>= 2;
3412         bSig >>= 2;
3413     }
3414     do {
3415         alternateASig = aSig;
3416         ++q;
3417         aSig -= bSig;
3418     } while ( 0 <= (sbits64) aSig );
3419     sigMean = aSig + alternateASig;
3420     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3421         aSig = alternateASig;
3422     }
3423     zSign = ( (sbits64) aSig < 0 );
3424     if ( zSign ) aSig = - aSig;
3425     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3426
3427 }
3428
3429 /*----------------------------------------------------------------------------
3430 | Returns the square root of the double-precision floating-point value `a'.
3431 | The operation is performed according to the IEC/IEEE Standard for Binary
3432 | Floating-Point Arithmetic.
3433 *----------------------------------------------------------------------------*/
3434
3435 float64 float64_sqrt( float64 a STATUS_PARAM )
3436 {
3437     flag aSign;
3438     int16 aExp, zExp;
3439     bits64 aSig, zSig, doubleZSig;
3440     bits64 rem0, rem1, term0, term1;
3441     a = float64_squash_input_denormal(a STATUS_VAR);
3442
3443     aSig = extractFloat64Frac( a );
3444     aExp = extractFloat64Exp( a );
3445     aSign = extractFloat64Sign( a );
3446     if ( aExp == 0x7FF ) {
3447         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3448         if ( ! aSign ) return a;
3449         float_raise( float_flag_invalid STATUS_VAR);
3450         return float64_default_nan;
3451     }
3452     if ( aSign ) {
3453         if ( ( aExp | aSig ) == 0 ) return a;
3454         float_raise( float_flag_invalid STATUS_VAR);
3455         return float64_default_nan;
3456     }
3457     if ( aExp == 0 ) {
3458         if ( aSig == 0 ) return float64_zero;
3459         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3460     }
3461     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3462     aSig |= LIT64( 0x0010000000000000 );
3463     zSig = estimateSqrt32( aExp, aSig>>21 );
3464     aSig <<= 9 - ( aExp & 1 );
3465     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3466     if ( ( zSig & 0x1FF ) <= 5 ) {
3467         doubleZSig = zSig<<1;
3468         mul64To128( zSig, zSig, &term0, &term1 );
3469         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3470         while ( (sbits64) rem0 < 0 ) {
3471             --zSig;
3472             doubleZSig -= 2;
3473             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3474         }
3475         zSig |= ( ( rem0 | rem1 ) != 0 );
3476     }
3477     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3478
3479 }
3480
3481 /*----------------------------------------------------------------------------
3482 | Returns the binary log of the double-precision floating-point value `a'.
3483 | The operation is performed according to the IEC/IEEE Standard for Binary
3484 | Floating-Point Arithmetic.
3485 *----------------------------------------------------------------------------*/
3486 float64 float64_log2( float64 a STATUS_PARAM )
3487 {
3488     flag aSign, zSign;
3489     int16 aExp;
3490     bits64 aSig, aSig0, aSig1, zSig, i;
3491     a = float64_squash_input_denormal(a STATUS_VAR);
3492
3493     aSig = extractFloat64Frac( a );
3494     aExp = extractFloat64Exp( a );
3495     aSign = extractFloat64Sign( a );
3496
3497     if ( aExp == 0 ) {
3498         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3499         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3500     }
3501     if ( aSign ) {
3502         float_raise( float_flag_invalid STATUS_VAR);
3503         return float64_default_nan;
3504     }
3505     if ( aExp == 0x7FF ) {
3506         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3507         return a;
3508     }
3509
3510     aExp -= 0x3FF;
3511     aSig |= LIT64( 0x0010000000000000 );
3512     zSign = aExp < 0;
3513     zSig = (bits64)aExp << 52;
3514     for (i = 1LL << 51; i > 0; i >>= 1) {
3515         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3516         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3517         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3518             aSig >>= 1;
3519             zSig |= i;
3520         }
3521     }
3522
3523     if ( zSign )
3524         zSig = -zSig;
3525     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3526 }
3527
3528 /*----------------------------------------------------------------------------
3529 | Returns 1 if the double-precision floating-point value `a' is equal to the
3530 | corresponding value `b', and 0 otherwise.  The comparison is performed
3531 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3532 *----------------------------------------------------------------------------*/
3533
3534 int float64_eq( float64 a, float64 b STATUS_PARAM )
3535 {
3536     bits64 av, bv;
3537     a = float64_squash_input_denormal(a STATUS_VAR);
3538     b = float64_squash_input_denormal(b STATUS_VAR);
3539
3540     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3541          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3542        ) {
3543         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3544             float_raise( float_flag_invalid STATUS_VAR);
3545         }
3546         return 0;
3547     }
3548     av = float64_val(a);
3549     bv = float64_val(b);
3550     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3551
3552 }
3553
3554 /*----------------------------------------------------------------------------
3555 | Returns 1 if the double-precision floating-point value `a' is less than or
3556 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3557 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3558 | Arithmetic.
3559 *----------------------------------------------------------------------------*/
3560
3561 int float64_le( float64 a, float64 b STATUS_PARAM )
3562 {
3563     flag aSign, bSign;
3564     bits64 av, bv;
3565     a = float64_squash_input_denormal(a STATUS_VAR);
3566     b = float64_squash_input_denormal(b STATUS_VAR);
3567
3568     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3569          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3570        ) {
3571         float_raise( float_flag_invalid STATUS_VAR);
3572         return 0;
3573     }
3574     aSign = extractFloat64Sign( a );
3575     bSign = extractFloat64Sign( b );
3576     av = float64_val(a);
3577     bv = float64_val(b);
3578     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3579     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3580
3581 }
3582
3583 /*----------------------------------------------------------------------------
3584 | Returns 1 if the double-precision floating-point value `a' is less than
3585 | the corresponding value `b', and 0 otherwise.  The comparison is performed
3586 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3587 *----------------------------------------------------------------------------*/
3588
3589 int float64_lt( float64 a, float64 b STATUS_PARAM )
3590 {
3591     flag aSign, bSign;
3592     bits64 av, bv;
3593
3594     a = float64_squash_input_denormal(a STATUS_VAR);
3595     b = float64_squash_input_denormal(b STATUS_VAR);
3596     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3597          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3598        ) {
3599         float_raise( float_flag_invalid STATUS_VAR);
3600         return 0;
3601     }
3602     aSign = extractFloat64Sign( a );
3603     bSign = extractFloat64Sign( b );
3604     av = float64_val(a);
3605     bv = float64_val(b);
3606     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3607     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3608
3609 }
3610
3611 /*----------------------------------------------------------------------------
3612 | Returns 1 if the double-precision floating-point value `a' is equal to the
3613 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3614 | if either operand is a NaN.  Otherwise, the comparison is performed
3615 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3616 *----------------------------------------------------------------------------*/
3617
3618 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3619 {
3620     bits64 av, bv;
3621     a = float64_squash_input_denormal(a STATUS_VAR);
3622     b = float64_squash_input_denormal(b STATUS_VAR);
3623
3624     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3625          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3626        ) {
3627         float_raise( float_flag_invalid STATUS_VAR);
3628         return 0;
3629     }
3630     av = float64_val(a);
3631     bv = float64_val(b);
3632     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3633
3634 }
3635
3636 /*----------------------------------------------------------------------------
3637 | Returns 1 if the double-precision floating-point value `a' is less than or
3638 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3639 | cause an exception.  Otherwise, the comparison is performed according to the
3640 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3641 *----------------------------------------------------------------------------*/
3642
3643 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3644 {
3645     flag aSign, bSign;
3646     bits64 av, bv;
3647     a = float64_squash_input_denormal(a STATUS_VAR);
3648     b = float64_squash_input_denormal(b STATUS_VAR);
3649
3650     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3651          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3652        ) {
3653         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3654             float_raise( float_flag_invalid STATUS_VAR);
3655         }
3656         return 0;
3657     }
3658     aSign = extractFloat64Sign( a );
3659     bSign = extractFloat64Sign( b );
3660     av = float64_val(a);
3661     bv = float64_val(b);
3662     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3663     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3664
3665 }
3666
3667 /*----------------------------------------------------------------------------
3668 | Returns 1 if the double-precision floating-point value `a' is less than
3669 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3670 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3671 | Standard for Binary Floating-Point Arithmetic.
3672 *----------------------------------------------------------------------------*/
3673
3674 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3675 {
3676     flag aSign, bSign;
3677     bits64 av, bv;
3678     a = float64_squash_input_denormal(a STATUS_VAR);
3679     b = float64_squash_input_denormal(b STATUS_VAR);
3680
3681     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3682          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3683        ) {
3684         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3685             float_raise( float_flag_invalid STATUS_VAR);
3686         }
3687         return 0;
3688     }
3689     aSign = extractFloat64Sign( a );
3690     bSign = extractFloat64Sign( b );
3691     av = float64_val(a);
3692     bv = float64_val(b);
3693     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3694     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3695
3696 }
3697
3698 #ifdef FLOATX80
3699
3700 /*----------------------------------------------------------------------------
3701 | Returns the result of converting the extended double-precision floating-
3702 | point value `a' to the 32-bit two's complement integer format.  The
3703 | conversion is performed according to the IEC/IEEE Standard for Binary
3704 | Floating-Point Arithmetic---which means in particular that the conversion
3705 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3706 | largest positive integer is returned.  Otherwise, if the conversion
3707 | overflows, the largest integer with the same sign as `a' is returned.
3708 *----------------------------------------------------------------------------*/
3709
3710 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3711 {
3712     flag aSign;
3713     int32 aExp, shiftCount;
3714     bits64 aSig;
3715
3716     aSig = extractFloatx80Frac( a );
3717     aExp = extractFloatx80Exp( a );
3718     aSign = extractFloatx80Sign( a );
3719     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3720     shiftCount = 0x4037 - aExp;
3721     if ( shiftCount <= 0 ) shiftCount = 1;
3722     shift64RightJamming( aSig, shiftCount, &aSig );
3723     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3724
3725 }
3726
3727 /*----------------------------------------------------------------------------
3728 | Returns the result of converting the extended double-precision floating-
3729 | point value `a' to the 32-bit two's complement integer format.  The
3730 | conversion is performed according to the IEC/IEEE Standard for Binary
3731 | Floating-Point Arithmetic, except that the conversion is always rounded
3732 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3733 | Otherwise, if the conversion overflows, the largest integer with the same
3734 | sign as `a' is returned.
3735 *----------------------------------------------------------------------------*/
3736
3737 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3738 {
3739     flag aSign;
3740     int32 aExp, shiftCount;
3741     bits64 aSig, savedASig;
3742     int32 z;
3743
3744     aSig = extractFloatx80Frac( a );
3745     aExp = extractFloatx80Exp( a );
3746     aSign = extractFloatx80Sign( a );
3747     if ( 0x401E < aExp ) {
3748         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3749         goto invalid;
3750     }
3751     else if ( aExp < 0x3FFF ) {
3752         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3753         return 0;
3754     }
3755     shiftCount = 0x403E - aExp;
3756     savedASig = aSig;
3757     aSig >>= shiftCount;
3758     z = aSig;
3759     if ( aSign ) z = - z;
3760     if ( ( z < 0 ) ^ aSign ) {
3761  invalid:
3762         float_raise( float_flag_invalid STATUS_VAR);
3763         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3764     }
3765     if ( ( aSig<<shiftCount ) != savedASig ) {
3766         STATUS(float_exception_flags) |= float_flag_inexact;
3767     }
3768     return z;
3769
3770 }
3771
3772 /*----------------------------------------------------------------------------
3773 | Returns the result of converting the extended double-precision floating-
3774 | point value `a' to the 64-bit two's complement integer format.  The
3775 | conversion is performed according to the IEC/IEEE Standard for Binary
3776 | Floating-Point Arithmetic---which means in particular that the conversion
3777 | is rounded according to the current rounding mode.  If `a' is a NaN,
3778 | the largest positive integer is returned.  Otherwise, if the conversion
3779 | overflows, the largest integer with the same sign as `a' is returned.
3780 *----------------------------------------------------------------------------*/
3781
3782 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3783 {
3784     flag aSign;
3785     int32 aExp, shiftCount;
3786     bits64 aSig, aSigExtra;
3787
3788     aSig = extractFloatx80Frac( a );
3789     aExp = extractFloatx80Exp( a );
3790     aSign = extractFloatx80Sign( a );
3791     shiftCount = 0x403E - aExp;
3792     if ( shiftCount <= 0 ) {
3793         if ( shiftCount ) {
3794             float_raise( float_flag_invalid STATUS_VAR);
3795             if (    ! aSign
3796                  || (    ( aExp == 0x7FFF )
3797                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3798                ) {
3799                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3800             }
3801             return (sbits64) LIT64( 0x8000000000000000 );
3802         }
3803         aSigExtra = 0;
3804     }
3805     else {
3806         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3807     }
3808     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3809
3810 }
3811
3812 /*----------------------------------------------------------------------------
3813 | Returns the result of converting the extended double-precision floating-
3814 | point value `a' to the 64-bit two's complement integer format.  The
3815 | conversion is performed according to the IEC/IEEE Standard for Binary
3816 | Floating-Point Arithmetic, except that the conversion is always rounded
3817 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3818 | Otherwise, if the conversion overflows, the largest integer with the same
3819 | sign as `a' is returned.
3820 *----------------------------------------------------------------------------*/
3821
3822 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3823 {
3824     flag aSign;
3825     int32 aExp, shiftCount;
3826     bits64 aSig;
3827     int64 z;
3828
3829     aSig = extractFloatx80Frac( a );
3830     aExp = extractFloatx80Exp( a );
3831     aSign = extractFloatx80Sign( a );
3832     shiftCount = aExp - 0x403E;
3833     if ( 0 <= shiftCount ) {
3834         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3835         if ( ( a.high != 0xC03E ) || aSig ) {
3836             float_raise( float_flag_invalid STATUS_VAR);
3837             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3838                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3839             }
3840         }
3841         return (sbits64) LIT64( 0x8000000000000000 );
3842     }
3843     else if ( aExp < 0x3FFF ) {
3844         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3845         return 0;
3846     }
3847     z = aSig>>( - shiftCount );
3848     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3849         STATUS(float_exception_flags) |= float_flag_inexact;
3850     }
3851     if ( aSign ) z = - z;
3852     return z;
3853
3854 }
3855
3856 /*----------------------------------------------------------------------------
3857 | Returns the result of converting the extended double-precision floating-
3858 | point value `a' to the single-precision floating-point format.  The
3859 | conversion is performed according to the IEC/IEEE Standard for Binary
3860 | Floating-Point Arithmetic.
3861 *----------------------------------------------------------------------------*/
3862
3863 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3864 {
3865     flag aSign;
3866     int32 aExp;
3867     bits64 aSig;
3868
3869     aSig = extractFloatx80Frac( a );
3870     aExp = extractFloatx80Exp( a );
3871     aSign = extractFloatx80Sign( a );
3872     if ( aExp == 0x7FFF ) {
3873         if ( (bits64) ( aSig<<1 ) ) {
3874             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3875         }
3876         return packFloat32( aSign, 0xFF, 0 );
3877     }
3878     shift64RightJamming( aSig, 33, &aSig );
3879     if ( aExp || aSig ) aExp -= 0x3F81;
3880     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3881
3882 }
3883
3884 /*----------------------------------------------------------------------------
3885 | Returns the result of converting the extended double-precision floating-
3886 | point value `a' to the double-precision floating-point format.  The
3887 | conversion is performed according to the IEC/IEEE Standard for Binary
3888 | Floating-Point Arithmetic.
3889 *----------------------------------------------------------------------------*/
3890
3891 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3892 {
3893     flag aSign;
3894     int32 aExp;
3895     bits64 aSig, zSig;
3896
3897     aSig = extractFloatx80Frac( a );
3898     aExp = extractFloatx80Exp( a );
3899     aSign = extractFloatx80Sign( a );
3900     if ( aExp == 0x7FFF ) {
3901         if ( (bits64) ( aSig<<1 ) ) {
3902             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3903         }
3904         return packFloat64( aSign, 0x7FF, 0 );
3905     }
3906     shift64RightJamming( aSig, 1, &zSig );
3907     if ( aExp || aSig ) aExp -= 0x3C01;
3908     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3909
3910 }
3911
3912 #ifdef FLOAT128
3913
3914 /*----------------------------------------------------------------------------
3915 | Returns the result of converting the extended double-precision floating-
3916 | point value `a' to the quadruple-precision floating-point format.  The
3917 | conversion is performed according to the IEC/IEEE Standard for Binary
3918 | Floating-Point Arithmetic.
3919 *----------------------------------------------------------------------------*/
3920
3921 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3922 {
3923     flag aSign;
3924     int16 aExp;
3925     bits64 aSig, zSig0, zSig1;
3926
3927     aSig = extractFloatx80Frac( a );
3928     aExp = extractFloatx80Exp( a );
3929     aSign = extractFloatx80Sign( a );
3930     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3931         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3932     }
3933     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3934     return packFloat128( aSign, aExp, zSig0, zSig1 );
3935
3936 }
3937
3938 #endif
3939
3940 /*----------------------------------------------------------------------------
3941 | Rounds the extended double-precision floating-point value `a' to an integer,
3942 | and returns the result as an extended quadruple-precision floating-point
3943 | value.  The operation is performed according to the IEC/IEEE Standard for
3944 | Binary Floating-Point Arithmetic.
3945 *----------------------------------------------------------------------------*/
3946
3947 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3948 {
3949     flag aSign;
3950     int32 aExp;
3951     bits64 lastBitMask, roundBitsMask;
3952     int8 roundingMode;
3953     floatx80 z;
3954
3955     aExp = extractFloatx80Exp( a );
3956     if ( 0x403E <= aExp ) {
3957         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3958             return propagateFloatx80NaN( a, a STATUS_VAR );
3959         }
3960         return a;
3961     }
3962     if ( aExp < 0x3FFF ) {
3963         if (    ( aExp == 0 )
3964              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3965             return a;
3966         }
3967         STATUS(float_exception_flags) |= float_flag_inexact;
3968         aSign = extractFloatx80Sign( a );
3969         switch ( STATUS(float_rounding_mode) ) {
3970          case float_round_nearest_even:
3971             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3972                ) {
3973                 return
3974                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3975             }
3976             break;
3977          case float_round_down:
3978             return
3979                   aSign ?
3980                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3981                 : packFloatx80( 0, 0, 0 );
3982          case float_round_up:
3983             return
3984                   aSign ? packFloatx80( 1, 0, 0 )
3985                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3986         }
3987         return packFloatx80( aSign, 0, 0 );
3988     }
3989     lastBitMask = 1;
3990     lastBitMask <<= 0x403E - aExp;
3991     roundBitsMask = lastBitMask - 1;
3992     z = a;
3993     roundingMode = STATUS(float_rounding_mode);
3994     if ( roundingMode == float_round_nearest_even ) {
3995         z.low += lastBitMask>>1;
3996         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3997     }
3998     else if ( roundingMode != float_round_to_zero ) {
3999         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
4000             z.low += roundBitsMask;
4001         }
4002     }
4003     z.low &= ~ roundBitsMask;
4004     if ( z.low == 0 ) {
4005         ++z.high;
4006         z.low = LIT64( 0x8000000000000000 );
4007     }
4008     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4009     return z;
4010
4011 }
4012
4013 /*----------------------------------------------------------------------------
4014 | Returns the result of adding the absolute values of the extended double-
4015 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
4016 | negated before being returned.  `zSign' is ignored if the result is a NaN.
4017 | The addition is performed according to the IEC/IEEE Standard for Binary
4018 | Floating-Point Arithmetic.
4019 *----------------------------------------------------------------------------*/
4020
4021 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4022 {
4023     int32 aExp, bExp, zExp;
4024     bits64 aSig, bSig, zSig0, zSig1;
4025     int32 expDiff;
4026
4027     aSig = extractFloatx80Frac( a );
4028     aExp = extractFloatx80Exp( a );
4029     bSig = extractFloatx80Frac( b );
4030     bExp = extractFloatx80Exp( b );
4031     expDiff = aExp - bExp;
4032     if ( 0 < expDiff ) {
4033         if ( aExp == 0x7FFF ) {
4034             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4035             return a;
4036         }
4037         if ( bExp == 0 ) --expDiff;
4038         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4039         zExp = aExp;
4040     }
4041     else if ( expDiff < 0 ) {
4042         if ( bExp == 0x7FFF ) {
4043             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4044             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4045         }
4046         if ( aExp == 0 ) ++expDiff;
4047         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4048         zExp = bExp;
4049     }
4050     else {
4051         if ( aExp == 0x7FFF ) {
4052             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4053                 return propagateFloatx80NaN( a, b STATUS_VAR );
4054             }
4055             return a;
4056         }
4057         zSig1 = 0;
4058         zSig0 = aSig + bSig;
4059         if ( aExp == 0 ) {
4060             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4061             goto roundAndPack;
4062         }
4063         zExp = aExp;
4064         goto shiftRight1;
4065     }
4066     zSig0 = aSig + bSig;
4067     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
4068  shiftRight1:
4069     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4070     zSig0 |= LIT64( 0x8000000000000000 );
4071     ++zExp;
4072  roundAndPack:
4073     return
4074         roundAndPackFloatx80(
4075             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4076
4077 }
4078
4079 /*----------------------------------------------------------------------------
4080 | Returns the result of subtracting the absolute values of the extended
4081 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4082 | difference is negated before being returned.  `zSign' is ignored if the
4083 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4084 | Standard for Binary Floating-Point Arithmetic.
4085 *----------------------------------------------------------------------------*/
4086
4087 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4088 {
4089     int32 aExp, bExp, zExp;
4090     bits64 aSig, bSig, zSig0, zSig1;
4091     int32 expDiff;
4092     floatx80 z;
4093
4094     aSig = extractFloatx80Frac( a );
4095     aExp = extractFloatx80Exp( a );
4096     bSig = extractFloatx80Frac( b );
4097     bExp = extractFloatx80Exp( b );
4098     expDiff = aExp - bExp;
4099     if ( 0 < expDiff ) goto aExpBigger;
4100     if ( expDiff < 0 ) goto bExpBigger;
4101     if ( aExp == 0x7FFF ) {
4102         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4103             return propagateFloatx80NaN( a, b STATUS_VAR );
4104         }
4105         float_raise( float_flag_invalid STATUS_VAR);
4106         z.low = floatx80_default_nan_low;
4107         z.high = floatx80_default_nan_high;
4108         return z;
4109     }
4110     if ( aExp == 0 ) {
4111         aExp = 1;
4112         bExp = 1;
4113     }
4114     zSig1 = 0;
4115     if ( bSig < aSig ) goto aBigger;
4116     if ( aSig < bSig ) goto bBigger;
4117     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4118  bExpBigger:
4119     if ( bExp == 0x7FFF ) {
4120         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4121         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4122     }
4123     if ( aExp == 0 ) ++expDiff;
4124     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4125  bBigger:
4126     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4127     zExp = bExp;
4128     zSign ^= 1;
4129     goto normalizeRoundAndPack;
4130  aExpBigger:
4131     if ( aExp == 0x7FFF ) {
4132         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4133         return a;
4134     }
4135     if ( bExp == 0 ) --expDiff;
4136     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4137  aBigger:
4138     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4139     zExp = aExp;
4140  normalizeRoundAndPack:
4141     return
4142         normalizeRoundAndPackFloatx80(
4143             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4144
4145 }
4146
4147 /*----------------------------------------------------------------------------
4148 | Returns the result of adding the extended double-precision floating-point
4149 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4150 | Standard for Binary Floating-Point Arithmetic.
4151 *----------------------------------------------------------------------------*/
4152
4153 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4154 {
4155     flag aSign, bSign;
4156
4157     aSign = extractFloatx80Sign( a );
4158     bSign = extractFloatx80Sign( b );
4159     if ( aSign == bSign ) {
4160         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4161     }
4162     else {
4163         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4164     }
4165
4166 }
4167
4168 /*----------------------------------------------------------------------------
4169 | Returns the result of subtracting the extended double-precision floating-
4170 | point values `a' and `b'.  The operation is performed according to the
4171 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4172 *----------------------------------------------------------------------------*/
4173
4174 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4175 {
4176     flag aSign, bSign;
4177
4178     aSign = extractFloatx80Sign( a );
4179     bSign = extractFloatx80Sign( b );
4180     if ( aSign == bSign ) {
4181         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4182     }
4183     else {
4184         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4185     }
4186
4187 }
4188
4189 /*----------------------------------------------------------------------------
4190 | Returns the result of multiplying the extended double-precision floating-
4191 | point values `a' and `b'.  The operation is performed according to the
4192 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4193 *----------------------------------------------------------------------------*/
4194
4195 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4196 {
4197     flag aSign, bSign, zSign;
4198     int32 aExp, bExp, zExp;
4199     bits64 aSig, bSig, zSig0, zSig1;
4200     floatx80 z;
4201
4202     aSig = extractFloatx80Frac( a );
4203     aExp = extractFloatx80Exp( a );
4204     aSign = extractFloatx80Sign( a );
4205     bSig = extractFloatx80Frac( b );
4206     bExp = extractFloatx80Exp( b );
4207     bSign = extractFloatx80Sign( b );
4208     zSign = aSign ^ bSign;
4209     if ( aExp == 0x7FFF ) {
4210         if (    (bits64) ( aSig<<1 )
4211              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4212             return propagateFloatx80NaN( a, b STATUS_VAR );
4213         }
4214         if ( ( bExp | bSig ) == 0 ) goto invalid;
4215         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4216     }
4217     if ( bExp == 0x7FFF ) {
4218         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4219         if ( ( aExp | aSig ) == 0 ) {
4220  invalid:
4221             float_raise( float_flag_invalid STATUS_VAR);
4222             z.low = floatx80_default_nan_low;
4223             z.high = floatx80_default_nan_high;
4224             return z;
4225         }
4226         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4227     }
4228     if ( aExp == 0 ) {
4229         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4230         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4231     }
4232     if ( bExp == 0 ) {
4233         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4234         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4235     }
4236     zExp = aExp + bExp - 0x3FFE;
4237     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4238     if ( 0 < (sbits64) zSig0 ) {
4239         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4240         --zExp;
4241     }
4242     return
4243         roundAndPackFloatx80(
4244             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4245
4246 }
4247
4248 /*----------------------------------------------------------------------------
4249 | Returns the result of dividing the extended double-precision floating-point
4250 | value `a' by the corresponding value `b'.  The operation is performed
4251 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4252 *----------------------------------------------------------------------------*/
4253
4254 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4255 {
4256     flag aSign, bSign, zSign;
4257     int32 aExp, bExp, zExp;
4258     bits64 aSig, bSig, zSig0, zSig1;
4259     bits64 rem0, rem1, rem2, term0, term1, term2;
4260     floatx80 z;
4261
4262     aSig = extractFloatx80Frac( a );
4263     aExp = extractFloatx80Exp( a );
4264     aSign = extractFloatx80Sign( a );
4265     bSig = extractFloatx80Frac( b );
4266     bExp = extractFloatx80Exp( b );
4267     bSign = extractFloatx80Sign( b );
4268     zSign = aSign ^ bSign;
4269     if ( aExp == 0x7FFF ) {
4270         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4271         if ( bExp == 0x7FFF ) {
4272             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4273             goto invalid;
4274         }
4275         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4276     }
4277     if ( bExp == 0x7FFF ) {
4278         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4279         return packFloatx80( zSign, 0, 0 );
4280     }
4281     if ( bExp == 0 ) {
4282         if ( bSig == 0 ) {
4283             if ( ( aExp | aSig ) == 0 ) {
4284  invalid:
4285                 float_raise( float_flag_invalid STATUS_VAR);
4286                 z.low = floatx80_default_nan_low;
4287                 z.high = floatx80_default_nan_high;
4288                 return z;
4289             }
4290             float_raise( float_flag_divbyzero STATUS_VAR);
4291             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4292         }
4293         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4294     }
4295     if ( aExp == 0 ) {
4296         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4297         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4298     }
4299     zExp = aExp - bExp + 0x3FFE;
4300     rem1 = 0;
4301     if ( bSig <= aSig ) {
4302         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4303         ++zExp;
4304     }
4305     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4306     mul64To128( bSig, zSig0, &term0, &term1 );
4307     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4308     while ( (sbits64) rem0 < 0 ) {
4309         --zSig0;
4310         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4311     }
4312     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4313     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4314         mul64To128( bSig, zSig1, &term1, &term2 );
4315         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4316         while ( (sbits64) rem1 < 0 ) {
4317             --zSig1;
4318             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4319         }
4320         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4321     }
4322     return
4323         roundAndPackFloatx80(
4324             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4325
4326 }
4327
4328 /*----------------------------------------------------------------------------
4329 | Returns the remainder of the extended double-precision floating-point value
4330 | `a' with respect to the corresponding value `b'.  The operation is performed
4331 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4332 *----------------------------------------------------------------------------*/
4333
4334 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4335 {
4336     flag aSign, zSign;
4337     int32 aExp, bExp, expDiff;
4338     bits64 aSig0, aSig1, bSig;
4339     bits64 q, term0, term1, alternateASig0, alternateASig1;
4340     floatx80 z;
4341
4342     aSig0 = extractFloatx80Frac( a );
4343     aExp = extractFloatx80Exp( a );
4344     aSign = extractFloatx80Sign( a );
4345     bSig = extractFloatx80Frac( b );
4346     bExp = extractFloatx80Exp( b );
4347     if ( aExp == 0x7FFF ) {
4348         if (    (bits64) ( aSig0<<1 )
4349              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4350             return propagateFloatx80NaN( a, b STATUS_VAR );
4351         }
4352         goto invalid;
4353     }
4354     if ( bExp == 0x7FFF ) {
4355         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4356         return a;
4357     }
4358     if ( bExp == 0 ) {
4359         if ( bSig == 0 ) {
4360  invalid:
4361             float_raise( float_flag_invalid STATUS_VAR);
4362             z.low = floatx80_default_nan_low;
4363             z.high = floatx80_default_nan_high;
4364             return z;
4365         }
4366         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4367     }
4368     if ( aExp == 0 ) {
4369         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4370         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4371     }
4372     bSig |= LIT64( 0x8000000000000000 );
4373     zSign = aSign;
4374     expDiff = aExp - bExp;
4375     aSig1 = 0;
4376     if ( expDiff < 0 ) {
4377         if ( expDiff < -1 ) return a;
4378         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4379         expDiff = 0;
4380     }
4381     q = ( bSig <= aSig0 );
4382     if ( q ) aSig0 -= bSig;
4383     expDiff -= 64;
4384     while ( 0 < expDiff ) {
4385         q = estimateDiv128To64( aSig0, aSig1, bSig );
4386         q = ( 2 < q ) ? q - 2 : 0;
4387         mul64To128( bSig, q, &term0, &term1 );
4388         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4389         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4390         expDiff -= 62;
4391     }
4392     expDiff += 64;
4393     if ( 0 < expDiff ) {
4394         q = estimateDiv128To64( aSig0, aSig1, bSig );
4395         q = ( 2 < q ) ? q - 2 : 0;
4396         q >>= 64 - expDiff;
4397         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4398         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4399         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4400         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4401             ++q;
4402             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4403         }
4404     }
4405     else {
4406         term1 = 0;
4407         term0 = bSig;
4408     }
4409     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4410     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4411          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4412               && ( q & 1 ) )
4413        ) {
4414         aSig0 = alternateASig0;
4415         aSig1 = alternateASig1;
4416         zSign = ! zSign;
4417     }
4418     return
4419         normalizeRoundAndPackFloatx80(
4420             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4421
4422 }
4423
4424 /*----------------------------------------------------------------------------
4425 | Returns the square root of the extended double-precision floating-point
4426 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4427 | for Binary Floating-Point Arithmetic.
4428 *----------------------------------------------------------------------------*/
4429
4430 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4431 {
4432     flag aSign;
4433     int32 aExp, zExp;
4434     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4435     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4436     floatx80 z;
4437
4438     aSig0 = extractFloatx80Frac( a );
4439     aExp = extractFloatx80Exp( a );
4440     aSign = extractFloatx80Sign( a );
4441     if ( aExp == 0x7FFF ) {
4442         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4443         if ( ! aSign ) return a;
4444         goto invalid;
4445     }
4446     if ( aSign ) {
4447         if ( ( aExp | aSig0 ) == 0 ) return a;
4448  invalid:
4449         float_raise( float_flag_invalid STATUS_VAR);
4450         z.low = floatx80_default_nan_low;
4451         z.high = floatx80_default_nan_high;
4452         return z;
4453     }
4454     if ( aExp == 0 ) {
4455         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4456         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4457     }
4458     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4459     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4460     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4461     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4462     doubleZSig0 = zSig0<<1;
4463     mul64To128( zSig0, zSig0, &term0, &term1 );
4464     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4465     while ( (sbits64) rem0 < 0 ) {
4466         --zSig0;
4467         doubleZSig0 -= 2;
4468         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4469     }
4470     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4471     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4472         if ( zSig1 == 0 ) zSig1 = 1;
4473         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4474         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4475         mul64To128( zSig1, zSig1, &term2, &term3 );
4476         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4477         while ( (sbits64) rem1 < 0 ) {
4478             --zSig1;
4479             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4480             term3 |= 1;
4481             term2 |= doubleZSig0;
4482             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4483         }
4484         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4485     }
4486     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4487     zSig0 |= doubleZSig0;
4488     return
4489         roundAndPackFloatx80(
4490             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4491
4492 }
4493
4494 /*----------------------------------------------------------------------------
4495 | Returns 1 if the extended double-precision floating-point value `a' is
4496 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
4497 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4498 | Arithmetic.
4499 *----------------------------------------------------------------------------*/
4500
4501 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4502 {
4503
4504     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4505               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4506          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4507               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4508        ) {
4509         if (    floatx80_is_signaling_nan( a )
4510              || floatx80_is_signaling_nan( b ) ) {
4511             float_raise( float_flag_invalid STATUS_VAR);
4512         }
4513         return 0;
4514     }
4515     return
4516            ( a.low == b.low )
4517         && (    ( a.high == b.high )
4518              || (    ( a.low == 0 )
4519                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4520            );
4521
4522 }
4523
4524 /*----------------------------------------------------------------------------
4525 | Returns 1 if the extended double-precision floating-point value `a' is
4526 | less than or equal to the corresponding value `b', and 0 otherwise.  The
4527 | comparison is performed according to the IEC/IEEE Standard for Binary
4528 | Floating-Point Arithmetic.
4529 *----------------------------------------------------------------------------*/
4530
4531 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4532 {
4533     flag aSign, bSign;
4534
4535     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4536               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4537          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4538               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4539        ) {
4540         float_raise( float_flag_invalid STATUS_VAR);
4541         return 0;
4542     }
4543     aSign = extractFloatx80Sign( a );
4544     bSign = extractFloatx80Sign( b );
4545     if ( aSign != bSign ) {
4546         return
4547                aSign
4548             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4549                  == 0 );
4550     }
4551     return
4552           aSign ? le128( b.high, b.low, a.high, a.low )
4553         : le128( a.high, a.low, b.high, b.low );
4554
4555 }
4556
4557 /*----------------------------------------------------------------------------
4558 | Returns 1 if the extended double-precision floating-point value `a' is
4559 | less than the corresponding value `b', and 0 otherwise.  The comparison
4560 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4561 | Arithmetic.
4562 *----------------------------------------------------------------------------*/
4563
4564 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4565 {
4566     flag aSign, bSign;
4567
4568     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4569               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4570          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4571               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4572        ) {
4573         float_raise( float_flag_invalid STATUS_VAR);
4574         return 0;
4575     }
4576     aSign = extractFloatx80Sign( a );
4577     bSign = extractFloatx80Sign( b );
4578     if ( aSign != bSign ) {
4579         return
4580                aSign
4581             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4582                  != 0 );
4583     }
4584     return
4585           aSign ? lt128( b.high, b.low, a.high, a.low )
4586         : lt128( a.high, a.low, b.high, b.low );
4587
4588 }
4589
4590 /*----------------------------------------------------------------------------
4591 | Returns 1 if the extended double-precision floating-point value `a' is equal
4592 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4593 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4594 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4595 *----------------------------------------------------------------------------*/
4596
4597 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4598 {
4599
4600     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4601               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4602          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4603               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4604        ) {
4605         float_raise( float_flag_invalid STATUS_VAR);
4606         return 0;
4607     }
4608     return
4609            ( a.low == b.low )
4610         && (    ( a.high == b.high )
4611              || (    ( a.low == 0 )
4612                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4613            );
4614
4615 }
4616
4617 /*----------------------------------------------------------------------------
4618 | Returns 1 if the extended double-precision floating-point value `a' is less
4619 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4620 | do not cause an exception.  Otherwise, the comparison is performed according
4621 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4622 *----------------------------------------------------------------------------*/
4623
4624 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4625 {
4626     flag aSign, bSign;
4627
4628     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4629               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4630          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4631               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4632        ) {
4633         if (    floatx80_is_signaling_nan( a )
4634              || floatx80_is_signaling_nan( b ) ) {
4635             float_raise( float_flag_invalid STATUS_VAR);
4636         }
4637         return 0;
4638     }
4639     aSign = extractFloatx80Sign( a );
4640     bSign = extractFloatx80Sign( b );
4641     if ( aSign != bSign ) {
4642         return
4643                aSign
4644             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4645                  == 0 );
4646     }
4647     return
4648           aSign ? le128( b.high, b.low, a.high, a.low )
4649         : le128( a.high, a.low, b.high, b.low );
4650
4651 }
4652
4653 /*----------------------------------------------------------------------------
4654 | Returns 1 if the extended double-precision floating-point value `a' is less
4655 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4656 | an exception.  Otherwise, the comparison is performed according to the
4657 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4658 *----------------------------------------------------------------------------*/
4659
4660 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4661 {
4662     flag aSign, bSign;
4663
4664     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4665               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4666          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4667               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4668        ) {
4669         if (    floatx80_is_signaling_nan( a )
4670              || floatx80_is_signaling_nan( b ) ) {
4671             float_raise( float_flag_invalid STATUS_VAR);
4672         }
4673         return 0;
4674     }
4675     aSign = extractFloatx80Sign( a );
4676     bSign = extractFloatx80Sign( b );
4677     if ( aSign != bSign ) {
4678         return
4679                aSign
4680             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4681                  != 0 );
4682     }
4683     return
4684           aSign ? lt128( b.high, b.low, a.high, a.low )
4685         : lt128( a.high, a.low, b.high, b.low );
4686
4687 }
4688
4689 #endif
4690
4691 #ifdef FLOAT128
4692
4693 /*----------------------------------------------------------------------------
4694 | Returns the result of converting the quadruple-precision floating-point
4695 | value `a' to the 32-bit two's complement integer format.  The conversion
4696 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4697 | Arithmetic---which means in particular that the conversion is rounded
4698 | according to the current rounding mode.  If `a' is a NaN, the largest
4699 | positive integer is returned.  Otherwise, if the conversion overflows, the
4700 | largest integer with the same sign as `a' is returned.
4701 *----------------------------------------------------------------------------*/
4702
4703 int32 float128_to_int32( float128 a STATUS_PARAM )
4704 {
4705     flag aSign;
4706     int32 aExp, shiftCount;
4707     bits64 aSig0, aSig1;
4708
4709     aSig1 = extractFloat128Frac1( a );
4710     aSig0 = extractFloat128Frac0( a );
4711     aExp = extractFloat128Exp( a );
4712     aSign = extractFloat128Sign( a );
4713     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4714     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4715     aSig0 |= ( aSig1 != 0 );
4716     shiftCount = 0x4028 - aExp;
4717     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4718     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4719
4720 }
4721
4722 /*----------------------------------------------------------------------------
4723 | Returns the result of converting the quadruple-precision floating-point
4724 | value `a' to the 32-bit two's complement integer format.  The conversion
4725 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4726 | Arithmetic, except that the conversion is always rounded toward zero.  If
4727 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4728 | conversion overflows, the largest integer with the same sign as `a' is
4729 | returned.
4730 *----------------------------------------------------------------------------*/
4731
4732 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4733 {
4734     flag aSign;
4735     int32 aExp, shiftCount;
4736     bits64 aSig0, aSig1, savedASig;
4737     int32 z;
4738
4739     aSig1 = extractFloat128Frac1( a );
4740     aSig0 = extractFloat128Frac0( a );
4741     aExp = extractFloat128Exp( a );
4742     aSign = extractFloat128Sign( a );
4743     aSig0 |= ( aSig1 != 0 );
4744     if ( 0x401E < aExp ) {
4745         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4746         goto invalid;
4747     }
4748     else if ( aExp < 0x3FFF ) {
4749         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4750         return 0;
4751     }
4752     aSig0 |= LIT64( 0x0001000000000000 );
4753     shiftCount = 0x402F - aExp;
4754     savedASig = aSig0;
4755     aSig0 >>= shiftCount;
4756     z = aSig0;
4757     if ( aSign ) z = - z;
4758     if ( ( z < 0 ) ^ aSign ) {
4759  invalid:
4760         float_raise( float_flag_invalid STATUS_VAR);
4761         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4762     }
4763     if ( ( aSig0<<shiftCount ) != savedASig ) {
4764         STATUS(float_exception_flags) |= float_flag_inexact;
4765     }
4766     return z;
4767
4768 }
4769
4770 /*----------------------------------------------------------------------------
4771 | Returns the result of converting the quadruple-precision floating-point
4772 | value `a' to the 64-bit two's complement integer format.  The conversion
4773 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4774 | Arithmetic---which means in particular that the conversion is rounded
4775 | according to the current rounding mode.  If `a' is a NaN, the largest
4776 | positive integer is returned.  Otherwise, if the conversion overflows, the
4777 | largest integer with the same sign as `a' is returned.
4778 *----------------------------------------------------------------------------*/
4779
4780 int64 float128_to_int64( float128 a STATUS_PARAM )
4781 {
4782     flag aSign;
4783     int32 aExp, shiftCount;
4784     bits64 aSig0, aSig1;
4785
4786     aSig1 = extractFloat128Frac1( a );
4787     aSig0 = extractFloat128Frac0( a );
4788     aExp = extractFloat128Exp( a );
4789     aSign = extractFloat128Sign( a );
4790     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4791     shiftCount = 0x402F - aExp;
4792     if ( shiftCount <= 0 ) {
4793         if ( 0x403E < aExp ) {
4794             float_raise( float_flag_invalid STATUS_VAR);
4795             if (    ! aSign
4796                  || (    ( aExp == 0x7FFF )
4797                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4798                     )
4799                ) {
4800                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4801             }
4802             return (sbits64) LIT64( 0x8000000000000000 );
4803         }
4804         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4805     }
4806     else {
4807         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4808     }
4809     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4810
4811 }
4812
4813 /*----------------------------------------------------------------------------
4814 | Returns the result of converting the quadruple-precision floating-point
4815 | value `a' to the 64-bit two's complement integer format.  The conversion
4816 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4817 | Arithmetic, except that the conversion is always rounded toward zero.
4818 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4819 | the conversion overflows, the largest integer with the same sign as `a' is
4820 | returned.
4821 *----------------------------------------------------------------------------*/
4822
4823 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4824 {
4825     flag aSign;
4826     int32 aExp, shiftCount;
4827     bits64 aSig0, aSig1;
4828     int64 z;
4829
4830     aSig1 = extractFloat128Frac1( a );
4831     aSig0 = extractFloat128Frac0( a );
4832     aExp = extractFloat128Exp( a );
4833     aSign = extractFloat128Sign( a );
4834     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4835     shiftCount = aExp - 0x402F;
4836     if ( 0 < shiftCount ) {
4837         if ( 0x403E <= aExp ) {
4838             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4839             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4840                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4841                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4842             }
4843             else {
4844                 float_raise( float_flag_invalid STATUS_VAR);
4845                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4846                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4847                 }
4848             }
4849             return (sbits64) LIT64( 0x8000000000000000 );
4850         }
4851         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4852         if ( (bits64) ( aSig1<<shiftCount ) ) {
4853             STATUS(float_exception_flags) |= float_flag_inexact;
4854         }
4855     }
4856     else {
4857         if ( aExp < 0x3FFF ) {
4858             if ( aExp | aSig0 | aSig1 ) {
4859                 STATUS(float_exception_flags) |= float_flag_inexact;
4860             }
4861             return 0;
4862         }
4863         z = aSig0>>( - shiftCount );
4864         if (    aSig1
4865              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4866             STATUS(float_exception_flags) |= float_flag_inexact;
4867         }
4868     }
4869     if ( aSign ) z = - z;
4870     return z;
4871
4872 }
4873
4874 /*----------------------------------------------------------------------------
4875 | Returns the result of converting the quadruple-precision floating-point
4876 | value `a' to the single-precision floating-point format.  The conversion
4877 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4878 | Arithmetic.
4879 *----------------------------------------------------------------------------*/
4880
4881 float32 float128_to_float32( float128 a STATUS_PARAM )
4882 {
4883     flag aSign;
4884     int32 aExp;
4885     bits64 aSig0, aSig1;
4886     bits32 zSig;
4887
4888     aSig1 = extractFloat128Frac1( a );
4889     aSig0 = extractFloat128Frac0( a );
4890     aExp = extractFloat128Exp( a );
4891     aSign = extractFloat128Sign( a );
4892     if ( aExp == 0x7FFF ) {
4893         if ( aSig0 | aSig1 ) {
4894             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4895         }
4896         return packFloat32( aSign, 0xFF, 0 );
4897     }
4898     aSig0 |= ( aSig1 != 0 );
4899     shift64RightJamming( aSig0, 18, &aSig0 );
4900     zSig = aSig0;
4901     if ( aExp || zSig ) {
4902         zSig |= 0x40000000;
4903         aExp -= 0x3F81;
4904     }
4905     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4906
4907 }
4908
4909 /*----------------------------------------------------------------------------
4910 | Returns the result of converting the quadruple-precision floating-point
4911 | value `a' to the double-precision floating-point format.  The conversion
4912 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4913 | Arithmetic.
4914 *----------------------------------------------------------------------------*/
4915
4916 float64 float128_to_float64( float128 a STATUS_PARAM )
4917 {
4918     flag aSign;
4919     int32 aExp;
4920     bits64 aSig0, aSig1;
4921
4922     aSig1 = extractFloat128Frac1( a );
4923     aSig0 = extractFloat128Frac0( a );
4924     aExp = extractFloat128Exp( a );
4925     aSign = extractFloat128Sign( a );
4926     if ( aExp == 0x7FFF ) {
4927         if ( aSig0 | aSig1 ) {
4928             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4929         }
4930         return packFloat64( aSign, 0x7FF, 0 );
4931     }
4932     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4933     aSig0 |= ( aSig1 != 0 );
4934     if ( aExp || aSig0 ) {
4935         aSig0 |= LIT64( 0x4000000000000000 );
4936         aExp -= 0x3C01;
4937     }
4938     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4939
4940 }
4941
4942 #ifdef FLOATX80
4943
4944 /*----------------------------------------------------------------------------
4945 | Returns the result of converting the quadruple-precision floating-point
4946 | value `a' to the extended double-precision floating-point format.  The
4947 | conversion is performed according to the IEC/IEEE Standard for Binary
4948 | Floating-Point Arithmetic.
4949 *----------------------------------------------------------------------------*/
4950
4951 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4952 {
4953     flag aSign;
4954     int32 aExp;
4955     bits64 aSig0, aSig1;
4956
4957     aSig1 = extractFloat128Frac1( a );
4958     aSig0 = extractFloat128Frac0( a );
4959     aExp = extractFloat128Exp( a );
4960     aSign = extractFloat128Sign( a );
4961     if ( aExp == 0x7FFF ) {
4962         if ( aSig0 | aSig1 ) {
4963             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4964         }
4965         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4966     }
4967     if ( aExp == 0 ) {
4968         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4969         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4970     }
4971     else {
4972         aSig0 |= LIT64( 0x0001000000000000 );
4973     }
4974     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4975     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4976
4977 }
4978
4979 #endif
4980
4981 /*----------------------------------------------------------------------------
4982 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4983 | returns the result as a quadruple-precision floating-point value.  The
4984 | operation is performed according to the IEC/IEEE Standard for Binary
4985 | Floating-Point Arithmetic.
4986 *----------------------------------------------------------------------------*/
4987
4988 float128 float128_round_to_int( float128 a STATUS_PARAM )
4989 {
4990     flag aSign;
4991     int32 aExp;
4992     bits64 lastBitMask, roundBitsMask;
4993     int8 roundingMode;
4994     float128 z;
4995
4996     aExp = extractFloat128Exp( a );
4997     if ( 0x402F <= aExp ) {
4998         if ( 0x406F <= aExp ) {
4999             if (    ( aExp == 0x7FFF )
5000                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5001                ) {
5002                 return propagateFloat128NaN( a, a STATUS_VAR );
5003             }
5004             return a;
5005         }
5006         lastBitMask = 1;
5007         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5008         roundBitsMask = lastBitMask - 1;
5009         z = a;
5010         roundingMode = STATUS(float_rounding_mode);
5011         if ( roundingMode == float_round_nearest_even ) {
5012             if ( lastBitMask ) {
5013                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5014                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5015             }
5016             else {
5017                 if ( (sbits64) z.low < 0 ) {
5018                     ++z.high;
5019                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
5020                 }
5021             }
5022         }
5023         else if ( roundingMode != float_round_to_zero ) {
5024             if (   extractFloat128Sign( z )
5025                  ^ ( roundingMode == float_round_up ) ) {
5026                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
5027             }
5028         }
5029         z.low &= ~ roundBitsMask;
5030     }
5031     else {
5032         if ( aExp < 0x3FFF ) {
5033             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5034             STATUS(float_exception_flags) |= float_flag_inexact;
5035             aSign = extractFloat128Sign( a );
5036             switch ( STATUS(float_rounding_mode) ) {
5037              case float_round_nearest_even:
5038                 if (    ( aExp == 0x3FFE )
5039                      && (   extractFloat128Frac0( a )
5040                           | extractFloat128Frac1( a ) )
5041                    ) {
5042                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5043                 }
5044                 break;
5045              case float_round_down:
5046                 return
5047                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5048                     : packFloat128( 0, 0, 0, 0 );
5049              case float_round_up:
5050                 return
5051                       aSign ? packFloat128( 1, 0, 0, 0 )
5052                     : packFloat128( 0, 0x3FFF, 0, 0 );
5053             }
5054             return packFloat128( aSign, 0, 0, 0 );
5055         }
5056         lastBitMask = 1;
5057         lastBitMask <<= 0x402F - aExp;
5058         roundBitsMask = lastBitMask - 1;
5059         z.low = 0;
5060         z.high = a.high;
5061         roundingMode = STATUS(float_rounding_mode);
5062         if ( roundingMode == float_round_nearest_even ) {
5063             z.high += lastBitMask>>1;
5064             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5065                 z.high &= ~ lastBitMask;
5066             }
5067         }
5068         else if ( roundingMode != float_round_to_zero ) {
5069             if (   extractFloat128Sign( z )
5070                  ^ ( roundingMode == float_round_up ) ) {
5071                 z.high |= ( a.low != 0 );
5072                 z.high += roundBitsMask;
5073             }
5074         }
5075         z.high &= ~ roundBitsMask;
5076     }
5077     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5078         STATUS(float_exception_flags) |= float_flag_inexact;
5079     }
5080     return z;
5081
5082 }
5083
5084 /*----------------------------------------------------------------------------
5085 | Returns the result of adding the absolute values of the quadruple-precision
5086 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5087 | before being returned.  `zSign' is ignored if the result is a NaN.
5088 | The addition is performed according to the IEC/IEEE Standard for Binary
5089 | Floating-Point Arithmetic.
5090 *----------------------------------------------------------------------------*/
5091
5092 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5093 {
5094     int32 aExp, bExp, zExp;
5095     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5096     int32 expDiff;
5097
5098     aSig1 = extractFloat128Frac1( a );
5099     aSig0 = extractFloat128Frac0( a );
5100     aExp = extractFloat128Exp( a );
5101     bSig1 = extractFloat128Frac1( b );
5102     bSig0 = extractFloat128Frac0( b );
5103     bExp = extractFloat128Exp( b );
5104     expDiff = aExp - bExp;
5105     if ( 0 < expDiff ) {
5106         if ( aExp == 0x7FFF ) {
5107             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5108             return a;
5109         }
5110         if ( bExp == 0 ) {
5111             --expDiff;
5112         }
5113         else {
5114             bSig0 |= LIT64( 0x0001000000000000 );
5115         }
5116         shift128ExtraRightJamming(
5117             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5118         zExp = aExp;
5119     }
5120     else if ( expDiff < 0 ) {
5121         if ( bExp == 0x7FFF ) {
5122             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5123             return packFloat128( zSign, 0x7FFF, 0, 0 );
5124         }
5125         if ( aExp == 0 ) {
5126             ++expDiff;
5127         }
5128         else {
5129             aSig0 |= LIT64( 0x0001000000000000 );
5130         }
5131         shift128ExtraRightJamming(
5132             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5133         zExp = bExp;
5134     }
5135     else {
5136         if ( aExp == 0x7FFF ) {
5137             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5138                 return propagateFloat128NaN( a, b STATUS_VAR );
5139             }
5140             return a;
5141         }
5142         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5143         if ( aExp == 0 ) {
5144             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5145             return packFloat128( zSign, 0, zSig0, zSig1 );
5146         }
5147         zSig2 = 0;
5148         zSig0 |= LIT64( 0x0002000000000000 );
5149         zExp = aExp;
5150         goto shiftRight1;
5151     }
5152     aSig0 |= LIT64( 0x0001000000000000 );
5153     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5154     --zExp;
5155     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5156     ++zExp;
5157  shiftRight1:
5158     shift128ExtraRightJamming(
5159         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5160  roundAndPack:
5161     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5162
5163 }
5164
5165 /*----------------------------------------------------------------------------
5166 | Returns the result of subtracting the absolute values of the quadruple-
5167 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
5168 | difference is negated before being returned.  `zSign' is ignored if the
5169 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5170 | Standard for Binary Floating-Point Arithmetic.
5171 *----------------------------------------------------------------------------*/
5172
5173 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5174 {
5175     int32 aExp, bExp, zExp;
5176     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5177     int32 expDiff;
5178     float128 z;
5179
5180     aSig1 = extractFloat128Frac1( a );
5181     aSig0 = extractFloat128Frac0( a );
5182     aExp = extractFloat128Exp( a );
5183     bSig1 = extractFloat128Frac1( b );
5184     bSig0 = extractFloat128Frac0( b );
5185     bExp = extractFloat128Exp( b );
5186     expDiff = aExp - bExp;
5187     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5188     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5189     if ( 0 < expDiff ) goto aExpBigger;
5190     if ( expDiff < 0 ) goto bExpBigger;
5191     if ( aExp == 0x7FFF ) {
5192         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5193             return propagateFloat128NaN( a, b STATUS_VAR );
5194         }
5195         float_raise( float_flag_invalid STATUS_VAR);
5196         z.low = float128_default_nan_low;
5197         z.high = float128_default_nan_high;
5198         return z;
5199     }
5200     if ( aExp == 0 ) {
5201         aExp = 1;
5202         bExp = 1;
5203     }
5204     if ( bSig0 < aSig0 ) goto aBigger;
5205     if ( aSig0 < bSig0 ) goto bBigger;
5206     if ( bSig1 < aSig1 ) goto aBigger;
5207     if ( aSig1 < bSig1 ) goto bBigger;
5208     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5209  bExpBigger:
5210     if ( bExp == 0x7FFF ) {
5211         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5212         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5213     }
5214     if ( aExp == 0 ) {
5215         ++expDiff;
5216     }
5217     else {
5218         aSig0 |= LIT64( 0x4000000000000000 );
5219     }
5220     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5221     bSig0 |= LIT64( 0x4000000000000000 );
5222  bBigger:
5223     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5224     zExp = bExp;
5225     zSign ^= 1;
5226     goto normalizeRoundAndPack;
5227  aExpBigger:
5228     if ( aExp == 0x7FFF ) {
5229         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5230         return a;
5231     }
5232     if ( bExp == 0 ) {
5233         --expDiff;
5234     }
5235     else {
5236         bSig0 |= LIT64( 0x4000000000000000 );
5237     }
5238     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5239     aSig0 |= LIT64( 0x4000000000000000 );
5240  aBigger:
5241     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5242     zExp = aExp;
5243  normalizeRoundAndPack:
5244     --zExp;
5245     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5246
5247 }
5248
5249 /*----------------------------------------------------------------------------
5250 | Returns the result of adding the quadruple-precision floating-point values
5251 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5252 | for Binary Floating-Point Arithmetic.
5253 *----------------------------------------------------------------------------*/
5254
5255 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5256 {
5257     flag aSign, bSign;
5258
5259     aSign = extractFloat128Sign( a );
5260     bSign = extractFloat128Sign( b );
5261     if ( aSign == bSign ) {
5262         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5263     }
5264     else {
5265         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5266     }
5267
5268 }
5269
5270 /*----------------------------------------------------------------------------
5271 | Returns the result of subtracting the quadruple-precision floating-point
5272 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5273 | Standard for Binary Floating-Point Arithmetic.
5274 *----------------------------------------------------------------------------*/
5275
5276 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5277 {
5278     flag aSign, bSign;
5279
5280     aSign = extractFloat128Sign( a );
5281     bSign = extractFloat128Sign( b );
5282     if ( aSign == bSign ) {
5283         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5284     }
5285     else {
5286         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5287     }
5288
5289 }
5290
5291 /*----------------------------------------------------------------------------
5292 | Returns the result of multiplying the quadruple-precision floating-point
5293 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5294 | Standard for Binary Floating-Point Arithmetic.
5295 *----------------------------------------------------------------------------*/
5296
5297 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5298 {
5299     flag aSign, bSign, zSign;
5300     int32 aExp, bExp, zExp;
5301     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5302     float128 z;
5303
5304     aSig1 = extractFloat128Frac1( a );
5305     aSig0 = extractFloat128Frac0( a );
5306     aExp = extractFloat128Exp( a );
5307     aSign = extractFloat128Sign( a );
5308     bSig1 = extractFloat128Frac1( b );
5309     bSig0 = extractFloat128Frac0( b );
5310     bExp = extractFloat128Exp( b );
5311     bSign = extractFloat128Sign( b );
5312     zSign = aSign ^ bSign;
5313     if ( aExp == 0x7FFF ) {
5314         if (    ( aSig0 | aSig1 )
5315              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5316             return propagateFloat128NaN( a, b STATUS_VAR );
5317         }
5318         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5319         return packFloat128( zSign, 0x7FFF, 0, 0 );
5320     }
5321     if ( bExp == 0x7FFF ) {
5322         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5323         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5324  invalid:
5325             float_raise( float_flag_invalid STATUS_VAR);
5326             z.low = float128_default_nan_low;
5327             z.high = float128_default_nan_high;
5328             return z;
5329         }
5330         return packFloat128( zSign, 0x7FFF, 0, 0 );
5331     }
5332     if ( aExp == 0 ) {
5333         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5334         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5335     }
5336     if ( bExp == 0 ) {
5337         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5338         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5339     }
5340     zExp = aExp + bExp - 0x4000;
5341     aSig0 |= LIT64( 0x0001000000000000 );
5342     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5343     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5344     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5345     zSig2 |= ( zSig3 != 0 );
5346     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5347         shift128ExtraRightJamming(
5348             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5349         ++zExp;
5350     }
5351     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5352
5353 }
5354
5355 /*----------------------------------------------------------------------------
5356 | Returns the result of dividing the quadruple-precision floating-point value
5357 | `a' by the corresponding value `b'.  The operation is performed according to
5358 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5359 *----------------------------------------------------------------------------*/
5360
5361 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5362 {
5363     flag aSign, bSign, zSign;
5364     int32 aExp, bExp, zExp;
5365     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5366     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5367     float128 z;
5368
5369     aSig1 = extractFloat128Frac1( a );
5370     aSig0 = extractFloat128Frac0( a );
5371     aExp = extractFloat128Exp( a );
5372     aSign = extractFloat128Sign( a );
5373     bSig1 = extractFloat128Frac1( b );
5374     bSig0 = extractFloat128Frac0( b );
5375     bExp = extractFloat128Exp( b );
5376     bSign = extractFloat128Sign( b );
5377     zSign = aSign ^ bSign;
5378     if ( aExp == 0x7FFF ) {
5379         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5380         if ( bExp == 0x7FFF ) {
5381             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5382             goto invalid;
5383         }
5384         return packFloat128( zSign, 0x7FFF, 0, 0 );
5385     }
5386     if ( bExp == 0x7FFF ) {
5387         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5388         return packFloat128( zSign, 0, 0, 0 );
5389     }
5390     if ( bExp == 0 ) {
5391         if ( ( bSig0 | bSig1 ) == 0 ) {
5392             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5393  invalid:
5394                 float_raise( float_flag_invalid STATUS_VAR);
5395                 z.low = float128_default_nan_low;
5396                 z.high = float128_default_nan_high;
5397                 return z;
5398             }
5399             float_raise( float_flag_divbyzero STATUS_VAR);
5400             return packFloat128( zSign, 0x7FFF, 0, 0 );
5401         }
5402         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5403     }
5404     if ( aExp == 0 ) {
5405         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5406         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5407     }
5408     zExp = aExp - bExp + 0x3FFD;
5409     shortShift128Left(
5410         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5411     shortShift128Left(
5412         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5413     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5414         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5415         ++zExp;
5416     }
5417     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5418     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5419     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5420     while ( (sbits64) rem0 < 0 ) {
5421         --zSig0;
5422         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5423     }
5424     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5425     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5426         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5427         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5428         while ( (sbits64) rem1 < 0 ) {
5429             --zSig1;
5430             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5431         }
5432         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5433     }
5434     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5435     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5436
5437 }
5438
5439 /*----------------------------------------------------------------------------
5440 | Returns the remainder of the quadruple-precision floating-point value `a'
5441 | with respect to the corresponding value `b'.  The operation is performed
5442 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5443 *----------------------------------------------------------------------------*/
5444
5445 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5446 {
5447     flag aSign, zSign;
5448     int32 aExp, bExp, expDiff;
5449     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5450     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5451     sbits64 sigMean0;
5452     float128 z;
5453
5454     aSig1 = extractFloat128Frac1( a );
5455     aSig0 = extractFloat128Frac0( a );
5456     aExp = extractFloat128Exp( a );
5457     aSign = extractFloat128Sign( a );
5458     bSig1 = extractFloat128Frac1( b );
5459     bSig0 = extractFloat128Frac0( b );
5460     bExp = extractFloat128Exp( b );
5461     if ( aExp == 0x7FFF ) {
5462         if (    ( aSig0 | aSig1 )
5463              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5464             return propagateFloat128NaN( a, b STATUS_VAR );
5465         }
5466         goto invalid;
5467     }
5468     if ( bExp == 0x7FFF ) {
5469         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5470         return a;
5471     }
5472     if ( bExp == 0 ) {
5473         if ( ( bSig0 | bSig1 ) == 0 ) {
5474  invalid:
5475             float_raise( float_flag_invalid STATUS_VAR);
5476             z.low = float128_default_nan_low;
5477             z.high = float128_default_nan_high;
5478             return z;
5479         }
5480         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5481     }
5482     if ( aExp == 0 ) {
5483         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5484         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5485     }
5486     expDiff = aExp - bExp;
5487     if ( expDiff < -1 ) return a;
5488     shortShift128Left(
5489         aSig0 | LIT64( 0x0001000000000000 ),
5490         aSig1,
5491         15 - ( expDiff < 0 ),
5492         &aSig0,
5493         &aSig1
5494     );
5495     shortShift128Left(
5496         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5497     q = le128( bSig0, bSig1, aSig0, aSig1 );
5498     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5499     expDiff -= 64;
5500     while ( 0 < expDiff ) {
5501         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5502         q = ( 4 < q ) ? q - 4 : 0;
5503         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5504         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5505         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5506         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5507         expDiff -= 61;
5508     }
5509     if ( -64 < expDiff ) {
5510         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5511         q = ( 4 < q ) ? q - 4 : 0;
5512         q >>= - expDiff;
5513         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5514         expDiff += 52;
5515         if ( expDiff < 0 ) {
5516             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5517         }
5518         else {
5519             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5520         }
5521         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5522         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5523     }
5524     else {
5525         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5526         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5527     }
5528     do {
5529         alternateASig0 = aSig0;
5530         alternateASig1 = aSig1;
5531         ++q;
5532         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5533     } while ( 0 <= (sbits64) aSig0 );
5534     add128(
5535         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5536     if (    ( sigMean0 < 0 )
5537          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5538         aSig0 = alternateASig0;
5539         aSig1 = alternateASig1;
5540     }
5541     zSign = ( (sbits64) aSig0 < 0 );
5542     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5543     return
5544         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5545
5546 }
5547
5548 /*----------------------------------------------------------------------------
5549 | Returns the square root of the quadruple-precision floating-point value `a'.
5550 | The operation is performed according to the IEC/IEEE Standard for Binary
5551 | Floating-Point Arithmetic.
5552 *----------------------------------------------------------------------------*/
5553
5554 float128 float128_sqrt( float128 a STATUS_PARAM )
5555 {
5556     flag aSign;
5557     int32 aExp, zExp;
5558     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5559     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5560     float128 z;
5561
5562     aSig1 = extractFloat128Frac1( a );
5563     aSig0 = extractFloat128Frac0( a );
5564     aExp = extractFloat128Exp( a );
5565     aSign = extractFloat128Sign( a );
5566     if ( aExp == 0x7FFF ) {
5567         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5568         if ( ! aSign ) return a;
5569         goto invalid;
5570     }
5571     if ( aSign ) {
5572         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5573  invalid:
5574         float_raise( float_flag_invalid STATUS_VAR);
5575         z.low = float128_default_nan_low;
5576         z.high = float128_default_nan_high;
5577         return z;
5578     }
5579     if ( aExp == 0 ) {
5580         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5581         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5582     }
5583     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5584     aSig0 |= LIT64( 0x0001000000000000 );
5585     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5586     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5587     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5588     doubleZSig0 = zSig0<<1;
5589     mul64To128( zSig0, zSig0, &term0, &term1 );
5590     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5591     while ( (sbits64) rem0 < 0 ) {
5592         --zSig0;
5593         doubleZSig0 -= 2;
5594         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5595     }
5596     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5597     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5598         if ( zSig1 == 0 ) zSig1 = 1;
5599         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5600         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5601         mul64To128( zSig1, zSig1, &term2, &term3 );
5602         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5603         while ( (sbits64) rem1 < 0 ) {
5604             --zSig1;
5605             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5606             term3 |= 1;
5607             term2 |= doubleZSig0;
5608             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5609         }
5610         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5611     }
5612     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5613     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5614
5615 }
5616
5617 /*----------------------------------------------------------------------------
5618 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5619 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5620 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5621 *----------------------------------------------------------------------------*/
5622
5623 int float128_eq( float128 a, float128 b STATUS_PARAM )
5624 {
5625
5626     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5627               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5628          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5629               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5630        ) {
5631         if (    float128_is_signaling_nan( a )
5632              || float128_is_signaling_nan( b ) ) {
5633             float_raise( float_flag_invalid STATUS_VAR);
5634         }
5635         return 0;
5636     }
5637     return
5638            ( a.low == b.low )
5639         && (    ( a.high == b.high )
5640              || (    ( a.low == 0 )
5641                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5642            );
5643
5644 }
5645
5646 /*----------------------------------------------------------------------------
5647 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5648 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5649 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5650 | Arithmetic.
5651 *----------------------------------------------------------------------------*/
5652
5653 int float128_le( float128 a, float128 b STATUS_PARAM )
5654 {
5655     flag aSign, bSign;
5656
5657     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5658               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5659          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5660               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5661        ) {
5662         float_raise( float_flag_invalid STATUS_VAR);
5663         return 0;
5664     }
5665     aSign = extractFloat128Sign( a );
5666     bSign = extractFloat128Sign( b );
5667     if ( aSign != bSign ) {
5668         return
5669                aSign
5670             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5671                  == 0 );
5672     }
5673     return
5674           aSign ? le128( b.high, b.low, a.high, a.low )
5675         : le128( a.high, a.low, b.high, b.low );
5676
5677 }
5678
5679 /*----------------------------------------------------------------------------
5680 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5681 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5682 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5683 *----------------------------------------------------------------------------*/
5684
5685 int float128_lt( float128 a, float128 b STATUS_PARAM )
5686 {
5687     flag aSign, bSign;
5688
5689     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5690               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5691          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5692               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5693        ) {
5694         float_raise( float_flag_invalid STATUS_VAR);
5695         return 0;
5696     }
5697     aSign = extractFloat128Sign( a );
5698     bSign = extractFloat128Sign( b );
5699     if ( aSign != bSign ) {
5700         return
5701                aSign
5702             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5703                  != 0 );
5704     }
5705     return
5706           aSign ? lt128( b.high, b.low, a.high, a.low )
5707         : lt128( a.high, a.low, b.high, b.low );
5708
5709 }
5710
5711 /*----------------------------------------------------------------------------
5712 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5713 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5714 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5715 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5716 *----------------------------------------------------------------------------*/
5717
5718 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5719 {
5720
5721     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5722               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5723          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5724               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5725        ) {
5726         float_raise( float_flag_invalid STATUS_VAR);
5727         return 0;
5728     }
5729     return
5730            ( a.low == b.low )
5731         && (    ( a.high == b.high )
5732              || (    ( a.low == 0 )
5733                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5734            );
5735
5736 }
5737
5738 /*----------------------------------------------------------------------------
5739 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5740 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5741 | cause an exception.  Otherwise, the comparison is performed according to the
5742 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5743 *----------------------------------------------------------------------------*/
5744
5745 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5746 {
5747     flag aSign, bSign;
5748
5749     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5750               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5751          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5752               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5753        ) {
5754         if (    float128_is_signaling_nan( a )
5755              || float128_is_signaling_nan( b ) ) {
5756             float_raise( float_flag_invalid STATUS_VAR);
5757         }
5758         return 0;
5759     }
5760     aSign = extractFloat128Sign( a );
5761     bSign = extractFloat128Sign( b );
5762     if ( aSign != bSign ) {
5763         return
5764                aSign
5765             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5766                  == 0 );
5767     }
5768     return
5769           aSign ? le128( b.high, b.low, a.high, a.low )
5770         : le128( a.high, a.low, b.high, b.low );
5771
5772 }
5773
5774 /*----------------------------------------------------------------------------
5775 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5776 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5777 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5778 | Standard for Binary Floating-Point Arithmetic.
5779 *----------------------------------------------------------------------------*/
5780
5781 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5782 {
5783     flag aSign, bSign;
5784
5785     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5786               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5787          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5788               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5789        ) {
5790         if (    float128_is_signaling_nan( a )
5791              || float128_is_signaling_nan( b ) ) {
5792             float_raise( float_flag_invalid STATUS_VAR);
5793         }
5794         return 0;
5795     }
5796     aSign = extractFloat128Sign( a );
5797     bSign = extractFloat128Sign( b );
5798     if ( aSign != bSign ) {
5799         return
5800                aSign
5801             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5802                  != 0 );
5803     }
5804     return
5805           aSign ? lt128( b.high, b.low, a.high, a.low )
5806         : lt128( a.high, a.low, b.high, b.low );
5807
5808 }
5809
5810 #endif
5811
5812 /* misc functions */
5813 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5814 {
5815     return int64_to_float32(a STATUS_VAR);
5816 }
5817
5818 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5819 {
5820     return int64_to_float64(a STATUS_VAR);
5821 }
5822
5823 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5824 {
5825     int64_t v;
5826     unsigned int res;
5827
5828     v = float32_to_int64(a STATUS_VAR);
5829     if (v < 0) {
5830         res = 0;
5831         float_raise( float_flag_invalid STATUS_VAR);
5832     } else if (v > 0xffffffff) {
5833         res = 0xffffffff;
5834         float_raise( float_flag_invalid STATUS_VAR);
5835     } else {
5836         res = v;
5837     }
5838     return res;
5839 }
5840
5841 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5842 {
5843     int64_t v;
5844     unsigned int res;
5845
5846     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5847     if (v < 0) {
5848         res = 0;
5849         float_raise( float_flag_invalid STATUS_VAR);
5850     } else if (v > 0xffffffff) {
5851         res = 0xffffffff;
5852         float_raise( float_flag_invalid STATUS_VAR);
5853     } else {
5854         res = v;
5855     }
5856     return res;
5857 }
5858
5859 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
5860 {
5861     int64_t v;
5862     unsigned int res;
5863
5864     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5865     if (v < 0) {
5866         res = 0;
5867         float_raise( float_flag_invalid STATUS_VAR);
5868     } else if (v > 0xffff) {
5869         res = 0xffff;
5870         float_raise( float_flag_invalid STATUS_VAR);
5871     } else {
5872         res = v;
5873     }
5874     return res;
5875 }
5876
5877 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5878 {
5879     int64_t v;
5880     unsigned int res;
5881
5882     v = float64_to_int64(a STATUS_VAR);
5883     if (v < 0) {
5884         res = 0;
5885         float_raise( float_flag_invalid STATUS_VAR);
5886     } else if (v > 0xffffffff) {
5887         res = 0xffffffff;
5888         float_raise( float_flag_invalid STATUS_VAR);
5889     } else {
5890         res = v;
5891     }
5892     return res;
5893 }
5894
5895 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5896 {
5897     int64_t v;
5898     unsigned int res;
5899
5900     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5901     if (v < 0) {
5902         res = 0;
5903         float_raise( float_flag_invalid STATUS_VAR);
5904     } else if (v > 0xffffffff) {
5905         res = 0xffffffff;
5906         float_raise( float_flag_invalid STATUS_VAR);
5907     } else {
5908         res = v;
5909     }
5910     return res;
5911 }
5912
5913 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
5914 {
5915     int64_t v;
5916     unsigned int res;
5917
5918     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5919     if (v < 0) {
5920         res = 0;
5921         float_raise( float_flag_invalid STATUS_VAR);
5922     } else if (v > 0xffff) {
5923         res = 0xffff;
5924         float_raise( float_flag_invalid STATUS_VAR);
5925     } else {
5926         res = v;
5927     }
5928     return res;
5929 }
5930
5931 /* FIXME: This looks broken.  */
5932 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5933 {
5934     int64_t v;
5935
5936     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5937     v += float64_val(a);
5938     v = float64_to_int64(make_float64(v) STATUS_VAR);
5939
5940     return v - INT64_MIN;
5941 }
5942
5943 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5944 {
5945     int64_t v;
5946
5947     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5948     v += float64_val(a);
5949     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5950
5951     return v - INT64_MIN;
5952 }
5953
5954 #define COMPARE(s, nan_exp)                                                  \
5955 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5956                                       int is_quiet STATUS_PARAM )            \
5957 {                                                                            \
5958     flag aSign, bSign;                                                       \
5959     bits ## s av, bv;                                                        \
5960     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
5961     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
5962                                                                              \
5963     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5964          extractFloat ## s ## Frac( a ) ) ||                                 \
5965         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5966           extractFloat ## s ## Frac( b ) )) {                                \
5967         if (!is_quiet ||                                                     \
5968             float ## s ## _is_signaling_nan( a ) ||                          \
5969             float ## s ## _is_signaling_nan( b ) ) {                         \
5970             float_raise( float_flag_invalid STATUS_VAR);                     \
5971         }                                                                    \
5972         return float_relation_unordered;                                     \
5973     }                                                                        \
5974     aSign = extractFloat ## s ## Sign( a );                                  \
5975     bSign = extractFloat ## s ## Sign( b );                                  \
5976     av = float ## s ## _val(a);                                              \
5977     bv = float ## s ## _val(b);                                              \
5978     if ( aSign != bSign ) {                                                  \
5979         if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5980             /* zero case */                                                  \
5981             return float_relation_equal;                                     \
5982         } else {                                                             \
5983             return 1 - (2 * aSign);                                          \
5984         }                                                                    \
5985     } else {                                                                 \
5986         if (av == bv) {                                                      \
5987             return float_relation_equal;                                     \
5988         } else {                                                             \
5989             return 1 - 2 * (aSign ^ ( av < bv ));                            \
5990         }                                                                    \
5991     }                                                                        \
5992 }                                                                            \
5993                                                                              \
5994 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5995 {                                                                            \
5996     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5997 }                                                                            \
5998                                                                              \
5999 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
6000 {                                                                            \
6001     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
6002 }
6003
6004 COMPARE(32, 0xff)
6005 COMPARE(64, 0x7ff)
6006
6007 INLINE int float128_compare_internal( float128 a, float128 b,
6008                                       int is_quiet STATUS_PARAM )
6009 {
6010     flag aSign, bSign;
6011
6012     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
6013           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
6014         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
6015           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
6016         if (!is_quiet ||
6017             float128_is_signaling_nan( a ) ||
6018             float128_is_signaling_nan( b ) ) {
6019             float_raise( float_flag_invalid STATUS_VAR);
6020         }
6021         return float_relation_unordered;
6022     }
6023     aSign = extractFloat128Sign( a );
6024     bSign = extractFloat128Sign( b );
6025     if ( aSign != bSign ) {
6026         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
6027             /* zero case */
6028             return float_relation_equal;
6029         } else {
6030             return 1 - (2 * aSign);
6031         }
6032     } else {
6033         if (a.low == b.low && a.high == b.high) {
6034             return float_relation_equal;
6035         } else {
6036             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6037         }
6038     }
6039 }
6040
6041 int float128_compare( float128 a, float128 b STATUS_PARAM )
6042 {
6043     return float128_compare_internal(a, b, 0 STATUS_VAR);
6044 }
6045
6046 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6047 {
6048     return float128_compare_internal(a, b, 1 STATUS_VAR);
6049 }
6050
6051 /* Multiply A by 2 raised to the power N.  */
6052 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6053 {
6054     flag aSign;
6055     int16 aExp;
6056     bits32 aSig;
6057
6058     a = float32_squash_input_denormal(a STATUS_VAR);
6059     aSig = extractFloat32Frac( a );
6060     aExp = extractFloat32Exp( a );
6061     aSign = extractFloat32Sign( a );
6062
6063     if ( aExp == 0xFF ) {
6064         return a;
6065     }
6066     if ( aExp != 0 )
6067         aSig |= 0x00800000;
6068     else if ( aSig == 0 )
6069         return a;
6070
6071     aExp += n - 1;
6072     aSig <<= 7;
6073     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6074 }
6075
6076 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6077 {
6078     flag aSign;
6079     int16 aExp;
6080     bits64 aSig;
6081
6082     a = float64_squash_input_denormal(a STATUS_VAR);
6083     aSig = extractFloat64Frac( a );
6084     aExp = extractFloat64Exp( a );
6085     aSign = extractFloat64Sign( a );
6086
6087     if ( aExp == 0x7FF ) {
6088         return a;
6089     }
6090     if ( aExp != 0 )
6091         aSig |= LIT64( 0x0010000000000000 );
6092     else if ( aSig == 0 )
6093         return a;
6094
6095     aExp += n - 1;
6096     aSig <<= 10;
6097     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6098 }
6099
6100 #ifdef FLOATX80
6101 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6102 {
6103     flag aSign;
6104     int16 aExp;
6105     bits64 aSig;
6106
6107     aSig = extractFloatx80Frac( a );
6108     aExp = extractFloatx80Exp( a );
6109     aSign = extractFloatx80Sign( a );
6110
6111     if ( aExp == 0x7FF ) {
6112         return a;
6113     }
6114     if (aExp == 0 && aSig == 0)
6115         return a;
6116
6117     aExp += n;
6118     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6119                                           aSign, aExp, aSig, 0 STATUS_VAR );
6120 }
6121 #endif
6122
6123 #ifdef FLOAT128
6124 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6125 {
6126     flag aSign;
6127     int32 aExp;
6128     bits64 aSig0, aSig1;
6129
6130     aSig1 = extractFloat128Frac1( a );
6131     aSig0 = extractFloat128Frac0( a );
6132     aExp = extractFloat128Exp( a );
6133     aSign = extractFloat128Sign( a );
6134     if ( aExp == 0x7FFF ) {
6135         return a;
6136     }
6137     if ( aExp != 0 )
6138         aSig0 |= LIT64( 0x0001000000000000 );
6139     else if ( aSig0 == 0 && aSig1 == 0 )
6140         return a;
6141
6142     aExp += n - 1;
6143     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6144                                           STATUS_VAR );
6145
6146 }
6147 #endif