fpu/softfloat.c

   1
   2 /*============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5 Package, Release 2b.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 arithmetic/SoftFloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26 Derivative works are acceptable, even for commercial purposes, so long as
  27 (1) the source code for the derivative work includes prominent notice that
  28 the work is derivative, and (2) the source code includes prominent notice with
  29 these four paragraphs for those parts of this code that are retained.
  30
  31 =============================================================================*/
  32
  33 #include "softfloat.h"
  34
  35 /*----------------------------------------------------------------------------
  36 | Primitive arithmetic functions, including multi-word arithmetic, and
  37 | division and square root approximations.  (Can be specialized to target if
  38 | desired.)
  39 *----------------------------------------------------------------------------*/
  40 #include "softfloat-macros.h"
  41
  42 /*----------------------------------------------------------------------------
  43 | Functions and definitions to determine:  (1) whether tininess for underflow
  44 | is detected before or after rounding by default, (2) what (if anything)
  45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  47 | are propagated from function inputs to output.  These details are target-
  48 | specific.
  49 *----------------------------------------------------------------------------*/
  50 #include "softfloat-specialize.h"
  51
  52 void set_float_rounding_mode(int val STATUS_PARAM)
  53 {
  54     STATUS(float_rounding_mode) = val;
  55 }
  56
  57 void set_float_exception_flags(int val STATUS_PARAM)
  58 {
  59     STATUS(float_exception_flags) = val;
  60 }
  61
  62 #ifdef FLOATX80
  63 void set_floatx80_rounding_precision(int val STATUS_PARAM)
  64 {
  65     STATUS(floatx80_rounding_precision) = val;
  66 }
  67 #endif
  68
  69 /*----------------------------------------------------------------------------
  70 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  71 | and 7, and returns the properly rounded 32-bit integer corresponding to the
  72 | input.  If `zSign' is 1, the input is negated before being converted to an
  73 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  74 | is simply rounded to an integer, with the inexact exception raised if the
  75 | input cannot be represented exactly as an integer.  However, if the fixed-
  76 | point input is too large, the invalid exception is raised and the largest
  77 | positive or negative integer is returned.
  78 *----------------------------------------------------------------------------*/
  79
  80 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
  81 {
  82     int8 roundingMode;
  83     flag roundNearestEven;
  84     int8 roundIncrement, roundBits;
  85     int32 z;
  86
  87     roundingMode = STATUS(float_rounding_mode);
  88     roundNearestEven = ( roundingMode == float_round_nearest_even );
  89     roundIncrement = 0x40;
  90     if ( ! roundNearestEven ) {
  91         if ( roundingMode == float_round_to_zero ) {
  92             roundIncrement = 0;
  93         }
  94         else {
  95             roundIncrement = 0x7F;
  96             if ( zSign ) {
  97                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  98             }
  99             else {
 100                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 101             }
 102         }
 103     }
 104     roundBits = absZ & 0x7F;
 105     absZ = ( absZ + roundIncrement )>>7;
 106     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 107     z = absZ;
 108     if ( zSign ) z = - z;
 109     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 110         float_raise( float_flag_invalid STATUS_VAR);
 111         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 112     }
 113     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 114     return z;
 115
 116 }
 117
 118 /*----------------------------------------------------------------------------
 119 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 120 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 121 | and returns the properly rounded 64-bit integer corresponding to the input.
 122 | If `zSign' is 1, the input is negated before being converted to an integer.
 123 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 124 | the inexact exception raised if the input cannot be represented exactly as
 125 | an integer.  However, if the fixed-point input is too large, the invalid
 126 | exception is raised and the largest positive or negative integer is
 127 | returned.
 128 *----------------------------------------------------------------------------*/
 129
 130 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
 131 {
 132     int8 roundingMode;
 133     flag roundNearestEven, increment;
 134     int64 z;
 135
 136     roundingMode = STATUS(float_rounding_mode);
 137     roundNearestEven = ( roundingMode == float_round_nearest_even );
 138     increment = ( (sbits64) absZ1 < 0 );
 139     if ( ! roundNearestEven ) {
 140         if ( roundingMode == float_round_to_zero ) {
 141             increment = 0;
 142         }
 143         else {
 144             if ( zSign ) {
 145                 increment = ( roundingMode == float_round_down ) && absZ1;
 146             }
 147             else {
 148                 increment = ( roundingMode == float_round_up ) && absZ1;
 149             }
 150         }
 151     }
 152     if ( increment ) {
 153         ++absZ0;
 154         if ( absZ0 == 0 ) goto overflow;
 155         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 156     }
 157     z = absZ0;
 158     if ( zSign ) z = - z;
 159     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 160  overflow:
 161         float_raise( float_flag_invalid STATUS_VAR);
 162         return
 163               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 164             : LIT64( 0x7FFFFFFFFFFFFFFF );
 165     }
 166     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 167     return z;
 168
 169 }
 170
 171 /*----------------------------------------------------------------------------
 172 | Returns the fraction bits of the single-precision floating-point value `a'.
 173 *----------------------------------------------------------------------------*/
 174
 175 INLINE bits32 extractFloat32Frac( float32 a )
 176 {
 177
 178     return float32_val(a) & 0x007FFFFF;
 179
 180 }
 181
 182 /*----------------------------------------------------------------------------
 183 | Returns the exponent bits of the single-precision floating-point value `a'.
 184 *----------------------------------------------------------------------------*/
 185
 186 INLINE int16 extractFloat32Exp( float32 a )
 187 {
 188
 189     return ( float32_val(a)>>23 ) & 0xFF;
 190
 191 }
 192
 193 /*----------------------------------------------------------------------------
 194 | Returns the sign bit of the single-precision floating-point value `a'.
 195 *----------------------------------------------------------------------------*/
 196
 197 INLINE flag extractFloat32Sign( float32 a )
 198 {
 199
 200     return float32_val(a)>>31;
 201
 202 }
 203
 204 /*----------------------------------------------------------------------------
 205 | If `a' is denormal and we are in flush-to-zero mode then set the
 206 | input-denormal exception and return zero. Otherwise just return the value.
 207 *----------------------------------------------------------------------------*/
 208 static float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
 209 {
 210     if (STATUS(flush_inputs_to_zero)) {
 211         if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
 212             float_raise(float_flag_input_denormal STATUS_VAR);
 213             return make_float32(float32_val(a) & 0x80000000);
 214         }
 215     }
 216     return a;
 217 }
 218
 219 /*----------------------------------------------------------------------------
 220 | Normalizes the subnormal single-precision floating-point value represented
 221 | by the denormalized significand `aSig'.  The normalized exponent and
 222 | significand are stored at the locations pointed to by `zExpPtr' and
 223 | `zSigPtr', respectively.
 224 *----------------------------------------------------------------------------*/
 225
 226 static void
 227  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 228 {
 229     int8 shiftCount;
 230
 231     shiftCount = countLeadingZeros32( aSig ) - 8;
 232     *zSigPtr = aSig<<shiftCount;
 233     *zExpPtr = 1 - shiftCount;
 234
 235 }
 236
 237 /*----------------------------------------------------------------------------
 238 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 239 | single-precision floating-point value, returning the result.  After being
 240 | shifted into the proper positions, the three fields are simply added
 241 | together to form the result.  This means that any integer portion of `zSig'
 242 | will be added into the exponent.  Since a properly normalized significand
 243 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 244 | than the desired result exponent whenever `zSig' is a complete, normalized
 245 | significand.
 246 *----------------------------------------------------------------------------*/
 247
 248 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 249 {
 250
 251     return make_float32(
 252           ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
 253
 254 }
 255
 256 /*----------------------------------------------------------------------------
 257 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 258 | and significand `zSig', and returns the proper single-precision floating-
 259 | point value corresponding to the abstract input.  Ordinarily, the abstract
 260 | value is simply rounded and packed into the single-precision format, with
 261 | the inexact exception raised if the abstract input cannot be represented
 262 | exactly.  However, if the abstract value is too large, the overflow and
 263 | inexact exceptions are raised and an infinity or maximal finite value is
 264 | returned.  If the abstract value is too small, the input value is rounded to
 265 | a subnormal number, and the underflow and inexact exceptions are raised if
 266 | the abstract input cannot be represented exactly as a subnormal single-
 267 | precision floating-point number.
 268 |     The input significand `zSig' has its binary point between bits 30
 269 | and 29, which is 7 bits to the left of the usual location.  This shifted
 270 | significand must be normalized or smaller.  If `zSig' is not normalized,
 271 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 272 | and it must not require rounding.  In the usual case that `zSig' is
 273 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 274 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 275 | Binary Floating-Point Arithmetic.
 276 *----------------------------------------------------------------------------*/
 277
 278 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 279 {
 280     int8 roundingMode;
 281     flag roundNearestEven;
 282     int8 roundIncrement, roundBits;
 283     flag isTiny;
 284
 285     roundingMode = STATUS(float_rounding_mode);
 286     roundNearestEven = ( roundingMode == float_round_nearest_even );
 287     roundIncrement = 0x40;
 288     if ( ! roundNearestEven ) {
 289         if ( roundingMode == float_round_to_zero ) {
 290             roundIncrement = 0;
 291         }
 292         else {
 293             roundIncrement = 0x7F;
 294             if ( zSign ) {
 295                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 296             }
 297             else {
 298                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 299             }
 300         }
 301     }
 302     roundBits = zSig & 0x7F;
 303     if ( 0xFD <= (bits16) zExp ) {
 304         if (    ( 0xFD < zExp )
 305              || (    ( zExp == 0xFD )
 306                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 307            ) {
 308             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 309             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
 310         }
 311         if ( zExp < 0 ) {
 312             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
 313             isTiny =
 314                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 315                 || ( zExp < -1 )
 316                 || ( zSig + roundIncrement < 0x80000000 );
 317             shift32RightJamming( zSig, - zExp, &zSig );
 318             zExp = 0;
 319             roundBits = zSig & 0x7F;
 320             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 321         }
 322     }
 323     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 324     zSig = ( zSig + roundIncrement )>>7;
 325     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 326     if ( zSig == 0 ) zExp = 0;
 327     return packFloat32( zSign, zExp, zSig );
 328
 329 }
 330
 331 /*----------------------------------------------------------------------------
 332 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 333 | and significand `zSig', and returns the proper single-precision floating-
 334 | point value corresponding to the abstract input.  This routine is just like
 335 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 336 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 337 | floating-point exponent.
 338 *----------------------------------------------------------------------------*/
 339
 340 static float32
 341  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
 342 {
 343     int8 shiftCount;
 344
 345     shiftCount = countLeadingZeros32( zSig ) - 1;
 346     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 347
 348 }
 349
 350 /*----------------------------------------------------------------------------
 351 | Returns the fraction bits of the double-precision floating-point value `a'.
 352 *----------------------------------------------------------------------------*/
 353
 354 INLINE bits64 extractFloat64Frac( float64 a )
 355 {
 356
 357     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
 358
 359 }
 360
 361 /*----------------------------------------------------------------------------
 362 | Returns the exponent bits of the double-precision floating-point value `a'.
 363 *----------------------------------------------------------------------------*/
 364
 365 INLINE int16 extractFloat64Exp( float64 a )
 366 {
 367
 368     return ( float64_val(a)>>52 ) & 0x7FF;
 369
 370 }
 371
 372 /*----------------------------------------------------------------------------
 373 | Returns the sign bit of the double-precision floating-point value `a'.
 374 *----------------------------------------------------------------------------*/
 375
 376 INLINE flag extractFloat64Sign( float64 a )
 377 {
 378
 379     return float64_val(a)>>63;
 380
 381 }
 382
 383 /*----------------------------------------------------------------------------
 384 | If `a' is denormal and we are in flush-to-zero mode then set the
 385 | input-denormal exception and return zero. Otherwise just return the value.
 386 *----------------------------------------------------------------------------*/
 387 static float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
 388 {
 389     if (STATUS(flush_inputs_to_zero)) {
 390         if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
 391             float_raise(float_flag_input_denormal STATUS_VAR);
 392             return make_float64(float64_val(a) & (1ULL << 63));
 393         }
 394     }
 395     return a;
 396 }
 397
 398 /*----------------------------------------------------------------------------
 399 | Normalizes the subnormal double-precision floating-point value represented
 400 | by the denormalized significand `aSig'.  The normalized exponent and
 401 | significand are stored at the locations pointed to by `zExpPtr' and
 402 | `zSigPtr', respectively.
 403 *----------------------------------------------------------------------------*/
 404
 405 static void
 406  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 407 {
 408     int8 shiftCount;
 409
 410     shiftCount = countLeadingZeros64( aSig ) - 11;
 411     *zSigPtr = aSig<<shiftCount;
 412     *zExpPtr = 1 - shiftCount;
 413
 414 }
 415
 416 /*----------------------------------------------------------------------------
 417 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 418 | double-precision floating-point value, returning the result.  After being
 419 | shifted into the proper positions, the three fields are simply added
 420 | together to form the result.  This means that any integer portion of `zSig'
 421 | will be added into the exponent.  Since a properly normalized significand
 422 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 423 | than the desired result exponent whenever `zSig' is a complete, normalized
 424 | significand.
 425 *----------------------------------------------------------------------------*/
 426
 427 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 428 {
 429
 430     return make_float64(
 431         ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
 432
 433 }
 434
 435 /*----------------------------------------------------------------------------
 436 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 437 | and significand `zSig', and returns the proper double-precision floating-
 438 | point value corresponding to the abstract input.  Ordinarily, the abstract
 439 | value is simply rounded and packed into the double-precision format, with
 440 | the inexact exception raised if the abstract input cannot be represented
 441 | exactly.  However, if the abstract value is too large, the overflow and
 442 | inexact exceptions are raised and an infinity or maximal finite value is
 443 | returned.  If the abstract value is too small, the input value is rounded
 444 | to a subnormal number, and the underflow and inexact exceptions are raised
 445 | if the abstract input cannot be represented exactly as a subnormal double-
 446 | precision floating-point number.
 447 |     The input significand `zSig' has its binary point between bits 62
 448 | and 61, which is 10 bits to the left of the usual location.  This shifted
 449 | significand must be normalized or smaller.  If `zSig' is not normalized,
 450 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 451 | and it must not require rounding.  In the usual case that `zSig' is
 452 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 453 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 454 | Binary Floating-Point Arithmetic.
 455 *----------------------------------------------------------------------------*/
 456
 457 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 458 {
 459     int8 roundingMode;
 460     flag roundNearestEven;
 461     int16 roundIncrement, roundBits;
 462     flag isTiny;
 463
 464     roundingMode = STATUS(float_rounding_mode);
 465     roundNearestEven = ( roundingMode == float_round_nearest_even );
 466     roundIncrement = 0x200;
 467     if ( ! roundNearestEven ) {
 468         if ( roundingMode == float_round_to_zero ) {
 469             roundIncrement = 0;
 470         }
 471         else {
 472             roundIncrement = 0x3FF;
 473             if ( zSign ) {
 474                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 475             }
 476             else {
 477                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 478             }
 479         }
 480     }
 481     roundBits = zSig & 0x3FF;
 482     if ( 0x7FD <= (bits16) zExp ) {
 483         if (    ( 0x7FD < zExp )
 484              || (    ( zExp == 0x7FD )
 485                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 486            ) {
 487             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 488             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
 489         }
 490         if ( zExp < 0 ) {
 491             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
 492             isTiny =
 493                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 494                 || ( zExp < -1 )
 495                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 496             shift64RightJamming( zSig, - zExp, &zSig );
 497             zExp = 0;
 498             roundBits = zSig & 0x3FF;
 499             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 500         }
 501     }
 502     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 503     zSig = ( zSig + roundIncrement )>>10;
 504     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 505     if ( zSig == 0 ) zExp = 0;
 506     return packFloat64( zSign, zExp, zSig );
 507
 508 }
 509
 510 /*----------------------------------------------------------------------------
 511 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 512 | and significand `zSig', and returns the proper double-precision floating-
 513 | point value corresponding to the abstract input.  This routine is just like
 514 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 515 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 516 | floating-point exponent.
 517 *----------------------------------------------------------------------------*/
 518
 519 static float64
 520  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
 521 {
 522     int8 shiftCount;
 523
 524     shiftCount = countLeadingZeros64( zSig ) - 1;
 525     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
 526
 527 }
 528
 529 #ifdef FLOATX80
 530
 531 /*----------------------------------------------------------------------------
 532 | Returns the fraction bits of the extended double-precision floating-point
 533 | value `a'.
 534 *----------------------------------------------------------------------------*/
 535
 536 INLINE bits64 extractFloatx80Frac( floatx80 a )
 537 {
 538
 539     return a.low;
 540
 541 }
 542
 543 /*----------------------------------------------------------------------------
 544 | Returns the exponent bits of the extended double-precision floating-point
 545 | value `a'.
 546 *----------------------------------------------------------------------------*/
 547
 548 INLINE int32 extractFloatx80Exp( floatx80 a )
 549 {
 550
 551     return a.high & 0x7FFF;
 552
 553 }
 554
 555 /*----------------------------------------------------------------------------
 556 | Returns the sign bit of the extended double-precision floating-point value
 557 | `a'.
 558 *----------------------------------------------------------------------------*/
 559
 560 INLINE flag extractFloatx80Sign( floatx80 a )
 561 {
 562
 563     return a.high>>15;
 564
 565 }
 566
 567 /*----------------------------------------------------------------------------
 568 | Normalizes the subnormal extended double-precision floating-point value
 569 | represented by the denormalized significand `aSig'.  The normalized exponent
 570 | and significand are stored at the locations pointed to by `zExpPtr' and
 571 | `zSigPtr', respectively.
 572 *----------------------------------------------------------------------------*/
 573
 574 static void
 575  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 576 {
 577     int8 shiftCount;
 578
 579     shiftCount = countLeadingZeros64( aSig );
 580     *zSigPtr = aSig<<shiftCount;
 581     *zExpPtr = 1 - shiftCount;
 582
 583 }
 584
 585 /*----------------------------------------------------------------------------
 586 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 587 | extended double-precision floating-point value, returning the result.
 588 *----------------------------------------------------------------------------*/
 589
 590 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 591 {
 592     floatx80 z;
 593
 594     z.low = zSig;
 595     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 596     return z;
 597
 598 }
 599
 600 /*----------------------------------------------------------------------------
 601 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 602 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 603 | and returns the proper extended double-precision floating-point value
 604 | corresponding to the abstract input.  Ordinarily, the abstract value is
 605 | rounded and packed into the extended double-precision format, with the
 606 | inexact exception raised if the abstract input cannot be represented
 607 | exactly.  However, if the abstract value is too large, the overflow and
 608 | inexact exceptions are raised and an infinity or maximal finite value is
 609 | returned.  If the abstract value is too small, the input value is rounded to
 610 | a subnormal number, and the underflow and inexact exceptions are raised if
 611 | the abstract input cannot be represented exactly as a subnormal extended
 612 | double-precision floating-point number.
 613 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 614 | number of bits as single or double precision, respectively.  Otherwise, the
 615 | result is rounded to the full precision of the extended double-precision
 616 | format.
 617 |     The input significand must be normalized or smaller.  If the input
 618 | significand is not normalized, `zExp' must be 0; in that case, the result
 619 | returned is a subnormal number, and it must not require rounding.  The
 620 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 621 | Floating-Point Arithmetic.
 622 *----------------------------------------------------------------------------*/
 623
 624 static floatx80
 625  roundAndPackFloatx80(
 626      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 627  STATUS_PARAM)
 628 {
 629     int8 roundingMode;
 630     flag roundNearestEven, increment, isTiny;
 631     int64 roundIncrement, roundMask, roundBits;
 632
 633     roundingMode = STATUS(float_rounding_mode);
 634     roundNearestEven = ( roundingMode == float_round_nearest_even );
 635     if ( roundingPrecision == 80 ) goto precision80;
 636     if ( roundingPrecision == 64 ) {
 637         roundIncrement = LIT64( 0x0000000000000400 );
 638         roundMask = LIT64( 0x00000000000007FF );
 639     }
 640     else if ( roundingPrecision == 32 ) {
 641         roundIncrement = LIT64( 0x0000008000000000 );
 642         roundMask = LIT64( 0x000000FFFFFFFFFF );
 643     }
 644     else {
 645         goto precision80;
 646     }
 647     zSig0 |= ( zSig1 != 0 );
 648     if ( ! roundNearestEven ) {
 649         if ( roundingMode == float_round_to_zero ) {
 650             roundIncrement = 0;
 651         }
 652         else {
 653             roundIncrement = roundMask;
 654             if ( zSign ) {
 655                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 656             }
 657             else {
 658                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 659             }
 660         }
 661     }
 662     roundBits = zSig0 & roundMask;
 663     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 664         if (    ( 0x7FFE < zExp )
 665              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 666            ) {
 667             goto overflow;
 668         }
 669         if ( zExp <= 0 ) {
 670             if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
 671             isTiny =
 672                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 673                 || ( zExp < 0 )
 674                 || ( zSig0 <= zSig0 + roundIncrement );
 675             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 676             zExp = 0;
 677             roundBits = zSig0 & roundMask;
 678             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
 679             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 680             zSig0 += roundIncrement;
 681             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 682             roundIncrement = roundMask + 1;
 683             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 684                 roundMask |= roundIncrement;
 685             }
 686             zSig0 &= ~ roundMask;
 687             return packFloatx80( zSign, zExp, zSig0 );
 688         }
 689     }
 690     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
 691     zSig0 += roundIncrement;
 692     if ( zSig0 < roundIncrement ) {
 693         ++zExp;
 694         zSig0 = LIT64( 0x8000000000000000 );
 695     }
 696     roundIncrement = roundMask + 1;
 697     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 698         roundMask |= roundIncrement;
 699     }
 700     zSig0 &= ~ roundMask;
 701     if ( zSig0 == 0 ) zExp = 0;
 702     return packFloatx80( zSign, zExp, zSig0 );
 703  precision80:
 704     increment = ( (sbits64) zSig1 < 0 );
 705     if ( ! roundNearestEven ) {
 706         if ( roundingMode == float_round_to_zero ) {
 707             increment = 0;
 708         }
 709         else {
 710             if ( zSign ) {
 711                 increment = ( roundingMode == float_round_down ) && zSig1;
 712             }
 713             else {
 714                 increment = ( roundingMode == float_round_up ) && zSig1;
 715             }
 716         }
 717     }
 718     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 719         if (    ( 0x7FFE < zExp )
 720              || (    ( zExp == 0x7FFE )
 721                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 722                   && increment
 723                 )
 724            ) {
 725             roundMask = 0;
 726  overflow:
 727             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 728             if (    ( roundingMode == float_round_to_zero )
 729                  || ( zSign && ( roundingMode == float_round_up ) )
 730                  || ( ! zSign && ( roundingMode == float_round_down ) )
 731                ) {
 732                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 733             }
 734             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 735         }
 736         if ( zExp <= 0 ) {
 737             isTiny =
 738                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
 739                 || ( zExp < 0 )
 740                 || ! increment
 741                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 742             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 743             zExp = 0;
 744             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
 745             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 746             if ( roundNearestEven ) {
 747                 increment = ( (sbits64) zSig1 < 0 );
 748             }
 749             else {
 750                 if ( zSign ) {
 751                     increment = ( roundingMode == float_round_down ) && zSig1;
 752                 }
 753                 else {
 754                     increment = ( roundingMode == float_round_up ) && zSig1;
 755                 }
 756             }
 757             if ( increment ) {
 758                 ++zSig0;
 759                 zSig0 &=
 760                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 761                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 762             }
 763             return packFloatx80( zSign, zExp, zSig0 );
 764         }
 765     }
 766     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
 767     if ( increment ) {
 768         ++zSig0;
 769         if ( zSig0 == 0 ) {
 770             ++zExp;
 771             zSig0 = LIT64( 0x8000000000000000 );
 772         }
 773         else {
 774             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 775         }
 776     }
 777     else {
 778         if ( zSig0 == 0 ) zExp = 0;
 779     }
 780     return packFloatx80( zSign, zExp, zSig0 );
 781
 782 }
 783
 784 /*----------------------------------------------------------------------------
 785 | Takes an abstract floating-point value having sign `zSign', exponent
 786 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 787 | and returns the proper extended double-precision floating-point value
 788 | corresponding to the abstract input.  This routine is just like
 789 | `roundAndPackFloatx80' except that the input significand does not have to be
 790 | normalized.
 791 *----------------------------------------------------------------------------*/
 792
 793 static floatx80
 794  normalizeRoundAndPackFloatx80(
 795      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 796  STATUS_PARAM)
 797 {
 798     int8 shiftCount;
 799
 800     if ( zSig0 == 0 ) {
 801         zSig0 = zSig1;
 802         zSig1 = 0;
 803         zExp -= 64;
 804     }
 805     shiftCount = countLeadingZeros64( zSig0 );
 806     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 807     zExp -= shiftCount;
 808     return
 809         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
 810
 811 }
 812
 813 #endif
 814
 815 #ifdef FLOAT128
 816
 817 /*----------------------------------------------------------------------------
 818 | Returns the least-significant 64 fraction bits of the quadruple-precision
 819 | floating-point value `a'.
 820 *----------------------------------------------------------------------------*/
 821
 822 INLINE bits64 extractFloat128Frac1( float128 a )
 823 {
 824
 825     return a.low;
 826
 827 }
 828
 829 /*----------------------------------------------------------------------------
 830 | Returns the most-significant 48 fraction bits of the quadruple-precision
 831 | floating-point value `a'.
 832 *----------------------------------------------------------------------------*/
 833
 834 INLINE bits64 extractFloat128Frac0( float128 a )
 835 {
 836
 837     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 838
 839 }
 840
 841 /*----------------------------------------------------------------------------
 842 | Returns the exponent bits of the quadruple-precision floating-point value
 843 | `a'.
 844 *----------------------------------------------------------------------------*/
 845
 846 INLINE int32 extractFloat128Exp( float128 a )
 847 {
 848
 849     return ( a.high>>48 ) & 0x7FFF;
 850
 851 }
 852
 853 /*----------------------------------------------------------------------------
 854 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 855 *----------------------------------------------------------------------------*/
 856
 857 INLINE flag extractFloat128Sign( float128 a )
 858 {
 859
 860     return a.high>>63;
 861
 862 }
 863
 864 /*----------------------------------------------------------------------------
 865 | Normalizes the subnormal quadruple-precision floating-point value
 866 | represented by the denormalized significand formed by the concatenation of
 867 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 868 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 869 | significand are stored at the location pointed to by `zSig0Ptr', and the
 870 | least significant 64 bits of the normalized significand are stored at the
 871 | location pointed to by `zSig1Ptr'.
 872 *----------------------------------------------------------------------------*/
 873
 874 static void
 875  normalizeFloat128Subnormal(
 876      bits64 aSig0,
 877      bits64 aSig1,
 878      int32 *zExpPtr,
 879      bits64 *zSig0Ptr,
 880      bits64 *zSig1Ptr
 881  )
 882 {
 883     int8 shiftCount;
 884
 885     if ( aSig0 == 0 ) {
 886         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 887         if ( shiftCount < 0 ) {
 888             *zSig0Ptr = aSig1>>( - shiftCount );
 889             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 890         }
 891         else {
 892             *zSig0Ptr = aSig1<<shiftCount;
 893             *zSig1Ptr = 0;
 894         }
 895         *zExpPtr = - shiftCount - 63;
 896     }
 897     else {
 898         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 899         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 900         *zExpPtr = 1 - shiftCount;
 901     }
 902
 903 }
 904
 905 /*----------------------------------------------------------------------------
 906 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 907 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 908 | floating-point value, returning the result.  After being shifted into the
 909 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 910 | added together to form the most significant 32 bits of the result.  This
 911 | means that any integer portion of `zSig0' will be added into the exponent.
 912 | Since a properly normalized significand will have an integer portion equal
 913 | to 1, the `zExp' input should be 1 less than the desired result exponent
 914 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 915 | significand.
 916 *----------------------------------------------------------------------------*/
 917
 918 INLINE float128
 919  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 920 {
 921     float128 z;
 922
 923     z.low = zSig1;
 924     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 925     return z;
 926
 927 }
 928
 929 /*----------------------------------------------------------------------------
 930 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 931 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 932 | and `zSig2', and returns the proper quadruple-precision floating-point value
 933 | corresponding to the abstract input.  Ordinarily, the abstract value is
 934 | simply rounded and packed into the quadruple-precision format, with the
 935 | inexact exception raised if the abstract input cannot be represented
 936 | exactly.  However, if the abstract value is too large, the overflow and
 937 | inexact exceptions are raised and an infinity or maximal finite value is
 938 | returned.  If the abstract value is too small, the input value is rounded to
 939 | a subnormal number, and the underflow and inexact exceptions are raised if
 940 | the abstract input cannot be represented exactly as a subnormal quadruple-
 941 | precision floating-point number.
 942 |     The input significand must be normalized or smaller.  If the input
 943 | significand is not normalized, `zExp' must be 0; in that case, the result
 944 | returned is a subnormal number, and it must not require rounding.  In the
 945 | usual case that the input significand is normalized, `zExp' must be 1 less
 946 | than the ``true'' floating-point exponent.  The handling of underflow and
 947 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 948 *----------------------------------------------------------------------------*/
 949
 950 static float128
 951  roundAndPackFloat128(
 952      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
 953 {
 954     int8 roundingMode;
 955     flag roundNearestEven, increment, isTiny;
 956
 957     roundingMode = STATUS(float_rounding_mode);
 958     roundNearestEven = ( roundingMode == float_round_nearest_even );
 959     increment = ( (sbits64) zSig2 < 0 );
 960     if ( ! roundNearestEven ) {
 961         if ( roundingMode == float_round_to_zero ) {
 962             increment = 0;
 963         }
 964         else {
 965             if ( zSign ) {
 966                 increment = ( roundingMode == float_round_down ) && zSig2;
 967             }
 968             else {
 969                 increment = ( roundingMode == float_round_up ) && zSig2;
 970             }
 971         }
 972     }
 973     if ( 0x7FFD <= (bits32) zExp ) {
 974         if (    ( 0x7FFD < zExp )
 975              || (    ( zExp == 0x7FFD )
 976                   && eq128(
 977                          LIT64( 0x0001FFFFFFFFFFFF ),
 978                          LIT64( 0xFFFFFFFFFFFFFFFF ),
 979                          zSig0,
 980                          zSig1
 981                      )
 982                   && increment
 983                 )
 984            ) {
 985             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
 986             if (    ( roundingMode == float_round_to_zero )
 987                  || ( zSign && ( roundingMode == float_round_up ) )
 988                  || ( ! zSign && ( roundingMode == float_round_down ) )
 989                ) {
 990                 return
 991                     packFloat128(
 992                         zSign,
 993                         0x7FFE,
 994                         LIT64( 0x0000FFFFFFFFFFFF ),
 995                         LIT64( 0xFFFFFFFFFFFFFFFF )
 996                     );
 997             }
 998             return packFloat128( zSign, 0x7FFF, 0, 0 );
 999         }
1000         if ( zExp < 0 ) {
1001             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
1002             isTiny =
1003                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1004                 || ( zExp < -1 )
1005                 || ! increment
1006                 || lt128(
1007                        zSig0,
1008                        zSig1,
1009                        LIT64( 0x0001FFFFFFFFFFFF ),
1010                        LIT64( 0xFFFFFFFFFFFFFFFF )
1011                    );
1012             shift128ExtraRightJamming(
1013                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1014             zExp = 0;
1015             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
1016             if ( roundNearestEven ) {
1017                 increment = ( (sbits64) zSig2 < 0 );
1018             }
1019             else {
1020                 if ( zSign ) {
1021                     increment = ( roundingMode == float_round_down ) && zSig2;
1022                 }
1023                 else {
1024                     increment = ( roundingMode == float_round_up ) && zSig2;
1025                 }
1026             }
1027         }
1028     }
1029     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1030     if ( increment ) {
1031         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1032         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1033     }
1034     else {
1035         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1036     }
1037     return packFloat128( zSign, zExp, zSig0, zSig1 );
1038
1039 }
1040
1041 /*----------------------------------------------------------------------------
1042 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1043 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1044 | returns the proper quadruple-precision floating-point value corresponding
1045 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1046 | except that the input significand has fewer bits and does not have to be
1047 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1048 | point exponent.
1049 *----------------------------------------------------------------------------*/
1050
1051 static float128
1052  normalizeRoundAndPackFloat128(
1053      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
1054 {
1055     int8 shiftCount;
1056     bits64 zSig2;
1057
1058     if ( zSig0 == 0 ) {
1059         zSig0 = zSig1;
1060         zSig1 = 0;
1061         zExp -= 64;
1062     }
1063     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1064     if ( 0 <= shiftCount ) {
1065         zSig2 = 0;
1066         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1067     }
1068     else {
1069         shift128ExtraRightJamming(
1070             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1071     }
1072     zExp -= shiftCount;
1073     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1074
1075 }
1076
1077 #endif
1078
1079 /*----------------------------------------------------------------------------
1080 | Returns the result of converting the 32-bit two's complement integer `a'
1081 | to the single-precision floating-point format.  The conversion is performed
1082 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1083 *----------------------------------------------------------------------------*/
1084
1085 float32 int32_to_float32( int32 a STATUS_PARAM )
1086 {
1087     flag zSign;
1088
1089     if ( a == 0 ) return float32_zero;
1090     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1091     zSign = ( a < 0 );
1092     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1093
1094 }
1095
1096 /*----------------------------------------------------------------------------
1097 | Returns the result of converting the 32-bit two's complement integer `a'
1098 | to the double-precision floating-point format.  The conversion is performed
1099 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1100 *----------------------------------------------------------------------------*/
1101
1102 float64 int32_to_float64( int32 a STATUS_PARAM )
1103 {
1104     flag zSign;
1105     uint32 absA;
1106     int8 shiftCount;
1107     bits64 zSig;
1108
1109     if ( a == 0 ) return float64_zero;
1110     zSign = ( a < 0 );
1111     absA = zSign ? - a : a;
1112     shiftCount = countLeadingZeros32( absA ) + 21;
1113     zSig = absA;
1114     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1115
1116 }
1117
1118 #ifdef FLOATX80
1119
1120 /*----------------------------------------------------------------------------
1121 | Returns the result of converting the 32-bit two's complement integer `a'
1122 | to the extended double-precision floating-point format.  The conversion
1123 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1124 | Arithmetic.
1125 *----------------------------------------------------------------------------*/
1126
1127 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
1128 {
1129     flag zSign;
1130     uint32 absA;
1131     int8 shiftCount;
1132     bits64 zSig;
1133
1134     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1135     zSign = ( a < 0 );
1136     absA = zSign ? - a : a;
1137     shiftCount = countLeadingZeros32( absA ) + 32;
1138     zSig = absA;
1139     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1140
1141 }
1142
1143 #endif
1144
1145 #ifdef FLOAT128
1146
1147 /*----------------------------------------------------------------------------
1148 | Returns the result of converting the 32-bit two's complement integer `a' to
1149 | the quadruple-precision floating-point format.  The conversion is performed
1150 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1151 *----------------------------------------------------------------------------*/
1152
1153 float128 int32_to_float128( int32 a STATUS_PARAM )
1154 {
1155     flag zSign;
1156     uint32 absA;
1157     int8 shiftCount;
1158     bits64 zSig0;
1159
1160     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1161     zSign = ( a < 0 );
1162     absA = zSign ? - a : a;
1163     shiftCount = countLeadingZeros32( absA ) + 17;
1164     zSig0 = absA;
1165     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1166
1167 }
1168
1169 #endif
1170
1171 /*----------------------------------------------------------------------------
1172 | Returns the result of converting the 64-bit two's complement integer `a'
1173 | to the single-precision floating-point format.  The conversion is performed
1174 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1175 *----------------------------------------------------------------------------*/
1176
1177 float32 int64_to_float32( int64 a STATUS_PARAM )
1178 {
1179     flag zSign;
1180     uint64 absA;
1181     int8 shiftCount;
1182
1183     if ( a == 0 ) return float32_zero;
1184     zSign = ( a < 0 );
1185     absA = zSign ? - a : a;
1186     shiftCount = countLeadingZeros64( absA ) - 40;
1187     if ( 0 <= shiftCount ) {
1188         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1189     }
1190     else {
1191         shiftCount += 7;
1192         if ( shiftCount < 0 ) {
1193             shift64RightJamming( absA, - shiftCount, &absA );
1194         }
1195         else {
1196             absA <<= shiftCount;
1197         }
1198         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1199     }
1200
1201 }
1202
1203 float32 uint64_to_float32( uint64 a STATUS_PARAM )
1204 {
1205     int8 shiftCount;
1206
1207     if ( a == 0 ) return float32_zero;
1208     shiftCount = countLeadingZeros64( a ) - 40;
1209     if ( 0 <= shiftCount ) {
1210         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
1211     }
1212     else {
1213         shiftCount += 7;
1214         if ( shiftCount < 0 ) {
1215             shift64RightJamming( a, - shiftCount, &a );
1216         }
1217         else {
1218             a <<= shiftCount;
1219         }
1220         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
1221     }
1222 }
1223
1224 /*----------------------------------------------------------------------------
1225 | Returns the result of converting the 64-bit two's complement integer `a'
1226 | to the double-precision floating-point format.  The conversion is performed
1227 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1228 *----------------------------------------------------------------------------*/
1229
1230 float64 int64_to_float64( int64 a STATUS_PARAM )
1231 {
1232     flag zSign;
1233
1234     if ( a == 0 ) return float64_zero;
1235     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1236         return packFloat64( 1, 0x43E, 0 );
1237     }
1238     zSign = ( a < 0 );
1239     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1240
1241 }
1242
1243 float64 uint64_to_float64( uint64 a STATUS_PARAM )
1244 {
1245     if ( a == 0 ) return float64_zero;
1246     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
1247
1248 }
1249
1250 #ifdef FLOATX80
1251
1252 /*----------------------------------------------------------------------------
1253 | Returns the result of converting the 64-bit two's complement integer `a'
1254 | to the extended double-precision floating-point format.  The conversion
1255 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1256 | Arithmetic.
1257 *----------------------------------------------------------------------------*/
1258
1259 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
1260 {
1261     flag zSign;
1262     uint64 absA;
1263     int8 shiftCount;
1264
1265     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1266     zSign = ( a < 0 );
1267     absA = zSign ? - a : a;
1268     shiftCount = countLeadingZeros64( absA );
1269     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1270
1271 }
1272
1273 #endif
1274
1275 #ifdef FLOAT128
1276
1277 /*----------------------------------------------------------------------------
1278 | Returns the result of converting the 64-bit two's complement integer `a' to
1279 | the quadruple-precision floating-point format.  The conversion is performed
1280 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1281 *----------------------------------------------------------------------------*/
1282
1283 float128 int64_to_float128( int64 a STATUS_PARAM )
1284 {
1285     flag zSign;
1286     uint64 absA;
1287     int8 shiftCount;
1288     int32 zExp;
1289     bits64 zSig0, zSig1;
1290
1291     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1292     zSign = ( a < 0 );
1293     absA = zSign ? - a : a;
1294     shiftCount = countLeadingZeros64( absA ) + 49;
1295     zExp = 0x406E - shiftCount;
1296     if ( 64 <= shiftCount ) {
1297         zSig1 = 0;
1298         zSig0 = absA;
1299         shiftCount -= 64;
1300     }
1301     else {
1302         zSig1 = absA;
1303         zSig0 = 0;
1304     }
1305     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1306     return packFloat128( zSign, zExp, zSig0, zSig1 );
1307
1308 }
1309
1310 #endif
1311
1312 /*----------------------------------------------------------------------------
1313 | Returns the result of converting the single-precision floating-point value
1314 | `a' to the 32-bit two's complement integer format.  The conversion is
1315 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1316 | Arithmetic---which means in particular that the conversion is rounded
1317 | according to the current rounding mode.  If `a' is a NaN, the largest
1318 | positive integer is returned.  Otherwise, if the conversion overflows, the
1319 | largest integer with the same sign as `a' is returned.
1320 *----------------------------------------------------------------------------*/
1321
1322 int32 float32_to_int32( float32 a STATUS_PARAM )
1323 {
1324     flag aSign;
1325     int16 aExp, shiftCount;
1326     bits32 aSig;
1327     bits64 aSig64;
1328
1329     a = float32_squash_input_denormal(a STATUS_VAR);
1330     aSig = extractFloat32Frac( a );
1331     aExp = extractFloat32Exp( a );
1332     aSign = extractFloat32Sign( a );
1333     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1334     if ( aExp ) aSig |= 0x00800000;
1335     shiftCount = 0xAF - aExp;
1336     aSig64 = aSig;
1337     aSig64 <<= 32;
1338     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1339     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1340
1341 }
1342
1343 /*----------------------------------------------------------------------------
1344 | Returns the result of converting the single-precision floating-point value
1345 | `a' to the 32-bit two's complement integer format.  The conversion is
1346 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1347 | Arithmetic, except that the conversion is always rounded toward zero.
1348 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1349 | the conversion overflows, the largest integer with the same sign as `a' is
1350 | returned.
1351 *----------------------------------------------------------------------------*/
1352
1353 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1354 {
1355     flag aSign;
1356     int16 aExp, shiftCount;
1357     bits32 aSig;
1358     int32 z;
1359     a = float32_squash_input_denormal(a STATUS_VAR);
1360
1361     aSig = extractFloat32Frac( a );
1362     aExp = extractFloat32Exp( a );
1363     aSign = extractFloat32Sign( a );
1364     shiftCount = aExp - 0x9E;
1365     if ( 0 <= shiftCount ) {
1366         if ( float32_val(a) != 0xCF000000 ) {
1367             float_raise( float_flag_invalid STATUS_VAR);
1368             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1369         }
1370         return (sbits32) 0x80000000;
1371     }
1372     else if ( aExp <= 0x7E ) {
1373         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1374         return 0;
1375     }
1376     aSig = ( aSig | 0x00800000 )<<8;
1377     z = aSig>>( - shiftCount );
1378     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1379         STATUS(float_exception_flags) |= float_flag_inexact;
1380     }
1381     if ( aSign ) z = - z;
1382     return z;
1383
1384 }
1385
1386 /*----------------------------------------------------------------------------
1387 | Returns the result of converting the single-precision floating-point value
1388 | `a' to the 16-bit two's complement integer format.  The conversion is
1389 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1390 | Arithmetic, except that the conversion is always rounded toward zero.
1391 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1392 | the conversion overflows, the largest integer with the same sign as `a' is
1393 | returned.
1394 *----------------------------------------------------------------------------*/
1395
1396 int16 float32_to_int16_round_to_zero( float32 a STATUS_PARAM )
1397 {
1398     flag aSign;
1399     int16 aExp, shiftCount;
1400     bits32 aSig;
1401     int32 z;
1402
1403     aSig = extractFloat32Frac( a );
1404     aExp = extractFloat32Exp( a );
1405     aSign = extractFloat32Sign( a );
1406     shiftCount = aExp - 0x8E;
1407     if ( 0 <= shiftCount ) {
1408         if ( float32_val(a) != 0xC7000000 ) {
1409             float_raise( float_flag_invalid STATUS_VAR);
1410             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1411                 return 0x7FFF;
1412             }
1413         }
1414         return (sbits32) 0xffff8000;
1415     }
1416     else if ( aExp <= 0x7E ) {
1417         if ( aExp | aSig ) {
1418             STATUS(float_exception_flags) |= float_flag_inexact;
1419         }
1420         return 0;
1421     }
1422     shiftCount -= 0x10;
1423     aSig = ( aSig | 0x00800000 )<<8;
1424     z = aSig>>( - shiftCount );
1425     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1426         STATUS(float_exception_flags) |= float_flag_inexact;
1427     }
1428     if ( aSign ) {
1429         z = - z;
1430     }
1431     return z;
1432
1433 }
1434
1435 /*----------------------------------------------------------------------------
1436 | Returns the result of converting the single-precision floating-point value
1437 | `a' to the 64-bit two's complement integer format.  The conversion is
1438 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1439 | Arithmetic---which means in particular that the conversion is rounded
1440 | according to the current rounding mode.  If `a' is a NaN, the largest
1441 | positive integer is returned.  Otherwise, if the conversion overflows, the
1442 | largest integer with the same sign as `a' is returned.
1443 *----------------------------------------------------------------------------*/
1444
1445 int64 float32_to_int64( float32 a STATUS_PARAM )
1446 {
1447     flag aSign;
1448     int16 aExp, shiftCount;
1449     bits32 aSig;
1450     bits64 aSig64, aSigExtra;
1451     a = float32_squash_input_denormal(a STATUS_VAR);
1452
1453     aSig = extractFloat32Frac( a );
1454     aExp = extractFloat32Exp( a );
1455     aSign = extractFloat32Sign( a );
1456     shiftCount = 0xBE - aExp;
1457     if ( shiftCount < 0 ) {
1458         float_raise( float_flag_invalid STATUS_VAR);
1459         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1460             return LIT64( 0x7FFFFFFFFFFFFFFF );
1461         }
1462         return (sbits64) LIT64( 0x8000000000000000 );
1463     }
1464     if ( aExp ) aSig |= 0x00800000;
1465     aSig64 = aSig;
1466     aSig64 <<= 40;
1467     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1468     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1469
1470 }
1471
1472 /*----------------------------------------------------------------------------
1473 | Returns the result of converting the single-precision floating-point value
1474 | `a' to the 64-bit two's complement integer format.  The conversion is
1475 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1476 | Arithmetic, except that the conversion is always rounded toward zero.  If
1477 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1478 | conversion overflows, the largest integer with the same sign as `a' is
1479 | returned.
1480 *----------------------------------------------------------------------------*/
1481
1482 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1483 {
1484     flag aSign;
1485     int16 aExp, shiftCount;
1486     bits32 aSig;
1487     bits64 aSig64;
1488     int64 z;
1489     a = float32_squash_input_denormal(a STATUS_VAR);
1490
1491     aSig = extractFloat32Frac( a );
1492     aExp = extractFloat32Exp( a );
1493     aSign = extractFloat32Sign( a );
1494     shiftCount = aExp - 0xBE;
1495     if ( 0 <= shiftCount ) {
1496         if ( float32_val(a) != 0xDF000000 ) {
1497             float_raise( float_flag_invalid STATUS_VAR);
1498             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1499                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1500             }
1501         }
1502         return (sbits64) LIT64( 0x8000000000000000 );
1503     }
1504     else if ( aExp <= 0x7E ) {
1505         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1506         return 0;
1507     }
1508     aSig64 = aSig | 0x00800000;
1509     aSig64 <<= 40;
1510     z = aSig64>>( - shiftCount );
1511     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1512         STATUS(float_exception_flags) |= float_flag_inexact;
1513     }
1514     if ( aSign ) z = - z;
1515     return z;
1516
1517 }
1518
1519 /*----------------------------------------------------------------------------
1520 | Returns the result of converting the single-precision floating-point value
1521 | `a' to the double-precision floating-point format.  The conversion is
1522 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1523 | Arithmetic.
1524 *----------------------------------------------------------------------------*/
1525
1526 float64 float32_to_float64( float32 a STATUS_PARAM )
1527 {
1528     flag aSign;
1529     int16 aExp;
1530     bits32 aSig;
1531     a = float32_squash_input_denormal(a STATUS_VAR);
1532
1533     aSig = extractFloat32Frac( a );
1534     aExp = extractFloat32Exp( a );
1535     aSign = extractFloat32Sign( a );
1536     if ( aExp == 0xFF ) {
1537         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
1538         return packFloat64( aSign, 0x7FF, 0 );
1539     }
1540     if ( aExp == 0 ) {
1541         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1542         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1543         --aExp;
1544     }
1545     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1546
1547 }
1548
1549 #ifdef FLOATX80
1550
1551 /*----------------------------------------------------------------------------
1552 | Returns the result of converting the single-precision floating-point value
1553 | `a' to the extended double-precision floating-point format.  The conversion
1554 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1555 | Arithmetic.
1556 *----------------------------------------------------------------------------*/
1557
1558 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1559 {
1560     flag aSign;
1561     int16 aExp;
1562     bits32 aSig;
1563
1564     a = float32_squash_input_denormal(a STATUS_VAR);
1565     aSig = extractFloat32Frac( a );
1566     aExp = extractFloat32Exp( a );
1567     aSign = extractFloat32Sign( a );
1568     if ( aExp == 0xFF ) {
1569         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
1570         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1571     }
1572     if ( aExp == 0 ) {
1573         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1574         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1575     }
1576     aSig |= 0x00800000;
1577     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1578
1579 }
1580
1581 #endif
1582
1583 #ifdef FLOAT128
1584
1585 /*----------------------------------------------------------------------------
1586 | Returns the result of converting the single-precision floating-point value
1587 | `a' to the double-precision floating-point format.  The conversion is
1588 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1589 | Arithmetic.
1590 *----------------------------------------------------------------------------*/
1591
1592 float128 float32_to_float128( float32 a STATUS_PARAM )
1593 {
1594     flag aSign;
1595     int16 aExp;
1596     bits32 aSig;
1597
1598     a = float32_squash_input_denormal(a STATUS_VAR);
1599     aSig = extractFloat32Frac( a );
1600     aExp = extractFloat32Exp( a );
1601     aSign = extractFloat32Sign( a );
1602     if ( aExp == 0xFF ) {
1603         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
1604         return packFloat128( aSign, 0x7FFF, 0, 0 );
1605     }
1606     if ( aExp == 0 ) {
1607         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1608         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1609         --aExp;
1610     }
1611     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1612
1613 }
1614
1615 #endif
1616
1617 /*----------------------------------------------------------------------------
1618 | Rounds the single-precision floating-point value `a' to an integer, and
1619 | returns the result as a single-precision floating-point value.  The
1620 | operation is performed according to the IEC/IEEE Standard for Binary
1621 | Floating-Point Arithmetic.
1622 *----------------------------------------------------------------------------*/
1623
1624 float32 float32_round_to_int( float32 a STATUS_PARAM)
1625 {
1626     flag aSign;
1627     int16 aExp;
1628     bits32 lastBitMask, roundBitsMask;
1629     int8 roundingMode;
1630     bits32 z;
1631     a = float32_squash_input_denormal(a STATUS_VAR);
1632
1633     aExp = extractFloat32Exp( a );
1634     if ( 0x96 <= aExp ) {
1635         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1636             return propagateFloat32NaN( a, a STATUS_VAR );
1637         }
1638         return a;
1639     }
1640     if ( aExp <= 0x7E ) {
1641         if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
1642         STATUS(float_exception_flags) |= float_flag_inexact;
1643         aSign = extractFloat32Sign( a );
1644         switch ( STATUS(float_rounding_mode) ) {
1645          case float_round_nearest_even:
1646             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1647                 return packFloat32( aSign, 0x7F, 0 );
1648             }
1649             break;
1650          case float_round_down:
1651             return make_float32(aSign ? 0xBF800000 : 0);
1652          case float_round_up:
1653             return make_float32(aSign ? 0x80000000 : 0x3F800000);
1654         }
1655         return packFloat32( aSign, 0, 0 );
1656     }
1657     lastBitMask = 1;
1658     lastBitMask <<= 0x96 - aExp;
1659     roundBitsMask = lastBitMask - 1;
1660     z = float32_val(a);
1661     roundingMode = STATUS(float_rounding_mode);
1662     if ( roundingMode == float_round_nearest_even ) {
1663         z += lastBitMask>>1;
1664         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1665     }
1666     else if ( roundingMode != float_round_to_zero ) {
1667         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
1668             z += roundBitsMask;
1669         }
1670     }
1671     z &= ~ roundBitsMask;
1672     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1673     return make_float32(z);
1674
1675 }
1676
1677 /*----------------------------------------------------------------------------
1678 | Returns the result of adding the absolute values of the single-precision
1679 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1680 | before being returned.  `zSign' is ignored if the result is a NaN.
1681 | The addition is performed according to the IEC/IEEE Standard for Binary
1682 | Floating-Point Arithmetic.
1683 *----------------------------------------------------------------------------*/
1684
1685 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1686 {
1687     int16 aExp, bExp, zExp;
1688     bits32 aSig, bSig, zSig;
1689     int16 expDiff;
1690
1691     aSig = extractFloat32Frac( a );
1692     aExp = extractFloat32Exp( a );
1693     bSig = extractFloat32Frac( b );
1694     bExp = extractFloat32Exp( b );
1695     expDiff = aExp - bExp;
1696     aSig <<= 6;
1697     bSig <<= 6;
1698     if ( 0 < expDiff ) {
1699         if ( aExp == 0xFF ) {
1700             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1701             return a;
1702         }
1703         if ( bExp == 0 ) {
1704             --expDiff;
1705         }
1706         else {
1707             bSig |= 0x20000000;
1708         }
1709         shift32RightJamming( bSig, expDiff, &bSig );
1710         zExp = aExp;
1711     }
1712     else if ( expDiff < 0 ) {
1713         if ( bExp == 0xFF ) {
1714             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1715             return packFloat32( zSign, 0xFF, 0 );
1716         }
1717         if ( aExp == 0 ) {
1718             ++expDiff;
1719         }
1720         else {
1721             aSig |= 0x20000000;
1722         }
1723         shift32RightJamming( aSig, - expDiff, &aSig );
1724         zExp = bExp;
1725     }
1726     else {
1727         if ( aExp == 0xFF ) {
1728             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1729             return a;
1730         }
1731         if ( aExp == 0 ) {
1732             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
1733             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1734         }
1735         zSig = 0x40000000 + aSig + bSig;
1736         zExp = aExp;
1737         goto roundAndPack;
1738     }
1739     aSig |= 0x20000000;
1740     zSig = ( aSig + bSig )<<1;
1741     --zExp;
1742     if ( (sbits32) zSig < 0 ) {
1743         zSig = aSig + bSig;
1744         ++zExp;
1745     }
1746  roundAndPack:
1747     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1748
1749 }
1750
1751 /*----------------------------------------------------------------------------
1752 | Returns the result of subtracting the absolute values of the single-
1753 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1754 | difference is negated before being returned.  `zSign' is ignored if the
1755 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1756 | Standard for Binary Floating-Point Arithmetic.
1757 *----------------------------------------------------------------------------*/
1758
1759 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1760 {
1761     int16 aExp, bExp, zExp;
1762     bits32 aSig, bSig, zSig;
1763     int16 expDiff;
1764
1765     aSig = extractFloat32Frac( a );
1766     aExp = extractFloat32Exp( a );
1767     bSig = extractFloat32Frac( b );
1768     bExp = extractFloat32Exp( b );
1769     expDiff = aExp - bExp;
1770     aSig <<= 7;
1771     bSig <<= 7;
1772     if ( 0 < expDiff ) goto aExpBigger;
1773     if ( expDiff < 0 ) goto bExpBigger;
1774     if ( aExp == 0xFF ) {
1775         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1776         float_raise( float_flag_invalid STATUS_VAR);
1777         return float32_default_nan;
1778     }
1779     if ( aExp == 0 ) {
1780         aExp = 1;
1781         bExp = 1;
1782     }
1783     if ( bSig < aSig ) goto aBigger;
1784     if ( aSig < bSig ) goto bBigger;
1785     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
1786  bExpBigger:
1787     if ( bExp == 0xFF ) {
1788         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1789         return packFloat32( zSign ^ 1, 0xFF, 0 );
1790     }
1791     if ( aExp == 0 ) {
1792         ++expDiff;
1793     }
1794     else {
1795         aSig |= 0x40000000;
1796     }
1797     shift32RightJamming( aSig, - expDiff, &aSig );
1798     bSig |= 0x40000000;
1799  bBigger:
1800     zSig = bSig - aSig;
1801     zExp = bExp;
1802     zSign ^= 1;
1803     goto normalizeRoundAndPack;
1804  aExpBigger:
1805     if ( aExp == 0xFF ) {
1806         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1807         return a;
1808     }
1809     if ( bExp == 0 ) {
1810         --expDiff;
1811     }
1812     else {
1813         bSig |= 0x40000000;
1814     }
1815     shift32RightJamming( bSig, expDiff, &bSig );
1816     aSig |= 0x40000000;
1817  aBigger:
1818     zSig = aSig - bSig;
1819     zExp = aExp;
1820  normalizeRoundAndPack:
1821     --zExp;
1822     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1823
1824 }
1825
1826 /*----------------------------------------------------------------------------
1827 | Returns the result of adding the single-precision floating-point values `a'
1828 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1829 | Binary Floating-Point Arithmetic.
1830 *----------------------------------------------------------------------------*/
1831
1832 float32 float32_add( float32 a, float32 b STATUS_PARAM )
1833 {
1834     flag aSign, bSign;
1835     a = float32_squash_input_denormal(a STATUS_VAR);
1836     b = float32_squash_input_denormal(b STATUS_VAR);
1837
1838     aSign = extractFloat32Sign( a );
1839     bSign = extractFloat32Sign( b );
1840     if ( aSign == bSign ) {
1841         return addFloat32Sigs( a, b, aSign STATUS_VAR);
1842     }
1843     else {
1844         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1845     }
1846
1847 }
1848
1849 /*----------------------------------------------------------------------------
1850 | Returns the result of subtracting the single-precision floating-point values
1851 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1852 | for Binary Floating-Point Arithmetic.
1853 *----------------------------------------------------------------------------*/
1854
1855 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
1856 {
1857     flag aSign, bSign;
1858     a = float32_squash_input_denormal(a STATUS_VAR);
1859     b = float32_squash_input_denormal(b STATUS_VAR);
1860
1861     aSign = extractFloat32Sign( a );
1862     bSign = extractFloat32Sign( b );
1863     if ( aSign == bSign ) {
1864         return subFloat32Sigs( a, b, aSign STATUS_VAR );
1865     }
1866     else {
1867         return addFloat32Sigs( a, b, aSign STATUS_VAR );
1868     }
1869
1870 }
1871
1872 /*----------------------------------------------------------------------------
1873 | Returns the result of multiplying the single-precision floating-point values
1874 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1875 | for Binary Floating-Point Arithmetic.
1876 *----------------------------------------------------------------------------*/
1877
1878 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
1879 {
1880     flag aSign, bSign, zSign;
1881     int16 aExp, bExp, zExp;
1882     bits32 aSig, bSig;
1883     bits64 zSig64;
1884     bits32 zSig;
1885
1886     a = float32_squash_input_denormal(a STATUS_VAR);
1887     b = float32_squash_input_denormal(b STATUS_VAR);
1888
1889     aSig = extractFloat32Frac( a );
1890     aExp = extractFloat32Exp( a );
1891     aSign = extractFloat32Sign( a );
1892     bSig = extractFloat32Frac( b );
1893     bExp = extractFloat32Exp( b );
1894     bSign = extractFloat32Sign( b );
1895     zSign = aSign ^ bSign;
1896     if ( aExp == 0xFF ) {
1897         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1898             return propagateFloat32NaN( a, b STATUS_VAR );
1899         }
1900         if ( ( bExp | bSig ) == 0 ) {
1901             float_raise( float_flag_invalid STATUS_VAR);
1902             return float32_default_nan;
1903         }
1904         return packFloat32( zSign, 0xFF, 0 );
1905     }
1906     if ( bExp == 0xFF ) {
1907         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1908         if ( ( aExp | aSig ) == 0 ) {
1909             float_raise( float_flag_invalid STATUS_VAR);
1910             return float32_default_nan;
1911         }
1912         return packFloat32( zSign, 0xFF, 0 );
1913     }
1914     if ( aExp == 0 ) {
1915         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1916         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1917     }
1918     if ( bExp == 0 ) {
1919         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1920         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1921     }
1922     zExp = aExp + bExp - 0x7F;
1923     aSig = ( aSig | 0x00800000 )<<7;
1924     bSig = ( bSig | 0x00800000 )<<8;
1925     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1926     zSig = zSig64;
1927     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1928         zSig <<= 1;
1929         --zExp;
1930     }
1931     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1932
1933 }
1934
1935 /*----------------------------------------------------------------------------
1936 | Returns the result of dividing the single-precision floating-point value `a'
1937 | by the corresponding value `b'.  The operation is performed according to the
1938 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1939 *----------------------------------------------------------------------------*/
1940
1941 float32 float32_div( float32 a, float32 b STATUS_PARAM )
1942 {
1943     flag aSign, bSign, zSign;
1944     int16 aExp, bExp, zExp;
1945     bits32 aSig, bSig, zSig;
1946     a = float32_squash_input_denormal(a STATUS_VAR);
1947     b = float32_squash_input_denormal(b STATUS_VAR);
1948
1949     aSig = extractFloat32Frac( a );
1950     aExp = extractFloat32Exp( a );
1951     aSign = extractFloat32Sign( a );
1952     bSig = extractFloat32Frac( b );
1953     bExp = extractFloat32Exp( b );
1954     bSign = extractFloat32Sign( b );
1955     zSign = aSign ^ bSign;
1956     if ( aExp == 0xFF ) {
1957         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1958         if ( bExp == 0xFF ) {
1959             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1960             float_raise( float_flag_invalid STATUS_VAR);
1961             return float32_default_nan;
1962         }
1963         return packFloat32( zSign, 0xFF, 0 );
1964     }
1965     if ( bExp == 0xFF ) {
1966         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1967         return packFloat32( zSign, 0, 0 );
1968     }
1969     if ( bExp == 0 ) {
1970         if ( bSig == 0 ) {
1971             if ( ( aExp | aSig ) == 0 ) {
1972                 float_raise( float_flag_invalid STATUS_VAR);
1973                 return float32_default_nan;
1974             }
1975             float_raise( float_flag_divbyzero STATUS_VAR);
1976             return packFloat32( zSign, 0xFF, 0 );
1977         }
1978         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1979     }
1980     if ( aExp == 0 ) {
1981         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1982         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1983     }
1984     zExp = aExp - bExp + 0x7D;
1985     aSig = ( aSig | 0x00800000 )<<7;
1986     bSig = ( bSig | 0x00800000 )<<8;
1987     if ( bSig <= ( aSig + aSig ) ) {
1988         aSig >>= 1;
1989         ++zExp;
1990     }
1991     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1992     if ( ( zSig & 0x3F ) == 0 ) {
1993         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1994     }
1995     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1996
1997 }
1998
1999 /*----------------------------------------------------------------------------
2000 | Returns the remainder of the single-precision floating-point value `a'
2001 | with respect to the corresponding value `b'.  The operation is performed
2002 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2003 *----------------------------------------------------------------------------*/
2004
2005 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2006 {
2007     flag aSign, zSign;
2008     int16 aExp, bExp, expDiff;
2009     bits32 aSig, bSig;
2010     bits32 q;
2011     bits64 aSig64, bSig64, q64;
2012     bits32 alternateASig;
2013     sbits32 sigMean;
2014     a = float32_squash_input_denormal(a STATUS_VAR);
2015     b = float32_squash_input_denormal(b STATUS_VAR);
2016
2017     aSig = extractFloat32Frac( a );
2018     aExp = extractFloat32Exp( a );
2019     aSign = extractFloat32Sign( a );
2020     bSig = extractFloat32Frac( b );
2021     bExp = extractFloat32Exp( b );
2022     if ( aExp == 0xFF ) {
2023         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2024             return propagateFloat32NaN( a, b STATUS_VAR );
2025         }
2026         float_raise( float_flag_invalid STATUS_VAR);
2027         return float32_default_nan;
2028     }
2029     if ( bExp == 0xFF ) {
2030         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2031         return a;
2032     }
2033     if ( bExp == 0 ) {
2034         if ( bSig == 0 ) {
2035             float_raise( float_flag_invalid STATUS_VAR);
2036             return float32_default_nan;
2037         }
2038         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2039     }
2040     if ( aExp == 0 ) {
2041         if ( aSig == 0 ) return a;
2042         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2043     }
2044     expDiff = aExp - bExp;
2045     aSig |= 0x00800000;
2046     bSig |= 0x00800000;
2047     if ( expDiff < 32 ) {
2048         aSig <<= 8;
2049         bSig <<= 8;
2050         if ( expDiff < 0 ) {
2051             if ( expDiff < -1 ) return a;
2052             aSig >>= 1;
2053         }
2054         q = ( bSig <= aSig );
2055         if ( q ) aSig -= bSig;
2056         if ( 0 < expDiff ) {
2057             q = ( ( (bits64) aSig )<<32 ) / bSig;
2058             q >>= 32 - expDiff;
2059             bSig >>= 2;
2060             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2061         }
2062         else {
2063             aSig >>= 2;
2064             bSig >>= 2;
2065         }
2066     }
2067     else {
2068         if ( bSig <= aSig ) aSig -= bSig;
2069         aSig64 = ( (bits64) aSig )<<40;
2070         bSig64 = ( (bits64) bSig )<<40;
2071         expDiff -= 64;
2072         while ( 0 < expDiff ) {
2073             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2074             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2075             aSig64 = - ( ( bSig * q64 )<<38 );
2076             expDiff -= 62;
2077         }
2078         expDiff += 64;
2079         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2080         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2081         q = q64>>( 64 - expDiff );
2082         bSig <<= 6;
2083         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2084     }
2085     do {
2086         alternateASig = aSig;
2087         ++q;
2088         aSig -= bSig;
2089     } while ( 0 <= (sbits32) aSig );
2090     sigMean = aSig + alternateASig;
2091     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2092         aSig = alternateASig;
2093     }
2094     zSign = ( (sbits32) aSig < 0 );
2095     if ( zSign ) aSig = - aSig;
2096     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2097
2098 }
2099
2100 /*----------------------------------------------------------------------------
2101 | Returns the square root of the single-precision floating-point value `a'.
2102 | The operation is performed according to the IEC/IEEE Standard for Binary
2103 | Floating-Point Arithmetic.
2104 *----------------------------------------------------------------------------*/
2105
2106 float32 float32_sqrt( float32 a STATUS_PARAM )
2107 {
2108     flag aSign;
2109     int16 aExp, zExp;
2110     bits32 aSig, zSig;
2111     bits64 rem, term;
2112     a = float32_squash_input_denormal(a STATUS_VAR);
2113
2114     aSig = extractFloat32Frac( a );
2115     aExp = extractFloat32Exp( a );
2116     aSign = extractFloat32Sign( a );
2117     if ( aExp == 0xFF ) {
2118         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2119         if ( ! aSign ) return a;
2120         float_raise( float_flag_invalid STATUS_VAR);
2121         return float32_default_nan;
2122     }
2123     if ( aSign ) {
2124         if ( ( aExp | aSig ) == 0 ) return a;
2125         float_raise( float_flag_invalid STATUS_VAR);
2126         return float32_default_nan;
2127     }
2128     if ( aExp == 0 ) {
2129         if ( aSig == 0 ) return float32_zero;
2130         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2131     }
2132     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2133     aSig = ( aSig | 0x00800000 )<<8;
2134     zSig = estimateSqrt32( aExp, aSig ) + 2;
2135     if ( ( zSig & 0x7F ) <= 5 ) {
2136         if ( zSig < 2 ) {
2137             zSig = 0x7FFFFFFF;
2138             goto roundAndPack;
2139         }
2140         aSig >>= aExp & 1;
2141         term = ( (bits64) zSig ) * zSig;
2142         rem = ( ( (bits64) aSig )<<32 ) - term;
2143         while ( (sbits64) rem < 0 ) {
2144             --zSig;
2145             rem += ( ( (bits64) zSig )<<1 ) | 1;
2146         }
2147         zSig |= ( rem != 0 );
2148     }
2149     shift32RightJamming( zSig, 1, &zSig );
2150  roundAndPack:
2151     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2152
2153 }
2154
2155 /*----------------------------------------------------------------------------
2156 | Returns the binary exponential of the single-precision floating-point value
2157 | `a'. The operation is performed according to the IEC/IEEE Standard for
2158 | Binary Floating-Point Arithmetic.
2159 |
2160 | Uses the following identities:
2161 |
2162 | 1. -------------------------------------------------------------------------
2163 |      x    x*ln(2)
2164 |     2  = e
2165 |
2166 | 2. -------------------------------------------------------------------------
2167 |                      2     3     4     5           n
2168 |      x        x     x     x     x     x           x
2169 |     e  = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2170 |               1!    2!    3!    4!    5!          n!
2171 *----------------------------------------------------------------------------*/
2172
2173 static const float64 float32_exp2_coefficients[15] =
2174 {
2175     make_float64( 0x3ff0000000000000ll ), /*  1 */
2176     make_float64( 0x3fe0000000000000ll ), /*  2 */
2177     make_float64( 0x3fc5555555555555ll ), /*  3 */
2178     make_float64( 0x3fa5555555555555ll ), /*  4 */
2179     make_float64( 0x3f81111111111111ll ), /*  5 */
2180     make_float64( 0x3f56c16c16c16c17ll ), /*  6 */
2181     make_float64( 0x3f2a01a01a01a01all ), /*  7 */
2182     make_float64( 0x3efa01a01a01a01all ), /*  8 */
2183     make_float64( 0x3ec71de3a556c734ll ), /*  9 */
2184     make_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2185     make_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2186     make_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2187     make_float64( 0x3de6124613a86d09ll ), /* 13 */
2188     make_float64( 0x3da93974a8c07c9dll ), /* 14 */
2189     make_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2190 };
2191
2192 float32 float32_exp2( float32 a STATUS_PARAM )
2193 {
2194     flag aSign;
2195     int16 aExp;
2196     bits32 aSig;
2197     float64 r, x, xn;
2198     int i;
2199     a = float32_squash_input_denormal(a STATUS_VAR);
2200
2201     aSig = extractFloat32Frac( a );
2202     aExp = extractFloat32Exp( a );
2203     aSign = extractFloat32Sign( a );
2204
2205     if ( aExp == 0xFF) {
2206         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2207         return (aSign) ? float32_zero : a;
2208     }
2209     if (aExp == 0) {
2210         if (aSig == 0) return float32_one;
2211     }
2212
2213     float_raise( float_flag_inexact STATUS_VAR);
2214
2215     /* ******************************* */
2216     /* using float64 for approximation */
2217     /* ******************************* */
2218     x = float32_to_float64(a STATUS_VAR);
2219     x = float64_mul(x, float64_ln2 STATUS_VAR);
2220
2221     xn = x;
2222     r = float64_one;
2223     for (i = 0 ; i < 15 ; i++) {
2224         float64 f;
2225
2226         f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2227         r = float64_add(r, f STATUS_VAR);
2228
2229         xn = float64_mul(xn, x STATUS_VAR);
2230     }
2231
2232     return float64_to_float32(r, status);
2233 }
2234
2235 /*----------------------------------------------------------------------------
2236 | Returns the binary log of the single-precision floating-point value `a'.
2237 | The operation is performed according to the IEC/IEEE Standard for Binary
2238 | Floating-Point Arithmetic.
2239 *----------------------------------------------------------------------------*/
2240 float32 float32_log2( float32 a STATUS_PARAM )
2241 {
2242     flag aSign, zSign;
2243     int16 aExp;
2244     bits32 aSig, zSig, i;
2245
2246     a = float32_squash_input_denormal(a STATUS_VAR);
2247     aSig = extractFloat32Frac( a );
2248     aExp = extractFloat32Exp( a );
2249     aSign = extractFloat32Sign( a );
2250
2251     if ( aExp == 0 ) {
2252         if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2253         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2254     }
2255     if ( aSign ) {
2256         float_raise( float_flag_invalid STATUS_VAR);
2257         return float32_default_nan;
2258     }
2259     if ( aExp == 0xFF ) {
2260         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2261         return a;
2262     }
2263
2264     aExp -= 0x7F;
2265     aSig |= 0x00800000;
2266     zSign = aExp < 0;
2267     zSig = aExp << 23;
2268
2269     for (i = 1 << 22; i > 0; i >>= 1) {
2270         aSig = ( (bits64)aSig * aSig ) >> 23;
2271         if ( aSig & 0x01000000 ) {
2272             aSig >>= 1;
2273             zSig |= i;
2274         }
2275     }
2276
2277     if ( zSign )
2278         zSig = -zSig;
2279
2280     return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2281 }
2282
2283 /*----------------------------------------------------------------------------
2284 | Returns 1 if the single-precision floating-point value `a' is equal to
2285 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2286 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2287 *----------------------------------------------------------------------------*/
2288
2289 int float32_eq( float32 a, float32 b STATUS_PARAM )
2290 {
2291     a = float32_squash_input_denormal(a STATUS_VAR);
2292     b = float32_squash_input_denormal(b STATUS_VAR);
2293
2294     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2295          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2296        ) {
2297         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2298             float_raise( float_flag_invalid STATUS_VAR);
2299         }
2300         return 0;
2301     }
2302     return ( float32_val(a) == float32_val(b) ) ||
2303             ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2304
2305 }
2306
2307 /*----------------------------------------------------------------------------
2308 | Returns 1 if the single-precision floating-point value `a' is less than
2309 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2310 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2311 | Arithmetic.
2312 *----------------------------------------------------------------------------*/
2313
2314 int float32_le( float32 a, float32 b STATUS_PARAM )
2315 {
2316     flag aSign, bSign;
2317     bits32 av, bv;
2318     a = float32_squash_input_denormal(a STATUS_VAR);
2319     b = float32_squash_input_denormal(b STATUS_VAR);
2320
2321     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2322          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2323        ) {
2324         float_raise( float_flag_invalid STATUS_VAR);
2325         return 0;
2326     }
2327     aSign = extractFloat32Sign( a );
2328     bSign = extractFloat32Sign( b );
2329     av = float32_val(a);
2330     bv = float32_val(b);
2331     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2332     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2333
2334 }
2335
2336 /*----------------------------------------------------------------------------
2337 | Returns 1 if the single-precision floating-point value `a' is less than
2338 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2339 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2340 *----------------------------------------------------------------------------*/
2341
2342 int float32_lt( float32 a, float32 b STATUS_PARAM )
2343 {
2344     flag aSign, bSign;
2345     bits32 av, bv;
2346     a = float32_squash_input_denormal(a STATUS_VAR);
2347     b = float32_squash_input_denormal(b STATUS_VAR);
2348
2349     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2350          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2351        ) {
2352         float_raise( float_flag_invalid STATUS_VAR);
2353         return 0;
2354     }
2355     aSign = extractFloat32Sign( a );
2356     bSign = extractFloat32Sign( b );
2357     av = float32_val(a);
2358     bv = float32_val(b);
2359     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2360     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2361
2362 }
2363
2364 /*----------------------------------------------------------------------------
2365 | Returns 1 if the single-precision floating-point value `a' is equal to
2366 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2367 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2368 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2369 *----------------------------------------------------------------------------*/
2370
2371 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
2372 {
2373     bits32 av, bv;
2374     a = float32_squash_input_denormal(a STATUS_VAR);
2375     b = float32_squash_input_denormal(b STATUS_VAR);
2376
2377     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2378          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2379        ) {
2380         float_raise( float_flag_invalid STATUS_VAR);
2381         return 0;
2382     }
2383     av = float32_val(a);
2384     bv = float32_val(b);
2385     return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2386
2387 }
2388
2389 /*----------------------------------------------------------------------------
2390 | Returns 1 if the single-precision floating-point value `a' is less than or
2391 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2392 | cause an exception.  Otherwise, the comparison is performed according to the
2393 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2394 *----------------------------------------------------------------------------*/
2395
2396 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
2397 {
2398     flag aSign, bSign;
2399     bits32 av, bv;
2400     a = float32_squash_input_denormal(a STATUS_VAR);
2401     b = float32_squash_input_denormal(b STATUS_VAR);
2402
2403     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2404          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2405        ) {
2406         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2407             float_raise( float_flag_invalid STATUS_VAR);
2408         }
2409         return 0;
2410     }
2411     aSign = extractFloat32Sign( a );
2412     bSign = extractFloat32Sign( b );
2413     av = float32_val(a);
2414     bv = float32_val(b);
2415     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
2416     return ( av == bv ) || ( aSign ^ ( av < bv ) );
2417
2418 }
2419
2420 /*----------------------------------------------------------------------------
2421 | Returns 1 if the single-precision floating-point value `a' is less than
2422 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2423 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2424 | Standard for Binary Floating-Point Arithmetic.
2425 *----------------------------------------------------------------------------*/
2426
2427 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
2428 {
2429     flag aSign, bSign;
2430     bits32 av, bv;
2431     a = float32_squash_input_denormal(a STATUS_VAR);
2432     b = float32_squash_input_denormal(b STATUS_VAR);
2433
2434     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2435          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2436        ) {
2437         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2438             float_raise( float_flag_invalid STATUS_VAR);
2439         }
2440         return 0;
2441     }
2442     aSign = extractFloat32Sign( a );
2443     bSign = extractFloat32Sign( b );
2444     av = float32_val(a);
2445     bv = float32_val(b);
2446     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
2447     return ( av != bv ) && ( aSign ^ ( av < bv ) );
2448
2449 }
2450
2451 /*----------------------------------------------------------------------------
2452 | Returns the result of converting the double-precision floating-point value
2453 | `a' to the 32-bit two's complement integer format.  The conversion is
2454 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2455 | Arithmetic---which means in particular that the conversion is rounded
2456 | according to the current rounding mode.  If `a' is a NaN, the largest
2457 | positive integer is returned.  Otherwise, if the conversion overflows, the
2458 | largest integer with the same sign as `a' is returned.
2459 *----------------------------------------------------------------------------*/
2460
2461 int32 float64_to_int32( float64 a STATUS_PARAM )
2462 {
2463     flag aSign;
2464     int16 aExp, shiftCount;
2465     bits64 aSig;
2466     a = float64_squash_input_denormal(a STATUS_VAR);
2467
2468     aSig = extractFloat64Frac( a );
2469     aExp = extractFloat64Exp( a );
2470     aSign = extractFloat64Sign( a );
2471     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2472     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2473     shiftCount = 0x42C - aExp;
2474     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2475     return roundAndPackInt32( aSign, aSig STATUS_VAR );
2476
2477 }
2478
2479 /*----------------------------------------------------------------------------
2480 | Returns the result of converting the double-precision floating-point value
2481 | `a' to the 32-bit two's complement integer format.  The conversion is
2482 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2483 | Arithmetic, except that the conversion is always rounded toward zero.
2484 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2485 | the conversion overflows, the largest integer with the same sign as `a' is
2486 | returned.
2487 *----------------------------------------------------------------------------*/
2488
2489 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2490 {
2491     flag aSign;
2492     int16 aExp, shiftCount;
2493     bits64 aSig, savedASig;
2494     int32 z;
2495     a = float64_squash_input_denormal(a STATUS_VAR);
2496
2497     aSig = extractFloat64Frac( a );
2498     aExp = extractFloat64Exp( a );
2499     aSign = extractFloat64Sign( a );
2500     if ( 0x41E < aExp ) {
2501         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2502         goto invalid;
2503     }
2504     else if ( aExp < 0x3FF ) {
2505         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2506         return 0;
2507     }
2508     aSig |= LIT64( 0x0010000000000000 );
2509     shiftCount = 0x433 - aExp;
2510     savedASig = aSig;
2511     aSig >>= shiftCount;
2512     z = aSig;
2513     if ( aSign ) z = - z;
2514     if ( ( z < 0 ) ^ aSign ) {
2515  invalid:
2516         float_raise( float_flag_invalid STATUS_VAR);
2517         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2518     }
2519     if ( ( aSig<<shiftCount ) != savedASig ) {
2520         STATUS(float_exception_flags) |= float_flag_inexact;
2521     }
2522     return z;
2523
2524 }
2525
2526 /*----------------------------------------------------------------------------
2527 | Returns the result of converting the double-precision floating-point value
2528 | `a' to the 16-bit two's complement integer format.  The conversion is
2529 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2530 | Arithmetic, except that the conversion is always rounded toward zero.
2531 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2532 | the conversion overflows, the largest integer with the same sign as `a' is
2533 | returned.
2534 *----------------------------------------------------------------------------*/
2535
2536 int16 float64_to_int16_round_to_zero( float64 a STATUS_PARAM )
2537 {
2538     flag aSign;
2539     int16 aExp, shiftCount;
2540     bits64 aSig, savedASig;
2541     int32 z;
2542
2543     aSig = extractFloat64Frac( a );
2544     aExp = extractFloat64Exp( a );
2545     aSign = extractFloat64Sign( a );
2546     if ( 0x40E < aExp ) {
2547         if ( ( aExp == 0x7FF ) && aSig ) {
2548             aSign = 0;
2549         }
2550         goto invalid;
2551     }
2552     else if ( aExp < 0x3FF ) {
2553         if ( aExp || aSig ) {
2554             STATUS(float_exception_flags) |= float_flag_inexact;
2555         }
2556         return 0;
2557     }
2558     aSig |= LIT64( 0x0010000000000000 );
2559     shiftCount = 0x433 - aExp;
2560     savedASig = aSig;
2561     aSig >>= shiftCount;
2562     z = aSig;
2563     if ( aSign ) {
2564         z = - z;
2565     }
2566     if ( ( (int16_t)z < 0 ) ^ aSign ) {
2567  invalid:
2568         float_raise( float_flag_invalid STATUS_VAR);
2569         return aSign ? (sbits32) 0xffff8000 : 0x7FFF;
2570     }
2571     if ( ( aSig<<shiftCount ) != savedASig ) {
2572         STATUS(float_exception_flags) |= float_flag_inexact;
2573     }
2574     return z;
2575 }
2576
2577 /*----------------------------------------------------------------------------
2578 | Returns the result of converting the double-precision floating-point value
2579 | `a' to the 64-bit two's complement integer format.  The conversion is
2580 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2581 | Arithmetic---which means in particular that the conversion is rounded
2582 | according to the current rounding mode.  If `a' is a NaN, the largest
2583 | positive integer is returned.  Otherwise, if the conversion overflows, the
2584 | largest integer with the same sign as `a' is returned.
2585 *----------------------------------------------------------------------------*/
2586
2587 int64 float64_to_int64( float64 a STATUS_PARAM )
2588 {
2589     flag aSign;
2590     int16 aExp, shiftCount;
2591     bits64 aSig, aSigExtra;
2592     a = float64_squash_input_denormal(a STATUS_VAR);
2593
2594     aSig = extractFloat64Frac( a );
2595     aExp = extractFloat64Exp( a );
2596     aSign = extractFloat64Sign( a );
2597     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2598     shiftCount = 0x433 - aExp;
2599     if ( shiftCount <= 0 ) {
2600         if ( 0x43E < aExp ) {
2601             float_raise( float_flag_invalid STATUS_VAR);
2602             if (    ! aSign
2603                  || (    ( aExp == 0x7FF )
2604                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2605                ) {
2606                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2607             }
2608             return (sbits64) LIT64( 0x8000000000000000 );
2609         }
2610         aSigExtra = 0;
2611         aSig <<= - shiftCount;
2612     }
2613     else {
2614         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2615     }
2616     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
2617
2618 }
2619
2620 /*----------------------------------------------------------------------------
2621 | Returns the result of converting the double-precision floating-point value
2622 | `a' to the 64-bit two's complement integer format.  The conversion is
2623 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2624 | Arithmetic, except that the conversion is always rounded toward zero.
2625 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2626 | the conversion overflows, the largest integer with the same sign as `a' is
2627 | returned.
2628 *----------------------------------------------------------------------------*/
2629
2630 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
2631 {
2632     flag aSign;
2633     int16 aExp, shiftCount;
2634     bits64 aSig;
2635     int64 z;
2636     a = float64_squash_input_denormal(a STATUS_VAR);
2637
2638     aSig = extractFloat64Frac( a );
2639     aExp = extractFloat64Exp( a );
2640     aSign = extractFloat64Sign( a );
2641     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2642     shiftCount = aExp - 0x433;
2643     if ( 0 <= shiftCount ) {
2644         if ( 0x43E <= aExp ) {
2645             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
2646                 float_raise( float_flag_invalid STATUS_VAR);
2647                 if (    ! aSign
2648                      || (    ( aExp == 0x7FF )
2649                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2650                    ) {
2651                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2652                 }
2653             }
2654             return (sbits64) LIT64( 0x8000000000000000 );
2655         }
2656         z = aSig<<shiftCount;
2657     }
2658     else {
2659         if ( aExp < 0x3FE ) {
2660             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2661             return 0;
2662         }
2663         z = aSig>>( - shiftCount );
2664         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2665             STATUS(float_exception_flags) |= float_flag_inexact;
2666         }
2667     }
2668     if ( aSign ) z = - z;
2669     return z;
2670
2671 }
2672
2673 /*----------------------------------------------------------------------------
2674 | Returns the result of converting the double-precision floating-point value
2675 | `a' to the single-precision floating-point format.  The conversion is
2676 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2677 | Arithmetic.
2678 *----------------------------------------------------------------------------*/
2679
2680 float32 float64_to_float32( float64 a STATUS_PARAM )
2681 {
2682     flag aSign;
2683     int16 aExp;
2684     bits64 aSig;
2685     bits32 zSig;
2686     a = float64_squash_input_denormal(a STATUS_VAR);
2687
2688     aSig = extractFloat64Frac( a );
2689     aExp = extractFloat64Exp( a );
2690     aSign = extractFloat64Sign( a );
2691     if ( aExp == 0x7FF ) {
2692         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
2693         return packFloat32( aSign, 0xFF, 0 );
2694     }
2695     shift64RightJamming( aSig, 22, &aSig );
2696     zSig = aSig;
2697     if ( aExp || zSig ) {
2698         zSig |= 0x40000000;
2699         aExp -= 0x381;
2700     }
2701     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
2702
2703 }
2704
2705
2706 /*----------------------------------------------------------------------------
2707 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2708 | half-precision floating-point value, returning the result.  After being
2709 | shifted into the proper positions, the three fields are simply added
2710 | together to form the result.  This means that any integer portion of `zSig'
2711 | will be added into the exponent.  Since a properly normalized significand
2712 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2713 | than the desired result exponent whenever `zSig' is a complete, normalized
2714 | significand.
2715 *----------------------------------------------------------------------------*/
2716 static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
2717 {
2718     return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
2719 }
2720
2721 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2722    The latter gains extra exponent range by omitting the NaN/Inf encodings.  */
2723
2724 float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
2725 {
2726     flag aSign;
2727     int16 aExp;
2728     bits32 aSig;
2729
2730     aSign = a >> 15;
2731     aExp = (a >> 10) & 0x1f;
2732     aSig = a & 0x3ff;
2733
2734     if (aExp == 0x1f && ieee) {
2735         if (aSig) {
2736             /* Make sure correct exceptions are raised.  */
2737             float32ToCommonNaN(a STATUS_VAR);
2738             aSig |= 0x200;
2739         }
2740         return packFloat32(aSign, 0xff, aSig << 13);
2741     }
2742     if (aExp == 0) {
2743         int8 shiftCount;
2744
2745         if (aSig == 0) {
2746             return packFloat32(aSign, 0, 0);
2747         }
2748
2749         shiftCount = countLeadingZeros32( aSig ) - 21;
2750         aSig = aSig << shiftCount;
2751         aExp = -shiftCount;
2752     }
2753     return packFloat32( aSign, aExp + 0x70, aSig << 13);
2754 }
2755
2756 bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM)
2757 {
2758     flag aSign;
2759     int16 aExp;
2760     bits32 aSig;
2761     bits32 mask;
2762     bits32 increment;
2763     int8 roundingMode;
2764     a = float32_squash_input_denormal(a STATUS_VAR);
2765
2766     aSig = extractFloat32Frac( a );
2767     aExp = extractFloat32Exp( a );
2768     aSign = extractFloat32Sign( a );
2769     if ( aExp == 0xFF ) {
2770         if (aSig) {
2771             /* Make sure correct exceptions are raised.  */
2772             float32ToCommonNaN(a STATUS_VAR);
2773             aSig |= 0x00400000;
2774         }
2775         return packFloat16(aSign, 0x1f, aSig >> 13);
2776     }
2777     if (aExp == 0 && aSign == 0) {
2778         return packFloat16(aSign, 0, 0);
2779     }
2780     /* Decimal point between bits 22 and 23.  */
2781     aSig |= 0x00800000;
2782     aExp -= 0x7f;
2783     if (aExp < -14) {
2784         mask = 0x007fffff;
2785         if (aExp < -24) {
2786             aExp = -25;
2787         } else {
2788             mask >>= 24 + aExp;
2789         }
2790     } else {
2791         mask = 0x00001fff;
2792     }
2793     if (aSig & mask) {
2794         float_raise( float_flag_underflow STATUS_VAR );
2795         roundingMode = STATUS(float_rounding_mode);
2796         switch (roundingMode) {
2797         case float_round_nearest_even:
2798             increment = (mask + 1) >> 1;
2799             if ((aSig & mask) == increment) {
2800                 increment = aSig & (increment << 1);
2801             }
2802             break;
2803         case float_round_up:
2804             increment = aSign ? 0 : mask;
2805             break;
2806         case float_round_down:
2807             increment = aSign ? mask : 0;
2808             break;
2809         default: /* round_to_zero */
2810             increment = 0;
2811             break;
2812         }
2813         aSig += increment;
2814         if (aSig >= 0x01000000) {
2815             aSig >>= 1;
2816             aExp++;
2817         }
2818     } else if (aExp < -14
2819           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
2820         float_raise( float_flag_underflow STATUS_VAR);
2821     }
2822
2823     if (ieee) {
2824         if (aExp > 15) {
2825             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2826             return packFloat16(aSign, 0x1f, 0);
2827         }
2828     } else {
2829         if (aExp > 16) {
2830             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
2831             return packFloat16(aSign, 0x1f, 0x3ff);
2832         }
2833     }
2834     if (aExp < -24) {
2835         return packFloat16(aSign, 0, 0);
2836     }
2837     if (aExp < -14) {
2838         aSig >>= -14 - aExp;
2839         aExp = -14;
2840     }
2841     return packFloat16(aSign, aExp + 14, aSig >> 13);
2842 }
2843
2844 #ifdef FLOATX80
2845
2846 /*----------------------------------------------------------------------------
2847 | Returns the result of converting the double-precision floating-point value
2848 | `a' to the extended double-precision floating-point format.  The conversion
2849 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2850 | Arithmetic.
2851 *----------------------------------------------------------------------------*/
2852
2853 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
2854 {
2855     flag aSign;
2856     int16 aExp;
2857     bits64 aSig;
2858
2859     a = float64_squash_input_denormal(a STATUS_VAR);
2860     aSig = extractFloat64Frac( a );
2861     aExp = extractFloat64Exp( a );
2862     aSign = extractFloat64Sign( a );
2863     if ( aExp == 0x7FF ) {
2864         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
2865         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2866     }
2867     if ( aExp == 0 ) {
2868         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2869         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2870     }
2871     return
2872         packFloatx80(
2873             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2874
2875 }
2876
2877 #endif
2878
2879 #ifdef FLOAT128
2880
2881 /*----------------------------------------------------------------------------
2882 | Returns the result of converting the double-precision floating-point value
2883 | `a' to the quadruple-precision floating-point format.  The conversion is
2884 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2885 | Arithmetic.
2886 *----------------------------------------------------------------------------*/
2887
2888 float128 float64_to_float128( float64 a STATUS_PARAM )
2889 {
2890     flag aSign;
2891     int16 aExp;
2892     bits64 aSig, zSig0, zSig1;
2893
2894     a = float64_squash_input_denormal(a STATUS_VAR);
2895     aSig = extractFloat64Frac( a );
2896     aExp = extractFloat64Exp( a );
2897     aSign = extractFloat64Sign( a );
2898     if ( aExp == 0x7FF ) {
2899         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
2900         return packFloat128( aSign, 0x7FFF, 0, 0 );
2901     }
2902     if ( aExp == 0 ) {
2903         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2904         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2905         --aExp;
2906     }
2907     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2908     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2909
2910 }
2911
2912 #endif
2913
2914 /*----------------------------------------------------------------------------
2915 | Rounds the double-precision floating-point value `a' to an integer, and
2916 | returns the result as a double-precision floating-point value.  The
2917 | operation is performed according to the IEC/IEEE Standard for Binary
2918 | Floating-Point Arithmetic.
2919 *----------------------------------------------------------------------------*/
2920
2921 float64 float64_round_to_int( float64 a STATUS_PARAM )
2922 {
2923     flag aSign;
2924     int16 aExp;
2925     bits64 lastBitMask, roundBitsMask;
2926     int8 roundingMode;
2927     bits64 z;
2928     a = float64_squash_input_denormal(a STATUS_VAR);
2929
2930     aExp = extractFloat64Exp( a );
2931     if ( 0x433 <= aExp ) {
2932         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2933             return propagateFloat64NaN( a, a STATUS_VAR );
2934         }
2935         return a;
2936     }
2937     if ( aExp < 0x3FF ) {
2938         if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
2939         STATUS(float_exception_flags) |= float_flag_inexact;
2940         aSign = extractFloat64Sign( a );
2941         switch ( STATUS(float_rounding_mode) ) {
2942          case float_round_nearest_even:
2943             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2944                 return packFloat64( aSign, 0x3FF, 0 );
2945             }
2946             break;
2947          case float_round_down:
2948             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
2949          case float_round_up:
2950             return make_float64(
2951             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2952         }
2953         return packFloat64( aSign, 0, 0 );
2954     }
2955     lastBitMask = 1;
2956     lastBitMask <<= 0x433 - aExp;
2957     roundBitsMask = lastBitMask - 1;
2958     z = float64_val(a);
2959     roundingMode = STATUS(float_rounding_mode);
2960     if ( roundingMode == float_round_nearest_even ) {
2961         z += lastBitMask>>1;
2962         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2963     }
2964     else if ( roundingMode != float_round_to_zero ) {
2965         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
2966             z += roundBitsMask;
2967         }
2968     }
2969     z &= ~ roundBitsMask;
2970     if ( z != float64_val(a) )
2971         STATUS(float_exception_flags) |= float_flag_inexact;
2972     return make_float64(z);
2973
2974 }
2975
2976 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
2977 {
2978     int oldmode;
2979     float64 res;
2980     oldmode = STATUS(float_rounding_mode);
2981     STATUS(float_rounding_mode) = float_round_to_zero;
2982     res = float64_round_to_int(a STATUS_VAR);
2983     STATUS(float_rounding_mode) = oldmode;
2984     return res;
2985 }
2986
2987 /*----------------------------------------------------------------------------
2988 | Returns the result of adding the absolute values of the double-precision
2989 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2990 | before being returned.  `zSign' is ignored if the result is a NaN.
2991 | The addition is performed according to the IEC/IEEE Standard for Binary
2992 | Floating-Point Arithmetic.
2993 *----------------------------------------------------------------------------*/
2994
2995 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
2996 {
2997     int16 aExp, bExp, zExp;
2998     bits64 aSig, bSig, zSig;
2999     int16 expDiff;
3000
3001     aSig = extractFloat64Frac( a );
3002     aExp = extractFloat64Exp( a );
3003     bSig = extractFloat64Frac( b );
3004     bExp = extractFloat64Exp( b );
3005     expDiff = aExp - bExp;
3006     aSig <<= 9;
3007     bSig <<= 9;
3008     if ( 0 < expDiff ) {
3009         if ( aExp == 0x7FF ) {
3010             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3011             return a;
3012         }
3013         if ( bExp == 0 ) {
3014             --expDiff;
3015         }
3016         else {
3017             bSig |= LIT64( 0x2000000000000000 );
3018         }
3019         shift64RightJamming( bSig, expDiff, &bSig );
3020         zExp = aExp;
3021     }
3022     else if ( expDiff < 0 ) {
3023         if ( bExp == 0x7FF ) {
3024             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3025             return packFloat64( zSign, 0x7FF, 0 );
3026         }
3027         if ( aExp == 0 ) {
3028             ++expDiff;
3029         }
3030         else {
3031             aSig |= LIT64( 0x2000000000000000 );
3032         }
3033         shift64RightJamming( aSig, - expDiff, &aSig );
3034         zExp = bExp;
3035     }
3036     else {
3037         if ( aExp == 0x7FF ) {
3038             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3039             return a;
3040         }
3041         if ( aExp == 0 ) {
3042             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
3043             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3044         }
3045         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3046         zExp = aExp;
3047         goto roundAndPack;
3048     }
3049     aSig |= LIT64( 0x2000000000000000 );
3050     zSig = ( aSig + bSig )<<1;
3051     --zExp;
3052     if ( (sbits64) zSig < 0 ) {
3053         zSig = aSig + bSig;
3054         ++zExp;
3055     }
3056  roundAndPack:
3057     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3058
3059 }
3060
3061 /*----------------------------------------------------------------------------
3062 | Returns the result of subtracting the absolute values of the double-
3063 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
3064 | difference is negated before being returned.  `zSign' is ignored if the
3065 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3066 | Standard for Binary Floating-Point Arithmetic.
3067 *----------------------------------------------------------------------------*/
3068
3069 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3070 {
3071     int16 aExp, bExp, zExp;
3072     bits64 aSig, bSig, zSig;
3073     int16 expDiff;
3074
3075     aSig = extractFloat64Frac( a );
3076     aExp = extractFloat64Exp( a );
3077     bSig = extractFloat64Frac( b );
3078     bExp = extractFloat64Exp( b );
3079     expDiff = aExp - bExp;
3080     aSig <<= 10;
3081     bSig <<= 10;
3082     if ( 0 < expDiff ) goto aExpBigger;
3083     if ( expDiff < 0 ) goto bExpBigger;
3084     if ( aExp == 0x7FF ) {
3085         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3086         float_raise( float_flag_invalid STATUS_VAR);
3087         return float64_default_nan;
3088     }
3089     if ( aExp == 0 ) {
3090         aExp = 1;
3091         bExp = 1;
3092     }
3093     if ( bSig < aSig ) goto aBigger;
3094     if ( aSig < bSig ) goto bBigger;
3095     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3096  bExpBigger:
3097     if ( bExp == 0x7FF ) {
3098         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3099         return packFloat64( zSign ^ 1, 0x7FF, 0 );
3100     }
3101     if ( aExp == 0 ) {
3102         ++expDiff;
3103     }
3104     else {
3105         aSig |= LIT64( 0x4000000000000000 );
3106     }
3107     shift64RightJamming( aSig, - expDiff, &aSig );
3108     bSig |= LIT64( 0x4000000000000000 );
3109  bBigger:
3110     zSig = bSig - aSig;
3111     zExp = bExp;
3112     zSign ^= 1;
3113     goto normalizeRoundAndPack;
3114  aExpBigger:
3115     if ( aExp == 0x7FF ) {
3116         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3117         return a;
3118     }
3119     if ( bExp == 0 ) {
3120         --expDiff;
3121     }
3122     else {
3123         bSig |= LIT64( 0x4000000000000000 );
3124     }
3125     shift64RightJamming( bSig, expDiff, &bSig );
3126     aSig |= LIT64( 0x4000000000000000 );
3127  aBigger:
3128     zSig = aSig - bSig;
3129     zExp = aExp;
3130  normalizeRoundAndPack:
3131     --zExp;
3132     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3133
3134 }
3135
3136 /*----------------------------------------------------------------------------
3137 | Returns the result of adding the double-precision floating-point values `a'
3138 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
3139 | Binary Floating-Point Arithmetic.
3140 *----------------------------------------------------------------------------*/
3141
3142 float64 float64_add( float64 a, float64 b STATUS_PARAM )
3143 {
3144     flag aSign, bSign;
3145     a = float64_squash_input_denormal(a STATUS_VAR);
3146     b = float64_squash_input_denormal(b STATUS_VAR);
3147
3148     aSign = extractFloat64Sign( a );
3149     bSign = extractFloat64Sign( b );
3150     if ( aSign == bSign ) {
3151         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3152     }
3153     else {
3154         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3155     }
3156
3157 }
3158
3159 /*----------------------------------------------------------------------------
3160 | Returns the result of subtracting the double-precision floating-point values
3161 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3162 | for Binary Floating-Point Arithmetic.
3163 *----------------------------------------------------------------------------*/
3164
3165 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3166 {
3167     flag aSign, bSign;
3168     a = float64_squash_input_denormal(a STATUS_VAR);
3169     b = float64_squash_input_denormal(b STATUS_VAR);
3170
3171     aSign = extractFloat64Sign( a );
3172     bSign = extractFloat64Sign( b );
3173     if ( aSign == bSign ) {
3174         return subFloat64Sigs( a, b, aSign STATUS_VAR );
3175     }
3176     else {
3177         return addFloat64Sigs( a, b, aSign STATUS_VAR );
3178     }
3179
3180 }
3181
3182 /*----------------------------------------------------------------------------
3183 | Returns the result of multiplying the double-precision floating-point values
3184 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
3185 | for Binary Floating-Point Arithmetic.
3186 *----------------------------------------------------------------------------*/
3187
3188 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3189 {
3190     flag aSign, bSign, zSign;
3191     int16 aExp, bExp, zExp;
3192     bits64 aSig, bSig, zSig0, zSig1;
3193
3194     a = float64_squash_input_denormal(a STATUS_VAR);
3195     b = float64_squash_input_denormal(b STATUS_VAR);
3196
3197     aSig = extractFloat64Frac( a );
3198     aExp = extractFloat64Exp( a );
3199     aSign = extractFloat64Sign( a );
3200     bSig = extractFloat64Frac( b );
3201     bExp = extractFloat64Exp( b );
3202     bSign = extractFloat64Sign( b );
3203     zSign = aSign ^ bSign;
3204     if ( aExp == 0x7FF ) {
3205         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3206             return propagateFloat64NaN( a, b STATUS_VAR );
3207         }
3208         if ( ( bExp | bSig ) == 0 ) {
3209             float_raise( float_flag_invalid STATUS_VAR);
3210             return float64_default_nan;
3211         }
3212         return packFloat64( zSign, 0x7FF, 0 );
3213     }
3214     if ( bExp == 0x7FF ) {
3215         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3216         if ( ( aExp | aSig ) == 0 ) {
3217             float_raise( float_flag_invalid STATUS_VAR);
3218             return float64_default_nan;
3219         }
3220         return packFloat64( zSign, 0x7FF, 0 );
3221     }
3222     if ( aExp == 0 ) {
3223         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3224         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3225     }
3226     if ( bExp == 0 ) {
3227         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3228         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3229     }
3230     zExp = aExp + bExp - 0x3FF;
3231     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3232     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3233     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3234     zSig0 |= ( zSig1 != 0 );
3235     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
3236         zSig0 <<= 1;
3237         --zExp;
3238     }
3239     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3240
3241 }
3242
3243 /*----------------------------------------------------------------------------
3244 | Returns the result of dividing the double-precision floating-point value `a'
3245 | by the corresponding value `b'.  The operation is performed according to
3246 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3247 *----------------------------------------------------------------------------*/
3248
3249 float64 float64_div( float64 a, float64 b STATUS_PARAM )
3250 {
3251     flag aSign, bSign, zSign;
3252     int16 aExp, bExp, zExp;
3253     bits64 aSig, bSig, zSig;
3254     bits64 rem0, rem1;
3255     bits64 term0, term1;
3256     a = float64_squash_input_denormal(a STATUS_VAR);
3257     b = float64_squash_input_denormal(b STATUS_VAR);
3258
3259     aSig = extractFloat64Frac( a );
3260     aExp = extractFloat64Exp( a );
3261     aSign = extractFloat64Sign( a );
3262     bSig = extractFloat64Frac( b );
3263     bExp = extractFloat64Exp( b );
3264     bSign = extractFloat64Sign( b );
3265     zSign = aSign ^ bSign;
3266     if ( aExp == 0x7FF ) {
3267         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3268         if ( bExp == 0x7FF ) {
3269             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3270             float_raise( float_flag_invalid STATUS_VAR);
3271             return float64_default_nan;
3272         }
3273         return packFloat64( zSign, 0x7FF, 0 );
3274     }
3275     if ( bExp == 0x7FF ) {
3276         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3277         return packFloat64( zSign, 0, 0 );
3278     }
3279     if ( bExp == 0 ) {
3280         if ( bSig == 0 ) {
3281             if ( ( aExp | aSig ) == 0 ) {
3282                 float_raise( float_flag_invalid STATUS_VAR);
3283                 return float64_default_nan;
3284             }
3285             float_raise( float_flag_divbyzero STATUS_VAR);
3286             return packFloat64( zSign, 0x7FF, 0 );
3287         }
3288         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3289     }
3290     if ( aExp == 0 ) {
3291         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3292         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3293     }
3294     zExp = aExp - bExp + 0x3FD;
3295     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3296     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3297     if ( bSig <= ( aSig + aSig ) ) {
3298         aSig >>= 1;
3299         ++zExp;
3300     }
3301     zSig = estimateDiv128To64( aSig, 0, bSig );
3302     if ( ( zSig & 0x1FF ) <= 2 ) {
3303         mul64To128( bSig, zSig, &term0, &term1 );
3304         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3305         while ( (sbits64) rem0 < 0 ) {
3306             --zSig;
3307             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3308         }
3309         zSig |= ( rem1 != 0 );
3310     }
3311     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3312
3313 }
3314
3315 /*----------------------------------------------------------------------------
3316 | Returns the remainder of the double-precision floating-point value `a'
3317 | with respect to the corresponding value `b'.  The operation is performed
3318 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3319 *----------------------------------------------------------------------------*/
3320
3321 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3322 {
3323     flag aSign, zSign;
3324     int16 aExp, bExp, expDiff;
3325     bits64 aSig, bSig;
3326     bits64 q, alternateASig;
3327     sbits64 sigMean;
3328
3329     a = float64_squash_input_denormal(a STATUS_VAR);
3330     b = float64_squash_input_denormal(b STATUS_VAR);
3331     aSig = extractFloat64Frac( a );
3332     aExp = extractFloat64Exp( a );
3333     aSign = extractFloat64Sign( a );
3334     bSig = extractFloat64Frac( b );
3335     bExp = extractFloat64Exp( b );
3336     if ( aExp == 0x7FF ) {
3337         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3338             return propagateFloat64NaN( a, b STATUS_VAR );
3339         }
3340         float_raise( float_flag_invalid STATUS_VAR);
3341         return float64_default_nan;
3342     }
3343     if ( bExp == 0x7FF ) {
3344         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3345         return a;
3346     }
3347     if ( bExp == 0 ) {
3348         if ( bSig == 0 ) {
3349             float_raise( float_flag_invalid STATUS_VAR);
3350             return float64_default_nan;
3351         }
3352         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3353     }
3354     if ( aExp == 0 ) {
3355         if ( aSig == 0 ) return a;
3356         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3357     }
3358     expDiff = aExp - bExp;
3359     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
3360     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3361     if ( expDiff < 0 ) {
3362         if ( expDiff < -1 ) return a;
3363         aSig >>= 1;
3364     }
3365     q = ( bSig <= aSig );
3366     if ( q ) aSig -= bSig;
3367     expDiff -= 64;
3368     while ( 0 < expDiff ) {
3369         q = estimateDiv128To64( aSig, 0, bSig );
3370         q = ( 2 < q ) ? q - 2 : 0;
3371         aSig = - ( ( bSig>>2 ) * q );
3372         expDiff -= 62;
3373     }
3374     expDiff += 64;
3375     if ( 0 < expDiff ) {
3376         q = estimateDiv128To64( aSig, 0, bSig );
3377         q = ( 2 < q ) ? q - 2 : 0;
3378         q >>= 64 - expDiff;
3379         bSig >>= 2;
3380         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
3381     }
3382     else {
3383         aSig >>= 2;
3384         bSig >>= 2;
3385     }
3386     do {
3387         alternateASig = aSig;
3388         ++q;
3389         aSig -= bSig;
3390     } while ( 0 <= (sbits64) aSig );
3391     sigMean = aSig + alternateASig;
3392     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
3393         aSig = alternateASig;
3394     }
3395     zSign = ( (sbits64) aSig < 0 );
3396     if ( zSign ) aSig = - aSig;
3397     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
3398
3399 }
3400
3401 /*----------------------------------------------------------------------------
3402 | Returns the square root of the double-precision floating-point value `a'.
3403 | The operation is performed according to the IEC/IEEE Standard for Binary
3404 | Floating-Point Arithmetic.
3405 *----------------------------------------------------------------------------*/
3406
3407 float64 float64_sqrt( float64 a STATUS_PARAM )
3408 {
3409     flag aSign;
3410     int16 aExp, zExp;
3411     bits64 aSig, zSig, doubleZSig;
3412     bits64 rem0, rem1, term0, term1;
3413     a = float64_squash_input_denormal(a STATUS_VAR);
3414
3415     aSig = extractFloat64Frac( a );
3416     aExp = extractFloat64Exp( a );
3417     aSign = extractFloat64Sign( a );
3418     if ( aExp == 0x7FF ) {
3419         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
3420         if ( ! aSign ) return a;
3421         float_raise( float_flag_invalid STATUS_VAR);
3422         return float64_default_nan;
3423     }
3424     if ( aSign ) {
3425         if ( ( aExp | aSig ) == 0 ) return a;
3426         float_raise( float_flag_invalid STATUS_VAR);
3427         return float64_default_nan;
3428     }
3429     if ( aExp == 0 ) {
3430         if ( aSig == 0 ) return float64_zero;
3431         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3432     }
3433     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
3434     aSig |= LIT64( 0x0010000000000000 );
3435     zSig = estimateSqrt32( aExp, aSig>>21 );
3436     aSig <<= 9 - ( aExp & 1 );
3437     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
3438     if ( ( zSig & 0x1FF ) <= 5 ) {
3439         doubleZSig = zSig<<1;
3440         mul64To128( zSig, zSig, &term0, &term1 );
3441         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
3442         while ( (sbits64) rem0 < 0 ) {
3443             --zSig;
3444             doubleZSig -= 2;
3445             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
3446         }
3447         zSig |= ( ( rem0 | rem1 ) != 0 );
3448     }
3449     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
3450
3451 }
3452
3453 /*----------------------------------------------------------------------------
3454 | Returns the binary log of the double-precision floating-point value `a'.
3455 | The operation is performed according to the IEC/IEEE Standard for Binary
3456 | Floating-Point Arithmetic.
3457 *----------------------------------------------------------------------------*/
3458 float64 float64_log2( float64 a STATUS_PARAM )
3459 {
3460     flag aSign, zSign;
3461     int16 aExp;
3462     bits64 aSig, aSig0, aSig1, zSig, i;
3463     a = float64_squash_input_denormal(a STATUS_VAR);
3464
3465     aSig = extractFloat64Frac( a );
3466     aExp = extractFloat64Exp( a );
3467     aSign = extractFloat64Sign( a );
3468
3469     if ( aExp == 0 ) {
3470         if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
3471         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3472     }
3473     if ( aSign ) {
3474         float_raise( float_flag_invalid STATUS_VAR);
3475         return float64_default_nan;
3476     }
3477     if ( aExp == 0x7FF ) {
3478         if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
3479         return a;
3480     }
3481
3482     aExp -= 0x3FF;
3483     aSig |= LIT64( 0x0010000000000000 );
3484     zSign = aExp < 0;
3485     zSig = (bits64)aExp << 52;
3486     for (i = 1LL << 51; i > 0; i >>= 1) {
3487         mul64To128( aSig, aSig, &aSig0, &aSig1 );
3488         aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
3489         if ( aSig & LIT64( 0x0020000000000000 ) ) {
3490             aSig >>= 1;
3491             zSig |= i;
3492         }
3493     }
3494
3495     if ( zSign )
3496         zSig = -zSig;
3497     return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
3498 }
3499
3500 /*----------------------------------------------------------------------------
3501 | Returns 1 if the double-precision floating-point value `a' is equal to the
3502 | corresponding value `b', and 0 otherwise.  The comparison is performed
3503 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3504 *----------------------------------------------------------------------------*/
3505
3506 int float64_eq( float64 a, float64 b STATUS_PARAM )
3507 {
3508     bits64 av, bv;
3509     a = float64_squash_input_denormal(a STATUS_VAR);
3510     b = float64_squash_input_denormal(b STATUS_VAR);
3511
3512     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3513          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3514        ) {
3515         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3516             float_raise( float_flag_invalid STATUS_VAR);
3517         }
3518         return 0;
3519     }
3520     av = float64_val(a);
3521     bv = float64_val(b);
3522     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3523
3524 }
3525
3526 /*----------------------------------------------------------------------------
3527 | Returns 1 if the double-precision floating-point value `a' is less than or
3528 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3529 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3530 | Arithmetic.
3531 *----------------------------------------------------------------------------*/
3532
3533 int float64_le( float64 a, float64 b STATUS_PARAM )
3534 {
3535     flag aSign, bSign;
3536     bits64 av, bv;
3537     a = float64_squash_input_denormal(a STATUS_VAR);
3538     b = float64_squash_input_denormal(b STATUS_VAR);
3539
3540     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3541          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3542        ) {
3543         float_raise( float_flag_invalid STATUS_VAR);
3544         return 0;
3545     }
3546     aSign = extractFloat64Sign( a );
3547     bSign = extractFloat64Sign( b );
3548     av = float64_val(a);
3549     bv = float64_val(b);
3550     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3551     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3552
3553 }
3554
3555 /*----------------------------------------------------------------------------
3556 | Returns 1 if the double-precision floating-point value `a' is less than
3557 | the corresponding value `b', and 0 otherwise.  The comparison is performed
3558 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3559 *----------------------------------------------------------------------------*/
3560
3561 int float64_lt( float64 a, float64 b STATUS_PARAM )
3562 {
3563     flag aSign, bSign;
3564     bits64 av, bv;
3565
3566     a = float64_squash_input_denormal(a STATUS_VAR);
3567     b = float64_squash_input_denormal(b STATUS_VAR);
3568     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3569          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3570        ) {
3571         float_raise( float_flag_invalid STATUS_VAR);
3572         return 0;
3573     }
3574     aSign = extractFloat64Sign( a );
3575     bSign = extractFloat64Sign( b );
3576     av = float64_val(a);
3577     bv = float64_val(b);
3578     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3579     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3580
3581 }
3582
3583 /*----------------------------------------------------------------------------
3584 | Returns 1 if the double-precision floating-point value `a' is equal to the
3585 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3586 | if either operand is a NaN.  Otherwise, the comparison is performed
3587 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3588 *----------------------------------------------------------------------------*/
3589
3590 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
3591 {
3592     bits64 av, bv;
3593     a = float64_squash_input_denormal(a STATUS_VAR);
3594     b = float64_squash_input_denormal(b STATUS_VAR);
3595
3596     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3597          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3598        ) {
3599         float_raise( float_flag_invalid STATUS_VAR);
3600         return 0;
3601     }
3602     av = float64_val(a);
3603     bv = float64_val(b);
3604     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3605
3606 }
3607
3608 /*----------------------------------------------------------------------------
3609 | Returns 1 if the double-precision floating-point value `a' is less than or
3610 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3611 | cause an exception.  Otherwise, the comparison is performed according to the
3612 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3613 *----------------------------------------------------------------------------*/
3614
3615 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
3616 {
3617     flag aSign, bSign;
3618     bits64 av, bv;
3619     a = float64_squash_input_denormal(a STATUS_VAR);
3620     b = float64_squash_input_denormal(b STATUS_VAR);
3621
3622     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3623          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3624        ) {
3625         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3626             float_raise( float_flag_invalid STATUS_VAR);
3627         }
3628         return 0;
3629     }
3630     aSign = extractFloat64Sign( a );
3631     bSign = extractFloat64Sign( b );
3632     av = float64_val(a);
3633     bv = float64_val(b);
3634     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
3635     return ( av == bv ) || ( aSign ^ ( av < bv ) );
3636
3637 }
3638
3639 /*----------------------------------------------------------------------------
3640 | Returns 1 if the double-precision floating-point value `a' is less than
3641 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3642 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3643 | Standard for Binary Floating-Point Arithmetic.
3644 *----------------------------------------------------------------------------*/
3645
3646 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
3647 {
3648     flag aSign, bSign;
3649     bits64 av, bv;
3650     a = float64_squash_input_denormal(a STATUS_VAR);
3651     b = float64_squash_input_denormal(b STATUS_VAR);
3652
3653     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3654          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3655        ) {
3656         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3657             float_raise( float_flag_invalid STATUS_VAR);
3658         }
3659         return 0;
3660     }
3661     aSign = extractFloat64Sign( a );
3662     bSign = extractFloat64Sign( b );
3663     av = float64_val(a);
3664     bv = float64_val(b);
3665     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
3666     return ( av != bv ) && ( aSign ^ ( av < bv ) );
3667
3668 }
3669
3670 #ifdef FLOATX80
3671
3672 /*----------------------------------------------------------------------------
3673 | Returns the result of converting the extended double-precision floating-
3674 | point value `a' to the 32-bit two's complement integer format.  The
3675 | conversion is performed according to the IEC/IEEE Standard for Binary
3676 | Floating-Point Arithmetic---which means in particular that the conversion
3677 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3678 | largest positive integer is returned.  Otherwise, if the conversion
3679 | overflows, the largest integer with the same sign as `a' is returned.
3680 *----------------------------------------------------------------------------*/
3681
3682 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
3683 {
3684     flag aSign;
3685     int32 aExp, shiftCount;
3686     bits64 aSig;
3687
3688     aSig = extractFloatx80Frac( a );
3689     aExp = extractFloatx80Exp( a );
3690     aSign = extractFloatx80Sign( a );
3691     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3692     shiftCount = 0x4037 - aExp;
3693     if ( shiftCount <= 0 ) shiftCount = 1;
3694     shift64RightJamming( aSig, shiftCount, &aSig );
3695     return roundAndPackInt32( aSign, aSig STATUS_VAR );
3696
3697 }
3698
3699 /*----------------------------------------------------------------------------
3700 | Returns the result of converting the extended double-precision floating-
3701 | point value `a' to the 32-bit two's complement integer format.  The
3702 | conversion is performed according to the IEC/IEEE Standard for Binary
3703 | Floating-Point Arithmetic, except that the conversion is always rounded
3704 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3705 | Otherwise, if the conversion overflows, the largest integer with the same
3706 | sign as `a' is returned.
3707 *----------------------------------------------------------------------------*/
3708
3709 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
3710 {
3711     flag aSign;
3712     int32 aExp, shiftCount;
3713     bits64 aSig, savedASig;
3714     int32 z;
3715
3716     aSig = extractFloatx80Frac( a );
3717     aExp = extractFloatx80Exp( a );
3718     aSign = extractFloatx80Sign( a );
3719     if ( 0x401E < aExp ) {
3720         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3721         goto invalid;
3722     }
3723     else if ( aExp < 0x3FFF ) {
3724         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3725         return 0;
3726     }
3727     shiftCount = 0x403E - aExp;
3728     savedASig = aSig;
3729     aSig >>= shiftCount;
3730     z = aSig;
3731     if ( aSign ) z = - z;
3732     if ( ( z < 0 ) ^ aSign ) {
3733  invalid:
3734         float_raise( float_flag_invalid STATUS_VAR);
3735         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3736     }
3737     if ( ( aSig<<shiftCount ) != savedASig ) {
3738         STATUS(float_exception_flags) |= float_flag_inexact;
3739     }
3740     return z;
3741
3742 }
3743
3744 /*----------------------------------------------------------------------------
3745 | Returns the result of converting the extended double-precision floating-
3746 | point value `a' to the 64-bit two's complement integer format.  The
3747 | conversion is performed according to the IEC/IEEE Standard for Binary
3748 | Floating-Point Arithmetic---which means in particular that the conversion
3749 | is rounded according to the current rounding mode.  If `a' is a NaN,
3750 | the largest positive integer is returned.  Otherwise, if the conversion
3751 | overflows, the largest integer with the same sign as `a' is returned.
3752 *----------------------------------------------------------------------------*/
3753
3754 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
3755 {
3756     flag aSign;
3757     int32 aExp, shiftCount;
3758     bits64 aSig, aSigExtra;
3759
3760     aSig = extractFloatx80Frac( a );
3761     aExp = extractFloatx80Exp( a );
3762     aSign = extractFloatx80Sign( a );
3763     shiftCount = 0x403E - aExp;
3764     if ( shiftCount <= 0 ) {
3765         if ( shiftCount ) {
3766             float_raise( float_flag_invalid STATUS_VAR);
3767             if (    ! aSign
3768                  || (    ( aExp == 0x7FFF )
3769                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3770                ) {
3771                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3772             }
3773             return (sbits64) LIT64( 0x8000000000000000 );
3774         }
3775         aSigExtra = 0;
3776     }
3777     else {
3778         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3779     }
3780     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3781
3782 }
3783
3784 /*----------------------------------------------------------------------------
3785 | Returns the result of converting the extended double-precision floating-
3786 | point value `a' to the 64-bit two's complement integer format.  The
3787 | conversion is performed according to the IEC/IEEE Standard for Binary
3788 | Floating-Point Arithmetic, except that the conversion is always rounded
3789 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3790 | Otherwise, if the conversion overflows, the largest integer with the same
3791 | sign as `a' is returned.
3792 *----------------------------------------------------------------------------*/
3793
3794 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
3795 {
3796     flag aSign;
3797     int32 aExp, shiftCount;
3798     bits64 aSig;
3799     int64 z;
3800
3801     aSig = extractFloatx80Frac( a );
3802     aExp = extractFloatx80Exp( a );
3803     aSign = extractFloatx80Sign( a );
3804     shiftCount = aExp - 0x403E;
3805     if ( 0 <= shiftCount ) {
3806         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3807         if ( ( a.high != 0xC03E ) || aSig ) {
3808             float_raise( float_flag_invalid STATUS_VAR);
3809             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3810                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3811             }
3812         }
3813         return (sbits64) LIT64( 0x8000000000000000 );
3814     }
3815     else if ( aExp < 0x3FFF ) {
3816         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3817         return 0;
3818     }
3819     z = aSig>>( - shiftCount );
3820     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3821         STATUS(float_exception_flags) |= float_flag_inexact;
3822     }
3823     if ( aSign ) z = - z;
3824     return z;
3825
3826 }
3827
3828 /*----------------------------------------------------------------------------
3829 | Returns the result of converting the extended double-precision floating-
3830 | point value `a' to the single-precision floating-point format.  The
3831 | conversion is performed according to the IEC/IEEE Standard for Binary
3832 | Floating-Point Arithmetic.
3833 *----------------------------------------------------------------------------*/
3834
3835 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
3836 {
3837     flag aSign;
3838     int32 aExp;
3839     bits64 aSig;
3840
3841     aSig = extractFloatx80Frac( a );
3842     aExp = extractFloatx80Exp( a );
3843     aSign = extractFloatx80Sign( a );
3844     if ( aExp == 0x7FFF ) {
3845         if ( (bits64) ( aSig<<1 ) ) {
3846             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
3847         }
3848         return packFloat32( aSign, 0xFF, 0 );
3849     }
3850     shift64RightJamming( aSig, 33, &aSig );
3851     if ( aExp || aSig ) aExp -= 0x3F81;
3852     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
3853
3854 }
3855
3856 /*----------------------------------------------------------------------------
3857 | Returns the result of converting the extended double-precision floating-
3858 | point value `a' to the double-precision floating-point format.  The
3859 | conversion is performed according to the IEC/IEEE Standard for Binary
3860 | Floating-Point Arithmetic.
3861 *----------------------------------------------------------------------------*/
3862
3863 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
3864 {
3865     flag aSign;
3866     int32 aExp;
3867     bits64 aSig, zSig;
3868
3869     aSig = extractFloatx80Frac( a );
3870     aExp = extractFloatx80Exp( a );
3871     aSign = extractFloatx80Sign( a );
3872     if ( aExp == 0x7FFF ) {
3873         if ( (bits64) ( aSig<<1 ) ) {
3874             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
3875         }
3876         return packFloat64( aSign, 0x7FF, 0 );
3877     }
3878     shift64RightJamming( aSig, 1, &zSig );
3879     if ( aExp || aSig ) aExp -= 0x3C01;
3880     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
3881
3882 }
3883
3884 #ifdef FLOAT128
3885
3886 /*----------------------------------------------------------------------------
3887 | Returns the result of converting the extended double-precision floating-
3888 | point value `a' to the quadruple-precision floating-point format.  The
3889 | conversion is performed according to the IEC/IEEE Standard for Binary
3890 | Floating-Point Arithmetic.
3891 *----------------------------------------------------------------------------*/
3892
3893 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
3894 {
3895     flag aSign;
3896     int16 aExp;
3897     bits64 aSig, zSig0, zSig1;
3898
3899     aSig = extractFloatx80Frac( a );
3900     aExp = extractFloatx80Exp( a );
3901     aSign = extractFloatx80Sign( a );
3902     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3903         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
3904     }
3905     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3906     return packFloat128( aSign, aExp, zSig0, zSig1 );
3907
3908 }
3909
3910 #endif
3911
3912 /*----------------------------------------------------------------------------
3913 | Rounds the extended double-precision floating-point value `a' to an integer,
3914 | and returns the result as an extended quadruple-precision floating-point
3915 | value.  The operation is performed according to the IEC/IEEE Standard for
3916 | Binary Floating-Point Arithmetic.
3917 *----------------------------------------------------------------------------*/
3918
3919 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
3920 {
3921     flag aSign;
3922     int32 aExp;
3923     bits64 lastBitMask, roundBitsMask;
3924     int8 roundingMode;
3925     floatx80 z;
3926
3927     aExp = extractFloatx80Exp( a );
3928     if ( 0x403E <= aExp ) {
3929         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3930             return propagateFloatx80NaN( a, a STATUS_VAR );
3931         }
3932         return a;
3933     }
3934     if ( aExp < 0x3FFF ) {
3935         if (    ( aExp == 0 )
3936              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3937             return a;
3938         }
3939         STATUS(float_exception_flags) |= float_flag_inexact;
3940         aSign = extractFloatx80Sign( a );
3941         switch ( STATUS(float_rounding_mode) ) {
3942          case float_round_nearest_even:
3943             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3944                ) {
3945                 return
3946                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3947             }
3948             break;
3949          case float_round_down:
3950             return
3951                   aSign ?
3952                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3953                 : packFloatx80( 0, 0, 0 );
3954          case float_round_up:
3955             return
3956                   aSign ? packFloatx80( 1, 0, 0 )
3957                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3958         }
3959         return packFloatx80( aSign, 0, 0 );
3960     }
3961     lastBitMask = 1;
3962     lastBitMask <<= 0x403E - aExp;
3963     roundBitsMask = lastBitMask - 1;
3964     z = a;
3965     roundingMode = STATUS(float_rounding_mode);
3966     if ( roundingMode == float_round_nearest_even ) {
3967         z.low += lastBitMask>>1;
3968         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3969     }
3970     else if ( roundingMode != float_round_to_zero ) {
3971         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3972             z.low += roundBitsMask;
3973         }
3974     }
3975     z.low &= ~ roundBitsMask;
3976     if ( z.low == 0 ) {
3977         ++z.high;
3978         z.low = LIT64( 0x8000000000000000 );
3979     }
3980     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
3981     return z;
3982
3983 }
3984
3985 /*----------------------------------------------------------------------------
3986 | Returns the result of adding the absolute values of the extended double-
3987 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3988 | negated before being returned.  `zSign' is ignored if the result is a NaN.
3989 | The addition is performed according to the IEC/IEEE Standard for Binary
3990 | Floating-Point Arithmetic.
3991 *----------------------------------------------------------------------------*/
3992
3993 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
3994 {
3995     int32 aExp, bExp, zExp;
3996     bits64 aSig, bSig, zSig0, zSig1;
3997     int32 expDiff;
3998
3999     aSig = extractFloatx80Frac( a );
4000     aExp = extractFloatx80Exp( a );
4001     bSig = extractFloatx80Frac( b );
4002     bExp = extractFloatx80Exp( b );
4003     expDiff = aExp - bExp;
4004     if ( 0 < expDiff ) {
4005         if ( aExp == 0x7FFF ) {
4006             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4007             return a;
4008         }
4009         if ( bExp == 0 ) --expDiff;
4010         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4011         zExp = aExp;
4012     }
4013     else if ( expDiff < 0 ) {
4014         if ( bExp == 0x7FFF ) {
4015             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4016             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4017         }
4018         if ( aExp == 0 ) ++expDiff;
4019         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4020         zExp = bExp;
4021     }
4022     else {
4023         if ( aExp == 0x7FFF ) {
4024             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4025                 return propagateFloatx80NaN( a, b STATUS_VAR );
4026             }
4027             return a;
4028         }
4029         zSig1 = 0;
4030         zSig0 = aSig + bSig;
4031         if ( aExp == 0 ) {
4032             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4033             goto roundAndPack;
4034         }
4035         zExp = aExp;
4036         goto shiftRight1;
4037     }
4038     zSig0 = aSig + bSig;
4039     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
4040  shiftRight1:
4041     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
4042     zSig0 |= LIT64( 0x8000000000000000 );
4043     ++zExp;
4044  roundAndPack:
4045     return
4046         roundAndPackFloatx80(
4047             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4048
4049 }
4050
4051 /*----------------------------------------------------------------------------
4052 | Returns the result of subtracting the absolute values of the extended
4053 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
4054 | difference is negated before being returned.  `zSign' is ignored if the
4055 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4056 | Standard for Binary Floating-Point Arithmetic.
4057 *----------------------------------------------------------------------------*/
4058
4059 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
4060 {
4061     int32 aExp, bExp, zExp;
4062     bits64 aSig, bSig, zSig0, zSig1;
4063     int32 expDiff;
4064     floatx80 z;
4065
4066     aSig = extractFloatx80Frac( a );
4067     aExp = extractFloatx80Exp( a );
4068     bSig = extractFloatx80Frac( b );
4069     bExp = extractFloatx80Exp( b );
4070     expDiff = aExp - bExp;
4071     if ( 0 < expDiff ) goto aExpBigger;
4072     if ( expDiff < 0 ) goto bExpBigger;
4073     if ( aExp == 0x7FFF ) {
4074         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
4075             return propagateFloatx80NaN( a, b STATUS_VAR );
4076         }
4077         float_raise( float_flag_invalid STATUS_VAR);
4078         z.low = floatx80_default_nan_low;
4079         z.high = floatx80_default_nan_high;
4080         return z;
4081     }
4082     if ( aExp == 0 ) {
4083         aExp = 1;
4084         bExp = 1;
4085     }
4086     zSig1 = 0;
4087     if ( bSig < aSig ) goto aBigger;
4088     if ( aSig < bSig ) goto bBigger;
4089     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
4090  bExpBigger:
4091     if ( bExp == 0x7FFF ) {
4092         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4093         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4094     }
4095     if ( aExp == 0 ) ++expDiff;
4096     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4097  bBigger:
4098     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
4099     zExp = bExp;
4100     zSign ^= 1;
4101     goto normalizeRoundAndPack;
4102  aExpBigger:
4103     if ( aExp == 0x7FFF ) {
4104         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4105         return a;
4106     }
4107     if ( bExp == 0 ) --expDiff;
4108     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4109  aBigger:
4110     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
4111     zExp = aExp;
4112  normalizeRoundAndPack:
4113     return
4114         normalizeRoundAndPackFloatx80(
4115             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4116
4117 }
4118
4119 /*----------------------------------------------------------------------------
4120 | Returns the result of adding the extended double-precision floating-point
4121 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4122 | Standard for Binary Floating-Point Arithmetic.
4123 *----------------------------------------------------------------------------*/
4124
4125 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
4126 {
4127     flag aSign, bSign;
4128
4129     aSign = extractFloatx80Sign( a );
4130     bSign = extractFloatx80Sign( b );
4131     if ( aSign == bSign ) {
4132         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4133     }
4134     else {
4135         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4136     }
4137
4138 }
4139
4140 /*----------------------------------------------------------------------------
4141 | Returns the result of subtracting the extended double-precision floating-
4142 | point values `a' and `b'.  The operation is performed according to the
4143 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4144 *----------------------------------------------------------------------------*/
4145
4146 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
4147 {
4148     flag aSign, bSign;
4149
4150     aSign = extractFloatx80Sign( a );
4151     bSign = extractFloatx80Sign( b );
4152     if ( aSign == bSign ) {
4153         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
4154     }
4155     else {
4156         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
4157     }
4158
4159 }
4160
4161 /*----------------------------------------------------------------------------
4162 | Returns the result of multiplying the extended double-precision floating-
4163 | point values `a' and `b'.  The operation is performed according to the
4164 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4165 *----------------------------------------------------------------------------*/
4166
4167 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
4168 {
4169     flag aSign, bSign, zSign;
4170     int32 aExp, bExp, zExp;
4171     bits64 aSig, bSig, zSig0, zSig1;
4172     floatx80 z;
4173
4174     aSig = extractFloatx80Frac( a );
4175     aExp = extractFloatx80Exp( a );
4176     aSign = extractFloatx80Sign( a );
4177     bSig = extractFloatx80Frac( b );
4178     bExp = extractFloatx80Exp( b );
4179     bSign = extractFloatx80Sign( b );
4180     zSign = aSign ^ bSign;
4181     if ( aExp == 0x7FFF ) {
4182         if (    (bits64) ( aSig<<1 )
4183              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4184             return propagateFloatx80NaN( a, b STATUS_VAR );
4185         }
4186         if ( ( bExp | bSig ) == 0 ) goto invalid;
4187         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4188     }
4189     if ( bExp == 0x7FFF ) {
4190         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4191         if ( ( aExp | aSig ) == 0 ) {
4192  invalid:
4193             float_raise( float_flag_invalid STATUS_VAR);
4194             z.low = floatx80_default_nan_low;
4195             z.high = floatx80_default_nan_high;
4196             return z;
4197         }
4198         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4199     }
4200     if ( aExp == 0 ) {
4201         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4202         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4203     }
4204     if ( bExp == 0 ) {
4205         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
4206         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4207     }
4208     zExp = aExp + bExp - 0x3FFE;
4209     mul64To128( aSig, bSig, &zSig0, &zSig1 );
4210     if ( 0 < (sbits64) zSig0 ) {
4211         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
4212         --zExp;
4213     }
4214     return
4215         roundAndPackFloatx80(
4216             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4217
4218 }
4219
4220 /*----------------------------------------------------------------------------
4221 | Returns the result of dividing the extended double-precision floating-point
4222 | value `a' by the corresponding value `b'.  The operation is performed
4223 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4224 *----------------------------------------------------------------------------*/
4225
4226 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
4227 {
4228     flag aSign, bSign, zSign;
4229     int32 aExp, bExp, zExp;
4230     bits64 aSig, bSig, zSig0, zSig1;
4231     bits64 rem0, rem1, rem2, term0, term1, term2;
4232     floatx80 z;
4233
4234     aSig = extractFloatx80Frac( a );
4235     aExp = extractFloatx80Exp( a );
4236     aSign = extractFloatx80Sign( a );
4237     bSig = extractFloatx80Frac( b );
4238     bExp = extractFloatx80Exp( b );
4239     bSign = extractFloatx80Sign( b );
4240     zSign = aSign ^ bSign;
4241     if ( aExp == 0x7FFF ) {
4242         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4243         if ( bExp == 0x7FFF ) {
4244             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4245             goto invalid;
4246         }
4247         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4248     }
4249     if ( bExp == 0x7FFF ) {
4250         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4251         return packFloatx80( zSign, 0, 0 );
4252     }
4253     if ( bExp == 0 ) {
4254         if ( bSig == 0 ) {
4255             if ( ( aExp | aSig ) == 0 ) {
4256  invalid:
4257                 float_raise( float_flag_invalid STATUS_VAR);
4258                 z.low = floatx80_default_nan_low;
4259                 z.high = floatx80_default_nan_high;
4260                 return z;
4261             }
4262             float_raise( float_flag_divbyzero STATUS_VAR);
4263             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4264         }
4265         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4266     }
4267     if ( aExp == 0 ) {
4268         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
4269         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
4270     }
4271     zExp = aExp - bExp + 0x3FFE;
4272     rem1 = 0;
4273     if ( bSig <= aSig ) {
4274         shift128Right( aSig, 0, 1, &aSig, &rem1 );
4275         ++zExp;
4276     }
4277     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
4278     mul64To128( bSig, zSig0, &term0, &term1 );
4279     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
4280     while ( (sbits64) rem0 < 0 ) {
4281         --zSig0;
4282         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4283     }
4284     zSig1 = estimateDiv128To64( rem1, 0, bSig );
4285     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
4286         mul64To128( bSig, zSig1, &term1, &term2 );
4287         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4288         while ( (sbits64) rem1 < 0 ) {
4289             --zSig1;
4290             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
4291         }
4292         zSig1 |= ( ( rem1 | rem2 ) != 0 );
4293     }
4294     return
4295         roundAndPackFloatx80(
4296             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
4297
4298 }
4299
4300 /*----------------------------------------------------------------------------
4301 | Returns the remainder of the extended double-precision floating-point value
4302 | `a' with respect to the corresponding value `b'.  The operation is performed
4303 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4304 *----------------------------------------------------------------------------*/
4305
4306 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
4307 {
4308     flag aSign, zSign;
4309     int32 aExp, bExp, expDiff;
4310     bits64 aSig0, aSig1, bSig;
4311     bits64 q, term0, term1, alternateASig0, alternateASig1;
4312     floatx80 z;
4313
4314     aSig0 = extractFloatx80Frac( a );
4315     aExp = extractFloatx80Exp( a );
4316     aSign = extractFloatx80Sign( a );
4317     bSig = extractFloatx80Frac( b );
4318     bExp = extractFloatx80Exp( b );
4319     if ( aExp == 0x7FFF ) {
4320         if (    (bits64) ( aSig0<<1 )
4321              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
4322             return propagateFloatx80NaN( a, b STATUS_VAR );
4323         }
4324         goto invalid;
4325     }
4326     if ( bExp == 0x7FFF ) {
4327         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
4328         return a;
4329     }
4330     if ( bExp == 0 ) {
4331         if ( bSig == 0 ) {
4332  invalid:
4333             float_raise( float_flag_invalid STATUS_VAR);
4334             z.low = floatx80_default_nan_low;
4335             z.high = floatx80_default_nan_high;
4336             return z;
4337         }
4338         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
4339     }
4340     if ( aExp == 0 ) {
4341         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
4342         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4343     }
4344     bSig |= LIT64( 0x8000000000000000 );
4345     zSign = aSign;
4346     expDiff = aExp - bExp;
4347     aSig1 = 0;
4348     if ( expDiff < 0 ) {
4349         if ( expDiff < -1 ) return a;
4350         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
4351         expDiff = 0;
4352     }
4353     q = ( bSig <= aSig0 );
4354     if ( q ) aSig0 -= bSig;
4355     expDiff -= 64;
4356     while ( 0 < expDiff ) {
4357         q = estimateDiv128To64( aSig0, aSig1, bSig );
4358         q = ( 2 < q ) ? q - 2 : 0;
4359         mul64To128( bSig, q, &term0, &term1 );
4360         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4361         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
4362         expDiff -= 62;
4363     }
4364     expDiff += 64;
4365     if ( 0 < expDiff ) {
4366         q = estimateDiv128To64( aSig0, aSig1, bSig );
4367         q = ( 2 < q ) ? q - 2 : 0;
4368         q >>= 64 - expDiff;
4369         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
4370         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4371         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
4372         while ( le128( term0, term1, aSig0, aSig1 ) ) {
4373             ++q;
4374             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
4375         }
4376     }
4377     else {
4378         term1 = 0;
4379         term0 = bSig;
4380     }
4381     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
4382     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
4383          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
4384               && ( q & 1 ) )
4385        ) {
4386         aSig0 = alternateASig0;
4387         aSig1 = alternateASig1;
4388         zSign = ! zSign;
4389     }
4390     return
4391         normalizeRoundAndPackFloatx80(
4392             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
4393
4394 }
4395
4396 /*----------------------------------------------------------------------------
4397 | Returns the square root of the extended double-precision floating-point
4398 | value `a'.  The operation is performed according to the IEC/IEEE Standard
4399 | for Binary Floating-Point Arithmetic.
4400 *----------------------------------------------------------------------------*/
4401
4402 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
4403 {
4404     flag aSign;
4405     int32 aExp, zExp;
4406     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
4407     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4408     floatx80 z;
4409
4410     aSig0 = extractFloatx80Frac( a );
4411     aExp = extractFloatx80Exp( a );
4412     aSign = extractFloatx80Sign( a );
4413     if ( aExp == 0x7FFF ) {
4414         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
4415         if ( ! aSign ) return a;
4416         goto invalid;
4417     }
4418     if ( aSign ) {
4419         if ( ( aExp | aSig0 ) == 0 ) return a;
4420  invalid:
4421         float_raise( float_flag_invalid STATUS_VAR);
4422         z.low = floatx80_default_nan_low;
4423         z.high = floatx80_default_nan_high;
4424         return z;
4425     }
4426     if ( aExp == 0 ) {
4427         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
4428         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
4429     }
4430     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
4431     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
4432     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
4433     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4434     doubleZSig0 = zSig0<<1;
4435     mul64To128( zSig0, zSig0, &term0, &term1 );
4436     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4437     while ( (sbits64) rem0 < 0 ) {
4438         --zSig0;
4439         doubleZSig0 -= 2;
4440         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4441     }
4442     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4443     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4444         if ( zSig1 == 0 ) zSig1 = 1;
4445         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4446         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4447         mul64To128( zSig1, zSig1, &term2, &term3 );
4448         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4449         while ( (sbits64) rem1 < 0 ) {
4450             --zSig1;
4451             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4452             term3 |= 1;
4453             term2 |= doubleZSig0;
4454             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4455         }
4456         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4457     }
4458     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
4459     zSig0 |= doubleZSig0;
4460     return
4461         roundAndPackFloatx80(
4462             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
4463
4464 }
4465
4466 /*----------------------------------------------------------------------------
4467 | Returns 1 if the extended double-precision floating-point value `a' is
4468 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
4469 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4470 | Arithmetic.
4471 *----------------------------------------------------------------------------*/
4472
4473 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
4474 {
4475
4476     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4477               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4478          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4479               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4480        ) {
4481         if (    floatx80_is_signaling_nan( a )
4482              || floatx80_is_signaling_nan( b ) ) {
4483             float_raise( float_flag_invalid STATUS_VAR);
4484         }
4485         return 0;
4486     }
4487     return
4488            ( a.low == b.low )
4489         && (    ( a.high == b.high )
4490              || (    ( a.low == 0 )
4491                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4492            );
4493
4494 }
4495
4496 /*----------------------------------------------------------------------------
4497 | Returns 1 if the extended double-precision floating-point value `a' is
4498 | less than or equal to the corresponding value `b', and 0 otherwise.  The
4499 | comparison is performed according to the IEC/IEEE Standard for Binary
4500 | Floating-Point Arithmetic.
4501 *----------------------------------------------------------------------------*/
4502
4503 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
4504 {
4505     flag aSign, bSign;
4506
4507     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4508               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4509          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4510               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4511        ) {
4512         float_raise( float_flag_invalid STATUS_VAR);
4513         return 0;
4514     }
4515     aSign = extractFloatx80Sign( a );
4516     bSign = extractFloatx80Sign( b );
4517     if ( aSign != bSign ) {
4518         return
4519                aSign
4520             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4521                  == 0 );
4522     }
4523     return
4524           aSign ? le128( b.high, b.low, a.high, a.low )
4525         : le128( a.high, a.low, b.high, b.low );
4526
4527 }
4528
4529 /*----------------------------------------------------------------------------
4530 | Returns 1 if the extended double-precision floating-point value `a' is
4531 | less than the corresponding value `b', and 0 otherwise.  The comparison
4532 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4533 | Arithmetic.
4534 *----------------------------------------------------------------------------*/
4535
4536 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
4537 {
4538     flag aSign, bSign;
4539
4540     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4541               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4542          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4543               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4544        ) {
4545         float_raise( float_flag_invalid STATUS_VAR);
4546         return 0;
4547     }
4548     aSign = extractFloatx80Sign( a );
4549     bSign = extractFloatx80Sign( b );
4550     if ( aSign != bSign ) {
4551         return
4552                aSign
4553             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4554                  != 0 );
4555     }
4556     return
4557           aSign ? lt128( b.high, b.low, a.high, a.low )
4558         : lt128( a.high, a.low, b.high, b.low );
4559
4560 }
4561
4562 /*----------------------------------------------------------------------------
4563 | Returns 1 if the extended double-precision floating-point value `a' is equal
4564 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
4565 | raised if either operand is a NaN.  Otherwise, the comparison is performed
4566 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4567 *----------------------------------------------------------------------------*/
4568
4569 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
4570 {
4571
4572     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4573               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4574          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4575               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4576        ) {
4577         float_raise( float_flag_invalid STATUS_VAR);
4578         return 0;
4579     }
4580     return
4581            ( a.low == b.low )
4582         && (    ( a.high == b.high )
4583              || (    ( a.low == 0 )
4584                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
4585            );
4586
4587 }
4588
4589 /*----------------------------------------------------------------------------
4590 | Returns 1 if the extended double-precision floating-point value `a' is less
4591 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
4592 | do not cause an exception.  Otherwise, the comparison is performed according
4593 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4594 *----------------------------------------------------------------------------*/
4595
4596 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4597 {
4598     flag aSign, bSign;
4599
4600     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4601               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4602          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4603               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4604        ) {
4605         if (    floatx80_is_signaling_nan( a )
4606              || floatx80_is_signaling_nan( b ) ) {
4607             float_raise( float_flag_invalid STATUS_VAR);
4608         }
4609         return 0;
4610     }
4611     aSign = extractFloatx80Sign( a );
4612     bSign = extractFloatx80Sign( b );
4613     if ( aSign != bSign ) {
4614         return
4615                aSign
4616             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4617                  == 0 );
4618     }
4619     return
4620           aSign ? le128( b.high, b.low, a.high, a.low )
4621         : le128( a.high, a.low, b.high, b.low );
4622
4623 }
4624
4625 /*----------------------------------------------------------------------------
4626 | Returns 1 if the extended double-precision floating-point value `a' is less
4627 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4628 | an exception.  Otherwise, the comparison is performed according to the
4629 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4630 *----------------------------------------------------------------------------*/
4631
4632 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
4633 {
4634     flag aSign, bSign;
4635
4636     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4637               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4638          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4639               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4640        ) {
4641         if (    floatx80_is_signaling_nan( a )
4642              || floatx80_is_signaling_nan( b ) ) {
4643             float_raise( float_flag_invalid STATUS_VAR);
4644         }
4645         return 0;
4646     }
4647     aSign = extractFloatx80Sign( a );
4648     bSign = extractFloatx80Sign( b );
4649     if ( aSign != bSign ) {
4650         return
4651                aSign
4652             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4653                  != 0 );
4654     }
4655     return
4656           aSign ? lt128( b.high, b.low, a.high, a.low )
4657         : lt128( a.high, a.low, b.high, b.low );
4658
4659 }
4660
4661 #endif
4662
4663 #ifdef FLOAT128
4664
4665 /*----------------------------------------------------------------------------
4666 | Returns the result of converting the quadruple-precision floating-point
4667 | value `a' to the 32-bit two's complement integer format.  The conversion
4668 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4669 | Arithmetic---which means in particular that the conversion is rounded
4670 | according to the current rounding mode.  If `a' is a NaN, the largest
4671 | positive integer is returned.  Otherwise, if the conversion overflows, the
4672 | largest integer with the same sign as `a' is returned.
4673 *----------------------------------------------------------------------------*/
4674
4675 int32 float128_to_int32( float128 a STATUS_PARAM )
4676 {
4677     flag aSign;
4678     int32 aExp, shiftCount;
4679     bits64 aSig0, aSig1;
4680
4681     aSig1 = extractFloat128Frac1( a );
4682     aSig0 = extractFloat128Frac0( a );
4683     aExp = extractFloat128Exp( a );
4684     aSign = extractFloat128Sign( a );
4685     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4686     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4687     aSig0 |= ( aSig1 != 0 );
4688     shiftCount = 0x4028 - aExp;
4689     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4690     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
4691
4692 }
4693
4694 /*----------------------------------------------------------------------------
4695 | Returns the result of converting the quadruple-precision floating-point
4696 | value `a' to the 32-bit two's complement integer format.  The conversion
4697 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4698 | Arithmetic, except that the conversion is always rounded toward zero.  If
4699 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4700 | conversion overflows, the largest integer with the same sign as `a' is
4701 | returned.
4702 *----------------------------------------------------------------------------*/
4703
4704 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
4705 {
4706     flag aSign;
4707     int32 aExp, shiftCount;
4708     bits64 aSig0, aSig1, savedASig;
4709     int32 z;
4710
4711     aSig1 = extractFloat128Frac1( a );
4712     aSig0 = extractFloat128Frac0( a );
4713     aExp = extractFloat128Exp( a );
4714     aSign = extractFloat128Sign( a );
4715     aSig0 |= ( aSig1 != 0 );
4716     if ( 0x401E < aExp ) {
4717         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4718         goto invalid;
4719     }
4720     else if ( aExp < 0x3FFF ) {
4721         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
4722         return 0;
4723     }
4724     aSig0 |= LIT64( 0x0001000000000000 );
4725     shiftCount = 0x402F - aExp;
4726     savedASig = aSig0;
4727     aSig0 >>= shiftCount;
4728     z = aSig0;
4729     if ( aSign ) z = - z;
4730     if ( ( z < 0 ) ^ aSign ) {
4731  invalid:
4732         float_raise( float_flag_invalid STATUS_VAR);
4733         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4734     }
4735     if ( ( aSig0<<shiftCount ) != savedASig ) {
4736         STATUS(float_exception_flags) |= float_flag_inexact;
4737     }
4738     return z;
4739
4740 }
4741
4742 /*----------------------------------------------------------------------------
4743 | Returns the result of converting the quadruple-precision floating-point
4744 | value `a' to the 64-bit two's complement integer format.  The conversion
4745 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4746 | Arithmetic---which means in particular that the conversion is rounded
4747 | according to the current rounding mode.  If `a' is a NaN, the largest
4748 | positive integer is returned.  Otherwise, if the conversion overflows, the
4749 | largest integer with the same sign as `a' is returned.
4750 *----------------------------------------------------------------------------*/
4751
4752 int64 float128_to_int64( float128 a STATUS_PARAM )
4753 {
4754     flag aSign;
4755     int32 aExp, shiftCount;
4756     bits64 aSig0, aSig1;
4757
4758     aSig1 = extractFloat128Frac1( a );
4759     aSig0 = extractFloat128Frac0( a );
4760     aExp = extractFloat128Exp( a );
4761     aSign = extractFloat128Sign( a );
4762     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4763     shiftCount = 0x402F - aExp;
4764     if ( shiftCount <= 0 ) {
4765         if ( 0x403E < aExp ) {
4766             float_raise( float_flag_invalid STATUS_VAR);
4767             if (    ! aSign
4768                  || (    ( aExp == 0x7FFF )
4769                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4770                     )
4771                ) {
4772                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4773             }
4774             return (sbits64) LIT64( 0x8000000000000000 );
4775         }
4776         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4777     }
4778     else {
4779         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4780     }
4781     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
4782
4783 }
4784
4785 /*----------------------------------------------------------------------------
4786 | Returns the result of converting the quadruple-precision floating-point
4787 | value `a' to the 64-bit two's complement integer format.  The conversion
4788 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4789 | Arithmetic, except that the conversion is always rounded toward zero.
4790 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4791 | the conversion overflows, the largest integer with the same sign as `a' is
4792 | returned.
4793 *----------------------------------------------------------------------------*/
4794
4795 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
4796 {
4797     flag aSign;
4798     int32 aExp, shiftCount;
4799     bits64 aSig0, aSig1;
4800     int64 z;
4801
4802     aSig1 = extractFloat128Frac1( a );
4803     aSig0 = extractFloat128Frac0( a );
4804     aExp = extractFloat128Exp( a );
4805     aSign = extractFloat128Sign( a );
4806     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4807     shiftCount = aExp - 0x402F;
4808     if ( 0 < shiftCount ) {
4809         if ( 0x403E <= aExp ) {
4810             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4811             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4812                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4813                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
4814             }
4815             else {
4816                 float_raise( float_flag_invalid STATUS_VAR);
4817                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4818                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4819                 }
4820             }
4821             return (sbits64) LIT64( 0x8000000000000000 );
4822         }
4823         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4824         if ( (bits64) ( aSig1<<shiftCount ) ) {
4825             STATUS(float_exception_flags) |= float_flag_inexact;
4826         }
4827     }
4828     else {
4829         if ( aExp < 0x3FFF ) {
4830             if ( aExp | aSig0 | aSig1 ) {
4831                 STATUS(float_exception_flags) |= float_flag_inexact;
4832             }
4833             return 0;
4834         }
4835         z = aSig0>>( - shiftCount );
4836         if (    aSig1
4837              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4838             STATUS(float_exception_flags) |= float_flag_inexact;
4839         }
4840     }
4841     if ( aSign ) z = - z;
4842     return z;
4843
4844 }
4845
4846 /*----------------------------------------------------------------------------
4847 | Returns the result of converting the quadruple-precision floating-point
4848 | value `a' to the single-precision floating-point format.  The conversion
4849 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4850 | Arithmetic.
4851 *----------------------------------------------------------------------------*/
4852
4853 float32 float128_to_float32( float128 a STATUS_PARAM )
4854 {
4855     flag aSign;
4856     int32 aExp;
4857     bits64 aSig0, aSig1;
4858     bits32 zSig;
4859
4860     aSig1 = extractFloat128Frac1( a );
4861     aSig0 = extractFloat128Frac0( a );
4862     aExp = extractFloat128Exp( a );
4863     aSign = extractFloat128Sign( a );
4864     if ( aExp == 0x7FFF ) {
4865         if ( aSig0 | aSig1 ) {
4866             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
4867         }
4868         return packFloat32( aSign, 0xFF, 0 );
4869     }
4870     aSig0 |= ( aSig1 != 0 );
4871     shift64RightJamming( aSig0, 18, &aSig0 );
4872     zSig = aSig0;
4873     if ( aExp || zSig ) {
4874         zSig |= 0x40000000;
4875         aExp -= 0x3F81;
4876     }
4877     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
4878
4879 }
4880
4881 /*----------------------------------------------------------------------------
4882 | Returns the result of converting the quadruple-precision floating-point
4883 | value `a' to the double-precision floating-point format.  The conversion
4884 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4885 | Arithmetic.
4886 *----------------------------------------------------------------------------*/
4887
4888 float64 float128_to_float64( float128 a STATUS_PARAM )
4889 {
4890     flag aSign;
4891     int32 aExp;
4892     bits64 aSig0, aSig1;
4893
4894     aSig1 = extractFloat128Frac1( a );
4895     aSig0 = extractFloat128Frac0( a );
4896     aExp = extractFloat128Exp( a );
4897     aSign = extractFloat128Sign( a );
4898     if ( aExp == 0x7FFF ) {
4899         if ( aSig0 | aSig1 ) {
4900             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
4901         }
4902         return packFloat64( aSign, 0x7FF, 0 );
4903     }
4904     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4905     aSig0 |= ( aSig1 != 0 );
4906     if ( aExp || aSig0 ) {
4907         aSig0 |= LIT64( 0x4000000000000000 );
4908         aExp -= 0x3C01;
4909     }
4910     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
4911
4912 }
4913
4914 #ifdef FLOATX80
4915
4916 /*----------------------------------------------------------------------------
4917 | Returns the result of converting the quadruple-precision floating-point
4918 | value `a' to the extended double-precision floating-point format.  The
4919 | conversion is performed according to the IEC/IEEE Standard for Binary
4920 | Floating-Point Arithmetic.
4921 *----------------------------------------------------------------------------*/
4922
4923 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
4924 {
4925     flag aSign;
4926     int32 aExp;
4927     bits64 aSig0, aSig1;
4928
4929     aSig1 = extractFloat128Frac1( a );
4930     aSig0 = extractFloat128Frac0( a );
4931     aExp = extractFloat128Exp( a );
4932     aSign = extractFloat128Sign( a );
4933     if ( aExp == 0x7FFF ) {
4934         if ( aSig0 | aSig1 ) {
4935             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
4936         }
4937         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4938     }
4939     if ( aExp == 0 ) {
4940         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4941         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4942     }
4943     else {
4944         aSig0 |= LIT64( 0x0001000000000000 );
4945     }
4946     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4947     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
4948
4949 }
4950
4951 #endif
4952
4953 /*----------------------------------------------------------------------------
4954 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4955 | returns the result as a quadruple-precision floating-point value.  The
4956 | operation is performed according to the IEC/IEEE Standard for Binary
4957 | Floating-Point Arithmetic.
4958 *----------------------------------------------------------------------------*/
4959
4960 float128 float128_round_to_int( float128 a STATUS_PARAM )
4961 {
4962     flag aSign;
4963     int32 aExp;
4964     bits64 lastBitMask, roundBitsMask;
4965     int8 roundingMode;
4966     float128 z;
4967
4968     aExp = extractFloat128Exp( a );
4969     if ( 0x402F <= aExp ) {
4970         if ( 0x406F <= aExp ) {
4971             if (    ( aExp == 0x7FFF )
4972                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4973                ) {
4974                 return propagateFloat128NaN( a, a STATUS_VAR );
4975             }
4976             return a;
4977         }
4978         lastBitMask = 1;
4979         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4980         roundBitsMask = lastBitMask - 1;
4981         z = a;
4982         roundingMode = STATUS(float_rounding_mode);
4983         if ( roundingMode == float_round_nearest_even ) {
4984             if ( lastBitMask ) {
4985                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4986                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4987             }
4988             else {
4989                 if ( (sbits64) z.low < 0 ) {
4990                     ++z.high;
4991                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4992                 }
4993             }
4994         }
4995         else if ( roundingMode != float_round_to_zero ) {
4996             if (   extractFloat128Sign( z )
4997                  ^ ( roundingMode == float_round_up ) ) {
4998                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4999             }
5000         }
5001         z.low &= ~ roundBitsMask;
5002     }
5003     else {
5004         if ( aExp < 0x3FFF ) {
5005             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
5006             STATUS(float_exception_flags) |= float_flag_inexact;
5007             aSign = extractFloat128Sign( a );
5008             switch ( STATUS(float_rounding_mode) ) {
5009              case float_round_nearest_even:
5010                 if (    ( aExp == 0x3FFE )
5011                      && (   extractFloat128Frac0( a )
5012                           | extractFloat128Frac1( a ) )
5013                    ) {
5014                     return packFloat128( aSign, 0x3FFF, 0, 0 );
5015                 }
5016                 break;
5017              case float_round_down:
5018                 return
5019                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
5020                     : packFloat128( 0, 0, 0, 0 );
5021              case float_round_up:
5022                 return
5023                       aSign ? packFloat128( 1, 0, 0, 0 )
5024                     : packFloat128( 0, 0x3FFF, 0, 0 );
5025             }
5026             return packFloat128( aSign, 0, 0, 0 );
5027         }
5028         lastBitMask = 1;
5029         lastBitMask <<= 0x402F - aExp;
5030         roundBitsMask = lastBitMask - 1;
5031         z.low = 0;
5032         z.high = a.high;
5033         roundingMode = STATUS(float_rounding_mode);
5034         if ( roundingMode == float_round_nearest_even ) {
5035             z.high += lastBitMask>>1;
5036             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
5037                 z.high &= ~ lastBitMask;
5038             }
5039         }
5040         else if ( roundingMode != float_round_to_zero ) {
5041             if (   extractFloat128Sign( z )
5042                  ^ ( roundingMode == float_round_up ) ) {
5043                 z.high |= ( a.low != 0 );
5044                 z.high += roundBitsMask;
5045             }
5046         }
5047         z.high &= ~ roundBitsMask;
5048     }
5049     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
5050         STATUS(float_exception_flags) |= float_flag_inexact;
5051     }
5052     return z;
5053
5054 }
5055
5056 /*----------------------------------------------------------------------------
5057 | Returns the result of adding the absolute values of the quadruple-precision
5058 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
5059 | before being returned.  `zSign' is ignored if the result is a NaN.
5060 | The addition is performed according to the IEC/IEEE Standard for Binary
5061 | Floating-Point Arithmetic.
5062 *----------------------------------------------------------------------------*/
5063
5064 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5065 {
5066     int32 aExp, bExp, zExp;
5067     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5068     int32 expDiff;
5069
5070     aSig1 = extractFloat128Frac1( a );
5071     aSig0 = extractFloat128Frac0( a );
5072     aExp = extractFloat128Exp( a );
5073     bSig1 = extractFloat128Frac1( b );
5074     bSig0 = extractFloat128Frac0( b );
5075     bExp = extractFloat128Exp( b );
5076     expDiff = aExp - bExp;
5077     if ( 0 < expDiff ) {
5078         if ( aExp == 0x7FFF ) {
5079             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5080             return a;
5081         }
5082         if ( bExp == 0 ) {
5083             --expDiff;
5084         }
5085         else {
5086             bSig0 |= LIT64( 0x0001000000000000 );
5087         }
5088         shift128ExtraRightJamming(
5089             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
5090         zExp = aExp;
5091     }
5092     else if ( expDiff < 0 ) {
5093         if ( bExp == 0x7FFF ) {
5094             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5095             return packFloat128( zSign, 0x7FFF, 0, 0 );
5096         }
5097         if ( aExp == 0 ) {
5098             ++expDiff;
5099         }
5100         else {
5101             aSig0 |= LIT64( 0x0001000000000000 );
5102         }
5103         shift128ExtraRightJamming(
5104             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
5105         zExp = bExp;
5106     }
5107     else {
5108         if ( aExp == 0x7FFF ) {
5109             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5110                 return propagateFloat128NaN( a, b STATUS_VAR );
5111             }
5112             return a;
5113         }
5114         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5115         if ( aExp == 0 ) {
5116             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
5117             return packFloat128( zSign, 0, zSig0, zSig1 );
5118         }
5119         zSig2 = 0;
5120         zSig0 |= LIT64( 0x0002000000000000 );
5121         zExp = aExp;
5122         goto shiftRight1;
5123     }
5124     aSig0 |= LIT64( 0x0001000000000000 );
5125     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5126     --zExp;
5127     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
5128     ++zExp;
5129  shiftRight1:
5130     shift128ExtraRightJamming(
5131         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5132  roundAndPack:
5133     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5134
5135 }
5136
5137 /*----------------------------------------------------------------------------
5138 | Returns the result of subtracting the absolute values of the quadruple-
5139 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
5140 | difference is negated before being returned.  `zSign' is ignored if the
5141 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
5142 | Standard for Binary Floating-Point Arithmetic.
5143 *----------------------------------------------------------------------------*/
5144
5145 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
5146 {
5147     int32 aExp, bExp, zExp;
5148     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
5149     int32 expDiff;
5150     float128 z;
5151
5152     aSig1 = extractFloat128Frac1( a );
5153     aSig0 = extractFloat128Frac0( a );
5154     aExp = extractFloat128Exp( a );
5155     bSig1 = extractFloat128Frac1( b );
5156     bSig0 = extractFloat128Frac0( b );
5157     bExp = extractFloat128Exp( b );
5158     expDiff = aExp - bExp;
5159     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5160     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
5161     if ( 0 < expDiff ) goto aExpBigger;
5162     if ( expDiff < 0 ) goto bExpBigger;
5163     if ( aExp == 0x7FFF ) {
5164         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
5165             return propagateFloat128NaN( a, b STATUS_VAR );
5166         }
5167         float_raise( float_flag_invalid STATUS_VAR);
5168         z.low = float128_default_nan_low;
5169         z.high = float128_default_nan_high;
5170         return z;
5171     }
5172     if ( aExp == 0 ) {
5173         aExp = 1;
5174         bExp = 1;
5175     }
5176     if ( bSig0 < aSig0 ) goto aBigger;
5177     if ( aSig0 < bSig0 ) goto bBigger;
5178     if ( bSig1 < aSig1 ) goto aBigger;
5179     if ( aSig1 < bSig1 ) goto bBigger;
5180     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
5181  bExpBigger:
5182     if ( bExp == 0x7FFF ) {
5183         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5184         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
5185     }
5186     if ( aExp == 0 ) {
5187         ++expDiff;
5188     }
5189     else {
5190         aSig0 |= LIT64( 0x4000000000000000 );
5191     }
5192     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5193     bSig0 |= LIT64( 0x4000000000000000 );
5194  bBigger:
5195     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
5196     zExp = bExp;
5197     zSign ^= 1;
5198     goto normalizeRoundAndPack;
5199  aExpBigger:
5200     if ( aExp == 0x7FFF ) {
5201         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5202         return a;
5203     }
5204     if ( bExp == 0 ) {
5205         --expDiff;
5206     }
5207     else {
5208         bSig0 |= LIT64( 0x4000000000000000 );
5209     }
5210     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
5211     aSig0 |= LIT64( 0x4000000000000000 );
5212  aBigger:
5213     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
5214     zExp = aExp;
5215  normalizeRoundAndPack:
5216     --zExp;
5217     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
5218
5219 }
5220
5221 /*----------------------------------------------------------------------------
5222 | Returns the result of adding the quadruple-precision floating-point values
5223 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
5224 | for Binary Floating-Point Arithmetic.
5225 *----------------------------------------------------------------------------*/
5226
5227 float128 float128_add( float128 a, float128 b STATUS_PARAM )
5228 {
5229     flag aSign, bSign;
5230
5231     aSign = extractFloat128Sign( a );
5232     bSign = extractFloat128Sign( b );
5233     if ( aSign == bSign ) {
5234         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5235     }
5236     else {
5237         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5238     }
5239
5240 }
5241
5242 /*----------------------------------------------------------------------------
5243 | Returns the result of subtracting the quadruple-precision floating-point
5244 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5245 | Standard for Binary Floating-Point Arithmetic.
5246 *----------------------------------------------------------------------------*/
5247
5248 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
5249 {
5250     flag aSign, bSign;
5251
5252     aSign = extractFloat128Sign( a );
5253     bSign = extractFloat128Sign( b );
5254     if ( aSign == bSign ) {
5255         return subFloat128Sigs( a, b, aSign STATUS_VAR );
5256     }
5257     else {
5258         return addFloat128Sigs( a, b, aSign STATUS_VAR );
5259     }
5260
5261 }
5262
5263 /*----------------------------------------------------------------------------
5264 | Returns the result of multiplying the quadruple-precision floating-point
5265 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
5266 | Standard for Binary Floating-Point Arithmetic.
5267 *----------------------------------------------------------------------------*/
5268
5269 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
5270 {
5271     flag aSign, bSign, zSign;
5272     int32 aExp, bExp, zExp;
5273     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
5274     float128 z;
5275
5276     aSig1 = extractFloat128Frac1( a );
5277     aSig0 = extractFloat128Frac0( a );
5278     aExp = extractFloat128Exp( a );
5279     aSign = extractFloat128Sign( a );
5280     bSig1 = extractFloat128Frac1( b );
5281     bSig0 = extractFloat128Frac0( b );
5282     bExp = extractFloat128Exp( b );
5283     bSign = extractFloat128Sign( b );
5284     zSign = aSign ^ bSign;
5285     if ( aExp == 0x7FFF ) {
5286         if (    ( aSig0 | aSig1 )
5287              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5288             return propagateFloat128NaN( a, b STATUS_VAR );
5289         }
5290         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
5291         return packFloat128( zSign, 0x7FFF, 0, 0 );
5292     }
5293     if ( bExp == 0x7FFF ) {
5294         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5295         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5296  invalid:
5297             float_raise( float_flag_invalid STATUS_VAR);
5298             z.low = float128_default_nan_low;
5299             z.high = float128_default_nan_high;
5300             return z;
5301         }
5302         return packFloat128( zSign, 0x7FFF, 0, 0 );
5303     }
5304     if ( aExp == 0 ) {
5305         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5306         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5307     }
5308     if ( bExp == 0 ) {
5309         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5310         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5311     }
5312     zExp = aExp + bExp - 0x4000;
5313     aSig0 |= LIT64( 0x0001000000000000 );
5314     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
5315     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
5316     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
5317     zSig2 |= ( zSig3 != 0 );
5318     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
5319         shift128ExtraRightJamming(
5320             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
5321         ++zExp;
5322     }
5323     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5324
5325 }
5326
5327 /*----------------------------------------------------------------------------
5328 | Returns the result of dividing the quadruple-precision floating-point value
5329 | `a' by the corresponding value `b'.  The operation is performed according to
5330 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5331 *----------------------------------------------------------------------------*/
5332
5333 float128 float128_div( float128 a, float128 b STATUS_PARAM )
5334 {
5335     flag aSign, bSign, zSign;
5336     int32 aExp, bExp, zExp;
5337     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
5338     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5339     float128 z;
5340
5341     aSig1 = extractFloat128Frac1( a );
5342     aSig0 = extractFloat128Frac0( a );
5343     aExp = extractFloat128Exp( a );
5344     aSign = extractFloat128Sign( a );
5345     bSig1 = extractFloat128Frac1( b );
5346     bSig0 = extractFloat128Frac0( b );
5347     bExp = extractFloat128Exp( b );
5348     bSign = extractFloat128Sign( b );
5349     zSign = aSign ^ bSign;
5350     if ( aExp == 0x7FFF ) {
5351         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5352         if ( bExp == 0x7FFF ) {
5353             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5354             goto invalid;
5355         }
5356         return packFloat128( zSign, 0x7FFF, 0, 0 );
5357     }
5358     if ( bExp == 0x7FFF ) {
5359         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5360         return packFloat128( zSign, 0, 0, 0 );
5361     }
5362     if ( bExp == 0 ) {
5363         if ( ( bSig0 | bSig1 ) == 0 ) {
5364             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
5365  invalid:
5366                 float_raise( float_flag_invalid STATUS_VAR);
5367                 z.low = float128_default_nan_low;
5368                 z.high = float128_default_nan_high;
5369                 return z;
5370             }
5371             float_raise( float_flag_divbyzero STATUS_VAR);
5372             return packFloat128( zSign, 0x7FFF, 0, 0 );
5373         }
5374         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5375     }
5376     if ( aExp == 0 ) {
5377         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
5378         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5379     }
5380     zExp = aExp - bExp + 0x3FFD;
5381     shortShift128Left(
5382         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
5383     shortShift128Left(
5384         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5385     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
5386         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
5387         ++zExp;
5388     }
5389     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
5390     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
5391     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
5392     while ( (sbits64) rem0 < 0 ) {
5393         --zSig0;
5394         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
5395     }
5396     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
5397     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
5398         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
5399         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
5400         while ( (sbits64) rem1 < 0 ) {
5401             --zSig1;
5402             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
5403         }
5404         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5405     }
5406     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
5407     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5408
5409 }
5410
5411 /*----------------------------------------------------------------------------
5412 | Returns the remainder of the quadruple-precision floating-point value `a'
5413 | with respect to the corresponding value `b'.  The operation is performed
5414 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5415 *----------------------------------------------------------------------------*/
5416
5417 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
5418 {
5419     flag aSign, zSign;
5420     int32 aExp, bExp, expDiff;
5421     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
5422     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
5423     sbits64 sigMean0;
5424     float128 z;
5425
5426     aSig1 = extractFloat128Frac1( a );
5427     aSig0 = extractFloat128Frac0( a );
5428     aExp = extractFloat128Exp( a );
5429     aSign = extractFloat128Sign( a );
5430     bSig1 = extractFloat128Frac1( b );
5431     bSig0 = extractFloat128Frac0( b );
5432     bExp = extractFloat128Exp( b );
5433     if ( aExp == 0x7FFF ) {
5434         if (    ( aSig0 | aSig1 )
5435              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
5436             return propagateFloat128NaN( a, b STATUS_VAR );
5437         }
5438         goto invalid;
5439     }
5440     if ( bExp == 0x7FFF ) {
5441         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
5442         return a;
5443     }
5444     if ( bExp == 0 ) {
5445         if ( ( bSig0 | bSig1 ) == 0 ) {
5446  invalid:
5447             float_raise( float_flag_invalid STATUS_VAR);
5448             z.low = float128_default_nan_low;
5449             z.high = float128_default_nan_high;
5450             return z;
5451         }
5452         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
5453     }
5454     if ( aExp == 0 ) {
5455         if ( ( aSig0 | aSig1 ) == 0 ) return a;
5456         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5457     }
5458     expDiff = aExp - bExp;
5459     if ( expDiff < -1 ) return a;
5460     shortShift128Left(
5461         aSig0 | LIT64( 0x0001000000000000 ),
5462         aSig1,
5463         15 - ( expDiff < 0 ),
5464         &aSig0,
5465         &aSig1
5466     );
5467     shortShift128Left(
5468         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
5469     q = le128( bSig0, bSig1, aSig0, aSig1 );
5470     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5471     expDiff -= 64;
5472     while ( 0 < expDiff ) {
5473         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5474         q = ( 4 < q ) ? q - 4 : 0;
5475         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5476         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
5477         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
5478         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
5479         expDiff -= 61;
5480     }
5481     if ( -64 < expDiff ) {
5482         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
5483         q = ( 4 < q ) ? q - 4 : 0;
5484         q >>= - expDiff;
5485         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5486         expDiff += 52;
5487         if ( expDiff < 0 ) {
5488             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
5489         }
5490         else {
5491             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
5492         }
5493         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
5494         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
5495     }
5496     else {
5497         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
5498         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
5499     }
5500     do {
5501         alternateASig0 = aSig0;
5502         alternateASig1 = aSig1;
5503         ++q;
5504         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
5505     } while ( 0 <= (sbits64) aSig0 );
5506     add128(
5507         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
5508     if (    ( sigMean0 < 0 )
5509          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
5510         aSig0 = alternateASig0;
5511         aSig1 = alternateASig1;
5512     }
5513     zSign = ( (sbits64) aSig0 < 0 );
5514     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
5515     return
5516         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
5517
5518 }
5519
5520 /*----------------------------------------------------------------------------
5521 | Returns the square root of the quadruple-precision floating-point value `a'.
5522 | The operation is performed according to the IEC/IEEE Standard for Binary
5523 | Floating-Point Arithmetic.
5524 *----------------------------------------------------------------------------*/
5525
5526 float128 float128_sqrt( float128 a STATUS_PARAM )
5527 {
5528     flag aSign;
5529     int32 aExp, zExp;
5530     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
5531     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5532     float128 z;
5533
5534     aSig1 = extractFloat128Frac1( a );
5535     aSig0 = extractFloat128Frac0( a );
5536     aExp = extractFloat128Exp( a );
5537     aSign = extractFloat128Sign( a );
5538     if ( aExp == 0x7FFF ) {
5539         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
5540         if ( ! aSign ) return a;
5541         goto invalid;
5542     }
5543     if ( aSign ) {
5544         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
5545  invalid:
5546         float_raise( float_flag_invalid STATUS_VAR);
5547         z.low = float128_default_nan_low;
5548         z.high = float128_default_nan_high;
5549         return z;
5550     }
5551     if ( aExp == 0 ) {
5552         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
5553         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5554     }
5555     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
5556     aSig0 |= LIT64( 0x0001000000000000 );
5557     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
5558     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
5559     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5560     doubleZSig0 = zSig0<<1;
5561     mul64To128( zSig0, zSig0, &term0, &term1 );
5562     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5563     while ( (sbits64) rem0 < 0 ) {
5564         --zSig0;
5565         doubleZSig0 -= 2;
5566         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5567     }
5568     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5569     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
5570         if ( zSig1 == 0 ) zSig1 = 1;
5571         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5572         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5573         mul64To128( zSig1, zSig1, &term2, &term3 );
5574         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5575         while ( (sbits64) rem1 < 0 ) {
5576             --zSig1;
5577             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5578             term3 |= 1;
5579             term2 |= doubleZSig0;
5580             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5581         }
5582         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5583     }
5584     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
5585     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
5586
5587 }
5588
5589 /*----------------------------------------------------------------------------
5590 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5591 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5592 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5593 *----------------------------------------------------------------------------*/
5594
5595 int float128_eq( float128 a, float128 b STATUS_PARAM )
5596 {
5597
5598     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5599               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5600          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5601               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5602        ) {
5603         if (    float128_is_signaling_nan( a )
5604              || float128_is_signaling_nan( b ) ) {
5605             float_raise( float_flag_invalid STATUS_VAR);
5606         }
5607         return 0;
5608     }
5609     return
5610            ( a.low == b.low )
5611         && (    ( a.high == b.high )
5612              || (    ( a.low == 0 )
5613                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5614            );
5615
5616 }
5617
5618 /*----------------------------------------------------------------------------
5619 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5620 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5621 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5622 | Arithmetic.
5623 *----------------------------------------------------------------------------*/
5624
5625 int float128_le( float128 a, float128 b STATUS_PARAM )
5626 {
5627     flag aSign, bSign;
5628
5629     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5630               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5631          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5632               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5633        ) {
5634         float_raise( float_flag_invalid STATUS_VAR);
5635         return 0;
5636     }
5637     aSign = extractFloat128Sign( a );
5638     bSign = extractFloat128Sign( b );
5639     if ( aSign != bSign ) {
5640         return
5641                aSign
5642             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5643                  == 0 );
5644     }
5645     return
5646           aSign ? le128( b.high, b.low, a.high, a.low )
5647         : le128( a.high, a.low, b.high, b.low );
5648
5649 }
5650
5651 /*----------------------------------------------------------------------------
5652 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5653 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5654 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5655 *----------------------------------------------------------------------------*/
5656
5657 int float128_lt( float128 a, float128 b STATUS_PARAM )
5658 {
5659     flag aSign, bSign;
5660
5661     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5662               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5663          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5664               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5665        ) {
5666         float_raise( float_flag_invalid STATUS_VAR);
5667         return 0;
5668     }
5669     aSign = extractFloat128Sign( a );
5670     bSign = extractFloat128Sign( b );
5671     if ( aSign != bSign ) {
5672         return
5673                aSign
5674             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5675                  != 0 );
5676     }
5677     return
5678           aSign ? lt128( b.high, b.low, a.high, a.low )
5679         : lt128( a.high, a.low, b.high, b.low );
5680
5681 }
5682
5683 /*----------------------------------------------------------------------------
5684 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5685 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5686 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5687 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5688 *----------------------------------------------------------------------------*/
5689
5690 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
5691 {
5692
5693     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5694               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5695          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5696               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5697        ) {
5698         float_raise( float_flag_invalid STATUS_VAR);
5699         return 0;
5700     }
5701     return
5702            ( a.low == b.low )
5703         && (    ( a.high == b.high )
5704              || (    ( a.low == 0 )
5705                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5706            );
5707
5708 }
5709
5710 /*----------------------------------------------------------------------------
5711 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5712 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5713 | cause an exception.  Otherwise, the comparison is performed according to the
5714 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5715 *----------------------------------------------------------------------------*/
5716
5717 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
5718 {
5719     flag aSign, bSign;
5720
5721     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5722               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5723          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5724               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5725        ) {
5726         if (    float128_is_signaling_nan( a )
5727              || float128_is_signaling_nan( b ) ) {
5728             float_raise( float_flag_invalid STATUS_VAR);
5729         }
5730         return 0;
5731     }
5732     aSign = extractFloat128Sign( a );
5733     bSign = extractFloat128Sign( b );
5734     if ( aSign != bSign ) {
5735         return
5736                aSign
5737             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5738                  == 0 );
5739     }
5740     return
5741           aSign ? le128( b.high, b.low, a.high, a.low )
5742         : le128( a.high, a.low, b.high, b.low );
5743
5744 }
5745
5746 /*----------------------------------------------------------------------------
5747 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5748 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5749 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5750 | Standard for Binary Floating-Point Arithmetic.
5751 *----------------------------------------------------------------------------*/
5752
5753 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
5754 {
5755     flag aSign, bSign;
5756
5757     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5758               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5759          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5760               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5761        ) {
5762         if (    float128_is_signaling_nan( a )
5763              || float128_is_signaling_nan( b ) ) {
5764             float_raise( float_flag_invalid STATUS_VAR);
5765         }
5766         return 0;
5767     }
5768     aSign = extractFloat128Sign( a );
5769     bSign = extractFloat128Sign( b );
5770     if ( aSign != bSign ) {
5771         return
5772                aSign
5773             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5774                  != 0 );
5775     }
5776     return
5777           aSign ? lt128( b.high, b.low, a.high, a.low )
5778         : lt128( a.high, a.low, b.high, b.low );
5779
5780 }
5781
5782 #endif
5783
5784 /* misc functions */
5785 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
5786 {
5787     return int64_to_float32(a STATUS_VAR);
5788 }
5789
5790 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
5791 {
5792     return int64_to_float64(a STATUS_VAR);
5793 }
5794
5795 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
5796 {
5797     int64_t v;
5798     unsigned int res;
5799
5800     v = float32_to_int64(a STATUS_VAR);
5801     if (v < 0) {
5802         res = 0;
5803         float_raise( float_flag_invalid STATUS_VAR);
5804     } else if (v > 0xffffffff) {
5805         res = 0xffffffff;
5806         float_raise( float_flag_invalid STATUS_VAR);
5807     } else {
5808         res = v;
5809     }
5810     return res;
5811 }
5812
5813 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
5814 {
5815     int64_t v;
5816     unsigned int res;
5817
5818     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5819     if (v < 0) {
5820         res = 0;
5821         float_raise( float_flag_invalid STATUS_VAR);
5822     } else if (v > 0xffffffff) {
5823         res = 0xffffffff;
5824         float_raise( float_flag_invalid STATUS_VAR);
5825     } else {
5826         res = v;
5827     }
5828     return res;
5829 }
5830
5831 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM )
5832 {
5833     int64_t v;
5834     unsigned int res;
5835
5836     v = float32_to_int64_round_to_zero(a STATUS_VAR);
5837     if (v < 0) {
5838         res = 0;
5839         float_raise( float_flag_invalid STATUS_VAR);
5840     } else if (v > 0xffff) {
5841         res = 0xffff;
5842         float_raise( float_flag_invalid STATUS_VAR);
5843     } else {
5844         res = v;
5845     }
5846     return res;
5847 }
5848
5849 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
5850 {
5851     int64_t v;
5852     unsigned int res;
5853
5854     v = float64_to_int64(a STATUS_VAR);
5855     if (v < 0) {
5856         res = 0;
5857         float_raise( float_flag_invalid STATUS_VAR);
5858     } else if (v > 0xffffffff) {
5859         res = 0xffffffff;
5860         float_raise( float_flag_invalid STATUS_VAR);
5861     } else {
5862         res = v;
5863     }
5864     return res;
5865 }
5866
5867 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
5868 {
5869     int64_t v;
5870     unsigned int res;
5871
5872     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5873     if (v < 0) {
5874         res = 0;
5875         float_raise( float_flag_invalid STATUS_VAR);
5876     } else if (v > 0xffffffff) {
5877         res = 0xffffffff;
5878         float_raise( float_flag_invalid STATUS_VAR);
5879     } else {
5880         res = v;
5881     }
5882     return res;
5883 }
5884
5885 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM )
5886 {
5887     int64_t v;
5888     unsigned int res;
5889
5890     v = float64_to_int64_round_to_zero(a STATUS_VAR);
5891     if (v < 0) {
5892         res = 0;
5893         float_raise( float_flag_invalid STATUS_VAR);
5894     } else if (v > 0xffff) {
5895         res = 0xffff;
5896         float_raise( float_flag_invalid STATUS_VAR);
5897     } else {
5898         res = v;
5899     }
5900     return res;
5901 }
5902
5903 /* FIXME: This looks broken.  */
5904 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
5905 {
5906     int64_t v;
5907
5908     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5909     v += float64_val(a);
5910     v = float64_to_int64(make_float64(v) STATUS_VAR);
5911
5912     return v - INT64_MIN;
5913 }
5914
5915 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
5916 {
5917     int64_t v;
5918
5919     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
5920     v += float64_val(a);
5921     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
5922
5923     return v - INT64_MIN;
5924 }
5925
5926 #define COMPARE(s, nan_exp)                                                  \
5927 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
5928                                       int is_quiet STATUS_PARAM )            \
5929 {                                                                            \
5930     flag aSign, bSign;                                                       \
5931     bits ## s av, bv;                                                        \
5932     a = float ## s ## _squash_input_denormal(a STATUS_VAR);                  \
5933     b = float ## s ## _squash_input_denormal(b STATUS_VAR);                  \
5934                                                                              \
5935     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
5936          extractFloat ## s ## Frac( a ) ) ||                                 \
5937         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
5938           extractFloat ## s ## Frac( b ) )) {                                \
5939         if (!is_quiet ||                                                     \
5940             float ## s ## _is_signaling_nan( a ) ||                          \
5941             float ## s ## _is_signaling_nan( b ) ) {                         \
5942             float_raise( float_flag_invalid STATUS_VAR);                     \
5943         }                                                                    \
5944         return float_relation_unordered;                                     \
5945     }                                                                        \
5946     aSign = extractFloat ## s ## Sign( a );                                  \
5947     bSign = extractFloat ## s ## Sign( b );                                  \
5948     av = float ## s ## _val(a);                                              \
5949     bv = float ## s ## _val(b);                                              \
5950     if ( aSign != bSign ) {                                                  \
5951         if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
5952             /* zero case */                                                  \
5953             return float_relation_equal;                                     \
5954         } else {                                                             \
5955             return 1 - (2 * aSign);                                          \
5956         }                                                                    \
5957     } else {                                                                 \
5958         if (av == bv) {                                                      \
5959             return float_relation_equal;                                     \
5960         } else {                                                             \
5961             return 1 - 2 * (aSign ^ ( av < bv ));                            \
5962         }                                                                    \
5963     }                                                                        \
5964 }                                                                            \
5965                                                                              \
5966 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
5967 {                                                                            \
5968     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
5969 }                                                                            \
5970                                                                              \
5971 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
5972 {                                                                            \
5973     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
5974 }
5975
5976 COMPARE(32, 0xff)
5977 COMPARE(64, 0x7ff)
5978
5979 INLINE int float128_compare_internal( float128 a, float128 b,
5980                                       int is_quiet STATUS_PARAM )
5981 {
5982     flag aSign, bSign;
5983
5984     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
5985           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
5986         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
5987           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
5988         if (!is_quiet ||
5989             float128_is_signaling_nan( a ) ||
5990             float128_is_signaling_nan( b ) ) {
5991             float_raise( float_flag_invalid STATUS_VAR);
5992         }
5993         return float_relation_unordered;
5994     }
5995     aSign = extractFloat128Sign( a );
5996     bSign = extractFloat128Sign( b );
5997     if ( aSign != bSign ) {
5998         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
5999             /* zero case */
6000             return float_relation_equal;
6001         } else {
6002             return 1 - (2 * aSign);
6003         }
6004     } else {
6005         if (a.low == b.low && a.high == b.high) {
6006             return float_relation_equal;
6007         } else {
6008             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
6009         }
6010     }
6011 }
6012
6013 int float128_compare( float128 a, float128 b STATUS_PARAM )
6014 {
6015     return float128_compare_internal(a, b, 0 STATUS_VAR);
6016 }
6017
6018 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
6019 {
6020     return float128_compare_internal(a, b, 1 STATUS_VAR);
6021 }
6022
6023 /* Multiply A by 2 raised to the power N.  */
6024 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
6025 {
6026     flag aSign;
6027     int16 aExp;
6028     bits32 aSig;
6029
6030     a = float32_squash_input_denormal(a STATUS_VAR);
6031     aSig = extractFloat32Frac( a );
6032     aExp = extractFloat32Exp( a );
6033     aSign = extractFloat32Sign( a );
6034
6035     if ( aExp == 0xFF ) {
6036         return a;
6037     }
6038     if ( aExp != 0 )
6039         aSig |= 0x00800000;
6040     else if ( aSig == 0 )
6041         return a;
6042
6043     aExp += n - 1;
6044     aSig <<= 7;
6045     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
6046 }
6047
6048 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
6049 {
6050     flag aSign;
6051     int16 aExp;
6052     bits64 aSig;
6053
6054     a = float64_squash_input_denormal(a STATUS_VAR);
6055     aSig = extractFloat64Frac( a );
6056     aExp = extractFloat64Exp( a );
6057     aSign = extractFloat64Sign( a );
6058
6059     if ( aExp == 0x7FF ) {
6060         return a;
6061     }
6062     if ( aExp != 0 )
6063         aSig |= LIT64( 0x0010000000000000 );
6064     else if ( aSig == 0 )
6065         return a;
6066
6067     aExp += n - 1;
6068     aSig <<= 10;
6069     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
6070 }
6071
6072 #ifdef FLOATX80
6073 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
6074 {
6075     flag aSign;
6076     int16 aExp;
6077     bits64 aSig;
6078
6079     aSig = extractFloatx80Frac( a );
6080     aExp = extractFloatx80Exp( a );
6081     aSign = extractFloatx80Sign( a );
6082
6083     if ( aExp == 0x7FF ) {
6084         return a;
6085     }
6086     if (aExp == 0 && aSig == 0)
6087         return a;
6088
6089     aExp += n;
6090     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
6091                                           aSign, aExp, aSig, 0 STATUS_VAR );
6092 }
6093 #endif
6094
6095 #ifdef FLOAT128
6096 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
6097 {
6098     flag aSign;
6099     int32 aExp;
6100     bits64 aSig0, aSig1;
6101
6102     aSig1 = extractFloat128Frac1( a );
6103     aSig0 = extractFloat128Frac0( a );
6104     aExp = extractFloat128Exp( a );
6105     aSign = extractFloat128Sign( a );
6106     if ( aExp == 0x7FFF ) {
6107         return a;
6108     }
6109     if ( aExp != 0 )
6110         aSig0 |= LIT64( 0x0001000000000000 );
6111     else if ( aSig0 == 0 && aSig1 == 0 )
6112         return a;
6113
6114     aExp += n - 1;
6115     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
6116                                           STATUS_VAR );
6117
6118 }
6119 #endif