2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
33 #include "softfloat.h"
35 /*----------------------------------------------------------------------------
36 | Primitive arithmetic functions, including multi-word arithmetic, and
37 | division and square root approximations. (Can be specialized to target if
39 *----------------------------------------------------------------------------*/
40 #include "softfloat-macros.h"
42 /*----------------------------------------------------------------------------
43 | Functions and definitions to determine: (1) whether tininess for underflow
44 | is detected before or after rounding by default, (2) what (if anything)
45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47 | are propagated from function inputs to output. These details are target-
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-specialize.h"
52 void set_float_rounding_mode(int val STATUS_PARAM
)
54 STATUS(float_rounding_mode
) = val
;
57 void set_float_exception_flags(int val STATUS_PARAM
)
59 STATUS(float_exception_flags
) = val
;
63 void set_floatx80_rounding_precision(int val STATUS_PARAM
)
65 STATUS(floatx80_rounding_precision
) = val
;
69 /*----------------------------------------------------------------------------
70 | Returns the fraction bits of the half-precision floating-point value `a'.
71 *----------------------------------------------------------------------------*/
73 INLINE
uint32_t extractFloat16Frac(float16 a
)
75 return float16_val(a
) & 0x3ff;
78 /*----------------------------------------------------------------------------
79 | Returns the exponent bits of the half-precision floating-point value `a'.
80 *----------------------------------------------------------------------------*/
82 INLINE int16
extractFloat16Exp(float16 a
)
84 return (float16_val(a
) >> 10) & 0x1f;
87 /*----------------------------------------------------------------------------
88 | Returns the sign bit of the single-precision floating-point value `a'.
89 *----------------------------------------------------------------------------*/
91 INLINE flag
extractFloat16Sign(float16 a
)
93 return float16_val(a
)>>15;
96 /*----------------------------------------------------------------------------
97 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
98 | and 7, and returns the properly rounded 32-bit integer corresponding to the
99 | input. If `zSign' is 1, the input is negated before being converted to an
100 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
101 | is simply rounded to an integer, with the inexact exception raised if the
102 | input cannot be represented exactly as an integer. However, if the fixed-
103 | point input is too large, the invalid exception is raised and the largest
104 | positive or negative integer is returned.
105 *----------------------------------------------------------------------------*/
107 static int32
roundAndPackInt32( flag zSign
, bits64 absZ STATUS_PARAM
)
110 flag roundNearestEven
;
111 int8 roundIncrement
, roundBits
;
114 roundingMode
= STATUS(float_rounding_mode
);
115 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
116 roundIncrement
= 0x40;
117 if ( ! roundNearestEven
) {
118 if ( roundingMode
== float_round_to_zero
) {
122 roundIncrement
= 0x7F;
124 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
127 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
131 roundBits
= absZ
& 0x7F;
132 absZ
= ( absZ
+ roundIncrement
)>>7;
133 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
135 if ( zSign
) z
= - z
;
136 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
137 float_raise( float_flag_invalid STATUS_VAR
);
138 return zSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
140 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
145 /*----------------------------------------------------------------------------
146 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
147 | `absZ1', with binary point between bits 63 and 64 (between the input words),
148 | and returns the properly rounded 64-bit integer corresponding to the input.
149 | If `zSign' is 1, the input is negated before being converted to an integer.
150 | Ordinarily, the fixed-point input is simply rounded to an integer, with
151 | the inexact exception raised if the input cannot be represented exactly as
152 | an integer. However, if the fixed-point input is too large, the invalid
153 | exception is raised and the largest positive or negative integer is
155 *----------------------------------------------------------------------------*/
157 static int64
roundAndPackInt64( flag zSign
, bits64 absZ0
, bits64 absZ1 STATUS_PARAM
)
160 flag roundNearestEven
, increment
;
163 roundingMode
= STATUS(float_rounding_mode
);
164 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
165 increment
= ( (sbits64
) absZ1
< 0 );
166 if ( ! roundNearestEven
) {
167 if ( roundingMode
== float_round_to_zero
) {
172 increment
= ( roundingMode
== float_round_down
) && absZ1
;
175 increment
= ( roundingMode
== float_round_up
) && absZ1
;
181 if ( absZ0
== 0 ) goto overflow
;
182 absZ0
&= ~ ( ( (bits64
) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
185 if ( zSign
) z
= - z
;
186 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
188 float_raise( float_flag_invalid STATUS_VAR
);
190 zSign
? (sbits64
) LIT64( 0x8000000000000000 )
191 : LIT64( 0x7FFFFFFFFFFFFFFF );
193 if ( absZ1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
198 /*----------------------------------------------------------------------------
199 | Returns the fraction bits of the single-precision floating-point value `a'.
200 *----------------------------------------------------------------------------*/
202 INLINE bits32
extractFloat32Frac( float32 a
)
205 return float32_val(a
) & 0x007FFFFF;
209 /*----------------------------------------------------------------------------
210 | Returns the exponent bits of the single-precision floating-point value `a'.
211 *----------------------------------------------------------------------------*/
213 INLINE int16
extractFloat32Exp( float32 a
)
216 return ( float32_val(a
)>>23 ) & 0xFF;
220 /*----------------------------------------------------------------------------
221 | Returns the sign bit of the single-precision floating-point value `a'.
222 *----------------------------------------------------------------------------*/
224 INLINE flag
extractFloat32Sign( float32 a
)
227 return float32_val(a
)>>31;
231 /*----------------------------------------------------------------------------
232 | If `a' is denormal and we are in flush-to-zero mode then set the
233 | input-denormal exception and return zero. Otherwise just return the value.
234 *----------------------------------------------------------------------------*/
235 static float32
float32_squash_input_denormal(float32 a STATUS_PARAM
)
237 if (STATUS(flush_inputs_to_zero
)) {
238 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
239 float_raise(float_flag_input_denormal STATUS_VAR
);
240 return make_float32(float32_val(a
) & 0x80000000);
246 /*----------------------------------------------------------------------------
247 | Normalizes the subnormal single-precision floating-point value represented
248 | by the denormalized significand `aSig'. The normalized exponent and
249 | significand are stored at the locations pointed to by `zExpPtr' and
250 | `zSigPtr', respectively.
251 *----------------------------------------------------------------------------*/
254 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
258 shiftCount
= countLeadingZeros32( aSig
) - 8;
259 *zSigPtr
= aSig
<<shiftCount
;
260 *zExpPtr
= 1 - shiftCount
;
264 /*----------------------------------------------------------------------------
265 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
266 | single-precision floating-point value, returning the result. After being
267 | shifted into the proper positions, the three fields are simply added
268 | together to form the result. This means that any integer portion of `zSig'
269 | will be added into the exponent. Since a properly normalized significand
270 | will have an integer portion equal to 1, the `zExp' input should be 1 less
271 | than the desired result exponent whenever `zSig' is a complete, normalized
273 *----------------------------------------------------------------------------*/
275 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
279 ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
);
283 /*----------------------------------------------------------------------------
284 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
285 | and significand `zSig', and returns the proper single-precision floating-
286 | point value corresponding to the abstract input. Ordinarily, the abstract
287 | value is simply rounded and packed into the single-precision format, with
288 | the inexact exception raised if the abstract input cannot be represented
289 | exactly. However, if the abstract value is too large, the overflow and
290 | inexact exceptions are raised and an infinity or maximal finite value is
291 | returned. If the abstract value is too small, the input value is rounded to
292 | a subnormal number, and the underflow and inexact exceptions are raised if
293 | the abstract input cannot be represented exactly as a subnormal single-
294 | precision floating-point number.
295 | The input significand `zSig' has its binary point between bits 30
296 | and 29, which is 7 bits to the left of the usual location. This shifted
297 | significand must be normalized or smaller. If `zSig' is not normalized,
298 | `zExp' must be 0; in that case, the result returned is a subnormal number,
299 | and it must not require rounding. In the usual case that `zSig' is
300 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
301 | The handling of underflow and overflow follows the IEC/IEEE Standard for
302 | Binary Floating-Point Arithmetic.
303 *----------------------------------------------------------------------------*/
305 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig STATUS_PARAM
)
308 flag roundNearestEven
;
309 int8 roundIncrement
, roundBits
;
312 roundingMode
= STATUS(float_rounding_mode
);
313 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
314 roundIncrement
= 0x40;
315 if ( ! roundNearestEven
) {
316 if ( roundingMode
== float_round_to_zero
) {
320 roundIncrement
= 0x7F;
322 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
325 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
329 roundBits
= zSig
& 0x7F;
330 if ( 0xFD <= (bits16
) zExp
) {
332 || ( ( zExp
== 0xFD )
333 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
335 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
336 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
339 if ( STATUS(flush_to_zero
) ) return packFloat32( zSign
, 0, 0 );
341 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
343 || ( zSig
+ roundIncrement
< 0x80000000 );
344 shift32RightJamming( zSig
, - zExp
, &zSig
);
346 roundBits
= zSig
& 0x7F;
347 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
350 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
351 zSig
= ( zSig
+ roundIncrement
)>>7;
352 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
353 if ( zSig
== 0 ) zExp
= 0;
354 return packFloat32( zSign
, zExp
, zSig
);
358 /*----------------------------------------------------------------------------
359 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
360 | and significand `zSig', and returns the proper single-precision floating-
361 | point value corresponding to the abstract input. This routine is just like
362 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
363 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
364 | floating-point exponent.
365 *----------------------------------------------------------------------------*/
368 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig STATUS_PARAM
)
372 shiftCount
= countLeadingZeros32( zSig
) - 1;
373 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
377 /*----------------------------------------------------------------------------
378 | Returns the fraction bits of the double-precision floating-point value `a'.
379 *----------------------------------------------------------------------------*/
381 INLINE bits64
extractFloat64Frac( float64 a
)
384 return float64_val(a
) & LIT64( 0x000FFFFFFFFFFFFF );
388 /*----------------------------------------------------------------------------
389 | Returns the exponent bits of the double-precision floating-point value `a'.
390 *----------------------------------------------------------------------------*/
392 INLINE int16
extractFloat64Exp( float64 a
)
395 return ( float64_val(a
)>>52 ) & 0x7FF;
399 /*----------------------------------------------------------------------------
400 | Returns the sign bit of the double-precision floating-point value `a'.
401 *----------------------------------------------------------------------------*/
403 INLINE flag
extractFloat64Sign( float64 a
)
406 return float64_val(a
)>>63;
410 /*----------------------------------------------------------------------------
411 | If `a' is denormal and we are in flush-to-zero mode then set the
412 | input-denormal exception and return zero. Otherwise just return the value.
413 *----------------------------------------------------------------------------*/
414 static float64
float64_squash_input_denormal(float64 a STATUS_PARAM
)
416 if (STATUS(flush_inputs_to_zero
)) {
417 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
418 float_raise(float_flag_input_denormal STATUS_VAR
);
419 return make_float64(float64_val(a
) & (1ULL << 63));
425 /*----------------------------------------------------------------------------
426 | Normalizes the subnormal double-precision floating-point value represented
427 | by the denormalized significand `aSig'. The normalized exponent and
428 | significand are stored at the locations pointed to by `zExpPtr' and
429 | `zSigPtr', respectively.
430 *----------------------------------------------------------------------------*/
433 normalizeFloat64Subnormal( bits64 aSig
, int16
*zExpPtr
, bits64
*zSigPtr
)
437 shiftCount
= countLeadingZeros64( aSig
) - 11;
438 *zSigPtr
= aSig
<<shiftCount
;
439 *zExpPtr
= 1 - shiftCount
;
443 /*----------------------------------------------------------------------------
444 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
445 | double-precision floating-point value, returning the result. After being
446 | shifted into the proper positions, the three fields are simply added
447 | together to form the result. This means that any integer portion of `zSig'
448 | will be added into the exponent. Since a properly normalized significand
449 | will have an integer portion equal to 1, the `zExp' input should be 1 less
450 | than the desired result exponent whenever `zSig' is a complete, normalized
452 *----------------------------------------------------------------------------*/
454 INLINE float64
packFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
458 ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<52 ) + zSig
);
462 /*----------------------------------------------------------------------------
463 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
464 | and significand `zSig', and returns the proper double-precision floating-
465 | point value corresponding to the abstract input. Ordinarily, the abstract
466 | value is simply rounded and packed into the double-precision format, with
467 | the inexact exception raised if the abstract input cannot be represented
468 | exactly. However, if the abstract value is too large, the overflow and
469 | inexact exceptions are raised and an infinity or maximal finite value is
470 | returned. If the abstract value is too small, the input value is rounded
471 | to a subnormal number, and the underflow and inexact exceptions are raised
472 | if the abstract input cannot be represented exactly as a subnormal double-
473 | precision floating-point number.
474 | The input significand `zSig' has its binary point between bits 62
475 | and 61, which is 10 bits to the left of the usual location. This shifted
476 | significand must be normalized or smaller. If `zSig' is not normalized,
477 | `zExp' must be 0; in that case, the result returned is a subnormal number,
478 | and it must not require rounding. In the usual case that `zSig' is
479 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
480 | The handling of underflow and overflow follows the IEC/IEEE Standard for
481 | Binary Floating-Point Arithmetic.
482 *----------------------------------------------------------------------------*/
484 static float64
roundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig STATUS_PARAM
)
487 flag roundNearestEven
;
488 int16 roundIncrement
, roundBits
;
491 roundingMode
= STATUS(float_rounding_mode
);
492 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
493 roundIncrement
= 0x200;
494 if ( ! roundNearestEven
) {
495 if ( roundingMode
== float_round_to_zero
) {
499 roundIncrement
= 0x3FF;
501 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
504 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
508 roundBits
= zSig
& 0x3FF;
509 if ( 0x7FD <= (bits16
) zExp
) {
510 if ( ( 0x7FD < zExp
)
511 || ( ( zExp
== 0x7FD )
512 && ( (sbits64
) ( zSig
+ roundIncrement
) < 0 ) )
514 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
515 return packFloat64( zSign
, 0x7FF, - ( roundIncrement
== 0 ));
518 if ( STATUS(flush_to_zero
) ) return packFloat64( zSign
, 0, 0 );
520 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
522 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
523 shift64RightJamming( zSig
, - zExp
, &zSig
);
525 roundBits
= zSig
& 0x3FF;
526 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
529 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
530 zSig
= ( zSig
+ roundIncrement
)>>10;
531 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
532 if ( zSig
== 0 ) zExp
= 0;
533 return packFloat64( zSign
, zExp
, zSig
);
537 /*----------------------------------------------------------------------------
538 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
539 | and significand `zSig', and returns the proper double-precision floating-
540 | point value corresponding to the abstract input. This routine is just like
541 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
542 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
543 | floating-point exponent.
544 *----------------------------------------------------------------------------*/
547 normalizeRoundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig STATUS_PARAM
)
551 shiftCount
= countLeadingZeros64( zSig
) - 1;
552 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
558 /*----------------------------------------------------------------------------
559 | Returns the fraction bits of the extended double-precision floating-point
561 *----------------------------------------------------------------------------*/
563 INLINE bits64
extractFloatx80Frac( floatx80 a
)
570 /*----------------------------------------------------------------------------
571 | Returns the exponent bits of the extended double-precision floating-point
573 *----------------------------------------------------------------------------*/
575 INLINE int32
extractFloatx80Exp( floatx80 a
)
578 return a
.high
& 0x7FFF;
582 /*----------------------------------------------------------------------------
583 | Returns the sign bit of the extended double-precision floating-point value
585 *----------------------------------------------------------------------------*/
587 INLINE flag
extractFloatx80Sign( floatx80 a
)
594 /*----------------------------------------------------------------------------
595 | Normalizes the subnormal extended double-precision floating-point value
596 | represented by the denormalized significand `aSig'. The normalized exponent
597 | and significand are stored at the locations pointed to by `zExpPtr' and
598 | `zSigPtr', respectively.
599 *----------------------------------------------------------------------------*/
602 normalizeFloatx80Subnormal( bits64 aSig
, int32
*zExpPtr
, bits64
*zSigPtr
)
606 shiftCount
= countLeadingZeros64( aSig
);
607 *zSigPtr
= aSig
<<shiftCount
;
608 *zExpPtr
= 1 - shiftCount
;
612 /*----------------------------------------------------------------------------
613 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
614 | extended double-precision floating-point value, returning the result.
615 *----------------------------------------------------------------------------*/
617 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, bits64 zSig
)
622 z
.high
= ( ( (bits16
) zSign
)<<15 ) + zExp
;
627 /*----------------------------------------------------------------------------
628 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
629 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
630 | and returns the proper extended double-precision floating-point value
631 | corresponding to the abstract input. Ordinarily, the abstract value is
632 | rounded and packed into the extended double-precision format, with the
633 | inexact exception raised if the abstract input cannot be represented
634 | exactly. However, if the abstract value is too large, the overflow and
635 | inexact exceptions are raised and an infinity or maximal finite value is
636 | returned. If the abstract value is too small, the input value is rounded to
637 | a subnormal number, and the underflow and inexact exceptions are raised if
638 | the abstract input cannot be represented exactly as a subnormal extended
639 | double-precision floating-point number.
640 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
641 | number of bits as single or double precision, respectively. Otherwise, the
642 | result is rounded to the full precision of the extended double-precision
644 | The input significand must be normalized or smaller. If the input
645 | significand is not normalized, `zExp' must be 0; in that case, the result
646 | returned is a subnormal number, and it must not require rounding. The
647 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
648 | Floating-Point Arithmetic.
649 *----------------------------------------------------------------------------*/
652 roundAndPackFloatx80(
653 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
657 flag roundNearestEven
, increment
, isTiny
;
658 int64 roundIncrement
, roundMask
, roundBits
;
660 roundingMode
= STATUS(float_rounding_mode
);
661 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
662 if ( roundingPrecision
== 80 ) goto precision80
;
663 if ( roundingPrecision
== 64 ) {
664 roundIncrement
= LIT64( 0x0000000000000400 );
665 roundMask
= LIT64( 0x00000000000007FF );
667 else if ( roundingPrecision
== 32 ) {
668 roundIncrement
= LIT64( 0x0000008000000000 );
669 roundMask
= LIT64( 0x000000FFFFFFFFFF );
674 zSig0
|= ( zSig1
!= 0 );
675 if ( ! roundNearestEven
) {
676 if ( roundingMode
== float_round_to_zero
) {
680 roundIncrement
= roundMask
;
682 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
685 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
689 roundBits
= zSig0
& roundMask
;
690 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
691 if ( ( 0x7FFE < zExp
)
692 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
697 if ( STATUS(flush_to_zero
) ) return packFloatx80( zSign
, 0, 0 );
699 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
701 || ( zSig0
<= zSig0
+ roundIncrement
);
702 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
704 roundBits
= zSig0
& roundMask
;
705 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
706 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
707 zSig0
+= roundIncrement
;
708 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
709 roundIncrement
= roundMask
+ 1;
710 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
711 roundMask
|= roundIncrement
;
713 zSig0
&= ~ roundMask
;
714 return packFloatx80( zSign
, zExp
, zSig0
);
717 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
718 zSig0
+= roundIncrement
;
719 if ( zSig0
< roundIncrement
) {
721 zSig0
= LIT64( 0x8000000000000000 );
723 roundIncrement
= roundMask
+ 1;
724 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
725 roundMask
|= roundIncrement
;
727 zSig0
&= ~ roundMask
;
728 if ( zSig0
== 0 ) zExp
= 0;
729 return packFloatx80( zSign
, zExp
, zSig0
);
731 increment
= ( (sbits64
) zSig1
< 0 );
732 if ( ! roundNearestEven
) {
733 if ( roundingMode
== float_round_to_zero
) {
738 increment
= ( roundingMode
== float_round_down
) && zSig1
;
741 increment
= ( roundingMode
== float_round_up
) && zSig1
;
745 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
746 if ( ( 0x7FFE < zExp
)
747 || ( ( zExp
== 0x7FFE )
748 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
754 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
755 if ( ( roundingMode
== float_round_to_zero
)
756 || ( zSign
&& ( roundingMode
== float_round_up
) )
757 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
759 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
761 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
765 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
768 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
769 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
771 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow STATUS_VAR
);
772 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
773 if ( roundNearestEven
) {
774 increment
= ( (sbits64
) zSig1
< 0 );
778 increment
= ( roundingMode
== float_round_down
) && zSig1
;
781 increment
= ( roundingMode
== float_round_up
) && zSig1
;
787 ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
788 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
790 return packFloatx80( zSign
, zExp
, zSig0
);
793 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
798 zSig0
= LIT64( 0x8000000000000000 );
801 zSig0
&= ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
805 if ( zSig0
== 0 ) zExp
= 0;
807 return packFloatx80( zSign
, zExp
, zSig0
);
811 /*----------------------------------------------------------------------------
812 | Takes an abstract floating-point value having sign `zSign', exponent
813 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
814 | and returns the proper extended double-precision floating-point value
815 | corresponding to the abstract input. This routine is just like
816 | `roundAndPackFloatx80' except that the input significand does not have to be
818 *----------------------------------------------------------------------------*/
821 normalizeRoundAndPackFloatx80(
822 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
832 shiftCount
= countLeadingZeros64( zSig0
);
833 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
836 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
844 /*----------------------------------------------------------------------------
845 | Returns the least-significant 64 fraction bits of the quadruple-precision
846 | floating-point value `a'.
847 *----------------------------------------------------------------------------*/
849 INLINE bits64
extractFloat128Frac1( float128 a
)
856 /*----------------------------------------------------------------------------
857 | Returns the most-significant 48 fraction bits of the quadruple-precision
858 | floating-point value `a'.
859 *----------------------------------------------------------------------------*/
861 INLINE bits64
extractFloat128Frac0( float128 a
)
864 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
868 /*----------------------------------------------------------------------------
869 | Returns the exponent bits of the quadruple-precision floating-point value
871 *----------------------------------------------------------------------------*/
873 INLINE int32
extractFloat128Exp( float128 a
)
876 return ( a
.high
>>48 ) & 0x7FFF;
880 /*----------------------------------------------------------------------------
881 | Returns the sign bit of the quadruple-precision floating-point value `a'.
882 *----------------------------------------------------------------------------*/
884 INLINE flag
extractFloat128Sign( float128 a
)
891 /*----------------------------------------------------------------------------
892 | Normalizes the subnormal quadruple-precision floating-point value
893 | represented by the denormalized significand formed by the concatenation of
894 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
895 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
896 | significand are stored at the location pointed to by `zSig0Ptr', and the
897 | least significant 64 bits of the normalized significand are stored at the
898 | location pointed to by `zSig1Ptr'.
899 *----------------------------------------------------------------------------*/
902 normalizeFloat128Subnormal(
913 shiftCount
= countLeadingZeros64( aSig1
) - 15;
914 if ( shiftCount
< 0 ) {
915 *zSig0Ptr
= aSig1
>>( - shiftCount
);
916 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
919 *zSig0Ptr
= aSig1
<<shiftCount
;
922 *zExpPtr
= - shiftCount
- 63;
925 shiftCount
= countLeadingZeros64( aSig0
) - 15;
926 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
927 *zExpPtr
= 1 - shiftCount
;
932 /*----------------------------------------------------------------------------
933 | Packs the sign `zSign', the exponent `zExp', and the significand formed
934 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
935 | floating-point value, returning the result. After being shifted into the
936 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
937 | added together to form the most significant 32 bits of the result. This
938 | means that any integer portion of `zSig0' will be added into the exponent.
939 | Since a properly normalized significand will have an integer portion equal
940 | to 1, the `zExp' input should be 1 less than the desired result exponent
941 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
943 *----------------------------------------------------------------------------*/
946 packFloat128( flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
951 z
.high
= ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<48 ) + zSig0
;
956 /*----------------------------------------------------------------------------
957 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
958 | and extended significand formed by the concatenation of `zSig0', `zSig1',
959 | and `zSig2', and returns the proper quadruple-precision floating-point value
960 | corresponding to the abstract input. Ordinarily, the abstract value is
961 | simply rounded and packed into the quadruple-precision format, with the
962 | inexact exception raised if the abstract input cannot be represented
963 | exactly. However, if the abstract value is too large, the overflow and
964 | inexact exceptions are raised and an infinity or maximal finite value is
965 | returned. If the abstract value is too small, the input value is rounded to
966 | a subnormal number, and the underflow and inexact exceptions are raised if
967 | the abstract input cannot be represented exactly as a subnormal quadruple-
968 | precision floating-point number.
969 | The input significand must be normalized or smaller. If the input
970 | significand is not normalized, `zExp' must be 0; in that case, the result
971 | returned is a subnormal number, and it must not require rounding. In the
972 | usual case that the input significand is normalized, `zExp' must be 1 less
973 | than the ``true'' floating-point exponent. The handling of underflow and
974 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
975 *----------------------------------------------------------------------------*/
978 roundAndPackFloat128(
979 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
, bits64 zSig2 STATUS_PARAM
)
982 flag roundNearestEven
, increment
, isTiny
;
984 roundingMode
= STATUS(float_rounding_mode
);
985 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
986 increment
= ( (sbits64
) zSig2
< 0 );
987 if ( ! roundNearestEven
) {
988 if ( roundingMode
== float_round_to_zero
) {
993 increment
= ( roundingMode
== float_round_down
) && zSig2
;
996 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1000 if ( 0x7FFD <= (bits32
) zExp
) {
1001 if ( ( 0x7FFD < zExp
)
1002 || ( ( zExp
== 0x7FFD )
1004 LIT64( 0x0001FFFFFFFFFFFF ),
1005 LIT64( 0xFFFFFFFFFFFFFFFF ),
1012 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
1013 if ( ( roundingMode
== float_round_to_zero
)
1014 || ( zSign
&& ( roundingMode
== float_round_up
) )
1015 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
1021 LIT64( 0x0000FFFFFFFFFFFF ),
1022 LIT64( 0xFFFFFFFFFFFFFFFF )
1025 return packFloat128( zSign
, 0x7FFF, 0, 0 );
1028 if ( STATUS(flush_to_zero
) ) return packFloat128( zSign
, 0, 0, 0 );
1030 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
1036 LIT64( 0x0001FFFFFFFFFFFF ),
1037 LIT64( 0xFFFFFFFFFFFFFFFF )
1039 shift128ExtraRightJamming(
1040 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
1042 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow STATUS_VAR
);
1043 if ( roundNearestEven
) {
1044 increment
= ( (sbits64
) zSig2
< 0 );
1048 increment
= ( roundingMode
== float_round_down
) && zSig2
;
1051 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1056 if ( zSig2
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1058 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1059 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1062 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1064 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1068 /*----------------------------------------------------------------------------
1069 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1070 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1071 | returns the proper quadruple-precision floating-point value corresponding
1072 | to the abstract input. This routine is just like `roundAndPackFloat128'
1073 | except that the input significand has fewer bits and does not have to be
1074 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1076 *----------------------------------------------------------------------------*/
1079 normalizeRoundAndPackFloat128(
1080 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1 STATUS_PARAM
)
1090 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1091 if ( 0 <= shiftCount
) {
1093 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1096 shift128ExtraRightJamming(
1097 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1100 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
1106 /*----------------------------------------------------------------------------
1107 | Returns the result of converting the 32-bit two's complement integer `a'
1108 | to the single-precision floating-point format. The conversion is performed
1109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110 *----------------------------------------------------------------------------*/
1112 float32
int32_to_float32( int32 a STATUS_PARAM
)
1116 if ( a
== 0 ) return float32_zero
;
1117 if ( a
== (sbits32
) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1119 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a STATUS_VAR
);
1123 /*----------------------------------------------------------------------------
1124 | Returns the result of converting the 32-bit two's complement integer `a'
1125 | to the double-precision floating-point format. The conversion is performed
1126 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1129 float64
int32_to_float64( int32 a STATUS_PARAM
)
1136 if ( a
== 0 ) return float64_zero
;
1138 absA
= zSign
? - a
: a
;
1139 shiftCount
= countLeadingZeros32( absA
) + 21;
1141 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1147 /*----------------------------------------------------------------------------
1148 | Returns the result of converting the 32-bit two's complement integer `a'
1149 | to the extended double-precision floating-point format. The conversion
1150 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1152 *----------------------------------------------------------------------------*/
1154 floatx80
int32_to_floatx80( int32 a STATUS_PARAM
)
1161 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1163 absA
= zSign
? - a
: a
;
1164 shiftCount
= countLeadingZeros32( absA
) + 32;
1166 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1174 /*----------------------------------------------------------------------------
1175 | Returns the result of converting the 32-bit two's complement integer `a' to
1176 | the quadruple-precision floating-point format. The conversion is performed
1177 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1178 *----------------------------------------------------------------------------*/
1180 float128
int32_to_float128( int32 a STATUS_PARAM
)
1187 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1189 absA
= zSign
? - a
: a
;
1190 shiftCount
= countLeadingZeros32( absA
) + 17;
1192 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1198 /*----------------------------------------------------------------------------
1199 | Returns the result of converting the 64-bit two's complement integer `a'
1200 | to the single-precision floating-point format. The conversion is performed
1201 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1202 *----------------------------------------------------------------------------*/
1204 float32
int64_to_float32( int64 a STATUS_PARAM
)
1210 if ( a
== 0 ) return float32_zero
;
1212 absA
= zSign
? - a
: a
;
1213 shiftCount
= countLeadingZeros64( absA
) - 40;
1214 if ( 0 <= shiftCount
) {
1215 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1219 if ( shiftCount
< 0 ) {
1220 shift64RightJamming( absA
, - shiftCount
, &absA
);
1223 absA
<<= shiftCount
;
1225 return roundAndPackFloat32( zSign
, 0x9C - shiftCount
, absA STATUS_VAR
);
1230 float32
uint64_to_float32( uint64 a STATUS_PARAM
)
1234 if ( a
== 0 ) return float32_zero
;
1235 shiftCount
= countLeadingZeros64( a
) - 40;
1236 if ( 0 <= shiftCount
) {
1237 return packFloat32( 1 > 0, 0x95 - shiftCount
, a
<<shiftCount
);
1241 if ( shiftCount
< 0 ) {
1242 shift64RightJamming( a
, - shiftCount
, &a
);
1247 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount
, a STATUS_VAR
);
1251 /*----------------------------------------------------------------------------
1252 | Returns the result of converting the 64-bit two's complement integer `a'
1253 | to the double-precision floating-point format. The conversion is performed
1254 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1255 *----------------------------------------------------------------------------*/
1257 float64
int64_to_float64( int64 a STATUS_PARAM
)
1261 if ( a
== 0 ) return float64_zero
;
1262 if ( a
== (sbits64
) LIT64( 0x8000000000000000 ) ) {
1263 return packFloat64( 1, 0x43E, 0 );
1266 return normalizeRoundAndPackFloat64( zSign
, 0x43C, zSign
? - a
: a STATUS_VAR
);
1270 float64
uint64_to_float64( uint64 a STATUS_PARAM
)
1272 if ( a
== 0 ) return float64_zero
;
1273 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR
);
1279 /*----------------------------------------------------------------------------
1280 | Returns the result of converting the 64-bit two's complement integer `a'
1281 | to the extended double-precision floating-point format. The conversion
1282 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1284 *----------------------------------------------------------------------------*/
1286 floatx80
int64_to_floatx80( int64 a STATUS_PARAM
)
1292 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1294 absA
= zSign
? - a
: a
;
1295 shiftCount
= countLeadingZeros64( absA
);
1296 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1304 /*----------------------------------------------------------------------------
1305 | Returns the result of converting the 64-bit two's complement integer `a' to
1306 | the quadruple-precision floating-point format. The conversion is performed
1307 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1308 *----------------------------------------------------------------------------*/
1310 float128
int64_to_float128( int64 a STATUS_PARAM
)
1316 bits64 zSig0
, zSig1
;
1318 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1320 absA
= zSign
? - a
: a
;
1321 shiftCount
= countLeadingZeros64( absA
) + 49;
1322 zExp
= 0x406E - shiftCount
;
1323 if ( 64 <= shiftCount
) {
1332 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1333 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1339 /*----------------------------------------------------------------------------
1340 | Returns the result of converting the single-precision floating-point value
1341 | `a' to the 32-bit two's complement integer format. The conversion is
1342 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1343 | Arithmetic---which means in particular that the conversion is rounded
1344 | according to the current rounding mode. If `a' is a NaN, the largest
1345 | positive integer is returned. Otherwise, if the conversion overflows, the
1346 | largest integer with the same sign as `a' is returned.
1347 *----------------------------------------------------------------------------*/
1349 int32
float32_to_int32( float32 a STATUS_PARAM
)
1352 int16 aExp
, shiftCount
;
1356 a
= float32_squash_input_denormal(a STATUS_VAR
);
1357 aSig
= extractFloat32Frac( a
);
1358 aExp
= extractFloat32Exp( a
);
1359 aSign
= extractFloat32Sign( a
);
1360 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1361 if ( aExp
) aSig
|= 0x00800000;
1362 shiftCount
= 0xAF - aExp
;
1365 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1366 return roundAndPackInt32( aSign
, aSig64 STATUS_VAR
);
1370 /*----------------------------------------------------------------------------
1371 | Returns the result of converting the single-precision floating-point value
1372 | `a' to the 32-bit two's complement integer format. The conversion is
1373 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1374 | Arithmetic, except that the conversion is always rounded toward zero.
1375 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1376 | the conversion overflows, the largest integer with the same sign as `a' is
1378 *----------------------------------------------------------------------------*/
1380 int32
float32_to_int32_round_to_zero( float32 a STATUS_PARAM
)
1383 int16 aExp
, shiftCount
;
1386 a
= float32_squash_input_denormal(a STATUS_VAR
);
1388 aSig
= extractFloat32Frac( a
);
1389 aExp
= extractFloat32Exp( a
);
1390 aSign
= extractFloat32Sign( a
);
1391 shiftCount
= aExp
- 0x9E;
1392 if ( 0 <= shiftCount
) {
1393 if ( float32_val(a
) != 0xCF000000 ) {
1394 float_raise( float_flag_invalid STATUS_VAR
);
1395 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1397 return (sbits32
) 0x80000000;
1399 else if ( aExp
<= 0x7E ) {
1400 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1403 aSig
= ( aSig
| 0x00800000 )<<8;
1404 z
= aSig
>>( - shiftCount
);
1405 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
1406 STATUS(float_exception_flags
) |= float_flag_inexact
;
1408 if ( aSign
) z
= - z
;
1413 /*----------------------------------------------------------------------------
1414 | Returns the result of converting the single-precision floating-point value
1415 | `a' to the 16-bit two's complement integer format. The conversion is
1416 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1417 | Arithmetic, except that the conversion is always rounded toward zero.
1418 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1419 | the conversion overflows, the largest integer with the same sign as `a' is
1421 *----------------------------------------------------------------------------*/
1423 int16
float32_to_int16_round_to_zero( float32 a STATUS_PARAM
)
1426 int16 aExp
, shiftCount
;
1430 aSig
= extractFloat32Frac( a
);
1431 aExp
= extractFloat32Exp( a
);
1432 aSign
= extractFloat32Sign( a
);
1433 shiftCount
= aExp
- 0x8E;
1434 if ( 0 <= shiftCount
) {
1435 if ( float32_val(a
) != 0xC7000000 ) {
1436 float_raise( float_flag_invalid STATUS_VAR
);
1437 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1441 return (sbits32
) 0xffff8000;
1443 else if ( aExp
<= 0x7E ) {
1444 if ( aExp
| aSig
) {
1445 STATUS(float_exception_flags
) |= float_flag_inexact
;
1450 aSig
= ( aSig
| 0x00800000 )<<8;
1451 z
= aSig
>>( - shiftCount
);
1452 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
1453 STATUS(float_exception_flags
) |= float_flag_inexact
;
1462 /*----------------------------------------------------------------------------
1463 | Returns the result of converting the single-precision floating-point value
1464 | `a' to the 64-bit two's complement integer format. The conversion is
1465 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1466 | Arithmetic---which means in particular that the conversion is rounded
1467 | according to the current rounding mode. If `a' is a NaN, the largest
1468 | positive integer is returned. Otherwise, if the conversion overflows, the
1469 | largest integer with the same sign as `a' is returned.
1470 *----------------------------------------------------------------------------*/
1472 int64
float32_to_int64( float32 a STATUS_PARAM
)
1475 int16 aExp
, shiftCount
;
1477 bits64 aSig64
, aSigExtra
;
1478 a
= float32_squash_input_denormal(a STATUS_VAR
);
1480 aSig
= extractFloat32Frac( a
);
1481 aExp
= extractFloat32Exp( a
);
1482 aSign
= extractFloat32Sign( a
);
1483 shiftCount
= 0xBE - aExp
;
1484 if ( shiftCount
< 0 ) {
1485 float_raise( float_flag_invalid STATUS_VAR
);
1486 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1487 return LIT64( 0x7FFFFFFFFFFFFFFF );
1489 return (sbits64
) LIT64( 0x8000000000000000 );
1491 if ( aExp
) aSig
|= 0x00800000;
1494 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1495 return roundAndPackInt64( aSign
, aSig64
, aSigExtra STATUS_VAR
);
1499 /*----------------------------------------------------------------------------
1500 | Returns the result of converting the single-precision floating-point value
1501 | `a' to the 64-bit two's complement integer format. The conversion is
1502 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1503 | Arithmetic, except that the conversion is always rounded toward zero. If
1504 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1505 | conversion overflows, the largest integer with the same sign as `a' is
1507 *----------------------------------------------------------------------------*/
1509 int64
float32_to_int64_round_to_zero( float32 a STATUS_PARAM
)
1512 int16 aExp
, shiftCount
;
1516 a
= float32_squash_input_denormal(a STATUS_VAR
);
1518 aSig
= extractFloat32Frac( a
);
1519 aExp
= extractFloat32Exp( a
);
1520 aSign
= extractFloat32Sign( a
);
1521 shiftCount
= aExp
- 0xBE;
1522 if ( 0 <= shiftCount
) {
1523 if ( float32_val(a
) != 0xDF000000 ) {
1524 float_raise( float_flag_invalid STATUS_VAR
);
1525 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1526 return LIT64( 0x7FFFFFFFFFFFFFFF );
1529 return (sbits64
) LIT64( 0x8000000000000000 );
1531 else if ( aExp
<= 0x7E ) {
1532 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1535 aSig64
= aSig
| 0x00800000;
1537 z
= aSig64
>>( - shiftCount
);
1538 if ( (bits64
) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1539 STATUS(float_exception_flags
) |= float_flag_inexact
;
1541 if ( aSign
) z
= - z
;
1546 /*----------------------------------------------------------------------------
1547 | Returns the result of converting the single-precision floating-point value
1548 | `a' to the double-precision floating-point format. The conversion is
1549 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1551 *----------------------------------------------------------------------------*/
1553 float64
float32_to_float64( float32 a STATUS_PARAM
)
1558 a
= float32_squash_input_denormal(a STATUS_VAR
);
1560 aSig
= extractFloat32Frac( a
);
1561 aExp
= extractFloat32Exp( a
);
1562 aSign
= extractFloat32Sign( a
);
1563 if ( aExp
== 0xFF ) {
1564 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1565 return packFloat64( aSign
, 0x7FF, 0 );
1568 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1569 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1572 return packFloat64( aSign
, aExp
+ 0x380, ( (bits64
) aSig
)<<29 );
1578 /*----------------------------------------------------------------------------
1579 | Returns the result of converting the single-precision floating-point value
1580 | `a' to the extended double-precision floating-point format. The conversion
1581 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1583 *----------------------------------------------------------------------------*/
1585 floatx80
float32_to_floatx80( float32 a STATUS_PARAM
)
1591 a
= float32_squash_input_denormal(a STATUS_VAR
);
1592 aSig
= extractFloat32Frac( a
);
1593 aExp
= extractFloat32Exp( a
);
1594 aSign
= extractFloat32Sign( a
);
1595 if ( aExp
== 0xFF ) {
1596 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1597 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1600 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1601 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1604 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<40 );
1612 /*----------------------------------------------------------------------------
1613 | Returns the result of converting the single-precision floating-point value
1614 | `a' to the double-precision floating-point format. The conversion is
1615 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1617 *----------------------------------------------------------------------------*/
1619 float128
float32_to_float128( float32 a STATUS_PARAM
)
1625 a
= float32_squash_input_denormal(a STATUS_VAR
);
1626 aSig
= extractFloat32Frac( a
);
1627 aExp
= extractFloat32Exp( a
);
1628 aSign
= extractFloat32Sign( a
);
1629 if ( aExp
== 0xFF ) {
1630 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1631 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1634 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1635 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1638 return packFloat128( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<25, 0 );
1644 /*----------------------------------------------------------------------------
1645 | Rounds the single-precision floating-point value `a' to an integer, and
1646 | returns the result as a single-precision floating-point value. The
1647 | operation is performed according to the IEC/IEEE Standard for Binary
1648 | Floating-Point Arithmetic.
1649 *----------------------------------------------------------------------------*/
1651 float32
float32_round_to_int( float32 a STATUS_PARAM
)
1655 bits32 lastBitMask
, roundBitsMask
;
1658 a
= float32_squash_input_denormal(a STATUS_VAR
);
1660 aExp
= extractFloat32Exp( a
);
1661 if ( 0x96 <= aExp
) {
1662 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1663 return propagateFloat32NaN( a
, a STATUS_VAR
);
1667 if ( aExp
<= 0x7E ) {
1668 if ( (bits32
) ( float32_val(a
)<<1 ) == 0 ) return a
;
1669 STATUS(float_exception_flags
) |= float_flag_inexact
;
1670 aSign
= extractFloat32Sign( a
);
1671 switch ( STATUS(float_rounding_mode
) ) {
1672 case float_round_nearest_even
:
1673 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1674 return packFloat32( aSign
, 0x7F, 0 );
1677 case float_round_down
:
1678 return make_float32(aSign
? 0xBF800000 : 0);
1679 case float_round_up
:
1680 return make_float32(aSign
? 0x80000000 : 0x3F800000);
1682 return packFloat32( aSign
, 0, 0 );
1685 lastBitMask
<<= 0x96 - aExp
;
1686 roundBitsMask
= lastBitMask
- 1;
1688 roundingMode
= STATUS(float_rounding_mode
);
1689 if ( roundingMode
== float_round_nearest_even
) {
1690 z
+= lastBitMask
>>1;
1691 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1693 else if ( roundingMode
!= float_round_to_zero
) {
1694 if ( extractFloat32Sign( make_float32(z
) ) ^ ( roundingMode
== float_round_up
) ) {
1698 z
&= ~ roundBitsMask
;
1699 if ( z
!= float32_val(a
) ) STATUS(float_exception_flags
) |= float_flag_inexact
;
1700 return make_float32(z
);
1704 /*----------------------------------------------------------------------------
1705 | Returns the result of adding the absolute values of the single-precision
1706 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1707 | before being returned. `zSign' is ignored if the result is a NaN.
1708 | The addition is performed according to the IEC/IEEE Standard for Binary
1709 | Floating-Point Arithmetic.
1710 *----------------------------------------------------------------------------*/
1712 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1714 int16 aExp
, bExp
, zExp
;
1715 bits32 aSig
, bSig
, zSig
;
1718 aSig
= extractFloat32Frac( a
);
1719 aExp
= extractFloat32Exp( a
);
1720 bSig
= extractFloat32Frac( b
);
1721 bExp
= extractFloat32Exp( b
);
1722 expDiff
= aExp
- bExp
;
1725 if ( 0 < expDiff
) {
1726 if ( aExp
== 0xFF ) {
1727 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1736 shift32RightJamming( bSig
, expDiff
, &bSig
);
1739 else if ( expDiff
< 0 ) {
1740 if ( bExp
== 0xFF ) {
1741 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1742 return packFloat32( zSign
, 0xFF, 0 );
1750 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1754 if ( aExp
== 0xFF ) {
1755 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1759 if ( STATUS(flush_to_zero
) ) return packFloat32( zSign
, 0, 0 );
1760 return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1762 zSig
= 0x40000000 + aSig
+ bSig
;
1767 zSig
= ( aSig
+ bSig
)<<1;
1769 if ( (sbits32
) zSig
< 0 ) {
1774 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1778 /*----------------------------------------------------------------------------
1779 | Returns the result of subtracting the absolute values of the single-
1780 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1781 | difference is negated before being returned. `zSign' is ignored if the
1782 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1783 | Standard for Binary Floating-Point Arithmetic.
1784 *----------------------------------------------------------------------------*/
1786 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1788 int16 aExp
, bExp
, zExp
;
1789 bits32 aSig
, bSig
, zSig
;
1792 aSig
= extractFloat32Frac( a
);
1793 aExp
= extractFloat32Exp( a
);
1794 bSig
= extractFloat32Frac( b
);
1795 bExp
= extractFloat32Exp( b
);
1796 expDiff
= aExp
- bExp
;
1799 if ( 0 < expDiff
) goto aExpBigger
;
1800 if ( expDiff
< 0 ) goto bExpBigger
;
1801 if ( aExp
== 0xFF ) {
1802 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1803 float_raise( float_flag_invalid STATUS_VAR
);
1804 return float32_default_nan
;
1810 if ( bSig
< aSig
) goto aBigger
;
1811 if ( aSig
< bSig
) goto bBigger
;
1812 return packFloat32( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
1814 if ( bExp
== 0xFF ) {
1815 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1816 return packFloat32( zSign
^ 1, 0xFF, 0 );
1824 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1830 goto normalizeRoundAndPack
;
1832 if ( aExp
== 0xFF ) {
1833 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1842 shift32RightJamming( bSig
, expDiff
, &bSig
);
1847 normalizeRoundAndPack
:
1849 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1853 /*----------------------------------------------------------------------------
1854 | Returns the result of adding the single-precision floating-point values `a'
1855 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1856 | Binary Floating-Point Arithmetic.
1857 *----------------------------------------------------------------------------*/
1859 float32
float32_add( float32 a
, float32 b STATUS_PARAM
)
1862 a
= float32_squash_input_denormal(a STATUS_VAR
);
1863 b
= float32_squash_input_denormal(b STATUS_VAR
);
1865 aSign
= extractFloat32Sign( a
);
1866 bSign
= extractFloat32Sign( b
);
1867 if ( aSign
== bSign
) {
1868 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1871 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1876 /*----------------------------------------------------------------------------
1877 | Returns the result of subtracting the single-precision floating-point values
1878 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1879 | for Binary Floating-Point Arithmetic.
1880 *----------------------------------------------------------------------------*/
1882 float32
float32_sub( float32 a
, float32 b STATUS_PARAM
)
1885 a
= float32_squash_input_denormal(a STATUS_VAR
);
1886 b
= float32_squash_input_denormal(b STATUS_VAR
);
1888 aSign
= extractFloat32Sign( a
);
1889 bSign
= extractFloat32Sign( b
);
1890 if ( aSign
== bSign
) {
1891 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1894 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1899 /*----------------------------------------------------------------------------
1900 | Returns the result of multiplying the single-precision floating-point values
1901 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1902 | for Binary Floating-Point Arithmetic.
1903 *----------------------------------------------------------------------------*/
1905 float32
float32_mul( float32 a
, float32 b STATUS_PARAM
)
1907 flag aSign
, bSign
, zSign
;
1908 int16 aExp
, bExp
, zExp
;
1913 a
= float32_squash_input_denormal(a STATUS_VAR
);
1914 b
= float32_squash_input_denormal(b STATUS_VAR
);
1916 aSig
= extractFloat32Frac( a
);
1917 aExp
= extractFloat32Exp( a
);
1918 aSign
= extractFloat32Sign( a
);
1919 bSig
= extractFloat32Frac( b
);
1920 bExp
= extractFloat32Exp( b
);
1921 bSign
= extractFloat32Sign( b
);
1922 zSign
= aSign
^ bSign
;
1923 if ( aExp
== 0xFF ) {
1924 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1925 return propagateFloat32NaN( a
, b STATUS_VAR
);
1927 if ( ( bExp
| bSig
) == 0 ) {
1928 float_raise( float_flag_invalid STATUS_VAR
);
1929 return float32_default_nan
;
1931 return packFloat32( zSign
, 0xFF, 0 );
1933 if ( bExp
== 0xFF ) {
1934 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1935 if ( ( aExp
| aSig
) == 0 ) {
1936 float_raise( float_flag_invalid STATUS_VAR
);
1937 return float32_default_nan
;
1939 return packFloat32( zSign
, 0xFF, 0 );
1942 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1943 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1946 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1947 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1949 zExp
= aExp
+ bExp
- 0x7F;
1950 aSig
= ( aSig
| 0x00800000 )<<7;
1951 bSig
= ( bSig
| 0x00800000 )<<8;
1952 shift64RightJamming( ( (bits64
) aSig
) * bSig
, 32, &zSig64
);
1954 if ( 0 <= (sbits32
) ( zSig
<<1 ) ) {
1958 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1962 /*----------------------------------------------------------------------------
1963 | Returns the result of dividing the single-precision floating-point value `a'
1964 | by the corresponding value `b'. The operation is performed according to the
1965 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1966 *----------------------------------------------------------------------------*/
1968 float32
float32_div( float32 a
, float32 b STATUS_PARAM
)
1970 flag aSign
, bSign
, zSign
;
1971 int16 aExp
, bExp
, zExp
;
1972 bits32 aSig
, bSig
, zSig
;
1973 a
= float32_squash_input_denormal(a STATUS_VAR
);
1974 b
= float32_squash_input_denormal(b STATUS_VAR
);
1976 aSig
= extractFloat32Frac( a
);
1977 aExp
= extractFloat32Exp( a
);
1978 aSign
= extractFloat32Sign( a
);
1979 bSig
= extractFloat32Frac( b
);
1980 bExp
= extractFloat32Exp( b
);
1981 bSign
= extractFloat32Sign( b
);
1982 zSign
= aSign
^ bSign
;
1983 if ( aExp
== 0xFF ) {
1984 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1985 if ( bExp
== 0xFF ) {
1986 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1987 float_raise( float_flag_invalid STATUS_VAR
);
1988 return float32_default_nan
;
1990 return packFloat32( zSign
, 0xFF, 0 );
1992 if ( bExp
== 0xFF ) {
1993 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1994 return packFloat32( zSign
, 0, 0 );
1998 if ( ( aExp
| aSig
) == 0 ) {
1999 float_raise( float_flag_invalid STATUS_VAR
);
2000 return float32_default_nan
;
2002 float_raise( float_flag_divbyzero STATUS_VAR
);
2003 return packFloat32( zSign
, 0xFF, 0 );
2005 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2008 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2009 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2011 zExp
= aExp
- bExp
+ 0x7D;
2012 aSig
= ( aSig
| 0x00800000 )<<7;
2013 bSig
= ( bSig
| 0x00800000 )<<8;
2014 if ( bSig
<= ( aSig
+ aSig
) ) {
2018 zSig
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
2019 if ( ( zSig
& 0x3F ) == 0 ) {
2020 zSig
|= ( (bits64
) bSig
* zSig
!= ( (bits64
) aSig
)<<32 );
2022 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
2026 /*----------------------------------------------------------------------------
2027 | Returns the remainder of the single-precision floating-point value `a'
2028 | with respect to the corresponding value `b'. The operation is performed
2029 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2030 *----------------------------------------------------------------------------*/
2032 float32
float32_rem( float32 a
, float32 b STATUS_PARAM
)
2035 int16 aExp
, bExp
, expDiff
;
2038 bits64 aSig64
, bSig64
, q64
;
2039 bits32 alternateASig
;
2041 a
= float32_squash_input_denormal(a STATUS_VAR
);
2042 b
= float32_squash_input_denormal(b STATUS_VAR
);
2044 aSig
= extractFloat32Frac( a
);
2045 aExp
= extractFloat32Exp( a
);
2046 aSign
= extractFloat32Sign( a
);
2047 bSig
= extractFloat32Frac( b
);
2048 bExp
= extractFloat32Exp( b
);
2049 if ( aExp
== 0xFF ) {
2050 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2051 return propagateFloat32NaN( a
, b STATUS_VAR
);
2053 float_raise( float_flag_invalid STATUS_VAR
);
2054 return float32_default_nan
;
2056 if ( bExp
== 0xFF ) {
2057 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2062 float_raise( float_flag_invalid STATUS_VAR
);
2063 return float32_default_nan
;
2065 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2068 if ( aSig
== 0 ) return a
;
2069 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2071 expDiff
= aExp
- bExp
;
2074 if ( expDiff
< 32 ) {
2077 if ( expDiff
< 0 ) {
2078 if ( expDiff
< -1 ) return a
;
2081 q
= ( bSig
<= aSig
);
2082 if ( q
) aSig
-= bSig
;
2083 if ( 0 < expDiff
) {
2084 q
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
2087 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2095 if ( bSig
<= aSig
) aSig
-= bSig
;
2096 aSig64
= ( (bits64
) aSig
)<<40;
2097 bSig64
= ( (bits64
) bSig
)<<40;
2099 while ( 0 < expDiff
) {
2100 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2101 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2102 aSig64
= - ( ( bSig
* q64
)<<38 );
2106 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2107 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2108 q
= q64
>>( 64 - expDiff
);
2110 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
2113 alternateASig
= aSig
;
2116 } while ( 0 <= (sbits32
) aSig
);
2117 sigMean
= aSig
+ alternateASig
;
2118 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2119 aSig
= alternateASig
;
2121 zSign
= ( (sbits32
) aSig
< 0 );
2122 if ( zSign
) aSig
= - aSig
;
2123 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
2127 /*----------------------------------------------------------------------------
2128 | Returns the square root of the single-precision floating-point value `a'.
2129 | The operation is performed according to the IEC/IEEE Standard for Binary
2130 | Floating-Point Arithmetic.
2131 *----------------------------------------------------------------------------*/
2133 float32
float32_sqrt( float32 a STATUS_PARAM
)
2139 a
= float32_squash_input_denormal(a STATUS_VAR
);
2141 aSig
= extractFloat32Frac( a
);
2142 aExp
= extractFloat32Exp( a
);
2143 aSign
= extractFloat32Sign( a
);
2144 if ( aExp
== 0xFF ) {
2145 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2146 if ( ! aSign
) return a
;
2147 float_raise( float_flag_invalid STATUS_VAR
);
2148 return float32_default_nan
;
2151 if ( ( aExp
| aSig
) == 0 ) return a
;
2152 float_raise( float_flag_invalid STATUS_VAR
);
2153 return float32_default_nan
;
2156 if ( aSig
== 0 ) return float32_zero
;
2157 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2159 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2160 aSig
= ( aSig
| 0x00800000 )<<8;
2161 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2162 if ( ( zSig
& 0x7F ) <= 5 ) {
2168 term
= ( (bits64
) zSig
) * zSig
;
2169 rem
= ( ( (bits64
) aSig
)<<32 ) - term
;
2170 while ( (sbits64
) rem
< 0 ) {
2172 rem
+= ( ( (bits64
) zSig
)<<1 ) | 1;
2174 zSig
|= ( rem
!= 0 );
2176 shift32RightJamming( zSig
, 1, &zSig
);
2178 return roundAndPackFloat32( 0, zExp
, zSig STATUS_VAR
);
2182 /*----------------------------------------------------------------------------
2183 | Returns the binary exponential of the single-precision floating-point value
2184 | `a'. The operation is performed according to the IEC/IEEE Standard for
2185 | Binary Floating-Point Arithmetic.
2187 | Uses the following identities:
2189 | 1. -------------------------------------------------------------------------
2193 | 2. -------------------------------------------------------------------------
2196 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2198 *----------------------------------------------------------------------------*/
2200 static const float64 float32_exp2_coefficients
[15] =
2202 make_float64( 0x3ff0000000000000ll
), /* 1 */
2203 make_float64( 0x3fe0000000000000ll
), /* 2 */
2204 make_float64( 0x3fc5555555555555ll
), /* 3 */
2205 make_float64( 0x3fa5555555555555ll
), /* 4 */
2206 make_float64( 0x3f81111111111111ll
), /* 5 */
2207 make_float64( 0x3f56c16c16c16c17ll
), /* 6 */
2208 make_float64( 0x3f2a01a01a01a01all
), /* 7 */
2209 make_float64( 0x3efa01a01a01a01all
), /* 8 */
2210 make_float64( 0x3ec71de3a556c734ll
), /* 9 */
2211 make_float64( 0x3e927e4fb7789f5cll
), /* 10 */
2212 make_float64( 0x3e5ae64567f544e4ll
), /* 11 */
2213 make_float64( 0x3e21eed8eff8d898ll
), /* 12 */
2214 make_float64( 0x3de6124613a86d09ll
), /* 13 */
2215 make_float64( 0x3da93974a8c07c9dll
), /* 14 */
2216 make_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
2219 float32
float32_exp2( float32 a STATUS_PARAM
)
2226 a
= float32_squash_input_denormal(a STATUS_VAR
);
2228 aSig
= extractFloat32Frac( a
);
2229 aExp
= extractFloat32Exp( a
);
2230 aSign
= extractFloat32Sign( a
);
2232 if ( aExp
== 0xFF) {
2233 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2234 return (aSign
) ? float32_zero
: a
;
2237 if (aSig
== 0) return float32_one
;
2240 float_raise( float_flag_inexact STATUS_VAR
);
2242 /* ******************************* */
2243 /* using float64 for approximation */
2244 /* ******************************* */
2245 x
= float32_to_float64(a STATUS_VAR
);
2246 x
= float64_mul(x
, float64_ln2 STATUS_VAR
);
2250 for (i
= 0 ; i
< 15 ; i
++) {
2253 f
= float64_mul(xn
, float32_exp2_coefficients
[i
] STATUS_VAR
);
2254 r
= float64_add(r
, f STATUS_VAR
);
2256 xn
= float64_mul(xn
, x STATUS_VAR
);
2259 return float64_to_float32(r
, status
);
2262 /*----------------------------------------------------------------------------
2263 | Returns the binary log of the single-precision floating-point value `a'.
2264 | The operation is performed according to the IEC/IEEE Standard for Binary
2265 | Floating-Point Arithmetic.
2266 *----------------------------------------------------------------------------*/
2267 float32
float32_log2( float32 a STATUS_PARAM
)
2271 bits32 aSig
, zSig
, i
;
2273 a
= float32_squash_input_denormal(a STATUS_VAR
);
2274 aSig
= extractFloat32Frac( a
);
2275 aExp
= extractFloat32Exp( a
);
2276 aSign
= extractFloat32Sign( a
);
2279 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
2280 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2283 float_raise( float_flag_invalid STATUS_VAR
);
2284 return float32_default_nan
;
2286 if ( aExp
== 0xFF ) {
2287 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2296 for (i
= 1 << 22; i
> 0; i
>>= 1) {
2297 aSig
= ( (bits64
)aSig
* aSig
) >> 23;
2298 if ( aSig
& 0x01000000 ) {
2307 return normalizeRoundAndPackFloat32( zSign
, 0x85, zSig STATUS_VAR
);
2310 /*----------------------------------------------------------------------------
2311 | Returns 1 if the single-precision floating-point value `a' is equal to
2312 | the corresponding value `b', and 0 otherwise. The comparison is performed
2313 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2314 *----------------------------------------------------------------------------*/
2316 int float32_eq( float32 a
, float32 b STATUS_PARAM
)
2318 a
= float32_squash_input_denormal(a STATUS_VAR
);
2319 b
= float32_squash_input_denormal(b STATUS_VAR
);
2321 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2322 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2324 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2325 float_raise( float_flag_invalid STATUS_VAR
);
2329 return ( float32_val(a
) == float32_val(b
) ) ||
2330 ( (bits32
) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
2334 /*----------------------------------------------------------------------------
2335 | Returns 1 if the single-precision floating-point value `a' is less than
2336 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2339 *----------------------------------------------------------------------------*/
2341 int float32_le( float32 a
, float32 b STATUS_PARAM
)
2345 a
= float32_squash_input_denormal(a STATUS_VAR
);
2346 b
= float32_squash_input_denormal(b STATUS_VAR
);
2348 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2349 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2351 float_raise( float_flag_invalid STATUS_VAR
);
2354 aSign
= extractFloat32Sign( a
);
2355 bSign
= extractFloat32Sign( b
);
2356 av
= float32_val(a
);
2357 bv
= float32_val(b
);
2358 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( av
| bv
)<<1 ) == 0 );
2359 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2363 /*----------------------------------------------------------------------------
2364 | Returns 1 if the single-precision floating-point value `a' is less than
2365 | the corresponding value `b', and 0 otherwise. The comparison is performed
2366 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2367 *----------------------------------------------------------------------------*/
2369 int float32_lt( float32 a
, float32 b STATUS_PARAM
)
2373 a
= float32_squash_input_denormal(a STATUS_VAR
);
2374 b
= float32_squash_input_denormal(b STATUS_VAR
);
2376 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2377 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2379 float_raise( float_flag_invalid STATUS_VAR
);
2382 aSign
= extractFloat32Sign( a
);
2383 bSign
= extractFloat32Sign( b
);
2384 av
= float32_val(a
);
2385 bv
= float32_val(b
);
2386 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( av
| bv
)<<1 ) != 0 );
2387 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2391 /*----------------------------------------------------------------------------
2392 | Returns 1 if the single-precision floating-point value `a' is equal to
2393 | the corresponding value `b', and 0 otherwise. The invalid exception is
2394 | raised if either operand is a NaN. Otherwise, the comparison is performed
2395 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2396 *----------------------------------------------------------------------------*/
2398 int float32_eq_signaling( float32 a
, float32 b STATUS_PARAM
)
2401 a
= float32_squash_input_denormal(a STATUS_VAR
);
2402 b
= float32_squash_input_denormal(b STATUS_VAR
);
2404 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2405 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2407 float_raise( float_flag_invalid STATUS_VAR
);
2410 av
= float32_val(a
);
2411 bv
= float32_val(b
);
2412 return ( av
== bv
) || ( (bits32
) ( ( av
| bv
)<<1 ) == 0 );
2416 /*----------------------------------------------------------------------------
2417 | Returns 1 if the single-precision floating-point value `a' is less than or
2418 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2419 | cause an exception. Otherwise, the comparison is performed according to the
2420 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2421 *----------------------------------------------------------------------------*/
2423 int float32_le_quiet( float32 a
, float32 b STATUS_PARAM
)
2427 a
= float32_squash_input_denormal(a STATUS_VAR
);
2428 b
= float32_squash_input_denormal(b STATUS_VAR
);
2430 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2431 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2433 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2434 float_raise( float_flag_invalid STATUS_VAR
);
2438 aSign
= extractFloat32Sign( a
);
2439 bSign
= extractFloat32Sign( b
);
2440 av
= float32_val(a
);
2441 bv
= float32_val(b
);
2442 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( av
| bv
)<<1 ) == 0 );
2443 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2447 /*----------------------------------------------------------------------------
2448 | Returns 1 if the single-precision floating-point value `a' is less than
2449 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2450 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2451 | Standard for Binary Floating-Point Arithmetic.
2452 *----------------------------------------------------------------------------*/
2454 int float32_lt_quiet( float32 a
, float32 b STATUS_PARAM
)
2458 a
= float32_squash_input_denormal(a STATUS_VAR
);
2459 b
= float32_squash_input_denormal(b STATUS_VAR
);
2461 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2462 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2464 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2465 float_raise( float_flag_invalid STATUS_VAR
);
2469 aSign
= extractFloat32Sign( a
);
2470 bSign
= extractFloat32Sign( b
);
2471 av
= float32_val(a
);
2472 bv
= float32_val(b
);
2473 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( av
| bv
)<<1 ) != 0 );
2474 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2478 /*----------------------------------------------------------------------------
2479 | Returns the result of converting the double-precision floating-point value
2480 | `a' to the 32-bit two's complement integer format. The conversion is
2481 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2482 | Arithmetic---which means in particular that the conversion is rounded
2483 | according to the current rounding mode. If `a' is a NaN, the largest
2484 | positive integer is returned. Otherwise, if the conversion overflows, the
2485 | largest integer with the same sign as `a' is returned.
2486 *----------------------------------------------------------------------------*/
2488 int32
float64_to_int32( float64 a STATUS_PARAM
)
2491 int16 aExp
, shiftCount
;
2493 a
= float64_squash_input_denormal(a STATUS_VAR
);
2495 aSig
= extractFloat64Frac( a
);
2496 aExp
= extractFloat64Exp( a
);
2497 aSign
= extractFloat64Sign( a
);
2498 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2499 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2500 shiftCount
= 0x42C - aExp
;
2501 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
2502 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
2506 /*----------------------------------------------------------------------------
2507 | Returns the result of converting the double-precision floating-point value
2508 | `a' to the 32-bit two's complement integer format. The conversion is
2509 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2510 | Arithmetic, except that the conversion is always rounded toward zero.
2511 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2512 | the conversion overflows, the largest integer with the same sign as `a' is
2514 *----------------------------------------------------------------------------*/
2516 int32
float64_to_int32_round_to_zero( float64 a STATUS_PARAM
)
2519 int16 aExp
, shiftCount
;
2520 bits64 aSig
, savedASig
;
2522 a
= float64_squash_input_denormal(a STATUS_VAR
);
2524 aSig
= extractFloat64Frac( a
);
2525 aExp
= extractFloat64Exp( a
);
2526 aSign
= extractFloat64Sign( a
);
2527 if ( 0x41E < aExp
) {
2528 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2531 else if ( aExp
< 0x3FF ) {
2532 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2535 aSig
|= LIT64( 0x0010000000000000 );
2536 shiftCount
= 0x433 - aExp
;
2538 aSig
>>= shiftCount
;
2540 if ( aSign
) z
= - z
;
2541 if ( ( z
< 0 ) ^ aSign
) {
2543 float_raise( float_flag_invalid STATUS_VAR
);
2544 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
2546 if ( ( aSig
<<shiftCount
) != savedASig
) {
2547 STATUS(float_exception_flags
) |= float_flag_inexact
;
2553 /*----------------------------------------------------------------------------
2554 | Returns the result of converting the double-precision floating-point value
2555 | `a' to the 16-bit two's complement integer format. The conversion is
2556 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2557 | Arithmetic, except that the conversion is always rounded toward zero.
2558 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2559 | the conversion overflows, the largest integer with the same sign as `a' is
2561 *----------------------------------------------------------------------------*/
2563 int16
float64_to_int16_round_to_zero( float64 a STATUS_PARAM
)
2566 int16 aExp
, shiftCount
;
2567 bits64 aSig
, savedASig
;
2570 aSig
= extractFloat64Frac( a
);
2571 aExp
= extractFloat64Exp( a
);
2572 aSign
= extractFloat64Sign( a
);
2573 if ( 0x40E < aExp
) {
2574 if ( ( aExp
== 0x7FF ) && aSig
) {
2579 else if ( aExp
< 0x3FF ) {
2580 if ( aExp
|| aSig
) {
2581 STATUS(float_exception_flags
) |= float_flag_inexact
;
2585 aSig
|= LIT64( 0x0010000000000000 );
2586 shiftCount
= 0x433 - aExp
;
2588 aSig
>>= shiftCount
;
2593 if ( ( (int16_t)z
< 0 ) ^ aSign
) {
2595 float_raise( float_flag_invalid STATUS_VAR
);
2596 return aSign
? (sbits32
) 0xffff8000 : 0x7FFF;
2598 if ( ( aSig
<<shiftCount
) != savedASig
) {
2599 STATUS(float_exception_flags
) |= float_flag_inexact
;
2604 /*----------------------------------------------------------------------------
2605 | Returns the result of converting the double-precision floating-point value
2606 | `a' to the 64-bit two's complement integer format. The conversion is
2607 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2608 | Arithmetic---which means in particular that the conversion is rounded
2609 | according to the current rounding mode. If `a' is a NaN, the largest
2610 | positive integer is returned. Otherwise, if the conversion overflows, the
2611 | largest integer with the same sign as `a' is returned.
2612 *----------------------------------------------------------------------------*/
2614 int64
float64_to_int64( float64 a STATUS_PARAM
)
2617 int16 aExp
, shiftCount
;
2618 bits64 aSig
, aSigExtra
;
2619 a
= float64_squash_input_denormal(a STATUS_VAR
);
2621 aSig
= extractFloat64Frac( a
);
2622 aExp
= extractFloat64Exp( a
);
2623 aSign
= extractFloat64Sign( a
);
2624 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2625 shiftCount
= 0x433 - aExp
;
2626 if ( shiftCount
<= 0 ) {
2627 if ( 0x43E < aExp
) {
2628 float_raise( float_flag_invalid STATUS_VAR
);
2630 || ( ( aExp
== 0x7FF )
2631 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2633 return LIT64( 0x7FFFFFFFFFFFFFFF );
2635 return (sbits64
) LIT64( 0x8000000000000000 );
2638 aSig
<<= - shiftCount
;
2641 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
2643 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
2647 /*----------------------------------------------------------------------------
2648 | Returns the result of converting the double-precision floating-point value
2649 | `a' to the 64-bit two's complement integer format. The conversion is
2650 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2651 | Arithmetic, except that the conversion is always rounded toward zero.
2652 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2653 | the conversion overflows, the largest integer with the same sign as `a' is
2655 *----------------------------------------------------------------------------*/
2657 int64
float64_to_int64_round_to_zero( float64 a STATUS_PARAM
)
2660 int16 aExp
, shiftCount
;
2663 a
= float64_squash_input_denormal(a STATUS_VAR
);
2665 aSig
= extractFloat64Frac( a
);
2666 aExp
= extractFloat64Exp( a
);
2667 aSign
= extractFloat64Sign( a
);
2668 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2669 shiftCount
= aExp
- 0x433;
2670 if ( 0 <= shiftCount
) {
2671 if ( 0x43E <= aExp
) {
2672 if ( float64_val(a
) != LIT64( 0xC3E0000000000000 ) ) {
2673 float_raise( float_flag_invalid STATUS_VAR
);
2675 || ( ( aExp
== 0x7FF )
2676 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2678 return LIT64( 0x7FFFFFFFFFFFFFFF );
2681 return (sbits64
) LIT64( 0x8000000000000000 );
2683 z
= aSig
<<shiftCount
;
2686 if ( aExp
< 0x3FE ) {
2687 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2690 z
= aSig
>>( - shiftCount
);
2691 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
2692 STATUS(float_exception_flags
) |= float_flag_inexact
;
2695 if ( aSign
) z
= - z
;
2700 /*----------------------------------------------------------------------------
2701 | Returns the result of converting the double-precision floating-point value
2702 | `a' to the single-precision floating-point format. The conversion is
2703 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2705 *----------------------------------------------------------------------------*/
2707 float32
float64_to_float32( float64 a STATUS_PARAM
)
2713 a
= float64_squash_input_denormal(a STATUS_VAR
);
2715 aSig
= extractFloat64Frac( a
);
2716 aExp
= extractFloat64Exp( a
);
2717 aSign
= extractFloat64Sign( a
);
2718 if ( aExp
== 0x7FF ) {
2719 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2720 return packFloat32( aSign
, 0xFF, 0 );
2722 shift64RightJamming( aSig
, 22, &aSig
);
2724 if ( aExp
|| zSig
) {
2728 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
2733 /*----------------------------------------------------------------------------
2734 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2735 | half-precision floating-point value, returning the result. After being
2736 | shifted into the proper positions, the three fields are simply added
2737 | together to form the result. This means that any integer portion of `zSig'
2738 | will be added into the exponent. Since a properly normalized significand
2739 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2740 | than the desired result exponent whenever `zSig' is a complete, normalized
2742 *----------------------------------------------------------------------------*/
2743 static float16
packFloat16(flag zSign
, int16 zExp
, bits16 zSig
)
2745 return make_float16(
2746 (((bits32
)zSign
) << 15) + (((bits32
)zExp
) << 10) + zSig
);
2749 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2750 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2752 float32
float16_to_float32(float16 a
, flag ieee STATUS_PARAM
)
2758 aSign
= extractFloat16Sign(a
);
2759 aExp
= extractFloat16Exp(a
);
2760 aSig
= extractFloat16Frac(a
);
2762 if (aExp
== 0x1f && ieee
) {
2764 /* Make sure correct exceptions are raised. */
2765 float32ToCommonNaN(a STATUS_VAR
);
2768 return packFloat32(aSign
, 0xff, aSig
<< 13);
2774 return packFloat32(aSign
, 0, 0);
2777 shiftCount
= countLeadingZeros32( aSig
) - 21;
2778 aSig
= aSig
<< shiftCount
;
2781 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
2784 float16
float32_to_float16(float32 a
, flag ieee STATUS_PARAM
)
2792 a
= float32_squash_input_denormal(a STATUS_VAR
);
2794 aSig
= extractFloat32Frac( a
);
2795 aExp
= extractFloat32Exp( a
);
2796 aSign
= extractFloat32Sign( a
);
2797 if ( aExp
== 0xFF ) {
2799 /* Input is a NaN */
2800 float16 r
= commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2802 return packFloat16(aSign
, 0, 0);
2808 float_raise(float_flag_invalid STATUS_VAR
);
2809 return packFloat16(aSign
, 0x1f, 0x3ff);
2811 return packFloat16(aSign
, 0x1f, 0);
2813 if (aExp
== 0 && aSig
== 0) {
2814 return packFloat16(aSign
, 0, 0);
2816 /* Decimal point between bits 22 and 23. */
2828 float_raise( float_flag_underflow STATUS_VAR
);
2829 roundingMode
= STATUS(float_rounding_mode
);
2830 switch (roundingMode
) {
2831 case float_round_nearest_even
:
2832 increment
= (mask
+ 1) >> 1;
2833 if ((aSig
& mask
) == increment
) {
2834 increment
= aSig
& (increment
<< 1);
2837 case float_round_up
:
2838 increment
= aSign
? 0 : mask
;
2840 case float_round_down
:
2841 increment
= aSign
? mask
: 0;
2843 default: /* round_to_zero */
2848 if (aSig
>= 0x01000000) {
2852 } else if (aExp
< -14
2853 && STATUS(float_detect_tininess
) == float_tininess_before_rounding
) {
2854 float_raise( float_flag_underflow STATUS_VAR
);
2859 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
2860 return packFloat16(aSign
, 0x1f, 0);
2864 float_raise(float_flag_invalid
| float_flag_inexact STATUS_VAR
);
2865 return packFloat16(aSign
, 0x1f, 0x3ff);
2869 return packFloat16(aSign
, 0, 0);
2872 aSig
>>= -14 - aExp
;
2875 return packFloat16(aSign
, aExp
+ 14, aSig
>> 13);
2880 /*----------------------------------------------------------------------------
2881 | Returns the result of converting the double-precision floating-point value
2882 | `a' to the extended double-precision floating-point format. The conversion
2883 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2885 *----------------------------------------------------------------------------*/
2887 floatx80
float64_to_floatx80( float64 a STATUS_PARAM
)
2893 a
= float64_squash_input_denormal(a STATUS_VAR
);
2894 aSig
= extractFloat64Frac( a
);
2895 aExp
= extractFloat64Exp( a
);
2896 aSign
= extractFloat64Sign( a
);
2897 if ( aExp
== 0x7FF ) {
2898 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2899 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2902 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
2903 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2907 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
2915 /*----------------------------------------------------------------------------
2916 | Returns the result of converting the double-precision floating-point value
2917 | `a' to the quadruple-precision floating-point format. The conversion is
2918 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2920 *----------------------------------------------------------------------------*/
2922 float128
float64_to_float128( float64 a STATUS_PARAM
)
2926 bits64 aSig
, zSig0
, zSig1
;
2928 a
= float64_squash_input_denormal(a STATUS_VAR
);
2929 aSig
= extractFloat64Frac( a
);
2930 aExp
= extractFloat64Exp( a
);
2931 aSign
= extractFloat64Sign( a
);
2932 if ( aExp
== 0x7FF ) {
2933 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2934 return packFloat128( aSign
, 0x7FFF, 0, 0 );
2937 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
2938 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2941 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
2942 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
2948 /*----------------------------------------------------------------------------
2949 | Rounds the double-precision floating-point value `a' to an integer, and
2950 | returns the result as a double-precision floating-point value. The
2951 | operation is performed according to the IEC/IEEE Standard for Binary
2952 | Floating-Point Arithmetic.
2953 *----------------------------------------------------------------------------*/
2955 float64
float64_round_to_int( float64 a STATUS_PARAM
)
2959 bits64 lastBitMask
, roundBitsMask
;
2962 a
= float64_squash_input_denormal(a STATUS_VAR
);
2964 aExp
= extractFloat64Exp( a
);
2965 if ( 0x433 <= aExp
) {
2966 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
2967 return propagateFloat64NaN( a
, a STATUS_VAR
);
2971 if ( aExp
< 0x3FF ) {
2972 if ( (bits64
) ( float64_val(a
)<<1 ) == 0 ) return a
;
2973 STATUS(float_exception_flags
) |= float_flag_inexact
;
2974 aSign
= extractFloat64Sign( a
);
2975 switch ( STATUS(float_rounding_mode
) ) {
2976 case float_round_nearest_even
:
2977 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
2978 return packFloat64( aSign
, 0x3FF, 0 );
2981 case float_round_down
:
2982 return make_float64(aSign
? LIT64( 0xBFF0000000000000 ) : 0);
2983 case float_round_up
:
2984 return make_float64(
2985 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2987 return packFloat64( aSign
, 0, 0 );
2990 lastBitMask
<<= 0x433 - aExp
;
2991 roundBitsMask
= lastBitMask
- 1;
2993 roundingMode
= STATUS(float_rounding_mode
);
2994 if ( roundingMode
== float_round_nearest_even
) {
2995 z
+= lastBitMask
>>1;
2996 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
2998 else if ( roundingMode
!= float_round_to_zero
) {
2999 if ( extractFloat64Sign( make_float64(z
) ) ^ ( roundingMode
== float_round_up
) ) {
3003 z
&= ~ roundBitsMask
;
3004 if ( z
!= float64_val(a
) )
3005 STATUS(float_exception_flags
) |= float_flag_inexact
;
3006 return make_float64(z
);
3010 float64
float64_trunc_to_int( float64 a STATUS_PARAM
)
3014 oldmode
= STATUS(float_rounding_mode
);
3015 STATUS(float_rounding_mode
) = float_round_to_zero
;
3016 res
= float64_round_to_int(a STATUS_VAR
);
3017 STATUS(float_rounding_mode
) = oldmode
;
3021 /*----------------------------------------------------------------------------
3022 | Returns the result of adding the absolute values of the double-precision
3023 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3024 | before being returned. `zSign' is ignored if the result is a NaN.
3025 | The addition is performed according to the IEC/IEEE Standard for Binary
3026 | Floating-Point Arithmetic.
3027 *----------------------------------------------------------------------------*/
3029 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
3031 int16 aExp
, bExp
, zExp
;
3032 bits64 aSig
, bSig
, zSig
;
3035 aSig
= extractFloat64Frac( a
);
3036 aExp
= extractFloat64Exp( a
);
3037 bSig
= extractFloat64Frac( b
);
3038 bExp
= extractFloat64Exp( b
);
3039 expDiff
= aExp
- bExp
;
3042 if ( 0 < expDiff
) {
3043 if ( aExp
== 0x7FF ) {
3044 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3051 bSig
|= LIT64( 0x2000000000000000 );
3053 shift64RightJamming( bSig
, expDiff
, &bSig
);
3056 else if ( expDiff
< 0 ) {
3057 if ( bExp
== 0x7FF ) {
3058 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3059 return packFloat64( zSign
, 0x7FF, 0 );
3065 aSig
|= LIT64( 0x2000000000000000 );
3067 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3071 if ( aExp
== 0x7FF ) {
3072 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3076 if ( STATUS(flush_to_zero
) ) return packFloat64( zSign
, 0, 0 );
3077 return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
3079 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
3083 aSig
|= LIT64( 0x2000000000000000 );
3084 zSig
= ( aSig
+ bSig
)<<1;
3086 if ( (sbits64
) zSig
< 0 ) {
3091 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3095 /*----------------------------------------------------------------------------
3096 | Returns the result of subtracting the absolute values of the double-
3097 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3098 | difference is negated before being returned. `zSign' is ignored if the
3099 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3100 | Standard for Binary Floating-Point Arithmetic.
3101 *----------------------------------------------------------------------------*/
3103 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
3105 int16 aExp
, bExp
, zExp
;
3106 bits64 aSig
, bSig
, zSig
;
3109 aSig
= extractFloat64Frac( a
);
3110 aExp
= extractFloat64Exp( a
);
3111 bSig
= extractFloat64Frac( b
);
3112 bExp
= extractFloat64Exp( b
);
3113 expDiff
= aExp
- bExp
;
3116 if ( 0 < expDiff
) goto aExpBigger
;
3117 if ( expDiff
< 0 ) goto bExpBigger
;
3118 if ( aExp
== 0x7FF ) {
3119 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3120 float_raise( float_flag_invalid STATUS_VAR
);
3121 return float64_default_nan
;
3127 if ( bSig
< aSig
) goto aBigger
;
3128 if ( aSig
< bSig
) goto bBigger
;
3129 return packFloat64( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
3131 if ( bExp
== 0x7FF ) {
3132 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3133 return packFloat64( zSign
^ 1, 0x7FF, 0 );
3139 aSig
|= LIT64( 0x4000000000000000 );
3141 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3142 bSig
|= LIT64( 0x4000000000000000 );
3147 goto normalizeRoundAndPack
;
3149 if ( aExp
== 0x7FF ) {
3150 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3157 bSig
|= LIT64( 0x4000000000000000 );
3159 shift64RightJamming( bSig
, expDiff
, &bSig
);
3160 aSig
|= LIT64( 0x4000000000000000 );
3164 normalizeRoundAndPack
:
3166 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3170 /*----------------------------------------------------------------------------
3171 | Returns the result of adding the double-precision floating-point values `a'
3172 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3173 | Binary Floating-Point Arithmetic.
3174 *----------------------------------------------------------------------------*/
3176 float64
float64_add( float64 a
, float64 b STATUS_PARAM
)
3179 a
= float64_squash_input_denormal(a STATUS_VAR
);
3180 b
= float64_squash_input_denormal(b STATUS_VAR
);
3182 aSign
= extractFloat64Sign( a
);
3183 bSign
= extractFloat64Sign( b
);
3184 if ( aSign
== bSign
) {
3185 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3188 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3193 /*----------------------------------------------------------------------------
3194 | Returns the result of subtracting the double-precision floating-point values
3195 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3196 | for Binary Floating-Point Arithmetic.
3197 *----------------------------------------------------------------------------*/
3199 float64
float64_sub( float64 a
, float64 b STATUS_PARAM
)
3202 a
= float64_squash_input_denormal(a STATUS_VAR
);
3203 b
= float64_squash_input_denormal(b STATUS_VAR
);
3205 aSign
= extractFloat64Sign( a
);
3206 bSign
= extractFloat64Sign( b
);
3207 if ( aSign
== bSign
) {
3208 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3211 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3216 /*----------------------------------------------------------------------------
3217 | Returns the result of multiplying the double-precision floating-point values
3218 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3219 | for Binary Floating-Point Arithmetic.
3220 *----------------------------------------------------------------------------*/
3222 float64
float64_mul( float64 a
, float64 b STATUS_PARAM
)
3224 flag aSign
, bSign
, zSign
;
3225 int16 aExp
, bExp
, zExp
;
3226 bits64 aSig
, bSig
, zSig0
, zSig1
;
3228 a
= float64_squash_input_denormal(a STATUS_VAR
);
3229 b
= float64_squash_input_denormal(b STATUS_VAR
);
3231 aSig
= extractFloat64Frac( a
);
3232 aExp
= extractFloat64Exp( a
);
3233 aSign
= extractFloat64Sign( a
);
3234 bSig
= extractFloat64Frac( b
);
3235 bExp
= extractFloat64Exp( b
);
3236 bSign
= extractFloat64Sign( b
);
3237 zSign
= aSign
^ bSign
;
3238 if ( aExp
== 0x7FF ) {
3239 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3240 return propagateFloat64NaN( a
, b STATUS_VAR
);
3242 if ( ( bExp
| bSig
) == 0 ) {
3243 float_raise( float_flag_invalid STATUS_VAR
);
3244 return float64_default_nan
;
3246 return packFloat64( zSign
, 0x7FF, 0 );
3248 if ( bExp
== 0x7FF ) {
3249 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3250 if ( ( aExp
| aSig
) == 0 ) {
3251 float_raise( float_flag_invalid STATUS_VAR
);
3252 return float64_default_nan
;
3254 return packFloat64( zSign
, 0x7FF, 0 );
3257 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3258 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3261 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3262 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3264 zExp
= aExp
+ bExp
- 0x3FF;
3265 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3266 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3267 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3268 zSig0
|= ( zSig1
!= 0 );
3269 if ( 0 <= (sbits64
) ( zSig0
<<1 ) ) {
3273 return roundAndPackFloat64( zSign
, zExp
, zSig0 STATUS_VAR
);
3277 /*----------------------------------------------------------------------------
3278 | Returns the result of dividing the double-precision floating-point value `a'
3279 | by the corresponding value `b'. The operation is performed according to
3280 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3281 *----------------------------------------------------------------------------*/
3283 float64
float64_div( float64 a
, float64 b STATUS_PARAM
)
3285 flag aSign
, bSign
, zSign
;
3286 int16 aExp
, bExp
, zExp
;
3287 bits64 aSig
, bSig
, zSig
;
3289 bits64 term0
, term1
;
3290 a
= float64_squash_input_denormal(a STATUS_VAR
);
3291 b
= float64_squash_input_denormal(b STATUS_VAR
);
3293 aSig
= extractFloat64Frac( a
);
3294 aExp
= extractFloat64Exp( a
);
3295 aSign
= extractFloat64Sign( a
);
3296 bSig
= extractFloat64Frac( b
);
3297 bExp
= extractFloat64Exp( b
);
3298 bSign
= extractFloat64Sign( b
);
3299 zSign
= aSign
^ bSign
;
3300 if ( aExp
== 0x7FF ) {
3301 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3302 if ( bExp
== 0x7FF ) {
3303 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3304 float_raise( float_flag_invalid STATUS_VAR
);
3305 return float64_default_nan
;
3307 return packFloat64( zSign
, 0x7FF, 0 );
3309 if ( bExp
== 0x7FF ) {
3310 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3311 return packFloat64( zSign
, 0, 0 );
3315 if ( ( aExp
| aSig
) == 0 ) {
3316 float_raise( float_flag_invalid STATUS_VAR
);
3317 return float64_default_nan
;
3319 float_raise( float_flag_divbyzero STATUS_VAR
);
3320 return packFloat64( zSign
, 0x7FF, 0 );
3322 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3325 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3326 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3328 zExp
= aExp
- bExp
+ 0x3FD;
3329 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3330 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3331 if ( bSig
<= ( aSig
+ aSig
) ) {
3335 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
3336 if ( ( zSig
& 0x1FF ) <= 2 ) {
3337 mul64To128( bSig
, zSig
, &term0
, &term1
);
3338 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3339 while ( (sbits64
) rem0
< 0 ) {
3341 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3343 zSig
|= ( rem1
!= 0 );
3345 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3349 /*----------------------------------------------------------------------------
3350 | Returns the remainder of the double-precision floating-point value `a'
3351 | with respect to the corresponding value `b'. The operation is performed
3352 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3353 *----------------------------------------------------------------------------*/
3355 float64
float64_rem( float64 a
, float64 b STATUS_PARAM
)
3358 int16 aExp
, bExp
, expDiff
;
3360 bits64 q
, alternateASig
;
3363 a
= float64_squash_input_denormal(a STATUS_VAR
);
3364 b
= float64_squash_input_denormal(b STATUS_VAR
);
3365 aSig
= extractFloat64Frac( a
);
3366 aExp
= extractFloat64Exp( a
);
3367 aSign
= extractFloat64Sign( a
);
3368 bSig
= extractFloat64Frac( b
);
3369 bExp
= extractFloat64Exp( b
);
3370 if ( aExp
== 0x7FF ) {
3371 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3372 return propagateFloat64NaN( a
, b STATUS_VAR
);
3374 float_raise( float_flag_invalid STATUS_VAR
);
3375 return float64_default_nan
;
3377 if ( bExp
== 0x7FF ) {
3378 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3383 float_raise( float_flag_invalid STATUS_VAR
);
3384 return float64_default_nan
;
3386 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3389 if ( aSig
== 0 ) return a
;
3390 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3392 expDiff
= aExp
- bExp
;
3393 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
3394 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3395 if ( expDiff
< 0 ) {
3396 if ( expDiff
< -1 ) return a
;
3399 q
= ( bSig
<= aSig
);
3400 if ( q
) aSig
-= bSig
;
3402 while ( 0 < expDiff
) {
3403 q
= estimateDiv128To64( aSig
, 0, bSig
);
3404 q
= ( 2 < q
) ? q
- 2 : 0;
3405 aSig
= - ( ( bSig
>>2 ) * q
);
3409 if ( 0 < expDiff
) {
3410 q
= estimateDiv128To64( aSig
, 0, bSig
);
3411 q
= ( 2 < q
) ? q
- 2 : 0;
3414 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3421 alternateASig
= aSig
;
3424 } while ( 0 <= (sbits64
) aSig
);
3425 sigMean
= aSig
+ alternateASig
;
3426 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3427 aSig
= alternateASig
;
3429 zSign
= ( (sbits64
) aSig
< 0 );
3430 if ( zSign
) aSig
= - aSig
;
3431 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
3435 /*----------------------------------------------------------------------------
3436 | Returns the square root of the double-precision floating-point value `a'.
3437 | The operation is performed according to the IEC/IEEE Standard for Binary
3438 | Floating-Point Arithmetic.
3439 *----------------------------------------------------------------------------*/
3441 float64
float64_sqrt( float64 a STATUS_PARAM
)
3445 bits64 aSig
, zSig
, doubleZSig
;
3446 bits64 rem0
, rem1
, term0
, term1
;
3447 a
= float64_squash_input_denormal(a STATUS_VAR
);
3449 aSig
= extractFloat64Frac( a
);
3450 aExp
= extractFloat64Exp( a
);
3451 aSign
= extractFloat64Sign( a
);
3452 if ( aExp
== 0x7FF ) {
3453 if ( aSig
) return propagateFloat64NaN( a
, a STATUS_VAR
);
3454 if ( ! aSign
) return a
;
3455 float_raise( float_flag_invalid STATUS_VAR
);
3456 return float64_default_nan
;
3459 if ( ( aExp
| aSig
) == 0 ) return a
;
3460 float_raise( float_flag_invalid STATUS_VAR
);
3461 return float64_default_nan
;
3464 if ( aSig
== 0 ) return float64_zero
;
3465 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3467 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
3468 aSig
|= LIT64( 0x0010000000000000 );
3469 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
3470 aSig
<<= 9 - ( aExp
& 1 );
3471 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
3472 if ( ( zSig
& 0x1FF ) <= 5 ) {
3473 doubleZSig
= zSig
<<1;
3474 mul64To128( zSig
, zSig
, &term0
, &term1
);
3475 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3476 while ( (sbits64
) rem0
< 0 ) {
3479 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
3481 zSig
|= ( ( rem0
| rem1
) != 0 );
3483 return roundAndPackFloat64( 0, zExp
, zSig STATUS_VAR
);
3487 /*----------------------------------------------------------------------------
3488 | Returns the binary log of the double-precision floating-point value `a'.
3489 | The operation is performed according to the IEC/IEEE Standard for Binary
3490 | Floating-Point Arithmetic.
3491 *----------------------------------------------------------------------------*/
3492 float64
float64_log2( float64 a STATUS_PARAM
)
3496 bits64 aSig
, aSig0
, aSig1
, zSig
, i
;
3497 a
= float64_squash_input_denormal(a STATUS_VAR
);
3499 aSig
= extractFloat64Frac( a
);
3500 aExp
= extractFloat64Exp( a
);
3501 aSign
= extractFloat64Sign( a
);
3504 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
3505 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3508 float_raise( float_flag_invalid STATUS_VAR
);
3509 return float64_default_nan
;
3511 if ( aExp
== 0x7FF ) {
3512 if ( aSig
) return propagateFloat64NaN( a
, float64_zero STATUS_VAR
);
3517 aSig
|= LIT64( 0x0010000000000000 );
3519 zSig
= (bits64
)aExp
<< 52;
3520 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
3521 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
3522 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
3523 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
3531 return normalizeRoundAndPackFloat64( zSign
, 0x408, zSig STATUS_VAR
);
3534 /*----------------------------------------------------------------------------
3535 | Returns 1 if the double-precision floating-point value `a' is equal to the
3536 | corresponding value `b', and 0 otherwise. The comparison is performed
3537 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3538 *----------------------------------------------------------------------------*/
3540 int float64_eq( float64 a
, float64 b STATUS_PARAM
)
3543 a
= float64_squash_input_denormal(a STATUS_VAR
);
3544 b
= float64_squash_input_denormal(b STATUS_VAR
);
3546 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3547 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3549 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3550 float_raise( float_flag_invalid STATUS_VAR
);
3554 av
= float64_val(a
);
3555 bv
= float64_val(b
);
3556 return ( av
== bv
) || ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3560 /*----------------------------------------------------------------------------
3561 | Returns 1 if the double-precision floating-point value `a' is less than or
3562 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3563 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3565 *----------------------------------------------------------------------------*/
3567 int float64_le( float64 a
, float64 b STATUS_PARAM
)
3571 a
= float64_squash_input_denormal(a STATUS_VAR
);
3572 b
= float64_squash_input_denormal(b STATUS_VAR
);
3574 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3575 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3577 float_raise( float_flag_invalid STATUS_VAR
);
3580 aSign
= extractFloat64Sign( a
);
3581 bSign
= extractFloat64Sign( b
);
3582 av
= float64_val(a
);
3583 bv
= float64_val(b
);
3584 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3585 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3589 /*----------------------------------------------------------------------------
3590 | Returns 1 if the double-precision floating-point value `a' is less than
3591 | the corresponding value `b', and 0 otherwise. The comparison is performed
3592 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3593 *----------------------------------------------------------------------------*/
3595 int float64_lt( float64 a
, float64 b STATUS_PARAM
)
3600 a
= float64_squash_input_denormal(a STATUS_VAR
);
3601 b
= float64_squash_input_denormal(b STATUS_VAR
);
3602 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3603 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3605 float_raise( float_flag_invalid STATUS_VAR
);
3608 aSign
= extractFloat64Sign( a
);
3609 bSign
= extractFloat64Sign( b
);
3610 av
= float64_val(a
);
3611 bv
= float64_val(b
);
3612 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( av
| bv
)<<1 ) != 0 );
3613 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3617 /*----------------------------------------------------------------------------
3618 | Returns 1 if the double-precision floating-point value `a' is equal to the
3619 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3620 | if either operand is a NaN. Otherwise, the comparison is performed
3621 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3622 *----------------------------------------------------------------------------*/
3624 int float64_eq_signaling( float64 a
, float64 b STATUS_PARAM
)
3627 a
= float64_squash_input_denormal(a STATUS_VAR
);
3628 b
= float64_squash_input_denormal(b STATUS_VAR
);
3630 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3631 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3633 float_raise( float_flag_invalid STATUS_VAR
);
3636 av
= float64_val(a
);
3637 bv
= float64_val(b
);
3638 return ( av
== bv
) || ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3642 /*----------------------------------------------------------------------------
3643 | Returns 1 if the double-precision floating-point value `a' is less than or
3644 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3645 | cause an exception. Otherwise, the comparison is performed according to the
3646 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3647 *----------------------------------------------------------------------------*/
3649 int float64_le_quiet( float64 a
, float64 b STATUS_PARAM
)
3653 a
= float64_squash_input_denormal(a STATUS_VAR
);
3654 b
= float64_squash_input_denormal(b STATUS_VAR
);
3656 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3657 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3659 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3660 float_raise( float_flag_invalid STATUS_VAR
);
3664 aSign
= extractFloat64Sign( a
);
3665 bSign
= extractFloat64Sign( b
);
3666 av
= float64_val(a
);
3667 bv
= float64_val(b
);
3668 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3669 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3673 /*----------------------------------------------------------------------------
3674 | Returns 1 if the double-precision floating-point value `a' is less than
3675 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3676 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3677 | Standard for Binary Floating-Point Arithmetic.
3678 *----------------------------------------------------------------------------*/
3680 int float64_lt_quiet( float64 a
, float64 b STATUS_PARAM
)
3684 a
= float64_squash_input_denormal(a STATUS_VAR
);
3685 b
= float64_squash_input_denormal(b STATUS_VAR
);
3687 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3688 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3690 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3691 float_raise( float_flag_invalid STATUS_VAR
);
3695 aSign
= extractFloat64Sign( a
);
3696 bSign
= extractFloat64Sign( b
);
3697 av
= float64_val(a
);
3698 bv
= float64_val(b
);
3699 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( av
| bv
)<<1 ) != 0 );
3700 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3706 /*----------------------------------------------------------------------------
3707 | Returns the result of converting the extended double-precision floating-
3708 | point value `a' to the 32-bit two's complement integer format. The
3709 | conversion is performed according to the IEC/IEEE Standard for Binary
3710 | Floating-Point Arithmetic---which means in particular that the conversion
3711 | is rounded according to the current rounding mode. If `a' is a NaN, the
3712 | largest positive integer is returned. Otherwise, if the conversion
3713 | overflows, the largest integer with the same sign as `a' is returned.
3714 *----------------------------------------------------------------------------*/
3716 int32
floatx80_to_int32( floatx80 a STATUS_PARAM
)
3719 int32 aExp
, shiftCount
;
3722 aSig
= extractFloatx80Frac( a
);
3723 aExp
= extractFloatx80Exp( a
);
3724 aSign
= extractFloatx80Sign( a
);
3725 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3726 shiftCount
= 0x4037 - aExp
;
3727 if ( shiftCount
<= 0 ) shiftCount
= 1;
3728 shift64RightJamming( aSig
, shiftCount
, &aSig
);
3729 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
3733 /*----------------------------------------------------------------------------
3734 | Returns the result of converting the extended double-precision floating-
3735 | point value `a' to the 32-bit two's complement integer format. The
3736 | conversion is performed according to the IEC/IEEE Standard for Binary
3737 | Floating-Point Arithmetic, except that the conversion is always rounded
3738 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3739 | Otherwise, if the conversion overflows, the largest integer with the same
3740 | sign as `a' is returned.
3741 *----------------------------------------------------------------------------*/
3743 int32
floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM
)
3746 int32 aExp
, shiftCount
;
3747 bits64 aSig
, savedASig
;
3750 aSig
= extractFloatx80Frac( a
);
3751 aExp
= extractFloatx80Exp( a
);
3752 aSign
= extractFloatx80Sign( a
);
3753 if ( 0x401E < aExp
) {
3754 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3757 else if ( aExp
< 0x3FFF ) {
3758 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3761 shiftCount
= 0x403E - aExp
;
3763 aSig
>>= shiftCount
;
3765 if ( aSign
) z
= - z
;
3766 if ( ( z
< 0 ) ^ aSign
) {
3768 float_raise( float_flag_invalid STATUS_VAR
);
3769 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
3771 if ( ( aSig
<<shiftCount
) != savedASig
) {
3772 STATUS(float_exception_flags
) |= float_flag_inexact
;
3778 /*----------------------------------------------------------------------------
3779 | Returns the result of converting the extended double-precision floating-
3780 | point value `a' to the 64-bit two's complement integer format. The
3781 | conversion is performed according to the IEC/IEEE Standard for Binary
3782 | Floating-Point Arithmetic---which means in particular that the conversion
3783 | is rounded according to the current rounding mode. If `a' is a NaN,
3784 | the largest positive integer is returned. Otherwise, if the conversion
3785 | overflows, the largest integer with the same sign as `a' is returned.
3786 *----------------------------------------------------------------------------*/
3788 int64
floatx80_to_int64( floatx80 a STATUS_PARAM
)
3791 int32 aExp
, shiftCount
;
3792 bits64 aSig
, aSigExtra
;
3794 aSig
= extractFloatx80Frac( a
);
3795 aExp
= extractFloatx80Exp( a
);
3796 aSign
= extractFloatx80Sign( a
);
3797 shiftCount
= 0x403E - aExp
;
3798 if ( shiftCount
<= 0 ) {
3800 float_raise( float_flag_invalid STATUS_VAR
);
3802 || ( ( aExp
== 0x7FFF )
3803 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
3805 return LIT64( 0x7FFFFFFFFFFFFFFF );
3807 return (sbits64
) LIT64( 0x8000000000000000 );
3812 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3814 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
3818 /*----------------------------------------------------------------------------
3819 | Returns the result of converting the extended double-precision floating-
3820 | point value `a' to the 64-bit two's complement integer format. The
3821 | conversion is performed according to the IEC/IEEE Standard for Binary
3822 | Floating-Point Arithmetic, except that the conversion is always rounded
3823 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3824 | Otherwise, if the conversion overflows, the largest integer with the same
3825 | sign as `a' is returned.
3826 *----------------------------------------------------------------------------*/
3828 int64
floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM
)
3831 int32 aExp
, shiftCount
;
3835 aSig
= extractFloatx80Frac( a
);
3836 aExp
= extractFloatx80Exp( a
);
3837 aSign
= extractFloatx80Sign( a
);
3838 shiftCount
= aExp
- 0x403E;
3839 if ( 0 <= shiftCount
) {
3840 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
3841 if ( ( a
.high
!= 0xC03E ) || aSig
) {
3842 float_raise( float_flag_invalid STATUS_VAR
);
3843 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
3844 return LIT64( 0x7FFFFFFFFFFFFFFF );
3847 return (sbits64
) LIT64( 0x8000000000000000 );
3849 else if ( aExp
< 0x3FFF ) {
3850 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3853 z
= aSig
>>( - shiftCount
);
3854 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
3855 STATUS(float_exception_flags
) |= float_flag_inexact
;
3857 if ( aSign
) z
= - z
;
3862 /*----------------------------------------------------------------------------
3863 | Returns the result of converting the extended double-precision floating-
3864 | point value `a' to the single-precision floating-point format. The
3865 | conversion is performed according to the IEC/IEEE Standard for Binary
3866 | Floating-Point Arithmetic.
3867 *----------------------------------------------------------------------------*/
3869 float32
floatx80_to_float32( floatx80 a STATUS_PARAM
)
3875 aSig
= extractFloatx80Frac( a
);
3876 aExp
= extractFloatx80Exp( a
);
3877 aSign
= extractFloatx80Sign( a
);
3878 if ( aExp
== 0x7FFF ) {
3879 if ( (bits64
) ( aSig
<<1 ) ) {
3880 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3882 return packFloat32( aSign
, 0xFF, 0 );
3884 shift64RightJamming( aSig
, 33, &aSig
);
3885 if ( aExp
|| aSig
) aExp
-= 0x3F81;
3886 return roundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
3890 /*----------------------------------------------------------------------------
3891 | Returns the result of converting the extended double-precision floating-
3892 | point value `a' to the double-precision floating-point format. The
3893 | conversion is performed according to the IEC/IEEE Standard for Binary
3894 | Floating-Point Arithmetic.
3895 *----------------------------------------------------------------------------*/
3897 float64
floatx80_to_float64( floatx80 a STATUS_PARAM
)
3903 aSig
= extractFloatx80Frac( a
);
3904 aExp
= extractFloatx80Exp( a
);
3905 aSign
= extractFloatx80Sign( a
);
3906 if ( aExp
== 0x7FFF ) {
3907 if ( (bits64
) ( aSig
<<1 ) ) {
3908 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3910 return packFloat64( aSign
, 0x7FF, 0 );
3912 shift64RightJamming( aSig
, 1, &zSig
);
3913 if ( aExp
|| aSig
) aExp
-= 0x3C01;
3914 return roundAndPackFloat64( aSign
, aExp
, zSig STATUS_VAR
);
3920 /*----------------------------------------------------------------------------
3921 | Returns the result of converting the extended double-precision floating-
3922 | point value `a' to the quadruple-precision floating-point format. The
3923 | conversion is performed according to the IEC/IEEE Standard for Binary
3924 | Floating-Point Arithmetic.
3925 *----------------------------------------------------------------------------*/
3927 float128
floatx80_to_float128( floatx80 a STATUS_PARAM
)
3931 bits64 aSig
, zSig0
, zSig1
;
3933 aSig
= extractFloatx80Frac( a
);
3934 aExp
= extractFloatx80Exp( a
);
3935 aSign
= extractFloatx80Sign( a
);
3936 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) {
3937 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3939 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
3940 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
3946 /*----------------------------------------------------------------------------
3947 | Rounds the extended double-precision floating-point value `a' to an integer,
3948 | and returns the result as an extended quadruple-precision floating-point
3949 | value. The operation is performed according to the IEC/IEEE Standard for
3950 | Binary Floating-Point Arithmetic.
3951 *----------------------------------------------------------------------------*/
3953 floatx80
floatx80_round_to_int( floatx80 a STATUS_PARAM
)
3957 bits64 lastBitMask
, roundBitsMask
;
3961 aExp
= extractFloatx80Exp( a
);
3962 if ( 0x403E <= aExp
) {
3963 if ( ( aExp
== 0x7FFF ) && (bits64
) ( extractFloatx80Frac( a
)<<1 ) ) {
3964 return propagateFloatx80NaN( a
, a STATUS_VAR
);
3968 if ( aExp
< 0x3FFF ) {
3970 && ( (bits64
) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
3973 STATUS(float_exception_flags
) |= float_flag_inexact
;
3974 aSign
= extractFloatx80Sign( a
);
3975 switch ( STATUS(float_rounding_mode
) ) {
3976 case float_round_nearest_even
:
3977 if ( ( aExp
== 0x3FFE ) && (bits64
) ( extractFloatx80Frac( a
)<<1 )
3980 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
3983 case float_round_down
:
3986 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3987 : packFloatx80( 0, 0, 0 );
3988 case float_round_up
:
3990 aSign
? packFloatx80( 1, 0, 0 )
3991 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3993 return packFloatx80( aSign
, 0, 0 );
3996 lastBitMask
<<= 0x403E - aExp
;
3997 roundBitsMask
= lastBitMask
- 1;
3999 roundingMode
= STATUS(float_rounding_mode
);
4000 if ( roundingMode
== float_round_nearest_even
) {
4001 z
.low
+= lastBitMask
>>1;
4002 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
4004 else if ( roundingMode
!= float_round_to_zero
) {
4005 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
4006 z
.low
+= roundBitsMask
;
4009 z
.low
&= ~ roundBitsMask
;
4012 z
.low
= LIT64( 0x8000000000000000 );
4014 if ( z
.low
!= a
.low
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4019 /*----------------------------------------------------------------------------
4020 | Returns the result of adding the absolute values of the extended double-
4021 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4022 | negated before being returned. `zSign' is ignored if the result is a NaN.
4023 | The addition is performed according to the IEC/IEEE Standard for Binary
4024 | Floating-Point Arithmetic.
4025 *----------------------------------------------------------------------------*/
4027 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
4029 int32 aExp
, bExp
, zExp
;
4030 bits64 aSig
, bSig
, zSig0
, zSig1
;
4033 aSig
= extractFloatx80Frac( a
);
4034 aExp
= extractFloatx80Exp( a
);
4035 bSig
= extractFloatx80Frac( b
);
4036 bExp
= extractFloatx80Exp( b
);
4037 expDiff
= aExp
- bExp
;
4038 if ( 0 < expDiff
) {
4039 if ( aExp
== 0x7FFF ) {
4040 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4043 if ( bExp
== 0 ) --expDiff
;
4044 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4047 else if ( expDiff
< 0 ) {
4048 if ( bExp
== 0x7FFF ) {
4049 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4050 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4052 if ( aExp
== 0 ) ++expDiff
;
4053 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4057 if ( aExp
== 0x7FFF ) {
4058 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
4059 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4064 zSig0
= aSig
+ bSig
;
4066 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
4072 zSig0
= aSig
+ bSig
;
4073 if ( (sbits64
) zSig0
< 0 ) goto roundAndPack
;
4075 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4076 zSig0
|= LIT64( 0x8000000000000000 );
4080 roundAndPackFloatx80(
4081 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4085 /*----------------------------------------------------------------------------
4086 | Returns the result of subtracting the absolute values of the extended
4087 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4088 | difference is negated before being returned. `zSign' is ignored if the
4089 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4090 | Standard for Binary Floating-Point Arithmetic.
4091 *----------------------------------------------------------------------------*/
4093 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
4095 int32 aExp
, bExp
, zExp
;
4096 bits64 aSig
, bSig
, zSig0
, zSig1
;
4100 aSig
= extractFloatx80Frac( a
);
4101 aExp
= extractFloatx80Exp( a
);
4102 bSig
= extractFloatx80Frac( b
);
4103 bExp
= extractFloatx80Exp( b
);
4104 expDiff
= aExp
- bExp
;
4105 if ( 0 < expDiff
) goto aExpBigger
;
4106 if ( expDiff
< 0 ) goto bExpBigger
;
4107 if ( aExp
== 0x7FFF ) {
4108 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
4109 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4111 float_raise( float_flag_invalid STATUS_VAR
);
4112 z
.low
= floatx80_default_nan_low
;
4113 z
.high
= floatx80_default_nan_high
;
4121 if ( bSig
< aSig
) goto aBigger
;
4122 if ( aSig
< bSig
) goto bBigger
;
4123 return packFloatx80( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
4125 if ( bExp
== 0x7FFF ) {
4126 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4127 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4129 if ( aExp
== 0 ) ++expDiff
;
4130 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4132 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
4135 goto normalizeRoundAndPack
;
4137 if ( aExp
== 0x7FFF ) {
4138 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4141 if ( bExp
== 0 ) --expDiff
;
4142 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4144 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
4146 normalizeRoundAndPack
:
4148 normalizeRoundAndPackFloatx80(
4149 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4153 /*----------------------------------------------------------------------------
4154 | Returns the result of adding the extended double-precision floating-point
4155 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4156 | Standard for Binary Floating-Point Arithmetic.
4157 *----------------------------------------------------------------------------*/
4159 floatx80
floatx80_add( floatx80 a
, floatx80 b STATUS_PARAM
)
4163 aSign
= extractFloatx80Sign( a
);
4164 bSign
= extractFloatx80Sign( b
);
4165 if ( aSign
== bSign
) {
4166 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4169 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4174 /*----------------------------------------------------------------------------
4175 | Returns the result of subtracting the extended double-precision floating-
4176 | point values `a' and `b'. The operation is performed according to the
4177 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4178 *----------------------------------------------------------------------------*/
4180 floatx80
floatx80_sub( floatx80 a
, floatx80 b STATUS_PARAM
)
4184 aSign
= extractFloatx80Sign( a
);
4185 bSign
= extractFloatx80Sign( b
);
4186 if ( aSign
== bSign
) {
4187 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4190 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4195 /*----------------------------------------------------------------------------
4196 | Returns the result of multiplying the extended double-precision floating-
4197 | point values `a' and `b'. The operation is performed according to the
4198 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4199 *----------------------------------------------------------------------------*/
4201 floatx80
floatx80_mul( floatx80 a
, floatx80 b STATUS_PARAM
)
4203 flag aSign
, bSign
, zSign
;
4204 int32 aExp
, bExp
, zExp
;
4205 bits64 aSig
, bSig
, zSig0
, zSig1
;
4208 aSig
= extractFloatx80Frac( a
);
4209 aExp
= extractFloatx80Exp( a
);
4210 aSign
= extractFloatx80Sign( a
);
4211 bSig
= extractFloatx80Frac( b
);
4212 bExp
= extractFloatx80Exp( b
);
4213 bSign
= extractFloatx80Sign( b
);
4214 zSign
= aSign
^ bSign
;
4215 if ( aExp
== 0x7FFF ) {
4216 if ( (bits64
) ( aSig
<<1 )
4217 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
4218 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4220 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
4221 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4223 if ( bExp
== 0x7FFF ) {
4224 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4225 if ( ( aExp
| aSig
) == 0 ) {
4227 float_raise( float_flag_invalid STATUS_VAR
);
4228 z
.low
= floatx80_default_nan_low
;
4229 z
.high
= floatx80_default_nan_high
;
4232 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4235 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4236 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
4239 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4240 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4242 zExp
= aExp
+ bExp
- 0x3FFE;
4243 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
4244 if ( 0 < (sbits64
) zSig0
) {
4245 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4249 roundAndPackFloatx80(
4250 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4254 /*----------------------------------------------------------------------------
4255 | Returns the result of dividing the extended double-precision floating-point
4256 | value `a' by the corresponding value `b'. The operation is performed
4257 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4258 *----------------------------------------------------------------------------*/
4260 floatx80
floatx80_div( floatx80 a
, floatx80 b STATUS_PARAM
)
4262 flag aSign
, bSign
, zSign
;
4263 int32 aExp
, bExp
, zExp
;
4264 bits64 aSig
, bSig
, zSig0
, zSig1
;
4265 bits64 rem0
, rem1
, rem2
, term0
, term1
, term2
;
4268 aSig
= extractFloatx80Frac( a
);
4269 aExp
= extractFloatx80Exp( a
);
4270 aSign
= extractFloatx80Sign( a
);
4271 bSig
= extractFloatx80Frac( b
);
4272 bExp
= extractFloatx80Exp( b
);
4273 bSign
= extractFloatx80Sign( b
);
4274 zSign
= aSign
^ bSign
;
4275 if ( aExp
== 0x7FFF ) {
4276 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4277 if ( bExp
== 0x7FFF ) {
4278 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4281 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4283 if ( bExp
== 0x7FFF ) {
4284 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4285 return packFloatx80( zSign
, 0, 0 );
4289 if ( ( aExp
| aSig
) == 0 ) {
4291 float_raise( float_flag_invalid STATUS_VAR
);
4292 z
.low
= floatx80_default_nan_low
;
4293 z
.high
= floatx80_default_nan_high
;
4296 float_raise( float_flag_divbyzero STATUS_VAR
);
4297 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4299 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4302 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4303 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
4305 zExp
= aExp
- bExp
+ 0x3FFE;
4307 if ( bSig
<= aSig
) {
4308 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
4311 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
4312 mul64To128( bSig
, zSig0
, &term0
, &term1
);
4313 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
4314 while ( (sbits64
) rem0
< 0 ) {
4316 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
4318 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
4319 if ( (bits64
) ( zSig1
<<1 ) <= 8 ) {
4320 mul64To128( bSig
, zSig1
, &term1
, &term2
);
4321 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4322 while ( (sbits64
) rem1
< 0 ) {
4324 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
4326 zSig1
|= ( ( rem1
| rem2
) != 0 );
4329 roundAndPackFloatx80(
4330 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4334 /*----------------------------------------------------------------------------
4335 | Returns the remainder of the extended double-precision floating-point value
4336 | `a' with respect to the corresponding value `b'. The operation is performed
4337 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4338 *----------------------------------------------------------------------------*/
4340 floatx80
floatx80_rem( floatx80 a
, floatx80 b STATUS_PARAM
)
4343 int32 aExp
, bExp
, expDiff
;
4344 bits64 aSig0
, aSig1
, bSig
;
4345 bits64 q
, term0
, term1
, alternateASig0
, alternateASig1
;
4348 aSig0
= extractFloatx80Frac( a
);
4349 aExp
= extractFloatx80Exp( a
);
4350 aSign
= extractFloatx80Sign( a
);
4351 bSig
= extractFloatx80Frac( b
);
4352 bExp
= extractFloatx80Exp( b
);
4353 if ( aExp
== 0x7FFF ) {
4354 if ( (bits64
) ( aSig0
<<1 )
4355 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
4356 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4360 if ( bExp
== 0x7FFF ) {
4361 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4367 float_raise( float_flag_invalid STATUS_VAR
);
4368 z
.low
= floatx80_default_nan_low
;
4369 z
.high
= floatx80_default_nan_high
;
4372 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4375 if ( (bits64
) ( aSig0
<<1 ) == 0 ) return a
;
4376 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4378 bSig
|= LIT64( 0x8000000000000000 );
4380 expDiff
= aExp
- bExp
;
4382 if ( expDiff
< 0 ) {
4383 if ( expDiff
< -1 ) return a
;
4384 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
4387 q
= ( bSig
<= aSig0
);
4388 if ( q
) aSig0
-= bSig
;
4390 while ( 0 < expDiff
) {
4391 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
4392 q
= ( 2 < q
) ? q
- 2 : 0;
4393 mul64To128( bSig
, q
, &term0
, &term1
);
4394 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4395 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
4399 if ( 0 < expDiff
) {
4400 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
4401 q
= ( 2 < q
) ? q
- 2 : 0;
4403 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
4404 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4405 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
4406 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
4408 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4415 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
4416 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
4417 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
4420 aSig0
= alternateASig0
;
4421 aSig1
= alternateASig1
;
4425 normalizeRoundAndPackFloatx80(
4426 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1 STATUS_VAR
);
4430 /*----------------------------------------------------------------------------
4431 | Returns the square root of the extended double-precision floating-point
4432 | value `a'. The operation is performed according to the IEC/IEEE Standard
4433 | for Binary Floating-Point Arithmetic.
4434 *----------------------------------------------------------------------------*/
4436 floatx80
floatx80_sqrt( floatx80 a STATUS_PARAM
)
4440 bits64 aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
4441 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4444 aSig0
= extractFloatx80Frac( a
);
4445 aExp
= extractFloatx80Exp( a
);
4446 aSign
= extractFloatx80Sign( a
);
4447 if ( aExp
== 0x7FFF ) {
4448 if ( (bits64
) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a STATUS_VAR
);
4449 if ( ! aSign
) return a
;
4453 if ( ( aExp
| aSig0
) == 0 ) return a
;
4455 float_raise( float_flag_invalid STATUS_VAR
);
4456 z
.low
= floatx80_default_nan_low
;
4457 z
.high
= floatx80_default_nan_high
;
4461 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
4462 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4464 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
4465 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
4466 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
4467 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
4468 doubleZSig0
= zSig0
<<1;
4469 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
4470 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
4471 while ( (sbits64
) rem0
< 0 ) {
4474 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
4476 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
4477 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4478 if ( zSig1
== 0 ) zSig1
= 1;
4479 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
4480 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4481 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
4482 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4483 while ( (sbits64
) rem1
< 0 ) {
4485 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
4487 term2
|= doubleZSig0
;
4488 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4490 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
4492 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
4493 zSig0
|= doubleZSig0
;
4495 roundAndPackFloatx80(
4496 STATUS(floatx80_rounding_precision
), 0, zExp
, zSig0
, zSig1 STATUS_VAR
);
4500 /*----------------------------------------------------------------------------
4501 | Returns 1 if the extended double-precision floating-point value `a' is
4502 | equal to the corresponding value `b', and 0 otherwise. The comparison is
4503 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4505 *----------------------------------------------------------------------------*/
4507 int floatx80_eq( floatx80 a
, floatx80 b STATUS_PARAM
)
4510 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4511 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4512 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4513 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4515 if ( floatx80_is_signaling_nan( a
)
4516 || floatx80_is_signaling_nan( b
) ) {
4517 float_raise( float_flag_invalid STATUS_VAR
);
4523 && ( ( a
.high
== b
.high
)
4525 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4530 /*----------------------------------------------------------------------------
4531 | Returns 1 if the extended double-precision floating-point value `a' is
4532 | less than or equal to the corresponding value `b', and 0 otherwise. The
4533 | comparison is performed according to the IEC/IEEE Standard for Binary
4534 | Floating-Point Arithmetic.
4535 *----------------------------------------------------------------------------*/
4537 int floatx80_le( floatx80 a
, floatx80 b STATUS_PARAM
)
4541 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4542 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4543 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4544 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4546 float_raise( float_flag_invalid STATUS_VAR
);
4549 aSign
= extractFloatx80Sign( a
);
4550 bSign
= extractFloatx80Sign( b
);
4551 if ( aSign
!= bSign
) {
4554 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4558 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4559 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4563 /*----------------------------------------------------------------------------
4564 | Returns 1 if the extended double-precision floating-point value `a' is
4565 | less than the corresponding value `b', and 0 otherwise. The comparison
4566 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4568 *----------------------------------------------------------------------------*/
4570 int floatx80_lt( floatx80 a
, floatx80 b STATUS_PARAM
)
4574 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4575 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4576 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4577 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4579 float_raise( float_flag_invalid STATUS_VAR
);
4582 aSign
= extractFloatx80Sign( a
);
4583 bSign
= extractFloatx80Sign( b
);
4584 if ( aSign
!= bSign
) {
4587 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4591 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4592 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4596 /*----------------------------------------------------------------------------
4597 | Returns 1 if the extended double-precision floating-point value `a' is equal
4598 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4599 | raised if either operand is a NaN. Otherwise, the comparison is performed
4600 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4601 *----------------------------------------------------------------------------*/
4603 int floatx80_eq_signaling( floatx80 a
, floatx80 b STATUS_PARAM
)
4606 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4607 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4608 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4609 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4611 float_raise( float_flag_invalid STATUS_VAR
);
4616 && ( ( a
.high
== b
.high
)
4618 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4623 /*----------------------------------------------------------------------------
4624 | Returns 1 if the extended double-precision floating-point value `a' is less
4625 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4626 | do not cause an exception. Otherwise, the comparison is performed according
4627 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4628 *----------------------------------------------------------------------------*/
4630 int floatx80_le_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4634 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4635 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4636 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4637 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4639 if ( floatx80_is_signaling_nan( a
)
4640 || floatx80_is_signaling_nan( b
) ) {
4641 float_raise( float_flag_invalid STATUS_VAR
);
4645 aSign
= extractFloatx80Sign( a
);
4646 bSign
= extractFloatx80Sign( b
);
4647 if ( aSign
!= bSign
) {
4650 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4654 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4655 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4659 /*----------------------------------------------------------------------------
4660 | Returns 1 if the extended double-precision floating-point value `a' is less
4661 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4662 | an exception. Otherwise, the comparison is performed according to the
4663 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4664 *----------------------------------------------------------------------------*/
4666 int floatx80_lt_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4670 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4671 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4672 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4673 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4675 if ( floatx80_is_signaling_nan( a
)
4676 || floatx80_is_signaling_nan( b
) ) {
4677 float_raise( float_flag_invalid STATUS_VAR
);
4681 aSign
= extractFloatx80Sign( a
);
4682 bSign
= extractFloatx80Sign( b
);
4683 if ( aSign
!= bSign
) {
4686 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4690 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4691 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4699 /*----------------------------------------------------------------------------
4700 | Returns the result of converting the quadruple-precision floating-point
4701 | value `a' to the 32-bit two's complement integer format. The conversion
4702 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4703 | Arithmetic---which means in particular that the conversion is rounded
4704 | according to the current rounding mode. If `a' is a NaN, the largest
4705 | positive integer is returned. Otherwise, if the conversion overflows, the
4706 | largest integer with the same sign as `a' is returned.
4707 *----------------------------------------------------------------------------*/
4709 int32
float128_to_int32( float128 a STATUS_PARAM
)
4712 int32 aExp
, shiftCount
;
4713 bits64 aSig0
, aSig1
;
4715 aSig1
= extractFloat128Frac1( a
);
4716 aSig0
= extractFloat128Frac0( a
);
4717 aExp
= extractFloat128Exp( a
);
4718 aSign
= extractFloat128Sign( a
);
4719 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
4720 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4721 aSig0
|= ( aSig1
!= 0 );
4722 shiftCount
= 0x4028 - aExp
;
4723 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
4724 return roundAndPackInt32( aSign
, aSig0 STATUS_VAR
);
4728 /*----------------------------------------------------------------------------
4729 | Returns the result of converting the quadruple-precision floating-point
4730 | value `a' to the 32-bit two's complement integer format. The conversion
4731 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4732 | Arithmetic, except that the conversion is always rounded toward zero. If
4733 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4734 | conversion overflows, the largest integer with the same sign as `a' is
4736 *----------------------------------------------------------------------------*/
4738 int32
float128_to_int32_round_to_zero( float128 a STATUS_PARAM
)
4741 int32 aExp
, shiftCount
;
4742 bits64 aSig0
, aSig1
, savedASig
;
4745 aSig1
= extractFloat128Frac1( a
);
4746 aSig0
= extractFloat128Frac0( a
);
4747 aExp
= extractFloat128Exp( a
);
4748 aSign
= extractFloat128Sign( a
);
4749 aSig0
|= ( aSig1
!= 0 );
4750 if ( 0x401E < aExp
) {
4751 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
4754 else if ( aExp
< 0x3FFF ) {
4755 if ( aExp
|| aSig0
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4758 aSig0
|= LIT64( 0x0001000000000000 );
4759 shiftCount
= 0x402F - aExp
;
4761 aSig0
>>= shiftCount
;
4763 if ( aSign
) z
= - z
;
4764 if ( ( z
< 0 ) ^ aSign
) {
4766 float_raise( float_flag_invalid STATUS_VAR
);
4767 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
4769 if ( ( aSig0
<<shiftCount
) != savedASig
) {
4770 STATUS(float_exception_flags
) |= float_flag_inexact
;
4776 /*----------------------------------------------------------------------------
4777 | Returns the result of converting the quadruple-precision floating-point
4778 | value `a' to the 64-bit two's complement integer format. The conversion
4779 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4780 | Arithmetic---which means in particular that the conversion is rounded
4781 | according to the current rounding mode. If `a' is a NaN, the largest
4782 | positive integer is returned. Otherwise, if the conversion overflows, the
4783 | largest integer with the same sign as `a' is returned.
4784 *----------------------------------------------------------------------------*/
4786 int64
float128_to_int64( float128 a STATUS_PARAM
)
4789 int32 aExp
, shiftCount
;
4790 bits64 aSig0
, aSig1
;
4792 aSig1
= extractFloat128Frac1( a
);
4793 aSig0
= extractFloat128Frac0( a
);
4794 aExp
= extractFloat128Exp( a
);
4795 aSign
= extractFloat128Sign( a
);
4796 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4797 shiftCount
= 0x402F - aExp
;
4798 if ( shiftCount
<= 0 ) {
4799 if ( 0x403E < aExp
) {
4800 float_raise( float_flag_invalid STATUS_VAR
);
4802 || ( ( aExp
== 0x7FFF )
4803 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
4806 return LIT64( 0x7FFFFFFFFFFFFFFF );
4808 return (sbits64
) LIT64( 0x8000000000000000 );
4810 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
4813 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
4815 return roundAndPackInt64( aSign
, aSig0
, aSig1 STATUS_VAR
);
4819 /*----------------------------------------------------------------------------
4820 | Returns the result of converting the quadruple-precision floating-point
4821 | value `a' to the 64-bit two's complement integer format. The conversion
4822 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4823 | Arithmetic, except that the conversion is always rounded toward zero.
4824 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4825 | the conversion overflows, the largest integer with the same sign as `a' is
4827 *----------------------------------------------------------------------------*/
4829 int64
float128_to_int64_round_to_zero( float128 a STATUS_PARAM
)
4832 int32 aExp
, shiftCount
;
4833 bits64 aSig0
, aSig1
;
4836 aSig1
= extractFloat128Frac1( a
);
4837 aSig0
= extractFloat128Frac0( a
);
4838 aExp
= extractFloat128Exp( a
);
4839 aSign
= extractFloat128Sign( a
);
4840 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4841 shiftCount
= aExp
- 0x402F;
4842 if ( 0 < shiftCount
) {
4843 if ( 0x403E <= aExp
) {
4844 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
4845 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
4846 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
4847 if ( aSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4850 float_raise( float_flag_invalid STATUS_VAR
);
4851 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
4852 return LIT64( 0x7FFFFFFFFFFFFFFF );
4855 return (sbits64
) LIT64( 0x8000000000000000 );
4857 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
4858 if ( (bits64
) ( aSig1
<<shiftCount
) ) {
4859 STATUS(float_exception_flags
) |= float_flag_inexact
;
4863 if ( aExp
< 0x3FFF ) {
4864 if ( aExp
| aSig0
| aSig1
) {
4865 STATUS(float_exception_flags
) |= float_flag_inexact
;
4869 z
= aSig0
>>( - shiftCount
);
4871 || ( shiftCount
&& (bits64
) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
4872 STATUS(float_exception_flags
) |= float_flag_inexact
;
4875 if ( aSign
) z
= - z
;
4880 /*----------------------------------------------------------------------------
4881 | Returns the result of converting the quadruple-precision floating-point
4882 | value `a' to the single-precision floating-point format. The conversion
4883 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4885 *----------------------------------------------------------------------------*/
4887 float32
float128_to_float32( float128 a STATUS_PARAM
)
4891 bits64 aSig0
, aSig1
;
4894 aSig1
= extractFloat128Frac1( a
);
4895 aSig0
= extractFloat128Frac0( a
);
4896 aExp
= extractFloat128Exp( a
);
4897 aSign
= extractFloat128Sign( a
);
4898 if ( aExp
== 0x7FFF ) {
4899 if ( aSig0
| aSig1
) {
4900 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4902 return packFloat32( aSign
, 0xFF, 0 );
4904 aSig0
|= ( aSig1
!= 0 );
4905 shift64RightJamming( aSig0
, 18, &aSig0
);
4907 if ( aExp
|| zSig
) {
4911 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
4915 /*----------------------------------------------------------------------------
4916 | Returns the result of converting the quadruple-precision floating-point
4917 | value `a' to the double-precision floating-point format. The conversion
4918 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4920 *----------------------------------------------------------------------------*/
4922 float64
float128_to_float64( float128 a STATUS_PARAM
)
4926 bits64 aSig0
, aSig1
;
4928 aSig1
= extractFloat128Frac1( a
);
4929 aSig0
= extractFloat128Frac0( a
);
4930 aExp
= extractFloat128Exp( a
);
4931 aSign
= extractFloat128Sign( a
);
4932 if ( aExp
== 0x7FFF ) {
4933 if ( aSig0
| aSig1
) {
4934 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4936 return packFloat64( aSign
, 0x7FF, 0 );
4938 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4939 aSig0
|= ( aSig1
!= 0 );
4940 if ( aExp
|| aSig0
) {
4941 aSig0
|= LIT64( 0x4000000000000000 );
4944 return roundAndPackFloat64( aSign
, aExp
, aSig0 STATUS_VAR
);
4950 /*----------------------------------------------------------------------------
4951 | Returns the result of converting the quadruple-precision floating-point
4952 | value `a' to the extended double-precision floating-point format. The
4953 | conversion is performed according to the IEC/IEEE Standard for Binary
4954 | Floating-Point Arithmetic.
4955 *----------------------------------------------------------------------------*/
4957 floatx80
float128_to_floatx80( float128 a STATUS_PARAM
)
4961 bits64 aSig0
, aSig1
;
4963 aSig1
= extractFloat128Frac1( a
);
4964 aSig0
= extractFloat128Frac0( a
);
4965 aExp
= extractFloat128Exp( a
);
4966 aSign
= extractFloat128Sign( a
);
4967 if ( aExp
== 0x7FFF ) {
4968 if ( aSig0
| aSig1
) {
4969 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4971 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4974 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
4975 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4978 aSig0
|= LIT64( 0x0001000000000000 );
4980 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
4981 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1 STATUS_VAR
);
4987 /*----------------------------------------------------------------------------
4988 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4989 | returns the result as a quadruple-precision floating-point value. The
4990 | operation is performed according to the IEC/IEEE Standard for Binary
4991 | Floating-Point Arithmetic.
4992 *----------------------------------------------------------------------------*/
4994 float128
float128_round_to_int( float128 a STATUS_PARAM
)
4998 bits64 lastBitMask
, roundBitsMask
;
5002 aExp
= extractFloat128Exp( a
);
5003 if ( 0x402F <= aExp
) {
5004 if ( 0x406F <= aExp
) {
5005 if ( ( aExp
== 0x7FFF )
5006 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
5008 return propagateFloat128NaN( a
, a STATUS_VAR
);
5013 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
5014 roundBitsMask
= lastBitMask
- 1;
5016 roundingMode
= STATUS(float_rounding_mode
);
5017 if ( roundingMode
== float_round_nearest_even
) {
5018 if ( lastBitMask
) {
5019 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
5020 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
5023 if ( (sbits64
) z
.low
< 0 ) {
5025 if ( (bits64
) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
5029 else if ( roundingMode
!= float_round_to_zero
) {
5030 if ( extractFloat128Sign( z
)
5031 ^ ( roundingMode
== float_round_up
) ) {
5032 add128( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
5035 z
.low
&= ~ roundBitsMask
;
5038 if ( aExp
< 0x3FFF ) {
5039 if ( ( ( (bits64
) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
5040 STATUS(float_exception_flags
) |= float_flag_inexact
;
5041 aSign
= extractFloat128Sign( a
);
5042 switch ( STATUS(float_rounding_mode
) ) {
5043 case float_round_nearest_even
:
5044 if ( ( aExp
== 0x3FFE )
5045 && ( extractFloat128Frac0( a
)
5046 | extractFloat128Frac1( a
) )
5048 return packFloat128( aSign
, 0x3FFF, 0, 0 );
5051 case float_round_down
:
5053 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
5054 : packFloat128( 0, 0, 0, 0 );
5055 case float_round_up
:
5057 aSign
? packFloat128( 1, 0, 0, 0 )
5058 : packFloat128( 0, 0x3FFF, 0, 0 );
5060 return packFloat128( aSign
, 0, 0, 0 );
5063 lastBitMask
<<= 0x402F - aExp
;
5064 roundBitsMask
= lastBitMask
- 1;
5067 roundingMode
= STATUS(float_rounding_mode
);
5068 if ( roundingMode
== float_round_nearest_even
) {
5069 z
.high
+= lastBitMask
>>1;
5070 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
5071 z
.high
&= ~ lastBitMask
;
5074 else if ( roundingMode
!= float_round_to_zero
) {
5075 if ( extractFloat128Sign( z
)
5076 ^ ( roundingMode
== float_round_up
) ) {
5077 z
.high
|= ( a
.low
!= 0 );
5078 z
.high
+= roundBitsMask
;
5081 z
.high
&= ~ roundBitsMask
;
5083 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
5084 STATUS(float_exception_flags
) |= float_flag_inexact
;
5090 /*----------------------------------------------------------------------------
5091 | Returns the result of adding the absolute values of the quadruple-precision
5092 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5093 | before being returned. `zSign' is ignored if the result is a NaN.
5094 | The addition is performed according to the IEC/IEEE Standard for Binary
5095 | Floating-Point Arithmetic.
5096 *----------------------------------------------------------------------------*/
5098 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
5100 int32 aExp
, bExp
, zExp
;
5101 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
5104 aSig1
= extractFloat128Frac1( a
);
5105 aSig0
= extractFloat128Frac0( a
);
5106 aExp
= extractFloat128Exp( a
);
5107 bSig1
= extractFloat128Frac1( b
);
5108 bSig0
= extractFloat128Frac0( b
);
5109 bExp
= extractFloat128Exp( b
);
5110 expDiff
= aExp
- bExp
;
5111 if ( 0 < expDiff
) {
5112 if ( aExp
== 0x7FFF ) {
5113 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5120 bSig0
|= LIT64( 0x0001000000000000 );
5122 shift128ExtraRightJamming(
5123 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
5126 else if ( expDiff
< 0 ) {
5127 if ( bExp
== 0x7FFF ) {
5128 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5129 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5135 aSig0
|= LIT64( 0x0001000000000000 );
5137 shift128ExtraRightJamming(
5138 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
5142 if ( aExp
== 0x7FFF ) {
5143 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
5144 return propagateFloat128NaN( a
, b STATUS_VAR
);
5148 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5150 if ( STATUS(flush_to_zero
) ) return packFloat128( zSign
, 0, 0, 0 );
5151 return packFloat128( zSign
, 0, zSig0
, zSig1
);
5154 zSig0
|= LIT64( 0x0002000000000000 );
5158 aSig0
|= LIT64( 0x0001000000000000 );
5159 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5161 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
5164 shift128ExtraRightJamming(
5165 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
5167 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5171 /*----------------------------------------------------------------------------
5172 | Returns the result of subtracting the absolute values of the quadruple-
5173 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5174 | difference is negated before being returned. `zSign' is ignored if the
5175 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5176 | Standard for Binary Floating-Point Arithmetic.
5177 *----------------------------------------------------------------------------*/
5179 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
5181 int32 aExp
, bExp
, zExp
;
5182 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
5186 aSig1
= extractFloat128Frac1( a
);
5187 aSig0
= extractFloat128Frac0( a
);
5188 aExp
= extractFloat128Exp( a
);
5189 bSig1
= extractFloat128Frac1( b
);
5190 bSig0
= extractFloat128Frac0( b
);
5191 bExp
= extractFloat128Exp( b
);
5192 expDiff
= aExp
- bExp
;
5193 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
5194 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
5195 if ( 0 < expDiff
) goto aExpBigger
;
5196 if ( expDiff
< 0 ) goto bExpBigger
;
5197 if ( aExp
== 0x7FFF ) {
5198 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
5199 return propagateFloat128NaN( a
, b STATUS_VAR
);
5201 float_raise( float_flag_invalid STATUS_VAR
);
5202 z
.low
= float128_default_nan_low
;
5203 z
.high
= float128_default_nan_high
;
5210 if ( bSig0
< aSig0
) goto aBigger
;
5211 if ( aSig0
< bSig0
) goto bBigger
;
5212 if ( bSig1
< aSig1
) goto aBigger
;
5213 if ( aSig1
< bSig1
) goto bBigger
;
5214 return packFloat128( STATUS(float_rounding_mode
) == float_round_down
, 0, 0, 0 );
5216 if ( bExp
== 0x7FFF ) {
5217 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5218 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
5224 aSig0
|= LIT64( 0x4000000000000000 );
5226 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
5227 bSig0
|= LIT64( 0x4000000000000000 );
5229 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
5232 goto normalizeRoundAndPack
;
5234 if ( aExp
== 0x7FFF ) {
5235 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5242 bSig0
|= LIT64( 0x4000000000000000 );
5244 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
5245 aSig0
|= LIT64( 0x4000000000000000 );
5247 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5249 normalizeRoundAndPack
:
5251 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1 STATUS_VAR
);
5255 /*----------------------------------------------------------------------------
5256 | Returns the result of adding the quadruple-precision floating-point values
5257 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5258 | for Binary Floating-Point Arithmetic.
5259 *----------------------------------------------------------------------------*/
5261 float128
float128_add( float128 a
, float128 b STATUS_PARAM
)
5265 aSign
= extractFloat128Sign( a
);
5266 bSign
= extractFloat128Sign( b
);
5267 if ( aSign
== bSign
) {
5268 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5271 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5276 /*----------------------------------------------------------------------------
5277 | Returns the result of subtracting the quadruple-precision floating-point
5278 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5279 | Standard for Binary Floating-Point Arithmetic.
5280 *----------------------------------------------------------------------------*/
5282 float128
float128_sub( float128 a
, float128 b STATUS_PARAM
)
5286 aSign
= extractFloat128Sign( a
);
5287 bSign
= extractFloat128Sign( b
);
5288 if ( aSign
== bSign
) {
5289 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5292 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5297 /*----------------------------------------------------------------------------
5298 | Returns the result of multiplying the quadruple-precision floating-point
5299 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5300 | Standard for Binary Floating-Point Arithmetic.
5301 *----------------------------------------------------------------------------*/
5303 float128
float128_mul( float128 a
, float128 b STATUS_PARAM
)
5305 flag aSign
, bSign
, zSign
;
5306 int32 aExp
, bExp
, zExp
;
5307 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
5310 aSig1
= extractFloat128Frac1( a
);
5311 aSig0
= extractFloat128Frac0( a
);
5312 aExp
= extractFloat128Exp( a
);
5313 aSign
= extractFloat128Sign( a
);
5314 bSig1
= extractFloat128Frac1( b
);
5315 bSig0
= extractFloat128Frac0( b
);
5316 bExp
= extractFloat128Exp( b
);
5317 bSign
= extractFloat128Sign( b
);
5318 zSign
= aSign
^ bSign
;
5319 if ( aExp
== 0x7FFF ) {
5320 if ( ( aSig0
| aSig1
)
5321 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5322 return propagateFloat128NaN( a
, b STATUS_VAR
);
5324 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
5325 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5327 if ( bExp
== 0x7FFF ) {
5328 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5329 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5331 float_raise( float_flag_invalid STATUS_VAR
);
5332 z
.low
= float128_default_nan_low
;
5333 z
.high
= float128_default_nan_high
;
5336 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5339 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5340 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5343 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5344 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5346 zExp
= aExp
+ bExp
- 0x4000;
5347 aSig0
|= LIT64( 0x0001000000000000 );
5348 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
5349 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
5350 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
5351 zSig2
|= ( zSig3
!= 0 );
5352 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
5353 shift128ExtraRightJamming(
5354 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
5357 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5361 /*----------------------------------------------------------------------------
5362 | Returns the result of dividing the quadruple-precision floating-point value
5363 | `a' by the corresponding value `b'. The operation is performed according to
5364 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5365 *----------------------------------------------------------------------------*/
5367 float128
float128_div( float128 a
, float128 b STATUS_PARAM
)
5369 flag aSign
, bSign
, zSign
;
5370 int32 aExp
, bExp
, zExp
;
5371 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
5372 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5375 aSig1
= extractFloat128Frac1( a
);
5376 aSig0
= extractFloat128Frac0( a
);
5377 aExp
= extractFloat128Exp( a
);
5378 aSign
= extractFloat128Sign( a
);
5379 bSig1
= extractFloat128Frac1( b
);
5380 bSig0
= extractFloat128Frac0( b
);
5381 bExp
= extractFloat128Exp( b
);
5382 bSign
= extractFloat128Sign( b
);
5383 zSign
= aSign
^ bSign
;
5384 if ( aExp
== 0x7FFF ) {
5385 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5386 if ( bExp
== 0x7FFF ) {
5387 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5390 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5392 if ( bExp
== 0x7FFF ) {
5393 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5394 return packFloat128( zSign
, 0, 0, 0 );
5397 if ( ( bSig0
| bSig1
) == 0 ) {
5398 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5400 float_raise( float_flag_invalid STATUS_VAR
);
5401 z
.low
= float128_default_nan_low
;
5402 z
.high
= float128_default_nan_high
;
5405 float_raise( float_flag_divbyzero STATUS_VAR
);
5406 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5408 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5411 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5412 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5414 zExp
= aExp
- bExp
+ 0x3FFD;
5416 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
5418 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5419 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
5420 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
5423 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5424 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
5425 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
5426 while ( (sbits64
) rem0
< 0 ) {
5428 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
5430 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
5431 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
5432 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
5433 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
5434 while ( (sbits64
) rem1
< 0 ) {
5436 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
5438 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5440 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
5441 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5445 /*----------------------------------------------------------------------------
5446 | Returns the remainder of the quadruple-precision floating-point value `a'
5447 | with respect to the corresponding value `b'. The operation is performed
5448 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5449 *----------------------------------------------------------------------------*/
5451 float128
float128_rem( float128 a
, float128 b STATUS_PARAM
)
5454 int32 aExp
, bExp
, expDiff
;
5455 bits64 aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
5456 bits64 allZero
, alternateASig0
, alternateASig1
, sigMean1
;
5460 aSig1
= extractFloat128Frac1( a
);
5461 aSig0
= extractFloat128Frac0( a
);
5462 aExp
= extractFloat128Exp( a
);
5463 aSign
= extractFloat128Sign( a
);
5464 bSig1
= extractFloat128Frac1( b
);
5465 bSig0
= extractFloat128Frac0( b
);
5466 bExp
= extractFloat128Exp( b
);
5467 if ( aExp
== 0x7FFF ) {
5468 if ( ( aSig0
| aSig1
)
5469 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5470 return propagateFloat128NaN( a
, b STATUS_VAR
);
5474 if ( bExp
== 0x7FFF ) {
5475 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5479 if ( ( bSig0
| bSig1
) == 0 ) {
5481 float_raise( float_flag_invalid STATUS_VAR
);
5482 z
.low
= float128_default_nan_low
;
5483 z
.high
= float128_default_nan_high
;
5486 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5489 if ( ( aSig0
| aSig1
) == 0 ) return a
;
5490 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5492 expDiff
= aExp
- bExp
;
5493 if ( expDiff
< -1 ) return a
;
5495 aSig0
| LIT64( 0x0001000000000000 ),
5497 15 - ( expDiff
< 0 ),
5502 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5503 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
5504 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5506 while ( 0 < expDiff
) {
5507 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5508 q
= ( 4 < q
) ? q
- 4 : 0;
5509 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5510 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
5511 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
5512 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
5515 if ( -64 < expDiff
) {
5516 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5517 q
= ( 4 < q
) ? q
- 4 : 0;
5519 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5521 if ( expDiff
< 0 ) {
5522 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
5525 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
5527 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5528 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
5531 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
5532 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5535 alternateASig0
= aSig0
;
5536 alternateASig1
= aSig1
;
5538 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5539 } while ( 0 <= (sbits64
) aSig0
);
5541 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (bits64
*)&sigMean0
, &sigMean1
);
5542 if ( ( sigMean0
< 0 )
5543 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
5544 aSig0
= alternateASig0
;
5545 aSig1
= alternateASig1
;
5547 zSign
= ( (sbits64
) aSig0
< 0 );
5548 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
5550 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1 STATUS_VAR
);
5554 /*----------------------------------------------------------------------------
5555 | Returns the square root of the quadruple-precision floating-point value `a'.
5556 | The operation is performed according to the IEC/IEEE Standard for Binary
5557 | Floating-Point Arithmetic.
5558 *----------------------------------------------------------------------------*/
5560 float128
float128_sqrt( float128 a STATUS_PARAM
)
5564 bits64 aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
5565 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5568 aSig1
= extractFloat128Frac1( a
);
5569 aSig0
= extractFloat128Frac0( a
);
5570 aExp
= extractFloat128Exp( a
);
5571 aSign
= extractFloat128Sign( a
);
5572 if ( aExp
== 0x7FFF ) {
5573 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a STATUS_VAR
);
5574 if ( ! aSign
) return a
;
5578 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
5580 float_raise( float_flag_invalid STATUS_VAR
);
5581 z
.low
= float128_default_nan_low
;
5582 z
.high
= float128_default_nan_high
;
5586 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
5587 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5589 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
5590 aSig0
|= LIT64( 0x0001000000000000 );
5591 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
5592 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
5593 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5594 doubleZSig0
= zSig0
<<1;
5595 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5596 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5597 while ( (sbits64
) rem0
< 0 ) {
5600 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5602 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5603 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
5604 if ( zSig1
== 0 ) zSig1
= 1;
5605 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5606 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5607 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5608 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5609 while ( (sbits64
) rem1
< 0 ) {
5611 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5613 term2
|= doubleZSig0
;
5614 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5616 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5618 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
5619 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5623 /*----------------------------------------------------------------------------
5624 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5625 | the corresponding value `b', and 0 otherwise. The comparison is performed
5626 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5627 *----------------------------------------------------------------------------*/
5629 int float128_eq( float128 a
, float128 b STATUS_PARAM
)
5632 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5633 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5634 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5635 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5637 if ( float128_is_signaling_nan( a
)
5638 || float128_is_signaling_nan( b
) ) {
5639 float_raise( float_flag_invalid STATUS_VAR
);
5645 && ( ( a
.high
== b
.high
)
5647 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5652 /*----------------------------------------------------------------------------
5653 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5654 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5655 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5657 *----------------------------------------------------------------------------*/
5659 int float128_le( float128 a
, float128 b STATUS_PARAM
)
5663 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5664 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5665 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5666 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5668 float_raise( float_flag_invalid STATUS_VAR
);
5671 aSign
= extractFloat128Sign( a
);
5672 bSign
= extractFloat128Sign( b
);
5673 if ( aSign
!= bSign
) {
5676 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5680 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5681 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5685 /*----------------------------------------------------------------------------
5686 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5687 | the corresponding value `b', and 0 otherwise. The comparison is performed
5688 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5689 *----------------------------------------------------------------------------*/
5691 int float128_lt( float128 a
, float128 b STATUS_PARAM
)
5695 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5696 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5697 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5698 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5700 float_raise( float_flag_invalid STATUS_VAR
);
5703 aSign
= extractFloat128Sign( a
);
5704 bSign
= extractFloat128Sign( b
);
5705 if ( aSign
!= bSign
) {
5708 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5712 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5713 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5717 /*----------------------------------------------------------------------------
5718 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5719 | the corresponding value `b', and 0 otherwise. The invalid exception is
5720 | raised if either operand is a NaN. Otherwise, the comparison is performed
5721 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5722 *----------------------------------------------------------------------------*/
5724 int float128_eq_signaling( float128 a
, float128 b STATUS_PARAM
)
5727 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5728 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5729 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5730 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5732 float_raise( float_flag_invalid STATUS_VAR
);
5737 && ( ( a
.high
== b
.high
)
5739 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5744 /*----------------------------------------------------------------------------
5745 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5746 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5747 | cause an exception. Otherwise, the comparison is performed according to the
5748 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5749 *----------------------------------------------------------------------------*/
5751 int float128_le_quiet( float128 a
, float128 b STATUS_PARAM
)
5755 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5756 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5757 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5758 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5760 if ( float128_is_signaling_nan( a
)
5761 || float128_is_signaling_nan( b
) ) {
5762 float_raise( float_flag_invalid STATUS_VAR
);
5766 aSign
= extractFloat128Sign( a
);
5767 bSign
= extractFloat128Sign( b
);
5768 if ( aSign
!= bSign
) {
5771 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5775 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5776 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5780 /*----------------------------------------------------------------------------
5781 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5782 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5783 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5784 | Standard for Binary Floating-Point Arithmetic.
5785 *----------------------------------------------------------------------------*/
5787 int float128_lt_quiet( float128 a
, float128 b STATUS_PARAM
)
5791 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5792 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5793 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5794 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5796 if ( float128_is_signaling_nan( a
)
5797 || float128_is_signaling_nan( b
) ) {
5798 float_raise( float_flag_invalid STATUS_VAR
);
5802 aSign
= extractFloat128Sign( a
);
5803 bSign
= extractFloat128Sign( b
);
5804 if ( aSign
!= bSign
) {
5807 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5811 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5812 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5818 /* misc functions */
5819 float32
uint32_to_float32( unsigned int a STATUS_PARAM
)
5821 return int64_to_float32(a STATUS_VAR
);
5824 float64
uint32_to_float64( unsigned int a STATUS_PARAM
)
5826 return int64_to_float64(a STATUS_VAR
);
5829 unsigned int float32_to_uint32( float32 a STATUS_PARAM
)
5834 v
= float32_to_int64(a STATUS_VAR
);
5837 float_raise( float_flag_invalid STATUS_VAR
);
5838 } else if (v
> 0xffffffff) {
5840 float_raise( float_flag_invalid STATUS_VAR
);
5847 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM
)
5852 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
5855 float_raise( float_flag_invalid STATUS_VAR
);
5856 } else if (v
> 0xffffffff) {
5858 float_raise( float_flag_invalid STATUS_VAR
);
5865 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM
)
5870 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
5873 float_raise( float_flag_invalid STATUS_VAR
);
5874 } else if (v
> 0xffff) {
5876 float_raise( float_flag_invalid STATUS_VAR
);
5883 unsigned int float64_to_uint32( float64 a STATUS_PARAM
)
5888 v
= float64_to_int64(a STATUS_VAR
);
5891 float_raise( float_flag_invalid STATUS_VAR
);
5892 } else if (v
> 0xffffffff) {
5894 float_raise( float_flag_invalid STATUS_VAR
);
5901 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM
)
5906 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
5909 float_raise( float_flag_invalid STATUS_VAR
);
5910 } else if (v
> 0xffffffff) {
5912 float_raise( float_flag_invalid STATUS_VAR
);
5919 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM
)
5924 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
5927 float_raise( float_flag_invalid STATUS_VAR
);
5928 } else if (v
> 0xffff) {
5930 float_raise( float_flag_invalid STATUS_VAR
);
5937 /* FIXME: This looks broken. */
5938 uint64_t float64_to_uint64 (float64 a STATUS_PARAM
)
5942 v
= float64_val(int64_to_float64(INT64_MIN STATUS_VAR
));
5943 v
+= float64_val(a
);
5944 v
= float64_to_int64(make_float64(v
) STATUS_VAR
);
5946 return v
- INT64_MIN
;
5949 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM
)
5953 v
= float64_val(int64_to_float64(INT64_MIN STATUS_VAR
));
5954 v
+= float64_val(a
);
5955 v
= float64_to_int64_round_to_zero(make_float64(v
) STATUS_VAR
);
5957 return v
- INT64_MIN
;
5960 #define COMPARE(s, nan_exp) \
5961 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5962 int is_quiet STATUS_PARAM ) \
5964 flag aSign, bSign; \
5966 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
5967 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
5969 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5970 extractFloat ## s ## Frac( a ) ) || \
5971 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5972 extractFloat ## s ## Frac( b ) )) { \
5974 float ## s ## _is_signaling_nan( a ) || \
5975 float ## s ## _is_signaling_nan( b ) ) { \
5976 float_raise( float_flag_invalid STATUS_VAR); \
5978 return float_relation_unordered; \
5980 aSign = extractFloat ## s ## Sign( a ); \
5981 bSign = extractFloat ## s ## Sign( b ); \
5982 av = float ## s ## _val(a); \
5983 bv = float ## s ## _val(b); \
5984 if ( aSign != bSign ) { \
5985 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \
5987 return float_relation_equal; \
5989 return 1 - (2 * aSign); \
5993 return float_relation_equal; \
5995 return 1 - 2 * (aSign ^ ( av < bv )); \
6000 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6002 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6005 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6007 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6013 INLINE
int float128_compare_internal( float128 a
, float128 b
,
6014 int is_quiet STATUS_PARAM
)
6018 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
6019 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
6020 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
6021 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
6023 float128_is_signaling_nan( a
) ||
6024 float128_is_signaling_nan( b
) ) {
6025 float_raise( float_flag_invalid STATUS_VAR
);
6027 return float_relation_unordered
;
6029 aSign
= extractFloat128Sign( a
);
6030 bSign
= extractFloat128Sign( b
);
6031 if ( aSign
!= bSign
) {
6032 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
6034 return float_relation_equal
;
6036 return 1 - (2 * aSign
);
6039 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6040 return float_relation_equal
;
6042 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6047 int float128_compare( float128 a
, float128 b STATUS_PARAM
)
6049 return float128_compare_internal(a
, b
, 0 STATUS_VAR
);
6052 int float128_compare_quiet( float128 a
, float128 b STATUS_PARAM
)
6054 return float128_compare_internal(a
, b
, 1 STATUS_VAR
);
6057 /* Multiply A by 2 raised to the power N. */
6058 float32
float32_scalbn( float32 a
, int n STATUS_PARAM
)
6064 a
= float32_squash_input_denormal(a STATUS_VAR
);
6065 aSig
= extractFloat32Frac( a
);
6066 aExp
= extractFloat32Exp( a
);
6067 aSign
= extractFloat32Sign( a
);
6069 if ( aExp
== 0xFF ) {
6074 else if ( aSig
== 0 )
6079 return normalizeRoundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
6082 float64
float64_scalbn( float64 a
, int n STATUS_PARAM
)
6088 a
= float64_squash_input_denormal(a STATUS_VAR
);
6089 aSig
= extractFloat64Frac( a
);
6090 aExp
= extractFloat64Exp( a
);
6091 aSign
= extractFloat64Sign( a
);
6093 if ( aExp
== 0x7FF ) {
6097 aSig
|= LIT64( 0x0010000000000000 );
6098 else if ( aSig
== 0 )
6103 return normalizeRoundAndPackFloat64( aSign
, aExp
, aSig STATUS_VAR
);
6107 floatx80
floatx80_scalbn( floatx80 a
, int n STATUS_PARAM
)
6113 aSig
= extractFloatx80Frac( a
);
6114 aExp
= extractFloatx80Exp( a
);
6115 aSign
= extractFloatx80Sign( a
);
6117 if ( aExp
== 0x7FF ) {
6120 if (aExp
== 0 && aSig
== 0)
6124 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision
),
6125 aSign
, aExp
, aSig
, 0 STATUS_VAR
);
6130 float128
float128_scalbn( float128 a
, int n STATUS_PARAM
)
6134 bits64 aSig0
, aSig1
;
6136 aSig1
= extractFloat128Frac1( a
);
6137 aSig0
= extractFloat128Frac0( a
);
6138 aExp
= extractFloat128Exp( a
);
6139 aSign
= extractFloat128Sign( a
);
6140 if ( aExp
== 0x7FFF ) {
6144 aSig0
|= LIT64( 0x0001000000000000 );
6145 else if ( aSig0
== 0 && aSig1
== 0 )
6149 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1