1 /* $NetBSD: softfloat.c,v 1.3 2013/01/10 08:16:11 matt Exp $ */
4 * This version hacked for use with gcc -msoft-float by bjh21.
5 * (Mostly a case of #ifdefing out things GCC doesn't need or provides
10 * Things you may want to define:
12 * SOFTFLOAT_FOR_GCC - build only those functions necessary for GCC (with
13 * -msoft-float) to work. Include "softfloat-for-gcc.h" to get them
18 * This differs from the standard bits32/softfloat.c in that float64
19 * is defined to be a 64-bit integer rather than a structure. The
20 * structure is float64s, with translation between the two going via
25 ===============================================================================
27 This C source file is part of the SoftFloat IEC/IEEE Floating-Point
28 Arithmetic Package, Release 2a.
30 Written by John R. Hauser. This work was made possible in part by the
31 International Computer Science Institute, located at Suite 600, 1947 Center
32 Street, Berkeley, California 94704. Funding was partially provided by the
33 National Science Foundation under grant MIP-9311980. The original version
34 of this code was written as part of a project to build a fixed-point vector
35 processor in collaboration with the University of California at Berkeley,
36 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
37 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
38 arithmetic/SoftFloat.html'.
40 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
41 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
42 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
43 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
44 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
46 Derivative works are acceptable, even for commercial purposes, so long as
47 (1) they include prominent notice that the work is derivative, and (2) they
48 include prominent notice akin to these four paragraphs for those parts of
49 this code that are retained.
51 ===============================================================================
54 #if defined(LIBC_SCCS) && !defined(lint)
55 __RCSID("$NetBSD: softfloat.c,v 1.3 2013/01/10 08:16:11 matt Exp $");
56 #endif /* LIBC_SCCS and not lint */
58 #ifdef SOFTFLOAT_FOR_GCC
59 #include "softfloat-for-gcc.h"
63 #include "softfloat.h"
66 * Conversions between floats as stored in memory and floats as
69 #ifndef FLOAT64_DEMANGLE
70 #define FLOAT64_DEMANGLE(a) (a)
72 #ifndef FLOAT64_MANGLE
73 #define FLOAT64_MANGLE(a) (a)
77 -------------------------------------------------------------------------------
78 Floating-point rounding mode and exception flags.
79 -------------------------------------------------------------------------------
81 #ifndef set_float_rounding_mode
82 fp_rnd float_rounding_mode
= float_round_nearest_even
;
83 fp_except float_exception_flags
= 0;
85 #ifndef set_float_exception_inexact_flag
86 #define set_float_exception_inexact_flag() \
87 ((void)(float_exception_flags |= float_flag_inexact))
91 -------------------------------------------------------------------------------
92 Primitive arithmetic functions, including multi-word arithmetic, and
93 division and square root approximations. (Can be specialized to target if
95 -------------------------------------------------------------------------------
97 #include "softfloat-macros"
100 -------------------------------------------------------------------------------
101 Functions and definitions to determine: (1) whether tininess for underflow
102 is detected before or after rounding by default, (2) what (if anything)
103 happens when exceptions are raised, (3) how signaling NaNs are distinguished
104 from quiet NaNs, (4) the default generated quiet NaNs, and (4) how NaNs
105 are propagated from function inputs to output. These details are target-
107 -------------------------------------------------------------------------------
109 #include "softfloat-specialize"
112 -------------------------------------------------------------------------------
113 Returns the fraction bits of the single-precision floating-point value `a'.
114 -------------------------------------------------------------------------------
116 INLINE bits32
extractFloat32Frac( float32 a
)
119 return a
& 0x007FFFFF;
124 -------------------------------------------------------------------------------
125 Returns the exponent bits of the single-precision floating-point value `a'.
126 -------------------------------------------------------------------------------
128 INLINE int16
extractFloat32Exp( float32 a
)
131 return ( a
>>23 ) & 0xFF;
136 -------------------------------------------------------------------------------
137 Returns the sign bit of the single-precision floating-point value `a'.
138 -------------------------------------------------------------------------------
140 INLINE flag
extractFloat32Sign( float32 a
)
148 -------------------------------------------------------------------------------
149 Normalizes the subnormal single-precision floating-point value represented
150 by the denormalized significand `aSig'. The normalized exponent and
151 significand are stored at the locations pointed to by `zExpPtr' and
152 `zSigPtr', respectively.
153 -------------------------------------------------------------------------------
156 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
160 shiftCount
= countLeadingZeros32( aSig
) - 8;
161 *zSigPtr
= aSig
<<shiftCount
;
162 *zExpPtr
= 1 - shiftCount
;
167 -------------------------------------------------------------------------------
168 Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
169 single-precision floating-point value, returning the result. After being
170 shifted into the proper positions, the three fields are simply added
171 together to form the result. This means that any integer portion of `zSig'
172 will be added into the exponent. Since a properly normalized significand
173 will have an integer portion equal to 1, the `zExp' input should be 1 less
174 than the desired result exponent whenever `zSig' is a complete, normalized
176 -------------------------------------------------------------------------------
178 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
181 return ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
;
186 -------------------------------------------------------------------------------
187 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
188 and significand `zSig', and returns the proper single-precision floating-
189 point value corresponding to the abstract input. Ordinarily, the abstract
190 value is simply rounded and packed into the single-precision format, with
191 the inexact exception raised if the abstract input cannot be represented
192 exactly. However, if the abstract value is too large, the overflow and
193 inexact exceptions are raised and an infinity or maximal finite value is
194 returned. If the abstract value is too small, the input value is rounded to
195 a subnormal number, and the underflow and inexact exceptions are raised if
196 the abstract input cannot be represented exactly as a subnormal single-
197 precision floating-point number.
198 The input significand `zSig' has its binary point between bits 30
199 and 29, which is 7 bits to the left of the usual location. This shifted
200 significand must be normalized or smaller. If `zSig' is not normalized,
201 `zExp' must be 0; in that case, the result returned is a subnormal number,
202 and it must not require rounding. In the usual case that `zSig' is
203 normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
204 The handling of underflow and overflow follows the IEC/IEEE Standard for
205 Binary Floating-Point Arithmetic.
206 -------------------------------------------------------------------------------
208 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
211 flag roundNearestEven
;
212 int8 roundIncrement
, roundBits
;
215 roundingMode
= float_rounding_mode
;
216 roundNearestEven
= roundingMode
== float_round_nearest_even
;
217 roundIncrement
= 0x40;
218 if ( ! roundNearestEven
) {
219 if ( roundingMode
== float_round_to_zero
) {
223 roundIncrement
= 0x7F;
225 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
228 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
232 roundBits
= zSig
& 0x7F;
233 if ( 0xFD <= (bits16
) zExp
) {
235 || ( ( zExp
== 0xFD )
236 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
238 float_raise( float_flag_overflow
| float_flag_inexact
);
239 return packFloat32( zSign
, 0xFF, 0 ) - ( roundIncrement
== 0 );
243 ( float_detect_tininess
== float_tininess_before_rounding
)
245 || ( zSig
+ roundIncrement
< (uint32
)0x80000000 );
246 shift32RightJamming( zSig
, - zExp
, &zSig
);
248 roundBits
= zSig
& 0x7F;
249 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
252 if ( roundBits
) set_float_exception_inexact_flag();
253 zSig
= ( zSig
+ roundIncrement
)>>7;
254 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
255 if ( zSig
== 0 ) zExp
= 0;
256 return packFloat32( zSign
, zExp
, zSig
);
261 -------------------------------------------------------------------------------
262 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
263 and significand `zSig', and returns the proper single-precision floating-
264 point value corresponding to the abstract input. This routine is just like
265 `roundAndPackFloat32' except that `zSig' does not have to be normalized.
266 Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
267 floating-point exponent.
268 -------------------------------------------------------------------------------
271 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
275 shiftCount
= countLeadingZeros32( zSig
) - 1;
276 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
281 -------------------------------------------------------------------------------
282 Returns the least-significant 32 fraction bits of the double-precision
283 floating-point value `a'.
284 -------------------------------------------------------------------------------
286 INLINE bits32
extractFloat64Frac1( float64 a
)
289 return (bits32
)(FLOAT64_DEMANGLE(a
) & LIT64(0x00000000FFFFFFFF));
294 -------------------------------------------------------------------------------
295 Returns the most-significant 20 fraction bits of the double-precision
296 floating-point value `a'.
297 -------------------------------------------------------------------------------
299 INLINE bits32
extractFloat64Frac0( float64 a
)
302 return (bits32
)((FLOAT64_DEMANGLE(a
) >> 32) & 0x000FFFFF);
307 -------------------------------------------------------------------------------
308 Returns the exponent bits of the double-precision floating-point value `a'.
309 -------------------------------------------------------------------------------
311 INLINE int16
extractFloat64Exp( float64 a
)
314 return (int16
)((FLOAT64_DEMANGLE(a
) >> 52) & 0x7FF);
319 -------------------------------------------------------------------------------
320 Returns the sign bit of the double-precision floating-point value `a'.
321 -------------------------------------------------------------------------------
323 INLINE flag
extractFloat64Sign( float64 a
)
326 return (flag
)(FLOAT64_DEMANGLE(a
) >> 63);
331 -------------------------------------------------------------------------------
332 Normalizes the subnormal double-precision floating-point value represented
333 by the denormalized significand formed by the concatenation of `aSig0' and
334 `aSig1'. The normalized exponent is stored at the location pointed to by
335 `zExpPtr'. The most significant 21 bits of the normalized significand are
336 stored at the location pointed to by `zSig0Ptr', and the least significant
337 32 bits of the normalized significand are stored at the location pointed to
339 -------------------------------------------------------------------------------
342 normalizeFloat64Subnormal(
353 shiftCount
= countLeadingZeros32( aSig1
) - 11;
354 if ( shiftCount
< 0 ) {
355 *zSig0Ptr
= aSig1
>>( - shiftCount
);
356 *zSig1Ptr
= aSig1
<<( shiftCount
& 31 );
359 *zSig0Ptr
= aSig1
<<shiftCount
;
362 *zExpPtr
= - shiftCount
- 31;
365 shiftCount
= countLeadingZeros32( aSig0
) - 11;
366 shortShift64Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
367 *zExpPtr
= 1 - shiftCount
;
373 -------------------------------------------------------------------------------
374 Packs the sign `zSign', the exponent `zExp', and the significand formed by
375 the concatenation of `zSig0' and `zSig1' into a double-precision floating-
376 point value, returning the result. After being shifted into the proper
377 positions, the three fields `zSign', `zExp', and `zSig0' are simply added
378 together to form the most significant 32 bits of the result. This means
379 that any integer portion of `zSig0' will be added into the exponent. Since
380 a properly normalized significand will have an integer portion equal to 1,
381 the `zExp' input should be 1 less than the desired result exponent whenever
382 `zSig0' and `zSig1' concatenated form a complete, normalized significand.
383 -------------------------------------------------------------------------------
386 packFloat64( flag zSign
, int16 zExp
, bits32 zSig0
, bits32 zSig1
)
389 return FLOAT64_MANGLE( ( ( (bits64
) zSign
)<<63 ) +
390 ( ( (bits64
) zExp
)<<52 ) +
391 ( ( (bits64
) zSig0
)<<32 ) + zSig1
);
397 -------------------------------------------------------------------------------
398 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
399 and extended significand formed by the concatenation of `zSig0', `zSig1',
400 and `zSig2', and returns the proper double-precision floating-point value
401 corresponding to the abstract input. Ordinarily, the abstract value is
402 simply rounded and packed into the double-precision format, with the inexact
403 exception raised if the abstract input cannot be represented exactly.
404 However, if the abstract value is too large, the overflow and inexact
405 exceptions are raised and an infinity or maximal finite value is returned.
406 If the abstract value is too small, the input value is rounded to a
407 subnormal number, and the underflow and inexact exceptions are raised if the
408 abstract input cannot be represented exactly as a subnormal double-precision
409 floating-point number.
410 The input significand must be normalized or smaller. If the input
411 significand is not normalized, `zExp' must be 0; in that case, the result
412 returned is a subnormal number, and it must not require rounding. In the
413 usual case that the input significand is normalized, `zExp' must be 1 less
414 than the ``true'' floating-point exponent. The handling of underflow and
415 overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
416 -------------------------------------------------------------------------------
420 flag zSign
, int16 zExp
, bits32 zSig0
, bits32 zSig1
, bits32 zSig2
)
423 flag roundNearestEven
, increment
, isTiny
;
425 roundingMode
= float_rounding_mode
;
426 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
427 increment
= ( (sbits32
) zSig2
< 0 );
428 if ( ! roundNearestEven
) {
429 if ( roundingMode
== float_round_to_zero
) {
434 increment
= ( roundingMode
== float_round_down
) && zSig2
;
437 increment
= ( roundingMode
== float_round_up
) && zSig2
;
441 if ( 0x7FD <= (bits16
) zExp
) {
442 if ( ( 0x7FD < zExp
)
443 || ( ( zExp
== 0x7FD )
444 && eq64( 0x001FFFFF, 0xFFFFFFFF, zSig0
, zSig1
)
448 float_raise( float_flag_overflow
| float_flag_inexact
);
449 if ( ( roundingMode
== float_round_to_zero
)
450 || ( zSign
&& ( roundingMode
== float_round_up
) )
451 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
453 return packFloat64( zSign
, 0x7FE, 0x000FFFFF, 0xFFFFFFFF );
455 return packFloat64( zSign
, 0x7FF, 0, 0 );
459 ( float_detect_tininess
== float_tininess_before_rounding
)
462 || lt64( zSig0
, zSig1
, 0x001FFFFF, 0xFFFFFFFF );
463 shift64ExtraRightJamming(
464 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
466 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow
);
467 if ( roundNearestEven
) {
468 increment
= ( (sbits32
) zSig2
< 0 );
472 increment
= ( roundingMode
== float_round_down
) && zSig2
;
475 increment
= ( roundingMode
== float_round_up
) && zSig2
;
480 if ( zSig2
) set_float_exception_inexact_flag();
482 add64( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
483 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
486 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
488 return packFloat64( zSign
, zExp
, zSig0
, zSig1
);
493 -------------------------------------------------------------------------------
494 Takes an abstract floating-point value having sign `zSign', exponent `zExp',
495 and significand formed by the concatenation of `zSig0' and `zSig1', and
496 returns the proper double-precision floating-point value corresponding
497 to the abstract input. This routine is just like `roundAndPackFloat64'
498 except that the input significand has fewer bits and does not have to be
499 normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
501 -------------------------------------------------------------------------------
504 normalizeRoundAndPackFloat64(
505 flag zSign
, int16 zExp
, bits32 zSig0
, bits32 zSig1
)
515 shiftCount
= countLeadingZeros32( zSig0
) - 11;
516 if ( 0 <= shiftCount
) {
518 shortShift64Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
521 shift64ExtraRightJamming(
522 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
525 return roundAndPackFloat64( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
530 -------------------------------------------------------------------------------
531 Returns the result of converting the 32-bit two's complement integer `a' to
532 the single-precision floating-point format. The conversion is performed
533 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
534 -------------------------------------------------------------------------------
536 float32
int32_to_float32( int32 a
)
540 if ( a
== 0 ) return 0;
541 if ( a
== (sbits32
) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
543 return normalizeRoundAndPackFloat32(zSign
, 0x9C, (uint32
)(zSign
? - a
: a
));
548 -------------------------------------------------------------------------------
549 Returns the result of converting the 32-bit two's complement integer `a' to
550 the double-precision floating-point format. The conversion is performed
551 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
552 -------------------------------------------------------------------------------
554 float64
int32_to_float64( int32 a
)
561 if ( a
== 0 ) return packFloat64( 0, 0, 0, 0 );
563 absA
= zSign
? - a
: a
;
564 shiftCount
= countLeadingZeros32( absA
) - 11;
565 if ( 0 <= shiftCount
) {
566 zSig0
= absA
<<shiftCount
;
570 shift64Right( absA
, 0, - shiftCount
, &zSig0
, &zSig1
);
572 return packFloat64( zSign
, 0x412 - shiftCount
, zSig0
, zSig1
);
576 #ifndef SOFTFLOAT_FOR_GCC
578 -------------------------------------------------------------------------------
579 Returns the result of converting the single-precision floating-point value
580 `a' to the 32-bit two's complement integer format. The conversion is
581 performed according to the IEC/IEEE Standard for Binary Floating-Point
582 Arithmetic---which means in particular that the conversion is rounded
583 according to the current rounding mode. If `a' is a NaN, the largest
584 positive integer is returned. Otherwise, if the conversion overflows, the
585 largest integer with the same sign as `a' is returned.
586 -------------------------------------------------------------------------------
588 int32
float32_to_int32( float32 a
)
591 int16 aExp
, shiftCount
;
592 bits32 aSig
, aSigExtra
;
596 aSig
= extractFloat32Frac( a
);
597 aExp
= extractFloat32Exp( a
);
598 aSign
= extractFloat32Sign( a
);
599 shiftCount
= aExp
- 0x96;
600 if ( 0 <= shiftCount
) {
601 if ( 0x9E <= aExp
) {
602 if ( a
!= 0xCF000000 ) {
603 float_raise( float_flag_invalid
);
604 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
608 return (sbits32
) 0x80000000;
610 z
= ( aSig
| 0x00800000 )<<shiftCount
;
611 if ( aSign
) z
= - z
;
615 aSigExtra
= aExp
| aSig
;
620 aSigExtra
= aSig
<<( shiftCount
& 31 );
621 z
= aSig
>>( - shiftCount
);
623 if ( aSigExtra
) set_float_exception_inexact_flag();
624 roundingMode
= float_rounding_mode
;
625 if ( roundingMode
== float_round_nearest_even
) {
626 if ( (sbits32
) aSigExtra
< 0 ) {
628 if ( (bits32
) ( aSigExtra
<<1 ) == 0 ) z
&= ~1;
630 if ( aSign
) z
= - z
;
633 aSigExtra
= ( aSigExtra
!= 0 );
635 z
+= ( roundingMode
== float_round_down
) & aSigExtra
;
639 z
+= ( roundingMode
== float_round_up
) & aSigExtra
;
649 -------------------------------------------------------------------------------
650 Returns the result of converting the single-precision floating-point value
651 `a' to the 32-bit two's complement integer format. The conversion is
652 performed according to the IEC/IEEE Standard for Binary Floating-Point
653 Arithmetic, except that the conversion is always rounded toward zero.
654 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
655 the conversion overflows, the largest integer with the same sign as `a' is
657 -------------------------------------------------------------------------------
659 int32
float32_to_int32_round_to_zero( float32 a
)
662 int16 aExp
, shiftCount
;
666 aSig
= extractFloat32Frac( a
);
667 aExp
= extractFloat32Exp( a
);
668 aSign
= extractFloat32Sign( a
);
669 shiftCount
= aExp
- 0x9E;
670 if ( 0 <= shiftCount
) {
671 if ( a
!= 0xCF000000 ) {
672 float_raise( float_flag_invalid
);
673 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
675 return (sbits32
) 0x80000000;
677 else if ( aExp
<= 0x7E ) {
678 if ( aExp
| aSig
) set_float_exception_inexact_flag();
681 aSig
= ( aSig
| 0x00800000 )<<8;
682 z
= aSig
>>( - shiftCount
);
683 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
684 set_float_exception_inexact_flag();
686 if ( aSign
) z
= - z
;
692 -------------------------------------------------------------------------------
693 Returns the result of converting the single-precision floating-point value
694 `a' to the double-precision floating-point format. The conversion is
695 performed according to the IEC/IEEE Standard for Binary Floating-Point
697 -------------------------------------------------------------------------------
699 float64
float32_to_float64( float32 a
)
703 bits32 aSig
, zSig0
, zSig1
;
705 aSig
= extractFloat32Frac( a
);
706 aExp
= extractFloat32Exp( a
);
707 aSign
= extractFloat32Sign( a
);
708 if ( aExp
== 0xFF ) {
709 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a
) );
710 return packFloat64( aSign
, 0x7FF, 0, 0 );
713 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0, 0 );
714 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
717 shift64Right( aSig
, 0, 3, &zSig0
, &zSig1
);
718 return packFloat64( aSign
, aExp
+ 0x380, zSig0
, zSig1
);
722 #ifndef SOFTFLOAT_FOR_GCC
724 -------------------------------------------------------------------------------
725 Rounds the single-precision floating-point value `a' to an integer,
726 and returns the result as a single-precision floating-point value. The
727 operation is performed according to the IEC/IEEE Standard for Binary
728 Floating-Point Arithmetic.
729 -------------------------------------------------------------------------------
731 float32
float32_round_to_int( float32 a
)
735 bits32 lastBitMask
, roundBitsMask
;
739 aExp
= extractFloat32Exp( a
);
740 if ( 0x96 <= aExp
) {
741 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
742 return propagateFloat32NaN( a
, a
);
746 if ( aExp
<= 0x7E ) {
747 if ( (bits32
) ( a
<<1 ) == 0 ) return a
;
748 set_float_exception_inexact_flag();
749 aSign
= extractFloat32Sign( a
);
750 switch ( float_rounding_mode
) {
751 case float_round_nearest_even
:
752 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
753 return packFloat32( aSign
, 0x7F, 0 );
756 case float_round_to_zero
:
758 case float_round_down
:
759 return aSign
? 0xBF800000 : 0;
761 return aSign
? 0x80000000 : 0x3F800000;
763 return packFloat32( aSign
, 0, 0 );
766 lastBitMask
<<= 0x96 - aExp
;
767 roundBitsMask
= lastBitMask
- 1;
769 roundingMode
= float_rounding_mode
;
770 if ( roundingMode
== float_round_nearest_even
) {
772 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
774 else if ( roundingMode
!= float_round_to_zero
) {
775 if ( extractFloat32Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
779 z
&= ~ roundBitsMask
;
780 if ( z
!= a
) set_float_exception_inexact_flag();
787 -------------------------------------------------------------------------------
788 Returns the result of adding the absolute values of the single-precision
789 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
790 before being returned. `zSign' is ignored if the result is a NaN.
791 The addition is performed according to the IEC/IEEE Standard for Binary
792 Floating-Point Arithmetic.
793 -------------------------------------------------------------------------------
795 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign
)
797 int16 aExp
, bExp
, zExp
;
798 bits32 aSig
, bSig
, zSig
;
801 aSig
= extractFloat32Frac( a
);
802 aExp
= extractFloat32Exp( a
);
803 bSig
= extractFloat32Frac( b
);
804 bExp
= extractFloat32Exp( b
);
805 expDiff
= aExp
- bExp
;
809 if ( aExp
== 0xFF ) {
810 if ( aSig
) return propagateFloat32NaN( a
, b
);
819 shift32RightJamming( bSig
, expDiff
, &bSig
);
822 else if ( expDiff
< 0 ) {
823 if ( bExp
== 0xFF ) {
824 if ( bSig
) return propagateFloat32NaN( a
, b
);
825 return packFloat32( zSign
, 0xFF, 0 );
833 shift32RightJamming( aSig
, - expDiff
, &aSig
);
837 if ( aExp
== 0xFF ) {
838 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
841 if ( aExp
== 0 ) return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
842 zSig
= 0x40000000 + aSig
+ bSig
;
847 zSig
= ( aSig
+ bSig
)<<1;
849 if ( (sbits32
) zSig
< 0 ) {
854 return roundAndPackFloat32( zSign
, zExp
, zSig
);
859 -------------------------------------------------------------------------------
860 Returns the result of subtracting the absolute values of the single-
861 precision floating-point values `a' and `b'. If `zSign' is 1, the
862 difference is negated before being returned. `zSign' is ignored if the
863 result is a NaN. The subtraction is performed according to the IEC/IEEE
864 Standard for Binary Floating-Point Arithmetic.
865 -------------------------------------------------------------------------------
867 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign
)
869 int16 aExp
, bExp
, zExp
;
870 bits32 aSig
, bSig
, zSig
;
873 aSig
= extractFloat32Frac( a
);
874 aExp
= extractFloat32Exp( a
);
875 bSig
= extractFloat32Frac( b
);
876 bExp
= extractFloat32Exp( b
);
877 expDiff
= aExp
- bExp
;
880 if ( 0 < expDiff
) goto aExpBigger
;
881 if ( expDiff
< 0 ) goto bExpBigger
;
882 if ( aExp
== 0xFF ) {
883 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
884 float_raise( float_flag_invalid
);
885 return float32_default_nan
;
891 if ( bSig
< aSig
) goto aBigger
;
892 if ( aSig
< bSig
) goto bBigger
;
893 return packFloat32( float_rounding_mode
== float_round_down
, 0, 0 );
895 if ( bExp
== 0xFF ) {
896 if ( bSig
) return propagateFloat32NaN( a
, b
);
897 return packFloat32( zSign
^ 1, 0xFF, 0 );
905 shift32RightJamming( aSig
, - expDiff
, &aSig
);
911 goto normalizeRoundAndPack
;
913 if ( aExp
== 0xFF ) {
914 if ( aSig
) return propagateFloat32NaN( a
, b
);
923 shift32RightJamming( bSig
, expDiff
, &bSig
);
928 normalizeRoundAndPack
:
930 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig
);
935 -------------------------------------------------------------------------------
936 Returns the result of adding the single-precision floating-point values `a'
937 and `b'. The operation is performed according to the IEC/IEEE Standard for
938 Binary Floating-Point Arithmetic.
939 -------------------------------------------------------------------------------
941 float32
float32_add( float32 a
, float32 b
)
945 aSign
= extractFloat32Sign( a
);
946 bSign
= extractFloat32Sign( b
);
947 if ( aSign
== bSign
) {
948 return addFloat32Sigs( a
, b
, aSign
);
951 return subFloat32Sigs( a
, b
, aSign
);
957 -------------------------------------------------------------------------------
958 Returns the result of subtracting the single-precision floating-point values
959 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
960 for Binary Floating-Point Arithmetic.
961 -------------------------------------------------------------------------------
963 float32
float32_sub( float32 a
, float32 b
)
967 aSign
= extractFloat32Sign( a
);
968 bSign
= extractFloat32Sign( b
);
969 if ( aSign
== bSign
) {
970 return subFloat32Sigs( a
, b
, aSign
);
973 return addFloat32Sigs( a
, b
, aSign
);
979 -------------------------------------------------------------------------------
980 Returns the result of multiplying the single-precision floating-point values
981 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
982 for Binary Floating-Point Arithmetic.
983 -------------------------------------------------------------------------------
985 float32
float32_mul( float32 a
, float32 b
)
987 flag aSign
, bSign
, zSign
;
988 int16 aExp
, bExp
, zExp
;
989 bits32 aSig
, bSig
, zSig0
, zSig1
;
991 aSig
= extractFloat32Frac( a
);
992 aExp
= extractFloat32Exp( a
);
993 aSign
= extractFloat32Sign( a
);
994 bSig
= extractFloat32Frac( b
);
995 bExp
= extractFloat32Exp( b
);
996 bSign
= extractFloat32Sign( b
);
997 zSign
= aSign
^ bSign
;
998 if ( aExp
== 0xFF ) {
999 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1000 return propagateFloat32NaN( a
, b
);
1002 if ( ( bExp
| bSig
) == 0 ) {
1003 float_raise( float_flag_invalid
);
1004 return float32_default_nan
;
1006 return packFloat32( zSign
, 0xFF, 0 );
1008 if ( bExp
== 0xFF ) {
1009 if ( bSig
) return propagateFloat32NaN( a
, b
);
1010 if ( ( aExp
| aSig
) == 0 ) {
1011 float_raise( float_flag_invalid
);
1012 return float32_default_nan
;
1014 return packFloat32( zSign
, 0xFF, 0 );
1017 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1018 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1021 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1022 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1024 zExp
= aExp
+ bExp
- 0x7F;
1025 aSig
= ( aSig
| 0x00800000 )<<7;
1026 bSig
= ( bSig
| 0x00800000 )<<8;
1027 mul32To64( aSig
, bSig
, &zSig0
, &zSig1
);
1028 zSig0
|= ( zSig1
!= 0 );
1029 if ( 0 <= (sbits32
) ( zSig0
<<1 ) ) {
1033 return roundAndPackFloat32( zSign
, zExp
, zSig0
);
1038 -------------------------------------------------------------------------------
1039 Returns the result of dividing the single-precision floating-point value `a'
1040 by the corresponding value `b'. The operation is performed according to the
1041 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1042 -------------------------------------------------------------------------------
1044 float32
float32_div( float32 a
, float32 b
)
1046 flag aSign
, bSign
, zSign
;
1047 int16 aExp
, bExp
, zExp
;
1048 bits32 aSig
, bSig
, zSig
, rem0
, rem1
, term0
, term1
;
1050 aSig
= extractFloat32Frac( a
);
1051 aExp
= extractFloat32Exp( a
);
1052 aSign
= extractFloat32Sign( a
);
1053 bSig
= extractFloat32Frac( b
);
1054 bExp
= extractFloat32Exp( b
);
1055 bSign
= extractFloat32Sign( b
);
1056 zSign
= aSign
^ bSign
;
1057 if ( aExp
== 0xFF ) {
1058 if ( aSig
) return propagateFloat32NaN( a
, b
);
1059 if ( bExp
== 0xFF ) {
1060 if ( bSig
) return propagateFloat32NaN( a
, b
);
1061 float_raise( float_flag_invalid
);
1062 return float32_default_nan
;
1064 return packFloat32( zSign
, 0xFF, 0 );
1066 if ( bExp
== 0xFF ) {
1067 if ( bSig
) return propagateFloat32NaN( a
, b
);
1068 return packFloat32( zSign
, 0, 0 );
1072 if ( ( aExp
| aSig
) == 0 ) {
1073 float_raise( float_flag_invalid
);
1074 return float32_default_nan
;
1076 float_raise( float_flag_divbyzero
);
1077 return packFloat32( zSign
, 0xFF, 0 );
1079 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1082 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1083 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1085 zExp
= aExp
- bExp
+ 0x7D;
1086 aSig
= ( aSig
| 0x00800000 )<<7;
1087 bSig
= ( bSig
| 0x00800000 )<<8;
1088 if ( bSig
<= ( aSig
+ aSig
) ) {
1092 zSig
= estimateDiv64To32( aSig
, 0, bSig
);
1093 if ( ( zSig
& 0x3F ) <= 2 ) {
1094 mul32To64( bSig
, zSig
, &term0
, &term1
);
1095 sub64( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
1096 while ( (sbits32
) rem0
< 0 ) {
1098 add64( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
1100 zSig
|= ( rem1
!= 0 );
1102 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1106 #ifndef SOFTFLOAT_FOR_GCC
1108 -------------------------------------------------------------------------------
1109 Returns the remainder of the single-precision floating-point value `a'
1110 with respect to the corresponding value `b'. The operation is performed
1111 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1112 -------------------------------------------------------------------------------
1114 float32
float32_rem( float32 a
, float32 b
)
1116 flag aSign
, bSign
, zSign
;
1117 int16 aExp
, bExp
, expDiff
;
1118 bits32 aSig
, bSig
, q
, allZero
, alternateASig
;
1121 aSig
= extractFloat32Frac( a
);
1122 aExp
= extractFloat32Exp( a
);
1123 aSign
= extractFloat32Sign( a
);
1124 bSig
= extractFloat32Frac( b
);
1125 bExp
= extractFloat32Exp( b
);
1126 bSign
= extractFloat32Sign( b
);
1127 if ( aExp
== 0xFF ) {
1128 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1129 return propagateFloat32NaN( a
, b
);
1131 float_raise( float_flag_invalid
);
1132 return float32_default_nan
;
1134 if ( bExp
== 0xFF ) {
1135 if ( bSig
) return propagateFloat32NaN( a
, b
);
1140 float_raise( float_flag_invalid
);
1141 return float32_default_nan
;
1143 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1146 if ( aSig
== 0 ) return a
;
1147 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1149 expDiff
= aExp
- bExp
;
1150 aSig
= ( aSig
| 0x00800000 )<<8;
1151 bSig
= ( bSig
| 0x00800000 )<<8;
1152 if ( expDiff
< 0 ) {
1153 if ( expDiff
< -1 ) return a
;
1156 q
= ( bSig
<= aSig
);
1157 if ( q
) aSig
-= bSig
;
1159 while ( 0 < expDiff
) {
1160 q
= estimateDiv64To32( aSig
, 0, bSig
);
1161 q
= ( 2 < q
) ? q
- 2 : 0;
1162 aSig
= - ( ( bSig
>>2 ) * q
);
1166 if ( 0 < expDiff
) {
1167 q
= estimateDiv64To32( aSig
, 0, bSig
);
1168 q
= ( 2 < q
) ? q
- 2 : 0;
1171 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
1178 alternateASig
= aSig
;
1181 } while ( 0 <= (sbits32
) aSig
);
1182 sigMean
= aSig
+ alternateASig
;
1183 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
1184 aSig
= alternateASig
;
1186 zSign
= ( (sbits32
) aSig
< 0 );
1187 if ( zSign
) aSig
= - aSig
;
1188 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig
);
1193 #ifndef SOFTFLOAT_FOR_GCC
1195 -------------------------------------------------------------------------------
1196 Returns the square root of the single-precision floating-point value `a'.
1197 The operation is performed according to the IEC/IEEE Standard for Binary
1198 Floating-Point Arithmetic.
1199 -------------------------------------------------------------------------------
1201 float32
float32_sqrt( float32 a
)
1205 bits32 aSig
, zSig
, rem0
, rem1
, term0
, term1
;
1207 aSig
= extractFloat32Frac( a
);
1208 aExp
= extractFloat32Exp( a
);
1209 aSign
= extractFloat32Sign( a
);
1210 if ( aExp
== 0xFF ) {
1211 if ( aSig
) return propagateFloat32NaN( a
, 0 );
1212 if ( ! aSign
) return a
;
1213 float_raise( float_flag_invalid
);
1214 return float32_default_nan
;
1217 if ( ( aExp
| aSig
) == 0 ) return a
;
1218 float_raise( float_flag_invalid
);
1219 return float32_default_nan
;
1222 if ( aSig
== 0 ) return 0;
1223 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1225 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
1226 aSig
= ( aSig
| 0x00800000 )<<8;
1227 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
1228 if ( ( zSig
& 0x7F ) <= 5 ) {
1235 mul32To64( zSig
, zSig
, &term0
, &term1
);
1236 sub64( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
1237 while ( (sbits32
) rem0
< 0 ) {
1239 shortShift64Left( 0, zSig
, 1, &term0
, &term1
);
1241 add64( rem0
, rem1
, term0
, term1
, &rem0
, &rem1
);
1243 zSig
|= ( ( rem0
| rem1
) != 0 );
1246 shift32RightJamming( zSig
, 1, &zSig
);
1248 return roundAndPackFloat32( 0, zExp
, zSig
);
1254 -------------------------------------------------------------------------------
1255 Returns 1 if the single-precision floating-point value `a' is equal to
1256 the corresponding value `b', and 0 otherwise. The comparison is performed
1257 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1258 -------------------------------------------------------------------------------
1260 flag
float32_eq( float32 a
, float32 b
)
1263 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1264 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1266 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1267 float_raise( float_flag_invalid
);
1271 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1276 -------------------------------------------------------------------------------
1277 Returns 1 if the single-precision floating-point value `a' is less than
1278 or equal to the corresponding value `b', and 0 otherwise. The comparison
1279 is performed according to the IEC/IEEE Standard for Binary Floating-Point
1281 -------------------------------------------------------------------------------
1283 flag
float32_le( float32 a
, float32 b
)
1287 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1288 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1290 float_raise( float_flag_invalid
);
1293 aSign
= extractFloat32Sign( a
);
1294 bSign
= extractFloat32Sign( b
);
1295 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1296 return ( a
== b
) || ( aSign
^ ( a
< b
) );
1301 -------------------------------------------------------------------------------
1302 Returns 1 if the single-precision floating-point value `a' is less than
1303 the corresponding value `b', and 0 otherwise. The comparison is performed
1304 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1305 -------------------------------------------------------------------------------
1307 flag
float32_lt( float32 a
, float32 b
)
1311 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1312 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1314 float_raise( float_flag_invalid
);
1317 aSign
= extractFloat32Sign( a
);
1318 bSign
= extractFloat32Sign( b
);
1319 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
1320 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
1324 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1326 -------------------------------------------------------------------------------
1327 Returns 1 if the single-precision floating-point value `a' is equal to
1328 the corresponding value `b', and 0 otherwise. The invalid exception is
1329 raised if either operand is a NaN. Otherwise, the comparison is performed
1330 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1331 -------------------------------------------------------------------------------
1333 flag
float32_eq_signaling( float32 a
, float32 b
)
1336 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1337 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1339 float_raise( float_flag_invalid
);
1342 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1347 -------------------------------------------------------------------------------
1348 Returns 1 if the single-precision floating-point value `a' is less than or
1349 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
1350 cause an exception. Otherwise, the comparison is performed according to the
1351 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1352 -------------------------------------------------------------------------------
1354 flag
float32_le_quiet( float32 a
, float32 b
)
1359 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1360 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1362 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1363 float_raise( float_flag_invalid
);
1367 aSign
= extractFloat32Sign( a
);
1368 bSign
= extractFloat32Sign( b
);
1369 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
1370 return ( a
== b
) || ( aSign
^ ( a
< b
) );
1375 -------------------------------------------------------------------------------
1376 Returns 1 if the single-precision floating-point value `a' is less than
1377 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
1378 exception. Otherwise, the comparison is performed according to the IEC/IEEE
1379 Standard for Binary Floating-Point Arithmetic.
1380 -------------------------------------------------------------------------------
1382 flag
float32_lt_quiet( float32 a
, float32 b
)
1386 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
1387 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
1389 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
1390 float_raise( float_flag_invalid
);
1394 aSign
= extractFloat32Sign( a
);
1395 bSign
= extractFloat32Sign( b
);
1396 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
1397 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
1400 #endif /* !SOFTFLOAT_FOR_GCC */
1402 #ifndef SOFTFLOAT_FOR_GCC /* Not needed */
1404 -------------------------------------------------------------------------------
1405 Returns the result of converting the double-precision floating-point value
1406 `a' to the 32-bit two's complement integer format. The conversion is
1407 performed according to the IEC/IEEE Standard for Binary Floating-Point
1408 Arithmetic---which means in particular that the conversion is rounded
1409 according to the current rounding mode. If `a' is a NaN, the largest
1410 positive integer is returned. Otherwise, if the conversion overflows, the
1411 largest integer with the same sign as `a' is returned.
1412 -------------------------------------------------------------------------------
1414 int32
float64_to_int32( float64 a
)
1417 int16 aExp
, shiftCount
;
1418 bits32 aSig0
, aSig1
, absZ
, aSigExtra
;
1422 aSig1
= extractFloat64Frac1( a
);
1423 aSig0
= extractFloat64Frac0( a
);
1424 aExp
= extractFloat64Exp( a
);
1425 aSign
= extractFloat64Sign( a
);
1426 shiftCount
= aExp
- 0x413;
1427 if ( 0 <= shiftCount
) {
1428 if ( 0x41E < aExp
) {
1429 if ( ( aExp
== 0x7FF ) && ( aSig0
| aSig1
) ) aSign
= 0;
1433 aSig0
| 0x00100000, aSig1
, shiftCount
, &absZ
, &aSigExtra
);
1434 if ( 0x80000000 < absZ
) goto invalid
;
1437 aSig1
= ( aSig1
!= 0 );
1438 if ( aExp
< 0x3FE ) {
1439 aSigExtra
= aExp
| aSig0
| aSig1
;
1443 aSig0
|= 0x00100000;
1444 aSigExtra
= ( aSig0
<<( shiftCount
& 31 ) ) | aSig1
;
1445 absZ
= aSig0
>>( - shiftCount
);
1448 roundingMode
= float_rounding_mode
;
1449 if ( roundingMode
== float_round_nearest_even
) {
1450 if ( (sbits32
) aSigExtra
< 0 ) {
1452 if ( (bits32
) ( aSigExtra
<<1 ) == 0 ) absZ
&= ~1;
1454 z
= aSign
? - absZ
: absZ
;
1457 aSigExtra
= ( aSigExtra
!= 0 );
1460 + ( ( roundingMode
== float_round_down
) & aSigExtra
) );
1463 z
= absZ
+ ( ( roundingMode
== float_round_up
) & aSigExtra
);
1466 if ( ( aSign
^ ( z
< 0 ) ) && z
) {
1468 float_raise( float_flag_invalid
);
1469 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
1471 if ( aSigExtra
) set_float_exception_inexact_flag();
1475 #endif /* !SOFTFLOAT_FOR_GCC */
1478 -------------------------------------------------------------------------------
1479 Returns the result of converting the double-precision floating-point value
1480 `a' to the 32-bit two's complement integer format. The conversion is
1481 performed according to the IEC/IEEE Standard for Binary Floating-Point
1482 Arithmetic, except that the conversion is always rounded toward zero.
1483 If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1484 the conversion overflows, the largest integer with the same sign as `a' is
1486 -------------------------------------------------------------------------------
1488 int32
float64_to_int32_round_to_zero( float64 a
)
1491 int16 aExp
, shiftCount
;
1492 bits32 aSig0
, aSig1
, absZ
, aSigExtra
;
1495 aSig1
= extractFloat64Frac1( a
);
1496 aSig0
= extractFloat64Frac0( a
);
1497 aExp
= extractFloat64Exp( a
);
1498 aSign
= extractFloat64Sign( a
);
1499 shiftCount
= aExp
- 0x413;
1500 if ( 0 <= shiftCount
) {
1501 if ( 0x41E < aExp
) {
1502 if ( ( aExp
== 0x7FF ) && ( aSig0
| aSig1
) ) aSign
= 0;
1506 aSig0
| 0x00100000, aSig1
, shiftCount
, &absZ
, &aSigExtra
);
1509 if ( aExp
< 0x3FF ) {
1510 if ( aExp
| aSig0
| aSig1
) {
1511 set_float_exception_inexact_flag();
1515 aSig0
|= 0x00100000;
1516 aSigExtra
= ( aSig0
<<( shiftCount
& 31 ) ) | aSig1
;
1517 absZ
= aSig0
>>( - shiftCount
);
1519 z
= aSign
? - absZ
: absZ
;
1520 if ( ( aSign
^ ( z
< 0 ) ) && z
) {
1522 float_raise( float_flag_invalid
);
1523 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
1525 if ( aSigExtra
) set_float_exception_inexact_flag();
1531 -------------------------------------------------------------------------------
1532 Returns the result of converting the double-precision floating-point value
1533 `a' to the single-precision floating-point format. The conversion is
1534 performed according to the IEC/IEEE Standard for Binary Floating-Point
1536 -------------------------------------------------------------------------------
1538 float32
float64_to_float32( float64 a
)
1542 bits32 aSig0
, aSig1
, zSig
;
1545 aSig1
= extractFloat64Frac1( a
);
1546 aSig0
= extractFloat64Frac0( a
);
1547 aExp
= extractFloat64Exp( a
);
1548 aSign
= extractFloat64Sign( a
);
1549 if ( aExp
== 0x7FF ) {
1550 if ( aSig0
| aSig1
) {
1551 return commonNaNToFloat32( float64ToCommonNaN( a
) );
1553 return packFloat32( aSign
, 0xFF, 0 );
1555 shift64RightJamming( aSig0
, aSig1
, 22, &allZero
, &zSig
);
1556 if ( aExp
) zSig
|= 0x40000000;
1557 return roundAndPackFloat32( aSign
, aExp
- 0x381, zSig
);
1561 #ifndef SOFTFLOAT_FOR_GCC
1563 -------------------------------------------------------------------------------
1564 Rounds the double-precision floating-point value `a' to an integer,
1565 and returns the result as a double-precision floating-point value. The
1566 operation is performed according to the IEC/IEEE Standard for Binary
1567 Floating-Point Arithmetic.
1568 -------------------------------------------------------------------------------
1570 float64
float64_round_to_int( float64 a
)
1574 bits32 lastBitMask
, roundBitsMask
;
1578 aExp
= extractFloat64Exp( a
);
1579 if ( 0x413 <= aExp
) {
1580 if ( 0x433 <= aExp
) {
1581 if ( ( aExp
== 0x7FF )
1582 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) ) {
1583 return propagateFloat64NaN( a
, a
);
1588 lastBitMask
= ( lastBitMask
<<( 0x432 - aExp
) )<<1;
1589 roundBitsMask
= lastBitMask
- 1;
1591 roundingMode
= float_rounding_mode
;
1592 if ( roundingMode
== float_round_nearest_even
) {
1593 if ( lastBitMask
) {
1594 add64( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
1595 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
1598 if ( (sbits32
) z
.low
< 0 ) {
1600 if ( (bits32
) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
1604 else if ( roundingMode
!= float_round_to_zero
) {
1605 if ( extractFloat64Sign( z
)
1606 ^ ( roundingMode
== float_round_up
) ) {
1607 add64( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
1610 z
.low
&= ~ roundBitsMask
;
1613 if ( aExp
<= 0x3FE ) {
1614 if ( ( ( (bits32
) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
1615 set_float_exception_inexact_flag();
1616 aSign
= extractFloat64Sign( a
);
1617 switch ( float_rounding_mode
) {
1618 case float_round_nearest_even
:
1619 if ( ( aExp
== 0x3FE )
1620 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) )
1622 return packFloat64( aSign
, 0x3FF, 0, 0 );
1625 case float_round_down
:
1627 aSign
? packFloat64( 1, 0x3FF, 0, 0 )
1628 : packFloat64( 0, 0, 0, 0 );
1629 case float_round_up
:
1631 aSign
? packFloat64( 1, 0, 0, 0 )
1632 : packFloat64( 0, 0x3FF, 0, 0 );
1634 return packFloat64( aSign
, 0, 0, 0 );
1637 lastBitMask
<<= 0x413 - aExp
;
1638 roundBitsMask
= lastBitMask
- 1;
1641 roundingMode
= float_rounding_mode
;
1642 if ( roundingMode
== float_round_nearest_even
) {
1643 z
.high
+= lastBitMask
>>1;
1644 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
1645 z
.high
&= ~ lastBitMask
;
1648 else if ( roundingMode
!= float_round_to_zero
) {
1649 if ( extractFloat64Sign( z
)
1650 ^ ( roundingMode
== float_round_up
) ) {
1651 z
.high
|= ( a
.low
!= 0 );
1652 z
.high
+= roundBitsMask
;
1655 z
.high
&= ~ roundBitsMask
;
1657 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
1658 set_float_exception_inexact_flag();
1666 -------------------------------------------------------------------------------
1667 Returns the result of adding the absolute values of the double-precision
1668 floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1669 before being returned. `zSign' is ignored if the result is a NaN.
1670 The addition is performed according to the IEC/IEEE Standard for Binary
1671 Floating-Point Arithmetic.
1672 -------------------------------------------------------------------------------
1674 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign
)
1676 int16 aExp
, bExp
, zExp
;
1677 bits32 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
1680 aSig1
= extractFloat64Frac1( a
);
1681 aSig0
= extractFloat64Frac0( a
);
1682 aExp
= extractFloat64Exp( a
);
1683 bSig1
= extractFloat64Frac1( b
);
1684 bSig0
= extractFloat64Frac0( b
);
1685 bExp
= extractFloat64Exp( b
);
1686 expDiff
= aExp
- bExp
;
1687 if ( 0 < expDiff
) {
1688 if ( aExp
== 0x7FF ) {
1689 if ( aSig0
| aSig1
) return propagateFloat64NaN( a
, b
);
1696 bSig0
|= 0x00100000;
1698 shift64ExtraRightJamming(
1699 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
1702 else if ( expDiff
< 0 ) {
1703 if ( bExp
== 0x7FF ) {
1704 if ( bSig0
| bSig1
) return propagateFloat64NaN( a
, b
);
1705 return packFloat64( zSign
, 0x7FF, 0, 0 );
1711 aSig0
|= 0x00100000;
1713 shift64ExtraRightJamming(
1714 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
1718 if ( aExp
== 0x7FF ) {
1719 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
1720 return propagateFloat64NaN( a
, b
);
1724 add64( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
1725 if ( aExp
== 0 ) return packFloat64( zSign
, 0, zSig0
, zSig1
);
1727 zSig0
|= 0x00200000;
1731 aSig0
|= 0x00100000;
1732 add64( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
1734 if ( zSig0
< 0x00200000 ) goto roundAndPack
;
1737 shift64ExtraRightJamming( zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
1739 return roundAndPackFloat64( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
1744 -------------------------------------------------------------------------------
1745 Returns the result of subtracting the absolute values of the double-
1746 precision floating-point values `a' and `b'. If `zSign' is 1, the
1747 difference is negated before being returned. `zSign' is ignored if the
1748 result is a NaN. The subtraction is performed according to the IEC/IEEE
1749 Standard for Binary Floating-Point Arithmetic.
1750 -------------------------------------------------------------------------------
1752 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign
)
1754 int16 aExp
, bExp
, zExp
;
1755 bits32 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
1758 aSig1
= extractFloat64Frac1( a
);
1759 aSig0
= extractFloat64Frac0( a
);
1760 aExp
= extractFloat64Exp( a
);
1761 bSig1
= extractFloat64Frac1( b
);
1762 bSig0
= extractFloat64Frac0( b
);
1763 bExp
= extractFloat64Exp( b
);
1764 expDiff
= aExp
- bExp
;
1765 shortShift64Left( aSig0
, aSig1
, 10, &aSig0
, &aSig1
);
1766 shortShift64Left( bSig0
, bSig1
, 10, &bSig0
, &bSig1
);
1767 if ( 0 < expDiff
) goto aExpBigger
;
1768 if ( expDiff
< 0 ) goto bExpBigger
;
1769 if ( aExp
== 0x7FF ) {
1770 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
1771 return propagateFloat64NaN( a
, b
);
1773 float_raise( float_flag_invalid
);
1774 return float64_default_nan
;
1780 if ( bSig0
< aSig0
) goto aBigger
;
1781 if ( aSig0
< bSig0
) goto bBigger
;
1782 if ( bSig1
< aSig1
) goto aBigger
;
1783 if ( aSig1
< bSig1
) goto bBigger
;
1784 return packFloat64( float_rounding_mode
== float_round_down
, 0, 0, 0 );
1786 if ( bExp
== 0x7FF ) {
1787 if ( bSig0
| bSig1
) return propagateFloat64NaN( a
, b
);
1788 return packFloat64( zSign
^ 1, 0x7FF, 0, 0 );
1794 aSig0
|= 0x40000000;
1796 shift64RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
1797 bSig0
|= 0x40000000;
1799 sub64( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
1802 goto normalizeRoundAndPack
;
1804 if ( aExp
== 0x7FF ) {
1805 if ( aSig0
| aSig1
) return propagateFloat64NaN( a
, b
);
1812 bSig0
|= 0x40000000;
1814 shift64RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
1815 aSig0
|= 0x40000000;
1817 sub64( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
1819 normalizeRoundAndPack
:
1821 return normalizeRoundAndPackFloat64( zSign
, zExp
- 10, zSig0
, zSig1
);
1826 -------------------------------------------------------------------------------
1827 Returns the result of adding the double-precision floating-point values `a'
1828 and `b'. The operation is performed according to the IEC/IEEE Standard for
1829 Binary Floating-Point Arithmetic.
1830 -------------------------------------------------------------------------------
1832 float64
float64_add( float64 a
, float64 b
)
1836 aSign
= extractFloat64Sign( a
);
1837 bSign
= extractFloat64Sign( b
);
1838 if ( aSign
== bSign
) {
1839 return addFloat64Sigs( a
, b
, aSign
);
1842 return subFloat64Sigs( a
, b
, aSign
);
1848 -------------------------------------------------------------------------------
1849 Returns the result of subtracting the double-precision floating-point values
1850 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1851 for Binary Floating-Point Arithmetic.
1852 -------------------------------------------------------------------------------
1854 float64
float64_sub( float64 a
, float64 b
)
1858 aSign
= extractFloat64Sign( a
);
1859 bSign
= extractFloat64Sign( b
);
1860 if ( aSign
== bSign
) {
1861 return subFloat64Sigs( a
, b
, aSign
);
1864 return addFloat64Sigs( a
, b
, aSign
);
1870 -------------------------------------------------------------------------------
1871 Returns the result of multiplying the double-precision floating-point values
1872 `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1873 for Binary Floating-Point Arithmetic.
1874 -------------------------------------------------------------------------------
1876 float64
float64_mul( float64 a
, float64 b
)
1878 flag aSign
, bSign
, zSign
;
1879 int16 aExp
, bExp
, zExp
;
1880 bits32 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
1882 aSig1
= extractFloat64Frac1( a
);
1883 aSig0
= extractFloat64Frac0( a
);
1884 aExp
= extractFloat64Exp( a
);
1885 aSign
= extractFloat64Sign( a
);
1886 bSig1
= extractFloat64Frac1( b
);
1887 bSig0
= extractFloat64Frac0( b
);
1888 bExp
= extractFloat64Exp( b
);
1889 bSign
= extractFloat64Sign( b
);
1890 zSign
= aSign
^ bSign
;
1891 if ( aExp
== 0x7FF ) {
1892 if ( ( aSig0
| aSig1
)
1893 || ( ( bExp
== 0x7FF ) && ( bSig0
| bSig1
) ) ) {
1894 return propagateFloat64NaN( a
, b
);
1896 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
1897 return packFloat64( zSign
, 0x7FF, 0, 0 );
1899 if ( bExp
== 0x7FF ) {
1900 if ( bSig0
| bSig1
) return propagateFloat64NaN( a
, b
);
1901 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
1903 float_raise( float_flag_invalid
);
1904 return float64_default_nan
;
1906 return packFloat64( zSign
, 0x7FF, 0, 0 );
1909 if ( ( aSig0
| aSig1
) == 0 ) return packFloat64( zSign
, 0, 0, 0 );
1910 normalizeFloat64Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
1913 if ( ( bSig0
| bSig1
) == 0 ) return packFloat64( zSign
, 0, 0, 0 );
1914 normalizeFloat64Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
1916 zExp
= aExp
+ bExp
- 0x400;
1917 aSig0
|= 0x00100000;
1918 shortShift64Left( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
1919 mul64To128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
1920 add64( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
1921 zSig2
|= ( zSig3
!= 0 );
1922 if ( 0x00200000 <= zSig0
) {
1923 shift64ExtraRightJamming(
1924 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
1927 return roundAndPackFloat64( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
1932 -------------------------------------------------------------------------------
1933 Returns the result of dividing the double-precision floating-point value `a'
1934 by the corresponding value `b'. The operation is performed according to the
1935 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1936 -------------------------------------------------------------------------------
1938 float64
float64_div( float64 a
, float64 b
)
1940 flag aSign
, bSign
, zSign
;
1941 int16 aExp
, bExp
, zExp
;
1942 bits32 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
1943 bits32 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
1945 aSig1
= extractFloat64Frac1( a
);
1946 aSig0
= extractFloat64Frac0( a
);
1947 aExp
= extractFloat64Exp( a
);
1948 aSign
= extractFloat64Sign( a
);
1949 bSig1
= extractFloat64Frac1( b
);
1950 bSig0
= extractFloat64Frac0( b
);
1951 bExp
= extractFloat64Exp( b
);
1952 bSign
= extractFloat64Sign( b
);
1953 zSign
= aSign
^ bSign
;
1954 if ( aExp
== 0x7FF ) {
1955 if ( aSig0
| aSig1
) return propagateFloat64NaN( a
, b
);
1956 if ( bExp
== 0x7FF ) {
1957 if ( bSig0
| bSig1
) return propagateFloat64NaN( a
, b
);
1960 return packFloat64( zSign
, 0x7FF, 0, 0 );
1962 if ( bExp
== 0x7FF ) {
1963 if ( bSig0
| bSig1
) return propagateFloat64NaN( a
, b
);
1964 return packFloat64( zSign
, 0, 0, 0 );
1967 if ( ( bSig0
| bSig1
) == 0 ) {
1968 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
1970 float_raise( float_flag_invalid
);
1971 return float64_default_nan
;
1973 float_raise( float_flag_divbyzero
);
1974 return packFloat64( zSign
, 0x7FF, 0, 0 );
1976 normalizeFloat64Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
1979 if ( ( aSig0
| aSig1
) == 0 ) return packFloat64( zSign
, 0, 0, 0 );
1980 normalizeFloat64Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
1982 zExp
= aExp
- bExp
+ 0x3FD;
1983 shortShift64Left( aSig0
| 0x00100000, aSig1
, 11, &aSig0
, &aSig1
);
1984 shortShift64Left( bSig0
| 0x00100000, bSig1
, 11, &bSig0
, &bSig1
);
1985 if ( le64( bSig0
, bSig1
, aSig0
, aSig1
) ) {
1986 shift64Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
1989 zSig0
= estimateDiv64To32( aSig0
, aSig1
, bSig0
);
1990 mul64By32To96( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
1991 sub96( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
1992 while ( (sbits32
) rem0
< 0 ) {
1994 add96( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
1996 zSig1
= estimateDiv64To32( rem1
, rem2
, bSig0
);
1997 if ( ( zSig1
& 0x3FF ) <= 4 ) {
1998 mul64By32To96( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
1999 sub96( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
2000 while ( (sbits32
) rem1
< 0 ) {
2002 add96( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
2004 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
2006 shift64ExtraRightJamming( zSig0
, zSig1
, 0, 11, &zSig0
, &zSig1
, &zSig2
);
2007 return roundAndPackFloat64( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
2011 #ifndef SOFTFLOAT_FOR_GCC
2013 -------------------------------------------------------------------------------
2014 Returns the remainder of the double-precision floating-point value `a'
2015 with respect to the corresponding value `b'. The operation is performed
2016 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2017 -------------------------------------------------------------------------------
2019 float64
float64_rem( float64 a
, float64 b
)
2021 flag aSign
, bSign
, zSign
;
2022 int16 aExp
, bExp
, expDiff
;
2023 bits32 aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
2024 bits32 allZero
, alternateASig0
, alternateASig1
, sigMean1
;
2028 aSig1
= extractFloat64Frac1( a
);
2029 aSig0
= extractFloat64Frac0( a
);
2030 aExp
= extractFloat64Exp( a
);
2031 aSign
= extractFloat64Sign( a
);
2032 bSig1
= extractFloat64Frac1( b
);
2033 bSig0
= extractFloat64Frac0( b
);
2034 bExp
= extractFloat64Exp( b
);
2035 bSign
= extractFloat64Sign( b
);
2036 if ( aExp
== 0x7FF ) {
2037 if ( ( aSig0
| aSig1
)
2038 || ( ( bExp
== 0x7FF ) && ( bSig0
| bSig1
) ) ) {
2039 return propagateFloat64NaN( a
, b
);
2043 if ( bExp
== 0x7FF ) {
2044 if ( bSig0
| bSig1
) return propagateFloat64NaN( a
, b
);
2048 if ( ( bSig0
| bSig1
) == 0 ) {
2050 float_raise( float_flag_invalid
);
2051 return float64_default_nan
;
2053 normalizeFloat64Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
2056 if ( ( aSig0
| aSig1
) == 0 ) return a
;
2057 normalizeFloat64Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
2059 expDiff
= aExp
- bExp
;
2060 if ( expDiff
< -1 ) return a
;
2062 aSig0
| 0x00100000, aSig1
, 11 - ( expDiff
< 0 ), &aSig0
, &aSig1
);
2063 shortShift64Left( bSig0
| 0x00100000, bSig1
, 11, &bSig0
, &bSig1
);
2064 q
= le64( bSig0
, bSig1
, aSig0
, aSig1
);
2065 if ( q
) sub64( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
2067 while ( 0 < expDiff
) {
2068 q
= estimateDiv64To32( aSig0
, aSig1
, bSig0
);
2069 q
= ( 4 < q
) ? q
- 4 : 0;
2070 mul64By32To96( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
2071 shortShift96Left( term0
, term1
, term2
, 29, &term1
, &term2
, &allZero
);
2072 shortShift64Left( aSig0
, aSig1
, 29, &aSig0
, &allZero
);
2073 sub64( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
2076 if ( -32 < expDiff
) {
2077 q
= estimateDiv64To32( aSig0
, aSig1
, bSig0
);
2078 q
= ( 4 < q
) ? q
- 4 : 0;
2080 shift64Right( bSig0
, bSig1
, 8, &bSig0
, &bSig1
);
2082 if ( expDiff
< 0 ) {
2083 shift64Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
2086 shortShift64Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
2088 mul64By32To96( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
2089 sub64( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
2092 shift64Right( aSig0
, aSig1
, 8, &aSig0
, &aSig1
);
2093 shift64Right( bSig0
, bSig1
, 8, &bSig0
, &bSig1
);
2096 alternateASig0
= aSig0
;
2097 alternateASig1
= aSig1
;
2099 sub64( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
2100 } while ( 0 <= (sbits32
) aSig0
);
2102 aSig0
, aSig1
, alternateASig0
, alternateASig1
, &sigMean0
, &sigMean1
);
2103 if ( ( sigMean0
< 0 )
2104 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
2105 aSig0
= alternateASig0
;
2106 aSig1
= alternateASig1
;
2108 zSign
= ( (sbits32
) aSig0
< 0 );
2109 if ( zSign
) sub64( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
2111 normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
);
2116 #ifndef SOFTFLOAT_FOR_GCC
2118 -------------------------------------------------------------------------------
2119 Returns the square root of the double-precision floating-point value `a'.
2120 The operation is performed according to the IEC/IEEE Standard for Binary
2121 Floating-Point Arithmetic.
2122 -------------------------------------------------------------------------------
2124 float64
float64_sqrt( float64 a
)
2128 bits32 aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
2129 bits32 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
2132 aSig1
= extractFloat64Frac1( a
);
2133 aSig0
= extractFloat64Frac0( a
);
2134 aExp
= extractFloat64Exp( a
);
2135 aSign
= extractFloat64Sign( a
);
2136 if ( aExp
== 0x7FF ) {
2137 if ( aSig0
| aSig1
) return propagateFloat64NaN( a
, a
);
2138 if ( ! aSign
) return a
;
2142 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
2144 float_raise( float_flag_invalid
);
2145 return float64_default_nan
;
2148 if ( ( aSig0
| aSig1
) == 0 ) return packFloat64( 0, 0, 0, 0 );
2149 normalizeFloat64Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
2151 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
2152 aSig0
|= 0x00100000;
2153 shortShift64Left( aSig0
, aSig1
, 11, &term0
, &term1
);
2154 zSig0
= ( estimateSqrt32( aExp
, term0
)>>1 ) + 1;
2155 if ( zSig0
== 0 ) zSig0
= 0x7FFFFFFF;
2156 doubleZSig0
= zSig0
+ zSig0
;
2157 shortShift64Left( aSig0
, aSig1
, 9 - ( aExp
& 1 ), &aSig0
, &aSig1
);
2158 mul32To64( zSig0
, zSig0
, &term0
, &term1
);
2159 sub64( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
2160 while ( (sbits32
) rem0
< 0 ) {
2163 add64( rem0
, rem1
, 0, doubleZSig0
| 1, &rem0
, &rem1
);
2165 zSig1
= estimateDiv64To32( rem1
, 0, doubleZSig0
);
2166 if ( ( zSig1
& 0x1FF ) <= 5 ) {
2167 if ( zSig1
== 0 ) zSig1
= 1;
2168 mul32To64( doubleZSig0
, zSig1
, &term1
, &term2
);
2169 sub64( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
2170 mul32To64( zSig1
, zSig1
, &term2
, &term3
);
2171 sub96( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
2172 while ( (sbits32
) rem1
< 0 ) {
2174 shortShift64Left( 0, zSig1
, 1, &term2
, &term3
);
2176 term2
|= doubleZSig0
;
2177 add96( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
2179 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
2181 shift64ExtraRightJamming( zSig0
, zSig1
, 0, 10, &zSig0
, &zSig1
, &zSig2
);
2182 return roundAndPackFloat64( 0, zExp
, zSig0
, zSig1
, zSig2
);
2188 -------------------------------------------------------------------------------
2189 Returns 1 if the double-precision floating-point value `a' is equal to
2190 the corresponding value `b', and 0 otherwise. The comparison is performed
2191 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2192 -------------------------------------------------------------------------------
2194 flag
float64_eq( float64 a
, float64 b
)
2197 if ( ( ( extractFloat64Exp( a
) == 0x7FF )
2198 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) )
2199 || ( ( extractFloat64Exp( b
) == 0x7FF )
2200 && ( extractFloat64Frac0( b
) | extractFloat64Frac1( b
) ) )
2202 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2203 float_raise( float_flag_invalid
);
2207 return ( a
== b
) ||
2208 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) == 0 );
2213 -------------------------------------------------------------------------------
2214 Returns 1 if the double-precision floating-point value `a' is less than
2215 or equal to the corresponding value `b', and 0 otherwise. The comparison
2216 is performed according to the IEC/IEEE Standard for Binary Floating-Point
2218 -------------------------------------------------------------------------------
2220 flag
float64_le( float64 a
, float64 b
)
2224 if ( ( ( extractFloat64Exp( a
) == 0x7FF )
2225 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) )
2226 || ( ( extractFloat64Exp( b
) == 0x7FF )
2227 && ( extractFloat64Frac0( b
) | extractFloat64Frac1( b
) ) )
2229 float_raise( float_flag_invalid
);
2232 aSign
= extractFloat64Sign( a
);
2233 bSign
= extractFloat64Sign( b
);
2234 if ( aSign
!= bSign
)
2236 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) ==
2238 return ( a
== b
) ||
2239 ( aSign
^ ( FLOAT64_DEMANGLE(a
) < FLOAT64_DEMANGLE(b
) ) );
2243 -------------------------------------------------------------------------------
2244 Returns 1 if the double-precision floating-point value `a' is less than
2245 the corresponding value `b', and 0 otherwise. The comparison is performed
2246 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2247 -------------------------------------------------------------------------------
2249 flag
float64_lt( float64 a
, float64 b
)
2253 if ( ( ( extractFloat64Exp( a
) == 0x7FF )
2254 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) )
2255 || ( ( extractFloat64Exp( b
) == 0x7FF )
2256 && ( extractFloat64Frac0( b
) | extractFloat64Frac1( b
) ) )
2258 float_raise( float_flag_invalid
);
2261 aSign
= extractFloat64Sign( a
);
2262 bSign
= extractFloat64Sign( b
);
2263 if ( aSign
!= bSign
)
2265 ( (bits64
) ( ( FLOAT64_DEMANGLE(a
) | FLOAT64_DEMANGLE(b
) )<<1 ) !=
2267 return ( a
!= b
) &&
2268 ( aSign
^ ( FLOAT64_DEMANGLE(a
) < FLOAT64_DEMANGLE(b
) ) );
2272 #ifndef SOFTFLOAT_FOR_GCC
2274 -------------------------------------------------------------------------------
2275 Returns 1 if the double-precision floating-point value `a' is equal to
2276 the corresponding value `b', and 0 otherwise. The invalid exception is
2277 raised if either operand is a NaN. Otherwise, the comparison is performed
2278 according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2279 -------------------------------------------------------------------------------
2281 flag
float64_eq_signaling( float64 a
, float64 b
)
2284 if ( ( ( extractFloat64Exp( a
) == 0x7FF )
2285 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) )
2286 || ( ( extractFloat64Exp( b
) == 0x7FF )
2287 && ( extractFloat64Frac0( b
) | extractFloat64Frac1( b
) ) )
2289 float_raise( float_flag_invalid
);
2292 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2297 -------------------------------------------------------------------------------
2298 Returns 1 if the double-precision floating-point value `a' is less than or
2299 equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2300 cause an exception. Otherwise, the comparison is performed according to the
2301 IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2302 -------------------------------------------------------------------------------
2304 flag
float64_le_quiet( float64 a
, float64 b
)
2308 if ( ( ( extractFloat64Exp( a
) == 0x7FF )
2309 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) )
2310 || ( ( extractFloat64Exp( b
) == 0x7FF )
2311 && ( extractFloat64Frac0( b
) | extractFloat64Frac1( b
) ) )
2313 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2314 float_raise( float_flag_invalid
);
2318 aSign
= extractFloat64Sign( a
);
2319 bSign
= extractFloat64Sign( b
);
2320 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2321 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2326 -------------------------------------------------------------------------------
2327 Returns 1 if the double-precision floating-point value `a' is less than
2328 the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2329 exception. Otherwise, the comparison is performed according to the IEC/IEEE
2330 Standard for Binary Floating-Point Arithmetic.
2331 -------------------------------------------------------------------------------
2333 flag
float64_lt_quiet( float64 a
, float64 b
)
2337 if ( ( ( extractFloat64Exp( a
) == 0x7FF )
2338 && ( extractFloat64Frac0( a
) | extractFloat64Frac1( a
) ) )
2339 || ( ( extractFloat64Exp( b
) == 0x7FF )
2340 && ( extractFloat64Frac0( b
) | extractFloat64Frac1( b
) ) )
2342 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2343 float_raise( float_flag_invalid
);
2347 aSign
= extractFloat64Sign( a
);
2348 bSign
= extractFloat64Sign( b
);
2349 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
2350 return ( a
!= b
) && ( aSign
^ ( a
< b
) );