2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
33 #include "softfloat.h"
35 /*----------------------------------------------------------------------------
36 | Primitive arithmetic functions, including multi-word arithmetic, and
37 | division and square root approximations. (Can be specialized to target if
39 *----------------------------------------------------------------------------*/
40 #include "softfloat-macros.h"
42 /*----------------------------------------------------------------------------
43 | Functions and definitions to determine: (1) whether tininess for underflow
44 | is detected before or after rounding by default, (2) what (if anything)
45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47 | are propagated from function inputs to output. These details are target-
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-specialize.h"
52 void set_float_rounding_mode(int val STATUS_PARAM
)
54 STATUS(float_rounding_mode
) = val
;
57 void set_float_exception_flags(int val STATUS_PARAM
)
59 STATUS(float_exception_flags
) = val
;
63 void set_floatx80_rounding_precision(int val STATUS_PARAM
)
65 STATUS(floatx80_rounding_precision
) = val
;
69 /*----------------------------------------------------------------------------
70 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
71 | and 7, and returns the properly rounded 32-bit integer corresponding to the
72 | input. If `zSign' is 1, the input is negated before being converted to an
73 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
74 | is simply rounded to an integer, with the inexact exception raised if the
75 | input cannot be represented exactly as an integer. However, if the fixed-
76 | point input is too large, the invalid exception is raised and the largest
77 | positive or negative integer is returned.
78 *----------------------------------------------------------------------------*/
80 static int32
roundAndPackInt32( flag zSign
, bits64 absZ STATUS_PARAM
)
83 flag roundNearestEven
;
84 int8 roundIncrement
, roundBits
;
87 roundingMode
= STATUS(float_rounding_mode
);
88 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
89 roundIncrement
= 0x40;
90 if ( ! roundNearestEven
) {
91 if ( roundingMode
== float_round_to_zero
) {
95 roundIncrement
= 0x7F;
97 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
100 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
104 roundBits
= absZ
& 0x7F;
105 absZ
= ( absZ
+ roundIncrement
)>>7;
106 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
108 if ( zSign
) z
= - z
;
109 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
110 float_raise( float_flag_invalid STATUS_VAR
);
111 return zSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
113 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
118 /*----------------------------------------------------------------------------
119 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
120 | `absZ1', with binary point between bits 63 and 64 (between the input words),
121 | and returns the properly rounded 64-bit integer corresponding to the input.
122 | If `zSign' is 1, the input is negated before being converted to an integer.
123 | Ordinarily, the fixed-point input is simply rounded to an integer, with
124 | the inexact exception raised if the input cannot be represented exactly as
125 | an integer. However, if the fixed-point input is too large, the invalid
126 | exception is raised and the largest positive or negative integer is
128 *----------------------------------------------------------------------------*/
130 static int64
roundAndPackInt64( flag zSign
, bits64 absZ0
, bits64 absZ1 STATUS_PARAM
)
133 flag roundNearestEven
, increment
;
136 roundingMode
= STATUS(float_rounding_mode
);
137 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
138 increment
= ( (sbits64
) absZ1
< 0 );
139 if ( ! roundNearestEven
) {
140 if ( roundingMode
== float_round_to_zero
) {
145 increment
= ( roundingMode
== float_round_down
) && absZ1
;
148 increment
= ( roundingMode
== float_round_up
) && absZ1
;
154 if ( absZ0
== 0 ) goto overflow
;
155 absZ0
&= ~ ( ( (bits64
) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
158 if ( zSign
) z
= - z
;
159 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
161 float_raise( float_flag_invalid STATUS_VAR
);
163 zSign
? (sbits64
) LIT64( 0x8000000000000000 )
164 : LIT64( 0x7FFFFFFFFFFFFFFF );
166 if ( absZ1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
171 /*----------------------------------------------------------------------------
172 | Returns the fraction bits of the single-precision floating-point value `a'.
173 *----------------------------------------------------------------------------*/
175 INLINE bits32
extractFloat32Frac( float32 a
)
178 return a
& 0x007FFFFF;
182 /*----------------------------------------------------------------------------
183 | Returns the exponent bits of the single-precision floating-point value `a'.
184 *----------------------------------------------------------------------------*/
186 INLINE int16
extractFloat32Exp( float32 a
)
189 return ( a
>>23 ) & 0xFF;
193 /*----------------------------------------------------------------------------
194 | Returns the sign bit of the single-precision floating-point value `a'.
195 *----------------------------------------------------------------------------*/
197 INLINE flag
extractFloat32Sign( float32 a
)
204 /*----------------------------------------------------------------------------
205 | Normalizes the subnormal single-precision floating-point value represented
206 | by the denormalized significand `aSig'. The normalized exponent and
207 | significand are stored at the locations pointed to by `zExpPtr' and
208 | `zSigPtr', respectively.
209 *----------------------------------------------------------------------------*/
212 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
216 shiftCount
= countLeadingZeros32( aSig
) - 8;
217 *zSigPtr
= aSig
<<shiftCount
;
218 *zExpPtr
= 1 - shiftCount
;
222 /*----------------------------------------------------------------------------
223 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
224 | single-precision floating-point value, returning the result. After being
225 | shifted into the proper positions, the three fields are simply added
226 | together to form the result. This means that any integer portion of `zSig'
227 | will be added into the exponent. Since a properly normalized significand
228 | will have an integer portion equal to 1, the `zExp' input should be 1 less
229 | than the desired result exponent whenever `zSig' is a complete, normalized
231 *----------------------------------------------------------------------------*/
233 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
236 return ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
;
240 /*----------------------------------------------------------------------------
241 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
242 | and significand `zSig', and returns the proper single-precision floating-
243 | point value corresponding to the abstract input. Ordinarily, the abstract
244 | value is simply rounded and packed into the single-precision format, with
245 | the inexact exception raised if the abstract input cannot be represented
246 | exactly. However, if the abstract value is too large, the overflow and
247 | inexact exceptions are raised and an infinity or maximal finite value is
248 | returned. If the abstract value is too small, the input value is rounded to
249 | a subnormal number, and the underflow and inexact exceptions are raised if
250 | the abstract input cannot be represented exactly as a subnormal single-
251 | precision floating-point number.
252 | The input significand `zSig' has its binary point between bits 30
253 | and 29, which is 7 bits to the left of the usual location. This shifted
254 | significand must be normalized or smaller. If `zSig' is not normalized,
255 | `zExp' must be 0; in that case, the result returned is a subnormal number,
256 | and it must not require rounding. In the usual case that `zSig' is
257 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
258 | The handling of underflow and overflow follows the IEC/IEEE Standard for
259 | Binary Floating-Point Arithmetic.
260 *----------------------------------------------------------------------------*/
262 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig STATUS_PARAM
)
265 flag roundNearestEven
;
266 int8 roundIncrement
, roundBits
;
269 roundingMode
= STATUS(float_rounding_mode
);
270 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
271 roundIncrement
= 0x40;
272 if ( ! roundNearestEven
) {
273 if ( roundingMode
== float_round_to_zero
) {
277 roundIncrement
= 0x7F;
279 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
282 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
286 roundBits
= zSig
& 0x7F;
287 if ( 0xFD <= (bits16
) zExp
) {
289 || ( ( zExp
== 0xFD )
290 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
292 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
293 return packFloat32( zSign
, 0xFF, 0 ) - ( roundIncrement
== 0 );
297 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
299 || ( zSig
+ roundIncrement
< 0x80000000 );
300 shift32RightJamming( zSig
, - zExp
, &zSig
);
302 roundBits
= zSig
& 0x7F;
303 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
306 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
307 zSig
= ( zSig
+ roundIncrement
)>>7;
308 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
309 if ( zSig
== 0 ) zExp
= 0;
310 return packFloat32( zSign
, zExp
, zSig
);
314 /*----------------------------------------------------------------------------
315 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
316 | and significand `zSig', and returns the proper single-precision floating-
317 | point value corresponding to the abstract input. This routine is just like
318 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
319 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
320 | floating-point exponent.
321 *----------------------------------------------------------------------------*/
324 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig STATUS_PARAM
)
328 shiftCount
= countLeadingZeros32( zSig
) - 1;
329 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
333 /*----------------------------------------------------------------------------
334 | Returns the fraction bits of the double-precision floating-point value `a'.
335 *----------------------------------------------------------------------------*/
337 INLINE bits64
extractFloat64Frac( float64 a
)
340 return a
& LIT64( 0x000FFFFFFFFFFFFF );
344 /*----------------------------------------------------------------------------
345 | Returns the exponent bits of the double-precision floating-point value `a'.
346 *----------------------------------------------------------------------------*/
348 INLINE int16
extractFloat64Exp( float64 a
)
351 return ( a
>>52 ) & 0x7FF;
355 /*----------------------------------------------------------------------------
356 | Returns the sign bit of the double-precision floating-point value `a'.
357 *----------------------------------------------------------------------------*/
359 INLINE flag
extractFloat64Sign( float64 a
)
366 /*----------------------------------------------------------------------------
367 | Normalizes the subnormal double-precision floating-point value represented
368 | by the denormalized significand `aSig'. The normalized exponent and
369 | significand are stored at the locations pointed to by `zExpPtr' and
370 | `zSigPtr', respectively.
371 *----------------------------------------------------------------------------*/
374 normalizeFloat64Subnormal( bits64 aSig
, int16
*zExpPtr
, bits64
*zSigPtr
)
378 shiftCount
= countLeadingZeros64( aSig
) - 11;
379 *zSigPtr
= aSig
<<shiftCount
;
380 *zExpPtr
= 1 - shiftCount
;
384 /*----------------------------------------------------------------------------
385 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
386 | double-precision floating-point value, returning the result. After being
387 | shifted into the proper positions, the three fields are simply added
388 | together to form the result. This means that any integer portion of `zSig'
389 | will be added into the exponent. Since a properly normalized significand
390 | will have an integer portion equal to 1, the `zExp' input should be 1 less
391 | than the desired result exponent whenever `zSig' is a complete, normalized
393 *----------------------------------------------------------------------------*/
395 INLINE float64
packFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
398 return ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<52 ) + zSig
;
402 /*----------------------------------------------------------------------------
403 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
404 | and significand `zSig', and returns the proper double-precision floating-
405 | point value corresponding to the abstract input. Ordinarily, the abstract
406 | value is simply rounded and packed into the double-precision format, with
407 | the inexact exception raised if the abstract input cannot be represented
408 | exactly. However, if the abstract value is too large, the overflow and
409 | inexact exceptions are raised and an infinity or maximal finite value is
410 | returned. If the abstract value is too small, the input value is rounded
411 | to a subnormal number, and the underflow and inexact exceptions are raised
412 | if the abstract input cannot be represented exactly as a subnormal double-
413 | precision floating-point number.
414 | The input significand `zSig' has its binary point between bits 62
415 | and 61, which is 10 bits to the left of the usual location. This shifted
416 | significand must be normalized or smaller. If `zSig' is not normalized,
417 | `zExp' must be 0; in that case, the result returned is a subnormal number,
418 | and it must not require rounding. In the usual case that `zSig' is
419 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
420 | The handling of underflow and overflow follows the IEC/IEEE Standard for
421 | Binary Floating-Point Arithmetic.
422 *----------------------------------------------------------------------------*/
424 static float64
roundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig STATUS_PARAM
)
427 flag roundNearestEven
;
428 int16 roundIncrement
, roundBits
;
431 roundingMode
= STATUS(float_rounding_mode
);
432 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
433 roundIncrement
= 0x200;
434 if ( ! roundNearestEven
) {
435 if ( roundingMode
== float_round_to_zero
) {
439 roundIncrement
= 0x3FF;
441 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
444 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
448 roundBits
= zSig
& 0x3FF;
449 if ( 0x7FD <= (bits16
) zExp
) {
450 if ( ( 0x7FD < zExp
)
451 || ( ( zExp
== 0x7FD )
452 && ( (sbits64
) ( zSig
+ roundIncrement
) < 0 ) )
454 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
455 return packFloat64( zSign
, 0x7FF, 0 ) - ( roundIncrement
== 0 );
459 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
461 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
462 shift64RightJamming( zSig
, - zExp
, &zSig
);
464 roundBits
= zSig
& 0x3FF;
465 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
468 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
469 zSig
= ( zSig
+ roundIncrement
)>>10;
470 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
471 if ( zSig
== 0 ) zExp
= 0;
472 return packFloat64( zSign
, zExp
, zSig
);
476 /*----------------------------------------------------------------------------
477 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
478 | and significand `zSig', and returns the proper double-precision floating-
479 | point value corresponding to the abstract input. This routine is just like
480 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
481 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
482 | floating-point exponent.
483 *----------------------------------------------------------------------------*/
486 normalizeRoundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig STATUS_PARAM
)
490 shiftCount
= countLeadingZeros64( zSig
) - 1;
491 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
497 /*----------------------------------------------------------------------------
498 | Returns the fraction bits of the extended double-precision floating-point
500 *----------------------------------------------------------------------------*/
502 INLINE bits64
extractFloatx80Frac( floatx80 a
)
509 /*----------------------------------------------------------------------------
510 | Returns the exponent bits of the extended double-precision floating-point
512 *----------------------------------------------------------------------------*/
514 INLINE int32
extractFloatx80Exp( floatx80 a
)
517 return a
.high
& 0x7FFF;
521 /*----------------------------------------------------------------------------
522 | Returns the sign bit of the extended double-precision floating-point value
524 *----------------------------------------------------------------------------*/
526 INLINE flag
extractFloatx80Sign( floatx80 a
)
533 /*----------------------------------------------------------------------------
534 | Normalizes the subnormal extended double-precision floating-point value
535 | represented by the denormalized significand `aSig'. The normalized exponent
536 | and significand are stored at the locations pointed to by `zExpPtr' and
537 | `zSigPtr', respectively.
538 *----------------------------------------------------------------------------*/
541 normalizeFloatx80Subnormal( bits64 aSig
, int32
*zExpPtr
, bits64
*zSigPtr
)
545 shiftCount
= countLeadingZeros64( aSig
);
546 *zSigPtr
= aSig
<<shiftCount
;
547 *zExpPtr
= 1 - shiftCount
;
551 /*----------------------------------------------------------------------------
552 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
553 | extended double-precision floating-point value, returning the result.
554 *----------------------------------------------------------------------------*/
556 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, bits64 zSig
)
561 z
.high
= ( ( (bits16
) zSign
)<<15 ) + zExp
;
566 /*----------------------------------------------------------------------------
567 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
568 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
569 | and returns the proper extended double-precision floating-point value
570 | corresponding to the abstract input. Ordinarily, the abstract value is
571 | rounded and packed into the extended double-precision format, with the
572 | inexact exception raised if the abstract input cannot be represented
573 | exactly. However, if the abstract value is too large, the overflow and
574 | inexact exceptions are raised and an infinity or maximal finite value is
575 | returned. If the abstract value is too small, the input value is rounded to
576 | a subnormal number, and the underflow and inexact exceptions are raised if
577 | the abstract input cannot be represented exactly as a subnormal extended
578 | double-precision floating-point number.
579 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
580 | number of bits as single or double precision, respectively. Otherwise, the
581 | result is rounded to the full precision of the extended double-precision
583 | The input significand must be normalized or smaller. If the input
584 | significand is not normalized, `zExp' must be 0; in that case, the result
585 | returned is a subnormal number, and it must not require rounding. The
586 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
587 | Floating-Point Arithmetic.
588 *----------------------------------------------------------------------------*/
591 roundAndPackFloatx80(
592 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
596 flag roundNearestEven
, increment
, isTiny
;
597 int64 roundIncrement
, roundMask
, roundBits
;
599 roundingMode
= STATUS(float_rounding_mode
);
600 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
601 if ( roundingPrecision
== 80 ) goto precision80
;
602 if ( roundingPrecision
== 64 ) {
603 roundIncrement
= LIT64( 0x0000000000000400 );
604 roundMask
= LIT64( 0x00000000000007FF );
606 else if ( roundingPrecision
== 32 ) {
607 roundIncrement
= LIT64( 0x0000008000000000 );
608 roundMask
= LIT64( 0x000000FFFFFFFFFF );
613 zSig0
|= ( zSig1
!= 0 );
614 if ( ! roundNearestEven
) {
615 if ( roundingMode
== float_round_to_zero
) {
619 roundIncrement
= roundMask
;
621 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
624 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
628 roundBits
= zSig0
& roundMask
;
629 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
630 if ( ( 0x7FFE < zExp
)
631 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
637 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
639 || ( zSig0
<= zSig0
+ roundIncrement
);
640 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
642 roundBits
= zSig0
& roundMask
;
643 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
644 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
645 zSig0
+= roundIncrement
;
646 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
647 roundIncrement
= roundMask
+ 1;
648 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
649 roundMask
|= roundIncrement
;
651 zSig0
&= ~ roundMask
;
652 return packFloatx80( zSign
, zExp
, zSig0
);
655 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
656 zSig0
+= roundIncrement
;
657 if ( zSig0
< roundIncrement
) {
659 zSig0
= LIT64( 0x8000000000000000 );
661 roundIncrement
= roundMask
+ 1;
662 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
663 roundMask
|= roundIncrement
;
665 zSig0
&= ~ roundMask
;
666 if ( zSig0
== 0 ) zExp
= 0;
667 return packFloatx80( zSign
, zExp
, zSig0
);
669 increment
= ( (sbits64
) zSig1
< 0 );
670 if ( ! roundNearestEven
) {
671 if ( roundingMode
== float_round_to_zero
) {
676 increment
= ( roundingMode
== float_round_down
) && zSig1
;
679 increment
= ( roundingMode
== float_round_up
) && zSig1
;
683 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
684 if ( ( 0x7FFE < zExp
)
685 || ( ( zExp
== 0x7FFE )
686 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
692 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
693 if ( ( roundingMode
== float_round_to_zero
)
694 || ( zSign
&& ( roundingMode
== float_round_up
) )
695 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
697 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
699 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
703 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
706 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
707 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
709 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow STATUS_VAR
);
710 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
711 if ( roundNearestEven
) {
712 increment
= ( (sbits64
) zSig1
< 0 );
716 increment
= ( roundingMode
== float_round_down
) && zSig1
;
719 increment
= ( roundingMode
== float_round_up
) && zSig1
;
725 ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
726 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
728 return packFloatx80( zSign
, zExp
, zSig0
);
731 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
736 zSig0
= LIT64( 0x8000000000000000 );
739 zSig0
&= ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
743 if ( zSig0
== 0 ) zExp
= 0;
745 return packFloatx80( zSign
, zExp
, zSig0
);
749 /*----------------------------------------------------------------------------
750 | Takes an abstract floating-point value having sign `zSign', exponent
751 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
752 | and returns the proper extended double-precision floating-point value
753 | corresponding to the abstract input. This routine is just like
754 | `roundAndPackFloatx80' except that the input significand does not have to be
756 *----------------------------------------------------------------------------*/
759 normalizeRoundAndPackFloatx80(
760 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
770 shiftCount
= countLeadingZeros64( zSig0
);
771 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
774 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
782 /*----------------------------------------------------------------------------
783 | Returns the least-significant 64 fraction bits of the quadruple-precision
784 | floating-point value `a'.
785 *----------------------------------------------------------------------------*/
787 INLINE bits64
extractFloat128Frac1( float128 a
)
794 /*----------------------------------------------------------------------------
795 | Returns the most-significant 48 fraction bits of the quadruple-precision
796 | floating-point value `a'.
797 *----------------------------------------------------------------------------*/
799 INLINE bits64
extractFloat128Frac0( float128 a
)
802 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
806 /*----------------------------------------------------------------------------
807 | Returns the exponent bits of the quadruple-precision floating-point value
809 *----------------------------------------------------------------------------*/
811 INLINE int32
extractFloat128Exp( float128 a
)
814 return ( a
.high
>>48 ) & 0x7FFF;
818 /*----------------------------------------------------------------------------
819 | Returns the sign bit of the quadruple-precision floating-point value `a'.
820 *----------------------------------------------------------------------------*/
822 INLINE flag
extractFloat128Sign( float128 a
)
829 /*----------------------------------------------------------------------------
830 | Normalizes the subnormal quadruple-precision floating-point value
831 | represented by the denormalized significand formed by the concatenation of
832 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
833 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
834 | significand are stored at the location pointed to by `zSig0Ptr', and the
835 | least significant 64 bits of the normalized significand are stored at the
836 | location pointed to by `zSig1Ptr'.
837 *----------------------------------------------------------------------------*/
840 normalizeFloat128Subnormal(
851 shiftCount
= countLeadingZeros64( aSig1
) - 15;
852 if ( shiftCount
< 0 ) {
853 *zSig0Ptr
= aSig1
>>( - shiftCount
);
854 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
857 *zSig0Ptr
= aSig1
<<shiftCount
;
860 *zExpPtr
= - shiftCount
- 63;
863 shiftCount
= countLeadingZeros64( aSig0
) - 15;
864 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
865 *zExpPtr
= 1 - shiftCount
;
870 /*----------------------------------------------------------------------------
871 | Packs the sign `zSign', the exponent `zExp', and the significand formed
872 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
873 | floating-point value, returning the result. After being shifted into the
874 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
875 | added together to form the most significant 32 bits of the result. This
876 | means that any integer portion of `zSig0' will be added into the exponent.
877 | Since a properly normalized significand will have an integer portion equal
878 | to 1, the `zExp' input should be 1 less than the desired result exponent
879 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
881 *----------------------------------------------------------------------------*/
884 packFloat128( flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
889 z
.high
= ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<48 ) + zSig0
;
894 /*----------------------------------------------------------------------------
895 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
896 | and extended significand formed by the concatenation of `zSig0', `zSig1',
897 | and `zSig2', and returns the proper quadruple-precision floating-point value
898 | corresponding to the abstract input. Ordinarily, the abstract value is
899 | simply rounded and packed into the quadruple-precision format, with the
900 | inexact exception raised if the abstract input cannot be represented
901 | exactly. However, if the abstract value is too large, the overflow and
902 | inexact exceptions are raised and an infinity or maximal finite value is
903 | returned. If the abstract value is too small, the input value is rounded to
904 | a subnormal number, and the underflow and inexact exceptions are raised if
905 | the abstract input cannot be represented exactly as a subnormal quadruple-
906 | precision floating-point number.
907 | The input significand must be normalized or smaller. If the input
908 | significand is not normalized, `zExp' must be 0; in that case, the result
909 | returned is a subnormal number, and it must not require rounding. In the
910 | usual case that the input significand is normalized, `zExp' must be 1 less
911 | than the ``true'' floating-point exponent. The handling of underflow and
912 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
913 *----------------------------------------------------------------------------*/
916 roundAndPackFloat128(
917 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
, bits64 zSig2 STATUS_PARAM
)
920 flag roundNearestEven
, increment
, isTiny
;
922 roundingMode
= STATUS(float_rounding_mode
);
923 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
924 increment
= ( (sbits64
) zSig2
< 0 );
925 if ( ! roundNearestEven
) {
926 if ( roundingMode
== float_round_to_zero
) {
931 increment
= ( roundingMode
== float_round_down
) && zSig2
;
934 increment
= ( roundingMode
== float_round_up
) && zSig2
;
938 if ( 0x7FFD <= (bits32
) zExp
) {
939 if ( ( 0x7FFD < zExp
)
940 || ( ( zExp
== 0x7FFD )
942 LIT64( 0x0001FFFFFFFFFFFF ),
943 LIT64( 0xFFFFFFFFFFFFFFFF ),
950 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
951 if ( ( roundingMode
== float_round_to_zero
)
952 || ( zSign
&& ( roundingMode
== float_round_up
) )
953 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
959 LIT64( 0x0000FFFFFFFFFFFF ),
960 LIT64( 0xFFFFFFFFFFFFFFFF )
963 return packFloat128( zSign
, 0x7FFF, 0, 0 );
967 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
973 LIT64( 0x0001FFFFFFFFFFFF ),
974 LIT64( 0xFFFFFFFFFFFFFFFF )
976 shift128ExtraRightJamming(
977 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
979 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow STATUS_VAR
);
980 if ( roundNearestEven
) {
981 increment
= ( (sbits64
) zSig2
< 0 );
985 increment
= ( roundingMode
== float_round_down
) && zSig2
;
988 increment
= ( roundingMode
== float_round_up
) && zSig2
;
993 if ( zSig2
) STATUS(float_exception_flags
) |= float_flag_inexact
;
995 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
996 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
999 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1001 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1005 /*----------------------------------------------------------------------------
1006 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1007 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1008 | returns the proper quadruple-precision floating-point value corresponding
1009 | to the abstract input. This routine is just like `roundAndPackFloat128'
1010 | except that the input significand has fewer bits and does not have to be
1011 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1013 *----------------------------------------------------------------------------*/
1016 normalizeRoundAndPackFloat128(
1017 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1 STATUS_PARAM
)
1027 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1028 if ( 0 <= shiftCount
) {
1030 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1033 shift128ExtraRightJamming(
1034 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1037 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
1043 /*----------------------------------------------------------------------------
1044 | Returns the result of converting the 32-bit two's complement integer `a'
1045 | to the single-precision floating-point format. The conversion is performed
1046 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1047 *----------------------------------------------------------------------------*/
1049 float32
int32_to_float32( int32 a STATUS_PARAM
)
1053 if ( a
== 0 ) return 0;
1054 if ( a
== (sbits32
) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1056 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a STATUS_VAR
);
1060 /*----------------------------------------------------------------------------
1061 | Returns the result of converting the 32-bit two's complement integer `a'
1062 | to the double-precision floating-point format. The conversion is performed
1063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1064 *----------------------------------------------------------------------------*/
1066 float64
int32_to_float64( int32 a STATUS_PARAM
)
1073 if ( a
== 0 ) return 0;
1075 absA
= zSign
? - a
: a
;
1076 shiftCount
= countLeadingZeros32( absA
) + 21;
1078 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1084 /*----------------------------------------------------------------------------
1085 | Returns the result of converting the 32-bit two's complement integer `a'
1086 | to the extended double-precision floating-point format. The conversion
1087 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1089 *----------------------------------------------------------------------------*/
1091 floatx80
int32_to_floatx80( int32 a STATUS_PARAM
)
1098 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1100 absA
= zSign
? - a
: a
;
1101 shiftCount
= countLeadingZeros32( absA
) + 32;
1103 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1111 /*----------------------------------------------------------------------------
1112 | Returns the result of converting the 32-bit two's complement integer `a' to
1113 | the quadruple-precision floating-point format. The conversion is performed
1114 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1115 *----------------------------------------------------------------------------*/
1117 float128
int32_to_float128( int32 a STATUS_PARAM
)
1124 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1126 absA
= zSign
? - a
: a
;
1127 shiftCount
= countLeadingZeros32( absA
) + 17;
1129 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1135 /*----------------------------------------------------------------------------
1136 | Returns the result of converting the 64-bit two's complement integer `a'
1137 | to the single-precision floating-point format. The conversion is performed
1138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1139 *----------------------------------------------------------------------------*/
1141 float32
int64_to_float32( int64 a STATUS_PARAM
)
1147 if ( a
== 0 ) return 0;
1149 absA
= zSign
? - a
: a
;
1150 shiftCount
= countLeadingZeros64( absA
) - 40;
1151 if ( 0 <= shiftCount
) {
1152 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1156 if ( shiftCount
< 0 ) {
1157 shift64RightJamming( absA
, - shiftCount
, &absA
);
1160 absA
<<= shiftCount
;
1162 return roundAndPackFloat32( zSign
, 0x9C - shiftCount
, absA STATUS_VAR
);
1167 float32
uint64_to_float32( uint64 a STATUS_PARAM
)
1171 if ( a
== 0 ) return 0;
1172 shiftCount
= countLeadingZeros64( a
) - 40;
1173 if ( 0 <= shiftCount
) {
1174 return packFloat32( 1 > 0, 0x95 - shiftCount
, a
<<shiftCount
);
1178 if ( shiftCount
< 0 ) {
1179 shift64RightJamming( a
, - shiftCount
, &a
);
1184 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount
, a STATUS_VAR
);
1188 /*----------------------------------------------------------------------------
1189 | Returns the result of converting the 64-bit two's complement integer `a'
1190 | to the double-precision floating-point format. The conversion is performed
1191 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1192 *----------------------------------------------------------------------------*/
1194 float64
int64_to_float64( int64 a STATUS_PARAM
)
1198 if ( a
== 0 ) return 0;
1199 if ( a
== (sbits64
) LIT64( 0x8000000000000000 ) ) {
1200 return packFloat64( 1, 0x43E, 0 );
1203 return normalizeRoundAndPackFloat64( zSign
, 0x43C, zSign
? - a
: a STATUS_VAR
);
1207 float64
uint64_to_float64( uint64 a STATUS_PARAM
)
1209 if ( a
== 0 ) return 0;
1210 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR
);
1216 /*----------------------------------------------------------------------------
1217 | Returns the result of converting the 64-bit two's complement integer `a'
1218 | to the extended double-precision floating-point format. The conversion
1219 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1221 *----------------------------------------------------------------------------*/
1223 floatx80
int64_to_floatx80( int64 a STATUS_PARAM
)
1229 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1231 absA
= zSign
? - a
: a
;
1232 shiftCount
= countLeadingZeros64( absA
);
1233 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1241 /*----------------------------------------------------------------------------
1242 | Returns the result of converting the 64-bit two's complement integer `a' to
1243 | the quadruple-precision floating-point format. The conversion is performed
1244 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1245 *----------------------------------------------------------------------------*/
1247 float128
int64_to_float128( int64 a STATUS_PARAM
)
1253 bits64 zSig0
, zSig1
;
1255 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1257 absA
= zSign
? - a
: a
;
1258 shiftCount
= countLeadingZeros64( absA
) + 49;
1259 zExp
= 0x406E - shiftCount
;
1260 if ( 64 <= shiftCount
) {
1269 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1270 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1276 /*----------------------------------------------------------------------------
1277 | Returns the result of converting the single-precision floating-point value
1278 | `a' to the 32-bit two's complement integer format. The conversion is
1279 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1280 | Arithmetic---which means in particular that the conversion is rounded
1281 | according to the current rounding mode. If `a' is a NaN, the largest
1282 | positive integer is returned. Otherwise, if the conversion overflows, the
1283 | largest integer with the same sign as `a' is returned.
1284 *----------------------------------------------------------------------------*/
1286 int32
float32_to_int32( float32 a STATUS_PARAM
)
1289 int16 aExp
, shiftCount
;
1293 aSig
= extractFloat32Frac( a
);
1294 aExp
= extractFloat32Exp( a
);
1295 aSign
= extractFloat32Sign( a
);
1296 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1297 if ( aExp
) aSig
|= 0x00800000;
1298 shiftCount
= 0xAF - aExp
;
1301 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1302 return roundAndPackInt32( aSign
, aSig64 STATUS_VAR
);
1306 /*----------------------------------------------------------------------------
1307 | Returns the result of converting the single-precision floating-point value
1308 | `a' to the 32-bit two's complement integer format. The conversion is
1309 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1310 | Arithmetic, except that the conversion is always rounded toward zero.
1311 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1312 | the conversion overflows, the largest integer with the same sign as `a' is
1314 *----------------------------------------------------------------------------*/
1316 int32
float32_to_int32_round_to_zero( float32 a STATUS_PARAM
)
1319 int16 aExp
, shiftCount
;
1323 aSig
= extractFloat32Frac( a
);
1324 aExp
= extractFloat32Exp( a
);
1325 aSign
= extractFloat32Sign( a
);
1326 shiftCount
= aExp
- 0x9E;
1327 if ( 0 <= shiftCount
) {
1328 if ( a
!= 0xCF000000 ) {
1329 float_raise( float_flag_invalid STATUS_VAR
);
1330 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1332 return (sbits32
) 0x80000000;
1334 else if ( aExp
<= 0x7E ) {
1335 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1338 aSig
= ( aSig
| 0x00800000 )<<8;
1339 z
= aSig
>>( - shiftCount
);
1340 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
1341 STATUS(float_exception_flags
) |= float_flag_inexact
;
1343 if ( aSign
) z
= - z
;
1348 /*----------------------------------------------------------------------------
1349 | Returns the result of converting the single-precision floating-point value
1350 | `a' to the 64-bit two's complement integer format. The conversion is
1351 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1352 | Arithmetic---which means in particular that the conversion is rounded
1353 | according to the current rounding mode. If `a' is a NaN, the largest
1354 | positive integer is returned. Otherwise, if the conversion overflows, the
1355 | largest integer with the same sign as `a' is returned.
1356 *----------------------------------------------------------------------------*/
1358 int64
float32_to_int64( float32 a STATUS_PARAM
)
1361 int16 aExp
, shiftCount
;
1363 bits64 aSig64
, aSigExtra
;
1365 aSig
= extractFloat32Frac( a
);
1366 aExp
= extractFloat32Exp( a
);
1367 aSign
= extractFloat32Sign( a
);
1368 shiftCount
= 0xBE - aExp
;
1369 if ( shiftCount
< 0 ) {
1370 float_raise( float_flag_invalid STATUS_VAR
);
1371 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1372 return LIT64( 0x7FFFFFFFFFFFFFFF );
1374 return (sbits64
) LIT64( 0x8000000000000000 );
1376 if ( aExp
) aSig
|= 0x00800000;
1379 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1380 return roundAndPackInt64( aSign
, aSig64
, aSigExtra STATUS_VAR
);
1384 /*----------------------------------------------------------------------------
1385 | Returns the result of converting the single-precision floating-point value
1386 | `a' to the 64-bit two's complement integer format. The conversion is
1387 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1388 | Arithmetic, except that the conversion is always rounded toward zero. If
1389 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1390 | conversion overflows, the largest integer with the same sign as `a' is
1392 *----------------------------------------------------------------------------*/
1394 int64
float32_to_int64_round_to_zero( float32 a STATUS_PARAM
)
1397 int16 aExp
, shiftCount
;
1402 aSig
= extractFloat32Frac( a
);
1403 aExp
= extractFloat32Exp( a
);
1404 aSign
= extractFloat32Sign( a
);
1405 shiftCount
= aExp
- 0xBE;
1406 if ( 0 <= shiftCount
) {
1407 if ( a
!= 0xDF000000 ) {
1408 float_raise( float_flag_invalid STATUS_VAR
);
1409 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1410 return LIT64( 0x7FFFFFFFFFFFFFFF );
1413 return (sbits64
) LIT64( 0x8000000000000000 );
1415 else if ( aExp
<= 0x7E ) {
1416 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1419 aSig64
= aSig
| 0x00800000;
1421 z
= aSig64
>>( - shiftCount
);
1422 if ( (bits64
) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1423 STATUS(float_exception_flags
) |= float_flag_inexact
;
1425 if ( aSign
) z
= - z
;
1430 /*----------------------------------------------------------------------------
1431 | Returns the result of converting the single-precision floating-point value
1432 | `a' to the double-precision floating-point format. The conversion is
1433 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1435 *----------------------------------------------------------------------------*/
1437 float64
float32_to_float64( float32 a STATUS_PARAM
)
1443 aSig
= extractFloat32Frac( a
);
1444 aExp
= extractFloat32Exp( a
);
1445 aSign
= extractFloat32Sign( a
);
1446 if ( aExp
== 0xFF ) {
1447 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR
));
1448 return packFloat64( aSign
, 0x7FF, 0 );
1451 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1452 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1455 return packFloat64( aSign
, aExp
+ 0x380, ( (bits64
) aSig
)<<29 );
1461 /*----------------------------------------------------------------------------
1462 | Returns the result of converting the single-precision floating-point value
1463 | `a' to the extended double-precision floating-point format. The conversion
1464 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1466 *----------------------------------------------------------------------------*/
1468 floatx80
float32_to_floatx80( float32 a STATUS_PARAM
)
1474 aSig
= extractFloat32Frac( a
);
1475 aExp
= extractFloat32Exp( a
);
1476 aSign
= extractFloat32Sign( a
);
1477 if ( aExp
== 0xFF ) {
1478 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR
) );
1479 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1482 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1483 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1486 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<40 );
1494 /*----------------------------------------------------------------------------
1495 | Returns the result of converting the single-precision floating-point value
1496 | `a' to the double-precision floating-point format. The conversion is
1497 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1499 *----------------------------------------------------------------------------*/
1501 float128
float32_to_float128( float32 a STATUS_PARAM
)
1507 aSig
= extractFloat32Frac( a
);
1508 aExp
= extractFloat32Exp( a
);
1509 aSign
= extractFloat32Sign( a
);
1510 if ( aExp
== 0xFF ) {
1511 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR
) );
1512 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1515 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1516 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1519 return packFloat128( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<25, 0 );
1525 /*----------------------------------------------------------------------------
1526 | Rounds the single-precision floating-point value `a' to an integer, and
1527 | returns the result as a single-precision floating-point value. The
1528 | operation is performed according to the IEC/IEEE Standard for Binary
1529 | Floating-Point Arithmetic.
1530 *----------------------------------------------------------------------------*/
1532 float32
float32_round_to_int( float32 a STATUS_PARAM
)
1536 bits32 lastBitMask
, roundBitsMask
;
1540 aExp
= extractFloat32Exp( a
);
1541 if ( 0x96 <= aExp
) {
1542 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1543 return propagateFloat32NaN( a
, a STATUS_VAR
);
1547 if ( aExp
<= 0x7E ) {
1548 if ( (bits32
) ( a
<<1 ) == 0 ) return a
;
1549 STATUS(float_exception_flags
) |= float_flag_inexact
;
1550 aSign
= extractFloat32Sign( a
);
1551 switch ( STATUS(float_rounding_mode
) ) {
1552 case float_round_nearest_even
:
1553 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1554 return packFloat32( aSign
, 0x7F, 0 );
1557 case float_round_down
:
1558 return aSign
? 0xBF800000 : 0;
1559 case float_round_up
:
1560 return aSign
? 0x80000000 : 0x3F800000;
1562 return packFloat32( aSign
, 0, 0 );
1565 lastBitMask
<<= 0x96 - aExp
;
1566 roundBitsMask
= lastBitMask
- 1;
1568 roundingMode
= STATUS(float_rounding_mode
);
1569 if ( roundingMode
== float_round_nearest_even
) {
1570 z
+= lastBitMask
>>1;
1571 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1573 else if ( roundingMode
!= float_round_to_zero
) {
1574 if ( extractFloat32Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
1578 z
&= ~ roundBitsMask
;
1579 if ( z
!= a
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1584 /*----------------------------------------------------------------------------
1585 | Returns the result of adding the absolute values of the single-precision
1586 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1587 | before being returned. `zSign' is ignored if the result is a NaN.
1588 | The addition is performed according to the IEC/IEEE Standard for Binary
1589 | Floating-Point Arithmetic.
1590 *----------------------------------------------------------------------------*/
1592 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1594 int16 aExp
, bExp
, zExp
;
1595 bits32 aSig
, bSig
, zSig
;
1598 aSig
= extractFloat32Frac( a
);
1599 aExp
= extractFloat32Exp( a
);
1600 bSig
= extractFloat32Frac( b
);
1601 bExp
= extractFloat32Exp( b
);
1602 expDiff
= aExp
- bExp
;
1605 if ( 0 < expDiff
) {
1606 if ( aExp
== 0xFF ) {
1607 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1616 shift32RightJamming( bSig
, expDiff
, &bSig
);
1619 else if ( expDiff
< 0 ) {
1620 if ( bExp
== 0xFF ) {
1621 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1622 return packFloat32( zSign
, 0xFF, 0 );
1630 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1634 if ( aExp
== 0xFF ) {
1635 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1638 if ( aExp
== 0 ) return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1639 zSig
= 0x40000000 + aSig
+ bSig
;
1644 zSig
= ( aSig
+ bSig
)<<1;
1646 if ( (sbits32
) zSig
< 0 ) {
1651 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1655 /*----------------------------------------------------------------------------
1656 | Returns the result of subtracting the absolute values of the single-
1657 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1658 | difference is negated before being returned. `zSign' is ignored if the
1659 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1660 | Standard for Binary Floating-Point Arithmetic.
1661 *----------------------------------------------------------------------------*/
1663 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1665 int16 aExp
, bExp
, zExp
;
1666 bits32 aSig
, bSig
, zSig
;
1669 aSig
= extractFloat32Frac( a
);
1670 aExp
= extractFloat32Exp( a
);
1671 bSig
= extractFloat32Frac( b
);
1672 bExp
= extractFloat32Exp( b
);
1673 expDiff
= aExp
- bExp
;
1676 if ( 0 < expDiff
) goto aExpBigger
;
1677 if ( expDiff
< 0 ) goto bExpBigger
;
1678 if ( aExp
== 0xFF ) {
1679 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1680 float_raise( float_flag_invalid STATUS_VAR
);
1681 return float32_default_nan
;
1687 if ( bSig
< aSig
) goto aBigger
;
1688 if ( aSig
< bSig
) goto bBigger
;
1689 return packFloat32( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
1691 if ( bExp
== 0xFF ) {
1692 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1693 return packFloat32( zSign
^ 1, 0xFF, 0 );
1701 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1707 goto normalizeRoundAndPack
;
1709 if ( aExp
== 0xFF ) {
1710 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1719 shift32RightJamming( bSig
, expDiff
, &bSig
);
1724 normalizeRoundAndPack
:
1726 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1730 /*----------------------------------------------------------------------------
1731 | Returns the result of adding the single-precision floating-point values `a'
1732 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1733 | Binary Floating-Point Arithmetic.
1734 *----------------------------------------------------------------------------*/
1736 float32
float32_add( float32 a
, float32 b STATUS_PARAM
)
1740 aSign
= extractFloat32Sign( a
);
1741 bSign
= extractFloat32Sign( b
);
1742 if ( aSign
== bSign
) {
1743 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1746 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1751 /*----------------------------------------------------------------------------
1752 | Returns the result of subtracting the single-precision floating-point values
1753 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1754 | for Binary Floating-Point Arithmetic.
1755 *----------------------------------------------------------------------------*/
1757 float32
float32_sub( float32 a
, float32 b STATUS_PARAM
)
1761 aSign
= extractFloat32Sign( a
);
1762 bSign
= extractFloat32Sign( b
);
1763 if ( aSign
== bSign
) {
1764 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1767 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1772 /*----------------------------------------------------------------------------
1773 | Returns the result of multiplying the single-precision floating-point values
1774 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1775 | for Binary Floating-Point Arithmetic.
1776 *----------------------------------------------------------------------------*/
1778 float32
float32_mul( float32 a
, float32 b STATUS_PARAM
)
1780 flag aSign
, bSign
, zSign
;
1781 int16 aExp
, bExp
, zExp
;
1786 aSig
= extractFloat32Frac( a
);
1787 aExp
= extractFloat32Exp( a
);
1788 aSign
= extractFloat32Sign( a
);
1789 bSig
= extractFloat32Frac( b
);
1790 bExp
= extractFloat32Exp( b
);
1791 bSign
= extractFloat32Sign( b
);
1792 zSign
= aSign
^ bSign
;
1793 if ( aExp
== 0xFF ) {
1794 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1795 return propagateFloat32NaN( a
, b STATUS_VAR
);
1797 if ( ( bExp
| bSig
) == 0 ) {
1798 float_raise( float_flag_invalid STATUS_VAR
);
1799 return float32_default_nan
;
1801 return packFloat32( zSign
, 0xFF, 0 );
1803 if ( bExp
== 0xFF ) {
1804 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1805 if ( ( aExp
| aSig
) == 0 ) {
1806 float_raise( float_flag_invalid STATUS_VAR
);
1807 return float32_default_nan
;
1809 return packFloat32( zSign
, 0xFF, 0 );
1812 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1813 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1816 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1817 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1819 zExp
= aExp
+ bExp
- 0x7F;
1820 aSig
= ( aSig
| 0x00800000 )<<7;
1821 bSig
= ( bSig
| 0x00800000 )<<8;
1822 shift64RightJamming( ( (bits64
) aSig
) * bSig
, 32, &zSig64
);
1824 if ( 0 <= (sbits32
) ( zSig
<<1 ) ) {
1828 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1832 /*----------------------------------------------------------------------------
1833 | Returns the result of dividing the single-precision floating-point value `a'
1834 | by the corresponding value `b'. The operation is performed according to the
1835 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1836 *----------------------------------------------------------------------------*/
1838 float32
float32_div( float32 a
, float32 b STATUS_PARAM
)
1840 flag aSign
, bSign
, zSign
;
1841 int16 aExp
, bExp
, zExp
;
1842 bits32 aSig
, bSig
, zSig
;
1844 aSig
= extractFloat32Frac( a
);
1845 aExp
= extractFloat32Exp( a
);
1846 aSign
= extractFloat32Sign( a
);
1847 bSig
= extractFloat32Frac( b
);
1848 bExp
= extractFloat32Exp( b
);
1849 bSign
= extractFloat32Sign( b
);
1850 zSign
= aSign
^ bSign
;
1851 if ( aExp
== 0xFF ) {
1852 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1853 if ( bExp
== 0xFF ) {
1854 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1855 float_raise( float_flag_invalid STATUS_VAR
);
1856 return float32_default_nan
;
1858 return packFloat32( zSign
, 0xFF, 0 );
1860 if ( bExp
== 0xFF ) {
1861 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1862 return packFloat32( zSign
, 0, 0 );
1866 if ( ( aExp
| aSig
) == 0 ) {
1867 float_raise( float_flag_invalid STATUS_VAR
);
1868 return float32_default_nan
;
1870 float_raise( float_flag_divbyzero STATUS_VAR
);
1871 return packFloat32( zSign
, 0xFF, 0 );
1873 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1876 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1877 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1879 zExp
= aExp
- bExp
+ 0x7D;
1880 aSig
= ( aSig
| 0x00800000 )<<7;
1881 bSig
= ( bSig
| 0x00800000 )<<8;
1882 if ( bSig
<= ( aSig
+ aSig
) ) {
1886 zSig
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
1887 if ( ( zSig
& 0x3F ) == 0 ) {
1888 zSig
|= ( (bits64
) bSig
* zSig
!= ( (bits64
) aSig
)<<32 );
1890 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1894 /*----------------------------------------------------------------------------
1895 | Returns the remainder of the single-precision floating-point value `a'
1896 | with respect to the corresponding value `b'. The operation is performed
1897 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1898 *----------------------------------------------------------------------------*/
1900 float32
float32_rem( float32 a
, float32 b STATUS_PARAM
)
1902 flag aSign
, bSign
, zSign
;
1903 int16 aExp
, bExp
, expDiff
;
1906 bits64 aSig64
, bSig64
, q64
;
1907 bits32 alternateASig
;
1910 aSig
= extractFloat32Frac( a
);
1911 aExp
= extractFloat32Exp( a
);
1912 aSign
= extractFloat32Sign( a
);
1913 bSig
= extractFloat32Frac( b
);
1914 bExp
= extractFloat32Exp( b
);
1915 bSign
= extractFloat32Sign( b
);
1916 if ( aExp
== 0xFF ) {
1917 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1918 return propagateFloat32NaN( a
, b STATUS_VAR
);
1920 float_raise( float_flag_invalid STATUS_VAR
);
1921 return float32_default_nan
;
1923 if ( bExp
== 0xFF ) {
1924 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1929 float_raise( float_flag_invalid STATUS_VAR
);
1930 return float32_default_nan
;
1932 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1935 if ( aSig
== 0 ) return a
;
1936 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1938 expDiff
= aExp
- bExp
;
1941 if ( expDiff
< 32 ) {
1944 if ( expDiff
< 0 ) {
1945 if ( expDiff
< -1 ) return a
;
1948 q
= ( bSig
<= aSig
);
1949 if ( q
) aSig
-= bSig
;
1950 if ( 0 < expDiff
) {
1951 q
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
1954 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
1962 if ( bSig
<= aSig
) aSig
-= bSig
;
1963 aSig64
= ( (bits64
) aSig
)<<40;
1964 bSig64
= ( (bits64
) bSig
)<<40;
1966 while ( 0 < expDiff
) {
1967 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1968 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1969 aSig64
= - ( ( bSig
* q64
)<<38 );
1973 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1974 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1975 q
= q64
>>( 64 - expDiff
);
1977 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
1980 alternateASig
= aSig
;
1983 } while ( 0 <= (sbits32
) aSig
);
1984 sigMean
= aSig
+ alternateASig
;
1985 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
1986 aSig
= alternateASig
;
1988 zSign
= ( (sbits32
) aSig
< 0 );
1989 if ( zSign
) aSig
= - aSig
;
1990 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
1994 /*----------------------------------------------------------------------------
1995 | Returns the square root of the single-precision floating-point value `a'.
1996 | The operation is performed according to the IEC/IEEE Standard for Binary
1997 | Floating-Point Arithmetic.
1998 *----------------------------------------------------------------------------*/
2000 float32
float32_sqrt( float32 a STATUS_PARAM
)
2007 aSig
= extractFloat32Frac( a
);
2008 aExp
= extractFloat32Exp( a
);
2009 aSign
= extractFloat32Sign( a
);
2010 if ( aExp
== 0xFF ) {
2011 if ( aSig
) return propagateFloat32NaN( a
, 0 STATUS_VAR
);
2012 if ( ! aSign
) return a
;
2013 float_raise( float_flag_invalid STATUS_VAR
);
2014 return float32_default_nan
;
2017 if ( ( aExp
| aSig
) == 0 ) return a
;
2018 float_raise( float_flag_invalid STATUS_VAR
);
2019 return float32_default_nan
;
2022 if ( aSig
== 0 ) return 0;
2023 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2025 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2026 aSig
= ( aSig
| 0x00800000 )<<8;
2027 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2028 if ( ( zSig
& 0x7F ) <= 5 ) {
2034 term
= ( (bits64
) zSig
) * zSig
;
2035 rem
= ( ( (bits64
) aSig
)<<32 ) - term
;
2036 while ( (sbits64
) rem
< 0 ) {
2038 rem
+= ( ( (bits64
) zSig
)<<1 ) | 1;
2040 zSig
|= ( rem
!= 0 );
2042 shift32RightJamming( zSig
, 1, &zSig
);
2044 return roundAndPackFloat32( 0, zExp
, zSig STATUS_VAR
);
2048 /*----------------------------------------------------------------------------
2049 | Returns 1 if the single-precision floating-point value `a' is equal to
2050 | the corresponding value `b', and 0 otherwise. The comparison is performed
2051 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2052 *----------------------------------------------------------------------------*/
2054 int float32_eq( float32 a
, float32 b STATUS_PARAM
)
2057 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2058 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2060 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2061 float_raise( float_flag_invalid STATUS_VAR
);
2065 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2069 /*----------------------------------------------------------------------------
2070 | Returns 1 if the single-precision floating-point value `a' is less than
2071 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2072 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2074 *----------------------------------------------------------------------------*/
2076 int float32_le( float32 a
, float32 b STATUS_PARAM
)
2080 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2081 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2083 float_raise( float_flag_invalid STATUS_VAR
);
2086 aSign
= extractFloat32Sign( a
);
2087 bSign
= extractFloat32Sign( b
);
2088 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2089 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2093 /*----------------------------------------------------------------------------
2094 | Returns 1 if the single-precision floating-point value `a' is less than
2095 | the corresponding value `b', and 0 otherwise. The comparison is performed
2096 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2097 *----------------------------------------------------------------------------*/
2099 int float32_lt( float32 a
, float32 b STATUS_PARAM
)
2103 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2104 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2106 float_raise( float_flag_invalid STATUS_VAR
);
2109 aSign
= extractFloat32Sign( a
);
2110 bSign
= extractFloat32Sign( b
);
2111 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
2112 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2116 /*----------------------------------------------------------------------------
2117 | Returns 1 if the single-precision floating-point value `a' is equal to
2118 | the corresponding value `b', and 0 otherwise. The invalid exception is
2119 | raised if either operand is a NaN. Otherwise, the comparison is performed
2120 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2121 *----------------------------------------------------------------------------*/
2123 int float32_eq_signaling( float32 a
, float32 b STATUS_PARAM
)
2126 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2127 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2129 float_raise( float_flag_invalid STATUS_VAR
);
2132 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2136 /*----------------------------------------------------------------------------
2137 | Returns 1 if the single-precision floating-point value `a' is less than or
2138 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2139 | cause an exception. Otherwise, the comparison is performed according to the
2140 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2141 *----------------------------------------------------------------------------*/
2143 int float32_le_quiet( float32 a
, float32 b STATUS_PARAM
)
2147 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2148 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2150 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2151 float_raise( float_flag_invalid STATUS_VAR
);
2155 aSign
= extractFloat32Sign( a
);
2156 bSign
= extractFloat32Sign( b
);
2157 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2158 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2162 /*----------------------------------------------------------------------------
2163 | Returns 1 if the single-precision floating-point value `a' is less than
2164 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2165 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2166 | Standard for Binary Floating-Point Arithmetic.
2167 *----------------------------------------------------------------------------*/
2169 int float32_lt_quiet( float32 a
, float32 b STATUS_PARAM
)
2173 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2174 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2176 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2177 float_raise( float_flag_invalid STATUS_VAR
);
2181 aSign
= extractFloat32Sign( a
);
2182 bSign
= extractFloat32Sign( b
);
2183 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
2184 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2188 /*----------------------------------------------------------------------------
2189 | Returns the result of converting the double-precision floating-point value
2190 | `a' to the 32-bit two's complement integer format. The conversion is
2191 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2192 | Arithmetic---which means in particular that the conversion is rounded
2193 | according to the current rounding mode. If `a' is a NaN, the largest
2194 | positive integer is returned. Otherwise, if the conversion overflows, the
2195 | largest integer with the same sign as `a' is returned.
2196 *----------------------------------------------------------------------------*/
2198 int32
float64_to_int32( float64 a STATUS_PARAM
)
2201 int16 aExp
, shiftCount
;
2204 aSig
= extractFloat64Frac( a
);
2205 aExp
= extractFloat64Exp( a
);
2206 aSign
= extractFloat64Sign( a
);
2207 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2208 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2209 shiftCount
= 0x42C - aExp
;
2210 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
2211 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
2215 /*----------------------------------------------------------------------------
2216 | Returns the result of converting the double-precision floating-point value
2217 | `a' to the 32-bit two's complement integer format. The conversion is
2218 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2219 | Arithmetic, except that the conversion is always rounded toward zero.
2220 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2221 | the conversion overflows, the largest integer with the same sign as `a' is
2223 *----------------------------------------------------------------------------*/
2225 int32
float64_to_int32_round_to_zero( float64 a STATUS_PARAM
)
2228 int16 aExp
, shiftCount
;
2229 bits64 aSig
, savedASig
;
2232 aSig
= extractFloat64Frac( a
);
2233 aExp
= extractFloat64Exp( a
);
2234 aSign
= extractFloat64Sign( a
);
2235 if ( 0x41E < aExp
) {
2236 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2239 else if ( aExp
< 0x3FF ) {
2240 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2243 aSig
|= LIT64( 0x0010000000000000 );
2244 shiftCount
= 0x433 - aExp
;
2246 aSig
>>= shiftCount
;
2248 if ( aSign
) z
= - z
;
2249 if ( ( z
< 0 ) ^ aSign
) {
2251 float_raise( float_flag_invalid STATUS_VAR
);
2252 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
2254 if ( ( aSig
<<shiftCount
) != savedASig
) {
2255 STATUS(float_exception_flags
) |= float_flag_inexact
;
2261 /*----------------------------------------------------------------------------
2262 | Returns the result of converting the double-precision floating-point value
2263 | `a' to the 64-bit two's complement integer format. The conversion is
2264 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2265 | Arithmetic---which means in particular that the conversion is rounded
2266 | according to the current rounding mode. If `a' is a NaN, the largest
2267 | positive integer is returned. Otherwise, if the conversion overflows, the
2268 | largest integer with the same sign as `a' is returned.
2269 *----------------------------------------------------------------------------*/
2271 int64
float64_to_int64( float64 a STATUS_PARAM
)
2274 int16 aExp
, shiftCount
;
2275 bits64 aSig
, aSigExtra
;
2277 aSig
= extractFloat64Frac( a
);
2278 aExp
= extractFloat64Exp( a
);
2279 aSign
= extractFloat64Sign( a
);
2280 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2281 shiftCount
= 0x433 - aExp
;
2282 if ( shiftCount
<= 0 ) {
2283 if ( 0x43E < aExp
) {
2284 float_raise( float_flag_invalid STATUS_VAR
);
2286 || ( ( aExp
== 0x7FF )
2287 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2289 return LIT64( 0x7FFFFFFFFFFFFFFF );
2291 return (sbits64
) LIT64( 0x8000000000000000 );
2294 aSig
<<= - shiftCount
;
2297 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
2299 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
2303 /*----------------------------------------------------------------------------
2304 | Returns the result of converting the double-precision floating-point value
2305 | `a' to the 64-bit two's complement integer format. The conversion is
2306 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2307 | Arithmetic, except that the conversion is always rounded toward zero.
2308 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2309 | the conversion overflows, the largest integer with the same sign as `a' is
2311 *----------------------------------------------------------------------------*/
2313 int64
float64_to_int64_round_to_zero( float64 a STATUS_PARAM
)
2316 int16 aExp
, shiftCount
;
2320 aSig
= extractFloat64Frac( a
);
2321 aExp
= extractFloat64Exp( a
);
2322 aSign
= extractFloat64Sign( a
);
2323 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2324 shiftCount
= aExp
- 0x433;
2325 if ( 0 <= shiftCount
) {
2326 if ( 0x43E <= aExp
) {
2327 if ( a
!= LIT64( 0xC3E0000000000000 ) ) {
2328 float_raise( float_flag_invalid STATUS_VAR
);
2330 || ( ( aExp
== 0x7FF )
2331 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2333 return LIT64( 0x7FFFFFFFFFFFFFFF );
2336 return (sbits64
) LIT64( 0x8000000000000000 );
2338 z
= aSig
<<shiftCount
;
2341 if ( aExp
< 0x3FE ) {
2342 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2345 z
= aSig
>>( - shiftCount
);
2346 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
2347 STATUS(float_exception_flags
) |= float_flag_inexact
;
2350 if ( aSign
) z
= - z
;
2355 /*----------------------------------------------------------------------------
2356 | Returns the result of converting the double-precision floating-point value
2357 | `a' to the single-precision floating-point format. The conversion is
2358 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2360 *----------------------------------------------------------------------------*/
2362 float32
float64_to_float32( float64 a STATUS_PARAM
)
2369 aSig
= extractFloat64Frac( a
);
2370 aExp
= extractFloat64Exp( a
);
2371 aSign
= extractFloat64Sign( a
);
2372 if ( aExp
== 0x7FF ) {
2373 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR
) );
2374 return packFloat32( aSign
, 0xFF, 0 );
2376 shift64RightJamming( aSig
, 22, &aSig
);
2378 if ( aExp
|| zSig
) {
2382 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
2388 /*----------------------------------------------------------------------------
2389 | Returns the result of converting the double-precision floating-point value
2390 | `a' to the extended double-precision floating-point format. The conversion
2391 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2393 *----------------------------------------------------------------------------*/
2395 floatx80
float64_to_floatx80( float64 a STATUS_PARAM
)
2401 aSig
= extractFloat64Frac( a
);
2402 aExp
= extractFloat64Exp( a
);
2403 aSign
= extractFloat64Sign( a
);
2404 if ( aExp
== 0x7FF ) {
2405 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR
) );
2406 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2409 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
2410 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2414 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
2422 /*----------------------------------------------------------------------------
2423 | Returns the result of converting the double-precision floating-point value
2424 | `a' to the quadruple-precision floating-point format. The conversion is
2425 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2427 *----------------------------------------------------------------------------*/
2429 float128
float64_to_float128( float64 a STATUS_PARAM
)
2433 bits64 aSig
, zSig0
, zSig1
;
2435 aSig
= extractFloat64Frac( a
);
2436 aExp
= extractFloat64Exp( a
);
2437 aSign
= extractFloat64Sign( a
);
2438 if ( aExp
== 0x7FF ) {
2439 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR
) );
2440 return packFloat128( aSign
, 0x7FFF, 0, 0 );
2443 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
2444 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2447 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
2448 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
2454 /*----------------------------------------------------------------------------
2455 | Rounds the double-precision floating-point value `a' to an integer, and
2456 | returns the result as a double-precision floating-point value. The
2457 | operation is performed according to the IEC/IEEE Standard for Binary
2458 | Floating-Point Arithmetic.
2459 *----------------------------------------------------------------------------*/
2461 float64
float64_round_to_int( float64 a STATUS_PARAM
)
2465 bits64 lastBitMask
, roundBitsMask
;
2469 aExp
= extractFloat64Exp( a
);
2470 if ( 0x433 <= aExp
) {
2471 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
2472 return propagateFloat64NaN( a
, a STATUS_VAR
);
2476 if ( aExp
< 0x3FF ) {
2477 if ( (bits64
) ( a
<<1 ) == 0 ) return a
;
2478 STATUS(float_exception_flags
) |= float_flag_inexact
;
2479 aSign
= extractFloat64Sign( a
);
2480 switch ( STATUS(float_rounding_mode
) ) {
2481 case float_round_nearest_even
:
2482 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
2483 return packFloat64( aSign
, 0x3FF, 0 );
2486 case float_round_down
:
2487 return aSign
? LIT64( 0xBFF0000000000000 ) : 0;
2488 case float_round_up
:
2490 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2492 return packFloat64( aSign
, 0, 0 );
2495 lastBitMask
<<= 0x433 - aExp
;
2496 roundBitsMask
= lastBitMask
- 1;
2498 roundingMode
= STATUS(float_rounding_mode
);
2499 if ( roundingMode
== float_round_nearest_even
) {
2500 z
+= lastBitMask
>>1;
2501 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
2503 else if ( roundingMode
!= float_round_to_zero
) {
2504 if ( extractFloat64Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
2508 z
&= ~ roundBitsMask
;
2509 if ( z
!= a
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2514 float64
float64_trunc_to_int( float64 a STATUS_PARAM
)
2518 oldmode
= STATUS(float_rounding_mode
);
2519 STATUS(float_rounding_mode
) = float_round_to_zero
;
2520 res
= float64_round_to_int(a STATUS_VAR
);
2521 STATUS(float_rounding_mode
) = oldmode
;
2525 /*----------------------------------------------------------------------------
2526 | Returns the result of adding the absolute values of the double-precision
2527 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2528 | before being returned. `zSign' is ignored if the result is a NaN.
2529 | The addition is performed according to the IEC/IEEE Standard for Binary
2530 | Floating-Point Arithmetic.
2531 *----------------------------------------------------------------------------*/
2533 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
2535 int16 aExp
, bExp
, zExp
;
2536 bits64 aSig
, bSig
, zSig
;
2539 aSig
= extractFloat64Frac( a
);
2540 aExp
= extractFloat64Exp( a
);
2541 bSig
= extractFloat64Frac( b
);
2542 bExp
= extractFloat64Exp( b
);
2543 expDiff
= aExp
- bExp
;
2546 if ( 0 < expDiff
) {
2547 if ( aExp
== 0x7FF ) {
2548 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2555 bSig
|= LIT64( 0x2000000000000000 );
2557 shift64RightJamming( bSig
, expDiff
, &bSig
);
2560 else if ( expDiff
< 0 ) {
2561 if ( bExp
== 0x7FF ) {
2562 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2563 return packFloat64( zSign
, 0x7FF, 0 );
2569 aSig
|= LIT64( 0x2000000000000000 );
2571 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2575 if ( aExp
== 0x7FF ) {
2576 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2579 if ( aExp
== 0 ) return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
2580 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
2584 aSig
|= LIT64( 0x2000000000000000 );
2585 zSig
= ( aSig
+ bSig
)<<1;
2587 if ( (sbits64
) zSig
< 0 ) {
2592 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
2596 /*----------------------------------------------------------------------------
2597 | Returns the result of subtracting the absolute values of the double-
2598 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2599 | difference is negated before being returned. `zSign' is ignored if the
2600 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2601 | Standard for Binary Floating-Point Arithmetic.
2602 *----------------------------------------------------------------------------*/
2604 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
2606 int16 aExp
, bExp
, zExp
;
2607 bits64 aSig
, bSig
, zSig
;
2610 aSig
= extractFloat64Frac( a
);
2611 aExp
= extractFloat64Exp( a
);
2612 bSig
= extractFloat64Frac( b
);
2613 bExp
= extractFloat64Exp( b
);
2614 expDiff
= aExp
- bExp
;
2617 if ( 0 < expDiff
) goto aExpBigger
;
2618 if ( expDiff
< 0 ) goto bExpBigger
;
2619 if ( aExp
== 0x7FF ) {
2620 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2621 float_raise( float_flag_invalid STATUS_VAR
);
2622 return float64_default_nan
;
2628 if ( bSig
< aSig
) goto aBigger
;
2629 if ( aSig
< bSig
) goto bBigger
;
2630 return packFloat64( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
2632 if ( bExp
== 0x7FF ) {
2633 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2634 return packFloat64( zSign
^ 1, 0x7FF, 0 );
2640 aSig
|= LIT64( 0x4000000000000000 );
2642 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2643 bSig
|= LIT64( 0x4000000000000000 );
2648 goto normalizeRoundAndPack
;
2650 if ( aExp
== 0x7FF ) {
2651 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2658 bSig
|= LIT64( 0x4000000000000000 );
2660 shift64RightJamming( bSig
, expDiff
, &bSig
);
2661 aSig
|= LIT64( 0x4000000000000000 );
2665 normalizeRoundAndPack
:
2667 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
2671 /*----------------------------------------------------------------------------
2672 | Returns the result of adding the double-precision floating-point values `a'
2673 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2674 | Binary Floating-Point Arithmetic.
2675 *----------------------------------------------------------------------------*/
2677 float64
float64_add( float64 a
, float64 b STATUS_PARAM
)
2681 aSign
= extractFloat64Sign( a
);
2682 bSign
= extractFloat64Sign( b
);
2683 if ( aSign
== bSign
) {
2684 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
2687 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
2692 /*----------------------------------------------------------------------------
2693 | Returns the result of subtracting the double-precision floating-point values
2694 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2695 | for Binary Floating-Point Arithmetic.
2696 *----------------------------------------------------------------------------*/
2698 float64
float64_sub( float64 a
, float64 b STATUS_PARAM
)
2702 aSign
= extractFloat64Sign( a
);
2703 bSign
= extractFloat64Sign( b
);
2704 if ( aSign
== bSign
) {
2705 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
2708 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
2713 /*----------------------------------------------------------------------------
2714 | Returns the result of multiplying the double-precision floating-point values
2715 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2716 | for Binary Floating-Point Arithmetic.
2717 *----------------------------------------------------------------------------*/
2719 float64
float64_mul( float64 a
, float64 b STATUS_PARAM
)
2721 flag aSign
, bSign
, zSign
;
2722 int16 aExp
, bExp
, zExp
;
2723 bits64 aSig
, bSig
, zSig0
, zSig1
;
2725 aSig
= extractFloat64Frac( a
);
2726 aExp
= extractFloat64Exp( a
);
2727 aSign
= extractFloat64Sign( a
);
2728 bSig
= extractFloat64Frac( b
);
2729 bExp
= extractFloat64Exp( b
);
2730 bSign
= extractFloat64Sign( b
);
2731 zSign
= aSign
^ bSign
;
2732 if ( aExp
== 0x7FF ) {
2733 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2734 return propagateFloat64NaN( a
, b STATUS_VAR
);
2736 if ( ( bExp
| bSig
) == 0 ) {
2737 float_raise( float_flag_invalid STATUS_VAR
);
2738 return float64_default_nan
;
2740 return packFloat64( zSign
, 0x7FF, 0 );
2742 if ( bExp
== 0x7FF ) {
2743 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2744 if ( ( aExp
| aSig
) == 0 ) {
2745 float_raise( float_flag_invalid STATUS_VAR
);
2746 return float64_default_nan
;
2748 return packFloat64( zSign
, 0x7FF, 0 );
2751 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2752 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2755 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2756 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2758 zExp
= aExp
+ bExp
- 0x3FF;
2759 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2760 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2761 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
2762 zSig0
|= ( zSig1
!= 0 );
2763 if ( 0 <= (sbits64
) ( zSig0
<<1 ) ) {
2767 return roundAndPackFloat64( zSign
, zExp
, zSig0 STATUS_VAR
);
2771 /*----------------------------------------------------------------------------
2772 | Returns the result of dividing the double-precision floating-point value `a'
2773 | by the corresponding value `b'. The operation is performed according to
2774 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2775 *----------------------------------------------------------------------------*/
2777 float64
float64_div( float64 a
, float64 b STATUS_PARAM
)
2779 flag aSign
, bSign
, zSign
;
2780 int16 aExp
, bExp
, zExp
;
2781 bits64 aSig
, bSig
, zSig
;
2783 bits64 term0
, term1
;
2785 aSig
= extractFloat64Frac( a
);
2786 aExp
= extractFloat64Exp( a
);
2787 aSign
= extractFloat64Sign( a
);
2788 bSig
= extractFloat64Frac( b
);
2789 bExp
= extractFloat64Exp( b
);
2790 bSign
= extractFloat64Sign( b
);
2791 zSign
= aSign
^ bSign
;
2792 if ( aExp
== 0x7FF ) {
2793 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2794 if ( bExp
== 0x7FF ) {
2795 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2796 float_raise( float_flag_invalid STATUS_VAR
);
2797 return float64_default_nan
;
2799 return packFloat64( zSign
, 0x7FF, 0 );
2801 if ( bExp
== 0x7FF ) {
2802 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2803 return packFloat64( zSign
, 0, 0 );
2807 if ( ( aExp
| aSig
) == 0 ) {
2808 float_raise( float_flag_invalid STATUS_VAR
);
2809 return float64_default_nan
;
2811 float_raise( float_flag_divbyzero STATUS_VAR
);
2812 return packFloat64( zSign
, 0x7FF, 0 );
2814 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2817 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2818 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2820 zExp
= aExp
- bExp
+ 0x3FD;
2821 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2822 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2823 if ( bSig
<= ( aSig
+ aSig
) ) {
2827 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
2828 if ( ( zSig
& 0x1FF ) <= 2 ) {
2829 mul64To128( bSig
, zSig
, &term0
, &term1
);
2830 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2831 while ( (sbits64
) rem0
< 0 ) {
2833 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
2835 zSig
|= ( rem1
!= 0 );
2837 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
2841 /*----------------------------------------------------------------------------
2842 | Returns the remainder of the double-precision floating-point value `a'
2843 | with respect to the corresponding value `b'. The operation is performed
2844 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2845 *----------------------------------------------------------------------------*/
2847 float64
float64_rem( float64 a
, float64 b STATUS_PARAM
)
2849 flag aSign
, bSign
, zSign
;
2850 int16 aExp
, bExp
, expDiff
;
2852 bits64 q
, alternateASig
;
2855 aSig
= extractFloat64Frac( a
);
2856 aExp
= extractFloat64Exp( a
);
2857 aSign
= extractFloat64Sign( a
);
2858 bSig
= extractFloat64Frac( b
);
2859 bExp
= extractFloat64Exp( b
);
2860 bSign
= extractFloat64Sign( b
);
2861 if ( aExp
== 0x7FF ) {
2862 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2863 return propagateFloat64NaN( a
, b STATUS_VAR
);
2865 float_raise( float_flag_invalid STATUS_VAR
);
2866 return float64_default_nan
;
2868 if ( bExp
== 0x7FF ) {
2869 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
2874 float_raise( float_flag_invalid STATUS_VAR
);
2875 return float64_default_nan
;
2877 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2880 if ( aSig
== 0 ) return a
;
2881 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2883 expDiff
= aExp
- bExp
;
2884 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
2885 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2886 if ( expDiff
< 0 ) {
2887 if ( expDiff
< -1 ) return a
;
2890 q
= ( bSig
<= aSig
);
2891 if ( q
) aSig
-= bSig
;
2893 while ( 0 < expDiff
) {
2894 q
= estimateDiv128To64( aSig
, 0, bSig
);
2895 q
= ( 2 < q
) ? q
- 2 : 0;
2896 aSig
= - ( ( bSig
>>2 ) * q
);
2900 if ( 0 < expDiff
) {
2901 q
= estimateDiv128To64( aSig
, 0, bSig
);
2902 q
= ( 2 < q
) ? q
- 2 : 0;
2905 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2912 alternateASig
= aSig
;
2915 } while ( 0 <= (sbits64
) aSig
);
2916 sigMean
= aSig
+ alternateASig
;
2917 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2918 aSig
= alternateASig
;
2920 zSign
= ( (sbits64
) aSig
< 0 );
2921 if ( zSign
) aSig
= - aSig
;
2922 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
2926 /*----------------------------------------------------------------------------
2927 | Returns the square root of the double-precision floating-point value `a'.
2928 | The operation is performed according to the IEC/IEEE Standard for Binary
2929 | Floating-Point Arithmetic.
2930 *----------------------------------------------------------------------------*/
2932 float64
float64_sqrt( float64 a STATUS_PARAM
)
2936 bits64 aSig
, zSig
, doubleZSig
;
2937 bits64 rem0
, rem1
, term0
, term1
;
2939 aSig
= extractFloat64Frac( a
);
2940 aExp
= extractFloat64Exp( a
);
2941 aSign
= extractFloat64Sign( a
);
2942 if ( aExp
== 0x7FF ) {
2943 if ( aSig
) return propagateFloat64NaN( a
, a STATUS_VAR
);
2944 if ( ! aSign
) return a
;
2945 float_raise( float_flag_invalid STATUS_VAR
);
2946 return float64_default_nan
;
2949 if ( ( aExp
| aSig
) == 0 ) return a
;
2950 float_raise( float_flag_invalid STATUS_VAR
);
2951 return float64_default_nan
;
2954 if ( aSig
== 0 ) return 0;
2955 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2957 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
2958 aSig
|= LIT64( 0x0010000000000000 );
2959 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
2960 aSig
<<= 9 - ( aExp
& 1 );
2961 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
2962 if ( ( zSig
& 0x1FF ) <= 5 ) {
2963 doubleZSig
= zSig
<<1;
2964 mul64To128( zSig
, zSig
, &term0
, &term1
);
2965 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2966 while ( (sbits64
) rem0
< 0 ) {
2969 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
2971 zSig
|= ( ( rem0
| rem1
) != 0 );
2973 return roundAndPackFloat64( 0, zExp
, zSig STATUS_VAR
);
2977 /*----------------------------------------------------------------------------
2978 | Returns 1 if the double-precision floating-point value `a' is equal to the
2979 | corresponding value `b', and 0 otherwise. The comparison is performed
2980 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2981 *----------------------------------------------------------------------------*/
2983 int float64_eq( float64 a
, float64 b STATUS_PARAM
)
2986 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2987 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2989 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2990 float_raise( float_flag_invalid STATUS_VAR
);
2994 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2998 /*----------------------------------------------------------------------------
2999 | Returns 1 if the double-precision floating-point value `a' is less than or
3000 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3001 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3003 *----------------------------------------------------------------------------*/
3005 int float64_le( float64 a
, float64 b STATUS_PARAM
)
3009 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3010 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3012 float_raise( float_flag_invalid STATUS_VAR
);
3015 aSign
= extractFloat64Sign( a
);
3016 bSign
= extractFloat64Sign( b
);
3017 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
3018 return ( a
== b
) || ( aSign
^ ( a
< b
) );
3022 /*----------------------------------------------------------------------------
3023 | Returns 1 if the double-precision floating-point value `a' is less than
3024 | the corresponding value `b', and 0 otherwise. The comparison is performed
3025 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3026 *----------------------------------------------------------------------------*/
3028 int float64_lt( float64 a
, float64 b STATUS_PARAM
)
3032 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3033 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3035 float_raise( float_flag_invalid STATUS_VAR
);
3038 aSign
= extractFloat64Sign( a
);
3039 bSign
= extractFloat64Sign( b
);
3040 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
3041 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
3045 /*----------------------------------------------------------------------------
3046 | Returns 1 if the double-precision floating-point value `a' is equal to the
3047 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3048 | if either operand is a NaN. Otherwise, the comparison is performed
3049 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3050 *----------------------------------------------------------------------------*/
3052 int float64_eq_signaling( float64 a
, float64 b STATUS_PARAM
)
3055 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3056 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3058 float_raise( float_flag_invalid STATUS_VAR
);
3061 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
3065 /*----------------------------------------------------------------------------
3066 | Returns 1 if the double-precision floating-point value `a' is less than or
3067 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3068 | cause an exception. Otherwise, the comparison is performed according to the
3069 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3070 *----------------------------------------------------------------------------*/
3072 int float64_le_quiet( float64 a
, float64 b STATUS_PARAM
)
3076 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3077 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3079 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3080 float_raise( float_flag_invalid STATUS_VAR
);
3084 aSign
= extractFloat64Sign( a
);
3085 bSign
= extractFloat64Sign( b
);
3086 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
3087 return ( a
== b
) || ( aSign
^ ( a
< b
) );
3091 /*----------------------------------------------------------------------------
3092 | Returns 1 if the double-precision floating-point value `a' is less than
3093 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3094 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3095 | Standard for Binary Floating-Point Arithmetic.
3096 *----------------------------------------------------------------------------*/
3098 int float64_lt_quiet( float64 a
, float64 b STATUS_PARAM
)
3102 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3103 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3105 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3106 float_raise( float_flag_invalid STATUS_VAR
);
3110 aSign
= extractFloat64Sign( a
);
3111 bSign
= extractFloat64Sign( b
);
3112 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
3113 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
3119 /*----------------------------------------------------------------------------
3120 | Returns the result of converting the extended double-precision floating-
3121 | point value `a' to the 32-bit two's complement integer format. The
3122 | conversion is performed according to the IEC/IEEE Standard for Binary
3123 | Floating-Point Arithmetic---which means in particular that the conversion
3124 | is rounded according to the current rounding mode. If `a' is a NaN, the
3125 | largest positive integer is returned. Otherwise, if the conversion
3126 | overflows, the largest integer with the same sign as `a' is returned.
3127 *----------------------------------------------------------------------------*/
3129 int32
floatx80_to_int32( floatx80 a STATUS_PARAM
)
3132 int32 aExp
, shiftCount
;
3135 aSig
= extractFloatx80Frac( a
);
3136 aExp
= extractFloatx80Exp( a
);
3137 aSign
= extractFloatx80Sign( a
);
3138 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3139 shiftCount
= 0x4037 - aExp
;
3140 if ( shiftCount
<= 0 ) shiftCount
= 1;
3141 shift64RightJamming( aSig
, shiftCount
, &aSig
);
3142 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
3146 /*----------------------------------------------------------------------------
3147 | Returns the result of converting the extended double-precision floating-
3148 | point value `a' to the 32-bit two's complement integer format. The
3149 | conversion is performed according to the IEC/IEEE Standard for Binary
3150 | Floating-Point Arithmetic, except that the conversion is always rounded
3151 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3152 | Otherwise, if the conversion overflows, the largest integer with the same
3153 | sign as `a' is returned.
3154 *----------------------------------------------------------------------------*/
3156 int32
floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM
)
3159 int32 aExp
, shiftCount
;
3160 bits64 aSig
, savedASig
;
3163 aSig
= extractFloatx80Frac( a
);
3164 aExp
= extractFloatx80Exp( a
);
3165 aSign
= extractFloatx80Sign( a
);
3166 if ( 0x401E < aExp
) {
3167 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3170 else if ( aExp
< 0x3FFF ) {
3171 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3174 shiftCount
= 0x403E - aExp
;
3176 aSig
>>= shiftCount
;
3178 if ( aSign
) z
= - z
;
3179 if ( ( z
< 0 ) ^ aSign
) {
3181 float_raise( float_flag_invalid STATUS_VAR
);
3182 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
3184 if ( ( aSig
<<shiftCount
) != savedASig
) {
3185 STATUS(float_exception_flags
) |= float_flag_inexact
;
3191 /*----------------------------------------------------------------------------
3192 | Returns the result of converting the extended double-precision floating-
3193 | point value `a' to the 64-bit two's complement integer format. The
3194 | conversion is performed according to the IEC/IEEE Standard for Binary
3195 | Floating-Point Arithmetic---which means in particular that the conversion
3196 | is rounded according to the current rounding mode. If `a' is a NaN,
3197 | the largest positive integer is returned. Otherwise, if the conversion
3198 | overflows, the largest integer with the same sign as `a' is returned.
3199 *----------------------------------------------------------------------------*/
3201 int64
floatx80_to_int64( floatx80 a STATUS_PARAM
)
3204 int32 aExp
, shiftCount
;
3205 bits64 aSig
, aSigExtra
;
3207 aSig
= extractFloatx80Frac( a
);
3208 aExp
= extractFloatx80Exp( a
);
3209 aSign
= extractFloatx80Sign( a
);
3210 shiftCount
= 0x403E - aExp
;
3211 if ( shiftCount
<= 0 ) {
3213 float_raise( float_flag_invalid STATUS_VAR
);
3215 || ( ( aExp
== 0x7FFF )
3216 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
3218 return LIT64( 0x7FFFFFFFFFFFFFFF );
3220 return (sbits64
) LIT64( 0x8000000000000000 );
3225 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3227 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
3231 /*----------------------------------------------------------------------------
3232 | Returns the result of converting the extended double-precision floating-
3233 | point value `a' to the 64-bit two's complement integer format. The
3234 | conversion is performed according to the IEC/IEEE Standard for Binary
3235 | Floating-Point Arithmetic, except that the conversion is always rounded
3236 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3237 | Otherwise, if the conversion overflows, the largest integer with the same
3238 | sign as `a' is returned.
3239 *----------------------------------------------------------------------------*/
3241 int64
floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM
)
3244 int32 aExp
, shiftCount
;
3248 aSig
= extractFloatx80Frac( a
);
3249 aExp
= extractFloatx80Exp( a
);
3250 aSign
= extractFloatx80Sign( a
);
3251 shiftCount
= aExp
- 0x403E;
3252 if ( 0 <= shiftCount
) {
3253 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
3254 if ( ( a
.high
!= 0xC03E ) || aSig
) {
3255 float_raise( float_flag_invalid STATUS_VAR
);
3256 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
3257 return LIT64( 0x7FFFFFFFFFFFFFFF );
3260 return (sbits64
) LIT64( 0x8000000000000000 );
3262 else if ( aExp
< 0x3FFF ) {
3263 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3266 z
= aSig
>>( - shiftCount
);
3267 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
3268 STATUS(float_exception_flags
) |= float_flag_inexact
;
3270 if ( aSign
) z
= - z
;
3275 /*----------------------------------------------------------------------------
3276 | Returns the result of converting the extended double-precision floating-
3277 | point value `a' to the single-precision floating-point format. The
3278 | conversion is performed according to the IEC/IEEE Standard for Binary
3279 | Floating-Point Arithmetic.
3280 *----------------------------------------------------------------------------*/
3282 float32
floatx80_to_float32( floatx80 a STATUS_PARAM
)
3288 aSig
= extractFloatx80Frac( a
);
3289 aExp
= extractFloatx80Exp( a
);
3290 aSign
= extractFloatx80Sign( a
);
3291 if ( aExp
== 0x7FFF ) {
3292 if ( (bits64
) ( aSig
<<1 ) ) {
3293 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR
) );
3295 return packFloat32( aSign
, 0xFF, 0 );
3297 shift64RightJamming( aSig
, 33, &aSig
);
3298 if ( aExp
|| aSig
) aExp
-= 0x3F81;
3299 return roundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
3303 /*----------------------------------------------------------------------------
3304 | Returns the result of converting the extended double-precision floating-
3305 | point value `a' to the double-precision floating-point format. The
3306 | conversion is performed according to the IEC/IEEE Standard for Binary
3307 | Floating-Point Arithmetic.
3308 *----------------------------------------------------------------------------*/
3310 float64
floatx80_to_float64( floatx80 a STATUS_PARAM
)
3316 aSig
= extractFloatx80Frac( a
);
3317 aExp
= extractFloatx80Exp( a
);
3318 aSign
= extractFloatx80Sign( a
);
3319 if ( aExp
== 0x7FFF ) {
3320 if ( (bits64
) ( aSig
<<1 ) ) {
3321 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR
) );
3323 return packFloat64( aSign
, 0x7FF, 0 );
3325 shift64RightJamming( aSig
, 1, &zSig
);
3326 if ( aExp
|| aSig
) aExp
-= 0x3C01;
3327 return roundAndPackFloat64( aSign
, aExp
, zSig STATUS_VAR
);
3333 /*----------------------------------------------------------------------------
3334 | Returns the result of converting the extended double-precision floating-
3335 | point value `a' to the quadruple-precision floating-point format. The
3336 | conversion is performed according to the IEC/IEEE Standard for Binary
3337 | Floating-Point Arithmetic.
3338 *----------------------------------------------------------------------------*/
3340 float128
floatx80_to_float128( floatx80 a STATUS_PARAM
)
3344 bits64 aSig
, zSig0
, zSig1
;
3346 aSig
= extractFloatx80Frac( a
);
3347 aExp
= extractFloatx80Exp( a
);
3348 aSign
= extractFloatx80Sign( a
);
3349 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) {
3350 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR
) );
3352 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
3353 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
3359 /*----------------------------------------------------------------------------
3360 | Rounds the extended double-precision floating-point value `a' to an integer,
3361 | and returns the result as an extended quadruple-precision floating-point
3362 | value. The operation is performed according to the IEC/IEEE Standard for
3363 | Binary Floating-Point Arithmetic.
3364 *----------------------------------------------------------------------------*/
3366 floatx80
floatx80_round_to_int( floatx80 a STATUS_PARAM
)
3370 bits64 lastBitMask
, roundBitsMask
;
3374 aExp
= extractFloatx80Exp( a
);
3375 if ( 0x403E <= aExp
) {
3376 if ( ( aExp
== 0x7FFF ) && (bits64
) ( extractFloatx80Frac( a
)<<1 ) ) {
3377 return propagateFloatx80NaN( a
, a STATUS_VAR
);
3381 if ( aExp
< 0x3FFF ) {
3383 && ( (bits64
) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
3386 STATUS(float_exception_flags
) |= float_flag_inexact
;
3387 aSign
= extractFloatx80Sign( a
);
3388 switch ( STATUS(float_rounding_mode
) ) {
3389 case float_round_nearest_even
:
3390 if ( ( aExp
== 0x3FFE ) && (bits64
) ( extractFloatx80Frac( a
)<<1 )
3393 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
3396 case float_round_down
:
3399 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3400 : packFloatx80( 0, 0, 0 );
3401 case float_round_up
:
3403 aSign
? packFloatx80( 1, 0, 0 )
3404 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3406 return packFloatx80( aSign
, 0, 0 );
3409 lastBitMask
<<= 0x403E - aExp
;
3410 roundBitsMask
= lastBitMask
- 1;
3412 roundingMode
= STATUS(float_rounding_mode
);
3413 if ( roundingMode
== float_round_nearest_even
) {
3414 z
.low
+= lastBitMask
>>1;
3415 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
3417 else if ( roundingMode
!= float_round_to_zero
) {
3418 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
3419 z
.low
+= roundBitsMask
;
3422 z
.low
&= ~ roundBitsMask
;
3425 z
.low
= LIT64( 0x8000000000000000 );
3427 if ( z
.low
!= a
.low
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3432 /*----------------------------------------------------------------------------
3433 | Returns the result of adding the absolute values of the extended double-
3434 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3435 | negated before being returned. `zSign' is ignored if the result is a NaN.
3436 | The addition is performed according to the IEC/IEEE Standard for Binary
3437 | Floating-Point Arithmetic.
3438 *----------------------------------------------------------------------------*/
3440 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
3442 int32 aExp
, bExp
, zExp
;
3443 bits64 aSig
, bSig
, zSig0
, zSig1
;
3446 aSig
= extractFloatx80Frac( a
);
3447 aExp
= extractFloatx80Exp( a
);
3448 bSig
= extractFloatx80Frac( b
);
3449 bExp
= extractFloatx80Exp( b
);
3450 expDiff
= aExp
- bExp
;
3451 if ( 0 < expDiff
) {
3452 if ( aExp
== 0x7FFF ) {
3453 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3456 if ( bExp
== 0 ) --expDiff
;
3457 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3460 else if ( expDiff
< 0 ) {
3461 if ( bExp
== 0x7FFF ) {
3462 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3463 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3465 if ( aExp
== 0 ) ++expDiff
;
3466 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3470 if ( aExp
== 0x7FFF ) {
3471 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3472 return propagateFloatx80NaN( a
, b STATUS_VAR
);
3477 zSig0
= aSig
+ bSig
;
3479 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
3485 zSig0
= aSig
+ bSig
;
3486 if ( (sbits64
) zSig0
< 0 ) goto roundAndPack
;
3488 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3489 zSig0
|= LIT64( 0x8000000000000000 );
3493 roundAndPackFloatx80(
3494 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
3498 /*----------------------------------------------------------------------------
3499 | Returns the result of subtracting the absolute values of the extended
3500 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3501 | difference is negated before being returned. `zSign' is ignored if the
3502 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3503 | Standard for Binary Floating-Point Arithmetic.
3504 *----------------------------------------------------------------------------*/
3506 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
3508 int32 aExp
, bExp
, zExp
;
3509 bits64 aSig
, bSig
, zSig0
, zSig1
;
3513 aSig
= extractFloatx80Frac( a
);
3514 aExp
= extractFloatx80Exp( a
);
3515 bSig
= extractFloatx80Frac( b
);
3516 bExp
= extractFloatx80Exp( b
);
3517 expDiff
= aExp
- bExp
;
3518 if ( 0 < expDiff
) goto aExpBigger
;
3519 if ( expDiff
< 0 ) goto bExpBigger
;
3520 if ( aExp
== 0x7FFF ) {
3521 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3522 return propagateFloatx80NaN( a
, b STATUS_VAR
);
3524 float_raise( float_flag_invalid STATUS_VAR
);
3525 z
.low
= floatx80_default_nan_low
;
3526 z
.high
= floatx80_default_nan_high
;
3534 if ( bSig
< aSig
) goto aBigger
;
3535 if ( aSig
< bSig
) goto bBigger
;
3536 return packFloatx80( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
3538 if ( bExp
== 0x7FFF ) {
3539 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3540 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3542 if ( aExp
== 0 ) ++expDiff
;
3543 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3545 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
3548 goto normalizeRoundAndPack
;
3550 if ( aExp
== 0x7FFF ) {
3551 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3554 if ( bExp
== 0 ) --expDiff
;
3555 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3557 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
3559 normalizeRoundAndPack
:
3561 normalizeRoundAndPackFloatx80(
3562 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
3566 /*----------------------------------------------------------------------------
3567 | Returns the result of adding the extended double-precision floating-point
3568 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3569 | Standard for Binary Floating-Point Arithmetic.
3570 *----------------------------------------------------------------------------*/
3572 floatx80
floatx80_add( floatx80 a
, floatx80 b STATUS_PARAM
)
3576 aSign
= extractFloatx80Sign( a
);
3577 bSign
= extractFloatx80Sign( b
);
3578 if ( aSign
== bSign
) {
3579 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
3582 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
3587 /*----------------------------------------------------------------------------
3588 | Returns the result of subtracting the extended double-precision floating-
3589 | point values `a' and `b'. The operation is performed according to the
3590 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3591 *----------------------------------------------------------------------------*/
3593 floatx80
floatx80_sub( floatx80 a
, floatx80 b STATUS_PARAM
)
3597 aSign
= extractFloatx80Sign( a
);
3598 bSign
= extractFloatx80Sign( b
);
3599 if ( aSign
== bSign
) {
3600 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
3603 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
3608 /*----------------------------------------------------------------------------
3609 | Returns the result of multiplying the extended double-precision floating-
3610 | point values `a' and `b'. The operation is performed according to the
3611 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3612 *----------------------------------------------------------------------------*/
3614 floatx80
floatx80_mul( floatx80 a
, floatx80 b STATUS_PARAM
)
3616 flag aSign
, bSign
, zSign
;
3617 int32 aExp
, bExp
, zExp
;
3618 bits64 aSig
, bSig
, zSig0
, zSig1
;
3621 aSig
= extractFloatx80Frac( a
);
3622 aExp
= extractFloatx80Exp( a
);
3623 aSign
= extractFloatx80Sign( a
);
3624 bSig
= extractFloatx80Frac( b
);
3625 bExp
= extractFloatx80Exp( b
);
3626 bSign
= extractFloatx80Sign( b
);
3627 zSign
= aSign
^ bSign
;
3628 if ( aExp
== 0x7FFF ) {
3629 if ( (bits64
) ( aSig
<<1 )
3630 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3631 return propagateFloatx80NaN( a
, b STATUS_VAR
);
3633 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
3634 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3636 if ( bExp
== 0x7FFF ) {
3637 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3638 if ( ( aExp
| aSig
) == 0 ) {
3640 float_raise( float_flag_invalid STATUS_VAR
);
3641 z
.low
= floatx80_default_nan_low
;
3642 z
.high
= floatx80_default_nan_high
;
3645 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3648 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3649 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3652 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3653 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3655 zExp
= aExp
+ bExp
- 0x3FFE;
3656 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3657 if ( 0 < (sbits64
) zSig0
) {
3658 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3662 roundAndPackFloatx80(
3663 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
3667 /*----------------------------------------------------------------------------
3668 | Returns the result of dividing the extended double-precision floating-point
3669 | value `a' by the corresponding value `b'. The operation is performed
3670 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3671 *----------------------------------------------------------------------------*/
3673 floatx80
floatx80_div( floatx80 a
, floatx80 b STATUS_PARAM
)
3675 flag aSign
, bSign
, zSign
;
3676 int32 aExp
, bExp
, zExp
;
3677 bits64 aSig
, bSig
, zSig0
, zSig1
;
3678 bits64 rem0
, rem1
, rem2
, term0
, term1
, term2
;
3681 aSig
= extractFloatx80Frac( a
);
3682 aExp
= extractFloatx80Exp( a
);
3683 aSign
= extractFloatx80Sign( a
);
3684 bSig
= extractFloatx80Frac( b
);
3685 bExp
= extractFloatx80Exp( b
);
3686 bSign
= extractFloatx80Sign( b
);
3687 zSign
= aSign
^ bSign
;
3688 if ( aExp
== 0x7FFF ) {
3689 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3690 if ( bExp
== 0x7FFF ) {
3691 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3694 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3696 if ( bExp
== 0x7FFF ) {
3697 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3698 return packFloatx80( zSign
, 0, 0 );
3702 if ( ( aExp
| aSig
) == 0 ) {
3704 float_raise( float_flag_invalid STATUS_VAR
);
3705 z
.low
= floatx80_default_nan_low
;
3706 z
.high
= floatx80_default_nan_high
;
3709 float_raise( float_flag_divbyzero STATUS_VAR
);
3710 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3712 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3715 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3716 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3718 zExp
= aExp
- bExp
+ 0x3FFE;
3720 if ( bSig
<= aSig
) {
3721 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
3724 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
3725 mul64To128( bSig
, zSig0
, &term0
, &term1
);
3726 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
3727 while ( (sbits64
) rem0
< 0 ) {
3729 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3731 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
3732 if ( (bits64
) ( zSig1
<<1 ) <= 8 ) {
3733 mul64To128( bSig
, zSig1
, &term1
, &term2
);
3734 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3735 while ( (sbits64
) rem1
< 0 ) {
3737 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
3739 zSig1
|= ( ( rem1
| rem2
) != 0 );
3742 roundAndPackFloatx80(
3743 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
3747 /*----------------------------------------------------------------------------
3748 | Returns the remainder of the extended double-precision floating-point value
3749 | `a' with respect to the corresponding value `b'. The operation is performed
3750 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3751 *----------------------------------------------------------------------------*/
3753 floatx80
floatx80_rem( floatx80 a
, floatx80 b STATUS_PARAM
)
3755 flag aSign
, bSign
, zSign
;
3756 int32 aExp
, bExp
, expDiff
;
3757 bits64 aSig0
, aSig1
, bSig
;
3758 bits64 q
, term0
, term1
, alternateASig0
, alternateASig1
;
3761 aSig0
= extractFloatx80Frac( a
);
3762 aExp
= extractFloatx80Exp( a
);
3763 aSign
= extractFloatx80Sign( a
);
3764 bSig
= extractFloatx80Frac( b
);
3765 bExp
= extractFloatx80Exp( b
);
3766 bSign
= extractFloatx80Sign( b
);
3767 if ( aExp
== 0x7FFF ) {
3768 if ( (bits64
) ( aSig0
<<1 )
3769 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3770 return propagateFloatx80NaN( a
, b STATUS_VAR
);
3774 if ( bExp
== 0x7FFF ) {
3775 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
3781 float_raise( float_flag_invalid STATUS_VAR
);
3782 z
.low
= floatx80_default_nan_low
;
3783 z
.high
= floatx80_default_nan_high
;
3786 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3789 if ( (bits64
) ( aSig0
<<1 ) == 0 ) return a
;
3790 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3792 bSig
|= LIT64( 0x8000000000000000 );
3794 expDiff
= aExp
- bExp
;
3796 if ( expDiff
< 0 ) {
3797 if ( expDiff
< -1 ) return a
;
3798 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
3801 q
= ( bSig
<= aSig0
);
3802 if ( q
) aSig0
-= bSig
;
3804 while ( 0 < expDiff
) {
3805 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3806 q
= ( 2 < q
) ? q
- 2 : 0;
3807 mul64To128( bSig
, q
, &term0
, &term1
);
3808 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3809 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
3813 if ( 0 < expDiff
) {
3814 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3815 q
= ( 2 < q
) ? q
- 2 : 0;
3817 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
3818 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3819 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
3820 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
3822 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3829 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
3830 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3831 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3834 aSig0
= alternateASig0
;
3835 aSig1
= alternateASig1
;
3839 normalizeRoundAndPackFloatx80(
3840 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1 STATUS_VAR
);
3844 /*----------------------------------------------------------------------------
3845 | Returns the square root of the extended double-precision floating-point
3846 | value `a'. The operation is performed according to the IEC/IEEE Standard
3847 | for Binary Floating-Point Arithmetic.
3848 *----------------------------------------------------------------------------*/
3850 floatx80
floatx80_sqrt( floatx80 a STATUS_PARAM
)
3854 bits64 aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
3855 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
3858 aSig0
= extractFloatx80Frac( a
);
3859 aExp
= extractFloatx80Exp( a
);
3860 aSign
= extractFloatx80Sign( a
);
3861 if ( aExp
== 0x7FFF ) {
3862 if ( (bits64
) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a STATUS_VAR
);
3863 if ( ! aSign
) return a
;
3867 if ( ( aExp
| aSig0
) == 0 ) return a
;
3869 float_raise( float_flag_invalid STATUS_VAR
);
3870 z
.low
= floatx80_default_nan_low
;
3871 z
.high
= floatx80_default_nan_high
;
3875 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
3876 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3878 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
3879 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
3880 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
3881 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
3882 doubleZSig0
= zSig0
<<1;
3883 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
3884 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
3885 while ( (sbits64
) rem0
< 0 ) {
3888 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
3890 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
3891 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3892 if ( zSig1
== 0 ) zSig1
= 1;
3893 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
3894 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3895 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
3896 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
3897 while ( (sbits64
) rem1
< 0 ) {
3899 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
3901 term2
|= doubleZSig0
;
3902 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
3904 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
3906 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
3907 zSig0
|= doubleZSig0
;
3909 roundAndPackFloatx80(
3910 STATUS(floatx80_rounding_precision
), 0, zExp
, zSig0
, zSig1 STATUS_VAR
);
3914 /*----------------------------------------------------------------------------
3915 | Returns 1 if the extended double-precision floating-point value `a' is
3916 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3917 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3919 *----------------------------------------------------------------------------*/
3921 int floatx80_eq( floatx80 a
, floatx80 b STATUS_PARAM
)
3924 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3925 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3926 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3927 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3929 if ( floatx80_is_signaling_nan( a
)
3930 || floatx80_is_signaling_nan( b
) ) {
3931 float_raise( float_flag_invalid STATUS_VAR
);
3937 && ( ( a
.high
== b
.high
)
3939 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
3944 /*----------------------------------------------------------------------------
3945 | Returns 1 if the extended double-precision floating-point value `a' is
3946 | less than or equal to the corresponding value `b', and 0 otherwise. The
3947 | comparison is performed according to the IEC/IEEE Standard for Binary
3948 | Floating-Point Arithmetic.
3949 *----------------------------------------------------------------------------*/
3951 int floatx80_le( floatx80 a
, floatx80 b STATUS_PARAM
)
3955 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3956 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3957 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3958 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3960 float_raise( float_flag_invalid STATUS_VAR
);
3963 aSign
= extractFloatx80Sign( a
);
3964 bSign
= extractFloatx80Sign( b
);
3965 if ( aSign
!= bSign
) {
3968 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3972 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
3973 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
3977 /*----------------------------------------------------------------------------
3978 | Returns 1 if the extended double-precision floating-point value `a' is
3979 | less than the corresponding value `b', and 0 otherwise. The comparison
3980 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3982 *----------------------------------------------------------------------------*/
3984 int floatx80_lt( floatx80 a
, floatx80 b STATUS_PARAM
)
3988 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3989 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3990 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3991 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3993 float_raise( float_flag_invalid STATUS_VAR
);
3996 aSign
= extractFloatx80Sign( a
);
3997 bSign
= extractFloatx80Sign( b
);
3998 if ( aSign
!= bSign
) {
4001 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4005 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4006 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4010 /*----------------------------------------------------------------------------
4011 | Returns 1 if the extended double-precision floating-point value `a' is equal
4012 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4013 | raised if either operand is a NaN. Otherwise, the comparison is performed
4014 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4015 *----------------------------------------------------------------------------*/
4017 int floatx80_eq_signaling( floatx80 a
, floatx80 b STATUS_PARAM
)
4020 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4021 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4022 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4023 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4025 float_raise( float_flag_invalid STATUS_VAR
);
4030 && ( ( a
.high
== b
.high
)
4032 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4037 /*----------------------------------------------------------------------------
4038 | Returns 1 if the extended double-precision floating-point value `a' is less
4039 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4040 | do not cause an exception. Otherwise, the comparison is performed according
4041 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4042 *----------------------------------------------------------------------------*/
4044 int floatx80_le_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4048 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4049 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4050 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4051 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4053 if ( floatx80_is_signaling_nan( a
)
4054 || floatx80_is_signaling_nan( b
) ) {
4055 float_raise( float_flag_invalid STATUS_VAR
);
4059 aSign
= extractFloatx80Sign( a
);
4060 bSign
= extractFloatx80Sign( b
);
4061 if ( aSign
!= bSign
) {
4064 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4068 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4069 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4073 /*----------------------------------------------------------------------------
4074 | Returns 1 if the extended double-precision floating-point value `a' is less
4075 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4076 | an exception. Otherwise, the comparison is performed according to the
4077 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4078 *----------------------------------------------------------------------------*/
4080 int floatx80_lt_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4084 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4085 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4086 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4087 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4089 if ( floatx80_is_signaling_nan( a
)
4090 || floatx80_is_signaling_nan( b
) ) {
4091 float_raise( float_flag_invalid STATUS_VAR
);
4095 aSign
= extractFloatx80Sign( a
);
4096 bSign
= extractFloatx80Sign( b
);
4097 if ( aSign
!= bSign
) {
4100 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4104 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4105 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4113 /*----------------------------------------------------------------------------
4114 | Returns the result of converting the quadruple-precision floating-point
4115 | value `a' to the 32-bit two's complement integer format. The conversion
4116 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4117 | Arithmetic---which means in particular that the conversion is rounded
4118 | according to the current rounding mode. If `a' is a NaN, the largest
4119 | positive integer is returned. Otherwise, if the conversion overflows, the
4120 | largest integer with the same sign as `a' is returned.
4121 *----------------------------------------------------------------------------*/
4123 int32
float128_to_int32( float128 a STATUS_PARAM
)
4126 int32 aExp
, shiftCount
;
4127 bits64 aSig0
, aSig1
;
4129 aSig1
= extractFloat128Frac1( a
);
4130 aSig0
= extractFloat128Frac0( a
);
4131 aExp
= extractFloat128Exp( a
);
4132 aSign
= extractFloat128Sign( a
);
4133 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
4134 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4135 aSig0
|= ( aSig1
!= 0 );
4136 shiftCount
= 0x4028 - aExp
;
4137 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
4138 return roundAndPackInt32( aSign
, aSig0 STATUS_VAR
);
4142 /*----------------------------------------------------------------------------
4143 | Returns the result of converting the quadruple-precision floating-point
4144 | value `a' to the 32-bit two's complement integer format. The conversion
4145 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4146 | Arithmetic, except that the conversion is always rounded toward zero. If
4147 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4148 | conversion overflows, the largest integer with the same sign as `a' is
4150 *----------------------------------------------------------------------------*/
4152 int32
float128_to_int32_round_to_zero( float128 a STATUS_PARAM
)
4155 int32 aExp
, shiftCount
;
4156 bits64 aSig0
, aSig1
, savedASig
;
4159 aSig1
= extractFloat128Frac1( a
);
4160 aSig0
= extractFloat128Frac0( a
);
4161 aExp
= extractFloat128Exp( a
);
4162 aSign
= extractFloat128Sign( a
);
4163 aSig0
|= ( aSig1
!= 0 );
4164 if ( 0x401E < aExp
) {
4165 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
4168 else if ( aExp
< 0x3FFF ) {
4169 if ( aExp
|| aSig0
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4172 aSig0
|= LIT64( 0x0001000000000000 );
4173 shiftCount
= 0x402F - aExp
;
4175 aSig0
>>= shiftCount
;
4177 if ( aSign
) z
= - z
;
4178 if ( ( z
< 0 ) ^ aSign
) {
4180 float_raise( float_flag_invalid STATUS_VAR
);
4181 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
4183 if ( ( aSig0
<<shiftCount
) != savedASig
) {
4184 STATUS(float_exception_flags
) |= float_flag_inexact
;
4190 /*----------------------------------------------------------------------------
4191 | Returns the result of converting the quadruple-precision floating-point
4192 | value `a' to the 64-bit two's complement integer format. The conversion
4193 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4194 | Arithmetic---which means in particular that the conversion is rounded
4195 | according to the current rounding mode. If `a' is a NaN, the largest
4196 | positive integer is returned. Otherwise, if the conversion overflows, the
4197 | largest integer with the same sign as `a' is returned.
4198 *----------------------------------------------------------------------------*/
4200 int64
float128_to_int64( float128 a STATUS_PARAM
)
4203 int32 aExp
, shiftCount
;
4204 bits64 aSig0
, aSig1
;
4206 aSig1
= extractFloat128Frac1( a
);
4207 aSig0
= extractFloat128Frac0( a
);
4208 aExp
= extractFloat128Exp( a
);
4209 aSign
= extractFloat128Sign( a
);
4210 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4211 shiftCount
= 0x402F - aExp
;
4212 if ( shiftCount
<= 0 ) {
4213 if ( 0x403E < aExp
) {
4214 float_raise( float_flag_invalid STATUS_VAR
);
4216 || ( ( aExp
== 0x7FFF )
4217 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
4220 return LIT64( 0x7FFFFFFFFFFFFFFF );
4222 return (sbits64
) LIT64( 0x8000000000000000 );
4224 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
4227 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
4229 return roundAndPackInt64( aSign
, aSig0
, aSig1 STATUS_VAR
);
4233 /*----------------------------------------------------------------------------
4234 | Returns the result of converting the quadruple-precision floating-point
4235 | value `a' to the 64-bit two's complement integer format. The conversion
4236 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4237 | Arithmetic, except that the conversion is always rounded toward zero.
4238 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4239 | the conversion overflows, the largest integer with the same sign as `a' is
4241 *----------------------------------------------------------------------------*/
4243 int64
float128_to_int64_round_to_zero( float128 a STATUS_PARAM
)
4246 int32 aExp
, shiftCount
;
4247 bits64 aSig0
, aSig1
;
4250 aSig1
= extractFloat128Frac1( a
);
4251 aSig0
= extractFloat128Frac0( a
);
4252 aExp
= extractFloat128Exp( a
);
4253 aSign
= extractFloat128Sign( a
);
4254 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4255 shiftCount
= aExp
- 0x402F;
4256 if ( 0 < shiftCount
) {
4257 if ( 0x403E <= aExp
) {
4258 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
4259 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
4260 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
4261 if ( aSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4264 float_raise( float_flag_invalid STATUS_VAR
);
4265 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
4266 return LIT64( 0x7FFFFFFFFFFFFFFF );
4269 return (sbits64
) LIT64( 0x8000000000000000 );
4271 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
4272 if ( (bits64
) ( aSig1
<<shiftCount
) ) {
4273 STATUS(float_exception_flags
) |= float_flag_inexact
;
4277 if ( aExp
< 0x3FFF ) {
4278 if ( aExp
| aSig0
| aSig1
) {
4279 STATUS(float_exception_flags
) |= float_flag_inexact
;
4283 z
= aSig0
>>( - shiftCount
);
4285 || ( shiftCount
&& (bits64
) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
4286 STATUS(float_exception_flags
) |= float_flag_inexact
;
4289 if ( aSign
) z
= - z
;
4294 /*----------------------------------------------------------------------------
4295 | Returns the result of converting the quadruple-precision floating-point
4296 | value `a' to the single-precision floating-point format. The conversion
4297 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4299 *----------------------------------------------------------------------------*/
4301 float32
float128_to_float32( float128 a STATUS_PARAM
)
4305 bits64 aSig0
, aSig1
;
4308 aSig1
= extractFloat128Frac1( a
);
4309 aSig0
= extractFloat128Frac0( a
);
4310 aExp
= extractFloat128Exp( a
);
4311 aSign
= extractFloat128Sign( a
);
4312 if ( aExp
== 0x7FFF ) {
4313 if ( aSig0
| aSig1
) {
4314 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR
) );
4316 return packFloat32( aSign
, 0xFF, 0 );
4318 aSig0
|= ( aSig1
!= 0 );
4319 shift64RightJamming( aSig0
, 18, &aSig0
);
4321 if ( aExp
|| zSig
) {
4325 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
4329 /*----------------------------------------------------------------------------
4330 | Returns the result of converting the quadruple-precision floating-point
4331 | value `a' to the double-precision floating-point format. The conversion
4332 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4334 *----------------------------------------------------------------------------*/
4336 float64
float128_to_float64( float128 a STATUS_PARAM
)
4340 bits64 aSig0
, aSig1
;
4342 aSig1
= extractFloat128Frac1( a
);
4343 aSig0
= extractFloat128Frac0( a
);
4344 aExp
= extractFloat128Exp( a
);
4345 aSign
= extractFloat128Sign( a
);
4346 if ( aExp
== 0x7FFF ) {
4347 if ( aSig0
| aSig1
) {
4348 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR
) );
4350 return packFloat64( aSign
, 0x7FF, 0 );
4352 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4353 aSig0
|= ( aSig1
!= 0 );
4354 if ( aExp
|| aSig0
) {
4355 aSig0
|= LIT64( 0x4000000000000000 );
4358 return roundAndPackFloat64( aSign
, aExp
, aSig0 STATUS_VAR
);
4364 /*----------------------------------------------------------------------------
4365 | Returns the result of converting the quadruple-precision floating-point
4366 | value `a' to the extended double-precision floating-point format. The
4367 | conversion is performed according to the IEC/IEEE Standard for Binary
4368 | Floating-Point Arithmetic.
4369 *----------------------------------------------------------------------------*/
4371 floatx80
float128_to_floatx80( float128 a STATUS_PARAM
)
4375 bits64 aSig0
, aSig1
;
4377 aSig1
= extractFloat128Frac1( a
);
4378 aSig0
= extractFloat128Frac0( a
);
4379 aExp
= extractFloat128Exp( a
);
4380 aSign
= extractFloat128Sign( a
);
4381 if ( aExp
== 0x7FFF ) {
4382 if ( aSig0
| aSig1
) {
4383 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR
) );
4385 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4388 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
4389 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4392 aSig0
|= LIT64( 0x0001000000000000 );
4394 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
4395 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1 STATUS_VAR
);
4401 /*----------------------------------------------------------------------------
4402 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4403 | returns the result as a quadruple-precision floating-point value. The
4404 | operation is performed according to the IEC/IEEE Standard for Binary
4405 | Floating-Point Arithmetic.
4406 *----------------------------------------------------------------------------*/
4408 float128
float128_round_to_int( float128 a STATUS_PARAM
)
4412 bits64 lastBitMask
, roundBitsMask
;
4416 aExp
= extractFloat128Exp( a
);
4417 if ( 0x402F <= aExp
) {
4418 if ( 0x406F <= aExp
) {
4419 if ( ( aExp
== 0x7FFF )
4420 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
4422 return propagateFloat128NaN( a
, a STATUS_VAR
);
4427 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
4428 roundBitsMask
= lastBitMask
- 1;
4430 roundingMode
= STATUS(float_rounding_mode
);
4431 if ( roundingMode
== float_round_nearest_even
) {
4432 if ( lastBitMask
) {
4433 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
4434 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
4437 if ( (sbits64
) z
.low
< 0 ) {
4439 if ( (bits64
) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
4443 else if ( roundingMode
!= float_round_to_zero
) {
4444 if ( extractFloat128Sign( z
)
4445 ^ ( roundingMode
== float_round_up
) ) {
4446 add128( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
4449 z
.low
&= ~ roundBitsMask
;
4452 if ( aExp
< 0x3FFF ) {
4453 if ( ( ( (bits64
) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
4454 STATUS(float_exception_flags
) |= float_flag_inexact
;
4455 aSign
= extractFloat128Sign( a
);
4456 switch ( STATUS(float_rounding_mode
) ) {
4457 case float_round_nearest_even
:
4458 if ( ( aExp
== 0x3FFE )
4459 && ( extractFloat128Frac0( a
)
4460 | extractFloat128Frac1( a
) )
4462 return packFloat128( aSign
, 0x3FFF, 0, 0 );
4465 case float_round_down
:
4467 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
4468 : packFloat128( 0, 0, 0, 0 );
4469 case float_round_up
:
4471 aSign
? packFloat128( 1, 0, 0, 0 )
4472 : packFloat128( 0, 0x3FFF, 0, 0 );
4474 return packFloat128( aSign
, 0, 0, 0 );
4477 lastBitMask
<<= 0x402F - aExp
;
4478 roundBitsMask
= lastBitMask
- 1;
4481 roundingMode
= STATUS(float_rounding_mode
);
4482 if ( roundingMode
== float_round_nearest_even
) {
4483 z
.high
+= lastBitMask
>>1;
4484 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
4485 z
.high
&= ~ lastBitMask
;
4488 else if ( roundingMode
!= float_round_to_zero
) {
4489 if ( extractFloat128Sign( z
)
4490 ^ ( roundingMode
== float_round_up
) ) {
4491 z
.high
|= ( a
.low
!= 0 );
4492 z
.high
+= roundBitsMask
;
4495 z
.high
&= ~ roundBitsMask
;
4497 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
4498 STATUS(float_exception_flags
) |= float_flag_inexact
;
4504 /*----------------------------------------------------------------------------
4505 | Returns the result of adding the absolute values of the quadruple-precision
4506 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4507 | before being returned. `zSign' is ignored if the result is a NaN.
4508 | The addition is performed according to the IEC/IEEE Standard for Binary
4509 | Floating-Point Arithmetic.
4510 *----------------------------------------------------------------------------*/
4512 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
4514 int32 aExp
, bExp
, zExp
;
4515 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
4518 aSig1
= extractFloat128Frac1( a
);
4519 aSig0
= extractFloat128Frac0( a
);
4520 aExp
= extractFloat128Exp( a
);
4521 bSig1
= extractFloat128Frac1( b
);
4522 bSig0
= extractFloat128Frac0( b
);
4523 bExp
= extractFloat128Exp( b
);
4524 expDiff
= aExp
- bExp
;
4525 if ( 0 < expDiff
) {
4526 if ( aExp
== 0x7FFF ) {
4527 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4534 bSig0
|= LIT64( 0x0001000000000000 );
4536 shift128ExtraRightJamming(
4537 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
4540 else if ( expDiff
< 0 ) {
4541 if ( bExp
== 0x7FFF ) {
4542 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4543 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4549 aSig0
|= LIT64( 0x0001000000000000 );
4551 shift128ExtraRightJamming(
4552 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
4556 if ( aExp
== 0x7FFF ) {
4557 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4558 return propagateFloat128NaN( a
, b STATUS_VAR
);
4562 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4563 if ( aExp
== 0 ) return packFloat128( zSign
, 0, zSig0
, zSig1
);
4565 zSig0
|= LIT64( 0x0002000000000000 );
4569 aSig0
|= LIT64( 0x0001000000000000 );
4570 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4572 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
4575 shift128ExtraRightJamming(
4576 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
4578 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
4582 /*----------------------------------------------------------------------------
4583 | Returns the result of subtracting the absolute values of the quadruple-
4584 | precision floating-point values `a' and `b'. If `zSign' is 1, the
4585 | difference is negated before being returned. `zSign' is ignored if the
4586 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4587 | Standard for Binary Floating-Point Arithmetic.
4588 *----------------------------------------------------------------------------*/
4590 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
4592 int32 aExp
, bExp
, zExp
;
4593 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
4597 aSig1
= extractFloat128Frac1( a
);
4598 aSig0
= extractFloat128Frac0( a
);
4599 aExp
= extractFloat128Exp( a
);
4600 bSig1
= extractFloat128Frac1( b
);
4601 bSig0
= extractFloat128Frac0( b
);
4602 bExp
= extractFloat128Exp( b
);
4603 expDiff
= aExp
- bExp
;
4604 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4605 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
4606 if ( 0 < expDiff
) goto aExpBigger
;
4607 if ( expDiff
< 0 ) goto bExpBigger
;
4608 if ( aExp
== 0x7FFF ) {
4609 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4610 return propagateFloat128NaN( a
, b STATUS_VAR
);
4612 float_raise( float_flag_invalid STATUS_VAR
);
4613 z
.low
= float128_default_nan_low
;
4614 z
.high
= float128_default_nan_high
;
4621 if ( bSig0
< aSig0
) goto aBigger
;
4622 if ( aSig0
< bSig0
) goto bBigger
;
4623 if ( bSig1
< aSig1
) goto aBigger
;
4624 if ( aSig1
< bSig1
) goto bBigger
;
4625 return packFloat128( STATUS(float_rounding_mode
) == float_round_down
, 0, 0, 0 );
4627 if ( bExp
== 0x7FFF ) {
4628 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4629 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
4635 aSig0
|= LIT64( 0x4000000000000000 );
4637 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
4638 bSig0
|= LIT64( 0x4000000000000000 );
4640 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
4643 goto normalizeRoundAndPack
;
4645 if ( aExp
== 0x7FFF ) {
4646 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4653 bSig0
|= LIT64( 0x4000000000000000 );
4655 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
4656 aSig0
|= LIT64( 0x4000000000000000 );
4658 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4660 normalizeRoundAndPack
:
4662 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1 STATUS_VAR
);
4666 /*----------------------------------------------------------------------------
4667 | Returns the result of adding the quadruple-precision floating-point values
4668 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4669 | for Binary Floating-Point Arithmetic.
4670 *----------------------------------------------------------------------------*/
4672 float128
float128_add( float128 a
, float128 b STATUS_PARAM
)
4676 aSign
= extractFloat128Sign( a
);
4677 bSign
= extractFloat128Sign( b
);
4678 if ( aSign
== bSign
) {
4679 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
4682 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
4687 /*----------------------------------------------------------------------------
4688 | Returns the result of subtracting the quadruple-precision floating-point
4689 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4690 | Standard for Binary Floating-Point Arithmetic.
4691 *----------------------------------------------------------------------------*/
4693 float128
float128_sub( float128 a
, float128 b STATUS_PARAM
)
4697 aSign
= extractFloat128Sign( a
);
4698 bSign
= extractFloat128Sign( b
);
4699 if ( aSign
== bSign
) {
4700 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
4703 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
4708 /*----------------------------------------------------------------------------
4709 | Returns the result of multiplying the quadruple-precision floating-point
4710 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4711 | Standard for Binary Floating-Point Arithmetic.
4712 *----------------------------------------------------------------------------*/
4714 float128
float128_mul( float128 a
, float128 b STATUS_PARAM
)
4716 flag aSign
, bSign
, zSign
;
4717 int32 aExp
, bExp
, zExp
;
4718 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
4721 aSig1
= extractFloat128Frac1( a
);
4722 aSig0
= extractFloat128Frac0( a
);
4723 aExp
= extractFloat128Exp( a
);
4724 aSign
= extractFloat128Sign( a
);
4725 bSig1
= extractFloat128Frac1( b
);
4726 bSig0
= extractFloat128Frac0( b
);
4727 bExp
= extractFloat128Exp( b
);
4728 bSign
= extractFloat128Sign( b
);
4729 zSign
= aSign
^ bSign
;
4730 if ( aExp
== 0x7FFF ) {
4731 if ( ( aSig0
| aSig1
)
4732 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
4733 return propagateFloat128NaN( a
, b STATUS_VAR
);
4735 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
4736 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4738 if ( bExp
== 0x7FFF ) {
4739 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4740 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
4742 float_raise( float_flag_invalid STATUS_VAR
);
4743 z
.low
= float128_default_nan_low
;
4744 z
.high
= float128_default_nan_high
;
4747 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4750 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4751 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4754 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4755 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
4757 zExp
= aExp
+ bExp
- 0x4000;
4758 aSig0
|= LIT64( 0x0001000000000000 );
4759 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
4760 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
4761 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
4762 zSig2
|= ( zSig3
!= 0 );
4763 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
4764 shift128ExtraRightJamming(
4765 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
4768 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
4772 /*----------------------------------------------------------------------------
4773 | Returns the result of dividing the quadruple-precision floating-point value
4774 | `a' by the corresponding value `b'. The operation is performed according to
4775 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4776 *----------------------------------------------------------------------------*/
4778 float128
float128_div( float128 a
, float128 b STATUS_PARAM
)
4780 flag aSign
, bSign
, zSign
;
4781 int32 aExp
, bExp
, zExp
;
4782 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
4783 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4786 aSig1
= extractFloat128Frac1( a
);
4787 aSig0
= extractFloat128Frac0( a
);
4788 aExp
= extractFloat128Exp( a
);
4789 aSign
= extractFloat128Sign( a
);
4790 bSig1
= extractFloat128Frac1( b
);
4791 bSig0
= extractFloat128Frac0( b
);
4792 bExp
= extractFloat128Exp( b
);
4793 bSign
= extractFloat128Sign( b
);
4794 zSign
= aSign
^ bSign
;
4795 if ( aExp
== 0x7FFF ) {
4796 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4797 if ( bExp
== 0x7FFF ) {
4798 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4801 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4803 if ( bExp
== 0x7FFF ) {
4804 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4805 return packFloat128( zSign
, 0, 0, 0 );
4808 if ( ( bSig0
| bSig1
) == 0 ) {
4809 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
4811 float_raise( float_flag_invalid STATUS_VAR
);
4812 z
.low
= float128_default_nan_low
;
4813 z
.high
= float128_default_nan_high
;
4816 float_raise( float_flag_divbyzero STATUS_VAR
);
4817 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4819 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
4822 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4823 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4825 zExp
= aExp
- bExp
+ 0x3FFD;
4827 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
4829 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
4830 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
4831 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
4834 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
4835 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
4836 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
4837 while ( (sbits64
) rem0
< 0 ) {
4839 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
4841 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
4842 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
4843 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
4844 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
4845 while ( (sbits64
) rem1
< 0 ) {
4847 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
4849 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
4851 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
4852 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
4856 /*----------------------------------------------------------------------------
4857 | Returns the remainder of the quadruple-precision floating-point value `a'
4858 | with respect to the corresponding value `b'. The operation is performed
4859 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4860 *----------------------------------------------------------------------------*/
4862 float128
float128_rem( float128 a
, float128 b STATUS_PARAM
)
4864 flag aSign
, bSign
, zSign
;
4865 int32 aExp
, bExp
, expDiff
;
4866 bits64 aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
4867 bits64 allZero
, alternateASig0
, alternateASig1
, sigMean1
;
4871 aSig1
= extractFloat128Frac1( a
);
4872 aSig0
= extractFloat128Frac0( a
);
4873 aExp
= extractFloat128Exp( a
);
4874 aSign
= extractFloat128Sign( a
);
4875 bSig1
= extractFloat128Frac1( b
);
4876 bSig0
= extractFloat128Frac0( b
);
4877 bExp
= extractFloat128Exp( b
);
4878 bSign
= extractFloat128Sign( b
);
4879 if ( aExp
== 0x7FFF ) {
4880 if ( ( aSig0
| aSig1
)
4881 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
4882 return propagateFloat128NaN( a
, b STATUS_VAR
);
4886 if ( bExp
== 0x7FFF ) {
4887 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
4891 if ( ( bSig0
| bSig1
) == 0 ) {
4893 float_raise( float_flag_invalid STATUS_VAR
);
4894 z
.low
= float128_default_nan_low
;
4895 z
.high
= float128_default_nan_high
;
4898 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
4901 if ( ( aSig0
| aSig1
) == 0 ) return a
;
4902 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4904 expDiff
= aExp
- bExp
;
4905 if ( expDiff
< -1 ) return a
;
4907 aSig0
| LIT64( 0x0001000000000000 ),
4909 15 - ( expDiff
< 0 ),
4914 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
4915 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
4916 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
4918 while ( 0 < expDiff
) {
4919 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
4920 q
= ( 4 < q
) ? q
- 4 : 0;
4921 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
4922 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
4923 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
4924 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
4927 if ( -64 < expDiff
) {
4928 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
4929 q
= ( 4 < q
) ? q
- 4 : 0;
4931 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
4933 if ( expDiff
< 0 ) {
4934 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
4937 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
4939 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
4940 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
4943 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
4944 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
4947 alternateASig0
= aSig0
;
4948 alternateASig1
= aSig1
;
4950 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
4951 } while ( 0 <= (sbits64
) aSig0
);
4953 aSig0
, aSig1
, alternateASig0
, alternateASig1
, &sigMean0
, &sigMean1
);
4954 if ( ( sigMean0
< 0 )
4955 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
4956 aSig0
= alternateASig0
;
4957 aSig1
= alternateASig1
;
4959 zSign
= ( (sbits64
) aSig0
< 0 );
4960 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
4962 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1 STATUS_VAR
);
4966 /*----------------------------------------------------------------------------
4967 | Returns the square root of the quadruple-precision floating-point value `a'.
4968 | The operation is performed according to the IEC/IEEE Standard for Binary
4969 | Floating-Point Arithmetic.
4970 *----------------------------------------------------------------------------*/
4972 float128
float128_sqrt( float128 a STATUS_PARAM
)
4976 bits64 aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
4977 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4980 aSig1
= extractFloat128Frac1( a
);
4981 aSig0
= extractFloat128Frac0( a
);
4982 aExp
= extractFloat128Exp( a
);
4983 aSign
= extractFloat128Sign( a
);
4984 if ( aExp
== 0x7FFF ) {
4985 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a STATUS_VAR
);
4986 if ( ! aSign
) return a
;
4990 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
4992 float_raise( float_flag_invalid STATUS_VAR
);
4993 z
.low
= float128_default_nan_low
;
4994 z
.high
= float128_default_nan_high
;
4998 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
4999 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5001 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
5002 aSig0
|= LIT64( 0x0001000000000000 );
5003 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
5004 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
5005 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5006 doubleZSig0
= zSig0
<<1;
5007 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5008 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5009 while ( (sbits64
) rem0
< 0 ) {
5012 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5014 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5015 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
5016 if ( zSig1
== 0 ) zSig1
= 1;
5017 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5018 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5019 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5020 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5021 while ( (sbits64
) rem1
< 0 ) {
5023 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5025 term2
|= doubleZSig0
;
5026 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5028 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5030 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
5031 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5035 /*----------------------------------------------------------------------------
5036 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5037 | the corresponding value `b', and 0 otherwise. The comparison is performed
5038 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5039 *----------------------------------------------------------------------------*/
5041 int float128_eq( float128 a
, float128 b STATUS_PARAM
)
5044 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5045 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5046 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5047 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5049 if ( float128_is_signaling_nan( a
)
5050 || float128_is_signaling_nan( b
) ) {
5051 float_raise( float_flag_invalid STATUS_VAR
);
5057 && ( ( a
.high
== b
.high
)
5059 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5064 /*----------------------------------------------------------------------------
5065 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5066 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5067 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5069 *----------------------------------------------------------------------------*/
5071 int float128_le( float128 a
, float128 b STATUS_PARAM
)
5075 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5076 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5077 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5078 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5080 float_raise( float_flag_invalid STATUS_VAR
);
5083 aSign
= extractFloat128Sign( a
);
5084 bSign
= extractFloat128Sign( b
);
5085 if ( aSign
!= bSign
) {
5088 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5092 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5093 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5097 /*----------------------------------------------------------------------------
5098 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5099 | the corresponding value `b', and 0 otherwise. The comparison is performed
5100 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5101 *----------------------------------------------------------------------------*/
5103 int float128_lt( float128 a
, float128 b STATUS_PARAM
)
5107 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5108 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5109 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5110 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5112 float_raise( float_flag_invalid STATUS_VAR
);
5115 aSign
= extractFloat128Sign( a
);
5116 bSign
= extractFloat128Sign( b
);
5117 if ( aSign
!= bSign
) {
5120 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5124 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5125 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5129 /*----------------------------------------------------------------------------
5130 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5131 | the corresponding value `b', and 0 otherwise. The invalid exception is
5132 | raised if either operand is a NaN. Otherwise, the comparison is performed
5133 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5134 *----------------------------------------------------------------------------*/
5136 int float128_eq_signaling( float128 a
, float128 b STATUS_PARAM
)
5139 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5140 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5141 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5142 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5144 float_raise( float_flag_invalid STATUS_VAR
);
5149 && ( ( a
.high
== b
.high
)
5151 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5156 /*----------------------------------------------------------------------------
5157 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5158 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5159 | cause an exception. Otherwise, the comparison is performed according to the
5160 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5161 *----------------------------------------------------------------------------*/
5163 int float128_le_quiet( float128 a
, float128 b STATUS_PARAM
)
5167 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5168 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5169 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5170 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5172 if ( float128_is_signaling_nan( a
)
5173 || float128_is_signaling_nan( b
) ) {
5174 float_raise( float_flag_invalid STATUS_VAR
);
5178 aSign
= extractFloat128Sign( a
);
5179 bSign
= extractFloat128Sign( b
);
5180 if ( aSign
!= bSign
) {
5183 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5187 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5188 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5192 /*----------------------------------------------------------------------------
5193 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5194 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5195 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5196 | Standard for Binary Floating-Point Arithmetic.
5197 *----------------------------------------------------------------------------*/
5199 int float128_lt_quiet( float128 a
, float128 b STATUS_PARAM
)
5203 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5204 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5205 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5206 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5208 if ( float128_is_signaling_nan( a
)
5209 || float128_is_signaling_nan( b
) ) {
5210 float_raise( float_flag_invalid STATUS_VAR
);
5214 aSign
= extractFloat128Sign( a
);
5215 bSign
= extractFloat128Sign( b
);
5216 if ( aSign
!= bSign
) {
5219 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5223 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5224 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5230 /* misc functions */
5231 float32
uint32_to_float32( unsigned int a STATUS_PARAM
)
5233 return int64_to_float32(a STATUS_VAR
);
5236 float64
uint32_to_float64( unsigned int a STATUS_PARAM
)
5238 return int64_to_float64(a STATUS_VAR
);
5241 unsigned int float32_to_uint32( float32 a STATUS_PARAM
)
5246 v
= float32_to_int64(a STATUS_VAR
);
5249 float_raise( float_flag_invalid STATUS_VAR
);
5250 } else if (v
> 0xffffffff) {
5252 float_raise( float_flag_invalid STATUS_VAR
);
5259 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM
)
5264 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
5267 float_raise( float_flag_invalid STATUS_VAR
);
5268 } else if (v
> 0xffffffff) {
5270 float_raise( float_flag_invalid STATUS_VAR
);
5277 unsigned int float64_to_uint32( float64 a STATUS_PARAM
)
5282 v
= float64_to_int64(a STATUS_VAR
);
5285 float_raise( float_flag_invalid STATUS_VAR
);
5286 } else if (v
> 0xffffffff) {
5288 float_raise( float_flag_invalid STATUS_VAR
);
5295 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM
)
5300 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
5303 float_raise( float_flag_invalid STATUS_VAR
);
5304 } else if (v
> 0xffffffff) {
5306 float_raise( float_flag_invalid STATUS_VAR
);
5313 uint64_t float64_to_uint64 (float64 a STATUS_PARAM
)
5317 v
= int64_to_float64(INT64_MIN STATUS_VAR
);
5318 v
= float64_to_int64((a
+ v
) STATUS_VAR
);
5320 return v
- INT64_MIN
;
5323 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM
)
5327 v
= int64_to_float64(INT64_MIN STATUS_VAR
);
5328 v
= float64_to_int64_round_to_zero((a
+ v
) STATUS_VAR
);
5330 return v
- INT64_MIN
;
5333 #define COMPARE(s, nan_exp) \
5334 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5335 int is_quiet STATUS_PARAM ) \
5337 flag aSign, bSign; \
5339 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5340 extractFloat ## s ## Frac( a ) ) || \
5341 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5342 extractFloat ## s ## Frac( b ) )) { \
5344 float ## s ## _is_signaling_nan( a ) || \
5345 float ## s ## _is_signaling_nan( b ) ) { \
5346 float_raise( float_flag_invalid STATUS_VAR); \
5348 return float_relation_unordered; \
5350 aSign = extractFloat ## s ## Sign( a ); \
5351 bSign = extractFloat ## s ## Sign( b ); \
5352 if ( aSign != bSign ) { \
5353 if ( (bits ## s) ( ( a | b )<<1 ) == 0 ) { \
5355 return float_relation_equal; \
5357 return 1 - (2 * aSign); \
5361 return float_relation_equal; \
5363 return 1 - 2 * (aSign ^ ( a < b )); \
5368 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5370 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5373 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
5375 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \