2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
33 #include "softfloat.h"
35 /*----------------------------------------------------------------------------
36 | Primitive arithmetic functions, including multi-word arithmetic, and
37 | division and square root approximations. (Can be specialized to target if
39 *----------------------------------------------------------------------------*/
40 #include "softfloat-macros.h"
42 /*----------------------------------------------------------------------------
43 | Functions and definitions to determine: (1) whether tininess for underflow
44 | is detected before or after rounding by default, (2) what (if anything)
45 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
46 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
47 | are propagated from function inputs to output. These details are target-
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-specialize.h"
52 void set_float_rounding_mode(int val STATUS_PARAM
)
54 STATUS(float_rounding_mode
) = val
;
57 void set_float_exception_flags(int val STATUS_PARAM
)
59 STATUS(float_exception_flags
) = val
;
63 void set_floatx80_rounding_precision(int val STATUS_PARAM
)
65 STATUS(floatx80_rounding_precision
) = val
;
69 /*----------------------------------------------------------------------------
70 | Returns the fraction bits of the half-precision floating-point value `a'.
71 *----------------------------------------------------------------------------*/
73 INLINE
uint32_t extractFloat16Frac(float16 a
)
75 return float16_val(a
) & 0x3ff;
78 /*----------------------------------------------------------------------------
79 | Returns the exponent bits of the half-precision floating-point value `a'.
80 *----------------------------------------------------------------------------*/
82 INLINE int16
extractFloat16Exp(float16 a
)
84 return (float16_val(a
) >> 10) & 0x1f;
87 /*----------------------------------------------------------------------------
88 | Returns the sign bit of the single-precision floating-point value `a'.
89 *----------------------------------------------------------------------------*/
91 INLINE flag
extractFloat16Sign(float16 a
)
93 return float16_val(a
)>>15;
96 /*----------------------------------------------------------------------------
97 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
98 | and 7, and returns the properly rounded 32-bit integer corresponding to the
99 | input. If `zSign' is 1, the input is negated before being converted to an
100 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
101 | is simply rounded to an integer, with the inexact exception raised if the
102 | input cannot be represented exactly as an integer. However, if the fixed-
103 | point input is too large, the invalid exception is raised and the largest
104 | positive or negative integer is returned.
105 *----------------------------------------------------------------------------*/
107 static int32
roundAndPackInt32( flag zSign
, bits64 absZ STATUS_PARAM
)
110 flag roundNearestEven
;
111 int8 roundIncrement
, roundBits
;
114 roundingMode
= STATUS(float_rounding_mode
);
115 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
116 roundIncrement
= 0x40;
117 if ( ! roundNearestEven
) {
118 if ( roundingMode
== float_round_to_zero
) {
122 roundIncrement
= 0x7F;
124 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
127 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
131 roundBits
= absZ
& 0x7F;
132 absZ
= ( absZ
+ roundIncrement
)>>7;
133 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
135 if ( zSign
) z
= - z
;
136 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
137 float_raise( float_flag_invalid STATUS_VAR
);
138 return zSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
140 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
145 /*----------------------------------------------------------------------------
146 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
147 | `absZ1', with binary point between bits 63 and 64 (between the input words),
148 | and returns the properly rounded 64-bit integer corresponding to the input.
149 | If `zSign' is 1, the input is negated before being converted to an integer.
150 | Ordinarily, the fixed-point input is simply rounded to an integer, with
151 | the inexact exception raised if the input cannot be represented exactly as
152 | an integer. However, if the fixed-point input is too large, the invalid
153 | exception is raised and the largest positive or negative integer is
155 *----------------------------------------------------------------------------*/
157 static int64
roundAndPackInt64( flag zSign
, bits64 absZ0
, bits64 absZ1 STATUS_PARAM
)
160 flag roundNearestEven
, increment
;
163 roundingMode
= STATUS(float_rounding_mode
);
164 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
165 increment
= ( (sbits64
) absZ1
< 0 );
166 if ( ! roundNearestEven
) {
167 if ( roundingMode
== float_round_to_zero
) {
172 increment
= ( roundingMode
== float_round_down
) && absZ1
;
175 increment
= ( roundingMode
== float_round_up
) && absZ1
;
181 if ( absZ0
== 0 ) goto overflow
;
182 absZ0
&= ~ ( ( (bits64
) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
185 if ( zSign
) z
= - z
;
186 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
188 float_raise( float_flag_invalid STATUS_VAR
);
190 zSign
? (sbits64
) LIT64( 0x8000000000000000 )
191 : LIT64( 0x7FFFFFFFFFFFFFFF );
193 if ( absZ1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
198 /*----------------------------------------------------------------------------
199 | Returns the fraction bits of the single-precision floating-point value `a'.
200 *----------------------------------------------------------------------------*/
202 INLINE bits32
extractFloat32Frac( float32 a
)
205 return float32_val(a
) & 0x007FFFFF;
209 /*----------------------------------------------------------------------------
210 | Returns the exponent bits of the single-precision floating-point value `a'.
211 *----------------------------------------------------------------------------*/
213 INLINE int16
extractFloat32Exp( float32 a
)
216 return ( float32_val(a
)>>23 ) & 0xFF;
220 /*----------------------------------------------------------------------------
221 | Returns the sign bit of the single-precision floating-point value `a'.
222 *----------------------------------------------------------------------------*/
224 INLINE flag
extractFloat32Sign( float32 a
)
227 return float32_val(a
)>>31;
231 /*----------------------------------------------------------------------------
232 | If `a' is denormal and we are in flush-to-zero mode then set the
233 | input-denormal exception and return zero. Otherwise just return the value.
234 *----------------------------------------------------------------------------*/
235 static float32
float32_squash_input_denormal(float32 a STATUS_PARAM
)
237 if (STATUS(flush_inputs_to_zero
)) {
238 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
239 float_raise(float_flag_input_denormal STATUS_VAR
);
240 return make_float32(float32_val(a
) & 0x80000000);
246 /*----------------------------------------------------------------------------
247 | Normalizes the subnormal single-precision floating-point value represented
248 | by the denormalized significand `aSig'. The normalized exponent and
249 | significand are stored at the locations pointed to by `zExpPtr' and
250 | `zSigPtr', respectively.
251 *----------------------------------------------------------------------------*/
254 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
258 shiftCount
= countLeadingZeros32( aSig
) - 8;
259 *zSigPtr
= aSig
<<shiftCount
;
260 *zExpPtr
= 1 - shiftCount
;
264 /*----------------------------------------------------------------------------
265 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
266 | single-precision floating-point value, returning the result. After being
267 | shifted into the proper positions, the three fields are simply added
268 | together to form the result. This means that any integer portion of `zSig'
269 | will be added into the exponent. Since a properly normalized significand
270 | will have an integer portion equal to 1, the `zExp' input should be 1 less
271 | than the desired result exponent whenever `zSig' is a complete, normalized
273 *----------------------------------------------------------------------------*/
275 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
279 ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
);
283 /*----------------------------------------------------------------------------
284 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
285 | and significand `zSig', and returns the proper single-precision floating-
286 | point value corresponding to the abstract input. Ordinarily, the abstract
287 | value is simply rounded and packed into the single-precision format, with
288 | the inexact exception raised if the abstract input cannot be represented
289 | exactly. However, if the abstract value is too large, the overflow and
290 | inexact exceptions are raised and an infinity or maximal finite value is
291 | returned. If the abstract value is too small, the input value is rounded to
292 | a subnormal number, and the underflow and inexact exceptions are raised if
293 | the abstract input cannot be represented exactly as a subnormal single-
294 | precision floating-point number.
295 | The input significand `zSig' has its binary point between bits 30
296 | and 29, which is 7 bits to the left of the usual location. This shifted
297 | significand must be normalized or smaller. If `zSig' is not normalized,
298 | `zExp' must be 0; in that case, the result returned is a subnormal number,
299 | and it must not require rounding. In the usual case that `zSig' is
300 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
301 | The handling of underflow and overflow follows the IEC/IEEE Standard for
302 | Binary Floating-Point Arithmetic.
303 *----------------------------------------------------------------------------*/
305 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig STATUS_PARAM
)
308 flag roundNearestEven
;
309 int8 roundIncrement
, roundBits
;
312 roundingMode
= STATUS(float_rounding_mode
);
313 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
314 roundIncrement
= 0x40;
315 if ( ! roundNearestEven
) {
316 if ( roundingMode
== float_round_to_zero
) {
320 roundIncrement
= 0x7F;
322 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
325 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
329 roundBits
= zSig
& 0x7F;
330 if ( 0xFD <= (bits16
) zExp
) {
332 || ( ( zExp
== 0xFD )
333 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
335 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
336 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
339 if ( STATUS(flush_to_zero
) ) return packFloat32( zSign
, 0, 0 );
341 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
343 || ( zSig
+ roundIncrement
< 0x80000000 );
344 shift32RightJamming( zSig
, - zExp
, &zSig
);
346 roundBits
= zSig
& 0x7F;
347 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
350 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
351 zSig
= ( zSig
+ roundIncrement
)>>7;
352 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
353 if ( zSig
== 0 ) zExp
= 0;
354 return packFloat32( zSign
, zExp
, zSig
);
358 /*----------------------------------------------------------------------------
359 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
360 | and significand `zSig', and returns the proper single-precision floating-
361 | point value corresponding to the abstract input. This routine is just like
362 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
363 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
364 | floating-point exponent.
365 *----------------------------------------------------------------------------*/
368 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig STATUS_PARAM
)
372 shiftCount
= countLeadingZeros32( zSig
) - 1;
373 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
377 /*----------------------------------------------------------------------------
378 | Returns the fraction bits of the double-precision floating-point value `a'.
379 *----------------------------------------------------------------------------*/
381 INLINE bits64
extractFloat64Frac( float64 a
)
384 return float64_val(a
) & LIT64( 0x000FFFFFFFFFFFFF );
388 /*----------------------------------------------------------------------------
389 | Returns the exponent bits of the double-precision floating-point value `a'.
390 *----------------------------------------------------------------------------*/
392 INLINE int16
extractFloat64Exp( float64 a
)
395 return ( float64_val(a
)>>52 ) & 0x7FF;
399 /*----------------------------------------------------------------------------
400 | Returns the sign bit of the double-precision floating-point value `a'.
401 *----------------------------------------------------------------------------*/
403 INLINE flag
extractFloat64Sign( float64 a
)
406 return float64_val(a
)>>63;
410 /*----------------------------------------------------------------------------
411 | If `a' is denormal and we are in flush-to-zero mode then set the
412 | input-denormal exception and return zero. Otherwise just return the value.
413 *----------------------------------------------------------------------------*/
414 static float64
float64_squash_input_denormal(float64 a STATUS_PARAM
)
416 if (STATUS(flush_inputs_to_zero
)) {
417 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
418 float_raise(float_flag_input_denormal STATUS_VAR
);
419 return make_float64(float64_val(a
) & (1ULL << 63));
425 /*----------------------------------------------------------------------------
426 | Normalizes the subnormal double-precision floating-point value represented
427 | by the denormalized significand `aSig'. The normalized exponent and
428 | significand are stored at the locations pointed to by `zExpPtr' and
429 | `zSigPtr', respectively.
430 *----------------------------------------------------------------------------*/
433 normalizeFloat64Subnormal( bits64 aSig
, int16
*zExpPtr
, bits64
*zSigPtr
)
437 shiftCount
= countLeadingZeros64( aSig
) - 11;
438 *zSigPtr
= aSig
<<shiftCount
;
439 *zExpPtr
= 1 - shiftCount
;
443 /*----------------------------------------------------------------------------
444 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
445 | double-precision floating-point value, returning the result. After being
446 | shifted into the proper positions, the three fields are simply added
447 | together to form the result. This means that any integer portion of `zSig'
448 | will be added into the exponent. Since a properly normalized significand
449 | will have an integer portion equal to 1, the `zExp' input should be 1 less
450 | than the desired result exponent whenever `zSig' is a complete, normalized
452 *----------------------------------------------------------------------------*/
454 INLINE float64
packFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
458 ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<52 ) + zSig
);
462 /*----------------------------------------------------------------------------
463 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
464 | and significand `zSig', and returns the proper double-precision floating-
465 | point value corresponding to the abstract input. Ordinarily, the abstract
466 | value is simply rounded and packed into the double-precision format, with
467 | the inexact exception raised if the abstract input cannot be represented
468 | exactly. However, if the abstract value is too large, the overflow and
469 | inexact exceptions are raised and an infinity or maximal finite value is
470 | returned. If the abstract value is too small, the input value is rounded
471 | to a subnormal number, and the underflow and inexact exceptions are raised
472 | if the abstract input cannot be represented exactly as a subnormal double-
473 | precision floating-point number.
474 | The input significand `zSig' has its binary point between bits 62
475 | and 61, which is 10 bits to the left of the usual location. This shifted
476 | significand must be normalized or smaller. If `zSig' is not normalized,
477 | `zExp' must be 0; in that case, the result returned is a subnormal number,
478 | and it must not require rounding. In the usual case that `zSig' is
479 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
480 | The handling of underflow and overflow follows the IEC/IEEE Standard for
481 | Binary Floating-Point Arithmetic.
482 *----------------------------------------------------------------------------*/
484 static float64
roundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig STATUS_PARAM
)
487 flag roundNearestEven
;
488 int16 roundIncrement
, roundBits
;
491 roundingMode
= STATUS(float_rounding_mode
);
492 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
493 roundIncrement
= 0x200;
494 if ( ! roundNearestEven
) {
495 if ( roundingMode
== float_round_to_zero
) {
499 roundIncrement
= 0x3FF;
501 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
504 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
508 roundBits
= zSig
& 0x3FF;
509 if ( 0x7FD <= (bits16
) zExp
) {
510 if ( ( 0x7FD < zExp
)
511 || ( ( zExp
== 0x7FD )
512 && ( (sbits64
) ( zSig
+ roundIncrement
) < 0 ) )
514 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
515 return packFloat64( zSign
, 0x7FF, - ( roundIncrement
== 0 ));
518 if ( STATUS(flush_to_zero
) ) return packFloat64( zSign
, 0, 0 );
520 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
522 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
523 shift64RightJamming( zSig
, - zExp
, &zSig
);
525 roundBits
= zSig
& 0x3FF;
526 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
529 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
530 zSig
= ( zSig
+ roundIncrement
)>>10;
531 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
532 if ( zSig
== 0 ) zExp
= 0;
533 return packFloat64( zSign
, zExp
, zSig
);
537 /*----------------------------------------------------------------------------
538 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
539 | and significand `zSig', and returns the proper double-precision floating-
540 | point value corresponding to the abstract input. This routine is just like
541 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
542 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
543 | floating-point exponent.
544 *----------------------------------------------------------------------------*/
547 normalizeRoundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig STATUS_PARAM
)
551 shiftCount
= countLeadingZeros64( zSig
) - 1;
552 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
558 /*----------------------------------------------------------------------------
559 | Returns the fraction bits of the extended double-precision floating-point
561 *----------------------------------------------------------------------------*/
563 INLINE bits64
extractFloatx80Frac( floatx80 a
)
570 /*----------------------------------------------------------------------------
571 | Returns the exponent bits of the extended double-precision floating-point
573 *----------------------------------------------------------------------------*/
575 INLINE int32
extractFloatx80Exp( floatx80 a
)
578 return a
.high
& 0x7FFF;
582 /*----------------------------------------------------------------------------
583 | Returns the sign bit of the extended double-precision floating-point value
585 *----------------------------------------------------------------------------*/
587 INLINE flag
extractFloatx80Sign( floatx80 a
)
594 /*----------------------------------------------------------------------------
595 | Normalizes the subnormal extended double-precision floating-point value
596 | represented by the denormalized significand `aSig'. The normalized exponent
597 | and significand are stored at the locations pointed to by `zExpPtr' and
598 | `zSigPtr', respectively.
599 *----------------------------------------------------------------------------*/
602 normalizeFloatx80Subnormal( bits64 aSig
, int32
*zExpPtr
, bits64
*zSigPtr
)
606 shiftCount
= countLeadingZeros64( aSig
);
607 *zSigPtr
= aSig
<<shiftCount
;
608 *zExpPtr
= 1 - shiftCount
;
612 /*----------------------------------------------------------------------------
613 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
614 | extended double-precision floating-point value, returning the result.
615 *----------------------------------------------------------------------------*/
617 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, bits64 zSig
)
622 z
.high
= ( ( (bits16
) zSign
)<<15 ) + zExp
;
627 /*----------------------------------------------------------------------------
628 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
629 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
630 | and returns the proper extended double-precision floating-point value
631 | corresponding to the abstract input. Ordinarily, the abstract value is
632 | rounded and packed into the extended double-precision format, with the
633 | inexact exception raised if the abstract input cannot be represented
634 | exactly. However, if the abstract value is too large, the overflow and
635 | inexact exceptions are raised and an infinity or maximal finite value is
636 | returned. If the abstract value is too small, the input value is rounded to
637 | a subnormal number, and the underflow and inexact exceptions are raised if
638 | the abstract input cannot be represented exactly as a subnormal extended
639 | double-precision floating-point number.
640 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
641 | number of bits as single or double precision, respectively. Otherwise, the
642 | result is rounded to the full precision of the extended double-precision
644 | The input significand must be normalized or smaller. If the input
645 | significand is not normalized, `zExp' must be 0; in that case, the result
646 | returned is a subnormal number, and it must not require rounding. The
647 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
648 | Floating-Point Arithmetic.
649 *----------------------------------------------------------------------------*/
652 roundAndPackFloatx80(
653 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
657 flag roundNearestEven
, increment
, isTiny
;
658 int64 roundIncrement
, roundMask
, roundBits
;
660 roundingMode
= STATUS(float_rounding_mode
);
661 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
662 if ( roundingPrecision
== 80 ) goto precision80
;
663 if ( roundingPrecision
== 64 ) {
664 roundIncrement
= LIT64( 0x0000000000000400 );
665 roundMask
= LIT64( 0x00000000000007FF );
667 else if ( roundingPrecision
== 32 ) {
668 roundIncrement
= LIT64( 0x0000008000000000 );
669 roundMask
= LIT64( 0x000000FFFFFFFFFF );
674 zSig0
|= ( zSig1
!= 0 );
675 if ( ! roundNearestEven
) {
676 if ( roundingMode
== float_round_to_zero
) {
680 roundIncrement
= roundMask
;
682 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
685 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
689 roundBits
= zSig0
& roundMask
;
690 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
691 if ( ( 0x7FFE < zExp
)
692 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
697 if ( STATUS(flush_to_zero
) ) return packFloatx80( zSign
, 0, 0 );
699 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
701 || ( zSig0
<= zSig0
+ roundIncrement
);
702 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
704 roundBits
= zSig0
& roundMask
;
705 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
706 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
707 zSig0
+= roundIncrement
;
708 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
709 roundIncrement
= roundMask
+ 1;
710 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
711 roundMask
|= roundIncrement
;
713 zSig0
&= ~ roundMask
;
714 return packFloatx80( zSign
, zExp
, zSig0
);
717 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
718 zSig0
+= roundIncrement
;
719 if ( zSig0
< roundIncrement
) {
721 zSig0
= LIT64( 0x8000000000000000 );
723 roundIncrement
= roundMask
+ 1;
724 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
725 roundMask
|= roundIncrement
;
727 zSig0
&= ~ roundMask
;
728 if ( zSig0
== 0 ) zExp
= 0;
729 return packFloatx80( zSign
, zExp
, zSig0
);
731 increment
= ( (sbits64
) zSig1
< 0 );
732 if ( ! roundNearestEven
) {
733 if ( roundingMode
== float_round_to_zero
) {
738 increment
= ( roundingMode
== float_round_down
) && zSig1
;
741 increment
= ( roundingMode
== float_round_up
) && zSig1
;
745 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
746 if ( ( 0x7FFE < zExp
)
747 || ( ( zExp
== 0x7FFE )
748 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
754 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
755 if ( ( roundingMode
== float_round_to_zero
)
756 || ( zSign
&& ( roundingMode
== float_round_up
) )
757 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
759 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
761 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
765 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
768 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
769 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
771 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow STATUS_VAR
);
772 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
773 if ( roundNearestEven
) {
774 increment
= ( (sbits64
) zSig1
< 0 );
778 increment
= ( roundingMode
== float_round_down
) && zSig1
;
781 increment
= ( roundingMode
== float_round_up
) && zSig1
;
787 ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
788 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
790 return packFloatx80( zSign
, zExp
, zSig0
);
793 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
798 zSig0
= LIT64( 0x8000000000000000 );
801 zSig0
&= ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
805 if ( zSig0
== 0 ) zExp
= 0;
807 return packFloatx80( zSign
, zExp
, zSig0
);
811 /*----------------------------------------------------------------------------
812 | Takes an abstract floating-point value having sign `zSign', exponent
813 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
814 | and returns the proper extended double-precision floating-point value
815 | corresponding to the abstract input. This routine is just like
816 | `roundAndPackFloatx80' except that the input significand does not have to be
818 *----------------------------------------------------------------------------*/
821 normalizeRoundAndPackFloatx80(
822 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
832 shiftCount
= countLeadingZeros64( zSig0
);
833 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
836 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
844 /*----------------------------------------------------------------------------
845 | Returns the least-significant 64 fraction bits of the quadruple-precision
846 | floating-point value `a'.
847 *----------------------------------------------------------------------------*/
849 INLINE bits64
extractFloat128Frac1( float128 a
)
856 /*----------------------------------------------------------------------------
857 | Returns the most-significant 48 fraction bits of the quadruple-precision
858 | floating-point value `a'.
859 *----------------------------------------------------------------------------*/
861 INLINE bits64
extractFloat128Frac0( float128 a
)
864 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
868 /*----------------------------------------------------------------------------
869 | Returns the exponent bits of the quadruple-precision floating-point value
871 *----------------------------------------------------------------------------*/
873 INLINE int32
extractFloat128Exp( float128 a
)
876 return ( a
.high
>>48 ) & 0x7FFF;
880 /*----------------------------------------------------------------------------
881 | Returns the sign bit of the quadruple-precision floating-point value `a'.
882 *----------------------------------------------------------------------------*/
884 INLINE flag
extractFloat128Sign( float128 a
)
891 /*----------------------------------------------------------------------------
892 | Normalizes the subnormal quadruple-precision floating-point value
893 | represented by the denormalized significand formed by the concatenation of
894 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
895 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
896 | significand are stored at the location pointed to by `zSig0Ptr', and the
897 | least significant 64 bits of the normalized significand are stored at the
898 | location pointed to by `zSig1Ptr'.
899 *----------------------------------------------------------------------------*/
902 normalizeFloat128Subnormal(
913 shiftCount
= countLeadingZeros64( aSig1
) - 15;
914 if ( shiftCount
< 0 ) {
915 *zSig0Ptr
= aSig1
>>( - shiftCount
);
916 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
919 *zSig0Ptr
= aSig1
<<shiftCount
;
922 *zExpPtr
= - shiftCount
- 63;
925 shiftCount
= countLeadingZeros64( aSig0
) - 15;
926 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
927 *zExpPtr
= 1 - shiftCount
;
932 /*----------------------------------------------------------------------------
933 | Packs the sign `zSign', the exponent `zExp', and the significand formed
934 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
935 | floating-point value, returning the result. After being shifted into the
936 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
937 | added together to form the most significant 32 bits of the result. This
938 | means that any integer portion of `zSig0' will be added into the exponent.
939 | Since a properly normalized significand will have an integer portion equal
940 | to 1, the `zExp' input should be 1 less than the desired result exponent
941 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
943 *----------------------------------------------------------------------------*/
946 packFloat128( flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
951 z
.high
= ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<48 ) + zSig0
;
956 /*----------------------------------------------------------------------------
957 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
958 | and extended significand formed by the concatenation of `zSig0', `zSig1',
959 | and `zSig2', and returns the proper quadruple-precision floating-point value
960 | corresponding to the abstract input. Ordinarily, the abstract value is
961 | simply rounded and packed into the quadruple-precision format, with the
962 | inexact exception raised if the abstract input cannot be represented
963 | exactly. However, if the abstract value is too large, the overflow and
964 | inexact exceptions are raised and an infinity or maximal finite value is
965 | returned. If the abstract value is too small, the input value is rounded to
966 | a subnormal number, and the underflow and inexact exceptions are raised if
967 | the abstract input cannot be represented exactly as a subnormal quadruple-
968 | precision floating-point number.
969 | The input significand must be normalized or smaller. If the input
970 | significand is not normalized, `zExp' must be 0; in that case, the result
971 | returned is a subnormal number, and it must not require rounding. In the
972 | usual case that the input significand is normalized, `zExp' must be 1 less
973 | than the ``true'' floating-point exponent. The handling of underflow and
974 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
975 *----------------------------------------------------------------------------*/
978 roundAndPackFloat128(
979 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
, bits64 zSig2 STATUS_PARAM
)
982 flag roundNearestEven
, increment
, isTiny
;
984 roundingMode
= STATUS(float_rounding_mode
);
985 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
986 increment
= ( (sbits64
) zSig2
< 0 );
987 if ( ! roundNearestEven
) {
988 if ( roundingMode
== float_round_to_zero
) {
993 increment
= ( roundingMode
== float_round_down
) && zSig2
;
996 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1000 if ( 0x7FFD <= (bits32
) zExp
) {
1001 if ( ( 0x7FFD < zExp
)
1002 || ( ( zExp
== 0x7FFD )
1004 LIT64( 0x0001FFFFFFFFFFFF ),
1005 LIT64( 0xFFFFFFFFFFFFFFFF ),
1012 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
1013 if ( ( roundingMode
== float_round_to_zero
)
1014 || ( zSign
&& ( roundingMode
== float_round_up
) )
1015 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
1021 LIT64( 0x0000FFFFFFFFFFFF ),
1022 LIT64( 0xFFFFFFFFFFFFFFFF )
1025 return packFloat128( zSign
, 0x7FFF, 0, 0 );
1028 if ( STATUS(flush_to_zero
) ) return packFloat128( zSign
, 0, 0, 0 );
1030 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
1036 LIT64( 0x0001FFFFFFFFFFFF ),
1037 LIT64( 0xFFFFFFFFFFFFFFFF )
1039 shift128ExtraRightJamming(
1040 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
1042 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow STATUS_VAR
);
1043 if ( roundNearestEven
) {
1044 increment
= ( (sbits64
) zSig2
< 0 );
1048 increment
= ( roundingMode
== float_round_down
) && zSig2
;
1051 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1056 if ( zSig2
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1058 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1059 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1062 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1064 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1068 /*----------------------------------------------------------------------------
1069 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1070 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1071 | returns the proper quadruple-precision floating-point value corresponding
1072 | to the abstract input. This routine is just like `roundAndPackFloat128'
1073 | except that the input significand has fewer bits and does not have to be
1074 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1076 *----------------------------------------------------------------------------*/
1079 normalizeRoundAndPackFloat128(
1080 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1 STATUS_PARAM
)
1090 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1091 if ( 0 <= shiftCount
) {
1093 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1096 shift128ExtraRightJamming(
1097 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1100 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
1106 /*----------------------------------------------------------------------------
1107 | Returns the result of converting the 32-bit two's complement integer `a'
1108 | to the single-precision floating-point format. The conversion is performed
1109 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1110 *----------------------------------------------------------------------------*/
1112 float32
int32_to_float32( int32 a STATUS_PARAM
)
1116 if ( a
== 0 ) return float32_zero
;
1117 if ( a
== (sbits32
) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1119 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a STATUS_VAR
);
1123 /*----------------------------------------------------------------------------
1124 | Returns the result of converting the 32-bit two's complement integer `a'
1125 | to the double-precision floating-point format. The conversion is performed
1126 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1129 float64
int32_to_float64( int32 a STATUS_PARAM
)
1136 if ( a
== 0 ) return float64_zero
;
1138 absA
= zSign
? - a
: a
;
1139 shiftCount
= countLeadingZeros32( absA
) + 21;
1141 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1147 /*----------------------------------------------------------------------------
1148 | Returns the result of converting the 32-bit two's complement integer `a'
1149 | to the extended double-precision floating-point format. The conversion
1150 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1152 *----------------------------------------------------------------------------*/
1154 floatx80
int32_to_floatx80( int32 a STATUS_PARAM
)
1161 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1163 absA
= zSign
? - a
: a
;
1164 shiftCount
= countLeadingZeros32( absA
) + 32;
1166 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1174 /*----------------------------------------------------------------------------
1175 | Returns the result of converting the 32-bit two's complement integer `a' to
1176 | the quadruple-precision floating-point format. The conversion is performed
1177 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1178 *----------------------------------------------------------------------------*/
1180 float128
int32_to_float128( int32 a STATUS_PARAM
)
1187 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1189 absA
= zSign
? - a
: a
;
1190 shiftCount
= countLeadingZeros32( absA
) + 17;
1192 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1198 /*----------------------------------------------------------------------------
1199 | Returns the result of converting the 64-bit two's complement integer `a'
1200 | to the single-precision floating-point format. The conversion is performed
1201 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1202 *----------------------------------------------------------------------------*/
1204 float32
int64_to_float32( int64 a STATUS_PARAM
)
1210 if ( a
== 0 ) return float32_zero
;
1212 absA
= zSign
? - a
: a
;
1213 shiftCount
= countLeadingZeros64( absA
) - 40;
1214 if ( 0 <= shiftCount
) {
1215 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1219 if ( shiftCount
< 0 ) {
1220 shift64RightJamming( absA
, - shiftCount
, &absA
);
1223 absA
<<= shiftCount
;
1225 return roundAndPackFloat32( zSign
, 0x9C - shiftCount
, absA STATUS_VAR
);
1230 float32
uint64_to_float32( uint64 a STATUS_PARAM
)
1234 if ( a
== 0 ) return float32_zero
;
1235 shiftCount
= countLeadingZeros64( a
) - 40;
1236 if ( 0 <= shiftCount
) {
1237 return packFloat32( 1 > 0, 0x95 - shiftCount
, a
<<shiftCount
);
1241 if ( shiftCount
< 0 ) {
1242 shift64RightJamming( a
, - shiftCount
, &a
);
1247 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount
, a STATUS_VAR
);
1251 /*----------------------------------------------------------------------------
1252 | Returns the result of converting the 64-bit two's complement integer `a'
1253 | to the double-precision floating-point format. The conversion is performed
1254 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1255 *----------------------------------------------------------------------------*/
1257 float64
int64_to_float64( int64 a STATUS_PARAM
)
1261 if ( a
== 0 ) return float64_zero
;
1262 if ( a
== (sbits64
) LIT64( 0x8000000000000000 ) ) {
1263 return packFloat64( 1, 0x43E, 0 );
1266 return normalizeRoundAndPackFloat64( zSign
, 0x43C, zSign
? - a
: a STATUS_VAR
);
1270 float64
uint64_to_float64( uint64 a STATUS_PARAM
)
1272 if ( a
== 0 ) return float64_zero
;
1273 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR
);
1279 /*----------------------------------------------------------------------------
1280 | Returns the result of converting the 64-bit two's complement integer `a'
1281 | to the extended double-precision floating-point format. The conversion
1282 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1284 *----------------------------------------------------------------------------*/
1286 floatx80
int64_to_floatx80( int64 a STATUS_PARAM
)
1292 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1294 absA
= zSign
? - a
: a
;
1295 shiftCount
= countLeadingZeros64( absA
);
1296 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1304 /*----------------------------------------------------------------------------
1305 | Returns the result of converting the 64-bit two's complement integer `a' to
1306 | the quadruple-precision floating-point format. The conversion is performed
1307 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1308 *----------------------------------------------------------------------------*/
1310 float128
int64_to_float128( int64 a STATUS_PARAM
)
1316 bits64 zSig0
, zSig1
;
1318 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1320 absA
= zSign
? - a
: a
;
1321 shiftCount
= countLeadingZeros64( absA
) + 49;
1322 zExp
= 0x406E - shiftCount
;
1323 if ( 64 <= shiftCount
) {
1332 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1333 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1339 /*----------------------------------------------------------------------------
1340 | Returns the result of converting the single-precision floating-point value
1341 | `a' to the 32-bit two's complement integer format. The conversion is
1342 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1343 | Arithmetic---which means in particular that the conversion is rounded
1344 | according to the current rounding mode. If `a' is a NaN, the largest
1345 | positive integer is returned. Otherwise, if the conversion overflows, the
1346 | largest integer with the same sign as `a' is returned.
1347 *----------------------------------------------------------------------------*/
1349 int32
float32_to_int32( float32 a STATUS_PARAM
)
1352 int16 aExp
, shiftCount
;
1356 a
= float32_squash_input_denormal(a STATUS_VAR
);
1357 aSig
= extractFloat32Frac( a
);
1358 aExp
= extractFloat32Exp( a
);
1359 aSign
= extractFloat32Sign( a
);
1360 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1361 if ( aExp
) aSig
|= 0x00800000;
1362 shiftCount
= 0xAF - aExp
;
1365 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1366 return roundAndPackInt32( aSign
, aSig64 STATUS_VAR
);
1370 /*----------------------------------------------------------------------------
1371 | Returns the result of converting the single-precision floating-point value
1372 | `a' to the 32-bit two's complement integer format. The conversion is
1373 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1374 | Arithmetic, except that the conversion is always rounded toward zero.
1375 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1376 | the conversion overflows, the largest integer with the same sign as `a' is
1378 *----------------------------------------------------------------------------*/
1380 int32
float32_to_int32_round_to_zero( float32 a STATUS_PARAM
)
1383 int16 aExp
, shiftCount
;
1386 a
= float32_squash_input_denormal(a STATUS_VAR
);
1388 aSig
= extractFloat32Frac( a
);
1389 aExp
= extractFloat32Exp( a
);
1390 aSign
= extractFloat32Sign( a
);
1391 shiftCount
= aExp
- 0x9E;
1392 if ( 0 <= shiftCount
) {
1393 if ( float32_val(a
) != 0xCF000000 ) {
1394 float_raise( float_flag_invalid STATUS_VAR
);
1395 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1397 return (sbits32
) 0x80000000;
1399 else if ( aExp
<= 0x7E ) {
1400 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1403 aSig
= ( aSig
| 0x00800000 )<<8;
1404 z
= aSig
>>( - shiftCount
);
1405 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
1406 STATUS(float_exception_flags
) |= float_flag_inexact
;
1408 if ( aSign
) z
= - z
;
1413 /*----------------------------------------------------------------------------
1414 | Returns the result of converting the single-precision floating-point value
1415 | `a' to the 16-bit two's complement integer format. The conversion is
1416 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1417 | Arithmetic, except that the conversion is always rounded toward zero.
1418 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1419 | the conversion overflows, the largest integer with the same sign as `a' is
1421 *----------------------------------------------------------------------------*/
1423 int16
float32_to_int16_round_to_zero( float32 a STATUS_PARAM
)
1426 int16 aExp
, shiftCount
;
1430 aSig
= extractFloat32Frac( a
);
1431 aExp
= extractFloat32Exp( a
);
1432 aSign
= extractFloat32Sign( a
);
1433 shiftCount
= aExp
- 0x8E;
1434 if ( 0 <= shiftCount
) {
1435 if ( float32_val(a
) != 0xC7000000 ) {
1436 float_raise( float_flag_invalid STATUS_VAR
);
1437 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1441 return (sbits32
) 0xffff8000;
1443 else if ( aExp
<= 0x7E ) {
1444 if ( aExp
| aSig
) {
1445 STATUS(float_exception_flags
) |= float_flag_inexact
;
1450 aSig
= ( aSig
| 0x00800000 )<<8;
1451 z
= aSig
>>( - shiftCount
);
1452 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
1453 STATUS(float_exception_flags
) |= float_flag_inexact
;
1462 /*----------------------------------------------------------------------------
1463 | Returns the result of converting the single-precision floating-point value
1464 | `a' to the 64-bit two's complement integer format. The conversion is
1465 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1466 | Arithmetic---which means in particular that the conversion is rounded
1467 | according to the current rounding mode. If `a' is a NaN, the largest
1468 | positive integer is returned. Otherwise, if the conversion overflows, the
1469 | largest integer with the same sign as `a' is returned.
1470 *----------------------------------------------------------------------------*/
1472 int64
float32_to_int64( float32 a STATUS_PARAM
)
1475 int16 aExp
, shiftCount
;
1477 bits64 aSig64
, aSigExtra
;
1478 a
= float32_squash_input_denormal(a STATUS_VAR
);
1480 aSig
= extractFloat32Frac( a
);
1481 aExp
= extractFloat32Exp( a
);
1482 aSign
= extractFloat32Sign( a
);
1483 shiftCount
= 0xBE - aExp
;
1484 if ( shiftCount
< 0 ) {
1485 float_raise( float_flag_invalid STATUS_VAR
);
1486 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1487 return LIT64( 0x7FFFFFFFFFFFFFFF );
1489 return (sbits64
) LIT64( 0x8000000000000000 );
1491 if ( aExp
) aSig
|= 0x00800000;
1494 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1495 return roundAndPackInt64( aSign
, aSig64
, aSigExtra STATUS_VAR
);
1499 /*----------------------------------------------------------------------------
1500 | Returns the result of converting the single-precision floating-point value
1501 | `a' to the 64-bit two's complement integer format. The conversion is
1502 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1503 | Arithmetic, except that the conversion is always rounded toward zero. If
1504 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1505 | conversion overflows, the largest integer with the same sign as `a' is
1507 *----------------------------------------------------------------------------*/
1509 int64
float32_to_int64_round_to_zero( float32 a STATUS_PARAM
)
1512 int16 aExp
, shiftCount
;
1516 a
= float32_squash_input_denormal(a STATUS_VAR
);
1518 aSig
= extractFloat32Frac( a
);
1519 aExp
= extractFloat32Exp( a
);
1520 aSign
= extractFloat32Sign( a
);
1521 shiftCount
= aExp
- 0xBE;
1522 if ( 0 <= shiftCount
) {
1523 if ( float32_val(a
) != 0xDF000000 ) {
1524 float_raise( float_flag_invalid STATUS_VAR
);
1525 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1526 return LIT64( 0x7FFFFFFFFFFFFFFF );
1529 return (sbits64
) LIT64( 0x8000000000000000 );
1531 else if ( aExp
<= 0x7E ) {
1532 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1535 aSig64
= aSig
| 0x00800000;
1537 z
= aSig64
>>( - shiftCount
);
1538 if ( (bits64
) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1539 STATUS(float_exception_flags
) |= float_flag_inexact
;
1541 if ( aSign
) z
= - z
;
1546 /*----------------------------------------------------------------------------
1547 | Returns the result of converting the single-precision floating-point value
1548 | `a' to the double-precision floating-point format. The conversion is
1549 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1551 *----------------------------------------------------------------------------*/
1553 float64
float32_to_float64( float32 a STATUS_PARAM
)
1558 a
= float32_squash_input_denormal(a STATUS_VAR
);
1560 aSig
= extractFloat32Frac( a
);
1561 aExp
= extractFloat32Exp( a
);
1562 aSign
= extractFloat32Sign( a
);
1563 if ( aExp
== 0xFF ) {
1564 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR
));
1565 return packFloat64( aSign
, 0x7FF, 0 );
1568 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1569 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1572 return packFloat64( aSign
, aExp
+ 0x380, ( (bits64
) aSig
)<<29 );
1578 /*----------------------------------------------------------------------------
1579 | Returns the result of converting the single-precision floating-point value
1580 | `a' to the extended double-precision floating-point format. The conversion
1581 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1583 *----------------------------------------------------------------------------*/
1585 floatx80
float32_to_floatx80( float32 a STATUS_PARAM
)
1591 a
= float32_squash_input_denormal(a STATUS_VAR
);
1592 aSig
= extractFloat32Frac( a
);
1593 aExp
= extractFloat32Exp( a
);
1594 aSign
= extractFloat32Sign( a
);
1595 if ( aExp
== 0xFF ) {
1596 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR
) );
1597 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1600 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1601 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1604 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<40 );
1612 /*----------------------------------------------------------------------------
1613 | Returns the result of converting the single-precision floating-point value
1614 | `a' to the double-precision floating-point format. The conversion is
1615 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1617 *----------------------------------------------------------------------------*/
1619 float128
float32_to_float128( float32 a STATUS_PARAM
)
1625 a
= float32_squash_input_denormal(a STATUS_VAR
);
1626 aSig
= extractFloat32Frac( a
);
1627 aExp
= extractFloat32Exp( a
);
1628 aSign
= extractFloat32Sign( a
);
1629 if ( aExp
== 0xFF ) {
1630 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR
) );
1631 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1634 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1635 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1638 return packFloat128( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<25, 0 );
1644 /*----------------------------------------------------------------------------
1645 | Rounds the single-precision floating-point value `a' to an integer, and
1646 | returns the result as a single-precision floating-point value. The
1647 | operation is performed according to the IEC/IEEE Standard for Binary
1648 | Floating-Point Arithmetic.
1649 *----------------------------------------------------------------------------*/
1651 float32
float32_round_to_int( float32 a STATUS_PARAM
)
1655 bits32 lastBitMask
, roundBitsMask
;
1658 a
= float32_squash_input_denormal(a STATUS_VAR
);
1660 aExp
= extractFloat32Exp( a
);
1661 if ( 0x96 <= aExp
) {
1662 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1663 return propagateFloat32NaN( a
, a STATUS_VAR
);
1667 if ( aExp
<= 0x7E ) {
1668 if ( (bits32
) ( float32_val(a
)<<1 ) == 0 ) return a
;
1669 STATUS(float_exception_flags
) |= float_flag_inexact
;
1670 aSign
= extractFloat32Sign( a
);
1671 switch ( STATUS(float_rounding_mode
) ) {
1672 case float_round_nearest_even
:
1673 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1674 return packFloat32( aSign
, 0x7F, 0 );
1677 case float_round_down
:
1678 return make_float32(aSign
? 0xBF800000 : 0);
1679 case float_round_up
:
1680 return make_float32(aSign
? 0x80000000 : 0x3F800000);
1682 return packFloat32( aSign
, 0, 0 );
1685 lastBitMask
<<= 0x96 - aExp
;
1686 roundBitsMask
= lastBitMask
- 1;
1688 roundingMode
= STATUS(float_rounding_mode
);
1689 if ( roundingMode
== float_round_nearest_even
) {
1690 z
+= lastBitMask
>>1;
1691 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1693 else if ( roundingMode
!= float_round_to_zero
) {
1694 if ( extractFloat32Sign( make_float32(z
) ) ^ ( roundingMode
== float_round_up
) ) {
1698 z
&= ~ roundBitsMask
;
1699 if ( z
!= float32_val(a
) ) STATUS(float_exception_flags
) |= float_flag_inexact
;
1700 return make_float32(z
);
1704 /*----------------------------------------------------------------------------
1705 | Returns the result of adding the absolute values of the single-precision
1706 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1707 | before being returned. `zSign' is ignored if the result is a NaN.
1708 | The addition is performed according to the IEC/IEEE Standard for Binary
1709 | Floating-Point Arithmetic.
1710 *----------------------------------------------------------------------------*/
1712 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1714 int16 aExp
, bExp
, zExp
;
1715 bits32 aSig
, bSig
, zSig
;
1718 aSig
= extractFloat32Frac( a
);
1719 aExp
= extractFloat32Exp( a
);
1720 bSig
= extractFloat32Frac( b
);
1721 bExp
= extractFloat32Exp( b
);
1722 expDiff
= aExp
- bExp
;
1725 if ( 0 < expDiff
) {
1726 if ( aExp
== 0xFF ) {
1727 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1736 shift32RightJamming( bSig
, expDiff
, &bSig
);
1739 else if ( expDiff
< 0 ) {
1740 if ( bExp
== 0xFF ) {
1741 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1742 return packFloat32( zSign
, 0xFF, 0 );
1750 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1754 if ( aExp
== 0xFF ) {
1755 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1759 if ( STATUS(flush_to_zero
) ) return packFloat32( zSign
, 0, 0 );
1760 return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1762 zSig
= 0x40000000 + aSig
+ bSig
;
1767 zSig
= ( aSig
+ bSig
)<<1;
1769 if ( (sbits32
) zSig
< 0 ) {
1774 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1778 /*----------------------------------------------------------------------------
1779 | Returns the result of subtracting the absolute values of the single-
1780 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1781 | difference is negated before being returned. `zSign' is ignored if the
1782 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1783 | Standard for Binary Floating-Point Arithmetic.
1784 *----------------------------------------------------------------------------*/
1786 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1788 int16 aExp
, bExp
, zExp
;
1789 bits32 aSig
, bSig
, zSig
;
1792 aSig
= extractFloat32Frac( a
);
1793 aExp
= extractFloat32Exp( a
);
1794 bSig
= extractFloat32Frac( b
);
1795 bExp
= extractFloat32Exp( b
);
1796 expDiff
= aExp
- bExp
;
1799 if ( 0 < expDiff
) goto aExpBigger
;
1800 if ( expDiff
< 0 ) goto bExpBigger
;
1801 if ( aExp
== 0xFF ) {
1802 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1803 float_raise( float_flag_invalid STATUS_VAR
);
1804 return float32_default_nan
;
1810 if ( bSig
< aSig
) goto aBigger
;
1811 if ( aSig
< bSig
) goto bBigger
;
1812 return packFloat32( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
1814 if ( bExp
== 0xFF ) {
1815 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1816 return packFloat32( zSign
^ 1, 0xFF, 0 );
1824 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1830 goto normalizeRoundAndPack
;
1832 if ( aExp
== 0xFF ) {
1833 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1842 shift32RightJamming( bSig
, expDiff
, &bSig
);
1847 normalizeRoundAndPack
:
1849 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1853 /*----------------------------------------------------------------------------
1854 | Returns the result of adding the single-precision floating-point values `a'
1855 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1856 | Binary Floating-Point Arithmetic.
1857 *----------------------------------------------------------------------------*/
1859 float32
float32_add( float32 a
, float32 b STATUS_PARAM
)
1862 a
= float32_squash_input_denormal(a STATUS_VAR
);
1863 b
= float32_squash_input_denormal(b STATUS_VAR
);
1865 aSign
= extractFloat32Sign( a
);
1866 bSign
= extractFloat32Sign( b
);
1867 if ( aSign
== bSign
) {
1868 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1871 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1876 /*----------------------------------------------------------------------------
1877 | Returns the result of subtracting the single-precision floating-point values
1878 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1879 | for Binary Floating-Point Arithmetic.
1880 *----------------------------------------------------------------------------*/
1882 float32
float32_sub( float32 a
, float32 b STATUS_PARAM
)
1885 a
= float32_squash_input_denormal(a STATUS_VAR
);
1886 b
= float32_squash_input_denormal(b STATUS_VAR
);
1888 aSign
= extractFloat32Sign( a
);
1889 bSign
= extractFloat32Sign( b
);
1890 if ( aSign
== bSign
) {
1891 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1894 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1899 /*----------------------------------------------------------------------------
1900 | Returns the result of multiplying the single-precision floating-point values
1901 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1902 | for Binary Floating-Point Arithmetic.
1903 *----------------------------------------------------------------------------*/
1905 float32
float32_mul( float32 a
, float32 b STATUS_PARAM
)
1907 flag aSign
, bSign
, zSign
;
1908 int16 aExp
, bExp
, zExp
;
1913 a
= float32_squash_input_denormal(a STATUS_VAR
);
1914 b
= float32_squash_input_denormal(b STATUS_VAR
);
1916 aSig
= extractFloat32Frac( a
);
1917 aExp
= extractFloat32Exp( a
);
1918 aSign
= extractFloat32Sign( a
);
1919 bSig
= extractFloat32Frac( b
);
1920 bExp
= extractFloat32Exp( b
);
1921 bSign
= extractFloat32Sign( b
);
1922 zSign
= aSign
^ bSign
;
1923 if ( aExp
== 0xFF ) {
1924 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1925 return propagateFloat32NaN( a
, b STATUS_VAR
);
1927 if ( ( bExp
| bSig
) == 0 ) {
1928 float_raise( float_flag_invalid STATUS_VAR
);
1929 return float32_default_nan
;
1931 return packFloat32( zSign
, 0xFF, 0 );
1933 if ( bExp
== 0xFF ) {
1934 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1935 if ( ( aExp
| aSig
) == 0 ) {
1936 float_raise( float_flag_invalid STATUS_VAR
);
1937 return float32_default_nan
;
1939 return packFloat32( zSign
, 0xFF, 0 );
1942 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1943 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1946 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1947 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1949 zExp
= aExp
+ bExp
- 0x7F;
1950 aSig
= ( aSig
| 0x00800000 )<<7;
1951 bSig
= ( bSig
| 0x00800000 )<<8;
1952 shift64RightJamming( ( (bits64
) aSig
) * bSig
, 32, &zSig64
);
1954 if ( 0 <= (sbits32
) ( zSig
<<1 ) ) {
1958 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1962 /*----------------------------------------------------------------------------
1963 | Returns the result of dividing the single-precision floating-point value `a'
1964 | by the corresponding value `b'. The operation is performed according to the
1965 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1966 *----------------------------------------------------------------------------*/
1968 float32
float32_div( float32 a
, float32 b STATUS_PARAM
)
1970 flag aSign
, bSign
, zSign
;
1971 int16 aExp
, bExp
, zExp
;
1972 bits32 aSig
, bSig
, zSig
;
1973 a
= float32_squash_input_denormal(a STATUS_VAR
);
1974 b
= float32_squash_input_denormal(b STATUS_VAR
);
1976 aSig
= extractFloat32Frac( a
);
1977 aExp
= extractFloat32Exp( a
);
1978 aSign
= extractFloat32Sign( a
);
1979 bSig
= extractFloat32Frac( b
);
1980 bExp
= extractFloat32Exp( b
);
1981 bSign
= extractFloat32Sign( b
);
1982 zSign
= aSign
^ bSign
;
1983 if ( aExp
== 0xFF ) {
1984 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1985 if ( bExp
== 0xFF ) {
1986 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1987 float_raise( float_flag_invalid STATUS_VAR
);
1988 return float32_default_nan
;
1990 return packFloat32( zSign
, 0xFF, 0 );
1992 if ( bExp
== 0xFF ) {
1993 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1994 return packFloat32( zSign
, 0, 0 );
1998 if ( ( aExp
| aSig
) == 0 ) {
1999 float_raise( float_flag_invalid STATUS_VAR
);
2000 return float32_default_nan
;
2002 float_raise( float_flag_divbyzero STATUS_VAR
);
2003 return packFloat32( zSign
, 0xFF, 0 );
2005 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2008 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2009 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2011 zExp
= aExp
- bExp
+ 0x7D;
2012 aSig
= ( aSig
| 0x00800000 )<<7;
2013 bSig
= ( bSig
| 0x00800000 )<<8;
2014 if ( bSig
<= ( aSig
+ aSig
) ) {
2018 zSig
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
2019 if ( ( zSig
& 0x3F ) == 0 ) {
2020 zSig
|= ( (bits64
) bSig
* zSig
!= ( (bits64
) aSig
)<<32 );
2022 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
2026 /*----------------------------------------------------------------------------
2027 | Returns the remainder of the single-precision floating-point value `a'
2028 | with respect to the corresponding value `b'. The operation is performed
2029 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2030 *----------------------------------------------------------------------------*/
2032 float32
float32_rem( float32 a
, float32 b STATUS_PARAM
)
2035 int16 aExp
, bExp
, expDiff
;
2038 bits64 aSig64
, bSig64
, q64
;
2039 bits32 alternateASig
;
2041 a
= float32_squash_input_denormal(a STATUS_VAR
);
2042 b
= float32_squash_input_denormal(b STATUS_VAR
);
2044 aSig
= extractFloat32Frac( a
);
2045 aExp
= extractFloat32Exp( a
);
2046 aSign
= extractFloat32Sign( a
);
2047 bSig
= extractFloat32Frac( b
);
2048 bExp
= extractFloat32Exp( b
);
2049 if ( aExp
== 0xFF ) {
2050 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2051 return propagateFloat32NaN( a
, b STATUS_VAR
);
2053 float_raise( float_flag_invalid STATUS_VAR
);
2054 return float32_default_nan
;
2056 if ( bExp
== 0xFF ) {
2057 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2062 float_raise( float_flag_invalid STATUS_VAR
);
2063 return float32_default_nan
;
2065 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2068 if ( aSig
== 0 ) return a
;
2069 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2071 expDiff
= aExp
- bExp
;
2074 if ( expDiff
< 32 ) {
2077 if ( expDiff
< 0 ) {
2078 if ( expDiff
< -1 ) return a
;
2081 q
= ( bSig
<= aSig
);
2082 if ( q
) aSig
-= bSig
;
2083 if ( 0 < expDiff
) {
2084 q
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
2087 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2095 if ( bSig
<= aSig
) aSig
-= bSig
;
2096 aSig64
= ( (bits64
) aSig
)<<40;
2097 bSig64
= ( (bits64
) bSig
)<<40;
2099 while ( 0 < expDiff
) {
2100 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2101 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2102 aSig64
= - ( ( bSig
* q64
)<<38 );
2106 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2107 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2108 q
= q64
>>( 64 - expDiff
);
2110 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
2113 alternateASig
= aSig
;
2116 } while ( 0 <= (sbits32
) aSig
);
2117 sigMean
= aSig
+ alternateASig
;
2118 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2119 aSig
= alternateASig
;
2121 zSign
= ( (sbits32
) aSig
< 0 );
2122 if ( zSign
) aSig
= - aSig
;
2123 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
2127 /*----------------------------------------------------------------------------
2128 | Returns the square root of the single-precision floating-point value `a'.
2129 | The operation is performed according to the IEC/IEEE Standard for Binary
2130 | Floating-Point Arithmetic.
2131 *----------------------------------------------------------------------------*/
2133 float32
float32_sqrt( float32 a STATUS_PARAM
)
2139 a
= float32_squash_input_denormal(a STATUS_VAR
);
2141 aSig
= extractFloat32Frac( a
);
2142 aExp
= extractFloat32Exp( a
);
2143 aSign
= extractFloat32Sign( a
);
2144 if ( aExp
== 0xFF ) {
2145 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2146 if ( ! aSign
) return a
;
2147 float_raise( float_flag_invalid STATUS_VAR
);
2148 return float32_default_nan
;
2151 if ( ( aExp
| aSig
) == 0 ) return a
;
2152 float_raise( float_flag_invalid STATUS_VAR
);
2153 return float32_default_nan
;
2156 if ( aSig
== 0 ) return float32_zero
;
2157 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2159 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2160 aSig
= ( aSig
| 0x00800000 )<<8;
2161 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2162 if ( ( zSig
& 0x7F ) <= 5 ) {
2168 term
= ( (bits64
) zSig
) * zSig
;
2169 rem
= ( ( (bits64
) aSig
)<<32 ) - term
;
2170 while ( (sbits64
) rem
< 0 ) {
2172 rem
+= ( ( (bits64
) zSig
)<<1 ) | 1;
2174 zSig
|= ( rem
!= 0 );
2176 shift32RightJamming( zSig
, 1, &zSig
);
2178 return roundAndPackFloat32( 0, zExp
, zSig STATUS_VAR
);
2182 /*----------------------------------------------------------------------------
2183 | Returns the binary exponential of the single-precision floating-point value
2184 | `a'. The operation is performed according to the IEC/IEEE Standard for
2185 | Binary Floating-Point Arithmetic.
2187 | Uses the following identities:
2189 | 1. -------------------------------------------------------------------------
2193 | 2. -------------------------------------------------------------------------
2196 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2198 *----------------------------------------------------------------------------*/
2200 static const float64 float32_exp2_coefficients
[15] =
2202 make_float64( 0x3ff0000000000000ll
), /* 1 */
2203 make_float64( 0x3fe0000000000000ll
), /* 2 */
2204 make_float64( 0x3fc5555555555555ll
), /* 3 */
2205 make_float64( 0x3fa5555555555555ll
), /* 4 */
2206 make_float64( 0x3f81111111111111ll
), /* 5 */
2207 make_float64( 0x3f56c16c16c16c17ll
), /* 6 */
2208 make_float64( 0x3f2a01a01a01a01all
), /* 7 */
2209 make_float64( 0x3efa01a01a01a01all
), /* 8 */
2210 make_float64( 0x3ec71de3a556c734ll
), /* 9 */
2211 make_float64( 0x3e927e4fb7789f5cll
), /* 10 */
2212 make_float64( 0x3e5ae64567f544e4ll
), /* 11 */
2213 make_float64( 0x3e21eed8eff8d898ll
), /* 12 */
2214 make_float64( 0x3de6124613a86d09ll
), /* 13 */
2215 make_float64( 0x3da93974a8c07c9dll
), /* 14 */
2216 make_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
2219 float32
float32_exp2( float32 a STATUS_PARAM
)
2226 a
= float32_squash_input_denormal(a STATUS_VAR
);
2228 aSig
= extractFloat32Frac( a
);
2229 aExp
= extractFloat32Exp( a
);
2230 aSign
= extractFloat32Sign( a
);
2232 if ( aExp
== 0xFF) {
2233 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2234 return (aSign
) ? float32_zero
: a
;
2237 if (aSig
== 0) return float32_one
;
2240 float_raise( float_flag_inexact STATUS_VAR
);
2242 /* ******************************* */
2243 /* using float64 for approximation */
2244 /* ******************************* */
2245 x
= float32_to_float64(a STATUS_VAR
);
2246 x
= float64_mul(x
, float64_ln2 STATUS_VAR
);
2250 for (i
= 0 ; i
< 15 ; i
++) {
2253 f
= float64_mul(xn
, float32_exp2_coefficients
[i
] STATUS_VAR
);
2254 r
= float64_add(r
, f STATUS_VAR
);
2256 xn
= float64_mul(xn
, x STATUS_VAR
);
2259 return float64_to_float32(r
, status
);
2262 /*----------------------------------------------------------------------------
2263 | Returns the binary log of the single-precision floating-point value `a'.
2264 | The operation is performed according to the IEC/IEEE Standard for Binary
2265 | Floating-Point Arithmetic.
2266 *----------------------------------------------------------------------------*/
2267 float32
float32_log2( float32 a STATUS_PARAM
)
2271 bits32 aSig
, zSig
, i
;
2273 a
= float32_squash_input_denormal(a STATUS_VAR
);
2274 aSig
= extractFloat32Frac( a
);
2275 aExp
= extractFloat32Exp( a
);
2276 aSign
= extractFloat32Sign( a
);
2279 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
2280 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2283 float_raise( float_flag_invalid STATUS_VAR
);
2284 return float32_default_nan
;
2286 if ( aExp
== 0xFF ) {
2287 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2296 for (i
= 1 << 22; i
> 0; i
>>= 1) {
2297 aSig
= ( (bits64
)aSig
* aSig
) >> 23;
2298 if ( aSig
& 0x01000000 ) {
2307 return normalizeRoundAndPackFloat32( zSign
, 0x85, zSig STATUS_VAR
);
2310 /*----------------------------------------------------------------------------
2311 | Returns 1 if the single-precision floating-point value `a' is equal to
2312 | the corresponding value `b', and 0 otherwise. The comparison is performed
2313 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2314 *----------------------------------------------------------------------------*/
2316 int float32_eq( float32 a
, float32 b STATUS_PARAM
)
2318 a
= float32_squash_input_denormal(a STATUS_VAR
);
2319 b
= float32_squash_input_denormal(b STATUS_VAR
);
2321 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2322 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2324 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2325 float_raise( float_flag_invalid STATUS_VAR
);
2329 return ( float32_val(a
) == float32_val(b
) ) ||
2330 ( (bits32
) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
2334 /*----------------------------------------------------------------------------
2335 | Returns 1 if the single-precision floating-point value `a' is less than
2336 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2337 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2339 *----------------------------------------------------------------------------*/
2341 int float32_le( float32 a
, float32 b STATUS_PARAM
)
2345 a
= float32_squash_input_denormal(a STATUS_VAR
);
2346 b
= float32_squash_input_denormal(b STATUS_VAR
);
2348 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2349 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2351 float_raise( float_flag_invalid STATUS_VAR
);
2354 aSign
= extractFloat32Sign( a
);
2355 bSign
= extractFloat32Sign( b
);
2356 av
= float32_val(a
);
2357 bv
= float32_val(b
);
2358 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( av
| bv
)<<1 ) == 0 );
2359 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2363 /*----------------------------------------------------------------------------
2364 | Returns 1 if the single-precision floating-point value `a' is less than
2365 | the corresponding value `b', and 0 otherwise. The comparison is performed
2366 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2367 *----------------------------------------------------------------------------*/
2369 int float32_lt( float32 a
, float32 b STATUS_PARAM
)
2373 a
= float32_squash_input_denormal(a STATUS_VAR
);
2374 b
= float32_squash_input_denormal(b STATUS_VAR
);
2376 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2377 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2379 float_raise( float_flag_invalid STATUS_VAR
);
2382 aSign
= extractFloat32Sign( a
);
2383 bSign
= extractFloat32Sign( b
);
2384 av
= float32_val(a
);
2385 bv
= float32_val(b
);
2386 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( av
| bv
)<<1 ) != 0 );
2387 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2391 /*----------------------------------------------------------------------------
2392 | Returns 1 if the single-precision floating-point value `a' is equal to
2393 | the corresponding value `b', and 0 otherwise. The invalid exception is
2394 | raised if either operand is a NaN. Otherwise, the comparison is performed
2395 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2396 *----------------------------------------------------------------------------*/
2398 int float32_eq_signaling( float32 a
, float32 b STATUS_PARAM
)
2401 a
= float32_squash_input_denormal(a STATUS_VAR
);
2402 b
= float32_squash_input_denormal(b STATUS_VAR
);
2404 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2405 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2407 float_raise( float_flag_invalid STATUS_VAR
);
2410 av
= float32_val(a
);
2411 bv
= float32_val(b
);
2412 return ( av
== bv
) || ( (bits32
) ( ( av
| bv
)<<1 ) == 0 );
2416 /*----------------------------------------------------------------------------
2417 | Returns 1 if the single-precision floating-point value `a' is less than or
2418 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2419 | cause an exception. Otherwise, the comparison is performed according to the
2420 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2421 *----------------------------------------------------------------------------*/
2423 int float32_le_quiet( float32 a
, float32 b STATUS_PARAM
)
2427 a
= float32_squash_input_denormal(a STATUS_VAR
);
2428 b
= float32_squash_input_denormal(b STATUS_VAR
);
2430 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2431 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2433 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2434 float_raise( float_flag_invalid STATUS_VAR
);
2438 aSign
= extractFloat32Sign( a
);
2439 bSign
= extractFloat32Sign( b
);
2440 av
= float32_val(a
);
2441 bv
= float32_val(b
);
2442 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( av
| bv
)<<1 ) == 0 );
2443 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2447 /*----------------------------------------------------------------------------
2448 | Returns 1 if the single-precision floating-point value `a' is less than
2449 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2450 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2451 | Standard for Binary Floating-Point Arithmetic.
2452 *----------------------------------------------------------------------------*/
2454 int float32_lt_quiet( float32 a
, float32 b STATUS_PARAM
)
2458 a
= float32_squash_input_denormal(a STATUS_VAR
);
2459 b
= float32_squash_input_denormal(b STATUS_VAR
);
2461 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2462 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2464 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2465 float_raise( float_flag_invalid STATUS_VAR
);
2469 aSign
= extractFloat32Sign( a
);
2470 bSign
= extractFloat32Sign( b
);
2471 av
= float32_val(a
);
2472 bv
= float32_val(b
);
2473 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( av
| bv
)<<1 ) != 0 );
2474 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2478 /*----------------------------------------------------------------------------
2479 | Returns the result of converting the double-precision floating-point value
2480 | `a' to the 32-bit two's complement integer format. The conversion is
2481 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2482 | Arithmetic---which means in particular that the conversion is rounded
2483 | according to the current rounding mode. If `a' is a NaN, the largest
2484 | positive integer is returned. Otherwise, if the conversion overflows, the
2485 | largest integer with the same sign as `a' is returned.
2486 *----------------------------------------------------------------------------*/
2488 int32
float64_to_int32( float64 a STATUS_PARAM
)
2491 int16 aExp
, shiftCount
;
2493 a
= float64_squash_input_denormal(a STATUS_VAR
);
2495 aSig
= extractFloat64Frac( a
);
2496 aExp
= extractFloat64Exp( a
);
2497 aSign
= extractFloat64Sign( a
);
2498 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2499 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2500 shiftCount
= 0x42C - aExp
;
2501 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
2502 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
2506 /*----------------------------------------------------------------------------
2507 | Returns the result of converting the double-precision floating-point value
2508 | `a' to the 32-bit two's complement integer format. The conversion is
2509 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2510 | Arithmetic, except that the conversion is always rounded toward zero.
2511 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2512 | the conversion overflows, the largest integer with the same sign as `a' is
2514 *----------------------------------------------------------------------------*/
2516 int32
float64_to_int32_round_to_zero( float64 a STATUS_PARAM
)
2519 int16 aExp
, shiftCount
;
2520 bits64 aSig
, savedASig
;
2522 a
= float64_squash_input_denormal(a STATUS_VAR
);
2524 aSig
= extractFloat64Frac( a
);
2525 aExp
= extractFloat64Exp( a
);
2526 aSign
= extractFloat64Sign( a
);
2527 if ( 0x41E < aExp
) {
2528 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2531 else if ( aExp
< 0x3FF ) {
2532 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2535 aSig
|= LIT64( 0x0010000000000000 );
2536 shiftCount
= 0x433 - aExp
;
2538 aSig
>>= shiftCount
;
2540 if ( aSign
) z
= - z
;
2541 if ( ( z
< 0 ) ^ aSign
) {
2543 float_raise( float_flag_invalid STATUS_VAR
);
2544 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
2546 if ( ( aSig
<<shiftCount
) != savedASig
) {
2547 STATUS(float_exception_flags
) |= float_flag_inexact
;
2553 /*----------------------------------------------------------------------------
2554 | Returns the result of converting the double-precision floating-point value
2555 | `a' to the 16-bit two's complement integer format. The conversion is
2556 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2557 | Arithmetic, except that the conversion is always rounded toward zero.
2558 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2559 | the conversion overflows, the largest integer with the same sign as `a' is
2561 *----------------------------------------------------------------------------*/
2563 int16
float64_to_int16_round_to_zero( float64 a STATUS_PARAM
)
2566 int16 aExp
, shiftCount
;
2567 bits64 aSig
, savedASig
;
2570 aSig
= extractFloat64Frac( a
);
2571 aExp
= extractFloat64Exp( a
);
2572 aSign
= extractFloat64Sign( a
);
2573 if ( 0x40E < aExp
) {
2574 if ( ( aExp
== 0x7FF ) && aSig
) {
2579 else if ( aExp
< 0x3FF ) {
2580 if ( aExp
|| aSig
) {
2581 STATUS(float_exception_flags
) |= float_flag_inexact
;
2585 aSig
|= LIT64( 0x0010000000000000 );
2586 shiftCount
= 0x433 - aExp
;
2588 aSig
>>= shiftCount
;
2593 if ( ( (int16_t)z
< 0 ) ^ aSign
) {
2595 float_raise( float_flag_invalid STATUS_VAR
);
2596 return aSign
? (sbits32
) 0xffff8000 : 0x7FFF;
2598 if ( ( aSig
<<shiftCount
) != savedASig
) {
2599 STATUS(float_exception_flags
) |= float_flag_inexact
;
2604 /*----------------------------------------------------------------------------
2605 | Returns the result of converting the double-precision floating-point value
2606 | `a' to the 64-bit two's complement integer format. The conversion is
2607 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2608 | Arithmetic---which means in particular that the conversion is rounded
2609 | according to the current rounding mode. If `a' is a NaN, the largest
2610 | positive integer is returned. Otherwise, if the conversion overflows, the
2611 | largest integer with the same sign as `a' is returned.
2612 *----------------------------------------------------------------------------*/
2614 int64
float64_to_int64( float64 a STATUS_PARAM
)
2617 int16 aExp
, shiftCount
;
2618 bits64 aSig
, aSigExtra
;
2619 a
= float64_squash_input_denormal(a STATUS_VAR
);
2621 aSig
= extractFloat64Frac( a
);
2622 aExp
= extractFloat64Exp( a
);
2623 aSign
= extractFloat64Sign( a
);
2624 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2625 shiftCount
= 0x433 - aExp
;
2626 if ( shiftCount
<= 0 ) {
2627 if ( 0x43E < aExp
) {
2628 float_raise( float_flag_invalid STATUS_VAR
);
2630 || ( ( aExp
== 0x7FF )
2631 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2633 return LIT64( 0x7FFFFFFFFFFFFFFF );
2635 return (sbits64
) LIT64( 0x8000000000000000 );
2638 aSig
<<= - shiftCount
;
2641 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
2643 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
2647 /*----------------------------------------------------------------------------
2648 | Returns the result of converting the double-precision floating-point value
2649 | `a' to the 64-bit two's complement integer format. The conversion is
2650 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2651 | Arithmetic, except that the conversion is always rounded toward zero.
2652 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2653 | the conversion overflows, the largest integer with the same sign as `a' is
2655 *----------------------------------------------------------------------------*/
2657 int64
float64_to_int64_round_to_zero( float64 a STATUS_PARAM
)
2660 int16 aExp
, shiftCount
;
2663 a
= float64_squash_input_denormal(a STATUS_VAR
);
2665 aSig
= extractFloat64Frac( a
);
2666 aExp
= extractFloat64Exp( a
);
2667 aSign
= extractFloat64Sign( a
);
2668 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2669 shiftCount
= aExp
- 0x433;
2670 if ( 0 <= shiftCount
) {
2671 if ( 0x43E <= aExp
) {
2672 if ( float64_val(a
) != LIT64( 0xC3E0000000000000 ) ) {
2673 float_raise( float_flag_invalid STATUS_VAR
);
2675 || ( ( aExp
== 0x7FF )
2676 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2678 return LIT64( 0x7FFFFFFFFFFFFFFF );
2681 return (sbits64
) LIT64( 0x8000000000000000 );
2683 z
= aSig
<<shiftCount
;
2686 if ( aExp
< 0x3FE ) {
2687 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2690 z
= aSig
>>( - shiftCount
);
2691 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
2692 STATUS(float_exception_flags
) |= float_flag_inexact
;
2695 if ( aSign
) z
= - z
;
2700 /*----------------------------------------------------------------------------
2701 | Returns the result of converting the double-precision floating-point value
2702 | `a' to the single-precision floating-point format. The conversion is
2703 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2705 *----------------------------------------------------------------------------*/
2707 float32
float64_to_float32( float64 a STATUS_PARAM
)
2713 a
= float64_squash_input_denormal(a STATUS_VAR
);
2715 aSig
= extractFloat64Frac( a
);
2716 aExp
= extractFloat64Exp( a
);
2717 aSign
= extractFloat64Sign( a
);
2718 if ( aExp
== 0x7FF ) {
2719 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR
) );
2720 return packFloat32( aSign
, 0xFF, 0 );
2722 shift64RightJamming( aSig
, 22, &aSig
);
2724 if ( aExp
|| zSig
) {
2728 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
2733 /*----------------------------------------------------------------------------
2734 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2735 | half-precision floating-point value, returning the result. After being
2736 | shifted into the proper positions, the three fields are simply added
2737 | together to form the result. This means that any integer portion of `zSig'
2738 | will be added into the exponent. Since a properly normalized significand
2739 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2740 | than the desired result exponent whenever `zSig' is a complete, normalized
2742 *----------------------------------------------------------------------------*/
2743 static float16
packFloat16(flag zSign
, int16 zExp
, bits16 zSig
)
2745 return make_float16(
2746 (((bits32
)zSign
) << 15) + (((bits32
)zExp
) << 10) + zSig
);
2749 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2750 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2752 float32
float16_to_float32(float16 a
, flag ieee STATUS_PARAM
)
2758 aSign
= extractFloat16Sign(a
);
2759 aExp
= extractFloat16Exp(a
);
2760 aSig
= extractFloat16Frac(a
);
2762 if (aExp
== 0x1f && ieee
) {
2764 /* Make sure correct exceptions are raised. */
2765 float32ToCommonNaN(a STATUS_VAR
);
2768 return packFloat32(aSign
, 0xff, aSig
<< 13);
2774 return packFloat32(aSign
, 0, 0);
2777 shiftCount
= countLeadingZeros32( aSig
) - 21;
2778 aSig
= aSig
<< shiftCount
;
2781 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
2784 float16
float32_to_float16(float32 a
, flag ieee STATUS_PARAM
)
2792 a
= float32_squash_input_denormal(a STATUS_VAR
);
2794 aSig
= extractFloat32Frac( a
);
2795 aExp
= extractFloat32Exp( a
);
2796 aSign
= extractFloat32Sign( a
);
2797 if ( aExp
== 0xFF ) {
2799 /* Make sure correct exceptions are raised. */
2800 float32ToCommonNaN(a STATUS_VAR
);
2803 return packFloat16(aSign
, 0x1f, aSig
>> 13);
2805 if (aExp
== 0 && aSign
== 0) {
2806 return packFloat16(aSign
, 0, 0);
2808 /* Decimal point between bits 22 and 23. */
2822 float_raise( float_flag_underflow STATUS_VAR
);
2823 roundingMode
= STATUS(float_rounding_mode
);
2824 switch (roundingMode
) {
2825 case float_round_nearest_even
:
2826 increment
= (mask
+ 1) >> 1;
2827 if ((aSig
& mask
) == increment
) {
2828 increment
= aSig
& (increment
<< 1);
2831 case float_round_up
:
2832 increment
= aSign
? 0 : mask
;
2834 case float_round_down
:
2835 increment
= aSign
? mask
: 0;
2837 default: /* round_to_zero */
2842 if (aSig
>= 0x01000000) {
2846 } else if (aExp
< -14
2847 && STATUS(float_detect_tininess
) == float_tininess_before_rounding
) {
2848 float_raise( float_flag_underflow STATUS_VAR
);
2853 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
2854 return packFloat16(aSign
, 0x1f, 0);
2858 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
2859 return packFloat16(aSign
, 0x1f, 0x3ff);
2863 return packFloat16(aSign
, 0, 0);
2866 aSig
>>= -14 - aExp
;
2869 return packFloat16(aSign
, aExp
+ 14, aSig
>> 13);
2874 /*----------------------------------------------------------------------------
2875 | Returns the result of converting the double-precision floating-point value
2876 | `a' to the extended double-precision floating-point format. The conversion
2877 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2879 *----------------------------------------------------------------------------*/
2881 floatx80
float64_to_floatx80( float64 a STATUS_PARAM
)
2887 a
= float64_squash_input_denormal(a STATUS_VAR
);
2888 aSig
= extractFloat64Frac( a
);
2889 aExp
= extractFloat64Exp( a
);
2890 aSign
= extractFloat64Sign( a
);
2891 if ( aExp
== 0x7FF ) {
2892 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR
) );
2893 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2896 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
2897 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2901 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
2909 /*----------------------------------------------------------------------------
2910 | Returns the result of converting the double-precision floating-point value
2911 | `a' to the quadruple-precision floating-point format. The conversion is
2912 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2914 *----------------------------------------------------------------------------*/
2916 float128
float64_to_float128( float64 a STATUS_PARAM
)
2920 bits64 aSig
, zSig0
, zSig1
;
2922 a
= float64_squash_input_denormal(a STATUS_VAR
);
2923 aSig
= extractFloat64Frac( a
);
2924 aExp
= extractFloat64Exp( a
);
2925 aSign
= extractFloat64Sign( a
);
2926 if ( aExp
== 0x7FF ) {
2927 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR
) );
2928 return packFloat128( aSign
, 0x7FFF, 0, 0 );
2931 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
2932 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2935 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
2936 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
2942 /*----------------------------------------------------------------------------
2943 | Rounds the double-precision floating-point value `a' to an integer, and
2944 | returns the result as a double-precision floating-point value. The
2945 | operation is performed according to the IEC/IEEE Standard for Binary
2946 | Floating-Point Arithmetic.
2947 *----------------------------------------------------------------------------*/
2949 float64
float64_round_to_int( float64 a STATUS_PARAM
)
2953 bits64 lastBitMask
, roundBitsMask
;
2956 a
= float64_squash_input_denormal(a STATUS_VAR
);
2958 aExp
= extractFloat64Exp( a
);
2959 if ( 0x433 <= aExp
) {
2960 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
2961 return propagateFloat64NaN( a
, a STATUS_VAR
);
2965 if ( aExp
< 0x3FF ) {
2966 if ( (bits64
) ( float64_val(a
)<<1 ) == 0 ) return a
;
2967 STATUS(float_exception_flags
) |= float_flag_inexact
;
2968 aSign
= extractFloat64Sign( a
);
2969 switch ( STATUS(float_rounding_mode
) ) {
2970 case float_round_nearest_even
:
2971 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
2972 return packFloat64( aSign
, 0x3FF, 0 );
2975 case float_round_down
:
2976 return make_float64(aSign
? LIT64( 0xBFF0000000000000 ) : 0);
2977 case float_round_up
:
2978 return make_float64(
2979 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
2981 return packFloat64( aSign
, 0, 0 );
2984 lastBitMask
<<= 0x433 - aExp
;
2985 roundBitsMask
= lastBitMask
- 1;
2987 roundingMode
= STATUS(float_rounding_mode
);
2988 if ( roundingMode
== float_round_nearest_even
) {
2989 z
+= lastBitMask
>>1;
2990 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
2992 else if ( roundingMode
!= float_round_to_zero
) {
2993 if ( extractFloat64Sign( make_float64(z
) ) ^ ( roundingMode
== float_round_up
) ) {
2997 z
&= ~ roundBitsMask
;
2998 if ( z
!= float64_val(a
) )
2999 STATUS(float_exception_flags
) |= float_flag_inexact
;
3000 return make_float64(z
);
3004 float64
float64_trunc_to_int( float64 a STATUS_PARAM
)
3008 oldmode
= STATUS(float_rounding_mode
);
3009 STATUS(float_rounding_mode
) = float_round_to_zero
;
3010 res
= float64_round_to_int(a STATUS_VAR
);
3011 STATUS(float_rounding_mode
) = oldmode
;
3015 /*----------------------------------------------------------------------------
3016 | Returns the result of adding the absolute values of the double-precision
3017 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3018 | before being returned. `zSign' is ignored if the result is a NaN.
3019 | The addition is performed according to the IEC/IEEE Standard for Binary
3020 | Floating-Point Arithmetic.
3021 *----------------------------------------------------------------------------*/
3023 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
3025 int16 aExp
, bExp
, zExp
;
3026 bits64 aSig
, bSig
, zSig
;
3029 aSig
= extractFloat64Frac( a
);
3030 aExp
= extractFloat64Exp( a
);
3031 bSig
= extractFloat64Frac( b
);
3032 bExp
= extractFloat64Exp( b
);
3033 expDiff
= aExp
- bExp
;
3036 if ( 0 < expDiff
) {
3037 if ( aExp
== 0x7FF ) {
3038 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3045 bSig
|= LIT64( 0x2000000000000000 );
3047 shift64RightJamming( bSig
, expDiff
, &bSig
);
3050 else if ( expDiff
< 0 ) {
3051 if ( bExp
== 0x7FF ) {
3052 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3053 return packFloat64( zSign
, 0x7FF, 0 );
3059 aSig
|= LIT64( 0x2000000000000000 );
3061 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3065 if ( aExp
== 0x7FF ) {
3066 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3070 if ( STATUS(flush_to_zero
) ) return packFloat64( zSign
, 0, 0 );
3071 return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
3073 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
3077 aSig
|= LIT64( 0x2000000000000000 );
3078 zSig
= ( aSig
+ bSig
)<<1;
3080 if ( (sbits64
) zSig
< 0 ) {
3085 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3089 /*----------------------------------------------------------------------------
3090 | Returns the result of subtracting the absolute values of the double-
3091 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3092 | difference is negated before being returned. `zSign' is ignored if the
3093 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3094 | Standard for Binary Floating-Point Arithmetic.
3095 *----------------------------------------------------------------------------*/
3097 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
3099 int16 aExp
, bExp
, zExp
;
3100 bits64 aSig
, bSig
, zSig
;
3103 aSig
= extractFloat64Frac( a
);
3104 aExp
= extractFloat64Exp( a
);
3105 bSig
= extractFloat64Frac( b
);
3106 bExp
= extractFloat64Exp( b
);
3107 expDiff
= aExp
- bExp
;
3110 if ( 0 < expDiff
) goto aExpBigger
;
3111 if ( expDiff
< 0 ) goto bExpBigger
;
3112 if ( aExp
== 0x7FF ) {
3113 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3114 float_raise( float_flag_invalid STATUS_VAR
);
3115 return float64_default_nan
;
3121 if ( bSig
< aSig
) goto aBigger
;
3122 if ( aSig
< bSig
) goto bBigger
;
3123 return packFloat64( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
3125 if ( bExp
== 0x7FF ) {
3126 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3127 return packFloat64( zSign
^ 1, 0x7FF, 0 );
3133 aSig
|= LIT64( 0x4000000000000000 );
3135 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3136 bSig
|= LIT64( 0x4000000000000000 );
3141 goto normalizeRoundAndPack
;
3143 if ( aExp
== 0x7FF ) {
3144 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3151 bSig
|= LIT64( 0x4000000000000000 );
3153 shift64RightJamming( bSig
, expDiff
, &bSig
);
3154 aSig
|= LIT64( 0x4000000000000000 );
3158 normalizeRoundAndPack
:
3160 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3164 /*----------------------------------------------------------------------------
3165 | Returns the result of adding the double-precision floating-point values `a'
3166 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3167 | Binary Floating-Point Arithmetic.
3168 *----------------------------------------------------------------------------*/
3170 float64
float64_add( float64 a
, float64 b STATUS_PARAM
)
3173 a
= float64_squash_input_denormal(a STATUS_VAR
);
3174 b
= float64_squash_input_denormal(b STATUS_VAR
);
3176 aSign
= extractFloat64Sign( a
);
3177 bSign
= extractFloat64Sign( b
);
3178 if ( aSign
== bSign
) {
3179 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3182 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3187 /*----------------------------------------------------------------------------
3188 | Returns the result of subtracting the double-precision floating-point values
3189 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3190 | for Binary Floating-Point Arithmetic.
3191 *----------------------------------------------------------------------------*/
3193 float64
float64_sub( float64 a
, float64 b STATUS_PARAM
)
3196 a
= float64_squash_input_denormal(a STATUS_VAR
);
3197 b
= float64_squash_input_denormal(b STATUS_VAR
);
3199 aSign
= extractFloat64Sign( a
);
3200 bSign
= extractFloat64Sign( b
);
3201 if ( aSign
== bSign
) {
3202 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3205 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3210 /*----------------------------------------------------------------------------
3211 | Returns the result of multiplying the double-precision floating-point values
3212 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3213 | for Binary Floating-Point Arithmetic.
3214 *----------------------------------------------------------------------------*/
3216 float64
float64_mul( float64 a
, float64 b STATUS_PARAM
)
3218 flag aSign
, bSign
, zSign
;
3219 int16 aExp
, bExp
, zExp
;
3220 bits64 aSig
, bSig
, zSig0
, zSig1
;
3222 a
= float64_squash_input_denormal(a STATUS_VAR
);
3223 b
= float64_squash_input_denormal(b STATUS_VAR
);
3225 aSig
= extractFloat64Frac( a
);
3226 aExp
= extractFloat64Exp( a
);
3227 aSign
= extractFloat64Sign( a
);
3228 bSig
= extractFloat64Frac( b
);
3229 bExp
= extractFloat64Exp( b
);
3230 bSign
= extractFloat64Sign( b
);
3231 zSign
= aSign
^ bSign
;
3232 if ( aExp
== 0x7FF ) {
3233 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3234 return propagateFloat64NaN( a
, b STATUS_VAR
);
3236 if ( ( bExp
| bSig
) == 0 ) {
3237 float_raise( float_flag_invalid STATUS_VAR
);
3238 return float64_default_nan
;
3240 return packFloat64( zSign
, 0x7FF, 0 );
3242 if ( bExp
== 0x7FF ) {
3243 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3244 if ( ( aExp
| aSig
) == 0 ) {
3245 float_raise( float_flag_invalid STATUS_VAR
);
3246 return float64_default_nan
;
3248 return packFloat64( zSign
, 0x7FF, 0 );
3251 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3252 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3255 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3256 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3258 zExp
= aExp
+ bExp
- 0x3FF;
3259 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3260 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3261 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3262 zSig0
|= ( zSig1
!= 0 );
3263 if ( 0 <= (sbits64
) ( zSig0
<<1 ) ) {
3267 return roundAndPackFloat64( zSign
, zExp
, zSig0 STATUS_VAR
);
3271 /*----------------------------------------------------------------------------
3272 | Returns the result of dividing the double-precision floating-point value `a'
3273 | by the corresponding value `b'. The operation is performed according to
3274 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3275 *----------------------------------------------------------------------------*/
3277 float64
float64_div( float64 a
, float64 b STATUS_PARAM
)
3279 flag aSign
, bSign
, zSign
;
3280 int16 aExp
, bExp
, zExp
;
3281 bits64 aSig
, bSig
, zSig
;
3283 bits64 term0
, term1
;
3284 a
= float64_squash_input_denormal(a STATUS_VAR
);
3285 b
= float64_squash_input_denormal(b STATUS_VAR
);
3287 aSig
= extractFloat64Frac( a
);
3288 aExp
= extractFloat64Exp( a
);
3289 aSign
= extractFloat64Sign( a
);
3290 bSig
= extractFloat64Frac( b
);
3291 bExp
= extractFloat64Exp( b
);
3292 bSign
= extractFloat64Sign( b
);
3293 zSign
= aSign
^ bSign
;
3294 if ( aExp
== 0x7FF ) {
3295 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3296 if ( bExp
== 0x7FF ) {
3297 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3298 float_raise( float_flag_invalid STATUS_VAR
);
3299 return float64_default_nan
;
3301 return packFloat64( zSign
, 0x7FF, 0 );
3303 if ( bExp
== 0x7FF ) {
3304 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3305 return packFloat64( zSign
, 0, 0 );
3309 if ( ( aExp
| aSig
) == 0 ) {
3310 float_raise( float_flag_invalid STATUS_VAR
);
3311 return float64_default_nan
;
3313 float_raise( float_flag_divbyzero STATUS_VAR
);
3314 return packFloat64( zSign
, 0x7FF, 0 );
3316 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3319 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3320 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3322 zExp
= aExp
- bExp
+ 0x3FD;
3323 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3324 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3325 if ( bSig
<= ( aSig
+ aSig
) ) {
3329 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
3330 if ( ( zSig
& 0x1FF ) <= 2 ) {
3331 mul64To128( bSig
, zSig
, &term0
, &term1
);
3332 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3333 while ( (sbits64
) rem0
< 0 ) {
3335 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3337 zSig
|= ( rem1
!= 0 );
3339 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3343 /*----------------------------------------------------------------------------
3344 | Returns the remainder of the double-precision floating-point value `a'
3345 | with respect to the corresponding value `b'. The operation is performed
3346 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3347 *----------------------------------------------------------------------------*/
3349 float64
float64_rem( float64 a
, float64 b STATUS_PARAM
)
3352 int16 aExp
, bExp
, expDiff
;
3354 bits64 q
, alternateASig
;
3357 a
= float64_squash_input_denormal(a STATUS_VAR
);
3358 b
= float64_squash_input_denormal(b STATUS_VAR
);
3359 aSig
= extractFloat64Frac( a
);
3360 aExp
= extractFloat64Exp( a
);
3361 aSign
= extractFloat64Sign( a
);
3362 bSig
= extractFloat64Frac( b
);
3363 bExp
= extractFloat64Exp( b
);
3364 if ( aExp
== 0x7FF ) {
3365 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3366 return propagateFloat64NaN( a
, b STATUS_VAR
);
3368 float_raise( float_flag_invalid STATUS_VAR
);
3369 return float64_default_nan
;
3371 if ( bExp
== 0x7FF ) {
3372 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3377 float_raise( float_flag_invalid STATUS_VAR
);
3378 return float64_default_nan
;
3380 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3383 if ( aSig
== 0 ) return a
;
3384 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3386 expDiff
= aExp
- bExp
;
3387 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
3388 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3389 if ( expDiff
< 0 ) {
3390 if ( expDiff
< -1 ) return a
;
3393 q
= ( bSig
<= aSig
);
3394 if ( q
) aSig
-= bSig
;
3396 while ( 0 < expDiff
) {
3397 q
= estimateDiv128To64( aSig
, 0, bSig
);
3398 q
= ( 2 < q
) ? q
- 2 : 0;
3399 aSig
= - ( ( bSig
>>2 ) * q
);
3403 if ( 0 < expDiff
) {
3404 q
= estimateDiv128To64( aSig
, 0, bSig
);
3405 q
= ( 2 < q
) ? q
- 2 : 0;
3408 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3415 alternateASig
= aSig
;
3418 } while ( 0 <= (sbits64
) aSig
);
3419 sigMean
= aSig
+ alternateASig
;
3420 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3421 aSig
= alternateASig
;
3423 zSign
= ( (sbits64
) aSig
< 0 );
3424 if ( zSign
) aSig
= - aSig
;
3425 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
3429 /*----------------------------------------------------------------------------
3430 | Returns the square root of the double-precision floating-point value `a'.
3431 | The operation is performed according to the IEC/IEEE Standard for Binary
3432 | Floating-Point Arithmetic.
3433 *----------------------------------------------------------------------------*/
3435 float64
float64_sqrt( float64 a STATUS_PARAM
)
3439 bits64 aSig
, zSig
, doubleZSig
;
3440 bits64 rem0
, rem1
, term0
, term1
;
3441 a
= float64_squash_input_denormal(a STATUS_VAR
);
3443 aSig
= extractFloat64Frac( a
);
3444 aExp
= extractFloat64Exp( a
);
3445 aSign
= extractFloat64Sign( a
);
3446 if ( aExp
== 0x7FF ) {
3447 if ( aSig
) return propagateFloat64NaN( a
, a STATUS_VAR
);
3448 if ( ! aSign
) return a
;
3449 float_raise( float_flag_invalid STATUS_VAR
);
3450 return float64_default_nan
;
3453 if ( ( aExp
| aSig
) == 0 ) return a
;
3454 float_raise( float_flag_invalid STATUS_VAR
);
3455 return float64_default_nan
;
3458 if ( aSig
== 0 ) return float64_zero
;
3459 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3461 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
3462 aSig
|= LIT64( 0x0010000000000000 );
3463 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
3464 aSig
<<= 9 - ( aExp
& 1 );
3465 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
3466 if ( ( zSig
& 0x1FF ) <= 5 ) {
3467 doubleZSig
= zSig
<<1;
3468 mul64To128( zSig
, zSig
, &term0
, &term1
);
3469 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3470 while ( (sbits64
) rem0
< 0 ) {
3473 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
3475 zSig
|= ( ( rem0
| rem1
) != 0 );
3477 return roundAndPackFloat64( 0, zExp
, zSig STATUS_VAR
);
3481 /*----------------------------------------------------------------------------
3482 | Returns the binary log of the double-precision floating-point value `a'.
3483 | The operation is performed according to the IEC/IEEE Standard for Binary
3484 | Floating-Point Arithmetic.
3485 *----------------------------------------------------------------------------*/
3486 float64
float64_log2( float64 a STATUS_PARAM
)
3490 bits64 aSig
, aSig0
, aSig1
, zSig
, i
;
3491 a
= float64_squash_input_denormal(a STATUS_VAR
);
3493 aSig
= extractFloat64Frac( a
);
3494 aExp
= extractFloat64Exp( a
);
3495 aSign
= extractFloat64Sign( a
);
3498 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
3499 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3502 float_raise( float_flag_invalid STATUS_VAR
);
3503 return float64_default_nan
;
3505 if ( aExp
== 0x7FF ) {
3506 if ( aSig
) return propagateFloat64NaN( a
, float64_zero STATUS_VAR
);
3511 aSig
|= LIT64( 0x0010000000000000 );
3513 zSig
= (bits64
)aExp
<< 52;
3514 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
3515 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
3516 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
3517 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
3525 return normalizeRoundAndPackFloat64( zSign
, 0x408, zSig STATUS_VAR
);
3528 /*----------------------------------------------------------------------------
3529 | Returns 1 if the double-precision floating-point value `a' is equal to the
3530 | corresponding value `b', and 0 otherwise. The comparison is performed
3531 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3532 *----------------------------------------------------------------------------*/
3534 int float64_eq( float64 a
, float64 b STATUS_PARAM
)
3537 a
= float64_squash_input_denormal(a STATUS_VAR
);
3538 b
= float64_squash_input_denormal(b STATUS_VAR
);
3540 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3541 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3543 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3544 float_raise( float_flag_invalid STATUS_VAR
);
3548 av
= float64_val(a
);
3549 bv
= float64_val(b
);
3550 return ( av
== bv
) || ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3554 /*----------------------------------------------------------------------------
3555 | Returns 1 if the double-precision floating-point value `a' is less than or
3556 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3557 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3559 *----------------------------------------------------------------------------*/
3561 int float64_le( float64 a
, float64 b STATUS_PARAM
)
3565 a
= float64_squash_input_denormal(a STATUS_VAR
);
3566 b
= float64_squash_input_denormal(b STATUS_VAR
);
3568 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3569 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3571 float_raise( float_flag_invalid STATUS_VAR
);
3574 aSign
= extractFloat64Sign( a
);
3575 bSign
= extractFloat64Sign( b
);
3576 av
= float64_val(a
);
3577 bv
= float64_val(b
);
3578 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3579 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3583 /*----------------------------------------------------------------------------
3584 | Returns 1 if the double-precision floating-point value `a' is less than
3585 | the corresponding value `b', and 0 otherwise. The comparison is performed
3586 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3587 *----------------------------------------------------------------------------*/
3589 int float64_lt( float64 a
, float64 b STATUS_PARAM
)
3594 a
= float64_squash_input_denormal(a STATUS_VAR
);
3595 b
= float64_squash_input_denormal(b STATUS_VAR
);
3596 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3597 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3599 float_raise( float_flag_invalid STATUS_VAR
);
3602 aSign
= extractFloat64Sign( a
);
3603 bSign
= extractFloat64Sign( b
);
3604 av
= float64_val(a
);
3605 bv
= float64_val(b
);
3606 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( av
| bv
)<<1 ) != 0 );
3607 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3611 /*----------------------------------------------------------------------------
3612 | Returns 1 if the double-precision floating-point value `a' is equal to the
3613 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3614 | if either operand is a NaN. Otherwise, the comparison is performed
3615 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3616 *----------------------------------------------------------------------------*/
3618 int float64_eq_signaling( float64 a
, float64 b STATUS_PARAM
)
3621 a
= float64_squash_input_denormal(a STATUS_VAR
);
3622 b
= float64_squash_input_denormal(b STATUS_VAR
);
3624 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3625 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3627 float_raise( float_flag_invalid STATUS_VAR
);
3630 av
= float64_val(a
);
3631 bv
= float64_val(b
);
3632 return ( av
== bv
) || ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3636 /*----------------------------------------------------------------------------
3637 | Returns 1 if the double-precision floating-point value `a' is less than or
3638 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3639 | cause an exception. Otherwise, the comparison is performed according to the
3640 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3641 *----------------------------------------------------------------------------*/
3643 int float64_le_quiet( float64 a
, float64 b STATUS_PARAM
)
3647 a
= float64_squash_input_denormal(a STATUS_VAR
);
3648 b
= float64_squash_input_denormal(b STATUS_VAR
);
3650 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3651 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3653 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3654 float_raise( float_flag_invalid STATUS_VAR
);
3658 aSign
= extractFloat64Sign( a
);
3659 bSign
= extractFloat64Sign( b
);
3660 av
= float64_val(a
);
3661 bv
= float64_val(b
);
3662 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( av
| bv
)<<1 ) == 0 );
3663 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3667 /*----------------------------------------------------------------------------
3668 | Returns 1 if the double-precision floating-point value `a' is less than
3669 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3670 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3671 | Standard for Binary Floating-Point Arithmetic.
3672 *----------------------------------------------------------------------------*/
3674 int float64_lt_quiet( float64 a
, float64 b STATUS_PARAM
)
3678 a
= float64_squash_input_denormal(a STATUS_VAR
);
3679 b
= float64_squash_input_denormal(b STATUS_VAR
);
3681 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3682 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3684 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3685 float_raise( float_flag_invalid STATUS_VAR
);
3689 aSign
= extractFloat64Sign( a
);
3690 bSign
= extractFloat64Sign( b
);
3691 av
= float64_val(a
);
3692 bv
= float64_val(b
);
3693 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( av
| bv
)<<1 ) != 0 );
3694 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3700 /*----------------------------------------------------------------------------
3701 | Returns the result of converting the extended double-precision floating-
3702 | point value `a' to the 32-bit two's complement integer format. The
3703 | conversion is performed according to the IEC/IEEE Standard for Binary
3704 | Floating-Point Arithmetic---which means in particular that the conversion
3705 | is rounded according to the current rounding mode. If `a' is a NaN, the
3706 | largest positive integer is returned. Otherwise, if the conversion
3707 | overflows, the largest integer with the same sign as `a' is returned.
3708 *----------------------------------------------------------------------------*/
3710 int32
floatx80_to_int32( floatx80 a STATUS_PARAM
)
3713 int32 aExp
, shiftCount
;
3716 aSig
= extractFloatx80Frac( a
);
3717 aExp
= extractFloatx80Exp( a
);
3718 aSign
= extractFloatx80Sign( a
);
3719 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3720 shiftCount
= 0x4037 - aExp
;
3721 if ( shiftCount
<= 0 ) shiftCount
= 1;
3722 shift64RightJamming( aSig
, shiftCount
, &aSig
);
3723 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
3727 /*----------------------------------------------------------------------------
3728 | Returns the result of converting the extended double-precision floating-
3729 | point value `a' to the 32-bit two's complement integer format. The
3730 | conversion is performed according to the IEC/IEEE Standard for Binary
3731 | Floating-Point Arithmetic, except that the conversion is always rounded
3732 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3733 | Otherwise, if the conversion overflows, the largest integer with the same
3734 | sign as `a' is returned.
3735 *----------------------------------------------------------------------------*/
3737 int32
floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM
)
3740 int32 aExp
, shiftCount
;
3741 bits64 aSig
, savedASig
;
3744 aSig
= extractFloatx80Frac( a
);
3745 aExp
= extractFloatx80Exp( a
);
3746 aSign
= extractFloatx80Sign( a
);
3747 if ( 0x401E < aExp
) {
3748 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3751 else if ( aExp
< 0x3FFF ) {
3752 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3755 shiftCount
= 0x403E - aExp
;
3757 aSig
>>= shiftCount
;
3759 if ( aSign
) z
= - z
;
3760 if ( ( z
< 0 ) ^ aSign
) {
3762 float_raise( float_flag_invalid STATUS_VAR
);
3763 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
3765 if ( ( aSig
<<shiftCount
) != savedASig
) {
3766 STATUS(float_exception_flags
) |= float_flag_inexact
;
3772 /*----------------------------------------------------------------------------
3773 | Returns the result of converting the extended double-precision floating-
3774 | point value `a' to the 64-bit two's complement integer format. The
3775 | conversion is performed according to the IEC/IEEE Standard for Binary
3776 | Floating-Point Arithmetic---which means in particular that the conversion
3777 | is rounded according to the current rounding mode. If `a' is a NaN,
3778 | the largest positive integer is returned. Otherwise, if the conversion
3779 | overflows, the largest integer with the same sign as `a' is returned.
3780 *----------------------------------------------------------------------------*/
3782 int64
floatx80_to_int64( floatx80 a STATUS_PARAM
)
3785 int32 aExp
, shiftCount
;
3786 bits64 aSig
, aSigExtra
;
3788 aSig
= extractFloatx80Frac( a
);
3789 aExp
= extractFloatx80Exp( a
);
3790 aSign
= extractFloatx80Sign( a
);
3791 shiftCount
= 0x403E - aExp
;
3792 if ( shiftCount
<= 0 ) {
3794 float_raise( float_flag_invalid STATUS_VAR
);
3796 || ( ( aExp
== 0x7FFF )
3797 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
3799 return LIT64( 0x7FFFFFFFFFFFFFFF );
3801 return (sbits64
) LIT64( 0x8000000000000000 );
3806 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3808 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
3812 /*----------------------------------------------------------------------------
3813 | Returns the result of converting the extended double-precision floating-
3814 | point value `a' to the 64-bit two's complement integer format. The
3815 | conversion is performed according to the IEC/IEEE Standard for Binary
3816 | Floating-Point Arithmetic, except that the conversion is always rounded
3817 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3818 | Otherwise, if the conversion overflows, the largest integer with the same
3819 | sign as `a' is returned.
3820 *----------------------------------------------------------------------------*/
3822 int64
floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM
)
3825 int32 aExp
, shiftCount
;
3829 aSig
= extractFloatx80Frac( a
);
3830 aExp
= extractFloatx80Exp( a
);
3831 aSign
= extractFloatx80Sign( a
);
3832 shiftCount
= aExp
- 0x403E;
3833 if ( 0 <= shiftCount
) {
3834 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
3835 if ( ( a
.high
!= 0xC03E ) || aSig
) {
3836 float_raise( float_flag_invalid STATUS_VAR
);
3837 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
3838 return LIT64( 0x7FFFFFFFFFFFFFFF );
3841 return (sbits64
) LIT64( 0x8000000000000000 );
3843 else if ( aExp
< 0x3FFF ) {
3844 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3847 z
= aSig
>>( - shiftCount
);
3848 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
3849 STATUS(float_exception_flags
) |= float_flag_inexact
;
3851 if ( aSign
) z
= - z
;
3856 /*----------------------------------------------------------------------------
3857 | Returns the result of converting the extended double-precision floating-
3858 | point value `a' to the single-precision floating-point format. The
3859 | conversion is performed according to the IEC/IEEE Standard for Binary
3860 | Floating-Point Arithmetic.
3861 *----------------------------------------------------------------------------*/
3863 float32
floatx80_to_float32( floatx80 a STATUS_PARAM
)
3869 aSig
= extractFloatx80Frac( a
);
3870 aExp
= extractFloatx80Exp( a
);
3871 aSign
= extractFloatx80Sign( a
);
3872 if ( aExp
== 0x7FFF ) {
3873 if ( (bits64
) ( aSig
<<1 ) ) {
3874 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR
) );
3876 return packFloat32( aSign
, 0xFF, 0 );
3878 shift64RightJamming( aSig
, 33, &aSig
);
3879 if ( aExp
|| aSig
) aExp
-= 0x3F81;
3880 return roundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
3884 /*----------------------------------------------------------------------------
3885 | Returns the result of converting the extended double-precision floating-
3886 | point value `a' to the double-precision floating-point format. The
3887 | conversion is performed according to the IEC/IEEE Standard for Binary
3888 | Floating-Point Arithmetic.
3889 *----------------------------------------------------------------------------*/
3891 float64
floatx80_to_float64( floatx80 a STATUS_PARAM
)
3897 aSig
= extractFloatx80Frac( a
);
3898 aExp
= extractFloatx80Exp( a
);
3899 aSign
= extractFloatx80Sign( a
);
3900 if ( aExp
== 0x7FFF ) {
3901 if ( (bits64
) ( aSig
<<1 ) ) {
3902 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR
) );
3904 return packFloat64( aSign
, 0x7FF, 0 );
3906 shift64RightJamming( aSig
, 1, &zSig
);
3907 if ( aExp
|| aSig
) aExp
-= 0x3C01;
3908 return roundAndPackFloat64( aSign
, aExp
, zSig STATUS_VAR
);
3914 /*----------------------------------------------------------------------------
3915 | Returns the result of converting the extended double-precision floating-
3916 | point value `a' to the quadruple-precision floating-point format. The
3917 | conversion is performed according to the IEC/IEEE Standard for Binary
3918 | Floating-Point Arithmetic.
3919 *----------------------------------------------------------------------------*/
3921 float128
floatx80_to_float128( floatx80 a STATUS_PARAM
)
3925 bits64 aSig
, zSig0
, zSig1
;
3927 aSig
= extractFloatx80Frac( a
);
3928 aExp
= extractFloatx80Exp( a
);
3929 aSign
= extractFloatx80Sign( a
);
3930 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) {
3931 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR
) );
3933 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
3934 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
3940 /*----------------------------------------------------------------------------
3941 | Rounds the extended double-precision floating-point value `a' to an integer,
3942 | and returns the result as an extended quadruple-precision floating-point
3943 | value. The operation is performed according to the IEC/IEEE Standard for
3944 | Binary Floating-Point Arithmetic.
3945 *----------------------------------------------------------------------------*/
3947 floatx80
floatx80_round_to_int( floatx80 a STATUS_PARAM
)
3951 bits64 lastBitMask
, roundBitsMask
;
3955 aExp
= extractFloatx80Exp( a
);
3956 if ( 0x403E <= aExp
) {
3957 if ( ( aExp
== 0x7FFF ) && (bits64
) ( extractFloatx80Frac( a
)<<1 ) ) {
3958 return propagateFloatx80NaN( a
, a STATUS_VAR
);
3962 if ( aExp
< 0x3FFF ) {
3964 && ( (bits64
) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
3967 STATUS(float_exception_flags
) |= float_flag_inexact
;
3968 aSign
= extractFloatx80Sign( a
);
3969 switch ( STATUS(float_rounding_mode
) ) {
3970 case float_round_nearest_even
:
3971 if ( ( aExp
== 0x3FFE ) && (bits64
) ( extractFloatx80Frac( a
)<<1 )
3974 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
3977 case float_round_down
:
3980 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3981 : packFloatx80( 0, 0, 0 );
3982 case float_round_up
:
3984 aSign
? packFloatx80( 1, 0, 0 )
3985 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3987 return packFloatx80( aSign
, 0, 0 );
3990 lastBitMask
<<= 0x403E - aExp
;
3991 roundBitsMask
= lastBitMask
- 1;
3993 roundingMode
= STATUS(float_rounding_mode
);
3994 if ( roundingMode
== float_round_nearest_even
) {
3995 z
.low
+= lastBitMask
>>1;
3996 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
3998 else if ( roundingMode
!= float_round_to_zero
) {
3999 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
4000 z
.low
+= roundBitsMask
;
4003 z
.low
&= ~ roundBitsMask
;
4006 z
.low
= LIT64( 0x8000000000000000 );
4008 if ( z
.low
!= a
.low
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4013 /*----------------------------------------------------------------------------
4014 | Returns the result of adding the absolute values of the extended double-
4015 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4016 | negated before being returned. `zSign' is ignored if the result is a NaN.
4017 | The addition is performed according to the IEC/IEEE Standard for Binary
4018 | Floating-Point Arithmetic.
4019 *----------------------------------------------------------------------------*/
4021 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
4023 int32 aExp
, bExp
, zExp
;
4024 bits64 aSig
, bSig
, zSig0
, zSig1
;
4027 aSig
= extractFloatx80Frac( a
);
4028 aExp
= extractFloatx80Exp( a
);
4029 bSig
= extractFloatx80Frac( b
);
4030 bExp
= extractFloatx80Exp( b
);
4031 expDiff
= aExp
- bExp
;
4032 if ( 0 < expDiff
) {
4033 if ( aExp
== 0x7FFF ) {
4034 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4037 if ( bExp
== 0 ) --expDiff
;
4038 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4041 else if ( expDiff
< 0 ) {
4042 if ( bExp
== 0x7FFF ) {
4043 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4044 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4046 if ( aExp
== 0 ) ++expDiff
;
4047 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4051 if ( aExp
== 0x7FFF ) {
4052 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
4053 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4058 zSig0
= aSig
+ bSig
;
4060 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
4066 zSig0
= aSig
+ bSig
;
4067 if ( (sbits64
) zSig0
< 0 ) goto roundAndPack
;
4069 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4070 zSig0
|= LIT64( 0x8000000000000000 );
4074 roundAndPackFloatx80(
4075 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4079 /*----------------------------------------------------------------------------
4080 | Returns the result of subtracting the absolute values of the extended
4081 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4082 | difference is negated before being returned. `zSign' is ignored if the
4083 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4084 | Standard for Binary Floating-Point Arithmetic.
4085 *----------------------------------------------------------------------------*/
4087 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
4089 int32 aExp
, bExp
, zExp
;
4090 bits64 aSig
, bSig
, zSig0
, zSig1
;
4094 aSig
= extractFloatx80Frac( a
);
4095 aExp
= extractFloatx80Exp( a
);
4096 bSig
= extractFloatx80Frac( b
);
4097 bExp
= extractFloatx80Exp( b
);
4098 expDiff
= aExp
- bExp
;
4099 if ( 0 < expDiff
) goto aExpBigger
;
4100 if ( expDiff
< 0 ) goto bExpBigger
;
4101 if ( aExp
== 0x7FFF ) {
4102 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
4103 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4105 float_raise( float_flag_invalid STATUS_VAR
);
4106 z
.low
= floatx80_default_nan_low
;
4107 z
.high
= floatx80_default_nan_high
;
4115 if ( bSig
< aSig
) goto aBigger
;
4116 if ( aSig
< bSig
) goto bBigger
;
4117 return packFloatx80( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
4119 if ( bExp
== 0x7FFF ) {
4120 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4121 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4123 if ( aExp
== 0 ) ++expDiff
;
4124 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4126 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
4129 goto normalizeRoundAndPack
;
4131 if ( aExp
== 0x7FFF ) {
4132 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4135 if ( bExp
== 0 ) --expDiff
;
4136 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4138 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
4140 normalizeRoundAndPack
:
4142 normalizeRoundAndPackFloatx80(
4143 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4147 /*----------------------------------------------------------------------------
4148 | Returns the result of adding the extended double-precision floating-point
4149 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4150 | Standard for Binary Floating-Point Arithmetic.
4151 *----------------------------------------------------------------------------*/
4153 floatx80
floatx80_add( floatx80 a
, floatx80 b STATUS_PARAM
)
4157 aSign
= extractFloatx80Sign( a
);
4158 bSign
= extractFloatx80Sign( b
);
4159 if ( aSign
== bSign
) {
4160 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4163 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4168 /*----------------------------------------------------------------------------
4169 | Returns the result of subtracting the extended double-precision floating-
4170 | point values `a' and `b'. The operation is performed according to the
4171 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4172 *----------------------------------------------------------------------------*/
4174 floatx80
floatx80_sub( floatx80 a
, floatx80 b STATUS_PARAM
)
4178 aSign
= extractFloatx80Sign( a
);
4179 bSign
= extractFloatx80Sign( b
);
4180 if ( aSign
== bSign
) {
4181 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4184 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4189 /*----------------------------------------------------------------------------
4190 | Returns the result of multiplying the extended double-precision floating-
4191 | point values `a' and `b'. The operation is performed according to the
4192 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4193 *----------------------------------------------------------------------------*/
4195 floatx80
floatx80_mul( floatx80 a
, floatx80 b STATUS_PARAM
)
4197 flag aSign
, bSign
, zSign
;
4198 int32 aExp
, bExp
, zExp
;
4199 bits64 aSig
, bSig
, zSig0
, zSig1
;
4202 aSig
= extractFloatx80Frac( a
);
4203 aExp
= extractFloatx80Exp( a
);
4204 aSign
= extractFloatx80Sign( a
);
4205 bSig
= extractFloatx80Frac( b
);
4206 bExp
= extractFloatx80Exp( b
);
4207 bSign
= extractFloatx80Sign( b
);
4208 zSign
= aSign
^ bSign
;
4209 if ( aExp
== 0x7FFF ) {
4210 if ( (bits64
) ( aSig
<<1 )
4211 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
4212 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4214 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
4215 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4217 if ( bExp
== 0x7FFF ) {
4218 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4219 if ( ( aExp
| aSig
) == 0 ) {
4221 float_raise( float_flag_invalid STATUS_VAR
);
4222 z
.low
= floatx80_default_nan_low
;
4223 z
.high
= floatx80_default_nan_high
;
4226 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4229 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4230 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
4233 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4234 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4236 zExp
= aExp
+ bExp
- 0x3FFE;
4237 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
4238 if ( 0 < (sbits64
) zSig0
) {
4239 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4243 roundAndPackFloatx80(
4244 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4248 /*----------------------------------------------------------------------------
4249 | Returns the result of dividing the extended double-precision floating-point
4250 | value `a' by the corresponding value `b'. The operation is performed
4251 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4252 *----------------------------------------------------------------------------*/
4254 floatx80
floatx80_div( floatx80 a
, floatx80 b STATUS_PARAM
)
4256 flag aSign
, bSign
, zSign
;
4257 int32 aExp
, bExp
, zExp
;
4258 bits64 aSig
, bSig
, zSig0
, zSig1
;
4259 bits64 rem0
, rem1
, rem2
, term0
, term1
, term2
;
4262 aSig
= extractFloatx80Frac( a
);
4263 aExp
= extractFloatx80Exp( a
);
4264 aSign
= extractFloatx80Sign( a
);
4265 bSig
= extractFloatx80Frac( b
);
4266 bExp
= extractFloatx80Exp( b
);
4267 bSign
= extractFloatx80Sign( b
);
4268 zSign
= aSign
^ bSign
;
4269 if ( aExp
== 0x7FFF ) {
4270 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4271 if ( bExp
== 0x7FFF ) {
4272 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4275 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4277 if ( bExp
== 0x7FFF ) {
4278 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4279 return packFloatx80( zSign
, 0, 0 );
4283 if ( ( aExp
| aSig
) == 0 ) {
4285 float_raise( float_flag_invalid STATUS_VAR
);
4286 z
.low
= floatx80_default_nan_low
;
4287 z
.high
= floatx80_default_nan_high
;
4290 float_raise( float_flag_divbyzero STATUS_VAR
);
4291 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4293 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4296 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4297 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
4299 zExp
= aExp
- bExp
+ 0x3FFE;
4301 if ( bSig
<= aSig
) {
4302 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
4305 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
4306 mul64To128( bSig
, zSig0
, &term0
, &term1
);
4307 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
4308 while ( (sbits64
) rem0
< 0 ) {
4310 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
4312 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
4313 if ( (bits64
) ( zSig1
<<1 ) <= 8 ) {
4314 mul64To128( bSig
, zSig1
, &term1
, &term2
);
4315 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4316 while ( (sbits64
) rem1
< 0 ) {
4318 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
4320 zSig1
|= ( ( rem1
| rem2
) != 0 );
4323 roundAndPackFloatx80(
4324 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4328 /*----------------------------------------------------------------------------
4329 | Returns the remainder of the extended double-precision floating-point value
4330 | `a' with respect to the corresponding value `b'. The operation is performed
4331 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4332 *----------------------------------------------------------------------------*/
4334 floatx80
floatx80_rem( floatx80 a
, floatx80 b STATUS_PARAM
)
4337 int32 aExp
, bExp
, expDiff
;
4338 bits64 aSig0
, aSig1
, bSig
;
4339 bits64 q
, term0
, term1
, alternateASig0
, alternateASig1
;
4342 aSig0
= extractFloatx80Frac( a
);
4343 aExp
= extractFloatx80Exp( a
);
4344 aSign
= extractFloatx80Sign( a
);
4345 bSig
= extractFloatx80Frac( b
);
4346 bExp
= extractFloatx80Exp( b
);
4347 if ( aExp
== 0x7FFF ) {
4348 if ( (bits64
) ( aSig0
<<1 )
4349 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
4350 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4354 if ( bExp
== 0x7FFF ) {
4355 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4361 float_raise( float_flag_invalid STATUS_VAR
);
4362 z
.low
= floatx80_default_nan_low
;
4363 z
.high
= floatx80_default_nan_high
;
4366 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4369 if ( (bits64
) ( aSig0
<<1 ) == 0 ) return a
;
4370 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4372 bSig
|= LIT64( 0x8000000000000000 );
4374 expDiff
= aExp
- bExp
;
4376 if ( expDiff
< 0 ) {
4377 if ( expDiff
< -1 ) return a
;
4378 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
4381 q
= ( bSig
<= aSig0
);
4382 if ( q
) aSig0
-= bSig
;
4384 while ( 0 < expDiff
) {
4385 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
4386 q
= ( 2 < q
) ? q
- 2 : 0;
4387 mul64To128( bSig
, q
, &term0
, &term1
);
4388 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4389 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
4393 if ( 0 < expDiff
) {
4394 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
4395 q
= ( 2 < q
) ? q
- 2 : 0;
4397 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
4398 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4399 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
4400 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
4402 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4409 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
4410 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
4411 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
4414 aSig0
= alternateASig0
;
4415 aSig1
= alternateASig1
;
4419 normalizeRoundAndPackFloatx80(
4420 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1 STATUS_VAR
);
4424 /*----------------------------------------------------------------------------
4425 | Returns the square root of the extended double-precision floating-point
4426 | value `a'. The operation is performed according to the IEC/IEEE Standard
4427 | for Binary Floating-Point Arithmetic.
4428 *----------------------------------------------------------------------------*/
4430 floatx80
floatx80_sqrt( floatx80 a STATUS_PARAM
)
4434 bits64 aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
4435 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4438 aSig0
= extractFloatx80Frac( a
);
4439 aExp
= extractFloatx80Exp( a
);
4440 aSign
= extractFloatx80Sign( a
);
4441 if ( aExp
== 0x7FFF ) {
4442 if ( (bits64
) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a STATUS_VAR
);
4443 if ( ! aSign
) return a
;
4447 if ( ( aExp
| aSig0
) == 0 ) return a
;
4449 float_raise( float_flag_invalid STATUS_VAR
);
4450 z
.low
= floatx80_default_nan_low
;
4451 z
.high
= floatx80_default_nan_high
;
4455 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
4456 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4458 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
4459 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
4460 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
4461 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
4462 doubleZSig0
= zSig0
<<1;
4463 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
4464 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
4465 while ( (sbits64
) rem0
< 0 ) {
4468 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
4470 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
4471 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4472 if ( zSig1
== 0 ) zSig1
= 1;
4473 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
4474 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4475 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
4476 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4477 while ( (sbits64
) rem1
< 0 ) {
4479 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
4481 term2
|= doubleZSig0
;
4482 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4484 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
4486 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
4487 zSig0
|= doubleZSig0
;
4489 roundAndPackFloatx80(
4490 STATUS(floatx80_rounding_precision
), 0, zExp
, zSig0
, zSig1 STATUS_VAR
);
4494 /*----------------------------------------------------------------------------
4495 | Returns 1 if the extended double-precision floating-point value `a' is
4496 | equal to the corresponding value `b', and 0 otherwise. The comparison is
4497 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4499 *----------------------------------------------------------------------------*/
4501 int floatx80_eq( floatx80 a
, floatx80 b STATUS_PARAM
)
4504 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4505 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4506 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4507 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4509 if ( floatx80_is_signaling_nan( a
)
4510 || floatx80_is_signaling_nan( b
) ) {
4511 float_raise( float_flag_invalid STATUS_VAR
);
4517 && ( ( a
.high
== b
.high
)
4519 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4524 /*----------------------------------------------------------------------------
4525 | Returns 1 if the extended double-precision floating-point value `a' is
4526 | less than or equal to the corresponding value `b', and 0 otherwise. The
4527 | comparison is performed according to the IEC/IEEE Standard for Binary
4528 | Floating-Point Arithmetic.
4529 *----------------------------------------------------------------------------*/
4531 int floatx80_le( floatx80 a
, floatx80 b STATUS_PARAM
)
4535 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4536 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4537 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4538 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4540 float_raise( float_flag_invalid STATUS_VAR
);
4543 aSign
= extractFloatx80Sign( a
);
4544 bSign
= extractFloatx80Sign( b
);
4545 if ( aSign
!= bSign
) {
4548 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4552 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4553 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4557 /*----------------------------------------------------------------------------
4558 | Returns 1 if the extended double-precision floating-point value `a' is
4559 | less than the corresponding value `b', and 0 otherwise. The comparison
4560 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4562 *----------------------------------------------------------------------------*/
4564 int floatx80_lt( floatx80 a
, floatx80 b STATUS_PARAM
)
4568 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4569 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4570 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4571 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4573 float_raise( float_flag_invalid STATUS_VAR
);
4576 aSign
= extractFloatx80Sign( a
);
4577 bSign
= extractFloatx80Sign( b
);
4578 if ( aSign
!= bSign
) {
4581 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4585 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4586 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4590 /*----------------------------------------------------------------------------
4591 | Returns 1 if the extended double-precision floating-point value `a' is equal
4592 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4593 | raised if either operand is a NaN. Otherwise, the comparison is performed
4594 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4595 *----------------------------------------------------------------------------*/
4597 int floatx80_eq_signaling( floatx80 a
, floatx80 b STATUS_PARAM
)
4600 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4601 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4602 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4603 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4605 float_raise( float_flag_invalid STATUS_VAR
);
4610 && ( ( a
.high
== b
.high
)
4612 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4617 /*----------------------------------------------------------------------------
4618 | Returns 1 if the extended double-precision floating-point value `a' is less
4619 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4620 | do not cause an exception. Otherwise, the comparison is performed according
4621 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4622 *----------------------------------------------------------------------------*/
4624 int floatx80_le_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4628 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4629 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4630 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4631 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4633 if ( floatx80_is_signaling_nan( a
)
4634 || floatx80_is_signaling_nan( b
) ) {
4635 float_raise( float_flag_invalid STATUS_VAR
);
4639 aSign
= extractFloatx80Sign( a
);
4640 bSign
= extractFloatx80Sign( b
);
4641 if ( aSign
!= bSign
) {
4644 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4648 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4649 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4653 /*----------------------------------------------------------------------------
4654 | Returns 1 if the extended double-precision floating-point value `a' is less
4655 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4656 | an exception. Otherwise, the comparison is performed according to the
4657 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4658 *----------------------------------------------------------------------------*/
4660 int floatx80_lt_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4664 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4665 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4666 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4667 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4669 if ( floatx80_is_signaling_nan( a
)
4670 || floatx80_is_signaling_nan( b
) ) {
4671 float_raise( float_flag_invalid STATUS_VAR
);
4675 aSign
= extractFloatx80Sign( a
);
4676 bSign
= extractFloatx80Sign( b
);
4677 if ( aSign
!= bSign
) {
4680 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4684 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4685 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4693 /*----------------------------------------------------------------------------
4694 | Returns the result of converting the quadruple-precision floating-point
4695 | value `a' to the 32-bit two's complement integer format. The conversion
4696 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4697 | Arithmetic---which means in particular that the conversion is rounded
4698 | according to the current rounding mode. If `a' is a NaN, the largest
4699 | positive integer is returned. Otherwise, if the conversion overflows, the
4700 | largest integer with the same sign as `a' is returned.
4701 *----------------------------------------------------------------------------*/
4703 int32
float128_to_int32( float128 a STATUS_PARAM
)
4706 int32 aExp
, shiftCount
;
4707 bits64 aSig0
, aSig1
;
4709 aSig1
= extractFloat128Frac1( a
);
4710 aSig0
= extractFloat128Frac0( a
);
4711 aExp
= extractFloat128Exp( a
);
4712 aSign
= extractFloat128Sign( a
);
4713 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
4714 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4715 aSig0
|= ( aSig1
!= 0 );
4716 shiftCount
= 0x4028 - aExp
;
4717 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
4718 return roundAndPackInt32( aSign
, aSig0 STATUS_VAR
);
4722 /*----------------------------------------------------------------------------
4723 | Returns the result of converting the quadruple-precision floating-point
4724 | value `a' to the 32-bit two's complement integer format. The conversion
4725 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4726 | Arithmetic, except that the conversion is always rounded toward zero. If
4727 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4728 | conversion overflows, the largest integer with the same sign as `a' is
4730 *----------------------------------------------------------------------------*/
4732 int32
float128_to_int32_round_to_zero( float128 a STATUS_PARAM
)
4735 int32 aExp
, shiftCount
;
4736 bits64 aSig0
, aSig1
, savedASig
;
4739 aSig1
= extractFloat128Frac1( a
);
4740 aSig0
= extractFloat128Frac0( a
);
4741 aExp
= extractFloat128Exp( a
);
4742 aSign
= extractFloat128Sign( a
);
4743 aSig0
|= ( aSig1
!= 0 );
4744 if ( 0x401E < aExp
) {
4745 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
4748 else if ( aExp
< 0x3FFF ) {
4749 if ( aExp
|| aSig0
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4752 aSig0
|= LIT64( 0x0001000000000000 );
4753 shiftCount
= 0x402F - aExp
;
4755 aSig0
>>= shiftCount
;
4757 if ( aSign
) z
= - z
;
4758 if ( ( z
< 0 ) ^ aSign
) {
4760 float_raise( float_flag_invalid STATUS_VAR
);
4761 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
4763 if ( ( aSig0
<<shiftCount
) != savedASig
) {
4764 STATUS(float_exception_flags
) |= float_flag_inexact
;
4770 /*----------------------------------------------------------------------------
4771 | Returns the result of converting the quadruple-precision floating-point
4772 | value `a' to the 64-bit two's complement integer format. The conversion
4773 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4774 | Arithmetic---which means in particular that the conversion is rounded
4775 | according to the current rounding mode. If `a' is a NaN, the largest
4776 | positive integer is returned. Otherwise, if the conversion overflows, the
4777 | largest integer with the same sign as `a' is returned.
4778 *----------------------------------------------------------------------------*/
4780 int64
float128_to_int64( float128 a STATUS_PARAM
)
4783 int32 aExp
, shiftCount
;
4784 bits64 aSig0
, aSig1
;
4786 aSig1
= extractFloat128Frac1( a
);
4787 aSig0
= extractFloat128Frac0( a
);
4788 aExp
= extractFloat128Exp( a
);
4789 aSign
= extractFloat128Sign( a
);
4790 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4791 shiftCount
= 0x402F - aExp
;
4792 if ( shiftCount
<= 0 ) {
4793 if ( 0x403E < aExp
) {
4794 float_raise( float_flag_invalid STATUS_VAR
);
4796 || ( ( aExp
== 0x7FFF )
4797 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
4800 return LIT64( 0x7FFFFFFFFFFFFFFF );
4802 return (sbits64
) LIT64( 0x8000000000000000 );
4804 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
4807 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
4809 return roundAndPackInt64( aSign
, aSig0
, aSig1 STATUS_VAR
);
4813 /*----------------------------------------------------------------------------
4814 | Returns the result of converting the quadruple-precision floating-point
4815 | value `a' to the 64-bit two's complement integer format. The conversion
4816 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4817 | Arithmetic, except that the conversion is always rounded toward zero.
4818 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4819 | the conversion overflows, the largest integer with the same sign as `a' is
4821 *----------------------------------------------------------------------------*/
4823 int64
float128_to_int64_round_to_zero( float128 a STATUS_PARAM
)
4826 int32 aExp
, shiftCount
;
4827 bits64 aSig0
, aSig1
;
4830 aSig1
= extractFloat128Frac1( a
);
4831 aSig0
= extractFloat128Frac0( a
);
4832 aExp
= extractFloat128Exp( a
);
4833 aSign
= extractFloat128Sign( a
);
4834 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4835 shiftCount
= aExp
- 0x402F;
4836 if ( 0 < shiftCount
) {
4837 if ( 0x403E <= aExp
) {
4838 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
4839 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
4840 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
4841 if ( aSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4844 float_raise( float_flag_invalid STATUS_VAR
);
4845 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
4846 return LIT64( 0x7FFFFFFFFFFFFFFF );
4849 return (sbits64
) LIT64( 0x8000000000000000 );
4851 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
4852 if ( (bits64
) ( aSig1
<<shiftCount
) ) {
4853 STATUS(float_exception_flags
) |= float_flag_inexact
;
4857 if ( aExp
< 0x3FFF ) {
4858 if ( aExp
| aSig0
| aSig1
) {
4859 STATUS(float_exception_flags
) |= float_flag_inexact
;
4863 z
= aSig0
>>( - shiftCount
);
4865 || ( shiftCount
&& (bits64
) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
4866 STATUS(float_exception_flags
) |= float_flag_inexact
;
4869 if ( aSign
) z
= - z
;
4874 /*----------------------------------------------------------------------------
4875 | Returns the result of converting the quadruple-precision floating-point
4876 | value `a' to the single-precision floating-point format. The conversion
4877 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4879 *----------------------------------------------------------------------------*/
4881 float32
float128_to_float32( float128 a STATUS_PARAM
)
4885 bits64 aSig0
, aSig1
;
4888 aSig1
= extractFloat128Frac1( a
);
4889 aSig0
= extractFloat128Frac0( a
);
4890 aExp
= extractFloat128Exp( a
);
4891 aSign
= extractFloat128Sign( a
);
4892 if ( aExp
== 0x7FFF ) {
4893 if ( aSig0
| aSig1
) {
4894 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR
) );
4896 return packFloat32( aSign
, 0xFF, 0 );
4898 aSig0
|= ( aSig1
!= 0 );
4899 shift64RightJamming( aSig0
, 18, &aSig0
);
4901 if ( aExp
|| zSig
) {
4905 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
4909 /*----------------------------------------------------------------------------
4910 | Returns the result of converting the quadruple-precision floating-point
4911 | value `a' to the double-precision floating-point format. The conversion
4912 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4914 *----------------------------------------------------------------------------*/
4916 float64
float128_to_float64( float128 a STATUS_PARAM
)
4920 bits64 aSig0
, aSig1
;
4922 aSig1
= extractFloat128Frac1( a
);
4923 aSig0
= extractFloat128Frac0( a
);
4924 aExp
= extractFloat128Exp( a
);
4925 aSign
= extractFloat128Sign( a
);
4926 if ( aExp
== 0x7FFF ) {
4927 if ( aSig0
| aSig1
) {
4928 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR
) );
4930 return packFloat64( aSign
, 0x7FF, 0 );
4932 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4933 aSig0
|= ( aSig1
!= 0 );
4934 if ( aExp
|| aSig0
) {
4935 aSig0
|= LIT64( 0x4000000000000000 );
4938 return roundAndPackFloat64( aSign
, aExp
, aSig0 STATUS_VAR
);
4944 /*----------------------------------------------------------------------------
4945 | Returns the result of converting the quadruple-precision floating-point
4946 | value `a' to the extended double-precision floating-point format. The
4947 | conversion is performed according to the IEC/IEEE Standard for Binary
4948 | Floating-Point Arithmetic.
4949 *----------------------------------------------------------------------------*/
4951 floatx80
float128_to_floatx80( float128 a STATUS_PARAM
)
4955 bits64 aSig0
, aSig1
;
4957 aSig1
= extractFloat128Frac1( a
);
4958 aSig0
= extractFloat128Frac0( a
);
4959 aExp
= extractFloat128Exp( a
);
4960 aSign
= extractFloat128Sign( a
);
4961 if ( aExp
== 0x7FFF ) {
4962 if ( aSig0
| aSig1
) {
4963 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR
) );
4965 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4968 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
4969 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4972 aSig0
|= LIT64( 0x0001000000000000 );
4974 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
4975 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1 STATUS_VAR
);
4981 /*----------------------------------------------------------------------------
4982 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4983 | returns the result as a quadruple-precision floating-point value. The
4984 | operation is performed according to the IEC/IEEE Standard for Binary
4985 | Floating-Point Arithmetic.
4986 *----------------------------------------------------------------------------*/
4988 float128
float128_round_to_int( float128 a STATUS_PARAM
)
4992 bits64 lastBitMask
, roundBitsMask
;
4996 aExp
= extractFloat128Exp( a
);
4997 if ( 0x402F <= aExp
) {
4998 if ( 0x406F <= aExp
) {
4999 if ( ( aExp
== 0x7FFF )
5000 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
5002 return propagateFloat128NaN( a
, a STATUS_VAR
);
5007 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
5008 roundBitsMask
= lastBitMask
- 1;
5010 roundingMode
= STATUS(float_rounding_mode
);
5011 if ( roundingMode
== float_round_nearest_even
) {
5012 if ( lastBitMask
) {
5013 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
5014 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
5017 if ( (sbits64
) z
.low
< 0 ) {
5019 if ( (bits64
) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
5023 else if ( roundingMode
!= float_round_to_zero
) {
5024 if ( extractFloat128Sign( z
)
5025 ^ ( roundingMode
== float_round_up
) ) {
5026 add128( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
5029 z
.low
&= ~ roundBitsMask
;
5032 if ( aExp
< 0x3FFF ) {
5033 if ( ( ( (bits64
) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
5034 STATUS(float_exception_flags
) |= float_flag_inexact
;
5035 aSign
= extractFloat128Sign( a
);
5036 switch ( STATUS(float_rounding_mode
) ) {
5037 case float_round_nearest_even
:
5038 if ( ( aExp
== 0x3FFE )
5039 && ( extractFloat128Frac0( a
)
5040 | extractFloat128Frac1( a
) )
5042 return packFloat128( aSign
, 0x3FFF, 0, 0 );
5045 case float_round_down
:
5047 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
5048 : packFloat128( 0, 0, 0, 0 );
5049 case float_round_up
:
5051 aSign
? packFloat128( 1, 0, 0, 0 )
5052 : packFloat128( 0, 0x3FFF, 0, 0 );
5054 return packFloat128( aSign
, 0, 0, 0 );
5057 lastBitMask
<<= 0x402F - aExp
;
5058 roundBitsMask
= lastBitMask
- 1;
5061 roundingMode
= STATUS(float_rounding_mode
);
5062 if ( roundingMode
== float_round_nearest_even
) {
5063 z
.high
+= lastBitMask
>>1;
5064 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
5065 z
.high
&= ~ lastBitMask
;
5068 else if ( roundingMode
!= float_round_to_zero
) {
5069 if ( extractFloat128Sign( z
)
5070 ^ ( roundingMode
== float_round_up
) ) {
5071 z
.high
|= ( a
.low
!= 0 );
5072 z
.high
+= roundBitsMask
;
5075 z
.high
&= ~ roundBitsMask
;
5077 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
5078 STATUS(float_exception_flags
) |= float_flag_inexact
;
5084 /*----------------------------------------------------------------------------
5085 | Returns the result of adding the absolute values of the quadruple-precision
5086 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5087 | before being returned. `zSign' is ignored if the result is a NaN.
5088 | The addition is performed according to the IEC/IEEE Standard for Binary
5089 | Floating-Point Arithmetic.
5090 *----------------------------------------------------------------------------*/
5092 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
5094 int32 aExp
, bExp
, zExp
;
5095 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
5098 aSig1
= extractFloat128Frac1( a
);
5099 aSig0
= extractFloat128Frac0( a
);
5100 aExp
= extractFloat128Exp( a
);
5101 bSig1
= extractFloat128Frac1( b
);
5102 bSig0
= extractFloat128Frac0( b
);
5103 bExp
= extractFloat128Exp( b
);
5104 expDiff
= aExp
- bExp
;
5105 if ( 0 < expDiff
) {
5106 if ( aExp
== 0x7FFF ) {
5107 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5114 bSig0
|= LIT64( 0x0001000000000000 );
5116 shift128ExtraRightJamming(
5117 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
5120 else if ( expDiff
< 0 ) {
5121 if ( bExp
== 0x7FFF ) {
5122 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5123 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5129 aSig0
|= LIT64( 0x0001000000000000 );
5131 shift128ExtraRightJamming(
5132 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
5136 if ( aExp
== 0x7FFF ) {
5137 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
5138 return propagateFloat128NaN( a
, b STATUS_VAR
);
5142 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5144 if ( STATUS(flush_to_zero
) ) return packFloat128( zSign
, 0, 0, 0 );
5145 return packFloat128( zSign
, 0, zSig0
, zSig1
);
5148 zSig0
|= LIT64( 0x0002000000000000 );
5152 aSig0
|= LIT64( 0x0001000000000000 );
5153 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5155 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
5158 shift128ExtraRightJamming(
5159 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
5161 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5165 /*----------------------------------------------------------------------------
5166 | Returns the result of subtracting the absolute values of the quadruple-
5167 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5168 | difference is negated before being returned. `zSign' is ignored if the
5169 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5170 | Standard for Binary Floating-Point Arithmetic.
5171 *----------------------------------------------------------------------------*/
5173 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
5175 int32 aExp
, bExp
, zExp
;
5176 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
5180 aSig1
= extractFloat128Frac1( a
);
5181 aSig0
= extractFloat128Frac0( a
);
5182 aExp
= extractFloat128Exp( a
);
5183 bSig1
= extractFloat128Frac1( b
);
5184 bSig0
= extractFloat128Frac0( b
);
5185 bExp
= extractFloat128Exp( b
);
5186 expDiff
= aExp
- bExp
;
5187 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
5188 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
5189 if ( 0 < expDiff
) goto aExpBigger
;
5190 if ( expDiff
< 0 ) goto bExpBigger
;
5191 if ( aExp
== 0x7FFF ) {
5192 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
5193 return propagateFloat128NaN( a
, b STATUS_VAR
);
5195 float_raise( float_flag_invalid STATUS_VAR
);
5196 z
.low
= float128_default_nan_low
;
5197 z
.high
= float128_default_nan_high
;
5204 if ( bSig0
< aSig0
) goto aBigger
;
5205 if ( aSig0
< bSig0
) goto bBigger
;
5206 if ( bSig1
< aSig1
) goto aBigger
;
5207 if ( aSig1
< bSig1
) goto bBigger
;
5208 return packFloat128( STATUS(float_rounding_mode
) == float_round_down
, 0, 0, 0 );
5210 if ( bExp
== 0x7FFF ) {
5211 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5212 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
5218 aSig0
|= LIT64( 0x4000000000000000 );
5220 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
5221 bSig0
|= LIT64( 0x4000000000000000 );
5223 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
5226 goto normalizeRoundAndPack
;
5228 if ( aExp
== 0x7FFF ) {
5229 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5236 bSig0
|= LIT64( 0x4000000000000000 );
5238 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
5239 aSig0
|= LIT64( 0x4000000000000000 );
5241 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5243 normalizeRoundAndPack
:
5245 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1 STATUS_VAR
);
5249 /*----------------------------------------------------------------------------
5250 | Returns the result of adding the quadruple-precision floating-point values
5251 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5252 | for Binary Floating-Point Arithmetic.
5253 *----------------------------------------------------------------------------*/
5255 float128
float128_add( float128 a
, float128 b STATUS_PARAM
)
5259 aSign
= extractFloat128Sign( a
);
5260 bSign
= extractFloat128Sign( b
);
5261 if ( aSign
== bSign
) {
5262 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5265 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5270 /*----------------------------------------------------------------------------
5271 | Returns the result of subtracting the quadruple-precision floating-point
5272 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5273 | Standard for Binary Floating-Point Arithmetic.
5274 *----------------------------------------------------------------------------*/
5276 float128
float128_sub( float128 a
, float128 b STATUS_PARAM
)
5280 aSign
= extractFloat128Sign( a
);
5281 bSign
= extractFloat128Sign( b
);
5282 if ( aSign
== bSign
) {
5283 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5286 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5291 /*----------------------------------------------------------------------------
5292 | Returns the result of multiplying the quadruple-precision floating-point
5293 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5294 | Standard for Binary Floating-Point Arithmetic.
5295 *----------------------------------------------------------------------------*/
5297 float128
float128_mul( float128 a
, float128 b STATUS_PARAM
)
5299 flag aSign
, bSign
, zSign
;
5300 int32 aExp
, bExp
, zExp
;
5301 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
5304 aSig1
= extractFloat128Frac1( a
);
5305 aSig0
= extractFloat128Frac0( a
);
5306 aExp
= extractFloat128Exp( a
);
5307 aSign
= extractFloat128Sign( a
);
5308 bSig1
= extractFloat128Frac1( b
);
5309 bSig0
= extractFloat128Frac0( b
);
5310 bExp
= extractFloat128Exp( b
);
5311 bSign
= extractFloat128Sign( b
);
5312 zSign
= aSign
^ bSign
;
5313 if ( aExp
== 0x7FFF ) {
5314 if ( ( aSig0
| aSig1
)
5315 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5316 return propagateFloat128NaN( a
, b STATUS_VAR
);
5318 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
5319 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5321 if ( bExp
== 0x7FFF ) {
5322 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5323 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5325 float_raise( float_flag_invalid STATUS_VAR
);
5326 z
.low
= float128_default_nan_low
;
5327 z
.high
= float128_default_nan_high
;
5330 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5333 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5334 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5337 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5338 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5340 zExp
= aExp
+ bExp
- 0x4000;
5341 aSig0
|= LIT64( 0x0001000000000000 );
5342 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
5343 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
5344 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
5345 zSig2
|= ( zSig3
!= 0 );
5346 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
5347 shift128ExtraRightJamming(
5348 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
5351 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5355 /*----------------------------------------------------------------------------
5356 | Returns the result of dividing the quadruple-precision floating-point value
5357 | `a' by the corresponding value `b'. The operation is performed according to
5358 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5359 *----------------------------------------------------------------------------*/
5361 float128
float128_div( float128 a
, float128 b STATUS_PARAM
)
5363 flag aSign
, bSign
, zSign
;
5364 int32 aExp
, bExp
, zExp
;
5365 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
5366 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5369 aSig1
= extractFloat128Frac1( a
);
5370 aSig0
= extractFloat128Frac0( a
);
5371 aExp
= extractFloat128Exp( a
);
5372 aSign
= extractFloat128Sign( a
);
5373 bSig1
= extractFloat128Frac1( b
);
5374 bSig0
= extractFloat128Frac0( b
);
5375 bExp
= extractFloat128Exp( b
);
5376 bSign
= extractFloat128Sign( b
);
5377 zSign
= aSign
^ bSign
;
5378 if ( aExp
== 0x7FFF ) {
5379 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5380 if ( bExp
== 0x7FFF ) {
5381 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5384 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5386 if ( bExp
== 0x7FFF ) {
5387 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5388 return packFloat128( zSign
, 0, 0, 0 );
5391 if ( ( bSig0
| bSig1
) == 0 ) {
5392 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5394 float_raise( float_flag_invalid STATUS_VAR
);
5395 z
.low
= float128_default_nan_low
;
5396 z
.high
= float128_default_nan_high
;
5399 float_raise( float_flag_divbyzero STATUS_VAR
);
5400 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5402 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5405 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5406 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5408 zExp
= aExp
- bExp
+ 0x3FFD;
5410 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
5412 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5413 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
5414 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
5417 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5418 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
5419 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
5420 while ( (sbits64
) rem0
< 0 ) {
5422 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
5424 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
5425 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
5426 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
5427 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
5428 while ( (sbits64
) rem1
< 0 ) {
5430 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
5432 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5434 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
5435 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5439 /*----------------------------------------------------------------------------
5440 | Returns the remainder of the quadruple-precision floating-point value `a'
5441 | with respect to the corresponding value `b'. The operation is performed
5442 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5443 *----------------------------------------------------------------------------*/
5445 float128
float128_rem( float128 a
, float128 b STATUS_PARAM
)
5448 int32 aExp
, bExp
, expDiff
;
5449 bits64 aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
5450 bits64 allZero
, alternateASig0
, alternateASig1
, sigMean1
;
5454 aSig1
= extractFloat128Frac1( a
);
5455 aSig0
= extractFloat128Frac0( a
);
5456 aExp
= extractFloat128Exp( a
);
5457 aSign
= extractFloat128Sign( a
);
5458 bSig1
= extractFloat128Frac1( b
);
5459 bSig0
= extractFloat128Frac0( b
);
5460 bExp
= extractFloat128Exp( b
);
5461 if ( aExp
== 0x7FFF ) {
5462 if ( ( aSig0
| aSig1
)
5463 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5464 return propagateFloat128NaN( a
, b STATUS_VAR
);
5468 if ( bExp
== 0x7FFF ) {
5469 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5473 if ( ( bSig0
| bSig1
) == 0 ) {
5475 float_raise( float_flag_invalid STATUS_VAR
);
5476 z
.low
= float128_default_nan_low
;
5477 z
.high
= float128_default_nan_high
;
5480 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5483 if ( ( aSig0
| aSig1
) == 0 ) return a
;
5484 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5486 expDiff
= aExp
- bExp
;
5487 if ( expDiff
< -1 ) return a
;
5489 aSig0
| LIT64( 0x0001000000000000 ),
5491 15 - ( expDiff
< 0 ),
5496 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5497 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
5498 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5500 while ( 0 < expDiff
) {
5501 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5502 q
= ( 4 < q
) ? q
- 4 : 0;
5503 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5504 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
5505 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
5506 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
5509 if ( -64 < expDiff
) {
5510 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5511 q
= ( 4 < q
) ? q
- 4 : 0;
5513 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5515 if ( expDiff
< 0 ) {
5516 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
5519 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
5521 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5522 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
5525 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
5526 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5529 alternateASig0
= aSig0
;
5530 alternateASig1
= aSig1
;
5532 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5533 } while ( 0 <= (sbits64
) aSig0
);
5535 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (bits64
*)&sigMean0
, &sigMean1
);
5536 if ( ( sigMean0
< 0 )
5537 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
5538 aSig0
= alternateASig0
;
5539 aSig1
= alternateASig1
;
5541 zSign
= ( (sbits64
) aSig0
< 0 );
5542 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
5544 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1 STATUS_VAR
);
5548 /*----------------------------------------------------------------------------
5549 | Returns the square root of the quadruple-precision floating-point value `a'.
5550 | The operation is performed according to the IEC/IEEE Standard for Binary
5551 | Floating-Point Arithmetic.
5552 *----------------------------------------------------------------------------*/
5554 float128
float128_sqrt( float128 a STATUS_PARAM
)
5558 bits64 aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
5559 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5562 aSig1
= extractFloat128Frac1( a
);
5563 aSig0
= extractFloat128Frac0( a
);
5564 aExp
= extractFloat128Exp( a
);
5565 aSign
= extractFloat128Sign( a
);
5566 if ( aExp
== 0x7FFF ) {
5567 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a STATUS_VAR
);
5568 if ( ! aSign
) return a
;
5572 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
5574 float_raise( float_flag_invalid STATUS_VAR
);
5575 z
.low
= float128_default_nan_low
;
5576 z
.high
= float128_default_nan_high
;
5580 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
5581 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5583 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
5584 aSig0
|= LIT64( 0x0001000000000000 );
5585 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
5586 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
5587 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5588 doubleZSig0
= zSig0
<<1;
5589 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5590 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5591 while ( (sbits64
) rem0
< 0 ) {
5594 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5596 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5597 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
5598 if ( zSig1
== 0 ) zSig1
= 1;
5599 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5600 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5601 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5602 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5603 while ( (sbits64
) rem1
< 0 ) {
5605 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5607 term2
|= doubleZSig0
;
5608 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5610 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5612 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
5613 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5617 /*----------------------------------------------------------------------------
5618 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5619 | the corresponding value `b', and 0 otherwise. The comparison is performed
5620 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5621 *----------------------------------------------------------------------------*/
5623 int float128_eq( float128 a
, float128 b STATUS_PARAM
)
5626 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5627 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5628 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5629 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5631 if ( float128_is_signaling_nan( a
)
5632 || float128_is_signaling_nan( b
) ) {
5633 float_raise( float_flag_invalid STATUS_VAR
);
5639 && ( ( a
.high
== b
.high
)
5641 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5646 /*----------------------------------------------------------------------------
5647 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5648 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5649 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5651 *----------------------------------------------------------------------------*/
5653 int float128_le( float128 a
, float128 b STATUS_PARAM
)
5657 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5658 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5659 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5660 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5662 float_raise( float_flag_invalid STATUS_VAR
);
5665 aSign
= extractFloat128Sign( a
);
5666 bSign
= extractFloat128Sign( b
);
5667 if ( aSign
!= bSign
) {
5670 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5674 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5675 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5679 /*----------------------------------------------------------------------------
5680 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5681 | the corresponding value `b', and 0 otherwise. The comparison is performed
5682 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5683 *----------------------------------------------------------------------------*/
5685 int float128_lt( float128 a
, float128 b STATUS_PARAM
)
5689 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5690 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5691 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5692 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5694 float_raise( float_flag_invalid STATUS_VAR
);
5697 aSign
= extractFloat128Sign( a
);
5698 bSign
= extractFloat128Sign( b
);
5699 if ( aSign
!= bSign
) {
5702 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5706 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5707 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5711 /*----------------------------------------------------------------------------
5712 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5713 | the corresponding value `b', and 0 otherwise. The invalid exception is
5714 | raised if either operand is a NaN. Otherwise, the comparison is performed
5715 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5716 *----------------------------------------------------------------------------*/
5718 int float128_eq_signaling( float128 a
, float128 b STATUS_PARAM
)
5721 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5722 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5723 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5724 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5726 float_raise( float_flag_invalid STATUS_VAR
);
5731 && ( ( a
.high
== b
.high
)
5733 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5738 /*----------------------------------------------------------------------------
5739 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5740 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5741 | cause an exception. Otherwise, the comparison is performed according to the
5742 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5743 *----------------------------------------------------------------------------*/
5745 int float128_le_quiet( float128 a
, float128 b STATUS_PARAM
)
5749 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5750 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5751 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5752 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5754 if ( float128_is_signaling_nan( a
)
5755 || float128_is_signaling_nan( b
) ) {
5756 float_raise( float_flag_invalid STATUS_VAR
);
5760 aSign
= extractFloat128Sign( a
);
5761 bSign
= extractFloat128Sign( b
);
5762 if ( aSign
!= bSign
) {
5765 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5769 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5770 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5774 /*----------------------------------------------------------------------------
5775 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5776 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5777 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5778 | Standard for Binary Floating-Point Arithmetic.
5779 *----------------------------------------------------------------------------*/
5781 int float128_lt_quiet( float128 a
, float128 b STATUS_PARAM
)
5785 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5786 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5787 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5788 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5790 if ( float128_is_signaling_nan( a
)
5791 || float128_is_signaling_nan( b
) ) {
5792 float_raise( float_flag_invalid STATUS_VAR
);
5796 aSign
= extractFloat128Sign( a
);
5797 bSign
= extractFloat128Sign( b
);
5798 if ( aSign
!= bSign
) {
5801 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5805 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5806 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5812 /* misc functions */
5813 float32
uint32_to_float32( unsigned int a STATUS_PARAM
)
5815 return int64_to_float32(a STATUS_VAR
);
5818 float64
uint32_to_float64( unsigned int a STATUS_PARAM
)
5820 return int64_to_float64(a STATUS_VAR
);
5823 unsigned int float32_to_uint32( float32 a STATUS_PARAM
)
5828 v
= float32_to_int64(a STATUS_VAR
);
5831 float_raise( float_flag_invalid STATUS_VAR
);
5832 } else if (v
> 0xffffffff) {
5834 float_raise( float_flag_invalid STATUS_VAR
);
5841 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM
)
5846 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
5849 float_raise( float_flag_invalid STATUS_VAR
);
5850 } else if (v
> 0xffffffff) {
5852 float_raise( float_flag_invalid STATUS_VAR
);
5859 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM
)
5864 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
5867 float_raise( float_flag_invalid STATUS_VAR
);
5868 } else if (v
> 0xffff) {
5870 float_raise( float_flag_invalid STATUS_VAR
);
5877 unsigned int float64_to_uint32( float64 a STATUS_PARAM
)
5882 v
= float64_to_int64(a STATUS_VAR
);
5885 float_raise( float_flag_invalid STATUS_VAR
);
5886 } else if (v
> 0xffffffff) {
5888 float_raise( float_flag_invalid STATUS_VAR
);
5895 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM
)
5900 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
5903 float_raise( float_flag_invalid STATUS_VAR
);
5904 } else if (v
> 0xffffffff) {
5906 float_raise( float_flag_invalid STATUS_VAR
);
5913 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM
)
5918 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
5921 float_raise( float_flag_invalid STATUS_VAR
);
5922 } else if (v
> 0xffff) {
5924 float_raise( float_flag_invalid STATUS_VAR
);
5931 /* FIXME: This looks broken. */
5932 uint64_t float64_to_uint64 (float64 a STATUS_PARAM
)
5936 v
= float64_val(int64_to_float64(INT64_MIN STATUS_VAR
));
5937 v
+= float64_val(a
);
5938 v
= float64_to_int64(make_float64(v
) STATUS_VAR
);
5940 return v
- INT64_MIN
;
5943 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM
)
5947 v
= float64_val(int64_to_float64(INT64_MIN STATUS_VAR
));
5948 v
+= float64_val(a
);
5949 v
= float64_to_int64_round_to_zero(make_float64(v
) STATUS_VAR
);
5951 return v
- INT64_MIN
;
5954 #define COMPARE(s, nan_exp) \
5955 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
5956 int is_quiet STATUS_PARAM ) \
5958 flag aSign, bSign; \
5960 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
5961 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
5963 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
5964 extractFloat ## s ## Frac( a ) ) || \
5965 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
5966 extractFloat ## s ## Frac( b ) )) { \
5968 float ## s ## _is_signaling_nan( a ) || \
5969 float ## s ## _is_signaling_nan( b ) ) { \
5970 float_raise( float_flag_invalid STATUS_VAR); \
5972 return float_relation_unordered; \
5974 aSign = extractFloat ## s ## Sign( a ); \
5975 bSign = extractFloat ## s ## Sign( b ); \
5976 av = float ## s ## _val(a); \
5977 bv = float ## s ## _val(b); \
5978 if ( aSign != bSign ) { \
5979 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \
5981 return float_relation_equal; \
5983 return 1 - (2 * aSign); \
5987 return float_relation_equal; \
5989 return 1 - 2 * (aSign ^ ( av < bv )); \
5994 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
5996 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
5999 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6001 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6007 INLINE
int float128_compare_internal( float128 a
, float128 b
,
6008 int is_quiet STATUS_PARAM
)
6012 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
6013 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
6014 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
6015 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
6017 float128_is_signaling_nan( a
) ||
6018 float128_is_signaling_nan( b
) ) {
6019 float_raise( float_flag_invalid STATUS_VAR
);
6021 return float_relation_unordered
;
6023 aSign
= extractFloat128Sign( a
);
6024 bSign
= extractFloat128Sign( b
);
6025 if ( aSign
!= bSign
) {
6026 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
6028 return float_relation_equal
;
6030 return 1 - (2 * aSign
);
6033 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6034 return float_relation_equal
;
6036 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6041 int float128_compare( float128 a
, float128 b STATUS_PARAM
)
6043 return float128_compare_internal(a
, b
, 0 STATUS_VAR
);
6046 int float128_compare_quiet( float128 a
, float128 b STATUS_PARAM
)
6048 return float128_compare_internal(a
, b
, 1 STATUS_VAR
);
6051 /* Multiply A by 2 raised to the power N. */
6052 float32
float32_scalbn( float32 a
, int n STATUS_PARAM
)
6058 a
= float32_squash_input_denormal(a STATUS_VAR
);
6059 aSig
= extractFloat32Frac( a
);
6060 aExp
= extractFloat32Exp( a
);
6061 aSign
= extractFloat32Sign( a
);
6063 if ( aExp
== 0xFF ) {
6068 else if ( aSig
== 0 )
6073 return normalizeRoundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
6076 float64
float64_scalbn( float64 a
, int n STATUS_PARAM
)
6082 a
= float64_squash_input_denormal(a STATUS_VAR
);
6083 aSig
= extractFloat64Frac( a
);
6084 aExp
= extractFloat64Exp( a
);
6085 aSign
= extractFloat64Sign( a
);
6087 if ( aExp
== 0x7FF ) {
6091 aSig
|= LIT64( 0x0010000000000000 );
6092 else if ( aSig
== 0 )
6097 return normalizeRoundAndPackFloat64( aSign
, aExp
, aSig STATUS_VAR
);
6101 floatx80
floatx80_scalbn( floatx80 a
, int n STATUS_PARAM
)
6107 aSig
= extractFloatx80Frac( a
);
6108 aExp
= extractFloatx80Exp( a
);
6109 aSign
= extractFloatx80Sign( a
);
6111 if ( aExp
== 0x7FF ) {
6114 if (aExp
== 0 && aSig
== 0)
6118 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision
),
6119 aSign
, aExp
, aSig
, 0 STATUS_VAR
);
6124 float128
float128_scalbn( float128 a
, int n STATUS_PARAM
)
6128 bits64 aSig0
, aSig1
;
6130 aSig1
= extractFloat128Frac1( a
);
6131 aSig0
= extractFloat128Frac0( a
);
6132 aExp
= extractFloat128Exp( a
);
6133 aSign
= extractFloat128Sign( a
);
6134 if ( aExp
== 0x7FFF ) {
6138 aSig0
|= LIT64( 0x0001000000000000 );
6139 else if ( aSig0
== 0 && aSig1
== 0 )
6143 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1