4 * Derived from SoftFloat.
7 /*============================================================================
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
12 Written by John R. Hauser. This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704. Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980. The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
36 =============================================================================*/
38 #include "softfloat.h"
40 /*----------------------------------------------------------------------------
41 | Primitive arithmetic functions, including multi-word arithmetic, and
42 | division and square root approximations. (Can be specialized to target if
44 *----------------------------------------------------------------------------*/
45 #include "softfloat-macros.h"
47 /*----------------------------------------------------------------------------
48 | Functions and definitions to determine: (1) whether tininess for underflow
49 | is detected before or after rounding by default, (2) what (if anything)
50 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
51 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
52 | are propagated from function inputs to output. These details are target-
54 *----------------------------------------------------------------------------*/
55 #include "softfloat-specialize.h"
57 void set_float_rounding_mode(int val STATUS_PARAM
)
59 STATUS(float_rounding_mode
) = val
;
62 void set_float_exception_flags(int val STATUS_PARAM
)
64 STATUS(float_exception_flags
) = val
;
68 void set_floatx80_rounding_precision(int val STATUS_PARAM
)
70 STATUS(floatx80_rounding_precision
) = val
;
74 /*----------------------------------------------------------------------------
75 | Returns the fraction bits of the half-precision floating-point value `a'.
76 *----------------------------------------------------------------------------*/
78 INLINE
uint32_t extractFloat16Frac(float16 a
)
80 return float16_val(a
) & 0x3ff;
83 /*----------------------------------------------------------------------------
84 | Returns the exponent bits of the half-precision floating-point value `a'.
85 *----------------------------------------------------------------------------*/
87 INLINE int16
extractFloat16Exp(float16 a
)
89 return (float16_val(a
) >> 10) & 0x1f;
92 /*----------------------------------------------------------------------------
93 | Returns the sign bit of the single-precision floating-point value `a'.
94 *----------------------------------------------------------------------------*/
96 INLINE flag
extractFloat16Sign(float16 a
)
98 return float16_val(a
)>>15;
101 /*----------------------------------------------------------------------------
102 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
103 | and 7, and returns the properly rounded 32-bit integer corresponding to the
104 | input. If `zSign' is 1, the input is negated before being converted to an
105 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
106 | is simply rounded to an integer, with the inexact exception raised if the
107 | input cannot be represented exactly as an integer. However, if the fixed-
108 | point input is too large, the invalid exception is raised and the largest
109 | positive or negative integer is returned.
110 *----------------------------------------------------------------------------*/
112 static int32
roundAndPackInt32( flag zSign
, uint64_t absZ STATUS_PARAM
)
115 flag roundNearestEven
;
116 int8 roundIncrement
, roundBits
;
119 roundingMode
= STATUS(float_rounding_mode
);
120 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
121 roundIncrement
= 0x40;
122 if ( ! roundNearestEven
) {
123 if ( roundingMode
== float_round_to_zero
) {
127 roundIncrement
= 0x7F;
129 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
132 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
136 roundBits
= absZ
& 0x7F;
137 absZ
= ( absZ
+ roundIncrement
)>>7;
138 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
140 if ( zSign
) z
= - z
;
141 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
142 float_raise( float_flag_invalid STATUS_VAR
);
143 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
145 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
150 /*----------------------------------------------------------------------------
151 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
152 | `absZ1', with binary point between bits 63 and 64 (between the input words),
153 | and returns the properly rounded 64-bit integer corresponding to the input.
154 | If `zSign' is 1, the input is negated before being converted to an integer.
155 | Ordinarily, the fixed-point input is simply rounded to an integer, with
156 | the inexact exception raised if the input cannot be represented exactly as
157 | an integer. However, if the fixed-point input is too large, the invalid
158 | exception is raised and the largest positive or negative integer is
160 *----------------------------------------------------------------------------*/
162 static int64
roundAndPackInt64( flag zSign
, uint64_t absZ0
, uint64_t absZ1 STATUS_PARAM
)
165 flag roundNearestEven
, increment
;
168 roundingMode
= STATUS(float_rounding_mode
);
169 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
170 increment
= ( (int64_t) absZ1
< 0 );
171 if ( ! roundNearestEven
) {
172 if ( roundingMode
== float_round_to_zero
) {
177 increment
= ( roundingMode
== float_round_down
) && absZ1
;
180 increment
= ( roundingMode
== float_round_up
) && absZ1
;
186 if ( absZ0
== 0 ) goto overflow
;
187 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
190 if ( zSign
) z
= - z
;
191 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
193 float_raise( float_flag_invalid STATUS_VAR
);
195 zSign
? (int64_t) LIT64( 0x8000000000000000 )
196 : LIT64( 0x7FFFFFFFFFFFFFFF );
198 if ( absZ1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
203 /*----------------------------------------------------------------------------
204 | Returns the fraction bits of the single-precision floating-point value `a'.
205 *----------------------------------------------------------------------------*/
207 INLINE
uint32_t extractFloat32Frac( float32 a
)
210 return float32_val(a
) & 0x007FFFFF;
214 /*----------------------------------------------------------------------------
215 | Returns the exponent bits of the single-precision floating-point value `a'.
216 *----------------------------------------------------------------------------*/
218 INLINE int16
extractFloat32Exp( float32 a
)
221 return ( float32_val(a
)>>23 ) & 0xFF;
225 /*----------------------------------------------------------------------------
226 | Returns the sign bit of the single-precision floating-point value `a'.
227 *----------------------------------------------------------------------------*/
229 INLINE flag
extractFloat32Sign( float32 a
)
232 return float32_val(a
)>>31;
236 /*----------------------------------------------------------------------------
237 | If `a' is denormal and we are in flush-to-zero mode then set the
238 | input-denormal exception and return zero. Otherwise just return the value.
239 *----------------------------------------------------------------------------*/
240 static float32
float32_squash_input_denormal(float32 a STATUS_PARAM
)
242 if (STATUS(flush_inputs_to_zero
)) {
243 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
244 float_raise(float_flag_input_denormal STATUS_VAR
);
245 return make_float32(float32_val(a
) & 0x80000000);
251 /*----------------------------------------------------------------------------
252 | Normalizes the subnormal single-precision floating-point value represented
253 | by the denormalized significand `aSig'. The normalized exponent and
254 | significand are stored at the locations pointed to by `zExpPtr' and
255 | `zSigPtr', respectively.
256 *----------------------------------------------------------------------------*/
259 normalizeFloat32Subnormal( uint32_t aSig
, int16
*zExpPtr
, uint32_t *zSigPtr
)
263 shiftCount
= countLeadingZeros32( aSig
) - 8;
264 *zSigPtr
= aSig
<<shiftCount
;
265 *zExpPtr
= 1 - shiftCount
;
269 /*----------------------------------------------------------------------------
270 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
271 | single-precision floating-point value, returning the result. After being
272 | shifted into the proper positions, the three fields are simply added
273 | together to form the result. This means that any integer portion of `zSig'
274 | will be added into the exponent. Since a properly normalized significand
275 | will have an integer portion equal to 1, the `zExp' input should be 1 less
276 | than the desired result exponent whenever `zSig' is a complete, normalized
278 *----------------------------------------------------------------------------*/
280 INLINE float32
packFloat32( flag zSign
, int16 zExp
, uint32_t zSig
)
284 ( ( (uint32_t) zSign
)<<31 ) + ( ( (uint32_t) zExp
)<<23 ) + zSig
);
288 /*----------------------------------------------------------------------------
289 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
290 | and significand `zSig', and returns the proper single-precision floating-
291 | point value corresponding to the abstract input. Ordinarily, the abstract
292 | value is simply rounded and packed into the single-precision format, with
293 | the inexact exception raised if the abstract input cannot be represented
294 | exactly. However, if the abstract value is too large, the overflow and
295 | inexact exceptions are raised and an infinity or maximal finite value is
296 | returned. If the abstract value is too small, the input value is rounded to
297 | a subnormal number, and the underflow and inexact exceptions are raised if
298 | the abstract input cannot be represented exactly as a subnormal single-
299 | precision floating-point number.
300 | The input significand `zSig' has its binary point between bits 30
301 | and 29, which is 7 bits to the left of the usual location. This shifted
302 | significand must be normalized or smaller. If `zSig' is not normalized,
303 | `zExp' must be 0; in that case, the result returned is a subnormal number,
304 | and it must not require rounding. In the usual case that `zSig' is
305 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
306 | The handling of underflow and overflow follows the IEC/IEEE Standard for
307 | Binary Floating-Point Arithmetic.
308 *----------------------------------------------------------------------------*/
310 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, uint32_t zSig STATUS_PARAM
)
313 flag roundNearestEven
;
314 int8 roundIncrement
, roundBits
;
317 roundingMode
= STATUS(float_rounding_mode
);
318 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
319 roundIncrement
= 0x40;
320 if ( ! roundNearestEven
) {
321 if ( roundingMode
== float_round_to_zero
) {
325 roundIncrement
= 0x7F;
327 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
330 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
334 roundBits
= zSig
& 0x7F;
335 if ( 0xFD <= (uint16_t) zExp
) {
337 || ( ( zExp
== 0xFD )
338 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
340 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
341 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
344 if (STATUS(flush_to_zero
)) {
345 float_raise(float_flag_output_denormal STATUS_VAR
);
346 return packFloat32(zSign
, 0, 0);
349 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
351 || ( zSig
+ roundIncrement
< 0x80000000 );
352 shift32RightJamming( zSig
, - zExp
, &zSig
);
354 roundBits
= zSig
& 0x7F;
355 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
358 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
359 zSig
= ( zSig
+ roundIncrement
)>>7;
360 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
361 if ( zSig
== 0 ) zExp
= 0;
362 return packFloat32( zSign
, zExp
, zSig
);
366 /*----------------------------------------------------------------------------
367 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
368 | and significand `zSig', and returns the proper single-precision floating-
369 | point value corresponding to the abstract input. This routine is just like
370 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
371 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
372 | floating-point exponent.
373 *----------------------------------------------------------------------------*/
376 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, uint32_t zSig STATUS_PARAM
)
380 shiftCount
= countLeadingZeros32( zSig
) - 1;
381 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
385 /*----------------------------------------------------------------------------
386 | Returns the fraction bits of the double-precision floating-point value `a'.
387 *----------------------------------------------------------------------------*/
389 INLINE
uint64_t extractFloat64Frac( float64 a
)
392 return float64_val(a
) & LIT64( 0x000FFFFFFFFFFFFF );
396 /*----------------------------------------------------------------------------
397 | Returns the exponent bits of the double-precision floating-point value `a'.
398 *----------------------------------------------------------------------------*/
400 INLINE int16
extractFloat64Exp( float64 a
)
403 return ( float64_val(a
)>>52 ) & 0x7FF;
407 /*----------------------------------------------------------------------------
408 | Returns the sign bit of the double-precision floating-point value `a'.
409 *----------------------------------------------------------------------------*/
411 INLINE flag
extractFloat64Sign( float64 a
)
414 return float64_val(a
)>>63;
418 /*----------------------------------------------------------------------------
419 | If `a' is denormal and we are in flush-to-zero mode then set the
420 | input-denormal exception and return zero. Otherwise just return the value.
421 *----------------------------------------------------------------------------*/
422 static float64
float64_squash_input_denormal(float64 a STATUS_PARAM
)
424 if (STATUS(flush_inputs_to_zero
)) {
425 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
426 float_raise(float_flag_input_denormal STATUS_VAR
);
427 return make_float64(float64_val(a
) & (1ULL << 63));
433 /*----------------------------------------------------------------------------
434 | Normalizes the subnormal double-precision floating-point value represented
435 | by the denormalized significand `aSig'. The normalized exponent and
436 | significand are stored at the locations pointed to by `zExpPtr' and
437 | `zSigPtr', respectively.
438 *----------------------------------------------------------------------------*/
441 normalizeFloat64Subnormal( uint64_t aSig
, int16
*zExpPtr
, uint64_t *zSigPtr
)
445 shiftCount
= countLeadingZeros64( aSig
) - 11;
446 *zSigPtr
= aSig
<<shiftCount
;
447 *zExpPtr
= 1 - shiftCount
;
451 /*----------------------------------------------------------------------------
452 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
453 | double-precision floating-point value, returning the result. After being
454 | shifted into the proper positions, the three fields are simply added
455 | together to form the result. This means that any integer portion of `zSig'
456 | will be added into the exponent. Since a properly normalized significand
457 | will have an integer portion equal to 1, the `zExp' input should be 1 less
458 | than the desired result exponent whenever `zSig' is a complete, normalized
460 *----------------------------------------------------------------------------*/
462 INLINE float64
packFloat64( flag zSign
, int16 zExp
, uint64_t zSig
)
466 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
470 /*----------------------------------------------------------------------------
471 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
472 | and significand `zSig', and returns the proper double-precision floating-
473 | point value corresponding to the abstract input. Ordinarily, the abstract
474 | value is simply rounded and packed into the double-precision format, with
475 | the inexact exception raised if the abstract input cannot be represented
476 | exactly. However, if the abstract value is too large, the overflow and
477 | inexact exceptions are raised and an infinity or maximal finite value is
478 | returned. If the abstract value is too small, the input value is rounded
479 | to a subnormal number, and the underflow and inexact exceptions are raised
480 | if the abstract input cannot be represented exactly as a subnormal double-
481 | precision floating-point number.
482 | The input significand `zSig' has its binary point between bits 62
483 | and 61, which is 10 bits to the left of the usual location. This shifted
484 | significand must be normalized or smaller. If `zSig' is not normalized,
485 | `zExp' must be 0; in that case, the result returned is a subnormal number,
486 | and it must not require rounding. In the usual case that `zSig' is
487 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
488 | The handling of underflow and overflow follows the IEC/IEEE Standard for
489 | Binary Floating-Point Arithmetic.
490 *----------------------------------------------------------------------------*/
492 static float64
roundAndPackFloat64( flag zSign
, int16 zExp
, uint64_t zSig STATUS_PARAM
)
495 flag roundNearestEven
;
496 int16 roundIncrement
, roundBits
;
499 roundingMode
= STATUS(float_rounding_mode
);
500 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
501 roundIncrement
= 0x200;
502 if ( ! roundNearestEven
) {
503 if ( roundingMode
== float_round_to_zero
) {
507 roundIncrement
= 0x3FF;
509 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
512 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
516 roundBits
= zSig
& 0x3FF;
517 if ( 0x7FD <= (uint16_t) zExp
) {
518 if ( ( 0x7FD < zExp
)
519 || ( ( zExp
== 0x7FD )
520 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
522 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
523 return packFloat64( zSign
, 0x7FF, - ( roundIncrement
== 0 ));
526 if (STATUS(flush_to_zero
)) {
527 float_raise(float_flag_output_denormal STATUS_VAR
);
528 return packFloat64(zSign
, 0, 0);
531 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
533 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
534 shift64RightJamming( zSig
, - zExp
, &zSig
);
536 roundBits
= zSig
& 0x3FF;
537 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
540 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
541 zSig
= ( zSig
+ roundIncrement
)>>10;
542 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
543 if ( zSig
== 0 ) zExp
= 0;
544 return packFloat64( zSign
, zExp
, zSig
);
548 /*----------------------------------------------------------------------------
549 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
550 | and significand `zSig', and returns the proper double-precision floating-
551 | point value corresponding to the abstract input. This routine is just like
552 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
553 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
554 | floating-point exponent.
555 *----------------------------------------------------------------------------*/
558 normalizeRoundAndPackFloat64( flag zSign
, int16 zExp
, uint64_t zSig STATUS_PARAM
)
562 shiftCount
= countLeadingZeros64( zSig
) - 1;
563 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
569 /*----------------------------------------------------------------------------
570 | Returns the fraction bits of the extended double-precision floating-point
572 *----------------------------------------------------------------------------*/
574 INLINE
uint64_t extractFloatx80Frac( floatx80 a
)
581 /*----------------------------------------------------------------------------
582 | Returns the exponent bits of the extended double-precision floating-point
584 *----------------------------------------------------------------------------*/
586 INLINE int32
extractFloatx80Exp( floatx80 a
)
589 return a
.high
& 0x7FFF;
593 /*----------------------------------------------------------------------------
594 | Returns the sign bit of the extended double-precision floating-point value
596 *----------------------------------------------------------------------------*/
598 INLINE flag
extractFloatx80Sign( floatx80 a
)
605 /*----------------------------------------------------------------------------
606 | Normalizes the subnormal extended double-precision floating-point value
607 | represented by the denormalized significand `aSig'. The normalized exponent
608 | and significand are stored at the locations pointed to by `zExpPtr' and
609 | `zSigPtr', respectively.
610 *----------------------------------------------------------------------------*/
613 normalizeFloatx80Subnormal( uint64_t aSig
, int32
*zExpPtr
, uint64_t *zSigPtr
)
617 shiftCount
= countLeadingZeros64( aSig
);
618 *zSigPtr
= aSig
<<shiftCount
;
619 *zExpPtr
= 1 - shiftCount
;
623 /*----------------------------------------------------------------------------
624 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
625 | extended double-precision floating-point value, returning the result.
626 *----------------------------------------------------------------------------*/
628 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, uint64_t zSig
)
633 z
.high
= ( ( (uint16_t) zSign
)<<15 ) + zExp
;
638 /*----------------------------------------------------------------------------
639 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
640 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
641 | and returns the proper extended double-precision floating-point value
642 | corresponding to the abstract input. Ordinarily, the abstract value is
643 | rounded and packed into the extended double-precision format, with the
644 | inexact exception raised if the abstract input cannot be represented
645 | exactly. However, if the abstract value is too large, the overflow and
646 | inexact exceptions are raised and an infinity or maximal finite value is
647 | returned. If the abstract value is too small, the input value is rounded to
648 | a subnormal number, and the underflow and inexact exceptions are raised if
649 | the abstract input cannot be represented exactly as a subnormal extended
650 | double-precision floating-point number.
651 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
652 | number of bits as single or double precision, respectively. Otherwise, the
653 | result is rounded to the full precision of the extended double-precision
655 | The input significand must be normalized or smaller. If the input
656 | significand is not normalized, `zExp' must be 0; in that case, the result
657 | returned is a subnormal number, and it must not require rounding. The
658 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
659 | Floating-Point Arithmetic.
660 *----------------------------------------------------------------------------*/
663 roundAndPackFloatx80(
664 int8 roundingPrecision
, flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
668 flag roundNearestEven
, increment
, isTiny
;
669 int64 roundIncrement
, roundMask
, roundBits
;
671 roundingMode
= STATUS(float_rounding_mode
);
672 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
673 if ( roundingPrecision
== 80 ) goto precision80
;
674 if ( roundingPrecision
== 64 ) {
675 roundIncrement
= LIT64( 0x0000000000000400 );
676 roundMask
= LIT64( 0x00000000000007FF );
678 else if ( roundingPrecision
== 32 ) {
679 roundIncrement
= LIT64( 0x0000008000000000 );
680 roundMask
= LIT64( 0x000000FFFFFFFFFF );
685 zSig0
|= ( zSig1
!= 0 );
686 if ( ! roundNearestEven
) {
687 if ( roundingMode
== float_round_to_zero
) {
691 roundIncrement
= roundMask
;
693 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
696 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
700 roundBits
= zSig0
& roundMask
;
701 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
702 if ( ( 0x7FFE < zExp
)
703 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
708 if (STATUS(flush_to_zero
)) {
709 float_raise(float_flag_output_denormal STATUS_VAR
);
710 return packFloatx80(zSign
, 0, 0);
713 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
715 || ( zSig0
<= zSig0
+ roundIncrement
);
716 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
718 roundBits
= zSig0
& roundMask
;
719 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
720 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
721 zSig0
+= roundIncrement
;
722 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
723 roundIncrement
= roundMask
+ 1;
724 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
725 roundMask
|= roundIncrement
;
727 zSig0
&= ~ roundMask
;
728 return packFloatx80( zSign
, zExp
, zSig0
);
731 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
732 zSig0
+= roundIncrement
;
733 if ( zSig0
< roundIncrement
) {
735 zSig0
= LIT64( 0x8000000000000000 );
737 roundIncrement
= roundMask
+ 1;
738 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
739 roundMask
|= roundIncrement
;
741 zSig0
&= ~ roundMask
;
742 if ( zSig0
== 0 ) zExp
= 0;
743 return packFloatx80( zSign
, zExp
, zSig0
);
745 increment
= ( (int64_t) zSig1
< 0 );
746 if ( ! roundNearestEven
) {
747 if ( roundingMode
== float_round_to_zero
) {
752 increment
= ( roundingMode
== float_round_down
) && zSig1
;
755 increment
= ( roundingMode
== float_round_up
) && zSig1
;
759 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
760 if ( ( 0x7FFE < zExp
)
761 || ( ( zExp
== 0x7FFE )
762 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
768 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
769 if ( ( roundingMode
== float_round_to_zero
)
770 || ( zSign
&& ( roundingMode
== float_round_up
) )
771 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
773 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
775 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
779 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
782 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
783 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
785 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow STATUS_VAR
);
786 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
787 if ( roundNearestEven
) {
788 increment
= ( (int64_t) zSig1
< 0 );
792 increment
= ( roundingMode
== float_round_down
) && zSig1
;
795 increment
= ( roundingMode
== float_round_up
) && zSig1
;
801 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
802 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
804 return packFloatx80( zSign
, zExp
, zSig0
);
807 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
812 zSig0
= LIT64( 0x8000000000000000 );
815 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
819 if ( zSig0
== 0 ) zExp
= 0;
821 return packFloatx80( zSign
, zExp
, zSig0
);
825 /*----------------------------------------------------------------------------
826 | Takes an abstract floating-point value having sign `zSign', exponent
827 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
828 | and returns the proper extended double-precision floating-point value
829 | corresponding to the abstract input. This routine is just like
830 | `roundAndPackFloatx80' except that the input significand does not have to be
832 *----------------------------------------------------------------------------*/
835 normalizeRoundAndPackFloatx80(
836 int8 roundingPrecision
, flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
846 shiftCount
= countLeadingZeros64( zSig0
);
847 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
850 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
858 /*----------------------------------------------------------------------------
859 | Returns the least-significant 64 fraction bits of the quadruple-precision
860 | floating-point value `a'.
861 *----------------------------------------------------------------------------*/
863 INLINE
uint64_t extractFloat128Frac1( float128 a
)
870 /*----------------------------------------------------------------------------
871 | Returns the most-significant 48 fraction bits of the quadruple-precision
872 | floating-point value `a'.
873 *----------------------------------------------------------------------------*/
875 INLINE
uint64_t extractFloat128Frac0( float128 a
)
878 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
882 /*----------------------------------------------------------------------------
883 | Returns the exponent bits of the quadruple-precision floating-point value
885 *----------------------------------------------------------------------------*/
887 INLINE int32
extractFloat128Exp( float128 a
)
890 return ( a
.high
>>48 ) & 0x7FFF;
894 /*----------------------------------------------------------------------------
895 | Returns the sign bit of the quadruple-precision floating-point value `a'.
896 *----------------------------------------------------------------------------*/
898 INLINE flag
extractFloat128Sign( float128 a
)
905 /*----------------------------------------------------------------------------
906 | Normalizes the subnormal quadruple-precision floating-point value
907 | represented by the denormalized significand formed by the concatenation of
908 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
909 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
910 | significand are stored at the location pointed to by `zSig0Ptr', and the
911 | least significant 64 bits of the normalized significand are stored at the
912 | location pointed to by `zSig1Ptr'.
913 *----------------------------------------------------------------------------*/
916 normalizeFloat128Subnormal(
927 shiftCount
= countLeadingZeros64( aSig1
) - 15;
928 if ( shiftCount
< 0 ) {
929 *zSig0Ptr
= aSig1
>>( - shiftCount
);
930 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
933 *zSig0Ptr
= aSig1
<<shiftCount
;
936 *zExpPtr
= - shiftCount
- 63;
939 shiftCount
= countLeadingZeros64( aSig0
) - 15;
940 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
941 *zExpPtr
= 1 - shiftCount
;
946 /*----------------------------------------------------------------------------
947 | Packs the sign `zSign', the exponent `zExp', and the significand formed
948 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
949 | floating-point value, returning the result. After being shifted into the
950 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
951 | added together to form the most significant 32 bits of the result. This
952 | means that any integer portion of `zSig0' will be added into the exponent.
953 | Since a properly normalized significand will have an integer portion equal
954 | to 1, the `zExp' input should be 1 less than the desired result exponent
955 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
957 *----------------------------------------------------------------------------*/
960 packFloat128( flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
)
965 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
970 /*----------------------------------------------------------------------------
971 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
972 | and extended significand formed by the concatenation of `zSig0', `zSig1',
973 | and `zSig2', and returns the proper quadruple-precision floating-point value
974 | corresponding to the abstract input. Ordinarily, the abstract value is
975 | simply rounded and packed into the quadruple-precision format, with the
976 | inexact exception raised if the abstract input cannot be represented
977 | exactly. However, if the abstract value is too large, the overflow and
978 | inexact exceptions are raised and an infinity or maximal finite value is
979 | returned. If the abstract value is too small, the input value is rounded to
980 | a subnormal number, and the underflow and inexact exceptions are raised if
981 | the abstract input cannot be represented exactly as a subnormal quadruple-
982 | precision floating-point number.
983 | The input significand must be normalized or smaller. If the input
984 | significand is not normalized, `zExp' must be 0; in that case, the result
985 | returned is a subnormal number, and it must not require rounding. In the
986 | usual case that the input significand is normalized, `zExp' must be 1 less
987 | than the ``true'' floating-point exponent. The handling of underflow and
988 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
989 *----------------------------------------------------------------------------*/
992 roundAndPackFloat128(
993 flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
, uint64_t zSig2 STATUS_PARAM
)
996 flag roundNearestEven
, increment
, isTiny
;
998 roundingMode
= STATUS(float_rounding_mode
);
999 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
1000 increment
= ( (int64_t) zSig2
< 0 );
1001 if ( ! roundNearestEven
) {
1002 if ( roundingMode
== float_round_to_zero
) {
1007 increment
= ( roundingMode
== float_round_down
) && zSig2
;
1010 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1014 if ( 0x7FFD <= (uint32_t) zExp
) {
1015 if ( ( 0x7FFD < zExp
)
1016 || ( ( zExp
== 0x7FFD )
1018 LIT64( 0x0001FFFFFFFFFFFF ),
1019 LIT64( 0xFFFFFFFFFFFFFFFF ),
1026 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
1027 if ( ( roundingMode
== float_round_to_zero
)
1028 || ( zSign
&& ( roundingMode
== float_round_up
) )
1029 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
1035 LIT64( 0x0000FFFFFFFFFFFF ),
1036 LIT64( 0xFFFFFFFFFFFFFFFF )
1039 return packFloat128( zSign
, 0x7FFF, 0, 0 );
1042 if (STATUS(flush_to_zero
)) {
1043 float_raise(float_flag_output_denormal STATUS_VAR
);
1044 return packFloat128(zSign
, 0, 0, 0);
1047 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
1053 LIT64( 0x0001FFFFFFFFFFFF ),
1054 LIT64( 0xFFFFFFFFFFFFFFFF )
1056 shift128ExtraRightJamming(
1057 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
1059 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow STATUS_VAR
);
1060 if ( roundNearestEven
) {
1061 increment
= ( (int64_t) zSig2
< 0 );
1065 increment
= ( roundingMode
== float_round_down
) && zSig2
;
1068 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1073 if ( zSig2
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1075 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1076 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1079 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1081 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1085 /*----------------------------------------------------------------------------
1086 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1087 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1088 | returns the proper quadruple-precision floating-point value corresponding
1089 | to the abstract input. This routine is just like `roundAndPackFloat128'
1090 | except that the input significand has fewer bits and does not have to be
1091 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1093 *----------------------------------------------------------------------------*/
1096 normalizeRoundAndPackFloat128(
1097 flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1 STATUS_PARAM
)
1107 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1108 if ( 0 <= shiftCount
) {
1110 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1113 shift128ExtraRightJamming(
1114 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1117 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
1123 /*----------------------------------------------------------------------------
1124 | Returns the result of converting the 32-bit two's complement integer `a'
1125 | to the single-precision floating-point format. The conversion is performed
1126 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1127 *----------------------------------------------------------------------------*/
1129 float32
int32_to_float32( int32 a STATUS_PARAM
)
1133 if ( a
== 0 ) return float32_zero
;
1134 if ( a
== (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1136 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a STATUS_VAR
);
1140 /*----------------------------------------------------------------------------
1141 | Returns the result of converting the 32-bit two's complement integer `a'
1142 | to the double-precision floating-point format. The conversion is performed
1143 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1144 *----------------------------------------------------------------------------*/
1146 float64
int32_to_float64( int32 a STATUS_PARAM
)
1153 if ( a
== 0 ) return float64_zero
;
1155 absA
= zSign
? - a
: a
;
1156 shiftCount
= countLeadingZeros32( absA
) + 21;
1158 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1164 /*----------------------------------------------------------------------------
1165 | Returns the result of converting the 32-bit two's complement integer `a'
1166 | to the extended double-precision floating-point format. The conversion
1167 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1169 *----------------------------------------------------------------------------*/
1171 floatx80
int32_to_floatx80( int32 a STATUS_PARAM
)
1178 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1180 absA
= zSign
? - a
: a
;
1181 shiftCount
= countLeadingZeros32( absA
) + 32;
1183 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1191 /*----------------------------------------------------------------------------
1192 | Returns the result of converting the 32-bit two's complement integer `a' to
1193 | the quadruple-precision floating-point format. The conversion is performed
1194 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1195 *----------------------------------------------------------------------------*/
1197 float128
int32_to_float128( int32 a STATUS_PARAM
)
1204 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1206 absA
= zSign
? - a
: a
;
1207 shiftCount
= countLeadingZeros32( absA
) + 17;
1209 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1215 /*----------------------------------------------------------------------------
1216 | Returns the result of converting the 64-bit two's complement integer `a'
1217 | to the single-precision floating-point format. The conversion is performed
1218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1219 *----------------------------------------------------------------------------*/
1221 float32
int64_to_float32( int64 a STATUS_PARAM
)
1227 if ( a
== 0 ) return float32_zero
;
1229 absA
= zSign
? - a
: a
;
1230 shiftCount
= countLeadingZeros64( absA
) - 40;
1231 if ( 0 <= shiftCount
) {
1232 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1236 if ( shiftCount
< 0 ) {
1237 shift64RightJamming( absA
, - shiftCount
, &absA
);
1240 absA
<<= shiftCount
;
1242 return roundAndPackFloat32( zSign
, 0x9C - shiftCount
, absA STATUS_VAR
);
1247 float32
uint64_to_float32( uint64 a STATUS_PARAM
)
1251 if ( a
== 0 ) return float32_zero
;
1252 shiftCount
= countLeadingZeros64( a
) - 40;
1253 if ( 0 <= shiftCount
) {
1254 return packFloat32( 1 > 0, 0x95 - shiftCount
, a
<<shiftCount
);
1258 if ( shiftCount
< 0 ) {
1259 shift64RightJamming( a
, - shiftCount
, &a
);
1264 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount
, a STATUS_VAR
);
1268 /*----------------------------------------------------------------------------
1269 | Returns the result of converting the 64-bit two's complement integer `a'
1270 | to the double-precision floating-point format. The conversion is performed
1271 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1272 *----------------------------------------------------------------------------*/
1274 float64
int64_to_float64( int64 a STATUS_PARAM
)
1278 if ( a
== 0 ) return float64_zero
;
1279 if ( a
== (int64_t) LIT64( 0x8000000000000000 ) ) {
1280 return packFloat64( 1, 0x43E, 0 );
1283 return normalizeRoundAndPackFloat64( zSign
, 0x43C, zSign
? - a
: a STATUS_VAR
);
1287 float64
uint64_to_float64( uint64 a STATUS_PARAM
)
1289 if ( a
== 0 ) return float64_zero
;
1290 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR
);
1296 /*----------------------------------------------------------------------------
1297 | Returns the result of converting the 64-bit two's complement integer `a'
1298 | to the extended double-precision floating-point format. The conversion
1299 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1301 *----------------------------------------------------------------------------*/
1303 floatx80
int64_to_floatx80( int64 a STATUS_PARAM
)
1309 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1311 absA
= zSign
? - a
: a
;
1312 shiftCount
= countLeadingZeros64( absA
);
1313 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1321 /*----------------------------------------------------------------------------
1322 | Returns the result of converting the 64-bit two's complement integer `a' to
1323 | the quadruple-precision floating-point format. The conversion is performed
1324 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1325 *----------------------------------------------------------------------------*/
1327 float128
int64_to_float128( int64 a STATUS_PARAM
)
1333 uint64_t zSig0
, zSig1
;
1335 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1337 absA
= zSign
? - a
: a
;
1338 shiftCount
= countLeadingZeros64( absA
) + 49;
1339 zExp
= 0x406E - shiftCount
;
1340 if ( 64 <= shiftCount
) {
1349 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1350 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1356 /*----------------------------------------------------------------------------
1357 | Returns the result of converting the single-precision floating-point value
1358 | `a' to the 32-bit two's complement integer format. The conversion is
1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1360 | Arithmetic---which means in particular that the conversion is rounded
1361 | according to the current rounding mode. If `a' is a NaN, the largest
1362 | positive integer is returned. Otherwise, if the conversion overflows, the
1363 | largest integer with the same sign as `a' is returned.
1364 *----------------------------------------------------------------------------*/
1366 int32
float32_to_int32( float32 a STATUS_PARAM
)
1369 int16 aExp
, shiftCount
;
1373 a
= float32_squash_input_denormal(a STATUS_VAR
);
1374 aSig
= extractFloat32Frac( a
);
1375 aExp
= extractFloat32Exp( a
);
1376 aSign
= extractFloat32Sign( a
);
1377 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1378 if ( aExp
) aSig
|= 0x00800000;
1379 shiftCount
= 0xAF - aExp
;
1382 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1383 return roundAndPackInt32( aSign
, aSig64 STATUS_VAR
);
1387 /*----------------------------------------------------------------------------
1388 | Returns the result of converting the single-precision floating-point value
1389 | `a' to the 32-bit two's complement integer format. The conversion is
1390 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1391 | Arithmetic, except that the conversion is always rounded toward zero.
1392 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1393 | the conversion overflows, the largest integer with the same sign as `a' is
1395 *----------------------------------------------------------------------------*/
1397 int32
float32_to_int32_round_to_zero( float32 a STATUS_PARAM
)
1400 int16 aExp
, shiftCount
;
1403 a
= float32_squash_input_denormal(a STATUS_VAR
);
1405 aSig
= extractFloat32Frac( a
);
1406 aExp
= extractFloat32Exp( a
);
1407 aSign
= extractFloat32Sign( a
);
1408 shiftCount
= aExp
- 0x9E;
1409 if ( 0 <= shiftCount
) {
1410 if ( float32_val(a
) != 0xCF000000 ) {
1411 float_raise( float_flag_invalid STATUS_VAR
);
1412 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1414 return (int32_t) 0x80000000;
1416 else if ( aExp
<= 0x7E ) {
1417 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1420 aSig
= ( aSig
| 0x00800000 )<<8;
1421 z
= aSig
>>( - shiftCount
);
1422 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1423 STATUS(float_exception_flags
) |= float_flag_inexact
;
1425 if ( aSign
) z
= - z
;
1430 /*----------------------------------------------------------------------------
1431 | Returns the result of converting the single-precision floating-point value
1432 | `a' to the 16-bit two's complement integer format. The conversion is
1433 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1434 | Arithmetic, except that the conversion is always rounded toward zero.
1435 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1436 | the conversion overflows, the largest integer with the same sign as `a' is
1438 *----------------------------------------------------------------------------*/
1440 int16
float32_to_int16_round_to_zero( float32 a STATUS_PARAM
)
1443 int16 aExp
, shiftCount
;
1447 aSig
= extractFloat32Frac( a
);
1448 aExp
= extractFloat32Exp( a
);
1449 aSign
= extractFloat32Sign( a
);
1450 shiftCount
= aExp
- 0x8E;
1451 if ( 0 <= shiftCount
) {
1452 if ( float32_val(a
) != 0xC7000000 ) {
1453 float_raise( float_flag_invalid STATUS_VAR
);
1454 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1458 return (int32_t) 0xffff8000;
1460 else if ( aExp
<= 0x7E ) {
1461 if ( aExp
| aSig
) {
1462 STATUS(float_exception_flags
) |= float_flag_inexact
;
1467 aSig
= ( aSig
| 0x00800000 )<<8;
1468 z
= aSig
>>( - shiftCount
);
1469 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1470 STATUS(float_exception_flags
) |= float_flag_inexact
;
1479 /*----------------------------------------------------------------------------
1480 | Returns the result of converting the single-precision floating-point value
1481 | `a' to the 64-bit two's complement integer format. The conversion is
1482 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1483 | Arithmetic---which means in particular that the conversion is rounded
1484 | according to the current rounding mode. If `a' is a NaN, the largest
1485 | positive integer is returned. Otherwise, if the conversion overflows, the
1486 | largest integer with the same sign as `a' is returned.
1487 *----------------------------------------------------------------------------*/
1489 int64
float32_to_int64( float32 a STATUS_PARAM
)
1492 int16 aExp
, shiftCount
;
1494 uint64_t aSig64
, aSigExtra
;
1495 a
= float32_squash_input_denormal(a STATUS_VAR
);
1497 aSig
= extractFloat32Frac( a
);
1498 aExp
= extractFloat32Exp( a
);
1499 aSign
= extractFloat32Sign( a
);
1500 shiftCount
= 0xBE - aExp
;
1501 if ( shiftCount
< 0 ) {
1502 float_raise( float_flag_invalid STATUS_VAR
);
1503 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1504 return LIT64( 0x7FFFFFFFFFFFFFFF );
1506 return (int64_t) LIT64( 0x8000000000000000 );
1508 if ( aExp
) aSig
|= 0x00800000;
1511 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1512 return roundAndPackInt64( aSign
, aSig64
, aSigExtra STATUS_VAR
);
1516 /*----------------------------------------------------------------------------
1517 | Returns the result of converting the single-precision floating-point value
1518 | `a' to the 64-bit two's complement integer format. The conversion is
1519 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1520 | Arithmetic, except that the conversion is always rounded toward zero. If
1521 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1522 | conversion overflows, the largest integer with the same sign as `a' is
1524 *----------------------------------------------------------------------------*/
1526 int64
float32_to_int64_round_to_zero( float32 a STATUS_PARAM
)
1529 int16 aExp
, shiftCount
;
1533 a
= float32_squash_input_denormal(a STATUS_VAR
);
1535 aSig
= extractFloat32Frac( a
);
1536 aExp
= extractFloat32Exp( a
);
1537 aSign
= extractFloat32Sign( a
);
1538 shiftCount
= aExp
- 0xBE;
1539 if ( 0 <= shiftCount
) {
1540 if ( float32_val(a
) != 0xDF000000 ) {
1541 float_raise( float_flag_invalid STATUS_VAR
);
1542 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1543 return LIT64( 0x7FFFFFFFFFFFFFFF );
1546 return (int64_t) LIT64( 0x8000000000000000 );
1548 else if ( aExp
<= 0x7E ) {
1549 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1552 aSig64
= aSig
| 0x00800000;
1554 z
= aSig64
>>( - shiftCount
);
1555 if ( (uint64_t) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1556 STATUS(float_exception_flags
) |= float_flag_inexact
;
1558 if ( aSign
) z
= - z
;
1563 /*----------------------------------------------------------------------------
1564 | Returns the result of converting the single-precision floating-point value
1565 | `a' to the double-precision floating-point format. The conversion is
1566 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1568 *----------------------------------------------------------------------------*/
1570 float64
float32_to_float64( float32 a STATUS_PARAM
)
1575 a
= float32_squash_input_denormal(a STATUS_VAR
);
1577 aSig
= extractFloat32Frac( a
);
1578 aExp
= extractFloat32Exp( a
);
1579 aSign
= extractFloat32Sign( a
);
1580 if ( aExp
== 0xFF ) {
1581 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1582 return packFloat64( aSign
, 0x7FF, 0 );
1585 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1586 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1589 return packFloat64( aSign
, aExp
+ 0x380, ( (uint64_t) aSig
)<<29 );
1595 /*----------------------------------------------------------------------------
1596 | Returns the result of converting the single-precision floating-point value
1597 | `a' to the extended double-precision floating-point format. The conversion
1598 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1600 *----------------------------------------------------------------------------*/
1602 floatx80
float32_to_floatx80( float32 a STATUS_PARAM
)
1608 a
= float32_squash_input_denormal(a STATUS_VAR
);
1609 aSig
= extractFloat32Frac( a
);
1610 aExp
= extractFloat32Exp( a
);
1611 aSign
= extractFloat32Sign( a
);
1612 if ( aExp
== 0xFF ) {
1613 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1614 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1617 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1618 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1621 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
1629 /*----------------------------------------------------------------------------
1630 | Returns the result of converting the single-precision floating-point value
1631 | `a' to the double-precision floating-point format. The conversion is
1632 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1634 *----------------------------------------------------------------------------*/
1636 float128
float32_to_float128( float32 a STATUS_PARAM
)
1642 a
= float32_squash_input_denormal(a STATUS_VAR
);
1643 aSig
= extractFloat32Frac( a
);
1644 aExp
= extractFloat32Exp( a
);
1645 aSign
= extractFloat32Sign( a
);
1646 if ( aExp
== 0xFF ) {
1647 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1648 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1651 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1652 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1655 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
1661 /*----------------------------------------------------------------------------
1662 | Rounds the single-precision floating-point value `a' to an integer, and
1663 | returns the result as a single-precision floating-point value. The
1664 | operation is performed according to the IEC/IEEE Standard for Binary
1665 | Floating-Point Arithmetic.
1666 *----------------------------------------------------------------------------*/
1668 float32
float32_round_to_int( float32 a STATUS_PARAM
)
1672 uint32_t lastBitMask
, roundBitsMask
;
1675 a
= float32_squash_input_denormal(a STATUS_VAR
);
1677 aExp
= extractFloat32Exp( a
);
1678 if ( 0x96 <= aExp
) {
1679 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1680 return propagateFloat32NaN( a
, a STATUS_VAR
);
1684 if ( aExp
<= 0x7E ) {
1685 if ( (uint32_t) ( float32_val(a
)<<1 ) == 0 ) return a
;
1686 STATUS(float_exception_flags
) |= float_flag_inexact
;
1687 aSign
= extractFloat32Sign( a
);
1688 switch ( STATUS(float_rounding_mode
) ) {
1689 case float_round_nearest_even
:
1690 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1691 return packFloat32( aSign
, 0x7F, 0 );
1694 case float_round_down
:
1695 return make_float32(aSign
? 0xBF800000 : 0);
1696 case float_round_up
:
1697 return make_float32(aSign
? 0x80000000 : 0x3F800000);
1699 return packFloat32( aSign
, 0, 0 );
1702 lastBitMask
<<= 0x96 - aExp
;
1703 roundBitsMask
= lastBitMask
- 1;
1705 roundingMode
= STATUS(float_rounding_mode
);
1706 if ( roundingMode
== float_round_nearest_even
) {
1707 z
+= lastBitMask
>>1;
1708 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1710 else if ( roundingMode
!= float_round_to_zero
) {
1711 if ( extractFloat32Sign( make_float32(z
) ) ^ ( roundingMode
== float_round_up
) ) {
1715 z
&= ~ roundBitsMask
;
1716 if ( z
!= float32_val(a
) ) STATUS(float_exception_flags
) |= float_flag_inexact
;
1717 return make_float32(z
);
1721 /*----------------------------------------------------------------------------
1722 | Returns the result of adding the absolute values of the single-precision
1723 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1724 | before being returned. `zSign' is ignored if the result is a NaN.
1725 | The addition is performed according to the IEC/IEEE Standard for Binary
1726 | Floating-Point Arithmetic.
1727 *----------------------------------------------------------------------------*/
1729 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1731 int16 aExp
, bExp
, zExp
;
1732 uint32_t aSig
, bSig
, zSig
;
1735 aSig
= extractFloat32Frac( a
);
1736 aExp
= extractFloat32Exp( a
);
1737 bSig
= extractFloat32Frac( b
);
1738 bExp
= extractFloat32Exp( b
);
1739 expDiff
= aExp
- bExp
;
1742 if ( 0 < expDiff
) {
1743 if ( aExp
== 0xFF ) {
1744 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1753 shift32RightJamming( bSig
, expDiff
, &bSig
);
1756 else if ( expDiff
< 0 ) {
1757 if ( bExp
== 0xFF ) {
1758 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1759 return packFloat32( zSign
, 0xFF, 0 );
1767 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1771 if ( aExp
== 0xFF ) {
1772 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1776 if (STATUS(flush_to_zero
)) {
1778 float_raise(float_flag_output_denormal STATUS_VAR
);
1780 return packFloat32(zSign
, 0, 0);
1782 return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1784 zSig
= 0x40000000 + aSig
+ bSig
;
1789 zSig
= ( aSig
+ bSig
)<<1;
1791 if ( (int32_t) zSig
< 0 ) {
1796 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1800 /*----------------------------------------------------------------------------
1801 | Returns the result of subtracting the absolute values of the single-
1802 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1803 | difference is negated before being returned. `zSign' is ignored if the
1804 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1805 | Standard for Binary Floating-Point Arithmetic.
1806 *----------------------------------------------------------------------------*/
1808 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1810 int16 aExp
, bExp
, zExp
;
1811 uint32_t aSig
, bSig
, zSig
;
1814 aSig
= extractFloat32Frac( a
);
1815 aExp
= extractFloat32Exp( a
);
1816 bSig
= extractFloat32Frac( b
);
1817 bExp
= extractFloat32Exp( b
);
1818 expDiff
= aExp
- bExp
;
1821 if ( 0 < expDiff
) goto aExpBigger
;
1822 if ( expDiff
< 0 ) goto bExpBigger
;
1823 if ( aExp
== 0xFF ) {
1824 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1825 float_raise( float_flag_invalid STATUS_VAR
);
1826 return float32_default_nan
;
1832 if ( bSig
< aSig
) goto aBigger
;
1833 if ( aSig
< bSig
) goto bBigger
;
1834 return packFloat32( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
1836 if ( bExp
== 0xFF ) {
1837 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1838 return packFloat32( zSign
^ 1, 0xFF, 0 );
1846 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1852 goto normalizeRoundAndPack
;
1854 if ( aExp
== 0xFF ) {
1855 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1864 shift32RightJamming( bSig
, expDiff
, &bSig
);
1869 normalizeRoundAndPack
:
1871 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1875 /*----------------------------------------------------------------------------
1876 | Returns the result of adding the single-precision floating-point values `a'
1877 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1878 | Binary Floating-Point Arithmetic.
1879 *----------------------------------------------------------------------------*/
1881 float32
float32_add( float32 a
, float32 b STATUS_PARAM
)
1884 a
= float32_squash_input_denormal(a STATUS_VAR
);
1885 b
= float32_squash_input_denormal(b STATUS_VAR
);
1887 aSign
= extractFloat32Sign( a
);
1888 bSign
= extractFloat32Sign( b
);
1889 if ( aSign
== bSign
) {
1890 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1893 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1898 /*----------------------------------------------------------------------------
1899 | Returns the result of subtracting the single-precision floating-point values
1900 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1901 | for Binary Floating-Point Arithmetic.
1902 *----------------------------------------------------------------------------*/
1904 float32
float32_sub( float32 a
, float32 b STATUS_PARAM
)
1907 a
= float32_squash_input_denormal(a STATUS_VAR
);
1908 b
= float32_squash_input_denormal(b STATUS_VAR
);
1910 aSign
= extractFloat32Sign( a
);
1911 bSign
= extractFloat32Sign( b
);
1912 if ( aSign
== bSign
) {
1913 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1916 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1921 /*----------------------------------------------------------------------------
1922 | Returns the result of multiplying the single-precision floating-point values
1923 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1924 | for Binary Floating-Point Arithmetic.
1925 *----------------------------------------------------------------------------*/
1927 float32
float32_mul( float32 a
, float32 b STATUS_PARAM
)
1929 flag aSign
, bSign
, zSign
;
1930 int16 aExp
, bExp
, zExp
;
1931 uint32_t aSig
, bSig
;
1935 a
= float32_squash_input_denormal(a STATUS_VAR
);
1936 b
= float32_squash_input_denormal(b STATUS_VAR
);
1938 aSig
= extractFloat32Frac( a
);
1939 aExp
= extractFloat32Exp( a
);
1940 aSign
= extractFloat32Sign( a
);
1941 bSig
= extractFloat32Frac( b
);
1942 bExp
= extractFloat32Exp( b
);
1943 bSign
= extractFloat32Sign( b
);
1944 zSign
= aSign
^ bSign
;
1945 if ( aExp
== 0xFF ) {
1946 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1947 return propagateFloat32NaN( a
, b STATUS_VAR
);
1949 if ( ( bExp
| bSig
) == 0 ) {
1950 float_raise( float_flag_invalid STATUS_VAR
);
1951 return float32_default_nan
;
1953 return packFloat32( zSign
, 0xFF, 0 );
1955 if ( bExp
== 0xFF ) {
1956 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1957 if ( ( aExp
| aSig
) == 0 ) {
1958 float_raise( float_flag_invalid STATUS_VAR
);
1959 return float32_default_nan
;
1961 return packFloat32( zSign
, 0xFF, 0 );
1964 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1965 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1968 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1969 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1971 zExp
= aExp
+ bExp
- 0x7F;
1972 aSig
= ( aSig
| 0x00800000 )<<7;
1973 bSig
= ( bSig
| 0x00800000 )<<8;
1974 shift64RightJamming( ( (uint64_t) aSig
) * bSig
, 32, &zSig64
);
1976 if ( 0 <= (int32_t) ( zSig
<<1 ) ) {
1980 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1984 /*----------------------------------------------------------------------------
1985 | Returns the result of dividing the single-precision floating-point value `a'
1986 | by the corresponding value `b'. The operation is performed according to the
1987 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1988 *----------------------------------------------------------------------------*/
1990 float32
float32_div( float32 a
, float32 b STATUS_PARAM
)
1992 flag aSign
, bSign
, zSign
;
1993 int16 aExp
, bExp
, zExp
;
1994 uint32_t aSig
, bSig
, zSig
;
1995 a
= float32_squash_input_denormal(a STATUS_VAR
);
1996 b
= float32_squash_input_denormal(b STATUS_VAR
);
1998 aSig
= extractFloat32Frac( a
);
1999 aExp
= extractFloat32Exp( a
);
2000 aSign
= extractFloat32Sign( a
);
2001 bSig
= extractFloat32Frac( b
);
2002 bExp
= extractFloat32Exp( b
);
2003 bSign
= extractFloat32Sign( b
);
2004 zSign
= aSign
^ bSign
;
2005 if ( aExp
== 0xFF ) {
2006 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2007 if ( bExp
== 0xFF ) {
2008 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2009 float_raise( float_flag_invalid STATUS_VAR
);
2010 return float32_default_nan
;
2012 return packFloat32( zSign
, 0xFF, 0 );
2014 if ( bExp
== 0xFF ) {
2015 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2016 return packFloat32( zSign
, 0, 0 );
2020 if ( ( aExp
| aSig
) == 0 ) {
2021 float_raise( float_flag_invalid STATUS_VAR
);
2022 return float32_default_nan
;
2024 float_raise( float_flag_divbyzero STATUS_VAR
);
2025 return packFloat32( zSign
, 0xFF, 0 );
2027 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2030 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2031 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2033 zExp
= aExp
- bExp
+ 0x7D;
2034 aSig
= ( aSig
| 0x00800000 )<<7;
2035 bSig
= ( bSig
| 0x00800000 )<<8;
2036 if ( bSig
<= ( aSig
+ aSig
) ) {
2040 zSig
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2041 if ( ( zSig
& 0x3F ) == 0 ) {
2042 zSig
|= ( (uint64_t) bSig
* zSig
!= ( (uint64_t) aSig
)<<32 );
2044 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
2048 /*----------------------------------------------------------------------------
2049 | Returns the remainder of the single-precision floating-point value `a'
2050 | with respect to the corresponding value `b'. The operation is performed
2051 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2052 *----------------------------------------------------------------------------*/
2054 float32
float32_rem( float32 a
, float32 b STATUS_PARAM
)
2057 int16 aExp
, bExp
, expDiff
;
2058 uint32_t aSig
, bSig
;
2060 uint64_t aSig64
, bSig64
, q64
;
2061 uint32_t alternateASig
;
2063 a
= float32_squash_input_denormal(a STATUS_VAR
);
2064 b
= float32_squash_input_denormal(b STATUS_VAR
);
2066 aSig
= extractFloat32Frac( a
);
2067 aExp
= extractFloat32Exp( a
);
2068 aSign
= extractFloat32Sign( a
);
2069 bSig
= extractFloat32Frac( b
);
2070 bExp
= extractFloat32Exp( b
);
2071 if ( aExp
== 0xFF ) {
2072 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2073 return propagateFloat32NaN( a
, b STATUS_VAR
);
2075 float_raise( float_flag_invalid STATUS_VAR
);
2076 return float32_default_nan
;
2078 if ( bExp
== 0xFF ) {
2079 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2084 float_raise( float_flag_invalid STATUS_VAR
);
2085 return float32_default_nan
;
2087 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2090 if ( aSig
== 0 ) return a
;
2091 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2093 expDiff
= aExp
- bExp
;
2096 if ( expDiff
< 32 ) {
2099 if ( expDiff
< 0 ) {
2100 if ( expDiff
< -1 ) return a
;
2103 q
= ( bSig
<= aSig
);
2104 if ( q
) aSig
-= bSig
;
2105 if ( 0 < expDiff
) {
2106 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2109 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2117 if ( bSig
<= aSig
) aSig
-= bSig
;
2118 aSig64
= ( (uint64_t) aSig
)<<40;
2119 bSig64
= ( (uint64_t) bSig
)<<40;
2121 while ( 0 < expDiff
) {
2122 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2123 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2124 aSig64
= - ( ( bSig
* q64
)<<38 );
2128 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2129 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2130 q
= q64
>>( 64 - expDiff
);
2132 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
2135 alternateASig
= aSig
;
2138 } while ( 0 <= (int32_t) aSig
);
2139 sigMean
= aSig
+ alternateASig
;
2140 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2141 aSig
= alternateASig
;
2143 zSign
= ( (int32_t) aSig
< 0 );
2144 if ( zSign
) aSig
= - aSig
;
2145 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
2149 /*----------------------------------------------------------------------------
2150 | Returns the square root of the single-precision floating-point value `a'.
2151 | The operation is performed according to the IEC/IEEE Standard for Binary
2152 | Floating-Point Arithmetic.
2153 *----------------------------------------------------------------------------*/
2155 float32
float32_sqrt( float32 a STATUS_PARAM
)
2159 uint32_t aSig
, zSig
;
2161 a
= float32_squash_input_denormal(a STATUS_VAR
);
2163 aSig
= extractFloat32Frac( a
);
2164 aExp
= extractFloat32Exp( a
);
2165 aSign
= extractFloat32Sign( a
);
2166 if ( aExp
== 0xFF ) {
2167 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2168 if ( ! aSign
) return a
;
2169 float_raise( float_flag_invalid STATUS_VAR
);
2170 return float32_default_nan
;
2173 if ( ( aExp
| aSig
) == 0 ) return a
;
2174 float_raise( float_flag_invalid STATUS_VAR
);
2175 return float32_default_nan
;
2178 if ( aSig
== 0 ) return float32_zero
;
2179 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2181 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2182 aSig
= ( aSig
| 0x00800000 )<<8;
2183 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2184 if ( ( zSig
& 0x7F ) <= 5 ) {
2190 term
= ( (uint64_t) zSig
) * zSig
;
2191 rem
= ( ( (uint64_t) aSig
)<<32 ) - term
;
2192 while ( (int64_t) rem
< 0 ) {
2194 rem
+= ( ( (uint64_t) zSig
)<<1 ) | 1;
2196 zSig
|= ( rem
!= 0 );
2198 shift32RightJamming( zSig
, 1, &zSig
);
2200 return roundAndPackFloat32( 0, zExp
, zSig STATUS_VAR
);
2204 /*----------------------------------------------------------------------------
2205 | Returns the binary exponential of the single-precision floating-point value
2206 | `a'. The operation is performed according to the IEC/IEEE Standard for
2207 | Binary Floating-Point Arithmetic.
2209 | Uses the following identities:
2211 | 1. -------------------------------------------------------------------------
2215 | 2. -------------------------------------------------------------------------
2218 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2220 *----------------------------------------------------------------------------*/
2222 static const float64 float32_exp2_coefficients
[15] =
2224 const_float64( 0x3ff0000000000000ll
), /* 1 */
2225 const_float64( 0x3fe0000000000000ll
), /* 2 */
2226 const_float64( 0x3fc5555555555555ll
), /* 3 */
2227 const_float64( 0x3fa5555555555555ll
), /* 4 */
2228 const_float64( 0x3f81111111111111ll
), /* 5 */
2229 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
2230 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
2231 const_float64( 0x3efa01a01a01a01all
), /* 8 */
2232 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
2233 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
2234 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
2235 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
2236 const_float64( 0x3de6124613a86d09ll
), /* 13 */
2237 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
2238 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
2241 float32
float32_exp2( float32 a STATUS_PARAM
)
2248 a
= float32_squash_input_denormal(a STATUS_VAR
);
2250 aSig
= extractFloat32Frac( a
);
2251 aExp
= extractFloat32Exp( a
);
2252 aSign
= extractFloat32Sign( a
);
2254 if ( aExp
== 0xFF) {
2255 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2256 return (aSign
) ? float32_zero
: a
;
2259 if (aSig
== 0) return float32_one
;
2262 float_raise( float_flag_inexact STATUS_VAR
);
2264 /* ******************************* */
2265 /* using float64 for approximation */
2266 /* ******************************* */
2267 x
= float32_to_float64(a STATUS_VAR
);
2268 x
= float64_mul(x
, float64_ln2 STATUS_VAR
);
2272 for (i
= 0 ; i
< 15 ; i
++) {
2275 f
= float64_mul(xn
, float32_exp2_coefficients
[i
] STATUS_VAR
);
2276 r
= float64_add(r
, f STATUS_VAR
);
2278 xn
= float64_mul(xn
, x STATUS_VAR
);
2281 return float64_to_float32(r
, status
);
2284 /*----------------------------------------------------------------------------
2285 | Returns the binary log of the single-precision floating-point value `a'.
2286 | The operation is performed according to the IEC/IEEE Standard for Binary
2287 | Floating-Point Arithmetic.
2288 *----------------------------------------------------------------------------*/
2289 float32
float32_log2( float32 a STATUS_PARAM
)
2293 uint32_t aSig
, zSig
, i
;
2295 a
= float32_squash_input_denormal(a STATUS_VAR
);
2296 aSig
= extractFloat32Frac( a
);
2297 aExp
= extractFloat32Exp( a
);
2298 aSign
= extractFloat32Sign( a
);
2301 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
2302 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2305 float_raise( float_flag_invalid STATUS_VAR
);
2306 return float32_default_nan
;
2308 if ( aExp
== 0xFF ) {
2309 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2318 for (i
= 1 << 22; i
> 0; i
>>= 1) {
2319 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
2320 if ( aSig
& 0x01000000 ) {
2329 return normalizeRoundAndPackFloat32( zSign
, 0x85, zSig STATUS_VAR
);
2332 /*----------------------------------------------------------------------------
2333 | Returns 1 if the single-precision floating-point value `a' is equal to
2334 | the corresponding value `b', and 0 otherwise. The invalid exception is
2335 | raised if either operand is a NaN. Otherwise, the comparison is performed
2336 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2337 *----------------------------------------------------------------------------*/
2339 int float32_eq( float32 a
, float32 b STATUS_PARAM
)
2342 a
= float32_squash_input_denormal(a STATUS_VAR
);
2343 b
= float32_squash_input_denormal(b STATUS_VAR
);
2345 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2346 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2348 float_raise( float_flag_invalid STATUS_VAR
);
2351 av
= float32_val(a
);
2352 bv
= float32_val(b
);
2353 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2356 /*----------------------------------------------------------------------------
2357 | Returns 1 if the single-precision floating-point value `a' is less than
2358 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2359 | exception is raised if either operand is a NaN. The comparison is performed
2360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2361 *----------------------------------------------------------------------------*/
2363 int float32_le( float32 a
, float32 b STATUS_PARAM
)
2367 a
= float32_squash_input_denormal(a STATUS_VAR
);
2368 b
= float32_squash_input_denormal(b STATUS_VAR
);
2370 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2371 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2373 float_raise( float_flag_invalid STATUS_VAR
);
2376 aSign
= extractFloat32Sign( a
);
2377 bSign
= extractFloat32Sign( b
);
2378 av
= float32_val(a
);
2379 bv
= float32_val(b
);
2380 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2381 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2385 /*----------------------------------------------------------------------------
2386 | Returns 1 if the single-precision floating-point value `a' is less than
2387 | the corresponding value `b', and 0 otherwise. The invalid exception is
2388 | raised if either operand is a NaN. The comparison is performed according
2389 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2390 *----------------------------------------------------------------------------*/
2392 int float32_lt( float32 a
, float32 b STATUS_PARAM
)
2396 a
= float32_squash_input_denormal(a STATUS_VAR
);
2397 b
= float32_squash_input_denormal(b STATUS_VAR
);
2399 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2400 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2402 float_raise( float_flag_invalid STATUS_VAR
);
2405 aSign
= extractFloat32Sign( a
);
2406 bSign
= extractFloat32Sign( b
);
2407 av
= float32_val(a
);
2408 bv
= float32_val(b
);
2409 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
2410 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2414 /*----------------------------------------------------------------------------
2415 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2416 | be compared, and 0 otherwise. The invalid exception is raised if either
2417 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2418 | Standard for Binary Floating-Point Arithmetic.
2419 *----------------------------------------------------------------------------*/
2421 int float32_unordered( float32 a
, float32 b STATUS_PARAM
)
2423 a
= float32_squash_input_denormal(a STATUS_VAR
);
2424 b
= float32_squash_input_denormal(b STATUS_VAR
);
2426 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2427 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2429 float_raise( float_flag_invalid STATUS_VAR
);
2435 /*----------------------------------------------------------------------------
2436 | Returns 1 if the single-precision floating-point value `a' is equal to
2437 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2438 | exception. The comparison is performed according to the IEC/IEEE Standard
2439 | for Binary Floating-Point Arithmetic.
2440 *----------------------------------------------------------------------------*/
2442 int float32_eq_quiet( float32 a
, float32 b STATUS_PARAM
)
2444 a
= float32_squash_input_denormal(a STATUS_VAR
);
2445 b
= float32_squash_input_denormal(b STATUS_VAR
);
2447 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2448 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2450 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2451 float_raise( float_flag_invalid STATUS_VAR
);
2455 return ( float32_val(a
) == float32_val(b
) ) ||
2456 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
2459 /*----------------------------------------------------------------------------
2460 | Returns 1 if the single-precision floating-point value `a' is less than or
2461 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2462 | cause an exception. Otherwise, the comparison is performed according to the
2463 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2464 *----------------------------------------------------------------------------*/
2466 int float32_le_quiet( float32 a
, float32 b STATUS_PARAM
)
2470 a
= float32_squash_input_denormal(a STATUS_VAR
);
2471 b
= float32_squash_input_denormal(b STATUS_VAR
);
2473 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2474 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2476 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2477 float_raise( float_flag_invalid STATUS_VAR
);
2481 aSign
= extractFloat32Sign( a
);
2482 bSign
= extractFloat32Sign( b
);
2483 av
= float32_val(a
);
2484 bv
= float32_val(b
);
2485 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2486 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2490 /*----------------------------------------------------------------------------
2491 | Returns 1 if the single-precision floating-point value `a' is less than
2492 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2493 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2494 | Standard for Binary Floating-Point Arithmetic.
2495 *----------------------------------------------------------------------------*/
2497 int float32_lt_quiet( float32 a
, float32 b STATUS_PARAM
)
2501 a
= float32_squash_input_denormal(a STATUS_VAR
);
2502 b
= float32_squash_input_denormal(b STATUS_VAR
);
2504 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2505 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2507 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2508 float_raise( float_flag_invalid STATUS_VAR
);
2512 aSign
= extractFloat32Sign( a
);
2513 bSign
= extractFloat32Sign( b
);
2514 av
= float32_val(a
);
2515 bv
= float32_val(b
);
2516 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
2517 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2521 /*----------------------------------------------------------------------------
2522 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2523 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2524 | comparison is performed according to the IEC/IEEE Standard for Binary
2525 | Floating-Point Arithmetic.
2526 *----------------------------------------------------------------------------*/
2528 int float32_unordered_quiet( float32 a
, float32 b STATUS_PARAM
)
2530 a
= float32_squash_input_denormal(a STATUS_VAR
);
2531 b
= float32_squash_input_denormal(b STATUS_VAR
);
2533 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2534 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2536 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2537 float_raise( float_flag_invalid STATUS_VAR
);
2544 /*----------------------------------------------------------------------------
2545 | Returns the result of converting the double-precision floating-point value
2546 | `a' to the 32-bit two's complement integer format. The conversion is
2547 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2548 | Arithmetic---which means in particular that the conversion is rounded
2549 | according to the current rounding mode. If `a' is a NaN, the largest
2550 | positive integer is returned. Otherwise, if the conversion overflows, the
2551 | largest integer with the same sign as `a' is returned.
2552 *----------------------------------------------------------------------------*/
2554 int32
float64_to_int32( float64 a STATUS_PARAM
)
2557 int16 aExp
, shiftCount
;
2559 a
= float64_squash_input_denormal(a STATUS_VAR
);
2561 aSig
= extractFloat64Frac( a
);
2562 aExp
= extractFloat64Exp( a
);
2563 aSign
= extractFloat64Sign( a
);
2564 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2565 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2566 shiftCount
= 0x42C - aExp
;
2567 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
2568 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
2572 /*----------------------------------------------------------------------------
2573 | Returns the result of converting the double-precision floating-point value
2574 | `a' to the 32-bit two's complement integer format. The conversion is
2575 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2576 | Arithmetic, except that the conversion is always rounded toward zero.
2577 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2578 | the conversion overflows, the largest integer with the same sign as `a' is
2580 *----------------------------------------------------------------------------*/
2582 int32
float64_to_int32_round_to_zero( float64 a STATUS_PARAM
)
2585 int16 aExp
, shiftCount
;
2586 uint64_t aSig
, savedASig
;
2588 a
= float64_squash_input_denormal(a STATUS_VAR
);
2590 aSig
= extractFloat64Frac( a
);
2591 aExp
= extractFloat64Exp( a
);
2592 aSign
= extractFloat64Sign( a
);
2593 if ( 0x41E < aExp
) {
2594 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2597 else if ( aExp
< 0x3FF ) {
2598 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2601 aSig
|= LIT64( 0x0010000000000000 );
2602 shiftCount
= 0x433 - aExp
;
2604 aSig
>>= shiftCount
;
2606 if ( aSign
) z
= - z
;
2607 if ( ( z
< 0 ) ^ aSign
) {
2609 float_raise( float_flag_invalid STATUS_VAR
);
2610 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
2612 if ( ( aSig
<<shiftCount
) != savedASig
) {
2613 STATUS(float_exception_flags
) |= float_flag_inexact
;
2619 /*----------------------------------------------------------------------------
2620 | Returns the result of converting the double-precision floating-point value
2621 | `a' to the 16-bit two's complement integer format. The conversion is
2622 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2623 | Arithmetic, except that the conversion is always rounded toward zero.
2624 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2625 | the conversion overflows, the largest integer with the same sign as `a' is
2627 *----------------------------------------------------------------------------*/
2629 int16
float64_to_int16_round_to_zero( float64 a STATUS_PARAM
)
2632 int16 aExp
, shiftCount
;
2633 uint64_t aSig
, savedASig
;
2636 aSig
= extractFloat64Frac( a
);
2637 aExp
= extractFloat64Exp( a
);
2638 aSign
= extractFloat64Sign( a
);
2639 if ( 0x40E < aExp
) {
2640 if ( ( aExp
== 0x7FF ) && aSig
) {
2645 else if ( aExp
< 0x3FF ) {
2646 if ( aExp
|| aSig
) {
2647 STATUS(float_exception_flags
) |= float_flag_inexact
;
2651 aSig
|= LIT64( 0x0010000000000000 );
2652 shiftCount
= 0x433 - aExp
;
2654 aSig
>>= shiftCount
;
2659 if ( ( (int16_t)z
< 0 ) ^ aSign
) {
2661 float_raise( float_flag_invalid STATUS_VAR
);
2662 return aSign
? (int32_t) 0xffff8000 : 0x7FFF;
2664 if ( ( aSig
<<shiftCount
) != savedASig
) {
2665 STATUS(float_exception_flags
) |= float_flag_inexact
;
2670 /*----------------------------------------------------------------------------
2671 | Returns the result of converting the double-precision floating-point value
2672 | `a' to the 64-bit two's complement integer format. The conversion is
2673 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2674 | Arithmetic---which means in particular that the conversion is rounded
2675 | according to the current rounding mode. If `a' is a NaN, the largest
2676 | positive integer is returned. Otherwise, if the conversion overflows, the
2677 | largest integer with the same sign as `a' is returned.
2678 *----------------------------------------------------------------------------*/
2680 int64
float64_to_int64( float64 a STATUS_PARAM
)
2683 int16 aExp
, shiftCount
;
2684 uint64_t aSig
, aSigExtra
;
2685 a
= float64_squash_input_denormal(a STATUS_VAR
);
2687 aSig
= extractFloat64Frac( a
);
2688 aExp
= extractFloat64Exp( a
);
2689 aSign
= extractFloat64Sign( a
);
2690 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2691 shiftCount
= 0x433 - aExp
;
2692 if ( shiftCount
<= 0 ) {
2693 if ( 0x43E < aExp
) {
2694 float_raise( float_flag_invalid STATUS_VAR
);
2696 || ( ( aExp
== 0x7FF )
2697 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2699 return LIT64( 0x7FFFFFFFFFFFFFFF );
2701 return (int64_t) LIT64( 0x8000000000000000 );
2704 aSig
<<= - shiftCount
;
2707 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
2709 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
2713 /*----------------------------------------------------------------------------
2714 | Returns the result of converting the double-precision floating-point value
2715 | `a' to the 64-bit two's complement integer format. The conversion is
2716 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2717 | Arithmetic, except that the conversion is always rounded toward zero.
2718 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2719 | the conversion overflows, the largest integer with the same sign as `a' is
2721 *----------------------------------------------------------------------------*/
2723 int64
float64_to_int64_round_to_zero( float64 a STATUS_PARAM
)
2726 int16 aExp
, shiftCount
;
2729 a
= float64_squash_input_denormal(a STATUS_VAR
);
2731 aSig
= extractFloat64Frac( a
);
2732 aExp
= extractFloat64Exp( a
);
2733 aSign
= extractFloat64Sign( a
);
2734 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2735 shiftCount
= aExp
- 0x433;
2736 if ( 0 <= shiftCount
) {
2737 if ( 0x43E <= aExp
) {
2738 if ( float64_val(a
) != LIT64( 0xC3E0000000000000 ) ) {
2739 float_raise( float_flag_invalid STATUS_VAR
);
2741 || ( ( aExp
== 0x7FF )
2742 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2744 return LIT64( 0x7FFFFFFFFFFFFFFF );
2747 return (int64_t) LIT64( 0x8000000000000000 );
2749 z
= aSig
<<shiftCount
;
2752 if ( aExp
< 0x3FE ) {
2753 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2756 z
= aSig
>>( - shiftCount
);
2757 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
2758 STATUS(float_exception_flags
) |= float_flag_inexact
;
2761 if ( aSign
) z
= - z
;
2766 /*----------------------------------------------------------------------------
2767 | Returns the result of converting the double-precision floating-point value
2768 | `a' to the single-precision floating-point format. The conversion is
2769 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2771 *----------------------------------------------------------------------------*/
2773 float32
float64_to_float32( float64 a STATUS_PARAM
)
2779 a
= float64_squash_input_denormal(a STATUS_VAR
);
2781 aSig
= extractFloat64Frac( a
);
2782 aExp
= extractFloat64Exp( a
);
2783 aSign
= extractFloat64Sign( a
);
2784 if ( aExp
== 0x7FF ) {
2785 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2786 return packFloat32( aSign
, 0xFF, 0 );
2788 shift64RightJamming( aSig
, 22, &aSig
);
2790 if ( aExp
|| zSig
) {
2794 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
2799 /*----------------------------------------------------------------------------
2800 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
2801 | half-precision floating-point value, returning the result. After being
2802 | shifted into the proper positions, the three fields are simply added
2803 | together to form the result. This means that any integer portion of `zSig'
2804 | will be added into the exponent. Since a properly normalized significand
2805 | will have an integer portion equal to 1, the `zExp' input should be 1 less
2806 | than the desired result exponent whenever `zSig' is a complete, normalized
2808 *----------------------------------------------------------------------------*/
2809 static float16
packFloat16(flag zSign
, int16 zExp
, uint16_t zSig
)
2811 return make_float16(
2812 (((uint32_t)zSign
) << 15) + (((uint32_t)zExp
) << 10) + zSig
);
2815 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
2816 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
2818 float32
float16_to_float32(float16 a
, flag ieee STATUS_PARAM
)
2824 aSign
= extractFloat16Sign(a
);
2825 aExp
= extractFloat16Exp(a
);
2826 aSig
= extractFloat16Frac(a
);
2828 if (aExp
== 0x1f && ieee
) {
2830 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR
) STATUS_VAR
);
2832 return packFloat32(aSign
, 0xff, aSig
<< 13);
2838 return packFloat32(aSign
, 0, 0);
2841 shiftCount
= countLeadingZeros32( aSig
) - 21;
2842 aSig
= aSig
<< shiftCount
;
2845 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
2848 float16
float32_to_float16(float32 a
, flag ieee STATUS_PARAM
)
2856 a
= float32_squash_input_denormal(a STATUS_VAR
);
2858 aSig
= extractFloat32Frac( a
);
2859 aExp
= extractFloat32Exp( a
);
2860 aSign
= extractFloat32Sign( a
);
2861 if ( aExp
== 0xFF ) {
2863 /* Input is a NaN */
2864 float16 r
= commonNaNToFloat16( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2866 return packFloat16(aSign
, 0, 0);
2872 float_raise(float_flag_invalid STATUS_VAR
);
2873 return packFloat16(aSign
, 0x1f, 0x3ff);
2875 return packFloat16(aSign
, 0x1f, 0);
2877 if (aExp
== 0 && aSig
== 0) {
2878 return packFloat16(aSign
, 0, 0);
2880 /* Decimal point between bits 22 and 23. */
2892 float_raise( float_flag_underflow STATUS_VAR
);
2893 roundingMode
= STATUS(float_rounding_mode
);
2894 switch (roundingMode
) {
2895 case float_round_nearest_even
:
2896 increment
= (mask
+ 1) >> 1;
2897 if ((aSig
& mask
) == increment
) {
2898 increment
= aSig
& (increment
<< 1);
2901 case float_round_up
:
2902 increment
= aSign
? 0 : mask
;
2904 case float_round_down
:
2905 increment
= aSign
? mask
: 0;
2907 default: /* round_to_zero */
2912 if (aSig
>= 0x01000000) {
2916 } else if (aExp
< -14
2917 && STATUS(float_detect_tininess
) == float_tininess_before_rounding
) {
2918 float_raise( float_flag_underflow STATUS_VAR
);
2923 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
2924 return packFloat16(aSign
, 0x1f, 0);
2928 float_raise(float_flag_invalid
| float_flag_inexact STATUS_VAR
);
2929 return packFloat16(aSign
, 0x1f, 0x3ff);
2933 return packFloat16(aSign
, 0, 0);
2936 aSig
>>= -14 - aExp
;
2939 return packFloat16(aSign
, aExp
+ 14, aSig
>> 13);
2944 /*----------------------------------------------------------------------------
2945 | Returns the result of converting the double-precision floating-point value
2946 | `a' to the extended double-precision floating-point format. The conversion
2947 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2949 *----------------------------------------------------------------------------*/
2951 floatx80
float64_to_floatx80( float64 a STATUS_PARAM
)
2957 a
= float64_squash_input_denormal(a STATUS_VAR
);
2958 aSig
= extractFloat64Frac( a
);
2959 aExp
= extractFloat64Exp( a
);
2960 aSign
= extractFloat64Sign( a
);
2961 if ( aExp
== 0x7FF ) {
2962 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2963 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2966 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
2967 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2971 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
2979 /*----------------------------------------------------------------------------
2980 | Returns the result of converting the double-precision floating-point value
2981 | `a' to the quadruple-precision floating-point format. The conversion is
2982 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2984 *----------------------------------------------------------------------------*/
2986 float128
float64_to_float128( float64 a STATUS_PARAM
)
2990 uint64_t aSig
, zSig0
, zSig1
;
2992 a
= float64_squash_input_denormal(a STATUS_VAR
);
2993 aSig
= extractFloat64Frac( a
);
2994 aExp
= extractFloat64Exp( a
);
2995 aSign
= extractFloat64Sign( a
);
2996 if ( aExp
== 0x7FF ) {
2997 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
2998 return packFloat128( aSign
, 0x7FFF, 0, 0 );
3001 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
3002 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3005 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
3006 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
3012 /*----------------------------------------------------------------------------
3013 | Rounds the double-precision floating-point value `a' to an integer, and
3014 | returns the result as a double-precision floating-point value. The
3015 | operation is performed according to the IEC/IEEE Standard for Binary
3016 | Floating-Point Arithmetic.
3017 *----------------------------------------------------------------------------*/
3019 float64
float64_round_to_int( float64 a STATUS_PARAM
)
3023 uint64_t lastBitMask
, roundBitsMask
;
3026 a
= float64_squash_input_denormal(a STATUS_VAR
);
3028 aExp
= extractFloat64Exp( a
);
3029 if ( 0x433 <= aExp
) {
3030 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
3031 return propagateFloat64NaN( a
, a STATUS_VAR
);
3035 if ( aExp
< 0x3FF ) {
3036 if ( (uint64_t) ( float64_val(a
)<<1 ) == 0 ) return a
;
3037 STATUS(float_exception_flags
) |= float_flag_inexact
;
3038 aSign
= extractFloat64Sign( a
);
3039 switch ( STATUS(float_rounding_mode
) ) {
3040 case float_round_nearest_even
:
3041 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
3042 return packFloat64( aSign
, 0x3FF, 0 );
3045 case float_round_down
:
3046 return make_float64(aSign
? LIT64( 0xBFF0000000000000 ) : 0);
3047 case float_round_up
:
3048 return make_float64(
3049 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3051 return packFloat64( aSign
, 0, 0 );
3054 lastBitMask
<<= 0x433 - aExp
;
3055 roundBitsMask
= lastBitMask
- 1;
3057 roundingMode
= STATUS(float_rounding_mode
);
3058 if ( roundingMode
== float_round_nearest_even
) {
3059 z
+= lastBitMask
>>1;
3060 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
3062 else if ( roundingMode
!= float_round_to_zero
) {
3063 if ( extractFloat64Sign( make_float64(z
) ) ^ ( roundingMode
== float_round_up
) ) {
3067 z
&= ~ roundBitsMask
;
3068 if ( z
!= float64_val(a
) )
3069 STATUS(float_exception_flags
) |= float_flag_inexact
;
3070 return make_float64(z
);
3074 float64
float64_trunc_to_int( float64 a STATUS_PARAM
)
3078 oldmode
= STATUS(float_rounding_mode
);
3079 STATUS(float_rounding_mode
) = float_round_to_zero
;
3080 res
= float64_round_to_int(a STATUS_VAR
);
3081 STATUS(float_rounding_mode
) = oldmode
;
3085 /*----------------------------------------------------------------------------
3086 | Returns the result of adding the absolute values of the double-precision
3087 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3088 | before being returned. `zSign' is ignored if the result is a NaN.
3089 | The addition is performed according to the IEC/IEEE Standard for Binary
3090 | Floating-Point Arithmetic.
3091 *----------------------------------------------------------------------------*/
3093 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
3095 int16 aExp
, bExp
, zExp
;
3096 uint64_t aSig
, bSig
, zSig
;
3099 aSig
= extractFloat64Frac( a
);
3100 aExp
= extractFloat64Exp( a
);
3101 bSig
= extractFloat64Frac( b
);
3102 bExp
= extractFloat64Exp( b
);
3103 expDiff
= aExp
- bExp
;
3106 if ( 0 < expDiff
) {
3107 if ( aExp
== 0x7FF ) {
3108 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3115 bSig
|= LIT64( 0x2000000000000000 );
3117 shift64RightJamming( bSig
, expDiff
, &bSig
);
3120 else if ( expDiff
< 0 ) {
3121 if ( bExp
== 0x7FF ) {
3122 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3123 return packFloat64( zSign
, 0x7FF, 0 );
3129 aSig
|= LIT64( 0x2000000000000000 );
3131 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3135 if ( aExp
== 0x7FF ) {
3136 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3140 if (STATUS(flush_to_zero
)) {
3142 float_raise(float_flag_output_denormal STATUS_VAR
);
3144 return packFloat64(zSign
, 0, 0);
3146 return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
3148 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
3152 aSig
|= LIT64( 0x2000000000000000 );
3153 zSig
= ( aSig
+ bSig
)<<1;
3155 if ( (int64_t) zSig
< 0 ) {
3160 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3164 /*----------------------------------------------------------------------------
3165 | Returns the result of subtracting the absolute values of the double-
3166 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3167 | difference is negated before being returned. `zSign' is ignored if the
3168 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3169 | Standard for Binary Floating-Point Arithmetic.
3170 *----------------------------------------------------------------------------*/
3172 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
3174 int16 aExp
, bExp
, zExp
;
3175 uint64_t aSig
, bSig
, zSig
;
3178 aSig
= extractFloat64Frac( a
);
3179 aExp
= extractFloat64Exp( a
);
3180 bSig
= extractFloat64Frac( b
);
3181 bExp
= extractFloat64Exp( b
);
3182 expDiff
= aExp
- bExp
;
3185 if ( 0 < expDiff
) goto aExpBigger
;
3186 if ( expDiff
< 0 ) goto bExpBigger
;
3187 if ( aExp
== 0x7FF ) {
3188 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3189 float_raise( float_flag_invalid STATUS_VAR
);
3190 return float64_default_nan
;
3196 if ( bSig
< aSig
) goto aBigger
;
3197 if ( aSig
< bSig
) goto bBigger
;
3198 return packFloat64( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
3200 if ( bExp
== 0x7FF ) {
3201 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3202 return packFloat64( zSign
^ 1, 0x7FF, 0 );
3208 aSig
|= LIT64( 0x4000000000000000 );
3210 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3211 bSig
|= LIT64( 0x4000000000000000 );
3216 goto normalizeRoundAndPack
;
3218 if ( aExp
== 0x7FF ) {
3219 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3226 bSig
|= LIT64( 0x4000000000000000 );
3228 shift64RightJamming( bSig
, expDiff
, &bSig
);
3229 aSig
|= LIT64( 0x4000000000000000 );
3233 normalizeRoundAndPack
:
3235 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3239 /*----------------------------------------------------------------------------
3240 | Returns the result of adding the double-precision floating-point values `a'
3241 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3242 | Binary Floating-Point Arithmetic.
3243 *----------------------------------------------------------------------------*/
3245 float64
float64_add( float64 a
, float64 b STATUS_PARAM
)
3248 a
= float64_squash_input_denormal(a STATUS_VAR
);
3249 b
= float64_squash_input_denormal(b STATUS_VAR
);
3251 aSign
= extractFloat64Sign( a
);
3252 bSign
= extractFloat64Sign( b
);
3253 if ( aSign
== bSign
) {
3254 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3257 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3262 /*----------------------------------------------------------------------------
3263 | Returns the result of subtracting the double-precision floating-point values
3264 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3265 | for Binary Floating-Point Arithmetic.
3266 *----------------------------------------------------------------------------*/
3268 float64
float64_sub( float64 a
, float64 b STATUS_PARAM
)
3271 a
= float64_squash_input_denormal(a STATUS_VAR
);
3272 b
= float64_squash_input_denormal(b STATUS_VAR
);
3274 aSign
= extractFloat64Sign( a
);
3275 bSign
= extractFloat64Sign( b
);
3276 if ( aSign
== bSign
) {
3277 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3280 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3285 /*----------------------------------------------------------------------------
3286 | Returns the result of multiplying the double-precision floating-point values
3287 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3288 | for Binary Floating-Point Arithmetic.
3289 *----------------------------------------------------------------------------*/
3291 float64
float64_mul( float64 a
, float64 b STATUS_PARAM
)
3293 flag aSign
, bSign
, zSign
;
3294 int16 aExp
, bExp
, zExp
;
3295 uint64_t aSig
, bSig
, zSig0
, zSig1
;
3297 a
= float64_squash_input_denormal(a STATUS_VAR
);
3298 b
= float64_squash_input_denormal(b STATUS_VAR
);
3300 aSig
= extractFloat64Frac( a
);
3301 aExp
= extractFloat64Exp( a
);
3302 aSign
= extractFloat64Sign( a
);
3303 bSig
= extractFloat64Frac( b
);
3304 bExp
= extractFloat64Exp( b
);
3305 bSign
= extractFloat64Sign( b
);
3306 zSign
= aSign
^ bSign
;
3307 if ( aExp
== 0x7FF ) {
3308 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3309 return propagateFloat64NaN( a
, b STATUS_VAR
);
3311 if ( ( bExp
| bSig
) == 0 ) {
3312 float_raise( float_flag_invalid STATUS_VAR
);
3313 return float64_default_nan
;
3315 return packFloat64( zSign
, 0x7FF, 0 );
3317 if ( bExp
== 0x7FF ) {
3318 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3319 if ( ( aExp
| aSig
) == 0 ) {
3320 float_raise( float_flag_invalid STATUS_VAR
);
3321 return float64_default_nan
;
3323 return packFloat64( zSign
, 0x7FF, 0 );
3326 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3327 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3330 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3331 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3333 zExp
= aExp
+ bExp
- 0x3FF;
3334 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3335 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3336 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3337 zSig0
|= ( zSig1
!= 0 );
3338 if ( 0 <= (int64_t) ( zSig0
<<1 ) ) {
3342 return roundAndPackFloat64( zSign
, zExp
, zSig0 STATUS_VAR
);
3346 /*----------------------------------------------------------------------------
3347 | Returns the result of dividing the double-precision floating-point value `a'
3348 | by the corresponding value `b'. The operation is performed according to
3349 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3350 *----------------------------------------------------------------------------*/
3352 float64
float64_div( float64 a
, float64 b STATUS_PARAM
)
3354 flag aSign
, bSign
, zSign
;
3355 int16 aExp
, bExp
, zExp
;
3356 uint64_t aSig
, bSig
, zSig
;
3357 uint64_t rem0
, rem1
;
3358 uint64_t term0
, term1
;
3359 a
= float64_squash_input_denormal(a STATUS_VAR
);
3360 b
= float64_squash_input_denormal(b STATUS_VAR
);
3362 aSig
= extractFloat64Frac( a
);
3363 aExp
= extractFloat64Exp( a
);
3364 aSign
= extractFloat64Sign( a
);
3365 bSig
= extractFloat64Frac( b
);
3366 bExp
= extractFloat64Exp( b
);
3367 bSign
= extractFloat64Sign( b
);
3368 zSign
= aSign
^ bSign
;
3369 if ( aExp
== 0x7FF ) {
3370 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3371 if ( bExp
== 0x7FF ) {
3372 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3373 float_raise( float_flag_invalid STATUS_VAR
);
3374 return float64_default_nan
;
3376 return packFloat64( zSign
, 0x7FF, 0 );
3378 if ( bExp
== 0x7FF ) {
3379 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3380 return packFloat64( zSign
, 0, 0 );
3384 if ( ( aExp
| aSig
) == 0 ) {
3385 float_raise( float_flag_invalid STATUS_VAR
);
3386 return float64_default_nan
;
3388 float_raise( float_flag_divbyzero STATUS_VAR
);
3389 return packFloat64( zSign
, 0x7FF, 0 );
3391 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3394 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3395 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3397 zExp
= aExp
- bExp
+ 0x3FD;
3398 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3399 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3400 if ( bSig
<= ( aSig
+ aSig
) ) {
3404 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
3405 if ( ( zSig
& 0x1FF ) <= 2 ) {
3406 mul64To128( bSig
, zSig
, &term0
, &term1
);
3407 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3408 while ( (int64_t) rem0
< 0 ) {
3410 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3412 zSig
|= ( rem1
!= 0 );
3414 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3418 /*----------------------------------------------------------------------------
3419 | Returns the remainder of the double-precision floating-point value `a'
3420 | with respect to the corresponding value `b'. The operation is performed
3421 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3422 *----------------------------------------------------------------------------*/
3424 float64
float64_rem( float64 a
, float64 b STATUS_PARAM
)
3427 int16 aExp
, bExp
, expDiff
;
3428 uint64_t aSig
, bSig
;
3429 uint64_t q
, alternateASig
;
3432 a
= float64_squash_input_denormal(a STATUS_VAR
);
3433 b
= float64_squash_input_denormal(b STATUS_VAR
);
3434 aSig
= extractFloat64Frac( a
);
3435 aExp
= extractFloat64Exp( a
);
3436 aSign
= extractFloat64Sign( a
);
3437 bSig
= extractFloat64Frac( b
);
3438 bExp
= extractFloat64Exp( b
);
3439 if ( aExp
== 0x7FF ) {
3440 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3441 return propagateFloat64NaN( a
, b STATUS_VAR
);
3443 float_raise( float_flag_invalid STATUS_VAR
);
3444 return float64_default_nan
;
3446 if ( bExp
== 0x7FF ) {
3447 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3452 float_raise( float_flag_invalid STATUS_VAR
);
3453 return float64_default_nan
;
3455 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3458 if ( aSig
== 0 ) return a
;
3459 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3461 expDiff
= aExp
- bExp
;
3462 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
3463 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3464 if ( expDiff
< 0 ) {
3465 if ( expDiff
< -1 ) return a
;
3468 q
= ( bSig
<= aSig
);
3469 if ( q
) aSig
-= bSig
;
3471 while ( 0 < expDiff
) {
3472 q
= estimateDiv128To64( aSig
, 0, bSig
);
3473 q
= ( 2 < q
) ? q
- 2 : 0;
3474 aSig
= - ( ( bSig
>>2 ) * q
);
3478 if ( 0 < expDiff
) {
3479 q
= estimateDiv128To64( aSig
, 0, bSig
);
3480 q
= ( 2 < q
) ? q
- 2 : 0;
3483 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3490 alternateASig
= aSig
;
3493 } while ( 0 <= (int64_t) aSig
);
3494 sigMean
= aSig
+ alternateASig
;
3495 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3496 aSig
= alternateASig
;
3498 zSign
= ( (int64_t) aSig
< 0 );
3499 if ( zSign
) aSig
= - aSig
;
3500 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
3504 /*----------------------------------------------------------------------------
3505 | Returns the square root of the double-precision floating-point value `a'.
3506 | The operation is performed according to the IEC/IEEE Standard for Binary
3507 | Floating-Point Arithmetic.
3508 *----------------------------------------------------------------------------*/
3510 float64
float64_sqrt( float64 a STATUS_PARAM
)
3514 uint64_t aSig
, zSig
, doubleZSig
;
3515 uint64_t rem0
, rem1
, term0
, term1
;
3516 a
= float64_squash_input_denormal(a STATUS_VAR
);
3518 aSig
= extractFloat64Frac( a
);
3519 aExp
= extractFloat64Exp( a
);
3520 aSign
= extractFloat64Sign( a
);
3521 if ( aExp
== 0x7FF ) {
3522 if ( aSig
) return propagateFloat64NaN( a
, a STATUS_VAR
);
3523 if ( ! aSign
) return a
;
3524 float_raise( float_flag_invalid STATUS_VAR
);
3525 return float64_default_nan
;
3528 if ( ( aExp
| aSig
) == 0 ) return a
;
3529 float_raise( float_flag_invalid STATUS_VAR
);
3530 return float64_default_nan
;
3533 if ( aSig
== 0 ) return float64_zero
;
3534 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3536 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
3537 aSig
|= LIT64( 0x0010000000000000 );
3538 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
3539 aSig
<<= 9 - ( aExp
& 1 );
3540 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
3541 if ( ( zSig
& 0x1FF ) <= 5 ) {
3542 doubleZSig
= zSig
<<1;
3543 mul64To128( zSig
, zSig
, &term0
, &term1
);
3544 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3545 while ( (int64_t) rem0
< 0 ) {
3548 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
3550 zSig
|= ( ( rem0
| rem1
) != 0 );
3552 return roundAndPackFloat64( 0, zExp
, zSig STATUS_VAR
);
3556 /*----------------------------------------------------------------------------
3557 | Returns the binary log of the double-precision floating-point value `a'.
3558 | The operation is performed according to the IEC/IEEE Standard for Binary
3559 | Floating-Point Arithmetic.
3560 *----------------------------------------------------------------------------*/
3561 float64
float64_log2( float64 a STATUS_PARAM
)
3565 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
3566 a
= float64_squash_input_denormal(a STATUS_VAR
);
3568 aSig
= extractFloat64Frac( a
);
3569 aExp
= extractFloat64Exp( a
);
3570 aSign
= extractFloat64Sign( a
);
3573 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
3574 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3577 float_raise( float_flag_invalid STATUS_VAR
);
3578 return float64_default_nan
;
3580 if ( aExp
== 0x7FF ) {
3581 if ( aSig
) return propagateFloat64NaN( a
, float64_zero STATUS_VAR
);
3586 aSig
|= LIT64( 0x0010000000000000 );
3588 zSig
= (uint64_t)aExp
<< 52;
3589 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
3590 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
3591 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
3592 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
3600 return normalizeRoundAndPackFloat64( zSign
, 0x408, zSig STATUS_VAR
);
3603 /*----------------------------------------------------------------------------
3604 | Returns 1 if the double-precision floating-point value `a' is equal to the
3605 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3606 | if either operand is a NaN. Otherwise, the comparison is performed
3607 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3608 *----------------------------------------------------------------------------*/
3610 int float64_eq( float64 a
, float64 b STATUS_PARAM
)
3613 a
= float64_squash_input_denormal(a STATUS_VAR
);
3614 b
= float64_squash_input_denormal(b STATUS_VAR
);
3616 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3617 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3619 float_raise( float_flag_invalid STATUS_VAR
);
3622 av
= float64_val(a
);
3623 bv
= float64_val(b
);
3624 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
3628 /*----------------------------------------------------------------------------
3629 | Returns 1 if the double-precision floating-point value `a' is less than or
3630 | equal to the corresponding value `b', and 0 otherwise. The invalid
3631 | exception is raised if either operand is a NaN. The comparison is performed
3632 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3633 *----------------------------------------------------------------------------*/
3635 int float64_le( float64 a
, float64 b STATUS_PARAM
)
3639 a
= float64_squash_input_denormal(a STATUS_VAR
);
3640 b
= float64_squash_input_denormal(b STATUS_VAR
);
3642 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3643 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3645 float_raise( float_flag_invalid STATUS_VAR
);
3648 aSign
= extractFloat64Sign( a
);
3649 bSign
= extractFloat64Sign( b
);
3650 av
= float64_val(a
);
3651 bv
= float64_val(b
);
3652 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
3653 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3657 /*----------------------------------------------------------------------------
3658 | Returns 1 if the double-precision floating-point value `a' is less than
3659 | the corresponding value `b', and 0 otherwise. The invalid exception is
3660 | raised if either operand is a NaN. The comparison is performed according
3661 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3662 *----------------------------------------------------------------------------*/
3664 int float64_lt( float64 a
, float64 b STATUS_PARAM
)
3669 a
= float64_squash_input_denormal(a STATUS_VAR
);
3670 b
= float64_squash_input_denormal(b STATUS_VAR
);
3671 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3672 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3674 float_raise( float_flag_invalid STATUS_VAR
);
3677 aSign
= extractFloat64Sign( a
);
3678 bSign
= extractFloat64Sign( b
);
3679 av
= float64_val(a
);
3680 bv
= float64_val(b
);
3681 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
3682 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3686 /*----------------------------------------------------------------------------
3687 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3688 | be compared, and 0 otherwise. The invalid exception is raised if either
3689 | operand is a NaN. The comparison is performed according to the IEC/IEEE
3690 | Standard for Binary Floating-Point Arithmetic.
3691 *----------------------------------------------------------------------------*/
3693 int float64_unordered( float64 a
, float64 b STATUS_PARAM
)
3695 a
= float64_squash_input_denormal(a STATUS_VAR
);
3696 b
= float64_squash_input_denormal(b STATUS_VAR
);
3698 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3699 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3701 float_raise( float_flag_invalid STATUS_VAR
);
3707 /*----------------------------------------------------------------------------
3708 | Returns 1 if the double-precision floating-point value `a' is equal to the
3709 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3710 | exception.The comparison is performed according to the IEC/IEEE Standard
3711 | for Binary Floating-Point Arithmetic.
3712 *----------------------------------------------------------------------------*/
3714 int float64_eq_quiet( float64 a
, float64 b STATUS_PARAM
)
3717 a
= float64_squash_input_denormal(a STATUS_VAR
);
3718 b
= float64_squash_input_denormal(b STATUS_VAR
);
3720 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3721 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3723 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3724 float_raise( float_flag_invalid STATUS_VAR
);
3728 av
= float64_val(a
);
3729 bv
= float64_val(b
);
3730 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
3734 /*----------------------------------------------------------------------------
3735 | Returns 1 if the double-precision floating-point value `a' is less than or
3736 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3737 | cause an exception. Otherwise, the comparison is performed according to the
3738 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3739 *----------------------------------------------------------------------------*/
3741 int float64_le_quiet( float64 a
, float64 b STATUS_PARAM
)
3745 a
= float64_squash_input_denormal(a STATUS_VAR
);
3746 b
= float64_squash_input_denormal(b STATUS_VAR
);
3748 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3749 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3751 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3752 float_raise( float_flag_invalid STATUS_VAR
);
3756 aSign
= extractFloat64Sign( a
);
3757 bSign
= extractFloat64Sign( b
);
3758 av
= float64_val(a
);
3759 bv
= float64_val(b
);
3760 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
3761 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
3765 /*----------------------------------------------------------------------------
3766 | Returns 1 if the double-precision floating-point value `a' is less than
3767 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3768 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3769 | Standard for Binary Floating-Point Arithmetic.
3770 *----------------------------------------------------------------------------*/
3772 int float64_lt_quiet( float64 a
, float64 b STATUS_PARAM
)
3776 a
= float64_squash_input_denormal(a STATUS_VAR
);
3777 b
= float64_squash_input_denormal(b STATUS_VAR
);
3779 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3780 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3782 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3783 float_raise( float_flag_invalid STATUS_VAR
);
3787 aSign
= extractFloat64Sign( a
);
3788 bSign
= extractFloat64Sign( b
);
3789 av
= float64_val(a
);
3790 bv
= float64_val(b
);
3791 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
3792 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
3796 /*----------------------------------------------------------------------------
3797 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
3798 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3799 | comparison is performed according to the IEC/IEEE Standard for Binary
3800 | Floating-Point Arithmetic.
3801 *----------------------------------------------------------------------------*/
3803 int float64_unordered_quiet( float64 a
, float64 b STATUS_PARAM
)
3805 a
= float64_squash_input_denormal(a STATUS_VAR
);
3806 b
= float64_squash_input_denormal(b STATUS_VAR
);
3808 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3809 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3811 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3812 float_raise( float_flag_invalid STATUS_VAR
);
3821 /*----------------------------------------------------------------------------
3822 | Returns the result of converting the extended double-precision floating-
3823 | point value `a' to the 32-bit two's complement integer format. The
3824 | conversion is performed according to the IEC/IEEE Standard for Binary
3825 | Floating-Point Arithmetic---which means in particular that the conversion
3826 | is rounded according to the current rounding mode. If `a' is a NaN, the
3827 | largest positive integer is returned. Otherwise, if the conversion
3828 | overflows, the largest integer with the same sign as `a' is returned.
3829 *----------------------------------------------------------------------------*/
3831 int32
floatx80_to_int32( floatx80 a STATUS_PARAM
)
3834 int32 aExp
, shiftCount
;
3837 aSig
= extractFloatx80Frac( a
);
3838 aExp
= extractFloatx80Exp( a
);
3839 aSign
= extractFloatx80Sign( a
);
3840 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
3841 shiftCount
= 0x4037 - aExp
;
3842 if ( shiftCount
<= 0 ) shiftCount
= 1;
3843 shift64RightJamming( aSig
, shiftCount
, &aSig
);
3844 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
3848 /*----------------------------------------------------------------------------
3849 | Returns the result of converting the extended double-precision floating-
3850 | point value `a' to the 32-bit two's complement integer format. The
3851 | conversion is performed according to the IEC/IEEE Standard for Binary
3852 | Floating-Point Arithmetic, except that the conversion is always rounded
3853 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3854 | Otherwise, if the conversion overflows, the largest integer with the same
3855 | sign as `a' is returned.
3856 *----------------------------------------------------------------------------*/
3858 int32
floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM
)
3861 int32 aExp
, shiftCount
;
3862 uint64_t aSig
, savedASig
;
3865 aSig
= extractFloatx80Frac( a
);
3866 aExp
= extractFloatx80Exp( a
);
3867 aSign
= extractFloatx80Sign( a
);
3868 if ( 0x401E < aExp
) {
3869 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
3872 else if ( aExp
< 0x3FFF ) {
3873 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3876 shiftCount
= 0x403E - aExp
;
3878 aSig
>>= shiftCount
;
3880 if ( aSign
) z
= - z
;
3881 if ( ( z
< 0 ) ^ aSign
) {
3883 float_raise( float_flag_invalid STATUS_VAR
);
3884 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
3886 if ( ( aSig
<<shiftCount
) != savedASig
) {
3887 STATUS(float_exception_flags
) |= float_flag_inexact
;
3893 /*----------------------------------------------------------------------------
3894 | Returns the result of converting the extended double-precision floating-
3895 | point value `a' to the 64-bit two's complement integer format. The
3896 | conversion is performed according to the IEC/IEEE Standard for Binary
3897 | Floating-Point Arithmetic---which means in particular that the conversion
3898 | is rounded according to the current rounding mode. If `a' is a NaN,
3899 | the largest positive integer is returned. Otherwise, if the conversion
3900 | overflows, the largest integer with the same sign as `a' is returned.
3901 *----------------------------------------------------------------------------*/
3903 int64
floatx80_to_int64( floatx80 a STATUS_PARAM
)
3906 int32 aExp
, shiftCount
;
3907 uint64_t aSig
, aSigExtra
;
3909 aSig
= extractFloatx80Frac( a
);
3910 aExp
= extractFloatx80Exp( a
);
3911 aSign
= extractFloatx80Sign( a
);
3912 shiftCount
= 0x403E - aExp
;
3913 if ( shiftCount
<= 0 ) {
3915 float_raise( float_flag_invalid STATUS_VAR
);
3917 || ( ( aExp
== 0x7FFF )
3918 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
3920 return LIT64( 0x7FFFFFFFFFFFFFFF );
3922 return (int64_t) LIT64( 0x8000000000000000 );
3927 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3929 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
3933 /*----------------------------------------------------------------------------
3934 | Returns the result of converting the extended double-precision floating-
3935 | point value `a' to the 64-bit two's complement integer format. The
3936 | conversion is performed according to the IEC/IEEE Standard for Binary
3937 | Floating-Point Arithmetic, except that the conversion is always rounded
3938 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3939 | Otherwise, if the conversion overflows, the largest integer with the same
3940 | sign as `a' is returned.
3941 *----------------------------------------------------------------------------*/
3943 int64
floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM
)
3946 int32 aExp
, shiftCount
;
3950 aSig
= extractFloatx80Frac( a
);
3951 aExp
= extractFloatx80Exp( a
);
3952 aSign
= extractFloatx80Sign( a
);
3953 shiftCount
= aExp
- 0x403E;
3954 if ( 0 <= shiftCount
) {
3955 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
3956 if ( ( a
.high
!= 0xC03E ) || aSig
) {
3957 float_raise( float_flag_invalid STATUS_VAR
);
3958 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
3959 return LIT64( 0x7FFFFFFFFFFFFFFF );
3962 return (int64_t) LIT64( 0x8000000000000000 );
3964 else if ( aExp
< 0x3FFF ) {
3965 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3968 z
= aSig
>>( - shiftCount
);
3969 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
3970 STATUS(float_exception_flags
) |= float_flag_inexact
;
3972 if ( aSign
) z
= - z
;
3977 /*----------------------------------------------------------------------------
3978 | Returns the result of converting the extended double-precision floating-
3979 | point value `a' to the single-precision floating-point format. The
3980 | conversion is performed according to the IEC/IEEE Standard for Binary
3981 | Floating-Point Arithmetic.
3982 *----------------------------------------------------------------------------*/
3984 float32
floatx80_to_float32( floatx80 a STATUS_PARAM
)
3990 aSig
= extractFloatx80Frac( a
);
3991 aExp
= extractFloatx80Exp( a
);
3992 aSign
= extractFloatx80Sign( a
);
3993 if ( aExp
== 0x7FFF ) {
3994 if ( (uint64_t) ( aSig
<<1 ) ) {
3995 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3997 return packFloat32( aSign
, 0xFF, 0 );
3999 shift64RightJamming( aSig
, 33, &aSig
);
4000 if ( aExp
|| aSig
) aExp
-= 0x3F81;
4001 return roundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
4005 /*----------------------------------------------------------------------------
4006 | Returns the result of converting the extended double-precision floating-
4007 | point value `a' to the double-precision floating-point format. The
4008 | conversion is performed according to the IEC/IEEE Standard for Binary
4009 | Floating-Point Arithmetic.
4010 *----------------------------------------------------------------------------*/
4012 float64
floatx80_to_float64( floatx80 a STATUS_PARAM
)
4016 uint64_t aSig
, zSig
;
4018 aSig
= extractFloatx80Frac( a
);
4019 aExp
= extractFloatx80Exp( a
);
4020 aSign
= extractFloatx80Sign( a
);
4021 if ( aExp
== 0x7FFF ) {
4022 if ( (uint64_t) ( aSig
<<1 ) ) {
4023 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4025 return packFloat64( aSign
, 0x7FF, 0 );
4027 shift64RightJamming( aSig
, 1, &zSig
);
4028 if ( aExp
|| aSig
) aExp
-= 0x3C01;
4029 return roundAndPackFloat64( aSign
, aExp
, zSig STATUS_VAR
);
4035 /*----------------------------------------------------------------------------
4036 | Returns the result of converting the extended double-precision floating-
4037 | point value `a' to the quadruple-precision floating-point format. The
4038 | conversion is performed according to the IEC/IEEE Standard for Binary
4039 | Floating-Point Arithmetic.
4040 *----------------------------------------------------------------------------*/
4042 float128
floatx80_to_float128( floatx80 a STATUS_PARAM
)
4046 uint64_t aSig
, zSig0
, zSig1
;
4048 aSig
= extractFloatx80Frac( a
);
4049 aExp
= extractFloatx80Exp( a
);
4050 aSign
= extractFloatx80Sign( a
);
4051 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
4052 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4054 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
4055 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
4061 /*----------------------------------------------------------------------------
4062 | Rounds the extended double-precision floating-point value `a' to an integer,
4063 | and returns the result as an extended quadruple-precision floating-point
4064 | value. The operation is performed according to the IEC/IEEE Standard for
4065 | Binary Floating-Point Arithmetic.
4066 *----------------------------------------------------------------------------*/
4068 floatx80
floatx80_round_to_int( floatx80 a STATUS_PARAM
)
4072 uint64_t lastBitMask
, roundBitsMask
;
4076 aExp
= extractFloatx80Exp( a
);
4077 if ( 0x403E <= aExp
) {
4078 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
4079 return propagateFloatx80NaN( a
, a STATUS_VAR
);
4083 if ( aExp
< 0x3FFF ) {
4085 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
4088 STATUS(float_exception_flags
) |= float_flag_inexact
;
4089 aSign
= extractFloatx80Sign( a
);
4090 switch ( STATUS(float_rounding_mode
) ) {
4091 case float_round_nearest_even
:
4092 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
4095 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
4098 case float_round_down
:
4101 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4102 : packFloatx80( 0, 0, 0 );
4103 case float_round_up
:
4105 aSign
? packFloatx80( 1, 0, 0 )
4106 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4108 return packFloatx80( aSign
, 0, 0 );
4111 lastBitMask
<<= 0x403E - aExp
;
4112 roundBitsMask
= lastBitMask
- 1;
4114 roundingMode
= STATUS(float_rounding_mode
);
4115 if ( roundingMode
== float_round_nearest_even
) {
4116 z
.low
+= lastBitMask
>>1;
4117 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
4119 else if ( roundingMode
!= float_round_to_zero
) {
4120 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
4121 z
.low
+= roundBitsMask
;
4124 z
.low
&= ~ roundBitsMask
;
4127 z
.low
= LIT64( 0x8000000000000000 );
4129 if ( z
.low
!= a
.low
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4134 /*----------------------------------------------------------------------------
4135 | Returns the result of adding the absolute values of the extended double-
4136 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4137 | negated before being returned. `zSign' is ignored if the result is a NaN.
4138 | The addition is performed according to the IEC/IEEE Standard for Binary
4139 | Floating-Point Arithmetic.
4140 *----------------------------------------------------------------------------*/
4142 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
4144 int32 aExp
, bExp
, zExp
;
4145 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4148 aSig
= extractFloatx80Frac( a
);
4149 aExp
= extractFloatx80Exp( a
);
4150 bSig
= extractFloatx80Frac( b
);
4151 bExp
= extractFloatx80Exp( b
);
4152 expDiff
= aExp
- bExp
;
4153 if ( 0 < expDiff
) {
4154 if ( aExp
== 0x7FFF ) {
4155 if ( (uint64_t) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4158 if ( bExp
== 0 ) --expDiff
;
4159 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4162 else if ( expDiff
< 0 ) {
4163 if ( bExp
== 0x7FFF ) {
4164 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4165 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4167 if ( aExp
== 0 ) ++expDiff
;
4168 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4172 if ( aExp
== 0x7FFF ) {
4173 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4174 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4179 zSig0
= aSig
+ bSig
;
4181 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
4187 zSig0
= aSig
+ bSig
;
4188 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
4190 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4191 zSig0
|= LIT64( 0x8000000000000000 );
4195 roundAndPackFloatx80(
4196 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4200 /*----------------------------------------------------------------------------
4201 | Returns the result of subtracting the absolute values of the extended
4202 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4203 | difference is negated before being returned. `zSign' is ignored if the
4204 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4205 | Standard for Binary Floating-Point Arithmetic.
4206 *----------------------------------------------------------------------------*/
4208 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
4210 int32 aExp
, bExp
, zExp
;
4211 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4215 aSig
= extractFloatx80Frac( a
);
4216 aExp
= extractFloatx80Exp( a
);
4217 bSig
= extractFloatx80Frac( b
);
4218 bExp
= extractFloatx80Exp( b
);
4219 expDiff
= aExp
- bExp
;
4220 if ( 0 < expDiff
) goto aExpBigger
;
4221 if ( expDiff
< 0 ) goto bExpBigger
;
4222 if ( aExp
== 0x7FFF ) {
4223 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4224 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4226 float_raise( float_flag_invalid STATUS_VAR
);
4227 z
.low
= floatx80_default_nan_low
;
4228 z
.high
= floatx80_default_nan_high
;
4236 if ( bSig
< aSig
) goto aBigger
;
4237 if ( aSig
< bSig
) goto bBigger
;
4238 return packFloatx80( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
4240 if ( bExp
== 0x7FFF ) {
4241 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4242 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4244 if ( aExp
== 0 ) ++expDiff
;
4245 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4247 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
4250 goto normalizeRoundAndPack
;
4252 if ( aExp
== 0x7FFF ) {
4253 if ( (uint64_t) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4256 if ( bExp
== 0 ) --expDiff
;
4257 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4259 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
4261 normalizeRoundAndPack
:
4263 normalizeRoundAndPackFloatx80(
4264 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4268 /*----------------------------------------------------------------------------
4269 | Returns the result of adding the extended double-precision floating-point
4270 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4271 | Standard for Binary Floating-Point Arithmetic.
4272 *----------------------------------------------------------------------------*/
4274 floatx80
floatx80_add( floatx80 a
, floatx80 b STATUS_PARAM
)
4278 aSign
= extractFloatx80Sign( a
);
4279 bSign
= extractFloatx80Sign( b
);
4280 if ( aSign
== bSign
) {
4281 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4284 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4289 /*----------------------------------------------------------------------------
4290 | Returns the result of subtracting the extended double-precision floating-
4291 | point values `a' and `b'. The operation is performed according to the
4292 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4293 *----------------------------------------------------------------------------*/
4295 floatx80
floatx80_sub( floatx80 a
, floatx80 b STATUS_PARAM
)
4299 aSign
= extractFloatx80Sign( a
);
4300 bSign
= extractFloatx80Sign( b
);
4301 if ( aSign
== bSign
) {
4302 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4305 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4310 /*----------------------------------------------------------------------------
4311 | Returns the result of multiplying the extended double-precision floating-
4312 | point values `a' and `b'. The operation is performed according to the
4313 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4314 *----------------------------------------------------------------------------*/
4316 floatx80
floatx80_mul( floatx80 a
, floatx80 b STATUS_PARAM
)
4318 flag aSign
, bSign
, zSign
;
4319 int32 aExp
, bExp
, zExp
;
4320 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4323 aSig
= extractFloatx80Frac( a
);
4324 aExp
= extractFloatx80Exp( a
);
4325 aSign
= extractFloatx80Sign( a
);
4326 bSig
= extractFloatx80Frac( b
);
4327 bExp
= extractFloatx80Exp( b
);
4328 bSign
= extractFloatx80Sign( b
);
4329 zSign
= aSign
^ bSign
;
4330 if ( aExp
== 0x7FFF ) {
4331 if ( (uint64_t) ( aSig
<<1 )
4332 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
4333 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4335 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
4336 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4338 if ( bExp
== 0x7FFF ) {
4339 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4340 if ( ( aExp
| aSig
) == 0 ) {
4342 float_raise( float_flag_invalid STATUS_VAR
);
4343 z
.low
= floatx80_default_nan_low
;
4344 z
.high
= floatx80_default_nan_high
;
4347 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4350 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4351 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
4354 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4355 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4357 zExp
= aExp
+ bExp
- 0x3FFE;
4358 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
4359 if ( 0 < (int64_t) zSig0
) {
4360 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4364 roundAndPackFloatx80(
4365 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4369 /*----------------------------------------------------------------------------
4370 | Returns the result of dividing the extended double-precision floating-point
4371 | value `a' by the corresponding value `b'. The operation is performed
4372 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4373 *----------------------------------------------------------------------------*/
4375 floatx80
floatx80_div( floatx80 a
, floatx80 b STATUS_PARAM
)
4377 flag aSign
, bSign
, zSign
;
4378 int32 aExp
, bExp
, zExp
;
4379 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4380 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
4383 aSig
= extractFloatx80Frac( a
);
4384 aExp
= extractFloatx80Exp( a
);
4385 aSign
= extractFloatx80Sign( a
);
4386 bSig
= extractFloatx80Frac( b
);
4387 bExp
= extractFloatx80Exp( b
);
4388 bSign
= extractFloatx80Sign( b
);
4389 zSign
= aSign
^ bSign
;
4390 if ( aExp
== 0x7FFF ) {
4391 if ( (uint64_t) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4392 if ( bExp
== 0x7FFF ) {
4393 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4396 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4398 if ( bExp
== 0x7FFF ) {
4399 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4400 return packFloatx80( zSign
, 0, 0 );
4404 if ( ( aExp
| aSig
) == 0 ) {
4406 float_raise( float_flag_invalid STATUS_VAR
);
4407 z
.low
= floatx80_default_nan_low
;
4408 z
.high
= floatx80_default_nan_high
;
4411 float_raise( float_flag_divbyzero STATUS_VAR
);
4412 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4414 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4417 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4418 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
4420 zExp
= aExp
- bExp
+ 0x3FFE;
4422 if ( bSig
<= aSig
) {
4423 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
4426 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
4427 mul64To128( bSig
, zSig0
, &term0
, &term1
);
4428 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
4429 while ( (int64_t) rem0
< 0 ) {
4431 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
4433 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
4434 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
4435 mul64To128( bSig
, zSig1
, &term1
, &term2
);
4436 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4437 while ( (int64_t) rem1
< 0 ) {
4439 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
4441 zSig1
|= ( ( rem1
| rem2
) != 0 );
4444 roundAndPackFloatx80(
4445 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4449 /*----------------------------------------------------------------------------
4450 | Returns the remainder of the extended double-precision floating-point value
4451 | `a' with respect to the corresponding value `b'. The operation is performed
4452 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4453 *----------------------------------------------------------------------------*/
4455 floatx80
floatx80_rem( floatx80 a
, floatx80 b STATUS_PARAM
)
4458 int32 aExp
, bExp
, expDiff
;
4459 uint64_t aSig0
, aSig1
, bSig
;
4460 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
4463 aSig0
= extractFloatx80Frac( a
);
4464 aExp
= extractFloatx80Exp( a
);
4465 aSign
= extractFloatx80Sign( a
);
4466 bSig
= extractFloatx80Frac( b
);
4467 bExp
= extractFloatx80Exp( b
);
4468 if ( aExp
== 0x7FFF ) {
4469 if ( (uint64_t) ( aSig0
<<1 )
4470 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
4471 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4475 if ( bExp
== 0x7FFF ) {
4476 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4482 float_raise( float_flag_invalid STATUS_VAR
);
4483 z
.low
= floatx80_default_nan_low
;
4484 z
.high
= floatx80_default_nan_high
;
4487 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4490 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
4491 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4493 bSig
|= LIT64( 0x8000000000000000 );
4495 expDiff
= aExp
- bExp
;
4497 if ( expDiff
< 0 ) {
4498 if ( expDiff
< -1 ) return a
;
4499 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
4502 q
= ( bSig
<= aSig0
);
4503 if ( q
) aSig0
-= bSig
;
4505 while ( 0 < expDiff
) {
4506 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
4507 q
= ( 2 < q
) ? q
- 2 : 0;
4508 mul64To128( bSig
, q
, &term0
, &term1
);
4509 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4510 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
4514 if ( 0 < expDiff
) {
4515 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
4516 q
= ( 2 < q
) ? q
- 2 : 0;
4518 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
4519 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4520 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
4521 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
4523 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
4530 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
4531 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
4532 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
4535 aSig0
= alternateASig0
;
4536 aSig1
= alternateASig1
;
4540 normalizeRoundAndPackFloatx80(
4541 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1 STATUS_VAR
);
4545 /*----------------------------------------------------------------------------
4546 | Returns the square root of the extended double-precision floating-point
4547 | value `a'. The operation is performed according to the IEC/IEEE Standard
4548 | for Binary Floating-Point Arithmetic.
4549 *----------------------------------------------------------------------------*/
4551 floatx80
floatx80_sqrt( floatx80 a STATUS_PARAM
)
4555 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
4556 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4559 aSig0
= extractFloatx80Frac( a
);
4560 aExp
= extractFloatx80Exp( a
);
4561 aSign
= extractFloatx80Sign( a
);
4562 if ( aExp
== 0x7FFF ) {
4563 if ( (uint64_t) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a STATUS_VAR
);
4564 if ( ! aSign
) return a
;
4568 if ( ( aExp
| aSig0
) == 0 ) return a
;
4570 float_raise( float_flag_invalid STATUS_VAR
);
4571 z
.low
= floatx80_default_nan_low
;
4572 z
.high
= floatx80_default_nan_high
;
4576 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
4577 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
4579 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
4580 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
4581 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
4582 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
4583 doubleZSig0
= zSig0
<<1;
4584 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
4585 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
4586 while ( (int64_t) rem0
< 0 ) {
4589 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
4591 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
4592 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
4593 if ( zSig1
== 0 ) zSig1
= 1;
4594 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
4595 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4596 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
4597 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4598 while ( (int64_t) rem1
< 0 ) {
4600 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
4602 term2
|= doubleZSig0
;
4603 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4605 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
4607 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
4608 zSig0
|= doubleZSig0
;
4610 roundAndPackFloatx80(
4611 STATUS(floatx80_rounding_precision
), 0, zExp
, zSig0
, zSig1 STATUS_VAR
);
4615 /*----------------------------------------------------------------------------
4616 | Returns 1 if the extended double-precision floating-point value `a' is equal
4617 | to the corresponding value `b', and 0 otherwise. The invalid exception is
4618 | raised if either operand is a NaN. Otherwise, the comparison is performed
4619 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4620 *----------------------------------------------------------------------------*/
4622 int floatx80_eq( floatx80 a
, floatx80 b STATUS_PARAM
)
4625 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4626 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4627 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4628 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4630 float_raise( float_flag_invalid STATUS_VAR
);
4635 && ( ( a
.high
== b
.high
)
4637 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4642 /*----------------------------------------------------------------------------
4643 | Returns 1 if the extended double-precision floating-point value `a' is
4644 | less than or equal to the corresponding value `b', and 0 otherwise. The
4645 | invalid exception is raised if either operand is a NaN. The comparison is
4646 | performed according to the IEC/IEEE Standard for Binary Floating-Point
4648 *----------------------------------------------------------------------------*/
4650 int floatx80_le( floatx80 a
, floatx80 b STATUS_PARAM
)
4654 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4655 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4656 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4657 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4659 float_raise( float_flag_invalid STATUS_VAR
);
4662 aSign
= extractFloatx80Sign( a
);
4663 bSign
= extractFloatx80Sign( b
);
4664 if ( aSign
!= bSign
) {
4667 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4671 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4672 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4676 /*----------------------------------------------------------------------------
4677 | Returns 1 if the extended double-precision floating-point value `a' is
4678 | less than the corresponding value `b', and 0 otherwise. The invalid
4679 | exception is raised if either operand is a NaN. The comparison is performed
4680 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4681 *----------------------------------------------------------------------------*/
4683 int floatx80_lt( floatx80 a
, floatx80 b STATUS_PARAM
)
4687 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4688 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4689 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4690 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4692 float_raise( float_flag_invalid STATUS_VAR
);
4695 aSign
= extractFloatx80Sign( a
);
4696 bSign
= extractFloatx80Sign( b
);
4697 if ( aSign
!= bSign
) {
4700 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4704 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4705 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4709 /*----------------------------------------------------------------------------
4710 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4711 | cannot be compared, and 0 otherwise. The invalid exception is raised if
4712 | either operand is a NaN. The comparison is performed according to the
4713 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4714 *----------------------------------------------------------------------------*/
4715 int floatx80_unordered( floatx80 a
, floatx80 b STATUS_PARAM
)
4717 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4718 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4719 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4720 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4722 float_raise( float_flag_invalid STATUS_VAR
);
4728 /*----------------------------------------------------------------------------
4729 | Returns 1 if the extended double-precision floating-point value `a' is
4730 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4731 | cause an exception. The comparison is performed according to the IEC/IEEE
4732 | Standard for Binary Floating-Point Arithmetic.
4733 *----------------------------------------------------------------------------*/
4735 int floatx80_eq_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4738 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4739 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4740 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4741 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4743 if ( floatx80_is_signaling_nan( a
)
4744 || floatx80_is_signaling_nan( b
) ) {
4745 float_raise( float_flag_invalid STATUS_VAR
);
4751 && ( ( a
.high
== b
.high
)
4753 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
4758 /*----------------------------------------------------------------------------
4759 | Returns 1 if the extended double-precision floating-point value `a' is less
4760 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
4761 | do not cause an exception. Otherwise, the comparison is performed according
4762 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4763 *----------------------------------------------------------------------------*/
4765 int floatx80_le_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4769 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4770 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4771 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4772 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4774 if ( floatx80_is_signaling_nan( a
)
4775 || floatx80_is_signaling_nan( b
) ) {
4776 float_raise( float_flag_invalid STATUS_VAR
);
4780 aSign
= extractFloatx80Sign( a
);
4781 bSign
= extractFloatx80Sign( b
);
4782 if ( aSign
!= bSign
) {
4785 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4789 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4790 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4794 /*----------------------------------------------------------------------------
4795 | Returns 1 if the extended double-precision floating-point value `a' is less
4796 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4797 | an exception. Otherwise, the comparison is performed according to the
4798 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4799 *----------------------------------------------------------------------------*/
4801 int floatx80_lt_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4805 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4806 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4807 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4808 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4810 if ( floatx80_is_signaling_nan( a
)
4811 || floatx80_is_signaling_nan( b
) ) {
4812 float_raise( float_flag_invalid STATUS_VAR
);
4816 aSign
= extractFloatx80Sign( a
);
4817 bSign
= extractFloatx80Sign( b
);
4818 if ( aSign
!= bSign
) {
4821 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4825 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4826 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4830 /*----------------------------------------------------------------------------
4831 | Returns 1 if the extended double-precision floating-point values `a' and `b'
4832 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
4833 | The comparison is performed according to the IEC/IEEE Standard for Binary
4834 | Floating-Point Arithmetic.
4835 *----------------------------------------------------------------------------*/
4836 int floatx80_unordered_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
4838 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4839 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
4840 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4841 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
4843 if ( floatx80_is_signaling_nan( a
)
4844 || floatx80_is_signaling_nan( b
) ) {
4845 float_raise( float_flag_invalid STATUS_VAR
);
4856 /*----------------------------------------------------------------------------
4857 | Returns the result of converting the quadruple-precision floating-point
4858 | value `a' to the 32-bit two's complement integer format. The conversion
4859 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4860 | Arithmetic---which means in particular that the conversion is rounded
4861 | according to the current rounding mode. If `a' is a NaN, the largest
4862 | positive integer is returned. Otherwise, if the conversion overflows, the
4863 | largest integer with the same sign as `a' is returned.
4864 *----------------------------------------------------------------------------*/
4866 int32
float128_to_int32( float128 a STATUS_PARAM
)
4869 int32 aExp
, shiftCount
;
4870 uint64_t aSig0
, aSig1
;
4872 aSig1
= extractFloat128Frac1( a
);
4873 aSig0
= extractFloat128Frac0( a
);
4874 aExp
= extractFloat128Exp( a
);
4875 aSign
= extractFloat128Sign( a
);
4876 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
4877 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4878 aSig0
|= ( aSig1
!= 0 );
4879 shiftCount
= 0x4028 - aExp
;
4880 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
4881 return roundAndPackInt32( aSign
, aSig0 STATUS_VAR
);
4885 /*----------------------------------------------------------------------------
4886 | Returns the result of converting the quadruple-precision floating-point
4887 | value `a' to the 32-bit two's complement integer format. The conversion
4888 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4889 | Arithmetic, except that the conversion is always rounded toward zero. If
4890 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4891 | conversion overflows, the largest integer with the same sign as `a' is
4893 *----------------------------------------------------------------------------*/
4895 int32
float128_to_int32_round_to_zero( float128 a STATUS_PARAM
)
4898 int32 aExp
, shiftCount
;
4899 uint64_t aSig0
, aSig1
, savedASig
;
4902 aSig1
= extractFloat128Frac1( a
);
4903 aSig0
= extractFloat128Frac0( a
);
4904 aExp
= extractFloat128Exp( a
);
4905 aSign
= extractFloat128Sign( a
);
4906 aSig0
|= ( aSig1
!= 0 );
4907 if ( 0x401E < aExp
) {
4908 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
4911 else if ( aExp
< 0x3FFF ) {
4912 if ( aExp
|| aSig0
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4915 aSig0
|= LIT64( 0x0001000000000000 );
4916 shiftCount
= 0x402F - aExp
;
4918 aSig0
>>= shiftCount
;
4920 if ( aSign
) z
= - z
;
4921 if ( ( z
< 0 ) ^ aSign
) {
4923 float_raise( float_flag_invalid STATUS_VAR
);
4924 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
4926 if ( ( aSig0
<<shiftCount
) != savedASig
) {
4927 STATUS(float_exception_flags
) |= float_flag_inexact
;
4933 /*----------------------------------------------------------------------------
4934 | Returns the result of converting the quadruple-precision floating-point
4935 | value `a' to the 64-bit two's complement integer format. The conversion
4936 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4937 | Arithmetic---which means in particular that the conversion is rounded
4938 | according to the current rounding mode. If `a' is a NaN, the largest
4939 | positive integer is returned. Otherwise, if the conversion overflows, the
4940 | largest integer with the same sign as `a' is returned.
4941 *----------------------------------------------------------------------------*/
4943 int64
float128_to_int64( float128 a STATUS_PARAM
)
4946 int32 aExp
, shiftCount
;
4947 uint64_t aSig0
, aSig1
;
4949 aSig1
= extractFloat128Frac1( a
);
4950 aSig0
= extractFloat128Frac0( a
);
4951 aExp
= extractFloat128Exp( a
);
4952 aSign
= extractFloat128Sign( a
);
4953 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4954 shiftCount
= 0x402F - aExp
;
4955 if ( shiftCount
<= 0 ) {
4956 if ( 0x403E < aExp
) {
4957 float_raise( float_flag_invalid STATUS_VAR
);
4959 || ( ( aExp
== 0x7FFF )
4960 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
4963 return LIT64( 0x7FFFFFFFFFFFFFFF );
4965 return (int64_t) LIT64( 0x8000000000000000 );
4967 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
4970 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
4972 return roundAndPackInt64( aSign
, aSig0
, aSig1 STATUS_VAR
);
4976 /*----------------------------------------------------------------------------
4977 | Returns the result of converting the quadruple-precision floating-point
4978 | value `a' to the 64-bit two's complement integer format. The conversion
4979 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4980 | Arithmetic, except that the conversion is always rounded toward zero.
4981 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4982 | the conversion overflows, the largest integer with the same sign as `a' is
4984 *----------------------------------------------------------------------------*/
4986 int64
float128_to_int64_round_to_zero( float128 a STATUS_PARAM
)
4989 int32 aExp
, shiftCount
;
4990 uint64_t aSig0
, aSig1
;
4993 aSig1
= extractFloat128Frac1( a
);
4994 aSig0
= extractFloat128Frac0( a
);
4995 aExp
= extractFloat128Exp( a
);
4996 aSign
= extractFloat128Sign( a
);
4997 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4998 shiftCount
= aExp
- 0x402F;
4999 if ( 0 < shiftCount
) {
5000 if ( 0x403E <= aExp
) {
5001 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
5002 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
5003 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
5004 if ( aSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
5007 float_raise( float_flag_invalid STATUS_VAR
);
5008 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
5009 return LIT64( 0x7FFFFFFFFFFFFFFF );
5012 return (int64_t) LIT64( 0x8000000000000000 );
5014 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
5015 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
5016 STATUS(float_exception_flags
) |= float_flag_inexact
;
5020 if ( aExp
< 0x3FFF ) {
5021 if ( aExp
| aSig0
| aSig1
) {
5022 STATUS(float_exception_flags
) |= float_flag_inexact
;
5026 z
= aSig0
>>( - shiftCount
);
5028 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
5029 STATUS(float_exception_flags
) |= float_flag_inexact
;
5032 if ( aSign
) z
= - z
;
5037 /*----------------------------------------------------------------------------
5038 | Returns the result of converting the quadruple-precision floating-point
5039 | value `a' to the single-precision floating-point format. The conversion
5040 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5042 *----------------------------------------------------------------------------*/
5044 float32
float128_to_float32( float128 a STATUS_PARAM
)
5048 uint64_t aSig0
, aSig1
;
5051 aSig1
= extractFloat128Frac1( a
);
5052 aSig0
= extractFloat128Frac0( a
);
5053 aExp
= extractFloat128Exp( a
);
5054 aSign
= extractFloat128Sign( a
);
5055 if ( aExp
== 0x7FFF ) {
5056 if ( aSig0
| aSig1
) {
5057 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
5059 return packFloat32( aSign
, 0xFF, 0 );
5061 aSig0
|= ( aSig1
!= 0 );
5062 shift64RightJamming( aSig0
, 18, &aSig0
);
5064 if ( aExp
|| zSig
) {
5068 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
5072 /*----------------------------------------------------------------------------
5073 | Returns the result of converting the quadruple-precision floating-point
5074 | value `a' to the double-precision floating-point format. The conversion
5075 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5077 *----------------------------------------------------------------------------*/
5079 float64
float128_to_float64( float128 a STATUS_PARAM
)
5083 uint64_t aSig0
, aSig1
;
5085 aSig1
= extractFloat128Frac1( a
);
5086 aSig0
= extractFloat128Frac0( a
);
5087 aExp
= extractFloat128Exp( a
);
5088 aSign
= extractFloat128Sign( a
);
5089 if ( aExp
== 0x7FFF ) {
5090 if ( aSig0
| aSig1
) {
5091 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
5093 return packFloat64( aSign
, 0x7FF, 0 );
5095 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
5096 aSig0
|= ( aSig1
!= 0 );
5097 if ( aExp
|| aSig0
) {
5098 aSig0
|= LIT64( 0x4000000000000000 );
5101 return roundAndPackFloat64( aSign
, aExp
, aSig0 STATUS_VAR
);
5107 /*----------------------------------------------------------------------------
5108 | Returns the result of converting the quadruple-precision floating-point
5109 | value `a' to the extended double-precision floating-point format. The
5110 | conversion is performed according to the IEC/IEEE Standard for Binary
5111 | Floating-Point Arithmetic.
5112 *----------------------------------------------------------------------------*/
5114 floatx80
float128_to_floatx80( float128 a STATUS_PARAM
)
5118 uint64_t aSig0
, aSig1
;
5120 aSig1
= extractFloat128Frac1( a
);
5121 aSig0
= extractFloat128Frac0( a
);
5122 aExp
= extractFloat128Exp( a
);
5123 aSign
= extractFloat128Sign( a
);
5124 if ( aExp
== 0x7FFF ) {
5125 if ( aSig0
| aSig1
) {
5126 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
5128 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5131 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
5132 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5135 aSig0
|= LIT64( 0x0001000000000000 );
5137 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
5138 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1 STATUS_VAR
);
5144 /*----------------------------------------------------------------------------
5145 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5146 | returns the result as a quadruple-precision floating-point value. The
5147 | operation is performed according to the IEC/IEEE Standard for Binary
5148 | Floating-Point Arithmetic.
5149 *----------------------------------------------------------------------------*/
5151 float128
float128_round_to_int( float128 a STATUS_PARAM
)
5155 uint64_t lastBitMask
, roundBitsMask
;
5159 aExp
= extractFloat128Exp( a
);
5160 if ( 0x402F <= aExp
) {
5161 if ( 0x406F <= aExp
) {
5162 if ( ( aExp
== 0x7FFF )
5163 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
5165 return propagateFloat128NaN( a
, a STATUS_VAR
);
5170 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
5171 roundBitsMask
= lastBitMask
- 1;
5173 roundingMode
= STATUS(float_rounding_mode
);
5174 if ( roundingMode
== float_round_nearest_even
) {
5175 if ( lastBitMask
) {
5176 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
5177 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
5180 if ( (int64_t) z
.low
< 0 ) {
5182 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
5186 else if ( roundingMode
!= float_round_to_zero
) {
5187 if ( extractFloat128Sign( z
)
5188 ^ ( roundingMode
== float_round_up
) ) {
5189 add128( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
5192 z
.low
&= ~ roundBitsMask
;
5195 if ( aExp
< 0x3FFF ) {
5196 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
5197 STATUS(float_exception_flags
) |= float_flag_inexact
;
5198 aSign
= extractFloat128Sign( a
);
5199 switch ( STATUS(float_rounding_mode
) ) {
5200 case float_round_nearest_even
:
5201 if ( ( aExp
== 0x3FFE )
5202 && ( extractFloat128Frac0( a
)
5203 | extractFloat128Frac1( a
) )
5205 return packFloat128( aSign
, 0x3FFF, 0, 0 );
5208 case float_round_down
:
5210 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
5211 : packFloat128( 0, 0, 0, 0 );
5212 case float_round_up
:
5214 aSign
? packFloat128( 1, 0, 0, 0 )
5215 : packFloat128( 0, 0x3FFF, 0, 0 );
5217 return packFloat128( aSign
, 0, 0, 0 );
5220 lastBitMask
<<= 0x402F - aExp
;
5221 roundBitsMask
= lastBitMask
- 1;
5224 roundingMode
= STATUS(float_rounding_mode
);
5225 if ( roundingMode
== float_round_nearest_even
) {
5226 z
.high
+= lastBitMask
>>1;
5227 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
5228 z
.high
&= ~ lastBitMask
;
5231 else if ( roundingMode
!= float_round_to_zero
) {
5232 if ( extractFloat128Sign( z
)
5233 ^ ( roundingMode
== float_round_up
) ) {
5234 z
.high
|= ( a
.low
!= 0 );
5235 z
.high
+= roundBitsMask
;
5238 z
.high
&= ~ roundBitsMask
;
5240 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
5241 STATUS(float_exception_flags
) |= float_flag_inexact
;
5247 /*----------------------------------------------------------------------------
5248 | Returns the result of adding the absolute values of the quadruple-precision
5249 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5250 | before being returned. `zSign' is ignored if the result is a NaN.
5251 | The addition is performed according to the IEC/IEEE Standard for Binary
5252 | Floating-Point Arithmetic.
5253 *----------------------------------------------------------------------------*/
5255 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
5257 int32 aExp
, bExp
, zExp
;
5258 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
5261 aSig1
= extractFloat128Frac1( a
);
5262 aSig0
= extractFloat128Frac0( a
);
5263 aExp
= extractFloat128Exp( a
);
5264 bSig1
= extractFloat128Frac1( b
);
5265 bSig0
= extractFloat128Frac0( b
);
5266 bExp
= extractFloat128Exp( b
);
5267 expDiff
= aExp
- bExp
;
5268 if ( 0 < expDiff
) {
5269 if ( aExp
== 0x7FFF ) {
5270 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5277 bSig0
|= LIT64( 0x0001000000000000 );
5279 shift128ExtraRightJamming(
5280 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
5283 else if ( expDiff
< 0 ) {
5284 if ( bExp
== 0x7FFF ) {
5285 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5286 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5292 aSig0
|= LIT64( 0x0001000000000000 );
5294 shift128ExtraRightJamming(
5295 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
5299 if ( aExp
== 0x7FFF ) {
5300 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
5301 return propagateFloat128NaN( a
, b STATUS_VAR
);
5305 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5307 if (STATUS(flush_to_zero
)) {
5308 if (zSig0
| zSig1
) {
5309 float_raise(float_flag_output_denormal STATUS_VAR
);
5311 return packFloat128(zSign
, 0, 0, 0);
5313 return packFloat128( zSign
, 0, zSig0
, zSig1
);
5316 zSig0
|= LIT64( 0x0002000000000000 );
5320 aSig0
|= LIT64( 0x0001000000000000 );
5321 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5323 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
5326 shift128ExtraRightJamming(
5327 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
5329 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5333 /*----------------------------------------------------------------------------
5334 | Returns the result of subtracting the absolute values of the quadruple-
5335 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5336 | difference is negated before being returned. `zSign' is ignored if the
5337 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5338 | Standard for Binary Floating-Point Arithmetic.
5339 *----------------------------------------------------------------------------*/
5341 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
5343 int32 aExp
, bExp
, zExp
;
5344 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
5348 aSig1
= extractFloat128Frac1( a
);
5349 aSig0
= extractFloat128Frac0( a
);
5350 aExp
= extractFloat128Exp( a
);
5351 bSig1
= extractFloat128Frac1( b
);
5352 bSig0
= extractFloat128Frac0( b
);
5353 bExp
= extractFloat128Exp( b
);
5354 expDiff
= aExp
- bExp
;
5355 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
5356 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
5357 if ( 0 < expDiff
) goto aExpBigger
;
5358 if ( expDiff
< 0 ) goto bExpBigger
;
5359 if ( aExp
== 0x7FFF ) {
5360 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
5361 return propagateFloat128NaN( a
, b STATUS_VAR
);
5363 float_raise( float_flag_invalid STATUS_VAR
);
5364 z
.low
= float128_default_nan_low
;
5365 z
.high
= float128_default_nan_high
;
5372 if ( bSig0
< aSig0
) goto aBigger
;
5373 if ( aSig0
< bSig0
) goto bBigger
;
5374 if ( bSig1
< aSig1
) goto aBigger
;
5375 if ( aSig1
< bSig1
) goto bBigger
;
5376 return packFloat128( STATUS(float_rounding_mode
) == float_round_down
, 0, 0, 0 );
5378 if ( bExp
== 0x7FFF ) {
5379 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5380 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
5386 aSig0
|= LIT64( 0x4000000000000000 );
5388 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
5389 bSig0
|= LIT64( 0x4000000000000000 );
5391 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
5394 goto normalizeRoundAndPack
;
5396 if ( aExp
== 0x7FFF ) {
5397 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5404 bSig0
|= LIT64( 0x4000000000000000 );
5406 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
5407 aSig0
|= LIT64( 0x4000000000000000 );
5409 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5411 normalizeRoundAndPack
:
5413 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1 STATUS_VAR
);
5417 /*----------------------------------------------------------------------------
5418 | Returns the result of adding the quadruple-precision floating-point values
5419 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
5420 | for Binary Floating-Point Arithmetic.
5421 *----------------------------------------------------------------------------*/
5423 float128
float128_add( float128 a
, float128 b STATUS_PARAM
)
5427 aSign
= extractFloat128Sign( a
);
5428 bSign
= extractFloat128Sign( b
);
5429 if ( aSign
== bSign
) {
5430 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5433 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5438 /*----------------------------------------------------------------------------
5439 | Returns the result of subtracting the quadruple-precision floating-point
5440 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5441 | Standard for Binary Floating-Point Arithmetic.
5442 *----------------------------------------------------------------------------*/
5444 float128
float128_sub( float128 a
, float128 b STATUS_PARAM
)
5448 aSign
= extractFloat128Sign( a
);
5449 bSign
= extractFloat128Sign( b
);
5450 if ( aSign
== bSign
) {
5451 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5454 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
5459 /*----------------------------------------------------------------------------
5460 | Returns the result of multiplying the quadruple-precision floating-point
5461 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5462 | Standard for Binary Floating-Point Arithmetic.
5463 *----------------------------------------------------------------------------*/
5465 float128
float128_mul( float128 a
, float128 b STATUS_PARAM
)
5467 flag aSign
, bSign
, zSign
;
5468 int32 aExp
, bExp
, zExp
;
5469 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
5472 aSig1
= extractFloat128Frac1( a
);
5473 aSig0
= extractFloat128Frac0( a
);
5474 aExp
= extractFloat128Exp( a
);
5475 aSign
= extractFloat128Sign( a
);
5476 bSig1
= extractFloat128Frac1( b
);
5477 bSig0
= extractFloat128Frac0( b
);
5478 bExp
= extractFloat128Exp( b
);
5479 bSign
= extractFloat128Sign( b
);
5480 zSign
= aSign
^ bSign
;
5481 if ( aExp
== 0x7FFF ) {
5482 if ( ( aSig0
| aSig1
)
5483 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5484 return propagateFloat128NaN( a
, b STATUS_VAR
);
5486 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
5487 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5489 if ( bExp
== 0x7FFF ) {
5490 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5491 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5493 float_raise( float_flag_invalid STATUS_VAR
);
5494 z
.low
= float128_default_nan_low
;
5495 z
.high
= float128_default_nan_high
;
5498 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5501 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5502 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5505 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5506 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5508 zExp
= aExp
+ bExp
- 0x4000;
5509 aSig0
|= LIT64( 0x0001000000000000 );
5510 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
5511 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
5512 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
5513 zSig2
|= ( zSig3
!= 0 );
5514 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
5515 shift128ExtraRightJamming(
5516 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
5519 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5523 /*----------------------------------------------------------------------------
5524 | Returns the result of dividing the quadruple-precision floating-point value
5525 | `a' by the corresponding value `b'. The operation is performed according to
5526 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5527 *----------------------------------------------------------------------------*/
5529 float128
float128_div( float128 a
, float128 b STATUS_PARAM
)
5531 flag aSign
, bSign
, zSign
;
5532 int32 aExp
, bExp
, zExp
;
5533 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
5534 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5537 aSig1
= extractFloat128Frac1( a
);
5538 aSig0
= extractFloat128Frac0( a
);
5539 aExp
= extractFloat128Exp( a
);
5540 aSign
= extractFloat128Sign( a
);
5541 bSig1
= extractFloat128Frac1( b
);
5542 bSig0
= extractFloat128Frac0( b
);
5543 bExp
= extractFloat128Exp( b
);
5544 bSign
= extractFloat128Sign( b
);
5545 zSign
= aSign
^ bSign
;
5546 if ( aExp
== 0x7FFF ) {
5547 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5548 if ( bExp
== 0x7FFF ) {
5549 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5552 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5554 if ( bExp
== 0x7FFF ) {
5555 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5556 return packFloat128( zSign
, 0, 0, 0 );
5559 if ( ( bSig0
| bSig1
) == 0 ) {
5560 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
5562 float_raise( float_flag_invalid STATUS_VAR
);
5563 z
.low
= float128_default_nan_low
;
5564 z
.high
= float128_default_nan_high
;
5567 float_raise( float_flag_divbyzero STATUS_VAR
);
5568 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5570 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5573 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
5574 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5576 zExp
= aExp
- bExp
+ 0x3FFD;
5578 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
5580 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5581 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
5582 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
5585 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5586 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
5587 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
5588 while ( (int64_t) rem0
< 0 ) {
5590 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
5592 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
5593 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
5594 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
5595 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
5596 while ( (int64_t) rem1
< 0 ) {
5598 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
5600 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5602 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
5603 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5607 /*----------------------------------------------------------------------------
5608 | Returns the remainder of the quadruple-precision floating-point value `a'
5609 | with respect to the corresponding value `b'. The operation is performed
5610 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5611 *----------------------------------------------------------------------------*/
5613 float128
float128_rem( float128 a
, float128 b STATUS_PARAM
)
5616 int32 aExp
, bExp
, expDiff
;
5617 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
5618 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
5622 aSig1
= extractFloat128Frac1( a
);
5623 aSig0
= extractFloat128Frac0( a
);
5624 aExp
= extractFloat128Exp( a
);
5625 aSign
= extractFloat128Sign( a
);
5626 bSig1
= extractFloat128Frac1( b
);
5627 bSig0
= extractFloat128Frac0( b
);
5628 bExp
= extractFloat128Exp( b
);
5629 if ( aExp
== 0x7FFF ) {
5630 if ( ( aSig0
| aSig1
)
5631 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
5632 return propagateFloat128NaN( a
, b STATUS_VAR
);
5636 if ( bExp
== 0x7FFF ) {
5637 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5641 if ( ( bSig0
| bSig1
) == 0 ) {
5643 float_raise( float_flag_invalid STATUS_VAR
);
5644 z
.low
= float128_default_nan_low
;
5645 z
.high
= float128_default_nan_high
;
5648 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
5651 if ( ( aSig0
| aSig1
) == 0 ) return a
;
5652 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5654 expDiff
= aExp
- bExp
;
5655 if ( expDiff
< -1 ) return a
;
5657 aSig0
| LIT64( 0x0001000000000000 ),
5659 15 - ( expDiff
< 0 ),
5664 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
5665 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
5666 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5668 while ( 0 < expDiff
) {
5669 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5670 q
= ( 4 < q
) ? q
- 4 : 0;
5671 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5672 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
5673 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
5674 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
5677 if ( -64 < expDiff
) {
5678 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
5679 q
= ( 4 < q
) ? q
- 4 : 0;
5681 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5683 if ( expDiff
< 0 ) {
5684 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
5687 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
5689 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
5690 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
5693 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
5694 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
5697 alternateASig0
= aSig0
;
5698 alternateASig1
= aSig1
;
5700 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
5701 } while ( 0 <= (int64_t) aSig0
);
5703 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
5704 if ( ( sigMean0
< 0 )
5705 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
5706 aSig0
= alternateASig0
;
5707 aSig1
= alternateASig1
;
5709 zSign
= ( (int64_t) aSig0
< 0 );
5710 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
5712 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1 STATUS_VAR
);
5716 /*----------------------------------------------------------------------------
5717 | Returns the square root of the quadruple-precision floating-point value `a'.
5718 | The operation is performed according to the IEC/IEEE Standard for Binary
5719 | Floating-Point Arithmetic.
5720 *----------------------------------------------------------------------------*/
5722 float128
float128_sqrt( float128 a STATUS_PARAM
)
5726 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
5727 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5730 aSig1
= extractFloat128Frac1( a
);
5731 aSig0
= extractFloat128Frac0( a
);
5732 aExp
= extractFloat128Exp( a
);
5733 aSign
= extractFloat128Sign( a
);
5734 if ( aExp
== 0x7FFF ) {
5735 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a STATUS_VAR
);
5736 if ( ! aSign
) return a
;
5740 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
5742 float_raise( float_flag_invalid STATUS_VAR
);
5743 z
.low
= float128_default_nan_low
;
5744 z
.high
= float128_default_nan_high
;
5748 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
5749 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5751 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
5752 aSig0
|= LIT64( 0x0001000000000000 );
5753 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
5754 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
5755 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5756 doubleZSig0
= zSig0
<<1;
5757 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5758 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5759 while ( (int64_t) rem0
< 0 ) {
5762 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5764 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5765 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
5766 if ( zSig1
== 0 ) zSig1
= 1;
5767 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5768 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5769 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5770 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5771 while ( (int64_t) rem1
< 0 ) {
5773 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5775 term2
|= doubleZSig0
;
5776 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5778 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5780 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
5781 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5785 /*----------------------------------------------------------------------------
5786 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5787 | the corresponding value `b', and 0 otherwise. The invalid exception is
5788 | raised if either operand is a NaN. Otherwise, the comparison is performed
5789 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5790 *----------------------------------------------------------------------------*/
5792 int float128_eq( float128 a
, float128 b STATUS_PARAM
)
5795 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5796 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5797 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5798 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5800 float_raise( float_flag_invalid STATUS_VAR
);
5805 && ( ( a
.high
== b
.high
)
5807 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5812 /*----------------------------------------------------------------------------
5813 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5814 | or equal to the corresponding value `b', and 0 otherwise. The invalid
5815 | exception is raised if either operand is a NaN. The comparison is performed
5816 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5817 *----------------------------------------------------------------------------*/
5819 int float128_le( float128 a
, float128 b STATUS_PARAM
)
5823 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5824 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5825 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5826 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5828 float_raise( float_flag_invalid STATUS_VAR
);
5831 aSign
= extractFloat128Sign( a
);
5832 bSign
= extractFloat128Sign( b
);
5833 if ( aSign
!= bSign
) {
5836 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5840 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5841 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5845 /*----------------------------------------------------------------------------
5846 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5847 | the corresponding value `b', and 0 otherwise. The invalid exception is
5848 | raised if either operand is a NaN. The comparison is performed according
5849 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5850 *----------------------------------------------------------------------------*/
5852 int float128_lt( float128 a
, float128 b STATUS_PARAM
)
5856 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5857 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5858 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5859 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5861 float_raise( float_flag_invalid STATUS_VAR
);
5864 aSign
= extractFloat128Sign( a
);
5865 bSign
= extractFloat128Sign( b
);
5866 if ( aSign
!= bSign
) {
5869 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5873 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5874 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5878 /*----------------------------------------------------------------------------
5879 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
5880 | be compared, and 0 otherwise. The invalid exception is raised if either
5881 | operand is a NaN. The comparison is performed according to the IEC/IEEE
5882 | Standard for Binary Floating-Point Arithmetic.
5883 *----------------------------------------------------------------------------*/
5885 int float128_unordered( float128 a
, float128 b STATUS_PARAM
)
5887 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5888 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5889 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5890 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5892 float_raise( float_flag_invalid STATUS_VAR
);
5898 /*----------------------------------------------------------------------------
5899 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5900 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5901 | exception. The comparison is performed according to the IEC/IEEE Standard
5902 | for Binary Floating-Point Arithmetic.
5903 *----------------------------------------------------------------------------*/
5905 int float128_eq_quiet( float128 a
, float128 b STATUS_PARAM
)
5908 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5909 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5910 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5911 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5913 if ( float128_is_signaling_nan( a
)
5914 || float128_is_signaling_nan( b
) ) {
5915 float_raise( float_flag_invalid STATUS_VAR
);
5921 && ( ( a
.high
== b
.high
)
5923 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5928 /*----------------------------------------------------------------------------
5929 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5930 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5931 | cause an exception. Otherwise, the comparison is performed according to the
5932 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5933 *----------------------------------------------------------------------------*/
5935 int float128_le_quiet( float128 a
, float128 b STATUS_PARAM
)
5939 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5940 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5941 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5942 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5944 if ( float128_is_signaling_nan( a
)
5945 || float128_is_signaling_nan( b
) ) {
5946 float_raise( float_flag_invalid STATUS_VAR
);
5950 aSign
= extractFloat128Sign( a
);
5951 bSign
= extractFloat128Sign( b
);
5952 if ( aSign
!= bSign
) {
5955 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5959 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5960 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5964 /*----------------------------------------------------------------------------
5965 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5966 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5967 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5968 | Standard for Binary Floating-Point Arithmetic.
5969 *----------------------------------------------------------------------------*/
5971 int float128_lt_quiet( float128 a
, float128 b STATUS_PARAM
)
5975 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5976 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5977 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5978 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5980 if ( float128_is_signaling_nan( a
)
5981 || float128_is_signaling_nan( b
) ) {
5982 float_raise( float_flag_invalid STATUS_VAR
);
5986 aSign
= extractFloat128Sign( a
);
5987 bSign
= extractFloat128Sign( b
);
5988 if ( aSign
!= bSign
) {
5991 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5995 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5996 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6000 /*----------------------------------------------------------------------------
6001 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6002 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6003 | comparison is performed according to the IEC/IEEE Standard for Binary
6004 | Floating-Point Arithmetic.
6005 *----------------------------------------------------------------------------*/
6007 int float128_unordered_quiet( float128 a
, float128 b STATUS_PARAM
)
6009 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6010 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6011 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6012 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6014 if ( float128_is_signaling_nan( a
)
6015 || float128_is_signaling_nan( b
) ) {
6016 float_raise( float_flag_invalid STATUS_VAR
);
6025 /* misc functions */
6026 float32
uint32_to_float32( unsigned int a STATUS_PARAM
)
6028 return int64_to_float32(a STATUS_VAR
);
6031 float64
uint32_to_float64( unsigned int a STATUS_PARAM
)
6033 return int64_to_float64(a STATUS_VAR
);
6036 unsigned int float32_to_uint32( float32 a STATUS_PARAM
)
6041 v
= float32_to_int64(a STATUS_VAR
);
6044 float_raise( float_flag_invalid STATUS_VAR
);
6045 } else if (v
> 0xffffffff) {
6047 float_raise( float_flag_invalid STATUS_VAR
);
6054 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM
)
6059 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
6062 float_raise( float_flag_invalid STATUS_VAR
);
6063 } else if (v
> 0xffffffff) {
6065 float_raise( float_flag_invalid STATUS_VAR
);
6072 unsigned int float32_to_uint16_round_to_zero( float32 a STATUS_PARAM
)
6077 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
6080 float_raise( float_flag_invalid STATUS_VAR
);
6081 } else if (v
> 0xffff) {
6083 float_raise( float_flag_invalid STATUS_VAR
);
6090 unsigned int float64_to_uint32( float64 a STATUS_PARAM
)
6095 v
= float64_to_int64(a STATUS_VAR
);
6098 float_raise( float_flag_invalid STATUS_VAR
);
6099 } else if (v
> 0xffffffff) {
6101 float_raise( float_flag_invalid STATUS_VAR
);
6108 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM
)
6113 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
6116 float_raise( float_flag_invalid STATUS_VAR
);
6117 } else if (v
> 0xffffffff) {
6119 float_raise( float_flag_invalid STATUS_VAR
);
6126 unsigned int float64_to_uint16_round_to_zero( float64 a STATUS_PARAM
)
6131 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
6134 float_raise( float_flag_invalid STATUS_VAR
);
6135 } else if (v
> 0xffff) {
6137 float_raise( float_flag_invalid STATUS_VAR
);
6144 /* FIXME: This looks broken. */
6145 uint64_t float64_to_uint64 (float64 a STATUS_PARAM
)
6149 v
= float64_val(int64_to_float64(INT64_MIN STATUS_VAR
));
6150 v
+= float64_val(a
);
6151 v
= float64_to_int64(make_float64(v
) STATUS_VAR
);
6153 return v
- INT64_MIN
;
6156 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM
)
6160 v
= float64_val(int64_to_float64(INT64_MIN STATUS_VAR
));
6161 v
+= float64_val(a
);
6162 v
= float64_to_int64_round_to_zero(make_float64(v
) STATUS_VAR
);
6164 return v
- INT64_MIN
;
6167 #define COMPARE(s, nan_exp) \
6168 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
6169 int is_quiet STATUS_PARAM ) \
6171 flag aSign, bSign; \
6172 uint ## s ## _t av, bv; \
6173 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6174 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6176 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6177 extractFloat ## s ## Frac( a ) ) || \
6178 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6179 extractFloat ## s ## Frac( b ) )) { \
6181 float ## s ## _is_signaling_nan( a ) || \
6182 float ## s ## _is_signaling_nan( b ) ) { \
6183 float_raise( float_flag_invalid STATUS_VAR); \
6185 return float_relation_unordered; \
6187 aSign = extractFloat ## s ## Sign( a ); \
6188 bSign = extractFloat ## s ## Sign( b ); \
6189 av = float ## s ## _val(a); \
6190 bv = float ## s ## _val(b); \
6191 if ( aSign != bSign ) { \
6192 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
6194 return float_relation_equal; \
6196 return 1 - (2 * aSign); \
6200 return float_relation_equal; \
6202 return 1 - 2 * (aSign ^ ( av < bv )); \
6207 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6209 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6212 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6214 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6220 INLINE
int floatx80_compare_internal( floatx80 a
, floatx80 b
,
6221 int is_quiet STATUS_PARAM
)
6225 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
6226 ( extractFloatx80Frac( a
)<<1 ) ) ||
6227 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
6228 ( extractFloatx80Frac( b
)<<1 ) )) {
6230 floatx80_is_signaling_nan( a
) ||
6231 floatx80_is_signaling_nan( b
) ) {
6232 float_raise( float_flag_invalid STATUS_VAR
);
6234 return float_relation_unordered
;
6236 aSign
= extractFloatx80Sign( a
);
6237 bSign
= extractFloatx80Sign( b
);
6238 if ( aSign
!= bSign
) {
6240 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
6241 ( ( a
.low
| b
.low
) == 0 ) ) {
6243 return float_relation_equal
;
6245 return 1 - (2 * aSign
);
6248 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6249 return float_relation_equal
;
6251 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6256 int floatx80_compare( floatx80 a
, floatx80 b STATUS_PARAM
)
6258 return floatx80_compare_internal(a
, b
, 0 STATUS_VAR
);
6261 int floatx80_compare_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
6263 return floatx80_compare_internal(a
, b
, 1 STATUS_VAR
);
6266 INLINE
int float128_compare_internal( float128 a
, float128 b
,
6267 int is_quiet STATUS_PARAM
)
6271 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
6272 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
6273 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
6274 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
6276 float128_is_signaling_nan( a
) ||
6277 float128_is_signaling_nan( b
) ) {
6278 float_raise( float_flag_invalid STATUS_VAR
);
6280 return float_relation_unordered
;
6282 aSign
= extractFloat128Sign( a
);
6283 bSign
= extractFloat128Sign( b
);
6284 if ( aSign
!= bSign
) {
6285 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
6287 return float_relation_equal
;
6289 return 1 - (2 * aSign
);
6292 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6293 return float_relation_equal
;
6295 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6300 int float128_compare( float128 a
, float128 b STATUS_PARAM
)
6302 return float128_compare_internal(a
, b
, 0 STATUS_VAR
);
6305 int float128_compare_quiet( float128 a
, float128 b STATUS_PARAM
)
6307 return float128_compare_internal(a
, b
, 1 STATUS_VAR
);
6310 /* min() and max() functions. These can't be implemented as
6311 * 'compare and pick one input' because that would mishandle
6312 * NaNs and +0 vs -0.
6314 #define MINMAX(s, nan_exp) \
6315 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
6316 int ismin STATUS_PARAM ) \
6318 flag aSign, bSign; \
6319 uint ## s ## _t av, bv; \
6320 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6321 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6322 if (float ## s ## _is_any_nan(a) || \
6323 float ## s ## _is_any_nan(b)) { \
6324 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
6326 aSign = extractFloat ## s ## Sign(a); \
6327 bSign = extractFloat ## s ## Sign(b); \
6328 av = float ## s ## _val(a); \
6329 bv = float ## s ## _val(b); \
6330 if (aSign != bSign) { \
6332 return aSign ? a : b; \
6334 return aSign ? b : a; \
6338 return (aSign ^ (av < bv)) ? a : b; \
6340 return (aSign ^ (av < bv)) ? b : a; \
6345 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
6347 return float ## s ## _minmax(a, b, 1 STATUS_VAR); \
6350 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
6352 return float ## s ## _minmax(a, b, 0 STATUS_VAR); \
6359 /* Multiply A by 2 raised to the power N. */
6360 float32
float32_scalbn( float32 a
, int n STATUS_PARAM
)
6366 a
= float32_squash_input_denormal(a STATUS_VAR
);
6367 aSig
= extractFloat32Frac( a
);
6368 aExp
= extractFloat32Exp( a
);
6369 aSign
= extractFloat32Sign( a
);
6371 if ( aExp
== 0xFF ) {
6373 return propagateFloat32NaN( a
, a STATUS_VAR
);
6379 else if ( aSig
== 0 )
6384 } else if (n
< -0x200) {
6390 return normalizeRoundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
6393 float64
float64_scalbn( float64 a
, int n STATUS_PARAM
)
6399 a
= float64_squash_input_denormal(a STATUS_VAR
);
6400 aSig
= extractFloat64Frac( a
);
6401 aExp
= extractFloat64Exp( a
);
6402 aSign
= extractFloat64Sign( a
);
6404 if ( aExp
== 0x7FF ) {
6406 return propagateFloat64NaN( a
, a STATUS_VAR
);
6411 aSig
|= LIT64( 0x0010000000000000 );
6412 else if ( aSig
== 0 )
6417 } else if (n
< -0x1000) {
6423 return normalizeRoundAndPackFloat64( aSign
, aExp
, aSig STATUS_VAR
);
6427 floatx80
floatx80_scalbn( floatx80 a
, int n STATUS_PARAM
)
6433 aSig
= extractFloatx80Frac( a
);
6434 aExp
= extractFloatx80Exp( a
);
6435 aSign
= extractFloatx80Sign( a
);
6437 if ( aExp
== 0x7FFF ) {
6439 return propagateFloatx80NaN( a
, a STATUS_VAR
);
6444 if (aExp
== 0 && aSig
== 0)
6449 } else if (n
< -0x10000) {
6454 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision
),
6455 aSign
, aExp
, aSig
, 0 STATUS_VAR
);
6460 float128
float128_scalbn( float128 a
, int n STATUS_PARAM
)
6464 uint64_t aSig0
, aSig1
;
6466 aSig1
= extractFloat128Frac1( a
);
6467 aSig0
= extractFloat128Frac0( a
);
6468 aExp
= extractFloat128Exp( a
);
6469 aSign
= extractFloat128Sign( a
);
6470 if ( aExp
== 0x7FFF ) {
6471 if ( aSig0
| aSig1
) {
6472 return propagateFloat128NaN( a
, a STATUS_VAR
);
6477 aSig0
|= LIT64( 0x0001000000000000 );
6478 else if ( aSig0
== 0 && aSig1
== 0 )
6483 } else if (n
< -0x10000) {
6488 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1