4 * Derived from SoftFloat.
7 /*============================================================================
9 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
12 Written by John R. Hauser. This work was made possible in part by the
13 International Computer Science Institute, located at Suite 600, 1947 Center
14 Street, Berkeley, California 94704. Funding was partially provided by the
15 National Science Foundation under grant MIP-9311980. The original version
16 of this code was written as part of a project to build a fixed-point vector
17 processor in collaboration with the University of California at Berkeley,
18 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
19 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
20 arithmetic/SoftFloat.html'.
22 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
23 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
24 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
25 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
26 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
27 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
28 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
29 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
31 Derivative works are acceptable, even for commercial purposes, so long as
32 (1) the source code for the derivative work includes prominent notice that
33 the work is derivative, and (2) the source code includes prominent notice with
34 these four paragraphs for those parts of this code that are retained.
36 =============================================================================*/
38 /* softfloat (and in particular the code in softfloat-specialize.h) is
39 * target-dependent and needs the TARGET_* macros.
43 #include "fpu/softfloat.h"
45 /*----------------------------------------------------------------------------
46 | Primitive arithmetic functions, including multi-word arithmetic, and
47 | division and square root approximations. (Can be specialized to target if
49 *----------------------------------------------------------------------------*/
50 #include "softfloat-macros.h"
52 /*----------------------------------------------------------------------------
53 | Functions and definitions to determine: (1) whether tininess for underflow
54 | is detected before or after rounding by default, (2) what (if anything)
55 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
56 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
57 | are propagated from function inputs to output. These details are target-
59 *----------------------------------------------------------------------------*/
60 #include "softfloat-specialize.h"
62 /*----------------------------------------------------------------------------
63 | Returns the fraction bits of the half-precision floating-point value `a'.
64 *----------------------------------------------------------------------------*/
66 INLINE
uint32_t extractFloat16Frac(float16 a
)
68 return float16_val(a
) & 0x3ff;
71 /*----------------------------------------------------------------------------
72 | Returns the exponent bits of the half-precision floating-point value `a'.
73 *----------------------------------------------------------------------------*/
75 INLINE
int_fast16_t extractFloat16Exp(float16 a
)
77 return (float16_val(a
) >> 10) & 0x1f;
80 /*----------------------------------------------------------------------------
81 | Returns the sign bit of the single-precision floating-point value `a'.
82 *----------------------------------------------------------------------------*/
84 INLINE flag
extractFloat16Sign(float16 a
)
86 return float16_val(a
)>>15;
89 /*----------------------------------------------------------------------------
90 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
91 | and 7, and returns the properly rounded 32-bit integer corresponding to the
92 | input. If `zSign' is 1, the input is negated before being converted to an
93 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
94 | is simply rounded to an integer, with the inexact exception raised if the
95 | input cannot be represented exactly as an integer. However, if the fixed-
96 | point input is too large, the invalid exception is raised and the largest
97 | positive or negative integer is returned.
98 *----------------------------------------------------------------------------*/
100 static int32
roundAndPackInt32( flag zSign
, uint64_t absZ STATUS_PARAM
)
103 flag roundNearestEven
;
104 int8 roundIncrement
, roundBits
;
107 roundingMode
= STATUS(float_rounding_mode
);
108 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
109 roundIncrement
= 0x40;
110 if ( ! roundNearestEven
) {
111 if ( roundingMode
== float_round_to_zero
) {
115 roundIncrement
= 0x7F;
117 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
120 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
124 roundBits
= absZ
& 0x7F;
125 absZ
= ( absZ
+ roundIncrement
)>>7;
126 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
128 if ( zSign
) z
= - z
;
129 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
130 float_raise( float_flag_invalid STATUS_VAR
);
131 return zSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
133 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
138 /*----------------------------------------------------------------------------
139 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
140 | `absZ1', with binary point between bits 63 and 64 (between the input words),
141 | and returns the properly rounded 64-bit integer corresponding to the input.
142 | If `zSign' is 1, the input is negated before being converted to an integer.
143 | Ordinarily, the fixed-point input is simply rounded to an integer, with
144 | the inexact exception raised if the input cannot be represented exactly as
145 | an integer. However, if the fixed-point input is too large, the invalid
146 | exception is raised and the largest positive or negative integer is
148 *----------------------------------------------------------------------------*/
150 static int64
roundAndPackInt64( flag zSign
, uint64_t absZ0
, uint64_t absZ1 STATUS_PARAM
)
153 flag roundNearestEven
, increment
;
156 roundingMode
= STATUS(float_rounding_mode
);
157 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
158 increment
= ( (int64_t) absZ1
< 0 );
159 if ( ! roundNearestEven
) {
160 if ( roundingMode
== float_round_to_zero
) {
165 increment
= ( roundingMode
== float_round_down
) && absZ1
;
168 increment
= ( roundingMode
== float_round_up
) && absZ1
;
174 if ( absZ0
== 0 ) goto overflow
;
175 absZ0
&= ~ ( ( (uint64_t) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
178 if ( zSign
) z
= - z
;
179 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
181 float_raise( float_flag_invalid STATUS_VAR
);
183 zSign
? (int64_t) LIT64( 0x8000000000000000 )
184 : LIT64( 0x7FFFFFFFFFFFFFFF );
186 if ( absZ1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
191 /*----------------------------------------------------------------------------
192 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
193 | `absZ1', with binary point between bits 63 and 64 (between the input words),
194 | and returns the properly rounded 64-bit unsigned integer corresponding to the
195 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
196 | with the inexact exception raised if the input cannot be represented exactly
197 | as an integer. However, if the fixed-point input is too large, the invalid
198 | exception is raised and the largest unsigned integer is returned.
199 *----------------------------------------------------------------------------*/
201 static int64
roundAndPackUint64(flag zSign
, uint64_t absZ0
,
202 uint64_t absZ1 STATUS_PARAM
)
205 flag roundNearestEven
, increment
;
207 roundingMode
= STATUS(float_rounding_mode
);
208 roundNearestEven
= (roundingMode
== float_round_nearest_even
);
209 increment
= ((int64_t)absZ1
< 0);
210 if (!roundNearestEven
) {
211 if (roundingMode
== float_round_to_zero
) {
215 increment
= (roundingMode
== float_round_down
) && absZ1
;
217 increment
= (roundingMode
== float_round_up
) && absZ1
;
224 float_raise(float_flag_invalid STATUS_VAR
);
225 return LIT64(0xFFFFFFFFFFFFFFFF);
227 absZ0
&= ~(((uint64_t)(absZ1
<<1) == 0) & roundNearestEven
);
230 if (zSign
&& absZ0
) {
231 float_raise(float_flag_invalid STATUS_VAR
);
236 STATUS(float_exception_flags
) |= float_flag_inexact
;
241 /*----------------------------------------------------------------------------
242 | Returns the fraction bits of the single-precision floating-point value `a'.
243 *----------------------------------------------------------------------------*/
245 INLINE
uint32_t extractFloat32Frac( float32 a
)
248 return float32_val(a
) & 0x007FFFFF;
252 /*----------------------------------------------------------------------------
253 | Returns the exponent bits of the single-precision floating-point value `a'.
254 *----------------------------------------------------------------------------*/
256 INLINE
int_fast16_t extractFloat32Exp(float32 a
)
259 return ( float32_val(a
)>>23 ) & 0xFF;
263 /*----------------------------------------------------------------------------
264 | Returns the sign bit of the single-precision floating-point value `a'.
265 *----------------------------------------------------------------------------*/
267 INLINE flag
extractFloat32Sign( float32 a
)
270 return float32_val(a
)>>31;
274 /*----------------------------------------------------------------------------
275 | If `a' is denormal and we are in flush-to-zero mode then set the
276 | input-denormal exception and return zero. Otherwise just return the value.
277 *----------------------------------------------------------------------------*/
278 static float32
float32_squash_input_denormal(float32 a STATUS_PARAM
)
280 if (STATUS(flush_inputs_to_zero
)) {
281 if (extractFloat32Exp(a
) == 0 && extractFloat32Frac(a
) != 0) {
282 float_raise(float_flag_input_denormal STATUS_VAR
);
283 return make_float32(float32_val(a
) & 0x80000000);
289 /*----------------------------------------------------------------------------
290 | Normalizes the subnormal single-precision floating-point value represented
291 | by the denormalized significand `aSig'. The normalized exponent and
292 | significand are stored at the locations pointed to by `zExpPtr' and
293 | `zSigPtr', respectively.
294 *----------------------------------------------------------------------------*/
297 normalizeFloat32Subnormal(uint32_t aSig
, int_fast16_t *zExpPtr
, uint32_t *zSigPtr
)
301 shiftCount
= countLeadingZeros32( aSig
) - 8;
302 *zSigPtr
= aSig
<<shiftCount
;
303 *zExpPtr
= 1 - shiftCount
;
307 /*----------------------------------------------------------------------------
308 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
309 | single-precision floating-point value, returning the result. After being
310 | shifted into the proper positions, the three fields are simply added
311 | together to form the result. This means that any integer portion of `zSig'
312 | will be added into the exponent. Since a properly normalized significand
313 | will have an integer portion equal to 1, the `zExp' input should be 1 less
314 | than the desired result exponent whenever `zSig' is a complete, normalized
316 *----------------------------------------------------------------------------*/
318 INLINE float32
packFloat32(flag zSign
, int_fast16_t zExp
, uint32_t zSig
)
322 ( ( (uint32_t) zSign
)<<31 ) + ( ( (uint32_t) zExp
)<<23 ) + zSig
);
326 /*----------------------------------------------------------------------------
327 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
328 | and significand `zSig', and returns the proper single-precision floating-
329 | point value corresponding to the abstract input. Ordinarily, the abstract
330 | value is simply rounded and packed into the single-precision format, with
331 | the inexact exception raised if the abstract input cannot be represented
332 | exactly. However, if the abstract value is too large, the overflow and
333 | inexact exceptions are raised and an infinity or maximal finite value is
334 | returned. If the abstract value is too small, the input value is rounded to
335 | a subnormal number, and the underflow and inexact exceptions are raised if
336 | the abstract input cannot be represented exactly as a subnormal single-
337 | precision floating-point number.
338 | The input significand `zSig' has its binary point between bits 30
339 | and 29, which is 7 bits to the left of the usual location. This shifted
340 | significand must be normalized or smaller. If `zSig' is not normalized,
341 | `zExp' must be 0; in that case, the result returned is a subnormal number,
342 | and it must not require rounding. In the usual case that `zSig' is
343 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
344 | The handling of underflow and overflow follows the IEC/IEEE Standard for
345 | Binary Floating-Point Arithmetic.
346 *----------------------------------------------------------------------------*/
348 static float32
roundAndPackFloat32(flag zSign
, int_fast16_t zExp
, uint32_t zSig STATUS_PARAM
)
351 flag roundNearestEven
;
352 int8 roundIncrement
, roundBits
;
355 roundingMode
= STATUS(float_rounding_mode
);
356 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
357 roundIncrement
= 0x40;
358 if ( ! roundNearestEven
) {
359 if ( roundingMode
== float_round_to_zero
) {
363 roundIncrement
= 0x7F;
365 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
368 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
372 roundBits
= zSig
& 0x7F;
373 if ( 0xFD <= (uint16_t) zExp
) {
375 || ( ( zExp
== 0xFD )
376 && ( (int32_t) ( zSig
+ roundIncrement
) < 0 ) )
378 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
379 return packFloat32( zSign
, 0xFF, - ( roundIncrement
== 0 ));
382 if (STATUS(flush_to_zero
)) {
383 float_raise(float_flag_output_denormal STATUS_VAR
);
384 return packFloat32(zSign
, 0, 0);
387 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
389 || ( zSig
+ roundIncrement
< 0x80000000 );
390 shift32RightJamming( zSig
, - zExp
, &zSig
);
392 roundBits
= zSig
& 0x7F;
393 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
396 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
397 zSig
= ( zSig
+ roundIncrement
)>>7;
398 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
399 if ( zSig
== 0 ) zExp
= 0;
400 return packFloat32( zSign
, zExp
, zSig
);
404 /*----------------------------------------------------------------------------
405 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
406 | and significand `zSig', and returns the proper single-precision floating-
407 | point value corresponding to the abstract input. This routine is just like
408 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
409 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
410 | floating-point exponent.
411 *----------------------------------------------------------------------------*/
414 normalizeRoundAndPackFloat32(flag zSign
, int_fast16_t zExp
, uint32_t zSig STATUS_PARAM
)
418 shiftCount
= countLeadingZeros32( zSig
) - 1;
419 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
423 /*----------------------------------------------------------------------------
424 | Returns the fraction bits of the double-precision floating-point value `a'.
425 *----------------------------------------------------------------------------*/
427 INLINE
uint64_t extractFloat64Frac( float64 a
)
430 return float64_val(a
) & LIT64( 0x000FFFFFFFFFFFFF );
434 /*----------------------------------------------------------------------------
435 | Returns the exponent bits of the double-precision floating-point value `a'.
436 *----------------------------------------------------------------------------*/
438 INLINE
int_fast16_t extractFloat64Exp(float64 a
)
441 return ( float64_val(a
)>>52 ) & 0x7FF;
445 /*----------------------------------------------------------------------------
446 | Returns the sign bit of the double-precision floating-point value `a'.
447 *----------------------------------------------------------------------------*/
449 INLINE flag
extractFloat64Sign( float64 a
)
452 return float64_val(a
)>>63;
456 /*----------------------------------------------------------------------------
457 | If `a' is denormal and we are in flush-to-zero mode then set the
458 | input-denormal exception and return zero. Otherwise just return the value.
459 *----------------------------------------------------------------------------*/
460 static float64
float64_squash_input_denormal(float64 a STATUS_PARAM
)
462 if (STATUS(flush_inputs_to_zero
)) {
463 if (extractFloat64Exp(a
) == 0 && extractFloat64Frac(a
) != 0) {
464 float_raise(float_flag_input_denormal STATUS_VAR
);
465 return make_float64(float64_val(a
) & (1ULL << 63));
471 /*----------------------------------------------------------------------------
472 | Normalizes the subnormal double-precision floating-point value represented
473 | by the denormalized significand `aSig'. The normalized exponent and
474 | significand are stored at the locations pointed to by `zExpPtr' and
475 | `zSigPtr', respectively.
476 *----------------------------------------------------------------------------*/
479 normalizeFloat64Subnormal(uint64_t aSig
, int_fast16_t *zExpPtr
, uint64_t *zSigPtr
)
483 shiftCount
= countLeadingZeros64( aSig
) - 11;
484 *zSigPtr
= aSig
<<shiftCount
;
485 *zExpPtr
= 1 - shiftCount
;
489 /*----------------------------------------------------------------------------
490 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
491 | double-precision floating-point value, returning the result. After being
492 | shifted into the proper positions, the three fields are simply added
493 | together to form the result. This means that any integer portion of `zSig'
494 | will be added into the exponent. Since a properly normalized significand
495 | will have an integer portion equal to 1, the `zExp' input should be 1 less
496 | than the desired result exponent whenever `zSig' is a complete, normalized
498 *----------------------------------------------------------------------------*/
500 INLINE float64
packFloat64(flag zSign
, int_fast16_t zExp
, uint64_t zSig
)
504 ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<52 ) + zSig
);
508 /*----------------------------------------------------------------------------
509 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
510 | and significand `zSig', and returns the proper double-precision floating-
511 | point value corresponding to the abstract input. Ordinarily, the abstract
512 | value is simply rounded and packed into the double-precision format, with
513 | the inexact exception raised if the abstract input cannot be represented
514 | exactly. However, if the abstract value is too large, the overflow and
515 | inexact exceptions are raised and an infinity or maximal finite value is
516 | returned. If the abstract value is too small, the input value is rounded
517 | to a subnormal number, and the underflow and inexact exceptions are raised
518 | if the abstract input cannot be represented exactly as a subnormal double-
519 | precision floating-point number.
520 | The input significand `zSig' has its binary point between bits 62
521 | and 61, which is 10 bits to the left of the usual location. This shifted
522 | significand must be normalized or smaller. If `zSig' is not normalized,
523 | `zExp' must be 0; in that case, the result returned is a subnormal number,
524 | and it must not require rounding. In the usual case that `zSig' is
525 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
526 | The handling of underflow and overflow follows the IEC/IEEE Standard for
527 | Binary Floating-Point Arithmetic.
528 *----------------------------------------------------------------------------*/
530 static float64
roundAndPackFloat64(flag zSign
, int_fast16_t zExp
, uint64_t zSig STATUS_PARAM
)
533 flag roundNearestEven
;
534 int_fast16_t roundIncrement
, roundBits
;
537 roundingMode
= STATUS(float_rounding_mode
);
538 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
539 roundIncrement
= 0x200;
540 if ( ! roundNearestEven
) {
541 if ( roundingMode
== float_round_to_zero
) {
545 roundIncrement
= 0x3FF;
547 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
550 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
554 roundBits
= zSig
& 0x3FF;
555 if ( 0x7FD <= (uint16_t) zExp
) {
556 if ( ( 0x7FD < zExp
)
557 || ( ( zExp
== 0x7FD )
558 && ( (int64_t) ( zSig
+ roundIncrement
) < 0 ) )
560 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
561 return packFloat64( zSign
, 0x7FF, - ( roundIncrement
== 0 ));
564 if (STATUS(flush_to_zero
)) {
565 float_raise(float_flag_output_denormal STATUS_VAR
);
566 return packFloat64(zSign
, 0, 0);
569 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
571 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
572 shift64RightJamming( zSig
, - zExp
, &zSig
);
574 roundBits
= zSig
& 0x3FF;
575 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
578 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
579 zSig
= ( zSig
+ roundIncrement
)>>10;
580 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
581 if ( zSig
== 0 ) zExp
= 0;
582 return packFloat64( zSign
, zExp
, zSig
);
586 /*----------------------------------------------------------------------------
587 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
588 | and significand `zSig', and returns the proper double-precision floating-
589 | point value corresponding to the abstract input. This routine is just like
590 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
591 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
592 | floating-point exponent.
593 *----------------------------------------------------------------------------*/
596 normalizeRoundAndPackFloat64(flag zSign
, int_fast16_t zExp
, uint64_t zSig STATUS_PARAM
)
600 shiftCount
= countLeadingZeros64( zSig
) - 1;
601 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount STATUS_VAR
);
605 /*----------------------------------------------------------------------------
606 | Returns the fraction bits of the extended double-precision floating-point
608 *----------------------------------------------------------------------------*/
610 INLINE
uint64_t extractFloatx80Frac( floatx80 a
)
617 /*----------------------------------------------------------------------------
618 | Returns the exponent bits of the extended double-precision floating-point
620 *----------------------------------------------------------------------------*/
622 INLINE int32
extractFloatx80Exp( floatx80 a
)
625 return a
.high
& 0x7FFF;
629 /*----------------------------------------------------------------------------
630 | Returns the sign bit of the extended double-precision floating-point value
632 *----------------------------------------------------------------------------*/
634 INLINE flag
extractFloatx80Sign( floatx80 a
)
641 /*----------------------------------------------------------------------------
642 | Normalizes the subnormal extended double-precision floating-point value
643 | represented by the denormalized significand `aSig'. The normalized exponent
644 | and significand are stored at the locations pointed to by `zExpPtr' and
645 | `zSigPtr', respectively.
646 *----------------------------------------------------------------------------*/
649 normalizeFloatx80Subnormal( uint64_t aSig
, int32
*zExpPtr
, uint64_t *zSigPtr
)
653 shiftCount
= countLeadingZeros64( aSig
);
654 *zSigPtr
= aSig
<<shiftCount
;
655 *zExpPtr
= 1 - shiftCount
;
659 /*----------------------------------------------------------------------------
660 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
661 | extended double-precision floating-point value, returning the result.
662 *----------------------------------------------------------------------------*/
664 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, uint64_t zSig
)
669 z
.high
= ( ( (uint16_t) zSign
)<<15 ) + zExp
;
674 /*----------------------------------------------------------------------------
675 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
676 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
677 | and returns the proper extended double-precision floating-point value
678 | corresponding to the abstract input. Ordinarily, the abstract value is
679 | rounded and packed into the extended double-precision format, with the
680 | inexact exception raised if the abstract input cannot be represented
681 | exactly. However, if the abstract value is too large, the overflow and
682 | inexact exceptions are raised and an infinity or maximal finite value is
683 | returned. If the abstract value is too small, the input value is rounded to
684 | a subnormal number, and the underflow and inexact exceptions are raised if
685 | the abstract input cannot be represented exactly as a subnormal extended
686 | double-precision floating-point number.
687 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
688 | number of bits as single or double precision, respectively. Otherwise, the
689 | result is rounded to the full precision of the extended double-precision
691 | The input significand must be normalized or smaller. If the input
692 | significand is not normalized, `zExp' must be 0; in that case, the result
693 | returned is a subnormal number, and it must not require rounding. The
694 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
695 | Floating-Point Arithmetic.
696 *----------------------------------------------------------------------------*/
699 roundAndPackFloatx80(
700 int8 roundingPrecision
, flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
704 flag roundNearestEven
, increment
, isTiny
;
705 int64 roundIncrement
, roundMask
, roundBits
;
707 roundingMode
= STATUS(float_rounding_mode
);
708 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
709 if ( roundingPrecision
== 80 ) goto precision80
;
710 if ( roundingPrecision
== 64 ) {
711 roundIncrement
= LIT64( 0x0000000000000400 );
712 roundMask
= LIT64( 0x00000000000007FF );
714 else if ( roundingPrecision
== 32 ) {
715 roundIncrement
= LIT64( 0x0000008000000000 );
716 roundMask
= LIT64( 0x000000FFFFFFFFFF );
721 zSig0
|= ( zSig1
!= 0 );
722 if ( ! roundNearestEven
) {
723 if ( roundingMode
== float_round_to_zero
) {
727 roundIncrement
= roundMask
;
729 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
732 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
736 roundBits
= zSig0
& roundMask
;
737 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
738 if ( ( 0x7FFE < zExp
)
739 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
744 if (STATUS(flush_to_zero
)) {
745 float_raise(float_flag_output_denormal STATUS_VAR
);
746 return packFloatx80(zSign
, 0, 0);
749 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
751 || ( zSig0
<= zSig0
+ roundIncrement
);
752 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
754 roundBits
= zSig0
& roundMask
;
755 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow STATUS_VAR
);
756 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
757 zSig0
+= roundIncrement
;
758 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
759 roundIncrement
= roundMask
+ 1;
760 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
761 roundMask
|= roundIncrement
;
763 zSig0
&= ~ roundMask
;
764 return packFloatx80( zSign
, zExp
, zSig0
);
767 if ( roundBits
) STATUS(float_exception_flags
) |= float_flag_inexact
;
768 zSig0
+= roundIncrement
;
769 if ( zSig0
< roundIncrement
) {
771 zSig0
= LIT64( 0x8000000000000000 );
773 roundIncrement
= roundMask
+ 1;
774 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
775 roundMask
|= roundIncrement
;
777 zSig0
&= ~ roundMask
;
778 if ( zSig0
== 0 ) zExp
= 0;
779 return packFloatx80( zSign
, zExp
, zSig0
);
781 increment
= ( (int64_t) zSig1
< 0 );
782 if ( ! roundNearestEven
) {
783 if ( roundingMode
== float_round_to_zero
) {
788 increment
= ( roundingMode
== float_round_down
) && zSig1
;
791 increment
= ( roundingMode
== float_round_up
) && zSig1
;
795 if ( 0x7FFD <= (uint32_t) ( zExp
- 1 ) ) {
796 if ( ( 0x7FFE < zExp
)
797 || ( ( zExp
== 0x7FFE )
798 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
804 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
805 if ( ( roundingMode
== float_round_to_zero
)
806 || ( zSign
&& ( roundingMode
== float_round_up
) )
807 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
809 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
811 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
815 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
818 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
819 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
821 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow STATUS_VAR
);
822 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
823 if ( roundNearestEven
) {
824 increment
= ( (int64_t) zSig1
< 0 );
828 increment
= ( roundingMode
== float_round_down
) && zSig1
;
831 increment
= ( roundingMode
== float_round_up
) && zSig1
;
837 ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
838 if ( (int64_t) zSig0
< 0 ) zExp
= 1;
840 return packFloatx80( zSign
, zExp
, zSig0
);
843 if ( zSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
848 zSig0
= LIT64( 0x8000000000000000 );
851 zSig0
&= ~ ( ( (uint64_t) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
855 if ( zSig0
== 0 ) zExp
= 0;
857 return packFloatx80( zSign
, zExp
, zSig0
);
861 /*----------------------------------------------------------------------------
862 | Takes an abstract floating-point value having sign `zSign', exponent
863 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
864 | and returns the proper extended double-precision floating-point value
865 | corresponding to the abstract input. This routine is just like
866 | `roundAndPackFloatx80' except that the input significand does not have to be
868 *----------------------------------------------------------------------------*/
871 normalizeRoundAndPackFloatx80(
872 int8 roundingPrecision
, flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
882 shiftCount
= countLeadingZeros64( zSig0
);
883 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
886 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
890 /*----------------------------------------------------------------------------
891 | Returns the least-significant 64 fraction bits of the quadruple-precision
892 | floating-point value `a'.
893 *----------------------------------------------------------------------------*/
895 INLINE
uint64_t extractFloat128Frac1( float128 a
)
902 /*----------------------------------------------------------------------------
903 | Returns the most-significant 48 fraction bits of the quadruple-precision
904 | floating-point value `a'.
905 *----------------------------------------------------------------------------*/
907 INLINE
uint64_t extractFloat128Frac0( float128 a
)
910 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
914 /*----------------------------------------------------------------------------
915 | Returns the exponent bits of the quadruple-precision floating-point value
917 *----------------------------------------------------------------------------*/
919 INLINE int32
extractFloat128Exp( float128 a
)
922 return ( a
.high
>>48 ) & 0x7FFF;
926 /*----------------------------------------------------------------------------
927 | Returns the sign bit of the quadruple-precision floating-point value `a'.
928 *----------------------------------------------------------------------------*/
930 INLINE flag
extractFloat128Sign( float128 a
)
937 /*----------------------------------------------------------------------------
938 | Normalizes the subnormal quadruple-precision floating-point value
939 | represented by the denormalized significand formed by the concatenation of
940 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
941 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
942 | significand are stored at the location pointed to by `zSig0Ptr', and the
943 | least significant 64 bits of the normalized significand are stored at the
944 | location pointed to by `zSig1Ptr'.
945 *----------------------------------------------------------------------------*/
948 normalizeFloat128Subnormal(
959 shiftCount
= countLeadingZeros64( aSig1
) - 15;
960 if ( shiftCount
< 0 ) {
961 *zSig0Ptr
= aSig1
>>( - shiftCount
);
962 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
965 *zSig0Ptr
= aSig1
<<shiftCount
;
968 *zExpPtr
= - shiftCount
- 63;
971 shiftCount
= countLeadingZeros64( aSig0
) - 15;
972 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
973 *zExpPtr
= 1 - shiftCount
;
978 /*----------------------------------------------------------------------------
979 | Packs the sign `zSign', the exponent `zExp', and the significand formed
980 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
981 | floating-point value, returning the result. After being shifted into the
982 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
983 | added together to form the most significant 32 bits of the result. This
984 | means that any integer portion of `zSig0' will be added into the exponent.
985 | Since a properly normalized significand will have an integer portion equal
986 | to 1, the `zExp' input should be 1 less than the desired result exponent
987 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
989 *----------------------------------------------------------------------------*/
992 packFloat128( flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
)
997 z
.high
= ( ( (uint64_t) zSign
)<<63 ) + ( ( (uint64_t) zExp
)<<48 ) + zSig0
;
1002 /*----------------------------------------------------------------------------
1003 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1004 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1005 | and `zSig2', and returns the proper quadruple-precision floating-point value
1006 | corresponding to the abstract input. Ordinarily, the abstract value is
1007 | simply rounded and packed into the quadruple-precision format, with the
1008 | inexact exception raised if the abstract input cannot be represented
1009 | exactly. However, if the abstract value is too large, the overflow and
1010 | inexact exceptions are raised and an infinity or maximal finite value is
1011 | returned. If the abstract value is too small, the input value is rounded to
1012 | a subnormal number, and the underflow and inexact exceptions are raised if
1013 | the abstract input cannot be represented exactly as a subnormal quadruple-
1014 | precision floating-point number.
1015 | The input significand must be normalized or smaller. If the input
1016 | significand is not normalized, `zExp' must be 0; in that case, the result
1017 | returned is a subnormal number, and it must not require rounding. In the
1018 | usual case that the input significand is normalized, `zExp' must be 1 less
1019 | than the ``true'' floating-point exponent. The handling of underflow and
1020 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1021 *----------------------------------------------------------------------------*/
1024 roundAndPackFloat128(
1025 flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1
, uint64_t zSig2 STATUS_PARAM
)
1028 flag roundNearestEven
, increment
, isTiny
;
1030 roundingMode
= STATUS(float_rounding_mode
);
1031 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
1032 increment
= ( (int64_t) zSig2
< 0 );
1033 if ( ! roundNearestEven
) {
1034 if ( roundingMode
== float_round_to_zero
) {
1039 increment
= ( roundingMode
== float_round_down
) && zSig2
;
1042 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1046 if ( 0x7FFD <= (uint32_t) zExp
) {
1047 if ( ( 0x7FFD < zExp
)
1048 || ( ( zExp
== 0x7FFD )
1050 LIT64( 0x0001FFFFFFFFFFFF ),
1051 LIT64( 0xFFFFFFFFFFFFFFFF ),
1058 float_raise( float_flag_overflow
| float_flag_inexact STATUS_VAR
);
1059 if ( ( roundingMode
== float_round_to_zero
)
1060 || ( zSign
&& ( roundingMode
== float_round_up
) )
1061 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
1067 LIT64( 0x0000FFFFFFFFFFFF ),
1068 LIT64( 0xFFFFFFFFFFFFFFFF )
1071 return packFloat128( zSign
, 0x7FFF, 0, 0 );
1074 if (STATUS(flush_to_zero
)) {
1075 float_raise(float_flag_output_denormal STATUS_VAR
);
1076 return packFloat128(zSign
, 0, 0, 0);
1079 ( STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
1085 LIT64( 0x0001FFFFFFFFFFFF ),
1086 LIT64( 0xFFFFFFFFFFFFFFFF )
1088 shift128ExtraRightJamming(
1089 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
1091 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow STATUS_VAR
);
1092 if ( roundNearestEven
) {
1093 increment
= ( (int64_t) zSig2
< 0 );
1097 increment
= ( roundingMode
== float_round_down
) && zSig2
;
1100 increment
= ( roundingMode
== float_round_up
) && zSig2
;
1105 if ( zSig2
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1107 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
1108 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
1111 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
1113 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1117 /*----------------------------------------------------------------------------
1118 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1119 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1120 | returns the proper quadruple-precision floating-point value corresponding
1121 | to the abstract input. This routine is just like `roundAndPackFloat128'
1122 | except that the input significand has fewer bits and does not have to be
1123 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1125 *----------------------------------------------------------------------------*/
1128 normalizeRoundAndPackFloat128(
1129 flag zSign
, int32 zExp
, uint64_t zSig0
, uint64_t zSig1 STATUS_PARAM
)
1139 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1140 if ( 0 <= shiftCount
) {
1142 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1145 shift128ExtraRightJamming(
1146 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1149 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
1153 /*----------------------------------------------------------------------------
1154 | Returns the result of converting the 32-bit two's complement integer `a'
1155 | to the single-precision floating-point format. The conversion is performed
1156 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1157 *----------------------------------------------------------------------------*/
1159 float32
int32_to_float32(int32_t a STATUS_PARAM
)
1163 if ( a
== 0 ) return float32_zero
;
1164 if ( a
== (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1166 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a STATUS_VAR
);
1170 /*----------------------------------------------------------------------------
1171 | Returns the result of converting the 32-bit two's complement integer `a'
1172 | to the double-precision floating-point format. The conversion is performed
1173 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1174 *----------------------------------------------------------------------------*/
1176 float64
int32_to_float64(int32_t a STATUS_PARAM
)
1183 if ( a
== 0 ) return float64_zero
;
1185 absA
= zSign
? - a
: a
;
1186 shiftCount
= countLeadingZeros32( absA
) + 21;
1188 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1192 /*----------------------------------------------------------------------------
1193 | Returns the result of converting the 32-bit two's complement integer `a'
1194 | to the extended double-precision floating-point format. The conversion
1195 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1197 *----------------------------------------------------------------------------*/
1199 floatx80
int32_to_floatx80(int32_t a STATUS_PARAM
)
1206 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1208 absA
= zSign
? - a
: a
;
1209 shiftCount
= countLeadingZeros32( absA
) + 32;
1211 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1215 /*----------------------------------------------------------------------------
1216 | Returns the result of converting the 32-bit two's complement integer `a' to
1217 | the quadruple-precision floating-point format. The conversion is performed
1218 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1219 *----------------------------------------------------------------------------*/
1221 float128
int32_to_float128(int32_t a STATUS_PARAM
)
1228 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1230 absA
= zSign
? - a
: a
;
1231 shiftCount
= countLeadingZeros32( absA
) + 17;
1233 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1237 /*----------------------------------------------------------------------------
1238 | Returns the result of converting the 64-bit two's complement integer `a'
1239 | to the single-precision floating-point format. The conversion is performed
1240 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1241 *----------------------------------------------------------------------------*/
1243 float32
int64_to_float32(int64_t a STATUS_PARAM
)
1249 if ( a
== 0 ) return float32_zero
;
1251 absA
= zSign
? - a
: a
;
1252 shiftCount
= countLeadingZeros64( absA
) - 40;
1253 if ( 0 <= shiftCount
) {
1254 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1258 if ( shiftCount
< 0 ) {
1259 shift64RightJamming( absA
, - shiftCount
, &absA
);
1262 absA
<<= shiftCount
;
1264 return roundAndPackFloat32( zSign
, 0x9C - shiftCount
, absA STATUS_VAR
);
1269 float32
uint64_to_float32(uint64_t a STATUS_PARAM
)
1273 if ( a
== 0 ) return float32_zero
;
1274 shiftCount
= countLeadingZeros64( a
) - 40;
1275 if ( 0 <= shiftCount
) {
1276 return packFloat32(0, 0x95 - shiftCount
, a
<<shiftCount
);
1280 if ( shiftCount
< 0 ) {
1281 shift64RightJamming( a
, - shiftCount
, &a
);
1286 return roundAndPackFloat32(0, 0x9C - shiftCount
, a STATUS_VAR
);
1290 /*----------------------------------------------------------------------------
1291 | Returns the result of converting the 64-bit two's complement integer `a'
1292 | to the double-precision floating-point format. The conversion is performed
1293 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1294 *----------------------------------------------------------------------------*/
1296 float64
int64_to_float64(int64_t a STATUS_PARAM
)
1300 if ( a
== 0 ) return float64_zero
;
1301 if ( a
== (int64_t) LIT64( 0x8000000000000000 ) ) {
1302 return packFloat64( 1, 0x43E, 0 );
1305 return normalizeRoundAndPackFloat64( zSign
, 0x43C, zSign
? - a
: a STATUS_VAR
);
1309 float64
uint64_to_float64(uint64_t a STATUS_PARAM
)
1314 return float64_zero
;
1316 if ((int64_t)a
< 0) {
1317 shift64RightJamming(a
, 1, &a
);
1320 return normalizeRoundAndPackFloat64(0, exp
, a STATUS_VAR
);
1323 /*----------------------------------------------------------------------------
1324 | Returns the result of converting the 64-bit two's complement integer `a'
1325 | to the extended double-precision floating-point format. The conversion
1326 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1328 *----------------------------------------------------------------------------*/
1330 floatx80
int64_to_floatx80(int64_t a STATUS_PARAM
)
1336 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1338 absA
= zSign
? - a
: a
;
1339 shiftCount
= countLeadingZeros64( absA
);
1340 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1344 /*----------------------------------------------------------------------------
1345 | Returns the result of converting the 64-bit two's complement integer `a' to
1346 | the quadruple-precision floating-point format. The conversion is performed
1347 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1348 *----------------------------------------------------------------------------*/
1350 float128
int64_to_float128(int64_t a STATUS_PARAM
)
1356 uint64_t zSig0
, zSig1
;
1358 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1360 absA
= zSign
? - a
: a
;
1361 shiftCount
= countLeadingZeros64( absA
) + 49;
1362 zExp
= 0x406E - shiftCount
;
1363 if ( 64 <= shiftCount
) {
1372 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1373 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1377 float128
uint64_to_float128(uint64_t a STATUS_PARAM
)
1380 return float128_zero
;
1382 return normalizeRoundAndPackFloat128(0, 0x406E, a
, 0 STATUS_VAR
);
1385 /*----------------------------------------------------------------------------
1386 | Returns the result of converting the single-precision floating-point value
1387 | `a' to the 32-bit two's complement integer format. The conversion is
1388 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1389 | Arithmetic---which means in particular that the conversion is rounded
1390 | according to the current rounding mode. If `a' is a NaN, the largest
1391 | positive integer is returned. Otherwise, if the conversion overflows, the
1392 | largest integer with the same sign as `a' is returned.
1393 *----------------------------------------------------------------------------*/
1395 int32
float32_to_int32( float32 a STATUS_PARAM
)
1398 int_fast16_t aExp
, shiftCount
;
1402 a
= float32_squash_input_denormal(a STATUS_VAR
);
1403 aSig
= extractFloat32Frac( a
);
1404 aExp
= extractFloat32Exp( a
);
1405 aSign
= extractFloat32Sign( a
);
1406 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1407 if ( aExp
) aSig
|= 0x00800000;
1408 shiftCount
= 0xAF - aExp
;
1411 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1412 return roundAndPackInt32( aSign
, aSig64 STATUS_VAR
);
1416 /*----------------------------------------------------------------------------
1417 | Returns the result of converting the single-precision floating-point value
1418 | `a' to the 32-bit two's complement integer format. The conversion is
1419 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1420 | Arithmetic, except that the conversion is always rounded toward zero.
1421 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1422 | the conversion overflows, the largest integer with the same sign as `a' is
1424 *----------------------------------------------------------------------------*/
1426 int32
float32_to_int32_round_to_zero( float32 a STATUS_PARAM
)
1429 int_fast16_t aExp
, shiftCount
;
1432 a
= float32_squash_input_denormal(a STATUS_VAR
);
1434 aSig
= extractFloat32Frac( a
);
1435 aExp
= extractFloat32Exp( a
);
1436 aSign
= extractFloat32Sign( a
);
1437 shiftCount
= aExp
- 0x9E;
1438 if ( 0 <= shiftCount
) {
1439 if ( float32_val(a
) != 0xCF000000 ) {
1440 float_raise( float_flag_invalid STATUS_VAR
);
1441 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1443 return (int32_t) 0x80000000;
1445 else if ( aExp
<= 0x7E ) {
1446 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1449 aSig
= ( aSig
| 0x00800000 )<<8;
1450 z
= aSig
>>( - shiftCount
);
1451 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1452 STATUS(float_exception_flags
) |= float_flag_inexact
;
1454 if ( aSign
) z
= - z
;
1459 /*----------------------------------------------------------------------------
1460 | Returns the result of converting the single-precision floating-point value
1461 | `a' to the 16-bit two's complement integer format. The conversion is
1462 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1463 | Arithmetic, except that the conversion is always rounded toward zero.
1464 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1465 | the conversion overflows, the largest integer with the same sign as `a' is
1467 *----------------------------------------------------------------------------*/
1469 int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM
)
1472 int_fast16_t aExp
, shiftCount
;
1476 aSig
= extractFloat32Frac( a
);
1477 aExp
= extractFloat32Exp( a
);
1478 aSign
= extractFloat32Sign( a
);
1479 shiftCount
= aExp
- 0x8E;
1480 if ( 0 <= shiftCount
) {
1481 if ( float32_val(a
) != 0xC7000000 ) {
1482 float_raise( float_flag_invalid STATUS_VAR
);
1483 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1487 return (int32_t) 0xffff8000;
1489 else if ( aExp
<= 0x7E ) {
1490 if ( aExp
| aSig
) {
1491 STATUS(float_exception_flags
) |= float_flag_inexact
;
1496 aSig
= ( aSig
| 0x00800000 )<<8;
1497 z
= aSig
>>( - shiftCount
);
1498 if ( (uint32_t) ( aSig
<<( shiftCount
& 31 ) ) ) {
1499 STATUS(float_exception_flags
) |= float_flag_inexact
;
1508 /*----------------------------------------------------------------------------
1509 | Returns the result of converting the single-precision floating-point value
1510 | `a' to the 64-bit two's complement integer format. The conversion is
1511 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1512 | Arithmetic---which means in particular that the conversion is rounded
1513 | according to the current rounding mode. If `a' is a NaN, the largest
1514 | positive integer is returned. Otherwise, if the conversion overflows, the
1515 | largest integer with the same sign as `a' is returned.
1516 *----------------------------------------------------------------------------*/
1518 int64
float32_to_int64( float32 a STATUS_PARAM
)
1521 int_fast16_t aExp
, shiftCount
;
1523 uint64_t aSig64
, aSigExtra
;
1524 a
= float32_squash_input_denormal(a STATUS_VAR
);
1526 aSig
= extractFloat32Frac( a
);
1527 aExp
= extractFloat32Exp( a
);
1528 aSign
= extractFloat32Sign( a
);
1529 shiftCount
= 0xBE - aExp
;
1530 if ( shiftCount
< 0 ) {
1531 float_raise( float_flag_invalid STATUS_VAR
);
1532 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1533 return LIT64( 0x7FFFFFFFFFFFFFFF );
1535 return (int64_t) LIT64( 0x8000000000000000 );
1537 if ( aExp
) aSig
|= 0x00800000;
1540 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1541 return roundAndPackInt64( aSign
, aSig64
, aSigExtra STATUS_VAR
);
1545 /*----------------------------------------------------------------------------
1546 | Returns the result of converting the single-precision floating-point value
1547 | `a' to the 64-bit unsigned integer format. The conversion is
1548 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1549 | Arithmetic---which means in particular that the conversion is rounded
1550 | according to the current rounding mode. If `a' is a NaN, the largest
1551 | unsigned integer is returned. Otherwise, if the conversion overflows, the
1552 | largest unsigned integer is returned. If the 'a' is negative, the result
1553 | is rounded and zero is returned; values that do not round to zero will
1554 | raise the inexact exception flag.
1555 *----------------------------------------------------------------------------*/
1557 uint64
float32_to_uint64(float32 a STATUS_PARAM
)
1560 int_fast16_t aExp
, shiftCount
;
1562 uint64_t aSig64
, aSigExtra
;
1563 a
= float32_squash_input_denormal(a STATUS_VAR
);
1565 aSig
= extractFloat32Frac(a
);
1566 aExp
= extractFloat32Exp(a
);
1567 aSign
= extractFloat32Sign(a
);
1568 if ((aSign
) && (aExp
> 126)) {
1569 float_raise(float_flag_invalid STATUS_VAR
);
1570 if (float32_is_any_nan(a
)) {
1571 return LIT64(0xFFFFFFFFFFFFFFFF);
1576 shiftCount
= 0xBE - aExp
;
1580 if (shiftCount
< 0) {
1581 float_raise(float_flag_invalid STATUS_VAR
);
1582 return LIT64(0xFFFFFFFFFFFFFFFF);
1587 shift64ExtraRightJamming(aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1588 return roundAndPackUint64(aSign
, aSig64
, aSigExtra STATUS_VAR
);
1591 /*----------------------------------------------------------------------------
1592 | Returns the result of converting the single-precision floating-point value
1593 | `a' to the 64-bit two's complement integer format. The conversion is
1594 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1595 | Arithmetic, except that the conversion is always rounded toward zero. If
1596 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1597 | conversion overflows, the largest integer with the same sign as `a' is
1599 *----------------------------------------------------------------------------*/
1601 int64
float32_to_int64_round_to_zero( float32 a STATUS_PARAM
)
1604 int_fast16_t aExp
, shiftCount
;
1608 a
= float32_squash_input_denormal(a STATUS_VAR
);
1610 aSig
= extractFloat32Frac( a
);
1611 aExp
= extractFloat32Exp( a
);
1612 aSign
= extractFloat32Sign( a
);
1613 shiftCount
= aExp
- 0xBE;
1614 if ( 0 <= shiftCount
) {
1615 if ( float32_val(a
) != 0xDF000000 ) {
1616 float_raise( float_flag_invalid STATUS_VAR
);
1617 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1618 return LIT64( 0x7FFFFFFFFFFFFFFF );
1621 return (int64_t) LIT64( 0x8000000000000000 );
1623 else if ( aExp
<= 0x7E ) {
1624 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
1627 aSig64
= aSig
| 0x00800000;
1629 z
= aSig64
>>( - shiftCount
);
1630 if ( (uint64_t) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1631 STATUS(float_exception_flags
) |= float_flag_inexact
;
1633 if ( aSign
) z
= - z
;
1638 /*----------------------------------------------------------------------------
1639 | Returns the result of converting the single-precision floating-point value
1640 | `a' to the double-precision floating-point format. The conversion is
1641 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1643 *----------------------------------------------------------------------------*/
1645 float64
float32_to_float64( float32 a STATUS_PARAM
)
1650 a
= float32_squash_input_denormal(a STATUS_VAR
);
1652 aSig
= extractFloat32Frac( a
);
1653 aExp
= extractFloat32Exp( a
);
1654 aSign
= extractFloat32Sign( a
);
1655 if ( aExp
== 0xFF ) {
1656 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1657 return packFloat64( aSign
, 0x7FF, 0 );
1660 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1661 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1664 return packFloat64( aSign
, aExp
+ 0x380, ( (uint64_t) aSig
)<<29 );
1668 /*----------------------------------------------------------------------------
1669 | Returns the result of converting the single-precision floating-point value
1670 | `a' to the extended double-precision floating-point format. The conversion
1671 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1673 *----------------------------------------------------------------------------*/
1675 floatx80
float32_to_floatx80( float32 a STATUS_PARAM
)
1681 a
= float32_squash_input_denormal(a STATUS_VAR
);
1682 aSig
= extractFloat32Frac( a
);
1683 aExp
= extractFloat32Exp( a
);
1684 aSign
= extractFloat32Sign( a
);
1685 if ( aExp
== 0xFF ) {
1686 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1687 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1690 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1691 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1694 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<40 );
1698 /*----------------------------------------------------------------------------
1699 | Returns the result of converting the single-precision floating-point value
1700 | `a' to the double-precision floating-point format. The conversion is
1701 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1703 *----------------------------------------------------------------------------*/
1705 float128
float32_to_float128( float32 a STATUS_PARAM
)
1711 a
= float32_squash_input_denormal(a STATUS_VAR
);
1712 aSig
= extractFloat32Frac( a
);
1713 aExp
= extractFloat32Exp( a
);
1714 aSign
= extractFloat32Sign( a
);
1715 if ( aExp
== 0xFF ) {
1716 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
1717 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1720 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1721 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1724 return packFloat128( aSign
, aExp
+ 0x3F80, ( (uint64_t) aSig
)<<25, 0 );
1728 /*----------------------------------------------------------------------------
1729 | Rounds the single-precision floating-point value `a' to an integer, and
1730 | returns the result as a single-precision floating-point value. The
1731 | operation is performed according to the IEC/IEEE Standard for Binary
1732 | Floating-Point Arithmetic.
1733 *----------------------------------------------------------------------------*/
1735 float32
float32_round_to_int( float32 a STATUS_PARAM
)
1739 uint32_t lastBitMask
, roundBitsMask
;
1742 a
= float32_squash_input_denormal(a STATUS_VAR
);
1744 aExp
= extractFloat32Exp( a
);
1745 if ( 0x96 <= aExp
) {
1746 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1747 return propagateFloat32NaN( a
, a STATUS_VAR
);
1751 if ( aExp
<= 0x7E ) {
1752 if ( (uint32_t) ( float32_val(a
)<<1 ) == 0 ) return a
;
1753 STATUS(float_exception_flags
) |= float_flag_inexact
;
1754 aSign
= extractFloat32Sign( a
);
1755 switch ( STATUS(float_rounding_mode
) ) {
1756 case float_round_nearest_even
:
1757 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1758 return packFloat32( aSign
, 0x7F, 0 );
1761 case float_round_down
:
1762 return make_float32(aSign
? 0xBF800000 : 0);
1763 case float_round_up
:
1764 return make_float32(aSign
? 0x80000000 : 0x3F800000);
1766 return packFloat32( aSign
, 0, 0 );
1769 lastBitMask
<<= 0x96 - aExp
;
1770 roundBitsMask
= lastBitMask
- 1;
1772 roundingMode
= STATUS(float_rounding_mode
);
1773 if ( roundingMode
== float_round_nearest_even
) {
1774 z
+= lastBitMask
>>1;
1775 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1777 else if ( roundingMode
!= float_round_to_zero
) {
1778 if ( extractFloat32Sign( make_float32(z
) ) ^ ( roundingMode
== float_round_up
) ) {
1782 z
&= ~ roundBitsMask
;
1783 if ( z
!= float32_val(a
) ) STATUS(float_exception_flags
) |= float_flag_inexact
;
1784 return make_float32(z
);
1788 /*----------------------------------------------------------------------------
1789 | Returns the result of adding the absolute values of the single-precision
1790 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1791 | before being returned. `zSign' is ignored if the result is a NaN.
1792 | The addition is performed according to the IEC/IEEE Standard for Binary
1793 | Floating-Point Arithmetic.
1794 *----------------------------------------------------------------------------*/
1796 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1798 int_fast16_t aExp
, bExp
, zExp
;
1799 uint32_t aSig
, bSig
, zSig
;
1800 int_fast16_t expDiff
;
1802 aSig
= extractFloat32Frac( a
);
1803 aExp
= extractFloat32Exp( a
);
1804 bSig
= extractFloat32Frac( b
);
1805 bExp
= extractFloat32Exp( b
);
1806 expDiff
= aExp
- bExp
;
1809 if ( 0 < expDiff
) {
1810 if ( aExp
== 0xFF ) {
1811 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1820 shift32RightJamming( bSig
, expDiff
, &bSig
);
1823 else if ( expDiff
< 0 ) {
1824 if ( bExp
== 0xFF ) {
1825 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1826 return packFloat32( zSign
, 0xFF, 0 );
1834 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1838 if ( aExp
== 0xFF ) {
1839 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1843 if (STATUS(flush_to_zero
)) {
1845 float_raise(float_flag_output_denormal STATUS_VAR
);
1847 return packFloat32(zSign
, 0, 0);
1849 return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1851 zSig
= 0x40000000 + aSig
+ bSig
;
1856 zSig
= ( aSig
+ bSig
)<<1;
1858 if ( (int32_t) zSig
< 0 ) {
1863 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1867 /*----------------------------------------------------------------------------
1868 | Returns the result of subtracting the absolute values of the single-
1869 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1870 | difference is negated before being returned. `zSign' is ignored if the
1871 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1872 | Standard for Binary Floating-Point Arithmetic.
1873 *----------------------------------------------------------------------------*/
1875 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign STATUS_PARAM
)
1877 int_fast16_t aExp
, bExp
, zExp
;
1878 uint32_t aSig
, bSig
, zSig
;
1879 int_fast16_t expDiff
;
1881 aSig
= extractFloat32Frac( a
);
1882 aExp
= extractFloat32Exp( a
);
1883 bSig
= extractFloat32Frac( b
);
1884 bExp
= extractFloat32Exp( b
);
1885 expDiff
= aExp
- bExp
;
1888 if ( 0 < expDiff
) goto aExpBigger
;
1889 if ( expDiff
< 0 ) goto bExpBigger
;
1890 if ( aExp
== 0xFF ) {
1891 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1892 float_raise( float_flag_invalid STATUS_VAR
);
1893 return float32_default_nan
;
1899 if ( bSig
< aSig
) goto aBigger
;
1900 if ( aSig
< bSig
) goto bBigger
;
1901 return packFloat32( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
1903 if ( bExp
== 0xFF ) {
1904 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1905 return packFloat32( zSign
^ 1, 0xFF, 0 );
1913 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1919 goto normalizeRoundAndPack
;
1921 if ( aExp
== 0xFF ) {
1922 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
1931 shift32RightJamming( bSig
, expDiff
, &bSig
);
1936 normalizeRoundAndPack
:
1938 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
1942 /*----------------------------------------------------------------------------
1943 | Returns the result of adding the single-precision floating-point values `a'
1944 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1945 | Binary Floating-Point Arithmetic.
1946 *----------------------------------------------------------------------------*/
1948 float32
float32_add( float32 a
, float32 b STATUS_PARAM
)
1951 a
= float32_squash_input_denormal(a STATUS_VAR
);
1952 b
= float32_squash_input_denormal(b STATUS_VAR
);
1954 aSign
= extractFloat32Sign( a
);
1955 bSign
= extractFloat32Sign( b
);
1956 if ( aSign
== bSign
) {
1957 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1960 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1965 /*----------------------------------------------------------------------------
1966 | Returns the result of subtracting the single-precision floating-point values
1967 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1968 | for Binary Floating-Point Arithmetic.
1969 *----------------------------------------------------------------------------*/
1971 float32
float32_sub( float32 a
, float32 b STATUS_PARAM
)
1974 a
= float32_squash_input_denormal(a STATUS_VAR
);
1975 b
= float32_squash_input_denormal(b STATUS_VAR
);
1977 aSign
= extractFloat32Sign( a
);
1978 bSign
= extractFloat32Sign( b
);
1979 if ( aSign
== bSign
) {
1980 return subFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1983 return addFloat32Sigs( a
, b
, aSign STATUS_VAR
);
1988 /*----------------------------------------------------------------------------
1989 | Returns the result of multiplying the single-precision floating-point values
1990 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1991 | for Binary Floating-Point Arithmetic.
1992 *----------------------------------------------------------------------------*/
1994 float32
float32_mul( float32 a
, float32 b STATUS_PARAM
)
1996 flag aSign
, bSign
, zSign
;
1997 int_fast16_t aExp
, bExp
, zExp
;
1998 uint32_t aSig
, bSig
;
2002 a
= float32_squash_input_denormal(a STATUS_VAR
);
2003 b
= float32_squash_input_denormal(b STATUS_VAR
);
2005 aSig
= extractFloat32Frac( a
);
2006 aExp
= extractFloat32Exp( a
);
2007 aSign
= extractFloat32Sign( a
);
2008 bSig
= extractFloat32Frac( b
);
2009 bExp
= extractFloat32Exp( b
);
2010 bSign
= extractFloat32Sign( b
);
2011 zSign
= aSign
^ bSign
;
2012 if ( aExp
== 0xFF ) {
2013 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2014 return propagateFloat32NaN( a
, b STATUS_VAR
);
2016 if ( ( bExp
| bSig
) == 0 ) {
2017 float_raise( float_flag_invalid STATUS_VAR
);
2018 return float32_default_nan
;
2020 return packFloat32( zSign
, 0xFF, 0 );
2022 if ( bExp
== 0xFF ) {
2023 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2024 if ( ( aExp
| aSig
) == 0 ) {
2025 float_raise( float_flag_invalid STATUS_VAR
);
2026 return float32_default_nan
;
2028 return packFloat32( zSign
, 0xFF, 0 );
2031 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2032 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2035 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2036 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2038 zExp
= aExp
+ bExp
- 0x7F;
2039 aSig
= ( aSig
| 0x00800000 )<<7;
2040 bSig
= ( bSig
| 0x00800000 )<<8;
2041 shift64RightJamming( ( (uint64_t) aSig
) * bSig
, 32, &zSig64
);
2043 if ( 0 <= (int32_t) ( zSig
<<1 ) ) {
2047 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
2051 /*----------------------------------------------------------------------------
2052 | Returns the result of dividing the single-precision floating-point value `a'
2053 | by the corresponding value `b'. The operation is performed according to the
2054 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2055 *----------------------------------------------------------------------------*/
2057 float32
float32_div( float32 a
, float32 b STATUS_PARAM
)
2059 flag aSign
, bSign
, zSign
;
2060 int_fast16_t aExp
, bExp
, zExp
;
2061 uint32_t aSig
, bSig
, zSig
;
2062 a
= float32_squash_input_denormal(a STATUS_VAR
);
2063 b
= float32_squash_input_denormal(b STATUS_VAR
);
2065 aSig
= extractFloat32Frac( a
);
2066 aExp
= extractFloat32Exp( a
);
2067 aSign
= extractFloat32Sign( a
);
2068 bSig
= extractFloat32Frac( b
);
2069 bExp
= extractFloat32Exp( b
);
2070 bSign
= extractFloat32Sign( b
);
2071 zSign
= aSign
^ bSign
;
2072 if ( aExp
== 0xFF ) {
2073 if ( aSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2074 if ( bExp
== 0xFF ) {
2075 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2076 float_raise( float_flag_invalid STATUS_VAR
);
2077 return float32_default_nan
;
2079 return packFloat32( zSign
, 0xFF, 0 );
2081 if ( bExp
== 0xFF ) {
2082 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2083 return packFloat32( zSign
, 0, 0 );
2087 if ( ( aExp
| aSig
) == 0 ) {
2088 float_raise( float_flag_invalid STATUS_VAR
);
2089 return float32_default_nan
;
2091 float_raise( float_flag_divbyzero STATUS_VAR
);
2092 return packFloat32( zSign
, 0xFF, 0 );
2094 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2097 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
2098 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2100 zExp
= aExp
- bExp
+ 0x7D;
2101 aSig
= ( aSig
| 0x00800000 )<<7;
2102 bSig
= ( bSig
| 0x00800000 )<<8;
2103 if ( bSig
<= ( aSig
+ aSig
) ) {
2107 zSig
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2108 if ( ( zSig
& 0x3F ) == 0 ) {
2109 zSig
|= ( (uint64_t) bSig
* zSig
!= ( (uint64_t) aSig
)<<32 );
2111 return roundAndPackFloat32( zSign
, zExp
, zSig STATUS_VAR
);
2115 /*----------------------------------------------------------------------------
2116 | Returns the remainder of the single-precision floating-point value `a'
2117 | with respect to the corresponding value `b'. The operation is performed
2118 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2119 *----------------------------------------------------------------------------*/
2121 float32
float32_rem( float32 a
, float32 b STATUS_PARAM
)
2124 int_fast16_t aExp
, bExp
, expDiff
;
2125 uint32_t aSig
, bSig
;
2127 uint64_t aSig64
, bSig64
, q64
;
2128 uint32_t alternateASig
;
2130 a
= float32_squash_input_denormal(a STATUS_VAR
);
2131 b
= float32_squash_input_denormal(b STATUS_VAR
);
2133 aSig
= extractFloat32Frac( a
);
2134 aExp
= extractFloat32Exp( a
);
2135 aSign
= extractFloat32Sign( a
);
2136 bSig
= extractFloat32Frac( b
);
2137 bExp
= extractFloat32Exp( b
);
2138 if ( aExp
== 0xFF ) {
2139 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
2140 return propagateFloat32NaN( a
, b STATUS_VAR
);
2142 float_raise( float_flag_invalid STATUS_VAR
);
2143 return float32_default_nan
;
2145 if ( bExp
== 0xFF ) {
2146 if ( bSig
) return propagateFloat32NaN( a
, b STATUS_VAR
);
2151 float_raise( float_flag_invalid STATUS_VAR
);
2152 return float32_default_nan
;
2154 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
2157 if ( aSig
== 0 ) return a
;
2158 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2160 expDiff
= aExp
- bExp
;
2163 if ( expDiff
< 32 ) {
2166 if ( expDiff
< 0 ) {
2167 if ( expDiff
< -1 ) return a
;
2170 q
= ( bSig
<= aSig
);
2171 if ( q
) aSig
-= bSig
;
2172 if ( 0 < expDiff
) {
2173 q
= ( ( (uint64_t) aSig
)<<32 ) / bSig
;
2176 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2184 if ( bSig
<= aSig
) aSig
-= bSig
;
2185 aSig64
= ( (uint64_t) aSig
)<<40;
2186 bSig64
= ( (uint64_t) bSig
)<<40;
2188 while ( 0 < expDiff
) {
2189 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2190 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2191 aSig64
= - ( ( bSig
* q64
)<<38 );
2195 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
2196 q64
= ( 2 < q64
) ? q64
- 2 : 0;
2197 q
= q64
>>( 64 - expDiff
);
2199 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
2202 alternateASig
= aSig
;
2205 } while ( 0 <= (int32_t) aSig
);
2206 sigMean
= aSig
+ alternateASig
;
2207 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2208 aSig
= alternateASig
;
2210 zSign
= ( (int32_t) aSig
< 0 );
2211 if ( zSign
) aSig
= - aSig
;
2212 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
2216 /*----------------------------------------------------------------------------
2217 | Returns the result of multiplying the single-precision floating-point values
2218 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2219 | multiplication. The operation is performed according to the IEC/IEEE
2220 | Standard for Binary Floating-Point Arithmetic 754-2008.
2221 | The flags argument allows the caller to select negation of the
2222 | addend, the intermediate product, or the final result. (The difference
2223 | between this and having the caller do a separate negation is that negating
2224 | externally will flip the sign bit on NaNs.)
2225 *----------------------------------------------------------------------------*/
2227 float32
float32_muladd(float32 a
, float32 b
, float32 c
, int flags STATUS_PARAM
)
2229 flag aSign
, bSign
, cSign
, zSign
;
2230 int_fast16_t aExp
, bExp
, cExp
, pExp
, zExp
, expDiff
;
2231 uint32_t aSig
, bSig
, cSig
;
2232 flag pInf
, pZero
, pSign
;
2233 uint64_t pSig64
, cSig64
, zSig64
;
2236 flag signflip
, infzero
;
2238 a
= float32_squash_input_denormal(a STATUS_VAR
);
2239 b
= float32_squash_input_denormal(b STATUS_VAR
);
2240 c
= float32_squash_input_denormal(c STATUS_VAR
);
2241 aSig
= extractFloat32Frac(a
);
2242 aExp
= extractFloat32Exp(a
);
2243 aSign
= extractFloat32Sign(a
);
2244 bSig
= extractFloat32Frac(b
);
2245 bExp
= extractFloat32Exp(b
);
2246 bSign
= extractFloat32Sign(b
);
2247 cSig
= extractFloat32Frac(c
);
2248 cExp
= extractFloat32Exp(c
);
2249 cSign
= extractFloat32Sign(c
);
2251 infzero
= ((aExp
== 0 && aSig
== 0 && bExp
== 0xff && bSig
== 0) ||
2252 (aExp
== 0xff && aSig
== 0 && bExp
== 0 && bSig
== 0));
2254 /* It is implementation-defined whether the cases of (0,inf,qnan)
2255 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2256 * they return if they do), so we have to hand this information
2257 * off to the target-specific pick-a-NaN routine.
2259 if (((aExp
== 0xff) && aSig
) ||
2260 ((bExp
== 0xff) && bSig
) ||
2261 ((cExp
== 0xff) && cSig
)) {
2262 return propagateFloat32MulAddNaN(a
, b
, c
, infzero STATUS_VAR
);
2266 float_raise(float_flag_invalid STATUS_VAR
);
2267 return float32_default_nan
;
2270 if (flags
& float_muladd_negate_c
) {
2274 signflip
= (flags
& float_muladd_negate_result
) ? 1 : 0;
2276 /* Work out the sign and type of the product */
2277 pSign
= aSign
^ bSign
;
2278 if (flags
& float_muladd_negate_product
) {
2281 pInf
= (aExp
== 0xff) || (bExp
== 0xff);
2282 pZero
= ((aExp
| aSig
) == 0) || ((bExp
| bSig
) == 0);
2285 if (pInf
&& (pSign
^ cSign
)) {
2286 /* addition of opposite-signed infinities => InvalidOperation */
2287 float_raise(float_flag_invalid STATUS_VAR
);
2288 return float32_default_nan
;
2290 /* Otherwise generate an infinity of the same sign */
2291 return packFloat32(cSign
^ signflip
, 0xff, 0);
2295 return packFloat32(pSign
^ signflip
, 0xff, 0);
2301 /* Adding two exact zeroes */
2302 if (pSign
== cSign
) {
2304 } else if (STATUS(float_rounding_mode
) == float_round_down
) {
2309 return packFloat32(zSign
^ signflip
, 0, 0);
2311 /* Exact zero plus a denorm */
2312 if (STATUS(flush_to_zero
)) {
2313 float_raise(float_flag_output_denormal STATUS_VAR
);
2314 return packFloat32(cSign
^ signflip
, 0, 0);
2317 /* Zero plus something non-zero : just return the something */
2318 return packFloat32(cSign
^ signflip
, cExp
, cSig
);
2322 normalizeFloat32Subnormal(aSig
, &aExp
, &aSig
);
2325 normalizeFloat32Subnormal(bSig
, &bExp
, &bSig
);
2328 /* Calculate the actual result a * b + c */
2330 /* Multiply first; this is easy. */
2331 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2332 * because we want the true exponent, not the "one-less-than"
2333 * flavour that roundAndPackFloat32() takes.
2335 pExp
= aExp
+ bExp
- 0x7e;
2336 aSig
= (aSig
| 0x00800000) << 7;
2337 bSig
= (bSig
| 0x00800000) << 8;
2338 pSig64
= (uint64_t)aSig
* bSig
;
2339 if ((int64_t)(pSig64
<< 1) >= 0) {
2344 zSign
= pSign
^ signflip
;
2346 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2351 /* Throw out the special case of c being an exact zero now */
2352 shift64RightJamming(pSig64
, 32, &pSig64
);
2354 return roundAndPackFloat32(zSign
, pExp
- 1,
2357 normalizeFloat32Subnormal(cSig
, &cExp
, &cSig
);
2360 cSig64
= (uint64_t)cSig
<< (62 - 23);
2361 cSig64
|= LIT64(0x4000000000000000);
2362 expDiff
= pExp
- cExp
;
2364 if (pSign
== cSign
) {
2367 /* scale c to match p */
2368 shift64RightJamming(cSig64
, expDiff
, &cSig64
);
2370 } else if (expDiff
< 0) {
2371 /* scale p to match c */
2372 shift64RightJamming(pSig64
, -expDiff
, &pSig64
);
2375 /* no scaling needed */
2378 /* Add significands and make sure explicit bit ends up in posn 62 */
2379 zSig64
= pSig64
+ cSig64
;
2380 if ((int64_t)zSig64
< 0) {
2381 shift64RightJamming(zSig64
, 1, &zSig64
);
2388 shift64RightJamming(cSig64
, expDiff
, &cSig64
);
2389 zSig64
= pSig64
- cSig64
;
2391 } else if (expDiff
< 0) {
2392 shift64RightJamming(pSig64
, -expDiff
, &pSig64
);
2393 zSig64
= cSig64
- pSig64
;
2398 if (cSig64
< pSig64
) {
2399 zSig64
= pSig64
- cSig64
;
2400 } else if (pSig64
< cSig64
) {
2401 zSig64
= cSig64
- pSig64
;
2406 if (STATUS(float_rounding_mode
) == float_round_down
) {
2409 return packFloat32(zSign
, 0, 0);
2413 /* Normalize to put the explicit bit back into bit 62. */
2414 shiftcount
= countLeadingZeros64(zSig64
) - 1;
2415 zSig64
<<= shiftcount
;
2418 shift64RightJamming(zSig64
, 32, &zSig64
);
2419 return roundAndPackFloat32(zSign
, zExp
, zSig64 STATUS_VAR
);
2423 /*----------------------------------------------------------------------------
2424 | Returns the square root of the single-precision floating-point value `a'.
2425 | The operation is performed according to the IEC/IEEE Standard for Binary
2426 | Floating-Point Arithmetic.
2427 *----------------------------------------------------------------------------*/
2429 float32
float32_sqrt( float32 a STATUS_PARAM
)
2432 int_fast16_t aExp
, zExp
;
2433 uint32_t aSig
, zSig
;
2435 a
= float32_squash_input_denormal(a STATUS_VAR
);
2437 aSig
= extractFloat32Frac( a
);
2438 aExp
= extractFloat32Exp( a
);
2439 aSign
= extractFloat32Sign( a
);
2440 if ( aExp
== 0xFF ) {
2441 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2442 if ( ! aSign
) return a
;
2443 float_raise( float_flag_invalid STATUS_VAR
);
2444 return float32_default_nan
;
2447 if ( ( aExp
| aSig
) == 0 ) return a
;
2448 float_raise( float_flag_invalid STATUS_VAR
);
2449 return float32_default_nan
;
2452 if ( aSig
== 0 ) return float32_zero
;
2453 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2455 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
2456 aSig
= ( aSig
| 0x00800000 )<<8;
2457 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
2458 if ( ( zSig
& 0x7F ) <= 5 ) {
2464 term
= ( (uint64_t) zSig
) * zSig
;
2465 rem
= ( ( (uint64_t) aSig
)<<32 ) - term
;
2466 while ( (int64_t) rem
< 0 ) {
2468 rem
+= ( ( (uint64_t) zSig
)<<1 ) | 1;
2470 zSig
|= ( rem
!= 0 );
2472 shift32RightJamming( zSig
, 1, &zSig
);
2474 return roundAndPackFloat32( 0, zExp
, zSig STATUS_VAR
);
2478 /*----------------------------------------------------------------------------
2479 | Returns the binary exponential of the single-precision floating-point value
2480 | `a'. The operation is performed according to the IEC/IEEE Standard for
2481 | Binary Floating-Point Arithmetic.
2483 | Uses the following identities:
2485 | 1. -------------------------------------------------------------------------
2489 | 2. -------------------------------------------------------------------------
2492 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2494 *----------------------------------------------------------------------------*/
2496 static const float64 float32_exp2_coefficients
[15] =
2498 const_float64( 0x3ff0000000000000ll
), /* 1 */
2499 const_float64( 0x3fe0000000000000ll
), /* 2 */
2500 const_float64( 0x3fc5555555555555ll
), /* 3 */
2501 const_float64( 0x3fa5555555555555ll
), /* 4 */
2502 const_float64( 0x3f81111111111111ll
), /* 5 */
2503 const_float64( 0x3f56c16c16c16c17ll
), /* 6 */
2504 const_float64( 0x3f2a01a01a01a01all
), /* 7 */
2505 const_float64( 0x3efa01a01a01a01all
), /* 8 */
2506 const_float64( 0x3ec71de3a556c734ll
), /* 9 */
2507 const_float64( 0x3e927e4fb7789f5cll
), /* 10 */
2508 const_float64( 0x3e5ae64567f544e4ll
), /* 11 */
2509 const_float64( 0x3e21eed8eff8d898ll
), /* 12 */
2510 const_float64( 0x3de6124613a86d09ll
), /* 13 */
2511 const_float64( 0x3da93974a8c07c9dll
), /* 14 */
2512 const_float64( 0x3d6ae7f3e733b81fll
), /* 15 */
2515 float32
float32_exp2( float32 a STATUS_PARAM
)
2522 a
= float32_squash_input_denormal(a STATUS_VAR
);
2524 aSig
= extractFloat32Frac( a
);
2525 aExp
= extractFloat32Exp( a
);
2526 aSign
= extractFloat32Sign( a
);
2528 if ( aExp
== 0xFF) {
2529 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2530 return (aSign
) ? float32_zero
: a
;
2533 if (aSig
== 0) return float32_one
;
2536 float_raise( float_flag_inexact STATUS_VAR
);
2538 /* ******************************* */
2539 /* using float64 for approximation */
2540 /* ******************************* */
2541 x
= float32_to_float64(a STATUS_VAR
);
2542 x
= float64_mul(x
, float64_ln2 STATUS_VAR
);
2546 for (i
= 0 ; i
< 15 ; i
++) {
2549 f
= float64_mul(xn
, float32_exp2_coefficients
[i
] STATUS_VAR
);
2550 r
= float64_add(r
, f STATUS_VAR
);
2552 xn
= float64_mul(xn
, x STATUS_VAR
);
2555 return float64_to_float32(r
, status
);
2558 /*----------------------------------------------------------------------------
2559 | Returns the binary log of the single-precision floating-point value `a'.
2560 | The operation is performed according to the IEC/IEEE Standard for Binary
2561 | Floating-Point Arithmetic.
2562 *----------------------------------------------------------------------------*/
2563 float32
float32_log2( float32 a STATUS_PARAM
)
2567 uint32_t aSig
, zSig
, i
;
2569 a
= float32_squash_input_denormal(a STATUS_VAR
);
2570 aSig
= extractFloat32Frac( a
);
2571 aExp
= extractFloat32Exp( a
);
2572 aSign
= extractFloat32Sign( a
);
2575 if ( aSig
== 0 ) return packFloat32( 1, 0xFF, 0 );
2576 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
2579 float_raise( float_flag_invalid STATUS_VAR
);
2580 return float32_default_nan
;
2582 if ( aExp
== 0xFF ) {
2583 if ( aSig
) return propagateFloat32NaN( a
, float32_zero STATUS_VAR
);
2592 for (i
= 1 << 22; i
> 0; i
>>= 1) {
2593 aSig
= ( (uint64_t)aSig
* aSig
) >> 23;
2594 if ( aSig
& 0x01000000 ) {
2603 return normalizeRoundAndPackFloat32( zSign
, 0x85, zSig STATUS_VAR
);
2606 /*----------------------------------------------------------------------------
2607 | Returns 1 if the single-precision floating-point value `a' is equal to
2608 | the corresponding value `b', and 0 otherwise. The invalid exception is
2609 | raised if either operand is a NaN. Otherwise, the comparison is performed
2610 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2611 *----------------------------------------------------------------------------*/
2613 int float32_eq( float32 a
, float32 b STATUS_PARAM
)
2616 a
= float32_squash_input_denormal(a STATUS_VAR
);
2617 b
= float32_squash_input_denormal(b STATUS_VAR
);
2619 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2620 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2622 float_raise( float_flag_invalid STATUS_VAR
);
2625 av
= float32_val(a
);
2626 bv
= float32_val(b
);
2627 return ( av
== bv
) || ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2630 /*----------------------------------------------------------------------------
2631 | Returns 1 if the single-precision floating-point value `a' is less than
2632 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2633 | exception is raised if either operand is a NaN. The comparison is performed
2634 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2635 *----------------------------------------------------------------------------*/
2637 int float32_le( float32 a
, float32 b STATUS_PARAM
)
2641 a
= float32_squash_input_denormal(a STATUS_VAR
);
2642 b
= float32_squash_input_denormal(b STATUS_VAR
);
2644 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2645 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2647 float_raise( float_flag_invalid STATUS_VAR
);
2650 aSign
= extractFloat32Sign( a
);
2651 bSign
= extractFloat32Sign( b
);
2652 av
= float32_val(a
);
2653 bv
= float32_val(b
);
2654 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2655 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2659 /*----------------------------------------------------------------------------
2660 | Returns 1 if the single-precision floating-point value `a' is less than
2661 | the corresponding value `b', and 0 otherwise. The invalid exception is
2662 | raised if either operand is a NaN. The comparison is performed according
2663 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2664 *----------------------------------------------------------------------------*/
2666 int float32_lt( float32 a
, float32 b STATUS_PARAM
)
2670 a
= float32_squash_input_denormal(a STATUS_VAR
);
2671 b
= float32_squash_input_denormal(b STATUS_VAR
);
2673 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2674 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2676 float_raise( float_flag_invalid STATUS_VAR
);
2679 aSign
= extractFloat32Sign( a
);
2680 bSign
= extractFloat32Sign( b
);
2681 av
= float32_val(a
);
2682 bv
= float32_val(b
);
2683 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
2684 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2688 /*----------------------------------------------------------------------------
2689 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2690 | be compared, and 0 otherwise. The invalid exception is raised if either
2691 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2692 | Standard for Binary Floating-Point Arithmetic.
2693 *----------------------------------------------------------------------------*/
2695 int float32_unordered( float32 a
, float32 b STATUS_PARAM
)
2697 a
= float32_squash_input_denormal(a STATUS_VAR
);
2698 b
= float32_squash_input_denormal(b STATUS_VAR
);
2700 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2701 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2703 float_raise( float_flag_invalid STATUS_VAR
);
2709 /*----------------------------------------------------------------------------
2710 | Returns 1 if the single-precision floating-point value `a' is equal to
2711 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2712 | exception. The comparison is performed according to the IEC/IEEE Standard
2713 | for Binary Floating-Point Arithmetic.
2714 *----------------------------------------------------------------------------*/
2716 int float32_eq_quiet( float32 a
, float32 b STATUS_PARAM
)
2718 a
= float32_squash_input_denormal(a STATUS_VAR
);
2719 b
= float32_squash_input_denormal(b STATUS_VAR
);
2721 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2722 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2724 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2725 float_raise( float_flag_invalid STATUS_VAR
);
2729 return ( float32_val(a
) == float32_val(b
) ) ||
2730 ( (uint32_t) ( ( float32_val(a
) | float32_val(b
) )<<1 ) == 0 );
2733 /*----------------------------------------------------------------------------
2734 | Returns 1 if the single-precision floating-point value `a' is less than or
2735 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2736 | cause an exception. Otherwise, the comparison is performed according to the
2737 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2738 *----------------------------------------------------------------------------*/
2740 int float32_le_quiet( float32 a
, float32 b STATUS_PARAM
)
2744 a
= float32_squash_input_denormal(a STATUS_VAR
);
2745 b
= float32_squash_input_denormal(b STATUS_VAR
);
2747 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2748 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2750 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2751 float_raise( float_flag_invalid STATUS_VAR
);
2755 aSign
= extractFloat32Sign( a
);
2756 bSign
= extractFloat32Sign( b
);
2757 av
= float32_val(a
);
2758 bv
= float32_val(b
);
2759 if ( aSign
!= bSign
) return aSign
|| ( (uint32_t) ( ( av
| bv
)<<1 ) == 0 );
2760 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
2764 /*----------------------------------------------------------------------------
2765 | Returns 1 if the single-precision floating-point value `a' is less than
2766 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2767 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2768 | Standard for Binary Floating-Point Arithmetic.
2769 *----------------------------------------------------------------------------*/
2771 int float32_lt_quiet( float32 a
, float32 b STATUS_PARAM
)
2775 a
= float32_squash_input_denormal(a STATUS_VAR
);
2776 b
= float32_squash_input_denormal(b STATUS_VAR
);
2778 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2779 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2781 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2782 float_raise( float_flag_invalid STATUS_VAR
);
2786 aSign
= extractFloat32Sign( a
);
2787 bSign
= extractFloat32Sign( b
);
2788 av
= float32_val(a
);
2789 bv
= float32_val(b
);
2790 if ( aSign
!= bSign
) return aSign
&& ( (uint32_t) ( ( av
| bv
)<<1 ) != 0 );
2791 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
2795 /*----------------------------------------------------------------------------
2796 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2797 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2798 | comparison is performed according to the IEC/IEEE Standard for Binary
2799 | Floating-Point Arithmetic.
2800 *----------------------------------------------------------------------------*/
2802 int float32_unordered_quiet( float32 a
, float32 b STATUS_PARAM
)
2804 a
= float32_squash_input_denormal(a STATUS_VAR
);
2805 b
= float32_squash_input_denormal(b STATUS_VAR
);
2807 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2808 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2810 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2811 float_raise( float_flag_invalid STATUS_VAR
);
2818 /*----------------------------------------------------------------------------
2819 | Returns the result of converting the double-precision floating-point value
2820 | `a' to the 32-bit two's complement integer format. The conversion is
2821 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2822 | Arithmetic---which means in particular that the conversion is rounded
2823 | according to the current rounding mode. If `a' is a NaN, the largest
2824 | positive integer is returned. Otherwise, if the conversion overflows, the
2825 | largest integer with the same sign as `a' is returned.
2826 *----------------------------------------------------------------------------*/
2828 int32
float64_to_int32( float64 a STATUS_PARAM
)
2831 int_fast16_t aExp
, shiftCount
;
2833 a
= float64_squash_input_denormal(a STATUS_VAR
);
2835 aSig
= extractFloat64Frac( a
);
2836 aExp
= extractFloat64Exp( a
);
2837 aSign
= extractFloat64Sign( a
);
2838 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2839 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2840 shiftCount
= 0x42C - aExp
;
2841 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
2842 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
2846 /*----------------------------------------------------------------------------
2847 | Returns the result of converting the double-precision floating-point value
2848 | `a' to the 32-bit two's complement integer format. The conversion is
2849 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2850 | Arithmetic, except that the conversion is always rounded toward zero.
2851 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2852 | the conversion overflows, the largest integer with the same sign as `a' is
2854 *----------------------------------------------------------------------------*/
2856 int32
float64_to_int32_round_to_zero( float64 a STATUS_PARAM
)
2859 int_fast16_t aExp
, shiftCount
;
2860 uint64_t aSig
, savedASig
;
2862 a
= float64_squash_input_denormal(a STATUS_VAR
);
2864 aSig
= extractFloat64Frac( a
);
2865 aExp
= extractFloat64Exp( a
);
2866 aSign
= extractFloat64Sign( a
);
2867 if ( 0x41E < aExp
) {
2868 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2871 else if ( aExp
< 0x3FF ) {
2872 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
2875 aSig
|= LIT64( 0x0010000000000000 );
2876 shiftCount
= 0x433 - aExp
;
2878 aSig
>>= shiftCount
;
2880 if ( aSign
) z
= - z
;
2881 if ( ( z
< 0 ) ^ aSign
) {
2883 float_raise( float_flag_invalid STATUS_VAR
);
2884 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
2886 if ( ( aSig
<<shiftCount
) != savedASig
) {
2887 STATUS(float_exception_flags
) |= float_flag_inexact
;
2893 /*----------------------------------------------------------------------------
2894 | Returns the result of converting the double-precision floating-point value
2895 | `a' to the 16-bit two's complement integer format. The conversion is
2896 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2897 | Arithmetic, except that the conversion is always rounded toward zero.
2898 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2899 | the conversion overflows, the largest integer with the same sign as `a' is
2901 *----------------------------------------------------------------------------*/
2903 int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM
)
2906 int_fast16_t aExp
, shiftCount
;
2907 uint64_t aSig
, savedASig
;
2910 aSig
= extractFloat64Frac( a
);
2911 aExp
= extractFloat64Exp( a
);
2912 aSign
= extractFloat64Sign( a
);
2913 if ( 0x40E < aExp
) {
2914 if ( ( aExp
== 0x7FF ) && aSig
) {
2919 else if ( aExp
< 0x3FF ) {
2920 if ( aExp
|| aSig
) {
2921 STATUS(float_exception_flags
) |= float_flag_inexact
;
2925 aSig
|= LIT64( 0x0010000000000000 );
2926 shiftCount
= 0x433 - aExp
;
2928 aSig
>>= shiftCount
;
2933 if ( ( (int16_t)z
< 0 ) ^ aSign
) {
2935 float_raise( float_flag_invalid STATUS_VAR
);
2936 return aSign
? (int32_t) 0xffff8000 : 0x7FFF;
2938 if ( ( aSig
<<shiftCount
) != savedASig
) {
2939 STATUS(float_exception_flags
) |= float_flag_inexact
;
2944 /*----------------------------------------------------------------------------
2945 | Returns the result of converting the double-precision floating-point value
2946 | `a' to the 64-bit two's complement integer format. The conversion is
2947 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2948 | Arithmetic---which means in particular that the conversion is rounded
2949 | according to the current rounding mode. If `a' is a NaN, the largest
2950 | positive integer is returned. Otherwise, if the conversion overflows, the
2951 | largest integer with the same sign as `a' is returned.
2952 *----------------------------------------------------------------------------*/
2954 int64
float64_to_int64( float64 a STATUS_PARAM
)
2957 int_fast16_t aExp
, shiftCount
;
2958 uint64_t aSig
, aSigExtra
;
2959 a
= float64_squash_input_denormal(a STATUS_VAR
);
2961 aSig
= extractFloat64Frac( a
);
2962 aExp
= extractFloat64Exp( a
);
2963 aSign
= extractFloat64Sign( a
);
2964 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2965 shiftCount
= 0x433 - aExp
;
2966 if ( shiftCount
<= 0 ) {
2967 if ( 0x43E < aExp
) {
2968 float_raise( float_flag_invalid STATUS_VAR
);
2970 || ( ( aExp
== 0x7FF )
2971 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2973 return LIT64( 0x7FFFFFFFFFFFFFFF );
2975 return (int64_t) LIT64( 0x8000000000000000 );
2978 aSig
<<= - shiftCount
;
2981 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
2983 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
2987 /*----------------------------------------------------------------------------
2988 | Returns the result of converting the double-precision floating-point value
2989 | `a' to the 64-bit two's complement integer format. The conversion is
2990 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2991 | Arithmetic, except that the conversion is always rounded toward zero.
2992 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2993 | the conversion overflows, the largest integer with the same sign as `a' is
2995 *----------------------------------------------------------------------------*/
2997 int64
float64_to_int64_round_to_zero( float64 a STATUS_PARAM
)
3000 int_fast16_t aExp
, shiftCount
;
3003 a
= float64_squash_input_denormal(a STATUS_VAR
);
3005 aSig
= extractFloat64Frac( a
);
3006 aExp
= extractFloat64Exp( a
);
3007 aSign
= extractFloat64Sign( a
);
3008 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
3009 shiftCount
= aExp
- 0x433;
3010 if ( 0 <= shiftCount
) {
3011 if ( 0x43E <= aExp
) {
3012 if ( float64_val(a
) != LIT64( 0xC3E0000000000000 ) ) {
3013 float_raise( float_flag_invalid STATUS_VAR
);
3015 || ( ( aExp
== 0x7FF )
3016 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
3018 return LIT64( 0x7FFFFFFFFFFFFFFF );
3021 return (int64_t) LIT64( 0x8000000000000000 );
3023 z
= aSig
<<shiftCount
;
3026 if ( aExp
< 0x3FE ) {
3027 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
3030 z
= aSig
>>( - shiftCount
);
3031 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
3032 STATUS(float_exception_flags
) |= float_flag_inexact
;
3035 if ( aSign
) z
= - z
;
3040 /*----------------------------------------------------------------------------
3041 | Returns the result of converting the double-precision floating-point value
3042 | `a' to the single-precision floating-point format. The conversion is
3043 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3045 *----------------------------------------------------------------------------*/
3047 float32
float64_to_float32( float64 a STATUS_PARAM
)
3053 a
= float64_squash_input_denormal(a STATUS_VAR
);
3055 aSig
= extractFloat64Frac( a
);
3056 aExp
= extractFloat64Exp( a
);
3057 aSign
= extractFloat64Sign( a
);
3058 if ( aExp
== 0x7FF ) {
3059 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3060 return packFloat32( aSign
, 0xFF, 0 );
3062 shift64RightJamming( aSig
, 22, &aSig
);
3064 if ( aExp
|| zSig
) {
3068 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
3073 /*----------------------------------------------------------------------------
3074 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3075 | half-precision floating-point value, returning the result. After being
3076 | shifted into the proper positions, the three fields are simply added
3077 | together to form the result. This means that any integer portion of `zSig'
3078 | will be added into the exponent. Since a properly normalized significand
3079 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3080 | than the desired result exponent whenever `zSig' is a complete, normalized
3082 *----------------------------------------------------------------------------*/
3083 static float16
packFloat16(flag zSign
, int_fast16_t zExp
, uint16_t zSig
)
3085 return make_float16(
3086 (((uint32_t)zSign
) << 15) + (((uint32_t)zExp
) << 10) + zSig
);
3089 /*----------------------------------------------------------------------------
3090 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3091 | and significand `zSig', and returns the proper half-precision floating-
3092 | point value corresponding to the abstract input. Ordinarily, the abstract
3093 | value is simply rounded and packed into the half-precision format, with
3094 | the inexact exception raised if the abstract input cannot be represented
3095 | exactly. However, if the abstract value is too large, the overflow and
3096 | inexact exceptions are raised and an infinity or maximal finite value is
3097 | returned. If the abstract value is too small, the input value is rounded to
3098 | a subnormal number, and the underflow and inexact exceptions are raised if
3099 | the abstract input cannot be represented exactly as a subnormal half-
3100 | precision floating-point number.
3101 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3102 | ARM-style "alternative representation", which omits the NaN and Inf
3103 | encodings in order to raise the maximum representable exponent by one.
3104 | The input significand `zSig' has its binary point between bits 22
3105 | and 23, which is 13 bits to the left of the usual location. This shifted
3106 | significand must be normalized or smaller. If `zSig' is not normalized,
3107 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3108 | and it must not require rounding. In the usual case that `zSig' is
3109 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3110 | Note the slightly odd position of the binary point in zSig compared with the
3111 | other roundAndPackFloat functions. This should probably be fixed if we
3112 | need to implement more float16 routines than just conversion.
3113 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3114 | Binary Floating-Point Arithmetic.
3115 *----------------------------------------------------------------------------*/
3117 static float32
roundAndPackFloat16(flag zSign
, int_fast16_t zExp
,
3118 uint32_t zSig
, flag ieee STATUS_PARAM
)
3120 int maxexp
= ieee
? 29 : 30;
3124 bool rounding_bumps_exp
;
3125 bool is_tiny
= false;
3127 /* Calculate the mask of bits of the mantissa which are not
3128 * representable in half-precision and will be lost.
3131 /* Will be denormal in halfprec */
3137 /* Normal number in halfprec */
3141 roundingMode
= STATUS(float_rounding_mode
);
3142 switch (roundingMode
) {
3143 case float_round_nearest_even
:
3144 increment
= (mask
+ 1) >> 1;
3145 if ((zSig
& mask
) == increment
) {
3146 increment
= zSig
& (increment
<< 1);
3149 case float_round_up
:
3150 increment
= zSign
? 0 : mask
;
3152 case float_round_down
:
3153 increment
= zSign
? mask
: 0;
3155 default: /* round_to_zero */
3160 rounding_bumps_exp
= (zSig
+ increment
>= 0x01000000);
3162 if (zExp
> maxexp
|| (zExp
== maxexp
&& rounding_bumps_exp
)) {
3164 float_raise(float_flag_overflow
| float_flag_inexact STATUS_VAR
);
3165 return packFloat16(zSign
, 0x1f, 0);
3167 float_raise(float_flag_invalid STATUS_VAR
);
3168 return packFloat16(zSign
, 0x1f, 0x3ff);
3173 /* Note that flush-to-zero does not affect half-precision results */
3175 (STATUS(float_detect_tininess
) == float_tininess_before_rounding
)
3177 || (!rounding_bumps_exp
);
3180 float_raise(float_flag_inexact STATUS_VAR
);
3182 float_raise(float_flag_underflow STATUS_VAR
);
3187 if (rounding_bumps_exp
) {
3193 return packFloat16(zSign
, 0, 0);
3199 return packFloat16(zSign
, zExp
, zSig
>> 13);
3202 static void normalizeFloat16Subnormal(uint32_t aSig
, int_fast16_t *zExpPtr
,
3205 int8_t shiftCount
= countLeadingZeros32(aSig
) - 21;
3206 *zSigPtr
= aSig
<< shiftCount
;
3207 *zExpPtr
= 1 - shiftCount
;
3210 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3211 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3213 float32
float16_to_float32(float16 a
, flag ieee STATUS_PARAM
)
3219 aSign
= extractFloat16Sign(a
);
3220 aExp
= extractFloat16Exp(a
);
3221 aSig
= extractFloat16Frac(a
);
3223 if (aExp
== 0x1f && ieee
) {
3225 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR
) STATUS_VAR
);
3227 return packFloat32(aSign
, 0xff, 0);
3231 return packFloat32(aSign
, 0, 0);
3234 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3237 return packFloat32( aSign
, aExp
+ 0x70, aSig
<< 13);
3240 float16
float32_to_float16(float32 a
, flag ieee STATUS_PARAM
)
3246 a
= float32_squash_input_denormal(a STATUS_VAR
);
3248 aSig
= extractFloat32Frac( a
);
3249 aExp
= extractFloat32Exp( a
);
3250 aSign
= extractFloat32Sign( a
);
3251 if ( aExp
== 0xFF ) {
3253 /* Input is a NaN */
3255 float_raise(float_flag_invalid STATUS_VAR
);
3256 return packFloat16(aSign
, 0, 0);
3258 return commonNaNToFloat16(
3259 float32ToCommonNaN(a STATUS_VAR
) STATUS_VAR
);
3263 float_raise(float_flag_invalid STATUS_VAR
);
3264 return packFloat16(aSign
, 0x1f, 0x3ff);
3266 return packFloat16(aSign
, 0x1f, 0);
3268 if (aExp
== 0 && aSig
== 0) {
3269 return packFloat16(aSign
, 0, 0);
3271 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3272 * even if the input is denormal; however this is harmless because
3273 * the largest possible single-precision denormal is still smaller
3274 * than the smallest representable half-precision denormal, and so we
3275 * will end up ignoring aSig and returning via the "always return zero"
3281 return roundAndPackFloat16(aSign
, aExp
, aSig
, ieee STATUS_VAR
);
3284 float64
float16_to_float64(float16 a
, flag ieee STATUS_PARAM
)
3290 aSign
= extractFloat16Sign(a
);
3291 aExp
= extractFloat16Exp(a
);
3292 aSig
= extractFloat16Frac(a
);
3294 if (aExp
== 0x1f && ieee
) {
3296 return commonNaNToFloat64(
3297 float16ToCommonNaN(a STATUS_VAR
) STATUS_VAR
);
3299 return packFloat64(aSign
, 0x7ff, 0);
3303 return packFloat64(aSign
, 0, 0);
3306 normalizeFloat16Subnormal(aSig
, &aExp
, &aSig
);
3309 return packFloat64(aSign
, aExp
+ 0x3f0, ((uint64_t)aSig
) << 42);
3312 float16
float64_to_float16(float64 a
, flag ieee STATUS_PARAM
)
3319 a
= float64_squash_input_denormal(a STATUS_VAR
);
3321 aSig
= extractFloat64Frac(a
);
3322 aExp
= extractFloat64Exp(a
);
3323 aSign
= extractFloat64Sign(a
);
3324 if (aExp
== 0x7FF) {
3326 /* Input is a NaN */
3328 float_raise(float_flag_invalid STATUS_VAR
);
3329 return packFloat16(aSign
, 0, 0);
3331 return commonNaNToFloat16(
3332 float64ToCommonNaN(a STATUS_VAR
) STATUS_VAR
);
3336 float_raise(float_flag_invalid STATUS_VAR
);
3337 return packFloat16(aSign
, 0x1f, 0x3ff);
3339 return packFloat16(aSign
, 0x1f, 0);
3341 shift64RightJamming(aSig
, 29, &aSig
);
3343 if (aExp
== 0 && zSig
== 0) {
3344 return packFloat16(aSign
, 0, 0);
3346 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3347 * even if the input is denormal; however this is harmless because
3348 * the largest possible single-precision denormal is still smaller
3349 * than the smallest representable half-precision denormal, and so we
3350 * will end up ignoring aSig and returning via the "always return zero"
3356 return roundAndPackFloat16(aSign
, aExp
, zSig
, ieee STATUS_VAR
);
3359 /*----------------------------------------------------------------------------
3360 | Returns the result of converting the double-precision floating-point value
3361 | `a' to the extended double-precision floating-point format. The conversion
3362 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3364 *----------------------------------------------------------------------------*/
3366 floatx80
float64_to_floatx80( float64 a STATUS_PARAM
)
3372 a
= float64_squash_input_denormal(a STATUS_VAR
);
3373 aSig
= extractFloat64Frac( a
);
3374 aExp
= extractFloat64Exp( a
);
3375 aSign
= extractFloat64Sign( a
);
3376 if ( aExp
== 0x7FF ) {
3377 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3378 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3381 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
3382 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3386 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
3390 /*----------------------------------------------------------------------------
3391 | Returns the result of converting the double-precision floating-point value
3392 | `a' to the quadruple-precision floating-point format. The conversion is
3393 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3395 *----------------------------------------------------------------------------*/
3397 float128
float64_to_float128( float64 a STATUS_PARAM
)
3401 uint64_t aSig
, zSig0
, zSig1
;
3403 a
= float64_squash_input_denormal(a STATUS_VAR
);
3404 aSig
= extractFloat64Frac( a
);
3405 aExp
= extractFloat64Exp( a
);
3406 aSign
= extractFloat64Sign( a
);
3407 if ( aExp
== 0x7FF ) {
3408 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
3409 return packFloat128( aSign
, 0x7FFF, 0, 0 );
3412 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
3413 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3416 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
3417 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
3421 /*----------------------------------------------------------------------------
3422 | Rounds the double-precision floating-point value `a' to an integer, and
3423 | returns the result as a double-precision floating-point value. The
3424 | operation is performed according to the IEC/IEEE Standard for Binary
3425 | Floating-Point Arithmetic.
3426 *----------------------------------------------------------------------------*/
3428 float64
float64_round_to_int( float64 a STATUS_PARAM
)
3432 uint64_t lastBitMask
, roundBitsMask
;
3435 a
= float64_squash_input_denormal(a STATUS_VAR
);
3437 aExp
= extractFloat64Exp( a
);
3438 if ( 0x433 <= aExp
) {
3439 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
3440 return propagateFloat64NaN( a
, a STATUS_VAR
);
3444 if ( aExp
< 0x3FF ) {
3445 if ( (uint64_t) ( float64_val(a
)<<1 ) == 0 ) return a
;
3446 STATUS(float_exception_flags
) |= float_flag_inexact
;
3447 aSign
= extractFloat64Sign( a
);
3448 switch ( STATUS(float_rounding_mode
) ) {
3449 case float_round_nearest_even
:
3450 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
3451 return packFloat64( aSign
, 0x3FF, 0 );
3454 case float_round_down
:
3455 return make_float64(aSign
? LIT64( 0xBFF0000000000000 ) : 0);
3456 case float_round_up
:
3457 return make_float64(
3458 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3460 return packFloat64( aSign
, 0, 0 );
3463 lastBitMask
<<= 0x433 - aExp
;
3464 roundBitsMask
= lastBitMask
- 1;
3466 roundingMode
= STATUS(float_rounding_mode
);
3467 if ( roundingMode
== float_round_nearest_even
) {
3468 z
+= lastBitMask
>>1;
3469 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
3471 else if ( roundingMode
!= float_round_to_zero
) {
3472 if ( extractFloat64Sign( make_float64(z
) ) ^ ( roundingMode
== float_round_up
) ) {
3476 z
&= ~ roundBitsMask
;
3477 if ( z
!= float64_val(a
) )
3478 STATUS(float_exception_flags
) |= float_flag_inexact
;
3479 return make_float64(z
);
3483 float64
float64_trunc_to_int( float64 a STATUS_PARAM
)
3487 oldmode
= STATUS(float_rounding_mode
);
3488 STATUS(float_rounding_mode
) = float_round_to_zero
;
3489 res
= float64_round_to_int(a STATUS_VAR
);
3490 STATUS(float_rounding_mode
) = oldmode
;
3494 /*----------------------------------------------------------------------------
3495 | Returns the result of adding the absolute values of the double-precision
3496 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3497 | before being returned. `zSign' is ignored if the result is a NaN.
3498 | The addition is performed according to the IEC/IEEE Standard for Binary
3499 | Floating-Point Arithmetic.
3500 *----------------------------------------------------------------------------*/
3502 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
3504 int_fast16_t aExp
, bExp
, zExp
;
3505 uint64_t aSig
, bSig
, zSig
;
3506 int_fast16_t expDiff
;
3508 aSig
= extractFloat64Frac( a
);
3509 aExp
= extractFloat64Exp( a
);
3510 bSig
= extractFloat64Frac( b
);
3511 bExp
= extractFloat64Exp( b
);
3512 expDiff
= aExp
- bExp
;
3515 if ( 0 < expDiff
) {
3516 if ( aExp
== 0x7FF ) {
3517 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3524 bSig
|= LIT64( 0x2000000000000000 );
3526 shift64RightJamming( bSig
, expDiff
, &bSig
);
3529 else if ( expDiff
< 0 ) {
3530 if ( bExp
== 0x7FF ) {
3531 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3532 return packFloat64( zSign
, 0x7FF, 0 );
3538 aSig
|= LIT64( 0x2000000000000000 );
3540 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3544 if ( aExp
== 0x7FF ) {
3545 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3549 if (STATUS(flush_to_zero
)) {
3551 float_raise(float_flag_output_denormal STATUS_VAR
);
3553 return packFloat64(zSign
, 0, 0);
3555 return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
3557 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
3561 aSig
|= LIT64( 0x2000000000000000 );
3562 zSig
= ( aSig
+ bSig
)<<1;
3564 if ( (int64_t) zSig
< 0 ) {
3569 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3573 /*----------------------------------------------------------------------------
3574 | Returns the result of subtracting the absolute values of the double-
3575 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3576 | difference is negated before being returned. `zSign' is ignored if the
3577 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3578 | Standard for Binary Floating-Point Arithmetic.
3579 *----------------------------------------------------------------------------*/
3581 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign STATUS_PARAM
)
3583 int_fast16_t aExp
, bExp
, zExp
;
3584 uint64_t aSig
, bSig
, zSig
;
3585 int_fast16_t expDiff
;
3587 aSig
= extractFloat64Frac( a
);
3588 aExp
= extractFloat64Exp( a
);
3589 bSig
= extractFloat64Frac( b
);
3590 bExp
= extractFloat64Exp( b
);
3591 expDiff
= aExp
- bExp
;
3594 if ( 0 < expDiff
) goto aExpBigger
;
3595 if ( expDiff
< 0 ) goto bExpBigger
;
3596 if ( aExp
== 0x7FF ) {
3597 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3598 float_raise( float_flag_invalid STATUS_VAR
);
3599 return float64_default_nan
;
3605 if ( bSig
< aSig
) goto aBigger
;
3606 if ( aSig
< bSig
) goto bBigger
;
3607 return packFloat64( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
3609 if ( bExp
== 0x7FF ) {
3610 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3611 return packFloat64( zSign
^ 1, 0x7FF, 0 );
3617 aSig
|= LIT64( 0x4000000000000000 );
3619 shift64RightJamming( aSig
, - expDiff
, &aSig
);
3620 bSig
|= LIT64( 0x4000000000000000 );
3625 goto normalizeRoundAndPack
;
3627 if ( aExp
== 0x7FF ) {
3628 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3635 bSig
|= LIT64( 0x4000000000000000 );
3637 shift64RightJamming( bSig
, expDiff
, &bSig
);
3638 aSig
|= LIT64( 0x4000000000000000 );
3642 normalizeRoundAndPack
:
3644 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3648 /*----------------------------------------------------------------------------
3649 | Returns the result of adding the double-precision floating-point values `a'
3650 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3651 | Binary Floating-Point Arithmetic.
3652 *----------------------------------------------------------------------------*/
3654 float64
float64_add( float64 a
, float64 b STATUS_PARAM
)
3657 a
= float64_squash_input_denormal(a STATUS_VAR
);
3658 b
= float64_squash_input_denormal(b STATUS_VAR
);
3660 aSign
= extractFloat64Sign( a
);
3661 bSign
= extractFloat64Sign( b
);
3662 if ( aSign
== bSign
) {
3663 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3666 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3671 /*----------------------------------------------------------------------------
3672 | Returns the result of subtracting the double-precision floating-point values
3673 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3674 | for Binary Floating-Point Arithmetic.
3675 *----------------------------------------------------------------------------*/
3677 float64
float64_sub( float64 a
, float64 b STATUS_PARAM
)
3680 a
= float64_squash_input_denormal(a STATUS_VAR
);
3681 b
= float64_squash_input_denormal(b STATUS_VAR
);
3683 aSign
= extractFloat64Sign( a
);
3684 bSign
= extractFloat64Sign( b
);
3685 if ( aSign
== bSign
) {
3686 return subFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3689 return addFloat64Sigs( a
, b
, aSign STATUS_VAR
);
3694 /*----------------------------------------------------------------------------
3695 | Returns the result of multiplying the double-precision floating-point values
3696 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3697 | for Binary Floating-Point Arithmetic.
3698 *----------------------------------------------------------------------------*/
3700 float64
float64_mul( float64 a
, float64 b STATUS_PARAM
)
3702 flag aSign
, bSign
, zSign
;
3703 int_fast16_t aExp
, bExp
, zExp
;
3704 uint64_t aSig
, bSig
, zSig0
, zSig1
;
3706 a
= float64_squash_input_denormal(a STATUS_VAR
);
3707 b
= float64_squash_input_denormal(b STATUS_VAR
);
3709 aSig
= extractFloat64Frac( a
);
3710 aExp
= extractFloat64Exp( a
);
3711 aSign
= extractFloat64Sign( a
);
3712 bSig
= extractFloat64Frac( b
);
3713 bExp
= extractFloat64Exp( b
);
3714 bSign
= extractFloat64Sign( b
);
3715 zSign
= aSign
^ bSign
;
3716 if ( aExp
== 0x7FF ) {
3717 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3718 return propagateFloat64NaN( a
, b STATUS_VAR
);
3720 if ( ( bExp
| bSig
) == 0 ) {
3721 float_raise( float_flag_invalid STATUS_VAR
);
3722 return float64_default_nan
;
3724 return packFloat64( zSign
, 0x7FF, 0 );
3726 if ( bExp
== 0x7FF ) {
3727 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3728 if ( ( aExp
| aSig
) == 0 ) {
3729 float_raise( float_flag_invalid STATUS_VAR
);
3730 return float64_default_nan
;
3732 return packFloat64( zSign
, 0x7FF, 0 );
3735 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3736 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3739 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3740 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3742 zExp
= aExp
+ bExp
- 0x3FF;
3743 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3744 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3745 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3746 zSig0
|= ( zSig1
!= 0 );
3747 if ( 0 <= (int64_t) ( zSig0
<<1 ) ) {
3751 return roundAndPackFloat64( zSign
, zExp
, zSig0 STATUS_VAR
);
3755 /*----------------------------------------------------------------------------
3756 | Returns the result of dividing the double-precision floating-point value `a'
3757 | by the corresponding value `b'. The operation is performed according to
3758 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3759 *----------------------------------------------------------------------------*/
3761 float64
float64_div( float64 a
, float64 b STATUS_PARAM
)
3763 flag aSign
, bSign
, zSign
;
3764 int_fast16_t aExp
, bExp
, zExp
;
3765 uint64_t aSig
, bSig
, zSig
;
3766 uint64_t rem0
, rem1
;
3767 uint64_t term0
, term1
;
3768 a
= float64_squash_input_denormal(a STATUS_VAR
);
3769 b
= float64_squash_input_denormal(b STATUS_VAR
);
3771 aSig
= extractFloat64Frac( a
);
3772 aExp
= extractFloat64Exp( a
);
3773 aSign
= extractFloat64Sign( a
);
3774 bSig
= extractFloat64Frac( b
);
3775 bExp
= extractFloat64Exp( b
);
3776 bSign
= extractFloat64Sign( b
);
3777 zSign
= aSign
^ bSign
;
3778 if ( aExp
== 0x7FF ) {
3779 if ( aSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3780 if ( bExp
== 0x7FF ) {
3781 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3782 float_raise( float_flag_invalid STATUS_VAR
);
3783 return float64_default_nan
;
3785 return packFloat64( zSign
, 0x7FF, 0 );
3787 if ( bExp
== 0x7FF ) {
3788 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3789 return packFloat64( zSign
, 0, 0 );
3793 if ( ( aExp
| aSig
) == 0 ) {
3794 float_raise( float_flag_invalid STATUS_VAR
);
3795 return float64_default_nan
;
3797 float_raise( float_flag_divbyzero STATUS_VAR
);
3798 return packFloat64( zSign
, 0x7FF, 0 );
3800 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3803 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
3804 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3806 zExp
= aExp
- bExp
+ 0x3FD;
3807 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
3808 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3809 if ( bSig
<= ( aSig
+ aSig
) ) {
3813 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
3814 if ( ( zSig
& 0x1FF ) <= 2 ) {
3815 mul64To128( bSig
, zSig
, &term0
, &term1
);
3816 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
3817 while ( (int64_t) rem0
< 0 ) {
3819 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3821 zSig
|= ( rem1
!= 0 );
3823 return roundAndPackFloat64( zSign
, zExp
, zSig STATUS_VAR
);
3827 /*----------------------------------------------------------------------------
3828 | Returns the remainder of the double-precision floating-point value `a'
3829 | with respect to the corresponding value `b'. The operation is performed
3830 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3831 *----------------------------------------------------------------------------*/
3833 float64
float64_rem( float64 a
, float64 b STATUS_PARAM
)
3836 int_fast16_t aExp
, bExp
, expDiff
;
3837 uint64_t aSig
, bSig
;
3838 uint64_t q
, alternateASig
;
3841 a
= float64_squash_input_denormal(a STATUS_VAR
);
3842 b
= float64_squash_input_denormal(b STATUS_VAR
);
3843 aSig
= extractFloat64Frac( a
);
3844 aExp
= extractFloat64Exp( a
);
3845 aSign
= extractFloat64Sign( a
);
3846 bSig
= extractFloat64Frac( b
);
3847 bExp
= extractFloat64Exp( b
);
3848 if ( aExp
== 0x7FF ) {
3849 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
3850 return propagateFloat64NaN( a
, b STATUS_VAR
);
3852 float_raise( float_flag_invalid STATUS_VAR
);
3853 return float64_default_nan
;
3855 if ( bExp
== 0x7FF ) {
3856 if ( bSig
) return propagateFloat64NaN( a
, b STATUS_VAR
);
3861 float_raise( float_flag_invalid STATUS_VAR
);
3862 return float64_default_nan
;
3864 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
3867 if ( aSig
== 0 ) return a
;
3868 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
3870 expDiff
= aExp
- bExp
;
3871 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
3872 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
3873 if ( expDiff
< 0 ) {
3874 if ( expDiff
< -1 ) return a
;
3877 q
= ( bSig
<= aSig
);
3878 if ( q
) aSig
-= bSig
;
3880 while ( 0 < expDiff
) {
3881 q
= estimateDiv128To64( aSig
, 0, bSig
);
3882 q
= ( 2 < q
) ? q
- 2 : 0;
3883 aSig
= - ( ( bSig
>>2 ) * q
);
3887 if ( 0 < expDiff
) {
3888 q
= estimateDiv128To64( aSig
, 0, bSig
);
3889 q
= ( 2 < q
) ? q
- 2 : 0;
3892 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
3899 alternateASig
= aSig
;
3902 } while ( 0 <= (int64_t) aSig
);
3903 sigMean
= aSig
+ alternateASig
;
3904 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
3905 aSig
= alternateASig
;
3907 zSign
= ( (int64_t) aSig
< 0 );
3908 if ( zSign
) aSig
= - aSig
;
3909 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig STATUS_VAR
);
3913 /*----------------------------------------------------------------------------
3914 | Returns the result of multiplying the double-precision floating-point values
3915 | `a' and `b' then adding 'c', with no intermediate rounding step after the
3916 | multiplication. The operation is performed according to the IEC/IEEE
3917 | Standard for Binary Floating-Point Arithmetic 754-2008.
3918 | The flags argument allows the caller to select negation of the
3919 | addend, the intermediate product, or the final result. (The difference
3920 | between this and having the caller do a separate negation is that negating
3921 | externally will flip the sign bit on NaNs.)
3922 *----------------------------------------------------------------------------*/
3924 float64
float64_muladd(float64 a
, float64 b
, float64 c
, int flags STATUS_PARAM
)
3926 flag aSign
, bSign
, cSign
, zSign
;
3927 int_fast16_t aExp
, bExp
, cExp
, pExp
, zExp
, expDiff
;
3928 uint64_t aSig
, bSig
, cSig
;
3929 flag pInf
, pZero
, pSign
;
3930 uint64_t pSig0
, pSig1
, cSig0
, cSig1
, zSig0
, zSig1
;
3932 flag signflip
, infzero
;
3934 a
= float64_squash_input_denormal(a STATUS_VAR
);
3935 b
= float64_squash_input_denormal(b STATUS_VAR
);
3936 c
= float64_squash_input_denormal(c STATUS_VAR
);
3937 aSig
= extractFloat64Frac(a
);
3938 aExp
= extractFloat64Exp(a
);
3939 aSign
= extractFloat64Sign(a
);
3940 bSig
= extractFloat64Frac(b
);
3941 bExp
= extractFloat64Exp(b
);
3942 bSign
= extractFloat64Sign(b
);
3943 cSig
= extractFloat64Frac(c
);
3944 cExp
= extractFloat64Exp(c
);
3945 cSign
= extractFloat64Sign(c
);
3947 infzero
= ((aExp
== 0 && aSig
== 0 && bExp
== 0x7ff && bSig
== 0) ||
3948 (aExp
== 0x7ff && aSig
== 0 && bExp
== 0 && bSig
== 0));
3950 /* It is implementation-defined whether the cases of (0,inf,qnan)
3951 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
3952 * they return if they do), so we have to hand this information
3953 * off to the target-specific pick-a-NaN routine.
3955 if (((aExp
== 0x7ff) && aSig
) ||
3956 ((bExp
== 0x7ff) && bSig
) ||
3957 ((cExp
== 0x7ff) && cSig
)) {
3958 return propagateFloat64MulAddNaN(a
, b
, c
, infzero STATUS_VAR
);
3962 float_raise(float_flag_invalid STATUS_VAR
);
3963 return float64_default_nan
;
3966 if (flags
& float_muladd_negate_c
) {
3970 signflip
= (flags
& float_muladd_negate_result
) ? 1 : 0;
3972 /* Work out the sign and type of the product */
3973 pSign
= aSign
^ bSign
;
3974 if (flags
& float_muladd_negate_product
) {
3977 pInf
= (aExp
== 0x7ff) || (bExp
== 0x7ff);
3978 pZero
= ((aExp
| aSig
) == 0) || ((bExp
| bSig
) == 0);
3980 if (cExp
== 0x7ff) {
3981 if (pInf
&& (pSign
^ cSign
)) {
3982 /* addition of opposite-signed infinities => InvalidOperation */
3983 float_raise(float_flag_invalid STATUS_VAR
);
3984 return float64_default_nan
;
3986 /* Otherwise generate an infinity of the same sign */
3987 return packFloat64(cSign
^ signflip
, 0x7ff, 0);
3991 return packFloat64(pSign
^ signflip
, 0x7ff, 0);
3997 /* Adding two exact zeroes */
3998 if (pSign
== cSign
) {
4000 } else if (STATUS(float_rounding_mode
) == float_round_down
) {
4005 return packFloat64(zSign
^ signflip
, 0, 0);
4007 /* Exact zero plus a denorm */
4008 if (STATUS(flush_to_zero
)) {
4009 float_raise(float_flag_output_denormal STATUS_VAR
);
4010 return packFloat64(cSign
^ signflip
, 0, 0);
4013 /* Zero plus something non-zero : just return the something */
4014 return packFloat64(cSign
^ signflip
, cExp
, cSig
);
4018 normalizeFloat64Subnormal(aSig
, &aExp
, &aSig
);
4021 normalizeFloat64Subnormal(bSig
, &bExp
, &bSig
);
4024 /* Calculate the actual result a * b + c */
4026 /* Multiply first; this is easy. */
4027 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4028 * because we want the true exponent, not the "one-less-than"
4029 * flavour that roundAndPackFloat64() takes.
4031 pExp
= aExp
+ bExp
- 0x3fe;
4032 aSig
= (aSig
| LIT64(0x0010000000000000))<<10;
4033 bSig
= (bSig
| LIT64(0x0010000000000000))<<11;
4034 mul64To128(aSig
, bSig
, &pSig0
, &pSig1
);
4035 if ((int64_t)(pSig0
<< 1) >= 0) {
4036 shortShift128Left(pSig0
, pSig1
, 1, &pSig0
, &pSig1
);
4040 zSign
= pSign
^ signflip
;
4042 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4043 * bit in position 126.
4047 /* Throw out the special case of c being an exact zero now */
4048 shift128RightJamming(pSig0
, pSig1
, 64, &pSig0
, &pSig1
);
4049 return roundAndPackFloat64(zSign
, pExp
- 1,
4052 normalizeFloat64Subnormal(cSig
, &cExp
, &cSig
);
4055 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4056 * significand of the addend, with the explicit bit in position 126.
4058 cSig0
= cSig
<< (126 - 64 - 52);
4060 cSig0
|= LIT64(0x4000000000000000);
4061 expDiff
= pExp
- cExp
;
4063 if (pSign
== cSign
) {
4066 /* scale c to match p */
4067 shift128RightJamming(cSig0
, cSig1
, expDiff
, &cSig0
, &cSig1
);
4069 } else if (expDiff
< 0) {
4070 /* scale p to match c */
4071 shift128RightJamming(pSig0
, pSig1
, -expDiff
, &pSig0
, &pSig1
);
4074 /* no scaling needed */
4077 /* Add significands and make sure explicit bit ends up in posn 126 */
4078 add128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4079 if ((int64_t)zSig0
< 0) {
4080 shift128RightJamming(zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4084 shift128RightJamming(zSig0
, zSig1
, 64, &zSig0
, &zSig1
);
4085 return roundAndPackFloat64(zSign
, zExp
, zSig1 STATUS_VAR
);
4089 shift128RightJamming(cSig0
, cSig1
, expDiff
, &cSig0
, &cSig1
);
4090 sub128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4092 } else if (expDiff
< 0) {
4093 shift128RightJamming(pSig0
, pSig1
, -expDiff
, &pSig0
, &pSig1
);
4094 sub128(cSig0
, cSig1
, pSig0
, pSig1
, &zSig0
, &zSig1
);
4099 if (lt128(cSig0
, cSig1
, pSig0
, pSig1
)) {
4100 sub128(pSig0
, pSig1
, cSig0
, cSig1
, &zSig0
, &zSig1
);
4101 } else if (lt128(pSig0
, pSig1
, cSig0
, cSig1
)) {
4102 sub128(cSig0
, cSig1
, pSig0
, pSig1
, &zSig0
, &zSig1
);
4107 if (STATUS(float_rounding_mode
) == float_round_down
) {
4110 return packFloat64(zSign
, 0, 0);
4114 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4115 * starting with the significand in a pair of uint64_t.
4118 shiftcount
= countLeadingZeros64(zSig0
) - 1;
4119 shortShift128Left(zSig0
, zSig1
, shiftcount
, &zSig0
, &zSig1
);
4125 shiftcount
= countLeadingZeros64(zSig1
);
4126 if (shiftcount
== 0) {
4127 zSig0
= (zSig1
>> 1) | (zSig1
& 1);
4131 zSig0
= zSig1
<< shiftcount
;
4132 zExp
-= (shiftcount
+ 64);
4135 return roundAndPackFloat64(zSign
, zExp
, zSig0 STATUS_VAR
);
4139 /*----------------------------------------------------------------------------
4140 | Returns the square root of the double-precision floating-point value `a'.
4141 | The operation is performed according to the IEC/IEEE Standard for Binary
4142 | Floating-Point Arithmetic.
4143 *----------------------------------------------------------------------------*/
4145 float64
float64_sqrt( float64 a STATUS_PARAM
)
4148 int_fast16_t aExp
, zExp
;
4149 uint64_t aSig
, zSig
, doubleZSig
;
4150 uint64_t rem0
, rem1
, term0
, term1
;
4151 a
= float64_squash_input_denormal(a STATUS_VAR
);
4153 aSig
= extractFloat64Frac( a
);
4154 aExp
= extractFloat64Exp( a
);
4155 aSign
= extractFloat64Sign( a
);
4156 if ( aExp
== 0x7FF ) {
4157 if ( aSig
) return propagateFloat64NaN( a
, a STATUS_VAR
);
4158 if ( ! aSign
) return a
;
4159 float_raise( float_flag_invalid STATUS_VAR
);
4160 return float64_default_nan
;
4163 if ( ( aExp
| aSig
) == 0 ) return a
;
4164 float_raise( float_flag_invalid STATUS_VAR
);
4165 return float64_default_nan
;
4168 if ( aSig
== 0 ) return float64_zero
;
4169 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4171 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
4172 aSig
|= LIT64( 0x0010000000000000 );
4173 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
4174 aSig
<<= 9 - ( aExp
& 1 );
4175 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
4176 if ( ( zSig
& 0x1FF ) <= 5 ) {
4177 doubleZSig
= zSig
<<1;
4178 mul64To128( zSig
, zSig
, &term0
, &term1
);
4179 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
4180 while ( (int64_t) rem0
< 0 ) {
4183 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
4185 zSig
|= ( ( rem0
| rem1
) != 0 );
4187 return roundAndPackFloat64( 0, zExp
, zSig STATUS_VAR
);
4191 /*----------------------------------------------------------------------------
4192 | Returns the binary log of the double-precision floating-point value `a'.
4193 | The operation is performed according to the IEC/IEEE Standard for Binary
4194 | Floating-Point Arithmetic.
4195 *----------------------------------------------------------------------------*/
4196 float64
float64_log2( float64 a STATUS_PARAM
)
4200 uint64_t aSig
, aSig0
, aSig1
, zSig
, i
;
4201 a
= float64_squash_input_denormal(a STATUS_VAR
);
4203 aSig
= extractFloat64Frac( a
);
4204 aExp
= extractFloat64Exp( a
);
4205 aSign
= extractFloat64Sign( a
);
4208 if ( aSig
== 0 ) return packFloat64( 1, 0x7FF, 0 );
4209 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
4212 float_raise( float_flag_invalid STATUS_VAR
);
4213 return float64_default_nan
;
4215 if ( aExp
== 0x7FF ) {
4216 if ( aSig
) return propagateFloat64NaN( a
, float64_zero STATUS_VAR
);
4221 aSig
|= LIT64( 0x0010000000000000 );
4223 zSig
= (uint64_t)aExp
<< 52;
4224 for (i
= 1LL << 51; i
> 0; i
>>= 1) {
4225 mul64To128( aSig
, aSig
, &aSig0
, &aSig1
);
4226 aSig
= ( aSig0
<< 12 ) | ( aSig1
>> 52 );
4227 if ( aSig
& LIT64( 0x0020000000000000 ) ) {
4235 return normalizeRoundAndPackFloat64( zSign
, 0x408, zSig STATUS_VAR
);
4238 /*----------------------------------------------------------------------------
4239 | Returns 1 if the double-precision floating-point value `a' is equal to the
4240 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4241 | if either operand is a NaN. Otherwise, the comparison is performed
4242 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4243 *----------------------------------------------------------------------------*/
4245 int float64_eq( float64 a
, float64 b STATUS_PARAM
)
4248 a
= float64_squash_input_denormal(a STATUS_VAR
);
4249 b
= float64_squash_input_denormal(b STATUS_VAR
);
4251 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4252 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4254 float_raise( float_flag_invalid STATUS_VAR
);
4257 av
= float64_val(a
);
4258 bv
= float64_val(b
);
4259 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4263 /*----------------------------------------------------------------------------
4264 | Returns 1 if the double-precision floating-point value `a' is less than or
4265 | equal to the corresponding value `b', and 0 otherwise. The invalid
4266 | exception is raised if either operand is a NaN. The comparison is performed
4267 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4268 *----------------------------------------------------------------------------*/
4270 int float64_le( float64 a
, float64 b STATUS_PARAM
)
4274 a
= float64_squash_input_denormal(a STATUS_VAR
);
4275 b
= float64_squash_input_denormal(b STATUS_VAR
);
4277 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4278 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4280 float_raise( float_flag_invalid STATUS_VAR
);
4283 aSign
= extractFloat64Sign( a
);
4284 bSign
= extractFloat64Sign( b
);
4285 av
= float64_val(a
);
4286 bv
= float64_val(b
);
4287 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4288 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4292 /*----------------------------------------------------------------------------
4293 | Returns 1 if the double-precision floating-point value `a' is less than
4294 | the corresponding value `b', and 0 otherwise. The invalid exception is
4295 | raised if either operand is a NaN. The comparison is performed according
4296 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4297 *----------------------------------------------------------------------------*/
4299 int float64_lt( float64 a
, float64 b STATUS_PARAM
)
4304 a
= float64_squash_input_denormal(a STATUS_VAR
);
4305 b
= float64_squash_input_denormal(b STATUS_VAR
);
4306 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4307 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4309 float_raise( float_flag_invalid STATUS_VAR
);
4312 aSign
= extractFloat64Sign( a
);
4313 bSign
= extractFloat64Sign( b
);
4314 av
= float64_val(a
);
4315 bv
= float64_val(b
);
4316 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4317 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4321 /*----------------------------------------------------------------------------
4322 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4323 | be compared, and 0 otherwise. The invalid exception is raised if either
4324 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4325 | Standard for Binary Floating-Point Arithmetic.
4326 *----------------------------------------------------------------------------*/
4328 int float64_unordered( float64 a
, float64 b STATUS_PARAM
)
4330 a
= float64_squash_input_denormal(a STATUS_VAR
);
4331 b
= float64_squash_input_denormal(b STATUS_VAR
);
4333 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4334 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4336 float_raise( float_flag_invalid STATUS_VAR
);
4342 /*----------------------------------------------------------------------------
4343 | Returns 1 if the double-precision floating-point value `a' is equal to the
4344 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4345 | exception.The comparison is performed according to the IEC/IEEE Standard
4346 | for Binary Floating-Point Arithmetic.
4347 *----------------------------------------------------------------------------*/
4349 int float64_eq_quiet( float64 a
, float64 b STATUS_PARAM
)
4352 a
= float64_squash_input_denormal(a STATUS_VAR
);
4353 b
= float64_squash_input_denormal(b STATUS_VAR
);
4355 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4356 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4358 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4359 float_raise( float_flag_invalid STATUS_VAR
);
4363 av
= float64_val(a
);
4364 bv
= float64_val(b
);
4365 return ( av
== bv
) || ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4369 /*----------------------------------------------------------------------------
4370 | Returns 1 if the double-precision floating-point value `a' is less than or
4371 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4372 | cause an exception. Otherwise, the comparison is performed according to the
4373 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4374 *----------------------------------------------------------------------------*/
4376 int float64_le_quiet( float64 a
, float64 b STATUS_PARAM
)
4380 a
= float64_squash_input_denormal(a STATUS_VAR
);
4381 b
= float64_squash_input_denormal(b STATUS_VAR
);
4383 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4384 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4386 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4387 float_raise( float_flag_invalid STATUS_VAR
);
4391 aSign
= extractFloat64Sign( a
);
4392 bSign
= extractFloat64Sign( b
);
4393 av
= float64_val(a
);
4394 bv
= float64_val(b
);
4395 if ( aSign
!= bSign
) return aSign
|| ( (uint64_t) ( ( av
| bv
)<<1 ) == 0 );
4396 return ( av
== bv
) || ( aSign
^ ( av
< bv
) );
4400 /*----------------------------------------------------------------------------
4401 | Returns 1 if the double-precision floating-point value `a' is less than
4402 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4403 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4404 | Standard for Binary Floating-Point Arithmetic.
4405 *----------------------------------------------------------------------------*/
4407 int float64_lt_quiet( float64 a
, float64 b STATUS_PARAM
)
4411 a
= float64_squash_input_denormal(a STATUS_VAR
);
4412 b
= float64_squash_input_denormal(b STATUS_VAR
);
4414 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4415 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4417 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4418 float_raise( float_flag_invalid STATUS_VAR
);
4422 aSign
= extractFloat64Sign( a
);
4423 bSign
= extractFloat64Sign( b
);
4424 av
= float64_val(a
);
4425 bv
= float64_val(b
);
4426 if ( aSign
!= bSign
) return aSign
&& ( (uint64_t) ( ( av
| bv
)<<1 ) != 0 );
4427 return ( av
!= bv
) && ( aSign
^ ( av
< bv
) );
4431 /*----------------------------------------------------------------------------
4432 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4433 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4434 | comparison is performed according to the IEC/IEEE Standard for Binary
4435 | Floating-Point Arithmetic.
4436 *----------------------------------------------------------------------------*/
4438 int float64_unordered_quiet( float64 a
, float64 b STATUS_PARAM
)
4440 a
= float64_squash_input_denormal(a STATUS_VAR
);
4441 b
= float64_squash_input_denormal(b STATUS_VAR
);
4443 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
4444 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
4446 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
4447 float_raise( float_flag_invalid STATUS_VAR
);
4454 /*----------------------------------------------------------------------------
4455 | Returns the result of converting the extended double-precision floating-
4456 | point value `a' to the 32-bit two's complement integer format. The
4457 | conversion is performed according to the IEC/IEEE Standard for Binary
4458 | Floating-Point Arithmetic---which means in particular that the conversion
4459 | is rounded according to the current rounding mode. If `a' is a NaN, the
4460 | largest positive integer is returned. Otherwise, if the conversion
4461 | overflows, the largest integer with the same sign as `a' is returned.
4462 *----------------------------------------------------------------------------*/
4464 int32
floatx80_to_int32( floatx80 a STATUS_PARAM
)
4467 int32 aExp
, shiftCount
;
4470 aSig
= extractFloatx80Frac( a
);
4471 aExp
= extractFloatx80Exp( a
);
4472 aSign
= extractFloatx80Sign( a
);
4473 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4474 shiftCount
= 0x4037 - aExp
;
4475 if ( shiftCount
<= 0 ) shiftCount
= 1;
4476 shift64RightJamming( aSig
, shiftCount
, &aSig
);
4477 return roundAndPackInt32( aSign
, aSig STATUS_VAR
);
4481 /*----------------------------------------------------------------------------
4482 | Returns the result of converting the extended double-precision floating-
4483 | point value `a' to the 32-bit two's complement integer format. The
4484 | conversion is performed according to the IEC/IEEE Standard for Binary
4485 | Floating-Point Arithmetic, except that the conversion is always rounded
4486 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4487 | Otherwise, if the conversion overflows, the largest integer with the same
4488 | sign as `a' is returned.
4489 *----------------------------------------------------------------------------*/
4491 int32
floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM
)
4494 int32 aExp
, shiftCount
;
4495 uint64_t aSig
, savedASig
;
4498 aSig
= extractFloatx80Frac( a
);
4499 aExp
= extractFloatx80Exp( a
);
4500 aSign
= extractFloatx80Sign( a
);
4501 if ( 0x401E < aExp
) {
4502 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) aSign
= 0;
4505 else if ( aExp
< 0x3FFF ) {
4506 if ( aExp
|| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4509 shiftCount
= 0x403E - aExp
;
4511 aSig
>>= shiftCount
;
4513 if ( aSign
) z
= - z
;
4514 if ( ( z
< 0 ) ^ aSign
) {
4516 float_raise( float_flag_invalid STATUS_VAR
);
4517 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
4519 if ( ( aSig
<<shiftCount
) != savedASig
) {
4520 STATUS(float_exception_flags
) |= float_flag_inexact
;
4526 /*----------------------------------------------------------------------------
4527 | Returns the result of converting the extended double-precision floating-
4528 | point value `a' to the 64-bit two's complement integer format. The
4529 | conversion is performed according to the IEC/IEEE Standard for Binary
4530 | Floating-Point Arithmetic---which means in particular that the conversion
4531 | is rounded according to the current rounding mode. If `a' is a NaN,
4532 | the largest positive integer is returned. Otherwise, if the conversion
4533 | overflows, the largest integer with the same sign as `a' is returned.
4534 *----------------------------------------------------------------------------*/
4536 int64
floatx80_to_int64( floatx80 a STATUS_PARAM
)
4539 int32 aExp
, shiftCount
;
4540 uint64_t aSig
, aSigExtra
;
4542 aSig
= extractFloatx80Frac( a
);
4543 aExp
= extractFloatx80Exp( a
);
4544 aSign
= extractFloatx80Sign( a
);
4545 shiftCount
= 0x403E - aExp
;
4546 if ( shiftCount
<= 0 ) {
4548 float_raise( float_flag_invalid STATUS_VAR
);
4550 || ( ( aExp
== 0x7FFF )
4551 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
4553 return LIT64( 0x7FFFFFFFFFFFFFFF );
4555 return (int64_t) LIT64( 0x8000000000000000 );
4560 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
4562 return roundAndPackInt64( aSign
, aSig
, aSigExtra STATUS_VAR
);
4566 /*----------------------------------------------------------------------------
4567 | Returns the result of converting the extended double-precision floating-
4568 | point value `a' to the 64-bit two's complement integer format. The
4569 | conversion is performed according to the IEC/IEEE Standard for Binary
4570 | Floating-Point Arithmetic, except that the conversion is always rounded
4571 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4572 | Otherwise, if the conversion overflows, the largest integer with the same
4573 | sign as `a' is returned.
4574 *----------------------------------------------------------------------------*/
4576 int64
floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM
)
4579 int32 aExp
, shiftCount
;
4583 aSig
= extractFloatx80Frac( a
);
4584 aExp
= extractFloatx80Exp( a
);
4585 aSign
= extractFloatx80Sign( a
);
4586 shiftCount
= aExp
- 0x403E;
4587 if ( 0 <= shiftCount
) {
4588 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
4589 if ( ( a
.high
!= 0xC03E ) || aSig
) {
4590 float_raise( float_flag_invalid STATUS_VAR
);
4591 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
4592 return LIT64( 0x7FFFFFFFFFFFFFFF );
4595 return (int64_t) LIT64( 0x8000000000000000 );
4597 else if ( aExp
< 0x3FFF ) {
4598 if ( aExp
| aSig
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4601 z
= aSig
>>( - shiftCount
);
4602 if ( (uint64_t) ( aSig
<<( shiftCount
& 63 ) ) ) {
4603 STATUS(float_exception_flags
) |= float_flag_inexact
;
4605 if ( aSign
) z
= - z
;
4610 /*----------------------------------------------------------------------------
4611 | Returns the result of converting the extended double-precision floating-
4612 | point value `a' to the single-precision floating-point format. The
4613 | conversion is performed according to the IEC/IEEE Standard for Binary
4614 | Floating-Point Arithmetic.
4615 *----------------------------------------------------------------------------*/
4617 float32
floatx80_to_float32( floatx80 a STATUS_PARAM
)
4623 aSig
= extractFloatx80Frac( a
);
4624 aExp
= extractFloatx80Exp( a
);
4625 aSign
= extractFloatx80Sign( a
);
4626 if ( aExp
== 0x7FFF ) {
4627 if ( (uint64_t) ( aSig
<<1 ) ) {
4628 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4630 return packFloat32( aSign
, 0xFF, 0 );
4632 shift64RightJamming( aSig
, 33, &aSig
);
4633 if ( aExp
|| aSig
) aExp
-= 0x3F81;
4634 return roundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
4638 /*----------------------------------------------------------------------------
4639 | Returns the result of converting the extended double-precision floating-
4640 | point value `a' to the double-precision floating-point format. The
4641 | conversion is performed according to the IEC/IEEE Standard for Binary
4642 | Floating-Point Arithmetic.
4643 *----------------------------------------------------------------------------*/
4645 float64
floatx80_to_float64( floatx80 a STATUS_PARAM
)
4649 uint64_t aSig
, zSig
;
4651 aSig
= extractFloatx80Frac( a
);
4652 aExp
= extractFloatx80Exp( a
);
4653 aSign
= extractFloatx80Sign( a
);
4654 if ( aExp
== 0x7FFF ) {
4655 if ( (uint64_t) ( aSig
<<1 ) ) {
4656 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4658 return packFloat64( aSign
, 0x7FF, 0 );
4660 shift64RightJamming( aSig
, 1, &zSig
);
4661 if ( aExp
|| aSig
) aExp
-= 0x3C01;
4662 return roundAndPackFloat64( aSign
, aExp
, zSig STATUS_VAR
);
4666 /*----------------------------------------------------------------------------
4667 | Returns the result of converting the extended double-precision floating-
4668 | point value `a' to the quadruple-precision floating-point format. The
4669 | conversion is performed according to the IEC/IEEE Standard for Binary
4670 | Floating-Point Arithmetic.
4671 *----------------------------------------------------------------------------*/
4673 float128
floatx80_to_float128( floatx80 a STATUS_PARAM
)
4677 uint64_t aSig
, zSig0
, zSig1
;
4679 aSig
= extractFloatx80Frac( a
);
4680 aExp
= extractFloatx80Exp( a
);
4681 aSign
= extractFloatx80Sign( a
);
4682 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( aSig
<<1 ) ) {
4683 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
4685 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
4686 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
4690 /*----------------------------------------------------------------------------
4691 | Rounds the extended double-precision floating-point value `a' to an integer,
4692 | and returns the result as an extended quadruple-precision floating-point
4693 | value. The operation is performed according to the IEC/IEEE Standard for
4694 | Binary Floating-Point Arithmetic.
4695 *----------------------------------------------------------------------------*/
4697 floatx80
floatx80_round_to_int( floatx80 a STATUS_PARAM
)
4701 uint64_t lastBitMask
, roundBitsMask
;
4705 aExp
= extractFloatx80Exp( a
);
4706 if ( 0x403E <= aExp
) {
4707 if ( ( aExp
== 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) ) {
4708 return propagateFloatx80NaN( a
, a STATUS_VAR
);
4712 if ( aExp
< 0x3FFF ) {
4714 && ( (uint64_t) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
4717 STATUS(float_exception_flags
) |= float_flag_inexact
;
4718 aSign
= extractFloatx80Sign( a
);
4719 switch ( STATUS(float_rounding_mode
) ) {
4720 case float_round_nearest_even
:
4721 if ( ( aExp
== 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a
)<<1 )
4724 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
4727 case float_round_down
:
4730 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4731 : packFloatx80( 0, 0, 0 );
4732 case float_round_up
:
4734 aSign
? packFloatx80( 1, 0, 0 )
4735 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4737 return packFloatx80( aSign
, 0, 0 );
4740 lastBitMask
<<= 0x403E - aExp
;
4741 roundBitsMask
= lastBitMask
- 1;
4743 roundingMode
= STATUS(float_rounding_mode
);
4744 if ( roundingMode
== float_round_nearest_even
) {
4745 z
.low
+= lastBitMask
>>1;
4746 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
4748 else if ( roundingMode
!= float_round_to_zero
) {
4749 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
4750 z
.low
+= roundBitsMask
;
4753 z
.low
&= ~ roundBitsMask
;
4756 z
.low
= LIT64( 0x8000000000000000 );
4758 if ( z
.low
!= a
.low
) STATUS(float_exception_flags
) |= float_flag_inexact
;
4763 /*----------------------------------------------------------------------------
4764 | Returns the result of adding the absolute values of the extended double-
4765 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4766 | negated before being returned. `zSign' is ignored if the result is a NaN.
4767 | The addition is performed according to the IEC/IEEE Standard for Binary
4768 | Floating-Point Arithmetic.
4769 *----------------------------------------------------------------------------*/
4771 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
4773 int32 aExp
, bExp
, zExp
;
4774 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4777 aSig
= extractFloatx80Frac( a
);
4778 aExp
= extractFloatx80Exp( a
);
4779 bSig
= extractFloatx80Frac( b
);
4780 bExp
= extractFloatx80Exp( b
);
4781 expDiff
= aExp
- bExp
;
4782 if ( 0 < expDiff
) {
4783 if ( aExp
== 0x7FFF ) {
4784 if ( (uint64_t) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4787 if ( bExp
== 0 ) --expDiff
;
4788 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4791 else if ( expDiff
< 0 ) {
4792 if ( bExp
== 0x7FFF ) {
4793 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4794 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4796 if ( aExp
== 0 ) ++expDiff
;
4797 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4801 if ( aExp
== 0x7FFF ) {
4802 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4803 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4808 zSig0
= aSig
+ bSig
;
4810 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
4816 zSig0
= aSig
+ bSig
;
4817 if ( (int64_t) zSig0
< 0 ) goto roundAndPack
;
4819 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4820 zSig0
|= LIT64( 0x8000000000000000 );
4824 roundAndPackFloatx80(
4825 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4829 /*----------------------------------------------------------------------------
4830 | Returns the result of subtracting the absolute values of the extended
4831 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
4832 | difference is negated before being returned. `zSign' is ignored if the
4833 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4834 | Standard for Binary Floating-Point Arithmetic.
4835 *----------------------------------------------------------------------------*/
4837 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign STATUS_PARAM
)
4839 int32 aExp
, bExp
, zExp
;
4840 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4844 aSig
= extractFloatx80Frac( a
);
4845 aExp
= extractFloatx80Exp( a
);
4846 bSig
= extractFloatx80Frac( b
);
4847 bExp
= extractFloatx80Exp( b
);
4848 expDiff
= aExp
- bExp
;
4849 if ( 0 < expDiff
) goto aExpBigger
;
4850 if ( expDiff
< 0 ) goto bExpBigger
;
4851 if ( aExp
== 0x7FFF ) {
4852 if ( (uint64_t) ( ( aSig
| bSig
)<<1 ) ) {
4853 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4855 float_raise( float_flag_invalid STATUS_VAR
);
4856 z
.low
= floatx80_default_nan_low
;
4857 z
.high
= floatx80_default_nan_high
;
4865 if ( bSig
< aSig
) goto aBigger
;
4866 if ( aSig
< bSig
) goto bBigger
;
4867 return packFloatx80( STATUS(float_rounding_mode
) == float_round_down
, 0, 0 );
4869 if ( bExp
== 0x7FFF ) {
4870 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4871 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
4873 if ( aExp
== 0 ) ++expDiff
;
4874 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
4876 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
4879 goto normalizeRoundAndPack
;
4881 if ( aExp
== 0x7FFF ) {
4882 if ( (uint64_t) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4885 if ( bExp
== 0 ) --expDiff
;
4886 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
4888 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
4890 normalizeRoundAndPack
:
4892 normalizeRoundAndPackFloatx80(
4893 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4897 /*----------------------------------------------------------------------------
4898 | Returns the result of adding the extended double-precision floating-point
4899 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4900 | Standard for Binary Floating-Point Arithmetic.
4901 *----------------------------------------------------------------------------*/
4903 floatx80
floatx80_add( floatx80 a
, floatx80 b STATUS_PARAM
)
4907 aSign
= extractFloatx80Sign( a
);
4908 bSign
= extractFloatx80Sign( b
);
4909 if ( aSign
== bSign
) {
4910 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4913 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4918 /*----------------------------------------------------------------------------
4919 | Returns the result of subtracting the extended double-precision floating-
4920 | point values `a' and `b'. The operation is performed according to the
4921 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4922 *----------------------------------------------------------------------------*/
4924 floatx80
floatx80_sub( floatx80 a
, floatx80 b STATUS_PARAM
)
4928 aSign
= extractFloatx80Sign( a
);
4929 bSign
= extractFloatx80Sign( b
);
4930 if ( aSign
== bSign
) {
4931 return subFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4934 return addFloatx80Sigs( a
, b
, aSign STATUS_VAR
);
4939 /*----------------------------------------------------------------------------
4940 | Returns the result of multiplying the extended double-precision floating-
4941 | point values `a' and `b'. The operation is performed according to the
4942 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4943 *----------------------------------------------------------------------------*/
4945 floatx80
floatx80_mul( floatx80 a
, floatx80 b STATUS_PARAM
)
4947 flag aSign
, bSign
, zSign
;
4948 int32 aExp
, bExp
, zExp
;
4949 uint64_t aSig
, bSig
, zSig0
, zSig1
;
4952 aSig
= extractFloatx80Frac( a
);
4953 aExp
= extractFloatx80Exp( a
);
4954 aSign
= extractFloatx80Sign( a
);
4955 bSig
= extractFloatx80Frac( b
);
4956 bExp
= extractFloatx80Exp( b
);
4957 bSign
= extractFloatx80Sign( b
);
4958 zSign
= aSign
^ bSign
;
4959 if ( aExp
== 0x7FFF ) {
4960 if ( (uint64_t) ( aSig
<<1 )
4961 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
4962 return propagateFloatx80NaN( a
, b STATUS_VAR
);
4964 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
4965 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4967 if ( bExp
== 0x7FFF ) {
4968 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
4969 if ( ( aExp
| aSig
) == 0 ) {
4971 float_raise( float_flag_invalid STATUS_VAR
);
4972 z
.low
= floatx80_default_nan_low
;
4973 z
.high
= floatx80_default_nan_high
;
4976 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4979 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4980 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
4983 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
4984 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
4986 zExp
= aExp
+ bExp
- 0x3FFE;
4987 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
4988 if ( 0 < (int64_t) zSig0
) {
4989 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
4993 roundAndPackFloatx80(
4994 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
4998 /*----------------------------------------------------------------------------
4999 | Returns the result of dividing the extended double-precision floating-point
5000 | value `a' by the corresponding value `b'. The operation is performed
5001 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5002 *----------------------------------------------------------------------------*/
5004 floatx80
floatx80_div( floatx80 a
, floatx80 b STATUS_PARAM
)
5006 flag aSign
, bSign
, zSign
;
5007 int32 aExp
, bExp
, zExp
;
5008 uint64_t aSig
, bSig
, zSig0
, zSig1
;
5009 uint64_t rem0
, rem1
, rem2
, term0
, term1
, term2
;
5012 aSig
= extractFloatx80Frac( a
);
5013 aExp
= extractFloatx80Exp( a
);
5014 aSign
= extractFloatx80Sign( a
);
5015 bSig
= extractFloatx80Frac( b
);
5016 bExp
= extractFloatx80Exp( b
);
5017 bSign
= extractFloatx80Sign( b
);
5018 zSign
= aSign
^ bSign
;
5019 if ( aExp
== 0x7FFF ) {
5020 if ( (uint64_t) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5021 if ( bExp
== 0x7FFF ) {
5022 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5025 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5027 if ( bExp
== 0x7FFF ) {
5028 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5029 return packFloatx80( zSign
, 0, 0 );
5033 if ( ( aExp
| aSig
) == 0 ) {
5035 float_raise( float_flag_invalid STATUS_VAR
);
5036 z
.low
= floatx80_default_nan_low
;
5037 z
.high
= floatx80_default_nan_high
;
5040 float_raise( float_flag_divbyzero STATUS_VAR
);
5041 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5043 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5046 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
5047 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
5049 zExp
= aExp
- bExp
+ 0x3FFE;
5051 if ( bSig
<= aSig
) {
5052 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
5055 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
5056 mul64To128( bSig
, zSig0
, &term0
, &term1
);
5057 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
5058 while ( (int64_t) rem0
< 0 ) {
5060 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
5062 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
5063 if ( (uint64_t) ( zSig1
<<1 ) <= 8 ) {
5064 mul64To128( bSig
, zSig1
, &term1
, &term2
);
5065 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5066 while ( (int64_t) rem1
< 0 ) {
5068 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
5070 zSig1
|= ( ( rem1
| rem2
) != 0 );
5073 roundAndPackFloatx80(
5074 STATUS(floatx80_rounding_precision
), zSign
, zExp
, zSig0
, zSig1 STATUS_VAR
);
5078 /*----------------------------------------------------------------------------
5079 | Returns the remainder of the extended double-precision floating-point value
5080 | `a' with respect to the corresponding value `b'. The operation is performed
5081 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5082 *----------------------------------------------------------------------------*/
5084 floatx80
floatx80_rem( floatx80 a
, floatx80 b STATUS_PARAM
)
5087 int32 aExp
, bExp
, expDiff
;
5088 uint64_t aSig0
, aSig1
, bSig
;
5089 uint64_t q
, term0
, term1
, alternateASig0
, alternateASig1
;
5092 aSig0
= extractFloatx80Frac( a
);
5093 aExp
= extractFloatx80Exp( a
);
5094 aSign
= extractFloatx80Sign( a
);
5095 bSig
= extractFloatx80Frac( b
);
5096 bExp
= extractFloatx80Exp( b
);
5097 if ( aExp
== 0x7FFF ) {
5098 if ( (uint64_t) ( aSig0
<<1 )
5099 || ( ( bExp
== 0x7FFF ) && (uint64_t) ( bSig
<<1 ) ) ) {
5100 return propagateFloatx80NaN( a
, b STATUS_VAR
);
5104 if ( bExp
== 0x7FFF ) {
5105 if ( (uint64_t) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b STATUS_VAR
);
5111 float_raise( float_flag_invalid STATUS_VAR
);
5112 z
.low
= floatx80_default_nan_low
;
5113 z
.high
= floatx80_default_nan_high
;
5116 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
5119 if ( (uint64_t) ( aSig0
<<1 ) == 0 ) return a
;
5120 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5122 bSig
|= LIT64( 0x8000000000000000 );
5124 expDiff
= aExp
- bExp
;
5126 if ( expDiff
< 0 ) {
5127 if ( expDiff
< -1 ) return a
;
5128 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
5131 q
= ( bSig
<= aSig0
);
5132 if ( q
) aSig0
-= bSig
;
5134 while ( 0 < expDiff
) {
5135 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5136 q
= ( 2 < q
) ? q
- 2 : 0;
5137 mul64To128( bSig
, q
, &term0
, &term1
);
5138 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5139 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
5143 if ( 0 < expDiff
) {
5144 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
5145 q
= ( 2 < q
) ? q
- 2 : 0;
5147 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
5148 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5149 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
5150 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
5152 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
5159 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
5160 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5161 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
5164 aSig0
= alternateASig0
;
5165 aSig1
= alternateASig1
;
5169 normalizeRoundAndPackFloatx80(
5170 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1 STATUS_VAR
);
5174 /*----------------------------------------------------------------------------
5175 | Returns the square root of the extended double-precision floating-point
5176 | value `a'. The operation is performed according to the IEC/IEEE Standard
5177 | for Binary Floating-Point Arithmetic.
5178 *----------------------------------------------------------------------------*/
5180 floatx80
floatx80_sqrt( floatx80 a STATUS_PARAM
)
5184 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
5185 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
5188 aSig0
= extractFloatx80Frac( a
);
5189 aExp
= extractFloatx80Exp( a
);
5190 aSign
= extractFloatx80Sign( a
);
5191 if ( aExp
== 0x7FFF ) {
5192 if ( (uint64_t) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a STATUS_VAR
);
5193 if ( ! aSign
) return a
;
5197 if ( ( aExp
| aSig0
) == 0 ) return a
;
5199 float_raise( float_flag_invalid STATUS_VAR
);
5200 z
.low
= floatx80_default_nan_low
;
5201 z
.high
= floatx80_default_nan_high
;
5205 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
5206 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
5208 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
5209 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
5210 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
5211 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
5212 doubleZSig0
= zSig0
<<1;
5213 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
5214 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
5215 while ( (int64_t) rem0
< 0 ) {
5218 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
5220 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
5221 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5222 if ( zSig1
== 0 ) zSig1
= 1;
5223 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
5224 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
5225 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
5226 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5227 while ( (int64_t) rem1
< 0 ) {
5229 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
5231 term2
|= doubleZSig0
;
5232 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
5234 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
5236 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
5237 zSig0
|= doubleZSig0
;
5239 roundAndPackFloatx80(
5240 STATUS(floatx80_rounding_precision
), 0, zExp
, zSig0
, zSig1 STATUS_VAR
);
5244 /*----------------------------------------------------------------------------
5245 | Returns 1 if the extended double-precision floating-point value `a' is equal
5246 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5247 | raised if either operand is a NaN. Otherwise, the comparison is performed
5248 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5249 *----------------------------------------------------------------------------*/
5251 int floatx80_eq( floatx80 a
, floatx80 b STATUS_PARAM
)
5254 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5255 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5256 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5257 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5259 float_raise( float_flag_invalid STATUS_VAR
);
5264 && ( ( a
.high
== b
.high
)
5266 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5271 /*----------------------------------------------------------------------------
5272 | Returns 1 if the extended double-precision floating-point value `a' is
5273 | less than or equal to the corresponding value `b', and 0 otherwise. The
5274 | invalid exception is raised if either operand is a NaN. The comparison is
5275 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5277 *----------------------------------------------------------------------------*/
5279 int floatx80_le( floatx80 a
, floatx80 b STATUS_PARAM
)
5283 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5284 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5285 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5286 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5288 float_raise( float_flag_invalid STATUS_VAR
);
5291 aSign
= extractFloatx80Sign( a
);
5292 bSign
= extractFloatx80Sign( b
);
5293 if ( aSign
!= bSign
) {
5296 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5300 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5301 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5305 /*----------------------------------------------------------------------------
5306 | Returns 1 if the extended double-precision floating-point value `a' is
5307 | less than the corresponding value `b', and 0 otherwise. The invalid
5308 | exception is raised if either operand is a NaN. The comparison is performed
5309 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5310 *----------------------------------------------------------------------------*/
5312 int floatx80_lt( floatx80 a
, floatx80 b STATUS_PARAM
)
5316 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5317 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5318 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5319 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5321 float_raise( float_flag_invalid STATUS_VAR
);
5324 aSign
= extractFloatx80Sign( a
);
5325 bSign
= extractFloatx80Sign( b
);
5326 if ( aSign
!= bSign
) {
5329 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5333 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5334 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5338 /*----------------------------------------------------------------------------
5339 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5340 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5341 | either operand is a NaN. The comparison is performed according to the
5342 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5343 *----------------------------------------------------------------------------*/
5344 int floatx80_unordered( floatx80 a
, floatx80 b STATUS_PARAM
)
5346 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5347 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5348 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5349 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5351 float_raise( float_flag_invalid STATUS_VAR
);
5357 /*----------------------------------------------------------------------------
5358 | Returns 1 if the extended double-precision floating-point value `a' is
5359 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5360 | cause an exception. The comparison is performed according to the IEC/IEEE
5361 | Standard for Binary Floating-Point Arithmetic.
5362 *----------------------------------------------------------------------------*/
5364 int floatx80_eq_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
5367 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5368 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5369 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5370 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5372 if ( floatx80_is_signaling_nan( a
)
5373 || floatx80_is_signaling_nan( b
) ) {
5374 float_raise( float_flag_invalid STATUS_VAR
);
5380 && ( ( a
.high
== b
.high
)
5382 && ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5387 /*----------------------------------------------------------------------------
5388 | Returns 1 if the extended double-precision floating-point value `a' is less
5389 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5390 | do not cause an exception. Otherwise, the comparison is performed according
5391 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5392 *----------------------------------------------------------------------------*/
5394 int floatx80_le_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
5398 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5399 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5400 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5401 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5403 if ( floatx80_is_signaling_nan( a
)
5404 || floatx80_is_signaling_nan( b
) ) {
5405 float_raise( float_flag_invalid STATUS_VAR
);
5409 aSign
= extractFloatx80Sign( a
);
5410 bSign
= extractFloatx80Sign( b
);
5411 if ( aSign
!= bSign
) {
5414 || ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5418 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5419 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5423 /*----------------------------------------------------------------------------
5424 | Returns 1 if the extended double-precision floating-point value `a' is less
5425 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5426 | an exception. Otherwise, the comparison is performed according to the
5427 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5428 *----------------------------------------------------------------------------*/
5430 int floatx80_lt_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
5434 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5435 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5436 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5437 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5439 if ( floatx80_is_signaling_nan( a
)
5440 || floatx80_is_signaling_nan( b
) ) {
5441 float_raise( float_flag_invalid STATUS_VAR
);
5445 aSign
= extractFloatx80Sign( a
);
5446 bSign
= extractFloatx80Sign( b
);
5447 if ( aSign
!= bSign
) {
5450 && ( ( ( (uint16_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5454 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5455 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5459 /*----------------------------------------------------------------------------
5460 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5461 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5462 | The comparison is performed according to the IEC/IEEE Standard for Binary
5463 | Floating-Point Arithmetic.
5464 *----------------------------------------------------------------------------*/
5465 int floatx80_unordered_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
5467 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
5468 && (uint64_t) ( extractFloatx80Frac( a
)<<1 ) )
5469 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
5470 && (uint64_t) ( extractFloatx80Frac( b
)<<1 ) )
5472 if ( floatx80_is_signaling_nan( a
)
5473 || floatx80_is_signaling_nan( b
) ) {
5474 float_raise( float_flag_invalid STATUS_VAR
);
5481 /*----------------------------------------------------------------------------
5482 | Returns the result of converting the quadruple-precision floating-point
5483 | value `a' to the 32-bit two's complement integer format. The conversion
5484 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5485 | Arithmetic---which means in particular that the conversion is rounded
5486 | according to the current rounding mode. If `a' is a NaN, the largest
5487 | positive integer is returned. Otherwise, if the conversion overflows, the
5488 | largest integer with the same sign as `a' is returned.
5489 *----------------------------------------------------------------------------*/
5491 int32
float128_to_int32( float128 a STATUS_PARAM
)
5494 int32 aExp
, shiftCount
;
5495 uint64_t aSig0
, aSig1
;
5497 aSig1
= extractFloat128Frac1( a
);
5498 aSig0
= extractFloat128Frac0( a
);
5499 aExp
= extractFloat128Exp( a
);
5500 aSign
= extractFloat128Sign( a
);
5501 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
5502 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5503 aSig0
|= ( aSig1
!= 0 );
5504 shiftCount
= 0x4028 - aExp
;
5505 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
5506 return roundAndPackInt32( aSign
, aSig0 STATUS_VAR
);
5510 /*----------------------------------------------------------------------------
5511 | Returns the result of converting the quadruple-precision floating-point
5512 | value `a' to the 32-bit two's complement integer format. The conversion
5513 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5514 | Arithmetic, except that the conversion is always rounded toward zero. If
5515 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5516 | conversion overflows, the largest integer with the same sign as `a' is
5518 *----------------------------------------------------------------------------*/
5520 int32
float128_to_int32_round_to_zero( float128 a STATUS_PARAM
)
5523 int32 aExp
, shiftCount
;
5524 uint64_t aSig0
, aSig1
, savedASig
;
5527 aSig1
= extractFloat128Frac1( a
);
5528 aSig0
= extractFloat128Frac0( a
);
5529 aExp
= extractFloat128Exp( a
);
5530 aSign
= extractFloat128Sign( a
);
5531 aSig0
|= ( aSig1
!= 0 );
5532 if ( 0x401E < aExp
) {
5533 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
5536 else if ( aExp
< 0x3FFF ) {
5537 if ( aExp
|| aSig0
) STATUS(float_exception_flags
) |= float_flag_inexact
;
5540 aSig0
|= LIT64( 0x0001000000000000 );
5541 shiftCount
= 0x402F - aExp
;
5543 aSig0
>>= shiftCount
;
5545 if ( aSign
) z
= - z
;
5546 if ( ( z
< 0 ) ^ aSign
) {
5548 float_raise( float_flag_invalid STATUS_VAR
);
5549 return aSign
? (int32_t) 0x80000000 : 0x7FFFFFFF;
5551 if ( ( aSig0
<<shiftCount
) != savedASig
) {
5552 STATUS(float_exception_flags
) |= float_flag_inexact
;
5558 /*----------------------------------------------------------------------------
5559 | Returns the result of converting the quadruple-precision floating-point
5560 | value `a' to the 64-bit two's complement integer format. The conversion
5561 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5562 | Arithmetic---which means in particular that the conversion is rounded
5563 | according to the current rounding mode. If `a' is a NaN, the largest
5564 | positive integer is returned. Otherwise, if the conversion overflows, the
5565 | largest integer with the same sign as `a' is returned.
5566 *----------------------------------------------------------------------------*/
5568 int64
float128_to_int64( float128 a STATUS_PARAM
)
5571 int32 aExp
, shiftCount
;
5572 uint64_t aSig0
, aSig1
;
5574 aSig1
= extractFloat128Frac1( a
);
5575 aSig0
= extractFloat128Frac0( a
);
5576 aExp
= extractFloat128Exp( a
);
5577 aSign
= extractFloat128Sign( a
);
5578 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5579 shiftCount
= 0x402F - aExp
;
5580 if ( shiftCount
<= 0 ) {
5581 if ( 0x403E < aExp
) {
5582 float_raise( float_flag_invalid STATUS_VAR
);
5584 || ( ( aExp
== 0x7FFF )
5585 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
5588 return LIT64( 0x7FFFFFFFFFFFFFFF );
5590 return (int64_t) LIT64( 0x8000000000000000 );
5592 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
5595 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
5597 return roundAndPackInt64( aSign
, aSig0
, aSig1 STATUS_VAR
);
5601 /*----------------------------------------------------------------------------
5602 | Returns the result of converting the quadruple-precision floating-point
5603 | value `a' to the 64-bit two's complement integer format. The conversion
5604 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5605 | Arithmetic, except that the conversion is always rounded toward zero.
5606 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5607 | the conversion overflows, the largest integer with the same sign as `a' is
5609 *----------------------------------------------------------------------------*/
5611 int64
float128_to_int64_round_to_zero( float128 a STATUS_PARAM
)
5614 int32 aExp
, shiftCount
;
5615 uint64_t aSig0
, aSig1
;
5618 aSig1
= extractFloat128Frac1( a
);
5619 aSig0
= extractFloat128Frac0( a
);
5620 aExp
= extractFloat128Exp( a
);
5621 aSign
= extractFloat128Sign( a
);
5622 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
5623 shiftCount
= aExp
- 0x402F;
5624 if ( 0 < shiftCount
) {
5625 if ( 0x403E <= aExp
) {
5626 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
5627 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
5628 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
5629 if ( aSig1
) STATUS(float_exception_flags
) |= float_flag_inexact
;
5632 float_raise( float_flag_invalid STATUS_VAR
);
5633 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
5634 return LIT64( 0x7FFFFFFFFFFFFFFF );
5637 return (int64_t) LIT64( 0x8000000000000000 );
5639 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
5640 if ( (uint64_t) ( aSig1
<<shiftCount
) ) {
5641 STATUS(float_exception_flags
) |= float_flag_inexact
;
5645 if ( aExp
< 0x3FFF ) {
5646 if ( aExp
| aSig0
| aSig1
) {
5647 STATUS(float_exception_flags
) |= float_flag_inexact
;
5651 z
= aSig0
>>( - shiftCount
);
5653 || ( shiftCount
&& (uint64_t) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
5654 STATUS(float_exception_flags
) |= float_flag_inexact
;
5657 if ( aSign
) z
= - z
;
5662 /*----------------------------------------------------------------------------
5663 | Returns the result of converting the quadruple-precision floating-point
5664 | value `a' to the single-precision floating-point format. The conversion
5665 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5667 *----------------------------------------------------------------------------*/
5669 float32
float128_to_float32( float128 a STATUS_PARAM
)
5673 uint64_t aSig0
, aSig1
;
5676 aSig1
= extractFloat128Frac1( a
);
5677 aSig0
= extractFloat128Frac0( a
);
5678 aExp
= extractFloat128Exp( a
);
5679 aSign
= extractFloat128Sign( a
);
5680 if ( aExp
== 0x7FFF ) {
5681 if ( aSig0
| aSig1
) {
5682 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
5684 return packFloat32( aSign
, 0xFF, 0 );
5686 aSig0
|= ( aSig1
!= 0 );
5687 shift64RightJamming( aSig0
, 18, &aSig0
);
5689 if ( aExp
|| zSig
) {
5693 return roundAndPackFloat32( aSign
, aExp
, zSig STATUS_VAR
);
5697 /*----------------------------------------------------------------------------
5698 | Returns the result of converting the quadruple-precision floating-point
5699 | value `a' to the double-precision floating-point format. The conversion
5700 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5702 *----------------------------------------------------------------------------*/
5704 float64
float128_to_float64( float128 a STATUS_PARAM
)
5708 uint64_t aSig0
, aSig1
;
5710 aSig1
= extractFloat128Frac1( a
);
5711 aSig0
= extractFloat128Frac0( a
);
5712 aExp
= extractFloat128Exp( a
);
5713 aSign
= extractFloat128Sign( a
);
5714 if ( aExp
== 0x7FFF ) {
5715 if ( aSig0
| aSig1
) {
5716 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
5718 return packFloat64( aSign
, 0x7FF, 0 );
5720 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
5721 aSig0
|= ( aSig1
!= 0 );
5722 if ( aExp
|| aSig0
) {
5723 aSig0
|= LIT64( 0x4000000000000000 );
5726 return roundAndPackFloat64( aSign
, aExp
, aSig0 STATUS_VAR
);
5730 /*----------------------------------------------------------------------------
5731 | Returns the result of converting the quadruple-precision floating-point
5732 | value `a' to the extended double-precision floating-point format. The
5733 | conversion is performed according to the IEC/IEEE Standard for Binary
5734 | Floating-Point Arithmetic.
5735 *----------------------------------------------------------------------------*/
5737 floatx80
float128_to_floatx80( float128 a STATUS_PARAM
)
5741 uint64_t aSig0
, aSig1
;
5743 aSig1
= extractFloat128Frac1( a
);
5744 aSig0
= extractFloat128Frac0( a
);
5745 aExp
= extractFloat128Exp( a
);
5746 aSign
= extractFloat128Sign( a
);
5747 if ( aExp
== 0x7FFF ) {
5748 if ( aSig0
| aSig1
) {
5749 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR
) STATUS_VAR
);
5751 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
5754 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
5755 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
5758 aSig0
|= LIT64( 0x0001000000000000 );
5760 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
5761 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1 STATUS_VAR
);
5765 /*----------------------------------------------------------------------------
5766 | Rounds the quadruple-precision floating-point value `a' to an integer, and
5767 | returns the result as a quadruple-precision floating-point value. The
5768 | operation is performed according to the IEC/IEEE Standard for Binary
5769 | Floating-Point Arithmetic.
5770 *----------------------------------------------------------------------------*/
5772 float128
float128_round_to_int( float128 a STATUS_PARAM
)
5776 uint64_t lastBitMask
, roundBitsMask
;
5780 aExp
= extractFloat128Exp( a
);
5781 if ( 0x402F <= aExp
) {
5782 if ( 0x406F <= aExp
) {
5783 if ( ( aExp
== 0x7FFF )
5784 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
5786 return propagateFloat128NaN( a
, a STATUS_VAR
);
5791 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
5792 roundBitsMask
= lastBitMask
- 1;
5794 roundingMode
= STATUS(float_rounding_mode
);
5795 if ( roundingMode
== float_round_nearest_even
) {
5796 if ( lastBitMask
) {
5797 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
5798 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
5801 if ( (int64_t) z
.low
< 0 ) {
5803 if ( (uint64_t) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
5807 else if ( roundingMode
!= float_round_to_zero
) {
5808 if ( extractFloat128Sign( z
)
5809 ^ ( roundingMode
== float_round_up
) ) {
5810 add128( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
5813 z
.low
&= ~ roundBitsMask
;
5816 if ( aExp
< 0x3FFF ) {
5817 if ( ( ( (uint64_t) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
5818 STATUS(float_exception_flags
) |= float_flag_inexact
;
5819 aSign
= extractFloat128Sign( a
);
5820 switch ( STATUS(float_rounding_mode
) ) {
5821 case float_round_nearest_even
:
5822 if ( ( aExp
== 0x3FFE )
5823 && ( extractFloat128Frac0( a
)
5824 | extractFloat128Frac1( a
) )
5826 return packFloat128( aSign
, 0x3FFF, 0, 0 );
5829 case float_round_down
:
5831 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
5832 : packFloat128( 0, 0, 0, 0 );
5833 case float_round_up
:
5835 aSign
? packFloat128( 1, 0, 0, 0 )
5836 : packFloat128( 0, 0x3FFF, 0, 0 );
5838 return packFloat128( aSign
, 0, 0, 0 );
5841 lastBitMask
<<= 0x402F - aExp
;
5842 roundBitsMask
= lastBitMask
- 1;
5845 roundingMode
= STATUS(float_rounding_mode
);
5846 if ( roundingMode
== float_round_nearest_even
) {
5847 z
.high
+= lastBitMask
>>1;
5848 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
5849 z
.high
&= ~ lastBitMask
;
5852 else if ( roundingMode
!= float_round_to_zero
) {
5853 if ( extractFloat128Sign( z
)
5854 ^ ( roundingMode
== float_round_up
) ) {
5855 z
.high
|= ( a
.low
!= 0 );
5856 z
.high
+= roundBitsMask
;
5859 z
.high
&= ~ roundBitsMask
;
5861 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
5862 STATUS(float_exception_flags
) |= float_flag_inexact
;
5868 /*----------------------------------------------------------------------------
5869 | Returns the result of adding the absolute values of the quadruple-precision
5870 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
5871 | before being returned. `zSign' is ignored if the result is a NaN.
5872 | The addition is performed according to the IEC/IEEE Standard for Binary
5873 | Floating-Point Arithmetic.
5874 *----------------------------------------------------------------------------*/
5876 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
5878 int32 aExp
, bExp
, zExp
;
5879 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
5882 aSig1
= extractFloat128Frac1( a
);
5883 aSig0
= extractFloat128Frac0( a
);
5884 aExp
= extractFloat128Exp( a
);
5885 bSig1
= extractFloat128Frac1( b
);
5886 bSig0
= extractFloat128Frac0( b
);
5887 bExp
= extractFloat128Exp( b
);
5888 expDiff
= aExp
- bExp
;
5889 if ( 0 < expDiff
) {
5890 if ( aExp
== 0x7FFF ) {
5891 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5898 bSig0
|= LIT64( 0x0001000000000000 );
5900 shift128ExtraRightJamming(
5901 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
5904 else if ( expDiff
< 0 ) {
5905 if ( bExp
== 0x7FFF ) {
5906 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
5907 return packFloat128( zSign
, 0x7FFF, 0, 0 );
5913 aSig0
|= LIT64( 0x0001000000000000 );
5915 shift128ExtraRightJamming(
5916 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
5920 if ( aExp
== 0x7FFF ) {
5921 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
5922 return propagateFloat128NaN( a
, b STATUS_VAR
);
5926 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5928 if (STATUS(flush_to_zero
)) {
5929 if (zSig0
| zSig1
) {
5930 float_raise(float_flag_output_denormal STATUS_VAR
);
5932 return packFloat128(zSign
, 0, 0, 0);
5934 return packFloat128( zSign
, 0, zSig0
, zSig1
);
5937 zSig0
|= LIT64( 0x0002000000000000 );
5941 aSig0
|= LIT64( 0x0001000000000000 );
5942 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
5944 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
5947 shift128ExtraRightJamming(
5948 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
5950 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
5954 /*----------------------------------------------------------------------------
5955 | Returns the result of subtracting the absolute values of the quadruple-
5956 | precision floating-point values `a' and `b'. If `zSign' is 1, the
5957 | difference is negated before being returned. `zSign' is ignored if the
5958 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5959 | Standard for Binary Floating-Point Arithmetic.
5960 *----------------------------------------------------------------------------*/
5962 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign STATUS_PARAM
)
5964 int32 aExp
, bExp
, zExp
;
5965 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
5969 aSig1
= extractFloat128Frac1( a
);
5970 aSig0
= extractFloat128Frac0( a
);
5971 aExp
= extractFloat128Exp( a
);
5972 bSig1
= extractFloat128Frac1( b
);
5973 bSig0
= extractFloat128Frac0( b
);
5974 bExp
= extractFloat128Exp( b
);
5975 expDiff
= aExp
- bExp
;
5976 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
5977 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
5978 if ( 0 < expDiff
) goto aExpBigger
;
5979 if ( expDiff
< 0 ) goto bExpBigger
;
5980 if ( aExp
== 0x7FFF ) {
5981 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
5982 return propagateFloat128NaN( a
, b STATUS_VAR
);
5984 float_raise( float_flag_invalid STATUS_VAR
);
5985 z
.low
= float128_default_nan_low
;
5986 z
.high
= float128_default_nan_high
;
5993 if ( bSig0
< aSig0
) goto aBigger
;
5994 if ( aSig0
< bSig0
) goto bBigger
;
5995 if ( bSig1
< aSig1
) goto aBigger
;
5996 if ( aSig1
< bSig1
) goto bBigger
;
5997 return packFloat128( STATUS(float_rounding_mode
) == float_round_down
, 0, 0, 0 );
5999 if ( bExp
== 0x7FFF ) {
6000 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6001 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
6007 aSig0
|= LIT64( 0x4000000000000000 );
6009 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6010 bSig0
|= LIT64( 0x4000000000000000 );
6012 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6015 goto normalizeRoundAndPack
;
6017 if ( aExp
== 0x7FFF ) {
6018 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6025 bSig0
|= LIT64( 0x4000000000000000 );
6027 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
6028 aSig0
|= LIT64( 0x4000000000000000 );
6030 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
6032 normalizeRoundAndPack
:
6034 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1 STATUS_VAR
);
6038 /*----------------------------------------------------------------------------
6039 | Returns the result of adding the quadruple-precision floating-point values
6040 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6041 | for Binary Floating-Point Arithmetic.
6042 *----------------------------------------------------------------------------*/
6044 float128
float128_add( float128 a
, float128 b STATUS_PARAM
)
6048 aSign
= extractFloat128Sign( a
);
6049 bSign
= extractFloat128Sign( b
);
6050 if ( aSign
== bSign
) {
6051 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
6054 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
6059 /*----------------------------------------------------------------------------
6060 | Returns the result of subtracting the quadruple-precision floating-point
6061 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6062 | Standard for Binary Floating-Point Arithmetic.
6063 *----------------------------------------------------------------------------*/
6065 float128
float128_sub( float128 a
, float128 b STATUS_PARAM
)
6069 aSign
= extractFloat128Sign( a
);
6070 bSign
= extractFloat128Sign( b
);
6071 if ( aSign
== bSign
) {
6072 return subFloat128Sigs( a
, b
, aSign STATUS_VAR
);
6075 return addFloat128Sigs( a
, b
, aSign STATUS_VAR
);
6080 /*----------------------------------------------------------------------------
6081 | Returns the result of multiplying the quadruple-precision floating-point
6082 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6083 | Standard for Binary Floating-Point Arithmetic.
6084 *----------------------------------------------------------------------------*/
6086 float128
float128_mul( float128 a
, float128 b STATUS_PARAM
)
6088 flag aSign
, bSign
, zSign
;
6089 int32 aExp
, bExp
, zExp
;
6090 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
6093 aSig1
= extractFloat128Frac1( a
);
6094 aSig0
= extractFloat128Frac0( a
);
6095 aExp
= extractFloat128Exp( a
);
6096 aSign
= extractFloat128Sign( a
);
6097 bSig1
= extractFloat128Frac1( b
);
6098 bSig0
= extractFloat128Frac0( b
);
6099 bExp
= extractFloat128Exp( b
);
6100 bSign
= extractFloat128Sign( b
);
6101 zSign
= aSign
^ bSign
;
6102 if ( aExp
== 0x7FFF ) {
6103 if ( ( aSig0
| aSig1
)
6104 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6105 return propagateFloat128NaN( a
, b STATUS_VAR
);
6107 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
6108 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6110 if ( bExp
== 0x7FFF ) {
6111 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6112 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6114 float_raise( float_flag_invalid STATUS_VAR
);
6115 z
.low
= float128_default_nan_low
;
6116 z
.high
= float128_default_nan_high
;
6119 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6122 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6123 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6126 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6127 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6129 zExp
= aExp
+ bExp
- 0x4000;
6130 aSig0
|= LIT64( 0x0001000000000000 );
6131 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
6132 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
6133 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
6134 zSig2
|= ( zSig3
!= 0 );
6135 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
6136 shift128ExtraRightJamming(
6137 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
6140 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
6144 /*----------------------------------------------------------------------------
6145 | Returns the result of dividing the quadruple-precision floating-point value
6146 | `a' by the corresponding value `b'. The operation is performed according to
6147 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6148 *----------------------------------------------------------------------------*/
6150 float128
float128_div( float128 a
, float128 b STATUS_PARAM
)
6152 flag aSign
, bSign
, zSign
;
6153 int32 aExp
, bExp
, zExp
;
6154 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
6155 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6158 aSig1
= extractFloat128Frac1( a
);
6159 aSig0
= extractFloat128Frac0( a
);
6160 aExp
= extractFloat128Exp( a
);
6161 aSign
= extractFloat128Sign( a
);
6162 bSig1
= extractFloat128Frac1( b
);
6163 bSig0
= extractFloat128Frac0( b
);
6164 bExp
= extractFloat128Exp( b
);
6165 bSign
= extractFloat128Sign( b
);
6166 zSign
= aSign
^ bSign
;
6167 if ( aExp
== 0x7FFF ) {
6168 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6169 if ( bExp
== 0x7FFF ) {
6170 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6173 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6175 if ( bExp
== 0x7FFF ) {
6176 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6177 return packFloat128( zSign
, 0, 0, 0 );
6180 if ( ( bSig0
| bSig1
) == 0 ) {
6181 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
6183 float_raise( float_flag_invalid STATUS_VAR
);
6184 z
.low
= float128_default_nan_low
;
6185 z
.high
= float128_default_nan_high
;
6188 float_raise( float_flag_divbyzero STATUS_VAR
);
6189 return packFloat128( zSign
, 0x7FFF, 0, 0 );
6191 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6194 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
6195 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6197 zExp
= aExp
- bExp
+ 0x3FFD;
6199 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
6201 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6202 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
6203 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
6206 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6207 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
6208 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
6209 while ( (int64_t) rem0
< 0 ) {
6211 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
6213 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
6214 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
6215 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
6216 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
6217 while ( (int64_t) rem1
< 0 ) {
6219 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
6221 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6223 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
6224 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
6228 /*----------------------------------------------------------------------------
6229 | Returns the remainder of the quadruple-precision floating-point value `a'
6230 | with respect to the corresponding value `b'. The operation is performed
6231 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6232 *----------------------------------------------------------------------------*/
6234 float128
float128_rem( float128 a
, float128 b STATUS_PARAM
)
6237 int32 aExp
, bExp
, expDiff
;
6238 uint64_t aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
6239 uint64_t allZero
, alternateASig0
, alternateASig1
, sigMean1
;
6243 aSig1
= extractFloat128Frac1( a
);
6244 aSig0
= extractFloat128Frac0( a
);
6245 aExp
= extractFloat128Exp( a
);
6246 aSign
= extractFloat128Sign( a
);
6247 bSig1
= extractFloat128Frac1( b
);
6248 bSig0
= extractFloat128Frac0( b
);
6249 bExp
= extractFloat128Exp( b
);
6250 if ( aExp
== 0x7FFF ) {
6251 if ( ( aSig0
| aSig1
)
6252 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
6253 return propagateFloat128NaN( a
, b STATUS_VAR
);
6257 if ( bExp
== 0x7FFF ) {
6258 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b STATUS_VAR
);
6262 if ( ( bSig0
| bSig1
) == 0 ) {
6264 float_raise( float_flag_invalid STATUS_VAR
);
6265 z
.low
= float128_default_nan_low
;
6266 z
.high
= float128_default_nan_high
;
6269 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
6272 if ( ( aSig0
| aSig1
) == 0 ) return a
;
6273 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6275 expDiff
= aExp
- bExp
;
6276 if ( expDiff
< -1 ) return a
;
6278 aSig0
| LIT64( 0x0001000000000000 ),
6280 15 - ( expDiff
< 0 ),
6285 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
6286 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
6287 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6289 while ( 0 < expDiff
) {
6290 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6291 q
= ( 4 < q
) ? q
- 4 : 0;
6292 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6293 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
6294 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
6295 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
6298 if ( -64 < expDiff
) {
6299 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
6300 q
= ( 4 < q
) ? q
- 4 : 0;
6302 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6304 if ( expDiff
< 0 ) {
6305 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
6308 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
6310 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
6311 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
6314 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
6315 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
6318 alternateASig0
= aSig0
;
6319 alternateASig1
= aSig1
;
6321 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
6322 } while ( 0 <= (int64_t) aSig0
);
6324 aSig0
, aSig1
, alternateASig0
, alternateASig1
, (uint64_t *)&sigMean0
, &sigMean1
);
6325 if ( ( sigMean0
< 0 )
6326 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
6327 aSig0
= alternateASig0
;
6328 aSig1
= alternateASig1
;
6330 zSign
= ( (int64_t) aSig0
< 0 );
6331 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
6333 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1 STATUS_VAR
);
6337 /*----------------------------------------------------------------------------
6338 | Returns the square root of the quadruple-precision floating-point value `a'.
6339 | The operation is performed according to the IEC/IEEE Standard for Binary
6340 | Floating-Point Arithmetic.
6341 *----------------------------------------------------------------------------*/
6343 float128
float128_sqrt( float128 a STATUS_PARAM
)
6347 uint64_t aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
6348 uint64_t rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
6351 aSig1
= extractFloat128Frac1( a
);
6352 aSig0
= extractFloat128Frac0( a
);
6353 aExp
= extractFloat128Exp( a
);
6354 aSign
= extractFloat128Sign( a
);
6355 if ( aExp
== 0x7FFF ) {
6356 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a STATUS_VAR
);
6357 if ( ! aSign
) return a
;
6361 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
6363 float_raise( float_flag_invalid STATUS_VAR
);
6364 z
.low
= float128_default_nan_low
;
6365 z
.high
= float128_default_nan_high
;
6369 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
6370 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
6372 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
6373 aSig0
|= LIT64( 0x0001000000000000 );
6374 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
6375 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
6376 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
6377 doubleZSig0
= zSig0
<<1;
6378 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
6379 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
6380 while ( (int64_t) rem0
< 0 ) {
6383 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
6385 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
6386 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
6387 if ( zSig1
== 0 ) zSig1
= 1;
6388 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
6389 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
6390 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
6391 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6392 while ( (int64_t) rem1
< 0 ) {
6394 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
6396 term2
|= doubleZSig0
;
6397 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
6399 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
6401 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
6402 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2 STATUS_VAR
);
6406 /*----------------------------------------------------------------------------
6407 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6408 | the corresponding value `b', and 0 otherwise. The invalid exception is
6409 | raised if either operand is a NaN. Otherwise, the comparison is performed
6410 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6411 *----------------------------------------------------------------------------*/
6413 int float128_eq( float128 a
, float128 b STATUS_PARAM
)
6416 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6417 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6418 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6419 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6421 float_raise( float_flag_invalid STATUS_VAR
);
6426 && ( ( a
.high
== b
.high
)
6428 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6433 /*----------------------------------------------------------------------------
6434 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6435 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6436 | exception is raised if either operand is a NaN. The comparison is performed
6437 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6438 *----------------------------------------------------------------------------*/
6440 int float128_le( float128 a
, float128 b STATUS_PARAM
)
6444 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6445 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6446 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6447 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6449 float_raise( float_flag_invalid STATUS_VAR
);
6452 aSign
= extractFloat128Sign( a
);
6453 bSign
= extractFloat128Sign( b
);
6454 if ( aSign
!= bSign
) {
6457 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6461 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6462 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6466 /*----------------------------------------------------------------------------
6467 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6468 | the corresponding value `b', and 0 otherwise. The invalid exception is
6469 | raised if either operand is a NaN. The comparison is performed according
6470 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6471 *----------------------------------------------------------------------------*/
6473 int float128_lt( float128 a
, float128 b STATUS_PARAM
)
6477 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6478 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6479 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6480 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6482 float_raise( float_flag_invalid STATUS_VAR
);
6485 aSign
= extractFloat128Sign( a
);
6486 bSign
= extractFloat128Sign( b
);
6487 if ( aSign
!= bSign
) {
6490 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6494 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6495 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6499 /*----------------------------------------------------------------------------
6500 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6501 | be compared, and 0 otherwise. The invalid exception is raised if either
6502 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6503 | Standard for Binary Floating-Point Arithmetic.
6504 *----------------------------------------------------------------------------*/
6506 int float128_unordered( float128 a
, float128 b STATUS_PARAM
)
6508 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6509 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6510 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6511 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6513 float_raise( float_flag_invalid STATUS_VAR
);
6519 /*----------------------------------------------------------------------------
6520 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6521 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6522 | exception. The comparison is performed according to the IEC/IEEE Standard
6523 | for Binary Floating-Point Arithmetic.
6524 *----------------------------------------------------------------------------*/
6526 int float128_eq_quiet( float128 a
, float128 b STATUS_PARAM
)
6529 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6530 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6531 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6532 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6534 if ( float128_is_signaling_nan( a
)
6535 || float128_is_signaling_nan( b
) ) {
6536 float_raise( float_flag_invalid STATUS_VAR
);
6542 && ( ( a
.high
== b
.high
)
6544 && ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
6549 /*----------------------------------------------------------------------------
6550 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6551 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6552 | cause an exception. Otherwise, the comparison is performed according to the
6553 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6554 *----------------------------------------------------------------------------*/
6556 int float128_le_quiet( float128 a
, float128 b STATUS_PARAM
)
6560 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6561 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6562 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6563 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6565 if ( float128_is_signaling_nan( a
)
6566 || float128_is_signaling_nan( b
) ) {
6567 float_raise( float_flag_invalid STATUS_VAR
);
6571 aSign
= extractFloat128Sign( a
);
6572 bSign
= extractFloat128Sign( b
);
6573 if ( aSign
!= bSign
) {
6576 || ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6580 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
6581 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
6585 /*----------------------------------------------------------------------------
6586 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6587 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6588 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
6589 | Standard for Binary Floating-Point Arithmetic.
6590 *----------------------------------------------------------------------------*/
6592 int float128_lt_quiet( float128 a
, float128 b STATUS_PARAM
)
6596 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6597 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6598 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6599 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6601 if ( float128_is_signaling_nan( a
)
6602 || float128_is_signaling_nan( b
) ) {
6603 float_raise( float_flag_invalid STATUS_VAR
);
6607 aSign
= extractFloat128Sign( a
);
6608 bSign
= extractFloat128Sign( b
);
6609 if ( aSign
!= bSign
) {
6612 && ( ( ( (uint64_t) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
6616 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
6617 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
6621 /*----------------------------------------------------------------------------
6622 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6623 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6624 | comparison is performed according to the IEC/IEEE Standard for Binary
6625 | Floating-Point Arithmetic.
6626 *----------------------------------------------------------------------------*/
6628 int float128_unordered_quiet( float128 a
, float128 b STATUS_PARAM
)
6630 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
6631 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
6632 || ( ( extractFloat128Exp( b
) == 0x7FFF )
6633 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
6635 if ( float128_is_signaling_nan( a
)
6636 || float128_is_signaling_nan( b
) ) {
6637 float_raise( float_flag_invalid STATUS_VAR
);
6644 /* misc functions */
6645 float32
uint32_to_float32(uint32_t a STATUS_PARAM
)
6647 return int64_to_float32(a STATUS_VAR
);
6650 float64
uint32_to_float64(uint32_t a STATUS_PARAM
)
6652 return int64_to_float64(a STATUS_VAR
);
6655 uint32
float32_to_uint32( float32 a STATUS_PARAM
)
6659 int old_exc_flags
= get_float_exception_flags(status
);
6661 v
= float32_to_int64(a STATUS_VAR
);
6664 } else if (v
> 0xffffffff) {
6669 set_float_exception_flags(old_exc_flags
, status
);
6670 float_raise(float_flag_invalid STATUS_VAR
);
6674 uint32
float32_to_uint32_round_to_zero( float32 a STATUS_PARAM
)
6678 int old_exc_flags
= get_float_exception_flags(status
);
6680 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
6683 } else if (v
> 0xffffffff) {
6688 set_float_exception_flags(old_exc_flags
, status
);
6689 float_raise(float_flag_invalid STATUS_VAR
);
6693 int_fast16_t float32_to_int16(float32 a STATUS_PARAM
)
6697 int old_exc_flags
= get_float_exception_flags(status
);
6699 v
= float32_to_int32(a STATUS_VAR
);
6702 } else if (v
> 0x7fff) {
6708 set_float_exception_flags(old_exc_flags
, status
);
6709 float_raise(float_flag_invalid STATUS_VAR
);
6713 uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM
)
6717 int old_exc_flags
= get_float_exception_flags(status
);
6719 v
= float32_to_int32(a STATUS_VAR
);
6722 } else if (v
> 0xffff) {
6728 set_float_exception_flags(old_exc_flags
, status
);
6729 float_raise(float_flag_invalid STATUS_VAR
);
6733 uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM
)
6737 int old_exc_flags
= get_float_exception_flags(status
);
6739 v
= float32_to_int64_round_to_zero(a STATUS_VAR
);
6742 } else if (v
> 0xffff) {
6747 set_float_exception_flags(old_exc_flags
, status
);
6748 float_raise(float_flag_invalid STATUS_VAR
);
6752 uint32
float64_to_uint32( float64 a STATUS_PARAM
)
6756 int old_exc_flags
= get_float_exception_flags(status
);
6758 v
= float64_to_uint64(a STATUS_VAR
);
6759 if (v
> 0xffffffff) {
6764 set_float_exception_flags(old_exc_flags
, status
);
6765 float_raise(float_flag_invalid STATUS_VAR
);
6769 uint32
float64_to_uint32_round_to_zero( float64 a STATUS_PARAM
)
6773 int old_exc_flags
= get_float_exception_flags(status
);
6775 v
= float64_to_uint64_round_to_zero(a STATUS_VAR
);
6776 if (v
> 0xffffffff) {
6781 set_float_exception_flags(old_exc_flags
, status
);
6782 float_raise(float_flag_invalid STATUS_VAR
);
6786 int_fast16_t float64_to_int16(float64 a STATUS_PARAM
)
6790 int old_exc_flags
= get_float_exception_flags(status
);
6792 v
= float64_to_int32(a STATUS_VAR
);
6795 } else if (v
> 0x7fff) {
6801 set_float_exception_flags(old_exc_flags
, status
);
6802 float_raise(float_flag_invalid STATUS_VAR
);
6806 uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM
)
6810 int old_exc_flags
= get_float_exception_flags(status
);
6812 v
= float64_to_int32(a STATUS_VAR
);
6815 } else if (v
> 0xffff) {
6821 set_float_exception_flags(old_exc_flags
, status
);
6822 float_raise(float_flag_invalid STATUS_VAR
);
6826 uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM
)
6830 int old_exc_flags
= get_float_exception_flags(status
);
6832 v
= float64_to_int64_round_to_zero(a STATUS_VAR
);
6835 } else if (v
> 0xffff) {
6840 set_float_exception_flags(old_exc_flags
, status
);
6841 float_raise(float_flag_invalid STATUS_VAR
);
6845 /*----------------------------------------------------------------------------
6846 | Returns the result of converting the double-precision floating-point value
6847 | `a' to the 64-bit unsigned integer format. The conversion is
6848 | performed according to the IEC/IEEE Standard for Binary Floating-Point
6849 | Arithmetic---which means in particular that the conversion is rounded
6850 | according to the current rounding mode. If `a' is a NaN, the largest
6851 | positive integer is returned. If the conversion overflows, the
6852 | largest unsigned integer is returned. If 'a' is negative, the value is
6853 | rounded and zero is returned; negative values that do not round to zero
6854 | will raise the inexact exception.
6855 *----------------------------------------------------------------------------*/
6857 uint64_t float64_to_uint64(float64 a STATUS_PARAM
)
6860 int_fast16_t aExp
, shiftCount
;
6861 uint64_t aSig
, aSigExtra
;
6862 a
= float64_squash_input_denormal(a STATUS_VAR
);
6864 aSig
= extractFloat64Frac(a
);
6865 aExp
= extractFloat64Exp(a
);
6866 aSign
= extractFloat64Sign(a
);
6867 if (aSign
&& (aExp
> 1022)) {
6868 float_raise(float_flag_invalid STATUS_VAR
);
6869 if (float64_is_any_nan(a
)) {
6870 return LIT64(0xFFFFFFFFFFFFFFFF);
6876 aSig
|= LIT64(0x0010000000000000);
6878 shiftCount
= 0x433 - aExp
;
6879 if (shiftCount
<= 0) {
6881 float_raise(float_flag_invalid STATUS_VAR
);
6882 return LIT64(0xFFFFFFFFFFFFFFFF);
6885 aSig
<<= -shiftCount
;
6887 shift64ExtraRightJamming(aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
6889 return roundAndPackUint64(aSign
, aSig
, aSigExtra STATUS_VAR
);
6892 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM
)
6894 signed char current_rounding_mode
= STATUS(float_rounding_mode
);
6895 set_float_rounding_mode(float_round_to_zero STATUS_VAR
);
6896 int64_t v
= float64_to_uint64(a STATUS_VAR
);
6897 set_float_rounding_mode(current_rounding_mode STATUS_VAR
);
6901 #define COMPARE(s, nan_exp) \
6902 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \
6903 int is_quiet STATUS_PARAM ) \
6905 flag aSign, bSign; \
6906 uint ## s ## _t av, bv; \
6907 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
6908 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
6910 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
6911 extractFloat ## s ## Frac( a ) ) || \
6912 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
6913 extractFloat ## s ## Frac( b ) )) { \
6915 float ## s ## _is_signaling_nan( a ) || \
6916 float ## s ## _is_signaling_nan( b ) ) { \
6917 float_raise( float_flag_invalid STATUS_VAR); \
6919 return float_relation_unordered; \
6921 aSign = extractFloat ## s ## Sign( a ); \
6922 bSign = extractFloat ## s ## Sign( b ); \
6923 av = float ## s ## _val(a); \
6924 bv = float ## s ## _val(b); \
6925 if ( aSign != bSign ) { \
6926 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
6928 return float_relation_equal; \
6930 return 1 - (2 * aSign); \
6934 return float_relation_equal; \
6936 return 1 - 2 * (aSign ^ ( av < bv )); \
6941 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
6943 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
6946 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
6948 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
6954 INLINE
int floatx80_compare_internal( floatx80 a
, floatx80 b
,
6955 int is_quiet STATUS_PARAM
)
6959 if (( ( extractFloatx80Exp( a
) == 0x7fff ) &&
6960 ( extractFloatx80Frac( a
)<<1 ) ) ||
6961 ( ( extractFloatx80Exp( b
) == 0x7fff ) &&
6962 ( extractFloatx80Frac( b
)<<1 ) )) {
6964 floatx80_is_signaling_nan( a
) ||
6965 floatx80_is_signaling_nan( b
) ) {
6966 float_raise( float_flag_invalid STATUS_VAR
);
6968 return float_relation_unordered
;
6970 aSign
= extractFloatx80Sign( a
);
6971 bSign
= extractFloatx80Sign( b
);
6972 if ( aSign
!= bSign
) {
6974 if ( ( ( (uint16_t) ( ( a
.high
| b
.high
) << 1 ) ) == 0) &&
6975 ( ( a
.low
| b
.low
) == 0 ) ) {
6977 return float_relation_equal
;
6979 return 1 - (2 * aSign
);
6982 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
6983 return float_relation_equal
;
6985 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
6990 int floatx80_compare( floatx80 a
, floatx80 b STATUS_PARAM
)
6992 return floatx80_compare_internal(a
, b
, 0 STATUS_VAR
);
6995 int floatx80_compare_quiet( floatx80 a
, floatx80 b STATUS_PARAM
)
6997 return floatx80_compare_internal(a
, b
, 1 STATUS_VAR
);
7000 INLINE
int float128_compare_internal( float128 a
, float128 b
,
7001 int is_quiet STATUS_PARAM
)
7005 if (( ( extractFloat128Exp( a
) == 0x7fff ) &&
7006 ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) ) ||
7007 ( ( extractFloat128Exp( b
) == 0x7fff ) &&
7008 ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )) {
7010 float128_is_signaling_nan( a
) ||
7011 float128_is_signaling_nan( b
) ) {
7012 float_raise( float_flag_invalid STATUS_VAR
);
7014 return float_relation_unordered
;
7016 aSign
= extractFloat128Sign( a
);
7017 bSign
= extractFloat128Sign( b
);
7018 if ( aSign
!= bSign
) {
7019 if ( ( ( ( a
.high
| b
.high
)<<1 ) | a
.low
| b
.low
) == 0 ) {
7021 return float_relation_equal
;
7023 return 1 - (2 * aSign
);
7026 if (a
.low
== b
.low
&& a
.high
== b
.high
) {
7027 return float_relation_equal
;
7029 return 1 - 2 * (aSign
^ ( lt128( a
.high
, a
.low
, b
.high
, b
.low
) ));
7034 int float128_compare( float128 a
, float128 b STATUS_PARAM
)
7036 return float128_compare_internal(a
, b
, 0 STATUS_VAR
);
7039 int float128_compare_quiet( float128 a
, float128 b STATUS_PARAM
)
7041 return float128_compare_internal(a
, b
, 1 STATUS_VAR
);
7044 /* min() and max() functions. These can't be implemented as
7045 * 'compare and pick one input' because that would mishandle
7046 * NaNs and +0 vs -0.
7048 * minnum() and maxnum() functions. These are similar to the min()
7049 * and max() functions but if one of the arguments is a QNaN and
7050 * the other is numerical then the numerical argument is returned.
7051 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7052 * and maxNum() operations. min() and max() are the typical min/max
7053 * semantics provided by many CPUs which predate that specification.
7056 INLINE float ## s float ## s ## _minmax(float ## s a, float ## s b, \
7057 int ismin, int isieee STATUS_PARAM) \
7059 flag aSign, bSign; \
7060 uint ## s ## _t av, bv; \
7061 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
7062 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
7063 if (float ## s ## _is_any_nan(a) || \
7064 float ## s ## _is_any_nan(b)) { \
7066 if (float ## s ## _is_quiet_nan(a) && \
7067 !float ## s ##_is_any_nan(b)) { \
7069 } else if (float ## s ## _is_quiet_nan(b) && \
7070 !float ## s ## _is_any_nan(a)) { \
7074 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
7076 aSign = extractFloat ## s ## Sign(a); \
7077 bSign = extractFloat ## s ## Sign(b); \
7078 av = float ## s ## _val(a); \
7079 bv = float ## s ## _val(b); \
7080 if (aSign != bSign) { \
7082 return aSign ? a : b; \
7084 return aSign ? b : a; \
7088 return (aSign ^ (av < bv)) ? a : b; \
7090 return (aSign ^ (av < bv)) ? b : a; \
7095 float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
7097 return float ## s ## _minmax(a, b, 1, 0 STATUS_VAR); \
7100 float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
7102 return float ## s ## _minmax(a, b, 0, 0 STATUS_VAR); \
7105 float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7107 return float ## s ## _minmax(a, b, 1, 1 STATUS_VAR); \
7110 float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7112 return float ## s ## _minmax(a, b, 0, 1 STATUS_VAR); \
7119 /* Multiply A by 2 raised to the power N. */
7120 float32
float32_scalbn( float32 a
, int n STATUS_PARAM
)
7126 a
= float32_squash_input_denormal(a STATUS_VAR
);
7127 aSig
= extractFloat32Frac( a
);
7128 aExp
= extractFloat32Exp( a
);
7129 aSign
= extractFloat32Sign( a
);
7131 if ( aExp
== 0xFF ) {
7133 return propagateFloat32NaN( a
, a STATUS_VAR
);
7139 } else if (aSig
== 0) {
7147 } else if (n
< -0x200) {
7153 return normalizeRoundAndPackFloat32( aSign
, aExp
, aSig STATUS_VAR
);
7156 float64
float64_scalbn( float64 a
, int n STATUS_PARAM
)
7162 a
= float64_squash_input_denormal(a STATUS_VAR
);
7163 aSig
= extractFloat64Frac( a
);
7164 aExp
= extractFloat64Exp( a
);
7165 aSign
= extractFloat64Sign( a
);
7167 if ( aExp
== 0x7FF ) {
7169 return propagateFloat64NaN( a
, a STATUS_VAR
);
7174 aSig
|= LIT64( 0x0010000000000000 );
7175 } else if (aSig
== 0) {
7183 } else if (n
< -0x1000) {
7189 return normalizeRoundAndPackFloat64( aSign
, aExp
, aSig STATUS_VAR
);
7192 floatx80
floatx80_scalbn( floatx80 a
, int n STATUS_PARAM
)
7198 aSig
= extractFloatx80Frac( a
);
7199 aExp
= extractFloatx80Exp( a
);
7200 aSign
= extractFloatx80Sign( a
);
7202 if ( aExp
== 0x7FFF ) {
7204 return propagateFloatx80NaN( a
, a STATUS_VAR
);
7218 } else if (n
< -0x10000) {
7223 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision
),
7224 aSign
, aExp
, aSig
, 0 STATUS_VAR
);
7227 float128
float128_scalbn( float128 a
, int n STATUS_PARAM
)
7231 uint64_t aSig0
, aSig1
;
7233 aSig1
= extractFloat128Frac1( a
);
7234 aSig0
= extractFloat128Frac0( a
);
7235 aExp
= extractFloat128Exp( a
);
7236 aSign
= extractFloat128Sign( a
);
7237 if ( aExp
== 0x7FFF ) {
7238 if ( aSig0
| aSig1
) {
7239 return propagateFloat128NaN( a
, a STATUS_VAR
);
7244 aSig0
|= LIT64( 0x0001000000000000 );
7245 } else if (aSig0
== 0 && aSig1
== 0) {
7253 } else if (n
< -0x10000) {
7258 return normalizeRoundAndPackFloat128( aSign
, aExp
, aSig0
, aSig1