]> git.proxmox.com Git - mirror_qemu.git/blame - fpu/softfloat.c
softfloat: Revert and reimplement remaining portions of 75d62a5856 and 3430b0be36f
[mirror_qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
4 * Derived from SoftFloat.
5 */
158142c2 6
a7d1ac78
PM
7/*
8===============================================================================
9This C source file is part of the SoftFloat IEC/IEEE Floating-point
10Arithmetic Package, Release 2a.
158142c2
FB
11
12Written by John R. Hauser. This work was made possible in part by the
13International Computer Science Institute, located at Suite 600, 1947 Center
14Street, Berkeley, California 94704. Funding was partially provided by the
15National Science Foundation under grant MIP-9311980. The original version
16of this code was written as part of a project to build a fixed-point vector
17processor in collaboration with the University of California at Berkeley,
18overseen by Profs. Nelson Morgan and John Wawrzynek. More information
a7d1ac78 19is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
158142c2
FB
20arithmetic/SoftFloat.html'.
21
a7d1ac78
PM
22THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
23has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
24TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
25PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
26AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
158142c2
FB
27
28Derivative works are acceptable, even for commercial purposes, so long as
a7d1ac78
PM
29(1) they include prominent notice that the work is derivative, and (2) they
30include prominent notice akin to these four paragraphs for those parts of
31this code that are retained.
158142c2 32
a7d1ac78
PM
33===============================================================================
34*/
158142c2 35
2ac8bd03
PM
36/* softfloat (and in particular the code in softfloat-specialize.h) is
37 * target-dependent and needs the TARGET_* macros.
38 */
39#include "config.h"
40
6b4c305c 41#include "fpu/softfloat.h"
158142c2 42
dc355b76
PM
43/* We only need stdlib for abort() */
44#include <stdlib.h>
45
158142c2
FB
46/*----------------------------------------------------------------------------
47| Primitive arithmetic functions, including multi-word arithmetic, and
48| division and square root approximations. (Can be specialized to target if
49| desired.)
50*----------------------------------------------------------------------------*/
51#include "softfloat-macros.h"
52
53/*----------------------------------------------------------------------------
54| Functions and definitions to determine: (1) whether tininess for underflow
55| is detected before or after rounding by default, (2) what (if anything)
56| happens when exceptions are raised, (3) how signaling NaNs are distinguished
57| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
58| are propagated from function inputs to output. These details are target-
59| specific.
60*----------------------------------------------------------------------------*/
61#include "softfloat-specialize.h"
62
bb4d4bb3
PM
63/*----------------------------------------------------------------------------
64| Returns the fraction bits of the half-precision floating-point value `a'.
65*----------------------------------------------------------------------------*/
66
a49db98d 67static inline uint32_t extractFloat16Frac(float16 a)
bb4d4bb3
PM
68{
69 return float16_val(a) & 0x3ff;
70}
71
72/*----------------------------------------------------------------------------
73| Returns the exponent bits of the half-precision floating-point value `a'.
74*----------------------------------------------------------------------------*/
75
a49db98d 76static inline int_fast16_t extractFloat16Exp(float16 a)
bb4d4bb3
PM
77{
78 return (float16_val(a) >> 10) & 0x1f;
79}
80
81/*----------------------------------------------------------------------------
82| Returns the sign bit of the single-precision floating-point value `a'.
83*----------------------------------------------------------------------------*/
84
a49db98d 85static inline flag extractFloat16Sign(float16 a)
bb4d4bb3
PM
86{
87 return float16_val(a)>>15;
88}
89
158142c2
FB
90/*----------------------------------------------------------------------------
91| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
92| and 7, and returns the properly rounded 32-bit integer corresponding to the
93| input. If `zSign' is 1, the input is negated before being converted to an
94| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
95| is simply rounded to an integer, with the inexact exception raised if the
96| input cannot be represented exactly as an integer. However, if the fixed-
97| point input is too large, the invalid exception is raised and the largest
98| positive or negative integer is returned.
99*----------------------------------------------------------------------------*/
100
bb98fe42 101static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
158142c2
FB
102{
103 int8 roundingMode;
104 flag roundNearestEven;
105 int8 roundIncrement, roundBits;
760e1416 106 int32_t z;
158142c2
FB
107
108 roundingMode = STATUS(float_rounding_mode);
109 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
110 switch (roundingMode) {
111 case float_round_nearest_even:
f9288a76 112 case float_round_ties_away:
dc355b76
PM
113 roundIncrement = 0x40;
114 break;
115 case float_round_to_zero:
116 roundIncrement = 0;
117 break;
118 case float_round_up:
119 roundIncrement = zSign ? 0 : 0x7f;
120 break;
121 case float_round_down:
122 roundIncrement = zSign ? 0x7f : 0;
123 break;
124 default:
125 abort();
158142c2
FB
126 }
127 roundBits = absZ & 0x7F;
128 absZ = ( absZ + roundIncrement )>>7;
129 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
130 z = absZ;
131 if ( zSign ) z = - z;
132 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
133 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 134 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
135 }
136 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
137 return z;
138
139}
140
141/*----------------------------------------------------------------------------
142| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
143| `absZ1', with binary point between bits 63 and 64 (between the input words),
144| and returns the properly rounded 64-bit integer corresponding to the input.
145| If `zSign' is 1, the input is negated before being converted to an integer.
146| Ordinarily, the fixed-point input is simply rounded to an integer, with
147| the inexact exception raised if the input cannot be represented exactly as
148| an integer. However, if the fixed-point input is too large, the invalid
149| exception is raised and the largest positive or negative integer is
150| returned.
151*----------------------------------------------------------------------------*/
152
bb98fe42 153static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
158142c2
FB
154{
155 int8 roundingMode;
156 flag roundNearestEven, increment;
760e1416 157 int64_t z;
158142c2
FB
158
159 roundingMode = STATUS(float_rounding_mode);
160 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
161 switch (roundingMode) {
162 case float_round_nearest_even:
f9288a76 163 case float_round_ties_away:
dc355b76
PM
164 increment = ((int64_t) absZ1 < 0);
165 break;
166 case float_round_to_zero:
167 increment = 0;
168 break;
169 case float_round_up:
170 increment = !zSign && absZ1;
171 break;
172 case float_round_down:
173 increment = zSign && absZ1;
174 break;
175 default:
176 abort();
158142c2
FB
177 }
178 if ( increment ) {
179 ++absZ0;
180 if ( absZ0 == 0 ) goto overflow;
bb98fe42 181 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
182 }
183 z = absZ0;
184 if ( zSign ) z = - z;
185 if ( z && ( ( z < 0 ) ^ zSign ) ) {
186 overflow:
187 float_raise( float_flag_invalid STATUS_VAR);
188 return
bb98fe42 189 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
190 : LIT64( 0x7FFFFFFFFFFFFFFF );
191 }
192 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
193 return z;
194
195}
196
fb3ea83a
TM
197/*----------------------------------------------------------------------------
198| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
199| `absZ1', with binary point between bits 63 and 64 (between the input words),
200| and returns the properly rounded 64-bit unsigned integer corresponding to the
201| input. Ordinarily, the fixed-point input is simply rounded to an integer,
202| with the inexact exception raised if the input cannot be represented exactly
203| as an integer. However, if the fixed-point input is too large, the invalid
204| exception is raised and the largest unsigned integer is returned.
205*----------------------------------------------------------------------------*/
206
207static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
208 uint64_t absZ1 STATUS_PARAM)
209{
210 int8 roundingMode;
211 flag roundNearestEven, increment;
212
213 roundingMode = STATUS(float_rounding_mode);
214 roundNearestEven = (roundingMode == float_round_nearest_even);
dc355b76
PM
215 switch (roundingMode) {
216 case float_round_nearest_even:
f9288a76 217 case float_round_ties_away:
dc355b76
PM
218 increment = ((int64_t)absZ1 < 0);
219 break;
220 case float_round_to_zero:
221 increment = 0;
222 break;
223 case float_round_up:
224 increment = !zSign && absZ1;
225 break;
226 case float_round_down:
227 increment = zSign && absZ1;
228 break;
229 default:
230 abort();
fb3ea83a
TM
231 }
232 if (increment) {
233 ++absZ0;
234 if (absZ0 == 0) {
235 float_raise(float_flag_invalid STATUS_VAR);
236 return LIT64(0xFFFFFFFFFFFFFFFF);
237 }
238 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
239 }
240
241 if (zSign && absZ0) {
242 float_raise(float_flag_invalid STATUS_VAR);
243 return 0;
244 }
245
246 if (absZ1) {
247 STATUS(float_exception_flags) |= float_flag_inexact;
248 }
249 return absZ0;
250}
251
158142c2
FB
252/*----------------------------------------------------------------------------
253| Returns the fraction bits of the single-precision floating-point value `a'.
254*----------------------------------------------------------------------------*/
255
a49db98d 256static inline uint32_t extractFloat32Frac( float32 a )
158142c2
FB
257{
258
f090c9d4 259 return float32_val(a) & 0x007FFFFF;
158142c2
FB
260
261}
262
263/*----------------------------------------------------------------------------
264| Returns the exponent bits of the single-precision floating-point value `a'.
265*----------------------------------------------------------------------------*/
266
a49db98d 267static inline int_fast16_t extractFloat32Exp(float32 a)
158142c2
FB
268{
269
f090c9d4 270 return ( float32_val(a)>>23 ) & 0xFF;
158142c2
FB
271
272}
273
274/*----------------------------------------------------------------------------
275| Returns the sign bit of the single-precision floating-point value `a'.
276*----------------------------------------------------------------------------*/
277
a49db98d 278static inline flag extractFloat32Sign( float32 a )
158142c2
FB
279{
280
f090c9d4 281 return float32_val(a)>>31;
158142c2
FB
282
283}
284
37d18660
PM
285/*----------------------------------------------------------------------------
286| If `a' is denormal and we are in flush-to-zero mode then set the
287| input-denormal exception and return zero. Otherwise just return the value.
288*----------------------------------------------------------------------------*/
7baeabce 289float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
37d18660
PM
290{
291 if (STATUS(flush_inputs_to_zero)) {
292 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
293 float_raise(float_flag_input_denormal STATUS_VAR);
294 return make_float32(float32_val(a) & 0x80000000);
295 }
296 }
297 return a;
298}
299
158142c2
FB
300/*----------------------------------------------------------------------------
301| Normalizes the subnormal single-precision floating-point value represented
302| by the denormalized significand `aSig'. The normalized exponent and
303| significand are stored at the locations pointed to by `zExpPtr' and
304| `zSigPtr', respectively.
305*----------------------------------------------------------------------------*/
306
307static void
94a49d86 308 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
158142c2
FB
309{
310 int8 shiftCount;
311
312 shiftCount = countLeadingZeros32( aSig ) - 8;
313 *zSigPtr = aSig<<shiftCount;
314 *zExpPtr = 1 - shiftCount;
315
316}
317
318/*----------------------------------------------------------------------------
319| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
320| single-precision floating-point value, returning the result. After being
321| shifted into the proper positions, the three fields are simply added
322| together to form the result. This means that any integer portion of `zSig'
323| will be added into the exponent. Since a properly normalized significand
324| will have an integer portion equal to 1, the `zExp' input should be 1 less
325| than the desired result exponent whenever `zSig' is a complete, normalized
326| significand.
327*----------------------------------------------------------------------------*/
328
a49db98d 329static inline float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
158142c2
FB
330{
331
f090c9d4 332 return make_float32(
bb98fe42 333 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
158142c2
FB
334
335}
336
337/*----------------------------------------------------------------------------
338| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
339| and significand `zSig', and returns the proper single-precision floating-
340| point value corresponding to the abstract input. Ordinarily, the abstract
341| value is simply rounded and packed into the single-precision format, with
342| the inexact exception raised if the abstract input cannot be represented
343| exactly. However, if the abstract value is too large, the overflow and
344| inexact exceptions are raised and an infinity or maximal finite value is
345| returned. If the abstract value is too small, the input value is rounded to
346| a subnormal number, and the underflow and inexact exceptions are raised if
347| the abstract input cannot be represented exactly as a subnormal single-
348| precision floating-point number.
349| The input significand `zSig' has its binary point between bits 30
350| and 29, which is 7 bits to the left of the usual location. This shifted
351| significand must be normalized or smaller. If `zSig' is not normalized,
352| `zExp' must be 0; in that case, the result returned is a subnormal number,
353| and it must not require rounding. In the usual case that `zSig' is
354| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
355| The handling of underflow and overflow follows the IEC/IEEE Standard for
356| Binary Floating-Point Arithmetic.
357*----------------------------------------------------------------------------*/
358
94a49d86 359static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
360{
361 int8 roundingMode;
362 flag roundNearestEven;
363 int8 roundIncrement, roundBits;
364 flag isTiny;
365
366 roundingMode = STATUS(float_rounding_mode);
367 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
368 switch (roundingMode) {
369 case float_round_nearest_even:
f9288a76 370 case float_round_ties_away:
dc355b76
PM
371 roundIncrement = 0x40;
372 break;
373 case float_round_to_zero:
374 roundIncrement = 0;
375 break;
376 case float_round_up:
377 roundIncrement = zSign ? 0 : 0x7f;
378 break;
379 case float_round_down:
380 roundIncrement = zSign ? 0x7f : 0;
381 break;
382 default:
383 abort();
384 break;
158142c2
FB
385 }
386 roundBits = zSig & 0x7F;
bb98fe42 387 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
388 if ( ( 0xFD < zExp )
389 || ( ( zExp == 0xFD )
bb98fe42 390 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
391 ) {
392 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 393 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
394 }
395 if ( zExp < 0 ) {
e6afc87f
PM
396 if (STATUS(flush_to_zero)) {
397 float_raise(float_flag_output_denormal STATUS_VAR);
398 return packFloat32(zSign, 0, 0);
399 }
158142c2
FB
400 isTiny =
401 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
402 || ( zExp < -1 )
403 || ( zSig + roundIncrement < 0x80000000 );
404 shift32RightJamming( zSig, - zExp, &zSig );
405 zExp = 0;
406 roundBits = zSig & 0x7F;
407 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
408 }
409 }
410 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
411 zSig = ( zSig + roundIncrement )>>7;
412 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
413 if ( zSig == 0 ) zExp = 0;
414 return packFloat32( zSign, zExp, zSig );
415
416}
417
418/*----------------------------------------------------------------------------
419| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
420| and significand `zSig', and returns the proper single-precision floating-
421| point value corresponding to the abstract input. This routine is just like
422| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
423| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
424| floating-point exponent.
425*----------------------------------------------------------------------------*/
426
427static float32
94a49d86 428 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
429{
430 int8 shiftCount;
431
432 shiftCount = countLeadingZeros32( zSig ) - 1;
433 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
434
435}
436
437/*----------------------------------------------------------------------------
438| Returns the fraction bits of the double-precision floating-point value `a'.
439*----------------------------------------------------------------------------*/
440
a49db98d 441static inline uint64_t extractFloat64Frac( float64 a )
158142c2
FB
442{
443
f090c9d4 444 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
158142c2
FB
445
446}
447
448/*----------------------------------------------------------------------------
449| Returns the exponent bits of the double-precision floating-point value `a'.
450*----------------------------------------------------------------------------*/
451
a49db98d 452static inline int_fast16_t extractFloat64Exp(float64 a)
158142c2
FB
453{
454
f090c9d4 455 return ( float64_val(a)>>52 ) & 0x7FF;
158142c2
FB
456
457}
458
459/*----------------------------------------------------------------------------
460| Returns the sign bit of the double-precision floating-point value `a'.
461*----------------------------------------------------------------------------*/
462
a49db98d 463static inline flag extractFloat64Sign( float64 a )
158142c2
FB
464{
465
f090c9d4 466 return float64_val(a)>>63;
158142c2
FB
467
468}
469
37d18660
PM
470/*----------------------------------------------------------------------------
471| If `a' is denormal and we are in flush-to-zero mode then set the
472| input-denormal exception and return zero. Otherwise just return the value.
473*----------------------------------------------------------------------------*/
7baeabce 474float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
37d18660
PM
475{
476 if (STATUS(flush_inputs_to_zero)) {
477 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
478 float_raise(float_flag_input_denormal STATUS_VAR);
479 return make_float64(float64_val(a) & (1ULL << 63));
480 }
481 }
482 return a;
483}
484
158142c2
FB
485/*----------------------------------------------------------------------------
486| Normalizes the subnormal double-precision floating-point value represented
487| by the denormalized significand `aSig'. The normalized exponent and
488| significand are stored at the locations pointed to by `zExpPtr' and
489| `zSigPtr', respectively.
490*----------------------------------------------------------------------------*/
491
492static void
94a49d86 493 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
158142c2
FB
494{
495 int8 shiftCount;
496
497 shiftCount = countLeadingZeros64( aSig ) - 11;
498 *zSigPtr = aSig<<shiftCount;
499 *zExpPtr = 1 - shiftCount;
500
501}
502
503/*----------------------------------------------------------------------------
504| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
505| double-precision floating-point value, returning the result. After being
506| shifted into the proper positions, the three fields are simply added
507| together to form the result. This means that any integer portion of `zSig'
508| will be added into the exponent. Since a properly normalized significand
509| will have an integer portion equal to 1, the `zExp' input should be 1 less
510| than the desired result exponent whenever `zSig' is a complete, normalized
511| significand.
512*----------------------------------------------------------------------------*/
513
a49db98d 514static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
158142c2
FB
515{
516
f090c9d4 517 return make_float64(
bb98fe42 518 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
519
520}
521
522/*----------------------------------------------------------------------------
523| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
524| and significand `zSig', and returns the proper double-precision floating-
525| point value corresponding to the abstract input. Ordinarily, the abstract
526| value is simply rounded and packed into the double-precision format, with
527| the inexact exception raised if the abstract input cannot be represented
528| exactly. However, if the abstract value is too large, the overflow and
529| inexact exceptions are raised and an infinity or maximal finite value is
a7d1ac78
PM
530| returned. If the abstract value is too small, the input value is rounded to
531| a subnormal number, and the underflow and inexact exceptions are raised if
532| the abstract input cannot be represented exactly as a subnormal double-
158142c2
FB
533| precision floating-point number.
534| The input significand `zSig' has its binary point between bits 62
535| and 61, which is 10 bits to the left of the usual location. This shifted
536| significand must be normalized or smaller. If `zSig' is not normalized,
537| `zExp' must be 0; in that case, the result returned is a subnormal number,
538| and it must not require rounding. In the usual case that `zSig' is
539| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
540| The handling of underflow and overflow follows the IEC/IEEE Standard for
541| Binary Floating-Point Arithmetic.
542*----------------------------------------------------------------------------*/
543
94a49d86 544static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
545{
546 int8 roundingMode;
547 flag roundNearestEven;
94a49d86 548 int_fast16_t roundIncrement, roundBits;
158142c2
FB
549 flag isTiny;
550
551 roundingMode = STATUS(float_rounding_mode);
552 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
553 switch (roundingMode) {
554 case float_round_nearest_even:
f9288a76 555 case float_round_ties_away:
dc355b76
PM
556 roundIncrement = 0x200;
557 break;
558 case float_round_to_zero:
559 roundIncrement = 0;
560 break;
561 case float_round_up:
562 roundIncrement = zSign ? 0 : 0x3ff;
563 break;
564 case float_round_down:
565 roundIncrement = zSign ? 0x3ff : 0;
566 break;
567 default:
568 abort();
158142c2
FB
569 }
570 roundBits = zSig & 0x3FF;
bb98fe42 571 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
572 if ( ( 0x7FD < zExp )
573 || ( ( zExp == 0x7FD )
bb98fe42 574 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
575 ) {
576 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 577 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
158142c2
FB
578 }
579 if ( zExp < 0 ) {
e6afc87f
PM
580 if (STATUS(flush_to_zero)) {
581 float_raise(float_flag_output_denormal STATUS_VAR);
582 return packFloat64(zSign, 0, 0);
583 }
158142c2
FB
584 isTiny =
585 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
586 || ( zExp < -1 )
587 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
588 shift64RightJamming( zSig, - zExp, &zSig );
589 zExp = 0;
590 roundBits = zSig & 0x3FF;
591 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
592 }
593 }
594 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
595 zSig = ( zSig + roundIncrement )>>10;
596 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
597 if ( zSig == 0 ) zExp = 0;
598 return packFloat64( zSign, zExp, zSig );
599
600}
601
602/*----------------------------------------------------------------------------
603| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
604| and significand `zSig', and returns the proper double-precision floating-
605| point value corresponding to the abstract input. This routine is just like
606| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
607| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
608| floating-point exponent.
609*----------------------------------------------------------------------------*/
610
611static float64
94a49d86 612 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
613{
614 int8 shiftCount;
615
616 shiftCount = countLeadingZeros64( zSig ) - 1;
617 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
618
619}
620
158142c2
FB
621/*----------------------------------------------------------------------------
622| Returns the fraction bits of the extended double-precision floating-point
623| value `a'.
624*----------------------------------------------------------------------------*/
625
a49db98d 626static inline uint64_t extractFloatx80Frac( floatx80 a )
158142c2
FB
627{
628
629 return a.low;
630
631}
632
633/*----------------------------------------------------------------------------
634| Returns the exponent bits of the extended double-precision floating-point
635| value `a'.
636*----------------------------------------------------------------------------*/
637
a49db98d 638static inline int32 extractFloatx80Exp( floatx80 a )
158142c2
FB
639{
640
641 return a.high & 0x7FFF;
642
643}
644
645/*----------------------------------------------------------------------------
646| Returns the sign bit of the extended double-precision floating-point value
647| `a'.
648*----------------------------------------------------------------------------*/
649
a49db98d 650static inline flag extractFloatx80Sign( floatx80 a )
158142c2
FB
651{
652
653 return a.high>>15;
654
655}
656
657/*----------------------------------------------------------------------------
658| Normalizes the subnormal extended double-precision floating-point value
659| represented by the denormalized significand `aSig'. The normalized exponent
660| and significand are stored at the locations pointed to by `zExpPtr' and
661| `zSigPtr', respectively.
662*----------------------------------------------------------------------------*/
663
664static void
bb98fe42 665 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
158142c2
FB
666{
667 int8 shiftCount;
668
669 shiftCount = countLeadingZeros64( aSig );
670 *zSigPtr = aSig<<shiftCount;
671 *zExpPtr = 1 - shiftCount;
672
673}
674
675/*----------------------------------------------------------------------------
676| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
677| extended double-precision floating-point value, returning the result.
678*----------------------------------------------------------------------------*/
679
a49db98d 680static inline floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
158142c2
FB
681{
682 floatx80 z;
683
684 z.low = zSig;
bb98fe42 685 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
158142c2
FB
686 return z;
687
688}
689
690/*----------------------------------------------------------------------------
691| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
692| and extended significand formed by the concatenation of `zSig0' and `zSig1',
693| and returns the proper extended double-precision floating-point value
694| corresponding to the abstract input. Ordinarily, the abstract value is
695| rounded and packed into the extended double-precision format, with the
696| inexact exception raised if the abstract input cannot be represented
697| exactly. However, if the abstract value is too large, the overflow and
698| inexact exceptions are raised and an infinity or maximal finite value is
699| returned. If the abstract value is too small, the input value is rounded to
700| a subnormal number, and the underflow and inexact exceptions are raised if
701| the abstract input cannot be represented exactly as a subnormal extended
702| double-precision floating-point number.
703| If `roundingPrecision' is 32 or 64, the result is rounded to the same
704| number of bits as single or double precision, respectively. Otherwise, the
705| result is rounded to the full precision of the extended double-precision
706| format.
707| The input significand must be normalized or smaller. If the input
708| significand is not normalized, `zExp' must be 0; in that case, the result
709| returned is a subnormal number, and it must not require rounding. The
710| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
711| Floating-Point Arithmetic.
712*----------------------------------------------------------------------------*/
713
714static floatx80
715 roundAndPackFloatx80(
bb98fe42 716 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
717 STATUS_PARAM)
718{
719 int8 roundingMode;
720 flag roundNearestEven, increment, isTiny;
721 int64 roundIncrement, roundMask, roundBits;
722
723 roundingMode = STATUS(float_rounding_mode);
724 roundNearestEven = ( roundingMode == float_round_nearest_even );
725 if ( roundingPrecision == 80 ) goto precision80;
726 if ( roundingPrecision == 64 ) {
727 roundIncrement = LIT64( 0x0000000000000400 );
728 roundMask = LIT64( 0x00000000000007FF );
729 }
730 else if ( roundingPrecision == 32 ) {
731 roundIncrement = LIT64( 0x0000008000000000 );
732 roundMask = LIT64( 0x000000FFFFFFFFFF );
733 }
734 else {
735 goto precision80;
736 }
737 zSig0 |= ( zSig1 != 0 );
dc355b76
PM
738 switch (roundingMode) {
739 case float_round_nearest_even:
f9288a76 740 case float_round_ties_away:
dc355b76
PM
741 break;
742 case float_round_to_zero:
743 roundIncrement = 0;
744 break;
745 case float_round_up:
746 roundIncrement = zSign ? 0 : roundMask;
747 break;
748 case float_round_down:
749 roundIncrement = zSign ? roundMask : 0;
750 break;
751 default:
752 abort();
158142c2
FB
753 }
754 roundBits = zSig0 & roundMask;
bb98fe42 755 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
756 if ( ( 0x7FFE < zExp )
757 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
758 ) {
759 goto overflow;
760 }
761 if ( zExp <= 0 ) {
e6afc87f
PM
762 if (STATUS(flush_to_zero)) {
763 float_raise(float_flag_output_denormal STATUS_VAR);
764 return packFloatx80(zSign, 0, 0);
765 }
158142c2
FB
766 isTiny =
767 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
768 || ( zExp < 0 )
769 || ( zSig0 <= zSig0 + roundIncrement );
770 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
771 zExp = 0;
772 roundBits = zSig0 & roundMask;
773 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
774 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
775 zSig0 += roundIncrement;
bb98fe42 776 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
777 roundIncrement = roundMask + 1;
778 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
779 roundMask |= roundIncrement;
780 }
781 zSig0 &= ~ roundMask;
782 return packFloatx80( zSign, zExp, zSig0 );
783 }
784 }
785 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
786 zSig0 += roundIncrement;
787 if ( zSig0 < roundIncrement ) {
788 ++zExp;
789 zSig0 = LIT64( 0x8000000000000000 );
790 }
791 roundIncrement = roundMask + 1;
792 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
793 roundMask |= roundIncrement;
794 }
795 zSig0 &= ~ roundMask;
796 if ( zSig0 == 0 ) zExp = 0;
797 return packFloatx80( zSign, zExp, zSig0 );
798 precision80:
dc355b76
PM
799 switch (roundingMode) {
800 case float_round_nearest_even:
f9288a76 801 case float_round_ties_away:
dc355b76
PM
802 increment = ((int64_t)zSig1 < 0);
803 break;
804 case float_round_to_zero:
805 increment = 0;
806 break;
807 case float_round_up:
808 increment = !zSign && zSig1;
809 break;
810 case float_round_down:
811 increment = zSign && zSig1;
812 break;
813 default:
814 abort();
158142c2 815 }
bb98fe42 816 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
817 if ( ( 0x7FFE < zExp )
818 || ( ( zExp == 0x7FFE )
819 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
820 && increment
821 )
822 ) {
823 roundMask = 0;
824 overflow:
825 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
826 if ( ( roundingMode == float_round_to_zero )
827 || ( zSign && ( roundingMode == float_round_up ) )
828 || ( ! zSign && ( roundingMode == float_round_down ) )
829 ) {
830 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
831 }
832 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
833 }
834 if ( zExp <= 0 ) {
835 isTiny =
836 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
837 || ( zExp < 0 )
838 || ! increment
839 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
840 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
841 zExp = 0;
842 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
843 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
dc355b76
PM
844 switch (roundingMode) {
845 case float_round_nearest_even:
f9288a76 846 case float_round_ties_away:
dc355b76
PM
847 increment = ((int64_t)zSig1 < 0);
848 break;
849 case float_round_to_zero:
850 increment = 0;
851 break;
852 case float_round_up:
853 increment = !zSign && zSig1;
854 break;
855 case float_round_down:
856 increment = zSign && zSig1;
857 break;
858 default:
859 abort();
158142c2
FB
860 }
861 if ( increment ) {
862 ++zSig0;
863 zSig0 &=
bb98fe42
AF
864 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
865 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
866 }
867 return packFloatx80( zSign, zExp, zSig0 );
868 }
869 }
870 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
871 if ( increment ) {
872 ++zSig0;
873 if ( zSig0 == 0 ) {
874 ++zExp;
875 zSig0 = LIT64( 0x8000000000000000 );
876 }
877 else {
bb98fe42 878 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
879 }
880 }
881 else {
882 if ( zSig0 == 0 ) zExp = 0;
883 }
884 return packFloatx80( zSign, zExp, zSig0 );
885
886}
887
888/*----------------------------------------------------------------------------
889| Takes an abstract floating-point value having sign `zSign', exponent
890| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
891| and returns the proper extended double-precision floating-point value
892| corresponding to the abstract input. This routine is just like
893| `roundAndPackFloatx80' except that the input significand does not have to be
894| normalized.
895*----------------------------------------------------------------------------*/
896
897static floatx80
898 normalizeRoundAndPackFloatx80(
bb98fe42 899 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
900 STATUS_PARAM)
901{
902 int8 shiftCount;
903
904 if ( zSig0 == 0 ) {
905 zSig0 = zSig1;
906 zSig1 = 0;
907 zExp -= 64;
908 }
909 shiftCount = countLeadingZeros64( zSig0 );
910 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
911 zExp -= shiftCount;
912 return
913 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
914
915}
916
158142c2
FB
917/*----------------------------------------------------------------------------
918| Returns the least-significant 64 fraction bits of the quadruple-precision
919| floating-point value `a'.
920*----------------------------------------------------------------------------*/
921
a49db98d 922static inline uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
923{
924
925 return a.low;
926
927}
928
929/*----------------------------------------------------------------------------
930| Returns the most-significant 48 fraction bits of the quadruple-precision
931| floating-point value `a'.
932*----------------------------------------------------------------------------*/
933
a49db98d 934static inline uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
935{
936
937 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
938
939}
940
941/*----------------------------------------------------------------------------
942| Returns the exponent bits of the quadruple-precision floating-point value
943| `a'.
944*----------------------------------------------------------------------------*/
945
a49db98d 946static inline int32 extractFloat128Exp( float128 a )
158142c2
FB
947{
948
949 return ( a.high>>48 ) & 0x7FFF;
950
951}
952
953/*----------------------------------------------------------------------------
954| Returns the sign bit of the quadruple-precision floating-point value `a'.
955*----------------------------------------------------------------------------*/
956
a49db98d 957static inline flag extractFloat128Sign( float128 a )
158142c2
FB
958{
959
960 return a.high>>63;
961
962}
963
964/*----------------------------------------------------------------------------
965| Normalizes the subnormal quadruple-precision floating-point value
966| represented by the denormalized significand formed by the concatenation of
967| `aSig0' and `aSig1'. The normalized exponent is stored at the location
968| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
969| significand are stored at the location pointed to by `zSig0Ptr', and the
970| least significant 64 bits of the normalized significand are stored at the
971| location pointed to by `zSig1Ptr'.
972*----------------------------------------------------------------------------*/
973
974static void
975 normalizeFloat128Subnormal(
bb98fe42
AF
976 uint64_t aSig0,
977 uint64_t aSig1,
158142c2 978 int32 *zExpPtr,
bb98fe42
AF
979 uint64_t *zSig0Ptr,
980 uint64_t *zSig1Ptr
158142c2
FB
981 )
982{
983 int8 shiftCount;
984
985 if ( aSig0 == 0 ) {
986 shiftCount = countLeadingZeros64( aSig1 ) - 15;
987 if ( shiftCount < 0 ) {
988 *zSig0Ptr = aSig1>>( - shiftCount );
989 *zSig1Ptr = aSig1<<( shiftCount & 63 );
990 }
991 else {
992 *zSig0Ptr = aSig1<<shiftCount;
993 *zSig1Ptr = 0;
994 }
995 *zExpPtr = - shiftCount - 63;
996 }
997 else {
998 shiftCount = countLeadingZeros64( aSig0 ) - 15;
999 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1000 *zExpPtr = 1 - shiftCount;
1001 }
1002
1003}
1004
1005/*----------------------------------------------------------------------------
1006| Packs the sign `zSign', the exponent `zExp', and the significand formed
1007| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1008| floating-point value, returning the result. After being shifted into the
1009| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1010| added together to form the most significant 32 bits of the result. This
1011| means that any integer portion of `zSig0' will be added into the exponent.
1012| Since a properly normalized significand will have an integer portion equal
1013| to 1, the `zExp' input should be 1 less than the desired result exponent
1014| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1015| significand.
1016*----------------------------------------------------------------------------*/
1017
a49db98d 1018static inline float128
bb98fe42 1019 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
1020{
1021 float128 z;
1022
1023 z.low = zSig1;
bb98fe42 1024 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
1025 return z;
1026
1027}
1028
1029/*----------------------------------------------------------------------------
1030| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1031| and extended significand formed by the concatenation of `zSig0', `zSig1',
1032| and `zSig2', and returns the proper quadruple-precision floating-point value
1033| corresponding to the abstract input. Ordinarily, the abstract value is
1034| simply rounded and packed into the quadruple-precision format, with the
1035| inexact exception raised if the abstract input cannot be represented
1036| exactly. However, if the abstract value is too large, the overflow and
1037| inexact exceptions are raised and an infinity or maximal finite value is
1038| returned. If the abstract value is too small, the input value is rounded to
1039| a subnormal number, and the underflow and inexact exceptions are raised if
1040| the abstract input cannot be represented exactly as a subnormal quadruple-
1041| precision floating-point number.
1042| The input significand must be normalized or smaller. If the input
1043| significand is not normalized, `zExp' must be 0; in that case, the result
1044| returned is a subnormal number, and it must not require rounding. In the
1045| usual case that the input significand is normalized, `zExp' must be 1 less
1046| than the ``true'' floating-point exponent. The handling of underflow and
1047| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1048*----------------------------------------------------------------------------*/
1049
1050static float128
1051 roundAndPackFloat128(
bb98fe42 1052 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
158142c2
FB
1053{
1054 int8 roundingMode;
1055 flag roundNearestEven, increment, isTiny;
1056
1057 roundingMode = STATUS(float_rounding_mode);
1058 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
1059 switch (roundingMode) {
1060 case float_round_nearest_even:
f9288a76 1061 case float_round_ties_away:
dc355b76
PM
1062 increment = ((int64_t)zSig2 < 0);
1063 break;
1064 case float_round_to_zero:
1065 increment = 0;
1066 break;
1067 case float_round_up:
1068 increment = !zSign && zSig2;
1069 break;
1070 case float_round_down:
1071 increment = zSign && zSig2;
1072 break;
1073 default:
1074 abort();
158142c2 1075 }
bb98fe42 1076 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
1077 if ( ( 0x7FFD < zExp )
1078 || ( ( zExp == 0x7FFD )
1079 && eq128(
1080 LIT64( 0x0001FFFFFFFFFFFF ),
1081 LIT64( 0xFFFFFFFFFFFFFFFF ),
1082 zSig0,
1083 zSig1
1084 )
1085 && increment
1086 )
1087 ) {
1088 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1089 if ( ( roundingMode == float_round_to_zero )
1090 || ( zSign && ( roundingMode == float_round_up ) )
1091 || ( ! zSign && ( roundingMode == float_round_down ) )
1092 ) {
1093 return
1094 packFloat128(
1095 zSign,
1096 0x7FFE,
1097 LIT64( 0x0000FFFFFFFFFFFF ),
1098 LIT64( 0xFFFFFFFFFFFFFFFF )
1099 );
1100 }
1101 return packFloat128( zSign, 0x7FFF, 0, 0 );
1102 }
1103 if ( zExp < 0 ) {
e6afc87f
PM
1104 if (STATUS(flush_to_zero)) {
1105 float_raise(float_flag_output_denormal STATUS_VAR);
1106 return packFloat128(zSign, 0, 0, 0);
1107 }
158142c2
FB
1108 isTiny =
1109 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1110 || ( zExp < -1 )
1111 || ! increment
1112 || lt128(
1113 zSig0,
1114 zSig1,
1115 LIT64( 0x0001FFFFFFFFFFFF ),
1116 LIT64( 0xFFFFFFFFFFFFFFFF )
1117 );
1118 shift128ExtraRightJamming(
1119 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1120 zExp = 0;
1121 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
dc355b76
PM
1122 switch (roundingMode) {
1123 case float_round_nearest_even:
f9288a76 1124 case float_round_ties_away:
dc355b76
PM
1125 increment = ((int64_t)zSig2 < 0);
1126 break;
1127 case float_round_to_zero:
1128 increment = 0;
1129 break;
1130 case float_round_up:
1131 increment = !zSign && zSig2;
1132 break;
1133 case float_round_down:
1134 increment = zSign && zSig2;
1135 break;
1136 default:
1137 abort();
158142c2
FB
1138 }
1139 }
1140 }
1141 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1142 if ( increment ) {
1143 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1144 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1145 }
1146 else {
1147 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1148 }
1149 return packFloat128( zSign, zExp, zSig0, zSig1 );
1150
1151}
1152
1153/*----------------------------------------------------------------------------
1154| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1155| and significand formed by the concatenation of `zSig0' and `zSig1', and
1156| returns the proper quadruple-precision floating-point value corresponding
1157| to the abstract input. This routine is just like `roundAndPackFloat128'
1158| except that the input significand has fewer bits and does not have to be
1159| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1160| point exponent.
1161*----------------------------------------------------------------------------*/
1162
1163static float128
1164 normalizeRoundAndPackFloat128(
bb98fe42 1165 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
158142c2
FB
1166{
1167 int8 shiftCount;
bb98fe42 1168 uint64_t zSig2;
158142c2
FB
1169
1170 if ( zSig0 == 0 ) {
1171 zSig0 = zSig1;
1172 zSig1 = 0;
1173 zExp -= 64;
1174 }
1175 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1176 if ( 0 <= shiftCount ) {
1177 zSig2 = 0;
1178 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1179 }
1180 else {
1181 shift128ExtraRightJamming(
1182 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1183 }
1184 zExp -= shiftCount;
1185 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1186
1187}
1188
158142c2
FB
1189/*----------------------------------------------------------------------------
1190| Returns the result of converting the 32-bit two's complement integer `a'
1191| to the single-precision floating-point format. The conversion is performed
1192| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1193*----------------------------------------------------------------------------*/
1194
c4850f9e 1195float32 int32_to_float32(int32_t a STATUS_PARAM)
158142c2
FB
1196{
1197 flag zSign;
1198
f090c9d4 1199 if ( a == 0 ) return float32_zero;
bb98fe42 1200 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
158142c2
FB
1201 zSign = ( a < 0 );
1202 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1203
1204}
1205
1206/*----------------------------------------------------------------------------
1207| Returns the result of converting the 32-bit two's complement integer `a'
1208| to the double-precision floating-point format. The conversion is performed
1209| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1210*----------------------------------------------------------------------------*/
1211
c4850f9e 1212float64 int32_to_float64(int32_t a STATUS_PARAM)
158142c2
FB
1213{
1214 flag zSign;
1215 uint32 absA;
1216 int8 shiftCount;
bb98fe42 1217 uint64_t zSig;
158142c2 1218
f090c9d4 1219 if ( a == 0 ) return float64_zero;
158142c2
FB
1220 zSign = ( a < 0 );
1221 absA = zSign ? - a : a;
1222 shiftCount = countLeadingZeros32( absA ) + 21;
1223 zSig = absA;
1224 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1225
1226}
1227
158142c2
FB
1228/*----------------------------------------------------------------------------
1229| Returns the result of converting the 32-bit two's complement integer `a'
1230| to the extended double-precision floating-point format. The conversion
1231| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1232| Arithmetic.
1233*----------------------------------------------------------------------------*/
1234
c4850f9e 1235floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
158142c2
FB
1236{
1237 flag zSign;
1238 uint32 absA;
1239 int8 shiftCount;
bb98fe42 1240 uint64_t zSig;
158142c2
FB
1241
1242 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1243 zSign = ( a < 0 );
1244 absA = zSign ? - a : a;
1245 shiftCount = countLeadingZeros32( absA ) + 32;
1246 zSig = absA;
1247 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1248
1249}
1250
158142c2
FB
1251/*----------------------------------------------------------------------------
1252| Returns the result of converting the 32-bit two's complement integer `a' to
1253| the quadruple-precision floating-point format. The conversion is performed
1254| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1255*----------------------------------------------------------------------------*/
1256
c4850f9e 1257float128 int32_to_float128(int32_t a STATUS_PARAM)
158142c2
FB
1258{
1259 flag zSign;
1260 uint32 absA;
1261 int8 shiftCount;
bb98fe42 1262 uint64_t zSig0;
158142c2
FB
1263
1264 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1265 zSign = ( a < 0 );
1266 absA = zSign ? - a : a;
1267 shiftCount = countLeadingZeros32( absA ) + 17;
1268 zSig0 = absA;
1269 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1270
1271}
1272
158142c2
FB
1273/*----------------------------------------------------------------------------
1274| Returns the result of converting the 64-bit two's complement integer `a'
1275| to the single-precision floating-point format. The conversion is performed
1276| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1277*----------------------------------------------------------------------------*/
1278
c4850f9e 1279float32 int64_to_float32(int64_t a STATUS_PARAM)
158142c2
FB
1280{
1281 flag zSign;
1282 uint64 absA;
1283 int8 shiftCount;
1284
f090c9d4 1285 if ( a == 0 ) return float32_zero;
158142c2
FB
1286 zSign = ( a < 0 );
1287 absA = zSign ? - a : a;
1288 shiftCount = countLeadingZeros64( absA ) - 40;
1289 if ( 0 <= shiftCount ) {
1290 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1291 }
1292 else {
1293 shiftCount += 7;
1294 if ( shiftCount < 0 ) {
1295 shift64RightJamming( absA, - shiftCount, &absA );
1296 }
1297 else {
1298 absA <<= shiftCount;
1299 }
1300 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1301 }
1302
1303}
1304
1305/*----------------------------------------------------------------------------
1306| Returns the result of converting the 64-bit two's complement integer `a'
1307| to the double-precision floating-point format. The conversion is performed
1308| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1309*----------------------------------------------------------------------------*/
1310
c4850f9e 1311float64 int64_to_float64(int64_t a STATUS_PARAM)
158142c2
FB
1312{
1313 flag zSign;
1314
f090c9d4 1315 if ( a == 0 ) return float64_zero;
bb98fe42 1316 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
158142c2
FB
1317 return packFloat64( 1, 0x43E, 0 );
1318 }
1319 zSign = ( a < 0 );
1320 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1321
1322}
1323
158142c2
FB
1324/*----------------------------------------------------------------------------
1325| Returns the result of converting the 64-bit two's complement integer `a'
1326| to the extended double-precision floating-point format. The conversion
1327| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1328| Arithmetic.
1329*----------------------------------------------------------------------------*/
1330
c4850f9e 1331floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
158142c2
FB
1332{
1333 flag zSign;
1334 uint64 absA;
1335 int8 shiftCount;
1336
1337 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1338 zSign = ( a < 0 );
1339 absA = zSign ? - a : a;
1340 shiftCount = countLeadingZeros64( absA );
1341 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1342
1343}
1344
158142c2
FB
1345/*----------------------------------------------------------------------------
1346| Returns the result of converting the 64-bit two's complement integer `a' to
1347| the quadruple-precision floating-point format. The conversion is performed
1348| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1349*----------------------------------------------------------------------------*/
1350
c4850f9e 1351float128 int64_to_float128(int64_t a STATUS_PARAM)
158142c2
FB
1352{
1353 flag zSign;
1354 uint64 absA;
1355 int8 shiftCount;
1356 int32 zExp;
bb98fe42 1357 uint64_t zSig0, zSig1;
158142c2
FB
1358
1359 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1360 zSign = ( a < 0 );
1361 absA = zSign ? - a : a;
1362 shiftCount = countLeadingZeros64( absA ) + 49;
1363 zExp = 0x406E - shiftCount;
1364 if ( 64 <= shiftCount ) {
1365 zSig1 = 0;
1366 zSig0 = absA;
1367 shiftCount -= 64;
1368 }
1369 else {
1370 zSig1 = absA;
1371 zSig0 = 0;
1372 }
1373 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1374 return packFloat128( zSign, zExp, zSig0, zSig1 );
1375
1376}
1377
6bb8e0f1
PM
1378/*----------------------------------------------------------------------------
1379| Returns the result of converting the 64-bit unsigned integer `a'
1380| to the single-precision floating-point format. The conversion is performed
1381| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1382*----------------------------------------------------------------------------*/
1383
1384float32 uint64_to_float32(uint64_t a STATUS_PARAM)
1385{
1386 int shiftcount;
1387
1388 if (a == 0) {
1389 return float32_zero;
1390 }
1391
1392 /* Determine (left) shift needed to put first set bit into bit posn 23
1393 * (since packFloat32() expects the binary point between bits 23 and 22);
1394 * this is the fast case for smallish numbers.
1395 */
1396 shiftcount = countLeadingZeros64(a) - 40;
1397 if (shiftcount >= 0) {
1398 return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1399 }
1400 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1401 * expects the binary point between bits 30 and 29, hence the + 7.
1402 */
1403 shiftcount += 7;
1404 if (shiftcount < 0) {
1405 shift64RightJamming(a, -shiftcount, &a);
1406 } else {
1407 a <<= shiftcount;
1408 }
1409
1410 return roundAndPackFloat32(0, 0x9c - shiftcount, a STATUS_VAR);
1411}
1412
1413/*----------------------------------------------------------------------------
1414| Returns the result of converting the 64-bit unsigned integer `a'
1415| to the double-precision floating-point format. The conversion is performed
1416| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1417*----------------------------------------------------------------------------*/
1418
1419float64 uint64_to_float64(uint64_t a STATUS_PARAM)
1420{
1421 int exp = 0x43C;
1422 int shiftcount;
1423
1424 if (a == 0) {
1425 return float64_zero;
1426 }
1427
1428 shiftcount = countLeadingZeros64(a) - 1;
1429 if (shiftcount < 0) {
1430 shift64RightJamming(a, -shiftcount, &a);
1431 } else {
1432 a <<= shiftcount;
1433 }
1434 return roundAndPackFloat64(0, exp - shiftcount, a STATUS_VAR);
1435}
1436
1437/*----------------------------------------------------------------------------
1438| Returns the result of converting the 64-bit unsigned integer `a'
1439| to the quadruple-precision floating-point format. The conversion is performed
1440| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1441*----------------------------------------------------------------------------*/
1442
c4850f9e 1443float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1e397ead
RH
1444{
1445 if (a == 0) {
1446 return float128_zero;
1447 }
1448 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1449}
1450
158142c2
FB
1451/*----------------------------------------------------------------------------
1452| Returns the result of converting the single-precision floating-point value
1453| `a' to the 32-bit two's complement integer format. The conversion is
1454| performed according to the IEC/IEEE Standard for Binary Floating-Point
1455| Arithmetic---which means in particular that the conversion is rounded
1456| according to the current rounding mode. If `a' is a NaN, the largest
1457| positive integer is returned. Otherwise, if the conversion overflows, the
1458| largest integer with the same sign as `a' is returned.
1459*----------------------------------------------------------------------------*/
1460
1461int32 float32_to_int32( float32 a STATUS_PARAM )
1462{
1463 flag aSign;
94a49d86 1464 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1465 uint32_t aSig;
1466 uint64_t aSig64;
158142c2 1467
37d18660 1468 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1469 aSig = extractFloat32Frac( a );
1470 aExp = extractFloat32Exp( a );
1471 aSign = extractFloat32Sign( a );
1472 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1473 if ( aExp ) aSig |= 0x00800000;
1474 shiftCount = 0xAF - aExp;
1475 aSig64 = aSig;
1476 aSig64 <<= 32;
1477 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1478 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1479
1480}
1481
1482/*----------------------------------------------------------------------------
1483| Returns the result of converting the single-precision floating-point value
1484| `a' to the 32-bit two's complement integer format. The conversion is
1485| performed according to the IEC/IEEE Standard for Binary Floating-Point
1486| Arithmetic, except that the conversion is always rounded toward zero.
1487| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1488| the conversion overflows, the largest integer with the same sign as `a' is
1489| returned.
1490*----------------------------------------------------------------------------*/
1491
1492int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1493{
1494 flag aSign;
94a49d86 1495 int_fast16_t aExp, shiftCount;
bb98fe42 1496 uint32_t aSig;
b3a6a2e0 1497 int32_t z;
37d18660 1498 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1499
1500 aSig = extractFloat32Frac( a );
1501 aExp = extractFloat32Exp( a );
1502 aSign = extractFloat32Sign( a );
1503 shiftCount = aExp - 0x9E;
1504 if ( 0 <= shiftCount ) {
f090c9d4 1505 if ( float32_val(a) != 0xCF000000 ) {
158142c2
FB
1506 float_raise( float_flag_invalid STATUS_VAR);
1507 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1508 }
bb98fe42 1509 return (int32_t) 0x80000000;
158142c2
FB
1510 }
1511 else if ( aExp <= 0x7E ) {
1512 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1513 return 0;
1514 }
1515 aSig = ( aSig | 0x00800000 )<<8;
1516 z = aSig>>( - shiftCount );
bb98fe42 1517 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
158142c2
FB
1518 STATUS(float_exception_flags) |= float_flag_inexact;
1519 }
1520 if ( aSign ) z = - z;
1521 return z;
1522
1523}
1524
cbcef455
PM
1525/*----------------------------------------------------------------------------
1526| Returns the result of converting the single-precision floating-point value
1527| `a' to the 16-bit two's complement integer format. The conversion is
1528| performed according to the IEC/IEEE Standard for Binary Floating-Point
1529| Arithmetic, except that the conversion is always rounded toward zero.
1530| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1531| the conversion overflows, the largest integer with the same sign as `a' is
1532| returned.
1533*----------------------------------------------------------------------------*/
1534
94a49d86 1535int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
cbcef455
PM
1536{
1537 flag aSign;
94a49d86 1538 int_fast16_t aExp, shiftCount;
bb98fe42 1539 uint32_t aSig;
cbcef455
PM
1540 int32 z;
1541
1542 aSig = extractFloat32Frac( a );
1543 aExp = extractFloat32Exp( a );
1544 aSign = extractFloat32Sign( a );
1545 shiftCount = aExp - 0x8E;
1546 if ( 0 <= shiftCount ) {
1547 if ( float32_val(a) != 0xC7000000 ) {
1548 float_raise( float_flag_invalid STATUS_VAR);
1549 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1550 return 0x7FFF;
1551 }
1552 }
bb98fe42 1553 return (int32_t) 0xffff8000;
cbcef455
PM
1554 }
1555 else if ( aExp <= 0x7E ) {
1556 if ( aExp | aSig ) {
1557 STATUS(float_exception_flags) |= float_flag_inexact;
1558 }
1559 return 0;
1560 }
1561 shiftCount -= 0x10;
1562 aSig = ( aSig | 0x00800000 )<<8;
1563 z = aSig>>( - shiftCount );
bb98fe42 1564 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
cbcef455
PM
1565 STATUS(float_exception_flags) |= float_flag_inexact;
1566 }
1567 if ( aSign ) {
1568 z = - z;
1569 }
1570 return z;
1571
1572}
1573
158142c2
FB
1574/*----------------------------------------------------------------------------
1575| Returns the result of converting the single-precision floating-point value
1576| `a' to the 64-bit two's complement integer format. The conversion is
1577| performed according to the IEC/IEEE Standard for Binary Floating-Point
1578| Arithmetic---which means in particular that the conversion is rounded
1579| according to the current rounding mode. If `a' is a NaN, the largest
1580| positive integer is returned. Otherwise, if the conversion overflows, the
1581| largest integer with the same sign as `a' is returned.
1582*----------------------------------------------------------------------------*/
1583
1584int64 float32_to_int64( float32 a STATUS_PARAM )
1585{
1586 flag aSign;
94a49d86 1587 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1588 uint32_t aSig;
1589 uint64_t aSig64, aSigExtra;
37d18660 1590 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1591
1592 aSig = extractFloat32Frac( a );
1593 aExp = extractFloat32Exp( a );
1594 aSign = extractFloat32Sign( a );
1595 shiftCount = 0xBE - aExp;
1596 if ( shiftCount < 0 ) {
1597 float_raise( float_flag_invalid STATUS_VAR);
1598 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1599 return LIT64( 0x7FFFFFFFFFFFFFFF );
1600 }
bb98fe42 1601 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1602 }
1603 if ( aExp ) aSig |= 0x00800000;
1604 aSig64 = aSig;
1605 aSig64 <<= 40;
1606 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1607 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1608
1609}
1610
2f18bbf9
TM
1611/*----------------------------------------------------------------------------
1612| Returns the result of converting the single-precision floating-point value
1613| `a' to the 64-bit unsigned integer format. The conversion is
1614| performed according to the IEC/IEEE Standard for Binary Floating-Point
1615| Arithmetic---which means in particular that the conversion is rounded
1616| according to the current rounding mode. If `a' is a NaN, the largest
1617| unsigned integer is returned. Otherwise, if the conversion overflows, the
1618| largest unsigned integer is returned. If the 'a' is negative, the result
1619| is rounded and zero is returned; values that do not round to zero will
1620| raise the inexact exception flag.
1621*----------------------------------------------------------------------------*/
1622
1623uint64 float32_to_uint64(float32 a STATUS_PARAM)
1624{
1625 flag aSign;
1626 int_fast16_t aExp, shiftCount;
1627 uint32_t aSig;
1628 uint64_t aSig64, aSigExtra;
1629 a = float32_squash_input_denormal(a STATUS_VAR);
1630
1631 aSig = extractFloat32Frac(a);
1632 aExp = extractFloat32Exp(a);
1633 aSign = extractFloat32Sign(a);
1634 if ((aSign) && (aExp > 126)) {
1635 float_raise(float_flag_invalid STATUS_VAR);
1636 if (float32_is_any_nan(a)) {
1637 return LIT64(0xFFFFFFFFFFFFFFFF);
1638 } else {
1639 return 0;
1640 }
1641 }
1642 shiftCount = 0xBE - aExp;
1643 if (aExp) {
1644 aSig |= 0x00800000;
1645 }
1646 if (shiftCount < 0) {
1647 float_raise(float_flag_invalid STATUS_VAR);
1648 return LIT64(0xFFFFFFFFFFFFFFFF);
1649 }
1650
1651 aSig64 = aSig;
1652 aSig64 <<= 40;
1653 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1654 return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1655}
1656
a13d4489
TM
1657/*----------------------------------------------------------------------------
1658| Returns the result of converting the single-precision floating-point value
1659| `a' to the 64-bit unsigned integer format. The conversion is
1660| performed according to the IEC/IEEE Standard for Binary Floating-Point
1661| Arithmetic, except that the conversion is always rounded toward zero. If
1662| `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1663| conversion overflows, the largest unsigned integer is returned. If the
1664| 'a' is negative, the result is rounded and zero is returned; values that do
1665| not round to zero will raise the inexact flag.
1666*----------------------------------------------------------------------------*/
1667
1668uint64 float32_to_uint64_round_to_zero(float32 a STATUS_PARAM)
1669{
1670 signed char current_rounding_mode = STATUS(float_rounding_mode);
1671 set_float_rounding_mode(float_round_to_zero STATUS_VAR);
1672 int64_t v = float32_to_uint64(a STATUS_VAR);
1673 set_float_rounding_mode(current_rounding_mode STATUS_VAR);
1674 return v;
1675}
1676
158142c2
FB
1677/*----------------------------------------------------------------------------
1678| Returns the result of converting the single-precision floating-point value
1679| `a' to the 64-bit two's complement integer format. The conversion is
1680| performed according to the IEC/IEEE Standard for Binary Floating-Point
1681| Arithmetic, except that the conversion is always rounded toward zero. If
1682| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1683| conversion overflows, the largest integer with the same sign as `a' is
1684| returned.
1685*----------------------------------------------------------------------------*/
1686
1687int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1688{
1689 flag aSign;
94a49d86 1690 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1691 uint32_t aSig;
1692 uint64_t aSig64;
158142c2 1693 int64 z;
37d18660 1694 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1695
1696 aSig = extractFloat32Frac( a );
1697 aExp = extractFloat32Exp( a );
1698 aSign = extractFloat32Sign( a );
1699 shiftCount = aExp - 0xBE;
1700 if ( 0 <= shiftCount ) {
f090c9d4 1701 if ( float32_val(a) != 0xDF000000 ) {
158142c2
FB
1702 float_raise( float_flag_invalid STATUS_VAR);
1703 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1704 return LIT64( 0x7FFFFFFFFFFFFFFF );
1705 }
1706 }
bb98fe42 1707 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1708 }
1709 else if ( aExp <= 0x7E ) {
1710 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1711 return 0;
1712 }
1713 aSig64 = aSig | 0x00800000;
1714 aSig64 <<= 40;
1715 z = aSig64>>( - shiftCount );
bb98fe42 1716 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
158142c2
FB
1717 STATUS(float_exception_flags) |= float_flag_inexact;
1718 }
1719 if ( aSign ) z = - z;
1720 return z;
1721
1722}
1723
1724/*----------------------------------------------------------------------------
1725| Returns the result of converting the single-precision floating-point value
1726| `a' to the double-precision floating-point format. The conversion is
1727| performed according to the IEC/IEEE Standard for Binary Floating-Point
1728| Arithmetic.
1729*----------------------------------------------------------------------------*/
1730
1731float64 float32_to_float64( float32 a STATUS_PARAM )
1732{
1733 flag aSign;
94a49d86 1734 int_fast16_t aExp;
bb98fe42 1735 uint32_t aSig;
37d18660 1736 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1737
1738 aSig = extractFloat32Frac( a );
1739 aExp = extractFloat32Exp( a );
1740 aSign = extractFloat32Sign( a );
1741 if ( aExp == 0xFF ) {
bcd4d9af 1742 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1743 return packFloat64( aSign, 0x7FF, 0 );
1744 }
1745 if ( aExp == 0 ) {
1746 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1747 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1748 --aExp;
1749 }
bb98fe42 1750 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
1751
1752}
1753
158142c2
FB
1754/*----------------------------------------------------------------------------
1755| Returns the result of converting the single-precision floating-point value
1756| `a' to the extended double-precision floating-point format. The conversion
1757| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1758| Arithmetic.
1759*----------------------------------------------------------------------------*/
1760
1761floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1762{
1763 flag aSign;
94a49d86 1764 int_fast16_t aExp;
bb98fe42 1765 uint32_t aSig;
158142c2 1766
37d18660 1767 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1768 aSig = extractFloat32Frac( a );
1769 aExp = extractFloat32Exp( a );
1770 aSign = extractFloat32Sign( a );
1771 if ( aExp == 0xFF ) {
bcd4d9af 1772 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1773 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1774 }
1775 if ( aExp == 0 ) {
1776 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1777 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1778 }
1779 aSig |= 0x00800000;
bb98fe42 1780 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
1781
1782}
1783
158142c2
FB
1784/*----------------------------------------------------------------------------
1785| Returns the result of converting the single-precision floating-point value
1786| `a' to the double-precision floating-point format. The conversion is
1787| performed according to the IEC/IEEE Standard for Binary Floating-Point
1788| Arithmetic.
1789*----------------------------------------------------------------------------*/
1790
1791float128 float32_to_float128( float32 a STATUS_PARAM )
1792{
1793 flag aSign;
94a49d86 1794 int_fast16_t aExp;
bb98fe42 1795 uint32_t aSig;
158142c2 1796
37d18660 1797 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1798 aSig = extractFloat32Frac( a );
1799 aExp = extractFloat32Exp( a );
1800 aSign = extractFloat32Sign( a );
1801 if ( aExp == 0xFF ) {
bcd4d9af 1802 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1803 return packFloat128( aSign, 0x7FFF, 0, 0 );
1804 }
1805 if ( aExp == 0 ) {
1806 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1807 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1808 --aExp;
1809 }
bb98fe42 1810 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
1811
1812}
1813
158142c2
FB
1814/*----------------------------------------------------------------------------
1815| Rounds the single-precision floating-point value `a' to an integer, and
1816| returns the result as a single-precision floating-point value. The
1817| operation is performed according to the IEC/IEEE Standard for Binary
1818| Floating-Point Arithmetic.
1819*----------------------------------------------------------------------------*/
1820
1821float32 float32_round_to_int( float32 a STATUS_PARAM)
1822{
1823 flag aSign;
94a49d86 1824 int_fast16_t aExp;
bb98fe42 1825 uint32_t lastBitMask, roundBitsMask;
bb98fe42 1826 uint32_t z;
37d18660 1827 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1828
1829 aExp = extractFloat32Exp( a );
1830 if ( 0x96 <= aExp ) {
1831 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1832 return propagateFloat32NaN( a, a STATUS_VAR );
1833 }
1834 return a;
1835 }
1836 if ( aExp <= 0x7E ) {
bb98fe42 1837 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
158142c2
FB
1838 STATUS(float_exception_flags) |= float_flag_inexact;
1839 aSign = extractFloat32Sign( a );
1840 switch ( STATUS(float_rounding_mode) ) {
1841 case float_round_nearest_even:
1842 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1843 return packFloat32( aSign, 0x7F, 0 );
1844 }
1845 break;
f9288a76
PM
1846 case float_round_ties_away:
1847 if (aExp == 0x7E) {
1848 return packFloat32(aSign, 0x7F, 0);
1849 }
1850 break;
158142c2 1851 case float_round_down:
f090c9d4 1852 return make_float32(aSign ? 0xBF800000 : 0);
158142c2 1853 case float_round_up:
f090c9d4 1854 return make_float32(aSign ? 0x80000000 : 0x3F800000);
158142c2
FB
1855 }
1856 return packFloat32( aSign, 0, 0 );
1857 }
1858 lastBitMask = 1;
1859 lastBitMask <<= 0x96 - aExp;
1860 roundBitsMask = lastBitMask - 1;
f090c9d4 1861 z = float32_val(a);
dc355b76
PM
1862 switch (STATUS(float_rounding_mode)) {
1863 case float_round_nearest_even:
158142c2 1864 z += lastBitMask>>1;
dc355b76
PM
1865 if ((z & roundBitsMask) == 0) {
1866 z &= ~lastBitMask;
1867 }
1868 break;
f9288a76
PM
1869 case float_round_ties_away:
1870 z += lastBitMask >> 1;
1871 break;
dc355b76
PM
1872 case float_round_to_zero:
1873 break;
1874 case float_round_up:
1875 if (!extractFloat32Sign(make_float32(z))) {
1876 z += roundBitsMask;
1877 }
1878 break;
1879 case float_round_down:
1880 if (extractFloat32Sign(make_float32(z))) {
158142c2
FB
1881 z += roundBitsMask;
1882 }
dc355b76
PM
1883 break;
1884 default:
1885 abort();
158142c2
FB
1886 }
1887 z &= ~ roundBitsMask;
f090c9d4
PB
1888 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1889 return make_float32(z);
158142c2
FB
1890
1891}
1892
1893/*----------------------------------------------------------------------------
1894| Returns the result of adding the absolute values of the single-precision
1895| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1896| before being returned. `zSign' is ignored if the result is a NaN.
1897| The addition is performed according to the IEC/IEEE Standard for Binary
1898| Floating-Point Arithmetic.
1899*----------------------------------------------------------------------------*/
1900
1901static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1902{
94a49d86 1903 int_fast16_t aExp, bExp, zExp;
bb98fe42 1904 uint32_t aSig, bSig, zSig;
94a49d86 1905 int_fast16_t expDiff;
158142c2
FB
1906
1907 aSig = extractFloat32Frac( a );
1908 aExp = extractFloat32Exp( a );
1909 bSig = extractFloat32Frac( b );
1910 bExp = extractFloat32Exp( b );
1911 expDiff = aExp - bExp;
1912 aSig <<= 6;
1913 bSig <<= 6;
1914 if ( 0 < expDiff ) {
1915 if ( aExp == 0xFF ) {
1916 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1917 return a;
1918 }
1919 if ( bExp == 0 ) {
1920 --expDiff;
1921 }
1922 else {
1923 bSig |= 0x20000000;
1924 }
1925 shift32RightJamming( bSig, expDiff, &bSig );
1926 zExp = aExp;
1927 }
1928 else if ( expDiff < 0 ) {
1929 if ( bExp == 0xFF ) {
1930 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1931 return packFloat32( zSign, 0xFF, 0 );
1932 }
1933 if ( aExp == 0 ) {
1934 ++expDiff;
1935 }
1936 else {
1937 aSig |= 0x20000000;
1938 }
1939 shift32RightJamming( aSig, - expDiff, &aSig );
1940 zExp = bExp;
1941 }
1942 else {
1943 if ( aExp == 0xFF ) {
1944 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1945 return a;
1946 }
fe76d976 1947 if ( aExp == 0 ) {
e6afc87f
PM
1948 if (STATUS(flush_to_zero)) {
1949 if (aSig | bSig) {
1950 float_raise(float_flag_output_denormal STATUS_VAR);
1951 }
1952 return packFloat32(zSign, 0, 0);
1953 }
fe76d976
PB
1954 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1955 }
158142c2
FB
1956 zSig = 0x40000000 + aSig + bSig;
1957 zExp = aExp;
1958 goto roundAndPack;
1959 }
1960 aSig |= 0x20000000;
1961 zSig = ( aSig + bSig )<<1;
1962 --zExp;
bb98fe42 1963 if ( (int32_t) zSig < 0 ) {
158142c2
FB
1964 zSig = aSig + bSig;
1965 ++zExp;
1966 }
1967 roundAndPack:
1968 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
1969
1970}
1971
1972/*----------------------------------------------------------------------------
1973| Returns the result of subtracting the absolute values of the single-
1974| precision floating-point values `a' and `b'. If `zSign' is 1, the
1975| difference is negated before being returned. `zSign' is ignored if the
1976| result is a NaN. The subtraction is performed according to the IEC/IEEE
1977| Standard for Binary Floating-Point Arithmetic.
1978*----------------------------------------------------------------------------*/
1979
1980static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1981{
94a49d86 1982 int_fast16_t aExp, bExp, zExp;
bb98fe42 1983 uint32_t aSig, bSig, zSig;
94a49d86 1984 int_fast16_t expDiff;
158142c2
FB
1985
1986 aSig = extractFloat32Frac( a );
1987 aExp = extractFloat32Exp( a );
1988 bSig = extractFloat32Frac( b );
1989 bExp = extractFloat32Exp( b );
1990 expDiff = aExp - bExp;
1991 aSig <<= 7;
1992 bSig <<= 7;
1993 if ( 0 < expDiff ) goto aExpBigger;
1994 if ( expDiff < 0 ) goto bExpBigger;
1995 if ( aExp == 0xFF ) {
1996 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1997 float_raise( float_flag_invalid STATUS_VAR);
1998 return float32_default_nan;
1999 }
2000 if ( aExp == 0 ) {
2001 aExp = 1;
2002 bExp = 1;
2003 }
2004 if ( bSig < aSig ) goto aBigger;
2005 if ( aSig < bSig ) goto bBigger;
2006 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2007 bExpBigger:
2008 if ( bExp == 0xFF ) {
2009 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2010 return packFloat32( zSign ^ 1, 0xFF, 0 );
2011 }
2012 if ( aExp == 0 ) {
2013 ++expDiff;
2014 }
2015 else {
2016 aSig |= 0x40000000;
2017 }
2018 shift32RightJamming( aSig, - expDiff, &aSig );
2019 bSig |= 0x40000000;
2020 bBigger:
2021 zSig = bSig - aSig;
2022 zExp = bExp;
2023 zSign ^= 1;
2024 goto normalizeRoundAndPack;
2025 aExpBigger:
2026 if ( aExp == 0xFF ) {
2027 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2028 return a;
2029 }
2030 if ( bExp == 0 ) {
2031 --expDiff;
2032 }
2033 else {
2034 bSig |= 0x40000000;
2035 }
2036 shift32RightJamming( bSig, expDiff, &bSig );
2037 aSig |= 0x40000000;
2038 aBigger:
2039 zSig = aSig - bSig;
2040 zExp = aExp;
2041 normalizeRoundAndPack:
2042 --zExp;
2043 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2044
2045}
2046
2047/*----------------------------------------------------------------------------
2048| Returns the result of adding the single-precision floating-point values `a'
2049| and `b'. The operation is performed according to the IEC/IEEE Standard for
2050| Binary Floating-Point Arithmetic.
2051*----------------------------------------------------------------------------*/
2052
2053float32 float32_add( float32 a, float32 b STATUS_PARAM )
2054{
2055 flag aSign, bSign;
37d18660
PM
2056 a = float32_squash_input_denormal(a STATUS_VAR);
2057 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2058
2059 aSign = extractFloat32Sign( a );
2060 bSign = extractFloat32Sign( b );
2061 if ( aSign == bSign ) {
2062 return addFloat32Sigs( a, b, aSign STATUS_VAR);
2063 }
2064 else {
2065 return subFloat32Sigs( a, b, aSign STATUS_VAR );
2066 }
2067
2068}
2069
2070/*----------------------------------------------------------------------------
2071| Returns the result of subtracting the single-precision floating-point values
2072| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2073| for Binary Floating-Point Arithmetic.
2074*----------------------------------------------------------------------------*/
2075
2076float32 float32_sub( float32 a, float32 b STATUS_PARAM )
2077{
2078 flag aSign, bSign;
37d18660
PM
2079 a = float32_squash_input_denormal(a STATUS_VAR);
2080 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2081
2082 aSign = extractFloat32Sign( a );
2083 bSign = extractFloat32Sign( b );
2084 if ( aSign == bSign ) {
2085 return subFloat32Sigs( a, b, aSign STATUS_VAR );
2086 }
2087 else {
2088 return addFloat32Sigs( a, b, aSign STATUS_VAR );
2089 }
2090
2091}
2092
2093/*----------------------------------------------------------------------------
2094| Returns the result of multiplying the single-precision floating-point values
2095| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2096| for Binary Floating-Point Arithmetic.
2097*----------------------------------------------------------------------------*/
2098
2099float32 float32_mul( float32 a, float32 b STATUS_PARAM )
2100{
2101 flag aSign, bSign, zSign;
94a49d86 2102 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
2103 uint32_t aSig, bSig;
2104 uint64_t zSig64;
2105 uint32_t zSig;
158142c2 2106
37d18660
PM
2107 a = float32_squash_input_denormal(a STATUS_VAR);
2108 b = float32_squash_input_denormal(b STATUS_VAR);
2109
158142c2
FB
2110 aSig = extractFloat32Frac( a );
2111 aExp = extractFloat32Exp( a );
2112 aSign = extractFloat32Sign( a );
2113 bSig = extractFloat32Frac( b );
2114 bExp = extractFloat32Exp( b );
2115 bSign = extractFloat32Sign( b );
2116 zSign = aSign ^ bSign;
2117 if ( aExp == 0xFF ) {
2118 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2119 return propagateFloat32NaN( a, b STATUS_VAR );
2120 }
2121 if ( ( bExp | bSig ) == 0 ) {
2122 float_raise( float_flag_invalid STATUS_VAR);
2123 return float32_default_nan;
2124 }
2125 return packFloat32( zSign, 0xFF, 0 );
2126 }
2127 if ( bExp == 0xFF ) {
2128 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2129 if ( ( aExp | aSig ) == 0 ) {
2130 float_raise( float_flag_invalid STATUS_VAR);
2131 return float32_default_nan;
2132 }
2133 return packFloat32( zSign, 0xFF, 0 );
2134 }
2135 if ( aExp == 0 ) {
2136 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2137 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2138 }
2139 if ( bExp == 0 ) {
2140 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2141 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2142 }
2143 zExp = aExp + bExp - 0x7F;
2144 aSig = ( aSig | 0x00800000 )<<7;
2145 bSig = ( bSig | 0x00800000 )<<8;
bb98fe42 2146 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
158142c2 2147 zSig = zSig64;
bb98fe42 2148 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
158142c2
FB
2149 zSig <<= 1;
2150 --zExp;
2151 }
2152 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2153
2154}
2155
2156/*----------------------------------------------------------------------------
2157| Returns the result of dividing the single-precision floating-point value `a'
2158| by the corresponding value `b'. The operation is performed according to the
2159| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2160*----------------------------------------------------------------------------*/
2161
2162float32 float32_div( float32 a, float32 b STATUS_PARAM )
2163{
2164 flag aSign, bSign, zSign;
94a49d86 2165 int_fast16_t aExp, bExp, zExp;
bb98fe42 2166 uint32_t aSig, bSig, zSig;
37d18660
PM
2167 a = float32_squash_input_denormal(a STATUS_VAR);
2168 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2169
2170 aSig = extractFloat32Frac( a );
2171 aExp = extractFloat32Exp( a );
2172 aSign = extractFloat32Sign( a );
2173 bSig = extractFloat32Frac( b );
2174 bExp = extractFloat32Exp( b );
2175 bSign = extractFloat32Sign( b );
2176 zSign = aSign ^ bSign;
2177 if ( aExp == 0xFF ) {
2178 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2179 if ( bExp == 0xFF ) {
2180 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2181 float_raise( float_flag_invalid STATUS_VAR);
2182 return float32_default_nan;
2183 }
2184 return packFloat32( zSign, 0xFF, 0 );
2185 }
2186 if ( bExp == 0xFF ) {
2187 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2188 return packFloat32( zSign, 0, 0 );
2189 }
2190 if ( bExp == 0 ) {
2191 if ( bSig == 0 ) {
2192 if ( ( aExp | aSig ) == 0 ) {
2193 float_raise( float_flag_invalid STATUS_VAR);
2194 return float32_default_nan;
2195 }
2196 float_raise( float_flag_divbyzero STATUS_VAR);
2197 return packFloat32( zSign, 0xFF, 0 );
2198 }
2199 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2200 }
2201 if ( aExp == 0 ) {
2202 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2203 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2204 }
2205 zExp = aExp - bExp + 0x7D;
2206 aSig = ( aSig | 0x00800000 )<<7;
2207 bSig = ( bSig | 0x00800000 )<<8;
2208 if ( bSig <= ( aSig + aSig ) ) {
2209 aSig >>= 1;
2210 ++zExp;
2211 }
bb98fe42 2212 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2 2213 if ( ( zSig & 0x3F ) == 0 ) {
bb98fe42 2214 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
158142c2
FB
2215 }
2216 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2217
2218}
2219
2220/*----------------------------------------------------------------------------
2221| Returns the remainder of the single-precision floating-point value `a'
2222| with respect to the corresponding value `b'. The operation is performed
2223| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2224*----------------------------------------------------------------------------*/
2225
2226float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2227{
ed086f3d 2228 flag aSign, zSign;
94a49d86 2229 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
2230 uint32_t aSig, bSig;
2231 uint32_t q;
2232 uint64_t aSig64, bSig64, q64;
2233 uint32_t alternateASig;
2234 int32_t sigMean;
37d18660
PM
2235 a = float32_squash_input_denormal(a STATUS_VAR);
2236 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2237
2238 aSig = extractFloat32Frac( a );
2239 aExp = extractFloat32Exp( a );
2240 aSign = extractFloat32Sign( a );
2241 bSig = extractFloat32Frac( b );
2242 bExp = extractFloat32Exp( b );
158142c2
FB
2243 if ( aExp == 0xFF ) {
2244 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2245 return propagateFloat32NaN( a, b STATUS_VAR );
2246 }
2247 float_raise( float_flag_invalid STATUS_VAR);
2248 return float32_default_nan;
2249 }
2250 if ( bExp == 0xFF ) {
2251 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2252 return a;
2253 }
2254 if ( bExp == 0 ) {
2255 if ( bSig == 0 ) {
2256 float_raise( float_flag_invalid STATUS_VAR);
2257 return float32_default_nan;
2258 }
2259 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2260 }
2261 if ( aExp == 0 ) {
2262 if ( aSig == 0 ) return a;
2263 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2264 }
2265 expDiff = aExp - bExp;
2266 aSig |= 0x00800000;
2267 bSig |= 0x00800000;
2268 if ( expDiff < 32 ) {
2269 aSig <<= 8;
2270 bSig <<= 8;
2271 if ( expDiff < 0 ) {
2272 if ( expDiff < -1 ) return a;
2273 aSig >>= 1;
2274 }
2275 q = ( bSig <= aSig );
2276 if ( q ) aSig -= bSig;
2277 if ( 0 < expDiff ) {
bb98fe42 2278 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
2279 q >>= 32 - expDiff;
2280 bSig >>= 2;
2281 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2282 }
2283 else {
2284 aSig >>= 2;
2285 bSig >>= 2;
2286 }
2287 }
2288 else {
2289 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
2290 aSig64 = ( (uint64_t) aSig )<<40;
2291 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
2292 expDiff -= 64;
2293 while ( 0 < expDiff ) {
2294 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2295 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2296 aSig64 = - ( ( bSig * q64 )<<38 );
2297 expDiff -= 62;
2298 }
2299 expDiff += 64;
2300 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2301 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2302 q = q64>>( 64 - expDiff );
2303 bSig <<= 6;
2304 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2305 }
2306 do {
2307 alternateASig = aSig;
2308 ++q;
2309 aSig -= bSig;
bb98fe42 2310 } while ( 0 <= (int32_t) aSig );
158142c2
FB
2311 sigMean = aSig + alternateASig;
2312 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2313 aSig = alternateASig;
2314 }
bb98fe42 2315 zSign = ( (int32_t) aSig < 0 );
158142c2
FB
2316 if ( zSign ) aSig = - aSig;
2317 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2318
2319}
2320
369be8f6
PM
2321/*----------------------------------------------------------------------------
2322| Returns the result of multiplying the single-precision floating-point values
2323| `a' and `b' then adding 'c', with no intermediate rounding step after the
2324| multiplication. The operation is performed according to the IEC/IEEE
2325| Standard for Binary Floating-Point Arithmetic 754-2008.
2326| The flags argument allows the caller to select negation of the
2327| addend, the intermediate product, or the final result. (The difference
2328| between this and having the caller do a separate negation is that negating
2329| externally will flip the sign bit on NaNs.)
2330*----------------------------------------------------------------------------*/
2331
2332float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2333{
2334 flag aSign, bSign, cSign, zSign;
94a49d86 2335 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
2336 uint32_t aSig, bSig, cSig;
2337 flag pInf, pZero, pSign;
2338 uint64_t pSig64, cSig64, zSig64;
2339 uint32_t pSig;
2340 int shiftcount;
2341 flag signflip, infzero;
2342
2343 a = float32_squash_input_denormal(a STATUS_VAR);
2344 b = float32_squash_input_denormal(b STATUS_VAR);
2345 c = float32_squash_input_denormal(c STATUS_VAR);
2346 aSig = extractFloat32Frac(a);
2347 aExp = extractFloat32Exp(a);
2348 aSign = extractFloat32Sign(a);
2349 bSig = extractFloat32Frac(b);
2350 bExp = extractFloat32Exp(b);
2351 bSign = extractFloat32Sign(b);
2352 cSig = extractFloat32Frac(c);
2353 cExp = extractFloat32Exp(c);
2354 cSign = extractFloat32Sign(c);
2355
2356 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2357 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2358
2359 /* It is implementation-defined whether the cases of (0,inf,qnan)
2360 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2361 * they return if they do), so we have to hand this information
2362 * off to the target-specific pick-a-NaN routine.
2363 */
2364 if (((aExp == 0xff) && aSig) ||
2365 ((bExp == 0xff) && bSig) ||
2366 ((cExp == 0xff) && cSig)) {
2367 return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2368 }
2369
2370 if (infzero) {
2371 float_raise(float_flag_invalid STATUS_VAR);
2372 return float32_default_nan;
2373 }
2374
2375 if (flags & float_muladd_negate_c) {
2376 cSign ^= 1;
2377 }
2378
2379 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2380
2381 /* Work out the sign and type of the product */
2382 pSign = aSign ^ bSign;
2383 if (flags & float_muladd_negate_product) {
2384 pSign ^= 1;
2385 }
2386 pInf = (aExp == 0xff) || (bExp == 0xff);
2387 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2388
2389 if (cExp == 0xff) {
2390 if (pInf && (pSign ^ cSign)) {
2391 /* addition of opposite-signed infinities => InvalidOperation */
2392 float_raise(float_flag_invalid STATUS_VAR);
2393 return float32_default_nan;
2394 }
2395 /* Otherwise generate an infinity of the same sign */
2396 return packFloat32(cSign ^ signflip, 0xff, 0);
2397 }
2398
2399 if (pInf) {
2400 return packFloat32(pSign ^ signflip, 0xff, 0);
2401 }
2402
2403 if (pZero) {
2404 if (cExp == 0) {
2405 if (cSig == 0) {
2406 /* Adding two exact zeroes */
2407 if (pSign == cSign) {
2408 zSign = pSign;
2409 } else if (STATUS(float_rounding_mode) == float_round_down) {
2410 zSign = 1;
2411 } else {
2412 zSign = 0;
2413 }
2414 return packFloat32(zSign ^ signflip, 0, 0);
2415 }
2416 /* Exact zero plus a denorm */
2417 if (STATUS(flush_to_zero)) {
2418 float_raise(float_flag_output_denormal STATUS_VAR);
2419 return packFloat32(cSign ^ signflip, 0, 0);
2420 }
2421 }
2422 /* Zero plus something non-zero : just return the something */
67d43538
PM
2423 if (flags & float_muladd_halve_result) {
2424 if (cExp == 0) {
2425 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2426 }
2427 /* Subtract one to halve, and one again because roundAndPackFloat32
2428 * wants one less than the true exponent.
2429 */
2430 cExp -= 2;
2431 cSig = (cSig | 0x00800000) << 7;
2432 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig STATUS_VAR);
2433 }
a6e7c184 2434 return packFloat32(cSign ^ signflip, cExp, cSig);
369be8f6
PM
2435 }
2436
2437 if (aExp == 0) {
2438 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2439 }
2440 if (bExp == 0) {
2441 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2442 }
2443
2444 /* Calculate the actual result a * b + c */
2445
2446 /* Multiply first; this is easy. */
2447 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2448 * because we want the true exponent, not the "one-less-than"
2449 * flavour that roundAndPackFloat32() takes.
2450 */
2451 pExp = aExp + bExp - 0x7e;
2452 aSig = (aSig | 0x00800000) << 7;
2453 bSig = (bSig | 0x00800000) << 8;
2454 pSig64 = (uint64_t)aSig * bSig;
2455 if ((int64_t)(pSig64 << 1) >= 0) {
2456 pSig64 <<= 1;
2457 pExp--;
2458 }
2459
2460 zSign = pSign ^ signflip;
2461
2462 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2463 * position 62.
2464 */
2465 if (cExp == 0) {
2466 if (!cSig) {
2467 /* Throw out the special case of c being an exact zero now */
2468 shift64RightJamming(pSig64, 32, &pSig64);
2469 pSig = pSig64;
67d43538
PM
2470 if (flags & float_muladd_halve_result) {
2471 pExp--;
2472 }
369be8f6
PM
2473 return roundAndPackFloat32(zSign, pExp - 1,
2474 pSig STATUS_VAR);
2475 }
2476 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2477 }
2478
2479 cSig64 = (uint64_t)cSig << (62 - 23);
2480 cSig64 |= LIT64(0x4000000000000000);
2481 expDiff = pExp - cExp;
2482
2483 if (pSign == cSign) {
2484 /* Addition */
2485 if (expDiff > 0) {
2486 /* scale c to match p */
2487 shift64RightJamming(cSig64, expDiff, &cSig64);
2488 zExp = pExp;
2489 } else if (expDiff < 0) {
2490 /* scale p to match c */
2491 shift64RightJamming(pSig64, -expDiff, &pSig64);
2492 zExp = cExp;
2493 } else {
2494 /* no scaling needed */
2495 zExp = cExp;
2496 }
2497 /* Add significands and make sure explicit bit ends up in posn 62 */
2498 zSig64 = pSig64 + cSig64;
2499 if ((int64_t)zSig64 < 0) {
2500 shift64RightJamming(zSig64, 1, &zSig64);
2501 } else {
2502 zExp--;
2503 }
2504 } else {
2505 /* Subtraction */
2506 if (expDiff > 0) {
2507 shift64RightJamming(cSig64, expDiff, &cSig64);
2508 zSig64 = pSig64 - cSig64;
2509 zExp = pExp;
2510 } else if (expDiff < 0) {
2511 shift64RightJamming(pSig64, -expDiff, &pSig64);
2512 zSig64 = cSig64 - pSig64;
2513 zExp = cExp;
2514 zSign ^= 1;
2515 } else {
2516 zExp = pExp;
2517 if (cSig64 < pSig64) {
2518 zSig64 = pSig64 - cSig64;
2519 } else if (pSig64 < cSig64) {
2520 zSig64 = cSig64 - pSig64;
2521 zSign ^= 1;
2522 } else {
2523 /* Exact zero */
2524 zSign = signflip;
2525 if (STATUS(float_rounding_mode) == float_round_down) {
2526 zSign ^= 1;
2527 }
2528 return packFloat32(zSign, 0, 0);
2529 }
2530 }
2531 --zExp;
2532 /* Normalize to put the explicit bit back into bit 62. */
2533 shiftcount = countLeadingZeros64(zSig64) - 1;
2534 zSig64 <<= shiftcount;
2535 zExp -= shiftcount;
2536 }
67d43538
PM
2537 if (flags & float_muladd_halve_result) {
2538 zExp--;
2539 }
2540
369be8f6
PM
2541 shift64RightJamming(zSig64, 32, &zSig64);
2542 return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2543}
2544
2545
158142c2
FB
2546/*----------------------------------------------------------------------------
2547| Returns the square root of the single-precision floating-point value `a'.
2548| The operation is performed according to the IEC/IEEE Standard for Binary
2549| Floating-Point Arithmetic.
2550*----------------------------------------------------------------------------*/
2551
2552float32 float32_sqrt( float32 a STATUS_PARAM )
2553{
2554 flag aSign;
94a49d86 2555 int_fast16_t aExp, zExp;
bb98fe42
AF
2556 uint32_t aSig, zSig;
2557 uint64_t rem, term;
37d18660 2558 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2559
2560 aSig = extractFloat32Frac( a );
2561 aExp = extractFloat32Exp( a );
2562 aSign = extractFloat32Sign( a );
2563 if ( aExp == 0xFF ) {
f090c9d4 2564 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
158142c2
FB
2565 if ( ! aSign ) return a;
2566 float_raise( float_flag_invalid STATUS_VAR);
2567 return float32_default_nan;
2568 }
2569 if ( aSign ) {
2570 if ( ( aExp | aSig ) == 0 ) return a;
2571 float_raise( float_flag_invalid STATUS_VAR);
2572 return float32_default_nan;
2573 }
2574 if ( aExp == 0 ) {
f090c9d4 2575 if ( aSig == 0 ) return float32_zero;
158142c2
FB
2576 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2577 }
2578 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2579 aSig = ( aSig | 0x00800000 )<<8;
2580 zSig = estimateSqrt32( aExp, aSig ) + 2;
2581 if ( ( zSig & 0x7F ) <= 5 ) {
2582 if ( zSig < 2 ) {
2583 zSig = 0x7FFFFFFF;
2584 goto roundAndPack;
2585 }
2586 aSig >>= aExp & 1;
bb98fe42
AF
2587 term = ( (uint64_t) zSig ) * zSig;
2588 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2589 while ( (int64_t) rem < 0 ) {
158142c2 2590 --zSig;
bb98fe42 2591 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
158142c2
FB
2592 }
2593 zSig |= ( rem != 0 );
2594 }
2595 shift32RightJamming( zSig, 1, &zSig );
2596 roundAndPack:
2597 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2598
2599}
2600
8229c991
AJ
2601/*----------------------------------------------------------------------------
2602| Returns the binary exponential of the single-precision floating-point value
2603| `a'. The operation is performed according to the IEC/IEEE Standard for
2604| Binary Floating-Point Arithmetic.
2605|
2606| Uses the following identities:
2607|
2608| 1. -------------------------------------------------------------------------
2609| x x*ln(2)
2610| 2 = e
2611|
2612| 2. -------------------------------------------------------------------------
2613| 2 3 4 5 n
2614| x x x x x x x
2615| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2616| 1! 2! 3! 4! 5! n!
2617*----------------------------------------------------------------------------*/
2618
2619static const float64 float32_exp2_coefficients[15] =
2620{
d5138cf4
PM
2621 const_float64( 0x3ff0000000000000ll ), /* 1 */
2622 const_float64( 0x3fe0000000000000ll ), /* 2 */
2623 const_float64( 0x3fc5555555555555ll ), /* 3 */
2624 const_float64( 0x3fa5555555555555ll ), /* 4 */
2625 const_float64( 0x3f81111111111111ll ), /* 5 */
2626 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2627 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2628 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2629 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2630 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2631 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2632 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2633 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2634 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2635 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
2636};
2637
2638float32 float32_exp2( float32 a STATUS_PARAM )
2639{
2640 flag aSign;
94a49d86 2641 int_fast16_t aExp;
bb98fe42 2642 uint32_t aSig;
8229c991
AJ
2643 float64 r, x, xn;
2644 int i;
37d18660 2645 a = float32_squash_input_denormal(a STATUS_VAR);
8229c991
AJ
2646
2647 aSig = extractFloat32Frac( a );
2648 aExp = extractFloat32Exp( a );
2649 aSign = extractFloat32Sign( a );
2650
2651 if ( aExp == 0xFF) {
2652 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2653 return (aSign) ? float32_zero : a;
2654 }
2655 if (aExp == 0) {
2656 if (aSig == 0) return float32_one;
2657 }
2658
2659 float_raise( float_flag_inexact STATUS_VAR);
2660
2661 /* ******************************* */
2662 /* using float64 for approximation */
2663 /* ******************************* */
2664 x = float32_to_float64(a STATUS_VAR);
2665 x = float64_mul(x, float64_ln2 STATUS_VAR);
2666
2667 xn = x;
2668 r = float64_one;
2669 for (i = 0 ; i < 15 ; i++) {
2670 float64 f;
2671
2672 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2673 r = float64_add(r, f STATUS_VAR);
2674
2675 xn = float64_mul(xn, x STATUS_VAR);
2676 }
2677
2678 return float64_to_float32(r, status);
2679}
2680
374dfc33
AJ
2681/*----------------------------------------------------------------------------
2682| Returns the binary log of the single-precision floating-point value `a'.
2683| The operation is performed according to the IEC/IEEE Standard for Binary
2684| Floating-Point Arithmetic.
2685*----------------------------------------------------------------------------*/
2686float32 float32_log2( float32 a STATUS_PARAM )
2687{
2688 flag aSign, zSign;
94a49d86 2689 int_fast16_t aExp;
bb98fe42 2690 uint32_t aSig, zSig, i;
374dfc33 2691
37d18660 2692 a = float32_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
2693 aSig = extractFloat32Frac( a );
2694 aExp = extractFloat32Exp( a );
2695 aSign = extractFloat32Sign( a );
2696
2697 if ( aExp == 0 ) {
2698 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2699 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2700 }
2701 if ( aSign ) {
2702 float_raise( float_flag_invalid STATUS_VAR);
2703 return float32_default_nan;
2704 }
2705 if ( aExp == 0xFF ) {
2706 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2707 return a;
2708 }
2709
2710 aExp -= 0x7F;
2711 aSig |= 0x00800000;
2712 zSign = aExp < 0;
2713 zSig = aExp << 23;
2714
2715 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 2716 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
2717 if ( aSig & 0x01000000 ) {
2718 aSig >>= 1;
2719 zSig |= i;
2720 }
2721 }
2722
2723 if ( zSign )
2724 zSig = -zSig;
2725
2726 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2727}
2728
158142c2
FB
2729/*----------------------------------------------------------------------------
2730| Returns 1 if the single-precision floating-point value `a' is equal to
b689362d
AJ
2731| the corresponding value `b', and 0 otherwise. The invalid exception is
2732| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
2733| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2734*----------------------------------------------------------------------------*/
2735
b689362d 2736int float32_eq( float32 a, float32 b STATUS_PARAM )
158142c2 2737{
b689362d 2738 uint32_t av, bv;
37d18660
PM
2739 a = float32_squash_input_denormal(a STATUS_VAR);
2740 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2741
2742 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2743 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2744 ) {
b689362d 2745 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
2746 return 0;
2747 }
b689362d
AJ
2748 av = float32_val(a);
2749 bv = float32_val(b);
2750 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
2751}
2752
2753/*----------------------------------------------------------------------------
2754| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2755| or equal to the corresponding value `b', and 0 otherwise. The invalid
2756| exception is raised if either operand is a NaN. The comparison is performed
2757| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2758*----------------------------------------------------------------------------*/
2759
750afe93 2760int float32_le( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2761{
2762 flag aSign, bSign;
bb98fe42 2763 uint32_t av, bv;
37d18660
PM
2764 a = float32_squash_input_denormal(a STATUS_VAR);
2765 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2766
2767 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2768 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2769 ) {
2770 float_raise( float_flag_invalid STATUS_VAR);
2771 return 0;
2772 }
2773 aSign = extractFloat32Sign( a );
2774 bSign = extractFloat32Sign( b );
f090c9d4
PB
2775 av = float32_val(a);
2776 bv = float32_val(b);
bb98fe42 2777 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2778 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2779
2780}
2781
2782/*----------------------------------------------------------------------------
2783| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2784| the corresponding value `b', and 0 otherwise. The invalid exception is
2785| raised if either operand is a NaN. The comparison is performed according
2786| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2787*----------------------------------------------------------------------------*/
2788
750afe93 2789int float32_lt( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2790{
2791 flag aSign, bSign;
bb98fe42 2792 uint32_t av, bv;
37d18660
PM
2793 a = float32_squash_input_denormal(a STATUS_VAR);
2794 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2795
2796 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2797 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2798 ) {
2799 float_raise( float_flag_invalid STATUS_VAR);
2800 return 0;
2801 }
2802 aSign = extractFloat32Sign( a );
2803 bSign = extractFloat32Sign( b );
f090c9d4
PB
2804 av = float32_val(a);
2805 bv = float32_val(b);
bb98fe42 2806 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2807 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2808
2809}
2810
67b7861d
AJ
2811/*----------------------------------------------------------------------------
2812| Returns 1 if the single-precision floating-point values `a' and `b' cannot
f5a64251
AJ
2813| be compared, and 0 otherwise. The invalid exception is raised if either
2814| operand is a NaN. The comparison is performed according to the IEC/IEEE
2815| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
2816*----------------------------------------------------------------------------*/
2817
2818int float32_unordered( float32 a, float32 b STATUS_PARAM )
2819{
2820 a = float32_squash_input_denormal(a STATUS_VAR);
2821 b = float32_squash_input_denormal(b STATUS_VAR);
2822
2823 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2824 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2825 ) {
2826 float_raise( float_flag_invalid STATUS_VAR);
2827 return 1;
2828 }
2829 return 0;
2830}
b689362d 2831
158142c2
FB
2832/*----------------------------------------------------------------------------
2833| Returns 1 if the single-precision floating-point value `a' is equal to
f5a64251
AJ
2834| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2835| exception. The comparison is performed according to the IEC/IEEE Standard
2836| for Binary Floating-Point Arithmetic.
158142c2
FB
2837*----------------------------------------------------------------------------*/
2838
b689362d 2839int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
158142c2 2840{
37d18660
PM
2841 a = float32_squash_input_denormal(a STATUS_VAR);
2842 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2843
2844 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2845 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2846 ) {
b689362d
AJ
2847 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2848 float_raise( float_flag_invalid STATUS_VAR);
2849 }
158142c2
FB
2850 return 0;
2851 }
b689362d
AJ
2852 return ( float32_val(a) == float32_val(b) ) ||
2853 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
2854}
2855
2856/*----------------------------------------------------------------------------
2857| Returns 1 if the single-precision floating-point value `a' is less than or
2858| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2859| cause an exception. Otherwise, the comparison is performed according to the
2860| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2861*----------------------------------------------------------------------------*/
2862
750afe93 2863int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2864{
2865 flag aSign, bSign;
bb98fe42 2866 uint32_t av, bv;
37d18660
PM
2867 a = float32_squash_input_denormal(a STATUS_VAR);
2868 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2869
2870 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2871 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2872 ) {
2873 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2874 float_raise( float_flag_invalid STATUS_VAR);
2875 }
2876 return 0;
2877 }
2878 aSign = extractFloat32Sign( a );
2879 bSign = extractFloat32Sign( b );
f090c9d4
PB
2880 av = float32_val(a);
2881 bv = float32_val(b);
bb98fe42 2882 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2883 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2884
2885}
2886
2887/*----------------------------------------------------------------------------
2888| Returns 1 if the single-precision floating-point value `a' is less than
2889| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2890| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2891| Standard for Binary Floating-Point Arithmetic.
2892*----------------------------------------------------------------------------*/
2893
750afe93 2894int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2895{
2896 flag aSign, bSign;
bb98fe42 2897 uint32_t av, bv;
37d18660
PM
2898 a = float32_squash_input_denormal(a STATUS_VAR);
2899 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2900
2901 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2902 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2903 ) {
2904 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2905 float_raise( float_flag_invalid STATUS_VAR);
2906 }
2907 return 0;
2908 }
2909 aSign = extractFloat32Sign( a );
2910 bSign = extractFloat32Sign( b );
f090c9d4
PB
2911 av = float32_val(a);
2912 bv = float32_val(b);
bb98fe42 2913 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2914 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2915
2916}
2917
67b7861d
AJ
2918/*----------------------------------------------------------------------------
2919| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2920| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2921| comparison is performed according to the IEC/IEEE Standard for Binary
2922| Floating-Point Arithmetic.
2923*----------------------------------------------------------------------------*/
2924
2925int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2926{
2927 a = float32_squash_input_denormal(a STATUS_VAR);
2928 b = float32_squash_input_denormal(b STATUS_VAR);
2929
2930 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2931 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2932 ) {
2933 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2934 float_raise( float_flag_invalid STATUS_VAR);
2935 }
2936 return 1;
2937 }
2938 return 0;
2939}
2940
158142c2
FB
2941/*----------------------------------------------------------------------------
2942| Returns the result of converting the double-precision floating-point value
2943| `a' to the 32-bit two's complement integer format. The conversion is
2944| performed according to the IEC/IEEE Standard for Binary Floating-Point
2945| Arithmetic---which means in particular that the conversion is rounded
2946| according to the current rounding mode. If `a' is a NaN, the largest
2947| positive integer is returned. Otherwise, if the conversion overflows, the
2948| largest integer with the same sign as `a' is returned.
2949*----------------------------------------------------------------------------*/
2950
2951int32 float64_to_int32( float64 a STATUS_PARAM )
2952{
2953 flag aSign;
94a49d86 2954 int_fast16_t aExp, shiftCount;
bb98fe42 2955 uint64_t aSig;
37d18660 2956 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2957
2958 aSig = extractFloat64Frac( a );
2959 aExp = extractFloat64Exp( a );
2960 aSign = extractFloat64Sign( a );
2961 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2962 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2963 shiftCount = 0x42C - aExp;
2964 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2965 return roundAndPackInt32( aSign, aSig STATUS_VAR );
2966
2967}
2968
2969/*----------------------------------------------------------------------------
2970| Returns the result of converting the double-precision floating-point value
2971| `a' to the 32-bit two's complement integer format. The conversion is
2972| performed according to the IEC/IEEE Standard for Binary Floating-Point
2973| Arithmetic, except that the conversion is always rounded toward zero.
2974| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2975| the conversion overflows, the largest integer with the same sign as `a' is
2976| returned.
2977*----------------------------------------------------------------------------*/
2978
2979int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
2980{
2981 flag aSign;
94a49d86 2982 int_fast16_t aExp, shiftCount;
bb98fe42 2983 uint64_t aSig, savedASig;
b3a6a2e0 2984 int32_t z;
37d18660 2985 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2986
2987 aSig = extractFloat64Frac( a );
2988 aExp = extractFloat64Exp( a );
2989 aSign = extractFloat64Sign( a );
2990 if ( 0x41E < aExp ) {
2991 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2992 goto invalid;
2993 }
2994 else if ( aExp < 0x3FF ) {
2995 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
2996 return 0;
2997 }
2998 aSig |= LIT64( 0x0010000000000000 );
2999 shiftCount = 0x433 - aExp;
3000 savedASig = aSig;
3001 aSig >>= shiftCount;
3002 z = aSig;
3003 if ( aSign ) z = - z;
3004 if ( ( z < 0 ) ^ aSign ) {
3005 invalid:
3006 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 3007 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
3008 }
3009 if ( ( aSig<<shiftCount ) != savedASig ) {
3010 STATUS(float_exception_flags) |= float_flag_inexact;
3011 }
3012 return z;
3013
3014}
3015
cbcef455
PM
3016/*----------------------------------------------------------------------------
3017| Returns the result of converting the double-precision floating-point value
3018| `a' to the 16-bit two's complement integer format. The conversion is
3019| performed according to the IEC/IEEE Standard for Binary Floating-Point
3020| Arithmetic, except that the conversion is always rounded toward zero.
3021| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3022| the conversion overflows, the largest integer with the same sign as `a' is
3023| returned.
3024*----------------------------------------------------------------------------*/
3025
94a49d86 3026int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
cbcef455
PM
3027{
3028 flag aSign;
94a49d86 3029 int_fast16_t aExp, shiftCount;
bb98fe42 3030 uint64_t aSig, savedASig;
cbcef455
PM
3031 int32 z;
3032
3033 aSig = extractFloat64Frac( a );
3034 aExp = extractFloat64Exp( a );
3035 aSign = extractFloat64Sign( a );
3036 if ( 0x40E < aExp ) {
3037 if ( ( aExp == 0x7FF ) && aSig ) {
3038 aSign = 0;
3039 }
3040 goto invalid;
3041 }
3042 else if ( aExp < 0x3FF ) {
3043 if ( aExp || aSig ) {
3044 STATUS(float_exception_flags) |= float_flag_inexact;
3045 }
3046 return 0;
3047 }
3048 aSig |= LIT64( 0x0010000000000000 );
3049 shiftCount = 0x433 - aExp;
3050 savedASig = aSig;
3051 aSig >>= shiftCount;
3052 z = aSig;
3053 if ( aSign ) {
3054 z = - z;
3055 }
3056 if ( ( (int16_t)z < 0 ) ^ aSign ) {
3057 invalid:
3058 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 3059 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
cbcef455
PM
3060 }
3061 if ( ( aSig<<shiftCount ) != savedASig ) {
3062 STATUS(float_exception_flags) |= float_flag_inexact;
3063 }
3064 return z;
3065}
3066
158142c2
FB
3067/*----------------------------------------------------------------------------
3068| Returns the result of converting the double-precision floating-point value
3069| `a' to the 64-bit two's complement integer format. The conversion is
3070| performed according to the IEC/IEEE Standard for Binary Floating-Point
3071| Arithmetic---which means in particular that the conversion is rounded
3072| according to the current rounding mode. If `a' is a NaN, the largest
3073| positive integer is returned. Otherwise, if the conversion overflows, the
3074| largest integer with the same sign as `a' is returned.
3075*----------------------------------------------------------------------------*/
3076
3077int64 float64_to_int64( float64 a STATUS_PARAM )
3078{
3079 flag aSign;
94a49d86 3080 int_fast16_t aExp, shiftCount;
bb98fe42 3081 uint64_t aSig, aSigExtra;
37d18660 3082 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3083
3084 aSig = extractFloat64Frac( a );
3085 aExp = extractFloat64Exp( a );
3086 aSign = extractFloat64Sign( a );
3087 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3088 shiftCount = 0x433 - aExp;
3089 if ( shiftCount <= 0 ) {
3090 if ( 0x43E < aExp ) {
3091 float_raise( float_flag_invalid STATUS_VAR);
3092 if ( ! aSign
3093 || ( ( aExp == 0x7FF )
3094 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3095 ) {
3096 return LIT64( 0x7FFFFFFFFFFFFFFF );
3097 }
bb98fe42 3098 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3099 }
3100 aSigExtra = 0;
3101 aSig <<= - shiftCount;
3102 }
3103 else {
3104 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3105 }
3106 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3107
3108}
3109
3110/*----------------------------------------------------------------------------
3111| Returns the result of converting the double-precision floating-point value
3112| `a' to the 64-bit two's complement integer format. The conversion is
3113| performed according to the IEC/IEEE Standard for Binary Floating-Point
3114| Arithmetic, except that the conversion is always rounded toward zero.
3115| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3116| the conversion overflows, the largest integer with the same sign as `a' is
3117| returned.
3118*----------------------------------------------------------------------------*/
3119
3120int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
3121{
3122 flag aSign;
94a49d86 3123 int_fast16_t aExp, shiftCount;
bb98fe42 3124 uint64_t aSig;
158142c2 3125 int64 z;
37d18660 3126 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3127
3128 aSig = extractFloat64Frac( a );
3129 aExp = extractFloat64Exp( a );
3130 aSign = extractFloat64Sign( a );
3131 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3132 shiftCount = aExp - 0x433;
3133 if ( 0 <= shiftCount ) {
3134 if ( 0x43E <= aExp ) {
f090c9d4 3135 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
158142c2
FB
3136 float_raise( float_flag_invalid STATUS_VAR);
3137 if ( ! aSign
3138 || ( ( aExp == 0x7FF )
3139 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3140 ) {
3141 return LIT64( 0x7FFFFFFFFFFFFFFF );
3142 }
3143 }
bb98fe42 3144 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3145 }
3146 z = aSig<<shiftCount;
3147 }
3148 else {
3149 if ( aExp < 0x3FE ) {
3150 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3151 return 0;
3152 }
3153 z = aSig>>( - shiftCount );
bb98fe42 3154 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
3155 STATUS(float_exception_flags) |= float_flag_inexact;
3156 }
3157 }
3158 if ( aSign ) z = - z;
3159 return z;
3160
3161}
3162
3163/*----------------------------------------------------------------------------
3164| Returns the result of converting the double-precision floating-point value
3165| `a' to the single-precision floating-point format. The conversion is
3166| performed according to the IEC/IEEE Standard for Binary Floating-Point
3167| Arithmetic.
3168*----------------------------------------------------------------------------*/
3169
3170float32 float64_to_float32( float64 a STATUS_PARAM )
3171{
3172 flag aSign;
94a49d86 3173 int_fast16_t aExp;
bb98fe42
AF
3174 uint64_t aSig;
3175 uint32_t zSig;
37d18660 3176 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3177
3178 aSig = extractFloat64Frac( a );
3179 aExp = extractFloat64Exp( a );
3180 aSign = extractFloat64Sign( a );
3181 if ( aExp == 0x7FF ) {
bcd4d9af 3182 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3183 return packFloat32( aSign, 0xFF, 0 );
3184 }
3185 shift64RightJamming( aSig, 22, &aSig );
3186 zSig = aSig;
3187 if ( aExp || zSig ) {
3188 zSig |= 0x40000000;
3189 aExp -= 0x381;
3190 }
3191 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3192
3193}
3194
60011498
PB
3195
3196/*----------------------------------------------------------------------------
3197| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3198| half-precision floating-point value, returning the result. After being
3199| shifted into the proper positions, the three fields are simply added
3200| together to form the result. This means that any integer portion of `zSig'
3201| will be added into the exponent. Since a properly normalized significand
3202| will have an integer portion equal to 1, the `zExp' input should be 1 less
3203| than the desired result exponent whenever `zSig' is a complete, normalized
3204| significand.
3205*----------------------------------------------------------------------------*/
94a49d86 3206static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
60011498 3207{
bb4d4bb3 3208 return make_float16(
bb98fe42 3209 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
3210}
3211
c4a1c5e7
PM
3212/*----------------------------------------------------------------------------
3213| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3214| and significand `zSig', and returns the proper half-precision floating-
3215| point value corresponding to the abstract input. Ordinarily, the abstract
3216| value is simply rounded and packed into the half-precision format, with
3217| the inexact exception raised if the abstract input cannot be represented
3218| exactly. However, if the abstract value is too large, the overflow and
3219| inexact exceptions are raised and an infinity or maximal finite value is
3220| returned. If the abstract value is too small, the input value is rounded to
3221| a subnormal number, and the underflow and inexact exceptions are raised if
3222| the abstract input cannot be represented exactly as a subnormal half-
3223| precision floating-point number.
3224| The `ieee' flag indicates whether to use IEEE standard half precision, or
3225| ARM-style "alternative representation", which omits the NaN and Inf
3226| encodings in order to raise the maximum representable exponent by one.
3227| The input significand `zSig' has its binary point between bits 22
3228| and 23, which is 13 bits to the left of the usual location. This shifted
3229| significand must be normalized or smaller. If `zSig' is not normalized,
3230| `zExp' must be 0; in that case, the result returned is a subnormal number,
3231| and it must not require rounding. In the usual case that `zSig' is
3232| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3233| Note the slightly odd position of the binary point in zSig compared with the
3234| other roundAndPackFloat functions. This should probably be fixed if we
3235| need to implement more float16 routines than just conversion.
3236| The handling of underflow and overflow follows the IEC/IEEE Standard for
3237| Binary Floating-Point Arithmetic.
3238*----------------------------------------------------------------------------*/
3239
3240static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3241 uint32_t zSig, flag ieee STATUS_PARAM)
3242{
3243 int maxexp = ieee ? 29 : 30;
3244 uint32_t mask;
3245 uint32_t increment;
c4a1c5e7
PM
3246 bool rounding_bumps_exp;
3247 bool is_tiny = false;
3248
3249 /* Calculate the mask of bits of the mantissa which are not
3250 * representable in half-precision and will be lost.
3251 */
3252 if (zExp < 1) {
3253 /* Will be denormal in halfprec */
3254 mask = 0x00ffffff;
3255 if (zExp >= -11) {
3256 mask >>= 11 + zExp;
3257 }
3258 } else {
3259 /* Normal number in halfprec */
3260 mask = 0x00001fff;
3261 }
3262
dc355b76 3263 switch (STATUS(float_rounding_mode)) {
c4a1c5e7
PM
3264 case float_round_nearest_even:
3265 increment = (mask + 1) >> 1;
3266 if ((zSig & mask) == increment) {
3267 increment = zSig & (increment << 1);
3268 }
3269 break;
f9288a76
PM
3270 case float_round_ties_away:
3271 increment = (mask + 1) >> 1;
3272 break;
c4a1c5e7
PM
3273 case float_round_up:
3274 increment = zSign ? 0 : mask;
3275 break;
3276 case float_round_down:
3277 increment = zSign ? mask : 0;
3278 break;
3279 default: /* round_to_zero */
3280 increment = 0;
3281 break;
3282 }
3283
3284 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3285
3286 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3287 if (ieee) {
3288 float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3289 return packFloat16(zSign, 0x1f, 0);
3290 } else {
3291 float_raise(float_flag_invalid STATUS_VAR);
3292 return packFloat16(zSign, 0x1f, 0x3ff);
3293 }
3294 }
3295
3296 if (zExp < 0) {
3297 /* Note that flush-to-zero does not affect half-precision results */
3298 is_tiny =
3299 (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3300 || (zExp < -1)
3301 || (!rounding_bumps_exp);
3302 }
3303 if (zSig & mask) {
3304 float_raise(float_flag_inexact STATUS_VAR);
3305 if (is_tiny) {
3306 float_raise(float_flag_underflow STATUS_VAR);
3307 }
3308 }
3309
3310 zSig += increment;
3311 if (rounding_bumps_exp) {
3312 zSig >>= 1;
3313 zExp++;
3314 }
3315
3316 if (zExp < -10) {
3317 return packFloat16(zSign, 0, 0);
3318 }
3319 if (zExp < 0) {
3320 zSig >>= -zExp;
3321 zExp = 0;
3322 }
3323 return packFloat16(zSign, zExp, zSig >> 13);
3324}
3325
3326static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3327 uint32_t *zSigPtr)
3328{
3329 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3330 *zSigPtr = aSig << shiftCount;
3331 *zExpPtr = 1 - shiftCount;
3332}
3333
60011498
PB
3334/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3335 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3
PM
3336
3337float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
60011498
PB
3338{
3339 flag aSign;
94a49d86 3340 int_fast16_t aExp;
bb98fe42 3341 uint32_t aSig;
60011498 3342
bb4d4bb3
PM
3343 aSign = extractFloat16Sign(a);
3344 aExp = extractFloat16Exp(a);
3345 aSig = extractFloat16Frac(a);
60011498
PB
3346
3347 if (aExp == 0x1f && ieee) {
3348 if (aSig) {
f591e1be 3349 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
60011498 3350 }
4be8eeac 3351 return packFloat32(aSign, 0xff, 0);
60011498
PB
3352 }
3353 if (aExp == 0) {
60011498
PB
3354 if (aSig == 0) {
3355 return packFloat32(aSign, 0, 0);
3356 }
3357
c4a1c5e7
PM
3358 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3359 aExp--;
60011498
PB
3360 }
3361 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3362}
3363
bb4d4bb3 3364float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
60011498
PB
3365{
3366 flag aSign;
94a49d86 3367 int_fast16_t aExp;
bb98fe42 3368 uint32_t aSig;
38970efa 3369
37d18660 3370 a = float32_squash_input_denormal(a STATUS_VAR);
60011498
PB
3371
3372 aSig = extractFloat32Frac( a );
3373 aExp = extractFloat32Exp( a );
3374 aSign = extractFloat32Sign( a );
3375 if ( aExp == 0xFF ) {
3376 if (aSig) {
600e30d2 3377 /* Input is a NaN */
600e30d2 3378 if (!ieee) {
38970efa 3379 float_raise(float_flag_invalid STATUS_VAR);
600e30d2
PM
3380 return packFloat16(aSign, 0, 0);
3381 }
38970efa
PM
3382 return commonNaNToFloat16(
3383 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
60011498 3384 }
600e30d2
PM
3385 /* Infinity */
3386 if (!ieee) {
3387 float_raise(float_flag_invalid STATUS_VAR);
3388 return packFloat16(aSign, 0x1f, 0x3ff);
3389 }
3390 return packFloat16(aSign, 0x1f, 0);
60011498 3391 }
600e30d2 3392 if (aExp == 0 && aSig == 0) {
60011498
PB
3393 return packFloat16(aSign, 0, 0);
3394 }
38970efa
PM
3395 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3396 * even if the input is denormal; however this is harmless because
3397 * the largest possible single-precision denormal is still smaller
3398 * than the smallest representable half-precision denormal, and so we
3399 * will end up ignoring aSig and returning via the "always return zero"
3400 * codepath.
3401 */
60011498 3402 aSig |= 0x00800000;
c4a1c5e7 3403 aExp -= 0x71;
60011498 3404
c4a1c5e7 3405 return roundAndPackFloat16(aSign, aExp, aSig, ieee STATUS_VAR);
60011498
PB
3406}
3407
14c9a07e
PM
3408float64 float16_to_float64(float16 a, flag ieee STATUS_PARAM)
3409{
3410 flag aSign;
3411 int_fast16_t aExp;
3412 uint32_t aSig;
3413
3414 aSign = extractFloat16Sign(a);
3415 aExp = extractFloat16Exp(a);
3416 aSig = extractFloat16Frac(a);
3417
3418 if (aExp == 0x1f && ieee) {
3419 if (aSig) {
3420 return commonNaNToFloat64(
3421 float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3422 }
3423 return packFloat64(aSign, 0x7ff, 0);
3424 }
3425 if (aExp == 0) {
3426 if (aSig == 0) {
3427 return packFloat64(aSign, 0, 0);
3428 }
3429
3430 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3431 aExp--;
3432 }
3433 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3434}
3435
3436float16 float64_to_float16(float64 a, flag ieee STATUS_PARAM)
3437{
3438 flag aSign;
3439 int_fast16_t aExp;
3440 uint64_t aSig;
3441 uint32_t zSig;
3442
3443 a = float64_squash_input_denormal(a STATUS_VAR);
3444
3445 aSig = extractFloat64Frac(a);
3446 aExp = extractFloat64Exp(a);
3447 aSign = extractFloat64Sign(a);
3448 if (aExp == 0x7FF) {
3449 if (aSig) {
3450 /* Input is a NaN */
3451 if (!ieee) {
3452 float_raise(float_flag_invalid STATUS_VAR);
3453 return packFloat16(aSign, 0, 0);
3454 }
3455 return commonNaNToFloat16(
3456 float64ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3457 }
3458 /* Infinity */
3459 if (!ieee) {
3460 float_raise(float_flag_invalid STATUS_VAR);
3461 return packFloat16(aSign, 0x1f, 0x3ff);
3462 }
3463 return packFloat16(aSign, 0x1f, 0);
3464 }
3465 shift64RightJamming(aSig, 29, &aSig);
3466 zSig = aSig;
3467 if (aExp == 0 && zSig == 0) {
3468 return packFloat16(aSign, 0, 0);
3469 }
3470 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3471 * even if the input is denormal; however this is harmless because
3472 * the largest possible single-precision denormal is still smaller
3473 * than the smallest representable half-precision denormal, and so we
3474 * will end up ignoring aSig and returning via the "always return zero"
3475 * codepath.
3476 */
3477 zSig |= 0x00800000;
3478 aExp -= 0x3F1;
3479
3480 return roundAndPackFloat16(aSign, aExp, zSig, ieee STATUS_VAR);
3481}
3482
158142c2
FB
3483/*----------------------------------------------------------------------------
3484| Returns the result of converting the double-precision floating-point value
3485| `a' to the extended double-precision floating-point format. The conversion
3486| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3487| Arithmetic.
3488*----------------------------------------------------------------------------*/
3489
3490floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3491{
3492 flag aSign;
94a49d86 3493 int_fast16_t aExp;
bb98fe42 3494 uint64_t aSig;
158142c2 3495
37d18660 3496 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3497 aSig = extractFloat64Frac( a );
3498 aExp = extractFloat64Exp( a );
3499 aSign = extractFloat64Sign( a );
3500 if ( aExp == 0x7FF ) {
bcd4d9af 3501 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3502 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3503 }
3504 if ( aExp == 0 ) {
3505 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3506 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3507 }
3508 return
3509 packFloatx80(
3510 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3511
3512}
3513
158142c2
FB
3514/*----------------------------------------------------------------------------
3515| Returns the result of converting the double-precision floating-point value
3516| `a' to the quadruple-precision floating-point format. The conversion is
3517| performed according to the IEC/IEEE Standard for Binary Floating-Point
3518| Arithmetic.
3519*----------------------------------------------------------------------------*/
3520
3521float128 float64_to_float128( float64 a STATUS_PARAM )
3522{
3523 flag aSign;
94a49d86 3524 int_fast16_t aExp;
bb98fe42 3525 uint64_t aSig, zSig0, zSig1;
158142c2 3526
37d18660 3527 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3528 aSig = extractFloat64Frac( a );
3529 aExp = extractFloat64Exp( a );
3530 aSign = extractFloat64Sign( a );
3531 if ( aExp == 0x7FF ) {
bcd4d9af 3532 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3533 return packFloat128( aSign, 0x7FFF, 0, 0 );
3534 }
3535 if ( aExp == 0 ) {
3536 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3537 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3538 --aExp;
3539 }
3540 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3541 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3542
3543}
3544
158142c2
FB
3545/*----------------------------------------------------------------------------
3546| Rounds the double-precision floating-point value `a' to an integer, and
3547| returns the result as a double-precision floating-point value. The
3548| operation is performed according to the IEC/IEEE Standard for Binary
3549| Floating-Point Arithmetic.
3550*----------------------------------------------------------------------------*/
3551
3552float64 float64_round_to_int( float64 a STATUS_PARAM )
3553{
3554 flag aSign;
94a49d86 3555 int_fast16_t aExp;
bb98fe42 3556 uint64_t lastBitMask, roundBitsMask;
bb98fe42 3557 uint64_t z;
37d18660 3558 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3559
3560 aExp = extractFloat64Exp( a );
3561 if ( 0x433 <= aExp ) {
3562 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3563 return propagateFloat64NaN( a, a STATUS_VAR );
3564 }
3565 return a;
3566 }
3567 if ( aExp < 0x3FF ) {
bb98fe42 3568 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
158142c2
FB
3569 STATUS(float_exception_flags) |= float_flag_inexact;
3570 aSign = extractFloat64Sign( a );
3571 switch ( STATUS(float_rounding_mode) ) {
3572 case float_round_nearest_even:
3573 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3574 return packFloat64( aSign, 0x3FF, 0 );
3575 }
3576 break;
f9288a76
PM
3577 case float_round_ties_away:
3578 if (aExp == 0x3FE) {
3579 return packFloat64(aSign, 0x3ff, 0);
3580 }
3581 break;
158142c2 3582 case float_round_down:
f090c9d4 3583 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
158142c2 3584 case float_round_up:
f090c9d4
PB
3585 return make_float64(
3586 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
158142c2
FB
3587 }
3588 return packFloat64( aSign, 0, 0 );
3589 }
3590 lastBitMask = 1;
3591 lastBitMask <<= 0x433 - aExp;
3592 roundBitsMask = lastBitMask - 1;
f090c9d4 3593 z = float64_val(a);
dc355b76
PM
3594 switch (STATUS(float_rounding_mode)) {
3595 case float_round_nearest_even:
3596 z += lastBitMask >> 1;
3597 if ((z & roundBitsMask) == 0) {
3598 z &= ~lastBitMask;
3599 }
3600 break;
f9288a76
PM
3601 case float_round_ties_away:
3602 z += lastBitMask >> 1;
3603 break;
dc355b76
PM
3604 case float_round_to_zero:
3605 break;
3606 case float_round_up:
3607 if (!extractFloat64Sign(make_float64(z))) {
3608 z += roundBitsMask;
3609 }
3610 break;
3611 case float_round_down:
3612 if (extractFloat64Sign(make_float64(z))) {
158142c2
FB
3613 z += roundBitsMask;
3614 }
dc355b76
PM
3615 break;
3616 default:
3617 abort();
158142c2
FB
3618 }
3619 z &= ~ roundBitsMask;
f090c9d4
PB
3620 if ( z != float64_val(a) )
3621 STATUS(float_exception_flags) |= float_flag_inexact;
3622 return make_float64(z);
158142c2
FB
3623
3624}
3625
e6e5906b
PB
3626float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3627{
3628 int oldmode;
3629 float64 res;
3630 oldmode = STATUS(float_rounding_mode);
3631 STATUS(float_rounding_mode) = float_round_to_zero;
3632 res = float64_round_to_int(a STATUS_VAR);
3633 STATUS(float_rounding_mode) = oldmode;
3634 return res;
3635}
3636
158142c2
FB
3637/*----------------------------------------------------------------------------
3638| Returns the result of adding the absolute values of the double-precision
3639| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3640| before being returned. `zSign' is ignored if the result is a NaN.
3641| The addition is performed according to the IEC/IEEE Standard for Binary
3642| Floating-Point Arithmetic.
3643*----------------------------------------------------------------------------*/
3644
3645static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3646{
94a49d86 3647 int_fast16_t aExp, bExp, zExp;
bb98fe42 3648 uint64_t aSig, bSig, zSig;
94a49d86 3649 int_fast16_t expDiff;
158142c2
FB
3650
3651 aSig = extractFloat64Frac( a );
3652 aExp = extractFloat64Exp( a );
3653 bSig = extractFloat64Frac( b );
3654 bExp = extractFloat64Exp( b );
3655 expDiff = aExp - bExp;
3656 aSig <<= 9;
3657 bSig <<= 9;
3658 if ( 0 < expDiff ) {
3659 if ( aExp == 0x7FF ) {
3660 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3661 return a;
3662 }
3663 if ( bExp == 0 ) {
3664 --expDiff;
3665 }
3666 else {
3667 bSig |= LIT64( 0x2000000000000000 );
3668 }
3669 shift64RightJamming( bSig, expDiff, &bSig );
3670 zExp = aExp;
3671 }
3672 else if ( expDiff < 0 ) {
3673 if ( bExp == 0x7FF ) {
3674 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3675 return packFloat64( zSign, 0x7FF, 0 );
3676 }
3677 if ( aExp == 0 ) {
3678 ++expDiff;
3679 }
3680 else {
3681 aSig |= LIT64( 0x2000000000000000 );
3682 }
3683 shift64RightJamming( aSig, - expDiff, &aSig );
3684 zExp = bExp;
3685 }
3686 else {
3687 if ( aExp == 0x7FF ) {
3688 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3689 return a;
3690 }
fe76d976 3691 if ( aExp == 0 ) {
e6afc87f
PM
3692 if (STATUS(flush_to_zero)) {
3693 if (aSig | bSig) {
3694 float_raise(float_flag_output_denormal STATUS_VAR);
3695 }
3696 return packFloat64(zSign, 0, 0);
3697 }
fe76d976
PB
3698 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3699 }
158142c2
FB
3700 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3701 zExp = aExp;
3702 goto roundAndPack;
3703 }
3704 aSig |= LIT64( 0x2000000000000000 );
3705 zSig = ( aSig + bSig )<<1;
3706 --zExp;
bb98fe42 3707 if ( (int64_t) zSig < 0 ) {
158142c2
FB
3708 zSig = aSig + bSig;
3709 ++zExp;
3710 }
3711 roundAndPack:
3712 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3713
3714}
3715
3716/*----------------------------------------------------------------------------
3717| Returns the result of subtracting the absolute values of the double-
3718| precision floating-point values `a' and `b'. If `zSign' is 1, the
3719| difference is negated before being returned. `zSign' is ignored if the
3720| result is a NaN. The subtraction is performed according to the IEC/IEEE
3721| Standard for Binary Floating-Point Arithmetic.
3722*----------------------------------------------------------------------------*/
3723
3724static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3725{
94a49d86 3726 int_fast16_t aExp, bExp, zExp;
bb98fe42 3727 uint64_t aSig, bSig, zSig;
94a49d86 3728 int_fast16_t expDiff;
158142c2
FB
3729
3730 aSig = extractFloat64Frac( a );
3731 aExp = extractFloat64Exp( a );
3732 bSig = extractFloat64Frac( b );
3733 bExp = extractFloat64Exp( b );
3734 expDiff = aExp - bExp;
3735 aSig <<= 10;
3736 bSig <<= 10;
3737 if ( 0 < expDiff ) goto aExpBigger;
3738 if ( expDiff < 0 ) goto bExpBigger;
3739 if ( aExp == 0x7FF ) {
3740 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3741 float_raise( float_flag_invalid STATUS_VAR);
3742 return float64_default_nan;
3743 }
3744 if ( aExp == 0 ) {
3745 aExp = 1;
3746 bExp = 1;
3747 }
3748 if ( bSig < aSig ) goto aBigger;
3749 if ( aSig < bSig ) goto bBigger;
3750 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3751 bExpBigger:
3752 if ( bExp == 0x7FF ) {
3753 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3754 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3755 }
3756 if ( aExp == 0 ) {
3757 ++expDiff;
3758 }
3759 else {
3760 aSig |= LIT64( 0x4000000000000000 );
3761 }
3762 shift64RightJamming( aSig, - expDiff, &aSig );
3763 bSig |= LIT64( 0x4000000000000000 );
3764 bBigger:
3765 zSig = bSig - aSig;
3766 zExp = bExp;
3767 zSign ^= 1;
3768 goto normalizeRoundAndPack;
3769 aExpBigger:
3770 if ( aExp == 0x7FF ) {
3771 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3772 return a;
3773 }
3774 if ( bExp == 0 ) {
3775 --expDiff;
3776 }
3777 else {
3778 bSig |= LIT64( 0x4000000000000000 );
3779 }
3780 shift64RightJamming( bSig, expDiff, &bSig );
3781 aSig |= LIT64( 0x4000000000000000 );
3782 aBigger:
3783 zSig = aSig - bSig;
3784 zExp = aExp;
3785 normalizeRoundAndPack:
3786 --zExp;
3787 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3788
3789}
3790
3791/*----------------------------------------------------------------------------
3792| Returns the result of adding the double-precision floating-point values `a'
3793| and `b'. The operation is performed according to the IEC/IEEE Standard for
3794| Binary Floating-Point Arithmetic.
3795*----------------------------------------------------------------------------*/
3796
3797float64 float64_add( float64 a, float64 b STATUS_PARAM )
3798{
3799 flag aSign, bSign;
37d18660
PM
3800 a = float64_squash_input_denormal(a STATUS_VAR);
3801 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3802
3803 aSign = extractFloat64Sign( a );
3804 bSign = extractFloat64Sign( b );
3805 if ( aSign == bSign ) {
3806 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3807 }
3808 else {
3809 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3810 }
3811
3812}
3813
3814/*----------------------------------------------------------------------------
3815| Returns the result of subtracting the double-precision floating-point values
3816| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3817| for Binary Floating-Point Arithmetic.
3818*----------------------------------------------------------------------------*/
3819
3820float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3821{
3822 flag aSign, bSign;
37d18660
PM
3823 a = float64_squash_input_denormal(a STATUS_VAR);
3824 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3825
3826 aSign = extractFloat64Sign( a );
3827 bSign = extractFloat64Sign( b );
3828 if ( aSign == bSign ) {
3829 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3830 }
3831 else {
3832 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3833 }
3834
3835}
3836
3837/*----------------------------------------------------------------------------
3838| Returns the result of multiplying the double-precision floating-point values
3839| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3840| for Binary Floating-Point Arithmetic.
3841*----------------------------------------------------------------------------*/
3842
3843float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3844{
3845 flag aSign, bSign, zSign;
94a49d86 3846 int_fast16_t aExp, bExp, zExp;
bb98fe42 3847 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 3848
37d18660
PM
3849 a = float64_squash_input_denormal(a STATUS_VAR);
3850 b = float64_squash_input_denormal(b STATUS_VAR);
3851
158142c2
FB
3852 aSig = extractFloat64Frac( a );
3853 aExp = extractFloat64Exp( a );
3854 aSign = extractFloat64Sign( a );
3855 bSig = extractFloat64Frac( b );
3856 bExp = extractFloat64Exp( b );
3857 bSign = extractFloat64Sign( b );
3858 zSign = aSign ^ bSign;
3859 if ( aExp == 0x7FF ) {
3860 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3861 return propagateFloat64NaN( a, b STATUS_VAR );
3862 }
3863 if ( ( bExp | bSig ) == 0 ) {
3864 float_raise( float_flag_invalid STATUS_VAR);
3865 return float64_default_nan;
3866 }
3867 return packFloat64( zSign, 0x7FF, 0 );
3868 }
3869 if ( bExp == 0x7FF ) {
3870 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3871 if ( ( aExp | aSig ) == 0 ) {
3872 float_raise( float_flag_invalid STATUS_VAR);
3873 return float64_default_nan;
3874 }
3875 return packFloat64( zSign, 0x7FF, 0 );
3876 }
3877 if ( aExp == 0 ) {
3878 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3879 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3880 }
3881 if ( bExp == 0 ) {
3882 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3883 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3884 }
3885 zExp = aExp + bExp - 0x3FF;
3886 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3887 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3888 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3889 zSig0 |= ( zSig1 != 0 );
bb98fe42 3890 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
158142c2
FB
3891 zSig0 <<= 1;
3892 --zExp;
3893 }
3894 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3895
3896}
3897
3898/*----------------------------------------------------------------------------
3899| Returns the result of dividing the double-precision floating-point value `a'
3900| by the corresponding value `b'. The operation is performed according to
3901| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3902*----------------------------------------------------------------------------*/
3903
3904float64 float64_div( float64 a, float64 b STATUS_PARAM )
3905{
3906 flag aSign, bSign, zSign;
94a49d86 3907 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
3908 uint64_t aSig, bSig, zSig;
3909 uint64_t rem0, rem1;
3910 uint64_t term0, term1;
37d18660
PM
3911 a = float64_squash_input_denormal(a STATUS_VAR);
3912 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3913
3914 aSig = extractFloat64Frac( a );
3915 aExp = extractFloat64Exp( a );
3916 aSign = extractFloat64Sign( a );
3917 bSig = extractFloat64Frac( b );
3918 bExp = extractFloat64Exp( b );
3919 bSign = extractFloat64Sign( b );
3920 zSign = aSign ^ bSign;
3921 if ( aExp == 0x7FF ) {
3922 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3923 if ( bExp == 0x7FF ) {
3924 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3925 float_raise( float_flag_invalid STATUS_VAR);
3926 return float64_default_nan;
3927 }
3928 return packFloat64( zSign, 0x7FF, 0 );
3929 }
3930 if ( bExp == 0x7FF ) {
3931 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3932 return packFloat64( zSign, 0, 0 );
3933 }
3934 if ( bExp == 0 ) {
3935 if ( bSig == 0 ) {
3936 if ( ( aExp | aSig ) == 0 ) {
3937 float_raise( float_flag_invalid STATUS_VAR);
3938 return float64_default_nan;
3939 }
3940 float_raise( float_flag_divbyzero STATUS_VAR);
3941 return packFloat64( zSign, 0x7FF, 0 );
3942 }
3943 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3944 }
3945 if ( aExp == 0 ) {
3946 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3947 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3948 }
3949 zExp = aExp - bExp + 0x3FD;
3950 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3951 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3952 if ( bSig <= ( aSig + aSig ) ) {
3953 aSig >>= 1;
3954 ++zExp;
3955 }
3956 zSig = estimateDiv128To64( aSig, 0, bSig );
3957 if ( ( zSig & 0x1FF ) <= 2 ) {
3958 mul64To128( bSig, zSig, &term0, &term1 );
3959 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 3960 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
3961 --zSig;
3962 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3963 }
3964 zSig |= ( rem1 != 0 );
3965 }
3966 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3967
3968}
3969
3970/*----------------------------------------------------------------------------
3971| Returns the remainder of the double-precision floating-point value `a'
3972| with respect to the corresponding value `b'. The operation is performed
3973| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3974*----------------------------------------------------------------------------*/
3975
3976float64 float64_rem( float64 a, float64 b STATUS_PARAM )
3977{
ed086f3d 3978 flag aSign, zSign;
94a49d86 3979 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
3980 uint64_t aSig, bSig;
3981 uint64_t q, alternateASig;
3982 int64_t sigMean;
158142c2 3983
37d18660
PM
3984 a = float64_squash_input_denormal(a STATUS_VAR);
3985 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3986 aSig = extractFloat64Frac( a );
3987 aExp = extractFloat64Exp( a );
3988 aSign = extractFloat64Sign( a );
3989 bSig = extractFloat64Frac( b );
3990 bExp = extractFloat64Exp( b );
158142c2
FB
3991 if ( aExp == 0x7FF ) {
3992 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3993 return propagateFloat64NaN( a, b STATUS_VAR );
3994 }
3995 float_raise( float_flag_invalid STATUS_VAR);
3996 return float64_default_nan;
3997 }
3998 if ( bExp == 0x7FF ) {
3999 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
4000 return a;
4001 }
4002 if ( bExp == 0 ) {
4003 if ( bSig == 0 ) {
4004 float_raise( float_flag_invalid STATUS_VAR);
4005 return float64_default_nan;
4006 }
4007 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4008 }
4009 if ( aExp == 0 ) {
4010 if ( aSig == 0 ) return a;
4011 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4012 }
4013 expDiff = aExp - bExp;
4014 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4015 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4016 if ( expDiff < 0 ) {
4017 if ( expDiff < -1 ) return a;
4018 aSig >>= 1;
4019 }
4020 q = ( bSig <= aSig );
4021 if ( q ) aSig -= bSig;
4022 expDiff -= 64;
4023 while ( 0 < expDiff ) {
4024 q = estimateDiv128To64( aSig, 0, bSig );
4025 q = ( 2 < q ) ? q - 2 : 0;
4026 aSig = - ( ( bSig>>2 ) * q );
4027 expDiff -= 62;
4028 }
4029 expDiff += 64;
4030 if ( 0 < expDiff ) {
4031 q = estimateDiv128To64( aSig, 0, bSig );
4032 q = ( 2 < q ) ? q - 2 : 0;
4033 q >>= 64 - expDiff;
4034 bSig >>= 2;
4035 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4036 }
4037 else {
4038 aSig >>= 2;
4039 bSig >>= 2;
4040 }
4041 do {
4042 alternateASig = aSig;
4043 ++q;
4044 aSig -= bSig;
bb98fe42 4045 } while ( 0 <= (int64_t) aSig );
158142c2
FB
4046 sigMean = aSig + alternateASig;
4047 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4048 aSig = alternateASig;
4049 }
bb98fe42 4050 zSign = ( (int64_t) aSig < 0 );
158142c2
FB
4051 if ( zSign ) aSig = - aSig;
4052 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
4053
4054}
4055
369be8f6
PM
4056/*----------------------------------------------------------------------------
4057| Returns the result of multiplying the double-precision floating-point values
4058| `a' and `b' then adding 'c', with no intermediate rounding step after the
4059| multiplication. The operation is performed according to the IEC/IEEE
4060| Standard for Binary Floating-Point Arithmetic 754-2008.
4061| The flags argument allows the caller to select negation of the
4062| addend, the intermediate product, or the final result. (The difference
4063| between this and having the caller do a separate negation is that negating
4064| externally will flip the sign bit on NaNs.)
4065*----------------------------------------------------------------------------*/
4066
4067float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
4068{
4069 flag aSign, bSign, cSign, zSign;
94a49d86 4070 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
4071 uint64_t aSig, bSig, cSig;
4072 flag pInf, pZero, pSign;
4073 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4074 int shiftcount;
4075 flag signflip, infzero;
4076
4077 a = float64_squash_input_denormal(a STATUS_VAR);
4078 b = float64_squash_input_denormal(b STATUS_VAR);
4079 c = float64_squash_input_denormal(c STATUS_VAR);
4080 aSig = extractFloat64Frac(a);
4081 aExp = extractFloat64Exp(a);
4082 aSign = extractFloat64Sign(a);
4083 bSig = extractFloat64Frac(b);
4084 bExp = extractFloat64Exp(b);
4085 bSign = extractFloat64Sign(b);
4086 cSig = extractFloat64Frac(c);
4087 cExp = extractFloat64Exp(c);
4088 cSign = extractFloat64Sign(c);
4089
4090 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4091 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4092
4093 /* It is implementation-defined whether the cases of (0,inf,qnan)
4094 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4095 * they return if they do), so we have to hand this information
4096 * off to the target-specific pick-a-NaN routine.
4097 */
4098 if (((aExp == 0x7ff) && aSig) ||
4099 ((bExp == 0x7ff) && bSig) ||
4100 ((cExp == 0x7ff) && cSig)) {
4101 return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
4102 }
4103
4104 if (infzero) {
4105 float_raise(float_flag_invalid STATUS_VAR);
4106 return float64_default_nan;
4107 }
4108
4109 if (flags & float_muladd_negate_c) {
4110 cSign ^= 1;
4111 }
4112
4113 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4114
4115 /* Work out the sign and type of the product */
4116 pSign = aSign ^ bSign;
4117 if (flags & float_muladd_negate_product) {
4118 pSign ^= 1;
4119 }
4120 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4121 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4122
4123 if (cExp == 0x7ff) {
4124 if (pInf && (pSign ^ cSign)) {
4125 /* addition of opposite-signed infinities => InvalidOperation */
4126 float_raise(float_flag_invalid STATUS_VAR);
4127 return float64_default_nan;
4128 }
4129 /* Otherwise generate an infinity of the same sign */
4130 return packFloat64(cSign ^ signflip, 0x7ff, 0);
4131 }
4132
4133 if (pInf) {
4134 return packFloat64(pSign ^ signflip, 0x7ff, 0);
4135 }
4136
4137 if (pZero) {
4138 if (cExp == 0) {
4139 if (cSig == 0) {
4140 /* Adding two exact zeroes */
4141 if (pSign == cSign) {
4142 zSign = pSign;
4143 } else if (STATUS(float_rounding_mode) == float_round_down) {
4144 zSign = 1;
4145 } else {
4146 zSign = 0;
4147 }
4148 return packFloat64(zSign ^ signflip, 0, 0);
4149 }
4150 /* Exact zero plus a denorm */
4151 if (STATUS(flush_to_zero)) {
4152 float_raise(float_flag_output_denormal STATUS_VAR);
4153 return packFloat64(cSign ^ signflip, 0, 0);
4154 }
4155 }
4156 /* Zero plus something non-zero : just return the something */
67d43538
PM
4157 if (flags & float_muladd_halve_result) {
4158 if (cExp == 0) {
4159 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4160 }
4161 /* Subtract one to halve, and one again because roundAndPackFloat64
4162 * wants one less than the true exponent.
4163 */
4164 cExp -= 2;
4165 cSig = (cSig | 0x0010000000000000ULL) << 10;
4166 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig STATUS_VAR);
4167 }
a6e7c184 4168 return packFloat64(cSign ^ signflip, cExp, cSig);
369be8f6
PM
4169 }
4170
4171 if (aExp == 0) {
4172 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4173 }
4174 if (bExp == 0) {
4175 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4176 }
4177
4178 /* Calculate the actual result a * b + c */
4179
4180 /* Multiply first; this is easy. */
4181 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4182 * because we want the true exponent, not the "one-less-than"
4183 * flavour that roundAndPackFloat64() takes.
4184 */
4185 pExp = aExp + bExp - 0x3fe;
4186 aSig = (aSig | LIT64(0x0010000000000000))<<10;
4187 bSig = (bSig | LIT64(0x0010000000000000))<<11;
4188 mul64To128(aSig, bSig, &pSig0, &pSig1);
4189 if ((int64_t)(pSig0 << 1) >= 0) {
4190 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4191 pExp--;
4192 }
4193
4194 zSign = pSign ^ signflip;
4195
4196 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4197 * bit in position 126.
4198 */
4199 if (cExp == 0) {
4200 if (!cSig) {
4201 /* Throw out the special case of c being an exact zero now */
4202 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
67d43538
PM
4203 if (flags & float_muladd_halve_result) {
4204 pExp--;
4205 }
369be8f6
PM
4206 return roundAndPackFloat64(zSign, pExp - 1,
4207 pSig1 STATUS_VAR);
4208 }
4209 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4210 }
4211
4212 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4213 * significand of the addend, with the explicit bit in position 126.
4214 */
4215 cSig0 = cSig << (126 - 64 - 52);
4216 cSig1 = 0;
4217 cSig0 |= LIT64(0x4000000000000000);
4218 expDiff = pExp - cExp;
4219
4220 if (pSign == cSign) {
4221 /* Addition */
4222 if (expDiff > 0) {
4223 /* scale c to match p */
4224 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4225 zExp = pExp;
4226 } else if (expDiff < 0) {
4227 /* scale p to match c */
4228 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4229 zExp = cExp;
4230 } else {
4231 /* no scaling needed */
4232 zExp = cExp;
4233 }
4234 /* Add significands and make sure explicit bit ends up in posn 126 */
4235 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4236 if ((int64_t)zSig0 < 0) {
4237 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4238 } else {
4239 zExp--;
4240 }
4241 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
67d43538
PM
4242 if (flags & float_muladd_halve_result) {
4243 zExp--;
4244 }
369be8f6
PM
4245 return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
4246 } else {
4247 /* Subtraction */
4248 if (expDiff > 0) {
4249 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4250 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4251 zExp = pExp;
4252 } else if (expDiff < 0) {
4253 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4254 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4255 zExp = cExp;
4256 zSign ^= 1;
4257 } else {
4258 zExp = pExp;
4259 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4260 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4261 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4262 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4263 zSign ^= 1;
4264 } else {
4265 /* Exact zero */
4266 zSign = signflip;
4267 if (STATUS(float_rounding_mode) == float_round_down) {
4268 zSign ^= 1;
4269 }
4270 return packFloat64(zSign, 0, 0);
4271 }
4272 }
4273 --zExp;
4274 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4275 * starting with the significand in a pair of uint64_t.
4276 */
4277 if (zSig0) {
4278 shiftcount = countLeadingZeros64(zSig0) - 1;
4279 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4280 if (zSig1) {
4281 zSig0 |= 1;
4282 }
4283 zExp -= shiftcount;
4284 } else {
e3d142d0
PM
4285 shiftcount = countLeadingZeros64(zSig1);
4286 if (shiftcount == 0) {
4287 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4288 zExp -= 63;
4289 } else {
4290 shiftcount--;
4291 zSig0 = zSig1 << shiftcount;
4292 zExp -= (shiftcount + 64);
4293 }
369be8f6 4294 }
67d43538
PM
4295 if (flags & float_muladd_halve_result) {
4296 zExp--;
4297 }
369be8f6
PM
4298 return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4299 }
4300}
4301
158142c2
FB
4302/*----------------------------------------------------------------------------
4303| Returns the square root of the double-precision floating-point value `a'.
4304| The operation is performed according to the IEC/IEEE Standard for Binary
4305| Floating-Point Arithmetic.
4306*----------------------------------------------------------------------------*/
4307
4308float64 float64_sqrt( float64 a STATUS_PARAM )
4309{
4310 flag aSign;
94a49d86 4311 int_fast16_t aExp, zExp;
bb98fe42
AF
4312 uint64_t aSig, zSig, doubleZSig;
4313 uint64_t rem0, rem1, term0, term1;
37d18660 4314 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
4315
4316 aSig = extractFloat64Frac( a );
4317 aExp = extractFloat64Exp( a );
4318 aSign = extractFloat64Sign( a );
4319 if ( aExp == 0x7FF ) {
4320 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4321 if ( ! aSign ) return a;
4322 float_raise( float_flag_invalid STATUS_VAR);
4323 return float64_default_nan;
4324 }
4325 if ( aSign ) {
4326 if ( ( aExp | aSig ) == 0 ) return a;
4327 float_raise( float_flag_invalid STATUS_VAR);
4328 return float64_default_nan;
4329 }
4330 if ( aExp == 0 ) {
f090c9d4 4331 if ( aSig == 0 ) return float64_zero;
158142c2
FB
4332 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4333 }
4334 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4335 aSig |= LIT64( 0x0010000000000000 );
4336 zSig = estimateSqrt32( aExp, aSig>>21 );
4337 aSig <<= 9 - ( aExp & 1 );
4338 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4339 if ( ( zSig & 0x1FF ) <= 5 ) {
4340 doubleZSig = zSig<<1;
4341 mul64To128( zSig, zSig, &term0, &term1 );
4342 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 4343 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4344 --zSig;
4345 doubleZSig -= 2;
4346 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4347 }
4348 zSig |= ( ( rem0 | rem1 ) != 0 );
4349 }
4350 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4351
4352}
4353
374dfc33
AJ
4354/*----------------------------------------------------------------------------
4355| Returns the binary log of the double-precision floating-point value `a'.
4356| The operation is performed according to the IEC/IEEE Standard for Binary
4357| Floating-Point Arithmetic.
4358*----------------------------------------------------------------------------*/
4359float64 float64_log2( float64 a STATUS_PARAM )
4360{
4361 flag aSign, zSign;
94a49d86 4362 int_fast16_t aExp;
bb98fe42 4363 uint64_t aSig, aSig0, aSig1, zSig, i;
37d18660 4364 a = float64_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
4365
4366 aSig = extractFloat64Frac( a );
4367 aExp = extractFloat64Exp( a );
4368 aSign = extractFloat64Sign( a );
4369
4370 if ( aExp == 0 ) {
4371 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4372 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4373 }
4374 if ( aSign ) {
4375 float_raise( float_flag_invalid STATUS_VAR);
4376 return float64_default_nan;
4377 }
4378 if ( aExp == 0x7FF ) {
4379 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4380 return a;
4381 }
4382
4383 aExp -= 0x3FF;
4384 aSig |= LIT64( 0x0010000000000000 );
4385 zSign = aExp < 0;
bb98fe42 4386 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
4387 for (i = 1LL << 51; i > 0; i >>= 1) {
4388 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4389 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4390 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4391 aSig >>= 1;
4392 zSig |= i;
4393 }
4394 }
4395
4396 if ( zSign )
4397 zSig = -zSig;
4398 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4399}
4400
158142c2
FB
4401/*----------------------------------------------------------------------------
4402| Returns 1 if the double-precision floating-point value `a' is equal to the
b689362d
AJ
4403| corresponding value `b', and 0 otherwise. The invalid exception is raised
4404| if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
4405| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4406*----------------------------------------------------------------------------*/
4407
b689362d 4408int float64_eq( float64 a, float64 b STATUS_PARAM )
158142c2 4409{
bb98fe42 4410 uint64_t av, bv;
37d18660
PM
4411 a = float64_squash_input_denormal(a STATUS_VAR);
4412 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4413
4414 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4415 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4416 ) {
b689362d 4417 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
4418 return 0;
4419 }
f090c9d4 4420 av = float64_val(a);
a1b91bb4 4421 bv = float64_val(b);
bb98fe42 4422 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4423
4424}
4425
4426/*----------------------------------------------------------------------------
4427| Returns 1 if the double-precision floating-point value `a' is less than or
f5a64251
AJ
4428| equal to the corresponding value `b', and 0 otherwise. The invalid
4429| exception is raised if either operand is a NaN. The comparison is performed
4430| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4431*----------------------------------------------------------------------------*/
4432
750afe93 4433int float64_le( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4434{
4435 flag aSign, bSign;
bb98fe42 4436 uint64_t av, bv;
37d18660
PM
4437 a = float64_squash_input_denormal(a STATUS_VAR);
4438 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4439
4440 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4441 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4442 ) {
4443 float_raise( float_flag_invalid STATUS_VAR);
4444 return 0;
4445 }
4446 aSign = extractFloat64Sign( a );
4447 bSign = extractFloat64Sign( b );
f090c9d4 4448 av = float64_val(a);
a1b91bb4 4449 bv = float64_val(b);
bb98fe42 4450 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4451 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4452
4453}
4454
4455/*----------------------------------------------------------------------------
4456| Returns 1 if the double-precision floating-point value `a' is less than
f5a64251
AJ
4457| the corresponding value `b', and 0 otherwise. The invalid exception is
4458| raised if either operand is a NaN. The comparison is performed according
4459| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4460*----------------------------------------------------------------------------*/
4461
750afe93 4462int float64_lt( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4463{
4464 flag aSign, bSign;
bb98fe42 4465 uint64_t av, bv;
158142c2 4466
37d18660
PM
4467 a = float64_squash_input_denormal(a STATUS_VAR);
4468 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4469 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4470 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4471 ) {
4472 float_raise( float_flag_invalid STATUS_VAR);
4473 return 0;
4474 }
4475 aSign = extractFloat64Sign( a );
4476 bSign = extractFloat64Sign( b );
f090c9d4 4477 av = float64_val(a);
a1b91bb4 4478 bv = float64_val(b);
bb98fe42 4479 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4480 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4481
4482}
4483
67b7861d
AJ
4484/*----------------------------------------------------------------------------
4485| Returns 1 if the double-precision floating-point values `a' and `b' cannot
f5a64251
AJ
4486| be compared, and 0 otherwise. The invalid exception is raised if either
4487| operand is a NaN. The comparison is performed according to the IEC/IEEE
4488| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
4489*----------------------------------------------------------------------------*/
4490
4491int float64_unordered( float64 a, float64 b STATUS_PARAM )
4492{
4493 a = float64_squash_input_denormal(a STATUS_VAR);
4494 b = float64_squash_input_denormal(b STATUS_VAR);
4495
4496 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4497 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4498 ) {
4499 float_raise( float_flag_invalid STATUS_VAR);
4500 return 1;
4501 }
4502 return 0;
4503}
4504
158142c2
FB
4505/*----------------------------------------------------------------------------
4506| Returns 1 if the double-precision floating-point value `a' is equal to the
f5a64251
AJ
4507| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4508| exception.The comparison is performed according to the IEC/IEEE Standard
4509| for Binary Floating-Point Arithmetic.
158142c2
FB
4510*----------------------------------------------------------------------------*/
4511
b689362d 4512int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
158142c2 4513{
bb98fe42 4514 uint64_t av, bv;
37d18660
PM
4515 a = float64_squash_input_denormal(a STATUS_VAR);
4516 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4517
4518 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4519 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4520 ) {
b689362d
AJ
4521 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4522 float_raise( float_flag_invalid STATUS_VAR);
4523 }
158142c2
FB
4524 return 0;
4525 }
f090c9d4 4526 av = float64_val(a);
a1b91bb4 4527 bv = float64_val(b);
bb98fe42 4528 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4529
4530}
4531
4532/*----------------------------------------------------------------------------
4533| Returns 1 if the double-precision floating-point value `a' is less than or
4534| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4535| cause an exception. Otherwise, the comparison is performed according to the
4536| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4537*----------------------------------------------------------------------------*/
4538
750afe93 4539int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4540{
4541 flag aSign, bSign;
bb98fe42 4542 uint64_t av, bv;
37d18660
PM
4543 a = float64_squash_input_denormal(a STATUS_VAR);
4544 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4545
4546 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4547 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4548 ) {
4549 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4550 float_raise( float_flag_invalid STATUS_VAR);
4551 }
4552 return 0;
4553 }
4554 aSign = extractFloat64Sign( a );
4555 bSign = extractFloat64Sign( b );
f090c9d4 4556 av = float64_val(a);
a1b91bb4 4557 bv = float64_val(b);
bb98fe42 4558 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4559 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4560
4561}
4562
4563/*----------------------------------------------------------------------------
4564| Returns 1 if the double-precision floating-point value `a' is less than
4565| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4566| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4567| Standard for Binary Floating-Point Arithmetic.
4568*----------------------------------------------------------------------------*/
4569
750afe93 4570int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4571{
4572 flag aSign, bSign;
bb98fe42 4573 uint64_t av, bv;
37d18660
PM
4574 a = float64_squash_input_denormal(a STATUS_VAR);
4575 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4576
4577 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4578 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4579 ) {
4580 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4581 float_raise( float_flag_invalid STATUS_VAR);
4582 }
4583 return 0;
4584 }
4585 aSign = extractFloat64Sign( a );
4586 bSign = extractFloat64Sign( b );
f090c9d4 4587 av = float64_val(a);
a1b91bb4 4588 bv = float64_val(b);
bb98fe42 4589 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4590 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4591
4592}
4593
67b7861d
AJ
4594/*----------------------------------------------------------------------------
4595| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4596| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4597| comparison is performed according to the IEC/IEEE Standard for Binary
4598| Floating-Point Arithmetic.
4599*----------------------------------------------------------------------------*/
4600
4601int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4602{
4603 a = float64_squash_input_denormal(a STATUS_VAR);
4604 b = float64_squash_input_denormal(b STATUS_VAR);
4605
4606 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4607 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4608 ) {
4609 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4610 float_raise( float_flag_invalid STATUS_VAR);
4611 }
4612 return 1;
4613 }
4614 return 0;
4615}
4616
158142c2
FB
4617/*----------------------------------------------------------------------------
4618| Returns the result of converting the extended double-precision floating-
4619| point value `a' to the 32-bit two's complement integer format. The
4620| conversion is performed according to the IEC/IEEE Standard for Binary
4621| Floating-Point Arithmetic---which means in particular that the conversion
4622| is rounded according to the current rounding mode. If `a' is a NaN, the
4623| largest positive integer is returned. Otherwise, if the conversion
4624| overflows, the largest integer with the same sign as `a' is returned.
4625*----------------------------------------------------------------------------*/
4626
4627int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4628{
4629 flag aSign;
4630 int32 aExp, shiftCount;
bb98fe42 4631 uint64_t aSig;
158142c2
FB
4632
4633 aSig = extractFloatx80Frac( a );
4634 aExp = extractFloatx80Exp( a );
4635 aSign = extractFloatx80Sign( a );
bb98fe42 4636 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4637 shiftCount = 0x4037 - aExp;
4638 if ( shiftCount <= 0 ) shiftCount = 1;
4639 shift64RightJamming( aSig, shiftCount, &aSig );
4640 return roundAndPackInt32( aSign, aSig STATUS_VAR );
4641
4642}
4643
4644/*----------------------------------------------------------------------------
4645| Returns the result of converting the extended double-precision floating-
4646| point value `a' to the 32-bit two's complement integer format. The
4647| conversion is performed according to the IEC/IEEE Standard for Binary
4648| Floating-Point Arithmetic, except that the conversion is always rounded
4649| toward zero. If `a' is a NaN, the largest positive integer is returned.
4650| Otherwise, if the conversion overflows, the largest integer with the same
4651| sign as `a' is returned.
4652*----------------------------------------------------------------------------*/
4653
4654int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4655{
4656 flag aSign;
4657 int32 aExp, shiftCount;
bb98fe42 4658 uint64_t aSig, savedASig;
b3a6a2e0 4659 int32_t z;
158142c2
FB
4660
4661 aSig = extractFloatx80Frac( a );
4662 aExp = extractFloatx80Exp( a );
4663 aSign = extractFloatx80Sign( a );
4664 if ( 0x401E < aExp ) {
bb98fe42 4665 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4666 goto invalid;
4667 }
4668 else if ( aExp < 0x3FFF ) {
4669 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4670 return 0;
4671 }
4672 shiftCount = 0x403E - aExp;
4673 savedASig = aSig;
4674 aSig >>= shiftCount;
4675 z = aSig;
4676 if ( aSign ) z = - z;
4677 if ( ( z < 0 ) ^ aSign ) {
4678 invalid:
4679 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 4680 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4681 }
4682 if ( ( aSig<<shiftCount ) != savedASig ) {
4683 STATUS(float_exception_flags) |= float_flag_inexact;
4684 }
4685 return z;
4686
4687}
4688
4689/*----------------------------------------------------------------------------
4690| Returns the result of converting the extended double-precision floating-
4691| point value `a' to the 64-bit two's complement integer format. The
4692| conversion is performed according to the IEC/IEEE Standard for Binary
4693| Floating-Point Arithmetic---which means in particular that the conversion
4694| is rounded according to the current rounding mode. If `a' is a NaN,
4695| the largest positive integer is returned. Otherwise, if the conversion
4696| overflows, the largest integer with the same sign as `a' is returned.
4697*----------------------------------------------------------------------------*/
4698
4699int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4700{
4701 flag aSign;
4702 int32 aExp, shiftCount;
bb98fe42 4703 uint64_t aSig, aSigExtra;
158142c2
FB
4704
4705 aSig = extractFloatx80Frac( a );
4706 aExp = extractFloatx80Exp( a );
4707 aSign = extractFloatx80Sign( a );
4708 shiftCount = 0x403E - aExp;
4709 if ( shiftCount <= 0 ) {
4710 if ( shiftCount ) {
4711 float_raise( float_flag_invalid STATUS_VAR);
4712 if ( ! aSign
4713 || ( ( aExp == 0x7FFF )
4714 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4715 ) {
4716 return LIT64( 0x7FFFFFFFFFFFFFFF );
4717 }
bb98fe42 4718 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4719 }
4720 aSigExtra = 0;
4721 }
4722 else {
4723 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4724 }
4725 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4726
4727}
4728
4729/*----------------------------------------------------------------------------
4730| Returns the result of converting the extended double-precision floating-
4731| point value `a' to the 64-bit two's complement integer format. The
4732| conversion is performed according to the IEC/IEEE Standard for Binary
4733| Floating-Point Arithmetic, except that the conversion is always rounded
4734| toward zero. If `a' is a NaN, the largest positive integer is returned.
4735| Otherwise, if the conversion overflows, the largest integer with the same
4736| sign as `a' is returned.
4737*----------------------------------------------------------------------------*/
4738
4739int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4740{
4741 flag aSign;
4742 int32 aExp, shiftCount;
bb98fe42 4743 uint64_t aSig;
158142c2
FB
4744 int64 z;
4745
4746 aSig = extractFloatx80Frac( a );
4747 aExp = extractFloatx80Exp( a );
4748 aSign = extractFloatx80Sign( a );
4749 shiftCount = aExp - 0x403E;
4750 if ( 0 <= shiftCount ) {
4751 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4752 if ( ( a.high != 0xC03E ) || aSig ) {
4753 float_raise( float_flag_invalid STATUS_VAR);
4754 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4755 return LIT64( 0x7FFFFFFFFFFFFFFF );
4756 }
4757 }
bb98fe42 4758 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4759 }
4760 else if ( aExp < 0x3FFF ) {
4761 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4762 return 0;
4763 }
4764 z = aSig>>( - shiftCount );
bb98fe42 4765 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
4766 STATUS(float_exception_flags) |= float_flag_inexact;
4767 }
4768 if ( aSign ) z = - z;
4769 return z;
4770
4771}
4772
4773/*----------------------------------------------------------------------------
4774| Returns the result of converting the extended double-precision floating-
4775| point value `a' to the single-precision floating-point format. The
4776| conversion is performed according to the IEC/IEEE Standard for Binary
4777| Floating-Point Arithmetic.
4778*----------------------------------------------------------------------------*/
4779
4780float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4781{
4782 flag aSign;
4783 int32 aExp;
bb98fe42 4784 uint64_t aSig;
158142c2
FB
4785
4786 aSig = extractFloatx80Frac( a );
4787 aExp = extractFloatx80Exp( a );
4788 aSign = extractFloatx80Sign( a );
4789 if ( aExp == 0x7FFF ) {
bb98fe42 4790 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4791 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4792 }
4793 return packFloat32( aSign, 0xFF, 0 );
4794 }
4795 shift64RightJamming( aSig, 33, &aSig );
4796 if ( aExp || aSig ) aExp -= 0x3F81;
4797 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4798
4799}
4800
4801/*----------------------------------------------------------------------------
4802| Returns the result of converting the extended double-precision floating-
4803| point value `a' to the double-precision floating-point format. The
4804| conversion is performed according to the IEC/IEEE Standard for Binary
4805| Floating-Point Arithmetic.
4806*----------------------------------------------------------------------------*/
4807
4808float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4809{
4810 flag aSign;
4811 int32 aExp;
bb98fe42 4812 uint64_t aSig, zSig;
158142c2
FB
4813
4814 aSig = extractFloatx80Frac( a );
4815 aExp = extractFloatx80Exp( a );
4816 aSign = extractFloatx80Sign( a );
4817 if ( aExp == 0x7FFF ) {
bb98fe42 4818 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4819 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4820 }
4821 return packFloat64( aSign, 0x7FF, 0 );
4822 }
4823 shift64RightJamming( aSig, 1, &zSig );
4824 if ( aExp || aSig ) aExp -= 0x3C01;
4825 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4826
4827}
4828
158142c2
FB
4829/*----------------------------------------------------------------------------
4830| Returns the result of converting the extended double-precision floating-
4831| point value `a' to the quadruple-precision floating-point format. The
4832| conversion is performed according to the IEC/IEEE Standard for Binary
4833| Floating-Point Arithmetic.
4834*----------------------------------------------------------------------------*/
4835
4836float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4837{
4838 flag aSign;
94a49d86 4839 int_fast16_t aExp;
bb98fe42 4840 uint64_t aSig, zSig0, zSig1;
158142c2
FB
4841
4842 aSig = extractFloatx80Frac( a );
4843 aExp = extractFloatx80Exp( a );
4844 aSign = extractFloatx80Sign( a );
bb98fe42 4845 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4846 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4847 }
4848 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4849 return packFloat128( aSign, aExp, zSig0, zSig1 );
4850
4851}
4852
158142c2
FB
4853/*----------------------------------------------------------------------------
4854| Rounds the extended double-precision floating-point value `a' to an integer,
4855| and returns the result as an extended quadruple-precision floating-point
4856| value. The operation is performed according to the IEC/IEEE Standard for
4857| Binary Floating-Point Arithmetic.
4858*----------------------------------------------------------------------------*/
4859
4860floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4861{
4862 flag aSign;
4863 int32 aExp;
bb98fe42 4864 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
4865 floatx80 z;
4866
4867 aExp = extractFloatx80Exp( a );
4868 if ( 0x403E <= aExp ) {
bb98fe42 4869 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
158142c2
FB
4870 return propagateFloatx80NaN( a, a STATUS_VAR );
4871 }
4872 return a;
4873 }
4874 if ( aExp < 0x3FFF ) {
4875 if ( ( aExp == 0 )
bb98fe42 4876 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
4877 return a;
4878 }
4879 STATUS(float_exception_flags) |= float_flag_inexact;
4880 aSign = extractFloatx80Sign( a );
4881 switch ( STATUS(float_rounding_mode) ) {
4882 case float_round_nearest_even:
bb98fe42 4883 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
4884 ) {
4885 return
4886 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4887 }
4888 break;
f9288a76
PM
4889 case float_round_ties_away:
4890 if (aExp == 0x3FFE) {
4891 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4892 }
4893 break;
158142c2
FB
4894 case float_round_down:
4895 return
4896 aSign ?
4897 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4898 : packFloatx80( 0, 0, 0 );
4899 case float_round_up:
4900 return
4901 aSign ? packFloatx80( 1, 0, 0 )
4902 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4903 }
4904 return packFloatx80( aSign, 0, 0 );
4905 }
4906 lastBitMask = 1;
4907 lastBitMask <<= 0x403E - aExp;
4908 roundBitsMask = lastBitMask - 1;
4909 z = a;
dc355b76
PM
4910 switch (STATUS(float_rounding_mode)) {
4911 case float_round_nearest_even:
158142c2 4912 z.low += lastBitMask>>1;
dc355b76
PM
4913 if ((z.low & roundBitsMask) == 0) {
4914 z.low &= ~lastBitMask;
4915 }
4916 break;
f9288a76
PM
4917 case float_round_ties_away:
4918 z.low += lastBitMask >> 1;
4919 break;
dc355b76
PM
4920 case float_round_to_zero:
4921 break;
4922 case float_round_up:
4923 if (!extractFloatx80Sign(z)) {
4924 z.low += roundBitsMask;
4925 }
4926 break;
4927 case float_round_down:
4928 if (extractFloatx80Sign(z)) {
158142c2
FB
4929 z.low += roundBitsMask;
4930 }
dc355b76
PM
4931 break;
4932 default:
4933 abort();
158142c2
FB
4934 }
4935 z.low &= ~ roundBitsMask;
4936 if ( z.low == 0 ) {
4937 ++z.high;
4938 z.low = LIT64( 0x8000000000000000 );
4939 }
4940 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4941 return z;
4942
4943}
4944
4945/*----------------------------------------------------------------------------
4946| Returns the result of adding the absolute values of the extended double-
4947| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4948| negated before being returned. `zSign' is ignored if the result is a NaN.
4949| The addition is performed according to the IEC/IEEE Standard for Binary
4950| Floating-Point Arithmetic.
4951*----------------------------------------------------------------------------*/
4952
4953static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
4954{
4955 int32 aExp, bExp, zExp;
bb98fe42 4956 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
4957 int32 expDiff;
4958
4959 aSig = extractFloatx80Frac( a );
4960 aExp = extractFloatx80Exp( a );
4961 bSig = extractFloatx80Frac( b );
4962 bExp = extractFloatx80Exp( b );
4963 expDiff = aExp - bExp;
4964 if ( 0 < expDiff ) {
4965 if ( aExp == 0x7FFF ) {
bb98fe42 4966 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4967 return a;
4968 }
4969 if ( bExp == 0 ) --expDiff;
4970 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
4971 zExp = aExp;
4972 }
4973 else if ( expDiff < 0 ) {
4974 if ( bExp == 0x7FFF ) {
bb98fe42 4975 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
4976 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4977 }
4978 if ( aExp == 0 ) ++expDiff;
4979 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
4980 zExp = bExp;
4981 }
4982 else {
4983 if ( aExp == 0x7FFF ) {
bb98fe42 4984 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
4985 return propagateFloatx80NaN( a, b STATUS_VAR );
4986 }
4987 return a;
4988 }
4989 zSig1 = 0;
4990 zSig0 = aSig + bSig;
4991 if ( aExp == 0 ) {
4992 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
4993 goto roundAndPack;
4994 }
4995 zExp = aExp;
4996 goto shiftRight1;
4997 }
4998 zSig0 = aSig + bSig;
bb98fe42 4999 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
5000 shiftRight1:
5001 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5002 zSig0 |= LIT64( 0x8000000000000000 );
5003 ++zExp;
5004 roundAndPack:
5005 return
5006 roundAndPackFloatx80(
5007 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5008
5009}
5010
5011/*----------------------------------------------------------------------------
5012| Returns the result of subtracting the absolute values of the extended
5013| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5014| difference is negated before being returned. `zSign' is ignored if the
5015| result is a NaN. The subtraction is performed according to the IEC/IEEE
5016| Standard for Binary Floating-Point Arithmetic.
5017*----------------------------------------------------------------------------*/
5018
5019static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
5020{
5021 int32 aExp, bExp, zExp;
bb98fe42 5022 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
5023 int32 expDiff;
5024 floatx80 z;
5025
5026 aSig = extractFloatx80Frac( a );
5027 aExp = extractFloatx80Exp( a );
5028 bSig = extractFloatx80Frac( b );
5029 bExp = extractFloatx80Exp( b );
5030 expDiff = aExp - bExp;
5031 if ( 0 < expDiff ) goto aExpBigger;
5032 if ( expDiff < 0 ) goto bExpBigger;
5033 if ( aExp == 0x7FFF ) {
bb98fe42 5034 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
5035 return propagateFloatx80NaN( a, b STATUS_VAR );
5036 }
5037 float_raise( float_flag_invalid STATUS_VAR);
5038 z.low = floatx80_default_nan_low;
5039 z.high = floatx80_default_nan_high;
5040 return z;
5041 }
5042 if ( aExp == 0 ) {
5043 aExp = 1;
5044 bExp = 1;
5045 }
5046 zSig1 = 0;
5047 if ( bSig < aSig ) goto aBigger;
5048 if ( aSig < bSig ) goto bBigger;
5049 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
5050 bExpBigger:
5051 if ( bExp == 0x7FFF ) {
bb98fe42 5052 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5053 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5054 }
5055 if ( aExp == 0 ) ++expDiff;
5056 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5057 bBigger:
5058 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5059 zExp = bExp;
5060 zSign ^= 1;
5061 goto normalizeRoundAndPack;
5062 aExpBigger:
5063 if ( aExp == 0x7FFF ) {
bb98fe42 5064 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5065 return a;
5066 }
5067 if ( bExp == 0 ) --expDiff;
5068 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5069 aBigger:
5070 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5071 zExp = aExp;
5072 normalizeRoundAndPack:
5073 return
5074 normalizeRoundAndPackFloatx80(
5075 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5076
5077}
5078
5079/*----------------------------------------------------------------------------
5080| Returns the result of adding the extended double-precision floating-point
5081| values `a' and `b'. The operation is performed according to the IEC/IEEE
5082| Standard for Binary Floating-Point Arithmetic.
5083*----------------------------------------------------------------------------*/
5084
5085floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
5086{
5087 flag aSign, bSign;
5088
5089 aSign = extractFloatx80Sign( a );
5090 bSign = extractFloatx80Sign( b );
5091 if ( aSign == bSign ) {
5092 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5093 }
5094 else {
5095 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5096 }
5097
5098}
5099
5100/*----------------------------------------------------------------------------
5101| Returns the result of subtracting the extended double-precision floating-
5102| point values `a' and `b'. The operation is performed according to the
5103| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5104*----------------------------------------------------------------------------*/
5105
5106floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
5107{
5108 flag aSign, bSign;
5109
5110 aSign = extractFloatx80Sign( a );
5111 bSign = extractFloatx80Sign( b );
5112 if ( aSign == bSign ) {
5113 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5114 }
5115 else {
5116 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5117 }
5118
5119}
5120
5121/*----------------------------------------------------------------------------
5122| Returns the result of multiplying the extended double-precision floating-
5123| point values `a' and `b'. The operation is performed according to the
5124| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5125*----------------------------------------------------------------------------*/
5126
5127floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
5128{
5129 flag aSign, bSign, zSign;
5130 int32 aExp, bExp, zExp;
bb98fe42 5131 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
5132 floatx80 z;
5133
5134 aSig = extractFloatx80Frac( a );
5135 aExp = extractFloatx80Exp( a );
5136 aSign = extractFloatx80Sign( a );
5137 bSig = extractFloatx80Frac( b );
5138 bExp = extractFloatx80Exp( b );
5139 bSign = extractFloatx80Sign( b );
5140 zSign = aSign ^ bSign;
5141 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5142 if ( (uint64_t) ( aSig<<1 )
5143 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
5144 return propagateFloatx80NaN( a, b STATUS_VAR );
5145 }
5146 if ( ( bExp | bSig ) == 0 ) goto invalid;
5147 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5148 }
5149 if ( bExp == 0x7FFF ) {
bb98fe42 5150 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5151 if ( ( aExp | aSig ) == 0 ) {
5152 invalid:
5153 float_raise( float_flag_invalid STATUS_VAR);
5154 z.low = floatx80_default_nan_low;
5155 z.high = floatx80_default_nan_high;
5156 return z;
5157 }
5158 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5159 }
5160 if ( aExp == 0 ) {
5161 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5162 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5163 }
5164 if ( bExp == 0 ) {
5165 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5166 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5167 }
5168 zExp = aExp + bExp - 0x3FFE;
5169 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 5170 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
5171 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5172 --zExp;
5173 }
5174 return
5175 roundAndPackFloatx80(
5176 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5177
5178}
5179
5180/*----------------------------------------------------------------------------
5181| Returns the result of dividing the extended double-precision floating-point
5182| value `a' by the corresponding value `b'. The operation is performed
5183| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5184*----------------------------------------------------------------------------*/
5185
5186floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
5187{
5188 flag aSign, bSign, zSign;
5189 int32 aExp, bExp, zExp;
bb98fe42
AF
5190 uint64_t aSig, bSig, zSig0, zSig1;
5191 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2
FB
5192 floatx80 z;
5193
5194 aSig = extractFloatx80Frac( a );
5195 aExp = extractFloatx80Exp( a );
5196 aSign = extractFloatx80Sign( a );
5197 bSig = extractFloatx80Frac( b );
5198 bExp = extractFloatx80Exp( b );
5199 bSign = extractFloatx80Sign( b );
5200 zSign = aSign ^ bSign;
5201 if ( aExp == 0x7FFF ) {
bb98fe42 5202 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2 5203 if ( bExp == 0x7FFF ) {
bb98fe42 5204 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5205 goto invalid;
5206 }
5207 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5208 }
5209 if ( bExp == 0x7FFF ) {
bb98fe42 5210 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5211 return packFloatx80( zSign, 0, 0 );
5212 }
5213 if ( bExp == 0 ) {
5214 if ( bSig == 0 ) {
5215 if ( ( aExp | aSig ) == 0 ) {
5216 invalid:
5217 float_raise( float_flag_invalid STATUS_VAR);
5218 z.low = floatx80_default_nan_low;
5219 z.high = floatx80_default_nan_high;
5220 return z;
5221 }
5222 float_raise( float_flag_divbyzero STATUS_VAR);
5223 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5224 }
5225 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5226 }
5227 if ( aExp == 0 ) {
5228 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5229 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5230 }
5231 zExp = aExp - bExp + 0x3FFE;
5232 rem1 = 0;
5233 if ( bSig <= aSig ) {
5234 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5235 ++zExp;
5236 }
5237 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5238 mul64To128( bSig, zSig0, &term0, &term1 );
5239 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 5240 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5241 --zSig0;
5242 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5243 }
5244 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 5245 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
5246 mul64To128( bSig, zSig1, &term1, &term2 );
5247 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 5248 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5249 --zSig1;
5250 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5251 }
5252 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5253 }
5254 return
5255 roundAndPackFloatx80(
5256 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5257
5258}
5259
5260/*----------------------------------------------------------------------------
5261| Returns the remainder of the extended double-precision floating-point value
5262| `a' with respect to the corresponding value `b'. The operation is performed
5263| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5264*----------------------------------------------------------------------------*/
5265
5266floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
5267{
ed086f3d 5268 flag aSign, zSign;
158142c2 5269 int32 aExp, bExp, expDiff;
bb98fe42
AF
5270 uint64_t aSig0, aSig1, bSig;
5271 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2
FB
5272 floatx80 z;
5273
5274 aSig0 = extractFloatx80Frac( a );
5275 aExp = extractFloatx80Exp( a );
5276 aSign = extractFloatx80Sign( a );
5277 bSig = extractFloatx80Frac( b );
5278 bExp = extractFloatx80Exp( b );
158142c2 5279 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5280 if ( (uint64_t) ( aSig0<<1 )
5281 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
5282 return propagateFloatx80NaN( a, b STATUS_VAR );
5283 }
5284 goto invalid;
5285 }
5286 if ( bExp == 0x7FFF ) {
bb98fe42 5287 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5288 return a;
5289 }
5290 if ( bExp == 0 ) {
5291 if ( bSig == 0 ) {
5292 invalid:
5293 float_raise( float_flag_invalid STATUS_VAR);
5294 z.low = floatx80_default_nan_low;
5295 z.high = floatx80_default_nan_high;
5296 return z;
5297 }
5298 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5299 }
5300 if ( aExp == 0 ) {
bb98fe42 5301 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
5302 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5303 }
5304 bSig |= LIT64( 0x8000000000000000 );
5305 zSign = aSign;
5306 expDiff = aExp - bExp;
5307 aSig1 = 0;
5308 if ( expDiff < 0 ) {
5309 if ( expDiff < -1 ) return a;
5310 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5311 expDiff = 0;
5312 }
5313 q = ( bSig <= aSig0 );
5314 if ( q ) aSig0 -= bSig;
5315 expDiff -= 64;
5316 while ( 0 < expDiff ) {
5317 q = estimateDiv128To64( aSig0, aSig1, bSig );
5318 q = ( 2 < q ) ? q - 2 : 0;
5319 mul64To128( bSig, q, &term0, &term1 );
5320 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5321 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5322 expDiff -= 62;
5323 }
5324 expDiff += 64;
5325 if ( 0 < expDiff ) {
5326 q = estimateDiv128To64( aSig0, aSig1, bSig );
5327 q = ( 2 < q ) ? q - 2 : 0;
5328 q >>= 64 - expDiff;
5329 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5330 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5331 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5332 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5333 ++q;
5334 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5335 }
5336 }
5337 else {
5338 term1 = 0;
5339 term0 = bSig;
5340 }
5341 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5342 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5343 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5344 && ( q & 1 ) )
5345 ) {
5346 aSig0 = alternateASig0;
5347 aSig1 = alternateASig1;
5348 zSign = ! zSign;
5349 }
5350 return
5351 normalizeRoundAndPackFloatx80(
5352 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5353
5354}
5355
5356/*----------------------------------------------------------------------------
5357| Returns the square root of the extended double-precision floating-point
5358| value `a'. The operation is performed according to the IEC/IEEE Standard
5359| for Binary Floating-Point Arithmetic.
5360*----------------------------------------------------------------------------*/
5361
5362floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5363{
5364 flag aSign;
5365 int32 aExp, zExp;
bb98fe42
AF
5366 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5367 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
5368 floatx80 z;
5369
5370 aSig0 = extractFloatx80Frac( a );
5371 aExp = extractFloatx80Exp( a );
5372 aSign = extractFloatx80Sign( a );
5373 if ( aExp == 0x7FFF ) {
bb98fe42 5374 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
158142c2
FB
5375 if ( ! aSign ) return a;
5376 goto invalid;
5377 }
5378 if ( aSign ) {
5379 if ( ( aExp | aSig0 ) == 0 ) return a;
5380 invalid:
5381 float_raise( float_flag_invalid STATUS_VAR);
5382 z.low = floatx80_default_nan_low;
5383 z.high = floatx80_default_nan_high;
5384 return z;
5385 }
5386 if ( aExp == 0 ) {
5387 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5388 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5389 }
5390 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5391 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5392 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5393 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5394 doubleZSig0 = zSig0<<1;
5395 mul64To128( zSig0, zSig0, &term0, &term1 );
5396 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 5397 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5398 --zSig0;
5399 doubleZSig0 -= 2;
5400 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5401 }
5402 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5403 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5404 if ( zSig1 == 0 ) zSig1 = 1;
5405 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5406 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5407 mul64To128( zSig1, zSig1, &term2, &term3 );
5408 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5409 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5410 --zSig1;
5411 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5412 term3 |= 1;
5413 term2 |= doubleZSig0;
5414 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5415 }
5416 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5417 }
5418 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5419 zSig0 |= doubleZSig0;
5420 return
5421 roundAndPackFloatx80(
5422 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5423
5424}
5425
5426/*----------------------------------------------------------------------------
b689362d
AJ
5427| Returns 1 if the extended double-precision floating-point value `a' is equal
5428| to the corresponding value `b', and 0 otherwise. The invalid exception is
5429| raised if either operand is a NaN. Otherwise, the comparison is performed
5430| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5431*----------------------------------------------------------------------------*/
5432
b689362d 5433int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5434{
5435
5436 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5437 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5438 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5439 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5440 ) {
b689362d 5441 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
5442 return 0;
5443 }
5444 return
5445 ( a.low == b.low )
5446 && ( ( a.high == b.high )
5447 || ( ( a.low == 0 )
bb98fe42 5448 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5449 );
5450
5451}
5452
5453/*----------------------------------------------------------------------------
5454| Returns 1 if the extended double-precision floating-point value `a' is
5455| less than or equal to the corresponding value `b', and 0 otherwise. The
f5a64251
AJ
5456| invalid exception is raised if either operand is a NaN. The comparison is
5457| performed according to the IEC/IEEE Standard for Binary Floating-Point
5458| Arithmetic.
158142c2
FB
5459*----------------------------------------------------------------------------*/
5460
750afe93 5461int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5462{
5463 flag aSign, bSign;
5464
5465 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5466 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5467 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5468 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5469 ) {
5470 float_raise( float_flag_invalid STATUS_VAR);
5471 return 0;
5472 }
5473 aSign = extractFloatx80Sign( a );
5474 bSign = extractFloatx80Sign( b );
5475 if ( aSign != bSign ) {
5476 return
5477 aSign
bb98fe42 5478 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5479 == 0 );
5480 }
5481 return
5482 aSign ? le128( b.high, b.low, a.high, a.low )
5483 : le128( a.high, a.low, b.high, b.low );
5484
5485}
5486
5487/*----------------------------------------------------------------------------
5488| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5489| less than the corresponding value `b', and 0 otherwise. The invalid
5490| exception is raised if either operand is a NaN. The comparison is performed
5491| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5492*----------------------------------------------------------------------------*/
5493
750afe93 5494int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5495{
5496 flag aSign, bSign;
5497
5498 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5499 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5500 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5501 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5502 ) {
5503 float_raise( float_flag_invalid STATUS_VAR);
5504 return 0;
5505 }
5506 aSign = extractFloatx80Sign( a );
5507 bSign = extractFloatx80Sign( b );
5508 if ( aSign != bSign ) {
5509 return
5510 aSign
bb98fe42 5511 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5512 != 0 );
5513 }
5514 return
5515 aSign ? lt128( b.high, b.low, a.high, a.low )
5516 : lt128( a.high, a.low, b.high, b.low );
5517
5518}
5519
67b7861d
AJ
5520/*----------------------------------------------------------------------------
5521| Returns 1 if the extended double-precision floating-point values `a' and `b'
f5a64251
AJ
5522| cannot be compared, and 0 otherwise. The invalid exception is raised if
5523| either operand is a NaN. The comparison is performed according to the
5524| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
5525*----------------------------------------------------------------------------*/
5526int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5527{
5528 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5529 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5530 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5531 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5532 ) {
5533 float_raise( float_flag_invalid STATUS_VAR);
5534 return 1;
5535 }
5536 return 0;
5537}
5538
158142c2 5539/*----------------------------------------------------------------------------
b689362d 5540| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5541| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5542| cause an exception. The comparison is performed according to the IEC/IEEE
5543| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5544*----------------------------------------------------------------------------*/
5545
b689362d 5546int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5547{
5548
5549 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5550 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5551 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5552 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5553 ) {
b689362d
AJ
5554 if ( floatx80_is_signaling_nan( a )
5555 || floatx80_is_signaling_nan( b ) ) {
5556 float_raise( float_flag_invalid STATUS_VAR);
5557 }
158142c2
FB
5558 return 0;
5559 }
5560 return
5561 ( a.low == b.low )
5562 && ( ( a.high == b.high )
5563 || ( ( a.low == 0 )
bb98fe42 5564 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5565 );
5566
5567}
5568
5569/*----------------------------------------------------------------------------
5570| Returns 1 if the extended double-precision floating-point value `a' is less
5571| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5572| do not cause an exception. Otherwise, the comparison is performed according
5573| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5574*----------------------------------------------------------------------------*/
5575
750afe93 5576int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5577{
5578 flag aSign, bSign;
5579
5580 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5581 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5582 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5583 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5584 ) {
5585 if ( floatx80_is_signaling_nan( a )
5586 || floatx80_is_signaling_nan( b ) ) {
5587 float_raise( float_flag_invalid STATUS_VAR);
5588 }
5589 return 0;
5590 }
5591 aSign = extractFloatx80Sign( a );
5592 bSign = extractFloatx80Sign( b );
5593 if ( aSign != bSign ) {
5594 return
5595 aSign
bb98fe42 5596 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5597 == 0 );
5598 }
5599 return
5600 aSign ? le128( b.high, b.low, a.high, a.low )
5601 : le128( a.high, a.low, b.high, b.low );
5602
5603}
5604
5605/*----------------------------------------------------------------------------
5606| Returns 1 if the extended double-precision floating-point value `a' is less
5607| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5608| an exception. Otherwise, the comparison is performed according to the
5609| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5610*----------------------------------------------------------------------------*/
5611
750afe93 5612int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5613{
5614 flag aSign, bSign;
5615
5616 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5617 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5618 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5619 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5620 ) {
5621 if ( floatx80_is_signaling_nan( a )
5622 || floatx80_is_signaling_nan( b ) ) {
5623 float_raise( float_flag_invalid STATUS_VAR);
5624 }
5625 return 0;
5626 }
5627 aSign = extractFloatx80Sign( a );
5628 bSign = extractFloatx80Sign( b );
5629 if ( aSign != bSign ) {
5630 return
5631 aSign
bb98fe42 5632 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5633 != 0 );
5634 }
5635 return
5636 aSign ? lt128( b.high, b.low, a.high, a.low )
5637 : lt128( a.high, a.low, b.high, b.low );
5638
5639}
5640
67b7861d
AJ
5641/*----------------------------------------------------------------------------
5642| Returns 1 if the extended double-precision floating-point values `a' and `b'
5643| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5644| The comparison is performed according to the IEC/IEEE Standard for Binary
5645| Floating-Point Arithmetic.
5646*----------------------------------------------------------------------------*/
5647int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5648{
5649 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5650 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5651 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5652 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5653 ) {
5654 if ( floatx80_is_signaling_nan( a )
5655 || floatx80_is_signaling_nan( b ) ) {
5656 float_raise( float_flag_invalid STATUS_VAR);
5657 }
5658 return 1;
5659 }
5660 return 0;
5661}
5662
158142c2
FB
5663/*----------------------------------------------------------------------------
5664| Returns the result of converting the quadruple-precision floating-point
5665| value `a' to the 32-bit two's complement integer format. The conversion
5666| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5667| Arithmetic---which means in particular that the conversion is rounded
5668| according to the current rounding mode. If `a' is a NaN, the largest
5669| positive integer is returned. Otherwise, if the conversion overflows, the
5670| largest integer with the same sign as `a' is returned.
5671*----------------------------------------------------------------------------*/
5672
5673int32 float128_to_int32( float128 a STATUS_PARAM )
5674{
5675 flag aSign;
5676 int32 aExp, shiftCount;
bb98fe42 5677 uint64_t aSig0, aSig1;
158142c2
FB
5678
5679 aSig1 = extractFloat128Frac1( a );
5680 aSig0 = extractFloat128Frac0( a );
5681 aExp = extractFloat128Exp( a );
5682 aSign = extractFloat128Sign( a );
5683 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5684 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5685 aSig0 |= ( aSig1 != 0 );
5686 shiftCount = 0x4028 - aExp;
5687 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5688 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5689
5690}
5691
5692/*----------------------------------------------------------------------------
5693| Returns the result of converting the quadruple-precision floating-point
5694| value `a' to the 32-bit two's complement integer format. The conversion
5695| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5696| Arithmetic, except that the conversion is always rounded toward zero. If
5697| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5698| conversion overflows, the largest integer with the same sign as `a' is
5699| returned.
5700*----------------------------------------------------------------------------*/
5701
5702int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5703{
5704 flag aSign;
5705 int32 aExp, shiftCount;
bb98fe42 5706 uint64_t aSig0, aSig1, savedASig;
b3a6a2e0 5707 int32_t z;
158142c2
FB
5708
5709 aSig1 = extractFloat128Frac1( a );
5710 aSig0 = extractFloat128Frac0( a );
5711 aExp = extractFloat128Exp( a );
5712 aSign = extractFloat128Sign( a );
5713 aSig0 |= ( aSig1 != 0 );
5714 if ( 0x401E < aExp ) {
5715 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5716 goto invalid;
5717 }
5718 else if ( aExp < 0x3FFF ) {
5719 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5720 return 0;
5721 }
5722 aSig0 |= LIT64( 0x0001000000000000 );
5723 shiftCount = 0x402F - aExp;
5724 savedASig = aSig0;
5725 aSig0 >>= shiftCount;
5726 z = aSig0;
5727 if ( aSign ) z = - z;
5728 if ( ( z < 0 ) ^ aSign ) {
5729 invalid:
5730 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 5731 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
5732 }
5733 if ( ( aSig0<<shiftCount ) != savedASig ) {
5734 STATUS(float_exception_flags) |= float_flag_inexact;
5735 }
5736 return z;
5737
5738}
5739
5740/*----------------------------------------------------------------------------
5741| Returns the result of converting the quadruple-precision floating-point
5742| value `a' to the 64-bit two's complement integer format. The conversion
5743| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5744| Arithmetic---which means in particular that the conversion is rounded
5745| according to the current rounding mode. If `a' is a NaN, the largest
5746| positive integer is returned. Otherwise, if the conversion overflows, the
5747| largest integer with the same sign as `a' is returned.
5748*----------------------------------------------------------------------------*/
5749
5750int64 float128_to_int64( float128 a STATUS_PARAM )
5751{
5752 flag aSign;
5753 int32 aExp, shiftCount;
bb98fe42 5754 uint64_t aSig0, aSig1;
158142c2
FB
5755
5756 aSig1 = extractFloat128Frac1( a );
5757 aSig0 = extractFloat128Frac0( a );
5758 aExp = extractFloat128Exp( a );
5759 aSign = extractFloat128Sign( a );
5760 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5761 shiftCount = 0x402F - aExp;
5762 if ( shiftCount <= 0 ) {
5763 if ( 0x403E < aExp ) {
5764 float_raise( float_flag_invalid STATUS_VAR);
5765 if ( ! aSign
5766 || ( ( aExp == 0x7FFF )
5767 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5768 )
5769 ) {
5770 return LIT64( 0x7FFFFFFFFFFFFFFF );
5771 }
bb98fe42 5772 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5773 }
5774 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5775 }
5776 else {
5777 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5778 }
5779 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5780
5781}
5782
5783/*----------------------------------------------------------------------------
5784| Returns the result of converting the quadruple-precision floating-point
5785| value `a' to the 64-bit two's complement integer format. The conversion
5786| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5787| Arithmetic, except that the conversion is always rounded toward zero.
5788| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5789| the conversion overflows, the largest integer with the same sign as `a' is
5790| returned.
5791*----------------------------------------------------------------------------*/
5792
5793int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5794{
5795 flag aSign;
5796 int32 aExp, shiftCount;
bb98fe42 5797 uint64_t aSig0, aSig1;
158142c2
FB
5798 int64 z;
5799
5800 aSig1 = extractFloat128Frac1( a );
5801 aSig0 = extractFloat128Frac0( a );
5802 aExp = extractFloat128Exp( a );
5803 aSign = extractFloat128Sign( a );
5804 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5805 shiftCount = aExp - 0x402F;
5806 if ( 0 < shiftCount ) {
5807 if ( 0x403E <= aExp ) {
5808 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5809 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5810 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5811 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5812 }
5813 else {
5814 float_raise( float_flag_invalid STATUS_VAR);
5815 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5816 return LIT64( 0x7FFFFFFFFFFFFFFF );
5817 }
5818 }
bb98fe42 5819 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5820 }
5821 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 5822 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
158142c2
FB
5823 STATUS(float_exception_flags) |= float_flag_inexact;
5824 }
5825 }
5826 else {
5827 if ( aExp < 0x3FFF ) {
5828 if ( aExp | aSig0 | aSig1 ) {
5829 STATUS(float_exception_flags) |= float_flag_inexact;
5830 }
5831 return 0;
5832 }
5833 z = aSig0>>( - shiftCount );
5834 if ( aSig1
bb98fe42 5835 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
158142c2
FB
5836 STATUS(float_exception_flags) |= float_flag_inexact;
5837 }
5838 }
5839 if ( aSign ) z = - z;
5840 return z;
5841
5842}
5843
5844/*----------------------------------------------------------------------------
5845| Returns the result of converting the quadruple-precision floating-point
5846| value `a' to the single-precision floating-point format. The conversion
5847| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5848| Arithmetic.
5849*----------------------------------------------------------------------------*/
5850
5851float32 float128_to_float32( float128 a STATUS_PARAM )
5852{
5853 flag aSign;
5854 int32 aExp;
bb98fe42
AF
5855 uint64_t aSig0, aSig1;
5856 uint32_t zSig;
158142c2
FB
5857
5858 aSig1 = extractFloat128Frac1( a );
5859 aSig0 = extractFloat128Frac0( a );
5860 aExp = extractFloat128Exp( a );
5861 aSign = extractFloat128Sign( a );
5862 if ( aExp == 0x7FFF ) {
5863 if ( aSig0 | aSig1 ) {
bcd4d9af 5864 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5865 }
5866 return packFloat32( aSign, 0xFF, 0 );
5867 }
5868 aSig0 |= ( aSig1 != 0 );
5869 shift64RightJamming( aSig0, 18, &aSig0 );
5870 zSig = aSig0;
5871 if ( aExp || zSig ) {
5872 zSig |= 0x40000000;
5873 aExp -= 0x3F81;
5874 }
5875 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5876
5877}
5878
5879/*----------------------------------------------------------------------------
5880| Returns the result of converting the quadruple-precision floating-point
5881| value `a' to the double-precision floating-point format. The conversion
5882| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5883| Arithmetic.
5884*----------------------------------------------------------------------------*/
5885
5886float64 float128_to_float64( float128 a STATUS_PARAM )
5887{
5888 flag aSign;
5889 int32 aExp;
bb98fe42 5890 uint64_t aSig0, aSig1;
158142c2
FB
5891
5892 aSig1 = extractFloat128Frac1( a );
5893 aSig0 = extractFloat128Frac0( a );
5894 aExp = extractFloat128Exp( a );
5895 aSign = extractFloat128Sign( a );
5896 if ( aExp == 0x7FFF ) {
5897 if ( aSig0 | aSig1 ) {
bcd4d9af 5898 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5899 }
5900 return packFloat64( aSign, 0x7FF, 0 );
5901 }
5902 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5903 aSig0 |= ( aSig1 != 0 );
5904 if ( aExp || aSig0 ) {
5905 aSig0 |= LIT64( 0x4000000000000000 );
5906 aExp -= 0x3C01;
5907 }
5908 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5909
5910}
5911
158142c2
FB
5912/*----------------------------------------------------------------------------
5913| Returns the result of converting the quadruple-precision floating-point
5914| value `a' to the extended double-precision floating-point format. The
5915| conversion is performed according to the IEC/IEEE Standard for Binary
5916| Floating-Point Arithmetic.
5917*----------------------------------------------------------------------------*/
5918
5919floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5920{
5921 flag aSign;
5922 int32 aExp;
bb98fe42 5923 uint64_t aSig0, aSig1;
158142c2
FB
5924
5925 aSig1 = extractFloat128Frac1( a );
5926 aSig0 = extractFloat128Frac0( a );
5927 aExp = extractFloat128Exp( a );
5928 aSign = extractFloat128Sign( a );
5929 if ( aExp == 0x7FFF ) {
5930 if ( aSig0 | aSig1 ) {
bcd4d9af 5931 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5932 }
5933 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5934 }
5935 if ( aExp == 0 ) {
5936 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5937 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5938 }
5939 else {
5940 aSig0 |= LIT64( 0x0001000000000000 );
5941 }
5942 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5943 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5944
5945}
5946
158142c2
FB
5947/*----------------------------------------------------------------------------
5948| Rounds the quadruple-precision floating-point value `a' to an integer, and
5949| returns the result as a quadruple-precision floating-point value. The
5950| operation is performed according to the IEC/IEEE Standard for Binary
5951| Floating-Point Arithmetic.
5952*----------------------------------------------------------------------------*/
5953
5954float128 float128_round_to_int( float128 a STATUS_PARAM )
5955{
5956 flag aSign;
5957 int32 aExp;
bb98fe42 5958 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
5959 float128 z;
5960
5961 aExp = extractFloat128Exp( a );
5962 if ( 0x402F <= aExp ) {
5963 if ( 0x406F <= aExp ) {
5964 if ( ( aExp == 0x7FFF )
5965 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
5966 ) {
5967 return propagateFloat128NaN( a, a STATUS_VAR );
5968 }
5969 return a;
5970 }
5971 lastBitMask = 1;
5972 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
5973 roundBitsMask = lastBitMask - 1;
5974 z = a;
dc355b76
PM
5975 switch (STATUS(float_rounding_mode)) {
5976 case float_round_nearest_even:
158142c2
FB
5977 if ( lastBitMask ) {
5978 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
5979 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
5980 }
5981 else {
bb98fe42 5982 if ( (int64_t) z.low < 0 ) {
158142c2 5983 ++z.high;
bb98fe42 5984 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
5985 }
5986 }
dc355b76 5987 break;
f9288a76
PM
5988 case float_round_ties_away:
5989 if (lastBitMask) {
5990 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
5991 } else {
5992 if ((int64_t) z.low < 0) {
5993 ++z.high;
5994 }
5995 }
5996 break;
dc355b76
PM
5997 case float_round_to_zero:
5998 break;
5999 case float_round_up:
6000 if (!extractFloat128Sign(z)) {
6001 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6002 }
6003 break;
6004 case float_round_down:
6005 if (extractFloat128Sign(z)) {
6006 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
158142c2 6007 }
dc355b76
PM
6008 break;
6009 default:
6010 abort();
158142c2
FB
6011 }
6012 z.low &= ~ roundBitsMask;
6013 }
6014 else {
6015 if ( aExp < 0x3FFF ) {
bb98fe42 6016 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
158142c2
FB
6017 STATUS(float_exception_flags) |= float_flag_inexact;
6018 aSign = extractFloat128Sign( a );
6019 switch ( STATUS(float_rounding_mode) ) {
6020 case float_round_nearest_even:
6021 if ( ( aExp == 0x3FFE )
6022 && ( extractFloat128Frac0( a )
6023 | extractFloat128Frac1( a ) )
6024 ) {
6025 return packFloat128( aSign, 0x3FFF, 0, 0 );
6026 }
6027 break;
f9288a76
PM
6028 case float_round_ties_away:
6029 if (aExp == 0x3FFE) {
6030 return packFloat128(aSign, 0x3FFF, 0, 0);
6031 }
6032 break;
158142c2
FB
6033 case float_round_down:
6034 return
6035 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6036 : packFloat128( 0, 0, 0, 0 );
6037 case float_round_up:
6038 return
6039 aSign ? packFloat128( 1, 0, 0, 0 )
6040 : packFloat128( 0, 0x3FFF, 0, 0 );
6041 }
6042 return packFloat128( aSign, 0, 0, 0 );
6043 }
6044 lastBitMask = 1;
6045 lastBitMask <<= 0x402F - aExp;
6046 roundBitsMask = lastBitMask - 1;
6047 z.low = 0;
6048 z.high = a.high;
dc355b76
PM
6049 switch (STATUS(float_rounding_mode)) {
6050 case float_round_nearest_even:
158142c2
FB
6051 z.high += lastBitMask>>1;
6052 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6053 z.high &= ~ lastBitMask;
6054 }
dc355b76 6055 break;
f9288a76
PM
6056 case float_round_ties_away:
6057 z.high += lastBitMask>>1;
6058 break;
dc355b76
PM
6059 case float_round_to_zero:
6060 break;
6061 case float_round_up:
6062 if (!extractFloat128Sign(z)) {
158142c2
FB
6063 z.high |= ( a.low != 0 );
6064 z.high += roundBitsMask;
6065 }
dc355b76
PM
6066 break;
6067 case float_round_down:
6068 if (extractFloat128Sign(z)) {
6069 z.high |= (a.low != 0);
6070 z.high += roundBitsMask;
6071 }
6072 break;
6073 default:
6074 abort();
158142c2
FB
6075 }
6076 z.high &= ~ roundBitsMask;
6077 }
6078 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6079 STATUS(float_exception_flags) |= float_flag_inexact;
6080 }
6081 return z;
6082
6083}
6084
6085/*----------------------------------------------------------------------------
6086| Returns the result of adding the absolute values of the quadruple-precision
6087| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6088| before being returned. `zSign' is ignored if the result is a NaN.
6089| The addition is performed according to the IEC/IEEE Standard for Binary
6090| Floating-Point Arithmetic.
6091*----------------------------------------------------------------------------*/
6092
6093static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6094{
6095 int32 aExp, bExp, zExp;
bb98fe42 6096 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
158142c2
FB
6097 int32 expDiff;
6098
6099 aSig1 = extractFloat128Frac1( a );
6100 aSig0 = extractFloat128Frac0( a );
6101 aExp = extractFloat128Exp( a );
6102 bSig1 = extractFloat128Frac1( b );
6103 bSig0 = extractFloat128Frac0( b );
6104 bExp = extractFloat128Exp( b );
6105 expDiff = aExp - bExp;
6106 if ( 0 < expDiff ) {
6107 if ( aExp == 0x7FFF ) {
6108 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6109 return a;
6110 }
6111 if ( bExp == 0 ) {
6112 --expDiff;
6113 }
6114 else {
6115 bSig0 |= LIT64( 0x0001000000000000 );
6116 }
6117 shift128ExtraRightJamming(
6118 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6119 zExp = aExp;
6120 }
6121 else if ( expDiff < 0 ) {
6122 if ( bExp == 0x7FFF ) {
6123 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6124 return packFloat128( zSign, 0x7FFF, 0, 0 );
6125 }
6126 if ( aExp == 0 ) {
6127 ++expDiff;
6128 }
6129 else {
6130 aSig0 |= LIT64( 0x0001000000000000 );
6131 }
6132 shift128ExtraRightJamming(
6133 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6134 zExp = bExp;
6135 }
6136 else {
6137 if ( aExp == 0x7FFF ) {
6138 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6139 return propagateFloat128NaN( a, b STATUS_VAR );
6140 }
6141 return a;
6142 }
6143 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976 6144 if ( aExp == 0 ) {
e6afc87f
PM
6145 if (STATUS(flush_to_zero)) {
6146 if (zSig0 | zSig1) {
6147 float_raise(float_flag_output_denormal STATUS_VAR);
6148 }
6149 return packFloat128(zSign, 0, 0, 0);
6150 }
fe76d976
PB
6151 return packFloat128( zSign, 0, zSig0, zSig1 );
6152 }
158142c2
FB
6153 zSig2 = 0;
6154 zSig0 |= LIT64( 0x0002000000000000 );
6155 zExp = aExp;
6156 goto shiftRight1;
6157 }
6158 aSig0 |= LIT64( 0x0001000000000000 );
6159 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6160 --zExp;
6161 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6162 ++zExp;
6163 shiftRight1:
6164 shift128ExtraRightJamming(
6165 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6166 roundAndPack:
6167 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6168
6169}
6170
6171/*----------------------------------------------------------------------------
6172| Returns the result of subtracting the absolute values of the quadruple-
6173| precision floating-point values `a' and `b'. If `zSign' is 1, the
6174| difference is negated before being returned. `zSign' is ignored if the
6175| result is a NaN. The subtraction is performed according to the IEC/IEEE
6176| Standard for Binary Floating-Point Arithmetic.
6177*----------------------------------------------------------------------------*/
6178
6179static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6180{
6181 int32 aExp, bExp, zExp;
bb98fe42 6182 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
158142c2
FB
6183 int32 expDiff;
6184 float128 z;
6185
6186 aSig1 = extractFloat128Frac1( a );
6187 aSig0 = extractFloat128Frac0( a );
6188 aExp = extractFloat128Exp( a );
6189 bSig1 = extractFloat128Frac1( b );
6190 bSig0 = extractFloat128Frac0( b );
6191 bExp = extractFloat128Exp( b );
6192 expDiff = aExp - bExp;
6193 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6194 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6195 if ( 0 < expDiff ) goto aExpBigger;
6196 if ( expDiff < 0 ) goto bExpBigger;
6197 if ( aExp == 0x7FFF ) {
6198 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6199 return propagateFloat128NaN( a, b STATUS_VAR );
6200 }
6201 float_raise( float_flag_invalid STATUS_VAR);
6202 z.low = float128_default_nan_low;
6203 z.high = float128_default_nan_high;
6204 return z;
6205 }
6206 if ( aExp == 0 ) {
6207 aExp = 1;
6208 bExp = 1;
6209 }
6210 if ( bSig0 < aSig0 ) goto aBigger;
6211 if ( aSig0 < bSig0 ) goto bBigger;
6212 if ( bSig1 < aSig1 ) goto aBigger;
6213 if ( aSig1 < bSig1 ) goto bBigger;
6214 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
6215 bExpBigger:
6216 if ( bExp == 0x7FFF ) {
6217 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6218 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6219 }
6220 if ( aExp == 0 ) {
6221 ++expDiff;
6222 }
6223 else {
6224 aSig0 |= LIT64( 0x4000000000000000 );
6225 }
6226 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6227 bSig0 |= LIT64( 0x4000000000000000 );
6228 bBigger:
6229 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6230 zExp = bExp;
6231 zSign ^= 1;
6232 goto normalizeRoundAndPack;
6233 aExpBigger:
6234 if ( aExp == 0x7FFF ) {
6235 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6236 return a;
6237 }
6238 if ( bExp == 0 ) {
6239 --expDiff;
6240 }
6241 else {
6242 bSig0 |= LIT64( 0x4000000000000000 );
6243 }
6244 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6245 aSig0 |= LIT64( 0x4000000000000000 );
6246 aBigger:
6247 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6248 zExp = aExp;
6249 normalizeRoundAndPack:
6250 --zExp;
6251 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
6252
6253}
6254
6255/*----------------------------------------------------------------------------
6256| Returns the result of adding the quadruple-precision floating-point values
6257| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6258| for Binary Floating-Point Arithmetic.
6259*----------------------------------------------------------------------------*/
6260
6261float128 float128_add( float128 a, float128 b STATUS_PARAM )
6262{
6263 flag aSign, bSign;
6264
6265 aSign = extractFloat128Sign( a );
6266 bSign = extractFloat128Sign( b );
6267 if ( aSign == bSign ) {
6268 return addFloat128Sigs( a, b, aSign STATUS_VAR );
6269 }
6270 else {
6271 return subFloat128Sigs( a, b, aSign STATUS_VAR );
6272 }
6273
6274}
6275
6276/*----------------------------------------------------------------------------
6277| Returns the result of subtracting the quadruple-precision floating-point
6278| values `a' and `b'. The operation is performed according to the IEC/IEEE
6279| Standard for Binary Floating-Point Arithmetic.
6280*----------------------------------------------------------------------------*/
6281
6282float128 float128_sub( float128 a, float128 b STATUS_PARAM )
6283{
6284 flag aSign, bSign;
6285
6286 aSign = extractFloat128Sign( a );
6287 bSign = extractFloat128Sign( b );
6288 if ( aSign == bSign ) {
6289 return subFloat128Sigs( a, b, aSign STATUS_VAR );
6290 }
6291 else {
6292 return addFloat128Sigs( a, b, aSign STATUS_VAR );
6293 }
6294
6295}
6296
6297/*----------------------------------------------------------------------------
6298| Returns the result of multiplying the quadruple-precision floating-point
6299| values `a' and `b'. The operation is performed according to the IEC/IEEE
6300| Standard for Binary Floating-Point Arithmetic.
6301*----------------------------------------------------------------------------*/
6302
6303float128 float128_mul( float128 a, float128 b STATUS_PARAM )
6304{
6305 flag aSign, bSign, zSign;
6306 int32 aExp, bExp, zExp;
bb98fe42 6307 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
6308 float128 z;
6309
6310 aSig1 = extractFloat128Frac1( a );
6311 aSig0 = extractFloat128Frac0( a );
6312 aExp = extractFloat128Exp( a );
6313 aSign = extractFloat128Sign( a );
6314 bSig1 = extractFloat128Frac1( b );
6315 bSig0 = extractFloat128Frac0( b );
6316 bExp = extractFloat128Exp( b );
6317 bSign = extractFloat128Sign( b );
6318 zSign = aSign ^ bSign;
6319 if ( aExp == 0x7FFF ) {
6320 if ( ( aSig0 | aSig1 )
6321 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6322 return propagateFloat128NaN( a, b STATUS_VAR );
6323 }
6324 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6325 return packFloat128( zSign, 0x7FFF, 0, 0 );
6326 }
6327 if ( bExp == 0x7FFF ) {
6328 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6329 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6330 invalid:
6331 float_raise( float_flag_invalid STATUS_VAR);
6332 z.low = float128_default_nan_low;
6333 z.high = float128_default_nan_high;
6334 return z;
6335 }
6336 return packFloat128( zSign, 0x7FFF, 0, 0 );
6337 }
6338 if ( aExp == 0 ) {
6339 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6340 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6341 }
6342 if ( bExp == 0 ) {
6343 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6344 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6345 }
6346 zExp = aExp + bExp - 0x4000;
6347 aSig0 |= LIT64( 0x0001000000000000 );
6348 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6349 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6350 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6351 zSig2 |= ( zSig3 != 0 );
6352 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6353 shift128ExtraRightJamming(
6354 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6355 ++zExp;
6356 }
6357 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6358
6359}
6360
6361/*----------------------------------------------------------------------------
6362| Returns the result of dividing the quadruple-precision floating-point value
6363| `a' by the corresponding value `b'. The operation is performed according to
6364| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6365*----------------------------------------------------------------------------*/
6366
6367float128 float128_div( float128 a, float128 b STATUS_PARAM )
6368{
6369 flag aSign, bSign, zSign;
6370 int32 aExp, bExp, zExp;
bb98fe42
AF
6371 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6372 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6373 float128 z;
6374
6375 aSig1 = extractFloat128Frac1( a );
6376 aSig0 = extractFloat128Frac0( a );
6377 aExp = extractFloat128Exp( a );
6378 aSign = extractFloat128Sign( a );
6379 bSig1 = extractFloat128Frac1( b );
6380 bSig0 = extractFloat128Frac0( b );
6381 bExp = extractFloat128Exp( b );
6382 bSign = extractFloat128Sign( b );
6383 zSign = aSign ^ bSign;
6384 if ( aExp == 0x7FFF ) {
6385 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6386 if ( bExp == 0x7FFF ) {
6387 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6388 goto invalid;
6389 }
6390 return packFloat128( zSign, 0x7FFF, 0, 0 );
6391 }
6392 if ( bExp == 0x7FFF ) {
6393 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6394 return packFloat128( zSign, 0, 0, 0 );
6395 }
6396 if ( bExp == 0 ) {
6397 if ( ( bSig0 | bSig1 ) == 0 ) {
6398 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6399 invalid:
6400 float_raise( float_flag_invalid STATUS_VAR);
6401 z.low = float128_default_nan_low;
6402 z.high = float128_default_nan_high;
6403 return z;
6404 }
6405 float_raise( float_flag_divbyzero STATUS_VAR);
6406 return packFloat128( zSign, 0x7FFF, 0, 0 );
6407 }
6408 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6409 }
6410 if ( aExp == 0 ) {
6411 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6412 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6413 }
6414 zExp = aExp - bExp + 0x3FFD;
6415 shortShift128Left(
6416 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6417 shortShift128Left(
6418 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6419 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6420 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6421 ++zExp;
6422 }
6423 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6424 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6425 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 6426 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6427 --zSig0;
6428 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6429 }
6430 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6431 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6432 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6433 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6434 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6435 --zSig1;
6436 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6437 }
6438 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6439 }
6440 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6441 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6442
6443}
6444
6445/*----------------------------------------------------------------------------
6446| Returns the remainder of the quadruple-precision floating-point value `a'
6447| with respect to the corresponding value `b'. The operation is performed
6448| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6449*----------------------------------------------------------------------------*/
6450
6451float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6452{
ed086f3d 6453 flag aSign, zSign;
158142c2 6454 int32 aExp, bExp, expDiff;
bb98fe42
AF
6455 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6456 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6457 int64_t sigMean0;
158142c2
FB
6458 float128 z;
6459
6460 aSig1 = extractFloat128Frac1( a );
6461 aSig0 = extractFloat128Frac0( a );
6462 aExp = extractFloat128Exp( a );
6463 aSign = extractFloat128Sign( a );
6464 bSig1 = extractFloat128Frac1( b );
6465 bSig0 = extractFloat128Frac0( b );
6466 bExp = extractFloat128Exp( b );
158142c2
FB
6467 if ( aExp == 0x7FFF ) {
6468 if ( ( aSig0 | aSig1 )
6469 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6470 return propagateFloat128NaN( a, b STATUS_VAR );
6471 }
6472 goto invalid;
6473 }
6474 if ( bExp == 0x7FFF ) {
6475 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6476 return a;
6477 }
6478 if ( bExp == 0 ) {
6479 if ( ( bSig0 | bSig1 ) == 0 ) {
6480 invalid:
6481 float_raise( float_flag_invalid STATUS_VAR);
6482 z.low = float128_default_nan_low;
6483 z.high = float128_default_nan_high;
6484 return z;
6485 }
6486 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6487 }
6488 if ( aExp == 0 ) {
6489 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6490 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6491 }
6492 expDiff = aExp - bExp;
6493 if ( expDiff < -1 ) return a;
6494 shortShift128Left(
6495 aSig0 | LIT64( 0x0001000000000000 ),
6496 aSig1,
6497 15 - ( expDiff < 0 ),
6498 &aSig0,
6499 &aSig1
6500 );
6501 shortShift128Left(
6502 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6503 q = le128( bSig0, bSig1, aSig0, aSig1 );
6504 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6505 expDiff -= 64;
6506 while ( 0 < expDiff ) {
6507 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6508 q = ( 4 < q ) ? q - 4 : 0;
6509 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6510 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6511 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6512 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6513 expDiff -= 61;
6514 }
6515 if ( -64 < expDiff ) {
6516 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6517 q = ( 4 < q ) ? q - 4 : 0;
6518 q >>= - expDiff;
6519 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6520 expDiff += 52;
6521 if ( expDiff < 0 ) {
6522 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6523 }
6524 else {
6525 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6526 }
6527 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6528 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6529 }
6530 else {
6531 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6532 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6533 }
6534 do {
6535 alternateASig0 = aSig0;
6536 alternateASig1 = aSig1;
6537 ++q;
6538 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 6539 } while ( 0 <= (int64_t) aSig0 );
158142c2 6540 add128(
bb98fe42 6541 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
6542 if ( ( sigMean0 < 0 )
6543 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6544 aSig0 = alternateASig0;
6545 aSig1 = alternateASig1;
6546 }
bb98fe42 6547 zSign = ( (int64_t) aSig0 < 0 );
158142c2
FB
6548 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6549 return
6550 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6551
6552}
6553
6554/*----------------------------------------------------------------------------
6555| Returns the square root of the quadruple-precision floating-point value `a'.
6556| The operation is performed according to the IEC/IEEE Standard for Binary
6557| Floating-Point Arithmetic.
6558*----------------------------------------------------------------------------*/
6559
6560float128 float128_sqrt( float128 a STATUS_PARAM )
6561{
6562 flag aSign;
6563 int32 aExp, zExp;
bb98fe42
AF
6564 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6565 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6566 float128 z;
6567
6568 aSig1 = extractFloat128Frac1( a );
6569 aSig0 = extractFloat128Frac0( a );
6570 aExp = extractFloat128Exp( a );
6571 aSign = extractFloat128Sign( a );
6572 if ( aExp == 0x7FFF ) {
6573 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6574 if ( ! aSign ) return a;
6575 goto invalid;
6576 }
6577 if ( aSign ) {
6578 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6579 invalid:
6580 float_raise( float_flag_invalid STATUS_VAR);
6581 z.low = float128_default_nan_low;
6582 z.high = float128_default_nan_high;
6583 return z;
6584 }
6585 if ( aExp == 0 ) {
6586 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6587 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6588 }
6589 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6590 aSig0 |= LIT64( 0x0001000000000000 );
6591 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6592 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6593 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6594 doubleZSig0 = zSig0<<1;
6595 mul64To128( zSig0, zSig0, &term0, &term1 );
6596 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 6597 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6598 --zSig0;
6599 doubleZSig0 -= 2;
6600 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6601 }
6602 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6603 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6604 if ( zSig1 == 0 ) zSig1 = 1;
6605 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6606 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6607 mul64To128( zSig1, zSig1, &term2, &term3 );
6608 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6609 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6610 --zSig1;
6611 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6612 term3 |= 1;
6613 term2 |= doubleZSig0;
6614 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6615 }
6616 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6617 }
6618 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6619 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6620
6621}
6622
6623/*----------------------------------------------------------------------------
6624| Returns 1 if the quadruple-precision floating-point value `a' is equal to
b689362d
AJ
6625| the corresponding value `b', and 0 otherwise. The invalid exception is
6626| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
6627| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6628*----------------------------------------------------------------------------*/
6629
b689362d 6630int float128_eq( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6631{
6632
6633 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6634 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6635 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6636 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6637 ) {
b689362d 6638 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
6639 return 0;
6640 }
6641 return
6642 ( a.low == b.low )
6643 && ( ( a.high == b.high )
6644 || ( ( a.low == 0 )
bb98fe42 6645 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6646 );
6647
6648}
6649
6650/*----------------------------------------------------------------------------
6651| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6652| or equal to the corresponding value `b', and 0 otherwise. The invalid
6653| exception is raised if either operand is a NaN. The comparison is performed
6654| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6655*----------------------------------------------------------------------------*/
6656
750afe93 6657int float128_le( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6658{
6659 flag aSign, bSign;
6660
6661 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6662 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6663 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6664 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6665 ) {
6666 float_raise( float_flag_invalid STATUS_VAR);
6667 return 0;
6668 }
6669 aSign = extractFloat128Sign( a );
6670 bSign = extractFloat128Sign( b );
6671 if ( aSign != bSign ) {
6672 return
6673 aSign
bb98fe42 6674 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6675 == 0 );
6676 }
6677 return
6678 aSign ? le128( b.high, b.low, a.high, a.low )
6679 : le128( a.high, a.low, b.high, b.low );
6680
6681}
6682
6683/*----------------------------------------------------------------------------
6684| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6685| the corresponding value `b', and 0 otherwise. The invalid exception is
6686| raised if either operand is a NaN. The comparison is performed according
6687| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6688*----------------------------------------------------------------------------*/
6689
750afe93 6690int float128_lt( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6691{
6692 flag aSign, bSign;
6693
6694 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6695 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6696 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6697 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6698 ) {
6699 float_raise( float_flag_invalid STATUS_VAR);
6700 return 0;
6701 }
6702 aSign = extractFloat128Sign( a );
6703 bSign = extractFloat128Sign( b );
6704 if ( aSign != bSign ) {
6705 return
6706 aSign
bb98fe42 6707 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6708 != 0 );
6709 }
6710 return
6711 aSign ? lt128( b.high, b.low, a.high, a.low )
6712 : lt128( a.high, a.low, b.high, b.low );
6713
6714}
6715
67b7861d
AJ
6716/*----------------------------------------------------------------------------
6717| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
f5a64251
AJ
6718| be compared, and 0 otherwise. The invalid exception is raised if either
6719| operand is a NaN. The comparison is performed according to the IEC/IEEE
6720| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
6721*----------------------------------------------------------------------------*/
6722
6723int float128_unordered( float128 a, float128 b STATUS_PARAM )
6724{
6725 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6726 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6727 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6728 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6729 ) {
6730 float_raise( float_flag_invalid STATUS_VAR);
6731 return 1;
6732 }
6733 return 0;
6734}
6735
158142c2
FB
6736/*----------------------------------------------------------------------------
6737| Returns 1 if the quadruple-precision floating-point value `a' is equal to
f5a64251
AJ
6738| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6739| exception. The comparison is performed according to the IEC/IEEE Standard
6740| for Binary Floating-Point Arithmetic.
158142c2
FB
6741*----------------------------------------------------------------------------*/
6742
b689362d 6743int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6744{
6745
6746 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6747 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6748 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6749 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6750 ) {
b689362d
AJ
6751 if ( float128_is_signaling_nan( a )
6752 || float128_is_signaling_nan( b ) ) {
6753 float_raise( float_flag_invalid STATUS_VAR);
6754 }
158142c2
FB
6755 return 0;
6756 }
6757 return
6758 ( a.low == b.low )
6759 && ( ( a.high == b.high )
6760 || ( ( a.low == 0 )
bb98fe42 6761 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6762 );
6763
6764}
6765
6766/*----------------------------------------------------------------------------
6767| Returns 1 if the quadruple-precision floating-point value `a' is less than
6768| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6769| cause an exception. Otherwise, the comparison is performed according to the
6770| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6771*----------------------------------------------------------------------------*/
6772
750afe93 6773int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6774{
6775 flag aSign, bSign;
6776
6777 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6778 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6779 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6780 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6781 ) {
6782 if ( float128_is_signaling_nan( a )
6783 || float128_is_signaling_nan( b ) ) {
6784 float_raise( float_flag_invalid STATUS_VAR);
6785 }
6786 return 0;
6787 }
6788 aSign = extractFloat128Sign( a );
6789 bSign = extractFloat128Sign( b );
6790 if ( aSign != bSign ) {
6791 return
6792 aSign
bb98fe42 6793 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6794 == 0 );
6795 }
6796 return
6797 aSign ? le128( b.high, b.low, a.high, a.low )
6798 : le128( a.high, a.low, b.high, b.low );
6799
6800}
6801
6802/*----------------------------------------------------------------------------
6803| Returns 1 if the quadruple-precision floating-point value `a' is less than
6804| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6805| exception. Otherwise, the comparison is performed according to the IEC/IEEE
6806| Standard for Binary Floating-Point Arithmetic.
6807*----------------------------------------------------------------------------*/
6808
750afe93 6809int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6810{
6811 flag aSign, bSign;
6812
6813 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6814 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6815 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6816 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6817 ) {
6818 if ( float128_is_signaling_nan( a )
6819 || float128_is_signaling_nan( b ) ) {
6820 float_raise( float_flag_invalid STATUS_VAR);
6821 }
6822 return 0;
6823 }
6824 aSign = extractFloat128Sign( a );
6825 bSign = extractFloat128Sign( b );
6826 if ( aSign != bSign ) {
6827 return
6828 aSign
bb98fe42 6829 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6830 != 0 );
6831 }
6832 return
6833 aSign ? lt128( b.high, b.low, a.high, a.low )
6834 : lt128( a.high, a.low, b.high, b.low );
6835
6836}
6837
67b7861d
AJ
6838/*----------------------------------------------------------------------------
6839| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6840| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6841| comparison is performed according to the IEC/IEEE Standard for Binary
6842| Floating-Point Arithmetic.
6843*----------------------------------------------------------------------------*/
6844
6845int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6846{
6847 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6848 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6849 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6850 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6851 ) {
6852 if ( float128_is_signaling_nan( a )
6853 || float128_is_signaling_nan( b ) ) {
6854 float_raise( float_flag_invalid STATUS_VAR);
6855 }
6856 return 1;
6857 }
6858 return 0;
6859}
6860
1d6bda35 6861/* misc functions */
c4850f9e 6862float32 uint32_to_float32(uint32_t a STATUS_PARAM)
1d6bda35
FB
6863{
6864 return int64_to_float32(a STATUS_VAR);
6865}
6866
c4850f9e 6867float64 uint32_to_float64(uint32_t a STATUS_PARAM)
1d6bda35
FB
6868{
6869 return int64_to_float64(a STATUS_VAR);
6870}
6871
9f8d2a09 6872uint32 float32_to_uint32( float32 a STATUS_PARAM )
1d6bda35
FB
6873{
6874 int64_t v;
9f8d2a09 6875 uint32 res;
34e1c27b 6876 int old_exc_flags = get_float_exception_flags(status);
1d6bda35
FB
6877
6878 v = float32_to_int64(a STATUS_VAR);
6879 if (v < 0) {
6880 res = 0;
1d6bda35
FB
6881 } else if (v > 0xffffffff) {
6882 res = 0xffffffff;
1d6bda35 6883 } else {
34e1c27b 6884 return v;
1d6bda35 6885 }
34e1c27b
PM
6886 set_float_exception_flags(old_exc_flags, status);
6887 float_raise(float_flag_invalid STATUS_VAR);
1d6bda35
FB
6888 return res;
6889}
6890
9f8d2a09 6891uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
1d6bda35
FB
6892{
6893 int64_t v;
9f8d2a09 6894 uint32 res;
34e1c27b 6895 int old_exc_flags = get_float_exception_flags(status);
1d6bda35
FB
6896
6897 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6898 if (v < 0) {
6899 res = 0;
1d6bda35
FB
6900 } else if (v > 0xffffffff) {
6901 res = 0xffffffff;
1d6bda35 6902 } else {
34e1c27b 6903 return v;
1d6bda35 6904 }
34e1c27b
PM
6905 set_float_exception_flags(old_exc_flags, status);
6906 float_raise(float_flag_invalid STATUS_VAR);
1d6bda35
FB
6907 return res;
6908}
6909
f581bf54
WN
6910int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6911{
6912 int32_t v;
6913 int_fast16_t res;
6914 int old_exc_flags = get_float_exception_flags(status);
6915
6916 v = float32_to_int32(a STATUS_VAR);
6917 if (v < -0x8000) {
6918 res = -0x8000;
6919 } else if (v > 0x7fff) {
6920 res = 0x7fff;
6921 } else {
6922 return v;
6923 }
6924
6925 set_float_exception_flags(old_exc_flags, status);
6926 float_raise(float_flag_invalid STATUS_VAR);
6927 return res;
6928}
6929
6930uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6931{
6932 int32_t v;
6933 uint_fast16_t res;
6934 int old_exc_flags = get_float_exception_flags(status);
6935
6936 v = float32_to_int32(a STATUS_VAR);
6937 if (v < 0) {
6938 res = 0;
6939 } else if (v > 0xffff) {
6940 res = 0xffff;
6941 } else {
6942 return v;
6943 }
6944
6945 set_float_exception_flags(old_exc_flags, status);
6946 float_raise(float_flag_invalid STATUS_VAR);
6947 return res;
6948}
6949
5aea4c58 6950uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
cbcef455
PM
6951{
6952 int64_t v;
5aea4c58 6953 uint_fast16_t res;
34e1c27b 6954 int old_exc_flags = get_float_exception_flags(status);
cbcef455
PM
6955
6956 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6957 if (v < 0) {
6958 res = 0;
cbcef455
PM
6959 } else if (v > 0xffff) {
6960 res = 0xffff;
cbcef455 6961 } else {
34e1c27b 6962 return v;
cbcef455 6963 }
34e1c27b
PM
6964 set_float_exception_flags(old_exc_flags, status);
6965 float_raise(float_flag_invalid STATUS_VAR);
cbcef455
PM
6966 return res;
6967}
6968
9f8d2a09 6969uint32 float64_to_uint32( float64 a STATUS_PARAM )
1d6bda35 6970{
5e7f654f 6971 uint64_t v;
9f8d2a09 6972 uint32 res;
5e7f654f 6973 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 6974
5e7f654f
TM
6975 v = float64_to_uint64(a STATUS_VAR);
6976 if (v > 0xffffffff) {
1d6bda35 6977 res = 0xffffffff;
1d6bda35 6978 } else {
5e7f654f 6979 return v;
1d6bda35 6980 }
5e7f654f
TM
6981 set_float_exception_flags(old_exc_flags, status);
6982 float_raise(float_flag_invalid STATUS_VAR);
1d6bda35
FB
6983 return res;
6984}
6985
9f8d2a09 6986uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
1d6bda35 6987{
fd728f2f 6988 uint64_t v;
9f8d2a09 6989 uint32 res;
fd728f2f 6990 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 6991
fd728f2f
TM
6992 v = float64_to_uint64_round_to_zero(a STATUS_VAR);
6993 if (v > 0xffffffff) {
1d6bda35 6994 res = 0xffffffff;
1d6bda35 6995 } else {
fd728f2f 6996 return v;
1d6bda35 6997 }
fd728f2f
TM
6998 set_float_exception_flags(old_exc_flags, status);
6999 float_raise(float_flag_invalid STATUS_VAR);
1d6bda35
FB
7000 return res;
7001}
7002
f581bf54
WN
7003int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
7004{
7005 int64_t v;
7006 int_fast16_t res;
7007 int old_exc_flags = get_float_exception_flags(status);
7008
7009 v = float64_to_int32(a STATUS_VAR);
7010 if (v < -0x8000) {
7011 res = -0x8000;
7012 } else if (v > 0x7fff) {
7013 res = 0x7fff;
7014 } else {
7015 return v;
7016 }
7017
7018 set_float_exception_flags(old_exc_flags, status);
7019 float_raise(float_flag_invalid STATUS_VAR);
7020 return res;
7021}
7022
7023uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
7024{
7025 int64_t v;
7026 uint_fast16_t res;
7027 int old_exc_flags = get_float_exception_flags(status);
7028
7029 v = float64_to_int32(a STATUS_VAR);
7030 if (v < 0) {
7031 res = 0;
7032 } else if (v > 0xffff) {
7033 res = 0xffff;
7034 } else {
7035 return v;
7036 }
7037
7038 set_float_exception_flags(old_exc_flags, status);
7039 float_raise(float_flag_invalid STATUS_VAR);
7040 return res;
7041}
7042
5aea4c58 7043uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
cbcef455
PM
7044{
7045 int64_t v;
5aea4c58 7046 uint_fast16_t res;
34e1c27b 7047 int old_exc_flags = get_float_exception_flags(status);
cbcef455
PM
7048
7049 v = float64_to_int64_round_to_zero(a STATUS_VAR);
7050 if (v < 0) {
7051 res = 0;
cbcef455
PM
7052 } else if (v > 0xffff) {
7053 res = 0xffff;
cbcef455 7054 } else {
34e1c27b 7055 return v;
cbcef455 7056 }
34e1c27b
PM
7057 set_float_exception_flags(old_exc_flags, status);
7058 float_raise(float_flag_invalid STATUS_VAR);
cbcef455
PM
7059 return res;
7060}
7061
fb3ea83a
TM
7062/*----------------------------------------------------------------------------
7063| Returns the result of converting the double-precision floating-point value
7064| `a' to the 64-bit unsigned integer format. The conversion is
7065| performed according to the IEC/IEEE Standard for Binary Floating-Point
7066| Arithmetic---which means in particular that the conversion is rounded
7067| according to the current rounding mode. If `a' is a NaN, the largest
7068| positive integer is returned. If the conversion overflows, the
7069| largest unsigned integer is returned. If 'a' is negative, the value is
7070| rounded and zero is returned; negative values that do not round to zero
7071| will raise the inexact exception.
7072*----------------------------------------------------------------------------*/
75d62a58 7073
fb3ea83a
TM
7074uint64_t float64_to_uint64(float64 a STATUS_PARAM)
7075{
7076 flag aSign;
7077 int_fast16_t aExp, shiftCount;
7078 uint64_t aSig, aSigExtra;
7079 a = float64_squash_input_denormal(a STATUS_VAR);
75d62a58 7080
fb3ea83a
TM
7081 aSig = extractFloat64Frac(a);
7082 aExp = extractFloat64Exp(a);
7083 aSign = extractFloat64Sign(a);
7084 if (aSign && (aExp > 1022)) {
7085 float_raise(float_flag_invalid STATUS_VAR);
7086 if (float64_is_any_nan(a)) {
7087 return LIT64(0xFFFFFFFFFFFFFFFF);
7088 } else {
7089 return 0;
7090 }
7091 }
7092 if (aExp) {
7093 aSig |= LIT64(0x0010000000000000);
7094 }
7095 shiftCount = 0x433 - aExp;
7096 if (shiftCount <= 0) {
7097 if (0x43E < aExp) {
7098 float_raise(float_flag_invalid STATUS_VAR);
7099 return LIT64(0xFFFFFFFFFFFFFFFF);
7100 }
7101 aSigExtra = 0;
7102 aSig <<= -shiftCount;
7103 } else {
7104 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7105 }
7106 return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
75d62a58
JM
7107}
7108
7109uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
7110{
0a87a310
TM
7111 signed char current_rounding_mode = STATUS(float_rounding_mode);
7112 set_float_rounding_mode(float_round_to_zero STATUS_VAR);
7113 int64_t v = float64_to_uint64(a STATUS_VAR);
7114 set_float_rounding_mode(current_rounding_mode STATUS_VAR);
7115 return v;
75d62a58
JM
7116}
7117
1d6bda35 7118#define COMPARE(s, nan_exp) \
a49db98d 7119static inline int float ## s ## _compare_internal( float ## s a, float ## s b, \
1d6bda35
FB
7120 int is_quiet STATUS_PARAM ) \
7121{ \
7122 flag aSign, bSign; \
bb98fe42 7123 uint ## s ## _t av, bv; \
37d18660
PM
7124 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
7125 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
1d6bda35
FB
7126 \
7127 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7128 extractFloat ## s ## Frac( a ) ) || \
7129 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7130 extractFloat ## s ## Frac( b ) )) { \
7131 if (!is_quiet || \
7132 float ## s ## _is_signaling_nan( a ) || \
7133 float ## s ## _is_signaling_nan( b ) ) { \
7134 float_raise( float_flag_invalid STATUS_VAR); \
7135 } \
7136 return float_relation_unordered; \
7137 } \
7138 aSign = extractFloat ## s ## Sign( a ); \
7139 bSign = extractFloat ## s ## Sign( b ); \
f090c9d4 7140 av = float ## s ## _val(a); \
cd8a2533 7141 bv = float ## s ## _val(b); \
1d6bda35 7142 if ( aSign != bSign ) { \
bb98fe42 7143 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
1d6bda35
FB
7144 /* zero case */ \
7145 return float_relation_equal; \
7146 } else { \
7147 return 1 - (2 * aSign); \
7148 } \
7149 } else { \
f090c9d4 7150 if (av == bv) { \
1d6bda35
FB
7151 return float_relation_equal; \
7152 } else { \
f090c9d4 7153 return 1 - 2 * (aSign ^ ( av < bv )); \
1d6bda35
FB
7154 } \
7155 } \
7156} \
7157 \
750afe93 7158int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
7159{ \
7160 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
7161} \
7162 \
750afe93 7163int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
7164{ \
7165 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
7166}
7167
7168COMPARE(32, 0xff)
7169COMPARE(64, 0x7ff)
9ee6e8bb 7170
a49db98d 7171static inline int floatx80_compare_internal( floatx80 a, floatx80 b,
f6714d36
AJ
7172 int is_quiet STATUS_PARAM )
7173{
7174 flag aSign, bSign;
7175
7176 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7177 ( extractFloatx80Frac( a )<<1 ) ) ||
7178 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7179 ( extractFloatx80Frac( b )<<1 ) )) {
7180 if (!is_quiet ||
7181 floatx80_is_signaling_nan( a ) ||
7182 floatx80_is_signaling_nan( b ) ) {
7183 float_raise( float_flag_invalid STATUS_VAR);
7184 }
7185 return float_relation_unordered;
7186 }
7187 aSign = extractFloatx80Sign( a );
7188 bSign = extractFloatx80Sign( b );
7189 if ( aSign != bSign ) {
7190
7191 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7192 ( ( a.low | b.low ) == 0 ) ) {
7193 /* zero case */
7194 return float_relation_equal;
7195 } else {
7196 return 1 - (2 * aSign);
7197 }
7198 } else {
7199 if (a.low == b.low && a.high == b.high) {
7200 return float_relation_equal;
7201 } else {
7202 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7203 }
7204 }
7205}
7206
7207int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
7208{
7209 return floatx80_compare_internal(a, b, 0 STATUS_VAR);
7210}
7211
7212int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
7213{
7214 return floatx80_compare_internal(a, b, 1 STATUS_VAR);
7215}
7216
a49db98d 7217static inline int float128_compare_internal( float128 a, float128 b,
1f587329
BS
7218 int is_quiet STATUS_PARAM )
7219{
7220 flag aSign, bSign;
7221
7222 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7223 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7224 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7225 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7226 if (!is_quiet ||
7227 float128_is_signaling_nan( a ) ||
7228 float128_is_signaling_nan( b ) ) {
7229 float_raise( float_flag_invalid STATUS_VAR);
7230 }
7231 return float_relation_unordered;
7232 }
7233 aSign = extractFloat128Sign( a );
7234 bSign = extractFloat128Sign( b );
7235 if ( aSign != bSign ) {
7236 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7237 /* zero case */
7238 return float_relation_equal;
7239 } else {
7240 return 1 - (2 * aSign);
7241 }
7242 } else {
7243 if (a.low == b.low && a.high == b.high) {
7244 return float_relation_equal;
7245 } else {
7246 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7247 }
7248 }
7249}
7250
7251int float128_compare( float128 a, float128 b STATUS_PARAM )
7252{
7253 return float128_compare_internal(a, b, 0 STATUS_VAR);
7254}
7255
7256int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
7257{
7258 return float128_compare_internal(a, b, 1 STATUS_VAR);
7259}
7260
274f1b04
PM
7261/* min() and max() functions. These can't be implemented as
7262 * 'compare and pick one input' because that would mishandle
7263 * NaNs and +0 vs -0.
e17ab310
WN
7264 *
7265 * minnum() and maxnum() functions. These are similar to the min()
7266 * and max() functions but if one of the arguments is a QNaN and
7267 * the other is numerical then the numerical argument is returned.
7268 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7269 * and maxNum() operations. min() and max() are the typical min/max
7270 * semantics provided by many CPUs which predate that specification.
2d31e060
LA
7271 *
7272 * minnummag() and maxnummag() functions correspond to minNumMag()
7273 * and minNumMag() from the IEEE-754 2008.
274f1b04 7274 */
e70614ea 7275#define MINMAX(s) \
a49db98d 7276static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
2d31e060
LA
7277 int ismin, int isieee, \
7278 int ismag STATUS_PARAM) \
274f1b04
PM
7279{ \
7280 flag aSign, bSign; \
2d31e060 7281 uint ## s ## _t av, bv, aav, abv; \
274f1b04
PM
7282 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
7283 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
7284 if (float ## s ## _is_any_nan(a) || \
7285 float ## s ## _is_any_nan(b)) { \
e17ab310
WN
7286 if (isieee) { \
7287 if (float ## s ## _is_quiet_nan(a) && \
7288 !float ## s ##_is_any_nan(b)) { \
7289 return b; \
7290 } else if (float ## s ## _is_quiet_nan(b) && \
7291 !float ## s ## _is_any_nan(a)) { \
7292 return a; \
7293 } \
7294 } \
274f1b04
PM
7295 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
7296 } \
7297 aSign = extractFloat ## s ## Sign(a); \
7298 bSign = extractFloat ## s ## Sign(b); \
7299 av = float ## s ## _val(a); \
7300 bv = float ## s ## _val(b); \
2d31e060
LA
7301 if (ismag) { \
7302 aav = float ## s ## _abs(av); \
7303 abv = float ## s ## _abs(bv); \
7304 if (aav != abv) { \
7305 if (ismin) { \
7306 return (aav < abv) ? a : b; \
7307 } else { \
7308 return (aav < abv) ? b : a; \
7309 } \
7310 } \
7311 } \
274f1b04
PM
7312 if (aSign != bSign) { \
7313 if (ismin) { \
7314 return aSign ? a : b; \
7315 } else { \
7316 return aSign ? b : a; \
7317 } \
7318 } else { \
7319 if (ismin) { \
7320 return (aSign ^ (av < bv)) ? a : b; \
7321 } else { \
7322 return (aSign ^ (av < bv)) ? b : a; \
7323 } \
7324 } \
7325} \
7326 \
7327float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
7328{ \
2d31e060 7329 return float ## s ## _minmax(a, b, 1, 0, 0 STATUS_VAR); \
274f1b04
PM
7330} \
7331 \
7332float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
7333{ \
2d31e060 7334 return float ## s ## _minmax(a, b, 0, 0, 0 STATUS_VAR); \
e17ab310
WN
7335} \
7336 \
7337float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7338{ \
2d31e060 7339 return float ## s ## _minmax(a, b, 1, 1, 0 STATUS_VAR); \
e17ab310
WN
7340} \
7341 \
7342float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7343{ \
2d31e060
LA
7344 return float ## s ## _minmax(a, b, 0, 1, 0 STATUS_VAR); \
7345} \
7346 \
7347float ## s float ## s ## _minnummag(float ## s a, float ## s b STATUS_PARAM) \
7348{ \
7349 return float ## s ## _minmax(a, b, 1, 1, 1 STATUS_VAR); \
7350} \
7351 \
7352float ## s float ## s ## _maxnummag(float ## s a, float ## s b STATUS_PARAM) \
7353{ \
7354 return float ## s ## _minmax(a, b, 0, 1, 1 STATUS_VAR); \
274f1b04
PM
7355}
7356
e70614ea
WN
7357MINMAX(32)
7358MINMAX(64)
274f1b04
PM
7359
7360
9ee6e8bb
PB
7361/* Multiply A by 2 raised to the power N. */
7362float32 float32_scalbn( float32 a, int n STATUS_PARAM )
7363{
7364 flag aSign;
326b9e98 7365 int16_t aExp;
bb98fe42 7366 uint32_t aSig;
9ee6e8bb 7367
37d18660 7368 a = float32_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
7369 aSig = extractFloat32Frac( a );
7370 aExp = extractFloat32Exp( a );
7371 aSign = extractFloat32Sign( a );
7372
7373 if ( aExp == 0xFF ) {
326b9e98
AJ
7374 if ( aSig ) {
7375 return propagateFloat32NaN( a, a STATUS_VAR );
7376 }
9ee6e8bb
PB
7377 return a;
7378 }
3c85c37f 7379 if (aExp != 0) {
69397542 7380 aSig |= 0x00800000;
3c85c37f 7381 } else if (aSig == 0) {
69397542 7382 return a;
3c85c37f
PM
7383 } else {
7384 aExp++;
7385 }
69397542 7386
326b9e98
AJ
7387 if (n > 0x200) {
7388 n = 0x200;
7389 } else if (n < -0x200) {
7390 n = -0x200;
7391 }
7392
69397542
PB
7393 aExp += n - 1;
7394 aSig <<= 7;
7395 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
7396}
7397
7398float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7399{
7400 flag aSign;
326b9e98 7401 int16_t aExp;
bb98fe42 7402 uint64_t aSig;
9ee6e8bb 7403
37d18660 7404 a = float64_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
7405 aSig = extractFloat64Frac( a );
7406 aExp = extractFloat64Exp( a );
7407 aSign = extractFloat64Sign( a );
7408
7409 if ( aExp == 0x7FF ) {
326b9e98
AJ
7410 if ( aSig ) {
7411 return propagateFloat64NaN( a, a STATUS_VAR );
7412 }
9ee6e8bb
PB
7413 return a;
7414 }
3c85c37f 7415 if (aExp != 0) {
69397542 7416 aSig |= LIT64( 0x0010000000000000 );
3c85c37f 7417 } else if (aSig == 0) {
69397542 7418 return a;
3c85c37f
PM
7419 } else {
7420 aExp++;
7421 }
69397542 7422
326b9e98
AJ
7423 if (n > 0x1000) {
7424 n = 0x1000;
7425 } else if (n < -0x1000) {
7426 n = -0x1000;
7427 }
7428
69397542
PB
7429 aExp += n - 1;
7430 aSig <<= 10;
7431 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
7432}
7433
9ee6e8bb
PB
7434floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7435{
7436 flag aSign;
326b9e98 7437 int32_t aExp;
bb98fe42 7438 uint64_t aSig;
9ee6e8bb
PB
7439
7440 aSig = extractFloatx80Frac( a );
7441 aExp = extractFloatx80Exp( a );
7442 aSign = extractFloatx80Sign( a );
7443
326b9e98
AJ
7444 if ( aExp == 0x7FFF ) {
7445 if ( aSig<<1 ) {
7446 return propagateFloatx80NaN( a, a STATUS_VAR );
7447 }
9ee6e8bb
PB
7448 return a;
7449 }
326b9e98 7450
3c85c37f
PM
7451 if (aExp == 0) {
7452 if (aSig == 0) {
7453 return a;
7454 }
7455 aExp++;
7456 }
69397542 7457
326b9e98
AJ
7458 if (n > 0x10000) {
7459 n = 0x10000;
7460 } else if (n < -0x10000) {
7461 n = -0x10000;
7462 }
7463
9ee6e8bb 7464 aExp += n;
69397542
PB
7465 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7466 aSign, aExp, aSig, 0 STATUS_VAR );
9ee6e8bb 7467}
9ee6e8bb 7468
9ee6e8bb
PB
7469float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7470{
7471 flag aSign;
326b9e98 7472 int32_t aExp;
bb98fe42 7473 uint64_t aSig0, aSig1;
9ee6e8bb
PB
7474
7475 aSig1 = extractFloat128Frac1( a );
7476 aSig0 = extractFloat128Frac0( a );
7477 aExp = extractFloat128Exp( a );
7478 aSign = extractFloat128Sign( a );
7479 if ( aExp == 0x7FFF ) {
326b9e98
AJ
7480 if ( aSig0 | aSig1 ) {
7481 return propagateFloat128NaN( a, a STATUS_VAR );
7482 }
9ee6e8bb
PB
7483 return a;
7484 }
3c85c37f 7485 if (aExp != 0) {
69397542 7486 aSig0 |= LIT64( 0x0001000000000000 );
3c85c37f 7487 } else if (aSig0 == 0 && aSig1 == 0) {
69397542 7488 return a;
3c85c37f
PM
7489 } else {
7490 aExp++;
7491 }
69397542 7492
326b9e98
AJ
7493 if (n > 0x10000) {
7494 n = 0x10000;
7495 } else if (n < -0x10000) {
7496 n = -0x10000;
7497 }
7498
69397542
PB
7499 aExp += n - 1;
7500 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7501 STATUS_VAR );
9ee6e8bb
PB
7502
7503}